From 972f9c280cb92579647438b0bb19ae2a46b25abb Mon Sep 17 00:00:00 2001
From: Alexey Kopytov <akopytov@gmail.com>
Date: Mon, 28 May 2018 10:12:01 +0300
Subject: [PATCH] Bug #79454: Inefficient InnoDB row stats implementation

This patch adds a new set of os_atomic_*() macros:

- os_atomic_*_nr() to atomically add/subtract a value to/from a variable
  without returning its original value;

- os_nonatomic_*_nr() to do the same non-atomically.

This semantics allows hardware-specific optimizations on some
architectures. For example, the STADD instruction available on AArch64
CPUs with LSE (Large System Extensions) performs an atomic
addition/subtraction while discarding the original value (i.e. not
loading it into a register). Which not only results in better
scalability as compared to the standard LL/SC synchronization mechanism
provided by AArch64 without LSE, but is also faster than the regular
non-atomic add/subtract instructions on AArch64, since the standard
load/modify/store sequence is replaced with a single instruction,
similar to the code generated for x86(_64).

This patch maps the new family of macros to the optimized LSE-based
implementations on AArch64. It also changes InnoDB fuzzy counters to use
the "non-atomic no-return add/subtract" semantics. No changes in
behavior is introduced for other architectures, i.e. regular
add/subtract code is emitted.
---
 configure.cmake                       |   5 ++
 storage/innobase/include/os0atomic.h  | 115 ++++++++++++++++++++++++++++++++++
 storage/innobase/include/sync0sync.h  |   1 -
 storage/innobase/include/ut0counter.h |  40 ++++++++++--
 4 files changed, 156 insertions(+), 5 deletions(-)

diff --git a/configure.cmake b/configure.cmake
index b6ec2a4f6a7..586d6edb78c 100644
--- a/configure.cmake
+++ b/configure.cmake
@@ -763,3 +763,8 @@ IF(HAVE_LIBNUMA AND NOT WITH_NUMA)
    SET(HAVE_LIBNUMA 0)
    MESSAGE(STATUS "Disabling NUMA on user's request")
 ENDIF()
+
+OPTION(WITH_LSE "Enable Large System Extensions for AArch64" ON)
+IF(WITH_LSE)
+  ADD_DEFINITIONS(-DHAVE_ARM64_LSE_ATOMICS)
+ENDIF()
diff --git a/storage/innobase/include/os0atomic.h b/storage/innobase/include/os0atomic.h
index 51137f34152..d161b77e807 100644
--- a/storage/innobase/include/os0atomic.h
+++ b/storage/innobase/include/os0atomic.h
@@ -262,6 +262,77 @@ bool os_compare_and_swap_thread_id(volatile os_thread_id_t *ptr,
   "Mutexes use GCC atomic builtins, rw_locks do not"
 #endif /* HAVE_IB_ATOMIC_PTHREAD_T_GCC */
 
+/** Same functions with no return value. These may have optimized
+implementations on some architectures. */
+
+#if defined(__aarch64__) && defined(HAVE_ARM64_LSE_ATOMICS)
+
+#define ARM64_LSE_ATOMIC_STADD(ptr, amount, w, r)   \
+  do {                                              \
+    __asm__ __volatile__("stadd" w " %" r "1, %0\n" \
+                         : "+Q"(*ptr)               \
+                         : "r"(amount)              \
+                         : "memory");               \
+  } while (0)
+
+#define os_atomic_increment_nr(ptr, amount)              \
+  do {                                                   \
+    switch (sizeof(*ptr)) {                              \
+      case 1:                                            \
+        ARM64_LSE_ATOMIC_STADD(ptr, amount, "b", "w");   \
+        break;                                           \
+      case 2:                                            \
+        ARM64_LSE_ATOMIC_STADD(ptr, amount, "h", "w");   \
+        break;                                           \
+      case 4:                                            \
+        ARM64_LSE_ATOMIC_STADD(ptr, amount, "", "w");    \
+        break;                                           \
+      case 8:                                            \
+        ARM64_LSE_ATOMIC_STADD(ptr, amount, "", "");     \
+        break;                                           \
+      default:                                           \
+        static_assert(true, "unsupported operand size"); \
+    }                                                    \
+  } while (0)
+#else
+#define os_atomic_increment_nr(ptr, amount) os_atomic_increment(ptr, amount)
+#endif
+
+#define os_atomic_increment_lint_nr(ptr, amount) \
+  os_atomic_increment_nr(ptr, amount)
+
+#define os_atomic_increment_ulint_nr(ptr, amount) \
+  os_atomic_increment_nr(ptr, amount)
+
+#define os_atomic_increment_uint32_nr(ptr, amount) \
+  os_atomic_increment_nr(ptr, amount)
+
+#define os_atomic_increment_uint64_nr(ptr, amount) \
+  os_atomic_increment_nr(ptr, amount)
+
+/* Non-atomic version of the functions with no return value. */
+
+#if defined(__aarch64__) && defined(HAVE_ARM64_LSE_ATOMICS)
+/* Atomic increment without fetching the original value is faster than nonatomic
+one with fetching. */
+#define os_nonatomic_increment_nr(ptr, amount) \
+  os_atomic_increment_nr(ptr, amount)
+#else
+#define os_nonatomic_increment_nr(ptr, amount) (*(ptr) += (amount))
+#endif
+
+#define os_nonatomic_increment_lint_nr(ptr, amount) \
+  os_nonatomic_increment_nr(ptr, amount)
+
+#define os_nonatomic_increment_ulint_nr(ptr, amount) \
+  os_nonatomic_increment_nr(ptr, amount)
+
+#define os_nonatomic_increment_uint32_nr(ptr, amount) \
+  os_nonatomic_increment_nr(ptr, amount)
+
+#define os_nonatomic_increment_uint64_nr(ptr, amount) \
+  os_nonatomic_increment_nr(ptr, amount)
+
 /** Returns the resulting value, ptr is pointer to target, amount is the
  amount of increment. */
 
@@ -298,6 +369,50 @@ amount to decrement. */
 
 #define os_atomic_decrement_uint64(ptr, amount) os_atomic_decrement(ptr, amount)
 
+/** Same functions with no return value. These may have optimized
+implementations on some architectures. */
+
+#if defined(__aarch64__) && defined(HAVE_ARM64_LSE_ATOMICS)
+#define os_atomic_decrement_nr(ptr, amount) os_atomic_increment_nr(ptr, -amount)
+#else
+#define os_atomic_decrement_nr(ptr, amount) os_atomic_decrement(ptr, amount)
+#endif
+
+#define os_atomic_decrement_lint_nr(ptr, amount) \
+  os_atomic_decrement_nr(ptr, amount)
+
+#define os_atomic_decrement_ulint_nr(ptr, amount) \
+  os_atomic_decrement_nr(ptr, amount)
+
+#define os_atomic_decrement_uint32_nr(ptr, amount) \
+  os_atomic_decrement_nr(ptr, amount)
+
+#define os_atomic_decrement_uint64_nr(ptr, amount) \
+  os_atomic_decrement_nr(ptr, amount)
+
+/* Non-atomic version of the functions with no return value. */
+
+#if defined(__aarch64__) && defined(HAVE_ARM64_LSE_ATOMICS)
+/* Atomic increment without fetching the original value is faster than nonatomic
+one with fetching. */
+#define os_nonatomic_decrement_nr(ptr, amount) \
+  os_atomic_decrement_nr(ptr, amount)
+#else
+#define os_nonatomic_decrement_nr(ptr, amount) (*(ptr) -= (amount))
+#endif
+
+#define os_nonatomic_decrement_lint_nr(ptr, amount) \
+  os_nonatomic_decrement_nr(ptr, amount)
+
+#define os_nonatomic_decrement_ulint_nr(ptr, amount) \
+  os_nonatomic_decrement_nr(ptr, amount)
+
+#define os_nonatomic_decrement_uint32_nr(ptr, amount) \
+  os_nonatomic_decrement_nr(ptr, amount)
+
+#define os_nonatomic_decrement_uint64_nr(ptr, amount) \
+  os_nonatomic_decrement_nr(ptr, amount)
+
 #endif
 
 #define os_atomic_inc_ulint(m, v, d) os_atomic_increment_ulint(v, d)
diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h
index 7a799e11473..19bc085fbcb 100644
--- a/storage/innobase/include/sync0sync.h
+++ b/storage/innobase/include/sync0sync.h
@@ -42,7 +42,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #define sync0sync_h
 
 #include "univ.i"
-#include "ut0counter.h"
 
 #ifdef HAVE_PSI_INTERFACE
 
diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h
index daeb1c16cc2..b0ab9e26988 100644
--- a/storage/innobase/include/ut0counter.h
+++ b/storage/innobase/include/ut0counter.h
@@ -38,6 +38,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 
 #include "univ.i"
 
+#include "os0atomic.h"
 #include "os0thread.h"
 #include "ut0dbg.h"
 
@@ -113,6 +114,37 @@ struct single_indexer_t {
 
 #define default_indexer_t counter_indexer_t
 
+template <typename T>
+UNIV_INLINE void add_noreturn(T &val, T n) {
+        val += n;
+}
+
+template <typename T>
+UNIV_INLINE void sub_noreturn(T &val, T n) {
+        val -= n;
+}
+
+/* Template specializations for native word size */
+template <>
+inline void add_noreturn<ulint>(ulint &val, ulint n) {
+  os_nonatomic_increment_ulint_nr(&val, n);
+}
+
+template <>
+inline void sub_noreturn<ulint>(ulint &val, ulint n) {
+  os_nonatomic_decrement_lint_nr(&val, n);
+}
+
+template <>
+inline void add_noreturn<lint>(lint &val, lint n) {
+  os_nonatomic_increment_lint_nr(&val, n);
+}
+
+template <>
+inline void sub_noreturn<lint>(lint &val, lint n) {
+  os_nonatomic_decrement_lint_nr(&val, n);
+}
+
 /** Class for using fuzzy counters. The counter is not protected by any
 mutex and the results are not guaranteed to be 100% accurate but close
 enough. Creates an array of counters and separates each element by the
@@ -151,7 +183,7 @@ class ib_counter_t {
 
     ut_ad(i < UT_ARR_SIZE(m_counter));
 
-    m_counter[i] += n;
+    add_noreturn(m_counter[i], n);
   }
 
   /** Use this if you can use a unique identifier, saves a
@@ -163,7 +195,7 @@ class ib_counter_t {
 
     ut_ad(i < UT_ARR_SIZE(m_counter));
 
-    m_counter[i] += n;
+    add_noreturn(m_counter[i], n);
   }
 
   /** If you can't use a good index id. Decrement by 1. */
@@ -176,7 +208,7 @@ class ib_counter_t {
 
     ut_ad(i < UT_ARR_SIZE(m_counter));
 
-    m_counter[i] -= n;
+    sub_noreturn(m_counter[i], n);
   }
 
   /** Use this if you can use a unique identifier, saves a
@@ -188,7 +220,7 @@ class ib_counter_t {
 
     ut_ad(i < UT_ARR_SIZE(m_counter));
 
-    m_counter[i] -= n;
+    sub_noreturn(m_counter[i], n);
   }
 
   /* @return total value - not 100% accurate, since it is not atomic. */