From a806624962b7b3bcf40242cfc70ef2da24587f78 Mon Sep 17 00:00:00 2001
From: Alexey Kopytov <akopytov@gmail.com>
Date: Fri, 17 Mar 2017 23:44:38 +0300
Subject: [PATCH] Bug #79454: Inefficient InnoDB row stats implementation

This patch adds a new set of os_atomic_*() macros:

- os_atomic_*_nr() to atomically add/subtract a value to/from a variable
  without returning its original value;

- os_nonatomic_*_nr() to do the same non-atomically.

This semantics allows hardware-specific optimizations on some
architectures. For example, the STADD instruction available on AArch64
CPUs with LSE (Large System Extensions) performs an atomic
addition/subtraction while discarding the original value (i.e. not
loading it into a register). Which not only results in better
scalability as compared to the standard LL/SC synchronization mechanism
provided by AArch64 without LSE, but is also faster than the regular
non-atomic add/subtract instructions on AArch64, since the standard
load/modify/store sequence is replaced with a single instruction,
similar to the code generated for x86(_64).

This kind of optimizations should theoretically be performed by the
compiler, but compiler support is lagging behind even in the most recent
GCC versions.

This patch maps the new family of macros to the optimized LSE-based
implementations on AArch64. It also changes InnoDB fuzzy counters to use
the "non-atomic no-return add/subtract" semantics. No changes in
behavior is introduced for other architectures, i.e. regular
add/subtract code is emitted.

The scope of this optimization is not limited to InnoDB fuzzy counters.
There are many other counters that do not require the original value
when updated. But those will be addressed separately based on the
primitives introduced by this patch.
---
 configure.cmake                       |   5 ++
 storage/innobase/include/os0atomic.h  | 112 ++++++++++++++++++++++++++++++++++
 storage/innobase/include/sync0sync.h  |   1 -
 storage/innobase/include/ut0counter.h |  41 +++++++++++--
 4 files changed, 154 insertions(+), 5 deletions(-)

diff --git a/configure.cmake b/configure.cmake
index cb8aa67..7089ea3 100644
--- a/configure.cmake
+++ b/configure.cmake
@@ -924,6 +924,11 @@ IF(HAVE_LIBNUMA AND NOT WITH_NUMA)
    MESSAGE(STATUS "Disabling NUMA on user's request")
 ENDIF()
 
+OPTION(WITH_LSE "Enable Large System Extensions for AArch64" OFF)
+IF(WITH_LSE)
+  ADD_DEFINITIONS(-DHAVE_ARM64_LSE_ATOMICS)
+ENDIF()
+
 # needed for libevent
 CHECK_TYPE_SIZE("socklen_t" SIZEOF_SOCKLEN_T)
 IF(SIZEOF_SOCKLEN_T)
diff --git a/storage/innobase/include/os0atomic.h b/storage/innobase/include/os0atomic.h
index 8e2b06b..82358f2 100644
--- a/storage/innobase/include/os0atomic.h
+++ b/storage/innobase/include/os0atomic.h
@@ -295,6 +295,71 @@ amount of increment. */
 # define os_atomic_increment_uint64(ptr, amount) \
 	os_atomic_increment(ptr, amount)
 
+/**********************************************************//**
+Same functions with no return value. These may have optimized implementations on
+some architectures. */
+
+#if defined(__aarch64__) && defined(HAVE_ARM64_LSE_ATOMICS)
+
+# define ARM64_LSE_ATOMIC_STADD(ptr, amount, w, r)	\
+	do {						\
+		__asm__ __volatile__(			\
+		"stadd" w " %" r "1, %0\n"		\
+		: "+Q" (*ptr)				\
+		: "r" (amount)				\
+		: "memory");				\
+	} while(0)
+
+# define os_atomic_increment_nr(ptr, amount)					\
+	do {									\
+		switch (sizeof(*ptr)) {						\
+		case 1: ARM64_LSE_ATOMIC_STADD(ptr, amount, "b", "w"); break;	\
+		case 2: ARM64_LSE_ATOMIC_STADD(ptr, amount, "h", "w"); break;	\
+		case 4: ARM64_LSE_ATOMIC_STADD(ptr, amount, "", "w"); break;	\
+		case 8: ARM64_LSE_ATOMIC_STADD(ptr, amount, "", ""); break;	\
+		default: ut_ad(0); /* wrong operand size */			\
+		}								\
+	} while (0)
+#else
+# define os_atomic_increment_nr(ptr, amount) \
+	os_atomic_increment(ptr, amount)
+#endif
+
+# define os_atomic_increment_lint_nr(ptr, amount) \
+	os_atomic_increment_nr(ptr, amount)
+
+# define os_atomic_increment_ulint_nr(ptr, amount) \
+	os_atomic_increment_nr(ptr, amount)
+
+# define os_atomic_increment_uint32_nr(ptr, amount ) \
+	os_atomic_increment_nr(ptr, amount)
+
+# define os_atomic_increment_uint64_nr(ptr, amount) \
+	os_atomic_increment_nr(ptr, amount)
+
+/* Non-atomic version of the functions with no return value. */
+
+#if defined(__aarch64__) && defined(HAVE_ARM64_LSE_ATOMICS)
+/* Atomic increment w/o fetching is faster than nonatomic one with it
+on ThunderX. */
+# define os_nonatomic_increment_nr(ptr, amount) \
+	os_atomic_increment_nr(ptr, amount)
+#else
+# define os_nonatomic_increment_nr(ptr, amount) (*(ptr) += (amount))
+#endif
+
+# define os_nonatomic_increment_lint_nr(ptr, amount) \
+	os_nonatomic_increment_nr(ptr, amount)
+
+# define os_nonatomic_increment_ulint_nr(ptr, amount) \
+	os_nonatomic_increment_nr(ptr, amount)
+
+# define os_nonatomic_increment_uint32_nr(ptr, amount ) \
+	os_nonatomic_increment_nr(ptr, amount)
+
+# define os_nonatomic_increment_uint64_nr(ptr, amount) \
+	os_nonatomic_increment_nr(ptr, amount)
+
 /* Returns the resulting value, ptr is pointer to target, amount is the
 amount to decrement. */
 
@@ -318,6 +383,53 @@ amount to decrement. */
 # define os_atomic_decrement_uint64(ptr, amount) \
 	os_atomic_decrement(ptr, amount)
 
+/**********************************************************//**
+Same functions with no return value. These may have optimized implementations on
+some architectures. */
+
+#if defined(__aarch64__) && defined(HAVE_ARM64_LSE_ATOMICS)
+# define os_atomic_decrement_nr(ptr, amount) \
+	os_atomic_increment_nr(ptr, -amount)
+#else
+# define os_atomic_decrement_nr(ptr, amount) \
+	os_atomic_decrement(ptr, amount)
+#endif
+
+# define os_atomic_decrement_lint_nr(ptr, amount) \
+	os_atomic_decrement_nr(ptr, amount)
+
+# define os_atomic_decrement_ulint_nr(ptr, amount) \
+	os_atomic_decrement_nr(ptr, amount)
+
+# define os_atomic_decrement_uint32_nr(ptr, amount ) \
+	os_atomic_decrement_nr(ptr, amount)
+
+# define os_atomic_decrement_uint64_nr(ptr, amount) \
+	os_atomic_decrement_nr(ptr, amount)
+
+/* Non-atomic version of the functions with no return value. */
+
+#if defined(__aarch64__) && defined(HAVE_ARM64_LSE_ATOMICS)
+/* Atomic decrement without fetching is faster than nonatomic one with it
+on AArch64. */
+# define os_nonatomic_decrement_nr(ptr, amount) \
+	os_atomic_decrement_nr(ptr, amount)
+#else
+# define os_nonatomic_decrement_nr(ptr, amount) (*(ptr) -= (amount))
+#endif
+
+# define os_nonatomic_decrement_lint_nr(ptr, amount) \
+	os_nonatomic_decrement_nr(ptr, amount)
+
+# define os_nonatomic_decrement_ulint_nr(ptr, amount) \
+	os_nonatomic_decrement_nr(ptr, amount)
+
+# define os_nonatomic_decrement_uint32_nr(ptr, amount ) \
+	os_nonatomic_decrement_nr(ptr, amount)
+
+# define os_nonatomic_decrement_uint64_nr(ptr, amount) \
+	os_nonatomic_decrement_nr(ptr, amount)
+
 #endif
 
 #define os_atomic_inc_ulint(m,v,d)	os_atomic_increment_ulint(v, d)
diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h
index 7fddada..289900e 100644
--- a/storage/innobase/include/sync0sync.h
+++ b/storage/innobase/include/sync0sync.h
@@ -35,7 +35,6 @@ Created 9/5/1995 Heikki Tuuri
 #define sync0sync_h
 
 #include "univ.i"
-#include "ut0counter.h"
 
 #if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
 
diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h
index 6b187ec..809613e 100644
--- a/storage/innobase/include/ut0counter.h
+++ b/storage/innobase/include/ut0counter.h
@@ -30,6 +30,7 @@ Created 2012/04/12 by Sunny Bains
 #include <my_rdtsc.h>
 #include "univ.i"
 #include "os0thread.h"
+#include "os0atomic.h"
 
 /** CPU cache line size */
 #ifdef __powerpc__
@@ -108,6 +109,38 @@ struct single_indexer_t {
 
 #define	default_indexer_t	counter_indexer_t
 
+
+template <typename T>
+UNIV_INLINE void add_noreturn(T &val, T n) {
+	val += n;
+}
+
+template <typename T>
+UNIV_INLINE void sub_noreturn(T &val, T n) {
+	val -= n;
+}
+
+/* Template specializations for native word size */
+template <>
+inline void add_noreturn<ulint>(ulint &val, ulint n) {
+	os_nonatomic_increment_ulint_nr(&val, n);
+}
+
+template <>
+inline void sub_noreturn<ulint>(ulint &val, ulint n) {
+	os_nonatomic_decrement_lint_nr(&val, n);
+}
+
+template <>
+inline void add_noreturn<lint>(lint &val, lint n) {
+	os_nonatomic_increment_lint_nr(&val, n);
+}
+
+template <>
+inline void sub_noreturn<lint>(lint &val, lint n) {
+	os_nonatomic_decrement_lint_nr(&val, n);
+}
+
 /** Class for using fuzzy counters. The counter is not protected by any
 mutex and the results are not guaranteed to be 100% accurate but close
 enough. Creates an array of counters and separates each element by the
@@ -151,7 +184,7 @@ class ib_counter_t {
 
 		ut_ad(i < UT_ARR_SIZE(m_counter));
 
-		m_counter[i] += n;
+		add_noreturn(m_counter[i], n);
 	}
 
 	/** Use this if you can use a unique identifier, saves a
@@ -163,7 +196,7 @@ class ib_counter_t {
 
 		ut_ad(i < UT_ARR_SIZE(m_counter));
 
-		m_counter[i] += n;
+		add_noreturn(m_counter[i], n);
 	}
 
 	/** If you can't use a good index id. Decrement by 1. */
@@ -176,7 +209,7 @@ class ib_counter_t {
 
 		ut_ad(i < UT_ARR_SIZE(m_counter));
 
-		m_counter[i] -= n;
+		sub_noreturn(m_counter[i], n);
 	}
 
 	/** Use this if you can use a unique identifier, saves a
@@ -188,7 +221,7 @@ class ib_counter_t {
 
 		ut_ad(i < UT_ARR_SIZE(m_counter));
 
-		m_counter[i] -= n;
+		sub_noreturn(m_counter[i], n);
 	}
 
 	/* @return total value - not 100% accurate, since it is not atomic. */