From a806624962b7b3bcf40242cfc70ef2da24587f78 Mon Sep 17 00:00:00 2001 From: Alexey Kopytov Date: Fri, 17 Mar 2017 23:44:38 +0300 Subject: [PATCH] Bug #79454: Inefficient InnoDB row stats implementation This patch adds a new set of os_atomic_*() macros: - os_atomic_*_nr() to atomically add/subtract a value to/from a variable without returning its original value; - os_nonatomic_*_nr() to do the same non-atomically. This semantics allows hardware-specific optimizations on some architectures. For example, the STADD instruction available on AArch64 CPUs with LSE (Large System Extensions) performs an atomic addition/subtraction while discarding the original value (i.e. not loading it into a register). Which not only results in better scalability as compared to the standard LL/SC synchronization mechanism provided by AArch64 without LSE, but is also faster than the regular non-atomic add/subtract instructions on AArch64, since the standard load/modify/store sequence is replaced with a single instruction, similar to the code generated for x86(_64). This kind of optimizations should theoretically be performed by the compiler, but compiler support is lagging behind even in the most recent GCC versions. This patch maps the new family of macros to the optimized LSE-based implementations on AArch64. It also changes InnoDB fuzzy counters to use the "non-atomic no-return add/subtract" semantics. No changes in behavior is introduced for other architectures, i.e. regular add/subtract code is emitted. The scope of this optimization is not limited to InnoDB fuzzy counters. There are many other counters that do not require the original value when updated. But those will be addressed separately based on the primitives introduced by this patch. --- configure.cmake | 5 ++ storage/innobase/include/os0atomic.h | 112 ++++++++++++++++++++++++++++++++++ storage/innobase/include/sync0sync.h | 1 - storage/innobase/include/ut0counter.h | 41 +++++++++++-- 4 files changed, 154 insertions(+), 5 deletions(-) diff --git a/configure.cmake b/configure.cmake index cb8aa67..7089ea3 100644 --- a/configure.cmake +++ b/configure.cmake @@ -924,6 +924,11 @@ IF(HAVE_LIBNUMA AND NOT WITH_NUMA) MESSAGE(STATUS "Disabling NUMA on user's request") ENDIF() +OPTION(WITH_LSE "Enable Large System Extensions for AArch64" OFF) +IF(WITH_LSE) + ADD_DEFINITIONS(-DHAVE_ARM64_LSE_ATOMICS) +ENDIF() + # needed for libevent CHECK_TYPE_SIZE("socklen_t" SIZEOF_SOCKLEN_T) IF(SIZEOF_SOCKLEN_T) diff --git a/storage/innobase/include/os0atomic.h b/storage/innobase/include/os0atomic.h index 8e2b06b..82358f2 100644 --- a/storage/innobase/include/os0atomic.h +++ b/storage/innobase/include/os0atomic.h @@ -295,6 +295,71 @@ amount of increment. */ # define os_atomic_increment_uint64(ptr, amount) \ os_atomic_increment(ptr, amount) +/**********************************************************//** +Same functions with no return value. These may have optimized implementations on +some architectures. */ + +#if defined(__aarch64__) && defined(HAVE_ARM64_LSE_ATOMICS) + +# define ARM64_LSE_ATOMIC_STADD(ptr, amount, w, r) \ + do { \ + __asm__ __volatile__( \ + "stadd" w " %" r "1, %0\n" \ + : "+Q" (*ptr) \ + : "r" (amount) \ + : "memory"); \ + } while(0) + +# define os_atomic_increment_nr(ptr, amount) \ + do { \ + switch (sizeof(*ptr)) { \ + case 1: ARM64_LSE_ATOMIC_STADD(ptr, amount, "b", "w"); break; \ + case 2: ARM64_LSE_ATOMIC_STADD(ptr, amount, "h", "w"); break; \ + case 4: ARM64_LSE_ATOMIC_STADD(ptr, amount, "", "w"); break; \ + case 8: ARM64_LSE_ATOMIC_STADD(ptr, amount, "", ""); break; \ + default: ut_ad(0); /* wrong operand size */ \ + } \ + } while (0) +#else +# define os_atomic_increment_nr(ptr, amount) \ + os_atomic_increment(ptr, amount) +#endif + +# define os_atomic_increment_lint_nr(ptr, amount) \ + os_atomic_increment_nr(ptr, amount) + +# define os_atomic_increment_ulint_nr(ptr, amount) \ + os_atomic_increment_nr(ptr, amount) + +# define os_atomic_increment_uint32_nr(ptr, amount ) \ + os_atomic_increment_nr(ptr, amount) + +# define os_atomic_increment_uint64_nr(ptr, amount) \ + os_atomic_increment_nr(ptr, amount) + +/* Non-atomic version of the functions with no return value. */ + +#if defined(__aarch64__) && defined(HAVE_ARM64_LSE_ATOMICS) +/* Atomic increment w/o fetching is faster than nonatomic one with it +on ThunderX. */ +# define os_nonatomic_increment_nr(ptr, amount) \ + os_atomic_increment_nr(ptr, amount) +#else +# define os_nonatomic_increment_nr(ptr, amount) (*(ptr) += (amount)) +#endif + +# define os_nonatomic_increment_lint_nr(ptr, amount) \ + os_nonatomic_increment_nr(ptr, amount) + +# define os_nonatomic_increment_ulint_nr(ptr, amount) \ + os_nonatomic_increment_nr(ptr, amount) + +# define os_nonatomic_increment_uint32_nr(ptr, amount ) \ + os_nonatomic_increment_nr(ptr, amount) + +# define os_nonatomic_increment_uint64_nr(ptr, amount) \ + os_nonatomic_increment_nr(ptr, amount) + /* Returns the resulting value, ptr is pointer to target, amount is the amount to decrement. */ @@ -318,6 +383,53 @@ amount to decrement. */ # define os_atomic_decrement_uint64(ptr, amount) \ os_atomic_decrement(ptr, amount) +/**********************************************************//** +Same functions with no return value. These may have optimized implementations on +some architectures. */ + +#if defined(__aarch64__) && defined(HAVE_ARM64_LSE_ATOMICS) +# define os_atomic_decrement_nr(ptr, amount) \ + os_atomic_increment_nr(ptr, -amount) +#else +# define os_atomic_decrement_nr(ptr, amount) \ + os_atomic_decrement(ptr, amount) +#endif + +# define os_atomic_decrement_lint_nr(ptr, amount) \ + os_atomic_decrement_nr(ptr, amount) + +# define os_atomic_decrement_ulint_nr(ptr, amount) \ + os_atomic_decrement_nr(ptr, amount) + +# define os_atomic_decrement_uint32_nr(ptr, amount ) \ + os_atomic_decrement_nr(ptr, amount) + +# define os_atomic_decrement_uint64_nr(ptr, amount) \ + os_atomic_decrement_nr(ptr, amount) + +/* Non-atomic version of the functions with no return value. */ + +#if defined(__aarch64__) && defined(HAVE_ARM64_LSE_ATOMICS) +/* Atomic decrement without fetching is faster than nonatomic one with it +on AArch64. */ +# define os_nonatomic_decrement_nr(ptr, amount) \ + os_atomic_decrement_nr(ptr, amount) +#else +# define os_nonatomic_decrement_nr(ptr, amount) (*(ptr) -= (amount)) +#endif + +# define os_nonatomic_decrement_lint_nr(ptr, amount) \ + os_nonatomic_decrement_nr(ptr, amount) + +# define os_nonatomic_decrement_ulint_nr(ptr, amount) \ + os_nonatomic_decrement_nr(ptr, amount) + +# define os_nonatomic_decrement_uint32_nr(ptr, amount ) \ + os_nonatomic_decrement_nr(ptr, amount) + +# define os_nonatomic_decrement_uint64_nr(ptr, amount) \ + os_nonatomic_decrement_nr(ptr, amount) + #endif #define os_atomic_inc_ulint(m,v,d) os_atomic_increment_ulint(v, d) diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h index 7fddada..289900e 100644 --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -35,7 +35,6 @@ Created 9/5/1995 Heikki Tuuri #define sync0sync_h #include "univ.i" -#include "ut0counter.h" #if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h index 6b187ec..809613e 100644 --- a/storage/innobase/include/ut0counter.h +++ b/storage/innobase/include/ut0counter.h @@ -30,6 +30,7 @@ Created 2012/04/12 by Sunny Bains #include #include "univ.i" #include "os0thread.h" +#include "os0atomic.h" /** CPU cache line size */ #ifdef __powerpc__ @@ -108,6 +109,38 @@ struct single_indexer_t { #define default_indexer_t counter_indexer_t + +template +UNIV_INLINE void add_noreturn(T &val, T n) { + val += n; +} + +template +UNIV_INLINE void sub_noreturn(T &val, T n) { + val -= n; +} + +/* Template specializations for native word size */ +template <> +inline void add_noreturn(ulint &val, ulint n) { + os_nonatomic_increment_ulint_nr(&val, n); +} + +template <> +inline void sub_noreturn(ulint &val, ulint n) { + os_nonatomic_decrement_lint_nr(&val, n); +} + +template <> +inline void add_noreturn(lint &val, lint n) { + os_nonatomic_increment_lint_nr(&val, n); +} + +template <> +inline void sub_noreturn(lint &val, lint n) { + os_nonatomic_decrement_lint_nr(&val, n); +} + /** Class for using fuzzy counters. The counter is not protected by any mutex and the results are not guaranteed to be 100% accurate but close enough. Creates an array of counters and separates each element by the @@ -151,7 +184,7 @@ class ib_counter_t { ut_ad(i < UT_ARR_SIZE(m_counter)); - m_counter[i] += n; + add_noreturn(m_counter[i], n); } /** Use this if you can use a unique identifier, saves a @@ -163,7 +196,7 @@ class ib_counter_t { ut_ad(i < UT_ARR_SIZE(m_counter)); - m_counter[i] += n; + add_noreturn(m_counter[i], n); } /** If you can't use a good index id. Decrement by 1. */ @@ -176,7 +209,7 @@ class ib_counter_t { ut_ad(i < UT_ARR_SIZE(m_counter)); - m_counter[i] -= n; + sub_noreturn(m_counter[i], n); } /** Use this if you can use a unique identifier, saves a @@ -188,7 +221,7 @@ class ib_counter_t { ut_ad(i < UT_ARR_SIZE(m_counter)); - m_counter[i] -= n; + sub_noreturn(m_counter[i], n); } /* @return total value - not 100% accurate, since it is not atomic. */