From 003d50ddb79e6025f3394dc5986ad6c9c338a276 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Tue, 4 Apr 2023 16:01:03 +0800 Subject: [PATCH] innobase: optimize hash_uint64 with hardware crc on Arm hash_uint64 is listed high in the perf report of sysbench writeonly tests. Arm SPE(Statistical Profiling Extension) reveals it's suffering from non-trivial l1d-cache misses when reading the lookup table. This patch combines hardware crc32c and crc32 instructions to calculate hash values. With fewer instructions and no memory access, performance improved significantly. All unit tests are passed. x86 has no hardware crc32 instruction. It's not supported in this patch. Using only crc32c doesn't introduce enough entropy. It's possible to pass the unit tests with four crc32c instructions, but the benefit is not huge. Tested on Neoverse-N1, gcc-10.3, RelWithDebInfo, -march=armv8.2-a. Big performance uplift is observed in related microbenchmarks. - Microbenchmarks.BM_RND_GEN: 117 -> 433 MB/s - Microbenchmarks.BM_HASH_UINT64: 121 -> 252 MB/s - Microbenchmarks.BM_HASH_UINT64_PAIR: 57 -> 165 MB/s In sysbench profiling result, hash_uint64 is not in the top list anymore after applying this change. NB: I also tried some fast universal hashing implementations, like clhash [1] and google cityhash [2], both failed mysql unit tests. [1] https://github.com/lemire/clhash [2] https://github.com/google/cityhash Change-Id: Ic9f360c8df2e2fcdd6ebe04881d50946c1dc204b --- storage/innobase/include/ut0rnd.h | 38 +++++++++++++++++++++++++++++++ unittest/gunit/innodb/ut0rnd-t.cc | 3 +++ 2 files changed, 41 insertions(+) diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h index 5a8dd225..d92d5652 100644 --- a/storage/innobase/include/ut0rnd.h +++ b/storage/innobase/include/ut0rnd.h @@ -44,6 +44,10 @@ this program; if not, write to the Free Software Foundation, Inc., #include "ut0crc32.h" #include "ut0seq_lock.h" +#ifdef CRC32_ARM64 +#include +#endif + namespace ut { namespace detail { @@ -55,6 +59,22 @@ constexpr uint64_t fast_hash_coeff_a1_64bit = 0xacb1f3526e25dd39; constexpr uint64_t fast_hash_coeff_a2_64bit = 0xd65b9175b5247ba3; constexpr uint64_t fast_hash_coeff_b_64bit = 0xf72a876a516b4b56; +#ifdef CRC32_ARM64 +#ifdef CRC32_ARM64_DEFAULT +MY_ATTRIBUTE((target("+crc"))) +#endif +static inline uint32_t crc32cd(uint32_t crc, uint64_t data) { + return __crc32cd(crc, data); +} + +#ifdef CRC32_ARM64_DEFAULT +MY_ATTRIBUTE((target("+crc"))) +#endif +static inline uint32_t crc32d(uint32_t crc, uint64_t data) { + return __crc32d(crc, data); +} +#endif // CRC32_ARM64 + } // namespace detail /** The following function generates pseudo-random 64bit integers which @@ -183,6 +203,24 @@ static inline uint64_t random_from_interval(uint64_t low, uint64_t high) { } static inline uint64_t hash_uint64(uint64_t value) { +#ifdef CRC32_ARM64 + /* Use hardware crc to generate hash. Besides fewer instructions, the biggest + benefit is that it reduces memory accesses. Profiling result shows non-trivial + l1d-cache misses for the lookup table approach in sysbench tests. */ + if (likely(ut_crc32_cpu_enabled)) { + // Remix bits + value = (value ^ (value >> 30)) * 0xbf58476d1ce4e5b9ULL; + value ^= (value >> 31); + /* Can easily pass unit tests with one crc32-c and one crc32 by picking two + random initial values. But it's very hard (if not impossible) to pass the + tests with two crc32-c. x86_64 doesn't support hardware crc32, so this code + path is enabled for arm64 only. */ + const uint64_t hi = detail::crc32cd(0x01234567, value); + const uint32_t lo = detail::crc32d(0x89abcdef, value); + return (hi << 32) | lo; + } +#endif + /* This implements Tabulation Hashing. It is a simple yet effective algorithm that easily passes the unit tests that check distributions of hash values modulo different numbers. Other techniques like one from hash_uint64_pair_fast diff --git a/unittest/gunit/innodb/ut0rnd-t.cc b/unittest/gunit/innodb/ut0rnd-t.cc index 53afa68e..03d78b07 100644 --- a/unittest/gunit/innodb/ut0rnd-t.cc +++ b/unittest/gunit/innodb/ut0rnd-t.cc @@ -365,6 +365,7 @@ BENCHMARK(BM_RND_GEN_STD_LINEAR) #endif static void BM_RND_GEN(const size_t num_iterations) { + ut_crc32_init(); benchmark_hasher(num_iterations, [](uint64_t, uint64_t n) { return ut::random_64(); }); } @@ -373,6 +374,7 @@ BENCHMARK(BM_RND_GEN) /* Micro-benchmark raw uint64_t hash performance. */ static void BM_HASH_UINT64(const size_t num_iterations) { + ut_crc32_init(); benchmark_hasher(num_iterations, [](uint64_t fold, uint64_t) { return ut::hash_uint64(fold); }); @@ -389,6 +391,7 @@ BENCHMARK(BM_HASH_UINT64_OLD) /* Micro-benchmark raw pair of uint32_t hash performance. */ static void BM_HASH_UINT64_PAIR(const size_t num_iterations) { + ut_crc32_init(); benchmark_hasher(num_iterations, [](uint64_t fold, uint64_t n) { return ut::hash_uint64_pair(fold, ut::random_64()); }); -- 2.25.1