From b6dc4459cbbb59cb347e9e94d5f535459483139a Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Mon, 3 Apr 2023 17:12:43 +0800 Subject: [PATCH] innobase: improve crc32c for arm64 Though crc32 value is always 32 bits, sse _mm_crc32_u64 accepts and returns 64 bit crc32 values. As a result, the crc32_u64 interface uses 64 bit crc32 values. >> static inline uint64_t update(uint64_t crc, uint64_t data); Arm __crc32cd accepts and returns 32 bit crc32 values. Type casting is required to be compatible to the interface. Looks it confuses the compiler and generates suboptimal code. Looking into the machine code of sliced crc32c on Arm, there are some unnecessary instructions in each iteration. As shown in below snapshot. 18b5e40: f9400006 ldr x6, [x0] 18b5e44: 91002000 add x0, x0, #0x8 18b5e48: f94a9c05 ldr x5, [x0, #5432] 18b5e4c: f9553c03 ldr x3, [x0, #10872] 18b5e50: 9ac65c21 crc32cx w1, w1, x6 18b5e54: 9ac55c42 crc32cx w2, w2, x5 18b5e58: 2a0103e1 mov w1, w1 <-- unnecessary !!! 18b5e5c: 2a0203e2 mov w2, w2 <-- unnecessary !!! 18b5e60: 9ac35c83 crc32cx w3, w4, x3 18b5e64: 2a0303e4 mov w4, w3 <-- unnecessary !!! 18b5e68: eb07001f cmp x0, x7 18b5e6c: 54fffea1 b.ne 18b5e40 This patch changes the crc32_u64 parameter and return type to uint32_t on Arm. It eliminates the redundent opcode. The machine code is more consice than original one. 18b5e38: f9400005 ldr x5, [x0] 18b5e3c: 91002000 add x0, x0, #0x8 18b5e40: f94a9c04 ldr x4, [x0, #5432] 18b5e44: f9553c03 ldr x3, [x0, #10872] 18b5e48: 9ac55d08 crc32cx w8, w8, x5 18b5e4c: 9ac45c21 crc32cx w1, w1, x4 18b5e50: 9ac35c42 crc32cx w2, w2, x3 18b5e54: eb06001f cmp x0, x6 18b5e58: 54ffff01 b.ne 18b5e38 7% performance uplift from Microbenchmarks.BM_CRC32_0_508 is observed on Arm Neoverse-N1. NOTE: build mysql with gcc-10.3, RelWithDebInfo Jira: ENTWLS-2963 Change-Id: Ib1c294c7c000ac2e220461058eb829b4659c710a --- storage/innobase/ut/crc32.cc | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/storage/innobase/ut/crc32.cc b/storage/innobase/ut/crc32.cc index 03d6d3ae..fee83fbb 100644 --- a/storage/innobase/ut/crc32.cc +++ b/storage/innobase/ut/crc32.cc @@ -446,7 +446,12 @@ struct crc32_impl { static inline uint32_t update(uint32_t crc, unsigned char data); static inline uint32_t update(uint32_t crc, uint16_t data); static inline uint32_t update(uint32_t crc, uint32_t data); - static inline uint64_t update(uint64_t crc, uint64_t data); +#ifdef CRC32_ARM64 + using crc_u64_t = uint32_t; +#else + using crc_u64_t = uint64_t; +#endif + static inline crc_u64_t update(crc_u64_t crc, uint64_t data); }; #ifdef CRC32_x86_64 @@ -490,8 +495,8 @@ uint32_t crc32_impl::update(uint32_t crc, uint32_t data) { #ifdef CRC32_ARM64_DEFAULT MY_ATTRIBUTE((target("+crc"))) #endif /* CRC32_ARM64_DEFAULT */ -uint64_t crc32_impl::update(uint64_t crc, uint64_t data) { - return (uint64_t)__crc32cd((uint32_t)crc, data); +uint32_t crc32_impl::update(uint32_t crc, uint64_t data) { + return __crc32cd(crc, data); } #endif /* CRC32_ARM64 */ @@ -601,7 +606,7 @@ slice_len bytes. */ template struct Update_step_executor { template - static void run(uint64_t *crc, const uint64_t *data64) { + static void run(crc32_impl::crc_u64_t *crc, const uint64_t *data64) { crc[i] = algo_to_use::update(crc[i], *(data64 + i * (slice_len / 8))); } }; @@ -613,7 +618,7 @@ are from the end of the i-th slice to the end of the chunk. */ template struct Combination_step_executor { template - static void run(uint64_t &combined, const uint64_t *crc) { + static void run(uint64_t &combined, const crc32_impl::crc_u64_t *crc) { combined ^= roll(crc[i]); } }; @@ -632,7 +637,7 @@ static inline uint32_t consume_chunk(uint32_t crc0, const unsigned char *data) { /* crc[i] is the hash for i-th slice, data[i*slice_len...(i+1)*slice_len) where the initial value for each crc[i] is zero, except crc[0] for which we use the initial value crc0 passed in by the caller. */ - uint64_t crc[slices_count]{crc0}; + crc32_impl::crc_u64_t crc[slices_count]{crc0}; /* Each iteration of the for() loop will eat 8 bytes (single uint64_t) from each slice. */ static_assert( -- 2.25.1