From dc942becd90e703b4ec9ebf199e7c041d767019c Mon Sep 17 00:00:00 2001 From: yuqilinaro Date: Thu, 6 Apr 2017 05:04:46 +0000 Subject: [PATCH 1/2] Bug #85819 Add AArch64 optimized crc32c implementation --- cmake/build_configurations/compiler_options.cmake | 12 +++- config.h.cmake | 3 + configure.cmake | 6 ++ storage/innobase/ut/ut0crc32.cc | 82 +++++++++++++++++++++++ 4 files changed, 101 insertions(+), 2 deletions(-) diff --git a/cmake/build_configurations/compiler_options.cmake b/cmake/build_configurations/compiler_options.cmake index c112418..4d72f44 100644 --- a/cmake/build_configurations/compiler_options.cmake +++ b/cmake/build_configurations/compiler_options.cmake @@ -30,7 +30,11 @@ IF(UNIX) # Default GCC flags IF(CMAKE_COMPILER_IS_GNUCC) - SET(COMMON_C_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing") + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + SET(COMMON_C_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing -march=armv8-a+crypto+crc") + ELSE() + SET(COMMON_C_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing") + ENDIF() # Disable inline optimizations for valgrind testing to avoid false positives IF(WITH_VALGRIND) SET(COMMON_C_FLAGS "-fno-inline ${COMMON_C_FLAGS}") @@ -54,7 +58,11 @@ IF(UNIX) SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 ${COMMON_C_FLAGS}") ENDIF() IF(CMAKE_COMPILER_IS_GNUCXX) - SET(COMMON_CXX_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing") + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + SET(COMMON_CXX_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing -march=armv8-a+crypto+crc") + ELSE() + SET(COMMON_CXX_FLAGS "-g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing") + ENDIF() # GCC 6 has C++14 as default, set it explicitly to the old default. EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GXX_VERSION) diff --git a/config.h.cmake b/config.h.cmake index c89493a..d87ef82 100644 --- a/config.h.cmake +++ b/config.h.cmake @@ -448,4 +448,7 @@ /* For default value of --early_plugin_load */ #cmakedefine DEFAULT_EARLY_PLUGIN_LOAD @DEFAULT_EARLY_PLUGIN_LOAD@ +/* Support ARMv8 CRC instructions */ +#cmakedefine ENABLE_ARMV8_CRC32 + #endif diff --git a/configure.cmake b/configure.cmake index cb8aa67..e239187 100644 --- a/configure.cmake +++ b/configure.cmake @@ -929,3 +929,9 @@ CHECK_TYPE_SIZE("socklen_t" SIZEOF_SOCKLEN_T) IF(SIZEOF_SOCKLEN_T) SET(HAVE_SOCKLEN_T 1) ENDIF() + +# Enable crc32 on AArch64 Platform +IF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + MESSAGE(STATUS "ARMv8 crc32 enabled.") + SET(ENABLE_ARMV8_CRC32 1) +ENDIF() diff --git a/storage/innobase/ut/ut0crc32.cc b/storage/innobase/ut/ut0crc32.cc index 979713f..b94408d 100644 --- a/storage/innobase/ut/ut0crc32.cc +++ b/storage/innobase/ut/ut0crc32.cc @@ -114,11 +114,20 @@ ut_crc32_swap_byteorder( | i >> 56); } + /* CRC32 hardware implementation. */ +#ifdef ENABLE_ARMV8_CRC32 +#define ARM_CRC32_INTRINSIC +#include +#include +#else +#undef ARM_CRC32_INTRINSIC +#endif /* Flag that tells whether the CPU supports CRC32 or not */ bool ut_crc32_sse2_enabled = false; + #if defined(__GNUC__) && defined(__x86_64__) /********************************************************************//** Fetches CPU info */ @@ -421,6 +430,70 @@ ut_crc32_byte_by_byte_hw( } #endif /* defined(__GNUC__) && defined(__x86_64__) */ +/************* + * For AArch64 + */ + +#ifdef ARM_CRC32_INTRINSIC +uint32_t +ut_crc32_byte_by_byte_aarch64( + const byte* buf, + ulint len) +{ + uint32_t crc = 0xFFFFFFFFU; + + ut_a(ut_crc32_sse2_enabled); + + while (len > 0) { + crc = __crc32cb(crc, *buf++); + len--; + } + + return(~crc); +} + + +uint32_t +ut_crc32_aarch64( + const byte* buf, + ulint len) +{ + register uint32_t crc = 0xFFFFFFFFU; + register const uint16_t *buf2; + register const uint32_t *buf4; + register const uint64_t *buf8; + + ut_a(ut_crc32_sse2_enabled); + + int64_t length = (int64_t)len; + buf8 = (const uint64_t *)(const void *)buf; + while ((length -= sizeof(uint64_t)) >= 0) { + crc = __crc32cd(crc, *buf8++); + } + + /* The following is more efficient than the straight loop */ + buf4 = (const uint32_t *)(const void *)buf8; + if (length & sizeof(uint32_t)) { + crc = __crc32cw(crc, *buf4++); + length -= 4; + } + + buf2 = (const uint16_t *)(const void *)buf4; + if (length & sizeof(uint16_t)) { + crc = __crc32ch(crc, *buf2++); + length -= 2; + } + + buf = (const uint8_t *)(const void *)buf2; + if (length & sizeof(uint8_t)) + crc = __crc32cb(crc, *buf); + + return(~crc); +} + +#endif /*ARM_CRC32_INTRINSIC*/ + + /* CRC32 software implementation. */ /* Precalculated table used to generate the CRC32 if the CPU does not @@ -727,6 +800,15 @@ ut_crc32_init() #endif /* defined(__GNUC__) && defined(__x86_64__) */ +#ifdef ARM_CRC32_INTRINSIC + ut_crc32_sse2_enabled = 0x1; + if (ut_crc32_sse2_enabled) { + ut_crc32 = ut_crc32_aarch64; + ut_crc32_legacy_big_endian = NULL; + ut_crc32_byte_by_byte = ut_crc32_byte_by_byte_aarch64; + } +#endif + if (!ut_crc32_sse2_enabled) { ut_crc32_slice8_table_init(); ut_crc32 = ut_crc32_sw; From 94dcdaf0f8bae40ece3f1ec2665c27e9131dcc86 Mon Sep 17 00:00:00 2001 From: guyuqi Date: Fri, 7 Apr 2017 13:08:52 +0800 Subject: [PATCH 2/2] Update ut0crc32.cc ARMv8 defines PMULL crypto instruction. The new patch optimizes crc32c calculate with the instruction when available rather than original linear crc32 instructions. --- storage/innobase/ut/ut0crc32.cc | 107 ++++++++++++++++++++++++++++++++++------ 1 file changed, 93 insertions(+), 14 deletions(-) diff --git a/storage/innobase/ut/ut0crc32.cc b/storage/innobase/ut/ut0crc32.cc index b94408d..b7d003c 100644 --- a/storage/innobase/ut/ut0crc32.cc +++ b/storage/innobase/ut/ut0crc32.cc @@ -116,10 +116,52 @@ ut_crc32_swap_byteorder( /* CRC32 hardware implementation. */ + +/*For AArch64*/ #ifdef ENABLE_ARMV8_CRC32 -#define ARM_CRC32_INTRINSIC #include #include + +#define ARM_CRC32_INTRINSIC + +#define CRC32C3X8(buffer,ITR) \ + crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\ + crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\ + crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR))); + +#define CRC32C7X3X8(buffer,ITR) do {\ + CRC32C3X8(buffer,(ITR)*7+0) \ + CRC32C3X8(buffer,(ITR)*7+1) \ + CRC32C3X8(buffer,(ITR)*7+2) \ + CRC32C3X8(buffer,(ITR)*7+3) \ + CRC32C3X8(buffer,(ITR)*7+4) \ + CRC32C3X8(buffer,(ITR)*7+5) \ + CRC32C3X8(buffer,(ITR)*7+6) \ + } while(0) + +#define PREF4X64L1(buffer,PREF_OFFSET, ITR) \ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64)); + +#define PREF1KL1(buffer,PREF_OFFSET) \ + PREF4X64L1(buffer,(PREF_OFFSET), 0) \ + PREF4X64L1(buffer,(PREF_OFFSET), 4) \ + PREF4X64L1(buffer,(PREF_OFFSET), 8) \ + PREF4X64L1(buffer,(PREF_OFFSET), 12) + +#define PREF4X64L2(buffer,PREF_OFFSET, ITR) \ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64)); + +#define PREF1KL2(buffer,PREF_OFFSET) \ + PREF4X64L2(buffer,(PREF_OFFSET), 0) \ + PREF4X64L2(buffer,(PREF_OFFSET), 4) \ + PREF4X64L2(buffer,(PREF_OFFSET), 8) \ + PREF4X64L2(buffer,(PREF_OFFSET), 12) #else #undef ARM_CRC32_INTRINSIC #endif @@ -430,15 +472,12 @@ ut_crc32_byte_by_byte_hw( } #endif /* defined(__GNUC__) && defined(__x86_64__) */ -/************* - * For AArch64 - */ #ifdef ARM_CRC32_INTRINSIC uint32_t ut_crc32_byte_by_byte_aarch64( - const byte* buf, - ulint len) + const uint8_t* buf, + uint64_t len) { uint32_t crc = 0xFFFFFFFFU; @@ -452,11 +491,10 @@ ut_crc32_byte_by_byte_aarch64( return(~crc); } - uint32_t ut_crc32_aarch64( - const byte* buf, - ulint len) + const uint8_t* buf, + uint64_t len) { register uint32_t crc = 0xFFFFFFFFU; register const uint16_t *buf2; @@ -465,8 +503,52 @@ ut_crc32_aarch64( ut_a(ut_crc32_sse2_enabled); + uint32_t crc0, crc1, crc2; int64_t length = (int64_t)len; buf8 = (const uint64_t *)(const void *)buf; + + /* Calculate reflected crc with PMULL Instruction */ + const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014; + uint64_t t0, t1; + + /* crc done "by 3" for fixed input block size of 1024 bytes */ + while ((length -= 1024) >= 0) { + /* Prefetch data for following block to avoid cache miss */ + PREF1KL2(buf,1024*3); + /* Do first 8 bytes here for better pipelining */ + crc0 = __crc32cd(crc, *buf8++); + crc1 = 0; + crc2 = 0; + + /* Process block inline + Process crc0 last to avoid dependency with above */ + CRC32C7X3X8(buf8,0); + CRC32C7X3X8(buf8,1); + CRC32C7X3X8(buf8,2); + CRC32C7X3X8(buf8,3); + CRC32C7X3X8(buf8,4); + CRC32C7X3X8(buf8,5); + + buf8 += 42*3; + /* Prefetch data for following block to avoid cache miss */ + PREF1KL1((uint8_t *)buf8,1024); + + /* Merge crc0 and crc1 into crc2 + crc1 multiply by K2 + crc0 multiply by K1 */ + + t1 = (uint64_t)vmull_p64(crc1, k2); + t0 = (uint64_t)vmull_p64(crc0, k1); + crc = __crc32cd(crc2, *buf8++); + crc1 = __crc32cd(0, t1); + crc ^= crc1; + crc0 = __crc32cd(0, t0); + crc ^= crc0; + } + + if(!(length += 1024)) + return (~crc); + while ((length -= sizeof(uint64_t)) >= 0) { crc = __crc32cd(crc, *buf8++); } @@ -475,13 +557,11 @@ ut_crc32_aarch64( buf4 = (const uint32_t *)(const void *)buf8; if (length & sizeof(uint32_t)) { crc = __crc32cw(crc, *buf4++); - length -= 4; } buf2 = (const uint16_t *)(const void *)buf4; if (length & sizeof(uint16_t)) { crc = __crc32ch(crc, *buf2++); - length -= 2; } buf = (const uint8_t *)(const void *)buf2; @@ -490,8 +570,7 @@ ut_crc32_aarch64( return(~crc); } - -#endif /*ARM_CRC32_INTRINSIC*/ +#endif /* CRC32 software implementation. */ @@ -504,7 +583,7 @@ static bool ut_crc32_slice8_table_initialized = false; /********************************************************************//** Initializes the table that is used to generate the CRC32 if the CPU does not have support for it. */ -static +//static void ut_crc32_slice8_table_init() /*========================*/