diff --git a/storage/innobase/ut/ut0crc32.cc b/storage/innobase/ut/ut0crc32.cc index 979713f..c35802d 100644 --- a/storage/innobase/ut/ut0crc32.cc +++ b/storage/innobase/ut/ut0crc32.cc @@ -114,11 +114,21 @@ ut_crc32_swap_byteorder( | i >> 56); } + /* CRC32 hardware implementation. */ +//#if 0 +#ifdef ENABLE_ARMV8_CRC32 +#define ARM_CRC32_INTRINSIC +#include +#include +#else +#undef ARM_CRC32_INTRINSIC +#endif /* Flag that tells whether the CPU supports CRC32 or not */ bool ut_crc32_sse2_enabled = false; + #if defined(__GNUC__) && defined(__x86_64__) /********************************************************************//** Fetches CPU info */ @@ -421,17 +431,238 @@ ut_crc32_byte_by_byte_hw( } #endif /* defined(__GNUC__) && defined(__x86_64__) */ +/*************************For AArch64********************************************* + ******************************************************************************* + */ + +#ifdef ARM_CRC32_INTRINSIC + +/****************************************************** + * + * For optimization based on crc+crypto instructions + * + ********************************************************/ +#define CRC32C3X8(buffer,ITR) \ + crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\ + crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\ + crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR))); + +#define CRC32C7X3X8(buffer,ITR) do {\ + CRC32C3X8(buffer,(ITR)*7+0) \ + CRC32C3X8(buffer,(ITR)*7+1) \ + CRC32C3X8(buffer,(ITR)*7+2) \ + CRC32C3X8(buffer,(ITR)*7+3) \ + CRC32C3X8(buffer,(ITR)*7+4) \ + CRC32C3X8(buffer,(ITR)*7+5) \ + CRC32C3X8(buffer,(ITR)*7+6) \ + } while(0) + + +#define PREF4X64L1(buffer,PREF_OFFSET, ITR) \ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64)); + +#define PREF1KL1(buffer,PREF_OFFSET) \ + PREF4X64L1(buffer,(PREF_OFFSET), 0) \ + PREF4X64L1(buffer,(PREF_OFFSET), 4) \ + PREF4X64L1(buffer,(PREF_OFFSET), 8) \ + PREF4X64L1(buffer,(PREF_OFFSET), 12) + +#define PREF4X64L2(buffer,PREF_OFFSET, ITR) \ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64)); + +#define PREF1KL2(buffer,PREF_OFFSET) \ + PREF4X64L2(buffer,(PREF_OFFSET), 0) \ + PREF4X64L2(buffer,(PREF_OFFSET), 4) \ + PREF4X64L2(buffer,(PREF_OFFSET), 8) \ + PREF4X64L2(buffer,(PREF_OFFSET), 12) + +uint32_t +ut_crc32_byte_by_byte_aarch64( + const uint8_t* buf, + uint64_t len) +{ + uint32_t crc = 0xFFFFFFFFU; + + ut_a(ut_crc32_sse2_enabled); + + while (len > 0) { + crc = __crc32cb(crc, *buf++); + len--; + } + + return(~crc); +} + +uint32_t +ut_crc32_aarch64( + const uint8_t* buf, + uint64_t len) +{ + register uint32_t crc = 0xFFFFFFFFU; + register const uint16_t *buf2; + register const uint32_t *buf4; + register const uint64_t *buf8; + + ut_a(ut_crc32_sse2_enabled); + +#if 0 + int64_t length = (int64_t)len; + buf8 = (const uint64_t *)(const void *)buf; + + while ((length -= sizeof(uint64_t)) >= 0) { + crc = __crc32cd(crc, *buf8++); + } + + /* The following is more efficient than the straight loop */ + buf4 = (const uint32_t *)(const void *)buf8; + if (length & sizeof(uint32_t)) { + crc = __crc32cw(crc, *buf4++); + } + + buf2 = (const uint16_t *)(const void *)buf4; + if (length & sizeof(uint16_t)) { + crc = __crc32ch(crc, *buf2++); + } + + buf = (const uint8_t *)(const void *)buf2; + if (length & sizeof(uint8_t)) + crc = __crc32cb(crc, *buf); +#endif + +#if 0 + /* Calculate byte-by-byte up to an 8-byte aligned address. After + this consume the input 8-bytes at a time. */ + while (len > 0 && (reinterpret_cast(buf) & 7) != 0) { + crc = __crc32cb(crc, *buf++); + len--; + } + + + buf8 = (const uint64_t *)(const void *)buf; + while (len >= 128) { + /* This call is repeated 16 times. 16 * 8 = 128. */ + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + crc = __crc32cd(crc, *buf8++); + len -= 128; + } + + while (len >= 8) { + crc = __crc32cd(crc, *buf8++); + len -= 8; + } + + buf = (const unsigned char *)buf8; + + if(len) do { + crc = __crc32cb(crc, *buf++); + + } while (--len); +#endif /*if 1*/ + +#if 1 + uint32_t crc0, crc1, crc2; + int64_t length = (int64_t)len; + buf8 = (const uint64_t *)(const void *)buf; + + /* Calculate reflected crc with PMULL Instruction */ + const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014; + uint64_t t0, t1; + + /* crc done "by 3" for fixed input block size of 1024 bytes */ + while ((length -= 1024) >= 0) { + /* Prefetch data for following block to avoid cache miss */ + PREF1KL2(buf,1024*3); + /* Do first 8 bytes here for better pipelining */ + crc0 = __crc32cd(crc, *buf8++); + crc1 = 0; + crc2 = 0; + + /* Process block inline + Process crc0 last to avoid dependency with above */ + CRC32C7X3X8(buf8,0); + CRC32C7X3X8(buf8,1); + CRC32C7X3X8(buf8,2); + CRC32C7X3X8(buf8,3); + CRC32C7X3X8(buf8,4); + CRC32C7X3X8(buf8,5); + + buf8 += 42*3; + /* Prefetch data for following block to avoid cache miss */ + PREF1KL1((uint8_t *)buf8,1024); + + /* Merge crc0 and crc1 into crc2 + crc1 multiply by K2 + crc0 multiply by K1 */ + + t1 = (uint64_t)vmull_p64(crc1, k2); + t0 = (uint64_t)vmull_p64(crc0, k1); + crc = __crc32cd(crc2, *buf8++); + crc1 = __crc32cd(0, t1); + crc ^= crc1; + crc0 = __crc32cd(0, t0); + crc ^= crc0; + } + + if(!(length += 1024)) + return (~crc); + + while ((length -= sizeof(uint64_t)) >= 0) { + crc = __crc32cd(crc, *buf8++); + } + + /* The following is more efficient than the straight loop */ + buf4 = (const uint32_t *)(const void *)buf8; + if (length & sizeof(uint32_t)) { + crc = __crc32cw(crc, *buf4++); + } + + buf2 = (const uint16_t *)(const void *)buf4; + if (length & sizeof(uint16_t)) { + crc = __crc32ch(crc, *buf2++); + } + + buf = (const uint8_t *)(const void *)buf2; + if (length & sizeof(uint8_t)) + crc = __crc32cb(crc, *buf); +#endif/*if 1*/ + + return(~crc); +} + +#endif/*if ARM_CRC32_INTRINSIC*/ + + /* CRC32 software implementation. */ /* Precalculated table used to generate the CRC32 if the CPU does not have support for it */ static uint32_t ut_crc32_slice8_table[8][256]; -static bool ut_crc32_slice8_table_initialized = false; +bool ut_crc32_slice8_table_initialized = false; /********************************************************************//** Initializes the table that is used to generate the CRC32 if the CPU does not have support for it. */ -static +//static void ut_crc32_slice8_table_init() /*========================*/ @@ -563,8 +794,8 @@ ut_crc32_64_legacy_big_endian_sw( @return CRC-32C (polynomial 0x11EDC6F41) */ uint32_t ut_crc32_sw( - const byte* buf, - ulint len) + const uint8_t* buf, + uint64_t len) { uint32_t crc = 0xFFFFFFFFU; @@ -727,6 +958,15 @@ ut_crc32_init() #endif /* defined(__GNUC__) && defined(__x86_64__) */ +#ifdef ARM_CRC32_INTRINSIC + ut_crc32_sse2_enabled = 0x1; + if (ut_crc32_sse2_enabled) { + ut_crc32 = ut_crc32_aarch64; + ut_crc32_legacy_big_endian = NULL; + ut_crc32_byte_by_byte = ut_crc32_byte_by_byte_aarch64; + } +#endif + if (!ut_crc32_sse2_enabled) { ut_crc32_slice8_table_init(); ut_crc32 = ut_crc32_sw;