From c4f9b226195a948a2e057a0dc91a20cb6a8f53ae Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Sun, 19 Mar 2023 17:15:22 +0800 Subject: [PATCH] Optimize FindLowestBitSet for Arm64 FindLowestBitSet is optimized on x86 by taking advantage of the fact that the input cannot be zero. This patch implements similar refinement on Arm64, and reduces assembly opcodes from 4 to 2. [1] Microbenchmark shows 30% improvement on Graviton-3 (Neoverse-V1). [2] [1] https://godbolt.org/z/9GWq79a69 [2] https://github.com/cyb70289/mytests/blob/master/bench-lsb.cc --- sql/join_optimizer/bit_utils.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sql/join_optimizer/bit_utils.h b/sql/join_optimizer/bit_utils.h index 56a831d3..9f79f2db 100644 --- a/sql/join_optimizer/bit_utils.h +++ b/sql/join_optimizer/bit_utils.h @@ -82,6 +82,9 @@ inline size_t FindLowestBitSet(uint64_t x) { size_t idx; asm("bsfq %1,%q0" : "=r"(idx) : "rm"(x)); return idx; +#elif defined(__GNUC__) && defined(__aarch64__) + // https://godbolt.org/z/9GWq79a69 + return static_cast(__builtin_ctzll(x)); #else // The cast to unsigned at least gets rid of the sign extension. return static_cast(ffsll(x)) - 1u; -- 2.25.1