diff --git a/strings/ctype-utf8.cc b/strings/ctype-utf8.cc index b69b4b447f9..5157da58e73 100644 --- a/strings/ctype-utf8.cc +++ b/strings/ctype-utf8.cc @@ -37,6 +37,7 @@ #include #include "m_ctype.h" +#include "m_string.h" #include "my_byteorder.h" #include "my_compiler.h" #include "my_dbug.h" @@ -5581,19 +5582,10 @@ static void my_hash_sort_utf8(const CHARSET_INFO *cs, const uchar *s, size_t slen, uint64 *n1, uint64 *n2) { my_wc_t wc; int res; - const uchar *e = s + slen; + const uchar *e = skip_trailing_space(s, slen); const MY_UNICASE_INFO *uni_plane = cs->caseinfo; - uint64 tmp1; - uint64 tmp2; - - /* - Remove end space. We have to do this to be able to compare - 'A ' and 'A' as identical - */ - while (e > s && e[-1] == ' ') e--; - - tmp1 = *n1; - tmp2 = *n2; + uint64 tmp1 = *n1; + uint64 tmp2 = *n2; while ((s < e) && (res = my_mb_wc_utf8(&wc, s, e)) > 0) { my_tosort_unicode(uni_plane, &wc, cs->state); @@ -5708,6 +5700,27 @@ static int my_strnncoll_utf8(const CHARSET_INFO *cs, const uchar *s, return (int)(t_is_prefix ? t - te : ((se - s) - (te - t))); } +/** + Simultaneously skip space for two strings (ASCII spaces only). + Small special routine function for my_strnncollsp_utf8(mb4) functions +*/ +static inline void skip_space(const uchar **sp, const uchar **tp, + const uchar *const se, const uchar *const te) { + while (*sp + 8 < se && *tp + 8 < te) { + uint64_t s, t; + memcpy(&s, *sp, 8); + memcpy(&t, *tp, 8); + if (s != 0x2020202020202020ULL || t != 0x2020202020202020ULL) break; + + *sp += 8; + *tp += 8; + } + while (*sp < se && *tp < te && **sp == 0x20 && **tp == 0x20) { + ++*sp; + ++*tp; + } +} + /* Compare strings, discarding end space @@ -5743,6 +5756,12 @@ static int my_strnncollsp_utf8(const CHARSET_INFO *cs, const uchar *s, const MY_UNICASE_INFO *uni_plane = cs->caseinfo; while (s < se && t < te) { + /* aggressive space skipping improves performance */ + if (*s == ' ' && *t == ' ') { + skip_space(&s, &t, se, te); + continue; + } + s_res = my_mb_wc_utf8(&s_wc, s, se); t_res = my_mb_wc_utf8(&t_wc, t, te); @@ -7567,21 +7586,12 @@ static void my_hash_sort_utf8mb4(const CHARSET_INFO *cs, const uchar *s, size_t slen, uint64 *n1, uint64 *n2) { my_wc_t wc; int res; - const uchar *e = s + slen; + const uchar *e = skip_trailing_space(s, slen); const MY_UNICASE_INFO *uni_plane = cs->caseinfo; - uint64 tmp1; - uint64 tmp2; + uint64 tmp1 = *n1; + uint64 tmp2 = *n2; uint ch; - /* - Remove end space. We do this to be able to compare - 'A ' and 'A' as identical - */ - while (e > s && e[-1] == ' ') e--; - - tmp1 = *n1; - tmp2 = *n2; - while ((res = my_mb_wc_utf8mb4(&wc, s, e)) > 0) { my_tosort_unicode(uni_plane, &wc, cs->state); @@ -7748,6 +7758,12 @@ static int my_strnncollsp_utf8mb4(const CHARSET_INFO *cs, const uchar *s, const MY_UNICASE_INFO *uni_plane = cs->caseinfo; while (s < se && t < te) { + /* aggressive space skipping improves performance */ + if (*s == ' ' && *t == ' ') { + skip_space(&s, &t, se, te); + continue; + } + int s_res = my_mb_wc_utf8mb4(&s_wc, s, se); int t_res = my_mb_wc_utf8mb4(&t_wc, t, te);