| Bug #99190 | my_tosort_unicode: unnecessary max_char check while utf8-mb3 processing | ||
|---|---|---|---|
| Submitted: | 6 Apr 2020 11:51 | Modified: | 6 Apr 2020 13:23 |
| Reporter: | Georgy Kirichenko | Email Updates: | |
| Status: | Verified | Impact on me: | |
| Category: | MySQL Server: Charsets | Severity: | S5 (Performance) |
| Version: | 8.0 | OS: | Any |
| Assigned to: | CPU Architecture: | Any | |
[6 Apr 2020 11:57]
Georgy Kirichenko
diff --git a/strings/ctype-utf8.cc b/strings/ctype-utf8.cc
index 37bae420ad3..13c7f291331 100644
--- a/strings/ctype-utf8.cc
+++ b/strings/ctype-utf8.cc
@@ -5002,13 +5002,25 @@ const MY_UNICASE_CHARACTER *my_unicase_pages_unicode520[4352] = {
MY_UNICASE_INFO my_unicase_unicode520 = {0x10FFFF, my_unicase_pages_unicode520};
-static inline void my_tosort_unicode(const MY_UNICASE_INFO *uni_plane,
- my_wc_t *wc, uint flags) {
+static inline void
+my_tosort_unicode_convert(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc,
+ uint flags)
+{
+ const MY_UNICASE_CHARACTER *page;
+
+ DBUG_ASSERT(*wc <= uni_plane->maxchar);
+
+ if ((page= uni_plane->page[*wc >> 8]))
+ *wc= (flags & MY_CS_LOWER_SORT) ?
+ page[*wc & 0xFF].tolower :
+ page[*wc & 0xFF].sort;
+}
+
+static inline void
+my_tosort_unicode(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc, uint flags)
+{
if (*wc <= uni_plane->maxchar) {
- const MY_UNICASE_CHARACTER *page;
- if ((page = uni_plane->page[*wc >> 8]))
- *wc = (flags & MY_CS_LOWER_SORT) ? page[*wc & 0xFF].tolower
- : page[*wc & 0xFF].sort;
+ my_tosort_unicode_convert(uni_plane, wc, flags);
} else {
*wc = MY_CS_REPLACEMENT_CHARACTER;
}
@@ -5238,7 +5250,10 @@ static size_t my_strxfrm_pad_unicode(uchar *str, uchar *strend) {
one wide character). This is inlined because the call overhead of
mb_wc() would otherwise be quite large.
*/
-template <class Mb_wc>
+
+typedef void (*tosort_unicode_t)(const MY_UNICASE_INFO *, my_wc_t *, uint);
+
+template <tosort_unicode_t tosort_unicode, class Mb_wc>
static inline size_t my_strnxfrm_unicode_tmpl(const CHARSET_INFO *cs,
Mb_wc mb_wc, uchar *dst,
size_t dstlen, uint nweights,
@@ -5286,7 +5301,7 @@ static inline size_t my_strnxfrm_unicode_tmpl(const CHARSET_INFO *cs,
goto pad;
src += res;
- my_tosort_unicode(uni_plane, &wc, cs->state);
+ tosort_unicode(uni_plane, &wc, cs->state);
dst = store16be(dst, wc);
}
@@ -5296,7 +5311,7 @@ static inline size_t my_strnxfrm_unicode_tmpl(const CHARSET_INFO *cs,
my_wc_t wc;
int res = mb_wc(&wc, src, se);
if (res > 0) {
- my_tosort_unicode(uni_plane, &wc, cs->state);
+ tosort_unicode(uni_plane, &wc, cs->state);
src += res;
*dst++ = (uchar)(wc >> 8);
}
@@ -5327,14 +5342,14 @@ size_t my_strnxfrm_unicode(const CHARSET_INFO *cs, uchar *dst, size_t dstlen,
// my_mb_wc_utf8 is so common that we special-case it; short-circuit away
// the thunk, and get it inlined.
if (cs->cset->mb_wc == my_mb_wc_utf8_thunk) {
- return my_strnxfrm_unicode_tmpl(cs, Mb_wc_utf8(), dst, dstlen, nweights,
- src, srclen, flags);
+ return my_strnxfrm_unicode_tmpl<my_tosort_unicode_convert>(cs, Mb_wc_utf8(),
+ dst, dstlen, nweights, src, srclen, flags);
} else {
// Fallback using a function pointer (which the compiler is unlikely
// to be able to optimize away).
Mb_wc_through_function_pointer mb_wc(cs);
- return my_strnxfrm_unicode_tmpl(cs, mb_wc, dst, dstlen, nweights, src,
- srclen, flags);
+ return my_strnxfrm_unicode_tmpl<my_tosort_unicode>(cs, mb_wc, dst, dstlen,
+ nweights, src, srclen, flags);
}
}
@@ -5696,8 +5711,8 @@ static int my_strnncoll_utf8(const CHARSET_INFO *cs, const uchar *s,
return bincmp(s, se, t, te);
}
- my_tosort_unicode(uni_plane, &s_wc, cs->state);
- my_tosort_unicode(uni_plane, &t_wc, cs->state);
+ my_tosort_unicode_convert(uni_plane, &s_wc, cs->state);
+ my_tosort_unicode_convert(uni_plane, &t_wc, cs->state);
if (s_wc != t_wc) {
return s_wc > t_wc ? 1 : -1;
@@ -5752,8 +5767,8 @@ static int my_strnncollsp_utf8(const CHARSET_INFO *cs, const uchar *s,
return bincmp(s, se, t, te);
}
- my_tosort_unicode(uni_plane, &s_wc, cs->state);
- my_tosort_unicode(uni_plane, &t_wc, cs->state);
+ my_tosort_unicode_convert(uni_plane, &s_wc, cs->state);
+ my_tosort_unicode_convert(uni_plane, &t_wc, cs->state);
if (s_wc != t_wc) {
return s_wc > t_wc ? 1 : -1;
[6 Apr 2020 13:23]
MySQL Verification Team
Hi Mr. Kirichenko, Thank you for your performance bug report. I found out that your patch improves performance, although not much. Still, any improvement is welcome. Thank you very much for your contribution. Verified as reported.

Description: my_tosort_unicode function checks an input character against uni_plane max char value here: 5004 static inline void my_tosort_unicode(const MY_UNICASE_INFO *uni_plane, 5005 my_wc_t *wc, uint flags) { -- Always true in case of utf8-mb3 5006 if (*wc <= uni_plane->maxchar) { 5007 const MY_UNICASE_CHARACTER *page; 5008 if ((page = uni_plane->page[*wc >> 8])) 5009 *wc = (flags & MY_CS_LOWER_SORT) ? page[*wc & 0xFF].tolower 5010 : page[*wc & 0xFF].sort; 5011 } else { 5012 *wc = MY_CS_REPLACEMENT_CHARACTER; 5013 } 5014 } But utf8-mb3 encodes only 2-bytes and there is no uniplanes with max char less than 65535 so such check is not required. Getting rid of this results in a small performance gain (tested on amd64 and aarch64 with sysbench ro test) How to repeat: Remove this check and compare sysbench performance results. Suggested fix: my_strnxfrm_unicode already has an utf8-mb3 optimization so I suggest to introduce a new function like my_tosort_unicode_convert without the check and call this function in my_strnxfrm_unicode version for utf8-mb3