| Bug #99190 | my_tosort_unicode: unnecessary max_char check while utf8-mb3 processing | ||
|---|---|---|---|
| Submitted: | 6 Apr 2020 11:51 | Modified: | 6 Apr 2020 13:23 | 
| Reporter: | Georgy Kirichenko | Email Updates: | |
| Status: | Verified | Impact on me: | |
| Category: | MySQL Server: Charsets | Severity: | S5 (Performance) | 
| Version: | 8.0 | OS: | Any | 
| Assigned to: | CPU Architecture: | Any | |
   [6 Apr 2020 11:57]
   Georgy Kirichenko        
  diff --git a/strings/ctype-utf8.cc b/strings/ctype-utf8.cc
index 37bae420ad3..13c7f291331 100644
--- a/strings/ctype-utf8.cc
+++ b/strings/ctype-utf8.cc
@@ -5002,13 +5002,25 @@ const MY_UNICASE_CHARACTER *my_unicase_pages_unicode520[4352] = {
 
 MY_UNICASE_INFO my_unicase_unicode520 = {0x10FFFF, my_unicase_pages_unicode520};
 
-static inline void my_tosort_unicode(const MY_UNICASE_INFO *uni_plane,
-                                     my_wc_t *wc, uint flags) {
+static inline void
+my_tosort_unicode_convert(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc,
+                          uint flags)
+{
+  const MY_UNICASE_CHARACTER *page;
+
+  DBUG_ASSERT(*wc <= uni_plane->maxchar);
+
+  if ((page= uni_plane->page[*wc >> 8]))
+    *wc= (flags & MY_CS_LOWER_SORT) ?
+         page[*wc & 0xFF].tolower :
+         page[*wc & 0xFF].sort;
+}
+
+static inline void
+my_tosort_unicode(const MY_UNICASE_INFO *uni_plane, my_wc_t *wc, uint flags)
+{
   if (*wc <= uni_plane->maxchar) {
-    const MY_UNICASE_CHARACTER *page;
-    if ((page = uni_plane->page[*wc >> 8]))
-      *wc = (flags & MY_CS_LOWER_SORT) ? page[*wc & 0xFF].tolower
-                                       : page[*wc & 0xFF].sort;
+    my_tosort_unicode_convert(uni_plane, wc, flags);
   } else {
     *wc = MY_CS_REPLACEMENT_CHARACTER;
   }
@@ -5238,7 +5250,10 @@ static size_t my_strxfrm_pad_unicode(uchar *str, uchar *strend) {
   one wide character). This is inlined because the call overhead of
   mb_wc() would otherwise be quite large.
 */
-template <class Mb_wc>
+
+typedef void (*tosort_unicode_t)(const MY_UNICASE_INFO *, my_wc_t *, uint);
+
+template <tosort_unicode_t tosort_unicode, class Mb_wc>
 static inline size_t my_strnxfrm_unicode_tmpl(const CHARSET_INFO *cs,
                                               Mb_wc mb_wc, uchar *dst,
                                               size_t dstlen, uint nweights,
@@ -5286,7 +5301,7 @@ static inline size_t my_strnxfrm_unicode_tmpl(const CHARSET_INFO *cs,
         goto pad;
       src += res;
 
-      my_tosort_unicode(uni_plane, &wc, cs->state);
+      tosort_unicode(uni_plane, &wc, cs->state);
 
       dst = store16be(dst, wc);
     }
@@ -5296,7 +5311,7 @@ static inline size_t my_strnxfrm_unicode_tmpl(const CHARSET_INFO *cs,
       my_wc_t wc;
       int res = mb_wc(&wc, src, se);
       if (res > 0) {
-        my_tosort_unicode(uni_plane, &wc, cs->state);
+        tosort_unicode(uni_plane, &wc, cs->state);
         src += res;
         *dst++ = (uchar)(wc >> 8);
       }
@@ -5327,14 +5342,14 @@ size_t my_strnxfrm_unicode(const CHARSET_INFO *cs, uchar *dst, size_t dstlen,
   // my_mb_wc_utf8 is so common that we special-case it; short-circuit away
   // the thunk, and get it inlined.
   if (cs->cset->mb_wc == my_mb_wc_utf8_thunk) {
-    return my_strnxfrm_unicode_tmpl(cs, Mb_wc_utf8(), dst, dstlen, nweights,
-                                    src, srclen, flags);
+    return my_strnxfrm_unicode_tmpl<my_tosort_unicode_convert>(cs, Mb_wc_utf8(),
+                                    dst, dstlen, nweights, src, srclen, flags);
   } else {
     // Fallback using a function pointer (which the compiler is unlikely
     // to be able to optimize away).
     Mb_wc_through_function_pointer mb_wc(cs);
-    return my_strnxfrm_unicode_tmpl(cs, mb_wc, dst, dstlen, nweights, src,
-                                    srclen, flags);
+    return my_strnxfrm_unicode_tmpl<my_tosort_unicode>(cs, mb_wc, dst, dstlen,
+                                    nweights, src, srclen, flags);
   }
 }
 
@@ -5696,8 +5711,8 @@ static int my_strnncoll_utf8(const CHARSET_INFO *cs, const uchar *s,
       return bincmp(s, se, t, te);
     }
 
-    my_tosort_unicode(uni_plane, &s_wc, cs->state);
-    my_tosort_unicode(uni_plane, &t_wc, cs->state);
+    my_tosort_unicode_convert(uni_plane, &s_wc, cs->state);
+    my_tosort_unicode_convert(uni_plane, &t_wc, cs->state);
 
     if (s_wc != t_wc) {
       return s_wc > t_wc ? 1 : -1;
@@ -5752,8 +5767,8 @@ static int my_strnncollsp_utf8(const CHARSET_INFO *cs, const uchar *s,
       return bincmp(s, se, t, te);
     }
 
-    my_tosort_unicode(uni_plane, &s_wc, cs->state);
-    my_tosort_unicode(uni_plane, &t_wc, cs->state);
+    my_tosort_unicode_convert(uni_plane, &s_wc, cs->state);
+    my_tosort_unicode_convert(uni_plane, &t_wc, cs->state);
 
     if (s_wc != t_wc) {
       return s_wc > t_wc ? 1 : -1;
 
   [6 Apr 2020 13:23]
   MySQL Verification Team        
  Hi Mr. Kirichenko, Thank you for your performance bug report. I found out that your patch improves performance, although not much. Still, any improvement is welcome. Thank you very much for your contribution. Verified as reported.


Description: my_tosort_unicode function checks an input character against uni_plane max char value here: 5004 static inline void my_tosort_unicode(const MY_UNICASE_INFO *uni_plane, 5005 my_wc_t *wc, uint flags) { -- Always true in case of utf8-mb3 5006 if (*wc <= uni_plane->maxchar) { 5007 const MY_UNICASE_CHARACTER *page; 5008 if ((page = uni_plane->page[*wc >> 8])) 5009 *wc = (flags & MY_CS_LOWER_SORT) ? page[*wc & 0xFF].tolower 5010 : page[*wc & 0xFF].sort; 5011 } else { 5012 *wc = MY_CS_REPLACEMENT_CHARACTER; 5013 } 5014 } But utf8-mb3 encodes only 2-bytes and there is no uniplanes with max char less than 65535 so such check is not required. Getting rid of this results in a small performance gain (tested on amd64 and aarch64 with sysbench ro test) How to repeat: Remove this check and compare sysbench performance results. Suggested fix: my_strnxfrm_unicode already has an utf8-mb3 optimization so I suggest to introduce a new function like my_tosort_unicode_convert without the check and call this function in my_strnxfrm_unicode version for utf8-mb3