diff -Nur mysql-server-8.0-orig/include/m_ctype.h mysql-server-8.0/include/m_ctype.h --- mysql-server-8.0-orig/include/m_ctype.h 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/include/m_ctype.h 2020-07-29 15:10:44.927344700 +0300 @@ -256,6 +256,12 @@ void (*hash_sort)(const CHARSET_INFO *cs, const uchar *key, size_t len, uint64 *nr1, uint64 *nr2); bool (*propagate)(const CHARSET_INFO *cs, const uchar *str, size_t len); + void (*hash_sort_count)(const CHARSET_INFO *cs, const uchar *s, size_t len, + size_t count, uint64 *nr1, uint64 *nr2, + bool strip_trailing_spaces); + int (*strnncollsp_count)(const CHARSET_INFO *, const uchar *, size_t, + const uchar *, size_t, size_t, + bool strip_trailing_spaces); } MY_COLLATION_HANDLER; extern MY_COLLATION_HANDLER my_collation_mb_bin_handler; @@ -585,6 +591,14 @@ const uchar *key, size_t len, uint64 *nr1, uint64 *nr2); +void my_hash_sort_count(const CHARSET_INFO *cs, const uchar *s, size_t len, + size_t count, uint64 *nr1, uint64 *nr2, + bool strip_trailing_spaces); + +int my_strnncollsp_count(const CHARSET_INFO *cs, const uchar *s1, size_t len1, + const uchar *cs2, size_t len2, size_t count, + bool strip_trailing_spaces); + size_t my_strnxfrm_mb(const CHARSET_INFO *, uchar *dst, size_t dstlen, uint nweights, const uchar *src, size_t srclen, uint flags); diff -Nur mysql-server-8.0-orig/sql/field.cc mysql-server-8.0/sql/field.cc --- mysql-server-8.0-orig/sql/field.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/sql/field.cc 2020-07-29 15:10:44.933331000 +0300 @@ -6211,29 +6211,11 @@ } int Field_string::cmp(const uchar *a_ptr, const uchar *b_ptr) const { - size_t a_len, b_len; - - if (field_charset->mbmaxlen != 1) { - uint char_len = field_length / field_charset->mbmaxlen; - a_len = my_charpos(field_charset, a_ptr, a_ptr + field_length, char_len); - b_len = my_charpos(field_charset, b_ptr, b_ptr + field_length, char_len); - } else - a_len = b_len = field_length; - - if (field_charset->pad_attribute == NO_PAD && - !(table->in_use->variables.sql_mode & MODE_PAD_CHAR_TO_FULL_LENGTH)) { - /* - Our CHAR default behavior is to strip spaces. For PAD SPACE collations, - this doesn't matter, for but NO PAD, we need to do it ourselves here. - */ - a_len = field_charset->cset->lengthsp(field_charset, (const char *)a_ptr, - a_len); - b_len = field_charset->cset->lengthsp(field_charset, (const char *)b_ptr, - b_len); - } - - return field_charset->coll->strnncollsp(field_charset, a_ptr, a_len, b_ptr, - b_len); + return field_charset->coll->strnncollsp_count( + field_charset, a_ptr, field_length, b_ptr, field_length, + field_length / field_charset->mbmaxlen, + field_charset->pad_attribute == NO_PAD && + !(table->in_use->variables.sql_mode & MODE_PAD_CHAR_TO_FULL_LENGTH)); } size_t Field_string::make_sort_key(uchar *to, size_t length) const { diff -Nur mysql-server-8.0-orig/storage/heap/hp_hash.cc mysql-server-8.0/storage/heap/hp_hash.cc --- mysql-server-8.0-orig/storage/heap/hp_hash.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/storage/heap/hp_hash.cc 2020-07-29 16:01:51.117301200 +0300 @@ -245,38 +245,37 @@ } if (seg->type == HA_KEYTYPE_TEXT) { const CHARSET_INFO *cs = seg->charset; - size_t length = seg->length; - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - size_t char_length; - char_length = my_charpos(cs, pos, pos + length, length / cs->mbmaxlen); - length = std::min(length, char_length); - } - if (cs->pad_attribute == NO_PAD) { - /* - MySQL specifies that CHAR fields are stripped of - trailing spaces before being returned from the database. - Normally this is done in Field_string::val_str(), - but since we don't involve the Field classes for - hashing, we need to do the same thing here - for NO PAD collations. (If not, hash_sort will ignore - the spaces for us, so we don't need to do it here.) - */ - length = cs->cset->lengthsp(cs, (const char *)pos, length); + if (seg->flag & HA_PART_KEY_SEG) { + cs->coll->hash_sort_count(cs, pos, seg->length, + seg->length / cs->mbmaxlen, &nr, &nr2, + cs->pad_attribute == NO_PAD); + } else { + size_t length = seg->length; + if (cs->pad_attribute == NO_PAD) { + /* + MySQL specifies that CHAR fields are stripped of + trailing spaces before being returned from the database. + Normally this is done in Field_string::val_str(), + but since we don't involve the Field classes for + hashing, we need to do the same thing here + for NO PAD collations. (If not, hash_sort will ignore + the spaces for us, so we don't need to do it here.) + */ + length = cs->cset->lengthsp(cs, (const char *)pos, length); + } + cs->coll->hash_sort(cs, pos, length, &nr, &nr2); } - cs->coll->hash_sort(cs, pos, length, &nr, &nr2); } else if (seg->type == HA_KEYTYPE_VARTEXT1) /* Any VARCHAR segments */ { const CHARSET_INFO *cs = seg->charset; uint pack_length = 2; /* Key packing is constant */ size_t length = uint2korr(pos); - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - size_t char_length; - char_length = - my_charpos(cs, pos + pack_length, pos + pack_length + length, - seg->length / cs->mbmaxlen); - length = std::min(length, char_length); + if (seg->flag & HA_PART_KEY_SEG) { + cs->coll->hash_sort_count(cs, pos + pack_length, length, + seg->length / cs->mbmaxlen, &nr, &nr2, false); + } else { + cs->coll->hash_sort(cs, pos + pack_length, length, &nr, &nr2); } - cs->coll->hash_sort(cs, pos + pack_length, length, &nr, &nr2); key += pack_length; } else { for (; pos < key; pos++) { @@ -305,39 +304,37 @@ } if (seg->type == HA_KEYTYPE_TEXT) { const CHARSET_INFO *cs = seg->charset; - size_t char_length = seg->length; - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - char_length = - my_charpos(cs, pos, pos + char_length, char_length / cs->mbmaxlen); - char_length = - std::min(char_length, size_t(seg->length)); /* QQ: ok to remove? */ - } - if (cs->pad_attribute == NO_PAD) { - /* - MySQL specifies that CHAR fields are stripped of - trailing spaces before being returned from the database. - Normally this is done in Field_string::val_str(), - but since we don't involve the Field classes for - hashing, we need to do the same thing here - for NO PAD collations. (If not, hash_sort will ignore - the spaces for us, so we don't need to do it here.) - */ - char_length = cs->cset->lengthsp(cs, (const char *)pos, char_length); + if (seg->flag & HA_PART_KEY_SEG) { + cs->coll->hash_sort_count(cs, pos, seg->length, + seg->length / cs->mbmaxlen, &nr, &nr2, + cs->pad_attribute == NO_PAD); + } else { + size_t length = seg->length; + if (cs->pad_attribute == NO_PAD) { + /* + MySQL specifies that CHAR fields are stripped of + trailing spaces before being returned from the database. + Normally this is done in Field_string::val_str(), + but since we don't involve the Field classes for + hashing, we need to do the same thing here + for NO PAD collations. (If not, hash_sort will ignore + the spaces for us, so we don't need to do it here.) + */ + length = cs->cset->lengthsp(cs, (const char *)pos, length); + } + cs->coll->hash_sort(cs, pos, length, &nr, &nr2); } - cs->coll->hash_sort(cs, pos, char_length, &nr, &nr2); } else if (seg->type == HA_KEYTYPE_VARTEXT1) /* Any VARCHAR segments */ { const CHARSET_INFO *cs = seg->charset; uint pack_length = seg->bit_start; size_t length = (pack_length == 1 ? (uint)*pos : uint2korr(pos)); - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - size_t char_length; - char_length = - my_charpos(cs, pos + pack_length, pos + pack_length + length, - seg->length / cs->mbmaxlen); - length = std::min(length, char_length); + if (seg->flag & HA_PART_KEY_SEG) { + cs->coll->hash_sort_count(cs, pos + pack_length, length, + seg->length / cs->mbmaxlen, &nr, &nr2, false); + } else { + cs->coll->hash_sort(cs, pos + pack_length, length, &nr, &nr2); } - cs->coll->hash_sort(cs, pos + pack_length, length, &nr, &nr2); } else { for (; pos < end; pos++) { nr ^= (uint64)((((uint)nr & 63) + nr2) * ((uint)*pos)) + (nr << 8); @@ -375,34 +372,31 @@ } if (seg->type == HA_KEYTYPE_TEXT) { const CHARSET_INFO *cs = seg->charset; - size_t char_length1; - size_t char_length2; const uchar *pos1 = rec1 + seg->start; const uchar *pos2 = rec2 + seg->start; - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - size_t char_length = seg->length / cs->mbmaxlen; - char_length1 = my_charpos(cs, pos1, pos1 + seg->length, char_length); - char_length1 = std::min(char_length1, size_t(seg->length)); - char_length2 = my_charpos(cs, pos2, pos2 + seg->length, char_length); - char_length2 = std::min(char_length2, size_t(seg->length)); + if (seg->flag & HA_PART_KEY_SEG) { + if (cs->coll->strnncollsp_count(cs, pos1, seg->length, pos2, + seg->length, seg->length / cs->mbmaxlen, + cs->pad_attribute == NO_PAD)) + return 1; } else { - char_length1 = char_length2 = seg->length; + size_t length1 = seg->length; + size_t length2 = seg->length; + if (cs->pad_attribute == NO_PAD) { + /* + MySQL specifies that CHAR fields are stripped of + trailing spaces before being returned from the database. + Normally this is done in Field_string::val_str(), + but since we don't involve the Field classes for + hashing, we need to do the same thing here + for NO PAD collations. (If not, hash_sort will ignore + the spaces for us, so we don't need to do it here.) + */ + length1 = cs->cset->lengthsp(cs, (const char *)pos1, length1); + length2 = cs->cset->lengthsp(cs, (const char *)pos2, length2); + } + if (cs->coll->strnncollsp(cs, pos1, length1, pos2, length2)) return 1; } - if (cs->pad_attribute == NO_PAD) { - /* - MySQL specifies that CHAR fields are stripped of - trailing spaces before being returned from the database. - Normally this is done in Field_string::val_str(), - but since we don't involve the Field classes for - internal comparisons, we need to do the same thing here - for NO PAD collations. (If not, strnncollsp will ignore - the spaces for us, so we don't need to do it here.) - */ - char_length1 = cs->cset->lengthsp(cs, (const char *)pos1, char_length1); - char_length2 = cs->cset->lengthsp(cs, (const char *)pos2, char_length2); - } - if (cs->coll->strnncollsp(cs, pos1, char_length1, pos2, char_length2)) - return 1; } else if (seg->type == HA_KEYTYPE_VARTEXT1) /* Any VARCHAR segments */ { const uchar *pos1 = rec1 + seg->start; @@ -419,19 +413,16 @@ pos1 += 2; pos2 += 2; } - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - uint safe_length1 = char_length1; - uint safe_length2 = char_length2; - uint char_length = seg->length / cs->mbmaxlen; - char_length1 = my_charpos(cs, pos1, pos1 + char_length1, char_length); - char_length1 = std::min(char_length1, safe_length1); - char_length2 = my_charpos(cs, pos2, pos2 + char_length2, char_length); - char_length2 = std::min(char_length2, safe_length2); + if (seg->flag & HA_PART_KEY_SEG) { + if (cs->coll->strnncollsp_count(seg->charset, pos1, char_length1, pos2, + char_length2, + seg->length / cs->mbmaxlen, false)) + return 1; + } else { + if (cs->coll->strnncollsp(seg->charset, pos1, char_length1, pos2, + char_length2)) + return 1; } - - if (cs->coll->strnncollsp(seg->charset, pos1, char_length1, pos2, - char_length2)) - return 1; } else { if (memcmp(rec1 + seg->start, rec2 + seg->start, seg->length)) return 1; } @@ -457,38 +448,30 @@ } if (seg->type == HA_KEYTYPE_TEXT) { const CHARSET_INFO *cs = seg->charset; - uint char_length_key; - uint char_length_rec; const uchar *pos = rec + seg->start; - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - uint char_length = seg->length / cs->mbmaxlen; - char_length_key = my_charpos(cs, key, key + seg->length, char_length); - char_length_key = std::min(char_length_key, uint(seg->length)); - char_length_rec = my_charpos(cs, pos, pos + seg->length, char_length); - char_length_rec = std::min(char_length_rec, uint(seg->length)); + if (seg->flag & HA_PART_KEY_SEG) { + if (cs->coll->strnncollsp_count(cs, pos, seg->length, key, seg->length, + seg->length / cs->mbmaxlen, + cs->pad_attribute == NO_PAD)) + return 1; } else { - char_length_key = seg->length; - char_length_rec = seg->length; - } - - if (cs->pad_attribute == NO_PAD) { - /* - MySQL specifies that CHAR fields are stripped of - trailing spaces before being returned from the database. - Normally this is done in Field_string::val_str(), - but since we don't involve the Field classes for - internal comparisons, we need to do the same thing here - for NO PAD collations. (If not, strnncollsp will ignore - the spaces for us, so we don't need to do it here.) - */ - char_length_rec = - cs->cset->lengthsp(cs, (const char *)pos, char_length_rec); - char_length_key = - cs->cset->lengthsp(cs, (const char *)key, char_length_key); + size_t rec_len = seg->length; + size_t key_len = seg->length; + if (cs->pad_attribute == NO_PAD) { + /* + MySQL specifies that CHAR fields are stripped of + trailing spaces before being returned from the database. + Normally this is done in Field_string::val_str(), + but since we don't involve the Field classes for + hashing, we need to do the same thing here + for NO PAD collations. (If not, hash_sort will ignore + the spaces for us, so we don't need to do it here.) + */ + rec_len = cs->cset->lengthsp(cs, (const char *)pos, rec_len); + key_len = cs->cset->lengthsp(cs, (const char *)key, key_len); + } + if (cs->coll->strnncollsp(cs, pos, rec_len, key, key_len)) return 1; } - - if (cs->coll->strnncollsp(cs, pos, char_length_rec, key, char_length_key)) - return 1; } else if (seg->type == HA_KEYTYPE_VARTEXT1) /* Any VARCHAR segments */ { const uchar *pos = rec + seg->start; @@ -499,20 +482,17 @@ uint char_length_key = uint2korr(key); pos += pack_length; key += 2; /* skip key pack length */ - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - uint char_length1, char_length2; - char_length1 = char_length2 = seg->length / cs->mbmaxlen; - char_length1 = my_charpos(cs, key, key + char_length_key, char_length1); - char_length_key = std::min(char_length_key, char_length1); - char_length2 = my_charpos(cs, pos, pos + char_length_rec, char_length2); - char_length_rec = std::min(char_length_rec, char_length2); + char_length_rec = std::min(char_length_rec, uint(seg->length)); + if (seg->flag & HA_PART_KEY_SEG) { + if (cs->coll->strnncollsp_count(cs, pos, char_length_rec, key, + char_length_key, + seg->length / cs->mbmaxlen, false)) + return 1; } else { - char_length_rec = std::min(char_length_rec, uint(seg->length)); + if (cs->coll->strnncollsp(seg->charset, pos, char_length_rec, key, + char_length_key)) + return 1; } - - if (cs->coll->strnncollsp(seg->charset, pos, char_length_rec, key, - char_length_key)) - return 1; } else { if (memcmp(rec + seg->start, key, seg->length)) return 1; } diff -Nur mysql-server-8.0-orig/strings/ctype-big5.cc mysql-server-8.0/strings/ctype-big5.cc --- mysql-server-8.0-orig/strings/ctype-big5.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-big5.cc 2020-07-29 15:10:44.950317200 +0300 @@ -6501,7 +6501,9 @@ my_strcasecmp_mb, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_big5_handler = {nullptr, /* init */ ismbchar_big5, diff -Nur mysql-server-8.0-orig/strings/ctype-bin.cc mysql-server-8.0/strings/ctype-bin.cc --- mysql-server-8.0-orig/strings/ctype-bin.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-bin.cc 2020-07-29 15:10:44.964250700 +0300 @@ -462,7 +462,9 @@ my_strcasecmp_bin, my_instr_bin, my_hash_sort_8bit_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_COLLATION_HANDLER my_collation_binary_handler = { nullptr, /* init */ @@ -476,7 +478,9 @@ my_strcasecmp_bin, my_instr_bin, my_hash_sort_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = { nullptr, /* init */ diff -Nur mysql-server-8.0-orig/strings/ctype-cp932.cc mysql-server-8.0/strings/ctype-cp932.cc --- mysql-server-8.0-orig/strings/ctype-cp932.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-cp932.cc 2020-07-29 15:10:44.978209700 +0300 @@ -18776,7 +18776,9 @@ my_strcasecmp_8bit, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = {nullptr, /* init */ ismbchar_cp932, diff -Nur mysql-server-8.0-orig/strings/ctype-czech.cc mysql-server-8.0/strings/ctype-czech.cc --- mysql-server-8.0-orig/strings/ctype-czech.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-czech.cc 2020-07-29 15:10:44.985232500 +0300 @@ -665,7 +665,9 @@ my_strcasecmp_8bit, my_instr_simple, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; CHARSET_INFO my_charset_latin2_czech_ci = { 2, diff -Nur mysql-server-8.0-orig/strings/ctype-euc_kr.cc mysql-server-8.0/strings/ctype-euc_kr.cc --- mysql-server-8.0-orig/strings/ctype-euc_kr.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-euc_kr.cc 2020-07-29 15:10:44.999154700 +0300 @@ -9425,7 +9425,9 @@ my_strcasecmp_mb, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = { nullptr, /* init */ diff -Nur mysql-server-8.0-orig/strings/ctype-eucjpms.cc mysql-server-8.0/strings/ctype-eucjpms.cc --- mysql-server-8.0-orig/strings/ctype-eucjpms.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-eucjpms.cc 2020-07-29 15:10:45.018124700 +0300 @@ -36493,7 +36493,9 @@ my_strcasecmp_mb, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = {nullptr, /* init */ ismbchar_eucjpms, diff -Nur mysql-server-8.0-orig/strings/ctype-gb18030.cc mysql-server-8.0/strings/ctype-gb18030.cc --- mysql-server-8.0-orig/strings/ctype-gb18030.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-gb18030.cc 2020-07-29 15:10:45.033061800 +0300 @@ -20360,7 +20360,9 @@ my_strcasecmp_gb18030, my_instr_mb, my_hash_sort_gb18030, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_gb18030_handler = { nullptr, diff -Nur mysql-server-8.0-orig/strings/ctype-gb2312.cc mysql-server-8.0/strings/ctype-gb2312.cc --- mysql-server-8.0-orig/strings/ctype-gb2312.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-gb2312.cc 2020-07-29 15:10:45.047025800 +0300 @@ -6521,7 +6521,9 @@ my_strcasecmp_mb, /* instr */ my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = {nullptr, /* init */ ismbchar_gb2312, diff -Nur mysql-server-8.0-orig/strings/ctype-gbk.cc mysql-server-8.0/strings/ctype-gbk.cc --- mysql-server-8.0-orig/strings/ctype-gbk.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-gbk.cc 2020-07-29 15:10:45.061985200 +0300 @@ -10009,7 +10009,9 @@ my_strcasecmp_mb, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = {nullptr, /* init */ ismbchar_gbk, diff -Nur mysql-server-8.0-orig/strings/ctype-latin1.cc mysql-server-8.0/strings/ctype-latin1.cc --- mysql-server-8.0-orig/strings/ctype-latin1.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-latin1.cc 2020-07-29 15:10:45.066971900 +0300 @@ -630,7 +630,9 @@ my_strcasecmp_8bit, my_instr_simple, my_hash_sort_latin1_de, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count, + my_strnncollsp_count}; CHARSET_INFO my_charset_latin1_german2_ci = { 31, diff -Nur mysql-server-8.0-orig/strings/ctype-mb.cc mysql-server-8.0/strings/ctype-mb.cc --- mysql-server-8.0-orig/strings/ctype-mb.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-mb.cc 2020-07-29 15:10:45.073956100 +0300 @@ -580,6 +580,35 @@ } /* + Generic hash sort with count function. + This function evaluates the octet length of the first count + characters and calls hash_sort function. + */ +void my_hash_sort_count(const CHARSET_INFO *cs, const uchar *s, size_t len, + size_t count, uint64 *nr1, uint64 *nr2, + bool strip_trailing_spaces) { + size_t octet_len = + cs->mbmaxlen == 1 ? len : my_charpos(cs, s, s + len, count); + if (strip_trailing_spaces) + octet_len = cs->cset->lengthsp(cs, (const char *)s, octet_len); + return cs->coll->hash_sort(cs, s, octet_len, nr1, nr2); +} + +int my_strnncollsp_count(const CHARSET_INFO *cs, const uchar *s1, size_t len1, + const uchar *s2, size_t len2, size_t count, + bool strip_trailing_spaces) { + size_t octet_len1 = + cs->mbmaxlen == 1 ? len1 : my_charpos(cs, s1, s1 + len1, count); + size_t octet_len2 = + cs->mbmaxlen == 1 ? len2 : my_charpos(cs, s2, s2 + len2, count); + if (strip_trailing_spaces) { + octet_len1 = cs->cset->lengthsp(cs, (const char *)s1, octet_len1); + octet_len2 = cs->cset->lengthsp(cs, (const char *)s2, octet_len2); + } + return cs->coll->strnncollsp(cs, s1, octet_len1, s2, octet_len2); +} + +/* Fill the given buffer with 'maximum character' for given charset SYNOPSIS pad_max_char() @@ -1343,4 +1372,6 @@ my_strcasecmp_mb_bin, my_instr_mb, my_hash_sort_mb_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; diff -Nur mysql-server-8.0-orig/strings/ctype-simple.cc mysql-server-8.0/strings/ctype-simple.cc --- mysql-server-8.0-orig/strings/ctype-simple.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-simple.cc 2020-07-29 15:10:45.081933600 +0300 @@ -1578,4 +1578,6 @@ my_strcasecmp_8bit, my_instr_simple, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; diff -Nur mysql-server-8.0-orig/strings/ctype-sjis.cc mysql-server-8.0/strings/ctype-sjis.cc --- mysql-server-8.0-orig/strings/ctype-sjis.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-sjis.cc 2020-07-29 15:10:45.096891800 +0300 @@ -17987,7 +17987,9 @@ my_strcasecmp_8bit, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = {nullptr, /* init */ ismbchar_sjis, diff -Nur mysql-server-8.0-orig/strings/ctype-tis620.cc mysql-server-8.0/strings/ctype-tis620.cc --- mysql-server-8.0-orig/strings/ctype-tis620.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-tis620.cc 2020-07-29 15:10:45.102903700 +0300 @@ -892,7 +892,9 @@ my_strcasecmp_8bit, my_instr_simple, /* QQ: To be fixed */ my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = { nullptr, /* init */ diff -Nur mysql-server-8.0-orig/strings/ctype-uca.cc mysql-server-8.0/strings/ctype-uca.cc --- mysql-server-8.0-orig/strings/ctype-uca.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-uca.cc 2020-07-29 15:10:45.113847600 +0300 @@ -5188,7 +5188,9 @@ nullptr, my_instr_mb, my_hash_sort_ucs2_uca, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count, + my_strnncollsp_count}; CHARSET_INFO my_charset_ucs2_unicode_ci = { 128, @@ -6035,14 +6037,16 @@ my_coll_uninit_uca, my_strnncoll_any_uca, my_strnncollsp_any_uca, my_strnxfrm_any_uca, my_strnxfrmlen_simple, my_like_range_mb, my_wildcmp_uca, my_strcasecmp_uca, my_instr_mb, - my_hash_sort_any_uca, my_propagate_complex}; + my_hash_sort_any_uca, my_propagate_complex, my_hash_sort_count, + my_strnncollsp_count}; MY_COLLATION_HANDLER my_collation_uca_900_handler = { my_coll_init_uca, /* init */ my_coll_uninit_uca, my_strnncoll_uca_900, my_strnncollsp_uca_900, my_strnxfrm_uca_900, my_strnxfrmlen_uca_900, my_like_range_mb, my_wildcmp_uca, my_strcasecmp_uca, my_instr_mb, - my_hash_sort_uca_900, my_propagate_uca_900}; + my_hash_sort_uca_900, my_propagate_uca_900, my_hash_sort_count, + my_strnncollsp_count}; /* We consider bytes with code more than 127 as a letter. @@ -7771,7 +7775,9 @@ nullptr, my_instr_mb, my_hash_sort_any_uca, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count, + my_strnncollsp_count}; extern MY_CHARSET_HANDLER my_charset_utf32_handler; @@ -8631,7 +8637,9 @@ nullptr, my_instr_mb, my_hash_sort_any_uca, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count, + my_strnncollsp_count}; extern MY_CHARSET_HANDLER my_charset_utf16_handler; @@ -9490,7 +9498,9 @@ nullptr, my_instr_mb, my_hash_sort_any_uca, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count, + my_strnncollsp_count}; /** The array used for "type of characters" bit mask for each @@ -11421,7 +11431,9 @@ my_strcasecmp_mb_bin, my_instr_mb, my_hash_sort_mb_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; CHARSET_INFO my_charset_utf8mb4_0900_bin = { 309, diff -Nur mysql-server-8.0-orig/strings/ctype-ucs2.cc mysql-server-8.0/strings/ctype-ucs2.cc --- mysql-server-8.0-orig/strings/ctype-ucs2.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-ucs2.cc 2020-07-29 15:10:45.124821100 +0300 @@ -1360,7 +1360,9 @@ my_strcasecmp_mb2_or_mb4, my_instr_mb, my_hash_sort_utf16, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_COLLATION_HANDLER my_collation_utf16_bin_handler = { nullptr, /* init */ @@ -1374,7 +1376,9 @@ my_strcasecmp_mb2_or_mb4, my_instr_mb, my_hash_sort_utf16_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; MY_CHARSET_HANDLER my_charset_utf16_handler = { nullptr, /* init */ @@ -2288,7 +2292,9 @@ my_strcasecmp_mb2_or_mb4, my_instr_mb, my_hash_sort_utf32, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_COLLATION_HANDLER my_collation_utf32_bin_handler = { nullptr, /* init */ @@ -2302,7 +2308,9 @@ my_strcasecmp_mb2_or_mb4, my_instr_mb, my_hash_sort_utf32, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; MY_CHARSET_HANDLER my_charset_utf32_handler = {nullptr, /* init */ my_ismbchar_utf32, @@ -2823,7 +2831,9 @@ my_strcasecmp_mb2_or_mb4, my_instr_mb, my_hash_sort_ucs2, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler = { nullptr, /* init */ @@ -2837,7 +2847,9 @@ my_strcasecmp_mb2_or_mb4, my_instr_mb, my_hash_sort_ucs2_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; MY_CHARSET_HANDLER my_charset_ucs2_handler = {nullptr, /* init */ my_ismbchar_ucs2, /* ismbchar */ diff -Nur mysql-server-8.0-orig/strings/ctype-ujis.cc mysql-server-8.0/strings/ctype-ujis.cc --- mysql-server-8.0-orig/strings/ctype-ujis.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-ujis.cc 2020-07-29 15:10:45.141773600 +0300 @@ -35787,7 +35787,9 @@ my_strcasecmp_mb, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = {nullptr, /* init */ ismbchar_ujis, diff -Nur mysql-server-8.0-orig/strings/ctype-utf8.cc mysql-server-8.0/strings/ctype-utf8.cc --- mysql-server-8.0-orig/strings/ctype-utf8.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-utf8.cc 2020-07-29 15:55:13.170853000 +0300 @@ -37,6 +37,7 @@ #include #include "m_ctype.h" +#include "m_string.h" #include "my_byteorder.h" #include "my_compiler.h" #include "my_dbug.h" @@ -5704,6 +5705,123 @@ return (res > 1) ? res : 0; } +static void my_hash_sort_count_utf8(const CHARSET_INFO *cs, const uchar *s, + size_t len, size_t count, uint64 *nr1, + uint64 *nr2, + bool skip_trailing MY_ATTRIBUTE((unused))) { + my_wc_t wc; + int res; + const MY_UNICASE_INFO *uni_plane = cs->caseinfo; + ulong tmp1; + ulong tmp2; + + /* + Remove end space. We have to do this to be able to compare + 'A ' and 'A' as identical + */ + const uchar *e = skip_trailing_space(s, len); + + tmp1 = *nr1; + tmp2 = *nr2; + + while ((count--) && (s < e) && + (res = my_mb_wc_utf8(&wc, (const uchar *)s, (const uchar *)e)) > 0) { + my_tosort_unicode(uni_plane, &wc, cs->state); + tmp1 ^= (((tmp1 & 63) + tmp2) * (wc & 0xFF)) + (tmp1 << 8); + tmp2 += 3; + tmp1 ^= (((tmp1 & 63) + tmp2) * (wc >> 8)) + (tmp1 << 8); + tmp2 += 3; + s += res; + } + + *nr1 = tmp1; + *nr2 = tmp2; +} + +static inline void skip_space_count(const uchar **sp, const uchar **tp, + const uchar *const se, + const uchar *const te, size_t *count) { + while (*sp + 8 < se && *tp + 8 < te && *count >= 8) { + uint64_t s, t; + memcpy(&s, *sp, 8); + memcpy(&t, *tp, 8); + if (s != 0x2020202020202020ULL || t != 0x2020202020202020ULL) break; + + *sp += 8; + *tp += 8; + *count -= 8; + } + while (*count > 0 && *sp < se && *tp < te && **sp == 0x20 && **tp == 0x20) { + ++*sp; + ++*tp; + --*count; + } +} + +static int my_strnncollsp_count_utf8(const CHARSET_INFO *cs, const uchar *s, + size_t slen, const uchar *t, size_t tlen, + size_t count, bool skip_trailing) { + int s_res, t_res, res, swap = 1; + my_wc_t s_wc = 0, t_wc = 0; + const uchar *se = skip_trailing ? skip_trailing_space(s, slen) : s + slen; + const uchar *te = skip_trailing ? skip_trailing_space(t, tlen) : t + tlen; + const MY_UNICASE_INFO *uni_plane = cs->caseinfo; + + while (count && s < se && t < te) { + /* aggressive space skipping improves performance */ + if (*s == ' ' && *t == ' ') { + skip_space_count(&s, &t, se, te, &count); + continue; + } + + s_res = my_mb_wc_utf8(&s_wc, s, se); + t_res = my_mb_wc_utf8(&t_wc, t, te); + + if (s_res <= 0 || t_res <= 0) { + /* Incorrect string, compare byte by byte value */ + return bincmp(s, s + count, t, t + count); + } + + my_tosort_unicode(uni_plane, &s_wc, cs->state); + my_tosort_unicode(uni_plane, &t_wc, cs->state); + + if (s_wc != t_wc) { + return s_wc > t_wc ? 1 : -1; + } + + s += s_res; + t += t_res; + --count; + } + if (count == 0 || (s == se && t == te)) return 0; + + slen = (size_t)(se - s); + tlen = (size_t)(te - t); + res = 0; + + if (slen < tlen) { + slen = tlen; + s = t; + se = te; + swap = -1; + res = -res; + } + /* + This following loop uses the fact that in UTF-8 + all multibyte characters are greater than space, + and all multibyte head characters are greater than + space. It means if we meet a character greater + than space, it always means that the longer string + is greater. So we can reuse the same loop from the + 8bit version, without having to process full multibute + sequences. + */ + for (; (count--) && s < se; s++) { + if (*s != ' ') return (*s < ' ') ? -swap : swap; + } + return res; +} + static uint my_mbcharlen_utf8(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)), uint c) { if (c < 0x80) @@ -5731,7 +5849,10 @@ my_strcasecmp_utf8, my_instr_mb, my_hash_sort_utf8, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count_utf8, + my_strnncollsp_count_utf8, +}; static MY_COLLATION_HANDLER my_collation_utf8_bin_handler = { nullptr, /* init */ @@ -5745,7 +5866,9 @@ my_strcasecmp_mb_bin, my_instr_mb, my_hash_sort_mb_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; MY_CHARSET_HANDLER my_charset_utf8_handler = {nullptr, /* init */ my_ismbchar_utf8, @@ -7006,7 +7129,10 @@ my_strcasecmp_utf8, my_instr_mb, my_hash_sort_utf8, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count_utf8, + my_strnncollsp_count_utf8, +}; static MY_CHARSET_HANDLER my_charset_filename_handler = { nullptr, /* init */ @@ -7749,7 +7875,9 @@ my_strcasecmp_utf8mb4, my_instr_mb, my_hash_sort_utf8mb4, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count, + my_strnncollsp_count}; static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler = { nullptr, /* init */ @@ -7763,7 +7891,9 @@ my_strcasecmp_mb_bin, my_instr_mb, my_hash_sort_mb_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; MY_CHARSET_HANDLER my_charset_utf8mb4_handler = {nullptr, /* init */ my_ismbchar_utf8mb4, diff -Nur mysql-server-8.0-orig/strings/ctype-win1250ch.cc mysql-server-8.0/strings/ctype-win1250ch.cc --- mysql-server-8.0-orig/strings/ctype-win1250ch.cc 2020-06-16 16:51:03.000000000 +0300 +++ mysql-server-8.0/strings/ctype-win1250ch.cc 2020-07-29 15:10:45.164711100 +0300 @@ -591,7 +591,9 @@ my_strcasecmp_8bit, my_instr_simple, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; CHARSET_INFO my_charset_cp1250_czech_ci = { 34,