commit 9fd8a33d3dd5508fab248786c5a397f66657cfb9 Author: Georgy Kirichenko Date: Wed Mar 11 11:03:50 2020 -0400 extend collation API with char count parameter This patch adds two function hash_sort_count and strncollsp_count which allow to pass count of characters to process. Such approach allows to get rid of evaluating octet length of string before hashing and comparison. Req:RR2019112801637 https://git.huawei.com/dbs/ARM_tuning/issues/131 Change-Id: I07ff732e5f3ba946044cdcb52f7970326b593fc9 diff --git a/include/m_ctype.h b/include/m_ctype.h index ffd8860dd52..f4d216e9182 100644 --- a/include/m_ctype.h +++ b/include/m_ctype.h @@ -256,6 +256,12 @@ typedef struct MY_COLLATION_HANDLER { void (*hash_sort)(const CHARSET_INFO *cs, const uchar *key, size_t len, uint64 *nr1, uint64 *nr2); bool (*propagate)(const CHARSET_INFO *cs, const uchar *str, size_t len); + void (*hash_sort_count)(const CHARSET_INFO *cs, const uchar *s, size_t len, + size_t count, uint64 *nr1, uint64 *nr2, + bool strip_trailing_spaces); + int (*strnncollsp_count)(const CHARSET_INFO *, const uchar *, size_t, + const uchar *, size_t, size_t, + bool strip_trailing_spaces); } MY_COLLATION_HANDLER; extern MY_COLLATION_HANDLER my_collation_mb_bin_handler; @@ -587,6 +593,14 @@ void my_hash_sort_mb_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)), const uchar *key, size_t len, uint64 *nr1, uint64 *nr2); +void my_hash_sort_count(const CHARSET_INFO *cs, const uchar *s, size_t len, + size_t count, uint64 *nr1, uint64 *nr2, + bool strip_trailing_spaces); + +int my_strnncollsp_count(const CHARSET_INFO *cs, const uchar *s1, size_t len1, + const uchar *cs2, size_t len2, size_t count, + bool strip_trailing_spaces); + size_t my_strnxfrm_mb(const CHARSET_INFO *, uchar *dst, size_t dstlen, uint nweights, const uchar *src, size_t srclen, uint flags); diff --git a/sql/field.cc b/sql/field.cc index bd592c962a3..f060a3bc25c 100644 --- a/sql/field.cc +++ b/sql/field.cc @@ -6407,29 +6407,11 @@ bool Field_string::compatible_field_size(uint field_metadata, } int Field_string::cmp(const uchar *a_ptr, const uchar *b_ptr) const { - size_t a_len, b_len; - - if (field_charset->mbmaxlen != 1) { - uint char_len = field_length / field_charset->mbmaxlen; - a_len = my_charpos(field_charset, a_ptr, a_ptr + field_length, char_len); - b_len = my_charpos(field_charset, b_ptr, b_ptr + field_length, char_len); - } else - a_len = b_len = field_length; - - if (field_charset->pad_attribute == NO_PAD && - !(table->in_use->variables.sql_mode & MODE_PAD_CHAR_TO_FULL_LENGTH)) { - /* - Our CHAR default behavior is to strip spaces. For PAD SPACE collations, - this doesn't matter, for but NO PAD, we need to do it ourselves here. - */ - a_len = field_charset->cset->lengthsp(field_charset, (const char *)a_ptr, - a_len); - b_len = field_charset->cset->lengthsp(field_charset, (const char *)b_ptr, - b_len); - } - - return field_charset->coll->strnncollsp(field_charset, a_ptr, a_len, b_ptr, - b_len); + return field_charset->coll->strnncollsp_count( + field_charset, a_ptr, field_length, b_ptr, field_length, + field_length / field_charset->mbmaxlen, + field_charset->pad_attribute == NO_PAD && + !(table->in_use->variables.sql_mode & MODE_PAD_CHAR_TO_FULL_LENGTH)); } size_t Field_string::make_sort_key(uchar *to, size_t length) const { diff --git a/storage/heap/hp_hash.cc b/storage/heap/hp_hash.cc index 0b3034dea0a..9615ef80a1a 100644 --- a/storage/heap/hp_hash.cc +++ b/storage/heap/hp_hash.cc @@ -243,38 +243,37 @@ uint64 hp_hashnr(HP_KEYDEF *keydef, const uchar *key) { } if (seg->type == HA_KEYTYPE_TEXT) { const CHARSET_INFO *cs = seg->charset; - size_t length = seg->length; - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - size_t char_length; - char_length = my_charpos(cs, pos, pos + length, length / cs->mbmaxlen); - set_if_smaller(length, char_length); - } - if (cs->pad_attribute == NO_PAD) { - /* - MySQL specifies that CHAR fields are stripped of - trailing spaces before being returned from the database. - Normally this is done in Field_string::val_str(), - but since we don't involve the Field classes for - hashing, we need to do the same thing here - for NO PAD collations. (If not, hash_sort will ignore - the spaces for us, so we don't need to do it here.) - */ - length = cs->cset->lengthsp(cs, (const char *)pos, length); + if (seg->flag & HA_PART_KEY_SEG) { + cs->coll->hash_sort_count(cs, pos, seg->length, + seg->length / cs->mbmaxlen, &nr, &nr2, + cs->pad_attribute == NO_PAD); + } else { + size_t length = seg->length; + if (cs->pad_attribute == NO_PAD) { + /* + MySQL specifies that CHAR fields are stripped of + trailing spaces before being returned from the database. + Normally this is done in Field_string::val_str(), + but since we don't involve the Field classes for + hashing, we need to do the same thing here + for NO PAD collations. (If not, hash_sort will ignore + the spaces for us, so we don't need to do it here.) + */ + length = cs->cset->lengthsp(cs, (const char *)pos, length); + } + cs->coll->hash_sort(cs, pos, length, &nr, &nr2); } - cs->coll->hash_sort(cs, pos, length, &nr, &nr2); } else if (seg->type == HA_KEYTYPE_VARTEXT1) /* Any VARCHAR segments */ { const CHARSET_INFO *cs = seg->charset; uint pack_length = 2; /* Key packing is constant */ size_t length = uint2korr(pos); - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - size_t char_length; - char_length = - my_charpos(cs, pos + pack_length, pos + pack_length + length, - seg->length / cs->mbmaxlen); - set_if_smaller(length, char_length); + if (seg->flag & HA_PART_KEY_SEG) { + cs->coll->hash_sort_count(cs, pos + pack_length, length, + seg->length / cs->mbmaxlen, &nr, &nr2, false); + } else { + cs->coll->hash_sort(cs, pos + pack_length, length, &nr, &nr2); } - cs->coll->hash_sort(cs, pos + pack_length, length, &nr, &nr2); key += pack_length; } else { for (; pos < key; pos++) { @@ -303,38 +302,37 @@ uint64 hp_rec_hashnr(HP_KEYDEF *keydef, const uchar *rec) { } if (seg->type == HA_KEYTYPE_TEXT) { const CHARSET_INFO *cs = seg->charset; - size_t char_length = seg->length; - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - char_length = - my_charpos(cs, pos, pos + char_length, char_length / cs->mbmaxlen); - set_if_smaller(char_length, seg->length); /* QQ: ok to remove? */ - } - if (cs->pad_attribute == NO_PAD) { - /* - MySQL specifies that CHAR fields are stripped of - trailing spaces before being returned from the database. - Normally this is done in Field_string::val_str(), - but since we don't involve the Field classes for - hashing, we need to do the same thing here - for NO PAD collations. (If not, hash_sort will ignore - the spaces for us, so we don't need to do it here.) - */ - char_length = cs->cset->lengthsp(cs, (const char *)pos, char_length); + if (seg->flag & HA_PART_KEY_SEG) { + cs->coll->hash_sort_count(cs, pos, seg->length, + seg->length / cs->mbmaxlen, &nr, &nr2, + cs->pad_attribute == NO_PAD); + } else { + size_t length = seg->length; + if (cs->pad_attribute == NO_PAD) { + /* + MySQL specifies that CHAR fields are stripped of + trailing spaces before being returned from the database. + Normally this is done in Field_string::val_str(), + but since we don't involve the Field classes for + hashing, we need to do the same thing here + for NO PAD collations. (If not, hash_sort will ignore + the spaces for us, so we don't need to do it here.) + */ + length = cs->cset->lengthsp(cs, (const char *)pos, length); + } + cs->coll->hash_sort(cs, pos, length, &nr, &nr2); } - cs->coll->hash_sort(cs, pos, char_length, &nr, &nr2); } else if (seg->type == HA_KEYTYPE_VARTEXT1) /* Any VARCHAR segments */ { const CHARSET_INFO *cs = seg->charset; uint pack_length = seg->bit_start; size_t length = (pack_length == 1 ? (uint)*pos : uint2korr(pos)); - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - size_t char_length; - char_length = - my_charpos(cs, pos + pack_length, pos + pack_length + length, - seg->length / cs->mbmaxlen); - set_if_smaller(length, char_length); + if (seg->flag & HA_PART_KEY_SEG) { + cs->coll->hash_sort_count(cs, pos + pack_length, length, + seg->length / cs->mbmaxlen, &nr, &nr2, false); + } else { + cs->coll->hash_sort(cs, pos + pack_length, length, &nr, &nr2); } - cs->coll->hash_sort(cs, pos + pack_length, length, &nr, &nr2); } else { for (; pos < end; pos++) { nr ^= (uint64)((((uint)nr & 63) + nr2) * ((uint)*pos)) + (nr << 8); @@ -372,34 +370,31 @@ int hp_rec_key_cmp(HP_KEYDEF *keydef, const uchar *rec1, const uchar *rec2) { } if (seg->type == HA_KEYTYPE_TEXT) { const CHARSET_INFO *cs = seg->charset; - size_t char_length1; - size_t char_length2; const uchar *pos1 = rec1 + seg->start; const uchar *pos2 = rec2 + seg->start; - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - size_t char_length = seg->length / cs->mbmaxlen; - char_length1 = my_charpos(cs, pos1, pos1 + seg->length, char_length); - set_if_smaller(char_length1, seg->length); - char_length2 = my_charpos(cs, pos2, pos2 + seg->length, char_length); - set_if_smaller(char_length2, seg->length); + if (seg->flag & HA_PART_KEY_SEG) { + if (cs->coll->strnncollsp_count(cs, pos1, seg->length, pos2, + seg->length, seg->length / cs->mbmaxlen, + cs->pad_attribute == NO_PAD)) + return 1; } else { - char_length1 = char_length2 = seg->length; - } - if (cs->pad_attribute == NO_PAD) { - /* - MySQL specifies that CHAR fields are stripped of - trailing spaces before being returned from the database. - Normally this is done in Field_string::val_str(), - but since we don't involve the Field classes for - internal comparisons, we need to do the same thing here - for NO PAD collations. (If not, strnncollsp will ignore - the spaces for us, so we don't need to do it here.) - */ - char_length1 = cs->cset->lengthsp(cs, (const char *)pos1, char_length1); - char_length2 = cs->cset->lengthsp(cs, (const char *)pos2, char_length2); + size_t length1 = seg->length; + size_t length2 = seg->length; + if (cs->pad_attribute == NO_PAD) { + /* + MySQL specifies that CHAR fields are stripped of + trailing spaces before being returned from the database. + Normally this is done in Field_string::val_str(), + but since we don't involve the Field classes for + hashing, we need to do the same thing here + for NO PAD collations. (If not, hash_sort will ignore + the spaces for us, so we don't need to do it here.) + */ + length1 = cs->cset->lengthsp(cs, (const char *)pos1, length1); + length2 = cs->cset->lengthsp(cs, (const char *)pos2, length2); + } + if (cs->coll->strnncollsp(cs, pos1, length1, pos2, length2)) return 1; } - if (cs->coll->strnncollsp(cs, pos1, char_length1, pos2, char_length2)) - return 1; } else if (seg->type == HA_KEYTYPE_VARTEXT1) /* Any VARCHAR segments */ { const uchar *pos1 = rec1 + seg->start; @@ -416,19 +411,16 @@ int hp_rec_key_cmp(HP_KEYDEF *keydef, const uchar *rec1, const uchar *rec2) { pos1 += 2; pos2 += 2; } - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - uint safe_length1 = char_length1; - uint safe_length2 = char_length2; - uint char_length = seg->length / cs->mbmaxlen; - char_length1 = my_charpos(cs, pos1, pos1 + char_length1, char_length); - set_if_smaller(char_length1, safe_length1); - char_length2 = my_charpos(cs, pos2, pos2 + char_length2, char_length); - set_if_smaller(char_length2, safe_length2); + if (seg->flag & HA_PART_KEY_SEG) { + if (cs->coll->strnncollsp_count(seg->charset, pos1, char_length1, pos2, + char_length2, + seg->length / cs->mbmaxlen, false)) + return 1; + } else { + if (cs->coll->strnncollsp(seg->charset, pos1, char_length1, pos2, + char_length2)) + return 1; } - - if (cs->coll->strnncollsp(seg->charset, pos1, char_length1, pos2, - char_length2)) - return 1; } else { if (memcmp(rec1 + seg->start, rec2 + seg->start, seg->length)) return 1; } @@ -454,38 +446,30 @@ int hp_key_cmp(HP_KEYDEF *keydef, const uchar *rec, const uchar *key) { } if (seg->type == HA_KEYTYPE_TEXT) { const CHARSET_INFO *cs = seg->charset; - uint char_length_key; - uint char_length_rec; const uchar *pos = rec + seg->start; - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - uint char_length = seg->length / cs->mbmaxlen; - char_length_key = my_charpos(cs, key, key + seg->length, char_length); - set_if_smaller(char_length_key, seg->length); - char_length_rec = my_charpos(cs, pos, pos + seg->length, char_length); - set_if_smaller(char_length_rec, seg->length); + if (seg->flag & HA_PART_KEY_SEG) { + if (cs->coll->strnncollsp_count(cs, pos, seg->length, key, seg->length, + seg->length / cs->mbmaxlen, + cs->pad_attribute == NO_PAD)) + return 1; } else { - char_length_key = seg->length; - char_length_rec = seg->length; - } - - if (cs->pad_attribute == NO_PAD) { - /* - MySQL specifies that CHAR fields are stripped of - trailing spaces before being returned from the database. - Normally this is done in Field_string::val_str(), - but since we don't involve the Field classes for - internal comparisons, we need to do the same thing here - for NO PAD collations. (If not, strnncollsp will ignore - the spaces for us, so we don't need to do it here.) - */ - char_length_rec = - cs->cset->lengthsp(cs, (const char *)pos, char_length_rec); - char_length_key = - cs->cset->lengthsp(cs, (const char *)key, char_length_key); + size_t rec_len = seg->length; + size_t key_len = seg->length; + if (cs->pad_attribute == NO_PAD) { + /* + MySQL specifies that CHAR fields are stripped of + trailing spaces before being returned from the database. + Normally this is done in Field_string::val_str(), + but since we don't involve the Field classes for + hashing, we need to do the same thing here + for NO PAD collations. (If not, hash_sort will ignore + the spaces for us, so we don't need to do it here.) + */ + rec_len = cs->cset->lengthsp(cs, (const char *)pos, rec_len); + key_len = cs->cset->lengthsp(cs, (const char *)key, key_len); + } + if (cs->coll->strnncollsp(cs, pos, rec_len, key, key_len)) return 1; } - - if (cs->coll->strnncollsp(cs, pos, char_length_rec, key, char_length_key)) - return 1; } else if (seg->type == HA_KEYTYPE_VARTEXT1) /* Any VARCHAR segments */ { const uchar *pos = rec + seg->start; @@ -496,20 +480,17 @@ int hp_key_cmp(HP_KEYDEF *keydef, const uchar *rec, const uchar *key) { uint char_length_key = uint2korr(key); pos += pack_length; key += 2; /* skip key pack length */ - if (cs->mbmaxlen > 1 && (seg->flag & HA_PART_KEY_SEG)) { - uint char_length1, char_length2; - char_length1 = char_length2 = seg->length / cs->mbmaxlen; - char_length1 = my_charpos(cs, key, key + char_length_key, char_length1); - set_if_smaller(char_length_key, char_length1); - char_length2 = my_charpos(cs, pos, pos + char_length_rec, char_length2); - set_if_smaller(char_length_rec, char_length2); + set_if_smaller(char_length_rec, seg->length); + if (seg->flag & HA_PART_KEY_SEG) { + if (cs->coll->strnncollsp_count(cs, pos, char_length_rec, key, + char_length_key, + seg->length / cs->mbmaxlen, false)) + return 1; } else { - set_if_smaller(char_length_rec, seg->length); + if (cs->coll->strnncollsp(seg->charset, pos, char_length_rec, key, + char_length_key)) + return 1; } - - if (cs->coll->strnncollsp(seg->charset, pos, char_length_rec, key, - char_length_key)) - return 1; } else { if (memcmp(rec + seg->start, key, seg->length)) return 1; } diff --git a/strings/ctype-big5.cc b/strings/ctype-big5.cc index 9ce5b566c4a..0f91c25158d 100644 --- a/strings/ctype-big5.cc +++ b/strings/ctype-big5.cc @@ -6482,7 +6482,9 @@ static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler = { my_strcasecmp_mb, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_big5_handler = {NULL, /* init */ ismbchar_big5, diff --git a/strings/ctype-bin.cc b/strings/ctype-bin.cc index 4afc29cf662..f1ba1651af0 100644 --- a/strings/ctype-bin.cc +++ b/strings/ctype-bin.cc @@ -462,7 +462,9 @@ MY_COLLATION_HANDLER my_collation_8bit_bin_handler = { my_strcasecmp_bin, my_instr_bin, my_hash_sort_8bit_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_COLLATION_HANDLER my_collation_binary_handler = { nullptr, /* init */ @@ -476,7 +478,9 @@ static MY_COLLATION_HANDLER my_collation_binary_handler = { my_strcasecmp_bin, my_instr_bin, my_hash_sort_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = { NULL, /* init */ diff --git a/strings/ctype-cp932.cc b/strings/ctype-cp932.cc index 1539aca9e2b..dc15346d9f5 100644 --- a/strings/ctype-cp932.cc +++ b/strings/ctype-cp932.cc @@ -18759,7 +18759,9 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = { my_strcasecmp_8bit, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = {NULL, /* init */ ismbchar_cp932, diff --git a/strings/ctype-czech.cc b/strings/ctype-czech.cc index d849d954869..2166c38a2ab 100644 --- a/strings/ctype-czech.cc +++ b/strings/ctype-czech.cc @@ -665,7 +665,9 @@ static MY_COLLATION_HANDLER my_collation_latin2_czech_ci_handler = { my_strcasecmp_8bit, my_instr_simple, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; CHARSET_INFO my_charset_latin2_czech_ci = { 2, diff --git a/strings/ctype-euc_kr.cc b/strings/ctype-euc_kr.cc index 4586707d478..f1e775dff82 100644 --- a/strings/ctype-euc_kr.cc +++ b/strings/ctype-euc_kr.cc @@ -9408,7 +9408,9 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = { my_strcasecmp_mb, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = { NULL, /* init */ diff --git a/strings/ctype-eucjpms.cc b/strings/ctype-eucjpms.cc index 6d8d7af2b5a..b567e81f821 100644 --- a/strings/ctype-eucjpms.cc +++ b/strings/ctype-eucjpms.cc @@ -36466,7 +36466,9 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = { my_strcasecmp_mb, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = {NULL, /* init */ ismbchar_eucjpms, diff --git a/strings/ctype-gb18030.cc b/strings/ctype-gb18030.cc index c3ded4d7562..eedfdb0f8a2 100644 --- a/strings/ctype-gb18030.cc +++ b/strings/ctype-gb18030.cc @@ -20360,7 +20360,9 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = {nullptr, my_strcasecmp_gb18030, my_instr_mb, my_hash_sort_gb18030, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_gb18030_handler = { NULL, diff --git a/strings/ctype-gb2312.cc b/strings/ctype-gb2312.cc index 6f376797a88..aaf58a146e3 100644 --- a/strings/ctype-gb2312.cc +++ b/strings/ctype-gb2312.cc @@ -6504,7 +6504,9 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = { my_strcasecmp_mb, /* instr */ my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = {NULL, /* init */ ismbchar_gb2312, diff --git a/strings/ctype-gbk.cc b/strings/ctype-gbk.cc index 957aaaffad8..98fc93f56b7 100644 --- a/strings/ctype-gbk.cc +++ b/strings/ctype-gbk.cc @@ -9990,7 +9990,9 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = {nullptr, /* init */ my_strcasecmp_mb, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = {NULL, /* init */ ismbchar_gbk, diff --git a/strings/ctype-latin1.cc b/strings/ctype-latin1.cc index 05b59be1ef9..cee267b7490 100644 --- a/strings/ctype-latin1.cc +++ b/strings/ctype-latin1.cc @@ -621,7 +621,9 @@ static MY_COLLATION_HANDLER my_collation_german2_ci_handler = { my_strcasecmp_8bit, my_instr_simple, my_hash_sort_latin1_de, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count, + my_strnncollsp_count}; CHARSET_INFO my_charset_latin1_german2_ci = { 31, diff --git a/strings/ctype-mb.cc b/strings/ctype-mb.cc index 2d14465bff7..f3f73f380d9 100644 --- a/strings/ctype-mb.cc +++ b/strings/ctype-mb.cc @@ -576,6 +576,35 @@ void my_hash_sort_mb_bin(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)), } } +/* + Generic hash sort with count function. + This function evaluates the octet length of the first count + characters and calls hash_sort function. + */ +void my_hash_sort_count(const CHARSET_INFO *cs, const uchar *s, size_t len, + size_t count, uint64 *nr1, uint64 *nr2, + bool strip_trailing_spaces) { + size_t octet_len = + cs->mbmaxlen == 1 ? len : my_charpos(cs, s, s + len, count); + if (strip_trailing_spaces) + octet_len = cs->cset->lengthsp(cs, (const char *)s, octet_len); + return cs->coll->hash_sort(cs, s, octet_len, nr1, nr2); +} + +int my_strnncollsp_count(const CHARSET_INFO *cs, const uchar *s1, size_t len1, + const uchar *s2, size_t len2, size_t count, + bool strip_trailing_spaces) { + size_t octet_len1 = + cs->mbmaxlen == 1 ? len1 : my_charpos(cs, s1, s1 + len1, count); + size_t octet_len2 = + cs->mbmaxlen == 1 ? len2 : my_charpos(cs, s2, s2 + len2, count); + if (strip_trailing_spaces) { + octet_len1 = cs->cset->lengthsp(cs, (const char *)s1, octet_len1); + octet_len2 = cs->cset->lengthsp(cs, (const char *)s2, octet_len2); + } + return cs->coll->strnncollsp(cs, s1, octet_len1, s2, octet_len2); +} + /* Fill the given buffer with 'maximum character' for given charset SYNOPSIS @@ -1325,4 +1354,6 @@ MY_COLLATION_HANDLER my_collation_mb_bin_handler = {nullptr, /* init */ my_strcasecmp_mb_bin, my_instr_mb, my_hash_sort_mb_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; diff --git a/strings/ctype-simple.cc b/strings/ctype-simple.cc index 6a9dbbb5d59..eeb38a93a63 100644 --- a/strings/ctype-simple.cc +++ b/strings/ctype-simple.cc @@ -1577,4 +1577,6 @@ MY_COLLATION_HANDLER my_collation_8bit_simple_ci_handler = { my_strcasecmp_8bit, my_instr_simple, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; diff --git a/strings/ctype-sjis.cc b/strings/ctype-sjis.cc index 44ff9ec5249..f9611495919 100644 --- a/strings/ctype-sjis.cc +++ b/strings/ctype-sjis.cc @@ -17970,7 +17970,9 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = { my_strcasecmp_8bit, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = {NULL, /* init */ ismbchar_sjis, diff --git a/strings/ctype-tis620.cc b/strings/ctype-tis620.cc index 4ba983ffb3b..137b2d87d11 100644 --- a/strings/ctype-tis620.cc +++ b/strings/ctype-tis620.cc @@ -880,7 +880,9 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = { my_strcasecmp_8bit, my_instr_simple, /* QQ: To be fixed */ my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = { NULL, /* init */ diff --git a/strings/ctype-uca.cc b/strings/ctype-uca.cc index ff262219706..9cfcc83a74d 100644 --- a/strings/ctype-uca.cc +++ b/strings/ctype-uca.cc @@ -5203,7 +5203,9 @@ MY_COLLATION_HANDLER my_collation_ucs2_uca_handler = { NULL, my_instr_mb, my_hash_sort_ucs2_uca, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count, + my_strnncollsp_count}; CHARSET_INFO my_charset_ucs2_unicode_ci = { 128, @@ -6050,14 +6052,16 @@ MY_COLLATION_HANDLER my_collation_any_uca_handler = { my_coll_uninit_uca, my_strnncoll_any_uca, my_strnncollsp_any_uca, my_strnxfrm_any_uca, my_strnxfrmlen_simple, my_like_range_mb, my_wildcmp_uca, my_strcasecmp_uca, my_instr_mb, - my_hash_sort_any_uca, my_propagate_complex}; + my_hash_sort_any_uca, my_propagate_complex, my_hash_sort_count, + my_strnncollsp_count}; MY_COLLATION_HANDLER my_collation_uca_900_handler = { my_coll_init_uca, /* init */ my_coll_uninit_uca, my_strnncoll_uca_900, my_strnncollsp_uca_900, my_strnxfrm_uca_900, my_strnxfrmlen_uca_900, my_like_range_mb, my_wildcmp_uca, my_strcasecmp_uca, my_instr_mb, - my_hash_sort_uca_900, my_propagate_uca_900}; + my_hash_sort_uca_900, my_propagate_uca_900, my_hash_sort_count, + my_strnncollsp_count}; /* We consider bytes with code more than 127 as a letter. @@ -7786,7 +7790,9 @@ MY_COLLATION_HANDLER my_collation_utf32_uca_handler = { NULL, my_instr_mb, my_hash_sort_any_uca, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count, + my_strnncollsp_count}; extern MY_CHARSET_HANDLER my_charset_utf32_handler; @@ -8646,7 +8652,9 @@ MY_COLLATION_HANDLER my_collation_utf16_uca_handler = { NULL, my_instr_mb, my_hash_sort_any_uca, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count, + my_strnncollsp_count}; extern MY_CHARSET_HANDLER my_charset_utf16_handler; @@ -9505,7 +9513,9 @@ MY_COLLATION_HANDLER my_collation_gb18030_uca_handler = { NULL, my_instr_mb, my_hash_sort_any_uca, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count, + my_strnncollsp_count}; /** The array used for "type of characters" bit mask for each @@ -11436,7 +11446,9 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_0900_bin_handler = { my_strcasecmp_mb_bin, my_instr_mb, my_hash_sort_mb_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; CHARSET_INFO my_charset_utf8mb4_0900_bin = { 309, diff --git a/strings/ctype-ucs2.cc b/strings/ctype-ucs2.cc index 930d8222cd9..fe93bafb07d 100644 --- a/strings/ctype-ucs2.cc +++ b/strings/ctype-ucs2.cc @@ -1360,7 +1360,9 @@ static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler = { my_strcasecmp_mb2_or_mb4, my_instr_mb, my_hash_sort_utf16, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_COLLATION_HANDLER my_collation_utf16_bin_handler = { nullptr, /* init */ @@ -1374,7 +1376,9 @@ static MY_COLLATION_HANDLER my_collation_utf16_bin_handler = { my_strcasecmp_mb2_or_mb4, my_instr_mb, my_hash_sort_utf16_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; MY_CHARSET_HANDLER my_charset_utf16_handler = { NULL, /* init */ @@ -2290,7 +2294,9 @@ static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler = { my_strcasecmp_mb2_or_mb4, my_instr_mb, my_hash_sort_utf32, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_COLLATION_HANDLER my_collation_utf32_bin_handler = { nullptr, /* init */ @@ -2304,7 +2310,9 @@ static MY_COLLATION_HANDLER my_collation_utf32_bin_handler = { my_strcasecmp_mb2_or_mb4, my_instr_mb, my_hash_sort_utf32, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; MY_CHARSET_HANDLER my_charset_utf32_handler = {NULL, /* init */ my_ismbchar_utf32, @@ -2825,7 +2833,9 @@ static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler = { my_strcasecmp_mb2_or_mb4, my_instr_mb, my_hash_sort_ucs2, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler = { nullptr, /* init */ @@ -2839,7 +2849,9 @@ static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler = { my_strcasecmp_mb2_or_mb4, my_instr_mb, my_hash_sort_ucs2_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; MY_CHARSET_HANDLER my_charset_ucs2_handler = {NULL, /* init */ my_ismbchar_ucs2, /* ismbchar */ diff --git a/strings/ctype-ujis.cc b/strings/ctype-ujis.cc index fd54b785c9d..9847adefc4e 100644 --- a/strings/ctype-ujis.cc +++ b/strings/ctype-ujis.cc @@ -35757,7 +35757,9 @@ static MY_COLLATION_HANDLER my_collation_ci_handler = { my_strcasecmp_mb, my_instr_mb, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; static MY_CHARSET_HANDLER my_charset_handler = {NULL, /* init */ ismbchar_ujis, diff --git a/strings/ctype-utf8.cc b/strings/ctype-utf8.cc index 5157da58e73..4544ecc665f 100644 --- a/strings/ctype-utf8.cc +++ b/strings/ctype-utf8.cc @@ -5936,6 +5936,123 @@ static uint my_ismbchar_utf8(const CHARSET_INFO *, const char *b, return (res > 1) ? res : 0; } +static void my_hash_sort_count_utf8(const CHARSET_INFO *cs, const uchar *s, + size_t len, size_t count, uint64 *nr1, + uint64 *nr2, + bool skip_trailing MY_ATTRIBUTE((unused))) { + my_wc_t wc; + int res; + const MY_UNICASE_INFO *uni_plane = cs->caseinfo; + ulong tmp1; + ulong tmp2; + + /* + Remove end space. We have to do this to be able to compare + 'A ' and 'A' as identical + */ + const uchar *e = skip_trailing_space(s, len); + + tmp1 = *nr1; + tmp2 = *nr2; + + while ((count--) && (s < e) && + (res = my_mb_wc_utf8(&wc, (uchar *)s, (uchar *)e)) > 0) { + my_tosort_unicode(uni_plane, &wc, cs->state); + tmp1 ^= (((tmp1 & 63) + tmp2) * (wc & 0xFF)) + (tmp1 << 8); + tmp2 += 3; + tmp1 ^= (((tmp1 & 63) + tmp2) * (wc >> 8)) + (tmp1 << 8); + tmp2 += 3; + s += res; + } + + *nr1 = tmp1; + *nr2 = tmp2; +} + +static inline void skip_space_count(const uchar **sp, const uchar **tp, + const uchar *const se, + const uchar *const te, size_t *count) { + while (*sp + 8 < se && *tp + 8 < te && *count >= 8) { + uint64_t s, t; + memcpy(&s, *sp, 8); + memcpy(&t, *tp, 8); + if (s != 0x2020202020202020ULL || t != 0x2020202020202020ULL) break; + + *sp += 8; + *tp += 8; + *count -= 8; + } + while (*count > 0 && *sp < se && *tp < te && **sp == 0x20 && **tp == 0x20) { + ++*sp; + ++*tp; + --*count; + } +} + +static int my_strnncollsp_count_utf8(const CHARSET_INFO *cs, const uchar *s, + size_t slen, const uchar *t, size_t tlen, + size_t count, bool skip_trailing) { + int s_res, t_res, res, swap = 1; + my_wc_t s_wc = 0, t_wc = 0; + const uchar *se = skip_trailing ? skip_trailing_space(s, slen) : s + slen; + const uchar *te = skip_trailing ? skip_trailing_space(t, tlen) : t + tlen; + const MY_UNICASE_INFO *uni_plane = cs->caseinfo; + + while (count && s < se && t < te) { + /* aggressive space skipping improves performance */ + if (*s == ' ' && *t == ' ') { + skip_space_count(&s, &t, se, te, &count); + continue; + } + + s_res = my_mb_wc_utf8(&s_wc, s, se); + t_res = my_mb_wc_utf8(&t_wc, t, te); + + if (s_res <= 0 || t_res <= 0) { + /* Incorrect string, compare byte by byte value */ + return bincmp(s, s + count, t, t + count); + } + + my_tosort_unicode(uni_plane, &s_wc, cs->state); + my_tosort_unicode(uni_plane, &t_wc, cs->state); + + if (s_wc != t_wc) { + return s_wc > t_wc ? 1 : -1; + } + + s += s_res; + t += t_res; + --count; + } + if (count == 0 || (s == se && t == te)) return 0; + + slen = (size_t)(se - s); + tlen = (size_t)(te - t); + res = 0; + + if (slen < tlen) { + slen = tlen; + s = t; + se = te; + swap = -1; + res = -res; + } + /* + This following loop uses the fact that in UTF-8 + all multibyte characters are greater than space, + and all multibyte head characters are greater than + space. It means if we meet a character greater + than space, it always means that the longer string + is greater. So we can reuse the same loop from the + 8bit version, without having to process full multibute + sequences. + */ + for (; (count--) && s < se; s++) { + if (*s != ' ') return (*s < ' ') ? -swap : swap; + } + return res; +} + static uint my_mbcharlen_utf8(const CHARSET_INFO *cs MY_ATTRIBUTE((unused)), uint c) { if (c < 0x80) @@ -5963,7 +6080,10 @@ static MY_COLLATION_HANDLER my_collation_utf8_general_ci_handler = { my_strcasecmp_utf8, my_instr_mb, my_hash_sort_utf8, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count_utf8, + my_strnncollsp_count_utf8, +}; static MY_COLLATION_HANDLER my_collation_utf8_bin_handler = { nullptr, /* init */ @@ -5977,7 +6097,9 @@ static MY_COLLATION_HANDLER my_collation_utf8_bin_handler = { my_strcasecmp_mb_bin, my_instr_mb, my_hash_sort_mb_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; MY_CHARSET_HANDLER my_charset_utf8_handler = {NULL, /* init */ my_ismbchar_utf8, @@ -7238,7 +7360,10 @@ static MY_COLLATION_HANDLER my_collation_filename_handler = { my_strcasecmp_utf8, my_instr_mb, my_hash_sort_utf8, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count_utf8, + my_strnncollsp_count_utf8, +}; static MY_CHARSET_HANDLER my_charset_filename_handler = { NULL, /* init */ @@ -7978,7 +8103,9 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_general_ci_handler = { my_strcasecmp_utf8mb4, my_instr_mb, my_hash_sort_utf8mb4, - my_propagate_complex}; + my_propagate_complex, + my_hash_sort_count, + my_strnncollsp_count}; static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler = { nullptr, /* init */ @@ -7992,7 +8119,9 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler = { my_strcasecmp_mb_bin, my_instr_mb, my_hash_sort_mb_bin, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; MY_CHARSET_HANDLER my_charset_utf8mb4_handler = {NULL, /* init */ my_ismbchar_utf8mb4, diff --git a/strings/ctype-win1250ch.cc b/strings/ctype-win1250ch.cc index 863966d60b7..7053b097e45 100644 --- a/strings/ctype-win1250ch.cc +++ b/strings/ctype-win1250ch.cc @@ -591,7 +591,9 @@ static MY_COLLATION_HANDLER my_collation_czech_ci_handler = { my_strcasecmp_8bit, my_instr_simple, my_hash_sort_simple, - my_propagate_simple}; + my_propagate_simple, + my_hash_sort_count, + my_strnncollsp_count}; CHARSET_INFO my_charset_cp1250_czech_ci = { 34,