From e559fde79fcc2aa71e98077f3e6520d9f37c9323 Mon Sep 17 00:00:00 2001
From: Dirkjan Bussink <d.bussink@gmail.com>
Date: Thu, 8 Feb 2024 10:03:08 +0100
Subject: [PATCH] Fix multibyte bugs in my_instr_mb
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The my_instr_mb function makes the incorrect assumption in multiple
places that byte lengths for the input strings can be used to short cut
certain decisions. In the case of multibyte character sets and
collations, this can't be done though since characters with a different
byte length can be considered equal under collation rules.

Take for example 'a' and 'Å' which are considered equal under the
default MySQL 8 collation utf8mb4_0900_ai_ci:

mysql> select 'a' = 'Å';
+-------------+
| 'a' = 'Å'   |
+-------------+
|           1 |
+-------------+
1 row in set (0.00 sec)

The character 'a' is though encoded with 1 byte, but 'Å' is encoded with
3 bytes. When using LOCATE, it should also find the character:

mysql> select locate('a', 'Å');
+--------------------+
| locate('a', 'Å')   |
+--------------------+
|                  0 |
+--------------------+
1 row in set (0.00 sec)

It shows here that it's wrong, and it doesn't consider the character a
substring of the other character, which mismatches the behavior seen
when checking equality above.

This is due to a number of bugs. First of all, there is this check at
the beginning:

if (s_length <= b_length) {

To be explicit, both length variables here are byte lengths, not
character lengths of the inputs. When this check does not match and the
byte length of needle is longer than the haystack, the assumption is
made that no result can be found. This is wrong, because in case here of
a needle of 'Å' it will have 3 bytes, but the haystack of 'a' has 1
byte. We still need to search through with the proper collation
settings. Therefore the change here removes this check as it's an
incorrect optimization trying to short circuit.

There's another attempt at optimization here to reduce the maximum
string length searched:

end = b + b_length - s_length + 1;

This is also not correct because of the same reason. We can't assume
byte lengths match character lengths, so this is changed to:

end = b + b_length

to ensure that the whole haystack string is searched.

Lastly, there's another bug. The call to strnncoll again assumes that
you can truncate the haystack to the byte length. This goes wrong if the
needle is 'a' and the haystack is 'Å'. In that case, the haystack string
of 'Å' is 3 bytes long and will be cut off at 1 byte. This results in an
invalid UTF-8 input and no match on the search.

Instead, strnncoll has a prefix option, so we use that and pass in the
full string lenghts so that a proper search for a prefix is made.

The higher level Item_func_locate function also has a bug where
incorrectly byte length is used. The following is incorrect:

return start + 1;

As start here is just updated from the character position to the byte
position. Instead, we need to use start0 here to return the correct
offset.

These changes resolve the given bugs around multibyte characters and
substring searching.
---
 mysql-test/r/func_str.result | 25 +++++++++++++++
 mysql-test/t/func_str.test   | 12 ++++++++
 sql/item_func.cc             |  2 +-
 strings/ctype-mb.cc          | 60 +++++++++++++++++-------------------
 4 files changed, 67 insertions(+), 32 deletions(-)

diff --git a/mysql-test/r/func_str.result b/mysql-test/r/func_str.result
index 72fdea0a3459..8dae718e58bd 100644
--- a/mysql-test/r/func_str.result
+++ b/mysql-test/r/func_str.result
@@ -5822,3 +5822,28 @@ SELECT QUOTE(x'80');
 ERROR HY000: Cannot convert string '\x80' from binary to utf8mb4
 SELECT QUOTE(_utf8mb4 x'80');
 ERROR HY000: Invalid utf8mb4 character string: '80'
+#
+# Bug: The locate function does not find correct substring
+#
+set names utf8mb4;
+select locate('a', 'Å');
+locate('a', 'Å')
+1
+select locate('Å', 'a');
+locate('Å', 'a')
+1
+select locate('aÅ', 'aÅ');
+locate('aÅ', 'aÅ')
+1
+select locate('Åa', 'Åa');
+locate('Åa', 'Åa')
+1
+select locate('ss', 'ß');
+locate('ss', 'ß')
+1
+select locate('ß', 'ss');
+locate('ß', 'ss')
+1
+select locate("", "ÅÅ", 2);
+locate("", "ÅÅ", 2)
+2
diff --git a/mysql-test/t/func_str.test b/mysql-test/t/func_str.test
index 58c5944c44e6..9620fa66b66d 100644
--- a/mysql-test/t/func_str.test
+++ b/mysql-test/t/func_str.test
@@ -2675,3 +2675,15 @@ SELECT QUOTE(NULL);
 SELECT QUOTE(x'80');
 --error ER_INVALID_CHARACTER_STRING
 SELECT QUOTE(_utf8mb4 x'80');
+
+--echo #
+--echo # Bug: The locate function does not find correct substring
+--echo #
+set names utf8mb4;
+select locate('a', 'Å');
+select locate('Å', 'a');
+select locate('aÅ', 'aÅ');
+select locate('Åa', 'Åa');
+select locate('ss', 'ß');
+select locate('ß', 'ss');
+select locate("", "ÅÅ", 2);
diff --git a/sql/item_func.cc b/sql/item_func.cc
index ac6875cbe6c7..002df36bf099 100644
--- a/sql/item_func.cc
+++ b/sql/item_func.cc
@@ -4212,7 +4212,7 @@ longlong Item_func_locate::val_int() {
   }
 
   if (!b->length())  // Found empty string at start
-    return start + 1;
+    return start0 + 1;
 
   if (!cs->coll->strstr(cs, a->ptr() + start,
                         static_cast<uint>(a->length() - start), b->ptr(),
diff --git a/strings/ctype-mb.cc b/strings/ctype-mb.cc
index b9e69874fba3..177c8e686a2d 100644
--- a/strings/ctype-mb.cc
+++ b/strings/ctype-mb.cc
@@ -359,42 +359,40 @@ unsigned my_instr_mb(const CHARSET_INFO *cs, const char *b, size_t b_length,
   const char *end, *b0;
   int res = 0;
 
-  if (s_length <= b_length) {
-    if (!s_length) {
-      if (nmatch) {
-        match->beg = 0;
-        match->end = 0;
-        match->mb_len = 0;
-      }
-      return 1; /* Empty string is always found */
+  if (!s_length) {
+    if (nmatch) {
+      match->beg = 0;
+      match->end = 0;
+      match->mb_len = 0;
     }
+    return 1; /* Empty string is always found */
+  }
 
-    b0 = b;
-    end = b + b_length - s_length + 1;
-
-    while (b < end) {
-      int mb_len;
-
-      if (!cs->coll->strnncoll(cs, pointer_cast<const uint8_t *>(b), s_length,
-                               pointer_cast<const uint8_t *>(s), s_length,
-                               false)) {
-        if (nmatch) {
-          match[0].beg = 0;
-          match[0].end = (unsigned)(b - b0);
-          match[0].mb_len = res;
-          if (nmatch > 1) {
-            match[1].beg = match[0].end;
-            match[1].end = match[0].end + (unsigned)s_length;
-            match[1].mb_len = 0; /* Not computed */
-          }
+  b0 = b;
+  end = b + b_length;
+
+  while (b < end) {
+    int mb_len;
+
+    if (!cs->coll->strnncoll(cs, pointer_cast<const uint8_t *>(b), b_length,
+                              pointer_cast<const uint8_t *>(s), s_length,
+                              true)) {
+      if (nmatch) {
+        match[0].beg = 0;
+        match[0].end = (unsigned)(b - b0);
+        match[0].mb_len = res;
+        if (nmatch > 1) {
+          match[1].beg = match[0].end;
+          match[1].end = match[0].end + (unsigned)s_length;
+          match[1].mb_len = 0; /* Not computed */
         }
-        return 2;
       }
-      mb_len = (mb_len = my_ismbchar(cs, b, end)) ? mb_len : 1;
-      b += mb_len;
-      b_length -= mb_len;
-      res++;
+      return 2;
     }
+    mb_len = (mb_len = my_ismbchar(cs, b, end)) ? mb_len : 1;
+    b += mb_len;
+    b_length -= mb_len;
+    res++;
   }
   return 0;
 }