Bug #87100 "Incorrect string value" is diplayed instead of "Data too long" sometimes.
Submitted: 17 Jul 2017 20:13 Modified: 9 Jul 2018 19:21
Reporter: Taku Amano (OCA) Email Updates:
Status: Closed Impact on me:
None 
Category:MySQL Server: Errors Severity:S3 (Non-critical)
Version:5.7.17, 8.0, 5.7.18 OS:Any
Assigned to: CPU Architecture:Any

[17 Jul 2017 20:13] Taku Amano
Description:
"Incorrect string value" may be displayed even if it is a valid string when trying to insert a string of more than column size into a TEXT type multi-byte column.

This problem occurs when a sequence of one character was broken while truncating a string to the maximum size of a column.

How to repeat:
# Steps to reproduce

Execute the following SQL.
```
SET SQL_MODE="TRADITIONAL,ANSI";
CREATE TEMPORARY TABLE t (c TINYTEXT CHARACTER SET utf8mb4);
-- C385 : Capital A, ring
INSERT INTO t (c) VALUES (REPEAT(X'C385', 128));
```

# Expected result

ERROR 1406 (22001): Data too long for column 'c' at row 1

# Actual result

ERROR 1366 (HY000): Incorrect string value: '\xC3\x85' for column 'c' at row 1

# Other cases of same result

## When the same character set

```
$ perl -e '$v = "Å" x 128; print qq{SET SQL_MODE="TRADITIONAL"; CREATE TEMPORARY TABLE t (c TINYTEXT CHARACTER SET utf8mb4); INSERT INTO t (c) VALUES ("$v")}' | mysql --default-character-set utf8mb4 -uroot -ppassword movabletype
```

# Normal cases

## When the column size matches the boundary of the characters.

```
SET SQL_MODE="TRADITIONAL,ANSI";
CREATE TEMPORARY TABLE t (c TINYTEXT CHARACTER SET utf8mb4);
-- C385 : Capital A, ring
INSERT INTO t (c) VALUES (CONCAT('a', REPEAT(X'C385', 128)));
```

## When character sets (or encodings) are different

```
$ perl -e '$v = "Å" x 128; print qq{SET SQL_MODE="TRADITIONAL"; CREATE TEMPORARY TABLE t (c TINYTEXT CHARACTER SET utf8mb4); INSERT INTO t (c) VALUES ("$v")}' | mysql --default-character-set utf8 -uroot -ppassword movabletype
```

Suggested fix:
diff --git a/mysql-test/suite/engines/funcs/r/in_text_boundary_error.result b/mysql-test/suite/engines/funcs/r/in_text_boundary_error.result
new file mode 100644
index 00000000000..43f4b36254b
--- /dev/null
+++ b/mysql-test/suite/engines/funcs/r/in_text_boundary_error.result
@@ -0,0 +1,12 @@
+SET SQL_MODE="TRADITIONAL,ANSI";
+DROP TABLE IF EXISTS t1;
+CREATE TABLE t1(c1 TINYTEXT CHARACTER SET utf8mb4);
+INSERT INTO t1 (c1) VALUES(REPEAT('A', 255));
+INSERT INTO t1 (c1) VALUES(REPEAT('A', 256));
+ERROR 22001: Data too long for column 'c1' at row 1
+INSERT INTO t1 (c1) VALUES(REPEAT(X'C385', 127));
+INSERT INTO t1 (c1) VALUES(REPEAT(X'C385', 128));
+ERROR 22001: Data too long for column 'c1' at row 1
+INSERT INTO t1 (c1) VALUES(CONCAT('a', REPEAT(X'C385', 128)));
+ERROR 22001: Data too long for column 'c1' at row 1
+DROP TABLE t1;
diff --git a/mysql-test/suite/engines/funcs/t/in_text_boundary_error.test b/mysql-test/suite/engines/funcs/t/in_text_boundary_error.test
new file mode 100644
index 00000000000..f7f582283b3
--- /dev/null
+++ b/mysql-test/suite/engines/funcs/t/in_text_boundary_error.test
@@ -0,0 +1,14 @@
+SET SQL_MODE="TRADITIONAL,ANSI";
+--disable_warnings
+DROP TABLE IF EXISTS t1;
+--enable_warnings
+CREATE TABLE t1(c1 TINYTEXT CHARACTER SET utf8mb4);
+INSERT INTO t1 (c1) VALUES(REPEAT('A', 255));
+--error 1406
+INSERT INTO t1 (c1) VALUES(REPEAT('A', 256));
+INSERT INTO t1 (c1) VALUES(REPEAT(X'C385', 127));
+--error 1406
+INSERT INTO t1 (c1) VALUES(REPEAT(X'C385', 128));
+--error 1406
+INSERT INTO t1 (c1) VALUES(CONCAT('a', REPEAT(X'C385', 128)));
+DROP TABLE t1;
diff --git a/sql-common/sql_string.cc b/sql-common/sql_string.cc
index aa93e684f1e..a4d4ddf2b3a 100644
--- a/sql-common/sql_string.cc
+++ b/sql-common/sql_string.cc
@@ -1045,13 +1045,24 @@ size_t well_formed_copy_nchars(const CHARSET_INFO *to_cs,
         to_length-= to_cs->mbminlen;
       }
 
-      set_if_smaller(from_length, to_length);
-      res= to_cs->cset->well_formed_len(to_cs, from, from + from_length,
+      res= to_cs->cset->well_formed_len(to_cs,
+                                        from, from + min(from_length, to_length),
                                         nchars, &well_formed_error);
       if (res > 0)
         memmove(to, from, res);
       *from_end_pos= from + res;
-      *well_formed_error_pos= well_formed_error ? from + res : NULL;
+      if (well_formed_error) {
+        const char *from_end= from + min(res + to_cs->mbmaxlen, from_length);
+        size_t extra;
+
+        extra= to_cs->cset->well_formed_len(to_cs,
+                                            from + res, from_end,
+                                            1, &well_formed_error);
+        *well_formed_error_pos= extra == 0 ? from + res : NULL;
+      }
+      else {
+        *well_formed_error_pos= NULL;
+      }
       *cannot_convert_error_pos= NULL;
       if (from_offset)
         res+= to_cs->mbminlen;
[18 Jul 2017 7:59] MySQL Verification Team
Hello Taku AMANO,

Thank you for the report and test case.

Thanks,
Umesh
[18 Jul 2017 8:01] MySQL Verification Team
In order to submit contributions you must first sign the Oracle Contribution Agreement (OCA). For additional information please check http://www.oracle.com/technetwork/community/oca-486395.html.
If you have any questions, please contact the MySQL community team - https://www.mysql.com/about/contact/?topic=community.
[18 Jul 2017 23:50] Taku Amano
Revised version of suggested fix.

Attachment: check-extra-well-formed-char.patch (application/octet-stream, text), 3.21 KiB.

[18 Jul 2017 23:53] Taku Amano
Thank you for reviewing.
I have submitted a document of OCA at yesterday. (And not yet accepted)

I have revised a suggested fix.
check-extra-well-formed-char.patch
Skipping extra check for real incorrect string, by this patch.
Difference from first version to this patch.
```
diff --git a/sql-common/sql_string.cc b/sql-common/sql_string.cc
index a4d4ddf2b3a..f64daf88cbc 100644
--- a/sql-common/sql_string.cc
+++ b/sql-common/sql_string.cc
@@ -1008,6 +1008,7 @@ size_t well_formed_copy_nchars(const CHARSET_INFO *to_cs,
     {
       int well_formed_error;
       uint from_offset;
+      size_t min_length;
 
       if ((from_offset= (from_length % to_cs->mbminlen)) &&
           (from_cs == &my_charset_bin))
@@ -1045,24 +1046,25 @@ size_t well_formed_copy_nchars(const CHARSET_INFO *to_cs,
         to_length-= to_cs->mbminlen;
       }
 
+      min_length = min(from_length, to_length);
       res= to_cs->cset->well_formed_len(to_cs,
-                                        from, from + min(from_length, to_length),
+                                        from, from + min_length,
                                         nchars, &well_formed_error);
       if (res > 0)
         memmove(to, from, res);
       *from_end_pos= from + res;
-      if (well_formed_error) {
+      if (well_formed_error &&
+          (min_length - res < to_cs->mbmaxlen))
+      {
         const char *from_end= from + min(res + to_cs->mbmaxlen, from_length);
         size_t extra;
 
         extra= to_cs->cset->well_formed_len(to_cs,
-                                            from + res, from_end,
+                                            *from_end_pos, from_end,
                                             1, &well_formed_error);
-        *well_formed_error_pos= extra == 0 ? from + res : NULL;
-      }
-      else {
-        *well_formed_error_pos= NULL;
+        well_formed_error= extra >= 1 ? 0 : 1;
       }
+      *well_formed_error_pos= well_formed_error ? *from_end_pos : NULL;
       *cannot_convert_error_pos= NULL;
       if (from_offset)
         res+= to_cs->mbminlen;
```
[1 Aug 2017 9:06] Taku Amano
Revised version of suggested fix.

(*) I confirm the code being submitted is offered under the terms of the OCA, and that I am authorized to contribute it.

Contribution: check-extra-well-formed-char.patch (application/octet-stream, text), 3.21 KiB.

[23 Aug 2017 4:14] Taku Amano
Hello Umesh,

Do I need to take any action?

Regards,
Taku AMANO
[9 Jul 2018 19:21] Paul DuBois
Posted by developer:
 
Fixed in 8.0.13.

In some cases when ER_DATA_TOO_LONG was the appropriate error,
ER_TRUNCATED_WRONG_VALUE_FOR_FIELD was reported instead.