Bug #74984 PARSER CONFUSES WITH 7BIT-CHARACTER STRING DETECTION
Submitted: 24 Nov 2014 15:11 Modified: 20 Feb 2015 17:58
Reporter: Gleb Shchepa Email Updates:
Status: Closed Impact on me:
None 
Category:MySQL Server: Parser Severity:S2 (Serious)
Version:5.7 OS:Any
Assigned to: CPU Architecture:Any

[24 Nov 2014 15:11] Gleb Shchepa
Description:
In some cases the parser collects string literals first and defer encoding conversion operations -- this is a result of WL#7200's refactoring.
During that deferred operations the parser references a 7bit-ness flag that is cached by the lexical scanner:

PTI_text_literal_text_string::itemize():

   uint repertoire= thd->m_parser_state->m_lip.text_string_is_7bit() &&
                    my_charset_is_ascii_based(cs_cli) ?

However, m_lip.text_string_is_7bit() returns the 7bit status of the last processed text literal -- not the status of this->literal.
Thus, if some SQL statement contains a sequence of text literals, where some literals require encoding conversion, but the last literal in the sequence is a pure 7bit string, then the PTI_text_literal_text_string::itemize() may miss the conversion.

How to repeat:
CREATE DATABASE MYSQLTEST1 CHARACTER SET LATIN2;
USE MYSQLTEST1;
CREATE TABLE t1 (a VARCHAR(255) CHARACTER SET LATIN2);
SET CHARACTER SET cp1250_latin2;
INSERT INTO t1 VALUES ('£¥ª¯');
INSERT INTO t1 VALUES ('£¥ª¯' '');
SELECT HEX(a) FROM t1;
DROP DATABASE MYSQLTEST1;

The SELECT statement returns:

HEX(a)
A3A1AAAF
A3A5AAAF

This output is incorrect, both rows must contain same values:

HEX(a)
A3A1AAAF
A3A1AAAF

Suggested fix:
Move m_lip.text_string_is_7bit() out of deferred itemize() function call back to the parser grammar:

diff --git a/sql/parse_tree_items.h b/sql/parse_tree_items.h
--- a/sql/parse_tree_items.h
+++ b/sql/parse_tree_items.h
@@ -489,10 +489,13 @@ class PTI_text_literal : public Item_string
   typedef Item_string super;
 
 protected:
+  bool is_7bit;
   LEX_STRING literal;
 
-  PTI_text_literal(const POS &pos, const LEX_STRING &literal_arg)
-  : super(pos), literal(literal_arg)
+  PTI_text_literal(const POS &pos,
+                   bool is_7bit_arg,
+                   const LEX_STRING &literal_arg)
+  : super(pos), is_7bit(is_7bit_arg), literal(literal_arg)
   {}
 };
 
@@ -502,8 +505,10 @@ class PTI_text_literal_text_string : public PTI_text_literal
   typedef PTI_text_literal super;
 
 public:
-  PTI_text_literal_text_string(const POS &pos, const LEX_STRING &literal)
-  : super(pos, literal)
+  PTI_text_literal_text_string(const POS &pos,
+                               bool is_7bit_arg,
+                               const LEX_STRING &literal)
+  : super(pos, is_7bit_arg, literal)
   {}
 
   virtual bool itemize(Parse_context *pc, Item **res)
@@ -515,8 +520,7 @@ public:
     LEX_STRING tmp;
     const CHARSET_INFO *cs_con= thd->variables.collation_connection;
     const CHARSET_INFO *cs_cli= thd->variables.character_set_client;
-    uint repertoire= thd->m_parser_state->m_lip.text_string_is_7bit() &&
-                     my_charset_is_ascii_based(cs_cli) ?
+    uint repertoire= is_7bit && my_charset_is_ascii_based(cs_cli) ?
                      MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
     if (thd->charset_is_collation_connection ||
         (repertoire == MY_REPERTOIRE_ASCII &&
@@ -539,8 +543,10 @@ class PTI_text_literal_nchar_string : public PTI_text_literal
   typedef PTI_text_literal super;
 
 public:
-  PTI_text_literal_nchar_string(const POS &pos, const LEX_STRING &literal)
-  : super(pos, literal)
+  PTI_text_literal_nchar_string(const POS &pos,
+                                bool is_7bit_arg,
+                                const LEX_STRING &literal)
+  : super(pos, is_7bit_arg, literal)
   {}
 
   virtual bool itemize(Parse_context *pc, Item **res)
@@ -548,8 +554,7 @@ public:
     if (super::itemize(pc, res))
       return true;
 
-    uint repertoire= pc->thd->m_parser_state->m_lip.text_string_is_7bit() ?
-                     MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
+    uint repertoire= is_7bit ? MY_REPERTOIRE_ASCII : MY_REPERTOIRE_UNICODE30;
     DBUG_ASSERT(my_charset_is_ascii_based(national_charset_info));
     init(literal.str, literal.length, national_charset_info,
          DERIVATION_COERCIBLE, repertoire);
@@ -566,9 +571,10 @@ class PTI_text_literal_underscore_charset : public PTI_text_literal
 
 public:
   PTI_text_literal_underscore_charset(const POS &pos,
+                                      bool is_7bit_arg,
                                       const CHARSET_INFO *cs_arg,
                                       const LEX_STRING &literal)
-  : super(pos, literal), cs(cs_arg)
+  : super(pos, is_7bit_arg, literal), cs(cs_arg)
   {}
 
   virtual bool itemize(Parse_context *pc, Item **res)
@@ -592,9 +598,9 @@ class PTI_text_literal_concat : public PTI_text_literal
   PTI_text_literal *head;
 
 public:
-  PTI_text_literal_concat(const POS &pos,
+  PTI_text_literal_concat(const POS &pos, bool is_7bit_arg,
                           PTI_text_literal *head_arg, const LEX_STRING &tail)
-  : super(pos, tail), head(head_arg)
+  : super(pos, is_7bit_arg, tail), head(head_arg)
   {}
 
   virtual bool itemize(Parse_context *pc, Item **res)
diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy
--- a/sql/sql_yacc.yy
+++ b/sql/sql_yacc.yy
@@ -12399,19 +12399,23 @@ load_data_set_elem:
 text_literal:
           TEXT_STRING
           {
-            $$= NEW_PTN PTI_text_literal_text_string(@$, $1);
+            $$= NEW_PTN PTI_text_literal_text_string(@$,
+                YYTHD->m_parser_state->m_lip.text_string_is_7bit(), $1);
           }
         | NCHAR_STRING
           {
-            $$= NEW_PTN PTI_text_literal_nchar_string(@$, $1);
+            $$= NEW_PTN PTI_text_literal_nchar_string(@$,
+                YYTHD->m_parser_state->m_lip.text_string_is_7bit(), $1);
           }
         | UNDERSCORE_CHARSET TEXT_STRING
           {
-            $$= NEW_PTN PTI_text_literal_underscore_charset(@$, $1, $2);
+            $$= NEW_PTN PTI_text_literal_underscore_charset(@$,
+                YYTHD->m_parser_state->m_lip.text_string_is_7bit(), $1, $2);
           }
         | text_literal TEXT_STRING_literal
           {
-            $$= NEW_PTN PTI_text_literal_concat(@$, $1, $2);
+            $$= NEW_PTN PTI_text_literal_concat(@$,
+                YYTHD->m_parser_state->m_lip.text_string_is_7bit(), $1, $2);
           }
         ;
[20 Feb 2015 17:58] Paul DuBois
Noted in 5.7.6 changelog entry.

During token processing, the parser check whether a token contained
7-bit data could be applied to the wrong token.