Bug #69328 Encoding problem : 1 database utf8 with 1 table utf8 and 1 table latin1
Submitted: 27 May 2013 16:01 Modified: 13 Jan 2023 23:49
Reporter: Lucas Pouzac Email Updates:
Status: Not a Bug Impact on me:
None 
Category:Connector / J Severity:S2 (Serious)
Version:5.1.25 OS:Any
Assigned to: Assigned Account CPU Architecture:Any

[27 May 2013 16:01] Lucas Pouzac
Description:
I try to get the same character ('é') from a table in latin1 and utf8 via the driver, but can not get there.

ResultSet.getBytes(...) is correct but no ResultSet.getString(...)

Below is my test case.

How to repeat:
###############################################
# CREATING DATABASE
###############################################
DROP DATABASE encoding;
CREATE DATABASE encoding CHARACTER SET utf8 COLLATE utf8_bin;

use encoding

CREATE TABLE utf8_table (
  id varchar(50) NOT NULL,
  data longtext NOT NULL)
  CHARACTER SET utf8 COLLATE utf8_bin;

SET NAMES utf8;

INSERT INTO utf8_table VALUES ("é", "é");

CREATE TABLE latin1_table (
  id varchar(50) NOT NULL,
  data longtext NOT NULL)
  CHARACTER SET latin1 COLLATE latin1_bin;

SET NAMES latin1;

INSERT INTO latin1_table VALUES ("é", "é");

GRANT ALL PRIVILEGES ON encoding.* TO 'encoding'@'localhost' IDENTIFIED BY 'encoding' WITH GRANT OPTION;
GRANT ALL PRIVILEGES ON encoding.* TO 'encoding'@'%' IDENTIFIED BY 'encoding' WITH GRANT OPTION;

###############################################
# CREATING DATABASE - RESULTS OK
###############################################

SET NAMES utf8;
SELECT * FROM utf8_table;
+----+------+
| id | data |
+----+------+
| é  | é    |
+----+------+

SELECT * FROM latin1_table;
+------+------+
| id   | data |
+------+------+
| é   | é   |
+------+------+

SET NAMES latin1;

SELECT * FROM latin1_table;
+----+------+
| id | data |
+----+------+
| é  | é    |
+----+------+

SELECT * FROM utf8_table;
+----+------+
| id | data |
+----+------+
| �   | �     |
+----+------+

###############################################
# JAVA USE CASE (with mysql-connector-java-5.1.25
###############################################

import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;

public class TestMysqlEncoding {

    private String dbURL = "";

    private String user = "";

    private String password = "";

    private java.sql.Connection dbConnect = null;

    private java.sql.Statement dbStatement = null;

    public TestMysqlEncoding(String url, String user, String password) {
        this.dbURL = url;
        this.user = user;
        this.password = password;
    }

    public Boolean connect() {
        try {
            Class.forName("com.mysql.jdbc.Driver").newInstance();
            System.out.println(dbURL);
            this.dbConnect = DriverManager.getConnection(this.dbURL, this.user, this.password);
            this.dbStatement = this.dbConnect.createStatement();
            return true;
        } catch (Exception ex) {
            System.err.println(ex);
        }
        return false;
    }

    public ResultSet exec(String sql) {
        try {
            return this.dbStatement.executeQuery(sql);
        } catch (Exception ex) {
            System.err.println(ex);
        }
        return null;
    }

    public int execUpdate(String sql) {
        try {
            return this.dbStatement.executeUpdate(sql);
        } catch (Exception ex) {
            System.err.println(ex);
        }
        return -1;
    }

    public void close() {
        try {
            this.dbStatement.close();
            this.dbConnect.close();
            this.dbConnect.close();
        } catch (SQLException ex) {
            System.err.println(ex);
        }
    }

    public static void main(String[] args) {
        String[] tablesName = { "utf8_table", "latin1_table" };

        for (String tableName : tablesName) {
            System.out.println("***************************");
            System.out.println("** " + tableName);
            System.out.println("***************************");

            String[] paramsMysql = {
                    "useUnicode=true&characterEncoding=utf8",
                    "useUnicode=true&characterEncoding=latin1" };

            for (String paramMysql : paramsMysql) {
                TestMysqlEncoding mysqlCli = new TestMysqlEncoding("jdbc:mysql://localhost:3306/encoding?" + paramMysql, "encoding", "encoding");
                try {
                    if (mysqlCli.connect()) {
                        try {
                            ResultSet rs = mysqlCli.exec("SELECT * FROM " + tableName);
                            if (rs != null) {
                                while (rs.next()) {
                                    System.out.println("column1 value : " + rs.getString(1)
                                            + " (bytes : [" + bytesToString(rs.getBytes(1)) + "])"
                                            + ", column2 value " + rs.getString(2)
                                            + " (bytes : [" + bytesToString(rs.getBytes(2)) + "])");
                                }
                            }
                        } catch (Exception ex) {
                            System.err.println(ex);
                        }
                    } else {
                        System.err.println("Mysql connection failed !!!");
                    }
                } finally {
                    mysqlCli.close();
                }
            }
        }
    }

    private static String bytesToString(byte[] bytes) {
        String result = "";
        boolean first = true;
        for (byte b : bytes) {
            if (!first) {
                result += ",";
            }
            result += b;
            first = false;
        }
        return result;
    }
}

###############################################
# JAVA USE CASE - RESULTS KO
###############################################

***************************
** utf8_table
***************************
jdbc:mysql://localhost:3306/encoding?useUnicode=true&characterEncoding=utf8
column1 value : é (bytes : [-61,-87]), column2 value é (bytes : [-61,-87])
jdbc:mysql://localhost:3306/encoding?useUnicode=true&characterEncoding=latin1
column1 value : é (bytes : [-61,-87]), column2 value é (bytes : [-61,-87])
***************************
** latin1_table
***************************
jdbc:mysql://localhost:3306/encoding?useUnicode=true&characterEncoding=utf8
column1 value : é (bytes : [-61,-87]), column2 value é (bytes : [-61,-87])
jdbc:mysql://localhost:3306/encoding?useUnicode=true&characterEncoding=latin1
column1 value : é (bytes : [-61,-87]), column2 value é (bytes : [-61,-87])

Suggested fix:
It's just an idea, maybe it's not possible in all case.

Encoding set to thisRow.getString(internalColumnIndex, encoding, this.connection) it's not metadata.getCharacterSet() but characterSetResult between client and server.
[27 May 2013 21:50] Mark Matthews
> Suggested fix:
> It's just an idea, maybe it's not possible in all case.
> 
> Encoding set to thisRow.getString(internalColumnIndex, encoding, this.connection) it's not > metadata.getCharacterSet() but characterSetResult between client and server.

No, it's metadata.getCharacterSet(), which *is* characterSetResult, which by default is null with the JDBC driver, which means the server returns whatever character set it thinks is correct, and the driver does the decoding.

I'm guessing in your case, MySQL is lying about what character encoding your results are actually encoded in, or as has happened in the past, encodes them one way, but reports them differently.
[30 May 2013 13:48] Alexander Soklakov
Hi Lucas,

Thanks for your report. Verified as described.
[13 Jan 2023 19:42] Filipe Silva
The test case is missing one variable: the mismatch between `character_set_client` setting and the *real* character set the client application is using.

Mind that:

- "é" is code point 0xE9 in latin1
- code points 0xE9 has any meaning in utf8
- "é" is code point 0xC3 0xA9 in utf8
- code point 0xC3 in latin1 is "Ã"
- code point 0xA9 in latin1 is "©"

Regardless of what you specify in SET NAMES and then you insert these values in both tables, you are actually submitting to the server the bytes 0xC3 0xA9 in all cases, because those are the bytes representing the code points coming from your console application, which, I'm guessing is using UTF-8.

Then you read the data. Several conversion happen in between the data stored in the tables and the characters rendered by your console. Let's take your cases one by one:

- `SET NAMES utf8;` + `SELECT * FROM utf8_table;`
All fine here since all character sets match.

- `SET NAMES utf8;` + `SELECT * FROM latin1_table;`
Result rendered as "é". Note that these are the code points 0xC3 and 0xA9 in the latin1 table.
So, this happens: the bytes 0xC3 and 0xA9 are read from the database, each one is considered to be a latin1 char, then each one is converted to utf8 following the `SET NAMES` setting and finally rendered as UTF8 in your console.

- `SET NAMES latin1;` + `SELECT * FROM latin1_table;`
Result rendered as "é", but not because charsets are properly aligned.
In this case the bytes 0xC3 and 0xA9 are returned, each one is a latin1 char by itself and they both look like "é" together, but since your console is running in UTF-8, it "sees" this sequence of two bytes, 0xC3 and 0xA9, and decodes it as a UTF-8 code point, rendering the character "é".

- `SET NAMES latin1;` + `SELECT * FROM utf8_table;`
Like before, the bytes 0xC3 and 0xA9 are returned, but now they represent a UTF-8 character, i.e., "é". This character is then converted to latin1 following the `SET NAMES` setting, i.e. the two bytes end up being translated to 0xE9, and sent back to the console to render. But, since the console is using UTF-8 and the byte 0xE9 has no meaning there, it ends up rendering the generic invalid character placeholder "�".

The conclusion is: when using a mysql client running in a console that is using UTF-8, in order for you to actually store *true* latin1 code point bytes into the latin1 table, you'd have to execute `SET NAMES latin1` *and* `set character_set_client = utf8;`. The result after this is that you won't be able to see any "é" when you read from both tables if `set names latin1` but both tables will, correctly, return "é" if "set names utf8".

MySQL Connector/J handles all this correctly. As such, this is not a bug.