Bug #89860 XA may lost prepared transaction and cause different between master and slave.
Submitted: 1 Mar 2018 1:34 Modified: 31 May 2018 7:36
Reporter: Ze Yang (OCA) Email Updates:
Status: Verified Impact on me:
None 
Category:MySQL Server: XA transactions Severity:S2 (Serious)
Version:5.7, 5.7.21 OS:Any
Assigned to: CPU Architecture:Any

[1 Mar 2018 1:34] Ze Yang
Description:
I proposed one bug https://bugs.mysql.com/bug.php?id=88534. XA may lost prepared transaction and cause different between master and slave,  as the xa prepare first call binlog prepare, then innodb prepare.

Last days I thought this bug again. If call InnoDB prepare first, the binlog would lost, the master would have a prepared xa transaction but the slave not.

How to repeat:
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -1460,10 +1460,12 @@ int ha_prepare(THD *thd)
       thd->status_var.ha_prepare_count++;
       if (ht->prepare)
       {
-        DBUG_EXECUTE_IF("simulate_xa_failure_prepare", {
-          ha_rollback_trans(thd, true);
-          DBUG_RETURN(1);
-        });
+        if (ht->db_type == DB_TYPE_INNODB) {
+          DBUG_EXECUTE_IF("simulate_xa_failure_prepare", {
+            ha_rollback_trans(thd, true);
+            DBUG_RETURN(1);
+          });
+        }
         if (ht->prepare(ht, thd, true))
         {
           ha_rollback_trans(thd, true);

replication.test:

--disable_warnings
source include/master-slave.inc;
--enable_warnings
connection master;
CREATE TABLE ti (c1 INT) ENGINE=INNODB;
XA START 'x';
INSERT INTO ti VALUES(1);
XA END 'x';
SET @@session.debug = '+d,simulate_xa_failure_prepare';
--error ER_XA_RBROLLBACK
XA PREPARE 'x';
--echo #Master
XA RECOVER;

--sync_slave_with_master
connection slave;
--echo #Slave
XA RECOVER;

replication.result:

include/master-slave.inc
[connection master]
CREATE TABLE ti (c1 INT) ENGINE=INNODB;
XA START 'x';
INSERT INTO ti VALUES(1);
XA END 'x';
SET @@session.debug = '+d,simulate_xa_failure_prepare';
XA PREPARE 'x';
ERROR XA100: XA_RBROLLBACK: Transaction branch was rolled back
#Master
XA RECOVER;
formatID  gtrid_length  bqual_length  data
#Slave
XA RECOVER;
formatID  gtrid_length  bqual_length  data
1 1 0 x

--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -1479,6 +1479,7 @@ int ha_prepare(THD *thd)
       }
       ha_info= ha_info->next();
     }
+    DBUG_EXECUTE_IF("crash_after_xa_prepare", DBUG_SUICIDE(););

     DBUG_ASSERT(thd->get_transaction()->xid_state()->
                 has_state(XID_STATE::XA_IDLE));

replication.test:

-- source include/have_log_bin.inc
CREATE TABLE ti (c1 INT) ENGINE=INNODB;
XA START 'x';
INSERT INTO ti VALUES(1);
XA END 'x';
SET @@session.debug = '+d,crash_after_xa_prepare';
--exec echo "wait" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
--error 2013
XA PREPARE 'x';
--source include/wait_until_disconnected.inc
--let $_expect_file_name= $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
--source include/start_mysqld.inc
XA RECOVER;
show binlog events in 'mysql.000001';

replication.result:
CREATE TABLE ti (c1 INT) ENGINE=INNODB;
XA START 'x';
INSERT INTO ti VALUES(1);
XA END 'x';
SET @@session.debug = '+d,crash_after_xa_prepare';
XA PREPARE 'x';
ERROR HY000: Lost connection to MySQL server during query
# restart
XA RECOVER;
formatID  gtrid_length  bqual_length  data
show binlog events in 'mysql.000001';
Log_name  Pos Event_type  Server_id End_log_pos Info
mysql.000001  4 Format_desc 1 123 Server ver: 5.7.19org-debug-log, Binlog ver: 4
mysql.000001  123 Previous_gtids  1 154
mysql.000001  154 Anonymous_Gtid  1 219 SET @@SESSION.GTID_NEXT= 'ANONYMOUS'
mysql.000001  219 Query 1 331 use `test`; CREATE TABLE ti (c1 INT) ENGINE=INNODB
mysql.000001  331 Anonymous_Gtid  1 396 SET @@SESSION.GTID_NEXT= 'ANONYMOUS'
mysql.000001  396 Query 1 483 XA START X'78',X'',1
mysql.000001  483 Table_map 1 528 table_id: 222 (test.ti)
mysql.000001  528 Write_rows  1 568 table_id: 222 flags: STMT_END_F
mysql.000001  568 Query 1 653 XA END X'78',X'',1
mysql.000001  653 XA_prepare  1 690 XA PREPARE X'78',X'',1

Suggested fix:
Two methods may could solve this problem.

1. Remain 'XA prepare': binlog prepare->engine prepare(ensure engine flush logs), 'XA COMMIT': binlog commit->engine commit. Write 'xa rollback' when we found the storage engine on master lost prepared transaction.

When the server restarted, we search the last binlog to check whether there's a 'xa prepare' which no 'xa commit/rollback' matched. If we find the not matched 'xa prepare x' in binlog, we check whether there's prepared xa 'x' in the storage engine.  If there's no prepared xa transacton, we write 'xa rollback x' to the binlog to cancel prepared transaction on slave.

But this method not friendly to HA. If we change to slave after the master crash, there's error happened.

2. 'XA PREPARE': engine prepare->binlog prepare, 'XA COMMIT' binlog commit-> engine commit. Add one table like the 'mysql.gtid_executed' table to record the xa prepared transactions when rotate binlog. Search the last binlog to decide whether the prepared transaction in storage engine should be rollbacked, committed or remained prepared.

Have to add a table to recored the xa prepared transaction when rotate binlog, as we would not know whether the prepared transaction's binlog was logged when server restart(the binlog may purged and 'xa prepare' may recorded in the purged binlog). So have to  add a table to record xa prepared transaction when rotate binlog, then we could use binlog to decide the transaction state.
[1 Mar 2018 12:56] MySQL Verification Team
Hello Michael Yang,

Thank you for the report and test case.
Verified as described with 5.7.21 src build after patching.

Thanks,
Umesh
[1 Mar 2018 13:00] MySQL Verification Team
5.7.21 - test results

Attachment: 89860_5.7.21.results (application/octet-stream, text), 4.68 KiB.

[31 May 2018 7:36] Ze Yang
This bug is the same with https://bugs.mysql.com/bug.php?id=76233.