Description:
When --skip-log-bin is set on the slave and GTID is turned on, if a crash occurs during the replay of the XA transaction, it may lead to inconsistency between the master and the slave.
How to repeat:
diff --git a/sql/xa/sql_xa_second_phase.cc b/sql/xa/sql_xa_second_phase.cc
index d63fce8f48e..85388393d17 100644
--- a/sql/xa/sql_xa_second_phase.cc
+++ b/sql/xa/sql_xa_second_phase.cc
@@ -120,6 +120,12 @@ void Sql_cmd_xa_second_phase::setup_thd_context(THD *thd) {
std::tie(this->m_gtid_error, this->m_need_clear_owned_gtid) =
commit_owned_gtids(thd, true);
if (this->m_gtid_error) my_error(ER_XA_RBROLLBACK, MYF(0));
+
+ DBUG_EXECUTE_IF("simulate_crash_after_write_gtid_for_xa", {
+ ha_flush_logs();
+ DBUG_SUICIDE();
+ });
+
this->m_result = detached_xs->xa_trans_rolled_back() || this->m_gtid_error;
assert(thd_xs->is_binlogged() == false);
diff --git a/mysql-test/suite/rpl_gtid/t/bug_xa_inconsistent-slave.opt b/mysql-test/suite/rpl_gtid/t/bug_xa_inconsistent-slave.opt
new file mode 100644
index 00000000000..789275fa25e
--- /dev/null
+++ b/mysql-test/suite/rpl_gtid/t/bug_xa_inconsistent-slave.opt
@@ -0,0 +1 @@
+--skip-log-bin
diff --git a/mysql-test/suite/rpl_gtid/t/bug_xa_inconsistent.test b/mysql-test/suite/rpl_gtid/t/bug_xa_inconsistent.test
new file mode 100644
index 00000000000..5895e4d5972
--- /dev/null
+++ b/mysql-test/suite/rpl_gtid/t/bug_xa_inconsistent.test
@@ -0,0 +1,46 @@
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+--source include/master-slave.inc
+
+--source include/rpl_connection_slave.inc
+--source include/stop_slave_sql.inc
+
+--source include/rpl_connection_master.inc
+create table t1 (id int) engine = innodb;
+xa start 'zjy';
+insert into t1 values (1);
+xa end 'zjy';
+xa prepare 'zjy';
+xa commit 'zjy';
+
+--source include/rpl_connection_slave.inc
+--let $debug_point= simulate_crash_after_write_gtid_for_xa
+--source include/add_debug_point.inc
+
+-- exec echo "wait" > $MYSQLTEST_VARDIR/tmp/mysqld.2.expect
+--source include/start_slave_sql.inc
+--source include/wait_until_disconnected.inc
+
+-- let $rpl_server_number= 2
+-- source include/rpl_start_server.inc
+-- enable_reconnect
+-- echo # Reconnecting to the slave server
+-- source include/wait_until_connected_again.inc
+
+--source include/start_slave.inc
+--source include/rpl_connection_master.inc
+--source include/sync_slave_sql_with_master.inc
+
+--source include/rpl_connection_master.inc
+--let $result=query_get_value("xa recover", data, 1)
+--let $assert_text=Should not have hanging transaction
+--let $assert_cond="$result" = "No such row"
+--source include/assert.inc
+
+--source include/rpl_connection_slave.inc
+--let $result=query_get_value("xa recover", data, 1)
+--let $assert_text=Should not have hanging transaction
+--let $assert_cond="$result" = "No such row"
+--source include/assert.inc
+
+drop table t1;
Suggested fix:
I think the key is that when the slave replays the event of "XA COMMIT", the execution sequence is as like:
step-1. First write the GTID of the event to the table "mysql.gtid_executed"
step-2. Execute XA COMMIT
If a crash occurs after step-1 and before step-2, the slave will never replay this event.
Description: When --skip-log-bin is set on the slave and GTID is turned on, if a crash occurs during the replay of the XA transaction, it may lead to inconsistency between the master and the slave. How to repeat: diff --git a/sql/xa/sql_xa_second_phase.cc b/sql/xa/sql_xa_second_phase.cc index d63fce8f48e..85388393d17 100644 --- a/sql/xa/sql_xa_second_phase.cc +++ b/sql/xa/sql_xa_second_phase.cc @@ -120,6 +120,12 @@ void Sql_cmd_xa_second_phase::setup_thd_context(THD *thd) { std::tie(this->m_gtid_error, this->m_need_clear_owned_gtid) = commit_owned_gtids(thd, true); if (this->m_gtid_error) my_error(ER_XA_RBROLLBACK, MYF(0)); + + DBUG_EXECUTE_IF("simulate_crash_after_write_gtid_for_xa", { + ha_flush_logs(); + DBUG_SUICIDE(); + }); + this->m_result = detached_xs->xa_trans_rolled_back() || this->m_gtid_error; assert(thd_xs->is_binlogged() == false); diff --git a/mysql-test/suite/rpl_gtid/t/bug_xa_inconsistent-slave.opt b/mysql-test/suite/rpl_gtid/t/bug_xa_inconsistent-slave.opt new file mode 100644 index 00000000000..789275fa25e --- /dev/null +++ b/mysql-test/suite/rpl_gtid/t/bug_xa_inconsistent-slave.opt @@ -0,0 +1 @@ +--skip-log-bin diff --git a/mysql-test/suite/rpl_gtid/t/bug_xa_inconsistent.test b/mysql-test/suite/rpl_gtid/t/bug_xa_inconsistent.test new file mode 100644 index 00000000000..5895e4d5972 --- /dev/null +++ b/mysql-test/suite/rpl_gtid/t/bug_xa_inconsistent.test @@ -0,0 +1,46 @@ +--source include/have_debug.inc +--source include/have_debug_sync.inc +--source include/master-slave.inc + +--source include/rpl_connection_slave.inc +--source include/stop_slave_sql.inc + +--source include/rpl_connection_master.inc +create table t1 (id int) engine = innodb; +xa start 'zjy'; +insert into t1 values (1); +xa end 'zjy'; +xa prepare 'zjy'; +xa commit 'zjy'; + +--source include/rpl_connection_slave.inc +--let $debug_point= simulate_crash_after_write_gtid_for_xa +--source include/add_debug_point.inc + +-- exec echo "wait" > $MYSQLTEST_VARDIR/tmp/mysqld.2.expect +--source include/start_slave_sql.inc +--source include/wait_until_disconnected.inc + +-- let $rpl_server_number= 2 +-- source include/rpl_start_server.inc +-- enable_reconnect +-- echo # Reconnecting to the slave server +-- source include/wait_until_connected_again.inc + +--source include/start_slave.inc +--source include/rpl_connection_master.inc +--source include/sync_slave_sql_with_master.inc + +--source include/rpl_connection_master.inc +--let $result=query_get_value("xa recover", data, 1) +--let $assert_text=Should not have hanging transaction +--let $assert_cond="$result" = "No such row" +--source include/assert.inc + +--source include/rpl_connection_slave.inc +--let $result=query_get_value("xa recover", data, 1) +--let $assert_text=Should not have hanging transaction +--let $assert_cond="$result" = "No such row" +--source include/assert.inc + +drop table t1; Suggested fix: I think the key is that when the slave replays the event of "XA COMMIT", the execution sequence is as like: step-1. First write the GTID of the event to the table "mysql.gtid_executed" step-2. Execute XA COMMIT If a crash occurs after step-1 and before step-2, the slave will never replay this event.