Description:
Optimizing a table with concurrent XA on the source may cause the replica to hang. The root cause of this problem is that the 'optimize table' operation for innodb is implemented as 'recreate + analyze'. The execlusive MDL lock is released after recreating but before writing 'optimize table' to binlog. An XA transaction may happen during this interval and write to binlog. The sequence of events in binlog is as follows:
xa start
dml
xa end
xa prepare
optimize table
The incorrect sequence of events in binlog leads replica to hang.
This issue is similar with https://bugs.mysql.com/bug.php?id=93761
How to repeat:
Some debug sync points are required:
diff --git a/sql/sql_admin.cc b/sql/sql_admin.cc
index 6ed8095577c..29cc3ee0566 100644
--- a/sql/sql_admin.cc
+++ b/sql/sql_admin.cc
@@ -1324,6 +1324,8 @@ static bool mysql_admin_table(
}
close_thread_tables(thd);
thd->mdl_context.release_transactional_locks();
+
+ DEBUG_SYNC(current_thd, "optimize_wait_after_recreating_table");
/* Clear references to TABLE and MDL_ticket after releasing them. */
table->table = nullptr;
table->mdl_request.ticket = nullptr;
@@ -1993,6 +1995,7 @@ bool Sql_cmd_optimize_table::execute(THD *thd) {
*/
res = write_bin_log(thd, true, thd->query().str, thd->query().length);
}
+ DEBUG_SYNC(current_thd, "optimize_after_writing_binlog");
thd->lex->query_block->m_table_list.first = first_table;
thd->lex->query_tables = first_table;
test script:
source include/have_debug_sync.inc;
source include/master-slave.inc;
connect (master2,localhost,root,,test,$MASTER_MYPORT,);
connection master;
create table t1(id int,c1 int);
insert into t1 values(1,1),(2,2);
set debug_sync="optimize_wait_after_recreating_table signal xa_start wait_for optimize_continue";
set debug_sync="optimize_after_writing_binlog signal xa_commit";
send optimize table t1;
connection master2;
set debug_sync="now wait_for xa_start";
xa start 'xa_test1';
insert into t1 values(3,3);
xa end 'xa_test1';
xa prepare 'xa_test1';
set debug_sync="now signal optimize_continue wait_for xa_commit";
xa commit 'xa_test1';
source include/sync_slave_sql_with_master.inc;
connection master;
reap;
drop table t1;
source include/sync_slave_sql_with_master.inc;
connection master;
disconnect master2;
source include/rpl_end.inc;
Suggested fix:
Perhaps the binlog of 'optimize table' should be written before releasing the execlusive MDL lock.
For InnoDB tables, it is more preferable to replace the 'optimize table' operation with 'alter + analyze' if XA is being used.