Description:
GTID protocol will put a FORMAT_DESCRIPTION_EVENT from the master with log_pos != 0(usually 123) after each reconnection if auto positioning is enabled. Since the log_pos(123) in Format_description_log_event may smaller than the pos that has been applyed by the worker thread, so a log_pos regression may appear. If we stop slave; start slave with log_pos regression, mts_recovery_groups may fail.
How to repeat:
Apply this patch and run mtr test below like ./mtr rpl.pos_regression, mysql will crash in mts_recovery_groups.
We can see log_pos regression in mysql-bin.000001 like this.
output of this mtr:
select Relay_log_name,Relay_log_pos,Master_log_name,Master_log_pos from mysql.slave_worker_info;
Relay_log_name Relay_log_pos Master_log_name Master_log_pos
./slave-relay-bin.000003 707 mysql-bin.000001 838
select Relay_log_name,Relay_log_pos,Master_log_name,Master_log_pos from mysql.slave_relay_log_info;
Relay_log_name Relay_log_pos Master_log_name Master_log_pos
./slave-relay-bin.000004 360 mysql-bin.000001 123
we can see:
Master_log_name.Master_log_pos in slave_relay_log_info < Master_log_name.Master_log_pos in slave_worker_info;
Relay_log_name.Relay_log_pos in slave_relay_log_info > Relay_log_name.Relay_log_pos in slave_worker_info;
--------------------------------------------------------------------------------
diff --git a/sql/rpl_slave.cc b/sql/rpl_slave.cc
index 88193c33962..960f86f5948 100644
--- a/sql/rpl_slave.cc
+++ b/sql/rpl_slave.cc
@@ -6860,6 +6860,18 @@ bool mts_checkpoint_routine(Relay_log_info *rli, ulonglong period,
error= rli->flush_info(TRUE);
+ /*
+ * Pause the SQL thread if we checkpoint at pos 123
+ * */
+
+ DBUG_EXECUTE_IF("pause_after_checkpoint_at_123",
+ {
+ if (rli->gaq->lwm.group_master_log_pos == 123)
+ {
+ sleep(5);
+ }
+ };);
+
mysql_cond_broadcast(&rli->data_cond);
if (need_data_lock)
mysql_mutex_unlock(&rli->data_lock);
--------------------------------------------------------------------------------
pos_regression-master.opt:
--log-bin=mysql-bin
--gtid_mode=ON
--enforce-gtid-consistency
pos_regression-slave.opt:
--log-bin=mysql-bin
--gtid_mode=ON
--enforce-gtid-consistency
--slave_parallel_type='LOGICAL_CLOCK'
--slave_parallel_workers=1
--relay_log_info_repository='TABLE'
--relay_log_purge=0
pos_regression.test:
source include/master-slave.inc;
source include/have_innodb.inc;
source include/have_debug.inc;
source include/have_debug_sync.inc;
source include/have_gtid.inc;
source include/have_binlog_format_row.inc;
--source include/rpl_connection_master.inc
CREATE TABLE t1 (a INT) ENGINE = InnoDB;
INSERT INTO t1 VALUES (1);
--source include/sync_slave_sql_with_master.inc
--source include/stop_slave.inc
--source include/rpl_connection_master.inc
DELETE FROM t1 WHERE a = 1;
INSERT INTO t1 VALUES (2);
--source include/rpl_connection_slave.inc
SET GLOBAL debug= '+d,stop_io_after_reading_write_rows_log_event';
--source include/start_slave.inc
let $_sql_running= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1);
let $_io_running= query_get_value(SHOW SLAVE STATUS, Slave_IO_Running, 1);
--echo Slave_SQL_Running:$_sql_running
--echo Slave_IO_Running:$_io_running
SET GLOBAL debug= '-d,stop_io_after_reading_write_rows_log_event';
SET GLOBAL debug= '+d,pause_after_checkpoint_at_123';
start slave io_thread;
--echo Wait until SQL thread synced to master checkpoint
--let $wait_condition= SELECT Master_log_pos = "123" FROM mysql.slave_relay_log_info
--source include/wait_condition.inc
select Relay_log_name,Relay_log_pos,Master_log_name,Master_log_pos from mysql.slave_worker_info;
select Relay_log_name,Relay_log_pos,Master_log_name,Master_log_pos from mysql.slave_relay_log_info;
--source include/stop_slave.inc
--source include/start_slave.inc
select * from mysql.slave_relay_log_info;
--source include/rpl_connection_master.inc
--source include/sync_slave_sql_with_master.inc
Suggested fix:
diff --git a/sql/rpl_slave.cc b/sql/rpl_slave.cc
index 88193c33962..e8ad3da2947 100644
--- a/sql/rpl_slave.cc
+++ b/sql/rpl_slave.cc
@@ -6831,6 +6831,14 @@ bool mts_checkpoint_routine(Relay_log_info *rli, ulonglong period,
else
mysql_mutex_assert_owner(&rli->data_lock);
+ /* Skip checkpoint if log_pos regression */
+ if (rli->gaq->lwm.group_master_log_pos <= rli->get_group_master_log_pos())
+ {
+ if (need_data_lock)
+ mysql_mutex_unlock(&rli->data_lock);
+ goto end;
+ }
+
/*
"Coordinator::commit_positions" {