MySQL Bugs: #110566: Contribution by Tencent: checkpoint may regression in mts

Bug #110566	Contribution by Tencent: checkpoint may regression in mts
Submitted:	30 Mar 2023 11:05	Modified:	31 Mar 2023 8:19
Reporter:	yewei Xu (OCA)	Email Updates:
Status:	Verified	Impact on me:	None
Category:	MySQL Server: Replication	Severity:	S2 (Serious)
Version:	5.7.41	OS:	Any
Assigned to:		CPU Architecture:	Any
Tags:	Contribution, MTS

Description:
GTID protocol will put a FORMAT_DESCRIPTION_EVENT from the master with log_pos != 0(usually 123) after each reconnection if auto positioning is enabled. Since the log_pos(123) in Format_description_log_event may smaller than the pos that has been applyed by the worker thread, so a log_pos regression may appear. If we stop slave; start slave with log_pos regression, mts_recovery_groups may fail.

How to repeat:
Apply this patch and run mtr test below like ./mtr rpl.pos_regression, mysql will crash in mts_recovery_groups.

We can see log_pos regression in mysql-bin.000001 like this.

output of this mtr:
select Relay_log_name,Relay_log_pos,Master_log_name,Master_log_pos from mysql.slave_worker_info;
Relay_log_name  Relay_log_pos   Master_log_name Master_log_pos
./slave-relay-bin.000003        707     mysql-bin.000001        838
select Relay_log_name,Relay_log_pos,Master_log_name,Master_log_pos from mysql.slave_relay_log_info;
Relay_log_name  Relay_log_pos   Master_log_name Master_log_pos
./slave-relay-bin.000004        360     mysql-bin.000001        123

we can see:
Master_log_name.Master_log_pos in slave_relay_log_info < Master_log_name.Master_log_pos in slave_worker_info;
Relay_log_name.Relay_log_pos in slave_relay_log_info > Relay_log_name.Relay_log_pos in slave_worker_info;

--------------------------------------------------------------------------------
diff --git a/sql/rpl_slave.cc b/sql/rpl_slave.cc
index 88193c33962..960f86f5948 100644
--- a/sql/rpl_slave.cc
+++ b/sql/rpl_slave.cc
@@ -6860,6 +6860,18 @@ bool mts_checkpoint_routine(Relay_log_info *rli, ulonglong period,
 
   error= rli->flush_info(TRUE);
 
+ /*
+  *     Pause the SQL thread if we checkpoint at pos 123
+  *       */
+
+  DBUG_EXECUTE_IF("pause_after_checkpoint_at_123",
+                  {
+                    if (rli->gaq->lwm.group_master_log_pos == 123)
+                    {
+                      sleep(5);
+                    }
+                  };);
+
   mysql_cond_broadcast(&rli->data_cond);
   if (need_data_lock)
     mysql_mutex_unlock(&rli->data_lock);
--------------------------------------------------------------------------------
pos_regression-master.opt:

--log-bin=mysql-bin                                                                                                                                      
--gtid_mode=ON                                                                                                                                           
--enforce-gtid-consistency

pos_regression-slave.opt:

--log-bin=mysql-bin                                                                                                                                      
--gtid_mode=ON                                                                                                                                           
--enforce-gtid-consistency                                                                                                                               
--slave_parallel_type='LOGICAL_CLOCK'                                                                                                                    
--slave_parallel_workers=1                                                                                                                               
--relay_log_info_repository='TABLE'                                                                                                                      
--relay_log_purge=0

pos_regression.test:

source include/master-slave.inc;
source include/have_innodb.inc;
source include/have_debug.inc;
source include/have_debug_sync.inc;
source include/have_gtid.inc;
source include/have_binlog_format_row.inc;

--source include/rpl_connection_master.inc
CREATE TABLE t1 (a INT) ENGINE = InnoDB;
INSERT INTO t1 VALUES (1);

--source include/sync_slave_sql_with_master.inc
--source include/stop_slave.inc

--source include/rpl_connection_master.inc
DELETE FROM t1 WHERE a = 1;
INSERT INTO t1 VALUES (2);

--source include/rpl_connection_slave.inc

SET GLOBAL debug= '+d,stop_io_after_reading_write_rows_log_event';
--source include/start_slave.inc
let $_sql_running= query_get_value(SHOW SLAVE STATUS, Slave_SQL_Running, 1);
let $_io_running= query_get_value(SHOW SLAVE STATUS, Slave_IO_Running, 1);
--echo Slave_SQL_Running:$_sql_running
--echo Slave_IO_Running:$_io_running

SET GLOBAL debug= '-d,stop_io_after_reading_write_rows_log_event';
SET GLOBAL debug= '+d,pause_after_checkpoint_at_123';
start slave io_thread;

--echo Wait until SQL thread synced to master checkpoint
--let $wait_condition= SELECT Master_log_pos = "123" FROM mysql.slave_relay_log_info
--source include/wait_condition.inc

select Relay_log_name,Relay_log_pos,Master_log_name,Master_log_pos from mysql.slave_worker_info;
select Relay_log_name,Relay_log_pos,Master_log_name,Master_log_pos from mysql.slave_relay_log_info;

--source include/stop_slave.inc
--source include/start_slave.inc

select * from mysql.slave_relay_log_info;

--source include/rpl_connection_master.inc
--source include/sync_slave_sql_with_master.inc

Suggested fix:
diff --git a/sql/rpl_slave.cc b/sql/rpl_slave.cc
index 88193c33962..e8ad3da2947 100644
--- a/sql/rpl_slave.cc
+++ b/sql/rpl_slave.cc
@@ -6831,6 +6831,14 @@ bool mts_checkpoint_routine(Relay_log_info *rli, ulonglong period,
   else
     mysql_mutex_assert_owner(&rli->data_lock);
 
+  /* Skip checkpoint if log_pos regression */
+  if (rli->gaq->lwm.group_master_log_pos <= rli->get_group_master_log_pos())
+  {
+    if (need_data_lock)
+      mysql_mutex_unlock(&rli->data_lock);
+    goto end;
+  }
+
   /*
     "Coordinator::commit_positions" {

Hello  Yewei Xu,

Thank you for the report and contribution.

regards,
Umesh