Bug #113813 Semisync will abort due to wrong transaction binlog postion comparison
Submitted: 31 Jan 2:26 Modified: 31 Jan 12:58
Reporter: karry zhang (OCA) Email Updates:
Status: Verified Impact on me:
None 
Category:MySQL Server: Replication Severity:S3 (Non-critical)
Version:MySQL8.0,8.0.36 OS:Any
Assigned to: CPU Architecture:Any
Tags: Contribution

[31 Jan 2:26] karry zhang
Description:
Semisync will abort due to wrong transaction binlog postion comparison.

How to repeat:
You can repeat this problem by running the following test case:
 
CALL mtr.add_suppression("\\[Warning\\] .*MY-\\d+.* \\[Server\\] Plugin mysqlx reported.*");

--let $rpl_skip_start_slave= 1
source include/master-slave.inc;

connection master;
--let $MYSQLD_DATADIR= `select @@datadir`
--let $master_binlog= query_get_value(SHOW MASTER STATUS, File, 1)
--let $master_binlog_index = `SELECT @@log_bin_index`

--let $rpl_server_number= 1
source include/rpl_stop_server.inc;

--echo
--echo # Change the binlog index entry and binlog file name so that the binlog can reach master-bin.999998, making it easier to reproduce the problem.
--exec cat $master_binlog_index
--exec sed -i 's/000001/999998/g' $master_binlog_index
--exec cat $master_binlog_index
--exec mv $MYSQLD_DATADIR/$master_binlog $MYSQLD_DATADIR/master-bin.999998

--let $rpl_server_number= 1
source include/rpl_start_server.inc;

--echo 
--echo # enable semi-sync replication
connection slave;
SET @old_rpl_semi_sync_slave_enabled = @@GLOBAL.rpl_semi_sync_slave_enabled;
SET GLOBAL rpl_semi_sync_slave_enabled = ON;
source include/start_slave.inc;
SHOW SLAVE STATUS;

connection master;
SET @old_rpl_semi_sync_master_enabled = @@GLOBAL.rpl_semi_sync_master_enabled;
SET GLOBAL rpl_semi_sync_master_enabled = ON;

# Set a relatively long rpl_semi_sync_master_timeout to prevent the impact of timeout.
SET @old_rpl_semi_sync_master_timeout = @@GLOBAL.rpl_semi_sync_master_timeout;
SET global rpl_semi_sync_master_timeout = 1000000000;

SHOW VARIABLES LIKE "%rpl_semi_sync_master_enabled%";
SHOW MASTER STATUS;
SHOW STATUS LIKE 'Rpl_semi_sync%';

--echo
--echo # create some data.
CREATE DATABASE tt;
USE tt;
CREATE TABLE t1(id int);
INSERT INTO t1 VALUES(0);
INSERT INTO t1 VALUES(1);
INSERT INTO t1 VALUES(2);
SHOW STATUS LIKE 'Rpl_semi_sync%';

--echo
--echo # flush binary logs so that binlog reaches master-bin.1000000.
FLUSH BINARY LOGS;
SHOW MASTER STATUS;
SHOW STATUS LIKE 'Rpl_semi_sync%';

--echo
--echo # semi-sync aborts due to wrong transaction binlog position comparison
INSERT INTO t1 VALUES(3);
INSERT INTO t1 VALUES(4);
SHOW STATUS LIKE 'Rpl_semi_sync%';

--echo
--echo # clean up
DROP TABLE t1;
DROP DATABASE tt;
SET GLOBAL rpl_semi_sync_master_timeout = @old_rpl_semi_sync_master_timeout;
SET GLOBAL rpl_semi_sync_master_enabled = @old_rpl_semi_sync_master_enabled;

connection slave;
SET GLOBAL rpl_semi_sync_slave_enabled = @old_rpl_semi_sync_slave_enabled;

--source include/rpl_end.inc

The results of the test case are as follows:

CALL mtr.add_suppression("\\[Warning\\] .*MY-\\d+.* \\[Server\\] Plugin mysqlx reported.*");
include/master-slave.inc
Warnings:
Note	####	Sending passwords in plain text without SSL/TLS is extremely insecure.
Note	####	Storing MySQL user name or password information in the connection metadata repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START REPLICA; see the 'START REPLICA Syntax' in the MySQL Manual for more information.
[connection master]
include/rpl_stop_server.inc [server_number=1]

# Change the binlog index entry and binlog file name so that the binlog can reach master-bin.999998, making it easier to reproduce the problem.
./master-bin.000001
./master-bin.999998
include/rpl_start_server.inc [server_number=1]

# enable semi-sync replication
SET @old_rpl_semi_sync_slave_enabled = @@GLOBAL.rpl_semi_sync_slave_enabled;
SET GLOBAL rpl_semi_sync_slave_enabled = ON;
include/start_slave.inc
SHOW SLAVE STATUS;
Slave_IO_State	Master_Host	Master_User	Master_Port	Connect_Retry	Master_Log_File	Read_Master_Log_Pos	Relay_Log_File	Relay_Log_Pos	Relay_Master_Log_File	Slave_IO_Running	Slave_SQL_Running	Replicate_Do_DB	Replicate_Ignore_DB	Replicate_Do_Table	Replicate_Ignore_Table	Replicate_Wild_Do_Table	Replicate_Wild_Ignore_Table	Last_Errno	Last_Error	Skip_Counter	Exec_Master_Log_Pos	Relay_Log_Space	Until_Condition	Until_Log_File	Until_Log_Pos	Master_SSL_Allowed	Master_SSL_CA_File	Master_SSL_CA_Path	Master_SSL_Cert	Master_SSL_Cipher	Master_SSL_Key	Seconds_Behind_Master	Master_SSL_Verify_Server_Cert	Last_IO_Errno	Last_IO_Error	Last_SQL_Errno	Last_SQL_Error	Replicate_Ignore_Server_Ids	Master_Server_Id	Master_UUID	Master_Info_File	SQL_Delay	SQL_Remaining_Delay	Slave_SQL_Running_State	Master_Retry_Count	Master_Bind	Last_IO_Error_Timestamp	Last_SQL_Error_Timestamp	Master_SSL_Crl	Master_SSL_Crlpath	Retrieved_Gtid_Set	Executed_Gtid_Set	Auto_Position	Replicate_Rewrite_DB	Channel_Name	Master_TLS_Version	Master_public_key_path	Get_master_public_key	Network_Namespace
Waiting for source to send event	127.0.0.1	root	13000	1	master-bin.999999	157	slave-relay-bin.000003	375	master-bin.999999	Yes	Yes							0		0	157	803	None		0	No						0	No	0		0			1	2e6fad92-bfde-11ee-9201-7cd30ac417bc	mysql.slave_master_info	0	NULL	Replica has read all relay log; waiting for more updates	10								0					0	
Warnings:
Warning	1287	'SHOW SLAVE STATUS' is deprecated and will be removed in a future release. Please use SHOW REPLICA STATUS instead
SET @old_rpl_semi_sync_master_enabled = @@GLOBAL.rpl_semi_sync_master_enabled;
SET GLOBAL rpl_semi_sync_master_enabled = ON;
SET @old_rpl_semi_sync_master_timeout = @@GLOBAL.rpl_semi_sync_master_timeout;
SET global rpl_semi_sync_master_timeout = 1000000000;
SHOW VARIABLES LIKE "%rpl_semi_sync_master_enabled%";
Variable_name	Value
rpl_semi_sync_master_enabled	ON
SHOW MASTER STATUS;
File	Position	Binlog_Do_DB	Binlog_Ignore_DB	Executed_Gtid_Set
master-bin.999999	157			
SHOW STATUS LIKE 'Rpl_semi_sync%';
Variable_name	Value
Rpl_semi_sync_master_clients	1
Rpl_semi_sync_master_net_avg_wait_time	0
Rpl_semi_sync_master_net_wait_time	0
Rpl_semi_sync_master_net_waits	0
Rpl_semi_sync_master_no_times	0
Rpl_semi_sync_master_no_tx	0
Rpl_semi_sync_master_status	ON
Rpl_semi_sync_master_timefunc_failures	0
Rpl_semi_sync_master_tx_avg_wait_time	0
Rpl_semi_sync_master_tx_wait_time	0
Rpl_semi_sync_master_tx_waits	0
Rpl_semi_sync_master_wait_pos_backtraverse	0
Rpl_semi_sync_master_wait_sessions	0
Rpl_semi_sync_master_yes_tx	0
Rpl_semi_sync_slave_status	OFF

# create some data.
CREATE DATABASE tt;
USE tt;
CREATE TABLE t1(id int);
INSERT INTO t1 VALUES(0);
INSERT INTO t1 VALUES(1);
INSERT INTO t1 VALUES(2);
SHOW STATUS LIKE 'Rpl_semi_sync%';
Variable_name	Value
Rpl_semi_sync_master_clients	1
Rpl_semi_sync_master_net_avg_wait_time	0
Rpl_semi_sync_master_net_wait_time	0
Rpl_semi_sync_master_net_waits	5
Rpl_semi_sync_master_no_times	0
Rpl_semi_sync_master_no_tx	0
Rpl_semi_sync_master_status	ON
Rpl_semi_sync_master_timefunc_failures	0
Rpl_semi_sync_master_tx_avg_wait_time	331
Rpl_semi_sync_master_tx_wait_time	1656
Rpl_semi_sync_master_tx_waits	5
Rpl_semi_sync_master_wait_pos_backtraverse	0
Rpl_semi_sync_master_wait_sessions	0
Rpl_semi_sync_master_yes_tx	5
Rpl_semi_sync_slave_status	OFF

# flush binary logs so that binlog reaches master-bin.1000000.
FLUSH BINARY LOGS;
SHOW MASTER STATUS;
File	Position	Binlog_Do_DB	Binlog_Ignore_DB	Executed_Gtid_Set
master-bin.1000000	157			
SHOW STATUS LIKE 'Rpl_semi_sync%';
Variable_name	Value
Rpl_semi_sync_master_clients	1
Rpl_semi_sync_master_net_avg_wait_time	0
Rpl_semi_sync_master_net_wait_time	0
Rpl_semi_sync_master_net_waits	5
Rpl_semi_sync_master_no_times	0
Rpl_semi_sync_master_no_tx	0
Rpl_semi_sync_master_status	ON
Rpl_semi_sync_master_timefunc_failures	0
Rpl_semi_sync_master_tx_avg_wait_time	331
Rpl_semi_sync_master_tx_wait_time	1656
Rpl_semi_sync_master_tx_waits	5
Rpl_semi_sync_master_wait_pos_backtraverse	0
Rpl_semi_sync_master_wait_sessions	0
Rpl_semi_sync_master_yes_tx	5
Rpl_semi_sync_slave_status	OFF

# semi-sync aborts due to wrong transaction binlog position comparison
INSERT INTO t1 VALUES(3);
INSERT INTO t1 VALUES(4);
SHOW STATUS LIKE 'Rpl_semi_sync%';
Variable_name	Value
Rpl_semi_sync_master_clients	1
Rpl_semi_sync_master_net_avg_wait_time	0
Rpl_semi_sync_master_net_wait_time	0
Rpl_semi_sync_master_net_waits	5
Rpl_semi_sync_master_no_times	0
Rpl_semi_sync_master_no_tx	0
Rpl_semi_sync_master_status	ON
Rpl_semi_sync_master_timefunc_failures	0
Rpl_semi_sync_master_tx_avg_wait_time	331
Rpl_semi_sync_master_tx_wait_time	1656
Rpl_semi_sync_master_tx_waits	5
Rpl_semi_sync_master_wait_pos_backtraverse	0
Rpl_semi_sync_master_wait_sessions	0
Rpl_semi_sync_master_yes_tx	7
Rpl_semi_sync_slave_status	OFF

# clean up
DROP TABLE t1;
DROP DATABASE tt;
SET GLOBAL rpl_semi_sync_master_timeout = @old_rpl_semi_sync_master_timeout;
SET GLOBAL rpl_semi_sync_master_enabled = @old_rpl_semi_sync_master_enabled;
SET GLOBAL rpl_semi_sync_slave_enabled = @old_rpl_semi_sync_slave_enabled;
include/rpl_end.inc

You can see after executing:
INSERT INTO t1 VALUES(3); 
INSERT INTO t1 VALUES(4);

Rpl_semi_sync_master_tx_waits keeps 5.

Suggested fix:
Not using strcmp in ActiveTranx::compare. 

I will upload a fix patch later.
[31 Jan 7:18] MySQL Verification Team
Hello karry zhang,

Thank you for the report and feedback.
I quickly tried on 8.0.36 build but seems provided mtr test case is not complete(maybe include/have_semisync_plugin.inc missing?). Could you please resend the exact mtr test case to reproduce at our end? Thank you.

--
 ./mtr  --suite=rpl bug113813
Logging: ./mtr  --suite=rpl bug113813
MySQL Version 8.0.36
Checking supported features
Collecting tests
 - Adding combinations for rpl
Checking leftover processes
Removing old var directory
Creating var directory '/export/home/tmp/ushastry/mysql-8.0.36/mysql-test/var'
Installing system database
Using parallel: 1

==============================================================================
                  TEST NAME                       RESULT  TIME (ms) COMMENT
------------------------------------------------------------------------------
[ 25%] rpl.bug113813 'mix'                       [ fail ]
        Test ended at 2024-01-31 08:15:22

CURRENT_TEST: rpl.bug113813
WARNING: MYSQL_OPT_RECONNECT is deprecated and will be removed in a future version.
WARNING: MYSQL_OPT_RECONNECT is deprecated and will be removed in a future version.
WARNING: MYSQL_OPT_RECONNECT is deprecated and will be removed in a future version.
WARNING: MYSQL_OPT_RECONNECT is deprecated and will be removed in a future version.
WARNING: MYSQL_OPT_RECONNECT is deprecated and will be removed in a future version.
WARNING: MYSQL_OPT_RECONNECT is deprecated and will be removed in a future version.
WARNING: MYSQL_OPT_RECONNECT is deprecated and will be removed in a future version.
WARNING: MYSQL_OPT_RECONNECT is deprecated and will be removed in a future version.
WARNING: MYSQL_OPT_RECONNECT is deprecated and will be removed in a future version.
WARNING: MYSQL_OPT_RECONNECT is deprecated and will be removed in a future version.
mysqltest: At line 27: Query 'SET @old_rpl_semi_sync_slave_enabled = @@GLOBAL.rpl_semi_sync_slave_enabled' failed.
ERROR 1193 (HY000): Unknown system variable 'rpl_semi_sync_slave_enabled'
safe_process[15387]: Child process: 15388, exit: 1

regards,
Umesh
[31 Jan 12:18] karry zhang
you can try the follow tese case on 8.0.36:

semisync.test
=============
source include/have_semisync_plugin.inc;

--let $rpl_skip_start_slave= 1
source include/master-slave.inc;

connection master;
--let $MYSQLD_DATADIR= `select @@datadir`
--let $master_binlog= query_get_value(SHOW MASTER STATUS, File, 1)
--let $master_binlog_index = `SELECT @@log_bin_index`

--let $rpl_server_number= 1
source include/rpl_stop_server.inc;

--echo
--echo # Change the binlog index entry and binlog file name so that the binlog can reach master-bin.999998, making it easier to reproduce the problem.
--exec cat $master_binlog_index
--exec sed -i 's/000001/999998/g' $master_binlog_index
--exec cat $master_binlog_index
--exec mv $MYSQLD_DATADIR/$master_binlog $MYSQLD_DATADIR/master-bin.999998

--let $rpl_server_number= 1
source include/rpl_start_server.inc;

connection master;
source include/install_semisync_source.inc;
SET @old_rpl_semi_sync_source_enabled = @@GLOBAL.rpl_semi_sync_source_enabled;
SET GLOBAL rpl_semi_sync_source_enabled = ON;

# Set a relatively long rpl_semi_sync_master_timeout to prevent the impact of timeout.
SET @old_rpl_semi_sync_source_timeout = @@GLOBAL.rpl_semi_sync_source_timeout;
SET global rpl_semi_sync_source_timeout = 100000;
SHOW VARIABLES LIKE "%rpl_semi_sync_source_enabled%";
SHOW MASTER STATUS;
#SHOW STATUS LIKE 'Rpl_semi_sync%';

--echo 
--echo # enable semi-sync replication
connection slave;
--source include/install_semisync_replica.inc
SET @old_rpl_semi_sync_replica_enabled = @@GLOBAL.rpl_semi_sync_replica_enabled;
SET GLOBAL rpl_semi_sync_replica_enabled = ON;
source include/start_slave.inc;
SHOW GLOBAL VARIABLES LIKE 'rpl_semi_sync_replica_enabled';
SHOW STATUS LIKE 'Rpl_semi_sync_replica_status';
SHOW REPLICA STATUS;

--echo
--echo # create some data.
connection master;
SHOW STATUS LIKE 'Rpl_semi_sync%';
CREATE DATABASE tt;
USE tt;
CREATE TABLE t1(id int);
INSERT INTO t1 VALUES(0);
INSERT INTO t1 VALUES(1);
INSERT INTO t1 VALUES(2);
SHOW STATUS LIKE 'Rpl_semi_sync%';

--echo
--echo # flush binary logs so that binlog reaches master-bin.1000000.
FLUSH BINARY LOGS;
SHOW MASTER STATUS;
SHOW STATUS LIKE 'Rpl_semi_sync%';

--echo
--echo # semi-sync aborts due to wrong transaction binlog position comparison
INSERT INTO t1 VALUES(3);
INSERT INTO t1 VALUES(4);
SHOW STATUS LIKE 'Rpl_semi_sync%';

--echo
--echo # clean up
DROP TABLE t1;
DROP DATABASE tt;
source include/sync_slave_sql_with_master.inc;

connection slave;
source include/stop_slave.inc;
SET GLOBAL rpl_semi_sync_replica_enabled = @old_rpl_semi_sync_replica_enabled;
source include/uninstall_semisync_replica.inc;
source include/start_slave.inc;

connection master;
SET GLOBAL rpl_semi_sync_source_enabled = @old_rpl_semi_sync_source_enabled;
SET GLOBAL rpl_semi_sync_source_timeout = @old_rpl_semi_sync_source_timeout;
source include/uninstall_semisync_source.inc;

source include/rpl_end.inc;

semisync-master.opt
===================
$SEMISYNC_PLUGIN_OPT

semisync-slave.opt
==================
$SEMISYNC_PLUGIN_OPT
[31 Jan 12:58] MySQL Verification Team
Hello karry zhang,

Thank you for the report and feedback.

regards,
Umesh
[7 Feb 1:58] karry zhang
bugfix patch

(*) I confirm the code being submitted is offered under the terms of the OCA, and that I am authorized to contribute it.

Contribution: semisync_bugfix.patch (application/octet-stream, text), 922 bytes.

[7 Feb 8:24] MySQL Verification Team
Hello karry zhang,

Thank you for your contribution.

regards,
Umesh