From e8a7326b4f4e6c1f27617b4f7a7a2edc279bf73d Mon Sep 17 00:00:00 2001 From: Venkatesh Prasad Date: Thu, 17 Aug 2023 20:25:41 +0530 Subject: [PATCH] Bug#110127: semi-sync master wait for ack when semi-sync replica is down & async replaces it https://bugs.mysql.com/bug.php?id=110127 Description ----------- This is the regression introduced in the commit 650d2f7 (Bug#33534218 : replication_sender_observe_commit_only=ON leads to break semisync replication) which made the before_send_hook and after_send_hook to be never called if the event is in the excluded group. Prior to 8.0.31, dump thread performed below operations 1. Read events from binlog 2. For every event 2.1. Call before send hook 2.2. If the event is skipped 2.2.1. Send hearbeat event if necessary with the latest position 2.2.2. Update the exclude_end_pos to mark the position in binlog till where the replica receiver is synchronized. 2.3. If the event is not skipped 2.3.1. Send heartbeat event if necessary with the last exclude_group_end_pos. 2.3.2. Call send_packet() to send events to replica. 2.4. Call after send hook This way it ensured that before_send and after_send hooks are executed irrespective of whether the event is skipped or not. In 8.0.31 upto 8.0.34, 1. Read events from binlog 2. For every event 2.1. If the event is skipped 2.2.1. Send hearbeat event if necessary with the latest position 2.2.2. Update the exclude_end_pos to mark the position in binlog till where the replica receiver is synchronized. 2.2. If the event is not skipped 2.3.1. Send heartbeat event if necessary with the last exclude_group_end_pos. 2.3.2. Call before send hook 2.3.3. Call send_packet() to send events to replica. 2.3.4. Call after send hook Here, the above upstream commit made the before_send_hook and after_send_hook to be called only when an event is not skipped. However this change caused the client connection thread to wait for the ACK from semi-sync replica even after it was added back as a semi-sync replica, as this change never reads the acknowledgement from the replica (after send hook is not executed). Solution -------- Make the before_send and after_send server hooks to be called even for heartbeat events, preserving the behavior before to 8.0.31, by also preserving the original bugfix made in the commit 650d2f7, i.e, Observe_transmission_guard is created just before calling before_send_hook, so that reserve_header_hook is always called when sending a heartbeat event and magic number is properly encoded. --- ...mi_sync_source_wait_for_replica_ack.result | 48 +++++++++++ ...ync_source_wait_for_replica_ack-master.opt | 1 + ...sync_source_wait_for_replica_ack-slave.opt | 1 + ...semi_sync_source_wait_for_replica_ack.test | 84 +++++++++++++++++++ sql/rpl_binlog_sender.cc | 6 ++ 5 files changed, 140 insertions(+) create mode 100644 mysql-test/suite/rpl_gtid/r/rpl_semi_sync_source_wait_for_replica_ack.result create mode 100644 mysql-test/suite/rpl_gtid/t/rpl_semi_sync_source_wait_for_replica_ack-master.opt create mode 100644 mysql-test/suite/rpl_gtid/t/rpl_semi_sync_source_wait_for_replica_ack-slave.opt create mode 100644 mysql-test/suite/rpl_gtid/t/rpl_semi_sync_source_wait_for_replica_ack.test diff --git a/mysql-test/suite/rpl_gtid/r/rpl_semi_sync_source_wait_for_replica_ack.result b/mysql-test/suite/rpl_gtid/r/rpl_semi_sync_source_wait_for_replica_ack.result new file mode 100644 index 00000000000..cca20b0d3d5 --- /dev/null +++ b/mysql-test/suite/rpl_gtid/r/rpl_semi_sync_source_wait_for_replica_ack.result @@ -0,0 +1,48 @@ +include/master-slave.inc +Warnings: +Note #### Sending passwords in plain text without SSL/TLS is extremely insecure. +Note #### Storing MySQL user name or password information in the connection metadata repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START REPLICA; see the 'START REPLICA Syntax' in the MySQL Manual for more information. +[connection master] +# +# 1) Setup semisync replication +include/install_semisync.inc +# +# 2) Set the rpl_semi_sync_source_timeout on source to a large value. +[connection master] +SET @saved_rpl_semi_sync_source_timeout = @@GLOBAL.rpl_semi_sync_source_timeout; +SET GLOBAL rpl_semi_sync_source_timeout = 36000000; +# +# 3) Switch the semi-sync replication on replica to async by restarting +# replica threads with rpl_semi_sync_replica_enabled=OFF +[connection slave] +SET GLOBAL rpl_semi_sync_replica_enabled=OFF; +STOP REPLICA; +START REPLICA; +# +# 4) Execute a transaction on source. This should wait for some time for the +# replica's acknowledgement. +[connection master] +CREATE DATABASE test1; +[connection master1] +# +# 5) Switch the async replication on replica to semi-sync by restarting +# replica threads with rpl_semi_sync_replica_enabled=ON. When replica +# threads are started the waiting transaction on source must proceed. +[connection slave] +SET GLOBAL rpl_semi_sync_replica_enabled = ON; +STOP REPLICA; +START REPLICA; +[connection master1] +[connection master] +# +# 6) Verify that the transaction on source server has finished and the +# statement has been replicated. +include/rpl_sync.inc +include/rpl_diff.inc +# +# 7) Cleanup +[connection master] +DROP DATABASE test1; +SET GLOBAL rpl_semi_sync_source_timeout = @saved_rpl_semi_sync_source_timeout; +include/uninstall_semisync.inc +include/rpl_end.inc diff --git a/mysql-test/suite/rpl_gtid/t/rpl_semi_sync_source_wait_for_replica_ack-master.opt b/mysql-test/suite/rpl_gtid/t/rpl_semi_sync_source_wait_for_replica_ack-master.opt new file mode 100644 index 00000000000..58029d28ace --- /dev/null +++ b/mysql-test/suite/rpl_gtid/t/rpl_semi_sync_source_wait_for_replica_ack-master.opt @@ -0,0 +1 @@ +$SEMISYNC_PLUGIN_OPT diff --git a/mysql-test/suite/rpl_gtid/t/rpl_semi_sync_source_wait_for_replica_ack-slave.opt b/mysql-test/suite/rpl_gtid/t/rpl_semi_sync_source_wait_for_replica_ack-slave.opt new file mode 100644 index 00000000000..58029d28ace --- /dev/null +++ b/mysql-test/suite/rpl_gtid/t/rpl_semi_sync_source_wait_for_replica_ack-slave.opt @@ -0,0 +1 @@ +$SEMISYNC_PLUGIN_OPT diff --git a/mysql-test/suite/rpl_gtid/t/rpl_semi_sync_source_wait_for_replica_ack.test b/mysql-test/suite/rpl_gtid/t/rpl_semi_sync_source_wait_for_replica_ack.test new file mode 100644 index 00000000000..035cfba9373 --- /dev/null +++ b/mysql-test/suite/rpl_gtid/t/rpl_semi_sync_source_wait_for_replica_ack.test @@ -0,0 +1,84 @@ +################################################################################ +# PS-8869: semi-sync master waits for ack when semi-sync replica is down & when +# we replace it with async +# +# Steps to reproduce: +# 1) Setup semisync replication +# 2) Set the rpl_semi_sync_source_timeout on source to a large value. +# 3) Switch the semi-sync replication on replica to async by restarting +# replica threads with rpl_semi_sync_replica_enabled=OFF +# 4) Execute a transaction on source. This should wait for some time for the +# replica's acknowledgement. +# 5) Switch the async replication on replica to semi-sync by restarting +# replica threads with rpl_semi_sync_replica_enabled=ON. When replica +# threads are started the waiting transaction on source must proceed. +# 6) Verify that the transaction on source server has finished and the +# statement has been replicated. +# 7) Cleanup +################################################################################ + +# Test is independent of Binlog format. One of the three formats is enough +# for testing. Choosing 'Row' format. +--source include/have_binlog_format_row.inc +--source include/not_group_replication_plugin.inc +--source include/master-slave.inc + +--echo # +--echo # 1) Setup semisync replication +--source include/install_semisync.inc + +--echo # +--echo # 2) Set the rpl_semi_sync_source_timeout on source to a large value. +--source include/rpl_connection_master.inc +SET @saved_rpl_semi_sync_source_timeout = @@GLOBAL.rpl_semi_sync_source_timeout; +SET GLOBAL rpl_semi_sync_source_timeout = 36000000; + +--echo # +--echo # 3) Switch the semi-sync replication on replica to async by restarting +--echo # replica threads with rpl_semi_sync_replica_enabled=OFF +--source include/rpl_connection_slave.inc +SET GLOBAL rpl_semi_sync_replica_enabled=OFF; STOP REPLICA; START REPLICA; + +--echo # +--echo # 4) Execute a transaction on source. This should wait for some time for the +--echo # replica's acknowledgement. +--source include/rpl_connection_master.inc +--send CREATE DATABASE test1 + +# Sleep for some time to ensure that that the query still keeps waiting for the semi-sync ACK from replica +--sleep 5 +--source include/rpl_connection_master1.inc +--let $wait_condition= SELECT State ="Waiting for semi-sync ACK from replica" FROM INFORMATION_SCHEMA.PROCESSLIST WHERE INFO LIKE "CREATE DATA%" +--source include/wait_condition_or_abort.inc + +--echo # +--echo # 5) Switch the async replication on replica to semi-sync by restarting +--echo # replica threads with rpl_semi_sync_replica_enabled=ON. When replica +--echo # threads are started the waiting transaction on source must proceed. +--source include/rpl_connection_slave.inc +SET GLOBAL rpl_semi_sync_replica_enabled = ON; STOP REPLICA; START REPLICA; + +--source include/rpl_connection_master1.inc +--let $wait_condition= SELECT COUNT(*)=1 FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = "test1" +--source include/wait_condition_or_abort.inc + +--source include/rpl_connection_master.inc +--reap + +--echo # +--echo # 6) Verify that the transaction on source server has finished and the +--echo # statement has been replicated. +--source include/rpl_sync.inc +--let $wait_condition= SELECT COUNT(*)=1 FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = "test1" +--source include/wait_condition_or_abort.inc + +--let $rpl_diff_statement= SELECT * FROM INFORMATION_SCHEMA.SCHEMATA +--source include/rpl_diff.inc + +--echo # +--echo # 7) Cleanup +--source include/rpl_connection_master.inc +DROP DATABASE test1; +SET GLOBAL rpl_semi_sync_source_timeout = @saved_rpl_semi_sync_source_timeout; +--source include/uninstall_semisync.inc +--source include/rpl_end.inc diff --git a/sql/rpl_binlog_sender.cc b/sql/rpl_binlog_sender.cc index 6fe49bf16d0..084eba5d3a9 100644 --- a/sql/rpl_binlog_sender.cc +++ b/sql/rpl_binlog_sender.cc @@ -625,6 +625,8 @@ int Binlog_sender::send_events(File_reader &reader, my_off_t end_pos) { auto now = now_in_nanosecs(); assert(now >= m_last_event_sent_ts); + if (before_send_hook(log_file, log_pos)) return 1; + // if enough time has elapsed so that we should send another heartbeat if (m_heartbeat_period > std::chrono::nanoseconds(0) && (now - m_last_event_sent_ts) >= m_heartbeat_period) { @@ -633,6 +635,10 @@ int Binlog_sender::send_events(File_reader &reader, my_off_t end_pos) { } else { exclude_group_end_pos = log_pos; } + + if (unlikely(after_send_hook(log_file, in_exclude_group ? log_pos : 0))) + return 1; + DBUG_PRINT("info", ("Event of type %s is skipped", Log_event::get_type_str(event_type))); } else { -- 2.34.1