Bug #104690 transaction cannot commit in AFTER consistencty mode when have unreachable node
Submitted: 23 Aug 2021 10:47 Modified: 25 Aug 2021 0:30
Reporter: phoenix Zhang (OCA) Email Updates:
Status: Verified Impact on me:
None 
Category:MySQL Server: Group Replication Severity:S3 (Non-critical)
Version:8.0.25, 8.0.26 OS:Any
Assigned to: CPU Architecture:Any
Tags: group_replication

[23 Aug 2021 10:47] phoenix Zhang
Description:
When in AFTER consistency mode, it one node become UNREACHABLE, transaction cannot commit success until the unreachable expel from group_replication cluster.

In this behavior, any backup node failover, the cluster will have some time that cannot provide RW service, which maybe an accient in real database system.

How to repeat:
first, add below cnf and test file.

$ cat mysql-test/suite/group_replication/t/gr_after_hang.cnf 
!include ../my.cnf

[mysqld.1]

[mysqld.2]

[mysqld.3]

[ENV]
SERVER_MYPORT_3=		@mysqld.3.port
SERVER_MYSOCK_3=		@mysqld.3.socket

$ cat mysql-test/suite/group_replication/t/gr_after_hang.test
--source include/have_group_replication_plugin.inc
--let $rpl_skip_group_replication_start= 1
--let $rpl_server_count= 3
--let $rpl_group_replication_single_primary_mode=1
--source include/group_replication.inc

--let $rpl_connection_name= server1
--source include/rpl_connection.inc
SET GLOBAL group_replication_member_expel_timeout=30;
SET GLOBAL group_replication_consistency='AFTER';
--source include/start_and_bootstrap_group_replication.inc
--let $rpl_connection_name= server2
--source include/rpl_connection.inc
SET GLOBAL group_replication_member_expel_timeout=30;
SET GLOBAL group_replication_consistency='AFTER';
--source include/start_group_replication.inc
--let $rpl_connection_name= server3
--source include/rpl_connection.inc
SET GLOBAL group_replication_member_expel_timeout=30;
SET GLOBAL group_replication_consistency='AFTER';
--source include/start_group_replication.inc

--let $rpl_connection_name= server1
--source include/rpl_connection.inc
SELECT * FROM performance_schema.replication_group_members;
CREATE TABLE t1 (a INT PRIMARY KEY , b INT);
--exec date
INSERT INTO t1 VALUES (1,1);

--let $rpl_connection_name= server3
--source include/rpl_connection.inc
SET PERSIST group_replication_local_address=@@group_replication_local_address;
--echo [server3 shutdown]
--let $_server_id= `SELECT @@server_id`
--let $expect_file_name= $MYSQLTEST_VARDIR/tmp/mysqld.$_server_id.expect
--let $mysqld_pid_file=`SELECT @@GLOBAL.pid_file`
--source include/expect_crash.inc
exec kill -9 `cat $mysqld_pid_file`;

connect (conn1,127.0.0.1,root,,test,$SERVER_MYPORT_1,);
connect (conn2,127.0.0.1,root,,test,$SERVER_MYPORT_1,);
connection conn1;
--echo [connect conn1]
--let $wait_condition=SELECT COUNT(*)=1 FROM performance_schema.replication_group_members WHERE MEMBER_STATE='UNREACHABLE'
--source include/wait_condition.inc
SELECT * FROM performance_schema.replication_group_members;
send INSERT INTO t1 VALUES (2,2);
sleep 5;

connection conn2;
--echo [connect conn2]
show processlist;

connection conn1;
--echo [connect conn1]
reap;
--exec date

And then run the testcase with command:
mysql-test/mtr gr_after_hang --nocheck-testcases

==============================================================================
                  TEST NAME                       RESULT  TIME (ms) COMMENT
------------------------------------------------------------------------------
include/group_replication.inc [rpl_server_count=3]
Warnings:
Note	####	Sending passwords in plain text without SSL/TLS is extremely insecure.
Note	####	Storing MySQL user name or password information in the master info repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START SLAVE; see the 'START SLAVE Syntax' in the MySQL Manual for more information.
[connection server1]
[connection server1]
SET GLOBAL group_replication_member_expel_timeout=30;
SET GLOBAL group_replication_consistency='AFTER';
include/start_and_bootstrap_group_replication.inc
[connection server2]
SET GLOBAL group_replication_member_expel_timeout=30;
SET GLOBAL group_replication_consistency='AFTER';
include/start_group_replication.inc
[connection server3]
SET GLOBAL group_replication_member_expel_timeout=30;
SET GLOBAL group_replication_consistency='AFTER';
include/start_group_replication.inc
[connection server1]
SELECT * FROM performance_schema.replication_group_members;
CHANNEL_NAME	MEMBER_ID	MEMBER_HOST	MEMBER_PORT	MEMBER_STATE	MEMBER_ROLE	MEMBER_VERSION
group_replication_applier	2c4f2d31-03f7-11ec-9dec-c8f7507e5048	127.0.0.1	13002	ONLINE	SECONDARY	8.0.25
group_replication_applier	2c89f332-03f7-11ec-abbe-c8f7507e5048	127.0.0.1	13004	ONLINE	SECONDARY	8.0.25
group_replication_applier	2c963be8-03f7-11ec-ba4a-c8f7507e5048	127.0.0.1	13000	ONLINE	PRIMARY	8.0.25
CREATE TABLE t1 (a INT PRIMARY KEY , b INT);
Mon Aug 23 12:47:59 GMT 2021
INSERT INTO t1 VALUES (1,1);
[connection server3]
SET PERSIST group_replication_local_address=@@group_replication_local_address;
[server3 shutdown]
[connect conn1]
SELECT * FROM performance_schema.replication_group_members;
CHANNEL_NAME	MEMBER_ID	MEMBER_HOST	MEMBER_PORT	MEMBER_STATE	MEMBER_ROLE	MEMBER_VERSION
group_replication_applier	2c4f2d31-03f7-11ec-9dec-c8f7507e5048	127.0.0.1	13002	ONLINE	SECONDARY	8.0.25
group_replication_applier	2c89f332-03f7-11ec-abbe-c8f7507e5048	127.0.0.1	13004	UNREACHABLE	SECONDARY	8.0.25
group_replication_applier	2c963be8-03f7-11ec-ba4a-c8f7507e5048	127.0.0.1	13000	ONLINE	PRIMARY	8.0.25
INSERT INTO t1 VALUES (2,2);
[connect conn2]
show processlist;
Id	User	Host	db	Command	Time	State	Info	Time_ms	Rows_sent	Rows_examined
5	event_scheduler	localhost	NULL	Daemon	19	Waiting on empty queue	NULL	18855	0	0
8	root	localhost	test	Sleep	19		NULL	18718	1	1
10	root	localhost:43354	test	Sleep	19		NULL	18392	1	1
11	root	localhost:43356	test	Sleep	19		NULL	18531	1	1
12	root	localhost:43358	test	Sleep	8		NULL	7570	0	0
13	root	localhost:43364	test	Sleep	19		NULL	18207	1	1
14	root	localhost:43366	test	Sleep	19		NULL	18402	1	1
17	system user		NULL	Connect	18	waiting for handler commit	Group replication applier module	18200	0	0
20	system user		NULL	Query	18	Slave has read all relay log; waiting for more updates	NULL	8968	0	0
33	root	localhost:43416	test	Query	5	waiting for handler commit	INSERT INTO t1 VALUES (2,2)	5001	0	0
34	root	localhost:43418	test	Query	0	init	show processlist	0	0	0
[connect conn1]
Mon Aug 23 12:48:33 GMT 2021
[24 Aug 2021 9:49] MySQL Verification Team
Hello phoenix Zhang!

Thank you for the report and feedback.

regards,
Umesh
[25 Aug 2021 0:30] phoenix Zhang
I check the worklog of consistency, https://dev.mysql.com/worklog/task/?id=10379

it ways:
FR-08: When group_replication_consistency=AFTER or BEFORE_AND_AFTER,
       if there are unreachable members but the group still has a
       reachable majority, the transaction will wait until that
       members are reachable or leave the group.

If so, such behavior work as expected