MySQL Bugs: #91830: The GR master node inserts big data and cannot succeed for a long time

Bug #91830	The GR master node inserts big data and cannot succeed for a long time
Submitted:	30 Jul 2018 11:05	Modified:	22 Mar 2019 8:35
Reporter:	yghmgl yang	Email Updates:
Status:	Closed	Impact on me:	None
Category:	MySQL Server: Replication	Severity:	S2 (Serious)
Version:	5.7.23	OS:	Any
Assigned to:		CPU Architecture:	Any

Description:
The GR master node inserts big data and cannot succeed for a long time.

How to repeat:
#400MB.data is a file of 400MB size
#Three nodes are located on different hosts
#Network bandwidth is 128MB/s
#If your bandwidth is not 128MB/s, you can adjust the size of the data file to reproduce the problem. The file size can be set to 4 times the bandwidth.

test case:
create database testdbx;
use testdbx;
create table t1(id int PRIMARY KEY AUTO_INCREMENT,c1 longblob);
insert into t1(c1) values(load_file('/sda/soft/gendata/400MB.data'));

After executing the above insert statement, waiting for 20 hours can not succeed, 
nor fail, all three nodes are online, use sar -n DEV 1 to monitor each node, and 
find that a large number of data packets are always being sent and received.

Suggested fix:
in xcom_base.c line 1731~1840

retry_new:
		/* Find a free slot */

		assert(!synode_eq(current_message, null_synode));
		ep->msgno = current_message;
		while (is_busy(ep->msgno)) {
			while (/* ! ep->client_msg->p->force_delivery &&  */ too_far(incr_msgno(ep->msgno))) { /* Too far ahead of executor */
				TIMED_TASK_WAIT(&exec_wait, 1.0);
				DBGOUT(FN; TIMECEXP(ep->start_propose); TIMECEXP(ep->client_msg->p->a->expiry_time); TIMECEXP(task_now());

				    NDBG(enough_live_nodes(ep->site), d));
			}
			ep->msgno = incr_msgno(ep->msgno);
		}
		assert(!synode_eq(ep->msgno, null_synode));
		proposer_site = find_site_def_rw(ep->msgno);

		ep->site = proposer_site;

		/* See if we can do anything with this message */
#if 1
    	if (!ep->site || get_nodeno(ep->site) == VOID_NODE_NO) {
			/* Give up */
			deliver_to_app(NULL, ep->client_msg->p->a, delivery_failure);
			GOTO(next);
		}
#endif
		DBGOHK(FN; STRLIT("changing current message"));
		set_current_message(ep->msgno);

		brand_client_msg(ep->client_msg->p, ep->msgno);
		ep->client_msg->p->a->lsn = ep->msgno.msgno;

		for(;;) { /* Loop until the client message has been learned */
			/* Get a Paxos instance to send the client message */
			ep->p = get_cache(ep->msgno);
			assert(ep->p);
			if(ep->client_msg->p->force_delivery)
				ep->p->force_delivery = ep->client_msg->p->force_delivery;
			lock = lock_pax_machine(ep->p);
			assert(!lock);

			/* Set the client message as current proposal */
			assert(ep->client_msg->p);
			replace_pax_msg(&ep->p->proposer.msg, clone_pax_msg(ep->client_msg->p));
			assert(ep->p->proposer.msg);
			PAX_MSG_SANITY_CHECK(ep->p->proposer.msg);

			/* Create the prepare message */
			unchecked_replace_pax_msg(&ep->prepare_msg,
				pax_msg_new(ep->msgno, ep->site));
			DBGOUT(FN; PTREXP(ep->client_msg->p->a); STRLIT("pushing "); SYCEXP(ep->msgno));
			MAY_DBG(FN; COPY_AND_FREE_GOUT(dbg_app_data(ep->prepare_msg->a)));

			if(threephase || ep->p->force_delivery){
				push_msg_3p(ep->site, ep->p, ep->prepare_msg, ep->msgno, normal);
			}else{
				push_msg_2p(ep->site, ep->p);
			}

			ep->start_push = task_now();

			while (!finished(ep->p)) { /* Try to get a value accepted */
				/* We will wake up periodically, and whenever a message arrives */
				TIMED_TASK_WAIT(&ep->p->rv, ep->delay = wakeup_delay(ep->delay));
				if (!synode_eq(ep->msgno, ep->p->synode) || ep->p->proposer.msg == NULL) {
					DBGOHK(FN; STRLIT("detected stolen state machine, retry"); );
					/* unlock_pax_machine(ep->p); */
					GOTO(retry_new); /* Need to break out of both loops,
										and we have no "exit named loop" construction */
				}
				assert(synode_eq(ep->msgno, ep->p->synode) &&  ep->p->proposer.msg);
				if (finished(ep->p))
					break;
				 {
					 double	now = task_now();
					if ((ep->start_push + ep->delay) <= now) {
						PAX_MSG_SANITY_CHECK(ep->p->proposer.msg);
						DBGOUT(FN; STRLIT("retry pushing "); SYCEXP(ep->msgno));
						MAY_DBG(FN; COPY_AND_FREE_GOUT(dbg_app_data(ep->prepare_msg->a));
						    );
						DBGOUT(BALCEXP(ep->p->proposer.bal);
						    BALCEXP(ep->p->acceptor.promise));
						MAY_DBG(FN; dbg_reply_set(ep->site, "prep_node_set", ep->p->proposer.prep_nodeset);
						    dbg_reply_set(ep->site, "prop_node_set", ep->p->proposer.prop_nodeset);
						    );
						push_msg_3p(ep->site, ep->p, ep->prepare_msg, ep->msgno, normal);
						ep->start_push = now;
					}
				}
			}
			/* When we get here, we know the value for this message number,
			   but it may not be the value we tried to push,
			   so loop until we have a successfull push. */
			unlock_pax_machine(ep->p);
			MAY_DBG(FN; STRLIT(" found finished message ");
			    SYCEXP(ep->msgno);
			    STRLIT("seconds since last push ");
			    NPUT(task_now() - ep->start_push, f);
			    STRLIT("ep->client_msg ");
			    COPY_AND_FREE_GOUT(dbg_pax_msg(ep->client_msg->p));
			    );
			MAY_DBG(FN; STRLIT("ep->p->learner.msg ");
			    COPY_AND_FREE_GOUT(dbg_pax_msg(ep->p->learner.msg));
			    );
			if (match_my_msg(ep->p->learner.msg, ep->client_msg->p)){
				break;
			} else
				GOTO(retry_new);
		}
		
The master node has initiated a retry (retry_new) before the slave node receives the data, 
so the result is that the master node keeps retrying and the network bandwidth resources are 
almost exhausted.I printed the log and found that it will retry_new once in 10 seconds, 
so I feel that this 10 seconds is too short. I didn't find the interval to set retry_new, 
so I didn't prove my idea.

my config

s1:

[mysqld]

port=3306
gtid_mode=ON
enforce_gtid_consistency=ON
max_allowed_packet=1073741824

master_info_repository=TABLE
relay_log_info_repository=TABLE
binlog_checksum=NONE
log_slave_updates=ON
log_bin=binlog
binlog_format=ROW
sync_binlog=1
binlog_order_commits=ON
innodb_flush_log_at_trx_commit=1
slave_parallel_type=LOGICAL_CLOCK
log-slave-updates=ON
slave-preserve-commit-order=1
relay_log=relay-log
slave_transaction_retries=10

transaction_write_set_extraction=XXHASH64
loose-group_replication_group_name="aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
loose-group_replication_start_on_boot=off
loose-group_replication_ip_whitelist ='100.180.244.254,100.180.253.162,100.180.46.183'
loose-group_replication_local_address="100.180.253.162:33060"
loose-group_replication_group_seeds="100.180.253.162:33060,100.180.244.254:33060,100.180.46.183:33060"

loose-group_replication_bootstrap_group=off

s2:

[mysqld]

port=3307
gtid_mode=ON
enforce_gtid_consistency=ON
max_allowed_packet=1073741824

master_info_repository=TABLE
relay_log_info_repository=TABLE
binlog_checksum=NONE
log_slave_updates=ON
log_bin=binlog
binlog_format=ROW
sync_binlog=1
binlog_order_commits=ON
innodb_flush_log_at_trx_commit=1
slave_parallel_type=LOGICAL_CLOCK
log-slave-updates=ON
slave-preserve-commit-order=1
relay_log=relay-log
slave_transaction_retries=10

transaction_write_set_extraction=XXHASH64
loose-group_replication_group_name="aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
loose-group_replication_start_on_boot=off
loose-group_replication_ip_whitelist ='100.180.244.254,100.180.253.162,100.180.46.183'
loose-group_replication_local_address="100.180.244.254:33060"
loose-group_replication_group_seeds="100.180.253.162:33060,100.180.244.254:33060,100.180.46.183:33060"
loose-group_replication_bootstrap_group=off

s3:

[mysqld]

port=3306
gtid_mode=ON
enforce_gtid_consistency=ON
max_allowed_packet=1073741824

master_info_repository=TABLE
relay_log_info_repository=TABLE
binlog_checksum=NONE
log_slave_updates=ON
log_bin=binlog
binlog_format=ROW
sync_binlog=1
binlog_order_commits=ON
innodb_flush_log_at_trx_commit=1
slave_parallel_type=LOGICAL_CLOCK
log-slave-updates=ON
slave-preserve-commit-order=1
relay_log=relay-log
slave_transaction_retries=10

transaction_write_set_extraction=XXHASH64
loose-group_replication_group_name="aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
loose-group_replication_start_on_boot=off
loose-group_replication_ip_whitelist ='100.180.244.254,100.180.253.162,100.180.46.183'
loose-group_replication_local_address="100.180.46.183:33060"
loose-group_replication_group_seeds="100.180.253.162:33060,100.180.244.254:33060,100.180.46.183:33060"
loose-group_replication_bootstrap_group=off

Hi,

I reproduced this but I'm torn between accepting this as a bug or deciding this is improperly sized platform for the use case you have, but since you already provided an idea how to fix the problem I'll accept this as a bug and let the dev team decide how to proceed forward.

Thanks for the work you put into this, and for your report!
kind regards
Bogdan

Hi, thanks for verified

The current MIN_CACHED is set to 10, that is, the data is kept for up to 10 seconds in cach. If MIN_CACHED is set to a larger value, the bug may be fixed. Currently, the large transaction limit packet is 1 GB (more than 1 GB  Will report an error while compression), so the value of MIN_CACHED can be as long as 1GB of data can be transmitted and can receive a response, of course, 1GB of transmission within MIN_CACHED  seconds also requires bandwidth support.

Posted by developer:
 
Fixed in MySQL 8.0.16 by WL #11610. Please see:
https://dev.mysql.com/doc/refman/8.0/en/group-replication-performance-message-fragmentatio...
https://dev.mysql.com/doc/refman/8.0/en/group-replication-communication-protocol.html
https://dev.mysql.com/doc/refman/8.0/en/group-replication-options.html#sysvar_group_replic...
https://dev.mysql.com/doc/refman/8.0/en/group-replication-functions-for-communication-prot...