diff --git a/mysql-test/suite/rpl/r/non_xid_slave_hang.result b/mysql-test/suite/rpl/r/non_xid_slave_hang.result new file mode 100644 index 0000000..7cf2fd0 --- /dev/null +++ b/mysql-test/suite/rpl/r/non_xid_slave_hang.result @@ -0,0 +1,27 @@ +include/master-slave.inc +[connection master] +set @parallel= @@global.slave_parallel_workers; +stop slave; +set global debug="+d,skip_xid_log_event"; +set global slave_parallel_workers=8; +set global slave_pr_mode='schema'; +start slave; +set binlog_format= statement; +create table test.t(id int); +begin; +insert into test.t values(1); +create temporary table test.t_tmp(id int); +commit; +set binlog_format= row; +create table test.a(id int) engine=INNODB; +insert into test.a values(1); +flush logs; +include/wait_for_slave_sql_error.inc [errno=1755] +include/stop_slave.inc +drop table test.a; +drop table test.t; +drop table test.a; +drop table test.t; +set global debug="-d,skip_xid_log_event"; +reset slave; +set global slave_parallel_workers= @parallel; diff --git a/mysql-test/suite/rpl/t/non_xid_slave_hang.test b/mysql-test/suite/rpl/t/non_xid_slave_hang.test new file mode 100644 index 0000000..a45f4e7 --- /dev/null +++ b/mysql-test/suite/rpl/t/non_xid_slave_hang.test @@ -0,0 +1,51 @@ +# Transactions without xid_log_event may cause parallel replication stalled. +# rb://93437 +# bug://73066,bug://72794 + +--disable_warnings + +--source include/have_debug.inc +--source include/master-slave.inc +--source include/have_binlog_format_row.inc + +--disable_query_log +call mtr.add_suppression("Slave:*"); +--enable_query_log + +connection slave; +set @parallel= @@global.slave_parallel_workers; + +stop slave; +set global debug="+d,skip_xid_log_event"; +set global slave_parallel_workers=8; +set global slave_pr_mode='schema'; +start slave; + +connection master; +set binlog_format= statement; +create table test.t(id int); +begin; +insert into test.t values(1); +create temporary table test.t_tmp(id int); +commit; + +set binlog_format= row; +create table test.a(id int) engine=INNODB; +insert into test.a values(1); +flush logs; + +connection slave; +--let $slave_sql_errno= 1755 +--source include/wait_for_slave_sql_error.inc +source include/stop_slave.inc; + +connection master; +drop table test.a; +drop table test.t; + +connection slave; +drop table test.a; +drop table test.t; +set global debug="-d,skip_xid_log_event"; +reset slave; +set global slave_parallel_workers= @parallel; diff --git a/sql/log_event.cc b/sql/log_event.cc index d8df3a1..467753c 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -2685,6 +2685,19 @@ Slave_worker *Log_event::get_slave_worker(Relay_log_info *rli) #endif Slave_committed_queue *gaq= rli->gaq; + if ((!rli->curr_group_seen_xid) && starts_group()) + { + char llbuff[22]; + llstr(rli->get_event_relay_log_pos(), llbuff); + my_error(ER_MTS_CANT_PARALLEL, MYF(0), + get_type_str(), rli->get_event_relay_log_name(), llbuff, + "Relay log is not complete, transaction delivery error." + "set slave_parallel_workers=0 and start slave to replicate."); + + rli->mts_group_status= Relay_log_info::MTS_KILLED_GROUP; + return NULL; + } + /* checking partioning properties and perform corresponding actions */ // Beginning of a group designated explicitly with BEGIN or GTID @@ -2702,6 +2715,8 @@ Slave_worker *Log_event::get_slave_worker(Relay_log_info *rli) gaq->get_job_group(rli->gaq->assigned_group_index)-> worker_id != MTS_WORKER_UNDEF))) { + if(starts_group()) + rli->curr_group_seen_xid= FALSE; if (!rli->curr_group_seen_gtid && !rli->curr_group_seen_begin) { ulong gaq_idx; @@ -2971,6 +2986,7 @@ Slave_worker *Log_event::get_slave_worker(Relay_log_info *rli) // reclaiming resources allocated during the group scheduling free_root(&rli->mts_coor_mem_root, MYF(MY_KEEP_PREALLOC)); + rli->curr_group_seen_xid= TRUE; #ifndef DBUG_OFF w_rr++; #endif @@ -3060,6 +3076,26 @@ int Log_event::apply_event(Relay_log_info *rli) goto err; } /* + Binlog in sequence like this: + begin, table_map, update, rotate + may hang, So you have to end it before hang. + */ + if (!rli->curr_group_seen_xid) + { + char llbuff[22]; + llstr(rli->get_event_relay_log_pos(), llbuff); + my_error(ER_MTS_CANT_PARALLEL, MYF(0), + get_type_str(), rli->get_event_relay_log_name(), llbuff, + "Relay log is not complete, transaction delivery error." + "set slave_parallel_workers=0 and start slave to replicate." + ); + + rli->mts_group_status= Relay_log_info::MTS_KILLED_GROUP; + worker= NULL; + goto err; + } + + /* Marking sure the event will be executed in sequential mode. */ if (wait_for_workers_to_finish(rli) == -1) diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h index 77f3592..93167c6 100644 --- a/sql/rpl_rli.h +++ b/sql/rpl_rli.h @@ -507,6 +507,7 @@ public: DYNAMIC_ARRAY curr_group_da; // deferred array to hold partition-info-free events bool curr_group_seen_gtid; // current group started with Gtid-event or not bool curr_group_seen_begin; // current group started with B-event or not + bool curr_group_seen_xid; // current group end normally bool curr_group_isolated; // current group requires execution in isolation bool mts_end_group_sets_max_dbs; // flag indicates if partitioning info is discovered volatile ulong mts_wq_underrun_w_id; // Id of a Worker whose queue is getting empty diff --git a/sql/rpl_slave.cc b/sql/rpl_slave.cc index 672a974..afb5420 100644 --- a/sql/rpl_slave.cc +++ b/sql/rpl_slave.cc @@ -5325,6 +5325,7 @@ int slave_start_workers(Relay_log_info *rli, ulong n, bool *mts_inited) rli->mts_coordinator_basic_nap= mts_coordinator_basic_nap; rli->mts_worker_underrun_level= mts_worker_underrun_level; rli->curr_group_seen_begin= rli->curr_group_seen_gtid= false; + rli->curr_group_seen_xid= TRUE; rli->curr_group_isolated= FALSE; rli->checkpoint_seqno= 0; rli->mts_last_online_stat= my_time(0); @@ -6376,6 +6377,13 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len) DBUG_RETURN(ret); } + DBUG_EXECUTE_IF("skip_xid_log_event", + if (event_type == XID_EVENT) + { + goto skip_relay_logging; + } + ); + switch (event_type) { case STOP_EVENT: /*