MySQL Bugs: #72794: Relay log without xid_log_event may case parallel replication hang

Bug #72794	Relay log without xid_log_event may case parallel replication hang
Submitted:	29 May 2014 4:02	Modified:	29 Oct 2014 13:32
Reporter:	qinglin zhang (OCA)	Email Updates:
Status:	Closed	Impact on me:	None
Category:	MySQL Server: Replication	Severity:	S2 (Serious)
Version:	5.6.15, 5.6.16	OS:	Any
Assigned to:		CPU Architecture:	Any

Description:
Description:
Relay log may not include xid_log_event when slave crash sometimes, when slave started and start slave again , it will continue consume Transactions without xid_log_event in the relay log. 
bug when a Transaction in relay log don't have xid_log_event and slave_parallel_workers > 0 ,it will cause two problems.
First. the Transaction in Relay_log_info::gaq will not move_queue_ahead, then all relay log thread will sleep all the time.
Second. when you run stop slave, the entry::usage in the map mapping_db_to_worker will always bigger than 0, and the workers will wait eternally.

the pt-pmp lie bellow:

     18 libaio::??,os_aio_linux_collect,os_aio_linux_handle,fil_aio_wait,io_handler_thread,start_thread,clone
     16 pthread_cond_wait,inline_mysql_cond_wait,pop_jobs_item,slave_worker_exec_job,handle_slave_worker,start_thread,clone
      3 pthread_cond_wait,os_cond_wait,os_event_wait_low,srv_worker_thread,start_thread,clone
      1 sigwait,signal_hand,start_thread,clone
      1 sigwaitinfo,timer_notify_thread,start_thread,clone
      1 select,os_thread_sleep,srv_master_sleep,srv_master_thread,start_thread,clone
      1 select,os_thread_sleep,page_cleaner_sleep_if_needed,buf_flush_page_cleaner_thread,start_thread,clone
      1 pthread_cond_wait,os_cond_wait,os_event_wait_low,srv_purge_coordinator_suspend,srv_purge_coordinator_thread,start_thread,clone
      1 pthread_cond_wait,os_cond_wait,os_event_wait_low,buf_dump_thread,start_thread,clone
      1 pthread_cond_wait,inline_mysql_cond_wait,wait_for_workers_to_finish,slave_stop_workers,handle_slave_sql,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,srv_monitor_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,srv_error_monitor_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,lock_wait_timeout_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,ib_wqueue_timedwait,fts_optimize_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,dict_stats_thread,start_thread,clone
      1 pthread_cond_timedwait,inline_mysql_cond_timedwait,slave_sleep<bool,connect_to_master,safe_connect,handle_slave_io,start_thread,clone
      1 poll,vio_io_wait,vio_socket_io_wait,vio_read,net_read_raw_loop,net_read_packet_header,net_read_packet,my_net_read,do_command,do_handle_one_connection,handle_one_connection,start_thread,clone
      1 poll,handle_connections_sockets,mysqld_main,__libc_start_main,_start

How to repeat:
diff --git a/mysql-test/suite/rpl/t/non_xid_slave_hang.test b/mysql-test/suite/rpl/t/non_xid_slave_hang.test                 
new file mode 100644
index 0000000..eeefd61
--- /dev/null
+++ b/mysql-test/suite/rpl/t/non_xid_slave_hang.test
@@ -0,0 +1,36 @@
+# rb://93437
+
+--source include/have_debug.inc
+--source include/master-slave.inc
+--source include/have_binlog_format_row.inc
+
+--disable_query_log
+call mtr.add_suppression("Slave:*");
+--enable_query_log
+
+connection slave;
+stop slave;
+set global debug="d,skip_xid_log_event";
+set global slave_parallel_workers=8;
+select sleep(30);
+start  slave;
+
+set global debug="d,skip_xid_log_event";
+
+connection master;
+create table test.a(id int) engine=INNODB;
+insert into test.a values(1);
+sync_slave_with_master;
+
+
+connection slave;
+source include/stop_slave.inc;
+RESET SLAVE;
+
+
+connection slave;
+drop table test.a;
+
+connection master;
+drop table test.a;
diff --git a/sql/rpl_slave.cc b/sql/rpl_slave.cc
index 5f08321..ed489a8 100644
--- a/sql/rpl_slave.cc
+++ b/sql/rpl_slave.cc
@@ -6373,6 +6373,13 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
     DBUG_RETURN(ret);
   }
 
+  DBUG_EXECUTE_IF("skip_xid_log_event",
+    if (event_type == XID_EVENT)
+    {
+      goto skip_relay_logging;
+    }
+  );
+
   switch (event_type) {
   case STOP_EVENT:
     /*

Suggested fix:
diff --git a/sql/log_event.cc b/sql/log_event.cc
index d8df3a1..4ea992e 100644
--- a/sql/log_event.cc
+++ b/sql/log_event.cc
@@ -2704,6 +2704,7 @@ Slave_worker *Log_event::get_slave_worker(Relay_log_info *rli)
   {
     if (!rli->curr_group_seen_gtid && !rli->curr_group_seen_begin)
     {
+      rli->last_group_end_normal= false;
       ulong gaq_idx;
       rli->mts_groups_assigned++;
       // free memory used by relay thread
@@ -2742,6 +2743,21 @@ Slave_worker *Log_event::get_slave_worker(Relay_log_info *rli)
         return ret_worker;
       }
     }
+    else if ((starts_group() || is_gtid_event(this)) &&
+             (!rli->last_group_end_normal))
+    {
+      /** transactions of dml don't have xid_log_event */
+      ptr_group= gaq->get_job_group(rli->gaq->assigned_group_index);
+
+      if(ptr_group->worker)
+        ptr_group->worker->clear_job_group_entry_info(rli);
+
+      llstr(rli->get_event_relay_log_pos(), llbuff);
+      my_error(ER_MTS_CANT_PARALLEL, MYF(0),
+               get_type_str(), rli->get_event_relay_log_name(), llbuff,
+               "could not distribute the event to a Worker");
+      return ptr_group->worker;
+    }
     else
     {
       Log_event *ptr_curr_ev= this;
@@ -2970,7 +2986,7 @@ Slave_worker *Log_event::get_slave_worker(Relay_log_info *rli)
 
     // reclaiming resources allocated during the group scheduling
     free_root(&rli->mts_coor_mem_root, MYF(MY_KEEP_PREALLOC));
-
+    rli->last_group_end_normal= true;
 #ifndef DBUG_OFF
     w_rr++;
 #endif
diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h
index 77f3592..719b588 100644
--- a/sql/rpl_rli.h
+++ b/sql/rpl_rli.h
@@ -507,6 +507,7 @@ public:
   DYNAMIC_ARRAY curr_group_da;  // deferred array to hold partition-info-free events
   bool curr_group_seen_gtid;   // current group started with Gtid-event or not
   bool curr_group_seen_begin;   // current group started with B-event or not
+  bool last_group_end_normal;    // last transaction end normally flag
   bool curr_group_isolated;     // current group requires execution in isolation
   bool mts_end_group_sets_max_dbs; // flag indicates if partitioning info is discovered
   volatile ulong mts_wq_underrun_w_id;  // Id of a Worker whose queue is getting empty
diff --git a/sql/rpl_rli_pdb.h b/sql/rpl_rli_pdb.h
index 221ab4e..29895d1 100644
--- a/sql/rpl_rli_pdb.h
+++ b/sql/rpl_rli_pdb.h
@@ -384,6 +384,7 @@ public:
   ulonglong set_master_log_pos(ulong val) { return master_log_pos= val; };
   bool commit_positions(Log_event *evt, Slave_job_group *ptr_g, bool force);
   bool reset_recovery_info();
+  void clear_job_group_entry_info(Relay_log_info*);
   /**
      Different from the parent method in that this does not delete
      rli_description_event.
diff --git a/sql/rpl_rli_pdb.cc b/sql/rpl_rli_pdb.cc
index 8e5c331..cfd06af 100644
--- a/sql/rpl_rli_pdb.cc
+++ b/sql/rpl_rli_pdb.cc
@@ -570,6 +570,22 @@ bool init_hash_workers(ulong slave_parallel_workers)
   DBUG_RETURN (!inited_hash_workers);
 }
 
+/** Clear entry info binded with the current transaction */
+void Slave_worker::clear_job_group_entry_info(Relay_log_info* rli)
+{
+  db_worker_hash_entry *entry= NULL;
+  mysql_mutex_lock(&slave_worker_hash_lock);
+  for (uint i =0; i< rli->curr_group_assigned_parts.elements; i++)
+  {
+    entry= * (db_worker_hash_entry **)
+      dynamic_array_ptr(&rli->curr_group_assigned_parts, i);
+
+    entry->usage--;
+    mysql_cond_signal(&slave_worker_hash_cond);
+  }
+  mysql_mutex_unlock(&slave_worker_hash_lock);
+}
+
 void destroy_hash_workers(Relay_log_info *rli)
 {
   DBUG_ENTER("destroy_hash_workers");
diff --git a/sql/rpl_slave.cc b/sql/rpl_slave.cc
index 5f08321..b6a0298 100644
--- a/sql/rpl_slave.cc
+++ b/sql/rpl_slave.cc
@@ -5322,6 +5322,7 @@ int slave_start_workers(Relay_log_info *rli, ulong n, bool *mts_inited)
   rli->mts_coordinator_basic_nap= mts_coordinator_basic_nap;
   rli->mts_worker_underrun_level= mts_worker_underrun_level;
   rli->curr_group_seen_begin= rli->curr_group_seen_gtid= false;
+  rli->last_group_end_normal= true;
   rli->curr_group_isolated= FALSE;
   rli->checkpoint_seqno= 0;
   rli->mts_last_online_stat= my_time(0);

remove sync_slave_with master, and you will have the pt-pmp like this:

Thu May 29 12:20:30 CST 2014
      8 pthread_cond_wait,safe_cond_wait,inline_mysql_cond_wait,pop_jobs_item,slave_worker_exec_job,handle_slave_worker,pfs_spawn_thread,start_thread,clone
      6 libaio::??,os_aio_linux_collect,os_aio_linux_handle,fil_aio_wait,io_handler_thread,start_thread,clone
      3 poll,vio_io_wait,vio_socket_io_wait,vio_read,net_read_raw_loop,net_read_packet_header,net_read_packet,my_net_read,do_command,do_handle_one_connection,handle_one_connection,pfs_spawn_thread,start_thread,clone
      1 sigwait,signal_hand,pfs_spawn_thread,start_thread,clone
      1 sigwaitinfo,timer_notify_thread,start_thread,clone
      1 select,os_thread_sleep,srv_master_sleep,srv_master_thread,start_thread,clone
      1 select,os_thread_sleep,page_cleaner_sleep_if_needed,buf_flush_page_cleaner_thread,start_thread,clone
      1 pthread_cond_wait,os_cond_wait,os_event_wait_low,srv_purge_coordinator_suspend,srv_purge_coordinator_thread,start_thread,clone
      1 pthread_cond_wait,os_cond_wait,os_event_wait_low,buf_dump_thread,start_thread,clone
      1 pthread_cond_timedwait,safe_cond_timedwait,inline_mysql_cond_timedwait,terminate_slave_thread,terminate_slave_threads,stop_slave,mysql_execute_command,mysql_parse,dispatch_command,do_command,do_handle_one_connection,handle_one_connection,pfs_spawn_thread,start_thread,clone
      1 pthread_cond_timedwait,safe_cond_timedwait,inline_mysql_cond_timedwait,MYSQL_BIN_LOG::wait_for_update_relay_log,next_event,exec_relay_log_event,handle_slave_sql,pfs_spawn_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,srv_monitor_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,srv_error_monitor_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,lock_wait_timeout_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,ib_wqueue_timedwait,fts_optimize_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,dict_stats_thread,start_thread,clone
      1 poll,vio_io_wait,vio_socket_io_wait,vio_read,vio_read_buff,net_read_raw_loop,net_read_packet_header,net_read_packet,my_net_read,cli_safe_read,read_event,handle_slave_io,pfs_spawn_thread,start_thread,clone
      1 poll,handle_connections_sockets,mysqld_main,main

remove sync_slave_with master, and you will have the pt-pmp like this:

Thu May 29 12:20:30 CST 2014
      8 pthread_cond_wait,safe_cond_wait,inline_mysql_cond_wait,pop_jobs_item,slave_worker_exec_job,handle_slave_worker,pfs_spawn_thread,start_thread,clone
      6 libaio::??,os_aio_linux_collect,os_aio_linux_handle,fil_aio_wait,io_handler_thread,start_thread,clone
      3 poll,vio_io_wait,vio_socket_io_wait,vio_read,net_read_raw_loop,net_read_packet_header,net_read_packet,my_net_read,do_command,do_handle_one_connection,handle_one_connection,pfs_spawn_thread,start_thread,clone
      1 sigwait,signal_hand,pfs_spawn_thread,start_thread,clone
      1 sigwaitinfo,timer_notify_thread,start_thread,clone
      1 select,os_thread_sleep,srv_master_sleep,srv_master_thread,start_thread,clone
      1 select,os_thread_sleep,page_cleaner_sleep_if_needed,buf_flush_page_cleaner_thread,start_thread,clone
      1 pthread_cond_wait,os_cond_wait,os_event_wait_low,srv_purge_coordinator_suspend,srv_purge_coordinator_thread,start_thread,clone
      1 pthread_cond_wait,os_cond_wait,os_event_wait_low,buf_dump_thread,start_thread,clone
      1 pthread_cond_timedwait,safe_cond_timedwait,inline_mysql_cond_timedwait,terminate_slave_thread,terminate_slave_threads,stop_slave,mysql_execute_command,mysql_parse,dispatch_command,do_command,do_handle_one_connection,handle_one_connection,pfs_spawn_thread,start_thread,clone
      1 pthread_cond_timedwait,safe_cond_timedwait,inline_mysql_cond_timedwait,MYSQL_BIN_LOG::wait_for_update_relay_log,next_event,exec_relay_log_event,handle_slave_sql,pfs_spawn_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,srv_monitor_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,srv_error_monitor_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,lock_wait_timeout_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,ib_wqueue_timedwait,fts_optimize_thread,start_thread,clone
      1 pthread_cond_timedwait,os_cond_wait_timed,os_event_wait_time_low,dict_stats_thread,start_thread,clone
      1 poll,vio_io_wait,vio_socket_io_wait,vio_read,vio_read_buff,net_read_raw_loop,net_read_packet_header,net_read_packet,my_net_read,cli_safe_read,read_event,handle_slave_io,pfs_spawn_thread,start_thread,clone
      1 poll,handle_connections_sockets,mysqld_main,main

There should be a flush logs after insert at master, so it should be like this:
----
connection master;
create table test.a(id int) engine=INNODB;
insert into test.a values(1);
flush logs;
----

Hello Zhang,

Thank you for the bug report and test case.
Verified as described.

Thanks,
Umesh

// pt-pmp

 bin/pt-pmp --pid=1581
Mon Jun  2 16:45:56 IST 2014
      6 libaio::??(libaio.so.1),os_aio_linux_collect(os0file.cc:4918),os_aio_linux_handle(os0file.cc:5062),fil_aio_wait(fil0fil.cc:5568),io_handler_thread(srv0start.cc:480),start_thread(libpthread.so.0),clone(libc.so.6)
      5 poll(libc.so.6),vio_io_wait(viosocket.c:771),vio_socket_io_wait(viosocket.c:68),vio_read(viosocket.c:123),net_read_raw_loop(net_serv.cc:669),net_read_packet_header(net_serv.cc:751),net_read_packet(net_serv.cc:816),my_net_read(net_serv.cc:894),do_command(sql_parse.cc:970),do_handle_one_connection(sql_connect.cc:982),handle_one_connection(sql_connect.cc:898),pfs_spawn_thread(pfs.cc:1858),start_thread(libpthread.so.0),clone(libc.so.6)
      1 sigwait(libpthread.so.0),signal_hand(mysqld.cc:3179),pfs_spawn_thread(pfs.cc:1858),start_thread(libpthread.so.0),clone(libc.so.6)
      1 select(libc.so.6),os_thread_sleep(os0thread.cc:258),srv_master_sleep(srv0srv.cc:2294),srv_master_thread(srv0srv.cc:2338),start_thread(libpthread.so.0),clone(libc.so.6)
      1 select(libc.so.6),os_thread_sleep(os0thread.cc:258),page_cleaner_sleep_if_needed(buf0flu.cc:2379),buf_flush_page_cleaner_thread(buf0flu.cc:2420),start_thread(libpthread.so.0),clone(libc.so.6)
      1 pthread_cond_wait,os_cond_wait(os0sync.cc:214),os_event_wait_low(os0sync.cc:610),srv_purge_coordinator_suspend(srv0srv.cc:2631),srv_purge_coordinator_thread(srv0srv.cc:2748),start_thread(libpthread.so.0),clone(libc.so.6)
      1 pthread_cond_wait,os_cond_wait(os0sync.cc:214),os_event_wait_low(os0sync.cc:610),buf_dump_thread(buf0dump.cc:594),start_thread(libpthread.so.0),clone(libc.so.6)
      1 pthread_cond_wait,inline_mysql_cond_wait(mysql_thread.h:1151),block_until_new_connection(mysqld.cc:2662),one_thread_per_connection_end(mysqld.cc:2763),do_handle_one_connection(sql_connect.cc:989),handle_one_connection(sql_connect.cc:898),pfs_spawn_thread(pfs.cc:1858),start_thread(libpthread.so.0),clone(libc.so.6)
      1 pthread_cond_timedwait,os_cond_wait_timed(os0sync.cc:177),os_event_wait_time_low(os0sync.cc:723),srv_monitor_thread(srv0srv.cc:1532),start_thread(libpthread.so.0),clone(libc.so.6)
      1 pthread_cond_timedwait,os_cond_wait_timed(os0sync.cc:177),os_event_wait_time_low(os0sync.cc:723),srv_error_monitor_thread(srv0srv.cc:1763),start_thread(libpthread.so.0),clone(libc.so.6)
      1 pthread_cond_timedwait,os_cond_wait_timed(os0sync.cc:177),os_event_wait_time_low(os0sync.cc:723),lock_wait_timeout_thread(lock0wait.cc:503),start_thread(libpthread.so.0),clone(libc.so.6)
      1 pthread_cond_timedwait,os_cond_wait_timed(os0sync.cc:177),os_event_wait_time_low(os0sync.cc:723),ib_wqueue_timedwait(ut0wqueue.cc:154),fts_optimize_thread(fts0opt.cc:2994),start_thread(libpthread.so.0),clone(libc.so.6)
      1 pthread_cond_timedwait,os_cond_wait_timed(os0sync.cc:177),os_event_wait_time_low(os0sync.cc:723),dict_stats_thread(dict0stats_bg.cc:349),start_thread(libpthread.so.0),clone(libc.so.6)
      1 pthread_cond_timedwait,inline_mysql_cond_timedwait(mysql_thread.h:1188),MYSQL_BIN_LOG::wait_for_update_bin_log(binlog.cc:5838),mysql_binlog_send(rpl_master.cc:1531),com_binlog_dump(rpl_master.cc:761),dispatch_command(sql_parse.cc:1546),do_command(sql_parse.cc:1036),do_handle_one_connection(sql_connect.cc:982),handle_one_connection(sql_connect.cc:898),pfs_spawn_thread(pfs.cc:1858),start_thread(libpthread.so.0),clone(libc.so.6)
      1 poll(libc.so.6),handle_connections_sockets(mysqld.cc:6232),mysqld_main(mysqld.cc:5651),main(main.cc:25)

Hi, thank you for your quick response. a simple patch is flush relay log by transactions.

Copy/Paste for dev's:

mtr test case:

mysql-5.6.15]# more mysql-test/suite/rpl/t/non_xid_slave_hang.test

--source include/have_debug.inc
--source include/master-slave.inc
--source include/have_binlog_format_row.inc

--disable_query_log
call mtr.add_suppression("Slave:*");
--enable_query_log

connection slave;
stop slave;
set global debug="d,skip_xid_log_event";
set global slave_parallel_workers=8;
select sleep(30);
start  slave;

set global debug="d,skip_xid_log_event";

connection master;
create table test.a(id int) engine=INNODB;
insert into test.a values(1);
flush logs;
sync_slave_with_master;

connection slave;
source include/stop_slave.inc;
RESET SLAVE;

connection slave;
drop table test.a;

connection master;
drop table test.a;

// Usage

cd /path/to/source_dir/mysql-test
./mysql-test-run.pl --debug non_xid_slave_hang 

// Use pt-pmp or any of your convenient tool to get traces..

Relay log without xid_log_event solution

Attachment: slave_hang_solution.patch (application/octet-stream, text), 6.39 KiB.

Hello qinglin zhang,

In single thread replication mode, sql_thread rollbacks the partial transaction without XID when seeing the Format Description Event (FDE) in the relay log. But the same is not done by slave parallel workers in parallel replication.

We have fixed this issue our Facebook branch. You can find the fix here https://github.com/facebook/mysql-5.6/commit/9bcc118bee733a53e871867443d611e0606ee750.

Please find the information in the summary it is a port of upstream fix with changes
"This diff ports Oracles's patch for missing big transaction
with GTIDs. This diff also includes fix for missing big transaction
with GTIDs when parallel replication is enabled."

Thank you for your feedback. This issue was covered in bug 17326020 and the following was added to the MySQL 5.6.21 and 5.7.5 changelogs:

        When the I/O thread reconnected to a master using GTIDs and
        multithreaded slaves while in the middle of a transaction, it
        failed to abort the transaction, leaving a partial transaction
        in the relay log, and then retrieving the same transaction
        again. This occurred when performing a rotation of the relay
        log. Now when reconnecting, the server checks before rotating
        the log in such cases, and waits first for any ongoing
        transaction to complete.

Therefore nothing new will be added to cover this bug and I'm closing it as fixed.

$ bzr log -r 6210
------------------------------------------------------------
revno: 6210
committer: Joao Gramacho <joao.gramacho@oracle.com>
branch nick: mysql-5.6
timestamp: Mon 2014-10-20 08:24:05 +0100
message:
  Bug#18885916 RELAY LOG WITHOUT XID_LOG_EVENT MAY CASE PARALLEL
               REPLICATION HANG
  
  Problem:
  =======
  
  With MTS, GTIDs and auto positioning enabled, when a worker applies a
  partial transaction left on relaylog by an IO thread reconnection, it
  will wait for the XID log event to commit the transaction.
  
  Unfortunately, the SQL thread coordinator will reach the master's
  ROTATE event on the next relaylog file and will wait for all workers
  to finish their tasks before applying the ROTATE.
  
  Analysis:
  ========
  
  As the whole transaction is retrieved again by the IO thread after the
  reconnection, the slave must rollback the partial transaction once
  noticing this ROTATE from the master.
  
  This bug reports the same issue already fixed by BUG#17326020, and the
  reported issue is not reproducible anymore. So, this patch is just
  adding a new test case.