diff -rc --exclude='*.orig' --exclude=sql_yacc.cc --exclude=sql_yacc.h mysql-5.6.25/sql/rpl_rli.cc mysql-5.6.25-patched/sql/rpl_rli.cc *** mysql-5.6.25/sql/rpl_rli.cc 2015-05-05 19:05:54.000000000 +0800 --- mysql-5.6.25-patched/sql/rpl_rli.cc 2015-07-20 19:11:09.593212657 +0800 *************** *** 124,129 **** --- 124,131 ---- until_log_name[0]= ign_master_log_name_end[0]= 0; set_timespec_nsec(last_clock, 0); memset(&cache_buf, 0, sizeof(cache_buf)); + flushed_relay_log_id = 0; + flushed_relay_log_pos = 0; cached_charset_invalidate(); mysql_mutex_init(key_relay_log_info_log_space_lock, diff -rc --exclude='*.orig' --exclude=sql_yacc.cc --exclude=sql_yacc.h mysql-5.6.25/sql/rpl_rli.h mysql-5.6.25-patched/sql/rpl_rli.h *** mysql-5.6.25/sql/rpl_rli.h 2015-05-05 19:05:54.000000000 +0800 --- mysql-5.6.25-patched/sql/rpl_rli.h 2015-07-20 19:11:09.594212695 +0800 *************** *** 161,166 **** --- 161,168 ---- IO_CACHE used when opening cold relay logs. */ IO_CACHE cache_buf,*cur_log; + uint flushed_relay_log_id; + ulonglong flushed_relay_log_pos; /* Identifies when the recovery process is going on. diff -rc --exclude='*.orig' --exclude=sql_yacc.cc --exclude=sql_yacc.h mysql-5.6.25/sql/rpl_slave.cc mysql-5.6.25-patched/sql/rpl_slave.cc *** mysql-5.6.25/sql/rpl_slave.cc 2015-07-20 19:28:16.084213085 +0800 --- mysql-5.6.25-patched/sql/rpl_slave.cc 2015-07-20 19:11:09.596212769 +0800 *************** *** 213,218 **** --- 213,219 ---- static int get_master_uuid(MYSQL *mysql, Master_info *mi); int io_thread_init_commands(MYSQL *mysql, Master_info *mi); static Log_event* next_event(Relay_log_info* rli); + static Log_event* fast_next_event(Relay_log_info* rli); static int queue_event(Master_info* mi,const char* buf,ulong event_len); static void set_stop_slave_wait_timeout(unsigned long wait_timeout); static int terminate_slave_thread(THD *thd, *************** *** 4214,4220 **** DBUG_RETURN(1); } ! Log_event *ev = next_event(rli), **ptr_ev; DBUG_ASSERT(rli->info_thd==thd); --- 4215,4226 ---- DBUG_RETURN(1); } ! Log_event *ev, **ptr_ev; ! ! if (tcc_binlog_dump_tunning) ! ev = fast_next_event(rli); ! else ! ev = next_event(rli); DBUG_ASSERT(rli->info_thd==thd); *************** *** 8199,8204 **** --- 8205,8683 ---- DBUG_RETURN(0); } + static IO_CACHE *reopen_relay_log2(Relay_log_info *rli, const char **errmsg) + { + DBUG_ENTER("reopen_relay_log"); + DBUG_ASSERT(rli->cur_log_fd == -1); + + IO_CACHE *cur_log = rli->cur_log=&rli->cache_buf; + if ((rli->cur_log_fd=open_binlog_file(cur_log,rli->get_event_relay_log_name(), + errmsg)) <0) + DBUG_RETURN(0); + /* + We want to start exactly where we was before: + relay_log_pos Current log pos + pending Number of bytes already processed from the event + */ + rli->set_event_relay_log_pos(max(rli->get_event_relay_log_pos(), + BIN_LOG_HEADER_SIZE)); + my_b_seek(cur_log,rli->get_event_relay_log_pos()); + DBUG_RETURN(cur_log); + } + + static Log_event* fast_next_event(Relay_log_info* rli) + { + Log_event* ev; + IO_CACHE* cur_log = rli->cur_log; + mysql_mutex_t *log_lock = rli->relay_log.get_log_lock(); + const char* errmsg=0; + THD* thd = rli->info_thd; + DBUG_ENTER("next_event"); + + DBUG_ASSERT(thd != 0); + + #ifndef DBUG_OFF + if (abort_slave_event_count && !rli->events_until_exit--) + DBUG_RETURN(0); + #endif + + /* + For most operations we need to protect rli members with data_lock, + so we assume calling function acquired this mutex for us and we will + hold it for the most of the loop below However, we will release it + whenever it is worth the hassle, and in the cases when we go into a + mysql_cond_wait() with the non-data_lock mutex + */ + mysql_mutex_assert_owner(&rli->data_lock); + + while (!sql_slave_killed(thd,rli)) + { + /* + We can have two kinds of log reading: + hot_log: + rli->cur_log points at the IO_CACHE of relay_log, which + is actively being updated by the I/O thread. We need to be careful + in this case and make sure that we are not looking at a stale log that + has already been rotated. If it has been, we reopen the log. + + The other case is much simpler: + We just have a read only log that nobody else will be updating. + */ + + if (rli->cur_log_fd == -1) + { + mysql_mutex_lock(log_lock); + cur_log=reopen_relay_log2(rli, &errmsg); + mysql_mutex_unlock(log_lock); + if (!cur_log) // No more log files + goto err; + } + + /* + As there is no guarantee that the relay is open (for example, an I/O + error during a write by the slave I/O thread may have closed it), we + have to test it. + */ + if (!my_b_inited(cur_log)) + goto err; + #ifndef DBUG_OFF + { + DBUG_PRINT("info", ("assertion skip %lu file pos %lu event relay log pos %lu file %s\n", + (ulong) rli->slave_skip_counter, (ulong) my_b_tell(cur_log), + (ulong) rli->get_event_relay_log_pos(), + rli->get_event_relay_log_name())); + + /* This is an assertion which sometimes fails, let's try to track it */ + char llbuf1[22], llbuf2[22]; + DBUG_PRINT("info", ("my_b_tell(cur_log)=%s rli->event_relay_log_pos=%s", + llstr(my_b_tell(cur_log),llbuf1), + llstr(rli->get_event_relay_log_pos(),llbuf2))); + + DBUG_ASSERT(my_b_tell(cur_log) >= BIN_LOG_HEADER_SIZE); + DBUG_ASSERT(my_b_tell(cur_log) == rli->get_event_relay_log_pos() || rli->is_parallel_exec()); + + DBUG_PRINT("info", ("next_event group master %s %lu group relay %s %lu event %s %lu\n", + rli->get_group_master_log_name(), + (ulong) rli->get_group_master_log_pos(), + rli->get_group_relay_log_name(), + (ulong) rli->get_group_relay_log_pos(), + rli->get_event_relay_log_name(), + (ulong) rli->get_event_relay_log_pos())); + } + #endif + /* + Relay log is always in new format - if the master is 3.23, the + I/O thread will convert the format for us. + A problem: the description event may be in a previous relay log. So if + the slave has been shutdown meanwhile, we would have to look in old relay + logs, which may even have been deleted. So we need to write this + description event at the beginning of the relay log. + When the relay log is created when the I/O thread starts, easy: the + master will send the description event and we will queue it. + But if the relay log is created by new_file(): then the solution is: + MYSQL_BIN_LOG::open() will write the buffered description event. + */ + int has_more_relaylog = 0; + if (get_binary_log_index(rli->get_event_relay_log_name()) < rli->flushed_relay_log_id || + rli->get_event_relay_log_pos() < rli->flushed_relay_log_pos) + { + has_more_relaylog = 1; + } + else + { + mysql_mutex_lock(log_lock); + rli->flushed_relay_log_id = get_binary_log_index(rli->relay_log.get_log_fname()); + rli->flushed_relay_log_pos = rli->relay_log.get_log_file()->end_of_file; + mysql_mutex_unlock(log_lock); + if (get_binary_log_index(rli->get_event_relay_log_name()) < rli->flushed_relay_log_id || + rli->get_event_relay_log_pos() < rli->flushed_relay_log_pos) + { + has_more_relaylog = 1; + } + } + + if (has_more_relaylog && (ev= Log_event::read_log_event(cur_log, 0, + rli->get_rli_description_event(), + opt_slave_sql_verify_checksum))) + { + DBUG_ASSERT(thd==rli->info_thd); + /* + read it while we have a lock, to avoid a mutex lock in + inc_event_relay_log_pos() + */ + rli->set_future_event_relay_log_pos(my_b_tell(cur_log)); + ev->future_event_relay_log_pos= rli->get_future_event_relay_log_pos(); + + /* + MTS checkpoint in the successful read branch + */ + bool force= (rli->checkpoint_seqno > (rli->checkpoint_group - 1)); + if (rli->is_parallel_exec() && (opt_mts_checkpoint_period != 0 || force)) + { + ulonglong period= static_cast(opt_mts_checkpoint_period * 1000000ULL); + mysql_mutex_unlock(&rli->data_lock); + /* + At this point the coordinator has is delegating jobs to workers and + the checkpoint routine must be periodically invoked. + */ + (void) mts_checkpoint_routine(rli, period, force, true/*need_data_lock=true*/); // TODO: ALFRANIO ERROR + DBUG_ASSERT(!force || + (force && (rli->checkpoint_seqno <= (rli->checkpoint_group - 1))) || + sql_slave_killed(thd, rli)); + mysql_mutex_lock(&rli->data_lock); + } + DBUG_RETURN(ev); + } + DBUG_ASSERT(thd==rli->info_thd); + if (opt_reckless_slave) // For mysql-test + cur_log->error = 0; + if (cur_log->error < 0) + { + errmsg = "slave SQL thread aborted because of I/O error"; + if (rli->mts_group_status == Relay_log_info::MTS_IN_GROUP) + /* + MTS group status is set to MTS_KILLED_GROUP, whenever a read event + error happens and there was already a non-terminal event scheduled. + */ + rli->mts_group_status= Relay_log_info::MTS_KILLED_GROUP; + goto err; + } + if (!cur_log->error) /* EOF */ + { + /* + On a hot log, EOF means that there are no more updates to + process and we must block until I/O thread adds some and + signals us to continue + */ + if (has_more_relaylog == 0) + { + /* + We say in Seconds_Behind_Master that we have "caught up". Note that + for example if network link is broken but I/O slave thread hasn't + noticed it (slave_net_timeout not elapsed), then we'll say "caught + up" whereas we're not really caught up. Fixing that would require + internally cutting timeout in smaller pieces in network read, no + thanks. Another example: SQL has caught up on I/O, now I/O has read + a new event and is queuing it; the false "0" will exist until SQL + finishes executing the new event; it will be look abnormal only if + the events have old timestamps (then you get "many", 0, "many"). + + Transient phases like this can be fixed with implemeting + Heartbeat event which provides the slave the status of the + master at time the master does not have any new update to send. + Seconds_Behind_Master would be zero only when master has no + more updates in binlog for slave. The heartbeat can be sent + in a (small) fraction of slave_net_timeout. Until it's done + rli->last_master_timestamp is temporarely (for time of + waiting for the following event) reset whenever EOF is + reached. + */ + + /* shows zero while it is sleeping (and until the next event + is about to be executed). Note, in MTS case + Seconds_Behind_Master resetting follows slightly different + schema where reaching EOF is not enough. The status + parameter is updated per some number of processed group of + events. The number can't be greater than + @@global.slave_checkpoint_group and anyway SBM updating + rate does not exceed @@global.slave_checkpoint_period. + Notice that SBM is set to a new value after processing the + terminal event (e.g Commit) of a group. Coordinator resets + SBM when notices no more groups left neither to read from + Relay-log nor to process by Workers. + */ + if (!rli->is_parallel_exec()) + rli->last_master_timestamp= 0; + + if (rli->ign_master_log_name_end[0]) + { + /* We generate and return a Rotate, to make our positions advance */ + DBUG_PRINT("info",("seeing an ignored end segment")); + ev= new Rotate_log_event(rli->ign_master_log_name_end, + 0, rli->ign_master_log_pos_end, + Rotate_log_event::DUP_NAME); + rli->ign_master_log_name_end[0]= 0; + mysql_mutex_unlock(log_lock); + if (unlikely(!ev)) + { + errmsg= "Slave SQL thread failed to create a Rotate event " + "(out of memory?), SHOW SLAVE STATUS may be inaccurate"; + goto err; + } + ev->server_id= 0; // don't be ignored by slave SQL thread + DBUG_RETURN(ev); + } + + /* + We can, and should release data_lock while we are waiting for + update. If we do not, show slave status will block + */ + mysql_mutex_unlock(&rli->data_lock); + /* + Possible deadlock : + - the I/O thread has reached log_space_limit + - the SQL thread has read all relay logs, but cannot purge for some + reason: + * it has already purged all logs except the current one + * there are other logs than the current one but they're involved in + a transaction that finishes in the current one (or is not finished) + Solution : + Wake up the possibly waiting I/O thread, and set a boolean asking + the I/O thread to temporarily ignore the log_space_limit + constraint, because we do not want the I/O thread to block because of + space (it's ok if it blocks for any other reason (e.g. because the + master does not send anything). Then the I/O thread stops waiting + and reads one more event and starts honoring log_space_limit again. + + If the SQL thread needs more events to be able to rotate the log (it + might need to finish the current group first), then it can ask for one + more at a time. Thus we don't outgrow the relay log indefinitely, + but rather in a controlled manner, until the next rotate. + + When the SQL thread starts it sets ignore_log_space_limit to false. + We should also reset ignore_log_space_limit to 0 when the user does + RESET SLAVE, but in fact, no need as RESET SLAVE requires that the slave + be stopped, and the SQL thread sets ignore_log_space_limit to 0 when + it stops. + */ + mysql_mutex_lock(&rli->log_space_lock); + + /* + If we have reached the limit of the relay space and we + are going to sleep, waiting for more events: + + 1. If outside a group, SQL thread asks the IO thread + to force a rotation so that the SQL thread purges + logs next time it processes an event (thus space is + freed). + + 2. If in a group, SQL thread asks the IO thread to + ignore the limit and queues yet one more event + so that the SQL thread finishes the group and + is are able to rotate and purge sometime soon. + */ + if (rli->log_space_limit && + rli->log_space_limit < rli->log_space_total) + { + /* force rotation if not in an unfinished group */ + if (!rli->is_parallel_exec()) + { + rli->sql_force_rotate_relay= !rli->is_in_group(); + } + else + { + rli->sql_force_rotate_relay= + (rli->mts_group_status != Relay_log_info::MTS_IN_GROUP); + } + /* ask for one more event */ + rli->ignore_log_space_limit= true; + } + + /* + If the I/O thread is blocked, unblock it. Ok to broadcast + after unlock, because the mutex is only destroyed in + ~Relay_log_info(), i.e. when rli is destroyed, and rli will + not be destroyed before we exit the present function. + */ + mysql_mutex_unlock(&rli->log_space_lock); + mysql_cond_broadcast(&rli->log_space_cond); + // Note that wait_for_update_relay_log unlocks lock_log ! + + if (rli->is_parallel_exec() && (opt_mts_checkpoint_period != 0 || + DBUG_EVALUATE_IF("check_slave_debug_group", 1, 0))) + { + int ret= 0; + struct timespec waittime; + ulonglong period= static_cast(opt_mts_checkpoint_period * 1000000ULL); + ulong signal_cnt= rli->relay_log.signal_cnt; + + do + { + /* + At this point the coordinator has no job to delegate to workers. + However, workers are executing their assigned jobs and as such + the checkpoint routine must be periodically invoked. + */ + (void) mts_checkpoint_routine(rli, period, false, true/*need_data_lock=true*/); // TODO: ALFRANIO ERROR + mysql_mutex_lock(log_lock); + // More to the empty relay-log all assigned events done so reset it. + if (rli->gaq->empty()) + rli->last_master_timestamp= 0; + + if (DBUG_EVALUATE_IF("check_slave_debug_group", 1, 0)) + period= 10000000ULL; + + set_timespec_nsec(waittime, period); + if (get_binary_log_index(rli->get_event_relay_log_name()) < + get_binary_log_index(rli->relay_log.get_log_fname()) || + rli->get_event_relay_log_pos() < rli->relay_log.get_log_file()->end_of_file) + { + ret= rli->relay_log.wait_for_update_relay_log(thd, &waittime); + } + mysql_mutex_unlock(log_lock); + } while ((ret == ETIMEDOUT || ret == ETIME) /* todo:remove */ && + signal_cnt == rli->relay_log.signal_cnt && !thd->killed); + } + else + { + mysql_mutex_lock(log_lock); + if (get_binary_log_index(rli->get_event_relay_log_name()) < + get_binary_log_index(rli->relay_log.get_log_fname()) || + rli->get_event_relay_log_pos() < rli->relay_log.get_log_file()->end_of_file) + { + rli->relay_log.wait_for_update_relay_log(thd, NULL); + } + mysql_mutex_unlock(log_lock); + } + + // re-acquire data lock since we released it earlier + mysql_mutex_lock(&rli->data_lock); + continue; + } + /* + If the log was not hot, we need to move to the next log in + sequence. The next log could be hot or cold, we deal with both + cases separately after doing some common initialization + */ + end_io_cache(cur_log); + DBUG_ASSERT(rli->cur_log_fd >= 0); + mysql_file_close(rli->cur_log_fd, MYF(MY_WME)); + rli->cur_log_fd = -1; + + if (relay_log_purge) + { + /* + purge_first_log will properly set up relay log coordinates in rli. + If the group's coordinates are equal to the event's coordinates + (i.e. the relay log was not rotated in the middle of a group), + we can purge this relay log too. + We do ulonglong and string comparisons, this may be slow but + - purging the last relay log is nice (it can save 1GB of disk), so we + like to detect the case where we can do it, and given this, + - I see no better detection method + - purge_first_log is not called that often + */ + if (rli->relay_log.purge_first_log + (rli, + rli->get_group_relay_log_pos() == rli->get_event_relay_log_pos() + && !strcmp(rli->get_group_relay_log_name(),rli->get_event_relay_log_name()))) + { + errmsg = "Error purging processed logs"; + goto err; + } + DBUG_PRINT("info", ("next_event group master %s %lu group relay %s %lu event %s %lu\n", + rli->get_group_master_log_name(), + (ulong) rli->get_group_master_log_pos(), + rli->get_group_relay_log_name(), + (ulong) rli->get_group_relay_log_pos(), + rli->get_event_relay_log_name(), + (ulong) rli->get_event_relay_log_pos())); + } + else + { + /* + If hot_log is set, then we already have a lock on + LOCK_log. If not, we have to get the lock. + + According to Sasha, the only time this code will ever be executed + is if we are recovering from a bug. + */ + mysql_mutex_lock(log_lock); + if (rli->relay_log.find_next_log(&rli->linfo, 0)) + { + errmsg = "error switching to the next log"; + goto err; + } + rli->set_event_relay_log_pos(BIN_LOG_HEADER_SIZE); + rli->set_event_relay_log_name(rli->linfo.log_file_name); + /* + We may update the worker here but this is not extremlly + necessary. /Alfranio + */ + rli->flush_info(); + mysql_mutex_unlock(log_lock); + } + + /* Reset the relay-log-change-notified status of Slave Workers */ + if (rli->is_parallel_exec()) + { + DBUG_PRINT("info", ("next_event: MTS group relay log changes to %s %lu\n", + rli->get_group_relay_log_name(), + (ulong) rli->get_group_relay_log_pos())); + rli->reset_notified_relay_log_change(); + } + + continue; + } + else + { + /* + Read failed with a non-EOF error. + TODO: come up with something better to handle this error + */ + sql_print_error("Slave SQL thread: I/O error reading \ + event(errno: %d cur_log->error: %d)", + my_errno,cur_log->error); + // set read position to the beginning of the event + my_b_seek(cur_log,rli->get_event_relay_log_pos()); + /* otherwise, we have had a partial read */ + errmsg = "Aborting slave SQL thread because of partial event read"; + break; // To end of function + } + } + if (!errmsg && log_warnings) + { + sql_print_information("Error reading relay log event: %s", + "slave SQL thread was killed"); + DBUG_RETURN(0); + } + + err: + if (errmsg) + sql_print_error("Error reading relay log event: %s", errmsg); + DBUG_RETURN(0); + } + /* Rotate a relay log (this is used only by FLUSH LOGS; the automatic rotation because of size is simpler because when we do it we already have all relevant