From 41aa79a2374864df140527886284ffdd1c95f2c5 Mon Sep 17 00:00:00 2001 From: sensssz Date: Mon, 17 Oct 2016 14:07:43 -0400 Subject: [PATCH 1/5] Implement VATS based on 5.7 --- sql/rpl_rli_pdb.cc | 3 + sql/rpl_slave.cc | 1 + sql/sql_class.cc | 2 + sql/sql_class.h | 2 + storage/innobase/handler/ha_innodb.cc | 29 +++++ storage/innobase/include/lock0lock.h | 11 ++ storage/innobase/lock/lock0lock.cc | 198 +++++++++++++++++++++++++++++++--- 7 files changed, 233 insertions(+), 13 deletions(-) diff --git a/sql/rpl_rli_pdb.cc b/sql/rpl_rli_pdb.cc index 665925b..498fcc1 100644 --- a/sql/rpl_rli_pdb.cc +++ b/sql/rpl_rli_pdb.cc @@ -94,6 +94,7 @@ bool handle_slave_worker_stop(Slave_worker *worker, worker->running_status= Slave_worker::STOP_ACCEPTED; mysql_cond_signal(&worker->jobs_cond); mysql_mutex_unlock(&rli->exit_count_lock); + is_slave_replication = false; return(true); } else if (rli->exit_counter == rli->slave_parallel_workers) @@ -104,10 +105,12 @@ bool handle_slave_worker_stop(Slave_worker *worker, worker->running_status= Slave_worker::STOP_ACCEPTED; mysql_cond_signal(&worker->jobs_cond); mysql_mutex_unlock(&rli->exit_count_lock); + is_slave_replication = false; return(true); } } mysql_mutex_unlock(&rli->exit_count_lock); + is_slave_replication = false; return(false); } diff --git a/sql/rpl_slave.cc b/sql/rpl_slave.cc index fd5601a..ad1a218 100644 --- a/sql/rpl_slave.cc +++ b/sql/rpl_slave.cc @@ -5893,6 +5893,7 @@ extern "C" void *handle_slave_worker(void *arg) struct PSI_thread *psi; #endif + is_slave_replication = true; my_thread_init(); DBUG_ENTER("handle_slave_worker"); diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 777f13a..6e412e9 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -62,6 +62,8 @@ using std::min; using std::max; +bool is_slave_replication = false; + /* The following is used to initialise Table_ident with a internal table name diff --git a/sql/sql_class.h b/sql/sql_class.h index d815a1c..28b0b40 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -84,6 +84,8 @@ void set_thd_stage_info(void *thd, #define THD_STAGE_INFO(thd, stage) \ (thd)->enter_stage(& stage, NULL, __func__, __FILE__, __LINE__) +extern bool is_slave_replication; + enum enum_delay_key_write { DELAY_KEY_WRITE_NONE, DELAY_KEY_WRITE_ON, DELAY_KEY_WRITE_ALL }; enum enum_rbr_exec_mode { RBR_EXEC_MODE_STRICT, diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 2b89c51..bda2a6f 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -299,6 +299,22 @@ static TYPELIB innodb_default_row_format_typelib = { NULL }; +/** Possible values of the parameter innodb_lock_schedule_algorithm */ +static const char* innodb_lock_schedule_algorithm_names[] = { + "fcfs", + "vats", + NullS +}; + +/** Used to define an enumerate type of the system variable +innodb_lock_schedule_algorithm. */ +static TYPELIB innodb_lock_schedule_algorithm_typelib = { + array_elements(innodb_lock_schedule_algorithm_names) - 1, + "innodb_lock_schedule_algorithm_typelib", + innodb_lock_schedule_algorithm_names, + NULL +}; + /* The following counter is used to convey information to InnoDB about server activity: in case of normal DML ops it is not sensible to call srv_active_wake_master_thread after each @@ -18861,6 +18877,18 @@ static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size, NULL, NULL, 120, 1, 127, 0); #endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ +static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm, + PLUGIN_VAR_RQCMDARG, + "The algorithm Innodb uses for deciding which locks to grant next when" + " a lock is released. Possible values are" + " FCFS" + " grant the locks in First-Come-First-Served order;" + " VATS" + " use the Variance-Aware-Transaction-Scheduling algorithm, which" + " uses an Eldest-Transaction-First heuristic.", + NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, + &innodb_lock_schedule_algorithm_typelib); + static MYSQL_SYSVAR_ULONG(buffer_pool_instances, srv_buf_pool_instances, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of buffer pool instances, set to higher value on high-end machines to increase scalability", @@ -19457,6 +19485,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(ft_sort_pll_degree), MYSQL_SYSVAR(large_prefix), MYSQL_SYSVAR(force_load_corrupted), + MYSQL_SYSVAR(lock_schedule_algorithm), MYSQL_SYSVAR(locks_unsafe_for_binlog), MYSQL_SYSVAR(lock_wait_timeout), MYSQL_SYSVAR(page_size), diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index 3b6722b..2bee7d4 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -40,6 +40,17 @@ Created 5/7/1996 Heikki Tuuri #include "gis0rtree.h" #include "lock0prdt.h" +/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by + setting innodb_lock_schedule_algorithm. */ +enum innodb_lock_schedule_algorithm_t { + /*!< First Come First Served */ + INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, + /*!< Variance-Aware-Transaction-Scheduling */ + INNODB_LOCK_SCHEDULE_ALGORITHM_VATS +}; + +extern ulong innodb_lock_schedule_algorithm; + // Forward declaration class ReadView; diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index e94ea02..3acfe6c 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -26,6 +26,7 @@ Created 5/7/1996 Heikki Tuuri #define LOCK_MODULE_IMPLEMENTATION #include +#include #include "ha_prototypes.h" #include "lock0lock.h" @@ -48,6 +49,9 @@ Created 5/7/1996 Heikki Tuuri #include +/** Lock scheduling algorithm */ +ulong innodb_lock_schedule_algorithm = INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS; + /** Total number of cached record locks */ static const ulint REC_LOCK_CACHE = 8; @@ -60,6 +64,15 @@ static const ulint TABLE_LOCK_CACHE = 8; /** Size in bytes, of the table lock instance */ static const ulint TABLE_LOCK_SIZE = sizeof(ib_lock_t); +/*********************************************************************//** +Checks if a waiting record lock request still has to wait in a queue. +@return lock that is causing the wait */ +static +const lock_t* +lock_rec_has_to_wait_in_queue( +/*==========================*/ + const lock_t* wait_lock); /*!< in: waiting record lock */ + /** Deadlock checker. */ class DeadlockChecker { public: @@ -1545,6 +1558,81 @@ RecLock::create(trx_t* trx, bool owns_trx_mutex, const lock_prdt_t* prdt) return(lock); } +/*********************************************************************//** +Check if lock1 has higher priority than lock2. +NULL has lowest priority. +If either is a high priority transaction, the lock has higher priority. +If neither of them is wait lock, the first one has higher priority. +If only one of them is a wait lock, it has lower priority. +Otherwise, the one with an older transaction has higher priority. +@returns true if lock1 has higher priority, false otherwise. */ +bool +has_higher_priority( + lock_t *lock1, + lock_t *lock2) +{ + if (lock1 == NULL) { + return false; + } else if (lock2 == NULL) { + return true; + } + if (trx_is_high_priority(lock1->trx)) { + return true; + } + if (trx_is_high_priority(lock2->trx)) { + return false; + } + if (!lock_get_wait(lock1)) { + return true; + } else if (!lock_get_wait(lock2)) { + return false; + } + return lock1->trx->start_time < lock2->trx->start_time; +} + +/*********************************************************************//** +Insert a lock to the hash list according to the mode (whether it is a wait +lock) and the age of the transaction the it is associated with. +If the lock is not a wait lock, insert it to the head of the hash list. +Otherwise, insert it to the middle of the wait locks according to the age of +the transaciton. */ +static +void +lock_rec_insert_by_trx_age( + lock_t *in_lock, /*!< in: lock to be insert */ + bool wait) /*!< in: whether it's a wait lock */ +{ + ulint space; + ulint page_no; + ulint rec_fold; + hash_table_t* hash; + hash_cell_t* cell; + lock_t* node; + lock_t* next; + + space = in_lock->un_member.rec_lock.space; + page_no = in_lock->un_member.rec_lock.page_no; + rec_fold = lock_rec_fold(space, page_no); + hash = lock_hash_get(in_lock->type_mode); + cell = hash_get_nth_cell(hash, + hash_calc_hash(rec_fold, hash)); + + node = (lock_t *) cell->node; + // If in_lock is not a wait lock, we insert it to the head of the list. + if (node == NULL || !wait || has_higher_priority(in_lock, node)) { + cell->node = in_lock; + in_lock->hash = node; + return; + } + while (node != NULL && has_higher_priority((lock_t *) node->hash, + in_lock)) { + node = (lock_t *) node->hash; + } + next = (lock_t *) node->hash; + node->hash = in_lock; + in_lock->hash = next; +} + /** Check the outcome of the deadlock check @param[in,out] victim_trx Transaction selected for rollback @@ -1567,7 +1655,22 @@ RecLock::check_deadlock_result(const trx_t* victim_trx, lock_t* lock) return(DB_DEADLOCK); - } else if (m_trx->lock.wait_lock == NULL) { + } + + // Move it only when it does not cause a deadlock. + if (innodb_lock_schedule_algorithm + == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS && !is_slave_replication) { + + HASH_DELETE(lock_t, hash, lock_hash_get(lock->type_mode), + m_rec_id.fold(), lock); + lock_rec_insert_by_trx_age(lock, m_mode & LOCK_WAIT); + if (lock_get_wait(lock) && !lock_rec_has_to_wait_in_queue(lock)) { + lock_reset_lock_and_trx_wait(lock); + return DB_SUCCESS_LOCKED_REC; + } + } + + if (m_trx->lock.wait_lock == NULL) { /* If there was a deadlock but we chose another transaction as a victim, it is possible that we @@ -2478,6 +2581,34 @@ lock_rec_cancel( } /*************************************************************//** +Move the lock to the head of the hash list. */ +static +void +lock_rec_move_to_front( + lock_t *lock_to_move, /*!< in: lock to be moved */ + ulint rec_fold) /*!< in: rec fold of the lock */ +{ + hash_table_t* lock_hash; + hash_cell_t* cell; + lock_t* next; + + if (lock_to_move != NULL) + { + lock_hash = lock_hash_get(lock_to_move->type_mode); + // Move the target lock to the head of the list + cell = hash_get_nth_cell(lock_hash, + hash_calc_hash(rec_fold, lock_hash)); + if (lock_to_move != cell->node) { + next = (lock_t *) cell->node; + cell->node = lock_to_move; + lock_to_move->hash = next; + } + } +} + + + +/*************************************************************//** Removes a record lock request, waiting or granted, from the queue and grants locks to other transactions in the queue if they now are entitled to a lock. NOTE: all record locks contained in in_lock are removed. */ @@ -2493,6 +2624,8 @@ lock_rec_dequeue_from_page( { ulint space; ulint page_no; + ulint rec_fold; + lock_t* previous = NULL; lock_t* lock; trx_lock_t* trx_lock; hash_table_t* lock_hash; @@ -2505,6 +2638,7 @@ lock_rec_dequeue_from_page( space = in_lock->un_member.rec_lock.space; page_no = in_lock->un_member.rec_lock.page_no; + rec_fold = lock_rec_fold(space, page_no); ut_ad(in_lock->index->table->n_rec_locks > 0); in_lock->index->table->n_rec_locks--; @@ -2519,20 +2653,55 @@ lock_rec_dequeue_from_page( MONITOR_INC(MONITOR_RECLOCK_REMOVED); MONITOR_DEC(MONITOR_NUM_RECLOCK); - /* Check if waiting locks in the queue can now be granted: grant - locks if there are no conflicting locks ahead. Stop at the first - X lock that is waiting or has been granted. */ + if (innodb_lock_schedule_algorithm + == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS || is_slave_replication) { - for (lock = lock_rec_get_first_on_page_addr(lock_hash, space, page_no); - lock != NULL; - lock = lock_rec_get_next_on_page(lock)) { + /* Check if waiting locks in the queue can now be granted: + grant locks if there are no conflicting locks ahead. Stop at + the first X lock that is waiting or has been granted. */ - if (lock_get_wait(lock) - && !lock_rec_has_to_wait_in_queue(lock)) { + for (lock = lock_rec_get_first_on_page_addr(lock_hash, space, + page_no); + lock != NULL; + lock = lock_rec_get_next_on_page(lock)) { - /* Grant the lock */ - ut_ad(lock->trx != in_lock->trx); - lock_grant(lock); + if (lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + /* Grant the lock */ + ut_ad(lock->trx != in_lock->trx); + lock_grant(lock); + } + } + } else { + /* Grant locks if there are no conflicting locks ahead. + Move granted locks to the head of the list. */ + for (lock = lock_rec_get_first_on_page_addr(lock_hash, space, + page_no); + lock != NULL;) { + + /* If the lock is a wait lock on this page, and it does not need to wait. */ + if ((lock->un_member.rec_lock.space == space) + && (lock->un_member.rec_lock.page_no == page_no) + && lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + lock_grant(lock); + + if (previous != NULL) { + /* Move the lock to the head of the list. */ + HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock); + lock_rec_move_to_front(lock, rec_fold); + } else { + /* Already at the head of the list. */ + previous = lock; + } + /* Move on to the next lock. */ + lock = static_cast(HASH_GET_NEXT(hash, previous)); + } else { + previous = lock; + lock = static_cast(HASH_GET_NEXT(hash, lock)); + } } } } @@ -7197,7 +7366,10 @@ DeadlockChecker::get_first_lock(ulint* heap_no) const /* Must find at least two locks, otherwise there cannot be a waiting lock, secondly the first lock cannot be the wait_lock. */ ut_a(lock != NULL); - ut_a(lock != m_wait_lock); + ut_a(lock != m_wait_lock || + (innodb_lock_schedule_algorithm + == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS + && !is_slave_replication)); /* Check that the lock type doesn't change. */ ut_ad(lock_get_type_low(lock) == lock_get_type_low(m_wait_lock)); From 61c8c62ac5882788cf7e6daad635f8253b5bb0e5 Mon Sep 17 00:00:00 2001 From: sensssz Date: Mon, 17 Oct 2016 14:07:43 -0400 Subject: [PATCH 2/5] Implement VATS based on 5.7 --- sql/rpl_rli_pdb.cc | 3 + sql/rpl_slave.cc | 1 + sql/sql_class.cc | 2 + sql/sql_class.h | 2 + storage/innobase/handler/ha_innodb.cc | 29 +++++ storage/innobase/include/lock0lock.h | 11 ++ storage/innobase/lock/lock0lock.cc | 198 +++++++++++++++++++++++++++++++--- 7 files changed, 233 insertions(+), 13 deletions(-) diff --git a/sql/rpl_rli_pdb.cc b/sql/rpl_rli_pdb.cc index 3ba42b1..7fb1fec 100644 --- a/sql/rpl_rli_pdb.cc +++ b/sql/rpl_rli_pdb.cc @@ -94,6 +94,7 @@ bool handle_slave_worker_stop(Slave_worker *worker, worker->running_status= Slave_worker::STOP_ACCEPTED; mysql_cond_signal(&worker->jobs_cond); mysql_mutex_unlock(&rli->exit_count_lock); + is_slave_replication = false; return(true); } else if (rli->exit_counter == rli->slave_parallel_workers) @@ -104,10 +105,12 @@ bool handle_slave_worker_stop(Slave_worker *worker, worker->running_status= Slave_worker::STOP_ACCEPTED; mysql_cond_signal(&worker->jobs_cond); mysql_mutex_unlock(&rli->exit_count_lock); + is_slave_replication = false; return(true); } } mysql_mutex_unlock(&rli->exit_count_lock); + is_slave_replication = false; return(false); } diff --git a/sql/rpl_slave.cc b/sql/rpl_slave.cc index fd00f1a..0f893a6 100644 --- a/sql/rpl_slave.cc +++ b/sql/rpl_slave.cc @@ -6056,6 +6056,7 @@ extern "C" void *handle_slave_worker(void *arg) struct PSI_thread *psi; #endif + is_slave_replication = true; my_thread_init(); DBUG_ENTER("handle_slave_worker"); diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 7a61fc0..68d4532 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -63,6 +63,8 @@ using std::min; using std::max; +bool is_slave_replication = false; + /* The following is used to initialise Table_ident with a internal table name diff --git a/sql/sql_class.h b/sql/sql_class.h index 01a19f2..0cb8e38 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -84,6 +84,8 @@ void set_thd_stage_info(void *thd, #define THD_STAGE_INFO(thd, stage) \ (thd)->enter_stage(& stage, NULL, __func__, __FILE__, __LINE__) +extern bool is_slave_replication; + enum enum_delay_key_write { DELAY_KEY_WRITE_NONE, DELAY_KEY_WRITE_ON, DELAY_KEY_WRITE_ALL }; enum enum_rbr_exec_mode { RBR_EXEC_MODE_STRICT, diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 1a65c5f..1181b61 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -300,6 +300,22 @@ static TYPELIB innodb_default_row_format_typelib = { NULL }; +/** Possible values of the parameter innodb_lock_schedule_algorithm */ +static const char* innodb_lock_schedule_algorithm_names[] = { + "fcfs", + "vats", + NullS +}; + +/** Used to define an enumerate type of the system variable +innodb_lock_schedule_algorithm. */ +static TYPELIB innodb_lock_schedule_algorithm_typelib = { + array_elements(innodb_lock_schedule_algorithm_names) - 1, + "innodb_lock_schedule_algorithm_typelib", + innodb_lock_schedule_algorithm_names, + NULL +}; + /* The following counter is used to convey information to InnoDB about server activity: in case of normal DML ops it is not sensible to call srv_active_wake_master_thread after each @@ -19467,6 +19483,18 @@ static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size, NULL, NULL, 120, 1, 127, 0); #endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ +static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm, + PLUGIN_VAR_RQCMDARG, + "The algorithm Innodb uses for deciding which locks to grant next when" + " a lock is released. Possible values are" + " FCFS" + " grant the locks in First-Come-First-Served order;" + " VATS" + " use the Variance-Aware-Transaction-Scheduling algorithm, which" + " uses an Eldest-Transaction-First heuristic.", + NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, + &innodb_lock_schedule_algorithm_typelib); + static MYSQL_SYSVAR_ULONG(buffer_pool_instances, srv_buf_pool_instances, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of buffer pool instances, set to higher value on high-end machines to increase scalability", @@ -20070,6 +20098,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(ft_sort_pll_degree), MYSQL_SYSVAR(large_prefix), MYSQL_SYSVAR(force_load_corrupted), + MYSQL_SYSVAR(lock_schedule_algorithm), MYSQL_SYSVAR(locks_unsafe_for_binlog), MYSQL_SYSVAR(lock_wait_timeout), MYSQL_SYSVAR(deadlock_detect), diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index 8db09e5..5770b8a 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -40,6 +40,17 @@ Created 5/7/1996 Heikki Tuuri #include "gis0rtree.h" #include "lock0prdt.h" +/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by + setting innodb_lock_schedule_algorithm. */ +enum innodb_lock_schedule_algorithm_t { + /*!< First Come First Served */ + INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, + /*!< Variance-Aware-Transaction-Scheduling */ + INNODB_LOCK_SCHEDULE_ALGORITHM_VATS +}; + +extern ulong innodb_lock_schedule_algorithm; + // Forward declaration class ReadView; diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 540bb61..18759eb 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -26,6 +26,7 @@ Created 5/7/1996 Heikki Tuuri #define LOCK_MODULE_IMPLEMENTATION #include +#include #include "ha_prototypes.h" #include "lock0lock.h" @@ -54,6 +55,9 @@ Created 5/7/1996 Heikki Tuuri /* Flag to enable/disable deadlock detector. */ my_bool innobase_deadlock_detect = TRUE; +/** Lock scheduling algorithm */ +ulong innodb_lock_schedule_algorithm = INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS; + /** Total number of cached record locks */ static const ulint REC_LOCK_CACHE = 8; @@ -66,6 +70,15 @@ static const ulint TABLE_LOCK_CACHE = 8; /** Size in bytes, of the table lock instance */ static const ulint TABLE_LOCK_SIZE = sizeof(ib_lock_t); +/*********************************************************************//** +Checks if a waiting record lock request still has to wait in a queue. +@return lock that is causing the wait */ +static +const lock_t* +lock_rec_has_to_wait_in_queue( +/*==========================*/ + const lock_t* wait_lock); /*!< in: waiting record lock */ + /** Deadlock checker. */ class DeadlockChecker { public: @@ -1556,6 +1569,81 @@ RecLock::create( return(lock); } +/*********************************************************************//** +Check if lock1 has higher priority than lock2. +NULL has lowest priority. +If either is a high priority transaction, the lock has higher priority. +If neither of them is wait lock, the first one has higher priority. +If only one of them is a wait lock, it has lower priority. +Otherwise, the one with an older transaction has higher priority. +@returns true if lock1 has higher priority, false otherwise. */ +bool +has_higher_priority( + lock_t *lock1, + lock_t *lock2) +{ + if (lock1 == NULL) { + return false; + } else if (lock2 == NULL) { + return true; + } + if (trx_is_high_priority(lock1->trx)) { + return true; + } + if (trx_is_high_priority(lock2->trx)) { + return false; + } + if (!lock_get_wait(lock1)) { + return true; + } else if (!lock_get_wait(lock2)) { + return false; + } + return lock1->trx->start_time < lock2->trx->start_time; +} + +/*********************************************************************//** +Insert a lock to the hash list according to the mode (whether it is a wait +lock) and the age of the transaction the it is associated with. +If the lock is not a wait lock, insert it to the head of the hash list. +Otherwise, insert it to the middle of the wait locks according to the age of +the transaciton. */ +static +void +lock_rec_insert_by_trx_age( + lock_t *in_lock, /*!< in: lock to be insert */ + bool wait) /*!< in: whether it's a wait lock */ +{ + ulint space; + ulint page_no; + ulint rec_fold; + hash_table_t* hash; + hash_cell_t* cell; + lock_t* node; + lock_t* next; + + space = in_lock->un_member.rec_lock.space; + page_no = in_lock->un_member.rec_lock.page_no; + rec_fold = lock_rec_fold(space, page_no); + hash = lock_hash_get(in_lock->type_mode); + cell = hash_get_nth_cell(hash, + hash_calc_hash(rec_fold, hash)); + + node = (lock_t *) cell->node; + // If in_lock is not a wait lock, we insert it to the head of the list. + if (node == NULL || !wait || has_higher_priority(in_lock, node)) { + cell->node = in_lock; + in_lock->hash = node; + return; + } + while (node != NULL && has_higher_priority((lock_t *) node->hash, + in_lock)) { + node = (lock_t *) node->hash; + } + next = (lock_t *) node->hash; + node->hash = in_lock; + in_lock->hash = next; +} + /** Check the outcome of the deadlock check @param[in,out] victim_trx Transaction selected for rollback @@ -1578,7 +1666,22 @@ RecLock::check_deadlock_result(const trx_t* victim_trx, lock_t* lock) return(DB_DEADLOCK); - } else if (m_trx->lock.wait_lock == NULL) { + } + + // Move it only when it does not cause a deadlock. + if (innodb_lock_schedule_algorithm + == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS && !is_slave_replication) { + + HASH_DELETE(lock_t, hash, lock_hash_get(lock->type_mode), + m_rec_id.fold(), lock); + lock_rec_insert_by_trx_age(lock, m_mode & LOCK_WAIT); + if (lock_get_wait(lock) && !lock_rec_has_to_wait_in_queue(lock)) { + lock_reset_lock_and_trx_wait(lock); + return DB_SUCCESS_LOCKED_REC; + } + } + + if (m_trx->lock.wait_lock == NULL) { /* If there was a deadlock but we chose another transaction as a victim, it is possible that we @@ -2424,6 +2527,34 @@ lock_rec_cancel( } /*************************************************************//** +Move the lock to the head of the hash list. */ +static +void +lock_rec_move_to_front( + lock_t *lock_to_move, /*!< in: lock to be moved */ + ulint rec_fold) /*!< in: rec fold of the lock */ +{ + hash_table_t* lock_hash; + hash_cell_t* cell; + lock_t* next; + + if (lock_to_move != NULL) + { + lock_hash = lock_hash_get(lock_to_move->type_mode); + // Move the target lock to the head of the list + cell = hash_get_nth_cell(lock_hash, + hash_calc_hash(rec_fold, lock_hash)); + if (lock_to_move != cell->node) { + next = (lock_t *) cell->node; + cell->node = lock_to_move; + lock_to_move->hash = next; + } + } +} + + + +/*************************************************************//** Removes a record lock request, waiting or granted, from the queue and grants locks to other transactions in the queue if they now are entitled to a lock. NOTE: all record locks contained in in_lock are removed. */ @@ -2439,6 +2570,8 @@ lock_rec_dequeue_from_page( { ulint space; ulint page_no; + ulint rec_fold; + lock_t* previous = NULL; lock_t* lock; trx_lock_t* trx_lock; hash_table_t* lock_hash; @@ -2451,6 +2584,7 @@ lock_rec_dequeue_from_page( space = in_lock->un_member.rec_lock.space; page_no = in_lock->un_member.rec_lock.page_no; + rec_fold = lock_rec_fold(space, page_no); ut_ad(in_lock->index->table->n_rec_locks > 0); in_lock->index->table->n_rec_locks--; @@ -2465,20 +2599,55 @@ lock_rec_dequeue_from_page( MONITOR_INC(MONITOR_RECLOCK_REMOVED); MONITOR_DEC(MONITOR_NUM_RECLOCK); - /* Check if waiting locks in the queue can now be granted: grant - locks if there are no conflicting locks ahead. Stop at the first - X lock that is waiting or has been granted. */ + if (innodb_lock_schedule_algorithm + == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS || is_slave_replication) { - for (lock = lock_rec_get_first_on_page_addr(lock_hash, space, page_no); - lock != NULL; - lock = lock_rec_get_next_on_page(lock)) { + /* Check if waiting locks in the queue can now be granted: + grant locks if there are no conflicting locks ahead. Stop at + the first X lock that is waiting or has been granted. */ - if (lock_get_wait(lock) - && !lock_rec_has_to_wait_in_queue(lock)) { + for (lock = lock_rec_get_first_on_page_addr(lock_hash, space, + page_no); + lock != NULL; + lock = lock_rec_get_next_on_page(lock)) { - /* Grant the lock */ - ut_ad(lock->trx != in_lock->trx); - lock_grant(lock); + if (lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + /* Grant the lock */ + ut_ad(lock->trx != in_lock->trx); + lock_grant(lock); + } + } + } else { + /* Grant locks if there are no conflicting locks ahead. + Move granted locks to the head of the list. */ + for (lock = lock_rec_get_first_on_page_addr(lock_hash, space, + page_no); + lock != NULL;) { + + /* If the lock is a wait lock on this page, and it does not need to wait. */ + if ((lock->un_member.rec_lock.space == space) + && (lock->un_member.rec_lock.page_no == page_no) + && lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + lock_grant(lock); + + if (previous != NULL) { + /* Move the lock to the head of the list. */ + HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock); + lock_rec_move_to_front(lock, rec_fold); + } else { + /* Already at the head of the list. */ + previous = lock; + } + /* Move on to the next lock. */ + lock = static_cast(HASH_GET_NEXT(hash, previous)); + } else { + previous = lock; + lock = static_cast(HASH_GET_NEXT(hash, lock)); + } } } } @@ -7221,7 +7390,10 @@ DeadlockChecker::get_first_lock(ulint* heap_no) const /* Must find at least two locks, otherwise there cannot be a waiting lock, secondly the first lock cannot be the wait_lock. */ ut_a(lock != NULL); - ut_a(lock != m_wait_lock); + ut_a(lock != m_wait_lock || + (innodb_lock_schedule_algorithm + == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS + && !is_slave_replication)); /* Check that the lock type doesn't change. */ ut_ad(lock_get_type_low(lock) == lock_get_type_low(m_wait_lock)); From f66201c6d04541b032bdb85d81c76dcb7258fe3a Mon Sep 17 00:00:00 2001 From: sensssz Date: Mon, 17 Oct 2016 22:27:35 -0400 Subject: [PATCH 3/5] Do not move high priority transactions. --- sql/rpl_rli_pdb.cc | 2 +- storage/innobase/lock/lock0lock.cc | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sql/rpl_rli_pdb.cc b/sql/rpl_rli_pdb.cc index 7fb1fec..e769bfd 100644 --- a/sql/rpl_rli_pdb.cc +++ b/sql/rpl_rli_pdb.cc @@ -105,7 +105,7 @@ bool handle_slave_worker_stop(Slave_worker *worker, worker->running_status= Slave_worker::STOP_ACCEPTED; mysql_cond_signal(&worker->jobs_cond); mysql_mutex_unlock(&rli->exit_count_lock); - is_slave_replication = false; + is_slave_replication = false; return(true); } } diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 18759eb..666a5bd 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -1670,7 +1670,9 @@ RecLock::check_deadlock_result(const trx_t* victim_trx, lock_t* lock) // Move it only when it does not cause a deadlock. if (innodb_lock_schedule_algorithm - == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS && !is_slave_replication) { + == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS + && !is_slave_replication + && !trx_is_high_priority(lock->trx)) { HASH_DELETE(lock_t, hash, lock_hash_get(lock->type_mode), m_rec_id.fold(), lock); From 4ce70c5e0833455dcb4a88e9336dcb13b198a2ac Mon Sep 17 00:00:00 2001 From: sensssz Date: Wed, 19 Oct 2016 15:34:30 -0400 Subject: [PATCH 4/5] Remove is_slave_replication and use thd_is_replication_slave_thread. --- sql/rpl_rli_pdb.cc | 3 --- sql/rpl_slave.cc | 1 - sql/sql_class.cc | 1 - sql/sql_class.h | 2 -- storage/innobase/handler/ha_innodb.cc | 2 +- storage/innobase/lock/lock0lock.cc | 6 +++--- 6 files changed, 4 insertions(+), 11 deletions(-) diff --git a/sql/rpl_rli_pdb.cc b/sql/rpl_rli_pdb.cc index e769bfd..3ba42b1 100644 --- a/sql/rpl_rli_pdb.cc +++ b/sql/rpl_rli_pdb.cc @@ -94,7 +94,6 @@ bool handle_slave_worker_stop(Slave_worker *worker, worker->running_status= Slave_worker::STOP_ACCEPTED; mysql_cond_signal(&worker->jobs_cond); mysql_mutex_unlock(&rli->exit_count_lock); - is_slave_replication = false; return(true); } else if (rli->exit_counter == rli->slave_parallel_workers) @@ -105,12 +104,10 @@ bool handle_slave_worker_stop(Slave_worker *worker, worker->running_status= Slave_worker::STOP_ACCEPTED; mysql_cond_signal(&worker->jobs_cond); mysql_mutex_unlock(&rli->exit_count_lock); - is_slave_replication = false; return(true); } } mysql_mutex_unlock(&rli->exit_count_lock); - is_slave_replication = false; return(false); } diff --git a/sql/rpl_slave.cc b/sql/rpl_slave.cc index 0f893a6..fd00f1a 100644 --- a/sql/rpl_slave.cc +++ b/sql/rpl_slave.cc @@ -6056,7 +6056,6 @@ extern "C" void *handle_slave_worker(void *arg) struct PSI_thread *psi; #endif - is_slave_replication = true; my_thread_init(); DBUG_ENTER("handle_slave_worker"); diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 68d4532..8d4d980 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -63,7 +63,6 @@ using std::min; using std::max; -bool is_slave_replication = false; /* The following is used to initialise Table_ident with a internal diff --git a/sql/sql_class.h b/sql/sql_class.h index 0cb8e38..01a19f2 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -84,8 +84,6 @@ void set_thd_stage_info(void *thd, #define THD_STAGE_INFO(thd, stage) \ (thd)->enter_stage(& stage, NULL, __func__, __FILE__, __LINE__) -extern bool is_slave_replication; - enum enum_delay_key_write { DELAY_KEY_WRITE_NONE, DELAY_KEY_WRITE_ON, DELAY_KEY_WRITE_ALL }; enum enum_rbr_exec_mode { RBR_EXEC_MODE_STRICT, diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 1181b61..31af630 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -1380,7 +1380,7 @@ thd_is_replication_slave_thread( /*============================*/ THD* thd) /*!< in: thread handle */ { - return((ibool) thd_slave_thread(thd)); + return thd && ((ibool) thd_slave_thread(thd)); } /******************************************************************//** diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 666a5bd..f125482 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -1671,7 +1671,7 @@ RecLock::check_deadlock_result(const trx_t* victim_trx, lock_t* lock) // Move it only when it does not cause a deadlock. if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS - && !is_slave_replication + && !thd_is_replication_slave_thread(lock->trx->mysql_thd) && !trx_is_high_priority(lock->trx)) { HASH_DELETE(lock_t, hash, lock_hash_get(lock->type_mode), @@ -2602,7 +2602,7 @@ lock_rec_dequeue_from_page( MONITOR_DEC(MONITOR_NUM_RECLOCK); if (innodb_lock_schedule_algorithm - == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS || is_slave_replication) { + == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS || thd_is_replication_slave_thread(in_lock->trx->mysql_thd)) { /* Check if waiting locks in the queue can now be granted: grant locks if there are no conflicting locks ahead. Stop at @@ -7395,7 +7395,7 @@ DeadlockChecker::get_first_lock(ulint* heap_no) const ut_a(lock != m_wait_lock || (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS - && !is_slave_replication)); + && !thd_is_replication_slave_thread(lock->trx->mysql_thd))); /* Check that the lock type doesn't change. */ ut_ad(lock_get_type_low(lock) == lock_get_type_low(m_wait_lock)); From 359d94ccf86604d3234cff35c46daf92d0ea5174 Mon Sep 17 00:00:00 2001 From: sensssz Date: Wed, 19 Oct 2016 15:48:13 -0400 Subject: [PATCH 5/5] Remove empty line in sql_class.cc --- sql/sql_class.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 8d4d980..7a61fc0 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -63,7 +63,6 @@ using std::min; using std::max; - /* The following is used to initialise Table_ident with a internal table name