From 6b5891a255075dd2db92a89d967a86b68fb3c8c8 Mon Sep 17 00:00:00 2001 From: sensssz Date: Wed, 12 Oct 2016 00:47:24 -0400 Subject: [PATCH 1/4] Implement VATS. Add a configuration parameter for it. --- storage/innobase/handler/ha_innodb.cc | 29 ++++++++ storage/innobase/include/lock0lock.h | 9 +++ storage/innobase/lock/lock0lock.cc | 133 ++++++++++++++++++++++++++++++---- 3 files changed, 158 insertions(+), 13 deletions(-) diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 22d51a4..aec74a4 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -227,6 +227,22 @@ static TYPELIB innodb_checksum_algorithm_typelib = { NULL }; +/** Possible values of the parameter innodb_lock_schedule_algorithm */ +static const char* innodb_lock_schedule_algorithm_names[] = { + "fcfs", + "vats", + NullS +}; + +/** Used to define an enumerate type of the system variable +innodb_lock_schedule_algorithm. */ +static TYPELIB innodb_lock_schedule_algorithm_typelib = { + array_elements(innodb_lock_schedule_algorithm_names) - 1, + "innodb_lock_schedule_algorithm_typelib", + innodb_lock_schedule_algorithm_names, + NULL +}; + /* The following counter is used to convey information to InnoDB about server activity: in case of normal DML ops it is not sensible to call srv_active_wake_master_thread after each @@ -16004,6 +16020,18 @@ static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size, NULL, NULL, 120, 1, 127, 0); #endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ +static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm, + PLUGIN_VAR_RQCMDARG, + "The algorithm Innodb uses for deciding which locks to grant next when" + " a lock is released. Possible values are" + " FCFS" + " grant the locks in First-Come-First-Served order;" + " VATS" + " use the Variance-Aware-Transaction-Scheduling algorithm, which" + " uses an Eldest-Transaction-First heuristic.", + NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, + &innodb_lock_schedule_algorithm_typelib); + static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of buffer pool instances, set to higher value on high-end machines to increase scalability", @@ -16536,6 +16564,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(ft_sort_pll_degree), MYSQL_SYSVAR(large_prefix), MYSQL_SYSVAR(force_load_corrupted), + MYSQL_SYSVAR(lock_schedule_algorithm), MYSQL_SYSVAR(locks_unsafe_for_binlog), MYSQL_SYSVAR(lock_wait_timeout), #ifdef UNIV_LOG_ARCHIVE diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index 6d5ed35..fa8d5ad 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -43,6 +43,15 @@ Created 5/7/1996 Heikki Tuuri extern ibool lock_print_waits; #endif /* UNIV_DEBUG */ +/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by + setting innodb_lock_schedule_algorithm. */ +enum innodb_lock_schedule_algorithm_t { + INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, /*!< First Come First Served */ + INNODB_LOCK_SCHEDULE_ALGORITHM_VATS /*!< Variance-Aware-Transaction-Scheduling */ +}; + +extern ulong innodb_lock_schedule_algorithm; + /*********************************************************************//** Gets the size of a lock struct. @return size in bytes */ diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index b5e4df3..244d94a 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -69,6 +69,9 @@ bitmap */ #define LOCK_PAGE_BITMAP_MARGIN 64 +/** Lock scheduling algorithm */ +ulong innodb_lock_schedule_algorithm = INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS; + /* An explicit record lock affects both the record and the gap before it. An implicit x-lock does not affect the gap, it only locks the index record from read or update. @@ -1774,6 +1777,72 @@ lock_number_of_rows_locked( /*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/ /*********************************************************************//** +Check if lock1 has higher priority than lock2. +NULL has lowest priority. +If neither of them is wait lock, the first one has higher priority. +If only one of them is a wait lock, it has lower priority. +Otherwise, the one with an older transaction has higher priority. +@returns true if lock1 has higher priority, false otherwise. */ +bool +has_higher_priority( + lock_t *lock1, + lock_t *lock2) +{ + if (lock1 == NULL) { + return false; + } else if (lock2 == NULL) { + return true; + } + if (!lock_get_wait(lock1)) { + return true; + } else if (!lock_get_wait(lock2)) { + return false; + } + return lock1->trx->start_time < lock2->trx->start_time; +} + +/*********************************************************************//** +Insert a lock to the hash list according to the mode (whether it is a wait lock) +and the age of the transaction the it is associated with. +If the lock is not a wait lock, insert it to the head of the hash list. +Otherwise, insert it to the middle of the wait locks according to the age of the +transaciton. +*/ +static +void +lock_rec_insert_by_trx_age( + lock_t *in_lock, /*!< in: lock to be insert */ + bool wait) /*!< in: whether it's a wait lock */ +{ + ulint space; + ulint page_no; + ulint rec_fold; + hash_cell_t* cell; + lock_t* node; + lock_t* next; + + space = in_lock->un_member.rec_lock.space; + page_no = in_lock->un_member.rec_lock.page_no; + rec_fold = lock_rec_fold(space, page_no); + cell = hash_get_nth_cell(lock_sys->rec_hash, + hash_calc_hash(rec_fold, lock_sys->rec_hash)); + + node = (lock_t *) cell->node; + // If in_lock is not a wait lock, we insert it to the head of the list. + if (node == NULL || !wait || has_higher_priority(in_lock, node)) { + cell->node = in_lock; + in_lock->hash = node; + return; + } + while (node != NULL && has_higher_priority((lock_t *) node->hash, in_lock)) { + node = (lock_t *) node->hash; + } + next = (lock_t *) node->hash; + node->hash = in_lock; + in_lock->hash = next; +} + +/*********************************************************************//** Creates a new record lock and inserts it to the lock queue. Does NOT check for deadlocks or lock compatibility! @return created lock */ @@ -1852,8 +1921,12 @@ lock_rec_create( ut_ad(index->table->n_ref_count > 0 || !index->table->can_be_evicted); - HASH_INSERT(lock_t, hash, lock_sys->rec_hash, - lock_rec_fold(space, page_no), lock); + if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { + lock_rec_insert_by_trx_age(lock, type_mode & LOCK_WAIT); + } else { + HASH_INSERT(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(space, page_no), lock); + } if (!caller_owns_trx_mutex) { trx_mutex_enter(trx); @@ -2471,7 +2544,9 @@ lock_rec_dequeue_from_page( { ulint space; ulint page_no; + ulint rec_fold; lock_t* lock; + lock_t* previous = NULL; trx_lock_t* trx_lock; ut_ad(lock_mutex_own()); @@ -2482,6 +2557,7 @@ lock_rec_dequeue_from_page( space = in_lock->un_member.rec_lock.space; page_no = in_lock->un_member.rec_lock.page_no; + rec_fold = lock_rec_fold(space, page_no); in_lock->index->table->n_rec_locks--; @@ -2493,20 +2569,51 @@ lock_rec_dequeue_from_page( MONITOR_INC(MONITOR_RECLOCK_REMOVED); MONITOR_DEC(MONITOR_NUM_RECLOCK); - /* Check if waiting locks in the queue can now be granted: grant - locks if there are no conflicting locks ahead. Stop at the first - X lock that is waiting or has been granted. */ + if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS) { + /* Check if waiting locks in the queue can now be granted: grant + locks if there are no conflicting locks ahead. Stop at the first + X lock that is waiting or has been granted. */ - for (lock = lock_rec_get_first_on_page_addr(space, page_no); - lock != NULL; - lock = lock_rec_get_next_on_page(lock)) { + for (lock = lock_rec_get_first_on_page_addr(space, page_no); + lock != NULL; + lock = lock_rec_get_next_on_page(lock)) { - if (lock_get_wait(lock) - && !lock_rec_has_to_wait_in_queue(lock)) { + if (lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { - /* Grant the lock */ - ut_ad(lock->trx != in_lock->trx); - lock_grant(lock); + /* Grant the lock */ + ut_ad(lock->trx != in_lock->trx); + lock_grant(lock); + } + } + } else { + /* Grant locks if there are no conflicting locks ahead. + Move granted locks to the head of the list. */ + for (lock = lock_rec_get_first_on_page_addr(space, page_no); + lock != NULL;) { + + /* If the lock is a wait lock on this page, and it does not need to wait. */ + if ((lock->un_member.rec_lock.space == space) + && (lock->un_member.rec_lock.page_no == page_no) + && lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + lock_grant(lock); + + if (previous != NULL) { + /* Move the lock to the head of the list. */ + HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock); + lock_rec_move_to_front(lock, rec_fold); + } else { + /* Already at the head of the list. */ + previous = lock; + } + /* Move on to the next lock. */ + lock = static_cast(HASH_GET_NEXT(hash, previous)); + } else { + previous = lock; + lock = static_cast(HASH_GET_NEXT(hash, lock)); + } } } } From 54dd0b598a7ede348c6870c8a5ea3ed92d3577a3 Mon Sep 17 00:00:00 2001 From: sensssz Date: Wed, 12 Oct 2016 01:39:10 -0400 Subject: [PATCH 2/4] Add function lock_rec_move_to_front --- storage/innobase/lock/lock0lock.cc | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 244d94a..ba42a5a 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -2528,6 +2528,27 @@ lock_rec_cancel( } /*************************************************************//** +Move the lock to the head of the hash list. */ +static +void +lock_rec_move_to_front( + lock_t *lock_to_move, /*!< in: lock to be moved */ + ulint rec_fold) /*!< in: rec fold of the lock */ +{ + if (lock_to_move != NULL) + { + // Move the target lock to the head of the list + hash_cell_t* cell = hash_get_nth_cell(lock_sys->rec_hash, + hash_calc_hash(rec_fold, lock_sys->rec_hash)); + if (lock_to_move != cell->node) { + lock_t *next = (lock_t *) cell->node; + cell->node = lock_to_move; + lock_to_move->hash = next; + } + } +} + +/*************************************************************//** Removes a record lock request, waiting or granted, from the queue and grants locks to other transactions in the queue if they now are entitled to a lock. NOTE: all record locks contained in in_lock are removed. */ From 2c0c71d23ae6f319405fbebaf7e0b1d5621892db Mon Sep 17 00:00:00 2001 From: sensssz Date: Wed, 12 Oct 2016 23:38:05 -0400 Subject: [PATCH 3/4] Conditionalize an assert in lock_rec_queue_validate. --- storage/innobase/lock/lock0lock.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index ba42a5a..d7e8a9f 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -5753,8 +5753,10 @@ lock_rec_queue_validate( ut_a(!lock_rec_other_has_expl_req( mode, 0, 0, block, heap_no, lock->trx)); - } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) { - + } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock) + && innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS) { + // If using VATS, it's possible that a wait lock is inserted to a place in the list + // such that it does not need to wait. ut_a(lock_rec_has_to_wait_in_queue(lock)); } } From fd7d45ef2c39362d88fd7b757500d52227e2f278 Mon Sep 17 00:00:00 2001 From: Jiamin Huang Date: Thu, 13 Oct 2016 01:13:49 -0400 Subject: [PATCH 4/4] Move lock after resolving deadlok. --- storage/innobase/lock/lock0lock.cc | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index d7e8a9f..722c953 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -1921,12 +1921,8 @@ lock_rec_create( ut_ad(index->table->n_ref_count > 0 || !index->table->can_be_evicted); - if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { - lock_rec_insert_by_trx_age(lock, type_mode & LOCK_WAIT); - } else { - HASH_INSERT(lock_t, hash, lock_sys->rec_hash, - lock_rec_fold(space, page_no), lock); - } + HASH_INSERT(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(space, page_no), lock); if (!caller_owns_trx_mutex) { trx_mutex_enter(trx); @@ -2052,6 +2048,13 @@ lock_rec_enqueue_waiting( return(DB_SUCCESS_LOCKED_REC); } + // Move it only when it does not cause a deadlock. + if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) { + HASH_DELETE(lock_t, hash, lock_sys->rec_hash, + lock_rec_fold(buf_block_get_space(block), buf_block_get_page_no(block)), lock); + lock_rec_insert_by_trx_age(lock, true); + } + trx->lock.que_state = TRX_QUE_LOCK_WAIT; trx->lock.was_chosen_as_deadlock_victim = FALSE; @@ -3798,7 +3801,8 @@ lock_get_first_lock( } ut_a(lock != NULL); - ut_a(lock != ctx->wait_lock); + ut_a(lock != ctx->wait_lock || + innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS); ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock)); return(lock);