diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 1a65c5f..31af630 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -300,6 +300,22 @@ static TYPELIB innodb_default_row_format_typelib = { NULL }; +/** Possible values of the parameter innodb_lock_schedule_algorithm */ +static const char* innodb_lock_schedule_algorithm_names[] = { + "fcfs", + "vats", + NullS +}; + +/** Used to define an enumerate type of the system variable +innodb_lock_schedule_algorithm. */ +static TYPELIB innodb_lock_schedule_algorithm_typelib = { + array_elements(innodb_lock_schedule_algorithm_names) - 1, + "innodb_lock_schedule_algorithm_typelib", + innodb_lock_schedule_algorithm_names, + NULL +}; + /* The following counter is used to convey information to InnoDB about server activity: in case of normal DML ops it is not sensible to call srv_active_wake_master_thread after each @@ -1364,7 +1380,7 @@ thd_is_replication_slave_thread( /*============================*/ THD* thd) /*!< in: thread handle */ { - return((ibool) thd_slave_thread(thd)); + return thd && ((ibool) thd_slave_thread(thd)); } /******************************************************************//** @@ -19467,6 +19483,18 @@ static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size, NULL, NULL, 120, 1, 127, 0); #endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ +static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm, + PLUGIN_VAR_RQCMDARG, + "The algorithm Innodb uses for deciding which locks to grant next when" + " a lock is released. Possible values are" + " FCFS" + " grant the locks in First-Come-First-Served order;" + " VATS" + " use the Variance-Aware-Transaction-Scheduling algorithm, which" + " uses an Eldest-Transaction-First heuristic.", + NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, + &innodb_lock_schedule_algorithm_typelib); + static MYSQL_SYSVAR_ULONG(buffer_pool_instances, srv_buf_pool_instances, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of buffer pool instances, set to higher value on high-end machines to increase scalability", @@ -20070,6 +20098,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(ft_sort_pll_degree), MYSQL_SYSVAR(large_prefix), MYSQL_SYSVAR(force_load_corrupted), + MYSQL_SYSVAR(lock_schedule_algorithm), MYSQL_SYSVAR(locks_unsafe_for_binlog), MYSQL_SYSVAR(lock_wait_timeout), MYSQL_SYSVAR(deadlock_detect), diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index 8db09e5..5770b8a 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -40,6 +40,17 @@ Created 5/7/1996 Heikki Tuuri #include "gis0rtree.h" #include "lock0prdt.h" +/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by + setting innodb_lock_schedule_algorithm. */ +enum innodb_lock_schedule_algorithm_t { + /*!< First Come First Served */ + INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, + /*!< Variance-Aware-Transaction-Scheduling */ + INNODB_LOCK_SCHEDULE_ALGORITHM_VATS +}; + +extern ulong innodb_lock_schedule_algorithm; + // Forward declaration class ReadView; diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 45e567e..505e489 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -1089,6 +1089,8 @@ struct trx_t { time_t start_time; /*!< time the state last time became TRX_STATE_ACTIVE */ + clock_t start_time_micro; /*!< start time of the transaction + in microseconds. */ lsn_t commit_lsn; /*!< lsn at the time of the commit */ table_id_t table_id; /*!< Table to drop iff dict_operation == TRX_DICT_OP_TABLE, or 0. */ diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index 540bb61..7269433 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -26,6 +26,7 @@ Created 5/7/1996 Heikki Tuuri #define LOCK_MODULE_IMPLEMENTATION #include +#include #include "ha_prototypes.h" #include "lock0lock.h" @@ -54,6 +55,9 @@ Created 5/7/1996 Heikki Tuuri /* Flag to enable/disable deadlock detector. */ my_bool innobase_deadlock_detect = TRUE; +/** Lock scheduling algorithm */ +ulong innodb_lock_schedule_algorithm = INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS; + /** Total number of cached record locks */ static const ulint REC_LOCK_CACHE = 8; @@ -66,6 +70,25 @@ static const ulint TABLE_LOCK_CACHE = 8; /** Size in bytes, of the table lock instance */ static const ulint TABLE_LOCK_SIZE = sizeof(ib_lock_t); +/*********************************************************************//** +Checks if a waiting record lock request still has to wait in a queue. +@return lock that is causing the wait */ +static +const lock_t* +lock_rec_has_to_wait_in_queue( +/*==========================*/ + const lock_t* wait_lock); /*!< in: waiting record lock */ + +/*************************************************************//** +Grants a lock to a waiting lock request and releases the waiting transaction. +The caller must hold lock_sys->mutex. */ +static +void +lock_grant( +/*=======*/ + lock_t* lock, /*!< in/out: waiting lock request */ + bool owns_trx_mutex); /*!< in: whether lock->trx->mutex is owned */ + /** Deadlock checker. */ class DeadlockChecker { public: @@ -1488,6 +1511,158 @@ RecLock::lock_alloc( return(lock); } +/*********************************************************************//** +Check if lock1 has higher priority than lock2. +NULL has lowest priority. +If either is a high priority transaction, the lock has higher priority. +If neither of them is wait lock, the first one has higher priority. +If only one of them is a wait lock, it has lower priority. +Otherwise, the one with an older transaction has higher priority. +@returns true if lock1 has higher priority, false otherwise. */ +bool +has_higher_priority( + lock_t *lock1, + lock_t *lock2) +{ + if (lock1 == NULL) { + return false; + } else if (lock2 == NULL) { + return true; + } + if (trx_is_high_priority(lock1->trx)) { + return true; + } + if (trx_is_high_priority(lock2->trx)) { + return false; + } + if (!lock_get_wait(lock1)) { + return true; + } else if (!lock_get_wait(lock2)) { + return false; + } + return lock1->trx->start_time_micro <= lock2->trx->start_time_micro; +} + +/*********************************************************************//** +Insert a lock to the hash list according to the mode (whether it is a wait +lock) and the age of the transaction the it is associated with. +If the lock is not a wait lock, insert it to the head of the hash list. +Otherwise, insert it to the middle of the wait locks according to the age of +the transaciton. */ +static +dberr_t +lock_rec_insert_by_trx_age( + lock_t *in_lock) /*!< in: lock to be insert */{ + ulint space; + ulint page_no; + ulint rec_fold; + hash_table_t* hash; + hash_cell_t* cell; + lock_t* node; + lock_t* next; + + space = in_lock->un_member.rec_lock.space; + page_no = in_lock->un_member.rec_lock.page_no; + rec_fold = lock_rec_fold(space, page_no); + hash = lock_hash_get(in_lock->type_mode); + cell = hash_get_nth_cell(hash, + hash_calc_hash(rec_fold, hash)); + + node = (lock_t *) cell->node; + // If in_lock is not a wait lock, we insert it to the head of the list. + if (node == NULL || !lock_get_wait(in_lock) || has_higher_priority(in_lock, node)) { + cell->node = in_lock; + in_lock->hash = node; + if (lock_get_wait(in_lock)) { + lock_grant(in_lock, true); + return DB_SUCCESS_LOCKED_REC; + } + return DB_SUCCESS; + } + while (node != NULL && has_higher_priority((lock_t *) node->hash, + in_lock)) { + node = (lock_t *) node->hash; + } + next = (lock_t *) node->hash; + node->hash = in_lock; + in_lock->hash = next; + + if (lock_get_wait(in_lock) && !lock_rec_has_to_wait_in_queue(in_lock)) { + lock_grant(in_lock, true); + if (cell->node != in_lock) { + // Move it to the front of the queue + node->hash = in_lock->hash; + next = (lock_t *) cell->node; + cell->node = in_lock; + in_lock->hash = next; + } + return DB_SUCCESS_LOCKED_REC; + } + + return DB_SUCCESS; +} + +static +bool +lock_queue_validate( + const lock_t *in_lock) /*!< in: lock whose hash list is to be validated */ +{ + ulint space; + ulint page_no; + ulint rec_fold; + hash_table_t* hash; + hash_cell_t* cell; + lock_t* next; + bool wait_lock = false; + + if (in_lock == NULL) { + return true; + } + + space = in_lock->un_member.rec_lock.space; + page_no = in_lock->un_member.rec_lock.page_no; + rec_fold = lock_rec_fold(space, page_no); + hash = lock_hash_get(in_lock->type_mode); + cell = hash_get_nth_cell(hash, + hash_calc_hash(rec_fold, hash)); + next = (lock_t *) cell->node; + while (next != NULL) { + // If this is a granted lock, check that there's no wait lock before it. + if (!lock_get_wait(next)) { + ut_ad(!wait_lock); + } else { + wait_lock = true; + } + next = next->hash; + } + return true; +} + +static +void +lock_rec_insert_to_head( + lock_t *in_lock, /*!< in: lock to be insert */ + ulint rec_fold) /*!< in: rec_fold of the page */ +{ + hash_table_t* hash; + hash_cell_t* cell; + lock_t* node; + + if (in_lock == NULL) { + return; + } + + hash = lock_hash_get(in_lock->type_mode); + cell = hash_get_nth_cell(hash, + hash_calc_hash(rec_fold, hash)); + node = (lock_t *) cell->node; + if (node != in_lock) { + cell->node = in_lock; + in_lock->hash = node; + } +} + + /** Add the lock to the record lock hash and the transaction's lock list @param[in,out] lock Newly created record lock to add to the rec hash @@ -1497,16 +1672,28 @@ RecLock::lock_add(lock_t* lock, bool add_to_hash) { ut_ad(lock_mutex_own()); ut_ad(trx_mutex_own(lock->trx)); + + bool wait_lock = m_mode & LOCK_WAIT; if (add_to_hash) { ulint key = m_rec_id.fold(); + hash_table_t *lock_hash = lock_hash_get(m_mode); ++lock->index->table->n_rec_locks; - - HASH_INSERT(lock_t, hash, lock_hash_get(m_mode), key, lock); - } - - if (m_mode & LOCK_WAIT) { + + if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS + && !thd_is_replication_slave_thread(lock->trx->mysql_thd)) { + if (wait_lock) { + HASH_INSERT(lock_t, hash, lock_hash, key, lock); + } else { + lock_rec_insert_to_head(lock, m_rec_id.fold()); + } + } else { + HASH_INSERT(lock_t, hash, lock_hash, key, lock); + } + } + + if (wait_lock) { lock_set_lock_and_trx_wait(lock, lock->trx); } @@ -1732,6 +1919,21 @@ RecLock::add_to_waitq(const lock_t* wait_for, const lock_prdt_t* prdt) dberr_t err = deadlock_check(lock); ut_ad(trx_mutex_own(m_trx)); + + // Move it only when it does not cause a deadlock. + if (err != DB_DEADLOCK + && innodb_lock_schedule_algorithm + == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS + && !thd_is_replication_slave_thread(lock->trx->mysql_thd) + && !trx_is_high_priority(lock->trx)) { + + HASH_DELETE(lock_t, hash, lock_hash_get(lock->type_mode), + m_rec_id.fold(), lock); + dberr_t res = lock_rec_insert_by_trx_age(lock); + if (res != DB_SUCCESS) { + return res; + } + } /* m_trx->mysql_thd is NULL if it's an internal trx. So current_thd is used */ if (err == DB_LOCK_WAIT) { @@ -2120,13 +2322,16 @@ static void lock_grant( /*=======*/ - lock_t* lock) /*!< in/out: waiting lock request */ + lock_t* lock, /*!< in/out: waiting lock request */ + bool owns_trx_mutex) /*!< in: whether lock->trx->mutex is owned */ { ut_ad(lock_mutex_own()); - + ut_ad(trx_mutex_own(lock->trx) == owns_trx_mutex); lock_reset_lock_and_trx_wait(lock); - trx_mutex_enter(lock->trx); + if (!owns_trx_mutex) { + trx_mutex_enter(lock->trx); + } if (lock_get_mode(lock) == LOCK_AUTO_INC) { dict_table_t* table = lock->un_member.tab_lock.table; @@ -2159,7 +2364,9 @@ lock_grant( } } - trx_mutex_exit(lock->trx); + if (!owns_trx_mutex) { + trx_mutex_exit(lock->trx); + } } /** @@ -2423,6 +2630,65 @@ lock_rec_cancel( trx_mutex_exit(lock->trx); } +static +void +lock_grant_and_move_on_page( + hash_table_t *lock_hash, + ulint space, + ulint page_no) +{ + lock_t* lock; + lock_t* previous; + ulint rec_fold = lock_rec_fold(space, page_no); + + previous = (lock_t *) hash_get_nth_cell(lock_hash, + hash_calc_hash(rec_fold, lock_hash))->node; + if (previous == NULL) { + return; + } + if (previous->un_member.rec_lock.space == space && + previous->un_member.rec_lock.page_no == page_no) { + lock = previous; + } + else { + while (previous->hash && + (previous->hash->un_member.rec_lock.space != space || + previous->hash->un_member.rec_lock.page_no != page_no)) { + previous = previous->hash; + } + lock = previous->hash; + } + + ut_ad(previous->hash == lock || previous == lock); + /* Grant locks if there are no conflicting locks ahead. + Move granted locks to the head of the list. */ + for (;lock != NULL;) { + + /* If the lock is a wait lock on this page, and it does not need to wait. */ + if ((lock->un_member.rec_lock.space == space) + && (lock->un_member.rec_lock.page_no == page_no) + && lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + lock_grant(lock, false); + + if (previous != NULL) { + /* Move the lock to the head of the list. */ + HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock); + lock_rec_insert_to_head(lock, rec_fold); + } else { + /* Already at the head of the list. */ + previous = lock; + } + /* Move on to the next lock. */ + lock = static_cast(HASH_GET_NEXT(hash, previous)); + } else { + previous = lock; + lock = static_cast(HASH_GET_NEXT(hash, lock)); + } + } +} + /*************************************************************//** Removes a record lock request, waiting or granted, from the queue and grants locks to other transactions in the queue if they now are entitled @@ -2465,21 +2731,29 @@ lock_rec_dequeue_from_page( MONITOR_INC(MONITOR_RECLOCK_REMOVED); MONITOR_DEC(MONITOR_NUM_RECLOCK); - /* Check if waiting locks in the queue can now be granted: grant - locks if there are no conflicting locks ahead. Stop at the first - X lock that is waiting or has been granted. */ + if (innodb_lock_schedule_algorithm + == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS || + thd_is_replication_slave_thread(in_lock->trx->mysql_thd)) { - for (lock = lock_rec_get_first_on_page_addr(lock_hash, space, page_no); - lock != NULL; - lock = lock_rec_get_next_on_page(lock)) { + /* Check if waiting locks in the queue can now be granted: + grant locks if there are no conflicting locks ahead. Stop at + the first X lock that is waiting or has been granted. */ - if (lock_get_wait(lock) - && !lock_rec_has_to_wait_in_queue(lock)) { + for (lock = lock_rec_get_first_on_page_addr(lock_hash, space, + page_no); + lock != NULL; + lock = lock_rec_get_next_on_page(lock)) { - /* Grant the lock */ - ut_ad(lock->trx != in_lock->trx); - lock_grant(lock); + if (lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + /* Grant the lock */ + ut_ad(lock->trx != in_lock->trx); + lock_grant(lock, false); + } } + } else { + lock_grant_and_move_on_page(lock_hash, space, page_no); } } @@ -4124,7 +4398,7 @@ lock_table_dequeue( /* Grant the lock */ ut_ad(in_lock->trx != lock->trx); - lock_grant(lock); + lock_grant(lock, false); } } } @@ -4208,6 +4482,66 @@ run_again: } /*=========================== LOCK RELEASE ==============================*/ +static +void +lock_grant_and_move_on_rec( + hash_table_t *lock_hash, + lock_t *first_lock, + ulint heap_no) +{ + lock_t* lock; + lock_t* previous; + ulint space; + ulint page_no; + ulint rec_fold; + + space = first_lock->un_member.rec_lock.space; + page_no = first_lock->un_member.rec_lock.page_no; + rec_fold = lock_rec_fold(space, page_no); + + previous = (lock_t *) hash_get_nth_cell(lock_hash, + hash_calc_hash(rec_fold, lock_hash))->node; + if (previous == NULL) { + return; + } + if (previous == first_lock) { + lock = previous; + } else { + while (previous->hash && + previous->hash != first_lock) { + previous = previous->hash; + } + lock = previous->hash; + } + /* Grant locks if there are no conflicting locks ahead. + Move granted locks to the head of the list. */ + for (;lock != NULL;) { + + /* If the lock is a wait lock on this page, and it does not need to wait. */ + if (lock->un_member.rec_lock.space == space + && lock->un_member.rec_lock.page_no == page_no + && lock_rec_get_nth_bit(lock, heap_no) + && lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + lock_grant(lock, false); + + if (previous != NULL) { + /* Move the lock to the head of the list. */ + HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock); + lock_rec_insert_to_head(lock, rec_fold); + } else { + /* Already at the head of the list. */ + previous = lock; + } + /* Move on to the next lock. */ + lock = static_cast(HASH_GET_NEXT(hash, previous)); + } else { + previous = lock; + lock = static_cast(HASH_GET_NEXT(hash, lock)); + } + } +} /*************************************************************//** Removes a granted record lock of a transaction from the queue and grants @@ -4268,19 +4602,26 @@ lock_rec_unlock( released: ut_a(!lock_get_wait(lock)); lock_rec_reset_nth_bit(lock, heap_no); - - /* Check if we can now grant waiting lock requests */ - - for (lock = first_lock; lock != NULL; - lock = lock_rec_get_next(heap_no, lock)) { - if (lock_get_wait(lock) - && !lock_rec_has_to_wait_in_queue(lock)) { - - /* Grant the lock */ - ut_ad(trx != lock->trx); - lock_grant(lock); - } - } + + if (innodb_lock_schedule_algorithm + == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS || + thd_is_replication_slave_thread(lock->trx->mysql_thd)) { + + /* Check if we can now grant waiting lock requests */ + + for (lock = first_lock; lock != NULL; + lock = lock_rec_get_next(heap_no, lock)) { + if (lock_get_wait(lock) + && !lock_rec_has_to_wait_in_queue(lock)) { + + /* Grant the lock */ + ut_ad(trx != lock->trx); + lock_grant(lock, false); + } + } + } else { + lock_grant_and_move_on_rec(lock_sys->rec_hash, first_lock, heap_no); + } lock_mutex_exit(); trx_mutex_exit(trx); @@ -5481,6 +5822,9 @@ lock_rec_queue_validate( ut_a(lock_rec_has_to_wait_in_queue(lock)); } } + + ut_ad(innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS || + lock_queue_validate(lock)); func_exit: if (!locked_lock_trx_sys) { @@ -7221,7 +7565,10 @@ DeadlockChecker::get_first_lock(ulint* heap_no) const /* Must find at least two locks, otherwise there cannot be a waiting lock, secondly the first lock cannot be the wait_lock. */ ut_a(lock != NULL); - ut_a(lock != m_wait_lock); + ut_a(lock != m_wait_lock || + (innodb_lock_schedule_algorithm + == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS + && !thd_is_replication_slave_thread(lock->trx->mysql_thd))); /* Check that the lock type doesn't change. */ ut_ad(lock_get_type_low(lock) == lock_get_type_low(m_wait_lock)); diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 7f89df7..021c221 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -1462,6 +1462,8 @@ trx_start_low( trx->start_time = ut_time(); } + trx->start_time_micro = clock(); + ut_a(trx->error_state == DB_SUCCESS); MONITOR_INC(MONITOR_TRX_ACTIVE);