------------------------------------------------------------ revno: 8781 fixes bugs: https://launchpad.net/bugs/1411692 https://launchpad.net/bugs/1411694 committer: Laurynas Biveinis branch nick: mysql-5.7-percona-patches timestamp: Fri 2015-01-16 21:30:41 +0200 message: Fix http://bugs.mysql.com/bug.php?id=75534 (Solve buffer pool mutex contention by splitting it). The patch: - Removes the buffer pool mutex. Introduces several new list/hash protecting mutexes, and access without any mutex to several variables. There atomic variables or os_rmb/os_wmb is used where deemed appropriate. volatile is not used. The new mutexes are - LRU_list_mutex for the LRU_list; - zip_free mutex for the zip_free arrays; - zip_hash mutex for the zip_hash hash and in_zip_hash flag; - free_list_mutex for the free_list and withdraw list. If desired, withdraw_list_mutex may be easily further split in the future. buf_pool->watch[] and all bpage protection has been moved to page_hash. The variables switched from buffer pool mutex protection to atomic operations and/or os_rmb/os_wmb. Particularly the uses of latter, while I tried to make them correct, might be very debatable. - srv_buf_pool_old_size, srv_buf_pool_size, srv_buf_pool_curr_size, srv_buf_pool_base_size - buf_pool->buddy_stat[i].used - buf_pool->curr_size, n_chunks_new - Reduces critical section length or removes it completely for buf_block_buf_fix_inc/dec calls. - Exploits the fact that freed pages must have no pointers to them from the buffer pool nor from any other thread except for the freeing one to remove redundant locking. The same applies to freshly allocated pages before any pointers to them are published. This however necessitates removing some of the debug checks that scan buffer pool chunks directly, as they don't have a way to freeze such blocks. (buf_block_align) - Related to above, add more consistency asserts to buf_page_set_state. Add some scalability asserts (!mutex_own) too. - buf_buddy_alloc rewritten not to require the buffer pool mutex at the start, which then might be released, and this fact propagated to the caller to make decisions to re-check things. It is now called with mutexes unlocked, and the caller buf_page_init_for_read algorithm has been simplified. All its allocations now happen with mutexes unlocked. - buf_flush_LRU_list_batch uses mutex_enter_nowait to skip over any currently-locked blocks. - Removed some outdated buf0buf.cc comments. Bugs fixed fully or partially, besides the current one: - http://bugs.mysql.com/bug.php?id=64344 fixed buf_page_init_for_read holding mutexes while allocating memory. It also should be easier to fix buf_LRU_free_page now. - http://bugs.mysql.com/bug.php?id=75503 - http://bugs.mysql.com/bug.php?id=75504 diff: === modified file 'storage/innobase/btr/btr0bulk.cc' --- storage/innobase/btr/btr0bulk.cc 2014-08-19 05:43:25 +0000 +++ storage/innobase/btr/btr0bulk.cc 2015-01-16 19:30:41 +0000 @@ -568,9 +568,7 @@ #endif /* UNIV_DEBUG */ /* We fix the block because we will re-pin it soon. */ - buf_page_mutex_enter(m_block); buf_block_buf_fix_inc(m_block, __FILE__, __LINE__); - buf_page_mutex_exit(m_block); mtr_commit(m_mtr); } @@ -598,9 +596,7 @@ ut_ad(m_block != NULL); } - buf_page_mutex_enter(m_block); buf_block_buf_fix_dec(m_block); - buf_page_mutex_exit(m_block); #ifdef UNIV_DEBUG page_header_set_ptr(m_page, NULL, PAGE_HEAP_TOP, === modified file 'storage/innobase/btr/btr0cur.cc' --- storage/innobase/btr/btr0cur.cc 2014-08-26 11:08:37 +0000 +++ storage/innobase/btr/btr0cur.cc 2015-01-16 19:30:41 +0000 @@ -467,9 +467,7 @@ if (btr_page_get_prev(buf_block_get_frame(block), mtr) == left_page_no) { /* adjust buf_fix_count */ - buf_page_mutex_enter(block); buf_block_buf_fix_dec(block); - buf_page_mutex_exit(block); *latch_mode = mode; return(true); @@ -486,9 +484,7 @@ } unpin_failed: /* unpin the block */ - buf_page_mutex_enter(block); buf_block_buf_fix_dec(block); - buf_page_mutex_exit(block); return(false); @@ -6260,33 +6256,40 @@ mtr_t* mtr) /*!< in: mini-transaction to commit */ { buf_pool_t* buf_pool = buf_pool_from_block(block); - ulint space = block->page.id.space(); - ulint page_no = block->page.id.page_no(); + page_id_t page_id(block->page.id.space(), + block->page.id.page_no()); + bool freed = false; ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table)); mtr_commit(mtr); - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + buf_page_mutex_enter(block); /* Only free the block if it is still allocated to the same file page. */ - if (buf_block_get_state(block) - == BUF_BLOCK_FILE_PAGE - && block->page.id.space() == space - && block->page.id.page_no() == page_no) { - - if (!buf_LRU_free_page(&block->page, all) - && all && block->page.zip.data) { + if (buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE + && page_id.equals_to(block->page.id)) { + + freed = buf_LRU_free_page(&block->page, all); + + if (!freed && all && block->page.zip.data + && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE + && page_id.equals_to(block->page.id)) { + /* Attempt to deallocate the uncompressed page if the whole block cannot be deallocted. */ - buf_LRU_free_page(&block->page, false); + freed = buf_LRU_free_page(&block->page, false); } } - buf_pool_mutex_exit(buf_pool); + if (!freed) { + mutex_exit(&buf_pool->LRU_list_mutex); + buf_page_mutex_exit(block); + } } /** Helper class used while writing blob pages, during insert or update. */ === modified file 'storage/innobase/btr/btr0sea.cc' --- storage/innobase/btr/btr0sea.cc 2014-08-26 11:08:37 +0000 +++ storage/innobase/btr/btr0sea.cc 2015-01-16 19:30:41 +0000 @@ -294,12 +294,9 @@ btr_search_enable(void) /*====================*/ { - buf_pool_mutex_enter_all(); - if (srv_buf_pool_old_size != srv_buf_pool_size) { - buf_pool_mutex_exit_all(); + os_rmb; + if (srv_buf_pool_old_size != srv_buf_pool_size) return; - } - buf_pool_mutex_exit_all(); rw_lock_x_lock(&btr_search_latch); @@ -1036,11 +1033,6 @@ #ifdef UNIV_SEARCH_PERF_STAT btr_search_n_succ++; #endif - if (!has_search_latch && buf_page_peek_if_too_old(&block->page)) { - - buf_page_make_young(&block->page); - } - /* Increment the page get statistics though we did not really fix the page: for user info only */ @@ -1883,7 +1875,6 @@ rec_offs_init(offsets_); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter_all(); cell_count = hash_get_n_cells(btr_search_sys->hash_index); @@ -1891,11 +1882,9 @@ /* We release btr_search_latch every once in a while to give other queries a chance to run. */ if ((i != 0) && ((i % chunk_size) == 0)) { - buf_pool_mutex_exit_all(); rw_lock_x_unlock(&btr_search_latch); os_thread_yield(); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter_all(); if (cell_count != hash_get_n_cells( btr_search_sys->hash_index)) { @@ -1913,13 +1902,16 @@ hash_get_nth_cell(btr_search_sys->hash_index, i)->node; for (; node != NULL; node = node->next) { - const buf_block_t* block + buf_block_t* block = buf_block_align((byte*) node->data); const buf_block_t* hash_block; buf_pool_t* buf_pool; index_id_t page_index_id; buf_pool = buf_pool_from_bpage((buf_page_t*) block); + /* Prevent BUF_BLOCK_FILE_PAGE -> BUF_BLOCK_REMOVE_HASH + transition until we lock the block mutex */ + mutex_enter(&buf_pool->LRU_list_mutex); if (UNIV_LIKELY(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE)) { @@ -1953,6 +1945,9 @@ == BUF_BLOCK_REMOVE_HASH); } + mutex_enter(&block->mutex); + mutex_exit(&buf_pool->LRU_list_mutex); + ut_a(!dict_index_is_ibuf(block->index)); ut_ad(block->page.id.space() == block->index->space); @@ -2001,6 +1996,8 @@ n_page_dumps++; } } + + mutex_exit(&block->mutex); } } @@ -2008,11 +2005,9 @@ /* We release btr_search_latch every once in a while to give other queries a chance to run. */ if (i != 0) { - buf_pool_mutex_exit_all(); rw_lock_x_unlock(&btr_search_latch); os_thread_yield(); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter_all(); if (cell_count != hash_get_n_cells( btr_search_sys->hash_index)) { @@ -2033,7 +2028,6 @@ } } - buf_pool_mutex_exit_all(); rw_lock_x_unlock(&btr_search_latch); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); === modified file 'storage/innobase/buf/buf0buddy.cc' --- storage/innobase/buf/buf0buddy.cc 2014-07-10 10:46:02 +0000 +++ storage/innobase/buf/buf0buddy.cc 2015-01-16 19:30:41 +0000 @@ -196,6 +196,7 @@ ulint i) { CheckZipFree check(i); + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); ut_list_validate(buf_pool->zip_free[i], check); } @@ -213,7 +214,7 @@ { const ulint size = BUF_BUDDY_LOW << i; - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); ut_ad(!ut_align_offset(buf, size)); ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); @@ -286,7 +287,7 @@ ulint i) /*!< in: index of buf_pool->zip_free[] */ { - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); ut_ad(buf_pool->zip_free[i].start != buf); buf_buddy_stamp_free(buf, i); @@ -306,7 +307,7 @@ ulint i) /*!< in: index of buf_pool->zip_free[] */ { - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); ut_ad(buf_buddy_check_free(buf_pool, buf, i)); UT_LIST_REMOVE(buf_pool->zip_free[i], buf); @@ -325,17 +326,15 @@ { buf_buddy_free_t* buf; - ut_ad(buf_pool_mutex_own(buf_pool)); ut_a(i < BUF_BUDDY_SIZES); ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + mutex_enter(&buf_pool->zip_free_mutex); ut_d(buf_buddy_list_validate(buf_pool, i)); buf = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); - if (buf_pool->curr_size < buf_pool->old_size - && UT_LIST_GET_LEN(buf_pool->withdraw) - < buf_pool->withdraw_target) { + if (buf_get_withdraw_depth(buf_pool)) { while (buf != NULL && buf_frame_will_withdrawn( @@ -347,7 +346,10 @@ if (buf) { buf_buddy_remove_from_free(buf_pool, buf, i); + mutex_exit(&buf_pool->zip_free_mutex); + } else if (i + 1 < BUF_BUDDY_SIZES) { + mutex_exit(&buf_pool->zip_free_mutex); /* Attempt to split. */ buf = buf_buddy_alloc_zip(buf_pool, i + 1); @@ -357,9 +359,13 @@ buf->stamp.bytes + (BUF_BUDDY_LOW << i)); + mutex_enter(&buf_pool->zip_free_mutex); ut_ad(!buf_pool_contains_zip(buf_pool, buddy)); buf_buddy_add_to_free(buf_pool, buddy, i); + mutex_exit(&buf_pool->zip_free_mutex); } + } else { + mutex_exit(&buf_pool->zip_free_mutex); } if (buf) { @@ -388,12 +394,12 @@ { const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf); buf_page_t* bpage; - buf_block_t* block; - ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE)); + mutex_enter(&buf_pool->zip_hash_mutex); + HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage, ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY && bpage->in_zip_hash && !bpage->in_page_hash), @@ -405,16 +411,15 @@ ut_d(bpage->in_zip_hash = FALSE); HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage); + ut_ad(buf_pool->buddy_n_frames > 0); + ut_d(buf_pool->buddy_n_frames--); + + mutex_exit(&buf_pool->zip_hash_mutex); + ut_d(memset(buf, 0, UNIV_PAGE_SIZE)); UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE); - block = (buf_block_t*) bpage; - buf_page_mutex_enter(block); - buf_LRU_block_free_non_file_page(block); - buf_page_mutex_exit(block); - - ut_ad(buf_pool->buddy_n_frames > 0); - ut_d(buf_pool->buddy_n_frames--); + buf_LRU_block_free_non_file_page(reinterpret_cast(bpage)); } /**********************************************************************//** @@ -427,7 +432,6 @@ { buf_pool_t* buf_pool = buf_pool_from_block(block); const ulint fold = BUF_POOL_ZIP_FOLD(block); - ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE); @@ -439,9 +443,12 @@ ut_ad(!block->page.in_page_hash); ut_ad(!block->page.in_zip_hash); ut_d(block->page.in_zip_hash = TRUE); + + mutex_enter(&buf_pool->zip_hash_mutex); HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page); ut_d(buf_pool->buddy_n_frames++); + mutex_exit(&buf_pool->zip_hash_mutex); } /**********************************************************************//** @@ -459,6 +466,7 @@ of buf_pool->zip_free[] */ { ulint offs = BUF_BUDDY_LOW << j; + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); ut_ad(j <= BUF_BUDDY_SIZES); ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); ut_ad(j >= i); @@ -481,27 +489,18 @@ } /**********************************************************************//** -Allocate a block. The thread calling this function must hold -buf_pool->mutex and must not hold buf_pool->zip_mutex or any block->mutex. -The buf_pool_mutex may be released and reacquired. +Allocate a block. @return allocated block, never NULL */ - void* buf_buddy_alloc_low( /*================*/ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ - ulint i, /*!< in: index of buf_pool->zip_free[], + ulint i) /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ - ibool* lru) /*!< in: pointer to a variable that - will be assigned TRUE if storage was - allocated from the LRU list and - buf_pool->mutex was temporarily - released */ { buf_block_t* block; - ut_ad(lru); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); @@ -523,24 +522,24 @@ } /* Try replacing an uncompressed page in the buffer pool. */ - buf_pool_mutex_exit(buf_pool); block = buf_LRU_get_free_block(buf_pool); - *lru = TRUE; - buf_pool_mutex_enter(buf_pool); alloc_big: buf_buddy_block_register(block); + mutex_enter(&buf_pool->zip_free_mutex); block = (buf_block_t*) buf_buddy_alloc_from( buf_pool, block->frame, i, BUF_BUDDY_SIZES); + mutex_exit(&buf_pool->zip_free_mutex); func_exit: - buf_pool->buddy_stat[i].used++; + os_atomic_increment_ulint(&buf_pool->buddy_stat[i].used, 1); return(block); } /**********************************************************************//** -Try to relocate a block. +Try to relocate a block. The caller must hold zip_free_mutex, and this +function will release and lock it again. @return true if relocated */ static bool @@ -559,7 +558,7 @@ ulint space; ulint offset; - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(!ut_align_offset(src, size)); ut_ad(!ut_align_offset(dst, size)); @@ -578,11 +577,15 @@ ut_ad(space != BUF_BUDDY_STAMP_FREE); + mutex_exit(&buf_pool->zip_free_mutex); + const page_id_t page_id(space, offset); /* If space,offset is bogus, then we know that the buf_page_hash_get_low() call below will return NULL. */ if (!force && buf_pool != buf_pool_get(page_id)) { + + mutex_enter(&buf_pool->zip_free_mutex); return(false); } @@ -601,6 +604,7 @@ rw_lock_x_unlock(hash_lock); if (!force || space != 0 || offset != 0) { + mutex_enter(&buf_pool->zip_free_mutex); return(false); } @@ -619,6 +623,7 @@ } if (bpage == NULL) { + mutex_enter(&buf_pool->zip_free_mutex); return(false); } } @@ -631,6 +636,7 @@ rw_lock_x_unlock(hash_lock); + mutex_enter(&buf_pool->zip_free_mutex); return(false); } @@ -642,6 +648,8 @@ mutex_enter(block_mutex); + mutex_enter(&buf_pool->zip_free_mutex); + if (buf_page_can_relocate(bpage)) { /* Relocate the compressed page. */ uintmax_t usec = ut_time_us(NULL); @@ -684,17 +692,19 @@ { buf_buddy_free_t* buddy; - ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(i <= BUF_BUDDY_SIZES); ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); + + mutex_enter(&buf_pool->zip_free_mutex); + ut_ad(buf_pool->buddy_stat[i].used > 0); - - buf_pool->buddy_stat[i].used--; + os_atomic_decrement_ulint(&buf_pool->buddy_stat[i].used, 1); recombine: UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i); if (i == BUF_BUDDY_SIZES) { + mutex_exit(&buf_pool->zip_free_mutex); buf_buddy_block_free(buf_pool, buf); return; } @@ -763,13 +773,15 @@ buf_buddy_add_to_free(buf_pool, reinterpret_cast(buf), i); + mutex_exit(&buf_pool->zip_free_mutex); } -/** Reallocate a block. +/** Try to reallocate a block. @param[in] buf_pool buffer pool instance @param[in] buf block to be reallocated, must be pointed to by the buffer pool @param[in] size block size, up to UNIV_PAGE_SIZE +@retval true if succeeded or if failed because the block was fixed @retval false if failed because of no free blocks. */ bool @@ -781,7 +793,6 @@ buf_block_t* block = NULL; ulint i = buf_buddy_get_slot(size); - ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(!mutex_own(&buf_pool->zip_mutex)); ut_ad(i <= BUF_BUDDY_SIZES); ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); @@ -802,23 +813,29 @@ buf_buddy_block_register(block); + mutex_enter(&buf_pool->zip_free_mutex); block = reinterpret_cast( buf_buddy_alloc_from( buf_pool, block->frame, i, BUF_BUDDY_SIZES)); + } else { + mutex_enter(&buf_pool->zip_free_mutex); } - buf_pool->buddy_stat[i].used++; + os_atomic_increment_ulint(&buf_pool->buddy_stat[i].used, 1); /* Try to relocate the buddy of buf to the free block. */ if (buf_buddy_relocate(buf_pool, buf, block, i, true)) { + mutex_exit(&buf_pool->zip_free_mutex); /* succeeded */ buf_buddy_free_low(buf_pool, buf, i); - } else { - /* failed */ - buf_buddy_free_low(buf_pool, block, i); + return(true); } - return(true); /* free_list was enough */ + /* failed */ + mutex_exit(&buf_pool->zip_free_mutex); + buf_buddy_free_low(buf_pool, block, i); + + return(false); } /** Combine all pairs of free buddies. @@ -828,7 +845,7 @@ buf_buddy_condense_free( buf_pool_t* buf_pool) { - ut_ad(buf_pool_mutex_own(buf_pool)); + mutex_enter(&buf_pool->zip_free_mutex); ut_ad(buf_pool->curr_size < buf_pool->old_size); for (ulint i = 0; i < UT_ARR_SIZE(buf_pool->zip_free); ++i) { @@ -873,7 +890,8 @@ /* Both buf and buddy are free. Try to combine them. */ buf_buddy_remove_from_free(buf_pool, buf, i); - buf_pool->buddy_stat[i].used++; + os_atomic_increment_ulint( + &buf_pool->buddy_stat[i].used, 1); buf_buddy_free_low(buf_pool, buf, i); } @@ -881,4 +899,5 @@ buf = next; } } + mutex_exit(&buf_pool->zip_free_mutex); } === modified file 'storage/innobase/buf/buf0buf.cc' --- storage/innobase/buf/buf0buf.cc 2014-11-25 10:24:36 +0000 +++ storage/innobase/buf/buf0buf.cc 2015-01-16 19:30:41 +0000 @@ -78,21 +78,6 @@ IMPLEMENTATION OF THE BUFFER POOL ================================= -Performance improvement: ------------------------- -Thread scheduling in NT may be so slow that the OS wait mechanism should -not be used even in waiting for disk reads to complete. -Rather, we should put waiting query threads to the queue of -waiting jobs, and let the OS thread do something useful while the i/o -is processed. In this way we could remove most OS thread switches in -an i/o-intensive benchmark like TPC-C. - -A possibility is to put a user space thread library between the database -and NT. User space thread libraries might be very fast. - -SQL Server 7.0 can be configured to use 'fibers' which are lightweight -threads in NT. These should be studied. - Buffer frames and blocks ------------------------ Following the terminology of Gray and Reuter, we call the memory @@ -103,24 +88,9 @@ Buffer pool struct ------------------ -The buffer buf_pool contains a single mutex which protects all the +The buffer buf_pool contains several mutexes which protects all the control data structures of the buf_pool. The content of a buffer frame is protected by a separate read-write lock in its control block, though. -These locks can be locked and unlocked without owning the buf_pool->mutex. -The OS events in the buf_pool struct can be waited for without owning the -buf_pool->mutex. - -The buf_pool->mutex is a hot-spot in main memory, causing a lot of -memory bus traffic on multiprocessor systems when processors -alternately access the mutex. On our Pentium, the mutex is accessed -maybe every 10 microseconds. We gave up the solution to have mutexes -for each control block, for instance, because it seemed to be -complicated. - -A solution to reduce mutex contention of the buf_pool->mutex is to -create a separate mutex for the page hash table. On Pentium, -accessing the hash table takes 2 microseconds, about half -of the total buf_pool->mutex hold time. Control blocks -------------- @@ -135,16 +105,6 @@ address of a frame is divisible by the universal page size, which is a power of two. -We intend to make the buffer buf_pool size on-line reconfigurable, -that is, the buf_pool size can be changed without closing the database. -Then the database administarator may adjust it to be bigger -at night, for example. The control block array must -contain enough control blocks for the maximum buffer buf_pool size -which is used in the particular database. -If the buf_pool size is cut, we exploit the virtual memory mechanism of -the OS, and just refrain from using frames at high addresses. Then the OS -can swap them to disk. - The control blocks containing file pages are put to a hash table according to the file address of the page. We could speed up the access to an individual page by using @@ -1224,7 +1184,8 @@ } /********************************************************************//** -Allocates a chunk of buffer frames. +Allocates a chunk of buffer frames. If called for an existing buf_pool, its +free_list_mutex must be locked. @return chunk, or NULL on failure */ static buf_chunk_t* @@ -1338,7 +1299,8 @@ /*********************************************************************//** Finds a block in the buffer pool that points to a -given compressed page. +given compressed page. Used only to confirm that buffer pool does not contain a +given pointer, thus protected by zip_free_mutex. @return buffer block pointing to the compressed page, or NULL */ buf_block_t* @@ -1351,7 +1313,7 @@ buf_chunk_t* chunk = buf_pool->chunks; ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->zip_free_mutex)); for (n = buf_pool->n_chunks; n--; chunk++) { buf_block_t* block = buf_chunk_contains_zip(chunk, data); @@ -1424,8 +1386,6 @@ ulint i; ulint curr_size = 0; - buf_pool_mutex_enter_all(); - for (i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool; @@ -1436,8 +1396,7 @@ srv_buf_pool_curr_size = curr_size; srv_buf_pool_old_size = srv_buf_pool_size; srv_buf_pool_base_size = srv_buf_pool_size; - - buf_pool_mutex_exit_all(); + os_wmb; } /********************************************************************//** @@ -1459,15 +1418,16 @@ /* 1. Initialize general fields ------------------------------- */ - mutex_create("buf_pool", &buf_pool->mutex); - + mutex_create("buf_pool_lru_list", &buf_pool->LRU_list_mutex); + mutex_create("buf_pool_free_list", &buf_pool->free_list_mutex); + mutex_create("buf_pool_zip_free", &buf_pool->zip_free_mutex); + mutex_create("buf_pool_zip_hash", &buf_pool->zip_hash_mutex); + mutex_create("buf_pool_flush_state", &buf_pool->flush_state_mutex); mutex_create("buf_pool_zip", &buf_pool->zip_mutex); new(&buf_pool->allocator) ut_allocator(mem_key_buf_buf_pool); - buf_pool_mutex_enter(buf_pool); - if (buf_pool_size > 0) { buf_pool->n_chunks = buf_pool_size / srv_buf_pool_chunk_unit; @@ -1514,7 +1474,6 @@ chunk->mem, &chunk->mem_pfx); } ut_free(buf_pool->chunks); - buf_pool_mutex_exit(buf_pool); return(DB_ERROR); } @@ -1573,15 +1532,15 @@ FlushHp(buf_pool, &buf_pool->flush_list_mutex); /* Initialize the hazard pointer for LRU batches */ - new(&buf_pool->lru_hp) LRUHp(buf_pool, &buf_pool->mutex); + new(&buf_pool->lru_hp) LRUHp(buf_pool, &buf_pool->LRU_list_mutex); /* Initialize the iterator for LRU scan search */ - new(&buf_pool->lru_scan_itr) LRUItr(buf_pool, &buf_pool->mutex); + new(&buf_pool->lru_scan_itr) LRUItr(buf_pool, + &buf_pool->LRU_list_mutex); /* Initialize the iterator for single page scan search */ - new(&buf_pool->single_scan_itr) LRUItr(buf_pool, &buf_pool->mutex); - - buf_pool_mutex_exit(buf_pool); + new(&buf_pool->single_scan_itr) LRUItr(buf_pool, + &buf_pool->LRU_list_mutex); return(DB_SUCCESS); } @@ -1600,7 +1559,11 @@ buf_page_t* bpage; buf_page_t* prev_bpage = 0; - mutex_free(&buf_pool->mutex); + mutex_free(&buf_pool->LRU_list_mutex); + mutex_free(&buf_pool->free_list_mutex); + mutex_free(&buf_pool->zip_free_mutex); + mutex_free(&buf_pool->zip_hash_mutex); + mutex_free(&buf_pool->flush_state_mutex); mutex_free(&buf_pool->zip_mutex); mutex_free(&buf_pool->flush_list_mutex); @@ -1701,6 +1664,8 @@ btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64); + os_wmb; + return(DB_SUCCESS); } @@ -1727,6 +1692,7 @@ /** Reallocate a control block. @param[in] buf_pool buffer pool instance @param[in] block pointer to control block +@retval true if succeeded or if failed because the block was fixed @retval false if failed because of no free blocks. */ static bool @@ -1737,8 +1703,7 @@ buf_block_t* new_block; ut_ad(buf_pool_withdrawing); - ut_ad(buf_pool_mutex_own(buf_pool)); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); new_block = buf_LRU_get_free_only(buf_pool); @@ -1827,6 +1792,8 @@ buf_flush_relocate_on_flush_list( &block->page, &new_block->page); } + /* At this point no outside pointers to block should exist */ + mutex_exit(&block->mutex); /* set other flags of buf_block_t */ new_block->check_index_page_at_flush @@ -1850,15 +1817,12 @@ buf_block_set_state(block, BUF_BLOCK_MEMORY); buf_LRU_block_free_non_file_page(block); - mutex_exit(&block->mutex); } else { rw_lock_x_unlock(hash_lock); mutex_exit(&block->mutex); /* free new_block */ - mutex_enter(&new_block->mutex); buf_LRU_block_free_non_file_page(new_block); - mutex_exit(&new_block->mutex); } return(true); /* free_list was enough */ @@ -1887,7 +1851,8 @@ va_end(ap); } -/** Determines if a block is intended to be withdrawn. +/** Determines if a block is intended to be withdrawn. The caller must ensure +that there was a sufficient memory barrier to read curr_size and old_size. @param[in] buf_pool buffer pool instance @param[in] block pointer to control block @retval true if will be withdrawn */ @@ -1898,7 +1863,6 @@ const buf_block_t* block) { ut_ad(buf_pool->curr_size < buf_pool->old_size); - ut_ad(!buf_pool_resizing || buf_pool_mutex_own(buf_pool)); const buf_chunk_t* chunk = buf_pool->chunks + buf_pool->n_chunks_new; @@ -1916,7 +1880,8 @@ return(false); } -/** Determines if a frame is intended to be withdrawn. +/** Determines if a frame is intended to be withdrawn. The caller must ensure +that there was a sufficient memory barrier to read curr_size and old_size. @param[in] buf_pool buffer pool instance @param[in] ptr pointer to a frame @retval true if will be withdrawn */ @@ -1927,7 +1892,6 @@ const byte* ptr) { ut_ad(buf_pool->curr_size < buf_pool->old_size); - ut_ad(!buf_pool_resizing || buf_pool_mutex_own(buf_pool)); const buf_chunk_t* chunk = buf_pool->chunks + buf_pool->n_chunks_new; @@ -1958,23 +1922,26 @@ buf_block_t* block; ulint loop_count = 0; ulint i = buf_pool_index(buf_pool); + ulint lru_len; ib::info() << "buffer pool " << i << " : start to withdraw the last " << buf_pool->withdraw_target << " blocks."; /* Minimize buf_pool->zip_free[i] lists */ - buf_pool_mutex_enter(buf_pool); buf_buddy_condense_free(buf_pool); - buf_pool_mutex_exit(buf_pool); - + + mutex_enter(&buf_pool->LRU_list_mutex); + lru_len = UT_LIST_GET_LEN(buf_pool->LRU); + mutex_exit(&buf_pool->LRU_list_mutex); + + mutex_enter(&buf_pool->free_list_mutex); while (UT_LIST_GET_LEN(buf_pool->withdraw) < buf_pool->withdraw_target) { /* try to withdraw from free_list */ ulint count1 = 0; - buf_pool_mutex_enter(buf_pool); block = reinterpret_cast( UT_LIST_GET_FIRST(buf_pool->free)); while (block != NULL @@ -1999,12 +1966,12 @@ buf_pool->withdraw, &block->page); ut_d(block->in_withdraw_list = TRUE); + fprintf(stderr, "1 withdrawing block at %p\n", block); count1++; } block = next_block; } - buf_pool_mutex_exit(buf_pool); /* reserve free_list length */ if (UT_LIST_GET_LEN(buf_pool->withdraw) @@ -2013,15 +1980,12 @@ ulint n_flushed = 0; /* cap scan_depth with current LRU size. */ - buf_pool_mutex_enter(buf_pool); - scan_depth = UT_LIST_GET_LEN(buf_pool->LRU); - buf_pool_mutex_exit(buf_pool); - scan_depth = ut_min( ut_max(buf_pool->withdraw_target - UT_LIST_GET_LEN(buf_pool->withdraw), static_cast(srv_LRU_scan_depth)), - scan_depth); + lru_len); + mutex_exit(&buf_pool->free_list_mutex); buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU, scan_depth, 0, &n_flushed); @@ -2034,12 +1998,15 @@ MONITOR_LRU_BATCH_FLUSH_PAGES, n_flushed); } + } else { + + mutex_exit(&buf_pool->free_list_mutex); } /* relocate blocks/buddies in withdrawn area */ ulint count2 = 0; - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); buf_page_t* bpage; bpage = UT_LIST_GET_FIRST(buf_pool->LRU); while (bpage != NULL) { @@ -2058,18 +2025,14 @@ if (buf_page_can_relocate(bpage)) { mutex_exit(block_mutex); - buf_pool_mutex_exit_forbid(buf_pool); if(!buf_buddy_realloc( buf_pool, bpage->zip.data, page_zip_get_size( &bpage->zip))) { /* failed to allocate block */ - buf_pool_mutex_exit_allow( - buf_pool); break; } - buf_pool_mutex_exit_allow(buf_pool); mutex_enter(block_mutex); count2++; } @@ -2085,17 +2048,13 @@ if (buf_page_can_relocate(bpage)) { mutex_exit(block_mutex); - buf_pool_mutex_exit_forbid(buf_pool); if(!buf_page_realloc( buf_pool, reinterpret_cast( bpage))) { /* failed to allocate block */ - buf_pool_mutex_exit_allow( - buf_pool); break; } - buf_pool_mutex_exit_allow(buf_pool); count2++; } else { mutex_exit(block_mutex); @@ -2108,7 +2067,10 @@ bpage = next_bpage; } - buf_pool_mutex_exit(buf_pool); + + mutex_exit(&buf_pool->LRU_list_mutex); + + mutex_enter(&buf_pool->free_list_mutex); buf_resize_status( "buffer pool %lu : withdrawing blocks. (%lu/%lu)", @@ -2125,6 +2087,8 @@ /* give up for now. retried after user threads paused. */ + mutex_exit(&buf_pool->free_list_mutex); + ib::info() << "buffer pool " << i << " : will retry to withdraw later."; @@ -2132,6 +2096,7 @@ return(true); } } + mutex_exit(&buf_pool->free_list_mutex); /* confirm withdrawn enough */ const buf_chunk_t* chunk @@ -2145,6 +2110,7 @@ /* If !=BUF_BLOCK_NOT_USED block in the withdrawn area, it means corruption something */ + fprintf(stderr, "withdrawn block at %p\n", block); ut_a(buf_block_get_state(block) == BUF_BLOCK_NOT_USED); ut_ad(block->in_withdraw_list); @@ -2152,8 +2118,10 @@ ++chunk; } + mutex_enter(&buf_pool->free_list_mutex); ib::info() << "buffer pool " << i << " : withdrawn target " << UT_LIST_GET_LEN(buf_pool->withdraw) << " blocks."; + mutex_exit(&buf_pool->free_list_mutex); /* retry is not needed */ ++buf_withdraw_clock; @@ -2169,6 +2137,7 @@ { hash_table_t* new_hash_table; + ut_ad(mutex_own(&buf_pool->zip_hash_mutex)); ut_ad(buf_pool->page_hash_old == NULL); /* recreate page_hash */ @@ -2254,6 +2223,8 @@ ut_ad(!buf_pool_withdrawing); ut_ad(srv_buf_pool_chunk_unit > 0); + /* Assumes that buf_resize_thread has already issued the necessary + memory barrier to read srv_buf_pool_size and srv_buf_pool_old_size */ new_instance_size = srv_buf_pool_size / srv_buf_pool_instances; new_instance_size /= UNIV_PAGE_SIZE; @@ -2269,19 +2240,25 @@ /* set new limit for all buffer pool for resizing */ for (ulint i = 0; i < srv_buf_pool_instances; i++) { buf_pool = buf_pool_from_array(i); - buf_pool_mutex_enter(buf_pool); + // No locking needed to read, same thread updated those ut_ad(buf_pool->curr_size == buf_pool->old_size); ut_ad(buf_pool->n_chunks_new == buf_pool->n_chunks); + mutex_enter(&buf_pool->free_list_mutex); ut_ad(UT_LIST_GET_LEN(buf_pool->withdraw) == 0); + mutex_exit(&buf_pool->free_list_mutex); +#ifdef UNIV_DEBUG + buf_flush_list_mutex_enter(buf_pool); ut_ad(buf_pool->flush_rbt == NULL); + buf_flush_list_mutex_exit(buf_pool); +#endif buf_pool->curr_size = new_instance_size; buf_pool->n_chunks_new = new_instance_size * UNIV_PAGE_SIZE / srv_buf_pool_chunk_unit; - buf_pool_mutex_exit(buf_pool); + os_wmb; } /* disable AHI if needed */ @@ -2431,16 +2408,19 @@ /* Indicate critical path */ buf_pool_resizing = true; - /* Acquire all buf_pool_mutex/hash_lock */ - for (ulint i = 0; i < srv_buf_pool_instances; ++i) { - buf_pool_t* buf_pool = buf_pool_from_array(i); - - buf_pool_mutex_enter(buf_pool); - } - for (ulint i = 0; i < srv_buf_pool_instances; ++i) { - buf_pool_t* buf_pool = buf_pool_from_array(i); - + /* Acquire all buffer pool mutexes and hash table locks */ + for (ulint i = 0; i < srv_buf_pool_instances; ++i) { + buf_pool_t* buf_pool = buf_pool_from_array(i); + + /* TODO: while we certainly lock a lot here, it does not + necessarily buy us enough correctness, see a comment at + buf_block_align. */ + mutex_enter(&buf_pool->LRU_list_mutex); hash_lock_x_all(buf_pool->page_hash); + mutex_enter(&buf_pool->zip_free_mutex); + mutex_enter(&buf_pool->free_list_mutex); + mutex_enter(&buf_pool->zip_hash_mutex); + mutex_enter(&buf_pool->flush_state_mutex); } buf_chunk_map_reg = UT_NEW_NOKEY(buf_pool_chunk_map_t()); @@ -2617,8 +2597,12 @@ for (ulint i = 0; i < srv_buf_pool_instances; ++i) { buf_pool_t* buf_pool = buf_pool_from_array(i); + mutex_exit(&buf_pool->flush_state_mutex); + mutex_exit(&buf_pool->zip_hash_mutex); + mutex_exit(&buf_pool->free_list_mutex); + mutex_exit(&buf_pool->zip_free_mutex); hash_unlock_x_all(buf_pool->page_hash); - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); ut_free(buf_pool->chunks_old); buf_pool->chunks_old = NULL; @@ -2663,6 +2647,7 @@ << srv_buf_pool_old_size << " to " << srv_buf_pool_size << "."; srv_buf_pool_old_size = srv_buf_pool_size; + os_wmb; } /* enable AHI if needed */ @@ -2713,13 +2698,11 @@ break; } - buf_pool_mutex_enter_all(); + os_rmb; if (srv_buf_pool_old_size == srv_buf_pool_size) { - buf_pool_mutex_exit_all(); /* nothing to do */ continue; } - buf_pool_mutex_exit_all(); buf_pool_resize(); } @@ -2793,7 +2776,7 @@ buf_page_t* b; buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_ad(buf_page_hash_lock_held_x(buf_pool, bpage)); ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); @@ -2873,7 +2856,8 @@ { ut_ad(mutex_own(m_mutex)); ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool); - ut_ad(!bpage || buf_page_in_file(bpage)); + ut_ad(!bpage || buf_page_in_file(bpage) + || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH); m_hp = bpage; } @@ -2951,7 +2935,7 @@ const buf_pool_t* buf_pool, const buf_page_t* bpage) { - /* We must also own the appropriate hash lock. */ + /* We must own the appropriate hash lock. */ ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage)); ut_ad(buf_page_in_file(bpage)); @@ -2972,8 +2956,9 @@ } /** Add watch for the given page to be read in. Caller must have -appropriate hash_lock for the bpage. This function may release the -hash_lock and reacquire it. +appropriate hash_lock for the bpage and hold the LRU list mutex to avoid a race +condition with buf_LRU_free_page inserting the same page into the page hash. +This function may release the hash_lock and reacquire it. @param[in] page_id page id @param[in,out] hash_lock hash_lock currently latched @return NULL if watch set, block if the page is in the buffer pool */ @@ -3007,32 +2992,26 @@ } /* From this point this function becomes fairly heavy in terms - of latching. We acquire the buf_pool mutex as well as all the - hash_locks. buf_pool mutex is needed because any changes to - the page_hash must be covered by it and hash_locks are needed + of latching. We acquire all the hash_locks. They are needed because we don't want to read any stale information in buf_pool->watch[]. However, it is not in the critical code path as this function will be called only by the purge thread. */ - /* To obey latching order first release the hash_lock. */ rw_lock_x_unlock(*hash_lock); - buf_pool_mutex_enter(buf_pool); hash_lock_x_all(buf_pool->page_hash); - /* If not own buf_pool_mutex, page_hash can be changed. */ - *hash_lock = buf_page_hash_lock_get(buf_pool, page_id); - /* We have to recheck that the page was not loaded or a watch set by some other purge thread. This is because of the small time window between when we release the - hash_lock to acquire buf_pool mutex above. */ + hash_lock to lock all the hash_locks. */ + + *hash_lock = buf_page_hash_lock_get(buf_pool, page_id); bpage = buf_page_hash_get_low(buf_pool, page_id); if (UNIV_LIKELY_NULL(bpage)) { - buf_pool_mutex_exit(buf_pool); hash_unlock_x_all_but(buf_pool->page_hash, *hash_lock); goto page_found; } @@ -3051,20 +3030,15 @@ ut_ad(!bpage->in_page_hash); ut_ad(bpage->buf_fix_count == 0); - /* bpage is pointing to buf_pool->watch[], - which is protected by buf_pool->mutex. - Normally, buf_page_t objects are protected by - buf_block_t::mutex or buf_pool->zip_mutex or both. */ - bpage->state = BUF_BLOCK_ZIP_PAGE; bpage->id.copy_from(page_id); bpage->buf_fix_count = 1; + bpage->buf_pool_index = buf_pool_index(buf_pool); ut_d(bpage->in_page_hash = TRUE); HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, page_id.fold(), bpage); - buf_pool_mutex_exit(buf_pool); /* Once the sentinel is in the page_hash we can safely release all locks except just the relevant hash_lock */ @@ -3075,7 +3049,7 @@ case BUF_BLOCK_ZIP_PAGE: ut_ad(bpage->in_page_hash); ut_ad(bpage->buf_fix_count > 0); - break; + continue; default: ut_error; } @@ -3092,7 +3066,7 @@ } /** Remove the sentinel block for the watch before replacing it with a -real block. buf_page_watch_clear() or buf_page_watch_occurred() will notice +real block. buf_page_watch_unset() or buf_page_watch_occurred() will notice that the block has been replaced with the real block. @param[in,out] buf_pool buffer pool instance @param[in,out] watch sentinel for watch @@ -3104,12 +3078,12 @@ buf_page_t* watch) { #ifdef UNIV_SYNC_DEBUG - /* We must also own the appropriate hash_bucket mutex. */ + /* We must own the appropriate hash_bucket mutex. */ rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, watch->id); ut_ad(rw_lock_own(hash_lock, RW_LOCK_X)); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(buf_page_get_state(watch) == BUF_BLOCK_ZIP_PAGE); HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, watch->id.fold(), watch); @@ -3128,13 +3102,6 @@ buf_page_t* bpage; buf_pool_t* buf_pool = buf_pool_get(page_id); - /* We only need to have buf_pool mutex in case where we end - up calling buf_pool_watch_remove but to obey latching order - we acquire it here before acquiring hash_lock. This should - not cause too much grief as this function is only ever - called from the purge thread. */ - buf_pool_mutex_enter(buf_pool); - rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, page_id); rw_lock_x_lock(hash_lock); @@ -3147,7 +3114,6 @@ buf_pool_watch_remove(buf_pool, bpage); } - buf_pool_mutex_exit(buf_pool); rw_lock_x_unlock(hash_lock); } @@ -3192,19 +3158,59 @@ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); ut_a(buf_page_in_file(bpage)); buf_LRU_make_block_young(bpage); - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); +} + +/********************************************************************//** +Recommends a move of a block to the start of the LRU list if there is danger +of dropping from the buffer pool. NOTE: does not reserve the buffer pool +mutex. +@return TRUE if should be made younger */ +static +ibool +buf_page_peek_if_too_old( +/*=====================*/ + const buf_page_t* bpage) /*!< in: block to make younger */ +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + + if (buf_pool->freed_page_clock == 0) { + /* If eviction has not started yet, do not update the + statistics or move blocks in the LRU list. This is + either the warm-up phase or an in-memory workload. */ + return(FALSE); + } else if (buf_LRU_old_threshold_ms && bpage->old) { + unsigned access_time = buf_page_is_accessed(bpage); + + /* It is possible that the below comparison returns an + unexpected result. 2^32 milliseconds pass in about 50 days, + so if the difference between ut_time_ms() and access_time + is e.g. 50 days + 15 ms, then the below will behave as if + it is 15 ms. This is known and fixing it would require to + increase buf_page_t::access_time from 32 to 64 bits. */ + if (access_time > 0 + && ((ib_uint32_t) (ut_time_ms() - access_time)) + >= buf_LRU_old_threshold_ms) { + return(TRUE); + } + + buf_pool->stat.n_pages_not_made_young++; + return(FALSE); + } else { + return(!buf_page_peek_if_young(bpage)); + } } /********************************************************************//** Moves a page to the start of the buffer pool LRU list if it is too old. This high-level function can be used to prevent an important page from -slipping out of the buffer pool. */ +slipping out of the buffer pool. The page must be fixed to the buffer pool. */ static void buf_page_make_young_if_needed( @@ -3212,10 +3218,8 @@ buf_page_t* bpage) /*!< in/out: buffer block of a file page */ { -#ifdef UNIV_DEBUG - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(!buf_pool_mutex_own(buf_pool)); -#endif /* UNIV_DEBUG */ + ut_ad(!mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex)); + ut_ad(bpage->buf_fix_count > 0); ut_a(buf_page_in_file(bpage)); if (buf_page_peek_if_too_old(bpage)) { @@ -3295,21 +3299,30 @@ buf_page_t* bpage; buf_pool_t* buf_pool = buf_pool_get(page_id); - /* Since we need to acquire buf_pool mutex to discard - the uncompressed frame and because page_hash mutex resides - below buf_pool mutex in sync ordering therefore we must - first release the page_hash mutex. This means that the - block in question can move out of page_hash. Therefore - we need to check again if the block is still in page_hash. */ - buf_pool_mutex_enter(buf_pool); + /* Since we need to acquire buf_pool->LRU_list_mutex to discard + the uncompressed frame and because page_hash mutex resides below + buf_pool->LRU_list_mutex in sync ordering therefore we must first + release the page_hash mutex. This means that the block in question + can move out of page_hash. Therefore we need to check again if the + block is still in page_hash. */ + mutex_enter(&buf_pool->LRU_list_mutex); bpage = buf_page_hash_get(buf_pool, page_id); if (bpage) { - buf_LRU_free_page(bpage, false); + + BPageMutex* block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + if (buf_LRU_free_page(bpage, false)) { + + return; + } + mutex_exit(block_mutex); } - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } /** Get read access to a compressed page (usually of type @@ -3550,7 +3563,7 @@ ut_ad(srv_buf_pool_chunk_unit > 0); /* TODO: This might be still optimistic treatment. - buf_pool_resize() needs all buf_pool_mutex and all + buf_pool_resize() needs most of buffer pool mutexes and all buf_pool->page_hash x-latched until actual modification. It should block the other user threads and should take while which is enough to done the buf_pool_chunk_map access. */ @@ -3591,11 +3604,6 @@ block->frame + n * UNIV_PAGE_SIZE. Check it. */ ut_ad(block->frame == page_align(ptr)); #ifdef UNIV_DEBUG - /* A thread that updates these fields must - hold buf_pool->mutex and block->mutex. Acquire - only the latter. */ - buf_page_mutex_enter(block); - switch (buf_block_get_state(block)) { case BUF_BLOCK_POOL_WATCH: case BUF_BLOCK_ZIP_PAGE: @@ -3610,37 +3618,14 @@ case BUF_BLOCK_NOT_USED: case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + case BUF_BLOCK_FILE_PAGE: /* Some data structures contain "guess" pointers to file pages. The file pages may have been freed and reused. Do not complain. */ break; - case BUF_BLOCK_REMOVE_HASH: - /* buf_LRU_block_remove_hashed_page() - will overwrite the FIL_PAGE_OFFSET and - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID with - 0xff and set the state to - BUF_BLOCK_REMOVE_HASH. */ -# ifndef UNIV_DEBUG_VALGRIND - /* In buf_LRU_block_remove_hashed() we - explicitly set those values to 0xff and - declare them uninitialized with - UNIV_MEM_INVALID() after that. */ - ut_ad(page_get_space_id(page_align(ptr)) - == 0xffffffff); - ut_ad(page_get_page_no(page_align(ptr)) - == 0xffffffff); -# endif /* UNIV_DEBUG_VALGRIND */ - break; - case BUF_BLOCK_FILE_PAGE: - ut_ad(block->page.id.space() - == page_get_space_id(page_align(ptr))); - ut_ad(block->page.id.page_no() - == page_get_page_no(page_align(ptr))); - break; } - - buf_page_mutex_exit(block); #endif /* UNIV_DEBUG */ return(block); @@ -3765,22 +3750,12 @@ access the block (and check for IO state) after the block has been added to the page hashtable. */ - if (buf_block_get_io_fix(block) == BUF_IO_READ) { + if (buf_block_get_io_fix_unlocked(block) == BUF_IO_READ) { /* Wait until the read operation completes */ - - BPageMutex* mutex = buf_page_get_mutex(&block->page); - for (;;) { - buf_io_fix io_fix; - - mutex_enter(mutex); - - io_fix = buf_block_get_io_fix(block); - - mutex_exit(mutex); - - if (io_fix == BUF_IO_READ) { + if (buf_block_get_io_fix_unlocked(block) + == BUF_IO_READ) { /* Wait by temporaly s-latch */ rw_lock_s_lock(&block->lock); rw_lock_s_unlock(&block->lock); @@ -3862,7 +3837,7 @@ rw_lock_s_lock(hash_lock); - /* If not own buf_pool_mutex, page_hash can be changed. */ + /* page_hash can be changed. */ hash_lock = buf_page_hash_lock_s_confirm(hash_lock, buf_pool, page_id); if (block != NULL) { @@ -3896,10 +3871,10 @@ /* Page not in buf_pool: needs to be read from file */ if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) { + rw_lock_x_lock(hash_lock); - /* If not own buf_pool_mutex, - page_hash can be changed. */ + /* page_hash can be changed. */ hash_lock = buf_page_hash_lock_x_confirm( hash_lock, buf_pool, page_id); @@ -3998,15 +3973,16 @@ rw_lock_s_unlock(hash_lock); got_block: +#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG + BPageMutex* fix_mutex = buf_page_get_mutex(&fix_block->page); +#endif if (mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL) { buf_page_t* fix_page = &fix_block->page; - BPageMutex* fix_mutex = buf_page_get_mutex(fix_page); - mutex_enter(fix_mutex); + os_rmb; const bool must_read - = (buf_page_get_io_fix(fix_page) == BUF_IO_READ); - mutex_exit(fix_mutex); + = (buf_page_get_io_fix_unlocked(fix_page) == BUF_IO_READ); if (must_read) { /* The page is being read to buffer pool, @@ -4022,9 +3998,10 @@ buf_page_t* bpage; case BUF_BLOCK_FILE_PAGE: + ut_ad(fix_mutex != &buf_pool->zip_mutex); bpage = &block->page; if (fsp_is_system_temporary(page_id.space()) - && buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + && buf_page_get_io_fix_unlocked(bpage) != BUF_IO_NONE) { /* This suggest that page is being flushed. Avoid returning reference to this page. Instead wait for flush action to complete. @@ -4049,10 +4026,11 @@ } bpage = &block->page; + ut_ad(fix_mutex == &buf_pool->zip_mutex); /* Note: We have already buffer fixed this block. */ if (bpage->buf_fix_count > 1 - || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + || buf_page_get_io_fix_unlocked(bpage) != BUF_IO_NONE) { /* This condition often occurs when the buffer is not buffer-fixed, but I/O-fixed by @@ -4072,9 +4050,8 @@ block = buf_LRU_get_free_block(buf_pool); - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); - /* If not own buf_pool_mutex, page_hash can be changed. */ hash_lock = buf_page_hash_lock_get(buf_pool, page_id); rw_lock_x_lock(hash_lock); @@ -4099,10 +4076,10 @@ This should be extremely unlikely, for example, if buf_page_get_zip() was invoked. */ - buf_LRU_block_free_non_file_page(block); - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); rw_lock_x_unlock(hash_lock); buf_page_mutex_exit(block); + buf_LRU_block_free_non_file_page(block); /* Try again */ goto loop; @@ -4145,24 +4122,26 @@ /* Insert at the front of unzip_LRU list */ buf_unzip_LRU_add_block(block, FALSE); + mutex_exit(&buf_pool->LRU_list_mutex); + buf_block_set_io_fix(block, BUF_IO_READ); rw_lock_x_lock_inline(&block->lock, 0, file, line); UNIV_MEM_INVALID(bpage, sizeof *bpage); rw_lock_x_unlock(hash_lock); - buf_pool->n_pend_unzip++; mutex_exit(&buf_pool->zip_mutex); - buf_pool_mutex_exit(buf_pool); access_time = buf_page_is_accessed(&block->page); buf_page_mutex_exit(block); + os_atomic_increment(&buf_pool->n_pend_unzip, 1); + buf_page_free_descriptor(bpage); /* Decompress the page while not holding - buf_pool->mutex or block->mutex. */ + any buf_pool or block->mutex. */ /* Page checksum verification is already done when the page is read from disk. Hence page checksum @@ -4183,17 +4162,13 @@ } } - buf_pool_mutex_enter(buf_pool); - buf_page_mutex_enter(fix_block); buf_block_set_io_fix(fix_block, BUF_IO_NONE); buf_page_mutex_exit(fix_block); - --buf_pool->n_pend_unzip; - - buf_pool_mutex_exit(buf_pool); + os_atomic_decrement_ulint(&buf_pool->n_pend_unzip, 1); rw_lock_x_unlock(&block->lock); @@ -4226,27 +4201,26 @@ /* Try to evict the block from the buffer pool, to use the insert buffer (change buffer) as much as possible. */ - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); buf_block_unfix(fix_block); - /* Now we are only holding the buf_pool->mutex, + /* Now we are only holding the buf_pool->LRU_list_mutex, not block->mutex or hash_lock. Blocks cannot be relocated or enter or exit the buf_pool while we - are holding the buf_pool->mutex. */ + are holding the buf_pool->LRU_list_mutex. */ + + fix_mutex = buf_page_get_mutex(&fix_block->page); + mutex_enter(fix_mutex); if (buf_LRU_free_page(&fix_block->page, true)) { - buf_pool_mutex_exit(buf_pool); + if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) + mutex_enter(&buf_pool->LRU_list_mutex); - /* If not own buf_pool_mutex, - page_hash can be changed. */ + /* page_hash can be changed. */ hash_lock = buf_page_hash_lock_get(buf_pool, page_id); - rw_lock_x_lock(hash_lock); - - /* If not own buf_pool_mutex, - page_hash can be changed. */ hash_lock = buf_page_hash_lock_x_confirm( hash_lock, buf_pool, page_id); @@ -4256,6 +4230,7 @@ buffer pool in the first place. */ block = (buf_block_t*) buf_pool_watch_set( page_id, &hash_lock); + mutex_exit(&buf_pool->LRU_list_mutex); } else { block = (buf_block_t*) buf_page_hash_get_low( buf_pool, page_id); @@ -4280,8 +4255,6 @@ return(NULL); } - buf_page_mutex_enter(fix_block); - if (buf_flush_page_try(buf_pool, fix_block)) { ib::info() << "innodb_change_buffering_debug flush " @@ -4292,13 +4265,13 @@ goto loop; } + mutex_exit(&buf_pool->LRU_list_mutex); + buf_page_mutex_exit(fix_block); buf_block_fix(fix_block); /* Failed to evict the page; change it directly */ - - buf_pool_mutex_exit(buf_pool); } #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ @@ -4471,9 +4444,7 @@ } if (!success) { - buf_page_mutex_enter(block); buf_block_buf_fix_dec(block); - buf_page_mutex_exit(block); return(FALSE); } @@ -4488,9 +4459,7 @@ rw_lock_x_unlock(&block->lock); } - buf_page_mutex_enter(block); buf_block_buf_fix_dec(block); - buf_page_mutex_exit(block); return(FALSE); } @@ -4596,9 +4565,7 @@ } if (!success) { - buf_page_mutex_enter(block); buf_block_buf_fix_dec(block); - buf_page_mutex_exit(block); return(FALSE); } @@ -4667,17 +4634,17 @@ ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page)); - buf_page_mutex_enter(block); + buf_block_buf_fix_inc(block, file, line); + rw_lock_s_unlock(hash_lock); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + buf_page_mutex_enter(block); ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_a(page_id.equals_to(block->page.id)); + buf_page_mutex_exit(block); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - buf_block_buf_fix_inc(block, file, line); - buf_page_mutex_exit(block); - mtr_memo_type_t fix_type = MTR_MEMO_PAGE_S_FIX; success = rw_lock_s_lock_nowait(&block->lock, file, line); @@ -4692,9 +4659,7 @@ } if (!success) { - buf_page_mutex_enter(block); buf_block_buf_fix_dec(block); - buf_page_mutex_exit(block); return(NULL); } @@ -4742,7 +4707,8 @@ #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ } -/** Inits a page to the buffer buf_pool. +/** Inits a page to the buffer buf_pool. The block pointer must be private to +the calling thread at the start of this function. @param[in,out] buf_pool buffer pool @param[in] page_id page id @param[in,out] block block to init */ @@ -4757,9 +4723,8 @@ buf_page_t* hash_page; ut_ad(buf_pool == buf_pool_get(page_id)); - ut_ad(buf_pool_mutex_own(buf_pool)); - ut_ad(buf_page_mutex_own(block)); + ut_ad(!mutex_own(buf_page_get_mutex(&block->page))); ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); #ifdef UNIV_SYNC_DEBUG @@ -4809,8 +4774,6 @@ << hash_page << ", " << block; #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - buf_page_mutex_exit(block); - buf_pool_mutex_exit(buf_pool); buf_print(); buf_LRU_print(); buf_validate(); @@ -4863,8 +4826,7 @@ buf_page_t* watch_page; rw_lock_t* hash_lock; mtr_t mtr; - ibool lru = FALSE; - void* data; + void* data = NULL; buf_pool_t* buf_pool = buf_pool_get(page_id); ut_ad(buf_pool); @@ -4897,7 +4859,13 @@ ut_ad(buf_pool_from_block(block) == buf_pool); } - buf_pool_mutex_enter(buf_pool); + if (!block) + bpage = buf_page_alloc_descriptor(); + + if ((block && page_size.is_compressed()) || !block) + data = buf_buddy_alloc(buf_pool, page_size.physical()); + + mutex_enter(&buf_pool->LRU_list_mutex); hash_lock = buf_page_hash_lock_get(buf_pool, page_id); rw_lock_x_lock(hash_lock); @@ -4907,12 +4875,17 @@ /* The page is already in the buffer pool. */ watch_page = NULL; err_exit: + mutex_exit(&buf_pool->LRU_list_mutex); rw_lock_x_unlock(hash_lock); - if (block) { - buf_page_mutex_enter(block); + + if (bpage) + buf_page_free_descriptor(bpage); + + if (data) + buf_buddy_free(buf_pool, data, page_size.physical()); + + if (block) buf_LRU_block_free_non_file_page(block); - buf_page_mutex_exit(block); - } bpage = NULL; goto func_exit; @@ -4928,25 +4901,39 @@ } if (block) { + + ut_ad(!bpage); bpage = &block->page; - buf_page_mutex_enter(block); - ut_ad(buf_pool_from_bpage(bpage) == buf_pool); buf_page_init(buf_pool, page_id, page_size, block); + buf_page_mutex_enter(block); + /* Note: We are using the hash_lock for protection. This is safe because no other thread can lookup the block from the page hashtable yet. */ buf_page_set_io_fix(bpage, BUF_IO_READ); - rw_lock_x_unlock(hash_lock); - /* The block must be put to the LRU list, to the old blocks */ buf_LRU_add_block(bpage, TRUE/* to old blocks */); + if (page_size.is_compressed()) { + block->page.zip.data = (page_zip_t*) data; + + /* To maintain the invariant + block->in_unzip_LRU_list + == buf_page_belongs_to_unzip_LRU(&block->page) + we have to add this block to unzip_LRU + after block->page.zip.data is set. */ + ut_ad(buf_page_belongs_to_unzip_LRU(&block->page)); + buf_unzip_LRU_add_block(block, TRUE); + } + + mutex_exit(&buf_pool->LRU_list_mutex); + /* We set a pass-type x-lock on the frame because then the same thread which called for the read operation (and is running now at this point of code) can wait @@ -4958,71 +4945,17 @@ rw_lock_x_lock_gen(&block->lock, BUF_IO_READ); - if (page_size.is_compressed()) { - /* buf_pool->mutex may be released and - reacquired by buf_buddy_alloc(). Thus, we - must release block->mutex in order not to - break the latching order in the reacquisition - of buf_pool->mutex. We also must defer this - operation until after the block descriptor has - been added to buf_pool->LRU and - buf_pool->page_hash. */ - buf_page_mutex_exit(block); - data = buf_buddy_alloc(buf_pool, page_size.physical(), - &lru); - buf_page_mutex_enter(block); - block->page.zip.data = (page_zip_t*) data; - - /* To maintain the invariant - block->in_unzip_LRU_list - == buf_page_belongs_to_unzip_LRU(&block->page) - we have to add this block to unzip_LRU - after block->page.zip.data is set. */ - ut_ad(buf_page_belongs_to_unzip_LRU(&block->page)); - buf_unzip_LRU_add_block(block, TRUE); - } + rw_lock_x_unlock(hash_lock); buf_page_mutex_exit(block); } else { - rw_lock_x_unlock(hash_lock); - - /* The compressed page must be allocated before the - control block (bpage), in order to avoid the - invocation of buf_buddy_relocate_block() on - uninitialized data. */ - data = buf_buddy_alloc(buf_pool, page_size.physical(), &lru); - - rw_lock_x_lock(hash_lock); - - /* If buf_buddy_alloc() allocated storage from the LRU list, - it released and reacquired buf_pool->mutex. Thus, we must - check the page_hash again, as it may have been modified. */ - if (UNIV_UNLIKELY(lru)) { - - watch_page = buf_page_hash_get_low(buf_pool, page_id); - - if (UNIV_UNLIKELY(watch_page - && !buf_pool_watch_is_sentinel(buf_pool, - watch_page))) { - - /* The block was added by some other thread. */ - rw_lock_x_unlock(hash_lock); - watch_page = NULL; - buf_buddy_free(buf_pool, data, - page_size.physical()); - - bpage = NULL; - goto func_exit; - } - } - - bpage = buf_page_alloc_descriptor(); /* Initialize the buf_pool pointer. */ bpage->buf_pool_index = buf_pool_index(buf_pool); page_zip_des_init(&bpage->zip); page_zip_set_size(&bpage->zip, page_size.physical()); + ut_ad(data); bpage->zip.data = (page_zip_t*) data; bpage->size.copy_from(page_size); @@ -5072,15 +5005,15 @@ #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG buf_LRU_insert_zip_clean(bpage); #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + mutex_exit(&buf_pool->LRU_list_mutex); buf_page_set_io_fix(bpage, BUF_IO_READ); mutex_exit(&buf_pool->zip_mutex); } - buf_pool->n_pend_reads++; + os_atomic_increment_ulint(&buf_pool->n_pend_reads, 1); func_exit: - buf_pool_mutex_exit(buf_pool); if (mode == BUF_READ_IBUF_PAGES_ONLY) { @@ -5122,7 +5055,7 @@ free_block = buf_LRU_get_free_block(buf_pool); - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); hash_lock = buf_page_hash_lock_get(buf_pool, page_id); rw_lock_x_lock(hash_lock); @@ -5140,7 +5073,7 @@ #endif /* UNIV_DEBUG_FILE_ACCESSES || UNIV_DEBUG */ /* Page can be found in buf_pool */ - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); rw_lock_x_unlock(hash_lock); buf_block_free(free_block); @@ -5155,37 +5088,30 @@ block = free_block; - buf_page_mutex_enter(block); - buf_page_init(buf_pool, page_id, page_size, block); + buf_page_mutex_enter(block); + rw_lock_x_unlock(hash_lock); /* The block must be put to the LRU list */ buf_LRU_add_block(&block->page, FALSE); + mutex_exit(&buf_pool->LRU_list_mutex); buf_block_buf_fix_inc(block, __FILE__, __LINE__); - buf_pool->stat.n_pages_created++; + os_atomic_increment(&buf_pool->stat.n_pages_created, 1); if (page_size.is_compressed()) { void* data; - ibool lru; - /* Prevent race conditions during buf_buddy_alloc(), - which may release and reacquire buf_pool->mutex, + /* Prevent race conditions during buf_buddy_alloc() by IO-fixing and X-latching the block. */ - buf_page_set_io_fix(&block->page, BUF_IO_READ); rw_lock_x_lock(&block->lock); buf_page_mutex_exit(block); - /* buf_pool->mutex may be released and reacquired by - buf_buddy_alloc(). Thus, we must release block->mutex - in order not to break the latching order in - the reacquisition of buf_pool->mutex. We also must - defer this operation until after the block descriptor - has been added to buf_pool->LRU and buf_pool->page_hash. */ - data = buf_buddy_alloc(buf_pool, page_size.physical(), &lru); + data = buf_buddy_alloc(buf_pool, page_size.physical()); + mutex_enter(&buf_pool->LRU_list_mutex); buf_page_mutex_enter(block); block->page.zip.data = (page_zip_t*) data; @@ -5196,13 +5122,12 @@ block->page.zip.data is set. */ ut_ad(buf_page_belongs_to_unzip_LRU(&block->page)); buf_unzip_LRU_add_block(block, FALSE); + mutex_exit(&buf_pool->LRU_list_mutex); buf_page_set_io_fix(&block->page, BUF_IO_NONE); rw_lock_x_unlock(&block->lock); } - buf_pool_mutex_exit(buf_pool); - mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); buf_page_set_accessed(&block->page); @@ -5246,6 +5171,8 @@ const byte* frame; monitor_id_t counter; + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + /* If the counter module is not turned on, just return */ if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) { return; @@ -5354,9 +5281,11 @@ == BUF_BLOCK_FILE_PAGE); ib_uint32_t space = bpage->id.space(); ibool ret = TRUE; + rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, bpage->id); /* First unfix and release lock on the bpage */ - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(hash_lock); mutex_enter(buf_page_get_mutex(bpage)); ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ); ut_ad(bpage->buf_fix_count == 0); @@ -5370,19 +5299,18 @@ BUF_IO_READ); } - mutex_exit(buf_page_get_mutex(bpage)); - /* Find the table with specified space id, and mark it corrupted */ if (dict_set_corrupted_by_space(space)) { buf_LRU_free_one_page(bpage); } else { + rw_lock_x_unlock(hash_lock); + mutex_exit(buf_page_get_mutex(bpage)); ret = FALSE; } + mutex_exit(&buf_pool->LRU_list_mutex); ut_ad(buf_pool->n_pend_reads > 0); - buf_pool->n_pend_reads--; - - buf_pool_mutex_exit(buf_pool); + os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1); return(ret); } @@ -5404,6 +5332,7 @@ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); const ibool uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + bool have_LRU_mutex = false; ut_a(buf_page_in_file(bpage)); @@ -5413,7 +5342,7 @@ ensures that this is the only thread that handles the i/o for this block. */ - io_type = buf_page_get_io_fix(bpage); + io_type = buf_page_get_io_fix_unlocked(bpage); ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE); if (io_type == BUF_IO_READ) { @@ -5423,15 +5352,16 @@ if (bpage->size.is_compressed()) { frame = bpage->zip.data; - buf_pool->n_pend_unzip++; + os_atomic_increment_ulint(&buf_pool->n_pend_unzip, 1); if (uncompressed && !buf_zip_decompress((buf_block_t*) bpage, FALSE)) { - buf_pool->n_pend_unzip--; + os_atomic_decrement_ulint( + &buf_pool->n_pend_unzip, 1); goto corrupt; } - buf_pool->n_pend_unzip--; + os_atomic_decrement_ulint(&buf_pool->n_pend_unzip, 1); } else { ut_a(uncompressed); frame = ((buf_block_t*) bpage)->frame; @@ -5544,8 +5474,39 @@ } } - buf_pool_mutex_enter(buf_pool); - mutex_enter(buf_page_get_mutex(bpage)); + if (io_type == BUF_IO_WRITE + && ( +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + /* to keep consistency at buf_LRU_insert_zip_clean() */ + buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY || +#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU || + buf_page_get_flush_type(bpage) == BUF_FLUSH_SINGLE_PAGE)) { + + have_LRU_mutex = true; /* optimistic */ + } +retry_mutex: + if (have_LRU_mutex) + mutex_enter(&buf_pool->LRU_list_mutex); + + BPageMutex* page_mutex = buf_page_get_mutex(bpage); + mutex_enter(page_mutex); + + if (UNIV_UNLIKELY(io_type == BUF_IO_WRITE + && ( +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY + || +#endif + buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU + || buf_page_get_flush_type(bpage) + == BUF_FLUSH_SINGLE_PAGE) + && !have_LRU_mutex)) { + + mutex_exit(page_mutex); + have_LRU_mutex = true; + goto retry_mutex; + } #ifdef UNIV_IBUF_COUNT_DEBUG if (io_type == BUF_IO_WRITE || uncompressed) { @@ -5560,19 +5521,19 @@ removes the newest lock debug record, without checking the thread id. */ - buf_page_set_io_fix(bpage, BUF_IO_NONE); buf_page_monitor(bpage, io_type); switch (io_type) { case BUF_IO_READ: + + ut_ad(!have_LRU_mutex); + + buf_page_set_io_fix(bpage, BUF_IO_NONE); + /* NOTE that the call to ibuf may have moved the ownership of the x-latch to this OS thread: do not let this confuse you in debugging! */ - ut_ad(buf_pool->n_pend_reads > 0); - buf_pool->n_pend_reads--; - buf_pool->stat.n_pages_read++; - if (uncompressed) { rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_READ); @@ -5580,6 +5541,10 @@ mutex_exit(buf_page_get_mutex(bpage)); + ut_ad(buf_pool->n_pend_reads > 0); + os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1); + os_atomic_increment_ulint(&buf_pool->stat.n_pages_read, 1); + break; case BUF_IO_WRITE: @@ -5593,7 +5558,7 @@ BUF_IO_WRITE); } - buf_pool->stat.n_pages_written++; + os_atomic_increment_ulint(&buf_pool->stat.n_pages_written, 1); /* We decide whether or not to evict the page from the LRU list based on the flush_type. @@ -5603,14 +5568,17 @@ by the caller explicitly. */ if (buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU) { evict = true; + ut_ad(have_LRU_mutex); } - if (evict) { - mutex_exit(buf_page_get_mutex(bpage)); - buf_LRU_free_page(bpage, true); + if (evict && buf_LRU_free_page(bpage, true)) { + have_LRU_mutex = false; } else { mutex_exit(buf_page_get_mutex(bpage)); } + if (have_LRU_mutex) { + mutex_exit(&buf_pool->LRU_list_mutex); + } break; @@ -5623,8 +5591,6 @@ io_type == BUF_IO_READ ? "read" : "wrote", bpage->id.space(), bpage->id.page_no())); - buf_pool_mutex_exit(buf_pool); - return(true); } @@ -5642,25 +5608,37 @@ ut_ad(buf_pool); - buf_pool_mutex_enter(buf_pool); - chunk = buf_pool->chunks; for (i = buf_pool->n_chunks; i--; chunk++) { + mutex_enter(&buf_pool->LRU_list_mutex); + const buf_block_t* block = buf_chunk_not_freed(chunk); + mutex_exit(&buf_pool->LRU_list_mutex); + if (UNIV_LIKELY_NULL(block)) { ib::fatal() << "Page " << block->page.id << " still fixed or dirty"; } } - buf_pool_mutex_exit(buf_pool); - return(TRUE); } +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +static +void +buf_refresh_io_stats( +/*=================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool instance */ +{ + buf_pool->last_printout_time = ut_time(); + buf_pool->old_stat = buf_pool->stat; +} + /*********************************************************************//** Invalidates file pages in one buffer pool instance */ static @@ -5671,7 +5649,9 @@ { ulint i; - buf_pool_mutex_enter(buf_pool); + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); + + mutex_enter(&buf_pool->flush_state_mutex); for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) { @@ -5689,21 +5669,21 @@ if (buf_pool->n_flush[i] > 0) { buf_flush_t type = static_cast(i); - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->flush_state_mutex); buf_flush_wait_batch_end(buf_pool, type); - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->flush_state_mutex); } } - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->flush_state_mutex); ut_ad(buf_all_freed_instance(buf_pool)); - buf_pool_mutex_enter(buf_pool); - while (buf_LRU_scan_and_free_block(buf_pool, true)) { } + mutex_enter(&buf_pool->LRU_list_mutex); + ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0); @@ -5711,10 +5691,10 @@ buf_pool->LRU_old = NULL; buf_pool->LRU_old_len = 0; + mutex_exit(&buf_pool->LRU_list_mutex); + memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat)); buf_refresh_io_stats(buf_pool); - - buf_pool_mutex_exit(buf_pool); } /*********************************************************************//** @@ -5756,8 +5736,11 @@ ut_ad(buf_pool); - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); hash_lock_x_all(buf_pool->page_hash); + mutex_enter(&buf_pool->zip_mutex); + mutex_enter(&buf_pool->free_list_mutex); + mutex_enter(&buf_pool->flush_state_mutex); chunk = buf_pool->chunks; @@ -5770,8 +5753,6 @@ for (j = chunk->size; j--; block++) { - buf_page_mutex_enter(block); - switch (buf_block_get_state(block)) { case BUF_BLOCK_POOL_WATCH: case BUF_BLOCK_ZIP_PAGE: @@ -5791,7 +5772,7 @@ == BUF_IO_READ || !ibuf_count_get(block->page.id)); #endif - switch (buf_page_get_io_fix(&block->page)) { + switch (buf_page_get_io_fix_unlocked(&block->page)) { case BUF_IO_NONE: break; @@ -5799,20 +5780,8 @@ switch (buf_page_get_flush_type( &block->page)) { case BUF_FLUSH_LRU: - n_lru_flush++; - goto assert_s_latched; case BUF_FLUSH_SINGLE_PAGE: - n_page_flush++; -assert_s_latched: - ut_a(rw_lock_is_locked( - &block->lock, - RW_LOCK_S) - || rw_lock_is_locked( - &block->lock, - RW_LOCK_SX)); - break; case BUF_FLUSH_LIST: - n_list_flush++; break; default: ut_error; @@ -5843,13 +5812,9 @@ /* do nothing */ break; } - - buf_page_mutex_exit(block); } } - mutex_enter(&buf_pool->zip_mutex); - /* Check clean compressed-only blocks. */ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; @@ -5893,7 +5858,9 @@ case BUF_BLOCK_ZIP_DIRTY: n_lru++; n_zip++; - switch (buf_page_get_io_fix(b)) { + /* fallthrough */ + case BUF_BLOCK_FILE_PAGE: + switch (buf_page_get_io_fix_unlocked(b)) { case BUF_IO_NONE: case BUF_IO_READ: case BUF_IO_PIN: @@ -5915,9 +5882,6 @@ break; } break; - case BUF_BLOCK_FILE_PAGE: - /* uncompressed page */ - break; case BUF_BLOCK_POOL_WATCH: case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_NOT_USED: @@ -5946,19 +5910,24 @@ } ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru); + + mutex_exit(&buf_pool->LRU_list_mutex); + if (buf_pool->curr_size == buf_pool->old_size - && UT_LIST_GET_LEN(buf_pool->free) != n_free) { + && UT_LIST_GET_LEN(buf_pool->free) > n_free) { ib::fatal() << "Free list len " << UT_LIST_GET_LEN(buf_pool->free) << ", free blocks " << n_free << ". Aborting..."; } + mutex_exit(&buf_pool->free_list_mutex); + ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush); - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->flush_state_mutex); ut_a(buf_LRU_validate()); ut_a(buf_flush_validate(buf_pool)); @@ -6016,12 +5985,16 @@ counts = static_cast(ut_malloc_nokey(sizeof(ulint) * size)); - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); + mutex_enter(&buf_pool->free_list_mutex); + mutex_enter(&buf_pool->flush_state_mutex); buf_flush_list_mutex_enter(buf_pool); ib::info() << *buf_pool; buf_flush_list_mutex_exit(buf_pool); + mutex_exit(&buf_pool->flush_state_mutex); + mutex_exit(&buf_pool->free_list_mutex); /* Count the number of blocks belonging to each index in the buffer */ @@ -6062,7 +6035,7 @@ } } - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); for (i = 0; i < n_found; i++) { index = dict_index_get_if_in_cache(index_ids[i]); @@ -6109,7 +6082,7 @@ /*********************************************************************//** Returns the number of latched pages in the buffer pool. @return number of latched pages */ - +static ulint buf_get_latched_pages_number_instance( /*==================================*/ @@ -6120,7 +6093,7 @@ buf_chunk_t* chunk; ulint fixed_pages_number = 0; - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); chunk = buf_pool->chunks; @@ -6137,18 +6110,16 @@ continue; } - buf_page_mutex_enter(block); - if (block->page.buf_fix_count != 0 - || buf_page_get_io_fix(&block->page) + || buf_page_get_io_fix_unlocked(&block->page) != BUF_IO_NONE) { fixed_pages_number++; } - - buf_page_mutex_exit(block); } } + mutex_exit(&buf_pool->LRU_list_mutex); + mutex_enter(&buf_pool->zip_mutex); /* Traverse the lists of clean and dirty compressed-only blocks. */ @@ -6179,12 +6150,16 @@ case BUF_BLOCK_FILE_PAGE: /* uncompressed page */ break; + case BUF_BLOCK_REMOVE_HASH: + /* We hold flush list but not LRU list mutex here. + Thus encountering BUF_BLOCK_REMOVE_HASH pages is + possible. */ + break; case BUF_BLOCK_POOL_WATCH: case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_NOT_USED: case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: - case BUF_BLOCK_REMOVE_HASH: ut_error; break; } @@ -6192,7 +6167,6 @@ buf_flush_list_mutex_exit(buf_pool); mutex_exit(&buf_pool->zip_mutex); - buf_pool_mutex_exit(buf_pool); return(fixed_pages_number); } @@ -6232,6 +6206,7 @@ { ulint pend_ios = 0; + os_rmb; for (ulint i = 0; i < srv_buf_pool_instances; i++) { pend_ios += buf_pool_from_array(i)->n_pend_reads; } @@ -6339,9 +6314,6 @@ /* Find appropriate pool_info to store stats for this buffer pool */ pool_info = &all_pool_info[pool_id]; - buf_pool_mutex_enter(buf_pool); - buf_flush_list_mutex_enter(buf_pool); - pool_info->pool_unique_id = pool_id; pool_info->pool_size = buf_pool->curr_size; @@ -6370,8 +6342,6 @@ (buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] + buf_pool->init_flush[BUF_FLUSH_SINGLE_PAGE]); - buf_flush_list_mutex_exit(buf_pool); - current_time = time(NULL); time_elapsed = 0.001 + difftime(current_time, buf_pool->last_printout_time); @@ -6453,12 +6423,11 @@ pool_info->unzip_cur = buf_LRU_stat_cur.unzip; buf_refresh_io_stats(buf_pool); - buf_pool_mutex_exit(buf_pool); } /*********************************************************************//** Prints info of the buffer i/o. */ - +static void buf_print_io_instance( /*==================*/ @@ -6563,6 +6532,8 @@ ut_zalloc_nokey(sizeof *pool_info)); } + os_rmb; + for (i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool; @@ -6603,18 +6574,6 @@ Refreshes the statistics used to print per-second averages. */ void -buf_refresh_io_stats( -/*=================*/ - buf_pool_t* buf_pool) /*!< in: buffer pool instance */ -{ - buf_pool->last_printout_time = ut_time(); - buf_pool->old_stat = buf_pool->stat; -} - -/**********************************************************************//** -Refreshes the statistics used to print per-second averages. */ - -void buf_refresh_io_stats_all(void) /*==========================*/ { @@ -6660,22 +6619,22 @@ ulint i; ulint pending_io = 0; - buf_pool_mutex_enter_all(); - for (i = 0; i < srv_buf_pool_instances; i++) { - const buf_pool_t* buf_pool; + buf_pool_t* buf_pool; buf_pool = buf_pool_from_array(i); - pending_io += buf_pool->n_pend_reads + pending_io += buf_pool->n_pend_reads; + + mutex_enter(&buf_pool->flush_state_mutex); + pending_io += + buf_pool->n_flush[BUF_FLUSH_LRU] + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] + buf_pool->n_flush[BUF_FLUSH_LIST]; + mutex_exit(&buf_pool->flush_state_mutex); } - buf_pool_mutex_exit_all(); - return(pending_io); } @@ -6691,11 +6650,11 @@ { ulint len; - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->free_list_mutex); len = UT_LIST_GET_LEN(buf_pool->free); - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->free_list_mutex); return(len); } @@ -6754,6 +6713,12 @@ std::ostream& out, const buf_pool_t& buf_pool) { + /* These locking requirements might be relaxed if desired */ + ut_ad(mutex_own(&buf_pool.LRU_list_mutex)); + ut_ad(mutex_own(&buf_pool.free_list_mutex)); + ut_ad(mutex_own(&buf_pool.flush_state_mutex)); + ut_ad(buf_flush_list_mutex_own(&buf_pool)); + out << "[buffer pool instance: " << "buf_pool size=" << buf_pool.curr_size << ", database pages=" << UT_LIST_GET_LEN(buf_pool.LRU) === modified file 'storage/innobase/buf/buf0dblwr.cc' --- storage/innobase/buf/buf0dblwr.cc 2014-08-26 11:08:37 +0000 +++ storage/innobase/buf/buf0dblwr.cc 2015-01-16 19:30:41 +0000 @@ -951,6 +951,7 @@ buf_page_t* bpage) /*!< in: buffer block to write */ { ut_a(buf_page_in_file(bpage)); + ut_ad(!mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex)); try_again: mutex_enter(&buf_dblwr->mutex); === modified file 'storage/innobase/buf/buf0dump.cc' --- storage/innobase/buf/buf0dump.cc 2014-08-12 08:15:50 +0000 +++ storage/innobase/buf/buf0dump.cc 2015-01-16 19:30:41 +0000 @@ -53,8 +53,8 @@ static ibool buf_load_abort_flag = FALSE; /* Used to temporary store dump info in order to avoid IO while holding -buffer pool mutex during dump and also to sort the contents of the dump -before reading the pages from disk during load. +buffer pool LRU list mutex during dump and also to sort the contents of the +dump before reading the pages from disk during load. We store the space id in the high 32 bits and page no in low 32 bits. */ typedef ib_uint64_t buf_dump_t; @@ -213,15 +213,15 @@ buf_pool = buf_pool_from_array(i); - /* obtain buf_pool mutex before allocate, since + /* obtain buf_pool LRU list mutex before allocate, since UT_LIST_GET_LEN(buf_pool->LRU) could change */ - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); n_pages = UT_LIST_GET_LEN(buf_pool->LRU); /* skip empty buffer pools */ if (n_pages == 0) { - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); continue; } @@ -239,7 +239,7 @@ n_pages * sizeof(*dump))); if (dump == NULL) { - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); fclose(f); buf_dump_status(STATUS_ERR, "Cannot allocate " ULINTPF " bytes: %s", @@ -261,7 +261,7 @@ ut_a(j == n_pages); - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) { ret = fprintf(f, ULINTPF "," ULINTPF "\n", === modified file 'storage/innobase/buf/buf0flu.cc' --- storage/innobase/buf/buf0flu.cc 2014-11-04 13:44:51 +0000 +++ storage/innobase/buf/buf0flu.cc 2015-01-16 19:30:41 +0000 @@ -374,9 +374,8 @@ buf_block_t* block, /*!< in/out: block which is modified */ lsn_t lsn) /*!< in: oldest modification */ { - ut_ad(!buf_pool_mutex_own(buf_pool)); ut_ad(log_flush_order_mutex_own()); - ut_ad(buf_page_mutex_own(block)); + ut_ad(mutex_own(buf_page_get_mutex(&block->page))); buf_flush_list_mutex_enter(buf_pool); @@ -436,15 +435,14 @@ buf_page_t* prev_b; buf_page_t* b; - ut_ad(!buf_pool_mutex_own(buf_pool)); ut_ad(log_flush_order_mutex_own()); - ut_ad(buf_page_mutex_own(block)); + ut_ad(mutex_own(buf_page_get_mutex(&block->page))); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); buf_flush_list_mutex_enter(buf_pool); - /* The field in_LRU_list is protected by buf_pool->mutex, which - we are not holding. However, while a block is in the flush + /* The field in_LRU_list is protected by buf_pool->LRU_list_mutex, + which we are not holding. However, while a block is in the flush list, it is dirty and cannot be discarded, not from the page_hash or from the LRU list. At most, the uncompressed page frame of a compressed block may be discarded or created @@ -517,7 +515,8 @@ /********************************************************************//** Returns TRUE if the file page block is immediately suitable for replacement, -i.e., the transition FILE_PAGE => NOT_USED allowed. +i.e., the transition FILE_PAGE => NOT_USED allowed. The caller must hold the +LRU list and block mutexes. @return TRUE if can replace immediately */ ibool @@ -528,7 +527,7 @@ { #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); #endif /* UNIV_DEBUG */ ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(bpage->in_LRU_list); @@ -557,17 +556,18 @@ buf_page_in_file(bpage) */ buf_flush_t flush_type)/*!< in: type of flush */ { + ut_a(buf_page_in_file(bpage) + || (buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH #ifdef UNIV_DEBUG - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); -#endif /* UNIV_DEBUG */ - - ut_a(buf_page_in_file(bpage)); - ut_ad(mutex_own(buf_page_get_mutex(bpage))); + && !mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex) +#endif + )); + ut_ad(mutex_own(buf_page_get_mutex(bpage)) + || flush_type == BUF_FLUSH_LIST); ut_ad(flush_type < BUF_FLUSH_N_TYPES); if (bpage->oldest_modification == 0 - || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + || buf_page_get_io_fix_unlocked(bpage) != BUF_IO_NONE) { return(false); } @@ -575,6 +575,7 @@ switch (flush_type) { case BUF_FLUSH_LIST: + return(buf_page_get_state(bpage) != BUF_BLOCK_REMOVE_HASH); case BUF_FLUSH_LRU: case BUF_FLUSH_SINGLE_PAGE: return(true); @@ -597,8 +598,11 @@ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(mutex_own(buf_page_get_mutex(bpage))); +#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_DIRTY + || mutex_own(&buf_pool->LRU_list_mutex)); +#endif ut_ad(bpage->in_flush_list); buf_flush_list_mutex_enter(buf_pool); @@ -671,7 +675,6 @@ buf_page_t* prev_b = NULL; buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); /* Must reside in the same buffer pool. */ ut_ad(buf_pool == buf_pool_from_bpage(dpage)); @@ -679,13 +682,6 @@ buf_flush_list_mutex_enter(buf_pool); - /* FIXME: At this point we have both buf_pool and flush_list - mutexes. Theoretically removal of a block from flush list is - only covered by flush_list mutex but currently we do - have buf_pool mutex in buf_flush_remove() therefore this block - is guaranteed to be in the flush list. We need to check if - this will work without the assumption of block removing code - having the buf_pool mutex. */ ut_ad(bpage->in_flush_list); ut_ad(dpage->in_flush_list); @@ -733,14 +729,15 @@ /*=====================*/ buf_page_t* bpage) /*!< in: pointer to the block in question */ { - buf_flush_t flush_type; + buf_flush_t flush_type = buf_page_get_flush_type(bpage); buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(bpage); + mutex_enter(&buf_pool->flush_state_mutex); buf_flush_remove(bpage); - flush_type = buf_page_get_flush_type(bpage); + buf_page_set_io_fix(bpage, BUF_IO_NONE); + buf_pool->n_flush[flush_type]--; if (buf_pool->n_flush[flush_type] == 0 @@ -751,6 +748,8 @@ os_event_set(buf_pool->no_flush[flush_type]); } + mutex_exit(&buf_pool->flush_state_mutex); + buf_dblwr_update(bpage, flush_type); } #endif /* !UNIV_HOTBACKUP */ @@ -904,7 +903,7 @@ #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(!buf_pool_mutex_own(buf_pool)); + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); #endif DBUG_PRINT("ib_buf", ("flush %s %u page " UINT32PF ":" UINT32PF, @@ -913,15 +912,13 @@ ut_ad(buf_page_in_file(bpage)); - /* We are not holding buf_pool->mutex or block_mutex here. - Nevertheless, it is safe to access bpage, because it is - io_fixed and oldest_modification != 0. Thus, it cannot be - relocated in the buffer pool or removed from flush_list or - LRU_list. */ - ut_ad(!buf_pool_mutex_own(buf_pool)); + /* We are not holding block_mutex here. Nevertheless, it is safe to + access bpage, because it is io_fixed and oldest_modification != 0. + Thus, it cannot be relocated in the buffer pool or removed from + flush_list or LRU_list. */ ut_ad(!buf_flush_list_mutex_own(buf_pool)); ut_ad(!buf_page_get_mutex(bpage)->is_owned()); - ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE); + ut_ad(buf_page_get_io_fix_unlocked(bpage) == BUF_IO_WRITE); ut_ad(bpage->oldest_modification != 0); #ifdef UNIV_IBUF_COUNT_DEBUG @@ -1008,9 +1005,10 @@ Writes a flushable page asynchronously from the buffer pool to a file. NOTE: in simulated aio we must call os_aio_simulated_wake_handler_threads after we have posted a batch of -writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be -held upon entering this function, and they will be released by this -function if it returns true. +writes! NOTE: buf_page_get_mutex(bpage) must be held upon entering this +function. The LRU list mutex must be held iff flush_type +== BUF_FLUSH_SINGLE_PAGE. Both mutexes will be released by this function if it +returns true. @return TRUE if the page was flushed */ ibool @@ -1024,7 +1022,15 @@ BPageMutex* block_mutex; ut_ad(flush_type < BUF_FLUSH_N_TYPES); - ut_ad(buf_pool_mutex_own(buf_pool)); + /* Hold the LRU list mutex iff called for a single page LRU + flush. A single page LRU flush is already non-performant, and holding + the LRU list mutex allows us to avoid having to store the previous LRU + list page or to restart the LRU scan in + buf_flush_single_page_from_LRU(). */ + ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE || + !mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(flush_type != BUF_FLUSH_SINGLE_PAGE || + mutex_own(&buf_pool->LRU_list_mutex)); ut_ad(buf_page_in_file(bpage)); ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE); @@ -1068,6 +1074,8 @@ /* We are committed to flushing by the time we get here */ + mutex_enter(&buf_pool->flush_state_mutex); + buf_page_set_io_fix(bpage, BUF_IO_WRITE); buf_page_set_flush_type(bpage, flush_type); @@ -1078,8 +1086,12 @@ ++buf_pool->n_flush[flush_type]; + mutex_exit(&buf_pool->flush_state_mutex); + mutex_exit(block_mutex); - buf_pool_mutex_exit(buf_pool); + + if (flush_type == BUF_FLUSH_SINGLE_PAGE) + mutex_exit(&buf_pool->LRU_list_mutex); if (flush_type == BUF_FLUSH_LIST && is_uncompressed @@ -1111,9 +1123,9 @@ # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG /********************************************************************//** Writes a flushable page asynchronously from the buffer pool to a file. -NOTE: buf_pool->mutex and block->mutex must be held upon entering this -function, and they will be released by this function after flushing. -This is loosely based on buf_flush_batch() and buf_flush_page(). +NOTE: block and LRU list mutexes must be held upon entering this function, and +they will be released by this function after flushing. This is loosely based on +buf_flush_batch() and buf_flush_page(). @return TRUE if the page was flushed and the mutexes released */ ibool @@ -1122,16 +1134,15 @@ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ buf_block_t* block) /*!< in/out: buffer control block */ { - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - ut_ad(buf_page_mutex_own(block)); + ut_ad(mutex_own(buf_page_get_mutex(&block->page))); if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) { return(FALSE); } - /* The following call will release the buffer pool and - block mutex. */ + /* The following call will release the LRU list and block mutexes. */ return(buf_flush_page( buf_pool, &block->page, BUF_FLUSH_SINGLE_PAGE, true)); @@ -1151,21 +1162,26 @@ buf_page_t* bpage; buf_pool_t* buf_pool = buf_pool_get(page_id); bool ret; + rw_lock_t* hash_lock; + ib_mutex_t* block_mutex; ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); - buf_pool_mutex_enter(buf_pool); - /* We only want to flush pages from this buffer pool. */ - bpage = buf_page_hash_get(buf_pool, page_id); + bpage = buf_page_hash_get_s_locked(buf_pool, page_id, &hash_lock); if (!bpage) { - buf_pool_mutex_exit(buf_pool); return(false); } + block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + rw_lock_s_unlock(hash_lock); + ut_a(buf_page_in_file(bpage)); /* We avoid flushing 'non-old' blocks in an LRU flush, @@ -1173,15 +1189,13 @@ ret = false; if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) { - BPageMutex* block_mutex = buf_page_get_mutex(bpage); - mutex_enter(block_mutex); if (buf_flush_ready_for_flush(bpage, flush_type)) { ret = true; } - mutex_exit(block_mutex); } - buf_pool_mutex_exit(buf_pool); + + mutex_exit(block_mutex); return(ret); } @@ -1207,6 +1221,8 @@ buf_pool_t* buf_pool = buf_pool_get(page_id); ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(!buf_flush_list_mutex_own(buf_pool)); if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN || srv_flush_neighbors == 0) { @@ -1276,6 +1292,8 @@ for (ulint i = low; i < high; i++) { buf_page_t* bpage; + rw_lock_t* hash_lock; + ib_mutex_t* block_mutex; if ((count + n_flushed) >= n_to_flush) { @@ -1296,17 +1314,21 @@ buf_pool = buf_pool_get(cur_page_id); - buf_pool_mutex_enter(buf_pool); - /* We only want to flush pages from this buffer pool. */ - bpage = buf_page_hash_get(buf_pool, cur_page_id); + bpage = buf_page_hash_get_s_locked(buf_pool, cur_page_id, + &hash_lock); if (bpage == NULL) { - buf_pool_mutex_exit(buf_pool); continue; } + block_mutex = buf_page_get_mutex(bpage); + + mutex_enter(block_mutex); + + rw_lock_s_unlock(hash_lock); + ut_a(buf_page_in_file(bpage)); /* We avoid flushing 'non-old' blocks in an LRU flush, @@ -1316,10 +1338,6 @@ || i == page_id.page_no() || buf_page_is_old(bpage)) { - BPageMutex* block_mutex = buf_page_get_mutex(bpage); - - mutex_enter(block_mutex); - if (buf_flush_ready_for_flush(bpage, flush_type) && (i == page_id.page_no() || bpage->buf_fix_count == 0)) { @@ -1332,16 +1350,15 @@ ++count; } else { + mutex_exit(block_mutex); - buf_pool_mutex_exit(buf_pool); } continue; - } else { - mutex_exit(block_mutex); } } - buf_pool_mutex_exit(buf_pool); + + mutex_exit(block_mutex); } if (count > 1) { @@ -1357,10 +1374,11 @@ /********************************************************************//** Check if the block is modified and ready for flushing. If the the block -is ready to flush then flush the page and try o flush its neighbors. +is ready to flush then flush the page and try o flush its neighbors. The caller +must hold the buffer pool list mutex corresponding to the type of flush. -@return TRUE if buf_pool mutex was released during this function. -This does not guarantee that some pages were written as well. +@return TRUE if the list mutex was released during this function. This does +not guarantee that some pages were written as well. Number of pages written are incremented to the count. */ static bool @@ -1378,41 +1396,69 @@ { #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - - ut_ad(buf_pool_mutex_own(buf_pool)); #endif /* UNIV_DEBUG */ bool flushed; - BPageMutex* block_mutex = buf_page_get_mutex(bpage); - - mutex_enter(block_mutex); - - ut_a(buf_page_in_file(bpage)); + BPageMutex* block_mutex = NULL; + + ut_ad(flush_type != BUF_FLUSH_SINGLE_PAGE); + + ut_ad((flush_type == BUF_FLUSH_LRU + && mutex_own(&buf_pool->LRU_list_mutex)) + || (flush_type == BUF_FLUSH_LIST + && buf_flush_list_mutex_own(buf_pool))); + + if (flush_type == BUF_FLUSH_LRU) { + block_mutex = buf_page_get_mutex(bpage); + mutex_enter(block_mutex); + } + + ut_a(buf_page_in_file(bpage) + || (buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH +#ifdef UNIV_DEBUG + && !mutex_own(&buf_pool->LRU_list_mutex) +#endif + )); if (buf_flush_ready_for_flush(bpage, flush_type)) { buf_pool_t* buf_pool; buf_pool = buf_pool_from_bpage(bpage); + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&buf_pool->LRU_list_mutex); + } + const page_id_t page_id = bpage->id; - mutex_exit(block_mutex); - - buf_pool_mutex_exit(buf_pool); + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(block_mutex); + } else { + buf_flush_list_mutex_exit(buf_pool); + } /* Try to flush also all the neighbors */ *count += buf_flush_try_neighbors( page_id, flush_type, *count, n_to_flush); - buf_pool_mutex_enter(buf_pool); - flushed = TRUE; + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&buf_pool->LRU_list_mutex); + } else { + buf_flush_list_mutex_enter(buf_pool); + } + flushed = true; + } else if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(block_mutex); + + flushed = false; } else { - mutex_exit(block_mutex); - flushed = false; } - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad((flush_type == BUF_FLUSH_LRU + && mutex_own(&buf_pool->LRU_list_mutex)) + || (flush_type == BUF_FLUSH_LIST + && buf_flush_list_mutex_own(buf_pool))); return(flushed); } @@ -1424,7 +1470,7 @@ tail of the unzip_LRU and puts those freed frames in the free list. Note that it is a best effort attempt and it is not guaranteed that after a call to this function there will be 'max' blocks in the free -list. +list. The caller must hold the LRU list mutex. @return number of blocks moved to the free list. */ static ulint @@ -1439,7 +1485,7 @@ ulint free_len = UT_LIST_GET_LEN(buf_pool->free); ulint lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); buf_block_t* block = UT_LIST_GET_LAST(buf_pool->unzip_LRU); @@ -1448,15 +1494,22 @@ && free_len < srv_LRU_scan_depth && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) { + BPageMutex* block_mutex = buf_page_get_mutex(&block->page); + ++scanned; + + mutex_enter(block_mutex); + if (buf_LRU_free_page(&block->page, false)) { - /* Block was freed. buf_pool->mutex potentially - released and reacquired */ + + /* Block was freed, all mutexes released */ ++count; + mutex_enter(&buf_pool->LRU_list_mutex); block = UT_LIST_GET_LAST(buf_pool->unzip_LRU); } else { + mutex_exit(block_mutex); block = UT_LIST_GET_PREV(unzip_LRU, block); } @@ -1464,7 +1517,7 @@ lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU); } - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); if (count) { MONITOR_INC_VALUE_CUMULATIVE( @@ -1508,15 +1561,11 @@ ulint count = 0; ulint free_len = UT_LIST_GET_LEN(buf_pool->free); ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU); - ulint withdraw_depth = 0; - - ut_ad(buf_pool_mutex_own(buf_pool)); - - if (buf_pool->curr_size < buf_pool->old_size - && buf_pool->withdraw_target > 0) { - withdraw_depth = buf_pool->withdraw_target - - UT_LIST_GET_LEN(buf_pool->withdraw); - } + ulint withdraw_depth; + + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + + withdraw_depth = buf_get_withdraw_depth(buf_pool); for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); bpage != NULL && count + evict_count < max @@ -1530,31 +1579,37 @@ BPageMutex* block_mutex = buf_page_get_mutex(bpage); - mutex_enter(block_mutex); + ulint failed_acquire = mutex_enter_nowait(block_mutex); - if (buf_flush_ready_for_replace(bpage)) { + if (!failed_acquire && buf_flush_ready_for_replace(bpage)) { /* block is ready for eviction i.e., it is clean and is not IO-fixed or buffer fixed. */ - mutex_exit(block_mutex); if (buf_LRU_free_page(bpage, true)) { ++evict_count; + mutex_enter(&buf_pool->LRU_list_mutex); + } else { + mutex_exit(block_mutex); } - } else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_LRU)) { + } else if (!failed_acquire + && buf_flush_ready_for_flush(bpage, + BUF_FLUSH_LRU)) { /* Block is ready for flush. Dispatch an IO request. The IO helper thread will put it on free list in IO completion routine. */ mutex_exit(block_mutex); buf_flush_page_and_try_neighbors( bpage, BUF_FLUSH_LRU, max, &count); + } else if (failed_acquire) { + ut_ad(buf_pool->lru_hp.is_hp(prev)); } else { /* Can't evict or dispatch this block. Go to previous. */ + mutex_exit(block_mutex); ut_ad(buf_pool->lru_hp.is_hp(prev)); - mutex_exit(block_mutex); } ut_ad(!mutex_own(block_mutex)); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); free_len = UT_LIST_GET_LEN(buf_pool->free); lru_len = UT_LIST_GET_LEN(buf_pool->LRU); @@ -1567,7 +1622,7 @@ should be flushed, we factor in this value. */ buf_lru_flush_page_count += count; - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); if (evict_count) { MONITOR_INC_VALUE_CUMULATIVE( @@ -1606,6 +1661,8 @@ ulint count = 0; std::pair res; + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + if (buf_LRU_evict_from_unzip_LRU(buf_pool)) { count = buf_free_from_unzip_LRU_list_batch(buf_pool, max); } @@ -1645,7 +1702,6 @@ ulint count = 0; ulint scanned = 0; - ut_ad(buf_pool_mutex_own(buf_pool)); /* Start from the end of the list looking for a suitable block to be flushed. */ @@ -1670,7 +1726,6 @@ prev = UT_LIST_GET_PREV(list, bpage); buf_pool->flush_hp.set(prev); - buf_flush_list_mutex_exit(buf_pool); #ifdef UNIV_DEBUG bool flushed = @@ -1678,8 +1733,6 @@ buf_flush_page_and_try_neighbors( bpage, BUF_FLUSH_LIST, min_n, &count); - buf_flush_list_mutex_enter(buf_pool); - ut_ad(flushed || buf_pool->flush_hp.is_hp(prev)); --len; @@ -1704,8 +1757,6 @@ count); } - ut_ad(buf_pool_mutex_own(buf_pool)); - return(count); } @@ -1751,13 +1802,13 @@ || !sync_check_iterate(check)); } - buf_pool_mutex_enter(buf_pool); - - /* Note: The buffer pool mutex is released and reacquired within + /* Note: The buffer pool mutexes are released and reacquired within the flush functions. */ switch (flush_type) { case BUF_FLUSH_LRU: + mutex_enter(&buf_pool->LRU_list_mutex); res = buf_do_LRU_batch(buf_pool, min_n); + mutex_exit(&buf_pool->LRU_list_mutex); break; case BUF_FLUSH_LIST: res.first = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit); @@ -1767,8 +1818,6 @@ ut_error; } - buf_pool_mutex_exit(buf_pool); - DBUG_PRINT("ib_buf", ("flush %u completed, flushed %u pages, evicted %u pages", unsigned(flush_type), unsigned(res.first), @@ -1809,14 +1858,14 @@ { ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->flush_state_mutex); if (buf_pool->n_flush[flush_type] > 0 || buf_pool->init_flush[flush_type] == TRUE) { /* There is already a flush batch of the same type running */ - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->flush_state_mutex); return(FALSE); } @@ -1825,7 +1874,7 @@ os_event_reset(buf_pool->no_flush[flush_type]); - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->flush_state_mutex); return(TRUE); } @@ -1842,7 +1891,7 @@ ulint flushed_page_count)/*!< in: flushed (not evicted!) page count */ { - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->flush_state_mutex); buf_pool->init_flush[flush_type] = FALSE; @@ -1855,7 +1904,7 @@ os_event_set(buf_pool->no_flush[flush_type]); } - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->flush_state_mutex); if (!srv_read_only_mode && flushed_page_count) { buf_dblwr_flush_buffered_writes(); @@ -2039,14 +2088,14 @@ buf_page_t* bpage; ibool freed; - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); for (bpage = buf_pool->single_scan_itr.start(), scanned = 0, freed = false; bpage != NULL; ++scanned, bpage = buf_pool->single_scan_itr.get()) { - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); @@ -2061,12 +2110,12 @@ if (buf_flush_ready_for_replace(bpage)) { /* block is ready for eviction i.e., it is clean and is not IO-fixed or buffer fixed. */ - mutex_exit(block_mutex); if (buf_LRU_free_page(bpage, true)) { - buf_pool_mutex_exit(buf_pool); freed = true; break; + } else { + mutex_exit(block_mutex); } } else if (buf_flush_ready_for_flush( @@ -2098,7 +2147,7 @@ if (!freed) { /* Can't find a single flushable page. */ ut_ad(!bpage); - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } if (scanned) { @@ -2109,7 +2158,7 @@ scanned); } - ut_ad(!buf_pool_mutex_own(buf_pool)); + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); return(freed); } @@ -2134,16 +2183,8 @@ /* srv_LRU_scan_depth can be arbitrarily large value. We cap it with current LRU size. */ - buf_pool_mutex_enter(buf_pool); scan_depth = UT_LIST_GET_LEN(buf_pool->LRU); - if (buf_pool->curr_size < buf_pool->old_size - && buf_pool->withdraw_target > 0) { - withdraw_depth = buf_pool->withdraw_target - - UT_LIST_GET_LEN(buf_pool->withdraw); - } else { - withdraw_depth = 0; - } - buf_pool_mutex_exit(buf_pool); + withdraw_depth = buf_get_withdraw_depth(buf_pool); if (withdraw_depth > srv_LRU_scan_depth) { scan_depth = ut_min(withdraw_depth, scan_depth); @@ -2174,15 +2215,15 @@ buf_pool = buf_pool_from_array(i); - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->flush_state_mutex); if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0 || buf_pool->init_flush[BUF_FLUSH_LRU]) { - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->flush_state_mutex); buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU); } else { - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->flush_state_mutex); } } } @@ -3087,7 +3128,6 @@ { ulint count = 0; - buf_pool_mutex_enter(buf_pool); buf_flush_list_mutex_enter(buf_pool); buf_page_t* bpage; @@ -3096,7 +3136,8 @@ bpage != 0; bpage = UT_LIST_GET_NEXT(list, bpage)) { - ut_ad(buf_page_in_file(bpage)); + ut_ad(buf_page_in_file(bpage) + || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH); ut_ad(bpage->in_flush_list); ut_ad(bpage->oldest_modification > 0); @@ -3106,7 +3147,6 @@ } buf_flush_list_mutex_exit(buf_pool); - buf_pool_mutex_exit(buf_pool); return(count); } === modified file 'storage/innobase/buf/buf0lru.cc' --- storage/innobase/buf/buf0lru.cc 2014-11-04 14:08:41 +0000 +++ storage/innobase/buf/buf0lru.cc 2015-01-16 19:30:41 +0000 @@ -71,7 +71,7 @@ /** When dropping the search hash index entries before deleting an ibd file, we build a local array of pages belonging to that tablespace in the buffer pool. Following is the size of that array. -We also release buf_pool->mutex after scanning this many pages of the +We also release buf_pool->LRU_list_mutex after scanning this many pages of the flush_list when dropping a table. This is to ensure that other threads are not blocked for extended period of time when using very large buffer pools. */ @@ -133,7 +133,7 @@ If the block is compressed-only (BUF_BLOCK_ZIP_PAGE), the object will be freed. -The caller must hold buf_pool->mutex, the buf_page_get_mutex() mutex +The caller must hold buf_pool->LRU_list_mutex, the buf_page_get_mutex() mutex and the appropriate hash_lock. This function will release the buf_page_get_mutex() and the hash_lock. @@ -169,7 +169,7 @@ buf_page_t* bpage, /*!< in: control block */ buf_pool_t* buf_pool) /*!< in: buffer pool instance */ { - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); buf_pool->stat.LRU_bytes += bpage->size.physical(); @@ -186,7 +186,7 @@ /*=========================*/ buf_pool_t* buf_pool) { - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); /* If the unzip_LRU list is empty, we can only use the LRU. */ if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) { @@ -270,7 +270,7 @@ ulint num_entries = 0; - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); scan_again: for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->LRU); @@ -315,18 +315,18 @@ goto next_page; } - /* Array full. We release the buf_pool->mutex to obey + /* Array full. We release the LRU list mutex to obey the latching order. */ - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); buf_LRU_drop_page_hash_batch( id, page_size, page_arr, num_entries); num_entries = 0; - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); - /* Note that we released the buf_pool mutex above + /* Note that we released the buf_pool->LRU_list_mutex above after reading the prev_bpage during processing of a page_hash_batch (i.e.: when the array was full). Because prev_bpage could belong to a compressed-only @@ -340,8 +340,7 @@ guarantee that ALL such entries will be dropped. */ /* If, however, bpage has been removed from LRU list - to the free list then we should restart the scan. - bpage->state is protected by buf_pool mutex. */ + to the free list then we should restart the scan. */ if (bpage != NULL && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { @@ -349,7 +348,7 @@ } } - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); /* Drop any remaining batch of search hashed pages. */ buf_LRU_drop_page_hash_batch(id, page_size, page_arr, num_entries); @@ -358,8 +357,8 @@ /******************************************************************//** While flushing (or removing dirty) pages from a tablespace we don't -want to hog the CPU and resources. Release the buffer pool and block -mutex and try to force a context switch. Then reacquire the same mutexes. +want to hog the CPU and resources. Release the LRU list and block +mutexes and try to force a context switch. Then reacquire the same mutexes. The current page is "fixed" before the release of the mutexes and then "unfixed" again once we have reacquired the mutexes. */ static @@ -369,40 +368,37 @@ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ buf_page_t* bpage) /*!< in/out: current page */ { - BPageMutex* block_mutex; + BPageMutex* block_mutex = buf_page_get_mutex(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(mutex_own(block_mutex)); ut_ad(buf_page_in_file(bpage)); - block_mutex = buf_page_get_mutex(bpage); - - mutex_enter(block_mutex); - /* "Fix" the block so that the position cannot be changed after we release the buffer pool and block mutexes. */ buf_page_set_sticky(bpage); - /* Now it is safe to release the buf_pool->mutex. */ - buf_pool_mutex_exit(buf_pool); + /* Now it is safe to release the LRU list mutex. */ + mutex_exit(&buf_pool->LRU_list_mutex); mutex_exit(block_mutex); /* Try and force a context switch. */ os_thread_yield(); - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); mutex_enter(block_mutex); /* "Unfix" the block now that we have both the - buffer pool and block mutex again. */ + LRU list and block mutexes again. */ buf_page_unset_sticky(bpage); mutex_exit(block_mutex); } /******************************************************************//** -If we have hogged the resources for too long then release the buffer -pool and flush list mutex and do a thread yield. Set the current page +If we have hogged the resources for too long then release the LRU list and +flush list mutexes and do a thread yield. Set the current page to "sticky" so that it is not relocated during the yield. @return true if yielded */ static __attribute__((warn_unused_result)) @@ -411,21 +407,47 @@ /*================*/ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ buf_page_t* bpage, /*!< in/out: bpage to remove */ - ulint processed) /*!< in: number of pages processed */ + ulint processed, /*!< in: number of pages processed */ + bool* must_restart) /*!< in/out: if true, we have to + restart the flush list scan */ { /* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the - loop we release buf_pool->mutex to let other threads + loop we release buf_pool->LRU_list_mutex to let other threads do their job but only if the block is not IO fixed. This ensures that the block stays in its position in the flush_list. */ if (bpage != NULL && processed >= BUF_LRU_DROP_SEARCH_SIZE - && buf_page_get_io_fix(bpage) == BUF_IO_NONE) { + && buf_page_get_io_fix_unlocked(bpage) == BUF_IO_NONE) { + + BPageMutex* block_mutex = buf_page_get_mutex(bpage); buf_flush_list_mutex_exit(buf_pool); - /* Release the buffer pool and block mutex + /* We don't have to worry about bpage becoming a dangling + pointer by a compressed page flush list relocation because + buf_page_get_gen() won't be called for pages from this + tablespace. */ + + mutex_enter(block_mutex); + /* Recheck the I/O fix and the flush list presence now that we + hold the right mutex */ + if (UNIV_UNLIKELY(buf_page_get_io_fix(bpage) != BUF_IO_NONE + || bpage->oldest_modification == 0)) { + + mutex_exit(block_mutex); + + *must_restart = true; + + buf_flush_list_mutex_enter(buf_pool); + + return false; + } + + *must_restart = false; + + /* Release the LRU list and buf_page_get_mutex() mutex to give the other threads a go. */ buf_flush_yield(buf_pool, bpage); @@ -454,18 +476,20 @@ /*=====================*/ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ buf_page_t* bpage, /*!< in/out: bpage to remove */ - bool flush) /*!< in: flush to disk if true but + bool flush, /*!< in: flush to disk if true but don't remove else remove without flushing to disk */ + bool* must_restart) /*!< in/out: if true, must restart the + flush list scan */ { - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_ad(buf_flush_list_mutex_own(buf_pool)); - /* bpage->space and bpage->io_fix are protected by - buf_pool->mutex and block_mutex. It is safe to check - them while holding buf_pool->mutex only. */ + /* It is safe to check bpage->space and bpage->io_fix while holding + buf_pool->LRU_list_mutex only. */ - if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + if (UNIV_UNLIKELY(buf_page_get_io_fix_unlocked(bpage) + != BUF_IO_NONE)) { /* We cannot remove this page during this scan yet; maybe the system is currently reading it @@ -474,24 +498,33 @@ } - BPageMutex* block_mutex; + BPageMutex* block_mutex = buf_page_get_mutex(bpage); bool processed = false; - block_mutex = buf_page_get_mutex(bpage); - - /* We have to release the flush_list_mutex to obey the - latching order. We are however guaranteed that the page - will stay in the flush_list and won't be relocated because - buf_flush_remove() and buf_flush_relocate_on_flush_list() - need buf_pool->mutex as well. */ + /* We don't have to worry about bpage becoming a dangling + pointer by a compressed page flush list relocation because + buf_page_get_gen() won't be called for pages from this + tablespace. */ buf_flush_list_mutex_exit(buf_pool); mutex_enter(block_mutex); - ut_ad(bpage->oldest_modification != 0); - - if (!flush) { + /* Recheck the page I/O fix and the flush list presence now + that we hold the right mutex. */ + if (UNIV_UNLIKELY(buf_page_get_io_fix(bpage) != BUF_IO_NONE + || bpage->oldest_modification == 0)) { + + /* The page became I/O-fixed or is not on the flush + list anymore, this invalidates any flush-list-page + pointers we have. */ + + mutex_exit(block_mutex); + + *must_restart = true; + processed = false; + + } else if (!flush) { buf_flush_remove(bpage); @@ -501,8 +534,8 @@ } else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)) { - /* The following call will release the buffer pool - and block mutex. */ + /* The following call will release the LRU list + and block mutexes. */ processed = buf_flush_page( buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, false); @@ -510,7 +543,7 @@ /* Wake possible simulated aio thread to actually post the writes to the operating system */ os_aio_simulated_wake_handler_threads(); - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); } else { mutex_exit(block_mutex); } @@ -521,7 +554,7 @@ buf_flush_list_mutex_enter(buf_pool); ut_ad(!mutex_own(block_mutex)); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); return(processed); } @@ -551,9 +584,12 @@ buf_page_t* bpage; ulint processed = 0; + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + buf_flush_list_mutex_enter(buf_pool); rescan: + bool must_restart = false; bool all_freed = true; for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list); @@ -572,15 +608,16 @@ /* Skip this block, as it does not belong to the target space. */ - } else if (!buf_flush_or_remove_page(buf_pool, bpage, flush)) { + } else if (!buf_flush_or_remove_page(buf_pool, bpage, flush, + &must_restart)) { /* Remove was unsuccessful, we have to try again by scanning the entire list from the end. This also means that we never released the - buf_pool mutex. Therefore we can trust the prev + flust list mutex. Therefore we can trust the prev pointer. buf_flush_or_remove_page() released the - flush list mutex but not the buf_pool mutex. + flush list mutex but not the LRU list mutex. Therefore it is possible that a new page was added to the flush list. For example, in case where we are at the head of the flush list and @@ -598,17 +635,23 @@ } else if (flush) { /* The processing was successful. And during the - processing we have released the buf_pool mutex + processing we have released all the buf_pool mutexes when calling buf_page_flush(). We cannot trust prev pointer. */ goto rescan; + } else if (UNIV_UNLIKELY(must_restart)) { + + ut_ad(!all_freed); + break; } ++processed; /* Yield if we have hogged the CPU and mutexes for too long. */ - if (buf_flush_try_yield(buf_pool, prev, processed)) { + if (buf_flush_try_yield(buf_pool, prev, processed, + &must_restart)) { + ut_ad(!must_restart); /* Reset the batch size counter if we had to yield. */ processed = 0; @@ -654,11 +697,13 @@ dberr_t err; do { - buf_pool_mutex_enter(buf_pool); + /* TODO: it should be possible to avoid locking the LRU list + mutex here. */ + mutex_enter(&buf_pool->LRU_list_mutex); err = buf_flush_or_remove_pages(buf_pool, id, flush, trx); - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); ut_ad(buf_flush_validate(buf_pool)); @@ -691,7 +736,7 @@ ibool all_freed; scan_again: - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); all_freed = TRUE; @@ -708,15 +753,16 @@ prev_bpage = UT_LIST_GET_PREV(LRU, bpage); - /* bpage->id.space() and bpage->io_fix are protected by - buf_pool->mutex and the block_mutex. It is safe to check - them while holding buf_pool->mutex only. */ + /* It is safe to check bpage->id.space() and bpage->io_fix + while holding buf_pool->LRU_list_mutex only and later recheck + while holding the buf_page_get_mutex() mutex. */ if (bpage->id.space() != id) { /* Skip this block, as it does not belong to the space that is being invalidated. */ goto next_page; - } else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + } else if (UNIV_UNLIKELY(buf_page_get_io_fix_unlocked(bpage) + != BUF_IO_NONE)) { /* We cannot remove this page during this scan yet; maybe the system is currently reading it in, or flushing the modifications to the file */ @@ -732,7 +778,11 @@ mutex_enter(block_mutex); - if (bpage->buf_fix_count > 0) { + if (UNIV_UNLIKELY( + bpage->id.space() != id + || bpage->buf_fix_count > 0 + || (buf_page_get_io_fix(bpage) + != BUF_IO_NONE))) { mutex_exit(block_mutex); @@ -761,7 +811,7 @@ /* Do nothing, because the adaptive hash index covers uncompressed pages only. */ } else if (((buf_block_t*) bpage)->index) { - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); rw_lock_x_unlock(hash_lock); @@ -802,7 +852,7 @@ bpage = prev_bpage; } - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); if (!all_freed) { os_thread_sleep(20000); @@ -905,7 +955,8 @@ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(mutex_own(&buf_pool->zip_mutex)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE); /* Find the first successor of bpage in the LRU list @@ -942,7 +993,7 @@ if true, otherwise scan only srv_LRU_scan_depth / 2 blocks. */ { - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); if (!buf_LRU_evict_from_unzip_LRU(buf_pool)) { return(false); @@ -961,12 +1012,17 @@ prev_block = UT_LIST_GET_PREV(unzip_LRU, block); + mutex_enter(&block->mutex); + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->in_unzip_LRU_list); ut_ad(block->page.in_LRU_list); freed = buf_LRU_free_page(&block->page, false); + if (!freed) + mutex_exit(&block->mutex); + block = prev_block; } @@ -993,7 +1049,7 @@ if true, otherwise scan only up to BUF_LRU_SEARCH_SCAN_THRESHOLD */ { - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ulint scanned = 0; bool freed = false; @@ -1009,19 +1065,20 @@ buf_pool->lru_scan_itr.set(prev); + ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_LRU_list); + + unsigned accessed = buf_page_is_accessed(bpage); + mutex_enter(mutex); - ut_ad(buf_page_in_file(bpage)); - ut_ad(bpage->in_LRU_list); - - unsigned accessed = buf_page_is_accessed(bpage); - if (buf_flush_ready_for_replace(bpage)) { - mutex_exit(mutex); + freed = buf_LRU_free_page(bpage, true); - } else { + } + + if (!freed) mutex_exit(mutex); - } if (freed && !accessed) { /* Keep track of pages that are evicted without @@ -1030,8 +1087,10 @@ ++buf_pool->stat.n_ra_pages_evicted; } - ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(!mutex_own(mutex)); + + if (freed) + break; } if (scanned) { @@ -1058,10 +1117,24 @@ BUF_LRU_SEARCH_SCAN_THRESHOLD blocks. */ { - ut_ad(buf_pool_mutex_own(buf_pool)); - - return(buf_LRU_free_from_unzip_LRU_list(buf_pool, scan_all) - || buf_LRU_free_from_common_LRU_list(buf_pool, scan_all)); + bool freed = false; + bool use_unzip_list = UT_LIST_GET_LEN(buf_pool->unzip_LRU) > 0; + + mutex_enter(&buf_pool->LRU_list_mutex); + + if (use_unzip_list) { + freed = buf_LRU_free_from_unzip_LRU_list(buf_pool, scan_all); + } + + if (!freed) { + freed = buf_LRU_free_from_common_LRU_list(buf_pool, scan_all); + } + + if (!freed) { + mutex_exit(&buf_pool->LRU_list_mutex); + } + + return(freed); } /******************************************************************//** @@ -1106,7 +1179,7 @@ { buf_block_t* block; - ut_ad(buf_pool_mutex_own(buf_pool)); + mutex_enter(&buf_pool->free_list_mutex); block = reinterpret_cast( UT_LIST_GET_FIRST(buf_pool->free)); @@ -1119,33 +1192,34 @@ ut_ad(!block->page.in_LRU_list); ut_a(!buf_page_in_file(&block->page)); UT_LIST_REMOVE(buf_pool->free, &block->page); + mutex_exit(&buf_pool->free_list_mutex); - if (buf_pool->curr_size >= buf_pool->old_size - || UT_LIST_GET_LEN(buf_pool->withdraw) - >= buf_pool->withdraw_target + if (!buf_get_withdraw_depth(buf_pool) || !buf_block_will_withdrawn(buf_pool, block)) { /* found valid free block */ - buf_page_mutex_enter(block); - buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE); + UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE); ut_ad(buf_pool_from_block(block) == buf_pool); - buf_page_mutex_exit(block); - break; + return(block); } /* This should be withdrawn */ + mutex_enter(&buf_pool->free_list_mutex); UT_LIST_ADD_LAST( buf_pool->withdraw, &block->page); ut_d(block->in_withdraw_list = TRUE); + fprintf(stderr, "2 withdrawing block at %p\n", block); block = reinterpret_cast( UT_LIST_GET_FIRST(buf_pool->free)); } + mutex_exit(&buf_pool->free_list_mutex); + return(block); } @@ -1160,8 +1234,6 @@ /*===================================*/ const buf_pool_t* buf_pool) /*!< in: buffer pool instance */ { - ut_ad(buf_pool_mutex_own(buf_pool)); - if (!recv_recovery_is_on() && buf_pool->curr_size == buf_pool->old_size && UT_LIST_GET_LEN(buf_pool->free) @@ -1255,10 +1327,10 @@ bool mon_value_was = false; bool started_monitor = false; + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); + MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH); loop: - buf_pool_mutex_enter(buf_pool); - buf_LRU_check_size_of_non_data_objects(buf_pool); /* If there is a block in the free list, take it */ @@ -1266,7 +1338,6 @@ if (block != NULL) { - buf_pool_mutex_exit(buf_pool); ut_ad(buf_pool_from_block(block) == buf_pool); memset(&block->page.zip, 0, sizeof block->page.zip); @@ -1279,6 +1350,7 @@ } freed = false; + os_rmb; if (buf_pool->try_LRU_scan || n_iterations > 0) { /* If no block was in the free list, search from the end of the LRU list and try to free a block there. @@ -1294,11 +1366,10 @@ TRUE again when we flush a batch from this buffer pool. */ buf_pool->try_LRU_scan = FALSE; + os_wmb; } } - buf_pool_mutex_exit(buf_pool); - if (freed) { goto loop; } @@ -1380,7 +1451,7 @@ ulint new_len; ut_a(buf_pool->LRU_old); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_ad(buf_pool->LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN); ut_ad(buf_pool->LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX); #if BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5) @@ -1444,7 +1515,7 @@ /*=============*/ buf_pool_t* buf_pool) { - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN); /* We first initialize all blocks in the LRU list as old and then use @@ -1480,7 +1551,7 @@ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); ut_ad(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); if (buf_page_belongs_to_unzip_LRU(bpage)) { buf_block_t* block = reinterpret_cast(bpage); @@ -1516,7 +1587,7 @@ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_a(buf_page_in_file(bpage)); @@ -1599,7 +1670,7 @@ { buf_pool_t* buf_pool = buf_pool_from_block(block); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); @@ -1625,7 +1696,7 @@ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_a(buf_page_in_file(bpage)); @@ -1678,7 +1749,7 @@ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_a(buf_page_in_file(bpage)); ut_ad(!bpage->in_LRU_list); @@ -1762,7 +1833,7 @@ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); if (bpage->old) { buf_pool->stat.n_pages_made_young++; @@ -1788,12 +1859,13 @@ Try to free a block. If bpage is a descriptor of a compressed-only page, the descriptor object will be freed as well. -NOTE: If this function returns true, it will temporarily -release buf_pool->mutex. Furthermore, the page frame will no longer be -accessible via bpage. - -The caller must hold buf_pool->mutex and must not hold any -buf_page_get_mutex() when calling this function. +NOTE: If this function returns true, it will release the LRU list mutex, +and release the buf_page_get_mutex() mutex. Furthermore, the page frame will no +longer be accessible via bpage. If this function returns false, +the buf_page_get_mutex() might be temporarily released and relocked too. + +The caller must hold the LRU list and buf_page_get_mutex() mutexes. + @return true if freed, false otherwise. */ bool @@ -1810,17 +1882,15 @@ BPageMutex* block_mutex = buf_page_get_mutex(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(mutex_own(block_mutex)); ut_ad(buf_page_in_file(bpage)); ut_ad(bpage->in_LRU_list); - rw_lock_x_lock(hash_lock); - mutex_enter(block_mutex); - if (!buf_page_can_relocate(bpage)) { /* Do not free buffer fixed and I/O-fixed blocks. */ - goto func_exit; + return(false); } #ifdef UNIV_IBUF_COUNT_DEBUG @@ -1832,25 +1902,20 @@ /* Do not completely free dirty blocks. */ if (bpage->oldest_modification) { - goto func_exit; + return(false); } } else if (bpage->oldest_modification > 0 && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY); -func_exit: - rw_lock_x_unlock(hash_lock); - mutex_exit(block_mutex); return(false); } else if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { b = buf_page_alloc_descriptor(); ut_a(b); - memcpy(b, bpage, sizeof *b); } - ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(buf_page_in_file(bpage)); ut_ad(bpage->in_LRU_list); ut_ad(!bpage->in_flush_list == !bpage->oldest_modification); @@ -1858,12 +1923,43 @@ DBUG_PRINT("ib_buf", ("free page " UINT32PF ":" UINT32PF, bpage->id.space(), bpage->id.page_no())); -#ifdef UNIV_SYNC_DEBUG - ut_ad(rw_lock_own(hash_lock, RW_LOCK_X)); -#endif /* UNIV_SYNC_DEBUG */ - ut_ad(buf_page_can_relocate(bpage)); + mutex_exit(block_mutex); + + rw_lock_x_lock(hash_lock); + mutex_enter(block_mutex); + + if (UNIV_UNLIKELY(!buf_page_can_relocate(bpage) + || ((zip || !bpage->zip.data) + && bpage->oldest_modification))) { + +not_freed: + rw_lock_x_unlock(hash_lock); + if (b) { + buf_page_free_descriptor(b); + } + + return(false); + } else if (UNIV_UNLIKELY(bpage->oldest_modification + && (buf_page_get_state(bpage) + != BUF_BLOCK_FILE_PAGE))) { + + ut_ad(buf_page_get_state(bpage) + == BUF_BLOCK_ZIP_DIRTY); + goto not_freed; + } + + if (b) { + memcpy(b, bpage, sizeof *b); + } if (!buf_LRU_block_remove_hashed(bpage, zip)) { + + mutex_exit(&buf_pool->LRU_list_mutex); + + if (b) { + buf_page_free_descriptor(b); + } + return(true); } @@ -1961,6 +2057,8 @@ buf_LRU_add_block_low(b, buf_page_is_old(b)); } + mutex_enter(&buf_pool->zip_mutex); + rw_lock_x_unlock(hash_lock); if (b->state == BUF_BLOCK_ZIP_PAGE) { #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG buf_LRU_insert_zip_clean(b); @@ -1978,40 +2076,15 @@ bpage->size.logical(), false)); - mutex_exit(block_mutex); - /* Prevent buf_page_get_gen() from - decompressing the block while we release - buf_pool->mutex and block_mutex. */ - block_mutex = buf_page_get_mutex(b); - - mutex_enter(block_mutex); - + decompressing the block while we release block_mutex. */ buf_page_set_sticky(b); - - mutex_exit(block_mutex); - - rw_lock_x_unlock(hash_lock); - - } else { - - /* There can be multiple threads doing an LRU scan to - free a block. The page_cleaner thread can be doing an - LRU batch whereas user threads can potentially be doing - multiple single page flushes. As we release - buf_pool->mutex below we need to make sure that no one - else considers this block as a victim for page - replacement. This block is already out of page_hash - and we are about to remove it from the LRU list and put - it on the free list. */ - mutex_enter(block_mutex); - - buf_page_set_sticky(bpage); - - mutex_exit(block_mutex); + mutex_exit(&buf_pool->zip_mutex); + mutex_exit(block_mutex); + } - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); /* Remove possible adaptive hash index on the page. The page was declared uninitialized by @@ -2048,16 +2121,14 @@ checksum); } - buf_pool_mutex_enter(buf_pool); - - mutex_enter(block_mutex); - - buf_page_unset_sticky(b != NULL ? b : bpage); - - mutex_exit(block_mutex); + if (b) { + mutex_enter(&buf_pool->zip_mutex); + buf_page_unset_sticky(b); + mutex_exit(&buf_pool->zip_mutex); + } buf_LRU_block_free_hashed_page((buf_block_t*) bpage); - + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); return(true); } @@ -2072,9 +2143,6 @@ void* data; buf_pool_t* buf_pool = buf_pool_from_block(block); - ut_ad(buf_pool_mutex_own(buf_pool)); - ut_ad(buf_page_mutex_own(block)); - switch (buf_block_get_state(block)) { case BUF_BLOCK_MEMORY: case BUF_BLOCK_READY_FOR_USE: @@ -2090,8 +2158,6 @@ ut_ad(!block->page.in_flush_list); ut_ad(!block->page.in_LRU_list); - buf_block_set_state(block, BUF_BLOCK_NOT_USED); - UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE); #ifdef UNIV_DEBUG /* Wipe contents of page to reveal possible stale pointers to it */ @@ -2105,16 +2171,11 @@ if (data != NULL) { block->page.zip.data = NULL; - buf_page_mutex_exit(block); - buf_pool_mutex_exit_forbid(buf_pool); ut_ad(block->page.size.is_compressed()); buf_buddy_free(buf_pool, data, block->page.size.physical()); - buf_pool_mutex_exit_allow(buf_pool); - buf_page_mutex_enter(block); - page_zip_set_size(&block->page.zip, 0); block->page.size.copy_from( @@ -2123,17 +2184,23 @@ false)); } - if (buf_pool->curr_size < buf_pool->old_size - && UT_LIST_GET_LEN(buf_pool->withdraw) < buf_pool->withdraw_target + if (buf_get_withdraw_depth(buf_pool) && buf_block_will_withdrawn(buf_pool, block)) { /* This should be withdrawn */ + buf_block_set_state(block, BUF_BLOCK_NOT_USED); + mutex_enter(&buf_pool->free_list_mutex); UT_LIST_ADD_LAST( buf_pool->withdraw, &block->page); ut_d(block->in_withdraw_list = TRUE); + fprintf(stderr, "3 withdrawing block at %p\n", block); + mutex_exit(&buf_pool->free_list_mutex); } else { + buf_block_set_state(block, BUF_BLOCK_NOT_USED); + mutex_enter(&buf_pool->free_list_mutex); UT_LIST_ADD_FIRST(buf_pool->free, &block->page); ut_d(block->page.in_free_list = TRUE); + mutex_exit(&buf_pool->free_list_mutex); } UNIV_MEM_ASSERT_AND_FREE(block->frame, UNIV_PAGE_SIZE); @@ -2144,7 +2211,7 @@ If the block is compressed-only (BUF_BLOCK_ZIP_PAGE), the object will be freed. -The caller must hold buf_pool->mutex, the buf_page_get_mutex() mutex +The caller must hold buf_pool->LRU_list_mutex, the buf_page_get_mutex() mutex and the appropriate hash_lock. This function will release the buf_page_get_mutex() and the hash_lock. @@ -2167,7 +2234,7 @@ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); rw_lock_t* hash_lock; - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_ad(mutex_own(buf_page_get_mutex(bpage))); hash_lock = buf_page_hash_lock_get(buf_pool, bpage->id); @@ -2272,7 +2339,7 @@ #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG mutex_exit(buf_page_get_mutex(bpage)); rw_lock_x_unlock(hash_lock); - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); buf_print(); buf_LRU_print(); buf_validate(); @@ -2302,12 +2369,10 @@ mutex_exit(&buf_pool->zip_mutex); rw_lock_x_unlock(hash_lock); - buf_pool_mutex_exit_forbid(buf_pool); buf_buddy_free(buf_pool, bpage->zip.data, bpage->size.physical()); - buf_pool_mutex_exit_allow(buf_pool); buf_page_free_descriptor(bpage); return(false); @@ -2331,14 +2396,15 @@ page_hash. Only possibility is when while invalidating a tablespace we buffer fix the prev_page in LRU to avoid relocation during the scan. But that is not - possible because we are holding buf_pool mutex. + possible because we are holding LRU list mutex. 2) Not possible because in buf_page_init_for_read() - we do a look up of page_hash while holding buf_pool - mutex and since we are holding buf_pool mutex here + we do a look up of page_hash while holding LRU list + mutex and since we are holding LRU list mutex here and by the time we'll release it in the caller we'd have inserted the compressed only descriptor in the page_hash. */ + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); rw_lock_x_unlock(hash_lock); mutex_exit(&((buf_block_t*) bpage)->mutex); @@ -2350,12 +2416,9 @@ ut_ad(!bpage->in_free_list); ut_ad(!bpage->in_flush_list); ut_ad(!bpage->in_LRU_list); - buf_pool_mutex_exit_forbid(buf_pool); buf_buddy_free(buf_pool, data, bpage->size.physical()); - buf_pool_mutex_exit_allow(buf_pool); - page_zip_set_size(&bpage->zip, 0); bpage->size.copy_from( @@ -2389,9 +2452,6 @@ be in a state where it can be freed */ { buf_pool_t* buf_pool = buf_pool_from_block(block); - ut_ad(buf_pool_mutex_own(buf_pool)); - - buf_page_mutex_enter(block); if (buf_pool->flush_rbt == NULL) { block->page.id.reset(ULINT32_UNDEFINED, ULINT32_UNDEFINED); @@ -2400,34 +2460,41 @@ buf_block_set_state(block, BUF_BLOCK_MEMORY); buf_LRU_block_free_non_file_page(block); - buf_page_mutex_exit(block); } /******************************************************************//** -Remove one page from LRU list and put it to free list */ +Remove one page from LRU list and put it to free list. The caller must hold the +LRU list and block mutexes and have page hash latched in X. The latch and +the block mutexes will be released. */ void buf_LRU_free_one_page( /*==================*/ - buf_page_t* bpage) /*!< in/out: block, must contain a file page and + buf_page_t* bpage, /*!< in/out: block, must contain a file page and be in a state where it can be freed; there may or may not be a hash index to the page */ + bool zip) /*!< in: true if should remove also the + compressed page of an uncompressed page */ { +#if defined(UNIV_DEBUG) || defined(UNIV_SYNC_DEBUG) buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + BPageMutex* block_mutex = buf_page_get_mutex(bpage); +#endif +#ifdef UNIV_SYNC_DEBUG rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, bpage->id); - BPageMutex* block_mutex = buf_page_get_mutex(bpage); - - ut_ad(buf_pool_mutex_own(buf_pool)); - - rw_lock_x_lock(hash_lock); - mutex_enter(block_mutex); - - if (buf_LRU_block_remove_hashed(bpage, true)) { +#endif + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + ut_ad(mutex_own(block_mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(hash_lock, RW_LOCK_X)); +#endif + + if (buf_LRU_block_remove_hashed(bpage, zip)) { buf_LRU_block_free_hashed_page((buf_block_t*) bpage); } - /* buf_LRU_block_remove_hashed() releases hash_lock and block_mutex */ + /* buf_LRU_block_remove_hashed() releases hash_lock and block mutex */ #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(hash_lock, RW_LOCK_X) && !rw_lock_own(hash_lock, RW_LOCK_S)); @@ -2459,7 +2526,7 @@ } if (adjust) { - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); if (ratio != buf_pool->LRU_old_ratio) { buf_pool->LRU_old_ratio = ratio; @@ -2471,7 +2538,7 @@ } } - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } else { buf_pool->LRU_old_ratio = ratio; } @@ -2521,6 +2588,7 @@ buf_LRU_stat_t cur_stat; /* If we haven't started eviction yet then don't update stats. */ + os_rmb; for (ulint i = 0; i < srv_buf_pool_instances; i++) { buf_pool = buf_pool_from_array(i); @@ -2557,6 +2625,7 @@ func_exit: /* Clear the current entry. */ memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur); + os_wmb; } #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG @@ -2571,7 +2640,7 @@ ulint old_len; ulint new_len; - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { @@ -2631,6 +2700,10 @@ ut_a(buf_pool->LRU_old_len == old_len); + mutex_exit(&buf_pool->LRU_list_mutex); + + mutex_enter(&buf_pool->free_list_mutex); + CheckInFreeList::validate(buf_pool); for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->free); @@ -2640,6 +2713,10 @@ ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED); } + mutex_exit(&buf_pool->free_list_mutex); + + mutex_enter(&buf_pool->LRU_list_mutex); + CheckUnzipLRUAndLRUList::validate(buf_pool); for (buf_block_t* block = UT_LIST_GET_FIRST(buf_pool->unzip_LRU); @@ -2651,7 +2728,7 @@ ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); } - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } /**********************************************************************//** @@ -2682,7 +2759,7 @@ /*===================*/ buf_pool_t* buf_pool) { - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); for (const buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->LRU); bpage != NULL; @@ -2738,7 +2815,7 @@ mutex_exit(buf_page_get_mutex(bpage)); } - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } /**********************************************************************//** === modified file 'storage/innobase/buf/buf0rea.cc' --- storage/innobase/buf/buf0rea.cc 2014-08-29 01:31:40 +0000 +++ storage/innobase/buf/buf0rea.cc 2015-01-16 19:30:41 +0000 @@ -62,10 +62,14 @@ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); const bool uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, + bpage->id); + + mutex_enter(&buf_pool->LRU_list_mutex); + rw_lock_x_lock(hash_lock); + mutex_enter(buf_page_get_mutex(bpage)); /* First unfix and release lock on the bpage */ - buf_pool_mutex_enter(buf_pool); - mutex_enter(buf_page_get_mutex(bpage)); ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ); ut_ad(bpage->buf_fix_count == 0); @@ -78,15 +82,13 @@ BUF_IO_READ); } - mutex_exit(buf_page_get_mutex(bpage)); - /* remove the block from LRU list */ buf_LRU_free_one_page(bpage); + mutex_exit(&buf_pool->LRU_list_mutex); + ut_ad(buf_pool->n_pend_reads > 0); - buf_pool->n_pend_reads--; - - buf_pool_mutex_exit(buf_pool); + os_atomic_decrement_ulint(&buf_pool->n_pend_reads, 1); } /** Low-level function which reads a page asynchronously from a file to the @@ -171,6 +173,7 @@ sync ? "sync" : "async")); ut_ad(buf_page_in_file(bpage)); + ut_ad(!mutex_own(&buf_pool_from_bpage(bpage)->LRU_list_mutex)); if (sync) { thd_wait_begin(NULL, THD_WAIT_DISKIO); @@ -296,11 +299,9 @@ high = space_size; } - buf_pool_mutex_enter(buf_pool); - + os_rmb; if (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { - buf_pool_mutex_exit(buf_pool); return(0); } @@ -309,8 +310,13 @@ that is, reside near the start of the LRU list. */ for (i = low; i < high; i++) { - const buf_page_t* bpage = buf_page_hash_get( - buf_pool, page_id_t(page_id.space(), i)); + + rw_lock_t* hash_lock; + + const buf_page_t* bpage = + buf_page_hash_get_s_locked(buf_pool, + page_id_t(page_id.space(), + i), &hash_lock); if (bpage != NULL && buf_page_is_accessed(bpage) @@ -321,13 +327,16 @@ if (recent_blocks >= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) { - buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(hash_lock); goto read_ahead; } } + + if (bpage) { + rw_lock_s_unlock(hash_lock); + } } - buf_pool_mutex_exit(buf_pool); /* Do nothing */ return(0); @@ -495,6 +504,7 @@ buf_page_t* bpage; buf_frame_t* frame; buf_page_t* pred_bpage = NULL; + unsigned pred_bpage_is_accessed = 0; ulint pred_offset; ulint succ_offset; ulint count; @@ -545,18 +555,15 @@ tablespace_version = fil_space_get_version(page_id.space()); - buf_pool_mutex_enter(buf_pool); - if (high > fil_space_get_size(page_id.space())) { - buf_pool_mutex_exit(buf_pool); /* The area is not whole, return */ return(0); } + os_rmb; if (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { - buf_pool_mutex_exit(buf_pool); return(0); } @@ -578,9 +585,13 @@ fail_count = 0; + rw_lock_t* hash_lock; + for (i = low; i < high; i++) { - bpage = buf_page_hash_get(buf_pool, - page_id_t(page_id.space(), i)); + + bpage = buf_page_hash_get_s_locked(buf_pool, + page_id_t(page_id.space(), + i), &hash_lock); if (bpage == NULL || !buf_page_is_accessed(bpage)) { /* Not accessed */ @@ -597,7 +608,7 @@ a little against this. */ int res = ut_ulint_cmp( buf_page_is_accessed(bpage), - buf_page_is_accessed(pred_bpage)); + pred_bpage_is_accessed); /* Accesses not in the right order */ if (res != 0 && res != asc_or_desc) { fail_count++; @@ -606,22 +617,29 @@ if (fail_count > threshold) { /* Too many failures: return */ - buf_pool_mutex_exit(buf_pool); + if (bpage) { + rw_lock_s_unlock(hash_lock); + } return(0); } - if (bpage && buf_page_is_accessed(bpage)) { - pred_bpage = bpage; + if (bpage) { + if (buf_page_is_accessed(bpage)) { + pred_bpage = bpage; + pred_bpage_is_accessed + = buf_page_is_accessed(bpage); + } + + rw_lock_s_unlock(hash_lock); } } /* If we got this far, we know that enough pages in the area have been accessed in the right order: linear read-ahead can be sensible */ - bpage = buf_page_hash_get(buf_pool, page_id); + bpage = buf_page_hash_get_s_locked(buf_pool, page_id, &hash_lock); if (bpage == NULL) { - buf_pool_mutex_exit(buf_pool); return(0); } @@ -647,7 +665,7 @@ pred_offset = fil_page_get_prev(frame); succ_offset = fil_page_get_next(frame); - buf_pool_mutex_exit(buf_pool); + rw_lock_s_unlock(hash_lock); if ((page_id.page_no() == low) && (succ_offset == page_id.page_no() + 1)) { @@ -790,6 +808,7 @@ continue; } + os_rmb; while (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { os_thread_sleep(500000); @@ -856,6 +875,7 @@ count = 0; buf_pool = buf_pool_get(cur_page_id); + os_rmb; while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { os_aio_simulated_wake_handler_threads(); === modified file 'storage/innobase/fsp/fsp0fsp.cc' --- storage/innobase/fsp/fsp0fsp.cc 2014-11-06 18:59:50 +0000 +++ storage/innobase/fsp/fsp0fsp.cc 2015-01-16 19:30:41 +0000 @@ -1443,11 +1443,9 @@ } else { rw_lock_sx_lock(&block->lock); } - mutex_enter(&block->mutex); buf_block_buf_fix_inc(block, __FILE__, __LINE__); - mutex_exit(&block->mutex); mtr_memo_push(init_mtr, block, rw_latch == RW_X_LATCH ? MTR_MEMO_PAGE_X_FIX : MTR_MEMO_PAGE_SX_FIX); === modified file 'storage/innobase/handler/ha_innodb.cc' --- storage/innobase/handler/ha_innodb.cc 2014-09-02 07:56:28 +0000 +++ storage/innobase/handler/ha_innodb.cc 2015-01-16 19:30:41 +0000 @@ -283,7 +283,11 @@ # ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK PSI_KEY(buffer_block_mutex), # endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */ - PSI_KEY(buf_pool_mutex), + PSI_KEY(buf_pool_flush_state_mutex), + PSI_KEY(buf_pool_LRU_list_mutex), + PSI_KEY(buf_pool_free_list_mutex), + PSI_KEY(buf_pool_zip_free_mutex), + PSI_KEY(buf_pool_zip_hash_mutex), PSI_KEY(buf_pool_zip_mutex), PSI_KEY(cache_last_read_mutex), PSI_KEY(dict_foreign_err_mutex), @@ -14858,9 +14862,8 @@ return; } - buf_pool_mutex_enter_all(); + os_rmb; if (srv_buf_pool_old_size != srv_buf_pool_size) { - buf_pool_mutex_exit_all(); push_warning_printf(thd, Sql_condition::SL_WARNING, ER_WRONG_ARGUMENTS, @@ -14871,7 +14874,6 @@ if (srv_buf_pool_instances > 1 && in_val < BUF_POOL_SIZE_THRESHOLD) { - buf_pool_mutex_exit_all(); push_warning_printf(thd, Sql_condition::SL_WARNING, ER_WRONG_ARGUMENTS, @@ -14884,15 +14886,13 @@ srv_buf_pool_size = buf_pool_size_align(static_cast(in_val)); innobase_buffer_pool_size = static_cast(srv_buf_pool_size); + os_wmb; if (srv_buf_pool_old_size == srv_buf_pool_size) { - buf_pool_mutex_exit_all(); /* nothing to do */ return; } - buf_pool_mutex_exit_all(); - ut_snprintf(export_vars.innodb_buffer_pool_resize_status, sizeof(export_vars.innodb_buffer_pool_resize_status), "Requested to resize buffer pool."); @@ -15683,7 +15683,7 @@ for (ulint i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool = &buf_pool_ptr[i]; - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); for (buf_block_t* block = UT_LIST_GET_LAST( buf_pool->unzip_LRU); @@ -15695,14 +15695,24 @@ ut_ad(block->in_unzip_LRU_list); ut_ad(block->page.in_LRU_list); - if (!buf_LRU_free_page(&block->page, false)) { + rw_lock_t* hash_lock + = buf_page_hash_lock_get(buf_pool, + block->page.id); + rw_lock_x_lock(hash_lock); + mutex_enter(&block->mutex); + + if (buf_page_can_relocate(&block->page)) { + rw_lock_x_unlock(hash_lock); + mutex_exit(&block->mutex); all_evicted = false; + } else { + buf_LRU_free_one_page(&block->page, false); } block = prev_block; } - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } return(all_evicted); === modified file 'storage/innobase/handler/i_s.cc' --- storage/innobase/handler/i_s.cc 2014-08-26 15:37:03 +0000 +++ storage/innobase/handler/i_s.cc 2015-01-16 19:30:41 +0000 @@ -2095,7 +2095,7 @@ buf_pool = buf_pool_from_array(i); - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->zip_free_mutex); for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) { buf_buddy_stat_t* buddy_stat; @@ -2104,6 +2104,7 @@ table->field[0]->store(BUF_BUDDY_LOW << x); table->field[1]->store(static_cast(i)); + os_rmb; table->field[2]->store(static_cast( buddy_stat->used)); table->field[3]->store(static_cast( @@ -2116,7 +2117,8 @@ static_cast(buddy_stat->relocated_usec / 1000000)); if (reset) { - /* This is protected by buf_pool->mutex. */ + /* This is protected by + buf_pool->zip_free_mutex. */ buddy_stat->relocated = 0; buddy_stat->relocated_usec = 0; } @@ -2127,7 +2129,7 @@ } } - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->zip_free_mutex); if (status) { break; @@ -5424,12 +5426,16 @@ out: structure filled with scanned info */ { + BPageMutex* mutex = buf_page_get_mutex(bpage); + ut_ad(pool_id < MAX_BUFFER_POOLS); page_info->pool_id = pool_id; page_info->block_id = pos; + mutex_enter(mutex); + page_info->page_state = buf_page_get_state(bpage); /* Only fetch information for buffers that map to a tablespace, @@ -5468,6 +5474,7 @@ break; case BUF_IO_READ: page_info->page_type = I_S_PAGE_TYPE_UNKNOWN; + mutex_exit(mutex); return; } @@ -5488,6 +5495,8 @@ } else { page_info->page_type = I_S_PAGE_TYPE_UNKNOWN; } + + mutex_exit(mutex); } /*******************************************************************//** @@ -5537,16 +5546,10 @@ /* For each chunk, we'll pre-allocate information structures to cache the page information read from - the buffer pool. Doing so before obtain any mutex */ + the buffer pool */ info_buffer = (buf_page_info_t*) mem_heap_zalloc( heap, mem_size); - /* Obtain appropriate mutexes. Since this is diagnostic - buffer pool info printout, we are not required to - preserve the overall consistency, so we can - release mutex periodically */ - buf_pool_mutex_enter(buf_pool); - /* GO through each block in the chunk */ for (n_blocks = num_to_process; n_blocks--; block++) { i_s_innodb_buffer_page_get_info( @@ -5556,8 +5559,6 @@ num_page++; } - buf_pool_mutex_exit(buf_pool); - /* Fill in information schema table with information just collected from the buffer chunk scan */ status = i_s_innodb_buffer_page_fill( @@ -6084,9 +6085,9 @@ DBUG_ENTER("i_s_innodb_fill_buffer_lru"); - /* Obtain buf_pool mutex before allocate info_buffer, since + /* Obtain buf_pool->LRU_list_mutex before allocate info_buffer, since UT_LIST_GET_LEN(buf_pool->LRU) could change */ - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); lru_len = UT_LIST_GET_LEN(buf_pool->LRU); @@ -6120,7 +6121,7 @@ ut_ad(lru_pos == UT_LIST_GET_LEN(buf_pool->LRU)); exit: - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); if (info_buffer) { status = i_s_innodb_buf_page_lru_fill( === modified file 'storage/innobase/ibuf/ibuf0ibuf.cc' --- storage/innobase/ibuf/ibuf0ibuf.cc 2014-08-26 15:37:03 +0000 +++ storage/innobase/ibuf/ibuf0ibuf.cc 2015-01-16 19:30:41 +0000 @@ -4544,7 +4544,8 @@ ulint dops[IBUF_OP_COUNT]; ut_ad(block == NULL || page_id.equals_to(block->page.id)); - ut_ad(block == NULL || buf_block_get_io_fix(block) == BUF_IO_READ); + ut_ad(block == NULL + || buf_block_get_io_fix_unlocked(block) == BUF_IO_READ); if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE || trx_sys_hdr_page(page_id) === modified file 'storage/innobase/include/buf0buddy.h' --- storage/innobase/include/buf0buddy.h 2014-07-07 11:14:44 +0000 +++ storage/innobase/include/buf0buddy.h 2015-01-16 19:30:41 +0000 @@ -35,10 +35,10 @@ #include "buf0types.h" /**********************************************************************//** -Allocate a block. The thread calling this function must hold -buf_pool->mutex and must not hold buf_pool->zip_mutex or any -block->mutex. The buf_pool->mutex may be released and reacquired. -This function should only be used for allocating compressed page frames. +Allocate a block. This function should only be used for allocating compressed +page frames. The thread calling this function must hold +buf_pool->LRU_list_mutex and must not hold buf_pool->zip_mutex or any +block->mutex. @return allocated block, never NULL */ UNIV_INLINE byte* @@ -46,14 +46,9 @@ /*============*/ buf_pool_t* buf_pool, /*!< in/out: buffer pool in which the page resides */ - ulint size, /*!< in: compressed page size + ulint size) /*!< in: compressed page size (between UNIV_ZIP_SIZE_MIN and UNIV_PAGE_SIZE) */ - ibool* lru) /*!< in: pointer to a variable - that will be assigned TRUE if - storage was allocated from the - LRU list and buf_pool->mutex was - temporarily released */ __attribute__((malloc, nonnull)); /**********************************************************************//** @@ -70,11 +65,12 @@ up to UNIV_PAGE_SIZE */ __attribute__((nonnull)); -/** Reallocate a block. +/** Try to reallocate a block. @param[in] buf_pool buffer pool instance @param[in] buf block to be reallocated, must be pointed to by the buffer pool @param[in] size block size, up to UNIV_PAGE_SIZE +@retval true if succeeded or if failed because the block was fixed @retval false if failed because of no free blocks. */ bool === modified file 'storage/innobase/include/buf0buddy.ic' --- storage/innobase/include/buf0buddy.ic 2013-09-09 13:50:47 +0000 +++ storage/innobase/include/buf0buddy.ic 2015-01-16 19:30:41 +0000 @@ -33,23 +33,16 @@ #include "sync0mutex.h" /**********************************************************************//** -Allocate a block. The thread calling this function must hold -buf_pool->mutex and must not hold buf_pool->zip_mutex or any block->mutex. -The buf_pool_mutex may be released and reacquired. +Allocate a block. @return allocated block, never NULL */ void* buf_buddy_alloc_low( /*================*/ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ - ulint i, /*!< in: index of buf_pool->zip_free[], + ulint i) /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ - ibool* lru) /*!< in: pointer to a variable that - will be assigned TRUE if storage was - allocated from the LRU list and - buf_pool->mutex was temporarily - released */ - __attribute__((malloc, nonnull)); + __attribute__((malloc, nonnull, warn_unused_result)); /**********************************************************************//** Deallocate a block. */ @@ -86,10 +79,10 @@ } /**********************************************************************//** -Allocate a block. The thread calling this function must hold -buf_pool->mutex and must not hold buf_pool->zip_mutex or any -block->mutex. The buf_pool->mutex may be released and reacquired. -This function should only be used for allocating compressed page frames. +Allocate a block. This function should only be used for allocating compressed +page frames. The thread calling this function must hold +buf_pool->LRU_list_mutex and must not hold buf_pool->zip_mutex or any +block->mutex. @return allocated block, never NULL */ UNIV_INLINE byte* @@ -97,22 +90,16 @@ /*============*/ buf_pool_t* buf_pool, /*!< in/out: buffer pool in which the page resides */ - ulint size, /*!< in: compressed page size + ulint size) /*!< in: compressed page size (between UNIV_ZIP_SIZE_MIN and UNIV_PAGE_SIZE) */ - ibool* lru) /*!< in: pointer to a variable - that will be assigned TRUE if - storage was allocated from the - LRU list and buf_pool->mutex was - temporarily released */ { - ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(ut_is_2pow(size)); ut_ad(size >= UNIV_ZIP_SIZE_MIN); ut_ad(size <= UNIV_PAGE_SIZE); - return((byte*) buf_buddy_alloc_low(buf_pool, buf_buddy_get_slot(size), - lru)); + return(static_cast + (buf_buddy_alloc_low(buf_pool,buf_buddy_get_slot(size)))); } /**********************************************************************//** @@ -128,7 +115,6 @@ ulint size) /*!< in: block size, up to UNIV_PAGE_SIZE */ { - ut_ad(buf_pool_mutex_own(buf_pool)); ut_ad(ut_is_2pow(size)); ut_ad(size >= UNIV_ZIP_SIZE_MIN); ut_ad(size <= UNIV_PAGE_SIZE); === modified file 'storage/innobase/include/buf0buf.h' --- storage/innobase/include/buf0buf.h 2014-08-26 11:08:37 +0000 +++ storage/innobase/include/buf0buf.h 2015-01-16 19:30:41 +0000 @@ -338,20 +338,6 @@ #ifndef UNIV_HOTBACKUP /********************************************************************//** -Acquire mutex on all buffer pool instances */ -UNIV_INLINE -void -buf_pool_mutex_enter_all(void); -/*===========================*/ - -/********************************************************************//** -Release mutex on all buffer pool instances */ -UNIV_INLINE -void -buf_pool_mutex_exit_all(void); -/*==========================*/ - -/********************************************************************//** Creates the buffer pool. @return DB_SUCCESS if success, DB_ERROR if not enough memory or error */ @@ -698,11 +684,10 @@ __attribute__((pure)); /********************************************************************//** -Tells if a block is still close enough to the MRU end of the LRU list -meaning that it is not in danger of getting evicted and also implying +Tells, for heuristics, if a block is still close enough to the MRU end of the +LRU list meaning that it is not in danger of getting evicted and also implying that it has been accessed recently. -Note that this is for heuristics only and does not reserve buffer pool -mutex. +The page must be either buffer-fixed, either its page hash must be locked. @return TRUE if block is close to MRU end of LRU */ UNIV_INLINE ibool @@ -710,16 +695,6 @@ /*===================*/ const buf_page_t* bpage); /*!< in: block */ /********************************************************************//** -Recommends a move of a block to the start of the LRU list if there is danger -of dropping from the buffer pool. NOTE: does not reserve the buffer pool -mutex. -@return TRUE if should be made younger */ -UNIV_INLINE -ibool -buf_page_peek_if_too_old( -/*=====================*/ - const buf_page_t* bpage); /*!< in: block to make younger */ -/********************************************************************//** Gets the youngest modification log sequence number for a frame. Returns zero if not file page or no modification occurred yet. @return newest modification to page */ @@ -731,8 +706,8 @@ page frame */ /********************************************************************//** Increments the modify clock of a frame by 1. The caller must (1) own the -buf_pool->mutex and block bufferfix count has to be zero, (2) or own an x-lock -on the block. */ +buf_pool->LRU_list_mutex and block bufferfix count has to be zero, (2) or own +an x-lock on the block, (3) or the block must belong to an intrinsic table. */ UNIV_INLINE void buf_block_modify_clock_inc( @@ -980,13 +955,6 @@ Refreshes the statistics used to print per-second averages. */ void -buf_refresh_io_stats( -/*=================*/ - buf_pool_t* buf_pool); /*!< buffer pool instance */ -/**********************************************************************//** -Refreshes the statistics used to print per-second averages. */ - -void buf_refresh_io_stats_all(void); /*=================*/ /*********************************************************************//** @@ -1132,6 +1100,19 @@ /*================*/ const buf_page_t* bpage) /*!< in: pointer to the control block */ __attribute__((pure)); + +/** Gets the io_fix state of a buffer page. Does not assert that the +buf_page_get_mutex() mutex is held, to be used in the cases where it is safe +not to hold it. +@param[in] pointer to the buffer page +@return page io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_page_get_io_fix_unlocked( +/*=========================*/ + const buf_page_t* bpage) + __attribute__((warn_unused_result)); + /*********************************************************************//** Gets the io_fix state of a block. @return io_fix state */ @@ -1141,6 +1122,18 @@ /*================*/ const buf_block_t* block) /*!< in: pointer to the control block */ __attribute__((pure)); + +/** Gets the io_fix state of a buffer block. Does not assert that the +buf_page_get_mutex() mutex is held, to be used in the cases where it is safe +not to hold it. +@param[in] pointer to the buffer block +@return page io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_block_get_io_fix_unlocked( +/*=========================*/ + const buf_block_t* block) + __attribute__((warn_unused_result)); /*********************************************************************//** Sets the io_fix state of a block. */ UNIV_INLINE @@ -1224,8 +1217,10 @@ __attribute__((nonnull)); /*********************************************************************//** Gets the buf_block_t handle of a buffered file block if an uncompressed -page frame exists, or NULL. Note: even though bpage is not declared a -const we don't update its value. It is safe to make this pure. +page frame exists, or NULL. page frame exists, or NULL. The caller must hold +either the appropriate hash lock in any mode, either the LRU list mutex. Note: +even though bpage is not declared a const we don't update its value. It is safe +to make this pure. @return control block, or NULL */ UNIV_INLINE buf_block_t* @@ -1468,8 +1463,9 @@ __attribute__((nonnull, warn_unused_result)); /** Add watch for the given page to be read in. Caller must have -appropriate hash_lock for the bpage. This function may release the -hash_lock and reacquire it. +appropriate hash_lock for the bpage and hold the LRU list mutex to avoid a race +condition with buf_LRU_free_page inserting the same page into the page hash. +This function may release the hash_lock and reacquire it. @param[in] page_id page id @param[in,out] hash_lock hash_lock currently latched @return NULL if watch set, block if the page is in the buffer pool */ @@ -1561,6 +1557,16 @@ #endif /* !UNIV_HOTBACKUP */ +/** Return how many more pages must be added to the withdraw list to reach the +withdraw target of the currently ongoing buffer pool resize. +@param[in] buf_pool buffer pool instance +@return page count to be withdrawn or zero if the target is already achieved or +if the buffer pool is not currently being resized. */ +UNIV_INLINE +ulint +buf_get_withdraw_depth( + buf_pool_t* buf_pool); + /** The common buffer control block structure for compressed and uncompressed frames */ @@ -1573,21 +1579,19 @@ None of these bit-fields must be modified without holding buf_page_get_mutex() [buf_block_t::mutex or buf_pool->zip_mutex], since they can be stored in the same - machine word. Some of these fields are additionally protected - by buf_pool->mutex. */ + machine word. */ /* @{ */ - /** Page id. Protected by buf_pool mutex. */ + /** Page id. */ page_id_t id; - /** Page size. Protected by buf_pool mutex. */ + /** Page size. */ page_size_t size; /** Count of how manyfold this block is currently bufferfixed. */ ib_uint32_t buf_fix_count; - /** type of pending I/O operation; also protected by - buf_pool->mutex for writes only */ + /** type of pending I/O operation. */ buf_io_fix io_fix; /** Block state. @see buf_page_in_file */ @@ -1607,7 +1611,7 @@ #endif /* !UNIV_HOTBACKUP */ page_zip_des_t zip; /*!< compressed page; zip.data (but not the data it points to) is - also protected by buf_pool->mutex; + protected by buf_pool->zip_mutex; state == BUF_BLOCK_ZIP_PAGE and zip.data == NULL means an active buf_pool->watch */ @@ -1626,22 +1630,17 @@ UT_LIST_NODE_T(buf_page_t) list; /*!< based on state, this is a - list node, protected either by - buf_pool->mutex or by - buf_pool->flush_list_mutex, - in one of the following lists in - buf_pool: + list node, protected by the + corresponding list mutex, in one of the + following lists in buf_pool: - BUF_BLOCK_NOT_USED: free, withdraw - BUF_BLOCK_FILE_PAGE: flush_list - BUF_BLOCK_ZIP_DIRTY: flush_list - BUF_BLOCK_ZIP_PAGE: zip_clean - If bpage is part of flush_list - then the node pointers are - covered by buf_pool->flush_list_mutex. - Otherwise these pointers are - protected by buf_pool->mutex. + The node pointers are protected by the + corresponding list mutex. The contents of the list node is undefined if !in_flush_list @@ -1664,8 +1663,8 @@ reads can happen while holding any one of the two mutexes */ ibool in_free_list; /*!< TRUE if in buf_pool->free; when - buf_pool->mutex is free, the following - should hold: in_free_list + buf_pool->free_list_mutex is free, the + following should hold: in_free_list == (state == BUF_BLOCK_NOT_USED) */ #endif /* UNIV_DEBUG */ lsn_t newest_modification; @@ -1689,8 +1688,8 @@ any one of the two mutexes */ /* @} */ /** @name LRU replacement algorithm fields - These fields are protected by buf_pool->mutex only (not - buf_pool->zip_mutex or buf_block_t::mutex). */ + These fields are protected by both buf_pool->LRU_list_mutex and the + block mutex. */ /* @{ */ UT_LIST_NODE_T(buf_page_t) LRU; @@ -1745,26 +1744,23 @@ /*!< node of the decompressed LRU list; a block is in the unzip_LRU list if page.state == BUF_BLOCK_FILE_PAGE - and page.zip.data != NULL */ + and page.zip.data != NULL. Protected by + both LRU_list_mutex and the block + mutex. */ #ifdef UNIV_DEBUG ibool in_unzip_LRU_list;/*!< TRUE if the page is in the decompressed LRU list; used in debugging */ ibool in_withdraw_list; #endif /* UNIV_DEBUG */ - BPageMutex mutex; /*!< mutex protecting this block: - state (also protected by the buffer - pool mutex), io_fix, buf_fix_count, - and accessed; we introduce this new - mutex in InnoDB-5.1 to relieve - contention on the buffer pool mutex */ + BPageMutex mutex; /*!< mutex protecting this block. */ rw_lock_t lock; /*!< read-write lock of the buffer frame */ unsigned lock_hash_val:32;/*!< hashed value of the page address in the record lock hash table; protected by buf_block_t::lock - (or buf_block_t::mutex, buf_pool->mutex - in buf_page_get_gen(), + (or buf_block_t::mutex in + buf_page_get_gen(), buf_page_init_for_read() and buf_page_create()) */ ibool check_index_page_at_flush; @@ -1787,10 +1783,11 @@ positioning: if the modify clock has not changed, we know that the pointer is still valid; this field may be - changed if the thread (1) owns the - pool mutex and the page is not + changed if the thread (1) owns the LRU + list mutex and the page is not bufferfixed, or (2) the thread has an - x-latch on the block */ + x-latch on the block, or (3) the block + must belong to an intrinsic table */ /* @} */ /** @name Hash search fields (unprotected) NOTE that these fields are NOT protected by any semaphore! */ @@ -2027,25 +2024,31 @@ counted as page gets; this field is NOT protected by the buffer pool mutex */ - ulint n_pages_read; /*!< number read operations */ - ulint n_pages_written;/*!< number write operations */ + ulint n_pages_read; /*!< number of read operations. Accessed + atomically. */ + ulint n_pages_written;/*!< number of write operations. Accessed + atomically. */ ulint n_pages_created;/*!< number of pages created - in the pool with no read */ + in the pool with no read. Accessed + atomically. */ ulint n_ra_pages_read_rnd;/*!< number of pages read in - as part of random read ahead */ + as part of random read ahead. Not protected. */ ulint n_ra_pages_read;/*!< number of pages read in - as part of read ahead */ + as part of read ahead. Not protected. */ ulint n_ra_pages_evicted;/*!< number of read ahead pages that are evicted without - being accessed */ + being accessed. Protected by LRU_list_mutex. */ ulint n_pages_made_young; /*!< number of pages made young, in - calls to buf_LRU_make_block_young() */ + calls to buf_LRU_make_block_young(). Protected + by LRU_list_mutex. */ ulint n_pages_not_made_young; /*!< number of pages not made young because the first access was not long enough ago, in - buf_page_peek_if_too_old() */ - ulint LRU_bytes; /*!< LRU size in bytes */ - ulint flush_list_bytes;/*!< flush_list size in bytes */ + buf_page_peek_if_too_old(). Not protected. */ + ulint LRU_bytes; /*!< LRU size in bytes. Protected by + LRU_list_mutex. */ + ulint flush_list_bytes;/*!< flush_list size in bytes. + Protected by flush_list_mutex */ }; /** Statistics of buddy blocks of a given size. */ @@ -2067,8 +2070,12 @@ /** @name General fields */ /* @{ */ - BufPoolMutex mutex; /*!< Buffer pool mutex of this - instance */ + BufListMutex LRU_list_mutex; /*!< LRU list mutex */ + BufListMutex free_list_mutex;/*!< free and withdraw list mutex */ + BufListMutex zip_free_mutex; /*!< buddy allocator mutex */ + BufListMutex zip_hash_mutex; /*!< zip_hash mutex */ + ib_mutex_t flush_state_mutex;/*!< Flush state protection + mutex */ BPageMutex zip_mutex; /*!< Zip mutex of this buffer pool instance, protects compressed only pages (of type buf_page_t, not @@ -2080,10 +2087,8 @@ pool for "old" blocks */ #ifdef UNIV_DEBUG ulint buddy_n_frames; /*!< Number of frames allocated from - the buffer pool to the buddy system */ -#endif -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - ulint mutex_exit_forbidden; /*!< Forbid release mutex */ + the buffer pool to the buddy system. + Protected by zip_hash_mutex. */ #endif ut_allocator allocator; /*!< Allocator used for allocating memory for the the "chunks" @@ -2103,12 +2108,7 @@ buf_page_in_file() == TRUE, indexed by (space_id, offset). page_hash is protected by an - array of mutexes. - Changes in page_hash are protected - by buf_pool->mutex and the relevant - page_hash mutex. Lookups can happen - while holding the buf_pool->mutex or - the relevant page_hash mutex. */ + array of mutexes. */ hash_table_t* page_hash_old; /*!< old pointer to page_hash to be freed after resizing buffer pool */ hash_table_t* zip_hash; /*!< hash table of buf_block_t blocks @@ -2116,15 +2116,19 @@ zip buddy system, indexed by block->frame */ ulint n_pend_reads; /*!< number of pending read - operations */ - ulint n_pend_unzip; /*!< number of pending decompressions */ + operations. Accessed atomically */ + ulint n_pend_unzip; /*!< number of pending decompressions. + Accessed atomically. */ time_t last_printout_time; /*!< when buf_print_io was last time - called */ + called. Accesses not protected. */ buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1]; /*!< Statistics of buddy system, - indexed by block size */ + indexed by block size. Protected by + zip_free mutex, except for the used + field, which is also accessed + atomically */ buf_pool_stat_t stat; /*!< current statistics */ buf_pool_stat_t old_stat; /*!< old statistics */ @@ -2134,7 +2138,7 @@ /* @{ */ - FlushListMutex flush_list_mutex;/*!< mutex protecting the + BufListMutex flush_list_mutex;/*!< mutex protecting the flush list access. This mutex protects flush_list, flush_rbt and bpage::list pointers when @@ -2151,14 +2155,17 @@ list */ ibool init_flush[BUF_FLUSH_N_TYPES]; /*!< this is TRUE when a flush of the - given type is being initialized */ + given type is being initialized. + Protected by flush_state_mutex. */ ulint n_flush[BUF_FLUSH_N_TYPES]; /*!< this is the number of pending - writes in the given flush type */ + writes in the given flush type. + Protected by flush_state_mutex. */ os_event_t no_flush[BUF_FLUSH_N_TYPES]; /*!< this is in the set state when there is no flush batch - of the given type running */ + of the given type running. Protected by + flush_state_mutex. */ ib_rbt_t* flush_rbt; /*!< a red-black tree is used exclusively during recovery to speed up insertions in the @@ -2181,7 +2188,8 @@ billion! A thread is allowed to read this for heuristic purposes without holding any - mutex or latch */ + mutex or latch. For non-heuristic + purposes protected by LRU_list_mutex */ ibool try_LRU_scan; /*!< Set to FALSE when an LRU scan for free block fails. This flag is used to avoid repeated @@ -2190,8 +2198,8 @@ available in the scan depth for eviction. Set to TRUE whenever we flush a batch from the - buffer pool. Protected by the - buf_pool->mutex */ + buffer pool. Accessed protected by + memory barriers. */ /* @} */ /** @name LRU replacement algorithm fields */ @@ -2205,21 +2213,22 @@ /*!< base node of the withdraw block list. It is only used during shrinking buffer pool size, not to - reuse the blocks will be removed */ + reuse the blocks will be removed. + Protected by free_list_mutex */ ulint withdraw_target;/*!< target length of withdraw block list, when withdrawing */ /** "hazard pointer" used during scan of LRU while doing - LRU list batch. Protected by buf_pool::mutex */ + LRU list batch. Protected by buf_pool::LRU_list_mutex */ LRUHp lru_hp; /** Iterator used to scan the LRU list when searching for - replacable victim. Protected by buf_pool::mutex. */ + replacable victim. Protected by buf_pool::LRU_list_mutex. */ LRUItr lru_scan_itr; /** Iterator used to scan the LRU list when searching for - single page flushing victim. Protected by buf_pool::mutex. */ + single page flushing victim. Protected by buf_pool::LRU_list_mutex. */ LRUItr single_scan_itr; UT_LIST_BASE_NODE_T(buf_page_t) LRU; @@ -2242,7 +2251,8 @@ UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU; /*!< base node of the - unzip_LRU list */ + unzip_LRU list. The list is protected + by LRU_list_mutex. */ /* @} */ /** @name Buddy allocator fields @@ -2259,8 +2269,12 @@ buf_page_t* watch; /*!< Sentinel records for buffer - pool watches. Protected by - buf_pool->mutex. */ + pool watches. Scanning the array is + protected by taking all page_hash + latches in X. Updating or reading an + individual watch page is protected by + a corresponding individual page_hash + latch. */ #if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN # error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN" @@ -2277,18 +2291,10 @@ std::ostream& out, const buf_pool_t& buf_pool); -/** @name Accessors for buf_pool->mutex. -Use these instead of accessing buf_pool->mutex directly. */ +/** @name Accessors for buffer pool mutexes +Use these instead of accessing buffer pool mutexes directly. */ /* @{ */ -/** Test if a buffer pool mutex is owned. */ -#define buf_pool_mutex_own(b) mutex_own(&b->mutex) -/** Acquire a buffer pool mutex. */ -#define buf_pool_mutex_enter(b) do { \ - ut_ad(!(b)->zip_mutex.is_owned()); \ - mutex_enter(&(b)->mutex); \ -} while (0) - /** Test if flush list mutex is owned. */ #define buf_flush_list_mutex_own(b) mutex_own(&(b)->flush_list_mutex) @@ -2310,7 +2316,7 @@ mutex_enter(&(b)->mutex); \ } while (0) -/** Release the trx->mutex. */ +/** Release the block->mutex. */ #define buf_page_mutex_exit(b) do { \ (b)->mutex.exit(); \ } while (0) @@ -2358,31 +2364,6 @@ # define buf_block_hash_lock_held_s_or_x(b, p) (TRUE) #endif /* UNIV_SYNC_DEBUG */ -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG -/** Forbid the release of the buffer pool mutex. */ -# define buf_pool_mutex_exit_forbid(b) do { \ - ut_ad(buf_pool_mutex_own(b)); \ - b->mutex_exit_forbidden++; \ -} while (0) -/** Allow the release of the buffer pool mutex. */ -# define buf_pool_mutex_exit_allow(b) do { \ - ut_ad(buf_pool_mutex_own(b)); \ - ut_a(b->mutex_exit_forbidden); \ - b->mutex_exit_forbidden--; \ -} while (0) -/** Release the buffer pool mutex. */ -# define buf_pool_mutex_exit(b) do { \ - ut_a(!b->mutex_exit_forbidden); \ - mutex_exit(&b->mutex); \ -} while (0) -#else -/** Forbid the release of the buffer pool mutex. */ -# define buf_pool_mutex_exit_forbid(b) ((void) 0) -/** Allow the release of the buffer pool mutex. */ -# define buf_pool_mutex_exit_allow(b) ((void) 0) -/** Release the buffer pool mutex. */ -# define buf_pool_mutex_exit(b) mutex_exit(&b->mutex) -#endif #endif /* !UNIV_HOTBACKUP */ /* @} */ === modified file 'storage/innobase/include/buf0buf.ic' --- storage/innobase/include/buf0buf.ic 2014-08-26 17:48:07 +0000 +++ storage/innobase/include/buf0buf.ic 2015-01-16 19:30:41 +0000 @@ -124,7 +124,7 @@ /*==========================*/ const buf_page_t* bpage) /*!< in: block */ { - /* This is sometimes read without holding buf_pool->mutex. */ + /* This is sometimes read without holding any buffer pool mutex. */ return(bpage->freed_page_clock); } @@ -141,11 +141,10 @@ } /********************************************************************//** -Tells if a block is still close enough to the MRU end of the LRU list -meaning that it is not in danger of getting evicted and also implying +Tells, for heuristics, if a block is still close enough to the MRU end of the +LRU list meaning that it is not in danger of getting evicted and also implying that it has been accessed recently. -Note that this is for heuristics only and does not reserve buffer pool -mutex. +The page must be either buffer-fixed, either its page hash must be locked. @return TRUE if block is close to MRU end of LRU */ UNIV_INLINE ibool @@ -155,6 +154,9 @@ { buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(bpage->buf_fix_count > 0 + || buf_page_hash_lock_held_s_or_x(buf_pool, bpage)); + /* FIXME: bpage->freed_page_clock is 31 bits */ return((buf_pool->freed_page_clock & ((1UL << 31) - 1)) < ((ulint) bpage->freed_page_clock @@ -162,46 +164,6 @@ * (BUF_LRU_OLD_RATIO_DIV - buf_pool->LRU_old_ratio) / (BUF_LRU_OLD_RATIO_DIV * 4)))); } - -/********************************************************************//** -Recommends a move of a block to the start of the LRU list if there is danger -of dropping from the buffer pool. NOTE: does not reserve the buffer pool -mutex. -@return TRUE if should be made younger */ -UNIV_INLINE -ibool -buf_page_peek_if_too_old( -/*=====================*/ - const buf_page_t* bpage) /*!< in: block to make younger */ -{ - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - - if (buf_pool->freed_page_clock == 0) { - /* If eviction has not started yet, do not update the - statistics or move blocks in the LRU list. This is - either the warm-up phase or an in-memory workload. */ - return(FALSE); - } else if (buf_LRU_old_threshold_ms && bpage->old) { - unsigned access_time = buf_page_is_accessed(bpage); - - /* It is possible that the below comparison returns an - unexpected result. 2^32 milliseconds pass in about 50 days, - so if the difference between ut_time_ms() and access_time - is e.g. 50 days + 15 ms, then the below will behave as if - it is 15 ms. This is known and fixing it would require to - increase buf_page_t::access_time from 32 to 64 bits. */ - if (access_time > 0 - && ((ib_uint32_t) (ut_time_ms() - access_time)) - >= buf_LRU_old_threshold_ms) { - return(TRUE); - } - - buf_pool->stat.n_pages_not_made_young++; - return(FALSE); - } else { - return(!buf_page_peek_if_young(bpage)); - } -} #endif /* !UNIV_HOTBACKUP */ /*********************************************************************//** @@ -244,6 +206,32 @@ { return(buf_page_get_state(&block->page)); } + +#ifdef UNIV_DEBUG +/** Assert that a given buffer pool page is private to the caller: no pointers +to it exist in any buffer pool list or hash table. Accessing pages by iterating +over buffer pool chunks is not considered here. Furthermore, assert that no +buffer pool locks except for LRU list mutex and page hash are held. +@param[in] bpage pointer to a buffer pool page */ +UNIV_INLINE +bool +buf_page_is_private( + const buf_page_t* bpage) +{ + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_a(!bpage->in_page_hash); + ut_a(!bpage->in_zip_hash); + ut_a(!bpage->in_flush_list); + ut_a(!bpage->in_free_list); + ut_a(!bpage->in_LRU_list); + ut_a(!mutex_own(buf_page_get_mutex(bpage))); + ut_a(!mutex_own(&buf_pool->free_list_mutex)); + ut_a(!mutex_own(&buf_pool->zip_free_mutex)); + ut_a(!mutex_own(&buf_pool->zip_hash_mutex)); + return(true); +} +#endif + /*********************************************************************//** Sets the state of a block. */ UNIV_INLINE @@ -255,6 +243,7 @@ { #ifdef UNIV_DEBUG enum buf_page_state old_state = buf_page_get_state(bpage); + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); switch (old_state) { case BUF_BLOCK_POOL_WATCH: @@ -265,21 +254,36 @@ break; case BUF_BLOCK_ZIP_DIRTY: ut_a(state == BUF_BLOCK_ZIP_PAGE); + ut_a(mutex_own(buf_page_get_mutex(bpage))); + ut_a(buf_flush_list_mutex_own(buf_pool)); + ut_a(bpage->in_flush_list); break; case BUF_BLOCK_NOT_USED: ut_a(state == BUF_BLOCK_READY_FOR_USE); + ut_a(buf_page_is_private(bpage)); break; case BUF_BLOCK_READY_FOR_USE: ut_a(state == BUF_BLOCK_MEMORY || state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_NOT_USED); + ut_a(buf_page_is_private(bpage)); break; case BUF_BLOCK_MEMORY: ut_a(state == BUF_BLOCK_NOT_USED); + ut_a(buf_page_is_private(bpage)); break; case BUF_BLOCK_FILE_PAGE: ut_a(state == BUF_BLOCK_NOT_USED || state == BUF_BLOCK_REMOVE_HASH); + if (state == BUF_BLOCK_REMOVE_HASH) { + ut_a(!bpage->in_page_hash); + ut_a(!bpage->in_zip_hash); + ut_a(!bpage->in_LRU_list); + ut_a(!bpage->in_free_list); + ut_a(mutex_own(buf_page_get_mutex(bpage))); + ut_a(mutex_own(&buf_pool->LRU_list_mutex)); + ut_a(buf_page_hash_lock_held_x(buf_pool, bpage)); + } break; case BUF_BLOCK_REMOVE_HASH: ut_a(state == BUF_BLOCK_MEMORY); @@ -427,6 +431,21 @@ /*================*/ const buf_page_t* bpage) /*!< in: pointer to the control block */ { + ut_ad(mutex_own(buf_page_get_mutex(bpage))); + return buf_page_get_io_fix_unlocked(bpage); +} + +/** Gets the io_fix state of a buffer page. Does not assert that the +buf_page_get_mutex() mutex is held, to be used in the cases where it is safe +not to hold it. +@param[in] pointer to the buffer page +@return page io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_page_get_io_fix_unlocked( +/*=========================*/ + const buf_page_t* bpage) +{ ut_ad(bpage != NULL); enum buf_io_fix io_fix = bpage->io_fix; @@ -456,6 +475,20 @@ return(buf_page_get_io_fix(&block->page)); } +/** Gets the io_fix state of a buffer block. Does not assert that the +buf_page_get_mutex() mutex is held, to be used in the cases where it is safe +not to hold it. +@param[in] pointer to the buffer block +@return page io_fix state */ +UNIV_INLINE +enum buf_io_fix +buf_block_get_io_fix_unlocked( +/*==========================*/ + const buf_block_t* block) +{ + return(buf_page_get_io_fix_unlocked(&block->page)); +} + /*********************************************************************//** Sets the io_fix state of a block. */ UNIV_INLINE @@ -465,10 +498,6 @@ buf_page_t* bpage, /*!< in/out: control block */ enum buf_io_fix io_fix) /*!< in: io_fix state */ { -#ifdef UNIV_DEBUG - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); -#endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); bpage->io_fix = io_fix; @@ -489,7 +518,7 @@ /*********************************************************************//** Makes a block sticky. A sticky block implies that even after we release -the buf_pool->mutex and the block->mutex: +the buf_pool->LRU_list_mutex and the block->mutex: * it cannot be removed from the flush_list * the block descriptor cannot be relocated * it cannot be removed from the LRU list @@ -504,10 +533,11 @@ { #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); #endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE); + ut_ad(bpage->in_LRU_list); bpage->io_fix = BUF_IO_PIN; } @@ -520,10 +550,6 @@ /*==================*/ buf_page_t* bpage) /*!< in/out: control block */ { -#ifdef UNIV_DEBUG - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); -#endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN); @@ -539,10 +565,6 @@ /*==================*/ const buf_page_t* bpage) /*!< control block being relocated */ { -#ifdef UNIV_DEBUG - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); -#endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(buf_page_in_file(bpage)); ut_ad(bpage->in_LRU_list); @@ -562,7 +584,11 @@ { #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(buf_pool_mutex_own(buf_pool)); + /* Buffer page mutex is not strictly required here for heuristic + purposes even if LRU mutex is not being held. Keep the assertion + for now since all the callers hold it. */ + ut_ad(mutex_own(buf_page_get_mutex(bpage)) + || mutex_own(&buf_pool->LRU_list_mutex)); #endif ut_ad(buf_page_in_file(bpage)); @@ -582,7 +608,7 @@ buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); #endif /* UNIV_DEBUG */ ut_a(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own(buf_pool)); + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); ut_ad(bpage->in_LRU_list); #ifdef UNIV_LRU_DEBUG @@ -627,11 +653,7 @@ /*==================*/ buf_page_t* bpage) /*!< in/out: control block */ { -#ifdef UNIV_DEBUG - buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); - ut_ad(!buf_pool_mutex_own(buf_pool)); ut_ad(mutex_own(buf_page_get_mutex(bpage))); -#endif /* UNIV_DEBUG */ ut_a(buf_page_in_file(bpage)); @@ -643,7 +665,10 @@ /*********************************************************************//** Gets the buf_block_t handle of a buffered file block if an uncompressed -page frame exists, or NULL. +page frame exists, or NULL. page frame exists, or NULL. The caller must hold +either the appropriate hash lock in any mode, either the LRU list mutex. Note: +even though bpage is not declared a const we don't update its value. It is safe +to make this pure. @return control block, or NULL */ UNIV_INLINE buf_block_t* @@ -652,6 +677,11 @@ buf_page_t* bpage) /*!< in: control block, or NULL */ { if (bpage != NULL) { +#ifdef UNIV_DEBUG + buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); + ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage) + || mutex_own(&buf_pool->LRU_list_mutex)); +#endif ut_ad(buf_page_in_file(bpage)); if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { @@ -809,19 +839,9 @@ /*===========*/ buf_block_t* block) /*!< in, own: block to be freed */ { - buf_pool_t* buf_pool = buf_pool_from_bpage((buf_page_t*) block); - - buf_pool_mutex_enter(buf_pool); - - buf_page_mutex_enter(block); - ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); buf_LRU_block_free_non_file_page(block); - - buf_page_mutex_exit(block); - - buf_pool_mutex_exit(buf_pool); } #endif /* !UNIV_HOTBACKUP */ @@ -872,8 +892,8 @@ /********************************************************************//** Increments the modify clock of a frame by 1. The caller must (1) own the -buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock -on the block. */ +buf_pool->LRU_list_mutex and block bufferfix count has to be zero, (2) or own +an x-lock on the block, (3) or the block must belong to an intrinsic table. */ UNIV_INLINE void buf_block_modify_clock_inc( @@ -885,7 +905,7 @@ /* No latch is acquired if block belongs to intrinsic table. */ if (!fsp_is_system_temporary(block->page.id.space())) { - ut_ad((buf_pool_mutex_own(buf_pool) + ut_ad((mutex_own(&buf_pool->LRU_list_mutex) && (block->page.buf_fix_count == 0)) || rw_lock_own_flagged(&block->lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX)); @@ -973,6 +993,7 @@ buf_block_unfix( buf_page_t* bpage) { + ut_ad(!mutex_own(buf_page_get_mutex(bpage))); ulint count = os_atomic_decrement_uint32(&bpage->buf_fix_count, 1); ut_ad(count + 1 != 0); return(count); @@ -1122,12 +1143,10 @@ if (mode == RW_LOCK_S) { rw_lock_s_lock(hash_lock); - /* If not own buf_pool_mutex, page_hash can be changed. */ hash_lock = hash_lock_s_confirm( hash_lock, buf_pool->page_hash, page_id.fold()); } else { rw_lock_x_lock(hash_lock); - /* If not own buf_pool_mutex, page_hash can be changed. */ hash_lock = hash_lock_x_confirm( hash_lock, buf_pool->page_hash, page_id.fold()); } @@ -1317,36 +1336,6 @@ } #endif /* UNIV_SYNC_DEBUG */ -/********************************************************************//** -Acquire mutex on all buffer pool instances. */ -UNIV_INLINE -void -buf_pool_mutex_enter_all(void) -/*==========================*/ -{ - for (ulint i = 0; i < srv_buf_pool_instances; ++i) { - buf_pool_t* buf_pool = buf_pool_from_array(i); - - buf_pool_mutex_enter(buf_pool); - } -} - -/********************************************************************//** -Release mutex on all buffer pool instances. */ -UNIV_INLINE -void -buf_pool_mutex_exit_all(void) -/*=========================*/ -{ - ulint i; - - for (i = 0; i < srv_buf_pool_instances; i++) { - buf_pool_t* buf_pool; - - buf_pool = buf_pool_from_array(i); - buf_pool_mutex_exit(buf_pool); - } -} /*********************************************************************//** Get the nth chunk's buffer block in the specified buffer pool. @return the nth chunk's buffer block. */ @@ -1396,4 +1385,24 @@ } } +/** Return how many more pages must be added to the withdraw list to reach the +withdraw target of the currently ongoing buffer pool resize. +@param[in] buf_pool buffer pool instance +@return page count to be withdrawn or zero if the target is already achieved or +if the buffer pool is not currently being resized. */ +UNIV_INLINE +ulint +buf_get_withdraw_depth( + buf_pool_t* buf_pool) +{ + os_rmb; + if (UNIV_LIKELY(buf_pool->curr_size >= buf_pool->old_size)) + return 0; + mutex_enter(&buf_pool->free_list_mutex); + ulint withdraw_len = UT_LIST_GET_LEN(buf_pool->withdraw); + mutex_exit(&buf_pool->free_list_mutex); + return(buf_pool->withdraw_target > withdraw_len + ? buf_pool->withdraw_target - withdraw_len : 0); +} + #endif /* !UNIV_HOTBACKUP */ === modified file 'storage/innobase/include/buf0flu.h' --- storage/innobase/include/buf0flu.h 2014-11-04 13:39:53 +0000 +++ storage/innobase/include/buf0flu.h 2015-01-16 19:30:41 +0000 @@ -79,10 +79,10 @@ # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG /********************************************************************//** Writes a flushable page asynchronously from the buffer pool to a file. -NOTE: buf_pool->mutex and block->mutex must be held upon entering this -function, and they will be released by this function after flushing. -This is loosely based on buf_flush_batch() and buf_flush_page(). -@return TRUE if the page was flushed and the mutexes released */ +NOTE: block and LRU list mutexes must be held upon entering this function, and +they will be released by this function after flushing. This is loosely based on +buf_flush_batch() and buf_flush_page(). +@return TRUE if the page was flushed and the mutex released */ ibool buf_flush_page_try( @@ -194,7 +194,8 @@ set of mtr's */ /********************************************************************//** Returns TRUE if the file page block is immediately suitable for replacement, -i.e., transition FILE_PAGE => NOT_USED allowed. +i.e., the transition FILE_PAGE => NOT_USED allowed. The caller must hold the +LRU list and block mutexes. @return TRUE if can replace immediately */ ibool @@ -265,9 +266,10 @@ Writes a flushable page asynchronously from the buffer pool to a file. NOTE: in simulated aio we must call os_aio_simulated_wake_handler_threads after we have posted a batch of -writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be -held upon entering this function, and they will be released by this -function. +writes! NOTE: buf_page_get_mutex(bpage) must be held upon entering this +function. The LRU list mutex must be held iff flush_type +== BUF_FLUSH_SINGLE_PAGE. Both mutexes will be released by this function if it +returns true. @return TRUE if page was flushed */ ibool === modified file 'storage/innobase/include/buf0flu.ic' --- storage/innobase/include/buf0flu.ic 2014-03-05 05:59:56 +0000 +++ storage/innobase/include/buf0flu.ic 2015-01-16 19:30:41 +0000 @@ -75,7 +75,6 @@ buf_pool_t* buf_pool = buf_pool_from_block(block); - ut_ad(!buf_pool_mutex_own(buf_pool)); ut_ad(!buf_flush_list_mutex_own(buf_pool)); } #endif /* UNIV_DEBUG */ @@ -118,7 +117,6 @@ buf_pool_t* buf_pool = buf_pool_from_block(block); - ut_ad(!buf_pool_mutex_own(buf_pool)); ut_ad(!buf_flush_list_mutex_own(buf_pool)); ut_ad(start_lsn != 0); === modified file 'storage/innobase/include/buf0lru.h' --- storage/innobase/include/buf0lru.h 2013-11-04 14:38:22 +0000 +++ storage/innobase/include/buf0lru.h 2015-01-16 19:30:41 +0000 @@ -79,12 +79,13 @@ Try to free a block. If bpage is a descriptor of a compressed-only page, the descriptor object will be freed as well. -NOTE: If this function returns true, it will temporarily -release buf_pool->mutex. Furthermore, the page frame will no longer be -accessible via bpage. - -The caller must hold buf_pool->mutex and must not hold any -buf_page_get_mutex() when calling this function. +NOTE: this function may temporarily release and relock the +buf_page_get_get_mutex(). Furthermore, the page frame will no longer be +accessible via bpage. If this function returns true, it will also release +the LRU list mutex. + +The caller must hold the LRU list and buf_page_get_mutex() mutexes. + @return true if freed, false otherwise. */ bool @@ -93,7 +94,7 @@ buf_page_t* bpage, /*!< in: block to be freed */ bool zip) /*!< in: true if should remove also the compressed page of an uncompressed page */ - __attribute__((nonnull)); + __attribute__((nonnull, warn_unused_result)); /******************************************************************//** Try to free a replaceable block. @return true if found and freed */ @@ -218,14 +219,18 @@ /*=====================*/ /******************************************************************//** -Remove one page from LRU list and put it to free list */ +Remove one page from LRU list and put it to free list. The caller must hold the +LRU list and block mutexes and have page hash latched in X. The latch and +the block mutexes will be released. */ void buf_LRU_free_one_page( /*==================*/ - buf_page_t* bpage) /*!< in/out: block, must contain a file page and + buf_page_t* bpage, /*!< in/out: block, must contain a file page and be in a state where it can be freed; there may or may not be a hash index to the page */ + bool zip = true)/*!< in: true if should remove also the + compressed page of an uncompressed page */ __attribute__((nonnull)); /******************************************************************//** @@ -297,7 +302,7 @@ extern buf_LRU_stat_t buf_LRU_stat_cur; /** Running sum of past values of buf_LRU_stat_cur. -Updated by buf_LRU_stat_update(). Protected by buf_pool->mutex. */ +Updated by buf_LRU_stat_update(). Accesses protected by memory barriers. */ extern buf_LRU_stat_t buf_LRU_stat_sum; /********************************************************************//** === modified file 'storage/innobase/include/buf0types.h' --- storage/innobase/include/buf0types.h 2014-07-04 03:01:03 +0000 +++ storage/innobase/include/buf0types.h 2015-01-16 19:30:41 +0000 @@ -119,8 +119,7 @@ #ifndef UNIV_INNOCHECKSUM typedef ib_mutex_t BPageMutex; -typedef ib_mutex_t BufPoolMutex; -typedef ib_mutex_t FlushListMutex; +typedef ib_mutex_t BufListMutex; #endif /* !UNIV_INNOCHECKSUM */ #endif /* buf0types.h */ === modified file 'storage/innobase/include/srv0srv.h' --- storage/innobase/include/srv0srv.h 2014-08-26 17:48:07 +0000 +++ storage/innobase/include/srv0srv.h 2015-01-16 19:30:41 +0000 @@ -289,7 +289,7 @@ extern ulong srv_LRU_scan_depth; /** Whether or not to flush neighbors of a block */ extern ulong srv_flush_neighbors; -/** Previously requested size */ +/** Previously requested size. Accesses protected by memory barriers. */ extern ulint srv_buf_pool_old_size; /** Current size as scaling factor for the other components */ extern ulint srv_buf_pool_base_size; === modified file 'storage/innobase/include/sync0sync.h' --- storage/innobase/include/sync0sync.h 2014-06-24 13:49:46 +0000 +++ storage/innobase/include/sync0sync.h 2015-01-16 19:30:41 +0000 @@ -52,7 +52,11 @@ /* Key defines to register InnoDB mutexes with performance schema */ extern mysql_pfs_key_t autoinc_mutex_key; extern mysql_pfs_key_t buffer_block_mutex_key; -extern mysql_pfs_key_t buf_pool_mutex_key; +extern mysql_pfs_key_t buf_pool_flush_state_mutex_key; +extern mysql_pfs_key_t buf_pool_LRU_list_mutex_key; +extern mysql_pfs_key_t buf_pool_free_list_mutex_key; +extern mysql_pfs_key_t buf_pool_zip_free_mutex_key; +extern mysql_pfs_key_t buf_pool_zip_hash_mutex_key; extern mysql_pfs_key_t buf_pool_zip_mutex_key; extern mysql_pfs_key_t cache_last_read_mutex_key; extern mysql_pfs_key_t dict_foreign_err_mutex_key; === modified file 'storage/innobase/include/sync0types.h' --- storage/innobase/include/sync0types.h 2014-08-14 05:00:58 +0000 +++ storage/innobase/include/sync0types.h 2015-01-16 19:30:41 +0000 @@ -186,7 +186,7 @@ Search system mutex | V -Buffer pool mutex +Buffer pool mutexes | V Log mutex @@ -217,11 +217,13 @@ SYNC_DOUBLEWRITE, SYNC_BUF_FLUSH_LIST, - + SYNC_BUF_FLUSH_STATE, + SYNC_BUF_ZIP_HASH, + SYNC_BUF_FREE_LIST, + SYNC_BUF_ZIP_FREE, SYNC_BUF_BLOCK, SYNC_BUF_PAGE_HASH, - - SYNC_BUF_POOL, + SYNC_BUF_LRU_LIST, SYNC_POOL, SYNC_POOL_MANAGER, === modified file 'storage/innobase/lock/lock0lock.cc' --- storage/innobase/lock/lock0lock.cc 2014-09-02 07:56:28 +0000 +++ storage/innobase/lock/lock0lock.cc 2015-01-16 19:30:41 +0000 @@ -504,7 +504,7 @@ for (ulint i = 0; i < srv_buf_pool_instances; ++i) { buf_pool_t* buf_pool = buf_pool_from_array(i); - buf_pool_mutex_enter(buf_pool); + mutex_enter(&buf_pool->LRU_list_mutex); buf_page_t* bpage; bpage = UT_LIST_GET_FIRST(buf_pool->LRU); @@ -522,7 +522,7 @@ } bpage = UT_LIST_GET_NEXT(LRU, bpage); } - buf_pool_mutex_exit(buf_pool); + mutex_exit(&buf_pool->LRU_list_mutex); } lock_mutex_exit(); === modified file 'storage/innobase/srv/srv0srv.cc' --- storage/innobase/srv/srv0srv.cc 2014-08-26 17:48:07 +0000 +++ storage/innobase/srv/srv0srv.cc 2015-01-16 19:30:41 +0000 @@ -208,7 +208,7 @@ ulong srv_LRU_scan_depth = 1024; /** Whether or not to flush neighbors of a block */ ulong srv_flush_neighbors = 1; -/** Previously requested size */ +/** Previously requested size. Accesses protected by memory barriers. */ ulint srv_buf_pool_old_size = 0; /** Current size as scaling factor for the other components */ ulint srv_buf_pool_base_size = 0; === modified file 'storage/innobase/sync/sync0debug.cc' --- storage/innobase/sync/sync0debug.cc 2014-08-19 05:43:25 +0000 +++ storage/innobase/sync/sync0debug.cc 2015-01-16 19:30:41 +0000 @@ -638,7 +638,11 @@ break; case SYNC_BUF_FLUSH_LIST: - case SYNC_BUF_POOL: + case SYNC_BUF_LRU_LIST: + case SYNC_BUF_FREE_LIST: + case SYNC_BUF_ZIP_FREE: + case SYNC_BUF_ZIP_HASH: + case SYNC_BUF_FLUSH_STATE: /* We can have multiple mutexes of this type therefore we can only check whether the greater than condition holds. */ @@ -647,22 +651,10 @@ break; case SYNC_BUF_PAGE_HASH: - - /* Multiple page_hash locks are only allowed during - buf_validate and that is where buf_pool mutex is already - held. */ - - /* Fall through */ - case SYNC_BUF_BLOCK: - /* Either the thread must own the (buffer pool) buf_pool->mutex - or it is allowed to latch only ONE of (buffer block) - block->mutex or buf_pool->zip_mutex. */ - if (less(latches, latch->m_level) != 0) { basic_check(latches, latch->m_level - 1); - ut_a(find(latches, SYNC_BUF_POOL) != 0); } break; @@ -886,9 +878,25 @@ buffer_block_mutex_key); #endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */ - LATCH_ADD(SrvLatches, "buf_pool", - SYNC_BUF_POOL, - buf_pool_mutex_key); + LATCH_ADD(SrvLatches, "buf_pool_lru_list", + SYNC_BUF_LRU_LIST, + buf_pool_LRU_list_mutex_key); + + LATCH_ADD(SrvLatches, "buf_pool_free_list", + SYNC_BUF_FREE_LIST, + buf_pool_free_list_mutex_key); + + LATCH_ADD(SrvLatches, "buf_pool_zip_free", + SYNC_BUF_ZIP_FREE, + buf_pool_zip_free_mutex_key); + + LATCH_ADD(SrvLatches, "buf_pool_zip_hash", + SYNC_BUF_ZIP_HASH, + buf_pool_zip_free_mutex_key); + + LATCH_ADD(SrvLatches, "buf_pool_flush_state", + SYNC_BUF_FLUSH_STATE, + buf_pool_flush_state_mutex_key); LATCH_ADD(SrvLatches, "buf_pool_zip", SYNC_BUF_BLOCK, === modified file 'storage/innobase/sync/sync0sync.cc' --- storage/innobase/sync/sync0sync.cc 2014-07-10 10:46:02 +0000 +++ storage/innobase/sync/sync0sync.cc 2015-01-16 19:30:41 +0000 @@ -38,7 +38,11 @@ /* Key to register autoinc_mutex with performance schema */ mysql_pfs_key_t autoinc_mutex_key; mysql_pfs_key_t buffer_block_mutex_key; -mysql_pfs_key_t buf_pool_mutex_key; +mysql_pfs_key_t buf_pool_flush_state_mutex_key; +mysql_pfs_key_t buf_pool_LRU_list_mutex_key; +mysql_pfs_key_t buf_pool_free_list_mutex_key; +mysql_pfs_key_t buf_pool_zip_free_mutex_key; +mysql_pfs_key_t buf_pool_zip_hash_mutex_key; mysql_pfs_key_t buf_pool_zip_mutex_key; mysql_pfs_key_t cache_last_read_mutex_key; mysql_pfs_key_t dict_foreign_err_mutex_key;