Description:
The function buf_LRU_check_size_of_non_data_objects attempts to verify the size of the buffer pool without acquiring a mutex lock. This lack of synchronization may lead to a critical assertion failure if a concurrent resizing is in progress.
This issue is difficult to reproduce in release builds, but we did encounter it in our environment.
How to repeat:
Add some debug code:
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index d3eb0adbb8e..92d32299ce9 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -90,6 +90,8 @@ this program; if not, write to the Free Software Foundation, Inc.,
#ifdef UNIV_DEBUG
#include "ut0stateful_latching_rules.h"
+#include "current_thd.h"
+#include "debug_sync.h"
#endif /* UNIV_DEBUG */
#ifdef HAVE_LIBNUMA
@@ -2208,8 +2210,27 @@ static void buf_pool_resize() {
ut_ad(buf_pool->n_chunks_new == buf_pool->n_chunks);
ut_ad(UT_LIST_GET_LEN(buf_pool->withdraw) == 0);
+ DBUG_EXECUTE_IF("buf_pool_resize_simulate_wait", {
+ const char act[] =
+ "now SIGNAL "
+ "buf_pool_resize_stopped_before_setting_new_size "
+ "WAIT_FOR "
+ "buf_pool_resize_update_new_size";
+ assert(!debug_sync_set_action(current_thd, STRING_WITH_LEN(act)));
+ });
+
buf_pool->curr_size = new_instance_size;
+ DBUG_EXECUTE_IF("buf_pool_resize_simulate_wait", {
+ const char act[] =
+ "now SIGNAL "
+ "buf_pool_resize_stopped_after_setting_new_size "
+ "WAIT_FOR "
+ "buf_pool_resize_continue";
+ assert(!debug_sync_set_action(current_thd, STRING_WITH_LEN(act)));
+ DBUG_SET("-d,buf_pool_resize_simulate_wait");
+ });
+
ut_ad(srv_buf_pool_chunk_unit % UNIV_PAGE_SIZE == 0);
buf_pool->n_chunks_new =
new_instance_size * UNIV_PAGE_SIZE / srv_buf_pool_chunk_unit;
@@ -2671,6 +2692,9 @@ withdraw_retry:
/** This is the thread for resizing buffer pool. It waits for an event and
when waked up either performs a resizing and sleeps again. */
void buf_resize_thread() {
+#ifdef UNIV_DEBUG
+ THD *thd = create_internal_thd();
+#endif
while (srv_shutdown_state.load() < SRV_SHUTDOWN_CLEANUP) {
os_event_wait(srv_buf_resize_event);
os_event_reset(srv_buf_resize_event);
@@ -2694,6 +2718,9 @@ void buf_resize_thread() {
buf_pool_resize();
}
+#ifdef UNIV_DEBUG
+ destroy_internal_thd(thd);
+#endif
}
/** Clears the adaptive hash index on all pages in the buffer pool. */
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
index 864eac435c7..1a184df1183 100644
--- a/storage/innobase/buf/buf0lru.cc
+++ b/storage/innobase/buf/buf0lru.cc
@@ -56,6 +56,10 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "trx0trx.h"
#include "ut0byte.h"
#include "ut0rnd.h"
+#ifdef UNIV_DEBUG
+#include "current_thd.h"
+#include "debug_sync.h"
+#endif
/** The number of blocks from the LRU_old pointer onward, including
the block pointed to, must be buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
@@ -1256,7 +1260,22 @@ static void buf_LRU_check_size_of_non_data_objects(
{
const size_t mb = (buf_pool->curr_size / (1024 * 1024 / UNIV_PAGE_SIZE));
+#ifdef UNIV_DEBUG
+ auto buf_check_size_debug_wait = []() {
+ const char act[] =
+ "now SIGNAL "
+ "buf_pool_resize_update_new_size "
+ "WAIT_FOR "
+ "buf_pool_resize_stopped_after_setting_new_size";
+ assert(!debug_sync_set_action(current_thd, STRING_WITH_LEN(act)));
+ DBUG_SET("-d,buf_check_size_simulate_wait");
+ return true;
+ };
+#endif
+
if (!recv_recovery_is_on() && buf_pool->curr_size == buf_pool->old_size &&
+ DBUG_EVALUATE_IF("buf_check_size_simulate_wait",
+ buf_check_size_debug_wait(), true) &&
UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) <
buf_pool->curr_size / 20) {
const bool buf_pool_full = true;
Run test script:
source include/have_debug.inc;
source include/have_debug_sync.inc;
--let $restart_parameters="restart: --innodb-buffer-pool-size=5M --innodb-buffer_pool_instances=1"
--source include/restart_mysqld.inc
--connect(con1,localhost,root)
--connection con1
set global debug='+d,buf_pool_resize_simulate_wait';
send set global innodb_buffer_pool_size = 120*1024*1024;
connection default;
set debug_sync='now wait_for buf_pool_resize_stopped_before_setting_new_size';
set debug='+d,buf_check_size_simulate_wait';
create table t1(id int,c1 int);
insert into t1 values(1,1);
set debug_sync='now signal buf_pool_resize_continue';
connection con1;
reap;
--source include/wait_condition.inc
connection default;
disconnect con1;
--source include/restart_mysqld.inc
drop table t1;
Suggested fix:
1. Eliminate the unsafe assertion.
2. Before executing the assertion logic, immediately recheck the buffer pool size. If the size has changed (indicating an ongoing resize), skip the assertion to prevent failures.
Description: The function buf_LRU_check_size_of_non_data_objects attempts to verify the size of the buffer pool without acquiring a mutex lock. This lack of synchronization may lead to a critical assertion failure if a concurrent resizing is in progress. This issue is difficult to reproduce in release builds, but we did encounter it in our environment. How to repeat: Add some debug code: diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index d3eb0adbb8e..92d32299ce9 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -90,6 +90,8 @@ this program; if not, write to the Free Software Foundation, Inc., #ifdef UNIV_DEBUG #include "ut0stateful_latching_rules.h" +#include "current_thd.h" +#include "debug_sync.h" #endif /* UNIV_DEBUG */ #ifdef HAVE_LIBNUMA @@ -2208,8 +2210,27 @@ static void buf_pool_resize() { ut_ad(buf_pool->n_chunks_new == buf_pool->n_chunks); ut_ad(UT_LIST_GET_LEN(buf_pool->withdraw) == 0); + DBUG_EXECUTE_IF("buf_pool_resize_simulate_wait", { + const char act[] = + "now SIGNAL " + "buf_pool_resize_stopped_before_setting_new_size " + "WAIT_FOR " + "buf_pool_resize_update_new_size"; + assert(!debug_sync_set_action(current_thd, STRING_WITH_LEN(act))); + }); + buf_pool->curr_size = new_instance_size; + DBUG_EXECUTE_IF("buf_pool_resize_simulate_wait", { + const char act[] = + "now SIGNAL " + "buf_pool_resize_stopped_after_setting_new_size " + "WAIT_FOR " + "buf_pool_resize_continue"; + assert(!debug_sync_set_action(current_thd, STRING_WITH_LEN(act))); + DBUG_SET("-d,buf_pool_resize_simulate_wait"); + }); + ut_ad(srv_buf_pool_chunk_unit % UNIV_PAGE_SIZE == 0); buf_pool->n_chunks_new = new_instance_size * UNIV_PAGE_SIZE / srv_buf_pool_chunk_unit; @@ -2671,6 +2692,9 @@ withdraw_retry: /** This is the thread for resizing buffer pool. It waits for an event and when waked up either performs a resizing and sleeps again. */ void buf_resize_thread() { +#ifdef UNIV_DEBUG + THD *thd = create_internal_thd(); +#endif while (srv_shutdown_state.load() < SRV_SHUTDOWN_CLEANUP) { os_event_wait(srv_buf_resize_event); os_event_reset(srv_buf_resize_event); @@ -2694,6 +2718,9 @@ void buf_resize_thread() { buf_pool_resize(); } +#ifdef UNIV_DEBUG + destroy_internal_thd(thd); +#endif } /** Clears the adaptive hash index on all pages in the buffer pool. */ diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 864eac435c7..1a184df1183 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -56,6 +56,10 @@ this program; if not, write to the Free Software Foundation, Inc., #include "trx0trx.h" #include "ut0byte.h" #include "ut0rnd.h" +#ifdef UNIV_DEBUG +#include "current_thd.h" +#include "debug_sync.h" +#endif /** The number of blocks from the LRU_old pointer onward, including the block pointed to, must be buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV @@ -1256,7 +1260,22 @@ static void buf_LRU_check_size_of_non_data_objects( { const size_t mb = (buf_pool->curr_size / (1024 * 1024 / UNIV_PAGE_SIZE)); +#ifdef UNIV_DEBUG + auto buf_check_size_debug_wait = []() { + const char act[] = + "now SIGNAL " + "buf_pool_resize_update_new_size " + "WAIT_FOR " + "buf_pool_resize_stopped_after_setting_new_size"; + assert(!debug_sync_set_action(current_thd, STRING_WITH_LEN(act))); + DBUG_SET("-d,buf_check_size_simulate_wait"); + return true; + }; +#endif + if (!recv_recovery_is_on() && buf_pool->curr_size == buf_pool->old_size && + DBUG_EVALUATE_IF("buf_check_size_simulate_wait", + buf_check_size_debug_wait(), true) && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) { const bool buf_pool_full = true; Run test script: source include/have_debug.inc; source include/have_debug_sync.inc; --let $restart_parameters="restart: --innodb-buffer-pool-size=5M --innodb-buffer_pool_instances=1" --source include/restart_mysqld.inc --connect(con1,localhost,root) --connection con1 set global debug='+d,buf_pool_resize_simulate_wait'; send set global innodb_buffer_pool_size = 120*1024*1024; connection default; set debug_sync='now wait_for buf_pool_resize_stopped_before_setting_new_size'; set debug='+d,buf_check_size_simulate_wait'; create table t1(id int,c1 int); insert into t1 values(1,1); set debug_sync='now signal buf_pool_resize_continue'; connection con1; reap; --source include/wait_condition.inc connection default; disconnect con1; --source include/restart_mysqld.inc drop table t1; Suggested fix: 1. Eliminate the unsafe assertion. 2. Before executing the assertion logic, immediately recheck the buffer pool size. If the size has changed (indicating an ongoing resize), skip the assertion to prevent failures.