Description:
The server was blocked when adjusting the buffer pool size from 16G to 4G in prod environment.
stack1:wait mutex &buf_pool->zip_free_mutex
312 Thread 155 (Thread 0x7f891e3f7700 (LWP 107754)):
313 #0 0x00007f8daa446945 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
314 #1 0x00000000010421db in os_event::wait_low(long) ()
315 #2 0x0000000001108280 in sync_array_wait_event(sync_array_t*, sync_cell_t*&) ()
316 #3 0x00000000012b59b6 in buf_buddy_free_low(buf_pool_t*, void*, unsigned long) ()
317 #4 0x00000000012b61b2 in buf_buddy_condense_free(buf_pool_t*) ()
318 #5 0x00000000011a4998 in buf_pool_resize() ()
319 #6 0x00000000011a6093 in buf_resize_thread ()
320 #7 0x00007f8daa442e25 in start_thread () from /lib64/libpthread.so.0
321 #8 0x00007f8da861434d in clone () from /lib64/libc.so.6
stack2: also wait &buf_pool->zip_free_mutex
199 Thread 172 (Thread 0x7f89307e9700 (LWP 107735)):
200 #0 0x00007f8daa446945 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
201 #1 0x00000000010421db in os_event::wait_low(long) ()
202 #2 0x0000000001108280 in sync_array_wait_event(sync_array_t*, sync_cell_t*&) ()
203 #3 0x00000000012b59b6 in buf_buddy_free_low(buf_pool_t*, void*, unsigned long) ()
204 #4 0x00000000011c4544 in buf_LRU_block_remove_hashed(buf_page_t*, bool) ()
205 #5 0x00000000011c81b6 in buf_LRU_free_page(buf_page_t*, bool) ()
206 #6 0x00000000011bd349 in buf_flush_do_batch(buf_pool_t*, buf_flush_t, unsigned long, unsigned long, unsigned long*) ()
207 #7 0x00000000011be302 in buf_lru_manager ()
208 #8 0x00007f8daa442e25 in start_thread () from /lib64/libpthread.so.0
209 #9 0x00007f8da861434d in clone () from /lib64/libc.so.6
code analyze:
buf_pool_resize call buf_buddy_condense_free to free buddy page, owned &buf_pool->zip_free_mutex. And buf_buddy_condense_free call buf_buddy_free_low, also do mutex_enter(&buf_pool->zip_free_mutex), makes self deadlock.
void
buf_buddy_condense_free(
buf_pool_t* buf_pool)
{
//first owned zip_free_mutex
mutex_enter(&buf_pool->zip_free_mutex);
...
if (buf_buddy_is_free(buddy, i) == BUF_BUDDY_STATE_FREE) {
/* Both buf and buddy are free.
Try to combine them. */
buf_buddy_remove_from_free(buf_pool, buf, i);
os_atomic_increment_ulint(
&buf_pool->buddy_stat[i].used, 1);
buf_buddy_free_low(buf_pool, buf, i);
}
}
buf_buddy_free_low(...)
{
mutex_enter(&buf_pool->zip_free_mutex); //try enter buf_pool->zip_free_mutex again
}
How to repeat:
1.run mysqld with innodb_buffer_pool_size=16G
2.create compressed table by sysbench
3.do compressure test
4.stop test
5.resize bp size to 4G
Have probability to repeat.
Suggested fix:
//declare
void
buf_buddy_free_low(
/*===============*/
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
void* buf, /*!< in: block to be freed, must not be
pointed to by the buffer pool */
ulint i, /*!< in: index of buf_pool->zip_free[],
or BUF_BUDDY_SIZES */
bool need_mutex=true); /*!< in: do enter zip_free_mutex */
//func1
void
buf_buddy_free_low(
/*===============*/
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
void* buf, /*!< in: block to be freed, must not be
pointed to by the buffer pool */
ulint i, /*!< in: index of buf_pool->zip_free[],
or BUF_BUDDY_SIZES */
bool need_mutex) /*!< in: do enter zip_free_mutex */
{
...
if (need_mutex)
{
mutex_enter(&buf_pool->zip_free_mutex);
}
...
func_exit:
if (need_mutex)
{
mutex_exit(&buf_pool->zip_free_mutex);
}
...
}
//func2
void
buf_buddy_condense_free(
buf_pool_t* buf_pool)
{
//first owned zip_free_mutex
mutex_enter(&buf_pool->zip_free_mutex);
...
if (buf_buddy_is_free(buddy, i) == BUF_BUDDY_STATE_FREE) {
/* Both buf and buddy are free.
Try to combine them. */
buf_buddy_remove_from_free(buf_pool, buf, i);
os_atomic_increment_ulint(
&buf_pool->buddy_stat[i].used, 1);
buf_buddy_free_low(buf_pool, buf, i, false);
}
}
Description: The server was blocked when adjusting the buffer pool size from 16G to 4G in prod environment. stack1:wait mutex &buf_pool->zip_free_mutex 312 Thread 155 (Thread 0x7f891e3f7700 (LWP 107754)): 313 #0 0x00007f8daa446945 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 314 #1 0x00000000010421db in os_event::wait_low(long) () 315 #2 0x0000000001108280 in sync_array_wait_event(sync_array_t*, sync_cell_t*&) () 316 #3 0x00000000012b59b6 in buf_buddy_free_low(buf_pool_t*, void*, unsigned long) () 317 #4 0x00000000012b61b2 in buf_buddy_condense_free(buf_pool_t*) () 318 #5 0x00000000011a4998 in buf_pool_resize() () 319 #6 0x00000000011a6093 in buf_resize_thread () 320 #7 0x00007f8daa442e25 in start_thread () from /lib64/libpthread.so.0 321 #8 0x00007f8da861434d in clone () from /lib64/libc.so.6 stack2: also wait &buf_pool->zip_free_mutex 199 Thread 172 (Thread 0x7f89307e9700 (LWP 107735)): 200 #0 0x00007f8daa446945 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 201 #1 0x00000000010421db in os_event::wait_low(long) () 202 #2 0x0000000001108280 in sync_array_wait_event(sync_array_t*, sync_cell_t*&) () 203 #3 0x00000000012b59b6 in buf_buddy_free_low(buf_pool_t*, void*, unsigned long) () 204 #4 0x00000000011c4544 in buf_LRU_block_remove_hashed(buf_page_t*, bool) () 205 #5 0x00000000011c81b6 in buf_LRU_free_page(buf_page_t*, bool) () 206 #6 0x00000000011bd349 in buf_flush_do_batch(buf_pool_t*, buf_flush_t, unsigned long, unsigned long, unsigned long*) () 207 #7 0x00000000011be302 in buf_lru_manager () 208 #8 0x00007f8daa442e25 in start_thread () from /lib64/libpthread.so.0 209 #9 0x00007f8da861434d in clone () from /lib64/libc.so.6 code analyze: buf_pool_resize call buf_buddy_condense_free to free buddy page, owned &buf_pool->zip_free_mutex. And buf_buddy_condense_free call buf_buddy_free_low, also do mutex_enter(&buf_pool->zip_free_mutex), makes self deadlock. void buf_buddy_condense_free( buf_pool_t* buf_pool) { //first owned zip_free_mutex mutex_enter(&buf_pool->zip_free_mutex); ... if (buf_buddy_is_free(buddy, i) == BUF_BUDDY_STATE_FREE) { /* Both buf and buddy are free. Try to combine them. */ buf_buddy_remove_from_free(buf_pool, buf, i); os_atomic_increment_ulint( &buf_pool->buddy_stat[i].used, 1); buf_buddy_free_low(buf_pool, buf, i); } } buf_buddy_free_low(...) { mutex_enter(&buf_pool->zip_free_mutex); //try enter buf_pool->zip_free_mutex again } How to repeat: 1.run mysqld with innodb_buffer_pool_size=16G 2.create compressed table by sysbench 3.do compressure test 4.stop test 5.resize bp size to 4G Have probability to repeat. Suggested fix: //declare void buf_buddy_free_low( /*===============*/ buf_pool_t* buf_pool, /*!< in: buffer pool instance */ void* buf, /*!< in: block to be freed, must not be pointed to by the buffer pool */ ulint i, /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ bool need_mutex=true); /*!< in: do enter zip_free_mutex */ //func1 void buf_buddy_free_low( /*===============*/ buf_pool_t* buf_pool, /*!< in: buffer pool instance */ void* buf, /*!< in: block to be freed, must not be pointed to by the buffer pool */ ulint i, /*!< in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ bool need_mutex) /*!< in: do enter zip_free_mutex */ { ... if (need_mutex) { mutex_enter(&buf_pool->zip_free_mutex); } ... func_exit: if (need_mutex) { mutex_exit(&buf_pool->zip_free_mutex); } ... } //func2 void buf_buddy_condense_free( buf_pool_t* buf_pool) { //first owned zip_free_mutex mutex_enter(&buf_pool->zip_free_mutex); ... if (buf_buddy_is_free(buddy, i) == BUF_BUDDY_STATE_FREE) { /* Both buf and buddy are free. Try to combine them. */ buf_buddy_remove_from_free(buf_pool, buf, i); os_atomic_increment_ulint( &buf_pool->buddy_stat[i].used, 1); buf_buddy_free_low(buf_pool, buf, i, false); } }