Description:
The server was blocked when adjusting the buffer pool size from 16G to 4G in prod environment.
stack1:wait mutex &buf_pool->zip_free_mutex
312 Thread 155 (Thread 0x7f891e3f7700 (LWP 107754)):
313 #0 0x00007f8daa446945 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
314 #1 0x00000000010421db in os_event::wait_low(long) ()
315 #2 0x0000000001108280 in sync_array_wait_event(sync_array_t*, sync_cell_t*&) ()
316 #3 0x00000000012b59b6 in buf_buddy_free_low(buf_pool_t*, void*, unsigned long) ()
317 #4 0x00000000012b61b2 in buf_buddy_condense_free(buf_pool_t*) ()
318 #5 0x00000000011a4998 in buf_pool_resize() ()
319 #6 0x00000000011a6093 in buf_resize_thread ()
320 #7 0x00007f8daa442e25 in start_thread () from /lib64/libpthread.so.0
321 #8 0x00007f8da861434d in clone () from /lib64/libc.so.6
stack2: also wait &buf_pool->zip_free_mutex
199 Thread 172 (Thread 0x7f89307e9700 (LWP 107735)):
200 #0 0x00007f8daa446945 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
201 #1 0x00000000010421db in os_event::wait_low(long) ()
202 #2 0x0000000001108280 in sync_array_wait_event(sync_array_t*, sync_cell_t*&) ()
203 #3 0x00000000012b59b6 in buf_buddy_free_low(buf_pool_t*, void*, unsigned long) ()
204 #4 0x00000000011c4544 in buf_LRU_block_remove_hashed(buf_page_t*, bool) ()
205 #5 0x00000000011c81b6 in buf_LRU_free_page(buf_page_t*, bool) ()
206 #6 0x00000000011bd349 in buf_flush_do_batch(buf_pool_t*, buf_flush_t, unsigned long, unsigned long, unsigned long*) ()
207 #7 0x00000000011be302 in buf_lru_manager ()
208 #8 0x00007f8daa442e25 in start_thread () from /lib64/libpthread.so.0
209 #9 0x00007f8da861434d in clone () from /lib64/libc.so.6
code analyze:
buf_pool_resize call buf_buddy_condense_free to free buddy page, owned &buf_pool->zip_free_mutex. And buf_buddy_condense_free call buf_buddy_free_low, also do mutex_enter(&buf_pool->zip_free_mutex), makes self deadlock.
void
buf_buddy_condense_free(
buf_pool_t* buf_pool)
{
//first owned zip_free_mutex
mutex_enter(&buf_pool->zip_free_mutex);
...
if (buf_buddy_is_free(buddy, i) == BUF_BUDDY_STATE_FREE) {
/* Both buf and buddy are free.
Try to combine them. */
buf_buddy_remove_from_free(buf_pool, buf, i);
os_atomic_increment_ulint(
&buf_pool->buddy_stat[i].used, 1);
buf_buddy_free_low(buf_pool, buf, i);
}
}
buf_buddy_free_low(...)
{
mutex_enter(&buf_pool->zip_free_mutex); //try enter buf_pool->zip_free_mutex again
}
How to repeat:
1.run mysqld with innodb_buffer_pool_size=16G
2.create compressed table by sysbench
3.do compressure test
4.stop test
5.resize bp size to 4G
Have probability to repeat.
Suggested fix:
//declare
void
buf_buddy_free_low(
/*===============*/
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
void* buf, /*!< in: block to be freed, must not be
pointed to by the buffer pool */
ulint i, /*!< in: index of buf_pool->zip_free[],
or BUF_BUDDY_SIZES */
bool need_mutex=true); /*!< in: do enter zip_free_mutex */
//func1
void
buf_buddy_free_low(
/*===============*/
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
void* buf, /*!< in: block to be freed, must not be
pointed to by the buffer pool */
ulint i, /*!< in: index of buf_pool->zip_free[],
or BUF_BUDDY_SIZES */
bool need_mutex) /*!< in: do enter zip_free_mutex */
{
...
if (need_mutex)
{
mutex_enter(&buf_pool->zip_free_mutex);
}
...
func_exit:
if (need_mutex)
{
mutex_exit(&buf_pool->zip_free_mutex);
}
...
}
//func2
void
buf_buddy_condense_free(
buf_pool_t* buf_pool)
{
//first owned zip_free_mutex
mutex_enter(&buf_pool->zip_free_mutex);
...
if (buf_buddy_is_free(buddy, i) == BUF_BUDDY_STATE_FREE) {
/* Both buf and buddy are free.
Try to combine them. */
buf_buddy_remove_from_free(buf_pool, buf, i);
os_atomic_increment_ulint(
&buf_pool->buddy_stat[i].used, 1);
buf_buddy_free_low(buf_pool, buf, i, false);
}
}