From 65a5f4fcb24dfab71d5287855f7d6e6813902346 Mon Sep 17 00:00:00 2001 From: Daniel Black Date: Fri, 15 Mar 2019 19:47:14 +1100 Subject: [PATCH] innodb buffer pool size not consistent with large pages As highlighted in bug #90943 and in Fernando Laudares, innodb buffer pool and large pages is greedy. https://fosdem.org/2019/schedule/event/hugepages_databases/ To highlight how greedy: Before: $ gdb --args ./runtime_output_directory/mysqld --no-defaults --datadir=/tmp/mysqldata --innodb-buffer-pool-size=20M --innodb-buffer-pool-instances=2 --innodb-buffer-pool-chunk-size=2M --large-pages --large-page-size=2M (gdb) break buf_pool_init Breakpoint 1 at 0x1fb0fe0: file /home/dan/repos/mysql-server/storage/innobase/buf/buf0buf.cc, line 1432. (gdb) break os_mem_alloc_large(unsigned long*) Breakpoint 2 at 0x1e54a20: file /home/dan/repos/mysql-server/storage/innobase/os/os0proc.cc, line 83. (gdb) r Thread 2 "mysqld" hit Breakpoint 1, buf_pool_init (total_size=20971520, n_instances=1) at /home/dan/repos/mysql-server/storage/innobase/buf/buf0buf.cc:1436 1436 const ulint size = total_size / n_instances; (gdb) n 1442 NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE; (gdb) p size $1 = 20971520 Thread 3 "mysqld" hit Breakpoint 2, os_mem_alloc_large (n=n@entry=0x7fffe5598918) at /home/dan/repos/mysql-server/storage/innobase/os/os0proc.cc:83 83 if (!os_use_large_pages || !os_large_page_size) { (gdb) f 83 if (!os_use_large_pages || !os_large_page_size) { (gdb) p *n $2 = 2162688 (gdb) n 89 size = ut_2pow_round(*n + (os_large_page_size - 1), os_large_page_size); (gdb) 91 shmid = shmget(IPC_PRIVATE, (size_t)size, SHM_HUGETLB | SHM_R | SHM_W); (gdb) 92 if (shmid < 0) { (gdb) p size $3 = 4194304 (gdb) n 97 ptr = shmat(shmid, NULL, 0); (gdb) 98 if (ptr == (void *)-1) { (gdb) p ptr $4 = (void *) 0x7fffe4800000 Looking at OS allocation: $ cd /proc/$(pidof mysqld);egrep -A 20 '(SYSV|huge)' smaps 7fffe4800000-7fffe4c00000 rw-s 00000000 00:0f 37519390 /SYSV00000000 (deleted) Size: 4096 kB KernelPageSize: 2048 kB MMUPageSize: 2048 kB Rss: 0 kB Pss: 0 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 0 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB VmFlags: rd wr sh mr mw me ms de ht sd So for a 2M chunk size, 4M are allocated. 49% of which isn't used. The same linear relationship hold with 1G or even 16G huge pages. Thats a lot of wastage. Asking sysadmins to set innodb_buffer_pool_chunk_size to 2% less of the large-page-size seem like a poor choise. After this commit, things get sane: Thread 3 "mysqld" hit Breakpoint 1, buf_chunk_init (buf_pool=0x7fffe0325618, chunk=0x7fffd80012d8, mem_size=2097152, mutex=0x7fffe624d350) at /home/dan/repos/mysql-server/storage/innobase/buf/buf0buf.cc:982 982 { (gdb) n 995 if (!buf_pool->allocate_chunk(mem_size, chunk)) { (gdb) c Continuing. Thread 3 "mysqld" hit Breakpoint 2, os_mem_alloc_large (n=n@entry=0x7fffe5598918) at /home/dan/repos/mysql-server/storage/innobase/os/os0proc.cc:83 83 if (!os_use_large_pages || !os_large_page_size) { (gdb) p *n $1 = 2097152 (gdb) n 89 size = ut_2pow_round(*n + (os_large_page_size - 1), os_large_page_size); (gdb) 91 shmid = shmget(IPC_PRIVATE, (size_t)size, SHM_HUGETLB | SHM_R | SHM_W); (gdb) p size $2 = 2097152 (gdb) n 92 if (shmid < 0) { (gdb) 97 ptr = shmat(shmid, NULL, 0); (gdb) 98 if (ptr == (void *)-1) { (gdb) p ptr $3 = (void *) 0x7fffe4a00000 $ cd /proc/$(pidof mysqld);egrep -A 20 '(SYSV|huge)' smaps 7fffe4a00000-7fffe4c00000 rw-s 00000000 00:0f 37552220 /SYSV00000000 (deleted) Size: 2048 kB KernelPageSize: 2048 kB MMUPageSize: 2048 kB Rss: 0 kB Pss: 0 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 0 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB VmFlags: rd wr sh mr mw me ms de ht sd 2M meg allocation when we specified innodb_buffer_pool_chunk_size=2M Rather than add a small extra amount on the size of chunks, keep it of the specified size. The rest of the chunk initialization code adapts to this small size reduction. This has been made in the general case, not just large pages, to keep it simple. The chunks size is controlled by innodb-buffer-pool-chunk-size. In the code increasing this by a descriptor table size length makes it difficult with large pages. With innodb-buffer-pool-chunk-size set to 2M the code before this commit would of added a small amount extra to this value when it tried to allocate this. While not normally a problem it is with large pages, it now requires addition space, a whole extra large page. With a number of pools, or with 1G or 16G large pages this is quite significant. By removing this additional amount, DBAs can set innodb-buffer-pool-chunk size to the large page size, or a multiple of it, and actually get that amount allocated. Previously they had to fudge a value less. --- storage/innobase/buf/buf0buf.cc | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 79424ada8c5..556be92e631 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -989,10 +989,6 @@ static buf_chunk_t *buf_chunk_init( /* Round down to a multiple of page size, although it already should be. */ mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE); - /* Reserve space for the block descriptors. */ - mem_size += ut_2pow_round( - (mem_size / UNIV_PAGE_SIZE) * (sizeof *block) + (UNIV_PAGE_SIZE - 1), - UNIV_PAGE_SIZE); DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return (NULL);); @@ -2022,11 +2018,11 @@ static void buf_pool_resize() { buf_flush_list_mutex_exit(buf_pool); #endif - buf_pool->curr_size = new_instance_size; - buf_pool->n_chunks_new = new_instance_size * UNIV_PAGE_SIZE / srv_buf_pool_chunk_unit; + buf_pool->curr_size = buf_pool->n_chunks_new * buf_pool->chunks->size; + os_wmb; } @@ -2353,7 +2349,7 @@ static void buf_pool_resize() { buf_pool->read_ahead_area = static_cast( ut_min(BUF_READ_AHEAD_PAGES, ut_2_power_up(buf_pool->curr_size / BUF_READ_AHEAD_PORTION))); - buf_pool->curr_pool_size = buf_pool->curr_size * UNIV_PAGE_SIZE; + buf_pool->curr_pool_size = buf_pool->n_chunks * srv_buf_pool_chunk_unit; curr_size += buf_pool->curr_pool_size; buf_pool->old_size = buf_pool->curr_size; }