From af85baec67bd6437526849cd938678bb323d58c2 Mon Sep 17 00:00:00 2001 From: Daniel Black Date: Fri, 15 Mar 2019 16:50:12 +1100 Subject: [PATCH] Allow innodb to use multiple large page sizes Linux has supported multiple large page sizes since kernel ~2.6.32. The Linux-3.8 kernel added a mmap call to retreive specific large page sizes. Currently innodb falls back to conventional mmap if shmget SHM_HUGETLB fails, meaning the deallocation attempts to use shmdt on an mmapped segment. Using shared memory means that kernel limits of kernel.shmall or kernel.shmmax need to be adjusted and a hugetlbfs mount needed to occur. For all these reasons mmap is an easier to use function. The sysadmin, without rebooting, mounting filesystems, sysctls or change large-page-size settings. kernel can change the allocation of huge pages available like: echo 4 > /sys/devices/system/node/node0/hugepages/hugepages-1048576kB/nr_hugepages The innodb large page allocator will choose a large page size smaller or equal to the requested size and allocate a block of memory. Meaning 1G pages will be used for a large innodb buffer pool while log buffers can use 2M pages. If a large page size is unavailable it will fall back to a smaller page size before reverting to convential memory. The meaning of large-page-size system variable has changed for 3.8 kernels that support multiple page size. 0 means choose the most approprate size for the location otherwise its the largest page size that will be used. This is only a compatibility issue if large-pages=1 and large-page-size=0 is a valid disabling mechanism. --- cmake/os/Linux.cmake | 3 +- config.h.cmake | 1 + share/errmsg-utf8.txt | 6 +- storage/innobase/include/os0proc.h | 5 -- storage/innobase/include/srv0srv.h | 4 + storage/innobase/os/os0proc.cc | 129 +++++++++++++++++++++-------- storage/innobase/srv/srv0srv.cc | 2 + storage/innobase/srv/srv0start.cc | 54 ++++++++++++ 8 files changed, 162 insertions(+), 42 deletions(-) diff --git a/cmake/os/Linux.cmake b/cmake/os/Linux.cmake index 52bf6b2d1af..c2e710c4a4d 100644 --- a/cmake/os/Linux.cmake +++ b/cmake/os/Linux.cmake @@ -79,4 +79,5 @@ IF(NOT WITH_ASAN AND NOT WITH_MSAN AND NOT WITH_UBSAN AND NOT WITH_TSAN) ENDIF() # Linux specific HUGETLB /large page support -CHECK_SYMBOL_EXISTS(SHM_HUGETLB sys/shm.h HAVE_LINUX_LARGE_PAGES) +CHECK_SYMBOL_EXISTS(MAP_HUGETLB sys/mman.h HAVE_LINUX_LARGE_PAGES) +CHECK_SYMBOL_EXISTS(MAP_HUGE_SHIFT sys/mman.h HAVE_LINUX_MULTIPLE_LARGE_PAGES) diff --git a/config.h.cmake b/config.h.cmake index cf6dc2da8b4..b7d8b5083e2 100644 --- a/config.h.cmake +++ b/config.h.cmake @@ -215,6 +215,7 @@ #cmakedefine LINUX_ALPINE 1 #cmakedefine LINUX_SUSE #cmakedefine HAVE_LINUX_LARGE_PAGES 1 +#cmakedefine HAVE_LINUX_MULTIPLE_LARGE_PAGES 1 #cmakedefine HAVE_SOLARIS_LARGE_PAGES 1 #cmakedefine HAVE_SOLARIS_ATOMIC 1 #define SYSTEM_TYPE "@SYSTEM_TYPE@" diff --git a/share/errmsg-utf8.txt b/share/errmsg-utf8.txt index 3d97ff7206b..b532ee415bc 100644 --- a/share/errmsg-utf8.txt +++ b/share/errmsg-utf8.txt @@ -16799,9 +16799,6 @@ ER_IB_MSG_851 ER_IB_MSG_852 eng "%s" -ER_IB_MSG_853 - eng "%s" - ER_IB_MSG_854 eng "%s" @@ -18515,6 +18512,9 @@ ER_IB_MSG_MADV_DONTDUMP_UNSUPPORTED ER_IB_MSG_MADVISE_FAILED eng "Disabling @@core_file because @@innodb_buffer_pool_in_core_file is disabled, yet madvise(%p,%zu,%s) failed with %s" +ER_IB_OS_LARGE_PAGE_SIZE + eng "Unexpected OS large page size %zu, not a power of 2, skipping" + ER_COLUMN_CHANGE_SIZE eng "Could not change column '%s' of table '%s'. The resulting size of index '%s' would exceed the max key length of %d bytes." diff --git a/storage/innobase/include/os0proc.h b/storage/innobase/include/os0proc.h index 13633bb12d3..3064d9494e2 100644 --- a/storage/innobase/include/os0proc.h +++ b/storage/innobase/include/os0proc.h @@ -36,11 +36,6 @@ this program; if not, write to the Free Software Foundation, Inc., #include "univ.i" -#ifdef UNIV_LINUX -#include -#include -#endif - typedef void *os_process_t; typedef unsigned long int os_process_id_t; diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 8206e1ea19f..9dfec8ff00c 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -438,6 +438,10 @@ even if they are marked as "corrupted". Mostly it is for DBA to process corrupted index and table */ extern bool srv_load_corrupted; +/* Available OS RAM large page sizes */ +#define srv_large_page_sizes_length 8 +extern size_t srv_large_page_sizes[srv_large_page_sizes_length]; + /** Dedicated server setting */ extern bool srv_dedicated_server; /** Requested size in bytes */ diff --git a/storage/innobase/os/os0proc.cc b/storage/innobase/os/os0proc.cc index ed466028590..a947ed34f66 100644 --- a/storage/innobase/os/os0proc.cc +++ b/storage/innobase/os/os0proc.cc @@ -36,6 +36,11 @@ this program; if not, write to the Free Software Foundation, Inc., #include #include #include + +#if defined HAVE_LINUX_LARGE_PAGES && defined HAVE_SYS_MMAN_H +#include +#endif + #include "ha_prototypes.h" #include "os0proc.h" #include "srv0srv.h" @@ -70,49 +75,114 @@ ulint os_proc_get_number(void) { #endif } +/* +Returns the next large page size smaller or equal to the passed in size. + +The search starts at srv_large_page_sizes[*start]. + +Assumes srv_get_large_page_sizes has been initialised + +For first use, have *start=0. There is no need to increment *start. + +@param sz size to be searched for. +@param start ptr to int representing offset in my_large_page_sizes to start from. +*start is updated during search and can be used to search again if 0 isn't returned. + +@returns the next size found. *start will be incremented to the next potential size. +@retval a large page size that is valid on this system or 0 if no large page size possible. +*/ +static size_t os_next_large_page_size(size_t sz, int *start) +{ +#if defined HAVE_LINUX_MULTIPLE_LARGE_PAGES + size_t cur; + + while (*start < srv_large_page_sizes_length + && srv_large_page_sizes[*start] > 0) + { + cur= *start; + (*start)++; + if (srv_large_page_sizes[cur] <= sz) + { + return srv_large_page_sizes[cur]; + } + } +#endif + return 0; +} + +static inline uint os_bit_size_t_log2(size_t value) +{ + uint bit; + for (bit=0 ; value > 1 ; value>>=1, bit++) ; + return bit; +} + /** Allocates large pages memory. @param[in,out] n Number of bytes to allocate @return allocated memory */ void *os_mem_alloc_large(ulint *n) { - void *ptr; + void *ptr = NULL; ulint size; #if defined HAVE_LINUX_LARGE_PAGES && defined UNIV_LINUX - int shmid; - struct shmid_ds buf; + int mapflag, i= 0; + size_t adjusted_size, large_page_size; - if (!os_use_large_pages || !os_large_page_size) { + if (!os_use_large_pages) { goto skip; } +#ifdef HAVE_LINUX_MULTIPLE_LARGE_PAGES + if (!os_large_page_size) { + /* advance i to be a smaller or equal to os_large_page_size */ + os_next_large_page_size(os_large_page_size, &i); + } + large_page_size = os_next_large_page_size(*n, &i); +#else + large_page_size = os_large_page_size; +#endif + if (!large_page_size) + goto skip; - /* Align block size to os_large_page_size */ - ut_ad(ut_is_2pow(os_large_page_size)); - size = ut_2pow_round(*n + (os_large_page_size - 1), os_large_page_size); + ut_ad(ut_is_2pow(large_page_size)); - shmid = shmget(IPC_PRIVATE, (size_t)size, SHM_HUGETLB | SHM_R | SHM_W); - if (shmid < 0) { - ib::warn(ER_IB_MSG_852) - << "Failed to allocate " << size << " bytes. errno " << errno; - ptr = NULL; - } else { - ptr = shmat(shmid, NULL, 0); - if (ptr == (void *)-1) { - ib::warn(ER_IB_MSG_853) << "Failed to attach shared memory segment," - " errno " - << errno; +#if defined HAVE_LINUX_MULTIPLE_LARGE_PAGES + do +#endif + { + mapflag = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; +#if defined HAVE_LINUX_MULTIPLE_LARGE_PAGES + /* MAP_HUGE_SHIFT added linux-3.8. Take largest HUGEPAGE size */ + mapflag |= os_bit_size_t_log2(large_page_size) << MAP_HUGE_SHIFT; +#endif + /* Align block size to large_page_size */ + adjusted_size = ut_2pow_round(*n + (large_page_size - 1), large_page_size); + ptr = mmap(NULL, adjusted_size, PROT_READ | PROT_WRITE, mapflag, -1, 0); + if (ptr != (void*)-1) { +#if defined HAVE_LINUX_MULTIPLE_LARGE_PAGES + break; + } else { + ptr = NULL; + if (errno == ENOMEM) { + /* no memory at this size, try next size */ + continue; + } +#else + } else { +#endif ptr = NULL; + ib::warn(ER_IB_MSG_852) + << "Failed to allocate " << adjusted_size << " bytes. pagesize " << large_page_size + << " bytes. errno " << errno; } - - /* Remove the shared memory segment so that it will be - automatically freed after memory is detached or - process exits */ - shmctl(shmid, IPC_RMID, &buf); } +#if defined HAVE_LINUX_MULTIPLE_LARGE_PAGES + while ((large_page_size = os_next_large_page_size(*n, &i))); +#endif if (ptr) { - *n = size; - os_atomic_increment_ulint(&os_total_large_mem_allocated, size); + *n = adjusted_size; + os_atomic_increment_ulint(&os_total_large_mem_allocated, adjusted_size); - UNIV_MEM_ALLOC(ptr, size); + UNIV_MEM_ALLOC(ptr, adjusted_size); return (ptr); } @@ -167,13 +237,6 @@ void *os_mem_alloc_large(ulint *n) { void os_mem_free_large(void *ptr, ulint size) { ut_a(os_total_large_mem_allocated >= size); -#if defined HAVE_LINUX_LARGE_PAGES && defined UNIV_LINUX - if (os_use_large_pages && os_large_page_size && !shmdt(ptr)) { - os_atomic_decrement_ulint(&os_total_large_mem_allocated, size); - UNIV_MEM_FREE(ptr, size); - return; - } -#endif /* HAVE_LINUX_LARGE_PAGES && UNIV_LINUX */ #ifdef _WIN32 /* When RELEASE memory, the size parameter must be 0. Do not use MEM_RELEASE with MEM_DECOMMIT. */ diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 95db588c591..ac2458aa045 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -372,6 +372,8 @@ with mutex_enter(), which will wait until it gets the mutex. */ /** Dedicated server setting */ bool srv_dedicated_server = true; +/** Operating system RAM sizes */ +size_t srv_large_page_sizes[srv_large_page_sizes_length]; /** Requested size in bytes */ ulint srv_buf_pool_size = ULINT_MAX; /** Minimum pool size in bytes */ diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index acd2f6ce92e..07db443b1d2 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -49,6 +49,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include #include #include +#include #include #include "btr0btr.h" @@ -318,6 +319,55 @@ static MY_ATTRIBUTE((warn_unused_result)) dberr_t return (DB_SUCCESS); } +/* Descending sort */ +static int size_t_cmp(const void *a, const void *b) +{ + const size_t *ia = (const size_t *)a; // casting pointer types + const size_t *ib = (const size_t *)b; + if (*ib > *ia) + { + return 1; + } + else if (*ib < *ia) + { + return -1; + } + return 0; +} + +/** Fetch large page sizes available from linux */ +static void srv_get_large_page_sizes(size_t sizes[srv_large_page_sizes_length]) +{ + DIR *dirp; + struct dirent *r; + int i= 0; + + dirp= opendir("/sys/kernel/mm/hugepages"); + if (dirp == NULL) + { + perror("Warning: failed to open /sys/kernel/mm/hugepages"); + } + else + { + while (i < srv_large_page_sizes_length && + (r= readdir(dirp))) + { + if (strncmp("hugepages-", r->d_name, 10) == 0) + { + sizes[i]= strtoull(r->d_name + 10, NULL, 10) * 1024ULL; + if (!ut_is_2pow(sizes[i])) + { + ib::warn(ER_IB_OS_LARGE_PAGE_SIZE, sizes[i]); + sizes[i] = 0; + continue; + } + ++i; + } + } + qsort(sizes, i, sizeof(size_t), size_t_cmp); + } +} + /** Initial number of the first redo log file */ #define INIT_LOG_FILE0 (SRV_N_LOG_FILES_MAX + 1) @@ -2025,6 +2075,10 @@ dberr_t srv_start(bool create_new_db, const std::string &scan_directories) { return (srv_init_abort(DB_ERROR)); } +#if defined(HAVE_LINUX_MULTIPLE_LARGE_PAGES) + srv_get_large_page_sizes(srv_large_page_sizes); +#endif /* HAVE_LINUX_MULTIPLE_LARGE_PAGES */ + double size; char unit;