diff --git a/mysql-test/r/mysqld--help-notwin.result b/mysql-test/r/mysqld--help-notwin.result index ee922e63149..214b2f20edc 100644 --- a/mysql-test/r/mysqld--help-notwin.result +++ b/mysql-test/r/mysqld--help-notwin.result @@ -486,6 +486,8 @@ The following options may be given as the first argument: -L, --language=name Client error messages in given language. May be given as a full path. Deprecated. Use --lc-messages-dir instead. --large-pages Enable support for large pages + --large-pages-for-code[=name] + Remap .text/.data to 2mb huge pages --lc-messages=name Set the language used for the error messages. --lc-messages-dir=name Directory where error messages are @@ -1546,6 +1548,7 @@ keyring-migration-port 0 keyring-migration-source (No default value) keyring-migration-user (No default value) large-pages FALSE +large-pages-for-code OFF lc-messages en_US lc-time-names en_US local-infile FALSE diff --git a/mysql-test/suite/sys_vars/r/large_pages_for_code_basic.result b/mysql-test/suite/sys_vars/r/large_pages_for_code_basic.result new file mode 100644 index 00000000000..e3143e60b6d --- /dev/null +++ b/mysql-test/suite/sys_vars/r/large_pages_for_code_basic.result @@ -0,0 +1,21 @@ +SELECT @@GLOBAL.large_pages_for_code; +@@GLOBAL.large_pages_for_code +OFF +SET @@GLOBAL.large_pages_for_code=ON; +ERROR HY000: Variable 'large_pages_for_code' is a read only variable +SELECT @@GLOBAL.large_pages_for_code; +@@GLOBAL.large_pages_for_code +OFF +SELECT @@large_pages_for_code = @@GLOBAL.large_pages_for_code; +@@large_pages_for_code = @@GLOBAL.large_pages_for_code +1 +SELECT @@large_pages_for_code; +@@large_pages_for_code +OFF +SELECT @@local.large_pages_for_code; +ERROR HY000: Variable 'large_pages_for_code' is a GLOBAL variable +SELECT @@SESSION.large_pages_for_code; +ERROR HY000: Variable 'large_pages_for_code' is a GLOBAL variable +SELECT @@GLOBAL.large_pages_for_code; +@@GLOBAL.large_pages_for_code +OFF diff --git a/mysql-test/suite/sys_vars/t/large_pages_for_code_basic.test b/mysql-test/suite/sys_vars/t/large_pages_for_code_basic.test new file mode 100644 index 00000000000..458afe1d06c --- /dev/null +++ b/mysql-test/suite/sys_vars/t/large_pages_for_code_basic.test @@ -0,0 +1,30 @@ +#################################################################### +# Displaying default value # +#################################################################### +SELECT @@GLOBAL.large_pages_for_code; + +#################################################################### +# Check if Value can set # +#################################################################### + +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +SET @@GLOBAL.large_pages_for_code=ON; +SELECT @@GLOBAL.large_pages_for_code; + +################################################################################ +# Check if accessing variable with and without GLOBAL point to same variable # +################################################################################ +SELECT @@large_pages_for_code = @@GLOBAL.large_pages_for_code; +SELECT @@large_pages_for_code; + +################################################################################ +# Check if variable can be accessed via local and SESSION prefix # +################################################################################ + +--Error ER_INCORRECT_GLOBAL_LOCAL_VAR +SELECT @@local.large_pages_for_code; + +--Error ER_INCORRECT_GLOBAL_LOCAL_VAR +SELECT @@SESSION.large_pages_for_code; + +SELECT @@GLOBAL.large_pages_for_code; diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt index cb0d00402f8..89fb9c9b94c 100644 --- a/sql/CMakeLists.txt +++ b/sql/CMakeLists.txt @@ -707,6 +707,10 @@ SET(SQL_SOURCE srv_session_service.cc ) +IF(LINUX) + LIST(APPEND SQL_SOURCE huge.cc) +ENDIF() + IF(NOT WIN32) LIST(APPEND SQL_SOURCE mysqld_daemon.cc) ENDIF() @@ -1035,6 +1039,22 @@ TARGET_LINK_LIBRARIES(mysqld sql_main sql_gis binlog rpl master slave sql_dd mysys minchassis binlogevents_static ${ICU_LIBRARIES}) +IF(LINUX) + # Needed for remapping .text/.data sections + IF(USE_LD_LLD) + TARGET_LINK_LIBRARIES(mysqld + -Wl,-zmax-page-size=0x200000 + ) + ELSE() + # Assuming ld.gold and the default ld.bfd + TARGET_LINK_LIBRARIES(mysqld + -Wl,-zcommon-page-size=0x200000 + -Wl,-zmax-page-size=0x200000 + -Wl,-Ttext-segment=0x200000 + ) + ENDIF() +ENDIF() + # Add dependency on ldap library, to ensure it is not unloaded # when we shutdown the authentication plugin. # For Valgrind, we need it to enable our suppressions. diff --git a/sql/huge.cc b/sql/huge.cc new file mode 100644 index 00000000000..e76cfb77092 --- /dev/null +++ b/sql/huge.cc @@ -0,0 +1,1014 @@ +/***************************************************************************** + +Copyright (c) 2020, Huawei and/or its affiliates. All rights reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/** + @file + + @brief + EXECUTIVE SUMMARY + ================= + The code below remaps .text and .data segments to the huge pages. According + to our benchmarks it improves performance up to 7% via significant iTLB/dTLB + cache miss reduction. Currently only 2Mb huge pages are supported. Since an + additional .text/.data sections' alignment is required (is done by linker), + huge page size is a compile time variable. + The code detects all the necessary conditions, if something goes wrong, it + falls back to the default pages. + + USER STORIES + ============ + The same approaches are realized by Google(Chromium) and Facebook(HHVM). + It's worth mentioning that these vendors presumably use transparent huge + pages. However technology works pretty well and widely accepted. + + FUNCTIONAL REQUIREMENTS + ======================= + 1. Link mysqld with the following parameters: + -zcommon-page-size=0x200000 -zmax-page-size=0x200000 -Ttext-segment=0x200000 + 2. Mount the hugetlbfs filesystem with right access permissions for mysqld + 3. Preallocate the number of hugepages which are sufficient for the + .text and .data segments (e.g. 50) + + Since we make ajustments to [heap] segment and call sbrk/brk directly, + the code below doesn't use malloc/operator new directly or indirectly + (otherwise it could lead to complex rare hidden errors). This is the only + reason why a separate buffer is provided for error messages. If errors occur, + all messages are written to the user's buffer, messages are divided by a '\n', + final message ends with '\0'. A caller is free to provide a NULL instead of + a buffer, so no messages are written at all. + + Details: + 1) [heap] segment fuels brk/sbrk system calls; + 2) glibc allocator uses brk/sbrk for small allocations; + 3) libhugetlbfs unmaps the [heap] segment implicitly; + during .data remapping because [heap] segment isn't aligned to 2Mb + while .text and .data really are. The code below preserves [heap] + segment: copies part of it which overlaps with the last data segment + (after huge page alignment) and artificially makes [heap] segment + bigger (in this case mmap(MAP_FIXED) leaves [heap] trimmed but alive). + + NON-FUNCTIONAL REQUIREMENTS + =========================== + NF-1: Improving performance up to 7%, any workload is affected positively. + NF-2: No explicit impact on other MySQL components + NF-3: The code below doesn't use any mysqld specific parts of code + (absolutely no dependecies). So it can be easily used anywhere else. + + INTERFACE SPECIFICATION + ======================= + I-1: One new file: huge.cc (current) + I-2: No changes in existing syntax + I-3: No new tools + I-4: No impact on existing functionality + I-5: Introduces a new function: + uint64_t remap_text_and_data_segments_to_huge_pages( + char *err_log,size_t err_size); + And a convenient wrapper for it: + uint64_t remap_text_and_data_segments_to_huge_pages( + void (*error_hook)(const char *error_msg)); + + The function must be called before mysqld starts its threads, otherwise + any parallel code gets immediate SIGSEGV during .code/.data remapping + stage. + + NOTES + ===== + Since we already deal with [heap] segments , it's possible to remap all + .text, .data and the part/whole of the current [heap] segment to 2Mb + pages even if special alignment (-Ttext-segment=0x200000) isn't specified. + The only thing which is going to change in this approach is memory protection + flags (which are specified for each page). If page is huge and it includes + .text, .data and [heap] 4k aligned segments simultaneously, this page must + have 'rwx' memory protection to execute .text and read/write .data/[heap] + (or program is terminated with SIGSEGV by the kernel). In this case, the code + should be rewritten from 'checking overlap' to 'segments merging' approach. + + Remapping .text segment to huge pages breaks symbol resolution in + "perf record". The workaround is the usage of perf's JIT API + "/tmp/perf-PID.map": + + nm --numeric-sort --print-size --demangle mysqld + | awk '$4{print $1" "$2" "$4}' | grep -Ee"^0" > /tmp/perf-2124679.map + + Now `perf` see the symbols as it were `JIT` symbols: + perf report -i perf-2124679.data +*/ + +#ifndef __linux__ +#error The code below supports Linux platform only +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Huge_page_remapping { + +#define HUGE_PAGE_SIZE 0x200000 +#define MAX_SEGMENTS 16 +#define DEBUG_LEVEL 0 /* 2 levels are supported */ + +/* Only GNU GCC/Clang attribute syntax is supported for now */ +#if defined(__GNUC__) +#define ATTRIBUTE(A) __attribute__(A) +#else +#define ATTRIBUTE(A) +#endif + +ATTRIBUTE((format(printf, 3, 0))) +inline size_t do_vsnprintf(char *buffer, size_t buffer_size, const char *format, + va_list args) { + int ret = vsnprintf(buffer, buffer_size, format, args); + /* error? just skip it */ + if (ret < 0) return 0; + + size_t size = static_cast(ret); + /* the output of vsnprintf() was truncated ? */ + if (size >= buffer_size) + size = buffer_size - 1; /* leave a room for final '\0' */ + + /* returns the number of bytes written without final '\0' */ + return size; +} + +class Memory_logger { + private: + char *m_output; + size_t m_output_size; + /** Indicates that something was written to the logger at least once */ + int m_filled; + + private: + bool initialized() const { return m_output != NULL; } + /** + Logger is initialized and has room for new messages, isn't it? + @return true if we can write to the logger safely. + */ + bool has_room() const { return m_output_size != 0; } + void clear() { + m_output = NULL; + m_output_size = 0; + m_filled = 0; + } + + public: + Memory_logger() { clear(); } + void init(char *error_buffer, size_t error_buffer_size) { + if (error_buffer == NULL || error_buffer_size == 0) return; + m_output = error_buffer; + m_output_size = error_buffer_size; + m_filled = 0; + } + ATTRIBUTE((format(printf, 2, 3))) + Memory_logger &operator()(const char *format, ...) { + if (!has_room()) return *this; + + va_list args; + va_start(args, format); + size_t size = do_vsnprintf(m_output, m_output_size, format, args); + va_end(args); + if (size == 0) return *this; + + /* + Adjust output buffer for the next message: write '\n' instead of '\0' + which is written by do_vsnprintf. + */ + m_output += size; + *m_output++ = '\n'; + m_output_size -= size + 1; + m_filled = 1; + return *this; + } + void finalize() { + if (!initialized()) return; + /* + Put a final sentinel for C-like functions: write a '\0' instead of + the last '\n' written by operator(). + */ + m_output[-m_filled] = '\0'; + + /* Clean our internal state */ + clear(); + } +}; + +Memory_logger print_error; + +/** RAII class for initialization and finalization of memory logger */ +struct Setup_memory_logger { + Setup_memory_logger(char *error_log, size_t error_size) { + print_error.init(error_log, error_size); + } + ~Setup_memory_logger() { print_error.finalize(); } +}; + +#if DEBUG_LEVEL == 0 +/* Debugging is turned off, errors are written to the Memory_logger */ +#define DEBUG(...) +#define PRINT_ERROR(FORMAT, ...) print_error(FORMAT, ##__VA_ARGS__) +#define PRINT_ERROR_ERRNO(FORMAT, ...) \ + do { \ + char errbuf[256]; \ + const char *errmsg = strerror_r(errno, errbuf, 256); \ + print_error(FORMAT ": %s", ##__VA_ARGS__, errmsg); \ + } while (0) +#else +/* Debugging is turned on, errors are written to the STDERR */ + +inline void dump_buffer_to_stderr(const char *ptr, size_t len) { + ssize_t written; + do { + written = write(STDERR_FILENO, ptr, len); + if (written == -1) { + if (errno == EINTR) continue; + return; + } + len -= written; + ptr += written; + } while (len && written); +} + +ATTRIBUTE((format(printf, 1, 2))) +inline void debug_impl(const char *format, ...) { + static const size_t s_buffer_size = 256; + char buffer[s_buffer_size]; + + va_list args; + va_start(args, format); + size_t size = do_vsnprintf(buffer, s_buffer_size, format, args); + va_end(args); + dump_buffer_to_stderr(buffer, size); +} +#define DEBUG(FORMAT, ...) debug_impl(FORMAT "\n", ##__VA_ARGS__) +#define PRINT_ERROR(FORMAT, ...) debug_impl(FORMAT "\n", ##__VA_ARGS__) +#define PRINT_ERROR_ERRNO(FORMAT, ...) \ + do { \ + char errbuf[256]; \ + const char *errmsg = strerror_r(errno, errbuf, 256); \ + debug_impl(FORMAT ": %s\n", ##__VA_ARGS__, errmsg); \ + } while (0) + +#endif /* IF DEBUG_LEVEL == 0 */ + +inline size_t align_me_down(size_t addr) { + return addr & ~(HUGE_PAGE_SIZE - 1); +} + +inline size_t align_me_up(size_t addr) { + return (addr + HUGE_PAGE_SIZE - 1) & ~(HUGE_PAGE_SIZE - 1); +} + +struct String_buffer { + bool empty() const { return m_data == NULL; } + + void split(String_buffer *cols, size_t count) const { + uint32_t coll = 0; + char *line = m_data, *const end = m_data + m_len; + for (char *start = line; start < end && coll < count; ++start) { + if (*start != ' ') continue; + + String_buffer &curr = cols[coll++]; + curr.m_data = line; + curr.m_len = start - line; + while (++start < end && *start == ' ') + ; + line = start; + } + if (coll < count) { + String_buffer &final_col = cols[coll]; + final_col.m_data = line; + final_col.m_len = end - line; + } + } + +#if DEBUG_LEVEL > 0 + void debug_print() const { + dump_buffer_to_stderr(m_data, m_len); + dump_buffer_to_stderr("\n", 1); + } +#endif + + char *m_data; + size_t m_len; +}; + +class File_reader { + public: + File_reader(const char *filename) : m_fd(open(filename, O_RDONLY)) { + if (m_fd == -1) { + PRINT_ERROR_ERRNO("Can't open file '%s'", filename); + return; + } + } + + File_reader(int _fd) : m_fd(_fd) {} + + ~File_reader() { destroy(); } + + bool is_ok() const { return m_fd != -1; } + + ssize_t read(char *buf, size_t len) { + ssize_t ret = 0, have = 0; + while (len && (ret = ::read(m_fd, buf, len))) { + if (ret == -1) { + if (errno == EINTR) continue; + + PRINT_ERROR_ERRNO("Can't read from %d", m_fd); + destroy(); + return 0; + } + + buf += ret; + len -= ret; + have += ret; + } + return have; + } + + private: + int m_fd; + + private: + void destroy() { + if (m_fd != -1) { + close(m_fd); + m_fd = -1; + } + } +}; + +class Buffered_file_reader : public File_reader { + public: + Buffered_file_reader(const char *filename) : File_reader(filename) { init(); } + Buffered_file_reader(const int fd) : File_reader(fd) { init(); } + + bool read_line(String_buffer &strbuf) { + if (!is_ok()) return false; + + while (true) { + char *end = m_buffer + m_have; + while (m_curr < end) { + if (*m_curr != '\n') { + ++m_curr; + continue; + } + strbuf.m_data = m_start; + strbuf.m_len = m_curr - m_start; + ++m_curr; + m_start = m_curr; + return true; + } + if (m_have < s_buffer_size) { + if (m_curr == m_start) return false; + strbuf.m_data = m_start; + strbuf.m_len = m_curr - m_start; + m_start = m_curr; + return true; + } + if (m_start == m_buffer) { + m_buffer[s_buffer_size - 1] = '\0'; + PRINT_ERROR("Buffer '%lu' is too small for the line '%s'\n", + s_buffer_size, m_buffer); + return false; + } + + m_have = end - m_start; + memmove(m_buffer, m_start, m_have); /* copying ranges may overlap */ + m_curr = m_buffer + m_have; + m_start = m_buffer; + m_have += File_reader::read(m_curr, s_buffer_size - m_have); + } + } + + private: + static const size_t s_buffer_size = 4096; + char m_buffer[s_buffer_size]; + + private: + char *m_start; + char *m_curr; + size_t m_have; + + void init() { + m_curr = m_start = m_buffer; + m_have = File_reader::read(m_buffer, s_buffer_size); + } +}; + +struct Elf_segment { + uint64_t m_vaddr; + uint64_t m_memsz; + uint64_t m_flags; +}; + +struct Elf_segments { + Elf_segments() : m_curr_size(0) {} + + inline void push_back(uint64_t vaddr, uint64_t memsz, uint64_t flags) { + if (m_curr_size >= MAX_SEGMENTS) return; + Elf_segment *seg = segments + m_curr_size; + seg->m_vaddr = vaddr; + seg->m_memsz = memsz; + seg->m_flags = flags; + ++m_curr_size; + } + + bool is_overlap() const { + for (uint32_t i = 1; i < m_curr_size; ++i) { + const Elf_segment *prev = segments + i - 1; + const Elf_segment *curr = segments + i; + uint64_t prev_end = align_me_up(prev->m_vaddr + prev->m_memsz); + uint64_t curr_start = align_me_down(curr->m_vaddr); + if (prev_end > curr_start) { + DEBUG( + "[%#lx %#lx) --> [%#lx %#lx) overlap with " + "[%#lx %#lx) --> [%#lx %#lx)", + prev->m_vaddr, prev->m_vaddr + prev->m_memsz, + align_me_down(prev->m_vaddr), prev_end, curr->m_vaddr, + curr->m_vaddr + curr->m_memsz, curr_start, + align_me_up(curr->m_vaddr + curr->m_memsz)); + return true; + } + } + return false; + } + + inline size_t size() const { return m_curr_size; } + + inline bool empty() const { return m_curr_size == 0; } + + size_t m_curr_size; + Elf_segment segments[MAX_SEGMENTS]; +}; + +inline uint64_t convert_elf_flags_to_memory_protocol_flags( + const uint64_t flags) { + uint64_t res = 0; + if (flags & PF_R) res |= PROT_READ; + if (flags & PF_W) res |= PROT_WRITE; + if (flags & PF_X) res |= PROT_EXEC; + return res; +} + +static int parse_elf_file(struct dl_phdr_info *info, size_t, void *data) { + Elf_segments &segs = *reinterpret_cast(data); + + /* + From man page DL_ITERATE_PHDR(3): + The first object visited by callback is the main program. + */ + DEBUG("Detected start address %#lx", info->dlpi_addr); + for (Elf64_Half i = 0; i < info->dlpi_phnum; ++i) { + const ElfW(Phdr) *hdr = info->dlpi_phdr + i; + if (hdr->p_type != PT_LOAD) continue; + + uint64_t vaddr = hdr->p_vaddr + info->dlpi_addr, memsz = hdr->p_memsz; + DEBUG("Detected load segment [%#lx, %#lx)", vaddr, vaddr + memsz); + segs.push_back(vaddr, memsz, + convert_elf_flags_to_memory_protocol_flags(hdr->p_flags)); + } + + /* + There're shared libraries segments right after the main program segments, + they all usually have the default alignment (e.g. 4k), so skip them. + */ + return 1; /* Do not call me again */ +} + +class Heap_rescue { + public: + Heap_rescue() { + /* + Even if sbrk returns (-1) the code below works correctly: + 0xfff...fff doens't overlap with anything inside the 48bit + virtual address space. + */ + if (!parse_mmap_file()) + m_start_address = m_end_address = reinterpret_cast(sbrk(0)); + m_new_break = 0; + DEBUG("Detected heap segment [%#lx, %#lx)", m_start_address, m_end_address); + } + + void copy(char *haddr, uint64_t curr_seg_end, uint64_t &mem_protection, + uint64_t vaddr_start_aligned, uint64_t vaddr_end_aligned) { + /* Overlapping? */ + if (m_start_address > vaddr_end_aligned) return; + + /* + .bss segment may reside at starting addresses of [heap] segment + (according to /proc/self/maps), that's why don't panic, just adjust + pointers and go further. + */ + if (m_start_address < curr_seg_end) { + DEBUG( + "Heap overlaps with current segment: " + "heap_start_address=%#lx, curr_seg_end=%#lx, adjusting ...", + m_start_address, curr_seg_end); + m_start_address = curr_seg_end; + } + + char *haddr_shifted = haddr + (m_start_address - vaddr_start_aligned); + uint64_t heap_to_copy; + if (m_end_address < vaddr_end_aligned) { + /* + Advance current break so that it moves past the current huge page. + In this case, the kernel does not unmap the entire heap segment, + it unmaps only the overlapping part (see man mmap / MAP_FIXED). + */ + if (sbrk(HUGE_PAGE_SIZE * 2) == MAP_FAILED) { + PRINT_ERROR_ERRNO("Can't advance the break: [heap] is lost."); + return; + } + m_new_break = vaddr_end_aligned; + heap_to_copy = m_end_address - m_start_address; + } else + heap_to_copy = vaddr_end_aligned - m_start_address; + + DEBUG("Copy [heap] segment [%#lx,%#lx) -> [%p, %p)", m_start_address, + m_start_address + heap_to_copy, haddr_shifted, + haddr_shifted + heap_to_copy); + memcpy(haddr_shifted, reinterpret_cast(m_start_address), + heap_to_copy); + + /* [heap] must be read/write accessible */ + mem_protection |= PROT_READ | PROT_WRITE; + } + void set_new_break() const { + /* There was no overlap detected, so skip this step */ + if (m_new_break == 0) return; + + /* Adjust break to the edge of the last 'hugely' aligned data segment */ + if (brk(reinterpret_cast(m_new_break))) { + PRINT_ERROR_ERRNO("Setting new [heap] break has failed"); + return; + } + DEBUG("New [heap] break is set: %#lx", m_new_break); + } + + private: + uint64_t m_start_address; + uint64_t m_end_address; + uint64_t m_new_break; + + private: + bool parse_mmap_file() { + static const char s_heapname[] = "[heap]"; + enum { + START_STOP_ADDRESS = 0, + PERMISSIONS = 1, + OFFSET = 2, + DEVICE = 3, + INODE = 4, + PATHNAME = 5, + COUNT + }; + + Buffered_file_reader reader("/proc/self/maps"); + String_buffer line; + String_buffer columns[COUNT] = {}; + while (reader.read_line(line)) { + line.split(columns, COUNT); + + String_buffer &pathname = columns[PATHNAME]; + if (pathname.m_len != sizeof(s_heapname) - 1 || + strncmp(pathname.m_data, s_heapname, pathname.m_len) != 0) + continue; + + String_buffer &address = columns[START_STOP_ADDRESS]; + /* There's only one [heap] line, if it's damaged, just return */ + if (address.empty()) return false; + + /* The 'address' looks like "12345-67890 " */ + char *next; + m_start_address = strtoull(address.m_data, &next, 16); + m_end_address = strtoull(next + 1, NULL, 16); + return true; + } + return false; + } +}; + +class Huge_page_remapper { + public: + inline bool setup() { + return find_hugetlbfs_mount_point() && + generate_mountpoint_filename_template(); + } + + /** + Remaps segment 'seg' to 2mb huge page. Adjust [heap] segment if needed. + + @param [in] seg ELF segment. Must be set. + @param [in] heap. Might be set. + @return number of consumed huge pages. + */ + uint64_t remap(const Elf_segment &seg, Heap_rescue *heap) const { + const uint64_t vaddr_start_aligned = align_me_down(seg.m_vaddr); + const uint64_t vaddr_end_aligned = align_me_up(seg.m_vaddr + seg.m_memsz); + const size_t hsize = vaddr_end_aligned - vaddr_start_aligned; + + /* Every call new mount point should be created => copy the template */ + char mp[PATH_MAX]; + memcpy(mp, m_curr_mount_point, m_mp_size); + + int fd = mkstemp64(mp); + if (fd < 0) { + PRINT_ERROR_ERRNO("Can't open '%s'", mp); + return 0; + } + unlink(mp); + DEBUG("Hugepage file created: '%s'", mp); + + char *haddr = static_cast( + mmap(NULL, hsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); + if (haddr == MAP_FAILED) { + PRINT_ERROR_ERRNO("Can't mmap fd = %d, size = %lu", fd, hsize); + return 0; + } + + /* + In case of position independent code, the start virtual address + isn't aligned to 2mb border by kernel, so copy our segment accurately + */ + char *haddr_shifted = haddr + (seg.m_vaddr - vaddr_start_aligned); + uint64_t seg_end = seg.m_vaddr + seg.m_memsz; + DEBUG("Text segment copy [%#lx, %#lx) -> [%p [%p %p) %p)", seg.m_vaddr, + seg_end, haddr, haddr_shifted, haddr_shifted + seg.m_memsz, + haddr + hsize); + memcpy(haddr_shifted, reinterpret_cast(seg.m_vaddr), seg.m_memsz); + + /* Current segment overlaps with [heap]? */ + uint64_t mem_protection = seg.m_flags; + if (heap) + heap->copy(haddr, seg_end, mem_protection, vaddr_start_aligned, + vaddr_end_aligned); + + DEBUG("Unmapping [%p %p)", haddr, haddr + hsize); + munmap(haddr, hsize); + + /* + Final mmap with MAP_FIXED to the existing virtual address, + any already existing and overlapping virtual memory regions + are going to be unmapped by kernel silently. + + Using MAP_PRIVATE is the correct classic expected by kernel usage of + code/data mappings. The side effect is the additional reservation of + huge pages made by kernel. That's why number of used pages reported by the + application log is smaller than the number of pages reported by the kernel + by about 20% (depends on the exact Linux kernel version). We can use + MAP_PRIVATE | MAP_NORESERVE to prevent reservation, however, if kernel + can't find a free huge page (e.g. after fork()), the application might get + SIGSEGV. + + Using MAP_SHARED prevents the addtitional huge pages reservation too, + however, you must know that sharing the code/data mapping results in: + * fork() doesn't copy these mappings, so both child and parent continue + to use the same mappings leading you to sporatic SIGSEGV crashes; + * gdb attach stops serving break points; + * produced core dumps are corrupted, gdb postmortem analysis is broken; + These effects are the only observed by us, but there could be more. + */ + haddr = static_cast( + mmap(reinterpret_cast(vaddr_start_aligned), hsize, + mem_protection, MAP_PRIVATE | MAP_FIXED, fd, 0)); + if (haddr == MAP_FAILED) { + PRINT_ERROR_ERRNO("Can't mmap '%s' -> [%#lx %#lx)", mp, + vaddr_start_aligned, vaddr_end_aligned); + close(fd); + return 0; + } + /* After mmap succeeded it's safe to close file descriptor */ + close(fd); + DEBUG("Remapping succeeded --> [%p %p)", haddr, haddr + hsize); + return hsize / HUGE_PAGE_SIZE; + } + + private: + size_t m_mp_size; + char m_curr_mount_point[PATH_MAX]; + + private: + bool check_mount_point(String_buffer &mount_point) { +#define HUGETLBFS_MAGIC 0x958458f6 + + /* check it's really hugetlbfs */ + struct statfs64 sfs; + if (statfs64(mount_point.m_data, &sfs) == -1) return false; + if (sfs.f_type != HUGETLBFS_MAGIC) { + PRINT_ERROR("'%s': Not a hugetlbfs filesystem", mount_point.m_data); + return false; + } + if (sfs.f_bsize != HUGE_PAGE_SIZE) { + PRINT_ERROR("'%s': Huge size isn't 2Mb", mount_point.m_data); + return false; + } + + /* check the process has apropriate access */ + if (access(mount_point.m_data, (R_OK | W_OK | X_OK)) != 0) { + PRINT_ERROR("'%s': don't have 'rwx' access", mount_point.m_data); + return false; + } + return true; + +#undef HUGETLBFS_MAGIC + } + bool process_mount_line(String_buffer &line) { + /* + Example of the line: + hugetlbfs /dev/hugepages2M hugetlbfs rw,relatime,pagesize=2M 0 0 + + Totally 6 columns, the last two parameters are dummy, we parse the first + 3 fields only. + */ + static const char hugetlbfs[] = "hugetlbfs"; + enum { DEVICE = 0, MOUNT_POINT = 1, FILESYSTEM_TYPE = 2, COUNT }; + + String_buffer columns[COUNT] = {}; + line.split(columns, COUNT); + + /* Is is hugetlbfs ? */ + String_buffer &fstype = columns[FILESYSTEM_TYPE]; + if (sizeof(hugetlbfs) - 1 != fstype.m_len || + strncmp(hugetlbfs, fstype.m_data, fstype.m_len) != 0) + return false; + + String_buffer &mp = columns[MOUNT_POINT]; + /* There's a space between mount_point and filesystem, so it's safe */ + mp.m_data[mp.m_len++] = '\0'; + DEBUG("Detected hugetlbfs filesystem at '%s'", mp.m_data); + + /* Double check it's correct hugetlbfs */ + if (!check_mount_point(mp)) return false; + + /* Check we can store hugetlbfs's path to our buffer */ + if (mp.m_len > PATH_MAX) { + DEBUG("Mount point '%s' is too long: %lu", mp.m_data, mp.m_len); + return false; + } + + /* Copy hugetlbfs mount point path and size (with final '\0') */ + m_mp_size = mp.m_len; + memcpy(m_curr_mount_point, mp.m_data, mp.m_len); + return true; + } + + bool find_hugetlbfs_mount_point() { + /* + Parsing kernel created files: + 1) /proc/mounts -> /proc/self/mounts + 2) /proc/mtab + */ + const char *filename = "/proc/mounts"; + int fd = open(filename, O_RDONLY); + if (fd == -1) { + PRINT_ERROR_ERRNO("Can't open file %s", filename); + + filename = "/etc/mtab"; + if ((fd = open(filename, O_RDONLY)) == -1) { + PRINT_ERROR_ERRNO("Can't open file %s too", filename); + return false; + } + } + + /* Searching hugetlbfs mounted filesystem line by line */ + + String_buffer line; + Buffered_file_reader reader(fd); + while (reader.read_line(line)) + if (process_mount_line(line)) return true; + return false; + } + + bool generate_mountpoint_filename_template() { + /* Make template for mktemp64(): curr_mount_point + '/' + "XXXXXX" */ + + char *dst = m_curr_mount_point + m_mp_size - 1; + *dst++ = '/'; /* instead of '\0' */ + + m_mp_size += 7; /* "XXXXXX" with final '\0' */ + if (m_mp_size > PATH_MAX) return false; + memcpy(dst, "XXXXXX", 7); /* including '\0' */ + + DEBUG("Mount point is generated: '%s'", m_curr_mount_point); + return true; + } +}; + +#if DEBUG_LEVEL > 1 +/* + For this verbose level some additional information is printed out: + 1) current segments (with addresses) due to ELF information + 2) current mappings of the process due to /proc/self/maps +*/ + +namespace Debug { + +/* buffer size must be at least 20 bytes */ +static const char *get_type(char *buffer, const Elf64_Word type) { +#ifndef PT_GNU_PROPERTY +#define PT_GNU_PROPERTY 0x6474e553 +#endif + + static const char *ptype[] = { + "null ", /* 0 */ + "load ", /* 1 */ + "dynamic ", /* 2 */ + "interp ", /* 3 */ + "note ", /* 4 */ + "shlib ", /* 5 */ + "phdr ", /* 6 */ + "tls ", /* 7 */ + }; + if (type < PT_NUM) return ptype[type]; + switch (type) { + case PT_GNU_EH_FRAME: + return "gnu_frame "; + case PT_GNU_STACK: + return "gnu_stack "; + case PT_GNU_RELRO: + return "gnu_relro "; + case PT_GNU_PROPERTY: + return "g_property"; + } + + sprintf(buffer, "0x%x", type); + return buffer; +} + +/* buffer size must be at least 4 bytes */ +static const char *get_flags(char *buffer, const Elf64_Word flags) { + strcpy(buffer, "---"); + if (flags & PF_R) buffer[0] = 'r'; + if (flags & PF_W) buffer[1] = 'w'; + if (flags & PF_X) buffer[2] = 'x'; + return buffer; +} + +static int print_elf_header(struct dl_phdr_info *info, size_t, void *) { + char flagsbuf[4], typebuf[20]; + DEBUG("\nObj: %0#14lx \"%s\"", info->dlpi_addr, info->dlpi_name); + for (Elf64_Half i = 0; i < info->dlpi_phnum; ++i) { + const ElfW(Phdr) *hdr = info->dlpi_phdr + i; + Elf64_Addr vaddr = info->dlpi_addr + hdr->p_vaddr; + DEBUG( + "%02x) type=%s vaddr=%0#14lx filesz=%-#10lxmemsz=%-#10lx" + "align=%-#10lxflags=%s", + i, get_type(typebuf, hdr->p_type), vaddr, hdr->p_filesz, hdr->p_memsz, + hdr->p_align, get_flags(flagsbuf, hdr->p_flags)); + } + return 0; /* please, call me again if next ELF segment is found */ +} + +static void print_file(const char *filename) { + DEBUG("\nPrinting file: \"%s\"", filename); + File_reader reader(filename); + if (!reader.is_ok()) return; + + static const size_t s_len = 4096; + char buffer[s_len]; + while (ssize_t r = reader.read(buffer, s_len)) + dump_buffer_to_stderr(buffer, r); + dump_buffer_to_stderr("\n", 1); +} + +} /* namespace Debug */ +#endif /* IF DEBUG_LEVEL > 1 */ + +static uint64_t do_remap() { + DEBUG("Remapping .text and .data ELF segments to huge pages"); + + Huge_page_remapper remapper; + if (!remapper.setup()) { + PRINT_ERROR("hugetlbfs is not ready, continue with default pages"); + return 0; + } + + Elf_segments segs; + dl_iterate_phdr(parse_elf_file, &segs); + + if (segs.empty()) { + PRINT_ERROR("Haven't found any ELF segments"); + return 0; + } + + if (segs.is_overlap()) { + PRINT_ERROR("huge pages overlap, continue with default pages"); + return 0; + } + +#if DEBUG_LEVEL > 1 + /* Print out all ELF segments and memory regions before remapping */ + dl_iterate_phdr(Debug::print_elf_header, NULL); + DEBUG("\nBEFORE REMAPPING"); + Debug::print_file("/proc/self/maps"); +#endif + + Heap_rescue heap; + size_t i = 0, segsize = segs.size() - 1; + uint64_t hp_count = 0, curr_hp = 0; + while (i < segsize) { + if (!(curr_hp = remapper.remap(segs.segments[i], NULL))) return 0; + hp_count += curr_hp; + ++i; + } + /* + In Linux [heap] always follows the LOAD segments, so check only the final + segment for overlapping. + */ + if (!(curr_hp = remapper.remap(segs.segments[i], &heap))) return 0; + hp_count += curr_hp; + heap.set_new_break(); + + DEBUG("Hugepages consumed: %lu\n", hp_count); + +#if DEBUG_LEVEL > 1 + /* Print memory regions after remapping */ + DEBUG("\nAFTER REMAPPING"); + Debug::print_file("/proc/self/maps"); +#endif + + /* Everything is fine */ + return hp_count; +} +} /* namespace Huge_page_remapping */ + +/** + Remaps .text/.data segments of the host program to 2mb huge pages. + + This function does the following things: + - finds the hugetlbfs mount point; + - checks the mount point has 2mb huge pages + - checks the mount point has rwx rights + - using hugetlbfs mount point to remap .text/.data segments + - if any segment overlaps with [heap] - takes additional actions to save + [heap] from unmapping by kernel + + @param [in] err_log. A buffer for error messages. + @param [in] err_size. A size of the buffer for error messages. + @return number of huge pages consumed if success or 0 if failed + + If worst comes to the worst, the function falls back to the default OS pages. +*/ + +uint64_t remap_text_and_data_segments_to_huge_pages(char *err_log, + size_t err_size) { + Huge_page_remapping::Setup_memory_logger setup(err_log, err_size); + return Huge_page_remapping::do_remap(); +} + +/** + Remaps .text/.data segments of the host program to 2mb huge pages. + + This is a special wrapper which consumes the user provided hook for + error messages. + @param [in] error_hook. A function which for error messages + @return number of huge pages consumes if success or 0 if failed +*/ + +uint64_t remap_text_and_data_segments_to_huge_pages( + void (*error_hook)(const char *error_msg)) { + char err_log[4096]; + + /* first byte indicates whether it's empty or not */ + *err_log = 0; + + /* No explicit or implicit heap allocations are allowed */ + uint64_t pages_consumed = + remap_text_and_data_segments_to_huge_pages(err_log, sizeof(err_log)); + if (!err_log[0]) return pages_consumed; + + /* + After the work is done we can use heap allocations -> + the user provided hook isn't dangerous now. + */ + char *saveptr, *line = strtok_r(err_log, "\n", &saveptr); + do { + error_hook(line); + } while ((line = strtok_r(NULL, "\n", &saveptr))); + return pages_consumed; +} diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 682e8d5ae13..d53dacd904c 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -1314,6 +1314,9 @@ bool opt_large_pages = false; bool opt_super_large_pages = false; bool opt_myisam_use_mmap = false; std::atomic offline_mode; +#ifdef __linux__ +ulong opt_large_pages_for_code; +#endif uint opt_large_page_size = 0; uint default_password_lifetime = 0; bool password_require_current = false; @@ -1936,6 +1939,11 @@ ulong opt_ssl_fips_mode = SSL_FIPS_MODE_OFF; /* Function declarations */ +#ifdef __linux__ +extern uint64 remap_text_and_data_segments_to_huge_pages( + void (*error_hook)(const char *error_msg)); +#endif + static int mysql_init_variables(); static int get_options(int *argc_ptr, char ***argv_ptr); static void add_terminator(vector *options); @@ -4721,6 +4729,33 @@ int init_common_variables() { DBUG_PRINT("info", ("%s Ver %s for %s on %s\n", my_progname, server_version, SYSTEM_TYPE, MACHINE_TYPE)); +#ifdef __linux__ + /* + opt_large_parges_for_code values: + 0 = OFF (turned off) + 1 = ON (turned on, if fails, roll back to the default pages) + 2 = ENFORCE (turned on, if fails, gracefully exit) + */ + if (opt_large_pages_for_code) { + uint64 huge_pages_consumed = + remap_text_and_data_segments_to_huge_pages([](const char *error_msg) { + sql_print_error("%s", error_msg); + }); + + if (huge_pages_consumed) + sql_print_information( + "Successfully remapped mysqld code and data to " + "huge pages (pages consumed: %llu)", + huge_pages_consumed); + else { + sql_print_error("Failed to remap mysqld code and data to huge pages"); + if (opt_large_pages_for_code == 2) return 1; + /* If no success, notify user via system variable */ + opt_large_pages_for_code = 0; + } + } +#endif + #ifdef HAVE_LINUX_LARGE_PAGES /* Initialize large page size */ if (opt_large_pages && (opt_large_page_size = my_get_large_page_size())) { diff --git a/sql/mysqld.h b/sql/mysqld.h index d43bf6718d6..69fc36d89ca 100644 --- a/sql/mysqld.h +++ b/sql/mysqld.h @@ -628,6 +628,9 @@ extern struct st_VioSSLFd *ssl_acceptor_fd; extern bool opt_large_pages; extern uint opt_large_page_size; +#ifdef __linux__ +extern ulong opt_large_pages_for_code; +#endif extern char lc_messages_dir[FN_REFLEN]; extern char *lc_messages_dir_ptr; extern const char *log_error_dest; diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc index 5b1a82d2f3f..86e1941cc10 100644 --- a/sql/sys_vars.cc +++ b/sql/sys_vars.cc @@ -2206,6 +2206,14 @@ static Sys_var_bool Sys_large_pages("large_pages", IF_WIN(NO_CMD_LINE, CMD_LINE(OPT_ARG)), DEFAULT(false)); +#ifdef __linux__ +static const char *large_pages_for_code_names[] = {"OFF", "ON", "ENFORCE", 0}; +static Sys_var_enum Sys_large_pages_for_code( + "large_pages_for_code", "Remap .text/.data to 2mb huge pages", + READ_ONLY GLOBAL_VAR(opt_large_pages_for_code), CMD_LINE(OPT_ARG), + large_pages_for_code_names, DEFAULT(0)); +#endif + static Sys_var_charptr Sys_language( "lc_messages_dir", "Directory where error messages are", READ_ONLY NON_PERSIST GLOBAL_VAR(lc_messages_dir_ptr),