From 8c13a3347e98871d48874d716bd0d83a20114849 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Fran=C3=A7ois=20Gagn=C3=A9?= <8471576+jfg956@users.noreply.github.com> Date: Tue, 3 Sep 2024 11:09:08 -0400 Subject: [PATCH] Testing Patch for Bug#115988 - Too Much Disk Read on Startup, penalizing deployments with many tables (1M+). --- storage/innobase/fil/fil0fil.cc | 204 +++++++++++++++++++++----- storage/innobase/handler/ha_innodb.cc | 20 +++ storage/innobase/include/srv0srv.h | 6 + storage/innobase/srv/srv0srv.cc | 6 + 4 files changed, 200 insertions(+), 36 deletions(-) diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index a4819965c3a8..4c84ccd62705 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1619,13 +1619,20 @@ class Fil_system { mapping table. */ dberr_t scan() { return m_dirs.scan(); } - /** Get the tablespace ID from an .ibd and/or an undo tablespace. If the ID is - 0 on the first page then try finding the ID with Datafile::find_space_id(). + /* Below, I removed part of the comment which was an implementation detail and IMHO added no value here. + * I left this comment for clarity of the patch, it can be removed when merging. */ + /** Get the tablespace ID from an .ibd and/or an undo tablespace. @param[in] filename File name to check @return s_invalid_space_id if not found, otherwise the space ID */ [[nodiscard]] static space_id_t get_tablespace_id( const std::string &filename); + /* Three functions below called by above. */ + [[nodiscard]] static space_id_t get_tablespace_id(FILE *fp); + [[nodiscard]] static space_id_t get_tablespace_id_light(FILE *fp); + [[nodiscard]] static space_id_t get_tablespace_id_heavy_duty( + const std::string &filename); + /** Fil_shard by space ID. @param[in] space_id Tablespace ID @return reference to the shard */ @@ -10897,8 +10904,9 @@ static bool fil_op_replay_rename(const page_id_t &page_id, return true; } -/** Get the tablespace ID from an .ibd and/or an undo tablespace. If the ID is 0 -on the first page then try finding the ID with Datafile::find_space_id(). +/* Below, I removed part of the comment which was an implementation detail and IMHO added no value here. + * I left this comment for clarity of the patch, it can be removed when merging. */ +/** Get the tablespace ID from an .ibd and/or an undo tablespace. @param[in] filename File name to check @return s_invalid_space_id if not found, otherwise the space ID */ space_id_t Fil_system::get_tablespace_id(const std::string &filename) { @@ -10909,37 +10917,151 @@ space_id_t Fil_system::get_tablespace_id(const std::string &filename) { return dict_sys_t::s_invalid_space_id; } - std::vector space_ids; - auto page_size = srv_page_size; + space_id_t space_id; + + /* Try the lighter methods first. + * Even if / when we remove srv_tablespace_startup_testing_light, we still + * need the non-light implementation in case MAX_PAGES_TO_READ is + * changed to something greater than 1. */ + if (MAX_PAGES_TO_READ == 1 && srv_tablespace_startup_testing_light) { + space_id = Fil_system::get_tablespace_id_light(fp); + } else { + space_id = Fil_system::get_tablespace_id(fp); + } + + fclose(fp); + + /* Try the more heavy duty method, as a last resort. */ + if (space_id == UINT32_UNDEFINED) { + /* If the first page cannot be read properly, then for compressed + tablespaces we don't know where the page boundary starts because + we don't know the page size. */ + space_id = Fil_system::get_tablespace_id_heavy_duty(filename); + } + + return space_id; +} + +space_id_t Fil_system::get_tablespace_id_light(FILE *fp) { + ut_a(MAX_PAGES_TO_READ == 1); + +#ifdef POSIX_FADV_RANDOM + /* Tell the kernel to not prefetch. + * If we do not do this, it might decide to read more + * and we definitely do not need more nor do we want to waste io resources + * because this function is called for each table on startup which can be a lot. + * Without this, my / JFG tests showed that reading 4 bytes below still fetches + * 16 kb from disk instead of 4k, which is the xfs block size. */ + posix_fadvise(fileno(fp), 0, srv_page_size, POSIX_FADV_RANDOM); +#endif /* POSIX_FADV_RANDOM */ + +#ifdef _IONBF + /* Without this, we will probably have BUFSIZ = 8192. + * Without this, my / JFG tests showed that we read a more than 4 kb. */ + setvbuf(fp, nullptr, _IONBF, 0); +#endif /* _IONBF */ + + /* For MAX_PAGES_TO_READ == 1, the "not-light" function reads srv_page_size. + * This "light" implementation reads less, but would not validate that + * the file size is at least one page. + * To keep the same behavior, we do below. + * There probably is a better way to get the size of a file than the + * hack below, but below allows doing it "just" with a FILE*, + * which is what we have at our disposal, so acceptable / good enough. */ + auto err1 = fseek(fp, 0, SEEK_END); + long file_size = ftell(fp); + if (err1 || file_size < (long)srv_page_size) { + return UINT32_UNDEFINED; + } + + byte buf[4]; + auto err2 = fseek(fp, FIL_PAGE_SPACE_ID, SEEK_SET); + auto nb_bytes = fread(buf, 1, 4, fp); + if (err2 || nb_bytes != 4) { + return UINT32_UNDEFINED; + } + +#ifdef POSIX_FADV_DONTNEED + posix_fadvise(fileno(fp), 0, srv_page_size, POSIX_FADV_DONTNEED); +#endif /* POSIX_FADV_DONTNEED */ + + DBUG_EXECUTE_IF("invalid_header", return UINT32_UNDEFINED;); + + space_id_t space_id = mach_read_from_4(buf); + + return space_id == 0 ? UINT32_UNDEFINED : space_id; +} - space_ids.reserve(MAX_PAGES_TO_READ); +space_id_t Fil_system::get_tablespace_id(FILE *fp) { + auto page_size = srv_page_size; const auto n_bytes = page_size * MAX_PAGES_TO_READ; + /* TODO... + * This buffer is allocated for each call of this function, + * which can be a lot as it is called for each table on startup. + * I / JFG tested allocating this buffer only once per thread doing duplicate scan, + * and I did not see a significant reduction in startup time, + * but I guess there might be some reduction in CPU usage. + * Leaving this to someone else to optimize as I think the light version + * of this function is good enough for now. */ std::unique_ptr buf(new byte[n_bytes]); if (!buf) { return dict_sys_t::s_invalid_space_id; } - auto pages_read = fread(buf.get(), page_size, MAX_PAGES_TO_READ, fp); + /* Let's avoid many calls to buf.get(). + * The compiler probably cannot do this, + * as it cannot know the result is the same for all calls. + * I left this comment for clarity of the patch, it can be removed when merging. */ + byte *pbuf = buf.get(); + +#ifdef POSIX_FADV_RANDOM + if (srv_tablespace_startup_testing_fadvise) { + /* Tell the kernel to not prefetch. + * If we do not do this, it might decide to read more + * and we definitely do not need more nor do we want to waste io resources + * because this function is called for each table on startup which can be a lot. */ + posix_fadvise(fileno(fp), 0, page_size * MAX_PAGES_TO_READ, POSIX_FADV_RANDOM); + } +#endif /* POSIX_FADV_RANDOM */ + + auto pages_read = fread(pbuf, page_size, MAX_PAGES_TO_READ, fp); DBUG_EXECUTE_IF("invalid_header", pages_read = 0;); /* Find the space id from the pages read if enough pages could be read. Fall back to the more heavier method of finding the space id from Datafile::find_space_id() if pages cannot be read properly. */ - if (pages_read >= MAX_PAGES_TO_READ) { + /* I / JFG had a visceral reaction to this large block and was compelled to improve things. + * Link to previous code: https://github.com/jfg956/mysql-server/blob/mysql-9.0.1/storage/innobase/fil/fil0fil.cc#L10932 + * It might have made sense to not early return for closing the FILE*, + * but with the close now being done in the caller and IMHO, + * a quick return makes the code easier to read and understand. + * I left this comment for clarity of the patch, it can be removed when merging. */ + if (pages_read < MAX_PAGES_TO_READ) { + return UINT32_UNDEFINED; + } + + /* Some of below are poorly indented to make the diff easier to understand. + * It can be reformatted when merging, maybe in a different merge commit. + * I left this comment for clarity of the patch, it can be removed when merging. */ + auto bytes_read = pages_read * page_size; #ifdef POSIX_FADV_DONTNEED posix_fadvise(fileno(fp), 0, bytes_read, POSIX_FADV_DONTNEED); #endif /* POSIX_FADV_DONTNEED */ - for (page_no_t i = 0; i < MAX_PAGES_TO_READ; ++i) { - const auto off = i * page_size + FIL_PAGE_SPACE_ID; + /* I / JFG had a visceral reaction to the "if" executed only in 1st iteration + * of the loop and was compelled to improve things. + * Link to previous code: https://github.com/jfg956/mysql-server/blob/mysql-9.0.1/storage/innobase/fil/fil0fil.cc#L10942 + * By doing this before the loop, the loop body is much smaller, so IMHO easier to understand. + * I left this comment for clarity of the patch, it can be removed when merging. */ + { + /* This is in a block to limit the scope of variables not needed later. */ - if (off == FIL_PAGE_SPACE_ID) { /* Find out the page size of the tablespace from the first page. In case of compressed pages, the subsequent pages can be of different sizes. If MAX_PAGES_TO_READ is changed to a different value, then the @@ -10950,44 +11072,55 @@ space_id_t Fil_system::get_tablespace_id(const std::string &filename) { ut_a(space_flags_offset + 4 < n_bytes); - const auto flags = mach_read_from_4(buf.get() + space_flags_offset); + const auto flags = mach_read_from_4(pbuf + space_flags_offset); page_size_t space_page_size(flags); page_size = space_page_size.physical(); } - space_ids.push_back(mach_read_from_4(buf.get() + off)); + /* I / JFG had a visceral reaction to the 2nd loop and was compelled to improve things. + * Link to previous code: https://github.com/jfg956/mysql-server/blob/mysql-9.0.1/storage/innobase/fil/fil0fil.cc#L10975 + * In addition to saving a loop, this allows removing space_ids (std::vector), + * saving the corresponding memory allocation. + * I left this comment for clarity of the patch, it can be removed when merging. */ + ut_a(FIL_PAGE_SPACE_ID + 4 < n_bytes); - if ((i + 1) * page_size >= bytes_read) { - break; - } - } + space_id_t space_id = mach_read_from_4(pbuf + FIL_PAGE_SPACE_ID); + if (space_id == 0) { + return UINT32_UNDEFINED; } - fclose(fp); - - space_id_t space_id; + /* I / JFG had a visceral reaction to if/break in the loop and was compelled to improve things. + * Link to previous code: https://github.com/jfg956/mysql-server/blob/mysql-9.0.1/storage/innobase/fil/fil0fil.cc#L10962 + * By putting it in the loop condition, the loop body is much smaller, so IMHO easier to understand. + * Also, the refactoring / unrolling of the 1st iteration of the loop in above needs this. + * If for troubleshooting / debugging with gdb, it is suitable to have this in the loop, feel free to put back, + * but at the beginning of the loop, not at the end, and with (i) instead of (i+1) because of unrolling. + * I left this comment for clarity of the patch, it can be removed when merging. */ + for (page_no_t i = 1; i < MAX_PAGES_TO_READ && i * page_size < bytes_read; ++i) { + const auto off = i * page_size + FIL_PAGE_SPACE_ID; - if (!space_ids.empty()) { - space_id = space_ids.front(); + /* This assertion was missing (in reference to the other about space_flags_offset). + * I left this comment for clarity of the patch, it can be removed when merging. */ + ut_a(off + 4 < n_bytes); - for (auto id : space_ids) { - if (id == 0 || space_id != id) { - space_id = UINT32_UNDEFINED; + space_id_t current_space_id = mach_read_from_4(pbuf + off); - break; - } + if (current_space_id != space_id) { + return UINT32_UNDEFINED; } - } else { - space_id = UINT32_UNDEFINED; } - /* Try the more heavy duty method, as a last resort. */ - if (space_id == UINT32_UNDEFINED) { - /* If the first page cannot be read properly, then for compressed - tablespaces we don't know where the page boundary starts because - we don't know the page size. */ + return space_id; +} + +space_id_t Fil_system::get_tablespace_id_heavy_duty(const std::string &filename) { + space_id_t space_id = UINT32_UNDEFINED; + + /* Below is poorly indented to make the diff easier to understand. + * It can be reformatted when merging, maybe in a different merge commit, + * and this comment should then be removed. */ Datafile file; @@ -11007,7 +11140,6 @@ space_id_t Fil_system::get_tablespace_id(const std::string &filename) { } file.close(); - } return space_id; } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 2ad58f17f251..0d35491f6db0 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -23367,6 +23367,24 @@ static MYSQL_SYSVAR_STR(directories, srv_innodb_directories, "'innodb-data-home-dir;innodb-undo-directory;datadir'", nullptr, nullptr, nullptr); +/* TODO: remove once testing done. */ +static MYSQL_SYSVAR_BOOL( + tablespace_startup_testing_fadvise, srv_tablespace_startup_testing_fadvise, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOPERSIST, + "For startup fadvise tests, will be removed eventually", + nullptr, /* check */ + nullptr, /* update */ + false /* def */); + +/* TODO: remove once testing done. */ +static MYSQL_SYSVAR_BOOL( + tablespace_startup_testing_light, srv_tablespace_startup_testing_light, + PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOPERSIST, + "For startup light tests, will be removed eventually", + nullptr, /* check */ + nullptr, /* update */ + false /* def */); + #ifdef UNIV_DEBUG /** Use this variable innodb_interpreter to execute debug code within InnoDB. The output is stored in the innodb_interpreter_output variable. */ @@ -23525,6 +23543,8 @@ static SYS_VAR *innobase_system_variables[] = { MYSQL_SYSVAR(sort_buffer_size), MYSQL_SYSVAR(online_alter_log_max_size), MYSQL_SYSVAR(directories), + MYSQL_SYSVAR(tablespace_startup_testing_fadvise), + MYSQL_SYSVAR(tablespace_startup_testing_light), MYSQL_SYSVAR(sync_spin_loops), MYSQL_SYSVAR(spin_wait_delay), MYSQL_SYSVAR(spin_wait_pause_multiplier), diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index b6bf10810e00..3b6e97b681df 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -400,6 +400,12 @@ extern bool srv_numa_interleave; deliminated by ';', i.e the FIL_PATH_SEPARATOR. */ extern char *srv_innodb_directories; +/* TODO: remove once testing done. */ +/* Documented in storage/innobase/handler/ha_innodb.cc. */ +/* It looks pointless to duplicate comments, if needed, feel free to do it when merging. */ +extern bool srv_tablespace_startup_testing_fadvise; +extern bool srv_tablespace_startup_testing_light; + /** Server undo tablespaces directory, can be absolute path. */ extern char *srv_undo_dir; diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index a614e8b64503..d4e16ac3638d 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -146,6 +146,12 @@ char *srv_doublewrite_dir = nullptr; deliminated by ';', i.e the FIL_PATH_SEPARATOR. */ char *srv_innodb_directories = nullptr; +/* TODO: remove once testing done. */ +/* Documented in storage/innobase/handler/ha_innodb.cc. */ +/* It looks pointless to duplicate comments, if needed, feel free to fo it when merging. */ +bool srv_tablespace_startup_testing_fadvise; +bool srv_tablespace_startup_testing_light; + /** Number of threads spawned for initializing rollback segments in parallel */ uint32_t srv_rseg_init_threads = 1;