From 8c13a3347e98871d48874d716bd0d83a20114849 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-Fran=C3=A7ois=20Gagn=C3=A9?=
 <8471576+jfg956@users.noreply.github.com>
Date: Tue, 3 Sep 2024 11:09:08 -0400
Subject: [PATCH] Testing Patch for Bug#115988 - Too Much Disk Read on Startup,
 penalizing deployments with many tables (1M+).

---
 storage/innobase/fil/fil0fil.cc       | 204 +++++++++++++++++++++-----
 storage/innobase/handler/ha_innodb.cc |  20 +++
 storage/innobase/include/srv0srv.h    |   6 +
 storage/innobase/srv/srv0srv.cc       |   6 +
 4 files changed, 200 insertions(+), 36 deletions(-)

diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index a4819965c3a8..4c84ccd62705 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -1619,13 +1619,20 @@ class Fil_system {
   mapping table. */
   dberr_t scan() { return m_dirs.scan(); }
 
-  /** Get the tablespace ID from an .ibd and/or an undo tablespace. If the ID is
-  0 on the first page then try finding the ID with Datafile::find_space_id().
+  /* Below, I removed part of the comment which was an implementation detail and IMHO added no value here.
+   * I left this comment for clarity of the patch, it can be removed when merging. */
+  /** Get the tablespace ID from an .ibd and/or an undo tablespace.
   @param[in]    filename        File name to check
   @return s_invalid_space_id if not found, otherwise the space ID */
   [[nodiscard]] static space_id_t get_tablespace_id(
       const std::string &filename);
 
+  /* Three functions below called by above. */
+  [[nodiscard]] static space_id_t get_tablespace_id(FILE *fp);
+  [[nodiscard]] static space_id_t get_tablespace_id_light(FILE *fp);
+  [[nodiscard]] static space_id_t get_tablespace_id_heavy_duty(
+    const std::string &filename);
+
   /** Fil_shard by space ID.
   @param[in]    space_id        Tablespace ID
   @return reference to the shard */
@@ -10897,8 +10904,9 @@ static bool fil_op_replay_rename(const page_id_t &page_id,
   return true;
 }
 
-/** Get the tablespace ID from an .ibd and/or an undo tablespace. If the ID is 0
-on the first page then try finding the ID with Datafile::find_space_id().
+/* Below, I removed part of the comment which was an implementation detail and IMHO added no value here.
+ * I left this comment for clarity of the patch, it can be removed when merging. */
+/** Get the tablespace ID from an .ibd and/or an undo tablespace.
 @param[in]      filename        File name to check
 @return s_invalid_space_id if not found, otherwise the space ID */
 space_id_t Fil_system::get_tablespace_id(const std::string &filename) {
@@ -10909,37 +10917,151 @@ space_id_t Fil_system::get_tablespace_id(const std::string &filename) {
     return dict_sys_t::s_invalid_space_id;
   }
 
-  std::vector<space_id_t> space_ids;
-  auto page_size = srv_page_size;
+  space_id_t space_id;
+
+  /* Try the lighter methods first.
+   * Even if / when we remove srv_tablespace_startup_testing_light, we still
+   *   need the non-light implementation in case MAX_PAGES_TO_READ is
+   *   changed to something greater than 1. */
+  if (MAX_PAGES_TO_READ == 1 && srv_tablespace_startup_testing_light) {
+    space_id = Fil_system::get_tablespace_id_light(fp);
+  } else {
+    space_id = Fil_system::get_tablespace_id(fp);
+  }
+
+  fclose(fp);
+
+  /* Try the more heavy duty method, as a last resort. */
+  if (space_id == UINT32_UNDEFINED) {
+    /* If the first page cannot be read properly, then for compressed
+    tablespaces we don't know where the page boundary starts because
+    we don't know the page size. */
+    space_id = Fil_system::get_tablespace_id_heavy_duty(filename);
+  }
+
+  return space_id;
+}
+
+space_id_t Fil_system::get_tablespace_id_light(FILE *fp) {
+  ut_a(MAX_PAGES_TO_READ == 1);
+
+#ifdef POSIX_FADV_RANDOM
+  /* Tell the kernel to not prefetch.
+   * If we do not do this, it might decide to read more
+   *   and we definitely do not need more nor do we want to waste io resources
+   *   because this function is called for each table on startup which can be a lot.
+   * Without this, my / JFG tests showed that reading 4 bytes below still fetches
+   *   16 kb from disk instead of 4k, which is the xfs block size. */
+  posix_fadvise(fileno(fp), 0, srv_page_size, POSIX_FADV_RANDOM);
+#endif /* POSIX_FADV_RANDOM */
+
+#ifdef _IONBF
+  /* Without this, we will probably have BUFSIZ = 8192.
+   * Without this, my / JFG tests showed that we read a more than 4 kb. */
+  setvbuf(fp, nullptr, _IONBF, 0);
+#endif /* _IONBF */
+
+  /* For MAX_PAGES_TO_READ == 1, the "not-light" function reads srv_page_size.
+   * This "light" implementation reads less, but would not validate that
+   *   the file size is at least one page.
+   * To keep the same behavior, we do below.
+   * There probably is a better way to get the size of a file than the
+   *   hack below, but below allows doing it "just" with a FILE*,
+   *   which is what we have at our disposal, so acceptable / good enough. */
+  auto err1 = fseek(fp, 0, SEEK_END);
+  long file_size = ftell(fp);
+  if (err1 || file_size < (long)srv_page_size) {
+    return UINT32_UNDEFINED;
+  }
+
+  byte buf[4];
+  auto err2 = fseek(fp, FIL_PAGE_SPACE_ID, SEEK_SET);
+  auto nb_bytes = fread(buf, 1, 4, fp);
+  if (err2 || nb_bytes != 4) {
+    return UINT32_UNDEFINED;
+  }
+
+#ifdef POSIX_FADV_DONTNEED
+  posix_fadvise(fileno(fp), 0, srv_page_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+  DBUG_EXECUTE_IF("invalid_header", return UINT32_UNDEFINED;);
+
+  space_id_t space_id = mach_read_from_4(buf);
+
+  return space_id == 0 ? UINT32_UNDEFINED : space_id;
+}
 
-  space_ids.reserve(MAX_PAGES_TO_READ);
+space_id_t Fil_system::get_tablespace_id(FILE *fp) {
+  auto page_size = srv_page_size;
 
   const auto n_bytes = page_size * MAX_PAGES_TO_READ;
 
+  /* TODO...
+   * This buffer is allocated for each call of this function,
+   *   which can be a lot as it is called for each table on startup.
+   * I / JFG tested allocating this buffer only once per thread doing duplicate scan,
+   *   and I did not see a significant reduction in startup time,
+   *   but I guess there might be some reduction in CPU usage.
+   * Leaving this to someone else to optimize as I think the light version
+   *   of this function is good enough for now. */
   std::unique_ptr<byte[]> buf(new byte[n_bytes]);
 
   if (!buf) {
     return dict_sys_t::s_invalid_space_id;
   }
 
-  auto pages_read = fread(buf.get(), page_size, MAX_PAGES_TO_READ, fp);
+  /* Let's avoid many calls to buf.get().
+   * The compiler probably cannot do this,
+   *   as it cannot know the result is the same for all calls.
+   * I left this comment for clarity of the patch, it can be removed when merging. */
+  byte *pbuf = buf.get();
+
+#ifdef POSIX_FADV_RANDOM
+  if (srv_tablespace_startup_testing_fadvise) {
+    /* Tell the kernel to not prefetch.
+     * If we do not do this, it might decide to read more
+     *   and we definitely do not need more nor do we want to waste io resources
+     *   because this function is called for each table on startup which can be a lot. */
+    posix_fadvise(fileno(fp), 0, page_size * MAX_PAGES_TO_READ, POSIX_FADV_RANDOM);
+  }
+#endif /* POSIX_FADV_RANDOM */
+
+  auto pages_read = fread(pbuf, page_size, MAX_PAGES_TO_READ, fp);
 
   DBUG_EXECUTE_IF("invalid_header", pages_read = 0;);
 
   /* Find the space id from the pages read if enough pages could be read.
   Fall back to the more heavier method of finding the space id from
   Datafile::find_space_id() if pages cannot be read properly. */
-  if (pages_read >= MAX_PAGES_TO_READ) {
+  /* I / JFG had a visceral reaction to this large block and was compelled to improve things.
+   * Link to previous code: https://github.com/jfg956/mysql-server/blob/mysql-9.0.1/storage/innobase/fil/fil0fil.cc#L10932
+   * It might have made sense to not early return for closing the FILE*,
+   *   but with the close now being done in the caller and IMHO,
+   *   a quick return makes the code easier to read and understand.
+   * I left this comment for clarity of the patch, it can be removed when merging. */
+  if (pages_read < MAX_PAGES_TO_READ) {
+    return UINT32_UNDEFINED;
+  }
+
+  /* Some of below are poorly indented to make the diff easier to understand.
+   * It can be reformatted when merging, maybe in a different merge commit.
+   * I left this comment for clarity of the patch, it can be removed when merging. */
+
     auto bytes_read = pages_read * page_size;
 
 #ifdef POSIX_FADV_DONTNEED
     posix_fadvise(fileno(fp), 0, bytes_read, POSIX_FADV_DONTNEED);
 #endif /* POSIX_FADV_DONTNEED */
 
-    for (page_no_t i = 0; i < MAX_PAGES_TO_READ; ++i) {
-      const auto off = i * page_size + FIL_PAGE_SPACE_ID;
+  /* I / JFG had a visceral reaction to the "if" executed only in 1st iteration
+   *   of the loop and was compelled to improve things.
+   * Link to previous code: https://github.com/jfg956/mysql-server/blob/mysql-9.0.1/storage/innobase/fil/fil0fil.cc#L10942
+   * By doing this before the loop, the loop body is much smaller, so IMHO easier to understand.
+   * I left this comment for clarity of the patch, it can be removed when merging. */
+      {
+        /* This is in a block to limit the scope of variables not needed later.  */
 
-      if (off == FIL_PAGE_SPACE_ID) {
         /* Find out the page size of the tablespace from the first page.
         In case of compressed pages, the subsequent pages can be of different
         sizes. If MAX_PAGES_TO_READ is changed to a different value, then the
@@ -10950,44 +11072,55 @@ space_id_t Fil_system::get_tablespace_id(const std::string &filename) {
 
         ut_a(space_flags_offset + 4 < n_bytes);
 
-        const auto flags = mach_read_from_4(buf.get() + space_flags_offset);
+        const auto flags = mach_read_from_4(pbuf + space_flags_offset);
 
         page_size_t space_page_size(flags);
 
         page_size = space_page_size.physical();
       }
 
-      space_ids.push_back(mach_read_from_4(buf.get() + off));
+  /* I / JFG had a visceral reaction to the 2nd loop and was compelled to improve things.
+   * Link to previous code: https://github.com/jfg956/mysql-server/blob/mysql-9.0.1/storage/innobase/fil/fil0fil.cc#L10975
+   * In addition to saving a loop, this allows removing space_ids (std::vector<space_id_t>),
+   *   saving the corresponding memory allocation.
+   * I left this comment for clarity of the patch, it can be removed when merging. */
+  ut_a(FIL_PAGE_SPACE_ID + 4 < n_bytes);
 
-      if ((i + 1) * page_size >= bytes_read) {
-        break;
-      }
-    }
+  space_id_t space_id = mach_read_from_4(pbuf + FIL_PAGE_SPACE_ID);
+  if (space_id == 0) {
+    return UINT32_UNDEFINED;
   }
 
-  fclose(fp);
-
-  space_id_t space_id;
+  /* I / JFG had a visceral reaction to if/break in the loop and was compelled to improve things.
+   * Link to previous code: https://github.com/jfg956/mysql-server/blob/mysql-9.0.1/storage/innobase/fil/fil0fil.cc#L10962
+   * By putting it in the loop condition, the loop body is much smaller, so IMHO easier to understand.
+   * Also, the refactoring / unrolling of the 1st iteration of the loop in above needs this.
+   * If for troubleshooting / debugging with gdb, it is suitable to have this in the loop, feel free to put back,
+   *   but at the beginning of the loop, not at the end, and with (i) instead of (i+1) because of unrolling.
+   * I left this comment for clarity of the patch, it can be removed when merging. */
+  for (page_no_t i = 1; i < MAX_PAGES_TO_READ && i * page_size < bytes_read; ++i) {
+    const auto off = i * page_size + FIL_PAGE_SPACE_ID;
 
-  if (!space_ids.empty()) {
-    space_id = space_ids.front();
+    /* This assertion was missing (in reference to the other about space_flags_offset).
+     * I left this comment for clarity of the patch, it can be removed when merging. */
+    ut_a(off + 4 < n_bytes);
 
-    for (auto id : space_ids) {
-      if (id == 0 || space_id != id) {
-        space_id = UINT32_UNDEFINED;
+    space_id_t current_space_id = mach_read_from_4(pbuf + off);
 
-        break;
-      }
+    if (current_space_id != space_id) {
+      return UINT32_UNDEFINED;
     }
-  } else {
-    space_id = UINT32_UNDEFINED;
   }
 
-  /* Try the more heavy duty method, as a last resort. */
-  if (space_id == UINT32_UNDEFINED) {
-    /* If the first page cannot be read properly, then for compressed
-    tablespaces we don't know where the page boundary starts because
-    we don't know the page size. */
+  return space_id;
+}
+
+space_id_t Fil_system::get_tablespace_id_heavy_duty(const std::string &filename) {
+  space_id_t space_id = UINT32_UNDEFINED;
+
+  /* Below is poorly indented to make the diff easier to understand.
+   * It can be reformatted when merging, maybe in a different merge commit,
+   *   and this comment should then be removed. */
 
     Datafile file;
 
@@ -11007,7 +11140,6 @@ space_id_t Fil_system::get_tablespace_id(const std::string &filename) {
     }
 
     file.close();
-  }
 
   return space_id;
 }
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 2ad58f17f251..0d35491f6db0 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -23367,6 +23367,24 @@ static MYSQL_SYSVAR_STR(directories, srv_innodb_directories,
                         "'innodb-data-home-dir;innodb-undo-directory;datadir'",
                         nullptr, nullptr, nullptr);
 
+/* TODO: remove once testing done. */
+static MYSQL_SYSVAR_BOOL(
+    tablespace_startup_testing_fadvise, srv_tablespace_startup_testing_fadvise,
+    PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOPERSIST,
+    "For startup fadvise tests, will be removed eventually",
+    nullptr, /* check */
+    nullptr, /* update */
+    false /* def */);
+
+/* TODO: remove once testing done. */
+static MYSQL_SYSVAR_BOOL(
+    tablespace_startup_testing_light, srv_tablespace_startup_testing_light,
+    PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOPERSIST,
+    "For startup light tests, will be removed eventually",
+    nullptr, /* check */
+    nullptr, /* update */
+    false /* def */);
+
 #ifdef UNIV_DEBUG
 /** Use this variable innodb_interpreter to execute debug code within InnoDB.
 The output is stored in the innodb_interpreter_output variable. */
@@ -23525,6 +23543,8 @@ static SYS_VAR *innobase_system_variables[] = {
     MYSQL_SYSVAR(sort_buffer_size),
     MYSQL_SYSVAR(online_alter_log_max_size),
     MYSQL_SYSVAR(directories),
+    MYSQL_SYSVAR(tablespace_startup_testing_fadvise),
+    MYSQL_SYSVAR(tablespace_startup_testing_light),
     MYSQL_SYSVAR(sync_spin_loops),
     MYSQL_SYSVAR(spin_wait_delay),
     MYSQL_SYSVAR(spin_wait_pause_multiplier),
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index b6bf10810e00..3b6e97b681df 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -400,6 +400,12 @@ extern bool srv_numa_interleave;
 deliminated by ';', i.e the FIL_PATH_SEPARATOR. */
 extern char *srv_innodb_directories;
 
+/* TODO: remove once testing done. */
+/* Documented in storage/innobase/handler/ha_innodb.cc. */
+/* It looks pointless to duplicate comments, if needed, feel free to do it when merging. */
+extern bool srv_tablespace_startup_testing_fadvise;
+extern bool srv_tablespace_startup_testing_light;
+
 /** Server undo tablespaces directory, can be absolute path. */
 extern char *srv_undo_dir;
 
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index a614e8b64503..d4e16ac3638d 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -146,6 +146,12 @@ char *srv_doublewrite_dir = nullptr;
 deliminated by ';', i.e the FIL_PATH_SEPARATOR. */
 char *srv_innodb_directories = nullptr;
 
+/* TODO: remove once testing done. */
+/* Documented in storage/innobase/handler/ha_innodb.cc. */
+/* It looks pointless to duplicate comments, if needed, feel free to fo it when merging. */
+bool srv_tablespace_startup_testing_fadvise;
+bool srv_tablespace_startup_testing_light;
+
 /** Number of threads spawned for initializing rollback segments
 in parallel */
 uint32_t srv_rseg_init_threads = 1;