commit 686eade99651217aba8f406cedf06710713bcba2
Author: Dmitry Lenev <dmitry.lenev@percona.com>
Date:   Thu Jul 18 18:01:27 2024 +0200

    Fix for bug#115608 "Inplace ALTER TABLE might cause lost rows if concurrent purge".
    
    Problem:
    --------
    ALTER TABLE which rebuilds InnoDB table using INPLACE algorithm might sometimes
    lead to row loss if concurrent purge happens on the table being ALTERed.
    
    Analysis:
    ---------
    New implementation of parallel ALTER TABLE INPLACE in InnoDB was introduced in
    MySQL 8.0.27. Its code is used for online table rebuild even in a single-thread
    case.
    This implementation iterates over all the rows in the table, in general case,
    handling different subtrees of a B-tree in different threads. This iteration
    over table rows needs to be paused, from time to time, to commit InnoDB MTR/
    release page latches it holds. This is necessary to give a way to concurrent
    actions on the B-tree scanned or before flushing rows of new version of table
    from in-memory buffer to the B-tree. In order to resume iteration after such
    pause persistent cursor position saved before pause is restored.
    
    The cause of the problem described above lies in PCursor::restore_position()
    method. This method used for two purposes in the code:
    
    1)  To resume iteration after it was paused in the scenario described
        above.
    2)  To initialize cursor when a thread starts iteration through subtree
        it was assigned to process.
    
    In scenario 2) we restore cursor to the record which has not been
    processed yet. If the record, which position was saved originally when
    subtrees/ranges to process were assigned to threads, has been purged
    meanwhile, the cursor will be restored to preceding record. And the
    cursor needs to be moved to the next record, so we don't start processing
    our subtree from the record belonging to a different thread.
    PCursor::restore_position() handles this by detecting situation when saved
    record was purged and moving to the next record in this case.
    
    However, in scenario 1) we actually restore cursor to the record which
    has been processed already and from which will be doing step to the next
    record right after restore. So iterating to the next record if saved record
    was purged like it is done in PCursor::restore_position(), and which is
    necessary in case 2), leads to double step forward, resulting in our scan
    missing record!
    
    Fix:
    ----
    This patch solves the problem by using different logic for restore of
    cursor position in these two cases.
    
    For case 1) we simply restore position which was saved using
    btr_pcur_t::restore_position(). If the record to which cursor is
    supposed to point has been purged meanwhile, this method will point
    the cursor to preceding record. Then the calling code will iterate to
    next record (i.e. successor of purged record) after restore.
    
    For case 2) we keep pre-fix behavior and correct cursor position
    after restoring if the record which position has been saved originally
    has been purged by moving to the next record in subtree to be processed.
    PCursor::restore_position() method which implements handling of this
    case has been renamed to PCursor::restore_position_for_range() and
    greatly simplified.

diff --git a/mysql-test/suite/innodb/r/bug115608.result b/mysql-test/suite/innodb/r/bug115608.result
new file mode 100644
index 00000000000..d2425763d55
--- /dev/null
+++ b/mysql-test/suite/innodb/r/bug115608.result
@@ -0,0 +1,33 @@
+#
+# Test for bug#115608 "Inplace ALTER TABLE might cause lost rows if
+#                      concurrent purge".
+#
+CREATE TABLE t1 (pk CHAR(5) PRIMARY KEY);
+INSERT INTO t1 VALUES ('aaaaa'), ('bbbbb'), ('bbbcc'), ('ccccc'), ('ddddd'), ('eeeee');
+set global innodb_purge_stop_now=ON;
+DELETE FROM t1 WHERE pk = 'bbbcc';
+connect  con1, localhost, root,,;
+SET DEBUG='+d,ddl_buf_add_two';
+SET DEBUG_SYNC='ddl_bulk_inserter_latches_released SIGNAL latches_released WAIT_FOR go';
+# Send ALTER TABLE INPLACE which rebuilds table.
+ALTER TABLE t1 ENGINE=InnoDB, ALGORITHM=INPLACE;
+connection default;
+SET DEBUG_SYNC='now WAIT_FOR latches_released';
+SET GLOBAL innodb_purge_run_now=ON;
+SET DEBUG_SYNC='now SIGNAL go';
+connection con1;
+# Reap ALTER TABLE
+# Before the fix row 'ddddd' was missing from the table after ALTER.
+SELECT * FROM t1;
+pk
+aaaaa
+bbbbb
+ccccc
+ddddd
+eeeee
+SET DEBUG='-d,ddl_buf_add_two';
+disconnect con1;
+connection default;
+# Cleanup.
+SET DEBUG_SYNC= 'RESET';
+DROP TABLE t1;
diff --git a/mysql-test/suite/innodb/t/bug115608.test b/mysql-test/suite/innodb/t/bug115608.test
new file mode 100644
index 00000000000..61d01b436f8
--- /dev/null
+++ b/mysql-test/suite/innodb/t/bug115608.test
@@ -0,0 +1,44 @@
+# Tests in this file use both debug-only facilities as well as debug sync point.
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+
+--echo #
+--echo # Test for bug#115608 "Inplace ALTER TABLE might cause lost rows if
+--echo #                      concurrent purge".
+--echo #
+--enable_connect_log
+CREATE TABLE t1 (pk CHAR(5) PRIMARY KEY);
+INSERT INTO t1 VALUES ('aaaaa'), ('bbbbb'), ('bbbcc'), ('ccccc'), ('ddddd'), ('eeeee');
+
+set global innodb_purge_stop_now=ON;
+DELETE FROM t1 WHERE pk = 'bbbcc';
+
+--connect (con1, localhost, root,,)
+SET DEBUG='+d,ddl_buf_add_two';
+SET DEBUG_SYNC='ddl_bulk_inserter_latches_released SIGNAL latches_released WAIT_FOR go';
+--echo # Send ALTER TABLE INPLACE which rebuilds table.
+--send ALTER TABLE t1 ENGINE=InnoDB, ALGORITHM=INPLACE
+
+--connection default
+SET DEBUG_SYNC='now WAIT_FOR latches_released';
+SET GLOBAL innodb_purge_run_now=ON;
+--source include/wait_innodb_all_purged.inc
+SET DEBUG_SYNC='now SIGNAL go';
+
+--connection con1
+--echo # Reap ALTER TABLE
+--reap
+
+--echo # Before the fix row 'ddddd' was missing from the table after ALTER.
+SELECT * FROM t1;
+
+SET DEBUG='-d,ddl_buf_add_two';
+
+--disconnect con1
+--source include/wait_until_disconnected.inc
+
+--connection default
+--echo # Cleanup.
+SET DEBUG_SYNC= 'RESET';
+DROP TABLE t1;
+--disable_connect_log
diff --git a/storage/innobase/ddl/ddl0par-scan.cc b/storage/innobase/ddl/ddl0par-scan.cc
index f7f948a0f74..f334a7d2f5c 100644
--- a/storage/innobase/ddl/ddl0par-scan.cc
+++ b/storage/innobase/ddl/ddl0par-scan.cc
@@ -237,6 +237,7 @@ dberr_t Parallel_cursor::scan(Builders &builders) noexcept {
             thread_ctx->get_state() != Parallel_reader::State::THREAD) {
           thread_ctx->savepoint();
           latches_released = true;
+          DEBUG_SYNC_C("ddl_bulk_inserter_latches_released");
         }
         return DB_SUCCESS;
       });
diff --git a/storage/innobase/row/row0pread.cc b/storage/innobase/row/row0pread.cc
index 1b5766614f6..28408a9be12 100644
--- a/storage/innobase/row/row0pread.cc
+++ b/storage/innobase/row/row0pread.cc
@@ -214,6 +214,27 @@ class PCursor {
   PCursor(btr_pcur_t *pcur, mtr_t *mtr, size_t read_level)
       : m_mtr(mtr), m_pcur(pcur), m_read_level(read_level) {}
 
+  /**
+    Restore position of cursor created from Scan_ctx::Range object.
+    Adjust it if starting record of range has been purged.
+  */
+  void restore_position_for_range() noexcept {
+    /*
+      Restore position on the record or its predecessor if the record
+      was purged meanwhile. In the latter case we need to adjust position by
+      moving one step forward as predecessor belongs to another range.
+
+      We rely on return value from restore_position() to detect this scenario.
+      This can be done when saved position points to user record which is
+      always the case for saved start of Scan_ctx::Range.
+    */
+    ut_ad(m_pcur->m_rel_pos == BTR_PCUR_ON);
+
+    if (!m_pcur->restore_position(BTR_SEARCH_LEAF, m_mtr, UT_LOCATION_HERE)) {
+      page_cur_move_to_next(m_pcur->get_page_cur());
+    }
+  }
+
   /** Create a savepoint and commit the mini-transaction.*/
   void savepoint() noexcept;
 
@@ -225,65 +246,6 @@ class PCursor {
   @return DB_SUCCESS or error code. */
   [[nodiscard]] dberr_t move_to_next_block(dict_index_t *index);
 
-  /** Restore the cursor position. */
-  void restore_position() noexcept {
-    constexpr auto MODE = BTR_SEARCH_LEAF;
-    const auto relative = m_pcur->m_rel_pos;
-    auto equal = m_pcur->restore_position(MODE, m_mtr, UT_LOCATION_HERE);
-
-#ifdef UNIV_DEBUG
-    if (m_pcur->m_pos_state == BTR_PCUR_IS_POSITIONED_OPTIMISTIC) {
-      ut_ad(m_pcur->m_rel_pos == BTR_PCUR_BEFORE ||
-            m_pcur->m_rel_pos == BTR_PCUR_AFTER);
-    } else {
-      ut_ad(m_pcur->m_pos_state == BTR_PCUR_IS_POSITIONED);
-      ut_ad((m_pcur->m_rel_pos == BTR_PCUR_ON) == m_pcur->is_on_user_rec());
-    }
-#endif /* UNIV_DEBUG */
-
-    switch (relative) {
-      case BTR_PCUR_ON:
-        if (!equal) {
-          page_cur_move_to_next(m_pcur->get_page_cur());
-        }
-        break;
-
-      case BTR_PCUR_UNSET:
-      case BTR_PCUR_BEFORE_FIRST_IN_TREE:
-        ut_error;
-        break;
-
-      case BTR_PCUR_AFTER:
-      case BTR_PCUR_AFTER_LAST_IN_TREE:
-        break;
-
-      case BTR_PCUR_BEFORE:
-        /* For non-optimistic restoration:
-        The position is now set to the record before pcur->old_rec.
-
-        For optimistic restoration:
-        The position also needs to take the previous search_mode into
-        consideration. */
-        switch (m_pcur->m_pos_state) {
-          case BTR_PCUR_IS_POSITIONED_OPTIMISTIC:
-            m_pcur->m_pos_state = BTR_PCUR_IS_POSITIONED;
-            /* The cursor always moves "up" ie. in ascending order. */
-            break;
-
-          case BTR_PCUR_IS_POSITIONED:
-            if (m_pcur->is_on_user_rec()) {
-              m_pcur->move_to_next(m_mtr);
-            }
-            break;
-
-          case BTR_PCUR_NOT_POSITIONED:
-          case BTR_PCUR_WAS_POSITIONED:
-            ut_error;
-        }
-        break;
-    }
-  }
-
   /** @return the current page cursor. */
   [[nodiscard]] page_cur_t *get_page_cursor() noexcept {
     return m_pcur->get_page_cur();
@@ -347,7 +309,7 @@ void PCursor::resume() noexcept {
   /* Restore position on the record, or its predecessor if the record
   was purged meanwhile. */
 
-  restore_position();
+  m_pcur->restore_position(BTR_SEARCH_LEAF, m_mtr, UT_LOCATION_HERE);
 
   if (!m_pcur->is_after_last_on_page()) {
     /* Move to the successor of the saved record. */
@@ -625,7 +587,7 @@ dberr_t Parallel_reader::Ctx::traverse() {
   auto &from = m_range.first;
 
   PCursor pcursor(from->m_pcur, &mtr, m_scan_ctx->m_config.m_read_level);
-  pcursor.restore_position();
+  pcursor.restore_position_for_range();
 
   dberr_t err{DB_SUCCESS};