commit 24e5cac267446db27233bbfeb56876e80be0e82d
Author: Dmitry Lenev <dmitry.lenev@percona.com>
Date:   Thu Aug 1 17:00:17 2024 +0200

    Bug#115511 "Inplace ALTER TABLE might fail with duplicate key error if concurrent insertions"
    
    Problem:
    --------
    ALTER TABLE with rebuilds InnoDB table using INPLACE algorithm occasionally
    might fail with unwarranted duplicate primary key error if there are
    concurrent insertions into the table, even though these insertions do not
    cause any PK conflict.
    
    Analysis:
    ---------
    New implementation of parallel ALTER TABLE INPLACE in InnoDB was introduced in
    MySQL 8.0.27. Its code is used for online table rebuild even in a single-thread
    case.
    This implementation iterates over all the rows in the table, in general case,
    handling different subtrees of a B-tree in different threads. This iteration
    over table rows needs to be paused, from time to time, to commit InnoDB MTR/
    release page latches it holds. This is necessary to give a way to concurrent
    actions on the B-tree scanned or before flushing rows of new version of table
    from in-memory buffer to the B-tree. In order to resume iteration after such
    pause persistent cursor position saved before pause is restored.
    
    The cause of the problem described above lies in how PCursor::savepoint()
    and PCursor::resume() methods perform this saving and restore of cursor
    position.
    
    Instead of storing position of current record pointed by cursor savepoint()
    stores the position of record that precedes it, and then resume() does
    restore in two steps - 1) restores position of this preceding record
    (or its closest precedessor if it was purged meanwhile) and then 2) moves
    one step forward assuming that will get to the record at which cursor
    pointed originally.
    
    Such approach makes sense when we try to save/restore cursor pointing to
    page's supremum record before switching to a new page, as it allows to
    avoid problems with records being skipped when the next page is merged into
    the current one while latches are released (so records which we have not
    scanned yet are moved over supremum record, but not over the record which
    originally preceded supremum).
    
    However, it is not necessary and becomes problematic we try to save/restore
    cursor pointing to user record from within record processing callback.
    In this case record which position we are trying to save when savepoint()
    method is called can be considered already processed as corresponding value
    already will be inserted into output buffer soon after restore. When a
    concurrent insert adds a new record between the record which position we
    have inteded to save by calling savepoint() and its precedessor which
    position this call stored internally, the later call to resume() will
    position cursor at this newly inserted record. This will lead to the
    resumed scan revisiting original record once again. As result the code
    will attempt to add this original record into the output buffer one more
    time and get duplicate key error.
    
    Fix:
    ---
    This patch solves the problem by adjusting PCursors::savepoint()/resume()
    logic not to do this step back on save/step forward on restore if we are
    trying to save/restore cursor pointing to a user record (in which case it
    is not necessary). This process still used when we are trying to save/
    restore cursor pointing to page infimum (where it is useful).
    
    It also adds some comments explaining how this code works and a few
    debug asserts enforcing its invariants.

diff --git a/mysql-test/suite/innodb/r/bug115511.result b/mysql-test/suite/innodb/r/bug115511.result
new file mode 100644
index 00000000000..93602bbd3a1
--- /dev/null
+++ b/mysql-test/suite/innodb/r/bug115511.result
@@ -0,0 +1,24 @@
+#
+# Test for bug#115511 "Inplace ALTER TABLE might fail with duplicate
+#                      key error if concurrent insertions".
+#
+CREATE TABLE t1 (pk CHAR(5) PRIMARY KEY);
+INSERT INTO t1 VALUES ('aaaaa'), ('bbbbb'), ('ccccc'), ('ddddd'), ('eeeee');
+connect  con1, localhost, root,,;
+SET DEBUG='+d,ddl_buf_add_two';
+SET DEBUG_SYNC='ddl_bulk_inserter_latches_released SIGNAL latches_released WAIT_FOR go';
+# Send ALTER TABLE INPLACE which rebuilds table.
+ALTER TABLE t1 ENGINE=InnoDB, ALGORITHM=INPLACE;
+connection default;
+SET DEBUG_SYNC='now WAIT_FOR latches_released';
+INSERT INTO t1 VALUES ('ccaaa');
+SET DEBUG_SYNC='now SIGNAL go';
+connection con1;
+# Reap ALTER TABLE
+# Before fix it failed with duplicate key error.
+SET DEBUG='-d,ddl_buf_add_two';
+disconnect con1;
+connection default;
+# Cleanup.
+SET DEBUG_SYNC= 'RESET';
+DROP TABLE t1;
diff --git a/mysql-test/suite/innodb/t/bug115511.test b/mysql-test/suite/innodb/t/bug115511.test
new file mode 100644
index 00000000000..2869a380e54
--- /dev/null
+++ b/mysql-test/suite/innodb/t/bug115511.test
@@ -0,0 +1,39 @@
+# Tests in this file use both debug-only facilities as well as debug sync point.
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+
+--echo #
+--echo # Test for bug#115511 "Inplace ALTER TABLE might fail with duplicate
+--echo #                      key error if concurrent insertions".
+--echo #
+--enable_connect_log
+CREATE TABLE t1 (pk CHAR(5) PRIMARY KEY);
+INSERT INTO t1 VALUES ('aaaaa'), ('bbbbb'), ('ccccc'), ('ddddd'), ('eeeee');
+
+--connect (con1, localhost, root,,)
+SET DEBUG='+d,ddl_buf_add_two';
+SET DEBUG_SYNC='ddl_bulk_inserter_latches_released SIGNAL latches_released WAIT_FOR go';
+--echo # Send ALTER TABLE INPLACE which rebuilds table.
+--send ALTER TABLE t1 ENGINE=InnoDB, ALGORITHM=INPLACE
+
+--connection default
+SET DEBUG_SYNC='now WAIT_FOR latches_released';
+INSERT INTO t1 VALUES ('ccaaa');
+SET DEBUG_SYNC='now SIGNAL go';
+
+--connection con1
+--echo # Reap ALTER TABLE
+--echo # Before fix it failed with duplicate key error.
+--reap
+
+SET DEBUG='-d,ddl_buf_add_two';
+
+--disconnect con1
+--source include/wait_until_disconnected.inc
+
+--connection default
+--echo # Cleanup.
+SET DEBUG_SYNC= 'RESET';
+DROP TABLE t1;
+
+--disable_connect_log
diff --git a/storage/innobase/row/row0pread.cc b/storage/innobase/row/row0pread.cc
index 28408a9be12..97ac5731e62 100644
--- a/storage/innobase/row/row0pread.cc
+++ b/storage/innobase/row/row0pread.cc
@@ -251,12 +251,13 @@ class PCursor {
     return m_pcur->get_page_cur();
   }
 
-  /** Restore from a saved position.
+  /** Restore from a saved position. Sets position the next user record
+  in the tree if it is cursor pointing to supremum that was saved.
   @return DB_SUCCESS or error code. */
   [[nodiscard]] dberr_t restore_from_savepoint() noexcept;
 
-  /** Move to the first user rec on the restored page. */
-  [[nodiscard]] dberr_t move_to_user_rec() noexcept;
+  /** Move to the first user rec on the next page. */
+  [[nodiscard]] dberr_t move_to_next_block_user_rec() noexcept;
 
   /** @return true if cursor is after last on page. */
   [[nodiscard]] bool is_after_last_on_page() const noexcept {
@@ -276,6 +277,9 @@ class PCursor {
   /** Level where the cursor is positioned or need to be positioned in case of
   restore. */
   size_t m_read_level{};
+
+  /** Indicates that savepoint created for cursor corresponds to supremum. */
+  bool m_supremum_saved{};
 };
 
 buf_block_t *Parallel_reader::Scan_ctx::block_get_s_latched(
@@ -293,9 +297,42 @@ buf_block_t *Parallel_reader::Scan_ctx::block_get_s_latched(
 }
 
 void PCursor::savepoint() noexcept {
-  /* Store the cursor position on the previous user record on the page. */
-  m_pcur->move_to_prev_on_page();
+  /* The below code will not work correctly in all cases if we are to take
+  savepoint of cursor pointing to infimum.
+
+  If savepoint is taken for such a cursor, in optimistic restore case we
+  might not detect situation when after mini-trancation commit/latches
+  release concurrent merge from the previous page or concurrent update of
+  the last record on the previous page, which doesn't fit into it, has
+  moved records over infimum (see MySQL bugs #114248 and #115518).
+  As result scan will revisit these records, which can cause, for example,
+  ALTER TABLE failing with a duplicate key error.
+
+  Luckily, at the moment, savepoint is taken only when cursor points
+  to a user record (when we do save/restore from Parallel_reader::Ctx::m_f
+  callback which processes records) or to the supremum (when we do save/
+  restore from Parallel_reader::m_finish_callback end-of-page callback or
+  before switching to next page in move_to_next_block()). */
+  ut_ad(!m_pcur->is_before_first_on_page());
+
+  /* If we are taking savepoint for cursor pointing to supremum we are doing
+  one step back. This is necessary to prevent situation when concurrent
+  merge from the next page, which happens after we commit mini-transaction/
+  release latches moves records over supremum to the current page.
+  In this case the optimistic restore of cursor pointing to supremum will
+  result in cursor pointing to supremum, which means moved records will
+  be incorrectly skipped by the scan. */
+  m_supremum_saved = m_pcur->is_after_last_on_page();
+  if (m_supremum_saved) {
+    m_pcur->move_to_prev_on_page();
+    // Only root page can be empty and we are not supposed to get here
+    // in this case.
+    ut_ad(!m_pcur->is_before_first_on_page());
+  }
 
+  /* Thanks to the above we can say that we are saving position of last user
+  record which have processed/ which corresponds to last key value we have
+  processed. */
   m_pcur->store_position(m_mtr);
 
   m_mtr->commit();
@@ -307,17 +344,53 @@ void PCursor::resume() noexcept {
   m_mtr->set_log_mode(MTR_LOG_NO_REDO);
 
   /* Restore position on the record, or its predecessor if the record
-  was purged meanwhile. */
+  was purged meanwhile. In extreme case this can be infimum record of
+  the first page in the tree. However, this can never be a supremum
+  record on a page, since we always move cursor one step back when
+  savepoint() is called for infimum record.
+
+  We can be in either of the two situations:
+
+  1) We do save/restore from Parallel_reader:::Ctx::m_f callback for
+  record processing. In this case we saved position of user-record for
+  which callback was invoked originally, and which we already consider
+  as processed. In theory, such record is not supposed be purged as our
+  transaction holds read view on it. But in practice, it doesn't matter
+  - even if the record was purged then our cursor will still point to
+  last processed user record in the tree after its restoration.
+  And Parallel_reader:::Ctx::m_f callback execution is always followed by
+  page_cur_move_to_next() call, which moves our cursor to the next record
+  and this record has not been processed yet.
+
+  2) We do save/restore from the Parallel_reader::m_finish_callback
+  end-of-page callback or from move_to_next_block() before switching
+  the page. In this case our cursor was positioned on supremum record
+  before savepoint() call.
+  Th savepoint() call raised m_supremum_saved flag and saved the position
+  of the user record which preceded the supremum, i.e. the last user record
+  which was processed.
+  After the below call to restore_position() the cursor will point to
+  the latter record, or, if it has been purged meanwhile, its closest
+  non-purged predecessor (in extreme case this can be infimum of the first
+  page in the tree!).
+  By moving to the successor of the saved record we position the cursor
+  either to supremum record (which means we restored original cursor
+  position and can continue switch to the next page as usual) or to
+  some user record which our scan have not processed yet (for example,
+  this record might have been moved from the next page due to page
+  merge or simply inserted to our page concurrently).
+  The latter case is detected by caller by doing !is_after_last_on_page()
+  check and instead of doing switch to the next page we continue processing
+  from the restored user record. */
 
   m_pcur->restore_position(BTR_SEARCH_LEAF, m_mtr, UT_LOCATION_HERE);
 
-  if (!m_pcur->is_after_last_on_page()) {
-    /* Move to the successor of the saved record. */
-    m_pcur->move_to_next_on_page();
-  }
+  ut_ad(!m_pcur->is_after_last_on_page());
+
+  if (m_supremum_saved) m_pcur->move_to_next_on_page();
 }
 
-dberr_t PCursor::move_to_user_rec() noexcept {
+dberr_t PCursor::move_to_next_block_user_rec() noexcept {
   auto cur = m_pcur->get_page_cur();
   const auto next_page_no = btr_page_get_next(page_cur_get_page(cur), m_mtr);
 
@@ -370,7 +443,28 @@ dberr_t PCursor::move_to_user_rec() noexcept {
 
 dberr_t PCursor::restore_from_savepoint() noexcept {
   resume();
-  return m_pcur->is_on_user_rec() ? DB_SUCCESS : move_to_user_rec();
+
+  /* We should not get cursor pointing to supremum when restoring cursor
+  which has pointed to user record from the Parallel_reader:::Ctx::m_f
+  callback. Otherwise the below call to move_to_next_block_user_rec()
+  will position the cursor to the first user record on the next page
+  and then calling code will do page_cur_move_to_next() right after
+  that. Which means that one record will be incorrectly skipped by
+  our scan. Luckily, this always holds with current code. */
+  ut_ad(m_supremum_saved || !m_pcur->is_after_last_on_page());
+
+  /* Move to the next user record in the index tree if we are at supremum.
+  Even though this looks awkward from the proper layering point of view,
+  and ideally should have been done on the same code level as iteration
+  over records, doing this here, actually, a) allows to avoid scenario in
+  which savepoint/mini-transaction commit and latch release/restore happen
+  for the same page twice - first time in page-end callback and then in
+  move_to_next_block() call, b) handles a hypotetical situation when
+  the index was reduced to the empty root page while latches were released
+  by detecting it and bailing out early (it is important for us to bail out
+  and not try to call savepoint() method in this case). */
+  return (!m_pcur->is_after_last_on_page()) ? DB_SUCCESS
+                                            : move_to_next_block_user_rec();
 }
 
 dberr_t Parallel_reader::Thread_ctx::restore_from_savepoint() noexcept {
@@ -402,7 +496,7 @@ dberr_t PCursor::move_to_next_block(dict_index_t *index) {
 
     err = restore_from_savepoint();
   } else {
-    err = move_to_user_rec();
+    err = move_to_next_block_user_rec();
   }
 
   int n_retries [[maybe_unused]] = 0;