diff --git a/mysql-test/suite/innodb/r/bug115511.result b/mysql-test/suite/innodb/r/bug115511.result new file mode 100644 index 00000000000..93602bbd3a1 --- /dev/null +++ b/mysql-test/suite/innodb/r/bug115511.result @@ -0,0 +1,24 @@ +# +# Test for bug#115511 "Inplace ALTER TABLE might fail with duplicate +# key error if concurrent insertions". +# +CREATE TABLE t1 (pk CHAR(5) PRIMARY KEY); +INSERT INTO t1 VALUES ('aaaaa'), ('bbbbb'), ('ccccc'), ('ddddd'), ('eeeee'); +connect con1, localhost, root,,; +SET DEBUG='+d,ddl_buf_add_two'; +SET DEBUG_SYNC='ddl_bulk_inserter_latches_released SIGNAL latches_released WAIT_FOR go'; +# Send ALTER TABLE INPLACE which rebuilds table. +ALTER TABLE t1 ENGINE=InnoDB, ALGORITHM=INPLACE; +connection default; +SET DEBUG_SYNC='now WAIT_FOR latches_released'; +INSERT INTO t1 VALUES ('ccaaa'); +SET DEBUG_SYNC='now SIGNAL go'; +connection con1; +# Reap ALTER TABLE +# Before fix it failed with duplicate key error. +SET DEBUG='-d,ddl_buf_add_two'; +disconnect con1; +connection default; +# Cleanup. +SET DEBUG_SYNC= 'RESET'; +DROP TABLE t1; diff --git a/mysql-test/suite/innodb/r/bug115608.result b/mysql-test/suite/innodb/r/bug115608.result new file mode 100644 index 00000000000..d2425763d55 --- /dev/null +++ b/mysql-test/suite/innodb/r/bug115608.result @@ -0,0 +1,33 @@ +# +# Test for bug#115608 "Inplace ALTER TABLE might cause lost rows if +# concurrent purge". +# +CREATE TABLE t1 (pk CHAR(5) PRIMARY KEY); +INSERT INTO t1 VALUES ('aaaaa'), ('bbbbb'), ('bbbcc'), ('ccccc'), ('ddddd'), ('eeeee'); +set global innodb_purge_stop_now=ON; +DELETE FROM t1 WHERE pk = 'bbbcc'; +connect con1, localhost, root,,; +SET DEBUG='+d,ddl_buf_add_two'; +SET DEBUG_SYNC='ddl_bulk_inserter_latches_released SIGNAL latches_released WAIT_FOR go'; +# Send ALTER TABLE INPLACE which rebuilds table. +ALTER TABLE t1 ENGINE=InnoDB, ALGORITHM=INPLACE; +connection default; +SET DEBUG_SYNC='now WAIT_FOR latches_released'; +SET GLOBAL innodb_purge_run_now=ON; +SET DEBUG_SYNC='now SIGNAL go'; +connection con1; +# Reap ALTER TABLE +# Before the fix row 'ddddd' was missing from the table after ALTER. +SELECT * FROM t1; +pk +aaaaa +bbbbb +ccccc +ddddd +eeeee +SET DEBUG='-d,ddl_buf_add_two'; +disconnect con1; +connection default; +# Cleanup. +SET DEBUG_SYNC= 'RESET'; +DROP TABLE t1; diff --git a/mysql-test/suite/innodb/t/bug115511.test b/mysql-test/suite/innodb/t/bug115511.test new file mode 100644 index 00000000000..2869a380e54 --- /dev/null +++ b/mysql-test/suite/innodb/t/bug115511.test @@ -0,0 +1,39 @@ +# Tests in this file use both debug-only facilities as well as debug sync point. +--source include/have_debug.inc +--source include/have_debug_sync.inc + +--echo # +--echo # Test for bug#115511 "Inplace ALTER TABLE might fail with duplicate +--echo # key error if concurrent insertions". +--echo # +--enable_connect_log +CREATE TABLE t1 (pk CHAR(5) PRIMARY KEY); +INSERT INTO t1 VALUES ('aaaaa'), ('bbbbb'), ('ccccc'), ('ddddd'), ('eeeee'); + +--connect (con1, localhost, root,,) +SET DEBUG='+d,ddl_buf_add_two'; +SET DEBUG_SYNC='ddl_bulk_inserter_latches_released SIGNAL latches_released WAIT_FOR go'; +--echo # Send ALTER TABLE INPLACE which rebuilds table. +--send ALTER TABLE t1 ENGINE=InnoDB, ALGORITHM=INPLACE + +--connection default +SET DEBUG_SYNC='now WAIT_FOR latches_released'; +INSERT INTO t1 VALUES ('ccaaa'); +SET DEBUG_SYNC='now SIGNAL go'; + +--connection con1 +--echo # Reap ALTER TABLE +--echo # Before fix it failed with duplicate key error. +--reap + +SET DEBUG='-d,ddl_buf_add_two'; + +--disconnect con1 +--source include/wait_until_disconnected.inc + +--connection default +--echo # Cleanup. +SET DEBUG_SYNC= 'RESET'; +DROP TABLE t1; + +--disable_connect_log diff --git a/mysql-test/suite/innodb/t/bug115608.test b/mysql-test/suite/innodb/t/bug115608.test new file mode 100644 index 00000000000..61d01b436f8 --- /dev/null +++ b/mysql-test/suite/innodb/t/bug115608.test @@ -0,0 +1,44 @@ +# Tests in this file use both debug-only facilities as well as debug sync point. +--source include/have_debug.inc +--source include/have_debug_sync.inc + +--echo # +--echo # Test for bug#115608 "Inplace ALTER TABLE might cause lost rows if +--echo # concurrent purge". +--echo # +--enable_connect_log +CREATE TABLE t1 (pk CHAR(5) PRIMARY KEY); +INSERT INTO t1 VALUES ('aaaaa'), ('bbbbb'), ('bbbcc'), ('ccccc'), ('ddddd'), ('eeeee'); + +set global innodb_purge_stop_now=ON; +DELETE FROM t1 WHERE pk = 'bbbcc'; + +--connect (con1, localhost, root,,) +SET DEBUG='+d,ddl_buf_add_two'; +SET DEBUG_SYNC='ddl_bulk_inserter_latches_released SIGNAL latches_released WAIT_FOR go'; +--echo # Send ALTER TABLE INPLACE which rebuilds table. +--send ALTER TABLE t1 ENGINE=InnoDB, ALGORITHM=INPLACE + +--connection default +SET DEBUG_SYNC='now WAIT_FOR latches_released'; +SET GLOBAL innodb_purge_run_now=ON; +--source include/wait_innodb_all_purged.inc +SET DEBUG_SYNC='now SIGNAL go'; + +--connection con1 +--echo # Reap ALTER TABLE +--reap + +--echo # Before the fix row 'ddddd' was missing from the table after ALTER. +SELECT * FROM t1; + +SET DEBUG='-d,ddl_buf_add_two'; + +--disconnect con1 +--source include/wait_until_disconnected.inc + +--connection default +--echo # Cleanup. +SET DEBUG_SYNC= 'RESET'; +DROP TABLE t1; +--disable_connect_log diff --git a/storage/innobase/ddl/ddl0par-scan.cc b/storage/innobase/ddl/ddl0par-scan.cc index f7f948a0f74..f334a7d2f5c 100644 --- a/storage/innobase/ddl/ddl0par-scan.cc +++ b/storage/innobase/ddl/ddl0par-scan.cc @@ -237,6 +237,7 @@ dberr_t Parallel_cursor::scan(Builders &builders) noexcept { thread_ctx->get_state() != Parallel_reader::State::THREAD) { thread_ctx->savepoint(); latches_released = true; + DEBUG_SYNC_C("ddl_bulk_inserter_latches_released"); } return DB_SUCCESS; }); diff --git a/storage/innobase/row/row0pread.cc b/storage/innobase/row/row0pread.cc index 1b5766614f6..97ac5731e62 100644 --- a/storage/innobase/row/row0pread.cc +++ b/storage/innobase/row/row0pread.cc @@ -214,6 +214,27 @@ class PCursor { PCursor(btr_pcur_t *pcur, mtr_t *mtr, size_t read_level) : m_mtr(mtr), m_pcur(pcur), m_read_level(read_level) {} + /** + Restore position of cursor created from Scan_ctx::Range object. + Adjust it if starting record of range has been purged. + */ + void restore_position_for_range() noexcept { + /* + Restore position on the record or its predecessor if the record + was purged meanwhile. In the latter case we need to adjust position by + moving one step forward as predecessor belongs to another range. + + We rely on return value from restore_position() to detect this scenario. + This can be done when saved position points to user record which is + always the case for saved start of Scan_ctx::Range. + */ + ut_ad(m_pcur->m_rel_pos == BTR_PCUR_ON); + + if (!m_pcur->restore_position(BTR_SEARCH_LEAF, m_mtr, UT_LOCATION_HERE)) { + page_cur_move_to_next(m_pcur->get_page_cur()); + } + } + /** Create a savepoint and commit the mini-transaction.*/ void savepoint() noexcept; @@ -225,76 +246,18 @@ class PCursor { @return DB_SUCCESS or error code. */ [[nodiscard]] dberr_t move_to_next_block(dict_index_t *index); - /** Restore the cursor position. */ - void restore_position() noexcept { - constexpr auto MODE = BTR_SEARCH_LEAF; - const auto relative = m_pcur->m_rel_pos; - auto equal = m_pcur->restore_position(MODE, m_mtr, UT_LOCATION_HERE); - -#ifdef UNIV_DEBUG - if (m_pcur->m_pos_state == BTR_PCUR_IS_POSITIONED_OPTIMISTIC) { - ut_ad(m_pcur->m_rel_pos == BTR_PCUR_BEFORE || - m_pcur->m_rel_pos == BTR_PCUR_AFTER); - } else { - ut_ad(m_pcur->m_pos_state == BTR_PCUR_IS_POSITIONED); - ut_ad((m_pcur->m_rel_pos == BTR_PCUR_ON) == m_pcur->is_on_user_rec()); - } -#endif /* UNIV_DEBUG */ - - switch (relative) { - case BTR_PCUR_ON: - if (!equal) { - page_cur_move_to_next(m_pcur->get_page_cur()); - } - break; - - case BTR_PCUR_UNSET: - case BTR_PCUR_BEFORE_FIRST_IN_TREE: - ut_error; - break; - - case BTR_PCUR_AFTER: - case BTR_PCUR_AFTER_LAST_IN_TREE: - break; - - case BTR_PCUR_BEFORE: - /* For non-optimistic restoration: - The position is now set to the record before pcur->old_rec. - - For optimistic restoration: - The position also needs to take the previous search_mode into - consideration. */ - switch (m_pcur->m_pos_state) { - case BTR_PCUR_IS_POSITIONED_OPTIMISTIC: - m_pcur->m_pos_state = BTR_PCUR_IS_POSITIONED; - /* The cursor always moves "up" ie. in ascending order. */ - break; - - case BTR_PCUR_IS_POSITIONED: - if (m_pcur->is_on_user_rec()) { - m_pcur->move_to_next(m_mtr); - } - break; - - case BTR_PCUR_NOT_POSITIONED: - case BTR_PCUR_WAS_POSITIONED: - ut_error; - } - break; - } - } - /** @return the current page cursor. */ [[nodiscard]] page_cur_t *get_page_cursor() noexcept { return m_pcur->get_page_cur(); } - /** Restore from a saved position. + /** Restore from a saved position. Sets position the next user record + in the tree if it is cursor pointing to supremum that was saved. @return DB_SUCCESS or error code. */ [[nodiscard]] dberr_t restore_from_savepoint() noexcept; - /** Move to the first user rec on the restored page. */ - [[nodiscard]] dberr_t move_to_user_rec() noexcept; + /** Move to the first user rec on the next page. */ + [[nodiscard]] dberr_t move_to_next_block_user_rec() noexcept; /** @return true if cursor is after last on page. */ [[nodiscard]] bool is_after_last_on_page() const noexcept { @@ -314,6 +277,9 @@ class PCursor { /** Level where the cursor is positioned or need to be positioned in case of restore. */ size_t m_read_level{}; + + /** Indicates that savepoint created for cursor corresponds to supremum. */ + bool m_supremum_saved{}; }; buf_block_t *Parallel_reader::Scan_ctx::block_get_s_latched( @@ -331,9 +297,42 @@ buf_block_t *Parallel_reader::Scan_ctx::block_get_s_latched( } void PCursor::savepoint() noexcept { - /* Store the cursor position on the previous user record on the page. */ - m_pcur->move_to_prev_on_page(); + /* The below code will not work correctly in all cases if we are to take + savepoint of cursor pointing to infimum. + + If savepoint is taken for such a cursor, in optimistic restore case we + might not detect situation when after mini-trancation commit/latches + release concurrent merge from the previous page or concurrent update of + the last record on the previous page, which doesn't fit into it, has + moved records over infimum (see MySQL bugs #114248 and #115518). + As result scan will revisit these records, which can cause, for example, + ALTER TABLE failing with a duplicate key error. + + Luckily, at the moment, savepoint is taken only when cursor points + to a user record (when we do save/restore from Parallel_reader::Ctx::m_f + callback which processes records) or to the supremum (when we do save/ + restore from Parallel_reader::m_finish_callback end-of-page callback or + before switching to next page in move_to_next_block()). */ + ut_ad(!m_pcur->is_before_first_on_page()); + + /* If we are taking savepoint for cursor pointing to supremum we are doing + one step back. This is necessary to prevent situation when concurrent + merge from the next page, which happens after we commit mini-transaction/ + release latches moves records over supremum to the current page. + In this case the optimistic restore of cursor pointing to supremum will + result in cursor pointing to supremum, which means moved records will + be incorrectly skipped by the scan. */ + m_supremum_saved = m_pcur->is_after_last_on_page(); + if (m_supremum_saved) { + m_pcur->move_to_prev_on_page(); + // Only root page can be empty and we are not supposed to get here + // in this case. + ut_ad(!m_pcur->is_before_first_on_page()); + } + /* Thanks to the above we can say that we are saving position of last user + record which have processed/ which corresponds to last key value we have + processed. */ m_pcur->store_position(m_mtr); m_mtr->commit(); @@ -345,17 +344,53 @@ void PCursor::resume() noexcept { m_mtr->set_log_mode(MTR_LOG_NO_REDO); /* Restore position on the record, or its predecessor if the record - was purged meanwhile. */ - - restore_position(); - - if (!m_pcur->is_after_last_on_page()) { - /* Move to the successor of the saved record. */ - m_pcur->move_to_next_on_page(); - } + was purged meanwhile. In extreme case this can be infimum record of + the first page in the tree. However, this can never be a supremum + record on a page, since we always move cursor one step back when + savepoint() is called for infimum record. + + We can be in either of the two situations: + + 1) We do save/restore from Parallel_reader:::Ctx::m_f callback for + record processing. In this case we saved position of user-record for + which callback was invoked originally, and which we already consider + as processed. In theory, such record is not supposed be purged as our + transaction holds read view on it. But in practice, it doesn't matter + - even if the record was purged then our cursor will still point to + last processed user record in the tree after its restoration. + And Parallel_reader:::Ctx::m_f callback execution is always followed by + page_cur_move_to_next() call, which moves our cursor to the next record + and this record has not been processed yet. + + 2) We do save/restore from the Parallel_reader::m_finish_callback + end-of-page callback or from move_to_next_block() before switching + the page. In this case our cursor was positioned on supremum record + before savepoint() call. + Th savepoint() call raised m_supremum_saved flag and saved the position + of the user record which preceded the supremum, i.e. the last user record + which was processed. + After the below call to restore_position() the cursor will point to + the latter record, or, if it has been purged meanwhile, its closest + non-purged predecessor (in extreme case this can be infimum of the first + page in the tree!). + By moving to the successor of the saved record we position the cursor + either to supremum record (which means we restored original cursor + position and can continue switch to the next page as usual) or to + some user record which our scan have not processed yet (for example, + this record might have been moved from the next page due to page + merge or simply inserted to our page concurrently). + The latter case is detected by caller by doing !is_after_last_on_page() + check and instead of doing switch to the next page we continue processing + from the restored user record. */ + + m_pcur->restore_position(BTR_SEARCH_LEAF, m_mtr, UT_LOCATION_HERE); + + ut_ad(!m_pcur->is_after_last_on_page()); + + if (m_supremum_saved) m_pcur->move_to_next_on_page(); } -dberr_t PCursor::move_to_user_rec() noexcept { +dberr_t PCursor::move_to_next_block_user_rec() noexcept { auto cur = m_pcur->get_page_cur(); const auto next_page_no = btr_page_get_next(page_cur_get_page(cur), m_mtr); @@ -408,7 +443,28 @@ dberr_t PCursor::move_to_user_rec() noexcept { dberr_t PCursor::restore_from_savepoint() noexcept { resume(); - return m_pcur->is_on_user_rec() ? DB_SUCCESS : move_to_user_rec(); + + /* We should not get cursor pointing to supremum when restoring cursor + which has pointed to user record from the Parallel_reader:::Ctx::m_f + callback. Otherwise the below call to move_to_next_block_user_rec() + will position the cursor to the first user record on the next page + and then calling code will do page_cur_move_to_next() right after + that. Which means that one record will be incorrectly skipped by + our scan. Luckily, this always holds with current code. */ + ut_ad(m_supremum_saved || !m_pcur->is_after_last_on_page()); + + /* Move to the next user record in the index tree if we are at supremum. + Even though this looks awkward from the proper layering point of view, + and ideally should have been done on the same code level as iteration + over records, doing this here, actually, a) allows to avoid scenario in + which savepoint/mini-transaction commit and latch release/restore happen + for the same page twice - first time in page-end callback and then in + move_to_next_block() call, b) handles a hypotetical situation when + the index was reduced to the empty root page while latches were released + by detecting it and bailing out early (it is important for us to bail out + and not try to call savepoint() method in this case). */ + return (!m_pcur->is_after_last_on_page()) ? DB_SUCCESS + : move_to_next_block_user_rec(); } dberr_t Parallel_reader::Thread_ctx::restore_from_savepoint() noexcept { @@ -440,7 +496,7 @@ dberr_t PCursor::move_to_next_block(dict_index_t *index) { err = restore_from_savepoint(); } else { - err = move_to_user_rec(); + err = move_to_next_block_user_rec(); } int n_retries [[maybe_unused]] = 0; @@ -625,7 +681,7 @@ dberr_t Parallel_reader::Ctx::traverse() { auto &from = m_range.first; PCursor pcursor(from->m_pcur, &mtr, m_scan_ctx->m_config.m_read_level); - pcursor.restore_position(); + pcursor.restore_position_for_range(); dberr_t err{DB_SUCCESS};