Description:
A critical crash was observed in our environment. The failure occurred during transaction rollback operations, triggered by an assertion failure in ut_dbg_assertion_failed with the condition next_undo_offset > undo_offset. Call stack:
ut_dbg_assertion_failed (expr=expr@entry=0x1a29ce8 "next_undo_offset > undo_offset")
trx_undo_rec_copy
trx_undo_get_undo_rec_low
trx_undo_get_undo_rec out
trx_undo_prev_version_build
row_vers_old_has_index_entry
row_undo_mod_del_mark_or_remove_sec_low
row_undo_mod_del_mark_or_remove_sec
row_undo_mod_upd_exist_sec
row_undo_mod(undo_node_t*, que_thr_t*)
row_undo
row_undo_step(que_thr_t*)
que_thr_step (thr=<optimized out>)
que_run_threads_low
que_run_threads(que_thr_t*)
trx_rollback_to_savepoint_low(trx_t*, trx_savept_t*)
trx_rollback_for_mysql_low (trx=0x7f34a194d270)
trx_rollback_low(trx_t*)
trx_rollback_for_mysql(trx_t*)
innobase_rollback(handlerton*, THD*, bool)
ha_rollback_low
After analysing the coredump I found that the low_limit_no of purge_sys->view is less than the trx_no of purge_sys->iter. This indicates that some rollback operations may read the purged undo logs! After diving into the code, I think the root cause is that there is a race condition between MVCC::view_open and MVCC::clone_oldest_view:
thd1 thd2 purge
simple select
readview:
{
m_low_limit_id = 2475083946,
m_up_limit_id = 2475083946,
m_low_limit_no = 2475083946
}
dml and commit
purge all undo logs
purge_sys->view:
{
m_low_limit_id = 2475254583,
m_up_limit_id = 2475254583,
m_low_limit_no = 2475254583
}
purge_sys->iter.trx_no: 2475254583
simple select
check if readview can be reused:
{
m_low_limit_id = 2475083946,
m_up_limit_id = 2475083946,
m_low_limit_no = 2475083946
}
ReadView::m_closed = true
MVCC::clone_oldest_view
purge_sys->view:
{
m_low_limit_id = 2475083946,
m_up_limit_id = 2475083946,
m_low_limit_no = 2475083946
}
dml and rollback
may read the stale undo
How to repeat:
First we need to add some debug points:
diff --git a/storage/innobase/read/read0read.cc b/storage/innobase/read/read0read.cc
index a918b11b06f..6ee7ad8b1d1 100644
--- a/storage/innobase/read/read0read.cc
+++ b/storage/innobase/read/read0read.cc
@@ -36,6 +36,8 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "srv0srv.h"
#include "trx0sys.h"
+#include "current_thd.h"
+#include <debug_sync.h>
/*
-------------------------------------------------------------------------------
@@ -520,6 +522,11 @@ void MVCC::view_open(ReadView *&view, trx_t *trx) {
if (trx_is_autocommit_non_locking(trx) && view->empty()) {
view->m_closed = false;
+#ifdef UNIV_DEBUG
+ if (current_thd) {
+ DEBUG_SYNC(current_thd, "simulate_stop_after_setting_m_closed");
+ }
+#endif
if (view->m_low_limit_id == trx_sys_get_next_trx_id_or_no()) {
return;
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
index cbbac9eb59f..15fc8250ce7 100644
--- a/storage/innobase/trx/trx0purge.cc
+++ b/storage/innobase/trx/trx0purge.cc
@@ -2407,6 +2407,15 @@ ulint trx_purge(ulint n_purge_threads, /*!< in: number of purge tasks
rw_lock_x_unlock(&purge_sys->latch);
+ DBUG_EXECUTE_IF("simulate_stop_after_clone_oldest_view", {
+ bool end = false;
+ while (!end) {
+ std::this_thread::sleep_for(std::chrono::seconds(1));
+ ib::info() << "[TXSQL] purge thread stopped after clone oldest view";
+ DBUG_EXECUTE_IF("continue_to_purge", { end = true; });
+ }
+ });
+
#ifdef UNIV_DEBUG
if (srv_purge_view_update_only_debug) {
return (0);
Then run the following test script and you will see the server has gone away:
source include/have_debug.inc;
source include/have_debug_sync.inc;
--connect(con1,localhost,root)
create table t1(id int not null primary key auto_increment, c1 int, c2 int, key k_c1(c1));
insert into t1(c1,c2) values(1,1),(2,2);
sleep 2;
connection con1;
select * from t1;
connection default;
update t1 set c2=c2+1;
update t1 set c1=c1+2;
connection default;
set global innodb_purge_stop_now=ON;
set global innodb_purge_run_now=ON;
--source include/wait_innodb_all_purged.inc
CREATE UNDO TABLESPACE undo_test_001 ADD DATAFILE 'undo_test_001.ibu';
ALTER UNDO TABLESPACE innodb_undo_001 SET INACTIVE;
--let $grep_file = $MYSQLTEST_VARDIR/log/mysqld.1.err
--let $grep_pattern = .*Completed truncate of undo tablespace innodb_undo_001.*
--source include/wait_for_pattern_in_file.inc
ALTER UNDO TABLESPACE innodb_undo_001 SET ACTIVE;
ALTER UNDO TABLESPACE innodb_undo_002 SET INACTIVE;
--let $grep_file = $MYSQLTEST_VARDIR/log/mysqld.1.err
--let $grep_pattern = .*Completed truncate of undo tablespace innodb_undo_002.*
--source include/wait_for_pattern_in_file.inc
ALTER UNDO TABLESPACE innodb_undo_002 SET ACTIVE;
ALTER UNDO TABLESPACE undo_test_001 SET INACTIVE;
--let $grep_file = $MYSQLTEST_VARDIR/log/mysqld.1.err
--let $grep_pattern = .*Completed truncate of undo tablespace undo_test_001.*
--source include/wait_for_pattern_in_file.inc
DROP UNDO TABLESPACE undo_test_001;
connection con1;
set debug_sync="simulate_stop_after_setting_m_closed signal clone_view_stopped wait_for clone_view_continue";
send select * from t1;
connection default;
set debug_sync="now wait_for clone_view_stopped";
set global debug='+d, simulate_stop_after_clone_oldest_view';
update t1 set c2=c2 +1;
--let $grep_file = $MYSQLTEST_VARDIR/log/mysqld.1.err
--let $grep_pattern = .*purge thread stopped after clone oldest.*
--source include/wait_for_pattern_in_file.inc
begin;
update t1 set c1=c1+2;
update t1 set c2 = c2 + 2;
rollback;
set debug_sync='now signal clone_view_continue';
connection con1;
reap;
connection default;
disconnect con1;
set global debug='-d, simulate_stop_after_clone_oldest_view';
set global debug='+d,continue_to_purge';
sleep 5;
set global debug='-d,continue_to_purge';
drop table t1;
Suggested fix:
The new readview of purge_sys should not be older than the old one.