commit 9bc448ef78aafac8f08a96aa71a3f48d4b3a587c Author: Shaohua Wang Date: Mon Sep 11 10:00:00 2023 +0800 Bugfix #48976130 trx hangs due to invalid in_innodb value Problem: ======== Transaction hangs and it cannot be killed. | 1428539 | scmp | 100.104.175.251:28332 | scmp_oms | Killed | 1863872 | updating | /* Query from DMS-SQL_JOB-14998049-SJob_53754720_40363538453911670v by user 479297 */ DELETE FROM sc Analysis: ========= trx->in_innodb is 4294967295(0xFFFFFFFFF). TRX_FORCE_ROLLBACK is set, so the trx should be rollbacked by another thread. But it will not be rollbacked because the whole value is invalid. There are no high priority trxs at all. There are many invalid values in the same core file. trx->in_innodb | trx->in_depth: 4294967295 | 1 1254150 | 4294967295 16879690 | 4294967294 58104809 | 4294967293 Related mysql bugs: https://bugs.mysql.com/bug.php?id=110652 https://bugs.mysql.com/bug.php?id=99643 The root causes: trx can be returned to trx pools when trx->in_innodb & trx->in_depth is non-zero, then there will be concurrent access to trx->in_depth, which is not allowed by design. The concurrent access to trx can explain why trx->innodb and trx->in_depth can be 4294967295, but canot explain random values like 1254150. See functions: innobase_rollback_by_xid() and innobase_commit_by_xid(). Solution: ========= 1. Add assertions for trx->in_innodb & trx->in_dept before trx is returned into trx pool; 2. Fix two violations in xa commit/rollback. diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index c853d5f7a36..08f14f255bb 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -20575,9 +20575,12 @@ static xa_status_code innobase_commit_by_xid( trx_t *trx = trx_get_trx_by_xid(xid); if (trx != nullptr) { - TrxInInnoDB trx_in_innodb(trx); + { + TrxInInnoDB trx_in_innodb(trx); + + innobase_commit_low(trx); + } - innobase_commit_low(trx); ut_ad(trx->mysql_thd == nullptr); /* use cases are: disconnected xa, slave xa, recovery */ trx_deregister_from_2pc(trx); @@ -20603,9 +20606,13 @@ static xa_status_code innobase_rollback_by_xid( trx_t *trx = trx_get_trx_by_xid(xid); if (trx != nullptr) { - TrxInInnoDB trx_in_innodb(trx); + int ret; + + { + TrxInInnoDB trx_in_innodb(trx); - int ret = innobase_rollback_trx(trx); + ret = innobase_rollback_trx(trx); + } trx_deregister_from_2pc(trx); ut_ad(!trx->will_lock); diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index dd670a7fb06..59f28f6f4be 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -578,6 +578,19 @@ static void trx_validate_state_before_free(trx_t *trx) { trx->dict_operation = TRX_DICT_OP_NONE; assert_trx_is_inactive(trx); + + trx->in_innodb &= ~TRX_FORCE_ROLLBACK_DISABLE; + if (trx->in_innodb != 0 || trx->in_depth != 0) { + ib::error(ER_IB_MSG_1202) + << "Freeing a trx though trx->in_innodb is " << trx->in_innodb + << " and trx->in_depth is " << trx->in_depth + << ", which is declared to be inside InnoDB"; + + trx_print(stderr, trx, 600); + putc('\n', stderr); + + ut_error; + } } /** Free and initialize a transaction object instantiated during recovery. @@ -628,6 +641,7 @@ void trx_free_prepared_or_active_recovered(trx_t *trx) { trx->state.store(TRX_STATE_NOT_STARTED, std::memory_order_relaxed); trx->will_lock = 0; + trx_validate_state_before_free(trx); trx_free(trx); }