Description:
When do FLUSH TABLE FOR EXPORT with concurrent DROP TABLE and dmls, it may lead deadlock.
First, one DML trx committed, and generate new gtid, then deadlock
[Gtid_table_persistor thread] try persist the gtid -> [FLUSH TABLE FOR EXPORT] acquire dict_sys_mytex and wait persist gtid finish -> [drop table] acquire table_cache lock, and wait dict_sys_mutex --> [Gtid_table_persistor] thread wait drop table release table_cache lock
How to repeat:
In 8.0.32, use below debug_sync diff and compile
$ git diff
diff --git a/sql/rpl_gtid_persist.cc b/sql/rpl_gtid_persist.cc
index 37f35a44156..581716a4b87 100644
--- a/sql/rpl_gtid_persist.cc
+++ b/sql/rpl_gtid_persist.cc
@@ -395,6 +395,11 @@ int Gtid_table_persistor::save(const Gtid_set *gtid_set, bool compress) {
goto end;
}
+#ifndef NDEBUG
+ DBUG_SIGNAL_WAIT_FOR(thd, "test_wait_gtid_save", "reach_gtid_save",
+ "end_gtid_save");
+#endif
+
ret = error = save(table, gtid_set);
end:
diff --git a/sql/sql_base.cc b/sql/sql_base.cc
index f93f0edaca3..5e54d08d237 100644
--- a/sql/sql_base.cc
+++ b/sql/sql_base.cc
@@ -10242,6 +10242,11 @@ void tdc_remove_table(THD *thd, enum_tdc_remove_table_type remove_type,
char key[MAX_DBKEY_LENGTH];
size_t key_length;
+#ifndef NDEBUG
+ DBUG_SIGNAL_WAIT_FOR(thd, "test_wait_rm_table", "reach_rm_table",
+ "end_rm_table");
+#endif
+
if (!has_lock)
table_cache_manager.lock_all_and_tdc();
else
1. use mtr to start one mysql-server with gtid and binlog on, mysql-test/mtr mysqldump_gtid --start
2. create conn1: mysql -uroot -P13000 -h127.0.0.1 test -A , and do below operation
mysql> create table t1 (c1 int, c2 int) engine=innodb; create table t2 (c1 int, c2 int) engine=innodb; create table t3 (c1 int, c2 int) engine=innodb;
Query OK, 0 rows affected (0.10 sec)
Query OK, 0 rows affected (0.06 sec)
Query OK, 0 rows affected (0.05 sec)
mysql> set global debug='+d,test_wait_gtid_save';insert into t3 values (2,2);set debug_sync='now wait_for reach_gtid_save';
Query OK, 0 rows affected (0.01 sec)
Query OK, 1 row affected (0.01 sec)
Query OK, 0 rows affected (0.10 sec)
3. create conn2, mysql -uroot -P13000 -h127.0.0.1 test, and try drop table, it will wait for signal
mysql> set debug = '+d,test_wait_rm_table';drop table t2;
4. in conn1, check processlist
mysql> set debug_sync='now wait_for reach_rm_table';
Query OK, 0 rows affected (0.00 sec)
mysql> show processlist;
+----+-----------------+-----------------+------+---------+------+------------------------+------------------+
| Id | User | Host | db | Command | Time | State | Info |
+----+-----------------+-----------------+------+---------+------+------------------------+------------------+
| 5 | event_scheduler | localhost | NULL | Daemon | 240 | Waiting on empty queue | NULL |
| 8 | root | localhost:46018 | test | Query | 0 | init | show processlist |
| 9 | root | localhost:58694 | test | Query | 61 | debug sync point: now | drop table t2 |
+----+-----------------+-----------------+------+---------+------+------------------------+------------------+
3 rows in set (0.00 sec)
Query OK, 0 rows affected (0.00 sec)
5. create conn3: mysql -uroot -P13000 -h127.0.0.1 test -A, and try execute FLUSH EXPORT, it will wait
mysql> flush table t1 for export;
6. back to conn1, send signal for drop table, and check processlist
mysql> SET debug_sync='now signal end_rm_table';
Query OK, 0 rows affected (0.00 sec)
mysql> show processlist;
+----+-----------------+-----------------+------+---------+------+------------------------+---------------------------+
| Id | User | Host | db | Command | Time | State | Info |
+----+-----------------+-----------------+------+---------+------+------------------------+---------------------------+
| 5 | event_scheduler | localhost | NULL | Daemon | 340 | Waiting on empty queue | NULL |
| 8 | root | localhost:46018 | test | Query | 0 | init | show processlist |
| 9 | root | localhost:58694 | test | Query | 161 | checking permissions | drop table t2 |
| 10 | root | localhost:39452 | test | Query | 65 | Opening tables | flush table t1 for export |
+----+-----------------+-----------------+------+---------+------+------------------------+---------------------------+
4 rows in set (0.00 sec)
7. in conn1, send signal for backend gtid_save (sometimes, maybe send twice), and check processlist
mysql> SET debug_sync='now signal end_gtid_save';
Query OK, 0 rows affected (0.01 sec)
mysql> show processlist;
+----+-----------------+-----------------+------+---------+------+------------------------+---------------------------+
| Id | User | Host | db | Command | Time | State | Info |
+----+-----------------+-----------------+------+---------+------+------------------------+---------------------------+
| 5 | event_scheduler | localhost | NULL | Daemon | 401 | Waiting on empty queue | NULL |
| 8 | root | localhost:46018 | test | Query | 0 | init | show processlist |
| 9 | root | localhost:58694 | test | Query | 222 | checking permissions | drop table t2 |
| 10 | root | localhost:39452 | test | Query | 126 | Opening tables | flush table t1 for export |
+----+-----------------+-----------------+------+---------+------+------------------------+---------------------------+
4 rows in set (0.00 sec)
8. then we will both drop table, and flush table for export cannot finish
9. use pstack tool to show all stacks, it will find
a. Gtid_table_persistor::save is now wait for Table_cache::lock
b. mysql_rm_table is now wait for dict_table_close
c. flush_tables_for_export is now wait for Clone_persist_gtid::wait_flush
10. in error log, it will print [InnoDB] Waiting for Clone GTID thread every 5s, and will abort after 4min. during deadlock, the server cannot provide normal service