Bug #93901 InnoDB purge thread don't release GLOBAL and BACKUP MDL lock after undo truncate
Submitted: 12 Jan 9:17 Modified: 23 Jan 23:42
Reporter: Fungo Wang Email Updates:
Status: Closed Impact on me:
None 
Category:MySQL Server: InnoDB storage engine Severity:S2 (Serious)
Version:8.0.13 OS:Any
Assigned to: CPU Architecture:Any
Tags: Backup, global, innodb, mdl, purge, undo truncate

[12 Jan 9:17] Fungo Wang
Description:
When InnoDB background purge thread truncate some undo logs, the GLOBAL and BACKUP MDL lock is held by it and never released. The user thread will get stuck if try to execute SQLs that need these two MDL locks, such as 
1. FLUSH TABLES WITH READ LOCK
2. LOCK INSTANCE FOR BACKUP
3. SET GLBOAL read_only = on

How to repeat:
Below is the recurrence testcase, actually I just copied the previous part of innodb_undo.truncate mtr testcase, and added 2 lines.

#
# WL#6965: Truncate UNDO logs.
#

--source include/not_valgrind.inc
--source include/have_debug.inc

# The test is shutting down the server to force undo truncation.
--source include/big_test.inc

SET GLOBAL innodb_undo_tablespaces=2;
SET GLOBAL innodb_rollback_segments=1;
SET GLOBAL innodb_undo_log_truncate=1;
SET GLOBAL innodb_max_undo_log_size=10485760;
SET GLOBAL innodb_purge_rseg_truncate_frequency=1;
SET GLOBAL innodb_purge_stop_now=ON;
SET GLOBAL innodb_stats_auto_recalc=OFF;
SHOW VARIABLES LIKE 'innodb_undo_tablespaces';
SHOW VARIABLES LIKE 'innodb_rollback_segments';

################################################################################
# Test-case will test following scenarios.
#
# 1. Perform enough DML action so that undo tablespace sizes grow beyond
#    set threshold and then wait and see if it is being truncated.
#
################################################################################

#-----------------------------------------------------------------------------
#
# create test-bed
#
let MYSQLD_DATADIR = `select @@datadir`;

#-----------------------------------------------------------------------------
#
# 1. Perform enough DML action so that undo tablespace sizes grow beyond
#    set threshold and then wait and see if it is being truncated.
#

# Storing the initial sizes of the tablespaces before DML action

let CHECKFILE = $MYSQLTEST_VARDIR/tmp/check.txt;

perl;
($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$ini1)
        = stat("$ENV{MYSQLD_DATADIR}/undo_001");
($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$ini2)
        = stat("$ENV{MYSQLD_DATADIR}/undo_002");
open(OUT, ">$ENV{CHECKFILE}") || die;
print OUT "let \$ini1= $ini1;\n";
print OUT "let \$ini2= $ini2;\n";
close(OUT);
EOF
--source $CHECKFILE

# Start of the first transaction that performs sufficient DML action
--connect(con1,localhost,root,,test)
--connection con1

SHOW VARIABLES LIKE 'innodb_rollback_segments';
create table t1(
  keyc int,
  c1 char(255),
  c2 char(255),
  c3 char(255),
  c4 char(255),
  c5 char(255),
  c6 char(255),
  primary key(keyc)) engine = innodb;

delimiter |;
CREATE PROCEDURE populate_t1()
BEGIN
        DECLARE i INT DEFAULT 1;
        while (i <= 20000) DO
                insert into t1 values (i, 'a', 'b', 'c', 'd', 'e', 'f' );
                SET i = i + 1;
        END WHILE;
END |
delimiter ;|

begin;
call populate_t1();
delete from t1 where keyc < 10000;
update t1 set c1 = 'mysql' where  keyc > 10000;
update t1 set c2 = 'mysql' where  keyc > 10000;
update t1 set c3= 'mysql' where keyc > 10000;
update t1 set c4= 'mysql' where keyc > 10000;
update t1 set c5= 'mysql' where keyc > 10000;
update t1 set c6= 'mysql' where keyc > 10000;

# Starting a new transaction for both tablespaces to get utilized
--connect(con2,localhost,root,,test)
--connection con2

create table t2 (
  keyc int,
  c1 char(255),
  c2 char(255),
  c3 char(255),
  c4 char(255),
  c5 char(255),
  c6 char(255),
  primary key(keyc)) engine = innodb;

delimiter |;
CREATE PROCEDURE populate_t2()
BEGIN
        DECLARE i INT DEFAULT 1;
        while (i <= 20000) DO
                insert into t2 values (i, 'a', 'b', 'c', 'd', 'e', 'f' );
                SET i = i + 1;
        END WHILE;
END |
delimiter ;|

begin;
call populate_t2();
delete from t2 where keyc < 10000;
update t2 set c1 = 'mysql' where  keyc > 10000;
update t2 set c2 = 'mysql' where  keyc > 10000;
update t2 set c3= 'mysql' where keyc > 10000;
update t2 set c4= 'mysql' where keyc > 10000;
update t2 set c5= 'mysql' where keyc > 10000;
update t2 set c6= 'mysql' where keyc > 10000;

--connection con2
commit;
--connection con1
commit;

--connection default

# Cleanup
drop PROCEDURE populate_t1;
drop PROCEDURE populate_t2;
drop table t1, t2;
--disconnect con1
--disconnect con2

# Creating a file check.txt to record information about tablespace sizes
let CHECKFILE = $MYSQLTEST_VARDIR/tmp/check.txt;

# Storing the sizes of both tablespaces before server shutdown
perl;
($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size1)
        = stat("$ENV{MYSQLD_DATADIR}/undo_001");
($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size2)
        = stat("$ENV{MYSQLD_DATADIR}/undo_002");
open(OUT, ">$ENV{CHECKFILE}") || die;
print OUT "let \$size1= $size1;\n";
print OUT "let \$size2= $size2;\n";
print OUT "expr \$before= \$size1 + \$size2;\n";
close(OUT);
EOF

SET GLOBAL innodb_purge_run_now=ON;

# Give the purge thread some time to do undo truncate
sleep 10;

FLUSH TABLES WITH READ LOCK; // will stuck here

When the testcase get stuck, we can get the MDL info easily from performance_schema.metadata_locks, and we can see that thread/innodb/srv_purge_thread hold the INTENTION_EXCLUSIVE type GLOAL and BACKUP MDL lock with TRANSACTION duration.

mysql> show processlist;
+----+-----------------+-----------------+------+---------+------+------------------------------+-----------------------------+
| Id | User            | Host            | db   | Command | Time | State                        | Info                        |
+----+-----------------+-----------------+------+---------+------+------------------------------+-----------------------------+
|  4 | event_scheduler | localhost       | NULL | Daemon  |  185 | Waiting on empty queue       | NULL                        |
|  9 | root            | localhost       | test | Query   |   46 | Waiting for global read lock | FLUSH TABLES WITH READ LOCK |
| 11 | root            | localhost:48654 | NULL | Query   |    0 | starting                     | show processlist            |
+----+-----------------+-----------------+------+---------+------+------------------------------+-----------------------------+
3 rows in set (0.00 sec)

mysql> select * from performance_schema.metadata_locks;
+-------------+--------------------+----------------+-------------+-----------------------+---------------------+---------------+-------------+------------------------+-----------------+----------------+
| OBJECT_TYPE | OBJECT_SCHEMA      | OBJECT_NAME    | COLUMN_NAME | OBJECT_INSTANCE_BEGIN | LOCK_TYPE           | LOCK_DURATION | LOCK_STATUS | SOURCE                 | OWNER_THREAD_ID | OWNER_EVENT_ID |
+-------------+--------------------+----------------+-------------+-----------------------+---------------------+---------------+-------------+------------------------+-----------------+----------------+
| GLOBAL      | NULL               | NULL           | NULL        |       140181295967536 | INTENTION_EXCLUSIVE | TRANSACTION   | GRANTED     | dictionary_impl.cc:416 |              34 |        1213371 |
| BACKUP LOCK | NULL               | NULL           | NULL        |       140181293968896 | INTENTION_EXCLUSIVE | TRANSACTION   | GRANTED     | dictionary_impl.cc:421 |              34 |        1213374 |
| GLOBAL      | NULL               | NULL           | NULL        |       140180690164416 | SHARED              | EXPLICIT      | PENDING     | lock.cc:1013           |              47 |          27691 |
| TABLE       | performance_schema | metadata_locks | NULL        |       140180619626736 | SHARED_READ         | TRANSACTION   | GRANTED     | sql_parse.cc:5744      |              49 |          10014 |
+-------------+--------------------+----------------+-------------+-----------------------+---------------------+---------------+-------------+------------------------+-----------------+----------------+
4 rows in set (0.00 sec)

mysql> select * from performance_schema.threads where thread_id = 34\G
*************************** 1. row ***************************
          THREAD_ID: 34
               NAME: thread/innodb/srv_purge_thread
               TYPE: BACKGROUND
     PROCESSLIST_ID: NULL
   PROCESSLIST_USER: NULL
   PROCESSLIST_HOST: NULL
     PROCESSLIST_DB: NULL
PROCESSLIST_COMMAND: NULL
   PROCESSLIST_TIME: 207
  PROCESSLIST_STATE: NULL
   PROCESSLIST_INFO: NULL
   PARENT_THREAD_ID: NULL
               ROLE: NULL
       INSTRUMENTED: YES
            HISTORY: YES
    CONNECTION_TYPE: NULL
       THREAD_OS_ID: 65003
     RESOURCE_GROUP: SYS_default
1 row in set (0.00 sec)

Suggested fix:
From the code level, in the `trx_undo_truncate_tablespace()` fucntion, the invoking of `dd::acquire_exclusive_tablespace_mdl()` will get the GLOBAL and BAKCUP mdl lock with MDL_TRANSACTION duration while try to get the EXPLICT mdl lock of undo log to be truncated. But after that, these two transaction duration MDL locks are not released (dd_release_mdl() only release EXPLICT MDL lock).

The proposed fix is to call `dd::commit_or_rollback_tablespace_change()` to release TRANSACTION duration MDL locks in `trx_undo_truncate_tablespace()`, just like what we did in `resume_alter_encrypt_tablespace()`.
[14 Jan 14:36] Sinisa Milivojevic
Hi,

I completely understand what you are writing about.

I think that the real bug here is that purge thread(s) hold these locks exclusively. I am not sure that your solution is the correct one, but it deserves a proper analysis.

Verified as reported.
[14 Jan 14:38] Sinisa Milivojevic
Hi,

This could be treated as "Feature request" or "Performance", but I prefer to leave it as a bug, since it deserves attention.

Unrelated to the entire matter, in MySQL 8.0, you do not need FTWRL in order to backup InnoDB tables.
[14 Jan 16:31] Fungo Wang
Hi Sinisa,

IMHO, I think it's definitely a serious bug, not "performance" or "feature request", some functionalities of mysql are broken because of this.

GLOBAL and BACKUP MDL lock are both held by the background purge thread.
As I mentioned at the beginning of this report, FTWRL is just one use case, there are other administration SQLs that could be blocked.
'RESET MASTER'  is another victim SQL;
LOCK INSTANCE FOR BACKUP is also blocked, so I doubt whether MEB can still work under this scenario :)
[14 Jan 18:01] Sinisa Milivojevic
Hi Fungo,

I think that you are right and I  am increasing the severity.
[23 Jan 23:42] Daniel Price
Posted by developer:
 
Fixed as of the upcoming 8.0.15 release, and here's the changelog entry:

Global and backup metadata locks were not released after the background
purge thread truncated undo logs.
[22 Mar 22:42] Marcelo Altmann
This issue was in fact fixed on 8.0.14. However, neither 8.0.14 nor 8.015 release notes mention it. Please update the release notes of 8.0.14.
[26 Mar 13:38] Sinisa Milivojevic
Hi Fungo,

I agree with you and I have notified our Documentation team.

Thanks very much.
[26 Mar 13:54] Sinisa Milivojevic
Hi,

Bug is mentioned in 8.0.16 Changelog, but the note will be moved to the correct release notes.
[26 Mar 14:03] Daniel Price
Posted by developer:
 
Corrected Version Fixed field. The issue was fixed in 8.0.14. Changelog entry was moved from 8.0.16 to 8.0.14 release notes.