Bug #111770 group replication lost transaction when import tablespace failed
Submitted: 17 Jul 2:59 Modified: 18 Jul 10:57
Reporter: gang liu Email Updates:
Status: Verified Impact on me:
None 
Category:MySQL Server: DDL Severity:S1 (Critical)
Version:8.0.30, 8.0.33 OS:Any
Assigned to: CPU Architecture:Any
Tags: group replication; import tablespace

[17 Jul 2:59] gang liu
Description:
a node crash in a mgr cluster during doing the "alter table ... import tablespace", when the node restart and join the mgr cluster, its  Gtid_executed & binlog is inconsistent.

How to repeat:
1.I add debug code:
```
--- a/sql/sql_table.cc
+++ b/sql/sql_table.cc
@@ -11302,6 +11302,9 @@ bool Sql_cmd_discard_import_tablespace::mysql_discard_or_import_tablespace(
           ++pa_id;
         }
       } else {
+        if (DBUG_EVALUATE_IF("import_tablespace_failed", true, false)) {
+          sleep(3600);
+        }
         assert(m_alter_info->partition_names.elements == 0);
         // We get here when we have imported a table tablespace, or all
         // partition tablespaces. In this case, we remove SDIs from all
```
2.the mtr script:

```
################################################################################
#
# steps:
# 0. Initialization:
#  - The test requires three servers: M1, M2, and M3.
# 1. ALTER TABLE IMPORT ... TABLESPACE ON M1
# 2. M3 APPLY THE BINLOG, when ALTER TABLE IMPORT ... TABLESPACE , the server M3 was killed
# 3. M3 start and join the group replicaton. then, M3 should have applied binlog (alter table import tablespace ...) agin.
# however, in step 2, the gtid(ALTER TABLE IMPORT ... TABLESPACE) was record in table mysql.gtid_executed, 
# the binlog (ALTER TABLE IMPORT ... TABLESPACE) was not write into binlog file.
# 4.Conlusion: the ddl (ALTER TABLE IMPORT ... TABLESPACE) was not atomicity lead some serious problem.
################################################################################

--source include/have_group_replication_plugin.inc
--source include/force_restart.inc
--let $rpl_skip_group_replication_start= 0
--let $rpl_server_count= 3
--source include/group_replication.inc

let $MYSQLD_DATADIR = `SELECT @@datadir`;
CREATE TABLE t1 (c1 VARCHAR(32) primary key, c2 VARCHAR(32), c3 VARCHAR(32));

INSERT INTO t1 VALUES ('Test Data -1', 'Test Data -2', 'Test Data -3');
--sleep 5

FLUSH table t1 for export;
--copy_file $MYSQLD_DATADIR/test/t1.ibd $MYSQLD_DATADIR/t1.ibd_back
--copy_file $MYSQLD_DATADIR/test/t1.cfg $MYSQLD_DATADIR/t1.cfg_back

--let $rpl_connection_name= server2
--source include/rpl_connection.inc
let $MYSQLD_DATADIR = `SELECT @@datadir`;
--copy_file $MYSQLD_DATADIR/test/t1.ibd $MYSQLD_DATADIR/t1.ibd_back

--let $rpl_connection_name= server3
--source include/rpl_connection.inc
let $MYSQLD_DATADIR = `SELECT @@datadir`;
--copy_file $MYSQLD_DATADIR/test/t1.ibd $MYSQLD_DATADIR/t1.ibd_back

--let $rpl_connection_name= server1
--source include/rpl_connection.inc
unlock tables;
DROP TABLE t1;

CREATE TABLE t2 (c1 VARCHAR(32) primary key, c2 VARCHAR(32), c3 VARCHAR(32));
ALTER TABLE t2 DISCARD TABLESPACE;

let $MYSQLD_DATADIR = `SELECT @@datadir`;
--move_file $MYSQLD_DATADIR/t1.ibd_back $MYSQLD_DATADIR/test/t2.ibd
--move_file $MYSQLD_DATADIR/t1.cfg_back $MYSQLD_DATADIR/test/t2.cfg

--let $rpl_connection_name= server2
--source include/rpl_connection.inc
let $MYSQLD_DATADIR = `SELECT @@datadir`;
--move_file $MYSQLD_DATADIR/t1.ibd_back $MYSQLD_DATADIR/test/t2.ibd

--let $rpl_connection_name= server3
--source include/rpl_connection.inc
let $MYSQLD_DATADIR = `SELECT @@datadir`;
--move_file $MYSQLD_DATADIR/t1.ibd_back $MYSQLD_DATADIR/test/t2.ibd

set global debug='+d,import_tablespace_failed';

--let $rpl_connection_name= server1
--source include/rpl_connection.inc
ALTER TABLE t2 IMPORT TABLESPACE;
--sleep 5

--let $rpl_connection_name= server3
--source include/rpl_connection.inc

--let $_server_id= `SELECT @@server_id`
--let $expect_file_name_s3= $MYSQLTEST_VARDIR/tmp/mysqld.$_server_id.expect
--let $mysqld_pid_file=`SELECT @@GLOBAL.pid_file`
--source include/expect_crash.inc
exec kill -9 `cat $mysqld_pid_file`;

--source include/start_mysqld.inc
set global debug='-d,import_tablespace_failed';

--let $rpl_connection_name= server1
--source include/rpl_connection.inc
show master status;
--let $s1_gtid= `SELECT @@gtid_executed`
select * from t2;

--let $rpl_connection_name= server3
--source include/rpl_connection.inc
eval SELECT WAIT_FOR_EXECUTED_GTID_SET('$s1_gtid', 60);
--error 1814
select * from t2;

--let $rpl_connection_name= server1
--source include/rpl_connection.inc
DROP TABLE t2;

--source include/group_replication_end.inc
```

3.the result script:

```
[connection server1]
CREATE TABLE t1 (c1 VARCHAR(32) primary key, c2 VARCHAR(32), c3 VARCHAR(32));
INSERT INTO t1 VALUES ('Test Data -1', 'Test Data -2', 'Test Data -3');
FLUSH table t1 for export;
[connection server2]
[connection server3]
[connection server1]
unlock tables;
DROP TABLE t1;
CREATE TABLE t2 (c1 VARCHAR(32) primary key, c2 VARCHAR(32), c3 VARCHAR(32));
ALTER TABLE t2 DISCARD TABLESPACE;
[connection server2]
[connection server3]
set global debug='+d,import_tablespace_failed';
[connection server1]
ALTER TABLE t2 IMPORT TABLESPACE;
[connection server3]
# restart
set global debug='-d,import_tablespace_failed';
[connection server1]
show master status;
File    Position        Binlog_Do_DB    Binlog_Ignore_DB        Executed_Gtid_Set
server-binary-log.000001        2705                    e8b953e5-2236-11ee-9ad1-2cf05daaf60d:1-9
select * from t2;
c1      c2      c3
Test Data -1    Test Data -2    Test Data -3
[connection server3]
SELECT WAIT_FOR_EXECUTED_GTID_SET('e8b953e5-2236-11ee-9ad1-2cf05daaf60d:1-9', 60);
WAIT_FOR_EXECUTED_GTID_SET('e8b953e5-2236-11ee-9ad1-2cf05daaf60d:1-9', 60)
0
select * from t2;
ERROR HY000: Tablespace has been discarded for table 't2'
[connection server1]
DROP TABLE t2;
[connection server3]
[connection server1]
include/group_replication_end.inc 
``` 

Suggested fix:
I do not have good suggestion.
[17 Jul 3:09] gang liu
I also found the sql "alter table ... import tablespace" was not atomic.

the mtr script:

```
let $MYSQLD_DATADIR = `SELECT @@datadir`;
CREATE TABLE t1 (c1 VARCHAR(32), c2 VARCHAR(32), c3 VARCHAR(32));

INSERT INTO t1 VALUES ('Test Data -1', 'Test Data -2', 'Test Data -3');

FLUSH table t1 for export;
--copy_file $MYSQLD_DATADIR/test/t1.cfg $MYSQLD_DATADIR/t1.cfg_back
--copy_file $MYSQLD_DATADIR/test/t1.ibd $MYSQLD_DATADIR/t1.ibd_back
unlock tables;
DROP TABLE t1;

CREATE TABLE t2 (c1 VARCHAR(32), c2 VARCHAR(32), c3 VARCHAR(32));
ALTER TABLE t2 DISCARD TABLESPACE;

--move_file $MYSQLD_DATADIR/t1.cfg_back $MYSQLD_DATADIR/test/t2.cfg
--move_file $MYSQLD_DATADIR/t1.ibd_back $MYSQLD_DATADIR/test/t2.ibd

set global debug='+d,import_tablespace_failed';
send ALTER TABLE t2 IMPORT TABLESPACE;
--sleep 5

--echo # In connection con1 - Running Insert
connect (con1,localhost,root,,);
--let $_server_id= `SELECT @@server_id`
--let $expect_file_name_s3= $MYSQLTEST_VARDIR/tmp/mysqld.$_server_id.expect
--let $mysqld_pid_file=`SELECT @@GLOBAL.pid_file`
--source include/expect_crash.inc
exec kill -9 `cat $mysqld_pid_file`;

--source include/start_mysqld.inc
set global debug='-d,import_tablespace_failed';

--error 1815
ALTER TABLE t2 IMPORT TABLESPACE;

DROP TABLE t2;

```

the result:

```
CREATE TABLE t1 (c1 VARCHAR(32), c2 VARCHAR(32), c3 VARCHAR(32));
INSERT INTO t1 VALUES ('Test Data -1', 'Test Data -2', 'Test Data -3');
FLUSH table t1 for export;
unlock tables;
DROP TABLE t1;
CREATE TABLE t2 (c1 VARCHAR(32), c2 VARCHAR(32), c3 VARCHAR(32));
ALTER TABLE t2 DISCARD TABLESPACE;
set global debug='+d,import_tablespace_failed';
ALTER TABLE t2 IMPORT TABLESPACE;
# In connection con1 - Running Insert
# restart
set global debug='-d,import_tablespace_failed';
ALTER TABLE t2 IMPORT TABLESPACE;
ERROR HY000: Internal error: Cannot reset LSNs in table `test`.`t2` : Data structure corruption
DROP TABLE t2;

```
[18 Jul 10:57] MySQL Verification Team
Hello gang liu,

Thank you for the report and test case.
Verified as described.

regards,
Umesh