Bug #83163 server crash after xa transaction loss transaction
Submitted: 27 Sep 2016 8:07 Modified: 7 Oct 2016 10:27
Reporter: Ji Zhang Email Updates:
Status: Verified Impact on me:
None 
Category:MySQL Server: Replication Severity:S2 (Serious)
Version:5.7.14, 5.7.15 OS:CentOS (6.7)
Assigned to: CPU Architecture:Any

[27 Sep 2016 8:07] Ji Zhang
Description:
When a xa transaction is commited and the server crashed, then restart the MySQL server, first, do xa start with the exist xid, says: XAER_DUPID: The XID already exists, but this xid has been submitted.
This is why?

slave replication info:
	mysql> show slave status\G
	*************************** 1. row ***************************
	               Slave_IO_State: 
	                  Master_Host: 172.16.90.233
	                  Master_User: repl
	                  Master_Port: 3390
	                Connect_Retry: 60
	              Master_Log_File: my1binlog.000001
	          Read_Master_Log_Pos: 12598508
	               Relay_Log_File: relay-log-23490.000010
	                Relay_Log_Pos: 9748
	        Relay_Master_Log_File: my1binlog.000001
	................
	................
	  Replicate_Wild_Ignore_Table: 
	                   Last_Errno: 1440
	                   Last_Error: Error 'XAER_DUPID: The XID already exists' on query. Default database: 'test'. Query: 'XA START X'30',X'',1'
	                 Skip_Counter: 0
	................
	................
	               Last_SQL_Errno: 1440
	               Last_SQL_Error: Error 'XAER_DUPID: The XID already exists' on query. Default database: 'test'. Query: 'XA START X'30',X'',1'
	  Replicate_Ignore_Server_Ids: 
	             Master_Server_Id: 23390
	                  Master_UUID: 5d94dc1a-8204-11e6-92ee-5254005428f4
	             Master_Info_File: mysql.slave_master_info
	                    SQL_Delay: 0
	................
	................
	           Master_SSL_Crlpath: 
	           Retrieved_Gtid_Set: 5d94dc1a-8204-11e6-92ee-5254005428f4:23380-29377
	            Executed_Gtid_Set: 5d94dc1a-8204-11e6-92ee-5254005428f4:1-23402
	1 row in set (0.00 sec)

master <my1binlog.000001> info:
	# at 10246088
	#160927  2:53:55 server id 23390  end_log_pos 10246153 CRC32 0x004cd64d         GTID    last_committed=23359    sequence_number=23369
	SET @@SESSION.GTID_NEXT= '5d94dc1a-8204-11e6-92ee-5254005428f4:23369'/*!*/;
	# at 10246153
	#160927  2:53:55 server id 23390  end_log_pos 10246240 CRC32 0x20efecf6         Query   thread_id=187   exec_time=0     error_code=0
	SET TIMESTAMP=1474916035/*!*/;
	XA START X'30',X'',1
	/*!*/;
	# at 10246240
	#160927  2:53:55 server id 23390  end_log_pos 10246297 CRC32 0xf4e0417e         Table_map: `test`.`part_table` mapped to number 119
	# at 10246297
	#160927  2:53:55 server id 23390  end_log_pos 10246341 CRC32 0xcdc10c06         Write_rows: table id 119 flags: STMT_END_F

	BINLOG '
	w27pVxNeWwAAOQAAAJlYnAAAAHcAAAAAAAEABHRlc3QACnBhcnRfdGFibGUAAwMDDwI8AAZ+QeD0
	w27pVx5eWwAALAAAAMVYnAAAAHcAAAAAAAEAAgAD//yaMwAAgwwAAAYMwc0=
	'/*!*/;
	### INSERT INTO `test`.`part_table`
	### SET
	###   @1=13210 /* INT meta=0 nullable=0 is_null=0 */
	###   @2=3203 /* INT meta=0 nullable=1 is_null=0 */
	###   @3=NULL /* VARSTRING(60) meta=60 nullable=1 is_null=1 */
	# at 10246341
	#160927  2:53:55 server id 23390  end_log_pos 10246398 CRC32 0xd8d0ee8d         Table_map: `test`.`norm_table` mapped to number 118
	# at 10246398
	#160927  2:53:55 server id 23390  end_log_pos 10246442 CRC32 0x252d14d7         Write_rows: table id 118 flags: STMT_END_F

	BINLOG '
	w27pVxNeWwAAOQAAAP5YnAAAAHYAAAAAAAEABHRlc3QACm5vcm1fdGFibGUAAwMDDwI8AAaN7tDY
	w27pVx5eWwAALAAAACpZnAAAAHYAAAAAAAEAAgAD//yaMwAAgwwAANcULSU=
	'/*!*/;
	### INSERT INTO `test`.`norm_table`
	### SET
	###   @1=13210 /* INT meta=0 nullable=0 is_null=0 */
	###   @2=3203 /* INT meta=0 nullable=1 is_null=0 */
	###   @3=NULL /* VARSTRING(60) meta=60 nullable=1 is_null=1 */
	# at 10246442
	#160927  2:53:55 server id 23390  end_log_pos 10246527 CRC32 0xbd0250ad         Query   thread_id=187   exec_time=0     error_code=0
	SET TIMESTAMP=1474916035/*!*/;
	XA END X'30',X'',1
	/*!*/;
	# at 10246527
	#160927  2:53:55 server id 23390  end_log_pos 10246564 CRC32 0xb2dd9b80         XA PREPARE X'30',X'',1
	XA PREPARE X'30',X'',1
	/*!*/;
	..........
	..........
	..........
	# at 10249978
	#160927  2:53:55 server id 23390  end_log_pos 10250043 CRC32 0x1386b346         GTID    last_committed=23369    sequence_number=23379
	SET @@SESSION.GTID_NEXT= '5d94dc1a-8204-11e6-92ee-5254005428f4:23379'/*!*/;
	# at 10250043
	#160927  2:53:55 server id 23390  end_log_pos 10250131 CRC32 0x8722aed0         Query   thread_id=187   exec_time=0     error_code=0
	SET TIMESTAMP=1474916035/*!*/;
	XA COMMIT X'30',X'',1
	/*!*/;
Here you can see the master has been submitted.
slave binary log:
	# at 1941042
	#160927  2:53:55 server id 23390  end_log_pos 1941107 CRC32 0x2bce1b66  GTID    last_committed=4771     sequence_number=4772
	SET @@SESSION.GTID_NEXT= '5d94dc1a-8204-11e6-92ee-5254005428f4:23369'/*!*/;
	# at 1941107
	#160927  2:53:55 server id 23390  end_log_pos 1941185 CRC32 0xf3bce965  Query   thread_id=187   exec_time=12    error_code=0
	SET TIMESTAMP=1474916035/*!*/;
	SET @@session.sql_mode=524288/*!*/;
	XA START X'30',X'',1
	/*!*/;
	# at 1941185
	#160927  2:53:55 server id 23390  end_log_pos 1941242 CRC32 0xeadf1e6a  Table_map: `test`.`part_table` mapped to number 108
	# at 1941242
	#160927  2:53:55 server id 23390  end_log_pos 1941286 CRC32 0xad5c60d7  Write_rows: table id 108 flags: STMT_END_F

	BINLOG '
	w27pVxNeWwAAOQAAAPqeHQAAAGwAAAAAAAEABHRlc3QACnBhcnRfdGFibGUAAwMDDwI8AAZqHt/q
	w27pVx5eWwAALAAAACafHQAAAGwAAAAAAAEAAgAD//yaMwAAgwwAANdgXK0=
	'/*!*/;
	### INSERT INTO `test`.`part_table`
	### SET
	###   @1=13210 /* INT meta=0 nullable=0 is_null=0 */
	###   @2=3203 /* INT meta=0 nullable=1 is_null=0 */
	###   @3=NULL /* VARSTRING(60) meta=60 nullable=1 is_null=1 */
	# at 1941286
	#160927  2:53:55 server id 23390  end_log_pos 1941343 CRC32 0xa21c9d71  Table_map: `test`.`norm_table` mapped to number 109
	# at 1941343
	#160927  2:53:55 server id 23390  end_log_pos 1941387 CRC32 0xdd164306  Write_rows: table id 109 flags: STMT_END_F

	BINLOG '
	w27pVxNeWwAAOQAAAF+fHQAAAG0AAAAAAAEABHRlc3QACm5vcm1fdGFibGUAAwMDDwI8AAZxnRyi
	w27pVx5eWwAALAAAAIufHQAAAG0AAAAAAAEAAgAD//yaMwAAgwwAAAZDFt0=
	'/*!*/;
	### INSERT INTO `test`.`norm_table`
	### SET
	###   @1=13210 /* INT meta=0 nullable=0 is_null=0 */
	###   @2=3203 /* INT meta=0 nullable=1 is_null=0 */
	###   @3=NULL /* VARSTRING(60) meta=60 nullable=1 is_null=1 */
	# at 1941387
	#160927  2:53:55 server id 23390  end_log_pos 1941463 CRC32 0x70cde740  Query   thread_id=187   exec_time=12    error_code=0
	SET TIMESTAMP=1474916035/*!*/;
	SET @@session.sql_mode=1436549152/*!*/;
	XA END X'30',X'',1
	/*!*/;
	# at 1941463
	#160927  2:53:55 server id 23390  end_log_pos 1941500 CRC32 0xf1c4290a  XA PREPARE X'30',X'',1
	XA PREPARE X'30',X'',1
	..........
	..........
	..........
	# at 1944806
	#160927  2:53:55 server id 23390  end_log_pos 1944871 CRC32 0x51aaf46a  GTID    last_committed=4781     sequence_number=4782
	SET @@SESSION.GTID_NEXT= '5d94dc1a-8204-11e6-92ee-5254005428f4:23379'/*!*/;
	# at 1944871
	#160927  2:53:55 server id 23390  end_log_pos 1944959 CRC32 0x91c3b6e4  Query   thread_id=187   exec_time=13    error_code=0
	SET TIMESTAMP=1474916035/*!*/;
	XA COMMIT X'30',X'',1
	/*!*/;
	ROLLBACK /* added by mysqlbinlog */ /*!*/;
	SET @@SESSION.GTID_NEXT= 'AUTOMATIC' /* added by mysqlbinlog */ /*!*/;
	DELIMITER ;
	# End of log file
	/*!50003 SET COMPLETION_TYPE=@OLD_COMPLETION_TYPE*/;
	/*!50530 SET @@SESSION.PSEUDO_SLAVE_MODE=0*/;
Here you can see the xa '30' in slave has been submitted.
Query table data from the slave:
	mysql> select * from part_table where c1=13210;
	Empty set (0.00 sec)

	mysql> select * from norm_table where c1=13210;    
	Empty set (0.00 sec)

	mysql> xa recover;
	Empty set (0.00 sec)
Here has been submitted, but no data,this is why?
Before the XID has been submitted, and then begin a the same XID's XA transaction should be OK,but there is a error,says:Error 'XAER_DUPID: The XID already exists' on query.
And the execution of "xa recover", did not return the result.
MySQL has been lost before the transaction has been submitted?

How to repeat:
Reproducing method:
	1.Create test table
		create table norm_table(c1 int auto_increment primary key, c2 int, c3 varchar(20));
		create table part_table(c1 int auto_increment primary key, c2 int, c3 varchar(20));
	2.Perform sysbench in master database
		sysbench --test=./tests/db/xa.lua --mysql-user=root --mysql-password=abc123 \
		--mysql-db=test --mysql-host=127.0.0.1 --mysql-port=3390 --oltp-read-only=off --report-interval=10 \
		--rand-type=uniform --max-time=60 --num-threads=5 --max-requests=0 --percentile=99 run
	xa.lua文件:
		pathtest = string.match(test, "(.*/)") or ""
		dofile(pathtest .. "common.lua")
		function thread_init(thread_id)
		   set_vars()
		   local ids
		   ids = sb_rand(0,10000)
		   local ids2
		   ids2 = sb_rand(10002,20000)
		   
		   if (db_driver == "mysql" and mysql_table_engine == "myisam") then
		      begin_query = "LOCK TABLES test WRITE"
		      commit_query = "UNLOCK TABLES"
		   else
		      begin_query = "xa start " .. "'" .. ids .. "'" 
		      commit_query1 = "xa end " .. "'" .. ids .. "'"
		      commit_query2 = "xa PREPARE " .. "'" .. ids .. "'"
		      commit_query3 = "xa commit " .. "'" .. ids .. "'"
		      begin_query2 = "xa start " .. "'" .. ids2 .. "'"
		      commit_query4 = "xa end " .. "'" .. ids2 .. "'"
		      commit_query5 = "xa commit " .. "'" .. ids2 .. "' ONE PHASE"
		   end
		end

		function event(thread_id)
		   local id
		   id = sb_rand(0,5000)
		   local ids3
		   ids3 = sb_rand(50001,10000)
		   if not oltp_skip_trx then
		      db_query(begin_query)
		   end
		   db_query("UPDATE test.norm_table SET c2 = c2 + 1 where c1 = " .. id )
		   db_query("UPDATE test.part_table SET c2 = c2 + 1 where c1 = " .. id )
		   db_query("delete from test.norm_table where c1 = " .. id )
		   db_query("delete from test.part_table where c1 = " .. id )
		   db_query("insert into test.part_table(c2) values(" .. id .. ")" )
		   db_query("insert into test.norm_table(c2) values(" .. id .. ")" )
		   db_query("delete from test.norm_table where c1 = " .. id )
		   db_query("delete from test.part_table where c1 = " .. id )
		   db_query("SELECT * FROM test.norm_table where sleep(0.3)=0 and c1 = " .. id )
		   if not oltp_skip_trx then
		      db_query(commit_query1)
		      db_query(commit_query2)
		      db_query(commit_query3)
		   end
		   
		   if not oltp_skip_trx then
		      db_query(begin_query2)
		   end
		   db_query("UPDATE test.norm_table SET c2 = c2 + 1 where c1 = " .. ids3 )
		   db_query("UPDATE test.part_table SET c2 = c2 + 1 where c1 = " .. ids3 )
		   db_query("delete from test.norm_table where c1 = " .. ids3 )
		   db_query("delete from test.part_table where c1 = " .. ids3 )
		   db_query("insert into test.part_table(c2) values(" .. ids3 .. ")" )
		   db_query("insert into test.norm_table(c2) values(" .. ids3 .. ")" )
		   db_query("delete from test.norm_table where c1 = " .. ids3 )
		   db_query("delete from test.part_table where c1 = " .. ids3 )
		   db_query("SELECT * FROM test.norm_table where sleep(0.3)=0 and c1 = " .. ids3 )
		   if not oltp_skip_trx then
		      db_query(commit_query4)
		      db_query(commit_query5)
		   end
		end
	3.When the sysbench is executed in master,and slave server power off(The power down).Neither init 0 nor shutdown.
	And then open slave server,slave database replication will be interrupted,Error occurred.
	This phenomenon is accidental,But the probability is high.
[27 Sep 2016 8:08] Ji Zhang
Master and slave configuration information is same:
	mysql> show variables like 'innodb_flush_log_at_trx_commit';
	+--------------------------------+-------+
	| Variable_name                  | Value |
	+--------------------------------+-------+
	| innodb_flush_log_at_trx_commit | 1     |
	+--------------------------------+-------+
	1 row in set (0.00 sec)

	mysql> show variables like 'sync_binlog';
	+---------------+-------+
	| Variable_name | Value |
	+---------------+-------+
	| sync_binlog   | 1     |
	+---------------+-------+
	1 row in set (0.00 sec)
[27 Sep 2016 8:16] Ji Zhang
Configuration file information:
#my.cnf
[client]
port            = 3390
socket          = /tmp/mysql3390.sock

[mysql]
prompt="\\u@\\h \\D \\R:\\m:\\s [\\d]> 
#pager="less -i -n -S"
#tee=/home/mysql/query.log
no-auto-rehash 

[mysqld_multi]
mysqld = /usr/local/mysql /bin/mysqld_safe
mysqladmin = /usr/local/mysql /bin/mysqladmin
log = /usr/local/mysql/mysqld_multi.log

[mysqld]
#misc
user = mysql
basedir = /usr/local/mysql
datadir = /data/mysql/data3390
port = 3390
socket = /tmp/mysql3390.sock
event_scheduler = 0

#timeout
interactive_timeout = 300
wait_timeout = 300

#character set
character-set-server = utf8

open_files_limit = 65535
max_connections = 100
max_connect_errors = 100000

#logs
log-output=file
slow_query_log = 1
slow_query_log_file = /data/mysql/data3390/slow.log
log-error =/data/mysql/data3390/error.log
log_warnings = 2
pid-file = /data/mysql/data3390/mysql.pid
long_query_time = 1
#log-slow-admin-statements = 1
#log-queries-not-using-indexes = 1
log-slow-slave-statements = 1

log-slave-updates=1
#GTID
gtid_mode=on
enforce-gtid-consistency=on

#binlog
binlog_format = row
server-id = 23490
log-bin = my1binlog
binlog_cache_size = 4M
max_binlog_size = 1G
max_binlog_cache_size = 2G
sync_binlog = 1
expire_logs_days = 10

#relay log
skip_slave_start = 0
max_relay_log_size = 1G
relay_log_purge = 1
relay_log_recovery = 1
log_slave_updates
relay_log=relay-log-23490
#slave-skip-errors=1032,1053,1062

#buffers & cache
table_open_cache = 2048
table_definition_cache = 2048
table_open_cache = 2048
max_heap_table_size = 96M
sort_buffer_size = 2M
join_buffer_size = 2M
thread_cache_size = 256
query_cache_size = 0
query_cache_type = 0
query_cache_limit = 256K
query_cache_min_res_unit = 512
thread_stack = 192K
tmp_table_size = 96M
key_buffer_size = 8M
read_buffer_size = 2M
read_rnd_buffer_size = 16M
bulk_insert_buffer_size = 32M

#myisam
myisam_sort_buffer_size = 128M
myisam_max_sort_file_size = 10G
myisam_repair_threads = 1

master-info-repository=TABLE
relay-log-info-repository=TABLE

#semi
#rpl_semi_sync_master_timeout=655350
#rpl_semi_sync_master_enabled=1
log_bin_trust_function_creators=1

#innodb
innodb_buffer_pool_size = 1G
innodb_buffer_pool_instances = 1
#innodb_data_file_path = ibdata1:3G:autoextend
#add 5.7
innodb_page_size=16384
innodb_flush_log_at_trx_commit = 1
innodb_log_buffer_size = 64M
innodb_log_file_size = 256M
innodb_log_files_in_group = 2
innodb_max_dirty_pages_pct = 50
innodb_file_per_table = 1
innodb_rollback_on_timeout
innodb_status_file = 1
innodb_io_capacity = 2000
transaction_isolation = READ-COMMITTED
innodb_flush_method = O_DIRECT

[mysqld3390]
basedir = /usr/local/mysql
datadir = /data/mysql/data3390
innodb_buffer_pool_size = 4G
transaction_isolation = REPEATABLE-READ
[7 Oct 2016 10:27] Umesh Shastry
Hello Ji Zhang,

Thank you for the report and test case.
Observed this with 5.7.15 build.

Thanks,
Umesh
[7 Oct 2016 10:29] Umesh Shastry
test results

Attachment: 83163_5.7.15.results (application/octet-stream, text), 11.06 KiB.

[7 Oct 2016 10:42] Umesh Shastry
Bug #83161 marked as duplicate of this