Bug #54220 Corruption or crash on SELECT FROM I_S + ALTER TABLE with innodb_file_per_table
Submitted: 3 Jun 2010 22:22 Modified: 4 Aug 2010 11:00
Reporter: Elena Stepanova Email Updates:
Status: Duplicate Impact on me:
None 
Category:MySQL Server: InnoDB Plugin storage engine Severity:S2 (Serious)
Version:mysql-trunk, mysql-next-mr, 5.5.4-m3 OS:Any
Assigned to: Assigned Account CPU Architecture:Any

[3 Jun 2010 22:22] Elena Stepanova
Description:
One connection executes ALTER TABLE ... TRUNCATE PARTITION, while another connection concurrently selects from I_S.partitions. 

With InnoDB 1.0.6, ALTER TABLE hangs, and server starts rapidly filling up error log with records.

InnoDB: Error: trying to access tablespace 5 page no. 3,
InnoDB: but the tablespace does not exist or is just being dropped.

or alike.

Although bug#38901 is marked as pushed into mysql-next-mr and 5.5.5, the latest binaries from mysql-next-mr and mysql-trunk still seem to have the problem.

With InnoDB 1.1 error log contains similar records, but instead of hanging up, server almost immediately assumes corruption and crashes. 

InnoDB: Error: Unable to read tablespace 6 page no 3 into the buffer pool after 100 attempts
InnoDB: The most probable cause of this error may be that the table has been corrupted.

#3  0x00002b9732b6abb5 in raise () from /lib64/libc.so.6
#4  0x00002b9732b6bfb0 in abort () from /lib64/libc.so.6
#5  0x000000000086f529 in buf_page_get_gen (space=6, zip_size=0, offset=3, rw_latch=2, guess=0x0, mode=10,
    file=0xa8e4cc "./include/btr0btr.ic", line=53, mtr=0x45085380) at buf/buf0buf.c:2745
#6  0x000000000084f303 in btr_root_block_get (index=0x1507938, mtr=<value optimized out>) at ./include/btr0btr.ic:53
#7  0x000000000084fa1b in btr_get_size (index=0x1507938, flag=2) at btr/btr0btr.c:166
#8  0x0000000000885e2d in dict_update_statistics (table=0x15036e8) at dict/dict0dict.c:4166
#9  0x00000000007ba1c1 in ha_innobase::info (this=0x15331b0, flag=30) at handler/ha_innodb.cc:7611
#10 0x00000000006f01b7 in ha_partition::get_dynamic_partition_info (this=<value optimized out>, stat_info=0x45086ae0,
    part_id=6) at ha_partition.cc:5289
#11 0x0000000000713c5d in store_schema_partitions_record (thd=0x14609c0, schema_table=0x1547090, showing_table=0x14e22c0,
    part_elem=0xffffffffffffffff, file=0xffffffff, part_id=11065856) at sql_show.cc:5298
#12 0x0000000000722c6f in get_schema_partitions_record (thd=0x14609c0, tables=<value optimized out>, table=0x1547090,
    res=<value optimized out>, db_name=<value optimized out>, table_name=0x450884c0) at sql_show.cc:5636
#13 0x0000000000725823 in get_all_tables (thd=0x14609c0, tables=0x14dfbc0, cond=0x0) at sql_show.cc:3560
#14 0x0000000000714890 in get_schema_tables_result (join=0x154f1d0, executed_place=PROCESSED_BY_JOIN_EXEC)
    at sql_show.cc:6646
#15 0x0000000000663085 in JOIN::exec (this=0x154f1d0) at sql_select.cc:1829
#16 0x000000000065f2da in mysql_select (thd=0x14609c0, rref_pointer_array=<value optimized out>,
    tables=<value optimized out>, wild_num=0, fields=<value optimized out>, conds=<value optimized out>, og_num=0,
    order=0x0, group=0x0, having=0x0, proc_param=0x0, select_options=0, result=0x14e1180, unit=0x1462238,
    select_lex=0x1462838) at sql_select.cc:2506
#17 0x0000000000664e44 in handle_select (thd=0x14609c0, lex=0x1462198, result=0x14e1180, setup_tables_done_option=0)
    at sql_select.cc:271
#18 0x00000000005f03ab in execute_sqlcom_select (thd=0x14609c0, all_tables=0x14dfbc0) at sql_parse.cc:4839
#19 0x00000000005f5cfb in mysql_execute_command (thd=0x14609c0) at sql_parse.cc:2324
#20 0x00000000005f9c45 in mysql_parse (thd=0x14609c0,
    inBuf=0x14df8c0 "SELECT COUNT(*) FROM information_schema.partitions", length=50, found_semicolon=0x4508b0f0)
    at sql_parse.cc:5871
#21 0x00000000005fa6b4 in dispatch_command (command=COM_QUERY, thd=0x14609c0,
    packet=0x14d7891 "SELECT COUNT(*) FROM information_schema.partitions", packet_length=<value optimized out>)
    at sql_parse.cc:1023
#22 0x00000000005ed9b8 in do_handle_one_connection (thd_arg=<value optimized out>) at sql_connect.cc:1173
#23 0x00000000005eddba in handle_one_connection (arg=<value optimized out>) at sql_connect.cc:1112
#24 0x000000000090d0a7 in pfs_spawn_thread (arg=<value optimized out>) at pfs.cc:1011
#25 0x00002b973257f143 in start_thread () from /lib64/libpthread.so.0
#26 0x00002b9732bfb8cd in clone () from /lib64/libc.so.6
#27 0x0000000000000000 in ?? ()

How to repeat:
# The test case below is not 100%-deterministic. 
# I am getting repeated error messages with 1.0.6 or a crash 
# with 1.1 in approx 90-95% runs, but sometimes you need to run it again.
# For 1.1 the failure is obvious. 
# For 1.0.6 if you see that test execution stopped on ALTER TABLE,
# you can check the error log to see if it is being filled up with the messages.
# The table contains only ~4K records, so normally ALTERs should finish
# almost instantly.

--source include/have_innodb.inc

let $part_num=6;

--disable_warnings
DROP DATABASE IF EXISTS celosia_features;
--enable_warnings

CREATE DATABASE celosia_features;

SET GLOBAL innodb_file_per_table = ON;

--connect(con2,localhost,root,,celosia_features)

eval CREATE TABLE celosia_features.t_celosia_ddl_partitions
(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
                user_num CHAR(10)
                ) ENGINE = innodb
PARTITION BY HASH(id) PARTITIONS $part_num;

--disable_query_log
--echo # Populating the table...
INSERT INTO celosia_features.t_celosia_ddl_partitions (user_num) VALUES ('1234567890');
let $n=12;
while ($n)
{
        INSERT INTO celosia_features.t_celosia_ddl_partitions (user_num)
                SELECT user_num FROM celosia_features.t_celosia_ddl_partitions;
        dec $n;
}
--echo # Finished populating the table

--connect(con1,localhost,root,,test,,)
USE celosia_features;

--enable_query_log

while ($part_num)
{
        dec $part_num;
        --connection con1
        send ;
        eval ALTER TABLE t_celosia_ddl_partitions TRUNCATE PARTITION p$part_num;
        --connection default

        SELECT COUNT(*) FROM information_schema.partitions
                                WHERE table_schema = 'celosia_features'
                                        AND table_name = 't_celosia_ddl_partitions'
                                        AND partition_name IS NOT NULL;
        --connection con1
        --reap
}

--exit
[4 Jun 2010 21:23] Elena Stepanova
Might be related to bug##49938
[29 Jul 2010 13:37] Jimmy Yang
seems to be a dup of #54678
[29 Jul 2010 14:12] Jimmy Yang
The root cause for this bug is the same as with 54678:

We are allowing ::info to peaking into a changing metadata as in the case of bug #54678. A proper solution would be MySQL block access to the table while we are dropping/recreating the table. 

However, MySQL now takes a more relaxed approach due to following reason:

our truncate table could take 2 approaches, 
1) with drop/recreate the table
2) delete row-by-row (if table has foreign keys)

And MySQL server layer has no idea which approach InnoDB is taking. To avoid locking metadata for long(as in option 2), it would allow ::info to access the metadata, which could see stale meatadata, as shown in this case.

Need to work with MySQL for a good solution
[30 Jul 2010 4:26] Jimmy Yang
Here is the step to reproduce the problem and confirm our theory. The steps are similar with 54678, the only difference is that in our case @@innodb_file_per_table is 1, in which case, the truncate table will truncate the whole table space (rather than index trees as in 54678).

mysql>  select @@innodb_file_per_table;
+-------------------------+
| @@innodb_file_per_table |
+-------------------------+
|                       1 |
+-------------------------+
1 row in set (0.00 sec)

CREATE TABLE t1
(id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
                user_num CHAR(10)
                ) ENGINE = innodb
PARTITION BY HASH(id) PARTITIONS 3;

Same as 54678, the problem is between:

1) truncate table t1; (or truncate partition)
2) select * from information_schema.table where table_name = "t1";

1. seesion 1:
mysql>  SELECT COUNT(*) FROM information_schema.partitions
    ->                                 WHERE table_schema = 'test'
    ->                                         AND table_name = 't1'
    ->                                         AND partition_name IS NOT NULL;

2. break at btr_get_size(), first we come with flag == 2, let the session looping

3. session 2:
mysql> ALTER TABLE t1 TRUNCATE PARTITION p0;

4. step through row_truncate_table_for_mysql(), let it go through fil_discard_tablespace(), notice space 15 is dropped, and the new space is 18. 

(gdb) p table->space
$11 = 15
(gdb) n
2816			if (flags != ULINT_UNDEFINED
(gdb) n
2817			    && fil_discard_tablespace(space)) {
(gdb) 
2816			if (flags != ULINT_UNDEFINED
(gdb) 
2821				dict_hdr_get_new_id(NULL, NULL, &space);
(gdb) n
2824				    || fil_create_new_single_table_tablespace(
(gdb) p space
$12 = 18

5) let session 2 looping

6)Let session 1 continue in btr_get_size(), it comes to buf_page_get_gen(), and still try to find page 3 or space 15, but space 15 is dropped, so it start to print messages:

Breakpoint 18, buf_page_get_gen (space=15, zip_size=0, offset=3, rw_latch=2, 
    guess=0x0, mode=10, 
    file=0x877e688 "/home/jy/work/mysql5.5_7/mysql-trunk-innodb/storage/innobase/include/btr0btr.ic", line=53, mtr=0xb1408ab8)
    at /home/jy/work/mysql5.5_7/mysql-trunk-innodb/storage/innobase/buf/buf0buf.c:2734
2734		ulint		retries = 0;
1: block = (buf_block_t *) 0x0
(gdb) c
Continuing.
[Switching to Thread 0xb13dbb70 (LWP 4715)]

Continuing.
100729 21:08:26  InnoDB: Error: trying to access tablespace 15 page no. 3,
InnoDB: but the tablespace does not exist or is just being dropped.
100729 21:08:28  InnoDB: Error: trying to access tablespace 15 page no. 3,
InnoDB: but the tablespace does not exist or is just being dropped.
100729 21:08:28  InnoDB: Error: trying to access tablespace 15 page no. 3,
....
InnoDB: but the tablespace does not exist or is just being dropped.
100729 21:08:28  InnoDB: Error: trying to access tablespace 15 page no. 3,
InnoDB: but the tablespace does not exist or is just being dropped.
100729 21:08:28  InnoDB: Error: trying to access tablespace 15 page no. 3,
[12 Sep 2011 16:32] Avleen Vig
Hi folks,

We tripped over this two days ago. Any update on when this might be fixed?

Thanks!
[12 Sep 2011 16:34] Avleen Vig
Hi folks,

We tripped over this two days ago. Any update on when this might be fixed?

Thanks!
[12 Sep 2011 16:34] Avleen Vig
Hi folks,

We tripped over this two days ago. Any update on when this might be fixed?

Thanks!
[12 Sep 2011 16:54] Calvin Sun
Hi Avleen,

Which version of MySQL you are using? The original bug (#54678) was fixed almost a year ago. The fix should fix this issue too,

Thanks,
Calvin