From 90bd2d039c3786eb2d18d0aca3fbeead3639071c Mon Sep 17 00:00:00 2001 From: casazhang Date: Fri, 15 Oct 2021 15:05:51 +0800 Subject: [PATCH] BUG#105224 dict_stats_analyze_index() may always disable optimal index-based access paths Problem: ======== Both foreground ANALYZE commands and the background statistics thread invoke dict_stats_analyze_index() to gather index statistics. These InnoDB persistent statistics are kept in shared dict objects and read without latching by the cost-based optimizer to calculate costs for access paths including table scan and index-based ones. There are two kinds of problems due to empty statistics: 1) When dict_stats_analyze_index() is in progress, empty index statistics may be read concurrently by the server. For a big enough table, REF access path could be disabled, even if it could be much more efficient than table scan. 2) When dict_stats_analyze_index() does an error return, empty index statistics lasts until the next call on the same index. There are two kinds of consequences. One is always table scan due to implied zero table rows. The other is disabled REF access path as above. In these cases, optimal index-based access paths could be always disabled until statistics are refreshed, and table scan is used even for very big tables which hurts the server badly. Analysis: ========= 1. Why the optimizer chooses a bad plan on empty statistics: The optimizer uses KEY::rec_per_key to estimate selectivity of equi-join conditions and equality ranges if in the latter case the count exceeds sysvar eq_range_index_dive_limit. When the statistic value is big enough, REF access path is considered useless. The rec_per_key statistic is converted from stat_n_diff_key_vals by innodb_rec_per_key(), and set to stat_n_rows if it is zero as in empty statistics. So REF access path is always considered useless for a big enough table. Note that BUG#99924 suggests a magic number instead of stat_n_rows as a workaround. The optimizer uses file->stats.records to estimate table scan cost. When it is small enough, table scan is considered the optimal access path, so index-based paths are disabled. The records statistic is from dict_table_t::stat_n_rows, which is in turn from index_t::stat_n_diff_key_vals of the primary index. Zero is small enough to always select table scan. 2. How empty statistics are generated: When statistics is being gathered, the server is free to fetch statistics by info() calls without lock for performance reasons. For example, openning a table on cache miss invokes info(HA_STATUS_CONST | HA_STATUS_NO_LOCK) to set rec_per_key and its float variant; optimizer initialization invokes info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK) to set file->stats. Note that dict_stats_analyze_index() performs in-place empty-analyze-fill operations on dict_index_t objects. The analyze step could last for a few seconds with default sampling setup, giving good chance for empty statistics. Since the server caches rec_per_key with TABLE_SHARE, the problem occurs only in certain setup where a cache miss happens when analyze is in progress. There is no restore when the analyze step is actually failed. This leaves empty statitics even beyond the scope of dict_stats_analyze_index(). If the index is primary index, dict_table_t::stat_n_rows will be set to zero in dict_stats_update_persistent(). Note that "BUG#98132 Analyze table leads to empty statistics during online rebuild DDL" could be classified into the second kind. It is a serious production issue as Online DDL becomes widely used. A rare case with tree height change also falls into the second kind. Fix: ==== Analyze with a private copy and overwrite at the end, rather than in-place empty-analyze-fill in dict_stats_analyze_index(). It does not fix concurrent read, but prevents empty statistics. Overwrite only on a successful analyze; otherwise, the stats are leaved in the old state which is guaranteed to be initialized by a preceding dict_stats_init(). Retrying mechanism is introduced for the rare case, or ensured by the outer workflow in other situations. --- mysql-test/r/zero-n_diff-during-analyze.result | 597 +++++++++++++++++++++++++ mysql-test/t/zero-n_diff-during-analyze.test | 221 +++++++++ sql/table.cc | 8 +- storage/innobase/dict/dict0stats.cc | 202 ++++++++- storage/innobase/row/row0mysql.cc | 6 + 5 files changed, 1016 insertions(+), 18 deletions(-) create mode 100644 mysql-test/r/zero-n_diff-during-analyze.result create mode 100644 mysql-test/t/zero-n_diff-during-analyze.test diff --git a/mysql-test/r/zero-n_diff-during-analyze.result b/mysql-test/r/zero-n_diff-during-analyze.result new file mode 100644 index 0000000..327da28 --- /dev/null +++ b/mysql-test/r/zero-n_diff-during-analyze.result @@ -0,0 +1,597 @@ +# +# Verify error return: undefined tree size. +# +create table t1 (a int primary key auto_increment, b int); +SET SESSION DEBUG='+d,dict_stats_skip_stat_modified_counter'; +insert t1 values (1,1); +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status OK +# old stats +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +n_rows clustered_index_size +1 1 +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; +index_name stat_value sample_size +PRIMARY 1 1 +insert t1 values (2,2); +SET SESSION DEBUG='+d,dict_stats_simulate_undefined_size'; +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status OK +# Expect old stats because of simulated strong failure +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +n_rows clustered_index_size +1 1 +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; +index_name stat_value sample_size +PRIMARY 1 1 +SET SESSION DEBUG='-d,dict_stats_simulate_undefined_size'; +SET SESSION DEBUG='-d,dict_stats_skip_stat_modified_counter'; +drop table t1; +# +# Verify error return: tree change. +# +create table t1 (a int); +SET SESSION DEBUG='+d,dict_stats_skip_stat_modified_counter'; +insert t1 values (1); +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status OK +# Prepare until 16384 rows to get a tree with more than one level ... +# old stats +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +n_rows clustered_index_size +1 1 +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; +index_name stat_value sample_size +GEN_CLUST_INDEX 1 1 +SET SESSION DEBUG='+d,dict_stats_simulate_tree_change'; +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status OK +# Expect new stats because of the simulated failure is considered temporary +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +n_rows clustered_index_size +16569 97 +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; +index_name stat_value sample_size +GEN_CLUST_INDEX 16569 20 +SET SESSION DEBUG='-d,dict_stats_simulate_tree_change'; +SET SESSION DEBUG='-d,dict_stats_skip_stat_modified_counter'; +drop table t1; +# +# Zero n_diff during analyze may disable REF access paths. +# +create table t1 (a int primary key auto_increment, b int); +create table t2 (a int primary key auto_increment, b int, c int, key(b)); +# Prepare data ... +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status OK +analyze table t2; +Table Op Msg_type Msg_text +test.t2 analyze status OK +# Simulate HA_STATUS_CONST in fetch_number_of_rows(), because +# ha_innobase::open() is only invoked on table cache miss. +SET SESSION DEBUG='+d,fetch_number_of_rows_info_const'; +# With correct n_diff, ref access is used for t2. +SET eq_range_index_dive_limit = 1; +EXPLAIN select * from t2 where b = 1; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t2 NULL ref b b 5 const 1 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t2`.`c` AS `c` from `test`.`t2` where (`test`.`t2`.`b` = 1) +SET eq_range_index_dive_limit = DEFAULT; +EXPLAIN select * from t1 join t2 using (b); +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ALL NULL NULL NULL NULL 2 100.00 Using where +1 SIMPLE t2 NULL ref b b 5 test.t1.b 1 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`b` AS `b`,`test`.`t1`.`a` AS `a`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`c` AS `c` from `test`.`t1` join `test`.`t2` where (`test`.`t2`.`b` = `test`.`t1`.`b`) +# Let ANALYZE pause right after dict_stats_empty_index() on the first secondary index. +SET DEBUG_SYNC='dict_stats_analyze_index_empty_sk wait_for continue_analyze execute 1'; +analyze table t2;; +# With temporary zero n_diff, table scan is used for t2. +# With correct n_diff, ref access is used for t2. +SET eq_range_index_dive_limit = 1; +EXPLAIN select * from t2 where b = 1; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t2 NULL ref b b 5 const 1 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t2`.`a` AS `a`,`test`.`t2`.`b` AS `b`,`test`.`t2`.`c` AS `c` from `test`.`t2` where (`test`.`t2`.`b` = 1) +SET eq_range_index_dive_limit = DEFAULT; +EXPLAIN select * from t1 join t2 using (b); +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL ALL NULL NULL NULL NULL 2 100.00 Using where +1 SIMPLE t2 NULL ref b b 5 test.t1.b 1 100.00 NULL +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`b` AS `b`,`test`.`t1`.`a` AS `a`,`test`.`t2`.`a` AS `a`,`test`.`t2`.`c` AS `c` from `test`.`t1` join `test`.`t2` where (`test`.`t2`.`b` = `test`.`t1`.`b`) +SET DEBUG_SYNC = "now signal continue_analyze"; +Table Op Msg_type Msg_text +test.t2 analyze status OK +SET SESSION DEBUG='-d,fetch_number_of_rows_info_const'; +drop table t1, t2; +SET DEBUG_SYNC = 'RESET'; +# +# BUG#98132 Zero dict_table_t::stat_n_rows during Online ALTER +# may lead to table scan thus bad performance. +SET optimizer_trace_max_mem_size=1048576; +SET optimizer_trace="enabled=on,one_line=off"; +create table t1 (a int primary key auto_increment, x int, b int, c char(64), key(b)); +insert t1 values (1,1,1,'x'),(2,2,2,'x'); +insert t1(x,b,c) select x,b,c from t1 where b = 1; +insert t1(x,b,c) select x,b,c from t1 where b = 1; +insert t1(x,b,c) select x,b,c from t1 where b = 1; +insert t1(x,b,c) select x,b,c from t1 where b = 1; +insert t1(x,b,c) select x,b,c from t1 where b = 1; +insert t1(x,b,c) select x,b,c from t1 where b = 2; +insert t1(x,b,c) select x,b,c from t1 where b = 2; +insert t1(x,b,c) select x,b,c from t1 where b = 2; +insert t1(x,b,c) select x,b,c from t1 where b = 2; +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status OK +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +n_rows clustered_index_size +48 1 +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; +index_name stat_value sample_size +PRIMARY 48 1 +b 2 1 +# With correct stat_n_rows, it is range access. +explain select * from t1 where b >= 2; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range b b 5 NULL 16 100.00 Using index condition +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`a` AS `a`,`test`.`t1`.`x` AS `x`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c` from `test`.`t1` where (`test`.`t1`.`b` >= 2) +SELECT TRACE FROM information_schema.OPTIMIZER_TRACE; +TRACE +{ + "steps": [ + { + "join_preparation": { + "select#": 1, + "steps": [ + { + "expanded_query": "/* select#1 */ select `t1`.`a` AS `a`,`t1`.`x` AS `x`,`t1`.`b` AS `b`,`t1`.`c` AS `c` from `t1` where (`t1`.`b` >= 2)" + } + ] + } + }, + { + "join_optimization": { + "select#": 1, + "steps": [ + { + "condition_processing": { + "condition": "WHERE", + "original_condition": "(`t1`.`b` >= 2)", + "steps": [ + { + "transformation": "equality_propagation", + "resulting_condition": "(`t1`.`b` >= 2)" + }, + { + "transformation": "constant_propagation", + "resulting_condition": "(`t1`.`b` >= 2)" + }, + { + "transformation": "trivial_condition_removal", + "resulting_condition": "(`t1`.`b` >= 2)" + } + ] + } + }, + { + "substitute_generated_columns": { + } + }, + { + "table_dependencies": [ + { + "table": "`t1`", + "row_may_be_null": false, + "map_bit": 0, + "depends_on_map_bits": [ + ] + } + ] + }, + { + "ref_optimizer_key_uses": [ + ] + }, + { + "rows_estimation": [ + { + "table": "`t1`", + "range_analysis": { + "table_scan": { + "rows": 48, + "cost": 7.15 + }, + "potential_range_indexes": [ + { + "index": "PRIMARY", + "usable": false, + "cause": "not_applicable" + }, + { + "index": "b", + "usable": true, + "key_parts": [ + "b", + "a" + ] + } + ], + "setup_range_conditions": [ + ], + "group_index_range": { + "chosen": false, + "cause": "not_group_by_or_distinct" + }, + "skip_scan_range": { + "potential_skip_scan_indexes": [ + { + "index": "b", + "usable": false, + "cause": "query_references_nonkey_column" + } + ] + }, + "analyzing_range_alternatives": { + "range_scan_alternatives": [ + { + "index": "b", + "ranges": [ + "2 <= b" + ], + "index_dives_for_eq_ranges": true, + "rowid_ordered": false, + "using_mrr": false, + "index_only": false, + "rows": 16, + "cost": 5.86, + "chosen": true + } + ], + "analyzing_roworder_intersect": { + "usable": false, + "cause": "too_few_roworder_scans" + } + }, + "chosen_range_access_summary": { + "range_access_plan": { + "type": "range_scan", + "index": "b", + "rows": 16, + "ranges": [ + "2 <= b" + ] + }, + "rows_for_plan": 16, + "cost_for_plan": 5.86, + "chosen": true + } + } + } + ] + }, + { + "considered_execution_plans": [ + { + "plan_prefix": [ + ], + "table": "`t1`", + "best_access_path": { + "considered_access_paths": [ + { + "rows_to_scan": 16, + "filtering_effect": [ + ], + "final_filtering_effect": 1, + "access_type": "range", + "range_details": { + "used_index": "b" + }, + "resulting_rows": 16, + "cost": 7.46, + "chosen": true + } + ] + }, + "condition_filtering_pct": 100, + "rows_for_plan": 16, + "cost_for_plan": 7.46, + "chosen": true + } + ] + }, + { + "attaching_conditions_to_tables": { + "original_condition": "(`t1`.`b` >= 2)", + "attached_conditions_computation": [ + ], + "attached_conditions_summary": [ + { + "table": "`t1`", + "attached": "(`t1`.`b` >= 2)" + } + ] + } + }, + { + "finalizing_table_conditions": [ + { + "table": "`t1`", + "original_table_condition": "(`t1`.`b` >= 2)", + "final_table_condition ": "(`t1`.`b` >= 2)" + } + ] + }, + { + "refine_plan": [ + { + "table": "`t1`", + "pushed_index_condition": "(`t1`.`b` >= 2)", + "table_condition_attached": null + } + ] + } + ] + } + }, + { + "join_explain": { + "select#": 1, + "steps": [ + ] + } + } + ] +} +SET DEBUG_SYNC = "innodb_inplace_alter_table_enter signal ddl_in_progress wait_for finish_ddl execute 1"; +alter table t1 drop column x;; +SET DEBUG_SYNC = "now wait_for ddl_in_progress"; +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status OK +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +n_rows clustered_index_size +48 1 +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; +index_name stat_value sample_size +PRIMARY 48 1 +b 2 1 +# With zero stat_n_rows, it is table scan (2.45) even if the real table might be very big. +# With correct stat_n_rows, it is range access. +explain select * from t1 where b >= 2; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range b b 5 NULL 16 100.00 Using index condition +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`a` AS `a`,`test`.`t1`.`x` AS `x`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c` from `test`.`t1` where (`test`.`t1`.`b` >= 2) +SELECT TRACE FROM information_schema.OPTIMIZER_TRACE; +TRACE +{ + "steps": [ + { + "join_preparation": { + "select#": 1, + "steps": [ + { + "expanded_query": "/* select#1 */ select `t1`.`a` AS `a`,`t1`.`x` AS `x`,`t1`.`b` AS `b`,`t1`.`c` AS `c` from `t1` where (`t1`.`b` >= 2)" + } + ] + } + }, + { + "join_optimization": { + "select#": 1, + "steps": [ + { + "condition_processing": { + "condition": "WHERE", + "original_condition": "(`t1`.`b` >= 2)", + "steps": [ + { + "transformation": "equality_propagation", + "resulting_condition": "(`t1`.`b` >= 2)" + }, + { + "transformation": "constant_propagation", + "resulting_condition": "(`t1`.`b` >= 2)" + }, + { + "transformation": "trivial_condition_removal", + "resulting_condition": "(`t1`.`b` >= 2)" + } + ] + } + }, + { + "substitute_generated_columns": { + } + }, + { + "table_dependencies": [ + { + "table": "`t1`", + "row_may_be_null": false, + "map_bit": 0, + "depends_on_map_bits": [ + ] + } + ] + }, + { + "ref_optimizer_key_uses": [ + ] + }, + { + "rows_estimation": [ + { + "table": "`t1`", + "range_analysis": { + "table_scan": { + "rows": 48, + "cost": 7.15 + }, + "potential_range_indexes": [ + { + "index": "PRIMARY", + "usable": false, + "cause": "not_applicable" + }, + { + "index": "b", + "usable": true, + "key_parts": [ + "b", + "a" + ] + } + ], + "setup_range_conditions": [ + ], + "group_index_range": { + "chosen": false, + "cause": "not_group_by_or_distinct" + }, + "skip_scan_range": { + "potential_skip_scan_indexes": [ + { + "index": "b", + "usable": false, + "cause": "query_references_nonkey_column" + } + ] + }, + "analyzing_range_alternatives": { + "range_scan_alternatives": [ + { + "index": "b", + "ranges": [ + "2 <= b" + ], + "index_dives_for_eq_ranges": true, + "rowid_ordered": false, + "using_mrr": false, + "index_only": false, + "rows": 16, + "cost": 5.86, + "chosen": true + } + ], + "analyzing_roworder_intersect": { + "usable": false, + "cause": "too_few_roworder_scans" + } + }, + "chosen_range_access_summary": { + "range_access_plan": { + "type": "range_scan", + "index": "b", + "rows": 16, + "ranges": [ + "2 <= b" + ] + }, + "rows_for_plan": 16, + "cost_for_plan": 5.86, + "chosen": true + } + } + } + ] + }, + { + "considered_execution_plans": [ + { + "plan_prefix": [ + ], + "table": "`t1`", + "best_access_path": { + "considered_access_paths": [ + { + "rows_to_scan": 16, + "filtering_effect": [ + ], + "final_filtering_effect": 1, + "access_type": "range", + "range_details": { + "used_index": "b" + }, + "resulting_rows": 16, + "cost": 7.46, + "chosen": true + } + ] + }, + "condition_filtering_pct": 100, + "rows_for_plan": 16, + "cost_for_plan": 7.46, + "chosen": true + } + ] + }, + { + "attaching_conditions_to_tables": { + "original_condition": "(`t1`.`b` >= 2)", + "attached_conditions_computation": [ + ], + "attached_conditions_summary": [ + { + "table": "`t1`", + "attached": "(`t1`.`b` >= 2)" + } + ] + } + }, + { + "finalizing_table_conditions": [ + { + "table": "`t1`", + "original_table_condition": "(`t1`.`b` >= 2)", + "final_table_condition ": "(`t1`.`b` >= 2)" + } + ] + }, + { + "refine_plan": [ + { + "table": "`t1`", + "pushed_index_condition": "(`t1`.`b` >= 2)", + "table_condition_attached": null + } + ] + } + ] + } + }, + { + "join_explain": { + "select#": 1, + "steps": [ + ] + } + } + ] +} +SET DEBUG_SYNC = "now signal finish_ddl"; +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +n_rows clustered_index_size +48 1 +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; +index_name stat_value sample_size +PRIMARY 48 1 +b 2 1 +# With stat_n_rows corrected by the completion of the online alter, it is range access again. +explain select * from t1 where b >= 2; +id select_type table partitions type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 NULL range b b 5 NULL 16 100.00 Using index condition +Warnings: +Note 1003 /* select#1 */ select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b`,`test`.`t1`.`c` AS `c` from `test`.`t1` where (`test`.`t1`.`b` >= 2) +drop table t1; +SET DEBUG_SYNC = 'RESET'; diff --git a/mysql-test/t/zero-n_diff-during-analyze.test b/mysql-test/t/zero-n_diff-during-analyze.test new file mode 100644 index 0000000..800ffd5 --- /dev/null +++ b/mysql-test/t/zero-n_diff-during-analyze.test @@ -0,0 +1,221 @@ +# +# BUG# dict_stats_analyze_index() may always discard optimal index-based access paths +# + +--source include/have_debug.inc +--source include/have_debug_sync.inc + +--echo # +--echo # Verify error return: undefined tree size. +--echo # + +create table t1 (a int primary key auto_increment, b int); +SET SESSION DEBUG='+d,dict_stats_skip_stat_modified_counter'; +insert t1 values (1,1); +analyze table t1; + +--echo # old stats +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; + +insert t1 values (2,2); + +SET SESSION DEBUG='+d,dict_stats_simulate_undefined_size'; +analyze table t1; + +--echo # Expect old stats because of simulated strong failure +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; + +SET SESSION DEBUG='-d,dict_stats_simulate_undefined_size'; +SET SESSION DEBUG='-d,dict_stats_skip_stat_modified_counter'; +drop table t1; + +--echo # +--echo # Verify error return: tree change. +--echo # + +create table t1 (a int); + +SET SESSION DEBUG='+d,dict_stats_skip_stat_modified_counter'; +insert t1 values (1); +analyze table t1; +--echo # Prepare until 16384 rows to get a tree with more than one level ... +--disable_query_log +let $i=14; +while ($i) +{ + --eval insert t1 select * from t1; + dec $i; +} +--enable_query_log + +--echo # old stats +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; + +SET SESSION DEBUG='+d,dict_stats_simulate_tree_change'; + +analyze table t1; + +--echo # Expect new stats because of the simulated failure is considered temporary +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; + +SET SESSION DEBUG='-d,dict_stats_simulate_tree_change'; +SET SESSION DEBUG='-d,dict_stats_skip_stat_modified_counter'; +drop table t1; + +--echo # +--echo # Zero n_diff during analyze may disable REF access paths. +--echo # + +connect (other,localhost,root,,); + +create table t1 (a int primary key auto_increment, b int); +create table t2 (a int primary key auto_increment, b int, c int, key(b)); +--echo # Prepare data ... +--disable_query_log +insert t1 values (1,1), (2,2); +let $i=16; +while ($i) +{ + --eval insert t2 values ($i,$i,$i) + dec $i; +} +--enable_query_log +analyze table t1; +analyze table t2; + +--echo # Simulate HA_STATUS_CONST in fetch_number_of_rows(), because +--echo # ha_innobase::open() is only invoked on table cache miss. +SET SESSION DEBUG='+d,fetch_number_of_rows_info_const'; + +--echo # With correct n_diff, ref access is used for t2. + +# Favor index statistics over index dive. +SET eq_range_index_dive_limit = 1; +EXPLAIN select * from t2 where b = 1; +SET eq_range_index_dive_limit = DEFAULT; + +EXPLAIN select * from t1 join t2 using (b); + +connection other; +--echo # Let ANALYZE pause right after dict_stats_empty_index() on the first secondary index. +SET DEBUG_SYNC='dict_stats_analyze_index_empty_sk wait_for continue_analyze execute 1'; +--send analyze table t2; + +connection default; + +--echo # With temporary zero n_diff, table scan is used for t2. +--echo # With correct n_diff, ref access is used for t2. + +# Favor index statistics over index dive. +SET eq_range_index_dive_limit = 1; +EXPLAIN select * from t2 where b = 1; +SET eq_range_index_dive_limit = DEFAULT; + +EXPLAIN select * from t1 join t2 using (b); + +SET DEBUG_SYNC = "now signal continue_analyze"; + +connection other; +--reap + +connection default; +disconnect other; + +SET SESSION DEBUG='-d,fetch_number_of_rows_info_const'; + +drop table t1, t2; +SET DEBUG_SYNC = 'RESET'; + +--echo # +--echo # BUG#98132 Zero dict_table_t::stat_n_rows during Online ALTER +--echo # may lead to table scan thus bad performance. + +# +# Note: +# +# This test requires non-blocking ANALYZE. Otherwise, the ANALYZE command +# used to simulate async invocation of dict_stats_update() will not finish +# until TABLE_SHARE is released by the DDL after debug sync timeout. +# +# Use the non-blocking fix in MySQL 8.0.24: +# +# BUG#32224917: ANALYZE TABLE TAKES TABLE LOCK DURING INDEX +# STATS UPDATE, CAUSES QUERY PILEUP +# +# Or the fix from Percona: +# +# https://www.percona.com/blog/2018/03/27/analyze-table-is-no-longer-a-blocking-operation/ +# +# Otherwise, temporarily disable the following invocation in mysql_admin_table(), with +# +# SET DEBUG = '+d,skip_flush_for_analyze'; +# +# bool skip_flush = false; +# DBUG_EXECUTE_IF("skip_flush_for_analyze", { skip_flush = true; }); +# +# ... +# +# } else if ((!skip_flush && open_for_modify) || fatal_error) { +# tdc_remove_table(thd, TDC_RT_REMOVE_UNUSED, table->db, +# table->table_name, false); +# } else { +# + +connect (other,localhost,root,,); + +connection default; +SET optimizer_trace_max_mem_size=1048576; # 1MB +SET optimizer_trace="enabled=on,one_line=off"; +create table t1 (a int primary key auto_increment, x int, b int, c char(64), key(b)); +insert t1 values (1,1,1,'x'),(2,2,2,'x'); +insert t1(x,b,c) select x,b,c from t1 where b = 1; +insert t1(x,b,c) select x,b,c from t1 where b = 1; +insert t1(x,b,c) select x,b,c from t1 where b = 1; +insert t1(x,b,c) select x,b,c from t1 where b = 1; +insert t1(x,b,c) select x,b,c from t1 where b = 1; +insert t1(x,b,c) select x,b,c from t1 where b = 2; +insert t1(x,b,c) select x,b,c from t1 where b = 2; +insert t1(x,b,c) select x,b,c from t1 where b = 2; +insert t1(x,b,c) select x,b,c from t1 where b = 2; +analyze table t1; +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; +--echo # With correct stat_n_rows, it is range access. +explain select * from t1 where b >= 2; +SELECT TRACE FROM information_schema.OPTIMIZER_TRACE; + +connection other; +SET DEBUG_SYNC = "innodb_inplace_alter_table_enter signal ddl_in_progress wait_for finish_ddl execute 1"; +--send alter table t1 drop column x; + +connection default; +SET DEBUG_SYNC = "now wait_for ddl_in_progress"; +# Use non-blocking ANALYZE to simulate async invocation of dict_stats_update() in the background stats thread. +# Otherwise, the next query waits the completion of the online alter which corrects the stats. +# SET DEBUG = '+d,skip_flush_for_analyze'; +analyze table t1; +# SET DEBUG = '-d,skip_flush_for_analyze'; +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; +--echo # With zero stat_n_rows, it is table scan (2.45) even if the real table might be very big. +--echo # With correct stat_n_rows, it is range access. +explain select * from t1 where b >= 2; +SELECT TRACE FROM information_schema.OPTIMIZER_TRACE; +SET DEBUG_SYNC = "now signal finish_ddl"; + +connection other; +--reap + +connection default; +select n_rows,clustered_index_size from mysql.innodb_table_stats where table_name = 't1'; +select index_name,stat_value,sample_size from mysql.innodb_index_stats where table_name = 't1' and stat_name = 'n_diff_pfx01'; +--echo # With stat_n_rows corrected by the completion of the online alter, it is range access again. +explain select * from t1 where b >= 2; + +drop table t1; +disconnect other; +SET DEBUG_SYNC = 'RESET'; diff --git a/sql/table.cc b/sql/table.cc index f579ba0..eb5734e 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -6564,8 +6564,12 @@ int TABLE_LIST::fetch_number_of_rows() { ->estimated_rowcount, // Recursive reference is never a const table (ha_rows)PLACEHOLDER_TABLE_ROW_ESTIMATE); - } else - error = table->file->info(HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK); + } else { + uint flag = HA_STATUS_VARIABLE | HA_STATUS_NO_LOCK; + DBUG_EXECUTE_IF("fetch_number_of_rows_info_const", + { flag |= HA_STATUS_CONST; }); + error = table->file->info(flag); + } return error; } diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc index 970c098..f19fa68e5 100644 --- a/storage/innobase/dict/dict0stats.cc +++ b/storage/innobase/dict/dict0stats.cc @@ -616,6 +616,94 @@ static void dict_stats_snapshot_free( dict_stats_table_clone_free(t); } +/** This function creates a scratch dict_index_t object and initializes the + following index members: + dict_index_t::type (copied) + dict_index_t::cached (copied) + dict_index_t::n_uniq (copied) + dict_index_t::stat_n_diff_key_vals[] (only allocated, left uninitialized) + dict_index_t::stat_n_sample_sizes[] (only allocated, left uninitialized) + dict_index_t::stat_n_non_null_key_vals[] (only allocated, left uninitialized) + dict_index_t::magic_n + + The scratch index allows the follow operations: + dict_stats_create_scratch_index() + dict_stats_empty_index() + dict_stats_index_set_n_diff() + dict_stats_copy_index() + dict_stats_free_scratch_index() + */ +static dict_index_t *dict_stats_create_scratch_index( + const dict_index_t *index) /*!< in: template index */ +{ + size_t heap_size; + + ulint n_uniq = dict_index_get_n_unique(index); + + heap_size = 0; + heap_size += sizeof(dict_index_t); + heap_size += n_uniq * sizeof(index->stat_n_diff_key_vals[0]); + heap_size += n_uniq * sizeof(index->stat_n_sample_sizes[0]); + heap_size += n_uniq * sizeof(index->stat_n_non_null_key_vals[0]); + + mem_heap_t *heap; + + heap = mem_heap_create(heap_size); + + dict_index_t *idx; + idx = (dict_index_t *)mem_heap_alloc(heap, sizeof(*idx)); + idx->heap = heap; + idx->type = index->type; + idx->cached = index->cached; + idx->n_uniq = n_uniq; + + idx->stat_n_diff_key_vals = (ib_uint64_t *)mem_heap_alloc( + heap, idx->n_uniq * sizeof(idx->stat_n_diff_key_vals[0])); + + idx->stat_n_sample_sizes = (ib_uint64_t *)mem_heap_alloc( + heap, idx->n_uniq * sizeof(idx->stat_n_sample_sizes[0])); + + idx->stat_n_non_null_key_vals = (ib_uint64_t *)mem_heap_alloc( + heap, idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0])); + ut_d(idx->magic_n = DICT_INDEX_MAGIC_N); + + return idx; +} + +/** Free the resources occupied by an object returned by + dict_stats_create_scratch_index(). */ +static void dict_stats_free_scratch_index( + dict_index_t *idx) /*!< in: scratchpad index object to free */ +{ + ut_d(idx->magic_n = DICT_INDEX_MAGIC_N); + mem_heap_free(idx->heap); +} + +/** Copy index statistics from one to another. */ +static void dict_stats_copy_index( + dict_index_t *dst_idx, /*!< in/out: destination index */ + const dict_index_t *src_idx) /*!< in: source index */ +{ + ulint n_copy_el; + + n_copy_el = dst_idx->n_uniq; + ut_a(dst_idx->n_uniq == src_idx->n_uniq); + + memmove(dst_idx->stat_n_diff_key_vals, src_idx->stat_n_diff_key_vals, + n_copy_el * sizeof(dst_idx->stat_n_diff_key_vals[0])); + + memmove(dst_idx->stat_n_sample_sizes, src_idx->stat_n_sample_sizes, + n_copy_el * sizeof(dst_idx->stat_n_sample_sizes[0])); + + memmove(dst_idx->stat_n_non_null_key_vals, + src_idx->stat_n_non_null_key_vals, + n_copy_el * sizeof(dst_idx->stat_n_non_null_key_vals[0])); + + dst_idx->stat_index_size = src_idx->stat_index_size; + + dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages; +} + /** Calculates new estimates for index statistics. This function is relatively quick and is used to calculate transient statistics that are not saved on disk. This was the only way to calculate statistics @@ -1664,11 +1752,32 @@ static inline void dict_stats_index_set_n_diff(const n_diff_data_t *n_diff_data, /** Calculates new statistics for a given index and saves them to the index members stat_n_diff_key_vals[], stat_n_sample_sizes[], stat_index_size and stat_n_leaf_pages, based on the specified n_sample_pages. + + A private scratch is used during analyze. Statistics for the given index will + be overwritten by the scratch at the end of this function. This is not an + atomic operation, but a tradeoff to prevent leaking empty state, because + latching is heavy and avoided in info_low(). + + Overwriting only takes place on a successful analyze; otherwise, the function + leaves statistics in old state and may request immediate retry. The old state + is guaranteed to be initialized by a preceding dict_stats_init(). + + Allowing only successful updates also prevents abnormal stats when the tree + has changed beyond recognition or BUG#98132 Analyze table leads to empty + statistics during online rebuild DDL. + @param[in,out] n_sample_pages number of leaf pages to sample. and suggested next value to retry if aborted. +@param[in,out] n_tickets number of tickets to retry without delay +@param[out] no_delay retry without delay @param[in,out] index index to analyze. @return false if aborted */ static bool dict_stats_analyze_index_low(ib_uint64_t &n_sample_pages, + ib_uint64_t &n_tickets, + bool &no_delay, +#ifdef UNIV_DEBUG + bool &simulate_tree_change, +#endif /* UNIV_DEBUG */ dict_index_t *index) { ulint root_level; ulint level; @@ -1682,6 +1791,7 @@ static bool dict_stats_analyze_index_low(ib_uint64_t &n_sample_pages, bool succeeded = true; mtr_t mtr; ulint size; + dict_index_t *scratch; DBUG_TRACE; DBUG_PRINT("info", ("index: %s, online status: %d", index->name(), @@ -1694,7 +1804,15 @@ static bool dict_stats_analyze_index_low(ib_uint64_t &n_sample_pages, DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name()); - dict_stats_empty_index(index); + /* Use a private scratch during analyze. */ + scratch = dict_stats_create_scratch_index(index); + dict_stats_empty_index(scratch); + +#ifdef UNIV_DEBUG + if (!(index->type & DICT_CLUSTERED)) { + DEBUG_SYNC_C("dict_stats_analyze_index_empty_sk"); + } +#endif /* UNIV_DEBUG */ mtr_start(&mtr); @@ -1702,8 +1820,11 @@ static bool dict_stats_analyze_index_low(ib_uint64_t &n_sample_pages, size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr); + DBUG_EXECUTE_IF("dict_stats_simulate_undefined_size", + {size = ULINT_UNDEFINED;}); + if (size != ULINT_UNDEFINED) { - index->stat_index_size = size; + scratch->stat_index_size = size; size = btr_get_size(index, BTR_N_LEAF_PAGES, &mtr); } @@ -1712,14 +1833,19 @@ static bool dict_stats_analyze_index_low(ib_uint64_t &n_sample_pages, switch (size) { case ULINT_UNDEFINED: + dict_stats_free_scratch_index(scratch); dict_stats_assert_initialized_index(index); + /* Consider ULINT_UNDEFINED as a strong failure, assuming it + to be handled by the outer workflow. For example, Online ALTER + rebuilds stats at the end of + ha_innobase::commit_inplace_alter_table_impl(). */ return true; case 0: /* The root node of the tree is a leaf */ size = 1; } - index->stat_n_leaf_pages = size; + scratch->stat_n_leaf_pages = size; mtr_start(&mtr); @@ -1744,8 +1870,17 @@ static bool dict_stats_analyze_index_low(ib_uint64_t &n_sample_pages, full scan without the SX_LOCK to a faster scan under SX_LOCK over 1e6+ pages. */ - if (root_level == 0 || n_sample_pages * n_uniq > - std::min(index->stat_n_leaf_pages, 1e6)) { + DBUG_EXECUTE_IF("dict_stats_simulate_tree_change", { + ut_a(root_level > 0); + }); + + if (root_level == 0 || + ( +#ifdef UNIV_DEBUG + !simulate_tree_change && +#endif /* UNIV_DEBUG */ + n_sample_pages * n_uniq > + std::min(scratch->stat_n_leaf_pages, 1e6))) { if (root_level == 0) { DEBUG_PRINTF( " %s(): just one page," @@ -1762,16 +1897,18 @@ static bool dict_stats_analyze_index_low(ib_uint64_t &n_sample_pages, into the index */ (void)dict_stats_analyze_index_level( - index, 0 /* leaf level */, index->stat_n_diff_key_vals, &total_recs, + index, 0 /* leaf level */, scratch->stat_n_diff_key_vals, &total_recs, &total_pages, nullptr /* boundaries not needed */, wait_start_time, &mtr); for (ulint i = 0; i < n_uniq; i++) { - index->stat_n_sample_sizes[i] = total_pages; + scratch->stat_n_sample_sizes[i] = total_pages; } mtr_commit(&mtr); + dict_stats_copy_index(index, scratch); + dict_stats_free_scratch_index(scratch); dict_stats_assert_initialized_index(index); return true; } @@ -1819,7 +1956,14 @@ static bool dict_stats_analyze_index_low(ib_uint64_t &n_sample_pages, mtr_start(&mtr); mtr_sx_lock(dict_index_get_lock(index), &mtr); wait_start_time = ut_time_monotonic(); - if (root_level != btr_height_get(index, &mtr)) { + if ( +#ifdef UNIV_DEBUG + simulate_tree_change || +#endif /* UNIV_DEBUG */ + root_level != btr_height_get(index, &mtr)) { +#ifdef UNIV_DEBUG + simulate_tree_change = false; +#endif /* UNIV_DEBUG */ /* Just quit if the tree has changed beyond recognition here. The old stats from previous runs will remain in the values that we have @@ -1970,15 +2114,22 @@ end: /* n_prefix == 0 means that the above loop did not end up prematurely due to tree being changed and so n_diff_data[] is set up. */ if (succeeded && n_prefix == 0) { - dict_stats_index_set_n_diff(n_diff_data, index); + dict_stats_index_set_n_diff(n_diff_data, scratch); + dict_stats_copy_index(index, scratch); + } else if (succeeded) { + /* Change of tree height is atomic and rare. An immediate retrial + should succeed. So consider it as a temporary failure. */ + if (n_tickets > 0) { + n_tickets--; + no_delay = true; + succeeded = false; + } } UT_DELETE_ARRAY(n_diff_data); - if (succeeded) { - dict_stats_assert_initialized_index(index); - } - + dict_stats_free_scratch_index(scratch); + dict_stats_assert_initialized_index(index); return succeeded; } @@ -1989,8 +2140,28 @@ static void dict_stats_analyze_index( dict_index_t *index) /*!< in/out: index to analyze */ { ib_uint64_t n_sample_pages = N_SAMPLE_PAGES(index); + ib_uint64_t n_no_delay_tickets = 1; + bool no_delay = false; +#ifdef UNIV_DEBUG + bool simulate_tree_change = false; + + DBUG_EXECUTE_IF("dict_stats_simulate_tree_change", { + simulate_tree_change = true; + }); +#endif /* UNI_DEBUG */ + while (n_sample_pages > 0 && - !dict_stats_analyze_index_low(n_sample_pages, index)) { + !dict_stats_analyze_index_low( + n_sample_pages, n_no_delay_tickets, no_delay, +#ifdef UNIV_DEBUG + simulate_tree_change, +#endif /* UNI_DEBUG */ + index)) { + if (no_delay) { + no_delay = false; + continue; + } + /* aborted. retrying. */ ib::warn(ER_IB_MSG_STATS_SAMPLING_TOO_LARGE) << "Detected too long lock waiting around " << index->table->name << "." @@ -2050,9 +2221,8 @@ static dberr_t dict_stats_update_persistent( continue; } - dict_stats_empty_index(index); - if (dict_stats_should_ignore_index(index)) { + dict_stats_empty_index(index); continue; } diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index f601a07..4bbd74b 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -1099,6 +1099,12 @@ static inline void row_update_statistics_if_needed( ib_uint64_t counter; ib_uint64_t n_rows; +#ifdef UNIV_DEBUG + if (!table->is_system_table) { + DBUG_EXECUTE_IF("dict_stats_skip_stat_modified_counter", {return;};); + } +#endif /* UNIV_DEBUG */ + if (!table->stat_initialized) { DBUG_EXECUTE_IF("test_upd_stats_if_needed_not_inited", fprintf(stderr, -- 1.8.3.1