diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt index 9c02e41ef06..7ba7a242dc6 100644 --- a/sql/CMakeLists.txt +++ b/sql/CMakeLists.txt @@ -495,6 +495,7 @@ SET(SQL_SHARED_SOURCES range_optimizer/partition_pruning.cc range_optimizer/range_analysis.cc range_optimizer/range_optimizer.cc + range_optimizer/range_estimator.cc range_optimizer/reverse_index_range_scan.cc range_optimizer/rowid_ordered_retrieval.cc range_optimizer/rowid_ordered_retrieval_plan.cc diff --git a/sql/handler.cc b/sql/handler.cc index c830aff5306..b665dba985e 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -6394,49 +6394,9 @@ ha_rows handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, } } - /* - Get the number of rows in the range. This is done by calling - records_in_range() unless: - - 1) The index is unique. - There cannot be more than one matching row, so 1 is - assumed. Note that it is possible that the correct number - is actually 0, so the row estimate may be too high in this - case. Also note: ranges of the form "x IS NULL" may have more - than 1 matching row so records_in_range() is called for these. - 2) SKIP_RECORDS_IN_RANGE is set. - There are supportive statistics and the user requested to use - statistics instead of records_in_range(); or the user is not - interested in cost calculation at all. - */ - int keyparts_used = 0; - if ((range.range_flag & UNIQUE_RANGE) && // 1) - !(range.range_flag & NULL_RANGE)) - rows = 1; /* there can be at most one row */ - else if (range.range_flag & SKIP_RECORDS_IN_RANGE) { // 2) - if ((range.range_flag & EQ_RANGE) && - !(range.range_flag & NULL_RANGE) && - (keyparts_used = std::popcount(range.start_key.keypart_map))) { - DBUG_EXECUTE_IF("crash_index_statistics_null_part", { - // It is an equality range, both ends are present, inclusive and same. - const uchar *p = range.start_key.key; - KEY *k = &table->key_info[keyno]; - for (uint kp = 0; kp < actual_key_parts(k); kp++) { - if (k->key_part[kp].field->is_nullable() && *p) { - DBUG_SUICIDE(); - } - p += k->key_part[kp].store_length; - } - }); - rows = static_cast( - table->key_info[keyno].records_per_key(keyparts_used - 1)); - } else { - /* - Since records_in_range has not been called, set the rows to 1. - FORCE INDEX has been used, cost model values will be ignored anyway. - */ - rows = 1; - } + // Get the number of rows in the range by statistics or by index dive. + if (table->range_estimator.use_statistics_only()) { + rows = table->range_estimator.records_in_range(range); } else { DBUG_EXECUTE_IF("crash_records_in_range", DBUG_SUICIDE();); assert(min_endp || max_endp); diff --git a/sql/range_optimizer/index_range_scan_plan.cc b/sql/range_optimizer/index_range_scan_plan.cc index d7ce9590522..956ec7d9fcd 100644 --- a/sql/range_optimizer/index_range_scan_plan.cc +++ b/sql/range_optimizer/index_range_scan_plan.cc @@ -219,6 +219,7 @@ static range_seq_t sel_arg_range_seq_init(void *init_param, uint, uint) { Sel_arg_range_sequence *seq = static_cast(init_param); seq->reset(); + seq->param->table->range_estimator.use_index(seq->keyno); return init_param; } @@ -578,27 +579,11 @@ static uint sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range) { /* Use statistics instead of index dives for estimates of rows in - this range if: - - 1) The user has requested it. - 2) Index statistics is supportive for the range, which means - a) It is an equality range. - b) It contains no null part. - 3) Index statistics is available. - - Ranges of the form "x IS NULL" will not use index statistics - because the number of rows with this value are likely to be - very different than the values in the index statistics. + this range, if it is requested, supportive and available. */ - int keyparts_used = 0; - if (param->use_index_statistics && // 1) - (range->range_flag & EQ_RANGE) && // 2a) - !(range->range_flag & NULL_RANGE) && // 2b) - (keyparts_used = std::popcount(range->start_key.keypart_map)) && // 3) - param->table->key_info[seq->keyno].has_records_per_key(keyparts_used - 1)) - range->range_flag |= SKIP_RECORDS_IN_RANGE; - - if (seq->skip_records_in_range) range->range_flag |= SKIP_RECORDS_IN_RANGE; + Range_settings settings; + init_range_settings(settings, *param, seq->skip_records_in_range); + param->table->range_estimator.decide_estimation_method(settings, *range); return 0; } diff --git a/sql/range_optimizer/range_estimator.cc b/sql/range_optimizer/range_estimator.cc new file mode 100644 index 00000000000..23dc3b21d65 --- /dev/null +++ b/sql/range_optimizer/range_estimator.cc @@ -0,0 +1,121 @@ +/* Copyright (c) 2000, 2024, Oracle and/or its affiliates. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2.0, + as published by the Free Software Foundation. + + This program is designed to work with certain software (including + but not limited to OpenSSL) that is licensed under separate terms, + as designated in a particular file or component or in included license + documentation. The authors of MySQL hereby grant you an additional + permission to link the program and your derivative works with the + separately licensed software that they have either included with + the program or referenced in the documentation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License, version 2.0, for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#include "my_base.h" +#include "my_dbug.h" +#include "sql/handler.h" +#include "sql/range_optimizer/range_opt_param.h" +#include "sql/sql_select.h" // actual_key_parts +#include "sql/table.h" + +#include "sql/range_optimizer/range_estimator.h" + +void init_range_settings(Range_settings &settings, const RANGE_OPT_PARAM ¶m, + bool force_skip) { + settings.skip = force_skip; + settings.eq_range = param.use_index_statistics; + settings.strict_range = false; + settings.use_index_statistics = param.use_index_statistics; +} + +void Range_estimator::init(TABLE *tbl) { + table = tbl; +} + +void Range_estimator::use_index(uint kr) { + keyno = kr; +} + +void Range_estimator::decide_estimation_method(const Range_settings &settings, + KEY_MULTI_RANGE &range) { + selected_estimator = check_statistics(settings, range); + + if (selected_estimator != NUM_ESTIMATORS) { + range.range_flag |= SKIP_RECORDS_IN_RANGE; + } +} + +enum enum_range_estimator +Range_estimator::check_statistics(const Range_settings &settings, + KEY_MULTI_RANGE &range) { + enum enum_range_estimator estimator = NUM_ESTIMATORS; + if ((range.range_flag & (UNIQUE_RANGE | NULL_RANGE)) == UNIQUE_RANGE) { + /* + A real unique range matching at most one row. + Note that it is possible that the correct number is actually 0, so the + row estimate may be too high in this case. Also note: ranges of the form + "x IS NULL" may have more than 1 matching row. + */ + estimator = ONE_ROW_ESTIMATOR; + } else if (settings.eq_range && settings.use_index_statistics && + check_index_statistics(range)) { + /* + Ranges of the form "x IS NULL" will not use index statistics + because the number of rows with this value are likely to be + very different than the values in the index statistics. + */ + DBUG_EXECUTE_IF("crash_index_statistics_null_part", { + // It is an equality range, both ends are present, inclusive and same. + const uchar *p = range.start_key.key; + KEY *k = &table->key_info[keyno]; + for (uint kp = 0; kp < actual_key_parts(k); kp++) { + if (k->key_part[kp].field->is_nullable() && *p) { + DBUG_SUICIDE(); + } + p += k->key_part[kp].store_length; + } + }); + estimator = EQ_RANGE_ESTIMATOR; + } else if (settings.skip) { + // FORCE INDEX has been used, cost model values will be ignored anyway. + estimator = ONE_ROW_ESTIMATOR; + } + + return estimator; +} + +ha_rows Range_estimator::records_in_range(KEY_MULTI_RANGE &range) { + assert(range.range_flag & SKIP_RECORDS_IN_RANGE); + + switch (selected_estimator) { + case EQ_RANGE_ESTIMATOR: + return estimate_by_index_statistics(range); + case ONE_ROW_ESTIMATOR: + return 1; + default: + assert(0); + return HA_POS_ERROR; + } +} + +bool Range_estimator::check_index_statistics(KEY_MULTI_RANGE &range) { + return + (((range.range_flag & (EQ_RANGE | NULL_RANGE)) == EQ_RANGE) && + (keyparts_used = std::popcount(range.start_key.keypart_map)) && + (table->key_info[keyno].has_records_per_key(keyparts_used - 1))); +} + +ha_rows Range_estimator::estimate_by_index_statistics(KEY_MULTI_RANGE &) { + return static_cast( + table->key_info[keyno].records_per_key(keyparts_used - 1)); +} diff --git a/sql/range_optimizer/range_estimator.h b/sql/range_optimizer/range_estimator.h new file mode 100644 index 00000000000..dbdb820f9c6 --- /dev/null +++ b/sql/range_optimizer/range_estimator.h @@ -0,0 +1,140 @@ +/* Copyright (c) 2000, 2024, Oracle and/or its affiliates. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2.0, + as published by the Free Software Foundation. + + This program is designed to work with certain software (including + but not limited to OpenSSL) that is licensed under separate terms, + as designated in a particular file or component or in included license + documentation. The authors of MySQL hereby grant you an additional + permission to link the program and your derivative works with the + separately licensed software that they have either included with + the program or referenced in the documentation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License, version 2.0, for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#ifndef SQL_RANGE_OPTIMIZER_RANGE_ESTIMATOR_H_ +#define SQL_RANGE_OPTIMIZER_RANGE_ESTIMATOR_H_ + +#include "my_base.h" + +class RANGE_OPT_PARAM; +class TABLE; + +/// Range estimator types. +enum enum_range_estimator { + /** + An estimator that always estimates to 1 row. It can be used when there is + such knowledge, or when cost calculation is useless. + */ + ONE_ROW_ESTIMATOR, + /** + An estimator for equality non-null range. + */ + EQ_RANGE_ESTIMATOR, + NUM_ESTIMATORS +}; + +/// Settings that control the seletion of estimators. +struct Range_settings { + /// True: always uses an estimator. ONE_ROW_ESTIMATOR will be the catch. + bool skip; + + /// True: use index statistics for equality ranges. + bool eq_range; + /// True: use index statistics for non-equality ranges. + bool strict_range; + + /// True: use index statistics for range estimation. + bool use_index_statistics; +}; + +/** + Provide statistics-only estimation for key ranges. + + It is supposed to be used as an alternative to handler::records_in_range(). + */ +class Range_estimator { + public: + void init(TABLE *_table); + + /// Set the index to evaluate key ranges. + void use_index(uint keyno); + + /** + Try to select a statistics-only estimator for the range. + + @param settings Settings that control the seletion of estimators. + @param range The range. + */ + void decide_estimation_method(const Range_settings &settings, + KEY_MULTI_RANGE &range); + + /** + Check if any statistics-only estimator is selected. + + @param range The range. + + @retval True An estimator is selected. + @retval False No available estimator. + */ + bool use_statistics_only() const { + return selected_estimator != NUM_ESTIMATORS; + } + + /** + Estimate records in a single key range. + + Note that the range is const in essence. It is declared non-const just because of + the lower-level handler interface. + + @param range The range. + */ + ha_rows records_in_range(KEY_MULTI_RANGE &range); + + private: + /** + Check if estimation could be served by statistics. + + To estimate a range, + + 1) The settings allows statistics-only estimation. + 2) The data type has supportive statistics. + 3) Statistics are available. + + @param settings Settings that control the seletion of estimators. + @param range The range. + + @return The selected estimator, or MAX_ESTIMATORS if none. + */ + enum enum_range_estimator + check_statistics(const Range_settings &settings, KEY_MULTI_RANGE &range); + + bool check_index_statistics(KEY_MULTI_RANGE &range); + ha_rows estimate_by_index_statistics(KEY_MULTI_RANGE &range); + + // The TABLE as statistics provider. + const TABLE *table; + + // Backing index of the current range. + uint keyno; + + // Key parts used for the current range. + int keyparts_used; + + // Selected estimator. + enum enum_range_estimator selected_estimator; +}; + +void init_range_settings(Range_settings &settings, const RANGE_OPT_PARAM ¶m, + bool force_skip); + +#endif // SQL_RANGE_OPTIMIZER_RANGE_ESTIMATOR_H_ diff --git a/sql/range_optimizer/rowid_ordered_retrieval_plan.cc b/sql/range_optimizer/rowid_ordered_retrieval_plan.cc index e6dc5609ee4..f610bfe3462 100644 --- a/sql/range_optimizer/rowid_ordered_retrieval_plan.cc +++ b/sql/range_optimizer/rowid_ordered_retrieval_plan.cc @@ -44,6 +44,7 @@ #include "sql/range_optimizer/index_range_scan_plan.h" #include "sql/range_optimizer/internal.h" #include "sql/range_optimizer/path_helpers.h" +#include "sql/range_optimizer/range_estimator.h" #include "sql/range_optimizer/range_opt_param.h" #include "sql/range_optimizer/rowid_ordered_retrieval.h" #include "sql/range_optimizer/tree.h" @@ -370,7 +371,7 @@ ROR_intersect_plan &ROR_intersect_plan::operator=( double ROR_intersect_plan::get_scan_selectivity( const ROR_SCAN_INFO *scan) const { double selectivity_mult = 1.0; - const TABLE *const table = m_param->table; + TABLE *const table = m_param->table; const KEY_PART_INFO *const key_part = table->key_info[scan->keynr].key_part; /** key values tuple, used to store both min_range.key and @@ -385,13 +386,14 @@ double ROR_intersect_plan::get_scan_selectivity( key_part_map keypart_map = 0; bool cur_covered; bool prev_covered = IsBitSet(key_part->fieldnr - 1, m_covered_fields); - key_range min_range; - key_range max_range; - min_range.key = key_val; - min_range.flag = HA_READ_KEY_EXACT; - max_range.key = key_val; - max_range.flag = HA_READ_AFTER_KEY; + KEY_MULTI_RANGE range; + range.start_key.key = key_val; + range.start_key.flag = HA_READ_KEY_EXACT; + range.end_key.key = key_val; + range.end_key.flag = HA_READ_AFTER_KEY; + range.range_flag = EQ_RANGE; ha_rows prev_records = table->file->stats.records; + table->range_estimator.use_index(scan->keynr); DBUG_TRACE; for (SEL_ROOT *sel_root = scan->sel_root; sel_root; @@ -401,52 +403,39 @@ double ROR_intersect_plan::get_scan_selectivity( IsBitSet(key_part[sel_root->root->part].fieldnr - 1, m_covered_fields); if (cur_covered != prev_covered) { /* create (part1val, ..., part{n-1}val) tuple. */ - bool is_null_range = false; + range.range_flag &= ~NULL_RANGE; ha_rows records; if (!tuple_arg) { tuple_arg = scan->sel_root->root; /* Here we use the length of the first key part */ tuple_arg->store_min_value(key_part[0].store_length, &key_ptr, 0); - is_null_range |= tuple_arg->is_null_interval(); + if (tuple_arg->is_null_interval()) range.range_flag |= NULL_RANGE; keypart_map = 1; } while (tuple_arg->next_key_part != sel_root) { tuple_arg = tuple_arg->next_key_part->root; tuple_arg->store_min_value(key_part[tuple_arg->part].store_length, &key_ptr, 0); - is_null_range |= tuple_arg->is_null_interval(); + if (tuple_arg->is_null_interval()) range.range_flag |= NULL_RANGE; keypart_map = (keypart_map << 1) | 1; } - min_range.length = max_range.length = (size_t)(key_ptr - key_val); - min_range.keypart_map = max_range.keypart_map = keypart_map; - - /* - Get the number of rows in this range. This is done by calling - records_in_range() unless all these are true: - 1) The user has requested that index statistics should be used - for equality ranges to avoid the incurred overhead of - index dives in records_in_range() - 2) The range is not on the form "x IS NULL". The reason is - that the number of rows with this value are likely to be - very different than the values in the index statistics - 3) Index statistics is available. - @see key_val - */ - if (!m_param->use_index_statistics || // (1) - is_null_range || // (2) - !table->key_info[scan->keynr].has_records_per_key( - tuple_arg->part)) // (3) - { - DBUG_EXECUTE_IF("crash_records_in_range", DBUG_SUICIDE();); - assert(min_range.length > 0); - assert( - !table->pos_in_table_list->is_derived_unfinished_materialization()); - records = - table->file->records_in_range(scan->keynr, &min_range, &max_range); + range.start_key.length = range.end_key.length = (size_t)(key_ptr - key_val); + range.start_key.keypart_map = range.end_key.keypart_map = keypart_map; + + assert(range.start_key.length > 0); + assert( + !table->pos_in_table_list->is_derived_unfinished_materialization()); + + Range_settings settings; + init_range_settings(settings, *m_param, /*force_skip=*/false); + table->range_estimator.decide_estimation_method(settings, range); + + if (table->range_estimator.use_statistics_only()) { + records = table->range_estimator.records_in_range(range); } else { - // Use index statistics - records = static_cast( - table->key_info[scan->keynr].records_per_key(tuple_arg->part)); + records = table->file->records_in_range(scan->keynr, + &range.start_key, + &range.end_key); } if (cur_covered) { diff --git a/sql/table.cc b/sql/table.cc index 7a6a33c04e9..748fd2c27ff 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -4164,6 +4164,8 @@ void TABLE::init(THD *thd, Table_ref *tl) { pos_in_table_list = tl; tl->table = this; + range_estimator.init(this); + clear_column_bitmaps(); /* Tables may be reused in a sub statement. */ diff --git a/sql/table.h b/sql/table.h index 0d201ce76b4..0f9adf2fe0a 100644 --- a/sql/table.h +++ b/sql/table.h @@ -54,6 +54,7 @@ #include "sql/mem_root_array.h" #include "sql/mysqld_cs.h" #include "sql/opt_costmodel.h" // Cost_model_table +#include "sql/range_optimizer/range_estimator.h" // Range_estimator #include "sql/partition_info.h" #include "sql/record_buffer.h" // Record_buffer #include "sql/sql_bitmap.h" // Bitmap @@ -1961,6 +1962,8 @@ struct TABLE { bool all_partitions_pruned_away{false}; MDL_ticket *mdl_ticket{nullptr}; + Range_estimator range_estimator; + private: /// Cost model object for operations on this table Cost_model_table m_cost_model;