diff --git a/sql/range_optimizer/index_range_scan_plan.cc b/sql/range_optimizer/index_range_scan_plan.cc index 785093d8..088399c3 100644 --- a/sql/range_optimizer/index_range_scan_plan.cc +++ b/sql/range_optimizer/index_range_scan_plan.cc @@ -53,6 +53,10 @@ #include "sql/table.h" #include "sql/thr_malloc.h" #include "sql_string.h" +#include "my_rnd.h" // randominit(), my_rnd() +#include "sql/system_variables.h" // enum_index_dive_sample_method +#include +#include using opt_range::null_element; using std::max; @@ -61,6 +65,8 @@ using std::min; static bool is_key_scan_ror(RANGE_OPT_PARAM *param, uint keynr, uint nparts); static bool eq_ranges_exceeds_limit(const SEL_ROOT *keypart, uint *count, uint limit); +bool qualifies_for_heuristic_sampling(THD *thd, const Query_block *qb, + const TABLE *table); static bool get_ranges_from_tree_given_base( THD *thd, MEM_ROOT *return_mem_root, const KEY *table_key, KEY_PART *key, SEL_ROOT *key_tree, uchar *const base_min_key, uchar *min_key, @@ -565,6 +571,149 @@ static uint sel_arg_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range) { return 0; } +// ----------------------------------------------------------------------- +// Heuristic sampling helpers +// ----------------------------------------------------------------------- + +/// Build a key_range struct from a SEL_ARG leaf node for records_in_range(). +static void setup_key_range_from_sel_arg(const SEL_ARG *arg, + const KEY_PART *key_part, + uchar *min_buf, uchar *max_buf, + key_range *min_kr, key_range *max_kr) { + uint store_length = key_part->store_length; + // MIN key + if (arg->min_flag & NO_MIN_RANGE) { + min_kr->key = nullptr; + min_kr->length = 0; + min_kr->flag = HA_READ_KEY_OR_NEXT; // flag unused when key==nullptr + } else { + memcpy(min_buf, arg->min_value, store_length); + min_kr->key = min_buf; + min_kr->length = store_length; + min_kr->flag = (arg->min_flag & NEAR_MIN) ? HA_READ_AFTER_KEY + : HA_READ_KEY_EXACT; + } + // MAX key + if (arg->max_flag & NO_MAX_RANGE) { + max_kr->key = nullptr; + max_kr->length = 0; + max_kr->flag = HA_READ_AFTER_KEY; // flag unused when key==nullptr + } else { + memcpy(max_buf, arg->max_value, store_length); + max_kr->key = max_buf; + max_kr->length = store_length; + max_kr->flag = (arg->max_flag & NEAR_MAX) ? HA_READ_BEFORE_KEY + : HA_READ_AFTER_KEY; + } +} + +/// Collect all leaf SEL_ARG nodes of a SEL_ROOT (for an equality range list). +static void collect_sel_arg_leaves(const SEL_ROOT *sel_root, + std::vector *leaves) { + if (sel_root == nullptr) return; + // Only KEY_RANGE trees have a proper inorder linked list. + // MAYBE_KEY and IMPOSSIBLE nodes do not use the ->next chain. + if (sel_root->type != SEL_ROOT::Type::KEY_RANGE) return; + if (sel_root->root == null_element) return; + // Guard against both null_element sentinel AND nullptr (defensive check). + for (const SEL_ARG *arg = sel_root->root->first(); + arg != nullptr && arg != null_element; + arg = arg->next) + leaves->push_back(arg); +} + +/// Select up to K random indices from [0, N) using Fisher-Yates partial +/// shuffle. Returns a sorted vector of selected indices. +static std::vector random_sample_indices(uint N, uint K, uint64_t seed) { + std::vector indices(N); + for (uint i = 0; i < N; ++i) indices[i] = i; + struct rand_struct rng; + randominit(&rng, static_cast(seed), + static_cast(seed >> 32)); + uint take = std::min(K, N); + for (uint i = 0; i < take; ++i) { + uint j = i + static_cast(my_rnd(&rng) * (N - i)); + std::swap(indices[i], indices[j]); + } + std::vector result(indices.begin(), indices.begin() + take); + std::sort(result.begin(), result.end()); + return result; +} + +/// Select K uniformly-spaced indices from [0, N). +static std::vector uniform_sample_indices(uint N, uint K) { + std::vector result; + uint take = std::min(K, N); + result.reserve(take); + for (uint i = 0; i < take; ++i) { + result.push_back(static_cast(static_cast(i) * N / take)); + } + return result; +} + +ha_rows heuristic_sample_records_in_range(THD *thd, RANGE_OPT_PARAM *param, + uint idx, SEL_ROOT *sel_root, uint N, + Opt_trace_context *trace) { + (void)trace; // reserved for future optimizer trace output + const uint K = thd->variables.eq_range_index_dive_sample_count; + if (K == 0) return HA_POS_ERROR; + + std::vector leaves; + collect_sel_arg_leaves(sel_root, &leaves); + if (leaves.empty()) return HA_POS_ERROR; + + uint total = static_cast(leaves.size()); + const auto method = + static_cast( + thd->variables.eq_range_index_dive_sample_method); + + std::vector sample_idx; + if (method == INDEX_DIVE_SAMPLE_RANDOM) { + uint64_t seed = static_cast(thd->query_id) ^ + (static_cast(idx) << 32); + sample_idx = random_sample_indices(total, K, seed); + } else if (method == INDEX_DIVE_SAMPLE_UNIFORM) { + sample_idx = uniform_sample_indices(total, K); + } else { + // INDEX_DIVE_SAMPLE_FIRST: take first K + uint take = std::min(K, total); + sample_idx.resize(take); + for (uint i = 0; i < take; ++i) sample_idx[i] = i; + } + + uint keynr = param->real_keynr ? param->real_keynr[idx] : idx; + handler *file = param->table->file; + const KEY_PART *key_part = param->key[idx]; + uchar min_buf[MAX_KEY_LENGTH + MAX_FIELD_WIDTH]; + uchar max_buf[MAX_KEY_LENGTH + MAX_FIELD_WIDTH]; + + ha_rows total_rows = 0; + uint sampled = 0; + for (uint si : sample_idx) { + const SEL_ARG *arg = leaves[si]; + key_range min_kr, max_kr; + setup_key_range_from_sel_arg(arg, key_part, min_buf, max_buf, &min_kr, + &max_kr); + ha_rows rows = file->records_in_range( + keynr, + (min_kr.key != nullptr) ? &min_kr : nullptr, + (max_kr.key != nullptr) ? &max_kr : nullptr); + if (rows == HA_POS_ERROR) return HA_POS_ERROR; + total_rows += rows; + ++sampled; + } + + if (sampled == 0) return HA_POS_ERROR; + + // Extrapolate: avg_rows * N + ha_rows estimated = + static_cast(static_cast(total_rows) / sampled * N); + + param->used_heuristic_sampling = true; + param->heuristic_sample_rows = estimated; + return estimated; +} + ha_rows check_quick_select(THD *thd, RANGE_OPT_PARAM *param, uint idx, bool index_only, SEL_ROOT *tree, bool update_tbl_stats, enum_order order_direction, @@ -603,6 +752,60 @@ ha_rows check_quick_select(THD *thd, RANGE_OPT_PARAM *param, uint idx, uint range_count = 0; param->use_index_statistics = eq_ranges_exceeds_limit( tree, &range_count, thd->variables.eq_range_index_dive_limit); + + /* --- BEGIN: Heuristic sampling intercept --- */ + if (param->use_index_statistics && + thd->variables.eq_range_index_dive_sample_count > 0 && + qualifies_for_heuristic_sampling(thd, param->query_block, param->table)) { + ha_rows h = heuristic_sample_records_in_range( + thd, param, idx, tree, range_count, &thd->opt_trace); + if (h != HA_POS_ERROR) { + param->use_index_statistics = false; // We did real dives (sampled) + + /* + Early return: must set all output parameters that the rest of this + function would have set, because callers (IndexRangeScanIterator, + DsMrr_impl) rely on them being properly initialized. + */ + *is_imerge_scan = true; + // Conservative: ROR requires detailed key-algorithm checks; skip here. + *is_ror_scan = false; + + *mrr_flags = (order_direction == ORDER_DESC) ? HA_MRR_USE_DEFAULT_IMPL : 0; + *mrr_flags |= HA_MRR_NO_ASSOCIATION; + if (order_direction != ORDER_NOT_RELEVANT) *mrr_flags |= HA_MRR_SORTED; + if (index_only && + (file->index_flags(keynr, 0, true) & HA_KEYREAD_ONLY) && + !(file->primary_key_is_clustered() && + keynr == param->table->s->primary_key)) + *mrr_flags |= HA_MRR_INDEX_ONLY; + if (thd->lex->sql_command != SQLCOM_SELECT) + *mrr_flags |= HA_MRR_SORTED; + + // bufsize must be non-zero; use the session's read-buffer setting. + *bufsize = thd->variables.read_rnd_buff_size; + + // Simplified cost estimate (IO-proportional to sampled row count). + cost->reset(); + cost->add_io(static_cast(h)); + + // Update table-level statistics the same way the normal path does. + param->table->quick_rows[keynr] = h; + if (update_tbl_stats) { + param->table->quick_keys.set_bit(keynr); + param->table->quick_key_parts[keynr] = seq.max_key_part + 1; + param->table->quick_n_ranges[keynr] = range_count; + param->table->quick_condition_rows = + min(param->table->quick_condition_rows, h); + } + param->table->possible_quick_keys.set_bit(keynr); + + return h; + } + // h == HA_POS_ERROR: fall through to existing static statistics path + } + /* --- END: Heuristic sampling intercept --- */ + *is_imerge_scan = true; *is_ror_scan = !(file->index_flags(keynr, 0, true) & HA_KEY_SCAN_NOT_ROR); diff --git a/sql/range_optimizer/index_range_scan_plan.h b/sql/range_optimizer/index_range_scan_plan.h index 555484d8..792518a1 100644 --- a/sql/range_optimizer/index_range_scan_plan.h +++ b/sql/range_optimizer/index_range_scan_plan.h @@ -136,4 +136,13 @@ void trace_basic_info_index_range_scan(THD *thd, const AccessPath *path, const RANGE_OPT_PARAM *param, Opt_trace_object *trace_object); +/// Estimate row count for an IN-list range using heuristic sampling. +/// Returns HA_POS_ERROR if sampling is disabled or should fall back. +ha_rows heuristic_sample_records_in_range(THD *thd, + RANGE_OPT_PARAM *param, + uint idx, + SEL_ROOT *sel_root, + uint N, + Opt_trace_context *trace); + #endif // SQL_RANGE_OPTIMIZER_INDEX_RANGE_SCAN_PLAN_H_ diff --git a/sql/range_optimizer/index_skip_scan_plan.h b/sql/range_optimizer/index_skip_scan_plan.h index 0a733418..8cd527d9 100644 --- a/sql/range_optimizer/index_skip_scan_plan.h +++ b/sql/range_optimizer/index_skip_scan_plan.h @@ -23,11 +23,16 @@ #ifndef SQL_RANGE_OPTIMIZER_INDEX_SKIP_SCAN_PLAN_H_ #define SQL_RANGE_OPTIMIZER_INDEX_SKIP_SCAN_PLAN_H_ +#ifndef INDEX_RANGE_SCAN_PLAN_H_INCLUDED +#define INDEX_RANGE_SCAN_PLAN_H_INCLUDED #include #include "my_base.h" #include "sql/range_optimizer/range_optimizer.h" +#include "sql/range_optimizer/range_opt_param.h" +#include "sql/opt_trace.h" // Opt_trace_context +#include "sql/sql_class.h" // THD class KEY; class KEY_PART_INFO; @@ -120,3 +125,4 @@ void dbug_dump_index_skip_scan(int indent, bool verbose, #endif #endif // SQL_RANGE_OPTIMIZER_INDEX_SKIP_SCAN_PLAN_H_ + diff --git a/sql/range_optimizer/range_opt_param.h b/sql/range_optimizer/range_opt_param.h index d47988e0..8974c25e 100644 --- a/sql/range_optimizer/range_opt_param.h +++ b/sql/range_optimizer/range_opt_param.h @@ -82,6 +82,16 @@ class RANGE_OPT_PARAM { bool has_errors() const { return (error_handler.has_errors()); } KEY_PART **key = nullptr; /* First key parts of keys used in the query */ + + /// Set to true when check_quick_select() used heuristic sampling + /// (instead of full index dives or static statistics) to estimate + /// the row count for an IN-list range. + bool used_heuristic_sampling{false}; + + /// The total estimated row count produced by heuristic sampling + /// (avg_rows_per_sampled_value Ă— N). + /// Populated only when used_heuristic_sampling == true. + ha_rows heuristic_sample_rows{0}; }; -#endif // SQL_RANGE_OPTIMIZER_RANGE_OPT_PARAM_H_ +#endif // SQL_RANGE_OPTIMIZER_RANGE_OPT_PARAM_H_ \ No newline at end of file diff --git a/sql/range_optimizer/range_optimizer.cc b/sql/range_optimizer/range_optimizer.cc index e34860ba..10b71a8c 100644 --- a/sql/range_optimizer/range_optimizer.cc +++ b/sql/range_optimizer/range_optimizer.cc @@ -1797,6 +1797,49 @@ void print_tree(String *out, const char *tree_name, SEL_TREE *tree, } } +// ----------------------------------------------------------------------- +// Heuristic sampling qualifier helpers +// ----------------------------------------------------------------------- + +static bool where_has_subquery(const Query_block *qb) { + if (qb == nullptr) return false; + const Item *where = qb->where_cond(); + return (where != nullptr && where->has_subquery()); +} + +static bool has_fulltext_index(const TABLE *table) { + if (table == nullptr) return false; + for (uint i = 0; i < table->s->keys; ++i) { + if (table->key_info[i].flags & HA_FULLTEXT) return true; + } + return false; +} + +static bool is_single_table_query(const Query_block *qb) { + if (qb == nullptr) return false; + uint count = 0; + for (const Table_ref *tl = qb->get_table_list(); tl != nullptr; + tl = tl->next_local) { + if (++count > 1) return false; + } + return count == 1; +} + +bool qualifies_for_heuristic_sampling(THD *thd, const Query_block *qb, + const TABLE *table) { + if (!is_single_table_query(qb)) return false; + if (has_fulltext_index(table)) return false; + if (where_has_subquery(qb)) return false; + + static constexpr uint HEURISTIC_PARTITION_THRESHOLD = 500; + if (table->part_info != nullptr) { + const uint part_count = static_cast(table->part_info->num_parts); + const uint K = thd->variables.eq_range_index_dive_sample_count; + if (part_count * K > HEURISTIC_PARTITION_THRESHOLD) return false; + } + return true; +} + /***************************************************************************** ** Print a quick range for debugging ** TODO: diff --git a/sql/range_optimizer/range_optimizer.h b/sql/range_optimizer/range_optimizer.h index ddbbcdb9..d2a31351 100644 --- a/sql/range_optimizer/range_optimizer.h +++ b/sql/range_optimizer/range_optimizer.h @@ -217,4 +217,12 @@ bool comparable_in_index(Item *cond_func, const Field *field, void trace_quick_description(const AccessPath *path, Opt_trace_context *trace); +/** + * Check whether a query qualifies for heuristic IN-list sampling. + * Returns true when all conditions are met (single-table, no FULLTEXT, + * no subquery in WHERE, partitionĂ—K within threshold). + */ +bool qualifies_for_heuristic_sampling(THD *thd, const Query_block *qb, + const TABLE *table); + #endif // SQL_RANGE_OPTIMIZER_RANGE_OPTIMIZER_H_ diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc index f5625324..d5286940 100644 --- a/sql/sys_vars.cc +++ b/sql/sys_vars.cc @@ -3732,6 +3732,55 @@ static Sys_var_uint Sys_eq_range_index_dive_limit( CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, UINT_MAX32), DEFAULT(200), BLOCK_SIZE(1)); +/* ----------------------------------------------------------------------- + NEW: eq_range_index_dive_sample_count + ----------------------------------------------------------------------- */ + +static Sys_var_uint Sys_eq_range_index_dive_sample_count( + "eq_range_index_dive_sample_count", + "Number of IN-list values to sample when the list length exceeds " + "eq_range_index_dive_limit. The sampled values are used with " + "records_in_range() and the result is extrapolated to estimate the " + "total row count. Set to 0 to disable heuristic sampling and fall " + "back to static index statistics.", + HINT_UPDATEABLE SESSION_VAR(eq_range_index_dive_sample_count), + CMD_LINE(REQUIRED_ARG), + VALID_RANGE(0, 200), + DEFAULT(10), + BLOCK_SIZE(1), + NO_MUTEX_GUARD, + NOT_IN_BINLOG, + ON_CHECK(nullptr), + ON_UPDATE(nullptr)); + +/* ----------------------------------------------------------------------- + NEW: eq_range_index_dive_sample_method + ----------------------------------------------------------------------- */ + +/** Allowed values for eq_range_index_dive_sample_method */ +static const char *index_dive_sample_method_names[] = { + "RANDOM", /* Fisher-Yates partial shuffle */ + "UNIFORM", /* evenly-spaced positions */ + "FIRST", /* first K values in index order */ + NullS +}; + +static Sys_var_enum Sys_eq_range_index_dive_sample_method( + "eq_range_index_dive_sample_method", + "Sampling strategy used by heuristic index dive estimation when " + "eq_range_index_dive_sample_count > 0. " + "RANDOM: Fisher-Yates partial shuffle (my_rnd); " + "UNIFORM: evenly-spaced positions 0, N/K, 2N/K, ...; " + "FIRST: first K values in index order.", + SESSION_VAR(eq_range_index_dive_sample_method), + CMD_LINE(REQUIRED_ARG), + index_dive_sample_method_names, + DEFAULT(INDEX_DIVE_SAMPLE_RANDOM), + NO_MUTEX_GUARD, + NOT_IN_BINLOG, + ON_CHECK(nullptr), + ON_UPDATE(nullptr)); + static Sys_var_ulong Sys_range_alloc_block_size( "range_alloc_block_size", "Allocation block size for storing ranges during optimization", diff --git a/sql/system_variables.h b/sql/system_variables.h index 1ff1d599..524f7ddf 100644 --- a/sql/system_variables.h +++ b/sql/system_variables.h @@ -235,6 +235,17 @@ struct System_variables { ulong auto_increment_increment, auto_increment_offset; ulong bulk_insert_buff_size; uint eq_range_index_dive_limit; + /** + * Number of IN-list values to sample for heuristic range estimation. + * SESSION scope, range [0, 200], default 10. + * 0 = disabled; fall back to static index statistics. + */ + uint eq_range_index_dive_sample_count; + /** + * Sampling strategy for heuristic range estimation. + * SESSION scope, ENUM {RANDOM=0, UNIFORM=1, FIRST=2}, default RANDOM. + */ + ulong eq_range_index_dive_sample_method; uint cte_max_recursion_depth; ulonglong histogram_generation_max_mem_size; ulong join_buff_size; @@ -626,4 +637,16 @@ void add_to_status(System_status_var *to_var, System_status_var *from_var); void reset_system_status_vars(System_status_var *status_vars); + +/* ----------------------------------------------------------------------- + Enum constants for eq_range_index_dive_sample_method + ----------------------------------------------------------------------- */ + +enum enum_index_dive_sample_method { + INDEX_DIVE_SAMPLE_RANDOM = 0, + INDEX_DIVE_SAMPLE_UNIFORM = 1, + INDEX_DIVE_SAMPLE_FIRST = 2 +}; + + #endif // SYSTEM_VARIABLES_INCLUDED