Description:
If there are non-deterministic expressions in in-subquery, and semi-join is chosen, the query may produce wrong result if materialization strategy is not chosen.
For example,
create table t1 (a int, b int, c int);
insert into t1 values(1,1,1),(2,2,2),(3,3,3);
insert into t1 select a, b * 2, c from t1;
insert into t1 select a, b * 2, c from t1;
insert into t1 select a, b * 2, c from t1;
create index idx_b on t1(b);
create table t2 (a int, b int, c int);
insert into t2 values(1,1,1),(2,2,2),(3,3,3);
create table inner_t (a int, start_time datetime, expires_time datetime);
insert into inner_t values(3, date_add(sysdate(), interval 1 hour), "2023-09-01 00:00:00"), (2, date_add(sysdate(), interval 1 hour), "2023-09-01 00:00:00"), (3, date_sub(sysdate(), interval 1 hour), date_add(sysdate(), interval 1 hour));
insert into inner_t select a, start_time, expires_time from inner_t;
insert into inner_t select a, start_time, expires_time from inner_t;
insert into inner_t select a, start_time, expires_time from inner_t;
set optimizer_switch='materialization=off';
mysql> select * from t1 left join t2 on t1.a = t2.a where t1.b in (select a from inner_t where sysdate() between start_time and expires_time);
Empty set (0.00 sec)
mysql> set optimizer_switch='semijoin=off';
Query OK, 0 rows affected (0.00 sec)
mysql> select * from t1 left join t2 on t1.a = t2.a where t1.b in (select a from inner_t where sysdate() between start_time and expires_time);
+------+------+------+------+------+------+
| a | b | c | a | b | c |
+------+------+------+------+------+------+
| 3 | 3 | 3 | 3 | 3 | 3 |
+------+------+------+------+------+------+
1 row in set (0.00 sec)
the query "select * from t1 left join t2 on t1.a = t2.a where t1.b in (select a from inner_t where sysdate() between start_time and expires_time)" should output one row, but it output empty result if semijoin is chosen but materialization is not used.
How to repeat:
create table t1 (a int, b int, c int);
insert into t1 values(1,1,1),(2,2,2),(3,3,3);
insert into t1 select a, b * 2, c from t1;
insert into t1 select a, b * 2, c from t1;
insert into t1 select a, b * 2, c from t1;
create index idx_b on t1(b);
create table t2 (a int, b int, c int);
insert into t2 values(1,1,1),(2,2,2),(3,3,3);
create table inner_t (a int, start_time datetime, expires_time datetime);
insert into inner_t values(3, date_add(sysdate(), interval 1 hour), "2023-09-01 00:00:00"), (2, date_add(sysdate(), interval 1 hour), "2023-09-01 00:00:00"), (3, date_sub(sysdate(), interval 1 hour), date_add(sysdate(), interval 1 hour));
insert into inner_t select a, start_time, expires_time from inner_t;
insert into inner_t select a, start_time, expires_time from inner_t;
insert into inner_t select a, start_time, expires_time from inner_t;
set optimizer_switch='materialization=off';
select * from t1 left join t2 on t1.a = t2.a where t1.b in (select a from inner_t where sysdate() between start_time and expires_time);
Suggested fix:
The plan of the query is:
mysql> explain format=tree select * from t1 left join t2 on t1.a = t2.a where t1.b in (select a from inner_t where sysdate() between start_time and expires_time);
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| EXPLAIN |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| -> Filter: (sysdate() between inner_t.start_time and inner_t.expires_time) (cost=2.46 rows=21.3)
-> Left hash join (t2.a = t1.a) (cost=2.46 rows=21.3)
-> Remove duplicate t1 rows using temporary table (weedout) (cost=2.83 rows=7.11)
-> Nested loop inner join (cost=2.83 rows=7.11)
-> Filter: (inner_t.a is not null) (cost=0.517 rows=2.67)
-> Table scan on inner_t (cost=0.517 rows=24)
-> Index lookup on t1 using idx_b (b=inner_t.a) (cost=0.7 rows=2.67)
-> Hash
-> Table scan on t2 (cost=0.0774 rows=3)
|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
1 row in set (0.01 sec)
From the plan, we can see that the non-deterministic expression "sysdate() between inner_t.start_time and inner_t.expires_time" has been attached to the top of plan, but it should be placed on above of Table scan on inner_t(just along with inner_t.a is not null). Through debugging, we found the prefix_tables of t1 doesn't have RAND_TABLE_BIT, and so the non-deterministic expression cann't be attach to t1, in order to solve the problem, we need to add RAND_TABLE_BIT to the prefix_tables of the last table of sj-nest like sjm in JOIN::set_prefix_tables.
```
diff --git a/sql/sql_optimizer.cc b/sql/sql_optimizer.cc
index 03f6d3f5625..5265045c9ac 100644
--- a/sql/sql_optimizer.cc
+++ b/sql/sql_optimizer.cc
@@ -5178,6 +5178,8 @@ void JOIN::set_prefix_tables() {
table_map saved_tables_map = (table_map)0;
JOIN_TAB *last_non_sjm_tab = nullptr; // Track the last non-sjm table
+ // Track the last table of sj_nest, key is the idx of first sj inner tables.
+ std::unordered_map<uint, JOIN_TAB *> last_sj_tab;
for (uint i = const_tables; i < tables; i++) {
JOIN_TAB *const tab = best_ref[i];
@@ -5219,6 +5221,10 @@ void JOIN::set_prefix_tables() {
current_tables_map |= tab->table_ref->map();
tab->set_prefix_tables(current_tables_map, prev_tables_map);
prev_tables_map = current_tables_map;
+
+ // If this is a sj_inner tab, update the last_sj_tab.
+ if (tab->get_sj_strategy() != SJ_OPT_NONE)
+ last_sj_tab[tab->first_sj_inner()] = tab;
}
}
/*
@@ -5227,6 +5233,14 @@ void JOIN::set_prefix_tables() {
*/
if (last_non_sjm_tab != nullptr)
last_non_sjm_tab->add_prefix_tables(RAND_TABLE_BIT);
+
+ /*
+ At the end of each semi-join nest,
+ add non-deterministic expressions to the last table of the nest:
+ */
+ for (auto iter : last_sj_tab) {
+ iter.second->add_prefix_tables(RAND_TABLE_BIT);
+ }
}
```