Description:
WeedoutIterator use file->position to distinguish duplicated rows. For TempTable engine, ::position return the cursor, but not the unique value. So if a temptable is rematerialized, all rows' cursor changed, and weedout will not work correctly.
# expect 2 rows, but get 8:
mysql> select *
-> from t_city_list as t1
-> JOIN LATERAL (
-> select country from t_city_list as t2
-> where country = t1.country
-> group by country
-> ) t3 ON t1.country = t3.country
-> WHERE
-> t3.country in (
-> select country_name
-> from t_countries
-> where country_name in (
-> select t4.country_filter
-> from t_filter t4
-> where t4.filter_id > 1
-> )
-> )
-> ;
+----+---------+-------+---------+
| id | country | city | country |
+----+---------+-------+---------+
| 6 | Japan | Tokyo | Japan |
| 7 | Japan | Osaka | Japan |
| 6 | Japan | Tokyo | Japan |
| 7 | Japan | Osaka | Japan |
| 6 | Japan | Tokyo | Japan |
| 7 | Japan | Osaka | Japan |
| 6 | Japan | Tokyo | Japan |
| 7 | Japan | Osaka | Japan |
+----+---------+-------+---------+
8 rows in set (0.01 sec)
# semijoin off, so duplicateweedout not used, get 2 rows correctly.
mysql> SET SESSION optimizer_switch='semijoin=off';
Query OK, 0 rows affected (0.00 sec)
mysql> select *
-> from t_city_list as t1
-> JOIN LATERAL (
-> select country from t_city_list as t2
-> where country = t1.country
-> group by country
-> ) t3 ON t1.country = t3.country
-> WHERE
-> t3.country in (
-> select country_name
-> from t_countries
-> where country_name in (
-> select t4.country_filter
-> from t_filter t4
-> where t4.filter_id > 1
-> )
-> )
-> ;
+----+---------+-------+---------+
| id | country | city | country |
+----+---------+-------+---------+
| 6 | Japan | Tokyo | Japan |
| 7 | Japan | Osaka | Japan |
+----+---------+-------+---------+
2 rows in set (0.01 sec)
How to repeat:
# Test for weedout duplicate detection issue with temptable rematerialization
# Bug: Weedout fails to eliminate duplicates when using temptable engine with rematerialization
# because truncate() invalidates position pointers that weedout relies on
#
# Create test tables with data that will trigger both rematerialization and weedout
CREATE TABLE t_city_list(
id bigint auto_increment primary key,
country varchar(64),
city varchar(64)
);
INSERT INTO t_city_list(country,city) VALUES
('China','Beijing'),('China','Guangzhou'),('China','Shenzhen'),('China','Hong Kong'),('China','Shanghai'),
('Japan','Tokyo'),('Japan','Osaka'),('USA','New York'),('USA','Los Angeles'),('USA','Chicago');
CREATE TABLE t_countries (country_name varchar(64));
CREATE TABLE t_filter (filter_id int, country_filter varchar(64));
INSERT INTO t_countries VALUES ('China'), ('Japan'), ('USA');
INSERT INTO t_countries VALUES ('China'), ('Japan'), ('USA');
INSERT INTO t_filter VALUES (1, 'China'), (2, 'Japan');
INSERT INTO t_filter VALUES (1, 'China'), (2, 'Japan');
# Force use of temptable engine for temporary operations
SET SESSION internal_tmp_mem_storage_engine=TempTable;
# Configure optimizer to use weedout strategy
SET SESSION optimizer_switch='semijoin=on,materialization=off,firstmatch=off,loosescan=off,duplicateweedout=on,block_nested_loop=off';
# expect 2 rows, but get 8 rows
select *
from t_city_list as t1
JOIN LATERAL (
select country from t_city_list as t2
where country = t1.country
group by country
) t3 ON t1.country = t3.country
WHERE
t3.country in (
select country_name
from t_countries
where country_name in (
select t4.country_filter
from t_filter t4
where t4.filter_id > 1
)
)
SET SESSION optimizer_switch='semijoin=off';
# get 2 rows if no semi join(duplicateweedout)
select *
from t_city_list as t1
JOIN LATERAL (
select country from t_city_list as t2
where country = t1.country
group by country
) t3 ON t1.country = t3.country
WHERE
t3.country in (
select country_name
from t_countries
where country_name in (
select t4.country_filter
from t_filter t4
where t4.filter_id > 1
)
)
# Clean up
DROP TABLE t_city_list, t_countries, t_filter;
# Reset settings
SET SESSION internal_tmp_mem_storage_engine=DEFAULT;
SET SESSION optimizer_switch=DEFAULT;
Description: WeedoutIterator use file->position to distinguish duplicated rows. For TempTable engine, ::position return the cursor, but not the unique value. So if a temptable is rematerialized, all rows' cursor changed, and weedout will not work correctly. # expect 2 rows, but get 8: mysql> select * -> from t_city_list as t1 -> JOIN LATERAL ( -> select country from t_city_list as t2 -> where country = t1.country -> group by country -> ) t3 ON t1.country = t3.country -> WHERE -> t3.country in ( -> select country_name -> from t_countries -> where country_name in ( -> select t4.country_filter -> from t_filter t4 -> where t4.filter_id > 1 -> ) -> ) -> ; +----+---------+-------+---------+ | id | country | city | country | +----+---------+-------+---------+ | 6 | Japan | Tokyo | Japan | | 7 | Japan | Osaka | Japan | | 6 | Japan | Tokyo | Japan | | 7 | Japan | Osaka | Japan | | 6 | Japan | Tokyo | Japan | | 7 | Japan | Osaka | Japan | | 6 | Japan | Tokyo | Japan | | 7 | Japan | Osaka | Japan | +----+---------+-------+---------+ 8 rows in set (0.01 sec) # semijoin off, so duplicateweedout not used, get 2 rows correctly. mysql> SET SESSION optimizer_switch='semijoin=off'; Query OK, 0 rows affected (0.00 sec) mysql> select * -> from t_city_list as t1 -> JOIN LATERAL ( -> select country from t_city_list as t2 -> where country = t1.country -> group by country -> ) t3 ON t1.country = t3.country -> WHERE -> t3.country in ( -> select country_name -> from t_countries -> where country_name in ( -> select t4.country_filter -> from t_filter t4 -> where t4.filter_id > 1 -> ) -> ) -> ; +----+---------+-------+---------+ | id | country | city | country | +----+---------+-------+---------+ | 6 | Japan | Tokyo | Japan | | 7 | Japan | Osaka | Japan | +----+---------+-------+---------+ 2 rows in set (0.01 sec) How to repeat: # Test for weedout duplicate detection issue with temptable rematerialization # Bug: Weedout fails to eliminate duplicates when using temptable engine with rematerialization # because truncate() invalidates position pointers that weedout relies on # # Create test tables with data that will trigger both rematerialization and weedout CREATE TABLE t_city_list( id bigint auto_increment primary key, country varchar(64), city varchar(64) ); INSERT INTO t_city_list(country,city) VALUES ('China','Beijing'),('China','Guangzhou'),('China','Shenzhen'),('China','Hong Kong'),('China','Shanghai'), ('Japan','Tokyo'),('Japan','Osaka'),('USA','New York'),('USA','Los Angeles'),('USA','Chicago'); CREATE TABLE t_countries (country_name varchar(64)); CREATE TABLE t_filter (filter_id int, country_filter varchar(64)); INSERT INTO t_countries VALUES ('China'), ('Japan'), ('USA'); INSERT INTO t_countries VALUES ('China'), ('Japan'), ('USA'); INSERT INTO t_filter VALUES (1, 'China'), (2, 'Japan'); INSERT INTO t_filter VALUES (1, 'China'), (2, 'Japan'); # Force use of temptable engine for temporary operations SET SESSION internal_tmp_mem_storage_engine=TempTable; # Configure optimizer to use weedout strategy SET SESSION optimizer_switch='semijoin=on,materialization=off,firstmatch=off,loosescan=off,duplicateweedout=on,block_nested_loop=off'; # expect 2 rows, but get 8 rows select * from t_city_list as t1 JOIN LATERAL ( select country from t_city_list as t2 where country = t1.country group by country ) t3 ON t1.country = t3.country WHERE t3.country in ( select country_name from t_countries where country_name in ( select t4.country_filter from t_filter t4 where t4.filter_id > 1 ) ) SET SESSION optimizer_switch='semijoin=off'; # get 2 rows if no semi join(duplicateweedout) select * from t_city_list as t1 JOIN LATERAL ( select country from t_city_list as t2 where country = t1.country group by country ) t3 ON t1.country = t3.country WHERE t3.country in ( select country_name from t_countries where country_name in ( select t4.country_filter from t_filter t4 where t4.filter_id > 1 ) ) # Clean up DROP TABLE t_city_list, t_countries, t_filter; # Reset settings SET SESSION internal_tmp_mem_storage_engine=DEFAULT; SET SESSION optimizer_switch=DEFAULT;