Bug #119225 weedout not work for rematerialized temptable
Submitted: 24 Oct 9:11 Modified: 31 Oct 5:20
Reporter: jump mason Email Updates:
Status: Verified Impact on me:
None 
Category:MySQL Server: Optimizer Severity:S3 (Non-critical)
Version:8.0.43 OS:Any
Assigned to: CPU Architecture:Any
Tags: duplicateweedout, temptable

[24 Oct 9:11] jump mason
Description:
WeedoutIterator use file->position to distinguish duplicated rows. For TempTable engine, ::position return the cursor, but not the unique value. So if a temptable is rematerialized, all rows' cursor changed, and weedout will not work correctly.

# expect 2 rows, but get 8:
mysql> select *
    -> from t_city_list as t1
    -> JOIN LATERAL (
    ->   select country from t_city_list as t2
    ->   where country = t1.country
    ->   group by country
    -> ) t3 ON t1.country = t3.country
    -> WHERE
    -> t3.country in (
    ->   select country_name
    ->   from t_countries
    ->   where country_name in (
    ->     select t4.country_filter
    ->     from t_filter t4
    ->     where t4.filter_id > 1
    ->   )
    -> )
    -> ;
+----+---------+-------+---------+
| id | country | city  | country |
+----+---------+-------+---------+
|  6 | Japan   | Tokyo | Japan   |
|  7 | Japan   | Osaka | Japan   |
|  6 | Japan   | Tokyo | Japan   |
|  7 | Japan   | Osaka | Japan   |
|  6 | Japan   | Tokyo | Japan   |
|  7 | Japan   | Osaka | Japan   |
|  6 | Japan   | Tokyo | Japan   |
|  7 | Japan   | Osaka | Japan   |
+----+---------+-------+---------+
8 rows in set (0.01 sec)

# semijoin off, so duplicateweedout not used, get 2 rows correctly.

mysql> SET SESSION optimizer_switch='semijoin=off';
Query OK, 0 rows affected (0.00 sec)

mysql> select *
    -> from t_city_list as t1
    -> JOIN LATERAL (
    ->   select country from t_city_list as t2
    ->   where country = t1.country
    ->   group by country
    -> ) t3 ON t1.country = t3.country
    -> WHERE
    -> t3.country in (
    ->   select country_name
    ->   from t_countries
    ->   where country_name in (
    ->     select t4.country_filter
    ->     from t_filter t4
    ->     where t4.filter_id > 1
    ->   )
    -> )
    -> ;
+----+---------+-------+---------+
| id | country | city  | country |
+----+---------+-------+---------+
|  6 | Japan   | Tokyo | Japan   |
|  7 | Japan   | Osaka | Japan   |
+----+---------+-------+---------+
2 rows in set (0.01 sec)

How to repeat:

# Test for weedout duplicate detection issue with temptable rematerialization
# Bug: Weedout fails to eliminate duplicates when using temptable engine with rematerialization
# because truncate() invalidates position pointers that weedout relies on
#

# Create test tables with data that will trigger both rematerialization and weedout
CREATE TABLE t_city_list(
  id bigint auto_increment primary key,
  country  varchar(64),
  city varchar(64)
);

INSERT INTO t_city_list(country,city) VALUES 
  ('China','Beijing'),('China','Guangzhou'),('China','Shenzhen'),('China','Hong Kong'),('China','Shanghai'),
  ('Japan','Tokyo'),('Japan','Osaka'),('USA','New York'),('USA','Los Angeles'),('USA','Chicago');

CREATE TABLE t_countries (country_name varchar(64));
CREATE TABLE t_filter (filter_id int, country_filter varchar(64));

INSERT INTO t_countries VALUES ('China'), ('Japan'), ('USA');
INSERT INTO t_countries VALUES ('China'), ('Japan'), ('USA');
INSERT INTO t_filter VALUES (1, 'China'), (2, 'Japan');
INSERT INTO t_filter VALUES (1, 'China'), (2, 'Japan');

# Force use of temptable engine for temporary operations
SET SESSION internal_tmp_mem_storage_engine=TempTable;

# Configure optimizer to use weedout strategy
SET SESSION optimizer_switch='semijoin=on,materialization=off,firstmatch=off,loosescan=off,duplicateweedout=on,block_nested_loop=off';

# expect 2 rows, but get 8 rows
select * 
from t_city_list as t1 
JOIN LATERAL (
  select country from t_city_list as t2
  where country = t1.country
  group by country
) t3 ON t1.country = t3.country
WHERE
t3.country in (
  select country_name
  from t_countries
  where country_name in (
    select t4.country_filter
    from t_filter t4
    where t4.filter_id > 1
  )
)

SET SESSION optimizer_switch='semijoin=off';
# get 2 rows if no semi join(duplicateweedout)
select * 
from t_city_list as t1 
JOIN LATERAL (
  select country from t_city_list as t2
  where country = t1.country
  group by country
) t3 ON t1.country = t3.country
WHERE
t3.country in (
  select country_name
  from t_countries
  where country_name in (
    select t4.country_filter
    from t_filter t4
    where t4.filter_id > 1
  )
)

# Clean up
DROP TABLE t_city_list, t_countries, t_filter;

# Reset settings
SET SESSION internal_tmp_mem_storage_engine=DEFAULT;
SET SESSION optimizer_switch=DEFAULT;
[31 Oct 5:20] Chaithra Marsur Gopala Reddy
Hi jump mason,

Thank you for the test case. Verified as described.