Bug #76314 Semi-join duplicate elimination gives lower cost than no duplicate elimination
Submitted: 13 Mar 2015 13:17 Modified: 30 Sep 2015 15:29
Reporter: Øystein Grøvlen Email Updates:
Status: Closed Impact on me:
None 
Category:MySQL Server: Optimizer Severity:S3 (Non-critical)
Version:5.7 OS:Any
Assigned to: CPU Architecture:Any

[13 Mar 2015 13:17] Øystein Grøvlen
Description:
Optimizer trace for the following query (found in subquery_sj.inc):

explain select t21.* from t21,t22 where t21.a = t22.a and t22.a in 
  (select t12.a from t11, t12 where t11.a in(255,256) and t11.a = t12.a and t11.c is null) and t22.c is null order by t21.a;

shows that optimizer calculates lower cost with duplicate elimination than without for MaterializedScan and DuplicateWeedout.
That should not be possible since the duplicate elimination will have a cost.  Here is the relevant part of optimizer trace for the query:

      "rest_of_plan": [\
        {\
          "plan_prefix": [\
            "`t11`",\
            "`t12`",\
            "`t22`"\
          ],\
          "table": "`t21`",\
          "best_access_path": {\
            "considered_access_paths": [\
              {\
                "rows_to_scan": 26,\
                "access_type": "scan",\
                "using_join_cache": true,\
                "buffers_needed": 1,\
                "resulting_rows": 26,\
                "cost": 2.352,\
                "chosen": true\
              }\
            ]\
          },\
          "condition_filtering_pct": 10,\
          "rows_for_plan": 0.676,\
          "cost_for_plan": 14.227,\
          "semijoin_strategy_choice": [\
            {\
              "strategy": "MaterializeScan",\
              "recalculate_access_paths_and_cost": {\
                "tables": [\
                  {\
                    "table": "`t22`",\
                    "best_access_path": {\
                      "considered_access_paths": [\
                        {\
                          "rows_to_scan": 26,\
                          "access_type": "scan",\
                          "using_join_cache": true,\
                          "buffers_needed": 1,\
                          "resulting_rows": 2.6,\
                          "cost": 5.8101,\
                          "chosen": true\
                        }\
                      ]\
                    }\
                  },\
                  {\
                    "table": "`t21`",\
                    "best_access_path": {\
                      "considered_access_paths": [\
                        {\
                          "rows_to_scan": 26,\
                          "access_type": "scan",\
                          "using_join_cache": true,\
                          "buffers_needed": 1,\
                          "resulting_rows": 26,\
                          "cost": 1.338,\
                          "chosen": true\
                        }\
                      ]\
                    }\
                  }\
                ]\
              },\
              "cost": 13.248,\
              "rows": 0.676,\
              "duplicate_tables_left": false,\
              "chosen": true\
            },\
            {\
              "strategy": "DuplicatesWeedout",\
              "cost": 13.317,\
              "rows": 0.676,\
              "duplicate_tables_left": false,\
              "chosen": false\
            }\
          ],\
          "sort_cost": 0.676,\
          "new_cost_for_plan": 13.924,\
          "chosen": true\
        }\
      ]\
    }\
  ]\
},\

As we can see, the cost of an ordinary join is 14.227 while the cost with MaterializedScan is 13.248 and DuplicateWeedout 13.317.

How to repeat:
CREATE TABLE t1 (
  a int(11) NOT NULL,
  b int(11) NOT NULL,
  c datetime default NULL,
  PRIMARY KEY  (a),
  KEY idx_bc (b,c)
);

INSERT INTO t1 VALUES 
(406989,67,'2006-02-23 17:08:46'), (150078,67,'2005-10-26 11:17:45'),
(406993,67,'2006-02-27 11:20:57'), (245655,67,'2005-12-08 15:59:08'),
(406994,67,'2006-02-27 11:26:46'), (256,67,NULL),
(398341,67,'2006-02-20 04:48:44'), (254,67,NULL),(1120,67,NULL),
(406988,67,'2006-02-23 17:07:22'), (255,67,NULL),
(398340,67,'2006-02-20 04:38:53'),(406631,67,'2006-02-23 10:49:42'),
(245653,67,'2005-12-08 15:59:07'),(406992,67,'2006-02-24 16:47:18'),
(245654,67,'2005-12-08 15:59:08'),(406995,67,'2006-02-28 11:55:00'),
(127261,67,'2005-10-13 12:17:58'),(406991,67,'2006-02-24 16:42:32'),
(245652,67,'2005-12-08 15:58:27'),(398545,67,'2006-02-20 04:53:13'),
(154504,67,'2005-10-28 11:53:01'),(9199,67,NULL),(1,67,'2006-02-23 15:01:35'),
(223456,67,NULL),(4101,67,NULL),(1133,67,NULL),
(406990,67,'2006-02-23 18:01:45'),(148815,67,'2005-10-25 15:34:17'),
(148812,67,'2005-10-25 15:30:01'),(245651,67,'2005-12-08 15:58:27'),
(154503,67,'2005-10-28 11:52:38');

create table t11 select * from t1 where b = 67 AND (c IS NULL OR c > NOW()) order by 3 asc;
create table t12 select * from t1 where b = 67 AND (c IS NULL OR c > NOW()) order by 3 desc;
create table t21 select * from t1 where b = 67 AND (c IS NULL OR c > '2005-12-08') order by 3 asc;
create table t22 select * from t1 where b = 67 AND (c IS NULL OR c > '2005-12-08') order by 3 desc;

update t22 set c = '2005-12-08 15:58:27' where a = 255;

SET optimizer_trace="enabled=on";
SET optimizer_trace_max_mem_size=1000000;

explain select t21.* from t21,t22 where t21.a = t22.a and 
t22.a in (select t12.a from t11, t12 where t11.a in(255,256) and t11.a = t12.a and t11.c is null) and t22.c is null order by t21.a;

select * from information_schema.optimizer_trace into outfile "tmp.out";

drop table t1, t11, t12, t21, t22;
[30 Sep 2015 15:29] Paul DuBois
Noted in 5.7.9, 5.8.0 changelogs.

Optimizer estimates for filtering conditions could lead to suboptimal
execution plans if the expected number of rows selected from a table
was between 0 and 1. The estimate is now made to be at least 1.