Description:
The query performs unnecessary sorting step:
```
select name, salary from (
SELECT name, salary,
row_number() over (partition by name order by salary) as rn
FROM users
) t where rn <= 3;
```
There's index on (name, salary) from users.
The plan is:
```
| -> Filter: (t.rn <= 3) (cost=0.338..112256 rows=332569)
-> Table scan on t (cost=2.5..2.5 rows=0)
-> Materialize CTE t (cost=0..0 rows=0)
-> Window aggregate: row_number() OVER (PARTITION BY users.`name` ORDER BY users.salary )
-> Sort: users.`name`, users.salary (cost=100414 rows=997807)
-> Index scan on users using name_salary (cost=100414 rows=997807)
```
The index (name, salary) is already sorted, so there's no need for one more sorting step.
The EXPLAIN FORMAT=JSON additionally confirms that the sort happens, here's an excerpt:
```
"windowing": {
"windows": [
{
"name": "<unnamed window>",
"using_filesort": true,
"filesort_key": [
"`name`",
"`salary`"
],
"functions": [
"row_number"
]
}
],
"cost_info": {
"sort_cost": "997807.00"
}
```
Expected: single index-scan without sorting, preferably with jumps to the next name after the current TOP3 one is figured out (the way Oracle RDBMS does it).
Is this an overlooked optimization or a bug?
How to repeat:
DROP TABLE IF EXISTS users;
CREATE TABLE users(
id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(64) NOT NULL,
salary MEDIUMINT UNSIGNED,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at DATETIME
);
DROP TABLE IF EXISTS names;
CREATE TABLE names (
name varchar(300) not null
);
INSERT INTO names (name) VALUES
('Emma'), ('Liam'), ('Noah'), ('Olivia'), ('William'), ('James'), ('Isabella'), ('Oliver'), ('Sophia'),
('Benjamin'), ('Elijah'), ('Charlotte'), ('Lucas'), ('Mia'), ('Mason'), ('Amelia'), ('Ethan'), ('Harper'),
('Logan'), ('Ava'), ('Alexander'), ('Abigail'), ('Sebastian'), ('Emily'), ('Michael'), ('Evelyn'), ('Daniel'),
('Madison'), ('Matthew'), ('Avery'), ('Henry'), ('Sofia'), ('Jackson'), ('Ella'), ('Samuel'), ('Scarlett'),
('David'), ('Victoria'), ('Joseph'), ('Aria'), ('Carter'), ('Grace'), ('Owen'), ('Chloe'), ('Wyatt'),
('Penelope'), ('John'), ('Riley'), ('Jack'), ('Lily'), ('Luke'), ('Aubrey'), ('Jayden'), ('Zoey'), ('Dylan'),
('Lila'), ('Levi'), ('Ellie'), ('Gabriel'), ('Aaliyah'), ('Nicholas'), ('Addison'), ('Isaac'), ('Natalie'),
('Lincoln'), ('Luna'), ('Christopher'), ('Savannah'), ('Joshua'), ('Brooklyn'), ('Andrew'), ('Hazel'),
('Julian'), ('Aurora'), ('Grayson'), ('Audrey'), ('Leah'), ('Bella'), ('Mateo'), ('Claire'), ('Ryan'),
('Stella'), ('Jaxon'), ('Paisley'), ('Leo'), ('Skylar'), ('Jonathan'), ('Samantha'), ('Charles'), ('Nora'),
('Adam'), ('Eleanor'), ('Thomas'), ('Caroline'), ('Xavier'), ('Nova'), ('Eli'), ('Genesis'), ('Pete'), ('Ann');
-- insert 1m users
INSERT INTO users (name, salary, created_at, deleted_at)
WITH RECURSIVE seq AS (SELECT 1 as n UNION ALL SELECT 1 + n FROM seq LIMIT 1000000)
SELECT
-- random name
(SELECT name FROM names ORDER BY RAND() LIMIT 1),
-- random salary up to 10000
rand()*10000,
DATE_ADD('2020-01-01', INTERVAL n MINUTE),
-- 99% null, 1% deleted_at = created_at + random within 30 days
IF(rand()>0.01, null, DATE_ADD(DATE_ADD('2020-01-01', INTERVAL n MINUTE), INTERVAL rand()*30 DAY))
FROM seq;
create index name_salary on users(name, salary);
explain select name, salary from (
SELECT name, salary,
row_number() over (partition by name order by salary) as rn
FROM users
) t where rn <= 3;