Bug #84400 FullText can not found some word including ', for example: "Xi'an" and "Yan'an"
Submitted: 4 Jan 2017 6:26 Modified: 4 Jan 2017 13:10
Reporter: Daniel Kou Email Updates:
Status: Verified Impact on me:
None 
Category:MySQL Server: FULLTEXT search Severity:S3 (Non-critical)
Version:5.7 OS:Any
Assigned to: CPU Architecture:Any

[4 Jan 2017 6:26] Daniel Kou
Description:
when we use a word in a column which is "Xi'an", "Yan'an",  both are the names of China City.
we can not found it using the following sql:
select * from tbl_district_en where match(name,name_en) against('+Xi\'an*' in boolean mode);

but we can found another city named "Xi'ao"  using the following sql:
select * from tbl_district_en where match(name,name_en) against('+Xi\'ao*' in boolean mode);

as I investgate,  the reason is "Xi'an" and "Yan'an" include a stopword "an",  but as MySQL document described,  any word can include ONE ',  like "aren't"  "you'll" , it should be used as ONE word(only example, ignore both are stop words).

so this should be a bug,  MySQL can not reject a word including stopwords.

How to repeat:
Create a table as the following:
CREATE TABLE `tbl_district_en` (
  `name` varchar(255) CHARACTER SET utf8mb4 NOT NULL DEFAULT '',
  `name_EN` varchar(128) CHARACTER SET utf8mb4 DEFAULT NULL,
  FULLTEXT KEY `full_name` (`name`,`name_EN`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

insert into tbl_district_en values('', 'Xi\'an'),('', 'Yan\'an'),('', 'Xi\'ao');

select * from tbl_district_en where match(name,name_en) against('+Xi\'an*' in boolean mode);
select * from tbl_district_en where match(name,name_en) against('+Xi\'ao*' in boolean mode);

Suggested fix:
MySQL fulltext search should support a word including "'" and stopwords.
[4 Jan 2017 13:10] MySQL Verification Team
Hello Daniel Kou,

Thank you for the report and feedback.
Verifying this as FR to support single quote(').

Thanks,
Umesh
[4 Jan 2017 13:11] MySQL Verification Team
-- 5.7.17

## "Xi'an" would tokenize to 2 character "Xi" and "an"
-- "an" is stop word

root@localhost [test]> CREATE TABLE `tbl_district_en` (
    ->   `name` varchar(255) CHARACTER SET utf8mb4 NOT NULL DEFAULT '',
    ->   `name_EN` varchar(128) CHARACTER SET utf8mb4 DEFAULT NULL,
    ->   FULLTEXT KEY `full_name` (`name`,`name_EN`)
    -> ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
Query OK, 0 rows affected (0.02 sec)

root@localhost [test]> select @@innodb_ft_min_token_size;
+----------------------------+
| @@innodb_ft_min_token_size |
+----------------------------+
|                          3 |
+----------------------------+
1 row in set (0.00 sec)

root@localhost [test]> set global innodb_ft_aux_table="test/tbl_district_en";
Query OK, 0 rows affected (0.00 sec)

root@localhost [test]> insert into tbl_district_en values('', 'Xi\'an'),('', 'Yan\'an'),('', 'Xi\'ao');
Query OK, 3 rows affected (0.00 sec)
Records: 3  Duplicates: 0  Warnings: 0

root@localhost [test]> select * from information_schema.innodb_ft_index_cache;
+------+--------------+-------------+-----------+--------+----------+
| WORD | FIRST_DOC_ID | LAST_DOC_ID | DOC_COUNT | DOC_ID | POSITION |
+------+--------------+-------------+-----------+--------+----------+
| yan  |            3 |           3 |         1 |      3 |        0 |
+------+--------------+-------------+-----------+--------+----------+
1 row in set (0.00 sec)

root@localhost [test]> select * from tbl_district_en where match(name,name_en) against('+Xi\'an*' in boolean mode);
Empty set (0.01 sec)

root@localhost [test]> select * from tbl_district_en where match(name,name_en) against('+Xi\'ao*' in boolean mode);
Empty set (0.00 sec)

root@localhost [test]> select * from tbl_district_en;
+------+---------+
| name | name_EN |
+------+---------+
|      | Xi'an   |
|      | Yan'an  |
|      | Xi'ao   |
+------+---------+
3 rows in set (0.00 sec)

-- restart with --innodb_ft_min_token_size=1

root@localhost [test]> select @@innodb_ft_min_token_size;
+----------------------------+
| @@innodb_ft_min_token_size |
+----------------------------+
|                          1 |
+----------------------------+
1 row in set (0.00 sec)

root@localhost [test]> set global innodb_ft_aux_table="test/tbl_district_en";
Query OK, 0 rows affected (0.00 sec)

root@localhost [test]> select * from information_schema.innodb_ft_index_cache;
Empty set (0.00 sec)

root@localhost [test]> optimize table tbl_district_en;
+----------------------+----------+----------+-------------------------------------------------------------------+
| Table                | Op       | Msg_type | Msg_text                                                          |
+----------------------+----------+----------+-------------------------------------------------------------------+
| test.tbl_district_en | optimize | note     | Table does not support optimize, doing recreate + analyze instead |
| test.tbl_district_en | optimize | status   | OK                                                                |
+----------------------+----------+----------+-------------------------------------------------------------------+
2 rows in set (0.03 sec)

root@localhost [test]> select * from information_schema.innodb_ft_index_cache;
+------+--------------+-------------+-----------+--------+----------+
| WORD | FIRST_DOC_ID | LAST_DOC_ID | DOC_COUNT | DOC_ID | POSITION |
+------+--------------+-------------+-----------+--------+----------+
| ao   |            4 |           4 |         1 |      4 |        3 |
| xi   |            2 |           4 |         2 |      2 |        0 |
| xi   |            2 |           4 |         2 |      4 |        0 |
| yan  |            3 |           3 |         1 |      3 |        0 |
+------+--------------+-------------+-----------+--------+----------+
4 rows in set (0.00 sec)

root@localhost [test]> select * from tbl_district_en where match(name,name_en) against('+Xi\'an*' in boolean mode);
Empty set (0.00 sec)

root@localhost [test]> select * from tbl_district_en where match(name,name_en) against('+Xi\'ao*' in boolean mode);
+------+---------+
| name | name_EN |
+------+---------+
|      | Xi'ao   |
+------+---------+
1 row in set (0.00 sec)

root@localhost [test]> select * from tbl_district_en where match(name,name_en) against('+Xi*' in boolean mode);
+------+---------+
| name | name_EN |
+------+---------+
|      | Xi'an   |
|      | Xi'ao   |
+------+---------+
2 rows in set (0.00 sec)