Description:
Innodb fulltext query (with ngram parser) need to search all possible ngram tokens if search word is greater than ngram_token_size.
At this time each token match a lot of documents, query takes a lot of time.
For example (with ngram_token_size=2),
SELECT * FROM ft_test WHERE MATCH(keyword) AGAINST('중국가을' IN BOOLEAN MODE);
18 rows in set (7.55 sec)
Above query need to search for "중국" and "국가" and "가을". Unfournately all three words are meaningful in Korea. So each result have a lot of matching documents.
mysql> SELECT COUNT(*) FROM ft_test WHERE MATCH(contents) AGAINST('중국' IN BOOLEAN MODE);
220239 row in set
mysql> SELECT COUNT(*) FROM ft_test WHERE MATCH(contents) AGAINST('국가 IN BOOLEAN MODE);
59202 row in set
mysql> SELECT COUNT(*) FROM ft_test WHERE MATCH(contents) AGAINST('가을' IN BOOLEAN MODE);
44640 row in set
And "storage/innobase/fts/fts0que.cc" put these results to vector and lookup via serial array iteration. So it tooks a lot of time.
Worse thing is that we can not increase ngram_token_size to 4 or 5 because all tokens
whose length are less than ngram_token_size are not indexed (i.e. we can't find less than ngram_token_size word).
How to repeat:
for(idx=0; idx<300000; idx++){
INSERT INTO ft_test VALUES (NULL, '중국');
}
for(idx=0; idx<50000; idx++){
INSERT INTO ft_test VALUES (NULL, '국가');
}
for(idx=0; idx<60000; idx++){
INSERT INTO ft_test VALUES (NULL, '가을');
}
for(idx=0; idx<20; idx++){
INSERT INTO ft_test VALUES (NULL, '중국가을');
}
And run the query
SELECT * FROM ft_test WHERE MATCH(contents) AGAINST('중국가을' IN BOOLEAN MODE);
Suggested fix:
Use hashmap data structure rather than vector for internal memory matching
(I am not sure this will help to make query faster)
OR add multi-gram index support.
Multi-gram means this. And I have implemented simple multi-gram parser (See bottom).
With this feature, we can increase ngram_token_size over 2 or 3 and rather long length word search performance could be faster and also can search small token. Above example query takes less than 0.01ms with multi-gram implementation.
mysql> show variables like 'ngram_token_size';
+------------------+-------+
| Variable_name | Value |
+------------------+-------+
| ngram_token_size | 5 |
+------------------+-------+
mysql> set global innodb_ft_enable_stopword=off;
mysql> set innodb_ft_enable_stopword=off;
mysql> CREATE TABLE ft_test (
id int NOT NULL,
contents text,
PRIMARY KEY (id),
FULLTEXT KEY fx_contents (contents) WITH PARSER ngram
) ENGINE=InnoDB;
mysql> insert into ft_test values (1, 'MySQL support UTF8');
mysql> SELECT * FROM INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHE order by position;
+-------+--------------+-------------+-----------+--------+----------+
| WORD | FIRST_DOC_ID | LAST_DOC_ID | DOC_COUNT | DOC_ID | POSITION |
+-------+--------------+-------------+-----------+--------+----------+
| mysql | 2 | 2 | 1 | 2 | 0 |
| ysql | 2 | 2 | 1 | 2 | 1 |
| sql | 2 | 2 | 1 | 2 | 2 |
| ql | 2 | 2 | 1 | 2 | 3 |
| suppo | 2 | 2 | 1 | 2 | 6 |
| uppor | 2 | 2 | 1 | 2 | 7 |
| pport | 2 | 2 | 1 | 2 | 8 |
| port | 2 | 2 | 1 | 2 | 9 |
| ort | 2 | 2 | 1 | 2 | 10 |
| rt | 2 | 2 | 1 | 2 | 11 |
| utf8 | 2 | 2 | 1 | 2 | 14 |
| tf8 | 2 | 2 | 1 | 2 | 15 |
| f8 | 2 | 2 | 1 | 2 | 16 |
+-------+--------------+-------------+-----------+--------+----------+
13 rows in set (0.00 sec)
>> Simple implementation for multi-gram parser.
$pwd
/mysql-5.7.17/plugin/fulltext/ngram_parser
$ cat patch.txt
--- plugin_ngram.cc.orig 2017-04-10 18:20:55.143305623 +0900
+++ plugin_ngram.cc 2017-04-10 18:16:24.669248384 +0900
@@ -15,6 +15,12 @@
#include <fts0tokenize.h>
+/**
+ * Indexing all tokens whose length is greater than (NGRAM_TOKEN_SIZE_MIN-1) and less than (NGRAM_TOKEN_SIZE_MAX+1)
+ * use ngram_token_size as NGRAM_TOKEN_SIZE_MAX
+ */
+#define NGRAM_TOKEN_SIZE_MIN 2
+
/* We are following InnoDB coding guidelines. */
/** Ngram token size, by default bigram. */
@@ -54,6 +60,9 @@
end = start + len;
n_chars = 0;
+ bool for_search = (param->mode==MYSQL_FTPARSER_FULL_BOOLEAN_INFO || param->mode==MYSQL_FTPARSER_WITH_STOPWORDS);
+ bool for_indexing = !for_search;
+
while (next < end) {
char_len = my_mbcharlen_ptr(cs, next, end);
@@ -63,6 +72,21 @@
} else {
/* Skip SPACE */
if (char_len == 1 && *next == ' ') {
+ // Add ngram token of which length is between ngram_token_size_min and ngram_token_size(ngram_token_max)
+ if(for_indexing && n_chars>=NGRAM_TOKEN_SIZE_MIN){
+ /* Add a ngram */
+ bool_info->position = start - doc;
+ ret = param->mysql_add_word(
+ param, start, next - start, bool_info);
+ RETURN_IF_ERROR(ret);
+ is_first = false;
+
+ /* Move a char forward */
+ start += my_mbcharlen_ptr(cs, start, end);
+ n_chars--;
+ continue;
+ }
+
start = next + 1;
next = start;
n_chars = 0;
@@ -88,6 +112,22 @@
}
}
+ // For the last token, we need to add recursively until (n_chars<NGRAM_TOKEN_SIZE_MIN)
+ while(for_indexing && start < end && n_chars>=NGRAM_TOKEN_SIZE_MIN && n_chars<ngram_token_size){
+ bool_info->position = start - doc;
+ ret = param->mysql_add_word(param, start, next - start, bool_info);
+ RETURN_IF_ERROR(ret);
+
+ /* Move a char forward */
+ char_len = my_mbcharlen_ptr(cs, start, end);
+ if(char_len<=0){
+ break;
+ }
+ start += char_len;
+ n_chars--;
+ is_first = false;
+ }
+
/* We handle unigram in cases below:
1. BOOLEAN MODE: suppose we have phrase search like ('"a bc"');
2. STOPWORD MODE: we should handle unigram when matching phrase.
@@ -300,3 +340,4 @@
0,
}
mysql_declare_plugin_end;
+