Description:
A text is split into several words, each word may be contained in many documents.
In fts cache, each word use fts_tokenizer_word_t::nodes to store doc ids. However, doc ids in fts_tokenizer_word_t::nodes are not guaranteed to be ordered, but in function 'fts_phrase_or_proximity_search', fts_query_t::matched_array[i] is expected to be sorted by doc id in ascending order. An unordered fts_tokenizer_word_t::nodes may cause incorrect result of fulltext search.
How to repeat:
source include/have_debug_sync.inc;
CREATE TABLE opening_lines (
FTS_DOC_ID BIGINT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
opening_line varchar(100),
author VARCHAR(200),
title VARCHAR(200),
FULLTEXT (opening_line) WITH PARSER ngram
) ENGINE=InnoDB;
connect (con1,localhost,root,,);
connect (con2,localhost,root,,);
connection con1;
set session debug_sync='bgc_before_flush_stage signal session1_before_flush wait_for session2_finish_add_fts_cache';
send INSERT INTO opening_lines(opening_line,author,title) VALUES ('TestOK','author1','title1');
connection con2;
set session debug_sync='now wait_for session1_before_flush';
set session debug_sync='fts_instrument_sync_request signal session2_finish_add_fts_cache';
INSERT INTO opening_lines(opening_line,author,title) VALUES ('OKTest','author2','title2');
connection con1;
reap;
echo has one row in result;
select * from opening_lines where opening_line like 'TestOK';
echo has no row in result;
select * from opening_lines where MATCH(opening_line) AGAINST('TestOK' in boolean mode);
disconnect con1;
disconnect con2;
drop table opening_lines;
Run this test script and you can see the two queries return different results.
Suggested fix:
The simplest way is sorting fts_query_t::matched_array[i] by doc id in ascending order when pushing data into it.
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
index ff360ea7857..607768f554a 100644
--- a/storage/innobase/fts/fts0que.cc
+++ b/storage/innobase/fts/fts0que.cc
@@ -3000,6 +3000,20 @@ static dberr_t fts_query_filter_doc_ids(
match =
static_cast<fts_match_t *>(ib_vector_push(query->matched, nullptr));
+ /* find the correct slot of new match */
+ ut_ad(ib_vector_size(query->matched) >= 1);
+ ulint slot = ib_vector_size(query->matched) - 1;
+ while (slot >= 1) {
+ fts_match_t *temp_match =
+ static_cast<fts_match_t *>(ib_vector_get(query->matched, slot - 1));
+ if (doc_id >= temp_match->doc_id) {
+ break;
+ }
+ ib_vector_set(query->matched, slot, temp_match);
+ slot--;
+ }
+ match = static_cast<fts_match_t *>(ib_vector_get(query->matched, slot));
+
Description: A text is split into several words, each word may be contained in many documents. In fts cache, each word use fts_tokenizer_word_t::nodes to store doc ids. However, doc ids in fts_tokenizer_word_t::nodes are not guaranteed to be ordered, but in function 'fts_phrase_or_proximity_search', fts_query_t::matched_array[i] is expected to be sorted by doc id in ascending order. An unordered fts_tokenizer_word_t::nodes may cause incorrect result of fulltext search. How to repeat: source include/have_debug_sync.inc; CREATE TABLE opening_lines ( FTS_DOC_ID BIGINT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY, opening_line varchar(100), author VARCHAR(200), title VARCHAR(200), FULLTEXT (opening_line) WITH PARSER ngram ) ENGINE=InnoDB; connect (con1,localhost,root,,); connect (con2,localhost,root,,); connection con1; set session debug_sync='bgc_before_flush_stage signal session1_before_flush wait_for session2_finish_add_fts_cache'; send INSERT INTO opening_lines(opening_line,author,title) VALUES ('TestOK','author1','title1'); connection con2; set session debug_sync='now wait_for session1_before_flush'; set session debug_sync='fts_instrument_sync_request signal session2_finish_add_fts_cache'; INSERT INTO opening_lines(opening_line,author,title) VALUES ('OKTest','author2','title2'); connection con1; reap; echo has one row in result; select * from opening_lines where opening_line like 'TestOK'; echo has no row in result; select * from opening_lines where MATCH(opening_line) AGAINST('TestOK' in boolean mode); disconnect con1; disconnect con2; drop table opening_lines; Run this test script and you can see the two queries return different results. Suggested fix: The simplest way is sorting fts_query_t::matched_array[i] by doc id in ascending order when pushing data into it. diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc index ff360ea7857..607768f554a 100644 --- a/storage/innobase/fts/fts0que.cc +++ b/storage/innobase/fts/fts0que.cc @@ -3000,6 +3000,20 @@ static dberr_t fts_query_filter_doc_ids( match = static_cast<fts_match_t *>(ib_vector_push(query->matched, nullptr)); + /* find the correct slot of new match */ + ut_ad(ib_vector_size(query->matched) >= 1); + ulint slot = ib_vector_size(query->matched) - 1; + while (slot >= 1) { + fts_match_t *temp_match = + static_cast<fts_match_t *>(ib_vector_get(query->matched, slot - 1)); + if (doc_id >= temp_match->doc_id) { + break; + } + ib_vector_set(query->matched, slot, temp_match); + slot--; + } + match = static_cast<fts_match_t *>(ib_vector_get(query->matched, slot)); +