Description:
A text is split into several words, each word may be contained in many documents.
In fts cache, each word use fts_tokenizer_word_t::nodes to store doc ids. However, doc ids in fts_tokenizer_word_t::nodes are not guaranteed to be ordered, but in function 'fts_phrase_or_proximity_search', fts_query_t::matched_array[i] is expected to be sorted by doc id in ascending order. An unordered fts_tokenizer_word_t::nodes may cause incorrect result of fulltext search.
How to repeat:
source include/have_debug_sync.inc;
CREATE TABLE opening_lines (
FTS_DOC_ID BIGINT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
opening_line varchar(100),
author VARCHAR(200),
title VARCHAR(200),
FULLTEXT (opening_line) WITH PARSER ngram
) ENGINE=InnoDB;
connect (con1,localhost,root,,);
connect (con2,localhost,root,,);
connection con1;
set session debug_sync='bgc_before_flush_stage signal session1_before_flush wait_for session2_finish_add_fts_cache';
send INSERT INTO opening_lines(opening_line,author,title) VALUES ('TestOK','author1','title1');
connection con2;
set session debug_sync='now wait_for session1_before_flush';
set session debug_sync='fts_instrument_sync_request signal session2_finish_add_fts_cache';
INSERT INTO opening_lines(opening_line,author,title) VALUES ('OKTest','author2','title2');
connection con1;
reap;
echo has one row in result;
select * from opening_lines where opening_line like 'TestOK';
echo has no row in result;
select * from opening_lines where MATCH(opening_line) AGAINST('TestOK' in boolean mode);
disconnect con1;
disconnect con2;
drop table opening_lines;
Run this test script and you can see the two queries return different results.
Suggested fix:
The simplest way is sorting fts_query_t::matched_array[i] by doc id in ascending order when pushing data into it.
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
index ff360ea7857..607768f554a 100644
--- a/storage/innobase/fts/fts0que.cc
+++ b/storage/innobase/fts/fts0que.cc
@@ -3000,6 +3000,20 @@ static dberr_t fts_query_filter_doc_ids(
match =
static_cast<fts_match_t *>(ib_vector_push(query->matched, nullptr));
+ /* find the correct slot of new match */
+ ut_ad(ib_vector_size(query->matched) >= 1);
+ ulint slot = ib_vector_size(query->matched) - 1;
+ while (slot >= 1) {
+ fts_match_t *temp_match =
+ static_cast<fts_match_t *>(ib_vector_get(query->matched, slot - 1));
+ if (doc_id >= temp_match->doc_id) {
+ break;
+ }
+ ib_vector_set(query->matched, slot, temp_match);
+ slot--;
+ }
+ match = static_cast<fts_match_t *>(ib_vector_get(query->matched, slot));
+