Description:
A text is split into several words, each word may be contained in many documents.
In fts cache, each word use fts_tokenizer_word_t::nodes to store doc ids. However, doc ids in fts_tokenizer_word_t::nodes are not guaranteed to be ordered, but in function 'fts_phrase_or_proximity_search', fts_query_t::matched_array[i] is expected to be sorted by doc id in ascending order. An unordered fts_tokenizer_word_t::nodes may cause incorrect result of fulltext search.
How to repeat:
source include/have_debug_sync.inc;
CREATE TABLE opening_lines (
       FTS_DOC_ID BIGINT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
       opening_line varchar(100),
       author VARCHAR(200),
       title VARCHAR(200),
       FULLTEXT (opening_line) WITH PARSER ngram
       ) ENGINE=InnoDB;
connect (con1,localhost,root,,);
connect (con2,localhost,root,,);
connection con1;
set session debug_sync='bgc_before_flush_stage signal session1_before_flush wait_for session2_finish_add_fts_cache';
send INSERT INTO opening_lines(opening_line,author,title) VALUES ('TestOK','author1','title1');
connection con2;
set session debug_sync='now wait_for session1_before_flush';
set session debug_sync='fts_instrument_sync_request signal session2_finish_add_fts_cache';
INSERT INTO opening_lines(opening_line,author,title) VALUES ('OKTest','author2','title2');
connection con1;
reap;
echo has one row in result;
select * from opening_lines where opening_line like 'TestOK';
echo has no row in result;
select * from opening_lines where MATCH(opening_line) AGAINST('TestOK' in boolean mode);
disconnect con1;
disconnect con2;
drop table opening_lines;
Run this test script and you can see the two queries return different results.
Suggested fix:
The simplest way is sorting fts_query_t::matched_array[i] by doc id in ascending order when pushing data into it.
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
index ff360ea7857..607768f554a 100644
--- a/storage/innobase/fts/fts0que.cc
+++ b/storage/innobase/fts/fts0que.cc
@@ -3000,6 +3000,20 @@ static dberr_t fts_query_filter_doc_ids(
       match =
           static_cast<fts_match_t *>(ib_vector_push(query->matched, nullptr));
 
+      /* find the correct slot of new match */
+      ut_ad(ib_vector_size(query->matched) >= 1);
+      ulint slot = ib_vector_size(query->matched) - 1;
+      while (slot >= 1) {
+        fts_match_t *temp_match =
+            static_cast<fts_match_t *>(ib_vector_get(query->matched, slot - 1));
+        if (doc_id >= temp_match->doc_id) {
+          break;
+        }
+        ib_vector_set(query->matched, slot, temp_match);
+        slot--;
+      }
+      match = static_cast<fts_match_t *>(ib_vector_get(query->matched, slot));
+
  
 
 
 
 
Description: A text is split into several words, each word may be contained in many documents. In fts cache, each word use fts_tokenizer_word_t::nodes to store doc ids. However, doc ids in fts_tokenizer_word_t::nodes are not guaranteed to be ordered, but in function 'fts_phrase_or_proximity_search', fts_query_t::matched_array[i] is expected to be sorted by doc id in ascending order. An unordered fts_tokenizer_word_t::nodes may cause incorrect result of fulltext search. How to repeat: source include/have_debug_sync.inc; CREATE TABLE opening_lines ( FTS_DOC_ID BIGINT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY, opening_line varchar(100), author VARCHAR(200), title VARCHAR(200), FULLTEXT (opening_line) WITH PARSER ngram ) ENGINE=InnoDB; connect (con1,localhost,root,,); connect (con2,localhost,root,,); connection con1; set session debug_sync='bgc_before_flush_stage signal session1_before_flush wait_for session2_finish_add_fts_cache'; send INSERT INTO opening_lines(opening_line,author,title) VALUES ('TestOK','author1','title1'); connection con2; set session debug_sync='now wait_for session1_before_flush'; set session debug_sync='fts_instrument_sync_request signal session2_finish_add_fts_cache'; INSERT INTO opening_lines(opening_line,author,title) VALUES ('OKTest','author2','title2'); connection con1; reap; echo has one row in result; select * from opening_lines where opening_line like 'TestOK'; echo has no row in result; select * from opening_lines where MATCH(opening_line) AGAINST('TestOK' in boolean mode); disconnect con1; disconnect con2; drop table opening_lines; Run this test script and you can see the two queries return different results. Suggested fix: The simplest way is sorting fts_query_t::matched_array[i] by doc id in ascending order when pushing data into it. diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc index ff360ea7857..607768f554a 100644 --- a/storage/innobase/fts/fts0que.cc +++ b/storage/innobase/fts/fts0que.cc @@ -3000,6 +3000,20 @@ static dberr_t fts_query_filter_doc_ids( match = static_cast<fts_match_t *>(ib_vector_push(query->matched, nullptr)); + /* find the correct slot of new match */ + ut_ad(ib_vector_size(query->matched) >= 1); + ulint slot = ib_vector_size(query->matched) - 1; + while (slot >= 1) { + fts_match_t *temp_match = + static_cast<fts_match_t *>(ib_vector_get(query->matched, slot - 1)); + if (doc_id >= temp_match->doc_id) { + break; + } + ib_vector_set(query->matched, slot, temp_match); + slot--; + } + match = static_cast<fts_match_t *>(ib_vector_get(query->matched, slot)); +