Description:
buf_pool->flush_rbt is used to help order flush list during recovery, as dirty pages added by `buf_flush_recv_note_modification()` are not in increasing order of their oldest_lsn.
But the flush_rbt could be disordered, the root cause is a buggy cmp function used when initializing flush_rbt.
```
327 /** Compare two modified blocks in the buffer pool. The key for comparison
328 is:
329 key = <oldest_modification, space, offset>
330 This comparison is used to maintian ordering of blocks in the
331 buf_pool->flush_rbt.
332 Note that for the purpose of flush_rbt, we only need to order blocks
333 on the oldest_modification. The other two fields are used to uniquely
334 identify the blocks.
335 @return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
336 static int buf_flush_block_cmp(const void *p1, /*!< in: block1 */
337 const void *p2) /*!< in: block2 */
338 {
339 int ret;
...
360
361 /* If oldest_modification is same then decide on the space. */
362 ret = (int)(b2->id.space() - b1->id.space());
363
364 /* Or else decide ordering on the page number. */
365 return (ret ? ret : (int)(b2->id.page_no() - b1->id.page_no()));
366 }
```
As space id and page no are both uint32_t type, the `(int)` cast could overflow if diff between the two pages is bigger than max(int)(2147483647).
This could be triggered with a innodb tablespace larger than 32TB, which is the case we catched in our pro environment.
This could lead to data corruption as flush list could be disordered, and checkpoint/recovery will not work correctly.
How to repeat:
Read the code is enough.
Or run updates with a innodb tablespace > 32TB, and do crash recovery.
Suggested fix:
Besides `buf_flush_block_cmp()`, there are also several other cmp functions need be fixed.
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index 53471d2b0db..e2ba47acdc0 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -336,7 +336,6 @@ static void buf_flush_delete_from_flush_rbt(
static int buf_flush_block_cmp(const void *p1, /*!< in: block1 */
const void *p2) /*!< in: block2 */
{
- int ret;
const buf_page_t *b1 = *(const buf_page_t **)p1;
const buf_page_t *b2 = *(const buf_page_t **)p2;
@@ -359,10 +358,20 @@ static int buf_flush_block_cmp(const void *p1, /*!< in: block1 */
}
/* If oldest_modification is same then decide on the space. */
- ret = (int)(b2->id.space() - b1->id.space());
+ if (b2->id.space() > b1->id.space()) {
+ return (1);
+ } else if (b2->id.space() < b1->id.space()) {
+ return (-1);
+ }
/* Or else decide ordering on the page number. */
- return (ret ? ret : (int)(b2->id.page_no() - b1->id.page_no()));
+ if (b2->id.page_no() > b1->id.page_no()) {
+ return (1);
+ } else if (b2->id.page_no() < b1->id.page_no()) {
+ return (-1);
+ }
+
+ return (0);
}
/** Initialize the red-black tree to speed up insertions into the flush_list
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
index 366e2578b60..5ab2dc01f6d 100644
--- a/storage/innobase/fts/fts0que.cc
+++ b/storage/innobase/fts/fts0que.cc
@@ -391,7 +391,13 @@ static inline int fts_freq_doc_id_cmp(const void *p1, /*!< in: id1 */
const fts_doc_freq_t *fq1 = (const fts_doc_freq_t *)p1;
const fts_doc_freq_t *fq2 = (const fts_doc_freq_t *)p2;
- return ((int)(fq1->doc_id - fq2->doc_id));
+ if (fq1->doc_id > fq2->doc_id) {
+ return (1);
+ } else if (fq1->doc_id < fq2->doc_id) {
+ return (-1);
+ }
+
+ return (0);
}
#if 0
diff --git a/storage/innobase/include/fts0types.ic b/storage/innobase/include/fts0types.ic
index 6ef3a1254b4..06f1d092c95 100644
--- a/storage/innobase/include/fts0types.ic
+++ b/storage/innobase/include/fts0types.ic
@@ -58,7 +58,13 @@ static inline int fts_trx_row_doc_id_cmp(const void *p1, /*!< in: id1 */
const fts_trx_row_t *tr1 = (const fts_trx_row_t *)p1;
const fts_trx_row_t *tr2 = (const fts_trx_row_t *)p2;
- return ((int)(tr1->doc_id - tr2->doc_id));
+ if (tr1->doc_id > tr2->doc_id) {
+ return (1);
+ } else if (tr1->doc_id < tr2->doc_id) {
+ return (-1);
+ }
+
+ return (0);
}
/** Compare two fts_ranking_t doc_ids.
@@ -69,7 +75,13 @@ static inline int fts_ranking_doc_id_cmp(const void *p1, /*!< in: id1 */
const fts_ranking_t *rk1 = (const fts_ranking_t *)p1;
const fts_ranking_t *rk2 = (const fts_ranking_t *)p2;
- return ((int)(rk1->doc_id - rk2->doc_id));
+ if (rk1->doc_id > rk2->doc_id) {
+ return (1);
+ } else if (rk1->doc_id < rk2->doc_id) {
+ return (-1);
+ }
+
+ return (0);
}
/** Compare two fts_update_t doc_ids.
@@ -80,7 +92,13 @@ static inline int fts_update_doc_id_cmp(const void *p1, /*!< in: id1 */
const fts_update_t *up1 = (const fts_update_t *)p1;
const fts_update_t *up2 = (const fts_update_t *)p2;
- return ((int)(up1->doc_id - up2->doc_id));
+ if (up1->doc_id > up2->doc_id) {
+ return (1);
+ } else if (up1->doc_id < up2->doc_id) {
+ return (-1);
+ }
+
+ return (0);
}
/** Get the first character's code position for FTS index partition */