commit 55468726b5e61da927f003f89a47ca62b1c6f8ce
Author: Dmitry Lenev <dmitry.lenev@percona.com>
Date:   Mon Jul 24 19:34:30 2023 +0200

    Bug #96236 "Big intrinsic tmp InnoDB table make InnoDB engine get totally stuck".
    
    Some queries that used a big intrinsic temporary InnoDB table might have
    led to InnoDB SE stall (especially when InnoDB buffer pool was small).
    Also concurrent execution of those queries could have led to the same
    problem or degraded performance.
    
    The problem occured for queries that scanned through such intrinsic tables
    using auto-created secondary key rather than clustered key.
    
    What happened was that in such cases some of the pages belonging to
    intrinsic table and accessed by the query stayed fixed in buffer pool
    until the end of statement (and not until the end of operation on
    specific row as it happens for normal InnoDB tables or only while they
    are inspected as it happens for scans of intrinsic tables over clustered
    key). As result these buffer fixed pages might have consumed the whole
    buffer pool at some point and further statement progress was blocked.
    
    Such behavior was result of combination of the following factors:
    1) operations on intrinsic temporary tables are optimized and use single
       mini-transaction for all row operations in the statement (while
       for normal tables commit of mini-transaction happens after
       processing of each record.
    2) normally, all buffer fixed pages are unfixed from memory only at
       commit of mini-transaction.
    3) process of scanning over a key is optimized to immediately unfix pages
       once the cursor leaves them. However, when scan happens over a secondary
       key additional lookups on the clustered key are performed. And buffer
       pages which are fixed by these additional lookups stay fixed until
       the end of mini-transaction.
    
    This fix solves the problem by ensuring that we commit mini-transaction
    which is used for access to initrinsic temporary table after performing
    operations on each 100 rows. This unfixes buffer pages touched by
    mini-transaction making them available for further use by other or this
    statements.
    
    This patch might lead to 1-3% performance reduction in some of scenarios
    which heavily use intrinsic temporary tables. OTOH we also have seen
    significant performance improvements in other scenarios where big
    intrinsic tables were used. So we consider this acceptable.

diff --git a/mysql-test/suite/innodb/r/optimizer_temporary_table_2.result b/mysql-test/suite/innodb/r/optimizer_temporary_table_2.result
new file mode 100644
index 00000000000..8e0db301901
--- /dev/null
+++ b/mysql-test/suite/innodb/r/optimizer_temporary_table_2.result
@@ -0,0 +1,55 @@
+#
+# Test for bug#96236 "Big intrinsic tmp InnoDB table make InnoDB
+# engine get totally stuck".
+#
+# Check that InnoDB buffer pool size is set to minimum/5Mb and thus is
+# small enough to trigger original problem. If it ever changes the size
+# of source table below needs to be changed proportionately.
+include/assert.inc [InnoDB Buffer Pool has minimal size (5Mb).]
+CREATE TABLE source (id INT PRIMARY KEY AUTO_INCREMENT, a VARCHAR(100), b VARCHAR(100), c VARCHAR(100), d VARCHAR(100), e VARCHAR(100), hdl_source_id INT, hdl_created_date DATETIME);
+CREATE TABLE dest LIKE source;
+# Populate source table with 64K records.
+SELECT count(*) FROM source;
+count(*)
+65536
+# Run query that creates big intrinsic temporary table. Prior to the
+# fix it has exhausted buffer pool by keeping all its pages buffer
+# fixed in memory, so query stalled.
+#
+# Note that to trigger problematic scenario query must perform scan
+# over <auto_key0> secondary key on intrinsic temporary table
+# (and auxiliary lookups to confirm record status on clustered key).
+# In this case the secondary key scan fixes and unfixes pages
+# through which it iterates, while pages accessed during auxiliary
+# lookups on clustered key stay buffer fixed until mtr commit.
+#
+# In cases when intrinsic temporary table is read using clustered
+# key only the problem didn't occur, as similary to previous case
+# key scan fixes and unfixes pages through which it iterates, but
+# there are no auxiliary lookups to buffer fix additional pages.
+EXPLAIN INSERT INTO dest SELECT id,a,b,c,d,e,hdl_source_id,hdl_created_date FROM (SELECT *, ROW_NUMBER() OVER(PARTITION BY id ORDER BY hdl_source_id DESC, hdl_created_date DESC) rn FROM source) a WHERE rn=1;
+id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	INSERT	dest	NULL	ALL	NULL	NULL	NULL	NULL	#	NULL	NULL
+1	PRIMARY	<derived2>	NULL	ref	<auto_key0>	<auto_key0>	8	const	#	100.00	NULL
+2	DERIVED	source	NULL	ALL	NULL	NULL	NULL	NULL	#	100.00	Using filesort
+Warnings:
+Note	3598	To get information about window functions use EXPLAIN FORMAT=JSON
+Note	1003	insert into `test`.`dest` /* select#1 */ select `a`.`id` AS `id`,`a`.`a` AS `a`,`a`.`b` AS `b`,`a`.`c` AS `c`,`a`.`d` AS `d`,`a`.`e` AS `e`,`a`.`hdl_source_id` AS `hdl_source_id`,`a`.`hdl_created_date` AS `hdl_created_date` from (/* select#2 */ select `test`.`source`.`id` AS `id`,`test`.`source`.`a` AS `a`,`test`.`source`.`b` AS `b`,`test`.`source`.`c` AS `c`,`test`.`source`.`d` AS `d`,`test`.`source`.`e` AS `e`,`test`.`source`.`hdl_source_id` AS `hdl_source_id`,`test`.`source`.`hdl_created_date` AS `hdl_created_date`,row_number() OVER (PARTITION BY `test`.`source`.`id` ORDER BY `test`.`source`.`hdl_source_id` desc,`test`.`source`.`hdl_created_date` desc )  AS `rn` from `test`.`source`) `a` where (`a`.`rn` = 1)
+INSERT INTO dest SELECT id,a,b,c,d,e,hdl_source_id,hdl_created_date FROM (SELECT *, ROW_NUMBER() OVER(PARTITION BY id ORDER BY hdl_source_id DESC, hdl_created_date DESC) rn FROM source) a WHERE rn=1;
+# Queries that used several cursors over intrinsic temporary tables
+# were also affected by the same problem. Again problem was visible
+# only if scan over secondary key was used.
+EXPLAIN WITH qn AS (SELECT *, ROW_NUMBER() OVER(PARTITION BY id ORDER BY hdl_source_id DESC, hdl_created_date DESC) rn FROM source) SELECT (SELECT COUNT(*) FROM qn WHERE rn=1), (SELECT COUNT(*) FROM qn WHERE rn=2);
+id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	PRIMARY	NULL	NULL	NULL	NULL	NULL	NULL	NULL	#	NULL	No tables used
+4	SUBQUERY	<derived3>	NULL	ref	<auto_key0>	<auto_key0>	8	const	#	100.00	NULL
+2	SUBQUERY	<derived3>	NULL	ref	<auto_key0>	<auto_key0>	8	const	#	100.00	NULL
+3	DERIVED	source	NULL	ALL	NULL	NULL	NULL	NULL	#	100.00	Using filesort
+Warnings:
+Note	3598	To get information about window functions use EXPLAIN FORMAT=JSON
+Note	1003	with `qn` as (/* select#3 */ select `test`.`source`.`id` AS `id`,`test`.`source`.`a` AS `a`,`test`.`source`.`b` AS `b`,`test`.`source`.`c` AS `c`,`test`.`source`.`d` AS `d`,`test`.`source`.`e` AS `e`,`test`.`source`.`hdl_source_id` AS `hdl_source_id`,`test`.`source`.`hdl_created_date` AS `hdl_created_date`,row_number() OVER (PARTITION BY `test`.`source`.`id` ORDER BY `test`.`source`.`hdl_source_id` desc,`test`.`source`.`hdl_created_date` desc )  AS `rn` from `test`.`source`) /* select#1 */ select (/* select#2 */ select count(0) from `qn` where (`qn`.`rn` = 1)) AS `(SELECT COUNT(*) FROM qn WHERE rn=1)`,(/* select#4 */ select count(0) from `qn` where (`qn`.`rn` = 2)) AS `(SELECT COUNT(*) FROM qn WHERE rn=2)`
+WITH qn AS (SELECT *, ROW_NUMBER() OVER(PARTITION BY id ORDER BY hdl_source_id DESC, hdl_created_date DESC) rn FROM source) SELECT (SELECT COUNT(*) FROM qn WHERE rn=1), (SELECT COUNT(*) FROM qn WHERE rn=2);
+(SELECT COUNT(*) FROM qn WHERE rn=1)	(SELECT COUNT(*) FROM qn WHERE rn=2)
+65536	0
+# Clean-up.
+DROP TABLES dest, source;
diff --git a/mysql-test/suite/innodb/t/optimizer_temporary_table_2-master.opt b/mysql-test/suite/innodb/t/optimizer_temporary_table_2-master.opt
new file mode 100644
index 00000000000..67c8423cf2a
--- /dev/null
+++ b/mysql-test/suite/innodb/t/optimizer_temporary_table_2-master.opt
@@ -0,0 +1 @@
+--innodb_buffer_pool_size=5M
diff --git a/mysql-test/suite/innodb/t/optimizer_temporary_table_2.test b/mysql-test/suite/innodb/t/optimizer_temporary_table_2.test
new file mode 100644
index 00000000000..f333cd4a24c
--- /dev/null
+++ b/mysql-test/suite/innodb/t/optimizer_temporary_table_2.test
@@ -0,0 +1,58 @@
+--echo #
+--echo # Test for bug#96236 "Big intrinsic tmp InnoDB table make InnoDB
+--echo # engine get totally stuck".
+--echo #
+
+--echo # Check that InnoDB buffer pool size is set to minimum/5Mb and thus is
+--echo # small enough to trigger original problem. If it ever changes the size
+--echo # of source table below needs to be changed proportionately.
+--let $innodb_buffer_pool_size = `SELECT @@global.innodb_buffer_pool_size`
+--let $assert_text = InnoDB Buffer Pool has minimal size (5Mb).
+--let $assert_cond = $innodb_buffer_pool_size = 5242880
+--source include/assert.inc
+
+CREATE TABLE source (id INT PRIMARY KEY AUTO_INCREMENT, a VARCHAR(100), b VARCHAR(100), c VARCHAR(100), d VARCHAR(100), e VARCHAR(100), hdl_source_id INT, hdl_created_date DATETIME);
+CREATE TABLE dest LIKE source;
+
+--echo # Populate source table with 64K records.
+--disable_query_log
+INSERT INTO source (a, b, c, d, e, hdl_source_id, hdl_created_date) values ('aaaaaaaaa', 'bbbbbbbbbb', 'cccccccccccccccc', 'dddddddddddddddddddddddddddd', 'eeeeeeeeeeeeeeeeee', 1, NOW());
+--let $i= 0
+while ($i < 16)
+{
+  INSERT INTO source (a, b, c, d, e, hdl_source_id, hdl_created_date) SELECT a, b, c, d, e, hdl_source_id + 1, NOW() FROM source;
+  --inc $i
+}
+--enable_query_log
+SELECT count(*) FROM source;
+
+--echo # Run query that creates big intrinsic temporary table. Prior to the
+--echo # fix it has exhausted buffer pool by keeping all its pages buffer
+--echo # fixed in memory, so query stalled.
+--echo #
+--echo # Note that to trigger problematic scenario query must perform scan
+--echo # over <auto_key0> secondary key on intrinsic temporary table
+--echo # (and auxiliary lookups to confirm record status on clustered key).
+--echo # In this case the secondary key scan fixes and unfixes pages
+--echo # through which it iterates, while pages accessed during auxiliary
+--echo # lookups on clustered key stay buffer fixed until mtr commit.
+--echo #
+--echo # In cases when intrinsic temporary table is read using clustered
+--echo # key only the problem didn't occur, as similary to previous case
+--echo # key scan fixes and unfixes pages through which it iterates, but
+--echo # there are no auxiliary lookups to buffer fix additional pages.
+--let $query = INSERT INTO dest SELECT id,a,b,c,d,e,hdl_source_id,hdl_created_date FROM (SELECT *, ROW_NUMBER() OVER(PARTITION BY id ORDER BY hdl_source_id DESC, hdl_created_date DESC) rn FROM source) a WHERE rn=1
+--replace_column 10 #
+--eval EXPLAIN $query
+--eval $query
+
+--echo # Queries that used several cursors over intrinsic temporary tables
+--echo # were also affected by the same problem. Again problem was visible
+--echo # only if scan over secondary key was used.
+--let $query = WITH qn AS (SELECT *, ROW_NUMBER() OVER(PARTITION BY id ORDER BY hdl_source_id DESC, hdl_created_date DESC) rn FROM source) SELECT (SELECT COUNT(*) FROM qn WHERE rn=1), (SELECT COUNT(*) FROM qn WHERE rn=2)
+--replace_column 10 #
+--eval EXPLAIN $query
+--eval $query
+
+--echo # Clean-up.
+DROP TABLES dest, source;
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index 02502a1cdae..6d8e83fd4b5 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -984,14 +984,20 @@ struct rec_cache_t {
   size_t nullable_cols{0};
 };
 
+/** Maximum number of records we insert or select from intrinsic table
+before committing mtr. */
+constexpr uint32_t MAX_INTRINSIC_MTR_RECORDS = 100;
+
 /** Cache position of last inserted or selected record by caching record
 and holding reference to the block where record resides.
-Note: We don't commit mtr and hold it beyond a transaction lifetime as this is
-a special case (intrinsic table) that are not shared across connection. */
+Note: We don't commit mtr (unless mtr_records reaches MAX_INTRINSIC_MTR_RECORDS
+limit) and hold it beyond a transaction lifetime as this is a  special case
+(intrinsic table) that are not shared across connection. */
 class last_ops_cur_t {
  public:
   /** Constructor */
-  last_ops_cur_t() : rec(), block(), mtr(), disable_caching(), invalid() {
+  last_ops_cur_t()
+      : rec(), block(), mtr(), disable_caching(), invalid(), mtr_records(0) {
     /* Do Nothing. */
   }
 
@@ -1003,6 +1009,7 @@ class last_ops_cur_t {
     rec = nullptr;
     block = nullptr;
     invalid = false;
+    mtr_records = 0;
   }
 
  public:
@@ -1022,6 +1029,11 @@ class last_ops_cur_t {
   split then invalidate the cached position as it would be no more
   remain valid. Will be re-cached on post-split insert. */
   bool invalid;
+
+  /** Number of records which were inserted or selected into from
+  intrinsic table within this mtr. Needed to limit number of intrinsic
+  table records inserted/selected within single mtr. */
+  uint32_t mtr_records;
 };
 
 /** "GEN_CLUST_INDEX" is the name reserved for InnoDB default
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
index d4d8dcb0507..fe97bb9f2aa 100644
--- a/storage/innobase/row/row0ins.cc
+++ b/storage/innobase/row/row0ins.cc
@@ -2626,9 +2626,15 @@ static dberr_t row_ins_sorted_clust_index_entry(ulint mode, dict_index_t *index,
   mtr = &index->last_ins_cur->mtr;
 
   /* Search for position if tree needs to be split or if last position
-  is not cached. */
+  is not cached. Also commit mtr and do the search if we have inserted
+  more than MAX_INTRINSIC_MTR_RECORDS within this mtr. The latter is
+  done to unfix buffer pages used by the mtr. Otherwise statements
+  (even single statement in extreme case) using intrinsic tables can
+  consume the whole buffer pool by their buffer fixed pages causing
+  performance problems or even stalls. */
   if (mode == BTR_MODIFY_TREE || index->last_ins_cur->rec == nullptr ||
-      index->last_ins_cur->disable_caching) {
+      index->last_ins_cur->disable_caching ||
+      index->last_ins_cur->mtr_records > MAX_INTRINSIC_MTR_RECORDS) {
     /* Commit the previous mtr. */
     index->last_ins_cur->release();
 
@@ -2700,6 +2706,7 @@ static dberr_t row_ins_sorted_clust_index_entry(ulint mode, dict_index_t *index,
         index->last_ins_cur->rec = insert_rec;
 
         index->last_ins_cur->block = cursor.page_cur.block;
+        index->last_ins_cur->mtr_records++;
       } else {
         index->last_ins_cur->release();
       }
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
index b3917453c6f..f7701e5a5c1 100644
--- a/storage/innobase/row/row0sel.cc
+++ b/storage/innobase/row/row0sel.cc
@@ -3966,7 +3966,8 @@ dberr_t row_search_no_mvcc(byte *buf, page_cur_mode_t mode,
   If search key is specified, cursor is open using the key else
   cursor is open to return all the records. */
   if (direction != 0) {
-    if (prebuilt->m_temp_read_shared && !prebuilt->m_temp_tree_modified) {
+    if (prebuilt->m_temp_read_shared && !prebuilt->m_temp_tree_modified &&
+        !(index->last_sel_cur->mtr_records > MAX_INTRINSIC_MTR_RECORDS)) {
       if (!mtr->is_active()) {
         mtr_start(mtr);
 
@@ -3992,10 +3993,17 @@ dberr_t row_search_no_mvcc(byte *buf, page_cur_mode_t mode,
         return (err); /* purecov: inspected */
       }
 
-    } else if (index->last_sel_cur->invalid || prebuilt->m_temp_tree_modified) {
+    } else if (index->last_sel_cur->invalid || prebuilt->m_temp_tree_modified ||
+               index->last_sel_cur->mtr_records > MAX_INTRINSIC_MTR_RECORDS) {
     block_relocated:
       /* Index tree has changed and so active cached cursor is no more valid.
-      Re-set it based on the last selected position. */
+      Re-set it based on the last selected position.
+      Also commit mtr and reset the cursor if we have selected more than
+      MAX_INTRINSIC_MTR_RECORDS within this mtr. The latter is done to unfix
+      buffer pages used by the mtr. Otherwise statements (even single
+      statement in extreme case) using intrinsic tables can consume the whole
+      buffer pool by their buffer fixed pages causing performance problems
+      or even stalls. */
       index->last_sel_cur->release();
       prebuilt->m_temp_tree_modified = false;
 
@@ -4143,6 +4151,7 @@ dberr_t row_search_no_mvcc(byte *buf, page_cur_mode_t mode,
     ut_ad(err == DB_SUCCESS);
     index->last_sel_cur->rec = pcur->get_rec();
     index->last_sel_cur->block = pcur->get_block();
+    index->last_sel_cur->mtr_records++;
 
     /* This is needed in order to restore the cursor if index
     structure changes while SELECT is still active. */