From 783bd8d2c8934b21c9dcd7312826277f615bfcef Mon Sep 17 00:00:00 2001
From: casazhang <casazhang@tencent.com>
Date: Fri, 17 Sep 2021 14:05:33 +0800
Subject: [PATCH] Bug#104108 CUMULATIVE FREQUENCY MAY GROW OVER 1.0 DUE TO
 FLOAT ERRORS

When singleton histogram is constructed, cumulative frequency
is calculated by adding frequencies of previous buckets and the
current bucket. This may lead to accumulated float errors, and
the last cumulative frequency may be over 1.0 (cf > 1.0) and
cause abnormal behaviors.

This patch accumulates with integer type to avoid float errors.
---
 mysql-test/r/histograms.result                   | 11 +++++++++++
 mysql-test/suite/innodb/r/histogram-debug.result | 10 +++++-----
 mysql-test/t/histograms.test                     | 10 ++++++++++
 sql/histograms/singleton.cc                      |  7 ++++---
 4 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/mysql-test/r/histograms.result b/mysql-test/r/histograms.result
index 757464b2f23..5a5ba1f0a75 100644
--- a/mysql-test/r/histograms.result
+++ b/mysql-test/r/histograms.result
@@ -1969,6 +1969,17 @@ AND OUTR . col_varchar_key IS NULL
 HAVING x <= 3
 ORDER BY OUTR . pk , OUTR . pk;
 x
+#
+# Bug#104108 CUMULATIVE FREQUENCY MAY GROW OVER 1.0 DUE TO FLOAT ERRORS
+#
+ANALYZE TABLE t2 UPDATE HISTOGRAM ON col_datetime WITH 100 BUCKETS;
+Table	Op	Msg_type	Msg_text
+test.t2	histogram	status	Histogram statistics created for column 'col_datetime'.
+SELECT schema_name, table_name, column_name,
+JSON_REMOVE(histogram, '$."last-updated"')
+FROM information_schema.COLUMN_STATISTICS WHERE table_name = 't2' and column_name = 'col_datetime';
+SCHEMA_NAME	TABLE_NAME	COLUMN_NAME	JSON_REMOVE(histogram, '$."last-updated"')
+test	t2	col_datetime	{"buckets": [["1900-01-01 00:00:00.000000", 0.3], ["2001-01-20 12:47:23.000000", 0.35], ["2001-07-25 08:40:24.000000", 0.4], ["2001-10-22 11:13:24.000000", 0.45], ["2002-08-25 20:35:06.000000", 0.5], ["2002-12-08 11:34:58.000000", 0.55], ["2003-03-12 02:00:34.000000", 0.6], ["2003-12-04 11:14:26.000000", 0.65], ["2005-08-15 00:00:00.000000", 0.7], ["2006-09-11 18:25:21.000000", 0.75], ["2007-04-10 12:16:04.000000", 0.8], ["2008-05-16 08:09:06.000000", 0.85], ["2008-07-02 00:00:00.000000", 0.9], ["2009-06-07 13:48:58.000000", 0.95], ["2009-11-07 00:00:00.000000", 1.0]], "data-type": "datetime", "null-values": 0.0, "collation-id": 8, "sampling-rate": 1.0, "histogram-type": "singleton", "number-of-buckets-specified": 100}
 DROP TABLE t1, t2;
 SET @@SESSION.sql_mode=DEFAULT;
 #
diff --git a/mysql-test/suite/innodb/r/histogram-debug.result b/mysql-test/suite/innodb/r/histogram-debug.result
index 4515dc26286..4bee1432db6 100644
--- a/mysql-test/suite/innodb/r/histogram-debug.result
+++ b/mysql-test/suite/innodb/r/histogram-debug.result
@@ -32,7 +32,7 @@ Table	Op	Msg_type	Msg_text
 test.t1	histogram	status	Histogram statistics created for column 'a'.
 SELECT SCHEMA_NAME, TABLE_NAME, COLUMN_NAME, JSON_REMOVE(histogram, '$."last-updated"') FROM information_schema.column_statistics WHERE COLUMN_NAME = 'a';
 SCHEMA_NAME	TABLE_NAME	COLUMN_NAME	JSON_REMOVE(histogram, '$."last-updated"')
-test	t1	a	{"buckets": [[10, 0.1], [20, 0.2], [30, 0.30000000000000004], [40, 0.4], [50, 0.5], [60, 0.6], [70, 0.7], [80, 0.7999999999999999], [90, 0.8999999999999999], [100, 0.9999999999999999]], "data-type": "int", "null-values": 0.0, "collation-id": 8, "sampling-rate": 0.5, "histogram-type": "singleton", "number-of-buckets-specified": 50}
+test	t1	a	{"buckets": [[10, 0.1], [20, 0.2], [30, 0.3], [40, 0.4], [50, 0.5], [60, 0.6], [70, 0.7], [80, 0.8], [90, 0.9], [100, 1.0]], "data-type": "int", "null-values": 0.0, "collation-id": 8, "sampling-rate": 0.5, "histogram-type": "singleton", "number-of-buckets-specified": 50}
 DROP TABLE t1;
 SELECT NAME,COUNT FROM INFORMATION_SCHEMA.INNODB_METRICS WHERE SUBSYSTEM='sampling';
 NAME	COUNT
@@ -97,7 +97,7 @@ Table	Op	Msg_type	Msg_text
 test.t1	histogram	status	Histogram statistics created for column 'a'.
 SELECT SCHEMA_NAME, TABLE_NAME, COLUMN_NAME, JSON_REMOVE(histogram, '$."last-updated"') FROM information_schema.column_statistics WHERE COLUMN_NAME = 'a';
 SCHEMA_NAME	TABLE_NAME	COLUMN_NAME	JSON_REMOVE(histogram, '$."last-updated"')
-test	t1	a	{"buckets": [[10, 0.2], [20, 0.4], [30, 0.6000000000000001], [40, 0.8], [50, 1.0]], "data-type": "int", "null-values": 0.0, "collation-id": 8, "sampling-rate": 1.0, "histogram-type": "singleton", "number-of-buckets-specified": 50}
+test	t1	a	{"buckets": [[10, 0.2], [20, 0.4], [30, 0.6], [40, 0.8], [50, 1.0]], "data-type": "int", "null-values": 0.0, "collation-id": 8, "sampling-rate": 1.0, "histogram-type": "singleton", "number-of-buckets-specified": 50}
 set global innodb_limit_optimistic_insert_debug = 0;
 DROP TABLE t1;
 SELECT NAME,COUNT FROM INFORMATION_SCHEMA.INNODB_METRICS WHERE SUBSYSTEM='sampling';
@@ -167,7 +167,7 @@ Table	Op	Msg_type	Msg_text
 test.t1	histogram	status	Histogram statistics created for column 'a'.
 SELECT SCHEMA_NAME, TABLE_NAME, COLUMN_NAME, JSON_REMOVE(histogram, '$."last-updated"') FROM information_schema.column_statistics WHERE COLUMN_NAME = 'a';
 SCHEMA_NAME	TABLE_NAME	COLUMN_NAME	JSON_REMOVE(histogram, '$."last-updated"')
-test	t1	a	{"buckets": [[40, 0.2], [50, 0.4], [80, 0.6000000000000001], [90, 0.8], [100, 1.0]], "data-type": "int", "null-values": 0.0, "collation-id": 8, "sampling-rate": 0.5, "histogram-type": "singleton", "number-of-buckets-specified": 50}
+test	t1	a	{"buckets": [[40, 0.2], [50, 0.4], [80, 0.6], [90, 0.8], [100, 1.0]], "data-type": "int", "null-values": 0.0, "collation-id": 8, "sampling-rate": 0.5, "histogram-type": "singleton", "number-of-buckets-specified": 50}
 SELECT NAME,COUNT FROM INFORMATION_SCHEMA.INNODB_METRICS WHERE SUBSYSTEM='sampling';
 NAME	COUNT
 sampled_pages_read	3
@@ -183,7 +183,7 @@ Table	Op	Msg_type	Msg_text
 test.t1	histogram	status	Histogram statistics created for column 'a'.
 SELECT SCHEMA_NAME, TABLE_NAME, COLUMN_NAME, JSON_REMOVE(histogram, '$."last-updated"') FROM information_schema.column_statistics WHERE COLUMN_NAME = 'a';
 SCHEMA_NAME	TABLE_NAME	COLUMN_NAME	JSON_REMOVE(histogram, '$."last-updated"')
-test	t1	a	{"buckets": [[40, 0.2], [50, 0.4], [80, 0.6000000000000001], [90, 0.8], [100, 1.0]], "data-type": "int", "null-values": 0.0, "collation-id": 8, "sampling-rate": 0.5, "histogram-type": "singleton", "number-of-buckets-specified": 50}
+test	t1	a	{"buckets": [[40, 0.2], [50, 0.4], [80, 0.6], [90, 0.8], [100, 1.0]], "data-type": "int", "null-values": 0.0, "collation-id": 8, "sampling-rate": 0.5, "histogram-type": "singleton", "number-of-buckets-specified": 50}
 DROP TABLE t1;
 set global innodb_limit_optimistic_insert_debug = 0;
 SET DEBUG='-d,histogram_force_sampling';
@@ -273,7 +273,7 @@ Table	Op	Msg_type	Msg_text
 test.t1	histogram	status	Histogram statistics created for column 'id'.
 SELECT SCHEMA_NAME, TABLE_NAME, COLUMN_NAME, JSON_REMOVE(histogram, '$."last-updated"') FROM information_schema.column_statistics WHERE COLUMN_NAME = 'id';
 SCHEMA_NAME	TABLE_NAME	COLUMN_NAME	JSON_REMOVE(histogram, '$."last-updated"')
-test	t1	id	{"buckets": [[1, 0.2], [2, 0.4], [3, 0.6000000000000001], [4, 0.8], [5, 1.0]], "data-type": "int", "null-values": 0.0, "collation-id": 8, "sampling-rate": 1.0, "histogram-type": "singleton", "number-of-buckets-specified": 50}
+test	t1	id	{"buckets": [[1, 0.2], [2, 0.4], [3, 0.6], [4, 0.8], [5, 1.0]], "data-type": "int", "null-values": 0.0, "collation-id": 8, "sampling-rate": 1.0, "histogram-type": "singleton", "number-of-buckets-specified": 50}
 SET DEBUG = "-d, sample_read_sample_half";
 DROP TABLE t1;
 #
diff --git a/mysql-test/t/histograms.test b/mysql-test/t/histograms.test
index 6d6254ae4a5..6087be23bd9 100644
--- a/mysql-test/t/histograms.test
+++ b/mysql-test/t/histograms.test
@@ -1558,6 +1558,16 @@ AND OUTR . col_varchar_key IS NULL
 HAVING x <= 3
 ORDER BY OUTR . pk , OUTR . pk;
 
+
+--echo #
+--echo # Bug#104108 CUMULATIVE FREQUENCY MAY GROW OVER 1.0 DUE TO FLOAT ERRORS
+--echo #
+
+ANALYZE TABLE t2 UPDATE HISTOGRAM ON col_datetime WITH 100 BUCKETS;
+SELECT schema_name, table_name, column_name,
+       JSON_REMOVE(histogram, '$."last-updated"')
+FROM information_schema.COLUMN_STATISTICS WHERE table_name = 't2' and column_name = 'col_datetime';
+
 DROP TABLE t1, t2;
 SET @@SESSION.sql_mode=DEFAULT;
 
diff --git a/sql/histograms/singleton.cc b/sql/histograms/singleton.cc
index d80abb4c2ff..e7bd1092b70 100644
--- a/sql/histograms/singleton.cc
+++ b/sql/histograms/singleton.cc
@@ -117,7 +117,7 @@ bool Singleton<T>::build_histogram(const Value_map<T> &value_map,
       value_map.get_num_null_values() / static_cast<double>(total_count);
 
   // Create buckets with relative frequency, and not absolute frequency.
-  double cumulative_frequency = 0.0;
+  ha_rows cumulative_sum = 0;
 
   /*
     Since we are using a std::map with Mem_root_allocator, we are forced to wrap
@@ -126,8 +126,9 @@ bool Singleton<T>::build_histogram(const Value_map<T> &value_map,
   */
   try {
     for (const auto &node : value_map) {
-      const double frequency = node.second / static_cast<double>(total_count);
-      cumulative_frequency += frequency;
+      cumulative_sum += node.second;
+      const double cumulative_frequency =
+          cumulative_sum / static_cast<double>(total_count);
 
       m_buckets.emplace(node.first, cumulative_frequency);
     }
-- 
2.26.2