4554 Guilhem Bichot 2013-12-02 Bug#16539979 - BASIC SELECT COUNT(DISTINCT ID) IS BROKEN Bug#17867117 - ERROR RESULT WHEN "COUNT + DISTINCT + CASE WHEN" NEED MERGE_WALK Problem: COUNT DISTINCT gives incorrect result when it uses a Unique Tree and its last inserted record has null value. Here is how COUNT DISTINCT is processed, given that this query is not using loose index scan. When a row is produced as a result of joining tables (there is only one table here), we store the SELECTed value in a Unique tree. This allows elimination of any duplicates, and thus implements DISTINCT. When we have processed all rows like this, we walk the Unique tree, counting its elements, in Aggregator_distinct::endup() (tree->walk()); for each element we call Item_sum_count::add(). Such function wants to ignore any NULL value, for that it checks item_sum -> args[0] -> null_value. It is a mistake: when walking the Unique tree, the value to be aggregated is not item_sum ->args[0] but rather table -> field[0]. Solution: instead of item_sum -> args[0] -> null_value, use arg_is_null(), which knows where to look (like in fix for bug 57932). As a consequence of this solution, we have to make arg_is_null() a little more general: 1) Because it was so far only used for AVG() (which always has a single argument), this function was looking at a single argument; now that it has to work with COUNT(DISTINCT expression1,expression2), it must look at all arguments. 2) Because we start using arg_is_null () for COUNT(DISTINCT), i.e. in Item_sum_count::add (), it implies that we are also using it for COUNT(no DISTINCT) (same add ()). For COUNT(no DISTINCT), the nullness to check is that of item_sum -> args[0]. But the null_value of such item is reliable only if val_*() has been called on it. So far arg_is_null() was always used after a call to arg_val*(), so could rely on null_value; but for COUNT, there is no call to arg_val*(), so arg_is_null() has to call is_null() instead. Note: comments starting with "Reviewers" will not be pushed. Testcase for 16539979 by Neeraj. Testcase for 17867117 contributed by Xiaobin Lin from Taobao. modified: mysql-test/r/distinct.result mysql-test/t/distinct.test sql/item_sum.cc sql/item_sum.h === modified file 'mysql-test/r/distinct.result' --- a/mysql-test/r/distinct.result +++ b/mysql-test/r/distinct.result @@ -804,4 +804,44 @@ c 11112222 33334444 DROP TABLE t1; +# +# Bug#16539979 BASIC SELECT COUNT(DISTINCT ID) IS BROKEN. +# Bug#17867117 ERROR RESULT WHEN "COUNT + DISTINCT + CASE WHEN" NEED MERGE_WALK +# +SET @tmp_table_size_save= @@tmp_table_size; +SET @@tmp_table_size= 1024; +CREATE TABLE t1 (a INT); +INSERT INTO t1 VALUES (1),(2),(3),(4),(5),(6),(7),(8); +INSERT INTO t1 SELECT a+8 FROM t1; +INSERT INTO t1 SELECT a+16 FROM t1; +INSERT INTO t1 SELECT a+32 FROM t1; +INSERT INTO t1 SELECT a+64 FROM t1; +INSERT INTO t1 VALUE(NULL); +SELECT COUNT(DISTINCT a) FROM t1; +COUNT(DISTINCT a) +128 +SELECT COUNT(DISTINCT (a+0)) FROM t1; +COUNT(DISTINCT (a+0)) +128 +DROP TABLE t1; +create table tb( +id int auto_increment primary key, +v varchar(32)) +engine=myisam charset=gbk; +insert into tb(v) values("aaa"); +insert into tb(v) (select v from tb); +insert into tb(v) (select v from tb); +insert into tb(v) (select v from tb); +insert into tb(v) (select v from tb); +insert into tb(v) (select v from tb); +insert into tb(v) (select v from tb); +update tb set v=concat(v, id); +select count(distinct case when id<=64 then id end) from tb; +count(distinct case when id<=64 then id end) +64 +select count(distinct case when id<=63 then id end) from tb; +count(distinct case when id<=63 then id end) +63 +drop table tb; +SET @@tmp_table_size= @tmp_table_size_save; End of 5.5 tests === modified file 'mysql-test/t/distinct.test' --- a/mysql-test/t/distinct.test +++ b/mysql-test/t/distinct.test @@ -625,5 +625,42 @@ INSERT INTO t1 VALUES (1111, 2222), (333 SELECT DISTINCT CONCAT(a,b) AS c FROM t1 ORDER BY 1; DROP TABLE t1; +--echo # +--echo # Bug#16539979 BASIC SELECT COUNT(DISTINCT ID) IS BROKEN. +--echo # Bug#17867117 ERROR RESULT WHEN "COUNT + DISTINCT + CASE WHEN" NEED MERGE_WALK +--echo # + +SET @tmp_table_size_save= @@tmp_table_size; +SET @@tmp_table_size= 1024; + +CREATE TABLE t1 (a INT); +INSERT INTO t1 VALUES (1),(2),(3),(4),(5),(6),(7),(8); +INSERT INTO t1 SELECT a+8 FROM t1; +INSERT INTO t1 SELECT a+16 FROM t1; +INSERT INTO t1 SELECT a+32 FROM t1; +INSERT INTO t1 SELECT a+64 FROM t1; +INSERT INTO t1 VALUE(NULL); +SELECT COUNT(DISTINCT a) FROM t1; +SELECT COUNT(DISTINCT (a+0)) FROM t1; +DROP TABLE t1; + +create table tb( +id int auto_increment primary key, +v varchar(32)) +engine=myisam charset=gbk; +insert into tb(v) values("aaa"); +insert into tb(v) (select v from tb); +insert into tb(v) (select v from tb); +insert into tb(v) (select v from tb); +insert into tb(v) (select v from tb); +insert into tb(v) (select v from tb); +insert into tb(v) (select v from tb); + +update tb set v=concat(v, id); +select count(distinct case when id<=64 then id end) from tb; +select count(distinct case when id<=63 then id end) from tb; +drop table tb; + +SET @@tmp_table_size= @tmp_table_size_save; --echo End of 5.5 tests === modified file 'sql/item_sum.cc' --- a/sql/item_sum.cc +++ b/sql/item_sum.cc @@ -1052,18 +1052,25 @@ void Aggregator_distinct::endup() endup_done= TRUE; } } - else - { - /* - We don't have a tree only if 'setup()' hasn't been called; - this is the case of sql_select.cc:return_zero_rows. - */ - if (tree) - table->field[0]->set_notnull(); - } + /* + Reviewers: please start by reading change in Item_sum_count::add(), the + rest is consequences. + */ + /* + We don't have a tree only if 'setup()' hasn't been called; + this is the case of sql_executor.cc:return_zero_rows. + */ if (tree && !endup_done) { + /* + All tree's values are not NULL. + Note that value of field is changed as we walk the tree, in + Aggregator_distinct::unique_walk_function, but it's always not NULL. + Reviewers: with this change, a call to + arg_is_null() (which looks at table->field[0]) is reliable. + */ + table->field[0]->set_notnull(); /* go over the tree of distinct keys and calculate the aggregate value */ use_distinct_values= TRUE; tree->walk(item_sum_distinct_walk, (void*) this); @@ -1334,7 +1341,7 @@ bool Item_sum_sum::add() { my_decimal value; const my_decimal *val= aggr->arg_val_decimal(&value); - if (!aggr->arg_is_null()) + if (!aggr->arg_is_null(true)) { my_decimal_add(E_DEC_FATAL_ERROR, dec_buffs + (curr_dec_buff^1), val, dec_buffs + curr_dec_buff); @@ -1345,7 +1352,7 @@ bool Item_sum_sum::add() else { sum+= aggr->arg_val_real(); - if (!aggr->arg_is_null()) + if (!aggr->arg_is_null(true)) null_value= 0; } DBUG_RETURN(0); @@ -1455,9 +1462,35 @@ double Aggregator_simple::arg_val_real() } -bool Aggregator_simple::arg_is_null() +bool Aggregator_simple::arg_is_null(bool use_null_value) { - return item_sum->args[0]->null_value; + /* + Reviewers: Before the patch, arg_is_null() was used only by AVG, which always + has a single argument. Now it's also used for COUNT, which can have + several arguments, like + COUNT(DISTINCT two expressions). + So we look at _all_ args, like the code removed from item_sum_count::add() + did. + */ + Item **item= item_sum->args; + const uint item_count= item_sum->arg_count; + if (use_null_value) + { + for (uint i= 0; i < item_count; i++) + { + if (item[i]->null_value) + return true; + } + } + else + { + for (uint i= 0; i < item_count; i++) + { + if (item[i]->maybe_null && item[i]->is_null()) + return true; + } + } + return false; } @@ -1475,10 +1508,34 @@ double Aggregator_distinct::arg_val_real } -bool Aggregator_distinct::arg_is_null() +bool Aggregator_distinct::arg_is_null(bool use_null_value) { - return use_distinct_values ? table->field[0]->is_null() : - item_sum->args[0]->null_value; + if (use_distinct_values) + { + const bool rc= table->field[0]->is_null(); + DBUG_ASSERT(!rc); // NULLs are never stored in 'tree' + return rc; + } + /* + Reviewers: + + As usual, item_sum->args[0]->null_value is out-of-date unless + it has been set by side-effect of arg_val*() or val_*() before. + Before this patch, arg_is_null() was always called after arg_val*() so + null_value was correct. But now, in item_sum_count::add(), arg_val*() + is not called. So item_sum->args[0]->null_value may be out-of-date. + I could simply always use is_null (), but for some calls to arg_is_null() + (the ones which existed before this patch - Item_sum_avg for example), looking + at null_value is good enough (because arg_val*() was already called and + has set null_value); in that case I prefer + to keep reading null_value, it is more efficient than calling + is_null () (which, if you look at its implementations, may call val_*() + again, which is really a waste of time, if val_*() has already been + called by arg_val*()!). That's the reason for the new argument "use_null_value". + */ + return use_null_value ? + item_sum->args[0]->null_value : + (item_sum->args[0]->maybe_null && item_sum->args[0]->is_null()); } @@ -1496,11 +1553,11 @@ void Item_sum_count::clear() bool Item_sum_count::add() { - for (uint i=0; imaybe_null && args[i]->is_null()) - return 0; - } + // Reviewers: same idea as fix for http://bugs.mysql.com/bug.php?id=57932 + // i.e. use a call which looks at the right place (which is table->field[0] + // if DISTINCT). + if (aggr->arg_is_null(false)) + return 0; count++; return 0; } @@ -1591,7 +1648,7 @@ bool Item_sum_avg::add() { if (Item_sum_sum::add()) return TRUE; - if (!aggr->arg_is_null()) + if (!aggr->arg_is_null(true)) count++; return FALSE; } === modified file 'sql/item_sum.h' --- a/sql/item_sum.h +++ b/sql/item_sum.h @@ -58,19 +58,8 @@ protected: /* the aggregate function class to act on */ Item_sum *item_sum; - /** - When feeding back the data in endup() from Unique/temp table back to - Item_sum::add() methods we must read the data from Unique (and not - recalculate the functions that are given as arguments to the aggregate - function. - This flag is to tell the add() methods to take the data from the Unique - instead by calling the relevant val_..() method - */ - - bool use_distinct_values; - public: - Aggregator (Item_sum *arg): item_sum(arg), use_distinct_values(FALSE) {} + Aggregator (Item_sum *arg): item_sum(arg) {} virtual ~Aggregator () {} /* Keep gcc happy */ enum Aggregator_type { SIMPLE_AGGREGATOR, DISTINCT_AGGREGATOR }; @@ -107,10 +96,16 @@ public: /** Floating point value of being-aggregated argument */ virtual double arg_val_real() = 0; /** - NULLness of being-aggregated argument; can be called only after - arg_val_decimal() or arg_val_real(). + NULLness of being-aggregated argument. + + @param use_null_value Optimization: to determine if the argument is NULL + we must, in the general case, call is_null() on it, which itself might + call val_*() on it, which might be costly. If you just have called + arg_val*(), you can pass use_null_value=true; this way, arg_is_null() + might avoid is_null() and instead do a cheap read of the Item's null_value + (updated by arg_val*()). */ - virtual bool arg_is_null() = 0; + virtual bool arg_is_null(bool use_null_value) = 0; }; @@ -480,7 +475,7 @@ public: Item *get_arg(uint i) { return args[i]; } Item *set_arg(uint i, THD *thd, Item *new_val); - uint get_arg_count() { return arg_count; } + uint get_arg_count() const { return arg_count; } /* Initialization of distinct related members */ void init_aggregator() @@ -607,10 +602,21 @@ class Aggregator_distinct : public Aggre */ bool always_null; + /** + When feeding back the data in endup() from Unique/temp table back to + Item_sum::add() methods we must read the data from Unique (and not + recalculate the functions that are given as arguments to the aggregate + function. + This flag is to tell the arg_*() methods to take the data from the Unique + instead of calling the relevant val_..() method. + */ + // Reviewers: I move it here, because only this subclass used it. + bool use_distinct_values; + public: Aggregator_distinct (Item_sum *sum) : Aggregator(sum), table(NULL), tmp_table_param(NULL), tree(NULL), - always_null(FALSE) {} + always_null(false), use_distinct_values(false) {} virtual ~Aggregator_distinct (); Aggregator_type Aggrtype() { return DISTINCT_AGGREGATOR; } @@ -620,7 +626,7 @@ public: void endup(); virtual my_decimal *arg_val_decimal(my_decimal * value); virtual double arg_val_real(); - virtual bool arg_is_null(); + virtual bool arg_is_null(bool use_null_value); bool unique_walk_function(void *element); static int composite_key_cmp(void* arg, uchar* key1, uchar* key2); @@ -646,7 +652,7 @@ public: void endup() {}; virtual my_decimal *arg_val_decimal(my_decimal * value); virtual double arg_val_real(); - virtual bool arg_is_null(); + virtual bool arg_is_null(bool use_null_value); };