From 50726b2322edac8e93489fcb5ef75a5634222d44 Mon Sep 17 00:00:00 2001 From: "cmiller@zippy.cornsilk.net" <> Date: Fri, 22 Dec 2006 15:37:37 -0500 Subject: Bug#22555: STDDEV yields positive result for groups with only one row When only one row was present, the subtraction of nearly the same number resulted in catastropic cancellation, introducing an error in the VARIANCE calculation near 1e-15. That was sqrt()ed to get STDDEV, the error was escallated to near 1e-8. The simple fix of testing for a row count of 1 and forcing that to yield 0.0 is insufficient, as two rows of the same value should also have a variance of 0.0, yet the error would be about the same. So, this patch changes the formula that computes the VARIANCE to be one that is not subject to catastrophic cancellation. In addition, it now uses only (faster-than-decimal) floating point numbers to calculate, and renders that to other types on demand. --- sql/item_sum.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'sql/item_sum.h') diff --git a/sql/item_sum.h b/sql/item_sum.h index fe7edd76ecf..989e72654fe 100644 --- a/sql/item_sum.h +++ b/sql/item_sum.h @@ -665,8 +665,10 @@ public: double val_real(); longlong val_int() { /* can't be fix_fields()ed */ return (longlong) rint(val_real()); } - String *val_str(String*); - my_decimal *val_decimal(my_decimal *); + String *val_str(String *str) + { return val_string_from_real(str); } + my_decimal *val_decimal(my_decimal *dec_buf) + { return val_decimal_from_real(dec_buf); } bool is_null() { (void) val_int(); return null_value; } enum_field_types field_type() const { @@ -688,6 +690,14 @@ public: = (sum(ai^2) - 2*sum(a)*sum(a)/count(a) + count(a)*sum(a)^2/count(a)^2 )/count(a) = = (sum(ai^2) - 2*sum(a)^2/count(a) + sum(a)^2/count(a) )/count(a) = = (sum(ai^2) - sum(a)^2/count(a))/count(a) + +But, this falls prey to catastrophic cancellation. Instead, use the recurrence formulas + + M_{1} = x_{1}, ~ M_{k} = M_{k-1} + (x_{k} - M_{k-1}) / k newline + S_{1} = 0, ~ S_{k} = S_{k-1} + (x_{k} - M_{k-1}) times (x_{k} - M_{k}) newline + for 2 <= k <= n newline + ital variance = S_{n} / (n-1) + */ class Item_sum_variance : public Item_sum_num @@ -696,9 +706,8 @@ class Item_sum_variance : public Item_sum_num public: Item_result hybrid_type; - double sum, sum_sqr; - my_decimal dec_sum[2], dec_sqr[2]; int cur_dec; + double recurrence_m, recurrence_s; /* Used in recurrence relation. */ ulonglong count; uint f_precision0, f_scale0; uint f_precision1, f_scale1; @@ -707,7 +716,7 @@ public: uint prec_increment; Item_sum_variance(Item *item_par, uint sample_arg) :Item_sum_num(item_par), - hybrid_type(REAL_RESULT), cur_dec(0), count(0), sample(sample_arg) + hybrid_type(REAL_RESULT), count(0), sample(sample_arg) {} Item_sum_variance(THD *thd, Item_sum_variance *item); enum Sumfunctype sum_func () const { return VARIANCE_FUNC; } @@ -727,7 +736,6 @@ public: enum Item_result result_type () const { return REAL_RESULT; } void cleanup() { - cur_dec= 0; count= 0; Item_sum_num::cleanup(); } -- cgit v1.2.1