diff options
| -rw-r--r-- | mysql-test/main/statistics.result | 15 | ||||
| -rw-r--r-- | mysql-test/main/statistics.test | 5 | ||||
| -rw-r--r-- | sql/sql_statistics.cc | 107 | 
3 files changed, 96 insertions, 31 deletions
| diff --git a/mysql-test/main/statistics.result b/mysql-test/main/statistics.result index ee97ec05391..787f0194532 100644 --- a/mysql-test/main/statistics.result +++ b/mysql-test/main/statistics.result @@ -1768,6 +1768,7 @@ set histogram_size=@save_hist_size, histogram_type=@save_hist_type;  #  set @save_use_stat_tables=@@use_stat_tables;  set @save_analyze_sample_percentage=@@analyze_sample_percentage; +set @save_hist_size=@@histogram_size;  set session rand_seed1=42;  set session rand_seed2=62;  set use_stat_tables=PREFERABLY; @@ -1800,7 +1801,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a  DECODE_HISTOGRAM(hist_type, histogram)  from mysql.column_stats;  table_name	column_name	min_value	max_value	nulls_ratio	avg_length	avg_frequency	DECODE_HISTOGRAM(hist_type, histogram) -t1	id	1	17384	0.0000	4.0000	14.0000	0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086 +t1	id	1	17384	0.0000	4.0000	14.0000	0.15705,0.15711,0.21463,0.15705,0.15711,0.15706  set analyze_sample_percentage=0.1;  #  # This query will show an innacurate avg_frequency value. @@ -1813,7 +1814,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a  DECODE_HISTOGRAM(hist_type, histogram)  from mysql.column_stats;  table_name	column_name	min_value	max_value	nulls_ratio	avg_length	avg_frequency	DECODE_HISTOGRAM(hist_type, histogram) -t1	id	111	17026	0.0000	4.0000	1.0047	0.039,0.098,0.055,0.118,0.078,0.157,0.082,0.118,0.094,0.063,0.098 +t1	id	111	17026	0.0000	4.0000	10.4739	0.13649,0.14922,0.16921,0.21141,0.18355,0.15012  #  # This query will show a better avg_frequency value.  # @@ -1826,7 +1827,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a  DECODE_HISTOGRAM(hist_type, histogram)  from mysql.column_stats;  table_name	column_name	min_value	max_value	nulls_ratio	avg_length	avg_frequency	DECODE_HISTOGRAM(hist_type, histogram) -t1	id	1	17384	0.0000	4.0000	3.5736	0.082,0.086,0.086,0.082,0.086,0.145,0.086,0.086,0.082,0.086,0.090 +t1	id	1	17384	0.0000	4.0000	14.0401	0.15566,0.15590,0.15729,0.21538,0.15790,0.15787  set analyze_sample_percentage=0;  #  # Test self adjusting sampling level. @@ -1839,7 +1840,7 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a  DECODE_HISTOGRAM(hist_type, histogram)  from mysql.column_stats;  table_name	column_name	min_value	max_value	nulls_ratio	avg_length	avg_frequency	DECODE_HISTOGRAM(hist_type, histogram) -t1	id	1	17384	0.0000	4.0000	7.4523	0.082,0.090,0.086,0.082,0.086,0.145,0.086,0.082,0.086,0.086,0.086 +t1	id	1	17384	0.0000	4.0000	13.9812	0.15860,0.15767,0.21515,0.15573,0.15630,0.15654  #  # Test record estimation is working properly.  # @@ -1858,9 +1859,11 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a  DECODE_HISTOGRAM(hist_type, histogram)  from mysql.column_stats;  table_name	column_name	min_value	max_value	nulls_ratio	avg_length	avg_frequency	DECODE_HISTOGRAM(hist_type, histogram) -t1	id	1	17384	0.0000	4.0000	14.0000	0.082,0.086,0.086,0.086,0.086,0.141,0.086,0.086,0.086,0.086,0.086 +t1	id	1	17384	0.0000	4.0000	14.0000	0.15705,0.15711,0.21463,0.15705,0.15711,0.15706  explain select * from t1;  id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra  1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	229376	 -set use_stat_tables=@save_use_stat_tables;  drop table t1; +set analyze_sample_percentage=@save_analyze_sample_percentage; +set histogram_size=@save_hist_size; +set use_stat_tables=@save_use_stat_tables; diff --git a/mysql-test/main/statistics.test b/mysql-test/main/statistics.test index 84bb7c10059..880f2987b50 100644 --- a/mysql-test/main/statistics.test +++ b/mysql-test/main/statistics.test @@ -912,6 +912,7 @@ set histogram_size=@save_hist_size, histogram_type=@save_hist_type;  --echo #  set @save_use_stat_tables=@@use_stat_tables;  set @save_analyze_sample_percentage=@@analyze_sample_percentage; +set @save_hist_size=@@histogram_size;  set session rand_seed1=42;  set session rand_seed2=62; @@ -986,6 +987,8 @@ select table_name, column_name, min_value, max_value, nulls_ratio, avg_length, a  from mysql.column_stats;  explain select * from t1; -set use_stat_tables=@save_use_stat_tables;  drop table t1; +set analyze_sample_percentage=@save_analyze_sample_percentage; +set histogram_size=@save_hist_size; +set use_stat_tables=@save_use_stat_tables; diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index 27fab974441..8683368e818 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -325,7 +325,7 @@ public:    inline void init(THD *thd, Field * table_field);    inline bool add(); -  inline void finish(ha_rows rows);  +  inline void finish(ha_rows rows, double sample_fraction);    inline void cleanup();  }; @@ -1540,6 +1540,8 @@ class Histogram_builder    uint curr_bucket;        /* number of the current bucket to be built     */    ulonglong count;         /* number of values retrieved                   */    ulonglong count_distinct;    /* number of distinct values retrieved      */ +  /* number of distinct values that occured only once  */ +  ulonglong count_distinct_single_occurence;  public:     Histogram_builder(Field *col, uint col_len, ha_rows rows) @@ -1553,14 +1555,21 @@ public:      bucket_capacity= (double) records / (hist_width + 1);      curr_bucket= 0;      count= 0; -    count_distinct= 0;     +    count_distinct= 0; +    count_distinct_single_occurence= 0;    } -  ulonglong get_count_distinct() { return count_distinct; } +  ulonglong get_count_distinct() const { return count_distinct; } +  ulonglong get_count_single_occurence() const +  { +    return count_distinct_single_occurence; +  }    int next(void *elem, element_count elem_cnt)    {      count_distinct++; +    if (elem_cnt == 1) +      count_distinct_single_occurence++;      count+= elem_cnt;      if (curr_bucket == hist_width)        return 0; @@ -1574,7 +1583,7 @@ public:               count > bucket_capacity * (curr_bucket + 1))        {          histogram->set_prev_value(curr_bucket); -	curr_bucket++; +        curr_bucket++;        }      }      return 0; @@ -1590,9 +1599,18 @@ int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)    return hist_builder->next(elem, elem_cnt);  } -C_MODE_END +static int count_distinct_single_occurence_walk(void *elem, +                                                element_count count, void *arg) +{ +  ((ulonglong*)arg)[0]+= 1; +  if (count == 1) +    ((ulonglong*)arg)[1]+= 1; +  return 0; +} + +C_MODE_END  /*    The class Count_distinct_field is a helper class used to calculate    the number of distinct values for a column. The class employs the @@ -1611,6 +1629,9 @@ protected:    Unique *tree;       /* The helper object to contain distinct values */    uint tree_key_length; /* The length of the keys for the elements of 'tree */ +  ulonglong distincts; +  ulonglong distincts_single_occurence; +  public:    Count_distinct_field() {} @@ -1662,30 +1683,40 @@ public:    {      return tree->unique_add(table_field->ptr);    } -   +    /*      @brief      Calculate the number of elements accumulated in the container of 'tree'    */ -  ulonglong get_value() -  { -    ulonglong count; -    if (tree->elements == 0) -      return (ulonglong) tree->elements_in_tree(); -    count= 0;   -    tree->walk(table_field->table, count_distinct_walk, (void*) &count); -    return count; +  void walk_tree() +  { +    ulonglong counts[2] = {0, 0}; +    tree->walk(table_field->table, +               count_distinct_single_occurence_walk, counts); +    distincts= counts[0]; +    distincts_single_occurence= counts[1];    }    /*      @brief -    Build the histogram for the elements accumulated in the container of 'tree' +    Calculate a histogram of the tree    */ -  ulonglong get_value_with_histogram(ha_rows rows) +   void walk_tree_with_histogram(ha_rows rows)    {      Histogram_builder hist_builder(table_field, tree_key_length, rows);      tree->walk(table_field->table,  histogram_build_walk, (void *) &hist_builder); -    return hist_builder.get_count_distinct(); +    distincts= hist_builder.get_count_distinct(); +    distincts_single_occurence= hist_builder.get_count_single_occurence(); +  } + +  ulonglong get_count_distinct() +  { +    return distincts; +  } + +  ulonglong get_count_distinct_single_occurence() +  { +    return distincts_single_occurence;    }    /* @@ -2514,7 +2545,7 @@ bool Column_statistics_collected::add()  */  inline -void Column_statistics_collected::finish(ha_rows rows) +void Column_statistics_collected::finish(ha_rows rows, double sample_fraction)  {    double val; @@ -2532,16 +2563,44 @@ void Column_statistics_collected::finish(ha_rows rows)    }    if (count_distinct)    { -    ulonglong distincts;      uint hist_size= count_distinct->get_hist_size(); + +    /* Compute cardinality statistics and optionally histogram. */      if (hist_size == 0) -      distincts= count_distinct->get_value(); +      count_distinct->walk_tree();      else -      distincts= count_distinct->get_value_with_histogram(rows - nulls); +      count_distinct->walk_tree_with_histogram(rows - nulls); + +    ulonglong distincts= count_distinct->get_count_distinct(); +    ulonglong distincts_single_occurence= +      count_distinct->get_count_distinct_single_occurence(); +      if (distincts)      { -      val= (double) (rows - nulls) / distincts; -      set_avg_frequency(val);  +      /* +       We use the unsmoothed first-order jackknife estimator" to estimate +       the number of distinct values. +       With a sufficient large percentage of rows sampled (80%), we revert back +       to computing the avg_frequency off of the raw data. +      */ +      if (sample_fraction > 0.8) +        val= (double) (rows - nulls) / distincts; +      else +      { +        if (nulls == 1) +          distincts_single_occurence+= 1; +        if (nulls) +          distincts+= 1; +        double fraction_single_occurence= +          static_cast<double>(distincts_single_occurence) / rows; +        double total_number_of_rows= rows / sample_fraction; +        double estimate_total_distincts= total_number_of_rows / +                (distincts / +                 (1.0 - (1.0 - sample_fraction) * fraction_single_occurence)); +        val = std::fmax(estimate_total_distincts * (rows - nulls) / rows, 1.0); +      } + +      set_avg_frequency(val);        set_not_null(COLUMN_STAT_AVG_FREQUENCY);      }      else @@ -2813,7 +2872,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)        continue;      bitmap_set_bit(table->write_set, table_field->field_index);       if (!rc) -      table_field->collected_stats->finish(rows); +      table_field->collected_stats->finish(rows, sample_fraction);      else        table_field->collected_stats->cleanup();    } | 
