diff options
| author | Vicențiu Ciorbaru <vicentiu@mariadb.org> | 2019-02-17 00:44:58 +0200 | 
|---|---|---|
| committer | Vicențiu Ciorbaru <vicentiu@mariadb.org> | 2019-02-19 12:01:21 +0200 | 
| commit | 764429271dc94e5af08706dd0ac6fb962c15726d (patch) | |
| tree | 23f2a60bd86d0b5b68476a98d8db5ad0136dfac3 /sql/sql_statistics.cc | |
| parent | f0773b7842fcfd2032b630b4cfc7404a29d12a8f (diff) | |
| download | mariadb-git-764429271dc94e5af08706dd0ac6fb962c15726d.tar.gz | |
Implement avg_frequency unsmoothed jacknife estimator
When sampling data through ANALYZE TABLE, use the estimator to get a
better estimation of avg_frequency instead of just using the raw sampled data.
Diffstat (limited to 'sql/sql_statistics.cc')
| -rw-r--r-- | sql/sql_statistics.cc | 107 | 
1 files changed, 83 insertions, 24 deletions
| diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index 27fab974441..8683368e818 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -325,7 +325,7 @@ public:    inline void init(THD *thd, Field * table_field);    inline bool add(); -  inline void finish(ha_rows rows);  +  inline void finish(ha_rows rows, double sample_fraction);    inline void cleanup();  }; @@ -1540,6 +1540,8 @@ class Histogram_builder    uint curr_bucket;        /* number of the current bucket to be built     */    ulonglong count;         /* number of values retrieved                   */    ulonglong count_distinct;    /* number of distinct values retrieved      */ +  /* number of distinct values that occured only once  */ +  ulonglong count_distinct_single_occurence;  public:     Histogram_builder(Field *col, uint col_len, ha_rows rows) @@ -1553,14 +1555,21 @@ public:      bucket_capacity= (double) records / (hist_width + 1);      curr_bucket= 0;      count= 0; -    count_distinct= 0;     +    count_distinct= 0; +    count_distinct_single_occurence= 0;    } -  ulonglong get_count_distinct() { return count_distinct; } +  ulonglong get_count_distinct() const { return count_distinct; } +  ulonglong get_count_single_occurence() const +  { +    return count_distinct_single_occurence; +  }    int next(void *elem, element_count elem_cnt)    {      count_distinct++; +    if (elem_cnt == 1) +      count_distinct_single_occurence++;      count+= elem_cnt;      if (curr_bucket == hist_width)        return 0; @@ -1574,7 +1583,7 @@ public:               count > bucket_capacity * (curr_bucket + 1))        {          histogram->set_prev_value(curr_bucket); -	curr_bucket++; +        curr_bucket++;        }      }      return 0; @@ -1590,9 +1599,18 @@ int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)    return hist_builder->next(elem, elem_cnt);  } -C_MODE_END +static int count_distinct_single_occurence_walk(void *elem, +                                                element_count count, void *arg) +{ +  ((ulonglong*)arg)[0]+= 1; +  if (count == 1) +    ((ulonglong*)arg)[1]+= 1; +  return 0; +} + +C_MODE_END  /*    The class Count_distinct_field is a helper class used to calculate    the number of distinct values for a column. The class employs the @@ -1611,6 +1629,9 @@ protected:    Unique *tree;       /* The helper object to contain distinct values */    uint tree_key_length; /* The length of the keys for the elements of 'tree */ +  ulonglong distincts; +  ulonglong distincts_single_occurence; +  public:    Count_distinct_field() {} @@ -1662,30 +1683,40 @@ public:    {      return tree->unique_add(table_field->ptr);    } -   +    /*      @brief      Calculate the number of elements accumulated in the container of 'tree'    */ -  ulonglong get_value() -  { -    ulonglong count; -    if (tree->elements == 0) -      return (ulonglong) tree->elements_in_tree(); -    count= 0;   -    tree->walk(table_field->table, count_distinct_walk, (void*) &count); -    return count; +  void walk_tree() +  { +    ulonglong counts[2] = {0, 0}; +    tree->walk(table_field->table, +               count_distinct_single_occurence_walk, counts); +    distincts= counts[0]; +    distincts_single_occurence= counts[1];    }    /*      @brief -    Build the histogram for the elements accumulated in the container of 'tree' +    Calculate a histogram of the tree    */ -  ulonglong get_value_with_histogram(ha_rows rows) +   void walk_tree_with_histogram(ha_rows rows)    {      Histogram_builder hist_builder(table_field, tree_key_length, rows);      tree->walk(table_field->table,  histogram_build_walk, (void *) &hist_builder); -    return hist_builder.get_count_distinct(); +    distincts= hist_builder.get_count_distinct(); +    distincts_single_occurence= hist_builder.get_count_single_occurence(); +  } + +  ulonglong get_count_distinct() +  { +    return distincts; +  } + +  ulonglong get_count_distinct_single_occurence() +  { +    return distincts_single_occurence;    }    /* @@ -2514,7 +2545,7 @@ bool Column_statistics_collected::add()  */  inline -void Column_statistics_collected::finish(ha_rows rows) +void Column_statistics_collected::finish(ha_rows rows, double sample_fraction)  {    double val; @@ -2532,16 +2563,44 @@ void Column_statistics_collected::finish(ha_rows rows)    }    if (count_distinct)    { -    ulonglong distincts;      uint hist_size= count_distinct->get_hist_size(); + +    /* Compute cardinality statistics and optionally histogram. */      if (hist_size == 0) -      distincts= count_distinct->get_value(); +      count_distinct->walk_tree();      else -      distincts= count_distinct->get_value_with_histogram(rows - nulls); +      count_distinct->walk_tree_with_histogram(rows - nulls); + +    ulonglong distincts= count_distinct->get_count_distinct(); +    ulonglong distincts_single_occurence= +      count_distinct->get_count_distinct_single_occurence(); +      if (distincts)      { -      val= (double) (rows - nulls) / distincts; -      set_avg_frequency(val);  +      /* +       We use the unsmoothed first-order jackknife estimator" to estimate +       the number of distinct values. +       With a sufficient large percentage of rows sampled (80%), we revert back +       to computing the avg_frequency off of the raw data. +      */ +      if (sample_fraction > 0.8) +        val= (double) (rows - nulls) / distincts; +      else +      { +        if (nulls == 1) +          distincts_single_occurence+= 1; +        if (nulls) +          distincts+= 1; +        double fraction_single_occurence= +          static_cast<double>(distincts_single_occurence) / rows; +        double total_number_of_rows= rows / sample_fraction; +        double estimate_total_distincts= total_number_of_rows / +                (distincts / +                 (1.0 - (1.0 - sample_fraction) * fraction_single_occurence)); +        val = std::fmax(estimate_total_distincts * (rows - nulls) / rows, 1.0); +      } + +      set_avg_frequency(val);        set_not_null(COLUMN_STAT_AVG_FREQUENCY);      }      else @@ -2813,7 +2872,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)        continue;      bitmap_set_bit(table->write_set, table_field->field_index);       if (!rc) -      table_field->collected_stats->finish(rows); +      table_field->collected_stats->finish(rows, sample_fraction);      else        table_field->collected_stats->cleanup();    } | 
