diff options
| author | Vicențiu Ciorbaru <vicentiu@mariadb.org> | 2019-02-15 01:23:00 +0200 | 
|---|---|---|
| committer | Vicențiu Ciorbaru <vicentiu@mariadb.org> | 2019-02-19 12:01:21 +0200 | 
| commit | f0773b7842fcfd2032b630b4cfc7404a29d12a8f (patch) | |
| tree | 3b00628835a73575036e3488e2613d39bc8544e0 /sql/sql_statistics.cc | |
| parent | 47f15ea73c49e90b16a4a4adf5414f51bdbf97a4 (diff) | |
| download | mariadb-git-f0773b7842fcfd2032b630b4cfc7404a29d12a8f.tar.gz | |
Introduce analyze_sample_percentage variable
The variable controls the amount of sampling analyze table performs.
If ANALYZE table with histogram collection is too slow, one can reduce the
time taken by setting analyze_sample_percentage to a lower value of the
total number of rows.
Setting it to 0 will use a formula to compute how many rows to sample:
The number of rows collected is capped to a minimum of 50000 and
increases logarithmically with a coffecient of 4096. The coffecient is
chosen so that we expect an error of less than 3% in our estimations
according to the paper:
"Random Sampling for Histogram Construction: How much is enough?”
– Surajit Chaudhuri, Rajeev Motwani, Vivek Narasayya, ACM SIGMOD, 1998.
The drawback of sampling is that avg_frequency number is computed
imprecisely and will yeild a smaller number than the real one.
Diffstat (limited to 'sql/sql_statistics.cc')
| -rw-r--r-- | sql/sql_statistics.cc | 40 | 
1 files changed, 30 insertions, 10 deletions
| diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index f903ce143a4..27fab974441 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -2729,12 +2729,28 @@ int collect_statistics_for_table(THD *thd, TABLE *table)    Field *table_field;    ha_rows rows= 0;    handler *file=table->file; +  double sample_fraction= thd->variables.sample_percentage / 100; +  const ha_rows MIN_THRESHOLD_FOR_SAMPLING= 50000;    DBUG_ENTER("collect_statistics_for_table");    table->collected_stats->cardinality_is_null= TRUE;    table->collected_stats->cardinality= 0; +  if (thd->variables.sample_percentage == 0) +  { +    if (file->records() < MIN_THRESHOLD_FOR_SAMPLING) +    { +      sample_fraction= 1; +    } +    else +    { +      sample_fraction= std::fmin( +                  (MIN_THRESHOLD_FOR_SAMPLING + 4096 * +                   log(200 * file->records())) / file->records(), 1); +    } +  } +    for (field_ptr= table->field; *field_ptr; field_ptr++)    {      table_field= *field_ptr;    @@ -2747,7 +2763,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)    /* Perform a full table scan to collect statistics on 'table's columns */    if (!(rc= file->ha_rnd_init(TRUE))) -  {   +  {      DEBUG_SYNC(table->in_use, "statistics_collection_start");      while ((rc= file->ha_rnd_next(table->record[0])) != HA_ERR_END_OF_FILE) @@ -2758,17 +2774,20 @@ int collect_statistics_for_table(THD *thd, TABLE *table)        if (rc)          break; -      for (field_ptr= table->field; *field_ptr; field_ptr++) +      if (thd_rnd(thd) <= sample_fraction)        { -        table_field= *field_ptr; -        if (!bitmap_is_set(table->read_set, table_field->field_index)) -          continue;   -        if ((rc= table_field->collected_stats->add())) +        for (field_ptr= table->field; *field_ptr; field_ptr++) +        { +          table_field= *field_ptr; +          if (!bitmap_is_set(table->read_set, table_field->field_index)) +            continue; +          if ((rc= table_field->collected_stats->add())) +            break; +        } +        if (rc)            break; +        rows++;        } -      if (rc) -        break; -      rows++;      }      file->ha_rnd_end();    } @@ -2782,7 +2801,8 @@ int collect_statistics_for_table(THD *thd, TABLE *table)    if (!rc)    {      table->collected_stats->cardinality_is_null= FALSE; -    table->collected_stats->cardinality= rows; +    table->collected_stats->cardinality= +      static_cast<ha_rows>(rows / sample_fraction);    }    bitmap_clear_all(table->write_set); | 
