diff options
author | Vicențiu Ciorbaru <vicentiu@mariadb.org> | 2019-02-15 01:23:00 +0200 |
---|---|---|
committer | Vicențiu Ciorbaru <vicentiu@mariadb.org> | 2019-02-19 12:01:21 +0200 |
commit | f0773b7842fcfd2032b630b4cfc7404a29d12a8f (patch) | |
tree | 3b00628835a73575036e3488e2613d39bc8544e0 /sql/sql_statistics.cc | |
parent | 47f15ea73c49e90b16a4a4adf5414f51bdbf97a4 (diff) | |
download | mariadb-git-f0773b7842fcfd2032b630b4cfc7404a29d12a8f.tar.gz |
Introduce analyze_sample_percentage variable
The variable controls the amount of sampling analyze table performs.
If ANALYZE table with histogram collection is too slow, one can reduce the
time taken by setting analyze_sample_percentage to a lower value of the
total number of rows.
Setting it to 0 will use a formula to compute how many rows to sample:
The number of rows collected is capped to a minimum of 50000 and
increases logarithmically with a coffecient of 4096. The coffecient is
chosen so that we expect an error of less than 3% in our estimations
according to the paper:
"Random Sampling for Histogram Construction: How much is enough?”
– Surajit Chaudhuri, Rajeev Motwani, Vivek Narasayya, ACM SIGMOD, 1998.
The drawback of sampling is that avg_frequency number is computed
imprecisely and will yeild a smaller number than the real one.
Diffstat (limited to 'sql/sql_statistics.cc')
-rw-r--r-- | sql/sql_statistics.cc | 40 |
1 files changed, 30 insertions, 10 deletions
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index f903ce143a4..27fab974441 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -2729,12 +2729,28 @@ int collect_statistics_for_table(THD *thd, TABLE *table) Field *table_field; ha_rows rows= 0; handler *file=table->file; + double sample_fraction= thd->variables.sample_percentage / 100; + const ha_rows MIN_THRESHOLD_FOR_SAMPLING= 50000; DBUG_ENTER("collect_statistics_for_table"); table->collected_stats->cardinality_is_null= TRUE; table->collected_stats->cardinality= 0; + if (thd->variables.sample_percentage == 0) + { + if (file->records() < MIN_THRESHOLD_FOR_SAMPLING) + { + sample_fraction= 1; + } + else + { + sample_fraction= std::fmin( + (MIN_THRESHOLD_FOR_SAMPLING + 4096 * + log(200 * file->records())) / file->records(), 1); + } + } + for (field_ptr= table->field; *field_ptr; field_ptr++) { table_field= *field_ptr; @@ -2747,7 +2763,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table) /* Perform a full table scan to collect statistics on 'table's columns */ if (!(rc= file->ha_rnd_init(TRUE))) - { + { DEBUG_SYNC(table->in_use, "statistics_collection_start"); while ((rc= file->ha_rnd_next(table->record[0])) != HA_ERR_END_OF_FILE) @@ -2758,17 +2774,20 @@ int collect_statistics_for_table(THD *thd, TABLE *table) if (rc) break; - for (field_ptr= table->field; *field_ptr; field_ptr++) + if (thd_rnd(thd) <= sample_fraction) { - table_field= *field_ptr; - if (!bitmap_is_set(table->read_set, table_field->field_index)) - continue; - if ((rc= table_field->collected_stats->add())) + for (field_ptr= table->field; *field_ptr; field_ptr++) + { + table_field= *field_ptr; + if (!bitmap_is_set(table->read_set, table_field->field_index)) + continue; + if ((rc= table_field->collected_stats->add())) + break; + } + if (rc) break; + rows++; } - if (rc) - break; - rows++; } file->ha_rnd_end(); } @@ -2782,7 +2801,8 @@ int collect_statistics_for_table(THD *thd, TABLE *table) if (!rc) { table->collected_stats->cardinality_is_null= FALSE; - table->collected_stats->cardinality= rows; + table->collected_stats->cardinality= + static_cast<ha_rows>(rows / sample_fraction); } bitmap_clear_all(table->write_set); |