summaryrefslogtreecommitdiff
path: root/sql/sql_statistics.cc
diff options
context:
space:
mode:
authorVicențiu Ciorbaru <vicentiu@mariadb.org>2019-02-15 01:23:00 +0200
committerVicențiu Ciorbaru <vicentiu@mariadb.org>2019-02-19 12:01:21 +0200
commitf0773b7842fcfd2032b630b4cfc7404a29d12a8f (patch)
tree3b00628835a73575036e3488e2613d39bc8544e0 /sql/sql_statistics.cc
parent47f15ea73c49e90b16a4a4adf5414f51bdbf97a4 (diff)
downloadmariadb-git-f0773b7842fcfd2032b630b4cfc7404a29d12a8f.tar.gz
Introduce analyze_sample_percentage variable
The variable controls the amount of sampling analyze table performs. If ANALYZE table with histogram collection is too slow, one can reduce the time taken by setting analyze_sample_percentage to a lower value of the total number of rows. Setting it to 0 will use a formula to compute how many rows to sample: The number of rows collected is capped to a minimum of 50000 and increases logarithmically with a coffecient of 4096. The coffecient is chosen so that we expect an error of less than 3% in our estimations according to the paper: "Random Sampling for Histogram Construction: How much is enough?” – Surajit Chaudhuri, Rajeev Motwani, Vivek Narasayya, ACM SIGMOD, 1998. The drawback of sampling is that avg_frequency number is computed imprecisely and will yeild a smaller number than the real one.
Diffstat (limited to 'sql/sql_statistics.cc')
-rw-r--r--sql/sql_statistics.cc40
1 files changed, 30 insertions, 10 deletions
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index f903ce143a4..27fab974441 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -2729,12 +2729,28 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
Field *table_field;
ha_rows rows= 0;
handler *file=table->file;
+ double sample_fraction= thd->variables.sample_percentage / 100;
+ const ha_rows MIN_THRESHOLD_FOR_SAMPLING= 50000;
DBUG_ENTER("collect_statistics_for_table");
table->collected_stats->cardinality_is_null= TRUE;
table->collected_stats->cardinality= 0;
+ if (thd->variables.sample_percentage == 0)
+ {
+ if (file->records() < MIN_THRESHOLD_FOR_SAMPLING)
+ {
+ sample_fraction= 1;
+ }
+ else
+ {
+ sample_fraction= std::fmin(
+ (MIN_THRESHOLD_FOR_SAMPLING + 4096 *
+ log(200 * file->records())) / file->records(), 1);
+ }
+ }
+
for (field_ptr= table->field; *field_ptr; field_ptr++)
{
table_field= *field_ptr;
@@ -2747,7 +2763,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
/* Perform a full table scan to collect statistics on 'table's columns */
if (!(rc= file->ha_rnd_init(TRUE)))
- {
+ {
DEBUG_SYNC(table->in_use, "statistics_collection_start");
while ((rc= file->ha_rnd_next(table->record[0])) != HA_ERR_END_OF_FILE)
@@ -2758,17 +2774,20 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
if (rc)
break;
- for (field_ptr= table->field; *field_ptr; field_ptr++)
+ if (thd_rnd(thd) <= sample_fraction)
{
- table_field= *field_ptr;
- if (!bitmap_is_set(table->read_set, table_field->field_index))
- continue;
- if ((rc= table_field->collected_stats->add()))
+ for (field_ptr= table->field; *field_ptr; field_ptr++)
+ {
+ table_field= *field_ptr;
+ if (!bitmap_is_set(table->read_set, table_field->field_index))
+ continue;
+ if ((rc= table_field->collected_stats->add()))
+ break;
+ }
+ if (rc)
break;
+ rows++;
}
- if (rc)
- break;
- rows++;
}
file->ha_rnd_end();
}
@@ -2782,7 +2801,8 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
if (!rc)
{
table->collected_stats->cardinality_is_null= FALSE;
- table->collected_stats->cardinality= rows;
+ table->collected_stats->cardinality=
+ static_cast<ha_rows>(rows / sample_fraction);
}
bitmap_clear_all(table->write_set);