From f0773b7842fcfd2032b630b4cfc7404a29d12a8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vicen=C8=9Biu=20Ciorbaru?= Date: Fri, 15 Feb 2019 01:23:00 +0200 Subject: Introduce analyze_sample_percentage variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The variable controls the amount of sampling analyze table performs. If ANALYZE table with histogram collection is too slow, one can reduce the time taken by setting analyze_sample_percentage to a lower value of the total number of rows. Setting it to 0 will use a formula to compute how many rows to sample: The number of rows collected is capped to a minimum of 50000 and increases logarithmically with a coffecient of 4096. The coffecient is chosen so that we expect an error of less than 3% in our estimations according to the paper: "Random Sampling for Histogram Construction: How much is enough?” – Surajit Chaudhuri, Rajeev Motwani, Vivek Narasayya, ACM SIGMOD, 1998. The drawback of sampling is that avg_frequency number is computed imprecisely and will yeild a smaller number than the real one. --- sql/sql_statistics.cc | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'sql/sql_statistics.cc') diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index f903ce143a4..27fab974441 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -2729,12 +2729,28 @@ int collect_statistics_for_table(THD *thd, TABLE *table) Field *table_field; ha_rows rows= 0; handler *file=table->file; + double sample_fraction= thd->variables.sample_percentage / 100; + const ha_rows MIN_THRESHOLD_FOR_SAMPLING= 50000; DBUG_ENTER("collect_statistics_for_table"); table->collected_stats->cardinality_is_null= TRUE; table->collected_stats->cardinality= 0; + if (thd->variables.sample_percentage == 0) + { + if (file->records() < MIN_THRESHOLD_FOR_SAMPLING) + { + sample_fraction= 1; + } + else + { + sample_fraction= std::fmin( + (MIN_THRESHOLD_FOR_SAMPLING + 4096 * + log(200 * file->records())) / file->records(), 1); + } + } + for (field_ptr= table->field; *field_ptr; field_ptr++) { table_field= *field_ptr; @@ -2747,7 +2763,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table) /* Perform a full table scan to collect statistics on 'table's columns */ if (!(rc= file->ha_rnd_init(TRUE))) - { + { DEBUG_SYNC(table->in_use, "statistics_collection_start"); while ((rc= file->ha_rnd_next(table->record[0])) != HA_ERR_END_OF_FILE) @@ -2758,17 +2774,20 @@ int collect_statistics_for_table(THD *thd, TABLE *table) if (rc) break; - for (field_ptr= table->field; *field_ptr; field_ptr++) + if (thd_rnd(thd) <= sample_fraction) { - table_field= *field_ptr; - if (!bitmap_is_set(table->read_set, table_field->field_index)) - continue; - if ((rc= table_field->collected_stats->add())) + for (field_ptr= table->field; *field_ptr; field_ptr++) + { + table_field= *field_ptr; + if (!bitmap_is_set(table->read_set, table_field->field_index)) + continue; + if ((rc= table_field->collected_stats->add())) + break; + } + if (rc) break; + rows++; } - if (rc) - break; - rows++; } file->ha_rnd_end(); } @@ -2782,7 +2801,8 @@ int collect_statistics_for_table(THD *thd, TABLE *table) if (!rc) { table->collected_stats->cardinality_is_null= FALSE; - table->collected_stats->cardinality= rows; + table->collected_stats->cardinality= + static_cast(rows / sample_fraction); } bitmap_clear_all(table->write_set); -- cgit v1.2.1