summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergei Petrunia <sergey@mariadb.com>2023-04-27 18:07:55 +0300
committerSergei Petrunia <sergey@mariadb.com>2023-04-27 20:08:16 +0300
commit5285832b7810a9bee4f1c36ac0a4caff113ae949 (patch)
tree02b4c144ba2792b5f0cc1f0bd8cda081fe8e6785
parent8f87023d3f3fbaad4e33991713db884cbe052fbc (diff)
downloadmariadb-git-bb-10.6-mdev31067-variant3.tar.gz
MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogrambb-10.6-mdev31067-variant3
Variant #3. When Histogram::point_selectivity() sees that the point value of interest falls into one bucket, it tries to guess whether the bucket has many different (unpopular) values or a few popular values. (The number of rows is fixed, as it's a Height-balanced histogram). The basis for this guess is the "width" of the value range the bucket covers. Buckets covering wider value ranges are assumed to contain values with proportionally lower frequencies. This is just a [brave] guesswork. For a very narrow bucket, it may produce an estimate that's larger than total #rows in the bucket or even in the whole table. Remove it and replace with another approach: compute the point selectivity by assuming that the number of rows that matches a point column=val condition is the average #rows for non-common value. A value is non-common if it takes less than one whole histogram bucket.
-rw-r--r--mysql-test/main/selectivity.result75
-rw-r--r--mysql-test/main/selectivity.test83
-rw-r--r--mysql-test/main/selectivity_innodb.result75
-rw-r--r--mysql-test/main/selectivity_no_engine.result6
-rw-r--r--sql/sql_statistics.cc127
-rw-r--r--sql/sql_statistics.h19
6 files changed, 330 insertions, 55 deletions
diff --git a/mysql-test/main/selectivity.result b/mysql-test/main/selectivity.result
index b5a48d341ae..6159186644b 100644
--- a/mysql-test/main/selectivity.result
+++ b/mysql-test/main/selectivity.result
@@ -834,7 +834,7 @@ flush table t1;
set optimizer_use_condition_selectivity=4;
explain extended select * from t1 where a=0;
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.79 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 0
drop table t1;
@@ -1649,7 +1649,7 @@ test.t1 analyze status Table is already up to date
# Check what info the optimizer has about selectivities
explain extended select * from t1 use index () where a in (17,51,5);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.90 Using where
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.91 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` USE INDEX () where `test`.`t1`.`a` in (17,51,5)
explain extended select * from t1 use index () where b=2;
@@ -1941,9 +1941,76 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 5 25.00 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 2
+DROP TABLE t1;
+# End of 10.2 tests
+#
+# MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
+#
+create table t0(a int);
+insert into t0 select 1 from seq_1_to_78;
+create table t1(a int);
+insert into t1 select 1 from seq_1_to_26;
+create table t10 (a int);
+insert into t10 select 0 from t0, seq_1_to_4;
+insert into t10 select 8693 from t1;
+insert into t10 select 8694 from t1;
+insert into t10 select 8695 from t1;
+insert into t10 select 34783 from t1;
+insert into t10 select 34784 from t1;
+insert into t10 select 34785 from t1;
+insert into t10 select 34785 from t0, seq_1_to_8;
+insert into t10 select 65214 from t1;
+insert into t10 select 65215 from t1;
+insert into t10 select 65216 from t1;
+insert into t10 select 65216 from t0, seq_1_to_52;
+insert into t10 select 65217 from t1;
+insert into t10 select 65218 from t1;
+insert into t10 select 65219 from t1;
+insert into t10 select 65219 from t0;
+insert into t10 select 73913 from t1;
+insert into t10 select 73914 from t1;
+insert into t10 select 73915 from t1;
+insert into t10 select 73915 from t0, seq_1_to_40;
+insert into t10 select 78257 from t1;
+insert into t10 select 78258 from t1;
+insert into t10 select 78259 from t1;
+insert into t10 select 91300 from t1;
+insert into t10 select 91301 from t1;
+insert into t10 select 91302 from t1;
+insert into t10 select 91302 from t0, seq_1_to_6;
+insert into t10 select 91303 from t1;
+insert into t10 select 91304 from t1;
+insert into t10 select 91305 from t1;
+insert into t10 select 91305 from t0, seq_1_to_8;
+insert into t10 select 99998 from t1;
+insert into t10 select 99999 from t1;
+insert into t10 select 100000 from t1;
+set use_stat_tables=preferably;
+analyze table t10 persistent for all;
+Table Op Msg_type Msg_text
+test.t10 analyze status Engine-independent statistics collected
+test.t10 analyze status OK
+flush tables;
+set statement optimizer_trace=1 for
+explain select * from t10 where a in (91303);
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t10 ALL NULL NULL NULL NULL 9984 Using where
+# Must have selectivity_from_histogram <= 1.0:
+select json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
+from information_schema.optimizer_trace;
+json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
+[
+ [
+ {
+ "column_name": "a",
+ "ranges":
+ ["91303 <= a <= 91303"],
+ "selectivity_from_histogram": 0.035714283
+ }
+ ]
+]
+drop table t0,t1,t10;
set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
set histogram_size=@save_histogram_size;
set use_stat_tables= @save_use_stat_tables;
-DROP TABLE t1;
-# End of 10.2 tests
set @@global.histogram_size=@save_histogram_size;
diff --git a/mysql-test/main/selectivity.test b/mysql-test/main/selectivity.test
index 4e4513d09d6..556ffd76a22 100644
--- a/mysql-test/main/selectivity.test
+++ b/mysql-test/main/selectivity.test
@@ -1321,14 +1321,91 @@ EXPLAIN EXTENDED SELECT * FROM t1 WHERE a=2;
FLUSH TABLES;
EXPLAIN EXTENDED SELECT * FROM t1 WHERE a=2;
-set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
-set histogram_size=@save_histogram_size;
-set use_stat_tables= @save_use_stat_tables;
DROP TABLE t1;
--echo # End of 10.2 tests
+--echo #
+--echo # MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
+--echo #
+create table t0(a int); # This holds how many rows we hold in a bucket.
+insert into t0 select 1 from seq_1_to_78;
+
+create table t1(a int); # one-third of a bucket
+insert into t1 select 1 from seq_1_to_26;
+
+create table t10 (a int);
+insert into t10 select 0 from t0, seq_1_to_4;
+
+insert into t10 select 8693 from t1;
+insert into t10 select 8694 from t1;
+insert into t10 select 8695 from t1;
+
+
+insert into t10 select 34783 from t1;
+insert into t10 select 34784 from t1;
+insert into t10 select 34785 from t1;
+
+
+insert into t10 select 34785 from t0, seq_1_to_8;
+
+insert into t10 select 65214 from t1;
+insert into t10 select 65215 from t1;
+insert into t10 select 65216 from t1;
+
+insert into t10 select 65216 from t0, seq_1_to_52;
+
+insert into t10 select 65217 from t1;
+insert into t10 select 65218 from t1;
+insert into t10 select 65219 from t1;
+
+insert into t10 select 65219 from t0;
+
+
+insert into t10 select 73913 from t1;
+insert into t10 select 73914 from t1;
+insert into t10 select 73915 from t1;
+
+insert into t10 select 73915 from t0, seq_1_to_40;
+
+
+insert into t10 select 78257 from t1;
+insert into t10 select 78258 from t1;
+insert into t10 select 78259 from t1;
+
+insert into t10 select 91300 from t1;
+insert into t10 select 91301 from t1;
+insert into t10 select 91302 from t1;
+
+insert into t10 select 91302 from t0, seq_1_to_6;
+
+insert into t10 select 91303 from t1; # Only 1/3rd of bucket matches the search tuple
+insert into t10 select 91304 from t1;
+insert into t10 select 91305 from t1;
+
+insert into t10 select 91305 from t0, seq_1_to_8;
+
+insert into t10 select 99998 from t1;
+insert into t10 select 99999 from t1;
+insert into t10 select 100000 from t1;
+
+set use_stat_tables=preferably;
+analyze table t10 persistent for all;
+flush tables;
+set statement optimizer_trace=1 for
+explain select * from t10 where a in (91303);
+
+#select * from information_schema.optimizer_trace;
+--echo # Must have selectivity_from_histogram <= 1.0:
+select json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
+from information_schema.optimizer_trace;
+
+drop table t0,t1,t10;
+
+set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
+set histogram_size=@save_histogram_size;
+set use_stat_tables= @save_use_stat_tables;
#
# Clean up
#
diff --git a/mysql-test/main/selectivity_innodb.result b/mysql-test/main/selectivity_innodb.result
index 3edf6d8e929..fd0cea27a81 100644
--- a/mysql-test/main/selectivity_innodb.result
+++ b/mysql-test/main/selectivity_innodb.result
@@ -845,7 +845,7 @@ flush table t1;
set optimizer_use_condition_selectivity=4;
explain extended select * from t1 where a=0;
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.79 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 0
drop table t1;
@@ -1661,7 +1661,7 @@ test.t1 analyze status OK
# Check what info the optimizer has about selectivities
explain extended select * from t1 use index () where a in (17,51,5);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.90 Using where
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.91 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` USE INDEX () where `test`.`t1`.`a` in (17,51,5)
explain extended select * from t1 use index () where b=2;
@@ -1953,11 +1953,78 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 5 25.00 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 2
+DROP TABLE t1;
+# End of 10.2 tests
+#
+# MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
+#
+create table t0(a int);
+insert into t0 select 1 from seq_1_to_78;
+create table t1(a int);
+insert into t1 select 1 from seq_1_to_26;
+create table t10 (a int);
+insert into t10 select 0 from t0, seq_1_to_4;
+insert into t10 select 8693 from t1;
+insert into t10 select 8694 from t1;
+insert into t10 select 8695 from t1;
+insert into t10 select 34783 from t1;
+insert into t10 select 34784 from t1;
+insert into t10 select 34785 from t1;
+insert into t10 select 34785 from t0, seq_1_to_8;
+insert into t10 select 65214 from t1;
+insert into t10 select 65215 from t1;
+insert into t10 select 65216 from t1;
+insert into t10 select 65216 from t0, seq_1_to_52;
+insert into t10 select 65217 from t1;
+insert into t10 select 65218 from t1;
+insert into t10 select 65219 from t1;
+insert into t10 select 65219 from t0;
+insert into t10 select 73913 from t1;
+insert into t10 select 73914 from t1;
+insert into t10 select 73915 from t1;
+insert into t10 select 73915 from t0, seq_1_to_40;
+insert into t10 select 78257 from t1;
+insert into t10 select 78258 from t1;
+insert into t10 select 78259 from t1;
+insert into t10 select 91300 from t1;
+insert into t10 select 91301 from t1;
+insert into t10 select 91302 from t1;
+insert into t10 select 91302 from t0, seq_1_to_6;
+insert into t10 select 91303 from t1;
+insert into t10 select 91304 from t1;
+insert into t10 select 91305 from t1;
+insert into t10 select 91305 from t0, seq_1_to_8;
+insert into t10 select 99998 from t1;
+insert into t10 select 99999 from t1;
+insert into t10 select 100000 from t1;
+set use_stat_tables=preferably;
+analyze table t10 persistent for all;
+Table Op Msg_type Msg_text
+test.t10 analyze status Engine-independent statistics collected
+test.t10 analyze status OK
+flush tables;
+set statement optimizer_trace=1 for
+explain select * from t10 where a in (91303);
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t10 ALL NULL NULL NULL NULL 9984 Using where
+# Must have selectivity_from_histogram <= 1.0:
+select json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
+from information_schema.optimizer_trace;
+json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
+[
+ [
+ {
+ "column_name": "a",
+ "ranges":
+ ["91303 <= a <= 91303"],
+ "selectivity_from_histogram": 0.035714283
+ }
+ ]
+]
+drop table t0,t1,t10;
set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
set histogram_size=@save_histogram_size;
set use_stat_tables= @save_use_stat_tables;
-DROP TABLE t1;
-# End of 10.2 tests
set @@global.histogram_size=@save_histogram_size;
set optimizer_switch=@save_optimizer_switch_for_selectivity_test;
set @tmp_ust= @@use_stat_tables;
diff --git a/mysql-test/main/selectivity_no_engine.result b/mysql-test/main/selectivity_no_engine.result
index 3811b12a1be..9cb1a54e0b1 100644
--- a/mysql-test/main/selectivity_no_engine.result
+++ b/mysql-test/main/selectivity_no_engine.result
@@ -36,12 +36,12 @@ test.t2 analyze status OK
# The following two must have the same in 'Extra' column:
explain extended select * from t2 where col1 IN (20, 180);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 2.00 Using where
Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where `test`.`t2`.`col1` in (20,180)
explain extended select * from t2 where col1 IN (180, 20);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 2.00 Using where
Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where `test`.`t2`.`col1` in (180,20)
drop table t1, t2;
@@ -102,7 +102,7 @@ test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status OK
explain extended select * from t1 where col1 in (1,2,3);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.37 Using where
+1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.00 Using where
Warnings:
Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where `test`.`t1`.`col1` in (1,2,3)
# Must not cause fp division by zero, or produce nonsense numbers:
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index be011adb60c..7481d3fd6ae 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -3780,9 +3780,9 @@ double get_column_range_cardinality(Field *field,
field->key_length());
double pos= field->pos_in_interval(col_stats->min_value,
col_stats->max_value);
+ double n_distinct= col_non_nulls / avg_frequency;
res= col_non_nulls *
- hist->point_selectivity(pos,
- avg_frequency / col_non_nulls);
+ hist->point_selectivity(pos, col_non_nulls, n_distinct);
}
}
else if (avg_frequency == 0.0)
@@ -3840,8 +3840,10 @@ double get_column_range_cardinality(Field *field,
@param pos Position of the "const" between column's min_value and
max_value. This is a number in [0..1] range.
- @param avg_sel Average selectivity of condition "col=const" in this table.
- It is calcuated as (#non_null_values / #distinct_values).
+ @param n_rows Number of rows covered by the histogram. This is number
+ of rows in the table that have non-NULL values for this
+ column.
+ @param n_distinct Number of distinct values for the histogram column.
@return
Expected condition selectivity (a number between 0 and 1)
@@ -3868,7 +3870,8 @@ double get_column_range_cardinality(Field *field,
value.
*/
-double Histogram::point_selectivity(double pos, double avg_sel)
+double Histogram::point_selectivity(double pos, double n_rows,
+ double n_distinct)
{
double sel;
/* Find the bucket that contains the value 'pos'. */
@@ -3901,54 +3904,100 @@ double Histogram::point_selectivity(double pos, double avg_sel)
}
else
{
- /*
+ double n_distinct_regular_values, regular_value_rows, rows_for_regular_val;
+ /*
The value 'pos' fits within one single histogram bucket.
- Histogram buckets have the same numbers of rows, but they cover
- different ranges of values.
-
- We assume that values are uniformly distributed across the [0..1] value
- range.
- */
-
- /*
- If all buckets covered value ranges of the same size, the width of
- value range would be:
+ We consider values that do not fit into one histogram values "popular".
+ The value we're looking for is "regular", not popular.
+ Compute the average selectivity for regular values.
*/
- double avg_bucket_width= 1.0 / (get_width() + 1);
+ /* Number of distinct regular values in the table: */
+ n_distinct_regular_values= n_distinct - n_popular_values;
+
+ if (n_distinct_regular_values < 1.0)
+ goto handle_edge_case;
+
/*
- Let's see what is the width of value range that our bucket is covering.
- (min==max currently. they are kept in the formula just in case we
- will want to extend it to handle multi-bucket case)
+ How many rows have regular values? Look at the fraction of histogram
+ buckets that are not occupied by popular values.
*/
- double inv_prec_factor= (double) 1.0 / prec_factor();
- double current_bucket_width=
- (max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) -
- (min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor));
+ regular_value_rows=
+ n_rows * (get_width() - n_popular_values_buckets) / get_width();
- DBUG_ASSERT(current_bucket_width); /* We shouldn't get a one zero-width bucket */
+ if (regular_value_rows < 1.0)
+ goto handle_edge_case;
- /*
- So:
- - each bucket has the same #rows
- - values are unformly distributed across the [min_value,max_value] domain.
+ /* Average #rows that a regular value has: */
+ rows_for_regular_val= regular_value_rows / n_distinct_regular_values;
- If a bucket has value range that's N times bigger then average, than
- each value will have to have N times fewer rows than average.
- */
- sel= avg_sel * avg_bucket_width / current_bucket_width;
+ /* Selectiving this many rows out of total table rows gives: */
+ sel= rows_for_regular_val / n_rows;
- /*
- (Q: if we just follow this proportion we may end up in a situation
- where number of different values we expect to find in this bucket
- exceeds the number of rows that this histogram has in a bucket. Are
- we ok with this or we would want to have certain caps?)
- */
+ /* Handle possible errors: */
+ if (sel <= 0.0 || sel > 1.0 / (get_width()))
+ {
+ DBUG_ASSERT(0);
+ handle_edge_case:
+ sel= 1.0 / (get_width());
+ }
}
return sel;
}
+
+/*
+ @brief
+ Compute n_popular_values and n_popular_values_buckets
+
+ @detail
+ They are used in point_selectivity().
+*/
+
+void Histogram::update_popular_value_counts()
+{
+ n_popular_values=0;
+ n_popular_values_buckets=0;
+
+ // This variable means "previous bucket was occupied by a popular value":
+ bool cur_value_popular= false;
+ uint prev_value= 0;
+
+ for (uint i=0; i < get_width(); i++)
+ {
+ uint left_endpoint= prev_value;
+ uint right_endpoint= get_value(i);
+
+ if (left_endpoint == right_endpoint)
+ {
+ /* This bucket is occupied by one value. It's a popular value */
+ n_popular_values_buckets++;
+ cur_value_popular= true;
+ }
+ else
+ {
+ if (cur_value_popular)
+ {
+ /*
+ The previous bucket (and maybe also buckets before it) was fully
+ occupied by a popular value.
+ */
+ n_popular_values++;
+
+ /* But current bucket is not (as left_endpoint!=right_endpoint) */
+ cur_value_popular= false;
+ }
+ }
+ prev_value= right_endpoint;
+ }
+
+ /* Check if the last bucket was occupied by a popular value */
+ if (cur_value_popular)
+ n_popular_values++;
+}
+
+
/*
Check whether the table is one of the persistent statistical tables.
*/
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
index 35b3aa33acc..8bd00437e67 100644
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -148,6 +148,15 @@ private:
uint8 size; /* Size of values array, in bytes */
uchar *values;
+ /*
+ Number of popular values in the histogram. A value is considered popular if
+ it occupies one whole bucket or more than that.
+ */
+ uint n_popular_value;
+
+ /* Number of buckets that are fully occupied by popular values. */
+ uint n_popular_values_buckets;
+
uint prec_factor()
{
switch (type) {
@@ -223,6 +232,8 @@ private:
return i;
}
+ /* Re-compute n_popular_values and n_popular_values_buckets */
+ void update_popular_value_counts();
public:
uint get_size() { return (uint) size; }
@@ -235,7 +246,11 @@ public:
void set_type (Histogram_type t) { type= t; }
- void set_values (uchar *vals) { values= (uchar *) vals; }
+ void set_values(uchar *vals)
+ {
+ values= (uchar *) vals;
+ update_popular_value_counts();
+ }
bool is_available() { return get_size() > 0 && get_values(); }
@@ -287,7 +302,7 @@ public:
/*
Estimate selectivity of "col=const" using a histogram
*/
- double point_selectivity(double pos, double avg_sel);
+ double point_selectivity(double pos, double n_rows, double n_distinct);
};