summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergei Petrunia <sergey@mariadb.com>2023-04-18 13:25:27 +0300
committerSergei Petrunia <sergey@mariadb.com>2023-04-18 13:25:27 +0300
commit3b74aa2be872ab72abc5ffe1d9f4a3bf4ef6a1df (patch)
tree6e7780a1fd4d53a2bcc0fba422520f0266f64f80
parent8f87023d3f3fbaad4e33991713db884cbe052fbc (diff)
downloadmariadb-git-bb-10.6-mdev31067.tar.gz
MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogrambb-10.6-mdev31067
When Histogram::point_selectivity() sees that the point value of interest falls into one bucket, it tries to guess whether the bucket has many different (unpopular) values or a few popular values. (The number of rows is fixed, as it's a Height-balanced histogram). The basis for this guess is the "width" of the value range the bucket covers. Buckets covering wider value ranges are assumed to contain values with proportionally lower frequencies. Note that this is just a [brave] guesswork. For a very narrow bucket, it may produce an estimate that's larger than total #rows in the table or even in the whole table. Make a conservative fix: make sure the produced estimate doesn't exceed the number of rows in the histogram's bucket.
-rw-r--r--mysql-test/main/selectivity.result66
-rw-r--r--mysql-test/main/selectivity.test74
-rw-r--r--mysql-test/main/selectivity_no_engine.result6
-rw-r--r--sql/sql_statistics.cc13
4 files changed, 151 insertions, 8 deletions
diff --git a/mysql-test/main/selectivity.result b/mysql-test/main/selectivity.result
index b5a48d341ae..9836c832ca7 100644
--- a/mysql-test/main/selectivity.result
+++ b/mysql-test/main/selectivity.result
@@ -1946,4 +1946,70 @@ set histogram_size=@save_histogram_size;
set use_stat_tables= @save_use_stat_tables;
DROP TABLE t1;
# End of 10.2 tests
+#
+# MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
+#
+create table t0(a int);
+insert into t0 select 1 from seq_1_to_78;
+create table t1(a int);
+insert into t1 select 1 from seq_1_to_26;
+create table t10 (a int);
+insert into t10 select 0 from t0, seq_1_to_4;
+insert into t10 select 8693 from t1;
+insert into t10 select 8694 from t1;
+insert into t10 select 8695 from t1;
+insert into t10 select 34783 from t1;
+insert into t10 select 34784 from t1;
+insert into t10 select 34785 from t1;
+insert into t10 select 34785 from t0, seq_1_to_8;
+insert into t10 select 65214 from t1;
+insert into t10 select 65215 from t1;
+insert into t10 select 65216 from t1;
+insert into t10 select 65216 from t0, seq_1_to_52;
+insert into t10 select 65217 from t1;
+insert into t10 select 65218 from t1;
+insert into t10 select 65219 from t1;
+insert into t10 select 65219 from t0;
+insert into t10 select 73913 from t1;
+insert into t10 select 73914 from t1;
+insert into t10 select 73915 from t1;
+insert into t10 select 73915 from t0, seq_1_to_40;
+insert into t10 select 78257 from t1;
+insert into t10 select 78258 from t1;
+insert into t10 select 78259 from t1;
+insert into t10 select 91300 from t1;
+insert into t10 select 91301 from t1;
+insert into t10 select 91302 from t1;
+insert into t10 select 91302 from t0, seq_1_to_6;
+insert into t10 select 91303 from t1;
+insert into t10 select 91304 from t1;
+insert into t10 select 91305 from t1;
+insert into t10 select 91305 from t0, seq_1_to_8;
+insert into t10 select 99998 from t1;
+insert into t10 select 99999 from t1;
+insert into t10 select 100000 from t1;
+analyze table t10 persistent for all;
+Table Op Msg_type Msg_text
+test.t10 analyze status Engine-independent statistics collected
+test.t10 analyze status OK
+flush tables;
+set optimizer_trace=1;
+explain select * from t10 where a in (91303);
+id select_type table type possible_keys key key_len ref rows Extra
+1 SIMPLE t10 ALL NULL NULL NULL NULL 9984 Using where
+# Must have selectivity_from_histogram <= 1.0:
+select json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
+from information_schema.optimizer_trace;
+json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
+[
+ [
+ {
+ "column_name": "a",
+ "ranges":
+ ["91303 <= a <= 91303"],
+ "selectivity_from_histogram": 0.0078125
+ }
+ ]
+]
+drop table t0,t1,t10;
set @@global.histogram_size=@save_histogram_size;
diff --git a/mysql-test/main/selectivity.test b/mysql-test/main/selectivity.test
index 4e4513d09d6..e8f265ca51b 100644
--- a/mysql-test/main/selectivity.test
+++ b/mysql-test/main/selectivity.test
@@ -1329,6 +1329,80 @@ DROP TABLE t1;
--echo # End of 10.2 tests
+--echo #
+--echo # MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
+--echo #
+create table t0(a int); # This holds how many rows we hold in a bucket.
+insert into t0 select 1 from seq_1_to_78;
+
+create table t1(a int); # one-third of a bucket
+insert into t1 select 1 from seq_1_to_26;
+
+create table t10 (a int);
+insert into t10 select 0 from t0, seq_1_to_4;
+
+insert into t10 select 8693 from t1;
+insert into t10 select 8694 from t1;
+insert into t10 select 8695 from t1;
+
+
+insert into t10 select 34783 from t1;
+insert into t10 select 34784 from t1;
+insert into t10 select 34785 from t1;
+
+
+insert into t10 select 34785 from t0, seq_1_to_8;
+
+insert into t10 select 65214 from t1;
+insert into t10 select 65215 from t1;
+insert into t10 select 65216 from t1;
+
+insert into t10 select 65216 from t0, seq_1_to_52;
+
+insert into t10 select 65217 from t1;
+insert into t10 select 65218 from t1;
+insert into t10 select 65219 from t1;
+
+insert into t10 select 65219 from t0;
+
+
+insert into t10 select 73913 from t1;
+insert into t10 select 73914 from t1;
+insert into t10 select 73915 from t1;
+
+insert into t10 select 73915 from t0, seq_1_to_40;
+
+
+insert into t10 select 78257 from t1;
+insert into t10 select 78258 from t1;
+insert into t10 select 78259 from t1;
+
+insert into t10 select 91300 from t1;
+insert into t10 select 91301 from t1;
+insert into t10 select 91302 from t1;
+
+insert into t10 select 91302 from t0, seq_1_to_6;
+
+insert into t10 select 91303 from t1; # Only 1/3rd of bucket matches the search tuple
+insert into t10 select 91304 from t1;
+insert into t10 select 91305 from t1;
+
+insert into t10 select 91305 from t0, seq_1_to_8;
+
+insert into t10 select 99998 from t1;
+insert into t10 select 99999 from t1;
+insert into t10 select 100000 from t1;
+
+analyze table t10 persistent for all;
+flush tables;
+set optimizer_trace=1;
+explain select * from t10 where a in (91303);
+--echo # Must have selectivity_from_histogram <= 1.0:
+select json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
+from information_schema.optimizer_trace;
+
+drop table t0,t1,t10;
+
#
# Clean up
#
diff --git a/mysql-test/main/selectivity_no_engine.result b/mysql-test/main/selectivity_no_engine.result
index 3811b12a1be..fe8646ac5bf 100644
--- a/mysql-test/main/selectivity_no_engine.result
+++ b/mysql-test/main/selectivity_no_engine.result
@@ -36,12 +36,12 @@ test.t2 analyze status OK
# The following two must have the same in 'Extra' column:
explain extended select * from t2 where col1 IN (20, 180);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.08 Using where
Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where `test`.`t2`.`col1` in (20,180)
explain extended select * from t2 where col1 IN (180, 20);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.08 Using where
Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where `test`.`t2`.`col1` in (180,20)
drop table t1, t2;
@@ -102,7 +102,7 @@ test.t1 analyze status Engine-independent statistics collected
test.t1 analyze status OK
explain extended select * from t1 where col1 in (1,2,3);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.37 Using where
+1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 2.82 Using where
Warnings:
Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where `test`.`t1`.`col1` in (1,2,3)
# Must not cause fp division by zero, or produce nonsense numbers:
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index be011adb60c..5aca25d7a1b 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -3932,7 +3932,8 @@ double Histogram::point_selectivity(double pos, double avg_sel)
/*
So:
- each bucket has the same #rows
- - values are unformly distributed across the [min_value,max_value] domain.
+ - We assume that values are unformly distributed across the
+ [min_value,max_value] domain.
If a bucket has value range that's N times bigger then average, than
each value will have to have N times fewer rows than average.
@@ -3940,11 +3941,13 @@ double Histogram::point_selectivity(double pos, double avg_sel)
sel= avg_sel * avg_bucket_width / current_bucket_width;
/*
- (Q: if we just follow this proportion we may end up in a situation
- where number of different values we expect to find in this bucket
- exceeds the number of rows that this histogram has in a bucket. Are
- we ok with this or we would want to have certain caps?)
+ Note that this adjustment is just a (brave?) heuristic. What we know for
+ certain is that the searched value fits into one histogram bucket. Do not
+ return an estimate larger than that.
*/
+ double bucket_sel= 1.0/(get_width() + 1);
+ if (sel >= bucket_sel)
+ sel= bucket_sel;
}
return sel;
}