summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--mysql-test/r/selectivity.result78
-rw-r--r--mysql-test/r/selectivity_innodb.result78
-rw-r--r--mysql-test/t/selectivity.test41
-rw-r--r--sql/sql_statistics.h123
4 files changed, 301 insertions, 19 deletions
diff --git a/mysql-test/r/selectivity.result b/mysql-test/r/selectivity.result
index 10d39eb174e..becfc02d6c4 100644
--- a/mysql-test/r/selectivity.result
+++ b/mysql-test/r/selectivity.result
@@ -1,4 +1,4 @@
-drop table if exists t1,t2,t3;
+drop table if exists t0,t1,t2,t3;
select @@global.use_stat_tables;
@@global.use_stat_tables
COMPLEMENTARY
@@ -826,7 +826,7 @@ flush table t1;
set optimizer_use_condition_selectivity=4;
explain extended select * from t1 where a=0;
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 49.61 Using where
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0)
drop table t1;
@@ -1308,15 +1308,85 @@ test.t2 analyze status OK
# The following two must have the same in 'Extra' column:
explain extended select * from t2 where col1 IN (20, 180);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.37 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (20,180))
explain extended select * from t2 where col1 IN (180, 20);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.37 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (180,20))
drop table t1, t2;
+#
+# MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a int);
+insert into t1 select A.a from t0 A, t0 B, t0 C;
+set histogram_size=20;
+set histogram_type='single_prec_hb';
+analyze table t1 persistent for all;
+Table Op Msg_type Msg_text
+test.t1 analyze status OK
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+# Should select about 10%:
+explain extended select * from t1 where a=2;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 2)
+# Should select about 10%:
+explain extended select * from t1 where a=1;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 1)
+# Must not have filtered=100%:
+explain extended select * from t1 where a=0;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0)
+# Again, must not have filtered=100%:
+explain extended select * from t1 where a=-1;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = <cache>(-(1)))
+drop table t0, t1;
+#
+# MDEV-4362: Selectivity estimates for IN (...) do not depend on whether the values are in range
+#
+create table t1 (col1 int);
+set @a=-1;
+create table t2 (a int) select (@a:=@a+1) as a from information_schema.session_variables A limit 100;
+insert into t1 select A.a from t2 A, t2 B where A.a < 100 and B.a < 100;
+select min(col1), max(col1), count(*) from t1;
+min(col1) max(col1) count(*)
+0 99 10000
+set histogram_size=100;
+analyze table t1 persistent for all;
+Table Op Msg_type Msg_text
+test.t1 analyze status OK
+explain extended select * from t1 where col1 in (1,2,3);
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.37 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` in (1,2,3))
+# Must not cause fp division by zero, or produce nonsense numbers:
+explain extended select * from t1 where col1 in (-1,-2,-3);
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 5.94 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` in (<cache>(-(1)),<cache>(-(2)),<cache>(-(3))))
+explain extended select * from t1 where col1<=-1;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 1.00 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` <= <cache>(-(1)))
+drop table t1, t2;
set histogram_type=@save_histogram_type;
set histogram_size=@save_histogram_size;
set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity;
diff --git a/mysql-test/r/selectivity_innodb.result b/mysql-test/r/selectivity_innodb.result
index 1af01855952..1761aa491d2 100644
--- a/mysql-test/r/selectivity_innodb.result
+++ b/mysql-test/r/selectivity_innodb.result
@@ -1,7 +1,7 @@
SET SESSION STORAGE_ENGINE='InnoDB';
set @save_optimizer_switch_for_selectivity_test=@@optimizer_switch;
set optimizer_switch='extended_keys=on';
-drop table if exists t1,t2,t3;
+drop table if exists t0,t1,t2,t3;
select @@global.use_stat_tables;
@@global.use_stat_tables
COMPLEMENTARY
@@ -835,7 +835,7 @@ flush table t1;
set optimizer_use_condition_selectivity=4;
explain extended select * from t1 where a=0;
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 49.61 Using where
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where
Warnings:
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0)
drop table t1;
@@ -1318,15 +1318,85 @@ test.t2 analyze status OK
# The following two must have the same in 'Extra' column:
explain extended select * from t2 where col1 IN (20, 180);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.37 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (20,180))
explain extended select * from t2 where col1 IN (180, 20);
id select_type table type possible_keys key key_len ref rows filtered Extra
-1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.37 Using where
+1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
Warnings:
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where (`test`.`t2`.`col1` in (180,20))
drop table t1, t2;
+#
+# MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong
+#
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a int);
+insert into t1 select A.a from t0 A, t0 B, t0 C;
+set histogram_size=20;
+set histogram_type='single_prec_hb';
+analyze table t1 persistent for all;
+Table Op Msg_type Msg_text
+test.t1 analyze status OK
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+# Should select about 10%:
+explain extended select * from t1 where a=2;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 2)
+# Should select about 10%:
+explain extended select * from t1 where a=1;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 1)
+# Must not have filtered=100%:
+explain extended select * from t1 where a=0;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = 0)
+# Again, must not have filtered=100%:
+explain extended select * from t1 where a=-1;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 9.52 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where (`test`.`t1`.`a` = <cache>(-(1)))
+drop table t0, t1;
+#
+# MDEV-4362: Selectivity estimates for IN (...) do not depend on whether the values are in range
+#
+create table t1 (col1 int);
+set @a=-1;
+create table t2 (a int) select (@a:=@a+1) as a from information_schema.session_variables A limit 100;
+insert into t1 select A.a from t2 A, t2 B where A.a < 100 and B.a < 100;
+select min(col1), max(col1), count(*) from t1;
+min(col1) max(col1) count(*)
+0 99 10000
+set histogram_size=100;
+analyze table t1 persistent for all;
+Table Op Msg_type Msg_text
+test.t1 analyze status OK
+explain extended select * from t1 where col1 in (1,2,3);
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.37 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` in (1,2,3))
+# Must not cause fp division by zero, or produce nonsense numbers:
+explain extended select * from t1 where col1 in (-1,-2,-3);
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 5.94 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` in (<cache>(-(1)),<cache>(-(2)),<cache>(-(3))))
+explain extended select * from t1 where col1<=-1;
+id select_type table type possible_keys key key_len ref rows filtered Extra
+1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 1.00 Using where
+Warnings:
+Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where (`test`.`t1`.`col1` <= <cache>(-(1)))
+drop table t1, t2;
set histogram_type=@save_histogram_type;
set histogram_size=@save_histogram_size;
set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity;
diff --git a/mysql-test/t/selectivity.test b/mysql-test/t/selectivity.test
index 8774afdc554..6403c770287 100644
--- a/mysql-test/t/selectivity.test
+++ b/mysql-test/t/selectivity.test
@@ -1,7 +1,7 @@
--source include/have_stat_tables.inc
--disable_warnings
-drop table if exists t1,t2,t3;
+drop table if exists t0,t1,t2,t3;
--enable_warnings
select @@global.use_stat_tables;
@@ -885,6 +885,45 @@ explain extended select * from t2 where col1 IN (180, 20);
drop table t1, t2;
+--echo #
+--echo # MDEV-5926: EITS: Histogram estimates for column=least_possible_value are wrong
+--echo #
+create table t0(a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table t1(a int);
+insert into t1 select A.a from t0 A, t0 B, t0 C;
+set histogram_size=20;
+set histogram_type='single_prec_hb';
+analyze table t1 persistent for all;
+set use_stat_tables='preferably';
+set optimizer_use_condition_selectivity=4;
+--echo # Should select about 10%:
+explain extended select * from t1 where a=2;
+--echo # Should select about 10%:
+explain extended select * from t1 where a=1;
+--echo # Must not have filtered=100%:
+explain extended select * from t1 where a=0;
+--echo # Again, must not have filtered=100%:
+explain extended select * from t1 where a=-1;
+
+drop table t0, t1;
+
+--echo #
+--echo # MDEV-4362: Selectivity estimates for IN (...) do not depend on whether the values are in range
+--echo #
+create table t1 (col1 int);
+set @a=-1;
+create table t2 (a int) select (@a:=@a+1) as a from information_schema.session_variables A limit 100;
+insert into t1 select A.a from t2 A, t2 B where A.a < 100 and B.a < 100;
+select min(col1), max(col1), count(*) from t1;
+set histogram_size=100;
+analyze table t1 persistent for all;
+explain extended select * from t1 where col1 in (1,2,3);
+--echo # Must not cause fp division by zero, or produce nonsense numbers:
+explain extended select * from t1 where col1 in (-1,-2,-3);
+explain extended select * from t1 where col1<=-1;
+drop table t1, t2;
+
set histogram_type=@save_histogram_type;
set histogram_size=@save_histogram_size;
set optimizer_use_condition_selectivity=@save_optimizer_use_condition_selectivity;
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
index 68aacd69d98..d0db0a3bf33 100644
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -113,7 +113,7 @@ class Histogram
private:
Histogram_type type;
- uint8 size;
+ uint8 size; /* Size of values array, in bytes */
uchar *values;
uint prec_factor()
@@ -142,6 +142,7 @@ public:
private:
uint get_value(uint i)
{
+ DBUG_ASSERT(i < get_width());
switch (type) {
case SINGLE_PREC_HB:
return (uint) (((uint8 *) values)[i]);
@@ -151,6 +152,7 @@ private:
return 0;
}
+ /* Find the bucket which value 'pos' falls into. */
uint find_bucket(double pos, bool first)
{
uint val= (uint) (pos * prec_factor());
@@ -169,6 +171,10 @@ private:
else
break;
}
+
+ if (val > get_value(i) && i < (get_width() - 1))
+ i++;
+
if (val == get_value(i))
{
if (first)
@@ -234,24 +240,121 @@ public:
sel= bucket_sel * (max - min + 1);
return sel;
}
+
+
+ /*
+ Estimate selectivity of "col=const" using a histogram
+
+ @param pos Position of the "const" between column's min_value and
+ max_value. This is a number in [0..1] range.
+ @param avg_sel Average selectivity of condition "col=const" in this table.
+ It is calcuated as (#non_null_values / #distinct_values).
+
+ @return
+ Expected condition selectivity (a number between 0 and 1)
+
+ @notes
+ [re_zero_length_buckets] If a bucket with zero value-length is in the
+ middle of the histogram, we will not have min==max. Example: suppose,
+ pos_value=0x12, and the histogram is:
+
+ #n #n+1 #n+2
+ ... 0x10 0x12 0x12 0x14 ...
+ |
+ +------------- bucket with zero value-length
+
+ Here, we will get min=#n+1, max=#n+2, and use the multi-bucket formula.
+
+ The problem happens at the histogram ends. if pos_value=0, and the
+ histogram is:
+
+ 0x00 0x10 ...
+
+ then min=0, max=0. This means pos_value is contained within bucket #0,
+ but on the other hand, histogram data says that the bucket has only one
+ value.
+ */
double point_selectivity(double pos, double avg_sel)
{
double sel;
- double bucket_sel= 1.0/(get_width() + 1);
+ /* Find the bucket that contains the value 'pos'. */
uint min= find_bucket(pos, TRUE);
+ uint pos_value= (uint) (pos * prec_factor());
+
+ /* Find how many buckets this value occupies */
uint max= min;
- while (max + 1 < get_width() && get_value(max + 1) == get_value(max))
+ while (max + 1 < get_width() && get_value(max + 1) == pos_value)
max++;
- double inv_prec_factor= (double) 1.0 / prec_factor();
- double width= (max + 1 == get_width() ?
- 1.0 : get_value(max) * inv_prec_factor) -
- (min == 0 ?
- 0.0 : get_value(min-1) * inv_prec_factor);
- sel= avg_sel * (bucket_sel * (max + 1 - min)) / width;
+
+ /*
+ A special case: we're looking at a single bucket, and that bucket has
+ zero value-length. Use the multi-bucket formula (attempt to use
+ single-bucket formula will cause divison by zero).
+
+ For more details see [re_zero_length_buckets] above.
+ */
+ if (max == min && get_value(max) == ((max==0)? 0 : get_value(max-1)))
+ max++;
+
+ if (max > min)
+ {
+ /*
+ The value occupies multiple buckets. Use start_bucket ... end_bucket as
+ selectivity.
+ */
+ double bucket_sel= 1.0/(get_width() + 1);
+ sel= bucket_sel * (max - min + 1);
+ }
+ else
+ {
+ /*
+ The value 'pos' fits within one single histogram bucket.
+
+ Histogram buckets have the same numbers of rows, but they cover
+ different ranges of values.
+
+ We assume that values are uniformly distributed across the [0..1] value
+ range.
+ */
+
+ /*
+ If all buckets covered value ranges of the same size, the width of
+ value range would be:
+ */
+ double avg_bucket_width= 1.0 / (get_width() + 1);
+
+ /*
+ Let's see what is the width of value range that our bucket is covering.
+ (min==max currently. they are kept in the formula just in case we
+ will want to extend it to handle multi-bucket case)
+ */
+ double inv_prec_factor= (double) 1.0 / prec_factor();
+ double current_bucket_width=
+ (max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) -
+ (min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor));
+
+ DBUG_ASSERT(current_bucket_width); /* We shouldn't get a one zero-width bucket */
+
+ /*
+ So:
+ - each bucket has the same #rows
+ - values are unformly distributed across the [min_value,max_value] domain.
+
+ If a bucket has value range that's N times bigger then average, than
+ each value will have to have N times fewer rows than average.
+ */
+ sel= avg_sel * avg_bucket_width / current_bucket_width;
+
+ /*
+ (Q: if we just follow this proportion we may end up in a situation
+ where number of different values we expect to find in this bucket
+ exceeds the number of rows that this histogram has in a bucket. Are
+ we ok with this or we would want to have certain caps?)
+ */
+ }
return sel;
}
-
};