diff options
-rw-r--r-- | mysql-test/main/selectivity.result | 75 | ||||
-rw-r--r-- | mysql-test/main/selectivity.test | 83 | ||||
-rw-r--r-- | mysql-test/main/selectivity_innodb.result | 75 | ||||
-rw-r--r-- | mysql-test/main/selectivity_no_engine.result | 6 | ||||
-rw-r--r-- | sql/sql_statistics.cc | 127 | ||||
-rw-r--r-- | sql/sql_statistics.h | 19 |
6 files changed, 330 insertions, 55 deletions
diff --git a/mysql-test/main/selectivity.result b/mysql-test/main/selectivity.result index b5a48d341ae..6159186644b 100644 --- a/mysql-test/main/selectivity.result +++ b/mysql-test/main/selectivity.result @@ -834,7 +834,7 @@ flush table t1; set optimizer_use_condition_selectivity=4; explain extended select * from t1 where a=0; id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where +1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.79 Using where Warnings: Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 0 drop table t1; @@ -1649,7 +1649,7 @@ test.t1 analyze status Table is already up to date # Check what info the optimizer has about selectivities explain extended select * from t1 use index () where a in (17,51,5); id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.90 Using where +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.91 Using where Warnings: Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` USE INDEX () where `test`.`t1`.`a` in (17,51,5) explain extended select * from t1 use index () where b=2; @@ -1941,9 +1941,76 @@ id select_type table type possible_keys key key_len ref rows filtered Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 5 25.00 Using where Warnings: Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 2 +DROP TABLE t1; +# End of 10.2 tests +# +# MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram +# +create table t0(a int); +insert into t0 select 1 from seq_1_to_78; +create table t1(a int); +insert into t1 select 1 from seq_1_to_26; +create table t10 (a int); +insert into t10 select 0 from t0, seq_1_to_4; +insert into t10 select 8693 from t1; +insert into t10 select 8694 from t1; +insert into t10 select 8695 from t1; +insert into t10 select 34783 from t1; +insert into t10 select 34784 from t1; +insert into t10 select 34785 from t1; +insert into t10 select 34785 from t0, seq_1_to_8; +insert into t10 select 65214 from t1; +insert into t10 select 65215 from t1; +insert into t10 select 65216 from t1; +insert into t10 select 65216 from t0, seq_1_to_52; +insert into t10 select 65217 from t1; +insert into t10 select 65218 from t1; +insert into t10 select 65219 from t1; +insert into t10 select 65219 from t0; +insert into t10 select 73913 from t1; +insert into t10 select 73914 from t1; +insert into t10 select 73915 from t1; +insert into t10 select 73915 from t0, seq_1_to_40; +insert into t10 select 78257 from t1; +insert into t10 select 78258 from t1; +insert into t10 select 78259 from t1; +insert into t10 select 91300 from t1; +insert into t10 select 91301 from t1; +insert into t10 select 91302 from t1; +insert into t10 select 91302 from t0, seq_1_to_6; +insert into t10 select 91303 from t1; +insert into t10 select 91304 from t1; +insert into t10 select 91305 from t1; +insert into t10 select 91305 from t0, seq_1_to_8; +insert into t10 select 99998 from t1; +insert into t10 select 99999 from t1; +insert into t10 select 100000 from t1; +set use_stat_tables=preferably; +analyze table t10 persistent for all; +Table Op Msg_type Msg_text +test.t10 analyze status Engine-independent statistics collected +test.t10 analyze status OK +flush tables; +set statement optimizer_trace=1 for +explain select * from t10 where a in (91303); +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t10 ALL NULL NULL NULL NULL 9984 Using where +# Must have selectivity_from_histogram <= 1.0: +select json_detailed(json_extract(trace, '$**.selectivity_for_columns')) +from information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.selectivity_for_columns')) +[ + [ + { + "column_name": "a", + "ranges": + ["91303 <= a <= 91303"], + "selectivity_from_histogram": 0.035714283 + } + ] +] +drop table t0,t1,t10; set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity; set histogram_size=@save_histogram_size; set use_stat_tables= @save_use_stat_tables; -DROP TABLE t1; -# End of 10.2 tests set @@global.histogram_size=@save_histogram_size; diff --git a/mysql-test/main/selectivity.test b/mysql-test/main/selectivity.test index 4e4513d09d6..556ffd76a22 100644 --- a/mysql-test/main/selectivity.test +++ b/mysql-test/main/selectivity.test @@ -1321,14 +1321,91 @@ EXPLAIN EXTENDED SELECT * FROM t1 WHERE a=2; FLUSH TABLES; EXPLAIN EXTENDED SELECT * FROM t1 WHERE a=2; -set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity; -set histogram_size=@save_histogram_size; -set use_stat_tables= @save_use_stat_tables; DROP TABLE t1; --echo # End of 10.2 tests +--echo # +--echo # MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram +--echo # +create table t0(a int); # This holds how many rows we hold in a bucket. +insert into t0 select 1 from seq_1_to_78; + +create table t1(a int); # one-third of a bucket +insert into t1 select 1 from seq_1_to_26; + +create table t10 (a int); +insert into t10 select 0 from t0, seq_1_to_4; + +insert into t10 select 8693 from t1; +insert into t10 select 8694 from t1; +insert into t10 select 8695 from t1; + + +insert into t10 select 34783 from t1; +insert into t10 select 34784 from t1; +insert into t10 select 34785 from t1; + + +insert into t10 select 34785 from t0, seq_1_to_8; + +insert into t10 select 65214 from t1; +insert into t10 select 65215 from t1; +insert into t10 select 65216 from t1; + +insert into t10 select 65216 from t0, seq_1_to_52; + +insert into t10 select 65217 from t1; +insert into t10 select 65218 from t1; +insert into t10 select 65219 from t1; + +insert into t10 select 65219 from t0; + + +insert into t10 select 73913 from t1; +insert into t10 select 73914 from t1; +insert into t10 select 73915 from t1; + +insert into t10 select 73915 from t0, seq_1_to_40; + + +insert into t10 select 78257 from t1; +insert into t10 select 78258 from t1; +insert into t10 select 78259 from t1; + +insert into t10 select 91300 from t1; +insert into t10 select 91301 from t1; +insert into t10 select 91302 from t1; + +insert into t10 select 91302 from t0, seq_1_to_6; + +insert into t10 select 91303 from t1; # Only 1/3rd of bucket matches the search tuple +insert into t10 select 91304 from t1; +insert into t10 select 91305 from t1; + +insert into t10 select 91305 from t0, seq_1_to_8; + +insert into t10 select 99998 from t1; +insert into t10 select 99999 from t1; +insert into t10 select 100000 from t1; + +set use_stat_tables=preferably; +analyze table t10 persistent for all; +flush tables; +set statement optimizer_trace=1 for +explain select * from t10 where a in (91303); + +#select * from information_schema.optimizer_trace; +--echo # Must have selectivity_from_histogram <= 1.0: +select json_detailed(json_extract(trace, '$**.selectivity_for_columns')) +from information_schema.optimizer_trace; + +drop table t0,t1,t10; + +set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity; +set histogram_size=@save_histogram_size; +set use_stat_tables= @save_use_stat_tables; # # Clean up # diff --git a/mysql-test/main/selectivity_innodb.result b/mysql-test/main/selectivity_innodb.result index 3edf6d8e929..fd0cea27a81 100644 --- a/mysql-test/main/selectivity_innodb.result +++ b/mysql-test/main/selectivity_innodb.result @@ -845,7 +845,7 @@ flush table t1; set optimizer_use_condition_selectivity=4; explain extended select * from t1 where a=0; id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where +1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.79 Using where Warnings: Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 0 drop table t1; @@ -1661,7 +1661,7 @@ test.t1 analyze status OK # Check what info the optimizer has about selectivities explain extended select * from t1 use index () where a in (17,51,5); id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.90 Using where +1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.91 Using where Warnings: Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` USE INDEX () where `test`.`t1`.`a` in (17,51,5) explain extended select * from t1 use index () where b=2; @@ -1953,11 +1953,78 @@ id select_type table type possible_keys key key_len ref rows filtered Extra 1 SIMPLE t1 ALL NULL NULL NULL NULL 5 25.00 Using where Warnings: Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 2 +DROP TABLE t1; +# End of 10.2 tests +# +# MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram +# +create table t0(a int); +insert into t0 select 1 from seq_1_to_78; +create table t1(a int); +insert into t1 select 1 from seq_1_to_26; +create table t10 (a int); +insert into t10 select 0 from t0, seq_1_to_4; +insert into t10 select 8693 from t1; +insert into t10 select 8694 from t1; +insert into t10 select 8695 from t1; +insert into t10 select 34783 from t1; +insert into t10 select 34784 from t1; +insert into t10 select 34785 from t1; +insert into t10 select 34785 from t0, seq_1_to_8; +insert into t10 select 65214 from t1; +insert into t10 select 65215 from t1; +insert into t10 select 65216 from t1; +insert into t10 select 65216 from t0, seq_1_to_52; +insert into t10 select 65217 from t1; +insert into t10 select 65218 from t1; +insert into t10 select 65219 from t1; +insert into t10 select 65219 from t0; +insert into t10 select 73913 from t1; +insert into t10 select 73914 from t1; +insert into t10 select 73915 from t1; +insert into t10 select 73915 from t0, seq_1_to_40; +insert into t10 select 78257 from t1; +insert into t10 select 78258 from t1; +insert into t10 select 78259 from t1; +insert into t10 select 91300 from t1; +insert into t10 select 91301 from t1; +insert into t10 select 91302 from t1; +insert into t10 select 91302 from t0, seq_1_to_6; +insert into t10 select 91303 from t1; +insert into t10 select 91304 from t1; +insert into t10 select 91305 from t1; +insert into t10 select 91305 from t0, seq_1_to_8; +insert into t10 select 99998 from t1; +insert into t10 select 99999 from t1; +insert into t10 select 100000 from t1; +set use_stat_tables=preferably; +analyze table t10 persistent for all; +Table Op Msg_type Msg_text +test.t10 analyze status Engine-independent statistics collected +test.t10 analyze status OK +flush tables; +set statement optimizer_trace=1 for +explain select * from t10 where a in (91303); +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t10 ALL NULL NULL NULL NULL 9984 Using where +# Must have selectivity_from_histogram <= 1.0: +select json_detailed(json_extract(trace, '$**.selectivity_for_columns')) +from information_schema.optimizer_trace; +json_detailed(json_extract(trace, '$**.selectivity_for_columns')) +[ + [ + { + "column_name": "a", + "ranges": + ["91303 <= a <= 91303"], + "selectivity_from_histogram": 0.035714283 + } + ] +] +drop table t0,t1,t10; set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity; set histogram_size=@save_histogram_size; set use_stat_tables= @save_use_stat_tables; -DROP TABLE t1; -# End of 10.2 tests set @@global.histogram_size=@save_histogram_size; set optimizer_switch=@save_optimizer_switch_for_selectivity_test; set @tmp_ust= @@use_stat_tables; diff --git a/mysql-test/main/selectivity_no_engine.result b/mysql-test/main/selectivity_no_engine.result index 3811b12a1be..9cb1a54e0b1 100644 --- a/mysql-test/main/selectivity_no_engine.result +++ b/mysql-test/main/selectivity_no_engine.result @@ -36,12 +36,12 @@ test.t2 analyze status OK # The following two must have the same in 'Extra' column: explain extended select * from t2 where col1 IN (20, 180); id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where +1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 2.00 Using where Warnings: Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where `test`.`t2`.`col1` in (20,180) explain extended select * from t2 where col1 IN (180, 20); id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where +1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 2.00 Using where Warnings: Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where `test`.`t2`.`col1` in (180,20) drop table t1, t2; @@ -102,7 +102,7 @@ test.t1 analyze status Engine-independent statistics collected test.t1 analyze status OK explain extended select * from t1 where col1 in (1,2,3); id select_type table type possible_keys key key_len ref rows filtered Extra -1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.37 Using where +1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.00 Using where Warnings: Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where `test`.`t1`.`col1` in (1,2,3) # Must not cause fp division by zero, or produce nonsense numbers: diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index be011adb60c..7481d3fd6ae 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -3780,9 +3780,9 @@ double get_column_range_cardinality(Field *field, field->key_length()); double pos= field->pos_in_interval(col_stats->min_value, col_stats->max_value); + double n_distinct= col_non_nulls / avg_frequency; res= col_non_nulls * - hist->point_selectivity(pos, - avg_frequency / col_non_nulls); + hist->point_selectivity(pos, col_non_nulls, n_distinct); } } else if (avg_frequency == 0.0) @@ -3840,8 +3840,10 @@ double get_column_range_cardinality(Field *field, @param pos Position of the "const" between column's min_value and max_value. This is a number in [0..1] range. - @param avg_sel Average selectivity of condition "col=const" in this table. - It is calcuated as (#non_null_values / #distinct_values). + @param n_rows Number of rows covered by the histogram. This is number + of rows in the table that have non-NULL values for this + column. + @param n_distinct Number of distinct values for the histogram column. @return Expected condition selectivity (a number between 0 and 1) @@ -3868,7 +3870,8 @@ double get_column_range_cardinality(Field *field, value. */ -double Histogram::point_selectivity(double pos, double avg_sel) +double Histogram::point_selectivity(double pos, double n_rows, + double n_distinct) { double sel; /* Find the bucket that contains the value 'pos'. */ @@ -3901,54 +3904,100 @@ double Histogram::point_selectivity(double pos, double avg_sel) } else { - /* + double n_distinct_regular_values, regular_value_rows, rows_for_regular_val; + /* The value 'pos' fits within one single histogram bucket. - Histogram buckets have the same numbers of rows, but they cover - different ranges of values. - - We assume that values are uniformly distributed across the [0..1] value - range. - */ - - /* - If all buckets covered value ranges of the same size, the width of - value range would be: + We consider values that do not fit into one histogram values "popular". + The value we're looking for is "regular", not popular. + Compute the average selectivity for regular values. */ - double avg_bucket_width= 1.0 / (get_width() + 1); + /* Number of distinct regular values in the table: */ + n_distinct_regular_values= n_distinct - n_popular_values; + + if (n_distinct_regular_values < 1.0) + goto handle_edge_case; + /* - Let's see what is the width of value range that our bucket is covering. - (min==max currently. they are kept in the formula just in case we - will want to extend it to handle multi-bucket case) + How many rows have regular values? Look at the fraction of histogram + buckets that are not occupied by popular values. */ - double inv_prec_factor= (double) 1.0 / prec_factor(); - double current_bucket_width= - (max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) - - (min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor)); + regular_value_rows= + n_rows * (get_width() - n_popular_values_buckets) / get_width(); - DBUG_ASSERT(current_bucket_width); /* We shouldn't get a one zero-width bucket */ + if (regular_value_rows < 1.0) + goto handle_edge_case; - /* - So: - - each bucket has the same #rows - - values are unformly distributed across the [min_value,max_value] domain. + /* Average #rows that a regular value has: */ + rows_for_regular_val= regular_value_rows / n_distinct_regular_values; - If a bucket has value range that's N times bigger then average, than - each value will have to have N times fewer rows than average. - */ - sel= avg_sel * avg_bucket_width / current_bucket_width; + /* Selectiving this many rows out of total table rows gives: */ + sel= rows_for_regular_val / n_rows; - /* - (Q: if we just follow this proportion we may end up in a situation - where number of different values we expect to find in this bucket - exceeds the number of rows that this histogram has in a bucket. Are - we ok with this or we would want to have certain caps?) - */ + /* Handle possible errors: */ + if (sel <= 0.0 || sel > 1.0 / (get_width())) + { + DBUG_ASSERT(0); + handle_edge_case: + sel= 1.0 / (get_width()); + } } return sel; } + +/* + @brief + Compute n_popular_values and n_popular_values_buckets + + @detail + They are used in point_selectivity(). +*/ + +void Histogram::update_popular_value_counts() +{ + n_popular_values=0; + n_popular_values_buckets=0; + + // This variable means "previous bucket was occupied by a popular value": + bool cur_value_popular= false; + uint prev_value= 0; + + for (uint i=0; i < get_width(); i++) + { + uint left_endpoint= prev_value; + uint right_endpoint= get_value(i); + + if (left_endpoint == right_endpoint) + { + /* This bucket is occupied by one value. It's a popular value */ + n_popular_values_buckets++; + cur_value_popular= true; + } + else + { + if (cur_value_popular) + { + /* + The previous bucket (and maybe also buckets before it) was fully + occupied by a popular value. + */ + n_popular_values++; + + /* But current bucket is not (as left_endpoint!=right_endpoint) */ + cur_value_popular= false; + } + } + prev_value= right_endpoint; + } + + /* Check if the last bucket was occupied by a popular value */ + if (cur_value_popular) + n_popular_values++; +} + + /* Check whether the table is one of the persistent statistical tables. */ diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index 35b3aa33acc..8bd00437e67 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -148,6 +148,15 @@ private: uint8 size; /* Size of values array, in bytes */ uchar *values; + /* + Number of popular values in the histogram. A value is considered popular if + it occupies one whole bucket or more than that. + */ + uint n_popular_value; + + /* Number of buckets that are fully occupied by popular values. */ + uint n_popular_values_buckets; + uint prec_factor() { switch (type) { @@ -223,6 +232,8 @@ private: return i; } + /* Re-compute n_popular_values and n_popular_values_buckets */ + void update_popular_value_counts(); public: uint get_size() { return (uint) size; } @@ -235,7 +246,11 @@ public: void set_type (Histogram_type t) { type= t; } - void set_values (uchar *vals) { values= (uchar *) vals; } + void set_values(uchar *vals) + { + values= (uchar *) vals; + update_popular_value_counts(); + } bool is_available() { return get_size() > 0 && get_values(); } @@ -287,7 +302,7 @@ public: /* Estimate selectivity of "col=const" using a histogram */ - double point_selectivity(double pos, double avg_sel); + double point_selectivity(double pos, double n_rows, double n_distinct); }; |