6 files changed, 330 insertions, 55 deletions
diff --git a/mysql-test/main/selectivity.result b/mysql-test/main/selectivity.result
index b5a48d341ae..6159186644b 100644
--- a/mysql-test/main/selectivity.result
+++ b/mysql-test/main/selectivity.result
@@ -834,7 +834,7 @@ flush table t1;
 set optimizer_use_condition_selectivity=4;
 explain extended select * from t1 where a=0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1025	0.39	Using where
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1025	0.79	Using where
 Warnings:
 Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 0
 drop table t1;
@@ -1649,7 +1649,7 @@ test.t1	analyze	status	Table is already up to date
 # Check what info the optimizer has about selectivities
 explain extended select * from t1 use index () where a in (17,51,5);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	3.90	Using where
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	3.91	Using where
 Warnings:
 Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` USE INDEX () where `test`.`t1`.`a` in (17,51,5)
 explain extended select * from t1 use index () where b=2;
@@ -1941,9 +1941,76 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	5	25.00	Using where
 Warnings:
 Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 2
+DROP TABLE t1;
+# End of 10.2 tests
+#
+# MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
+#
+create table t0(a int);
+insert into t0 select 1 from seq_1_to_78;
+create table t1(a int);
+insert into t1 select 1 from seq_1_to_26;
+create table t10 (a int);
+insert into t10 select 0 from t0, seq_1_to_4;
+insert into t10 select 8693 from t1;
+insert into t10 select 8694 from t1;
+insert into t10 select 8695 from t1;
+insert into t10 select 34783 from t1;
+insert into t10 select 34784 from t1;
+insert into t10 select 34785 from t1;
+insert into t10 select 34785 from t0, seq_1_to_8;
+insert into t10 select 65214 from t1;
+insert into t10 select 65215 from t1;
+insert into t10 select 65216 from t1;
+insert into t10 select 65216 from t0, seq_1_to_52;
+insert into t10 select 65217 from t1;
+insert into t10 select 65218 from t1;
+insert into t10 select 65219 from t1;
+insert into t10 select 65219 from t0;
+insert into t10 select 73913 from t1;
+insert into t10 select 73914 from t1;
+insert into t10 select 73915 from t1;
+insert into t10 select 73915 from t0, seq_1_to_40;
+insert into t10 select 78257 from t1;
+insert into t10 select 78258 from t1;
+insert into t10 select 78259 from t1;
+insert into t10 select 91300 from t1;
+insert into t10 select 91301 from t1;
+insert into t10 select 91302 from t1;
+insert into t10 select 91302 from t0, seq_1_to_6;
+insert into t10 select 91303 from t1;
+insert into t10 select 91304 from t1;
+insert into t10 select 91305 from t1;
+insert into t10 select 91305 from t0, seq_1_to_8;
+insert into t10 select  99998 from t1;
+insert into t10 select  99999 from t1;
+insert into t10 select 100000 from t1;
+set use_stat_tables=preferably;
+analyze table t10 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t10	analyze	status	Engine-independent statistics collected
+test.t10	analyze	status	OK
+flush tables;
+set statement optimizer_trace=1 for
+explain select * from t10  where a in (91303);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t10	ALL	NULL	NULL	NULL	NULL	9984	Using where
+# Must have selectivity_from_histogram <= 1.0:
+select json_detailed(json_extract(trace, '$**.selectivity_for_columns')) 
+from information_schema.optimizer_trace;
+json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
+[
+    [
+        {
+            "column_name": "a",
+            "ranges": 
+            ["91303 <= a <= 91303"],
+            "selectivity_from_histogram": 0.035714283
+        }
+    ]
+]
+drop table t0,t1,t10;
 set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
 set histogram_size=@save_histogram_size;
 set use_stat_tables= @save_use_stat_tables;
-DROP TABLE t1;
-# End of 10.2 tests
 set @@global.histogram_size=@save_histogram_size;
diff --git a/mysql-test/main/selectivity.test b/mysql-test/main/selectivity.test
index 4e4513d09d6..556ffd76a22 100644
--- a/mysql-test/main/selectivity.test
+++ b/mysql-test/main/selectivity.test
@@ -1321,14 +1321,91 @@ EXPLAIN EXTENDED SELECT * FROM t1 WHERE a=2;
 FLUSH TABLES;
 
 EXPLAIN EXTENDED SELECT * FROM t1 WHERE a=2;
-set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
-set histogram_size=@save_histogram_size;
-set use_stat_tables= @save_use_stat_tables;
 
 DROP TABLE t1;
 
 --echo # End of 10.2 tests
 
+--echo #
+--echo # MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
+--echo #
+create table t0(a int);  # This holds how many rows we hold in a bucket.
+insert into t0 select 1 from seq_1_to_78;
+
+create table t1(a int);  # one-third of a bucket
+insert into t1 select 1 from seq_1_to_26;
+
+create table t10 (a int);
+insert into t10 select 0 from t0, seq_1_to_4;
+
+insert into t10 select 8693 from t1;
+insert into t10 select 8694 from t1;
+insert into t10 select 8695 from t1;
+
+
+insert into t10 select 34783 from t1;
+insert into t10 select 34784 from t1;
+insert into t10 select 34785 from t1;
+
+
+insert into t10 select 34785 from t0, seq_1_to_8;
+
+insert into t10 select 65214 from t1;
+insert into t10 select 65215 from t1;
+insert into t10 select 65216 from t1;
+
+insert into t10 select 65216 from t0, seq_1_to_52;
+
+insert into t10 select 65217 from t1;
+insert into t10 select 65218 from t1;
+insert into t10 select 65219 from t1;
+
+insert into t10 select 65219 from t0;
+
+
+insert into t10 select 73913 from t1;
+insert into t10 select 73914 from t1;
+insert into t10 select 73915 from t1;
+
+insert into t10 select 73915 from t0, seq_1_to_40;
+
+
+insert into t10 select 78257 from t1;
+insert into t10 select 78258 from t1;
+insert into t10 select 78259 from t1;
+
+insert into t10 select 91300 from t1;
+insert into t10 select 91301 from t1;
+insert into t10 select 91302 from t1;
+
+insert into t10 select 91302 from t0, seq_1_to_6;
+
+insert into t10 select 91303 from t1; # Only 1/3rd of bucket matches the search tuple
+insert into t10 select 91304 from t1;
+insert into t10 select 91305 from t1;
+
+insert into t10 select 91305 from t0, seq_1_to_8;
+
+insert into t10 select  99998 from t1;
+insert into t10 select  99999 from t1;
+insert into t10 select 100000 from t1;
+
+set use_stat_tables=preferably;
+analyze table t10 persistent for all;
+flush tables;
+set statement optimizer_trace=1 for
+explain select * from t10  where a in (91303);
+
+#select * from information_schema.optimizer_trace;
+--echo # Must have selectivity_from_histogram <= 1.0:
+select json_detailed(json_extract(trace, '$**.selectivity_for_columns')) 
+from information_schema.optimizer_trace;
+
+drop table t0,t1,t10;
+
+set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
+set histogram_size=@save_histogram_size;
+set use_stat_tables= @save_use_stat_tables;
 #
 # Clean up
 #
diff --git a/mysql-test/main/selectivity_innodb.result b/mysql-test/main/selectivity_innodb.result
index 3edf6d8e929..fd0cea27a81 100644
--- a/mysql-test/main/selectivity_innodb.result
+++ b/mysql-test/main/selectivity_innodb.result
@@ -845,7 +845,7 @@ flush table t1;
 set optimizer_use_condition_selectivity=4;
 explain extended select * from t1 where a=0;
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1025	0.39	Using where
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1025	0.79	Using where
 Warnings:
 Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 0
 drop table t1;
@@ -1661,7 +1661,7 @@ test.t1	analyze	status	OK
 # Check what info the optimizer has about selectivities
 explain extended select * from t1 use index () where a in (17,51,5);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	3.90	Using where
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	3.91	Using where
 Warnings:
 Note	1003	select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` USE INDEX () where `test`.`t1`.`a` in (17,51,5)
 explain extended select * from t1 use index () where b=2;
@@ -1953,11 +1953,78 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	5	25.00	Using where
 Warnings:
 Note	1003	select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 2
+DROP TABLE t1;
+# End of 10.2 tests
+#
+# MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
+#
+create table t0(a int);
+insert into t0 select 1 from seq_1_to_78;
+create table t1(a int);
+insert into t1 select 1 from seq_1_to_26;
+create table t10 (a int);
+insert into t10 select 0 from t0, seq_1_to_4;
+insert into t10 select 8693 from t1;
+insert into t10 select 8694 from t1;
+insert into t10 select 8695 from t1;
+insert into t10 select 34783 from t1;
+insert into t10 select 34784 from t1;
+insert into t10 select 34785 from t1;
+insert into t10 select 34785 from t0, seq_1_to_8;
+insert into t10 select 65214 from t1;
+insert into t10 select 65215 from t1;
+insert into t10 select 65216 from t1;
+insert into t10 select 65216 from t0, seq_1_to_52;
+insert into t10 select 65217 from t1;
+insert into t10 select 65218 from t1;
+insert into t10 select 65219 from t1;
+insert into t10 select 65219 from t0;
+insert into t10 select 73913 from t1;
+insert into t10 select 73914 from t1;
+insert into t10 select 73915 from t1;
+insert into t10 select 73915 from t0, seq_1_to_40;
+insert into t10 select 78257 from t1;
+insert into t10 select 78258 from t1;
+insert into t10 select 78259 from t1;
+insert into t10 select 91300 from t1;
+insert into t10 select 91301 from t1;
+insert into t10 select 91302 from t1;
+insert into t10 select 91302 from t0, seq_1_to_6;
+insert into t10 select 91303 from t1;
+insert into t10 select 91304 from t1;
+insert into t10 select 91305 from t1;
+insert into t10 select 91305 from t0, seq_1_to_8;
+insert into t10 select  99998 from t1;
+insert into t10 select  99999 from t1;
+insert into t10 select 100000 from t1;
+set use_stat_tables=preferably;
+analyze table t10 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t10	analyze	status	Engine-independent statistics collected
+test.t10	analyze	status	OK
+flush tables;
+set statement optimizer_trace=1 for
+explain select * from t10  where a in (91303);
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t10	ALL	NULL	NULL	NULL	NULL	9984	Using where
+# Must have selectivity_from_histogram <= 1.0:
+select json_detailed(json_extract(trace, '$**.selectivity_for_columns')) 
+from information_schema.optimizer_trace;
+json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
+[
+    [
+        {
+            "column_name": "a",
+            "ranges": 
+            ["91303 <= a <= 91303"],
+            "selectivity_from_histogram": 0.035714283
+        }
+    ]
+]
+drop table t0,t1,t10;
 set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
 set histogram_size=@save_histogram_size;
 set use_stat_tables= @save_use_stat_tables;
-DROP TABLE t1;
-# End of 10.2 tests
 set @@global.histogram_size=@save_histogram_size;
 set optimizer_switch=@save_optimizer_switch_for_selectivity_test;
 set @tmp_ust= @@use_stat_tables;
diff --git a/mysql-test/main/selectivity_no_engine.result b/mysql-test/main/selectivity_no_engine.result
index 3811b12a1be..9cb1a54e0b1 100644
--- a/mysql-test/main/selectivity_no_engine.result
+++ b/mysql-test/main/selectivity_no_engine.result
@@ -36,12 +36,12 @@ test.t2	analyze	status	OK
 # The following two must have the same in 'Extra' column:
 explain extended select * from t2 where col1 IN (20, 180);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	1100	1.35	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	1100	2.00	Using where
 Warnings:
 Note	1003	select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where `test`.`t2`.`col1` in (20,180)
 explain extended select * from t2 where col1 IN (180, 20);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	1100	1.35	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	1100	2.00	Using where
 Warnings:
 Note	1003	select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where `test`.`t2`.`col1` in (180,20)
 drop table t1, t2;
@@ -102,7 +102,7 @@ test.t1	analyze	status	Engine-independent statistics collected
 test.t1	analyze	status	OK
 explain extended select * from t1 where col1 in (1,2,3);
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	10000	3.37	Using where
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	10000	3.00	Using where
 Warnings:
 Note	1003	select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where `test`.`t1`.`col1` in (1,2,3)
 # Must not cause fp division by zero, or produce nonsense numbers:
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index be011adb60c..7481d3fd6ae 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -3780,9 +3780,9 @@ double get_column_range_cardinality(Field *field,
                                  field->key_length());
           double pos= field->pos_in_interval(col_stats->min_value,
                                              col_stats->max_value);
+          double n_distinct= col_non_nulls / avg_frequency;
           res= col_non_nulls * 
-	       hist->point_selectivity(pos,
-                                       avg_frequency / col_non_nulls);
+               hist->point_selectivity(pos, col_non_nulls, n_distinct);
         }
       }
       else if (avg_frequency == 0.0)
@@ -3840,8 +3840,10 @@ double get_column_range_cardinality(Field *field,
   
   @param pos      Position of the "const" between column's min_value and 
                   max_value.  This is a number in [0..1] range.
-  @param avg_sel  Average selectivity of condition "col=const" in this table.
-                  It is calcuated as (#non_null_values / #distinct_values).
+  @param n_rows   Number of rows covered by the histogram. This is number 
+                  of rows in the table that have non-NULL values for this
+                  column.
+  @param n_distinct  Number of distinct values for the histogram column.
   
   @return
      Expected condition selectivity (a number between 0 and 1)
@@ -3868,7 +3870,8 @@ double get_column_range_cardinality(Field *field,
       value.
 */
 
-double Histogram::point_selectivity(double pos, double avg_sel)
+double Histogram::point_selectivity(double pos, double n_rows,
+                                    double n_distinct)
 {
   double sel;
   /* Find the bucket that contains the value 'pos'. */
@@ -3901,54 +3904,100 @@ double Histogram::point_selectivity(double pos, double avg_sel)
   }
   else
   {
-    /* 
+    double n_distinct_regular_values, regular_value_rows, rows_for_regular_val;
+    /*
       The value 'pos' fits within one single histogram bucket.
 
-      Histogram buckets have the same numbers of rows, but they cover
-      different ranges of values.
-
-      We assume that values are uniformly distributed across the [0..1] value
-      range.
-    */
-
-    /* 
-      If all buckets covered value ranges of the same size, the width of
-      value range would be:
+      We consider values that do not fit into one histogram values "popular".
+      The value we're looking for is "regular", not popular.
+      Compute the average selectivity for regular values.
     */
-    double avg_bucket_width= 1.0 / (get_width() + 1);
     
+    /* Number of distinct regular values in the table: */
+    n_distinct_regular_values= n_distinct - n_popular_values;
+
+    if (n_distinct_regular_values < 1.0)
+      goto handle_edge_case;
+
     /*
-      Let's see what is the width of value range that our bucket is covering.
-        (min==max currently. they are kept in the formula just in case we 
-         will want to extend it to handle multi-bucket case)
+      How many rows have regular values? Look at the fraction of histogram
+      buckets that are not occupied by popular values.
     */
-    double inv_prec_factor= (double) 1.0 / prec_factor(); 
-    double current_bucket_width= 
-        (max + 1 == get_width() ?  1.0 : (get_value(max) * inv_prec_factor)) -
-        (min == 0 ?  0.0 : (get_value(min-1) * inv_prec_factor));
+    regular_value_rows=
+      n_rows * (get_width() - n_popular_values_buckets) / get_width();
 
-    DBUG_ASSERT(current_bucket_width); /* We shouldn't get a one zero-width bucket */
+    if (regular_value_rows < 1.0)
+      goto handle_edge_case;
 
-    /*
-      So:
-      - each bucket has the same #rows 
-      - values are unformly distributed across the [min_value,max_value] domain.
+    /* Average #rows that a regular value has: */
+    rows_for_regular_val= regular_value_rows / n_distinct_regular_values;
 
-      If a bucket has value range that's N times bigger then average, than
-      each value will have to have N times fewer rows than average.
-    */
-    sel= avg_sel * avg_bucket_width / current_bucket_width;
+    /* Selectiving this many rows out of total table rows gives: */
+    sel= rows_for_regular_val / n_rows;
 
-    /*
-      (Q: if we just follow this proportion we may end up in a situation
-      where number of different values we expect to find in this bucket
-      exceeds the number of rows that this histogram has in a bucket. Are 
-      we ok with this or we would want to have certain caps?)
-    */
+    /* Handle possible errors: */
+    if (sel <= 0.0 || sel > 1.0 / (get_width()))
+    {
+      DBUG_ASSERT(0);
+    handle_edge_case:
+      sel= 1.0 / (get_width());
+    }
   }
   return sel;
 }
 
+
+/*
+  @brief
+    Compute n_popular_values and n_popular_values_buckets
+
+  @detail
+    They are used in point_selectivity().
+*/
+
+void Histogram::update_popular_value_counts()
+{
+  n_popular_values=0;
+  n_popular_values_buckets=0;
+
+  // This variable means "previous bucket was occupied by a popular value":
+  bool cur_value_popular= false;
+  uint prev_value= 0;
+
+  for (uint i=0; i < get_width(); i++)
+  {
+    uint left_endpoint= prev_value;
+    uint right_endpoint= get_value(i);
+
+    if (left_endpoint == right_endpoint)
+    {
+      /* This bucket is occupied by one value. It's a popular value */
+      n_popular_values_buckets++;
+      cur_value_popular= true;
+    }
+    else
+    {
+      if (cur_value_popular)
+      {
+        /*
+          The previous bucket (and maybe also buckets before it) was fully
+          occupied by a popular value.
+        */
+        n_popular_values++;
+
+        /* But current bucket is not (as left_endpoint!=right_endpoint) */
+        cur_value_popular= false;
+      }
+    }
+    prev_value= right_endpoint;
+  }
+
+  /* Check if the last bucket was occupied by a popular value */
+  if (cur_value_popular)
+    n_popular_values++;
+}
+
+
 /*
   Check whether the table is one of the persistent statistical tables.
 */
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
index 35b3aa33acc..8bd00437e67 100644
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -148,6 +148,15 @@ private:
   uint8 size; /* Size of values array, in bytes */
   uchar *values;
 
+  /*
+    Number of popular values in the histogram. A value is considered popular if
+    it occupies one whole bucket or more than that.
+  */
+  uint n_popular_value;
+
+  /* Number of buckets that are fully occupied by popular values.  */
+  uint n_popular_values_buckets;
+
   uint prec_factor()
   {
     switch (type) {
@@ -223,6 +232,8 @@ private:
     return i;
   }
 
+  /* Re-compute n_popular_values and n_popular_values_buckets */
+  void update_popular_value_counts();
 public:
 
   uint get_size() { return (uint) size; }
@@ -235,7 +246,11 @@ public:
 
   void set_type (Histogram_type t) { type= t; }
 
-  void set_values (uchar *vals) { values= (uchar *) vals; }
+  void set_values(uchar *vals)
+  {
+    values= (uchar *) vals;
+    update_popular_value_counts();
+  }
 
   bool is_available() { return get_size() > 0 && get_values(); }
 
@@ -287,7 +302,7 @@ public:
   /*
     Estimate selectivity of "col=const" using a histogram
   */
-  double point_selectivity(double pos, double avg_sel);
+  double point_selectivity(double pos, double n_rows, double n_distinct);
 };