MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogrambb-10.6-mdev31067-variant3

Variant #3. When Histogram::point_selectivity() sees that the point value of interest falls into one bucket, it tries to guess whether the bucket has many different (unpopular) values or a few popular values. (The number of rows is fixed, as it's a Height-balanced histogram). The basis for this guess is the "width" of the value range the bucket covers. Buckets covering wider value ranges are assumed to contain values with proportionally lower frequencies. This is just a [brave] guesswork. For a very narrow bucket, it may produce an estimate that's larger than total #rows in the bucket or even in the whole table. Remove it and replace with another approach: compute the point selectivity by assuming that the number of rows that matches a point column=val condition is the average #rows for non-common value. A value is non-common if it takes less than one whole histogram bucket.
author: Sergei Petrunia <sergey@mariadb.com> 2023-04-27 18:07:55 +0300
committer: Sergei Petrunia <sergey@mariadb.com> 2023-04-27 20:08:16 +0300
commit: 5285832b7810a9bee4f1c36ac0a4caff113ae949 (patch)
tree: 02b4c144ba2792b5f0cc1f0bd8cda081fe8e6785 /sql/sql_statistics.cc
parent: 8f87023d3f3fbaad4e33991713db884cbe052fbc (diff)
download: mariadb-git-bb-10.6-mdev31067-variant3.tar.gz
1 files changed, 88 insertions, 39 deletions
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index be011adb60c..7481d3fd6ae 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -3780,9 +3780,9 @@ double get_column_range_cardinality(Field *field,
                                  field->key_length());
           double pos= field->pos_in_interval(col_stats->min_value,
                                              col_stats->max_value);
+          double n_distinct= col_non_nulls / avg_frequency;
           res= col_non_nulls * 
-	       hist->point_selectivity(pos,
-                                       avg_frequency / col_non_nulls);
+               hist->point_selectivity(pos, col_non_nulls, n_distinct);
         }
       }
       else if (avg_frequency == 0.0)
@@ -3840,8 +3840,10 @@ double get_column_range_cardinality(Field *field,
   
   @param pos      Position of the "const" between column's min_value and 
                   max_value.  This is a number in [0..1] range.
-  @param avg_sel  Average selectivity of condition "col=const" in this table.
-                  It is calcuated as (#non_null_values / #distinct_values).
+  @param n_rows   Number of rows covered by the histogram. This is number 
+                  of rows in the table that have non-NULL values for this
+                  column.
+  @param n_distinct  Number of distinct values for the histogram column.
   
   @return
      Expected condition selectivity (a number between 0 and 1)
@@ -3868,7 +3870,8 @@ double get_column_range_cardinality(Field *field,
       value.
 */
 
-double Histogram::point_selectivity(double pos, double avg_sel)
+double Histogram::point_selectivity(double pos, double n_rows,
+                                    double n_distinct)
 {
   double sel;
   /* Find the bucket that contains the value 'pos'. */
@@ -3901,54 +3904,100 @@ double Histogram::point_selectivity(double pos, double avg_sel)
   }
   else
   {
-    /* 
+    double n_distinct_regular_values, regular_value_rows, rows_for_regular_val;
+    /*
       The value 'pos' fits within one single histogram bucket.
 
-      Histogram buckets have the same numbers of rows, but they cover
-      different ranges of values.
-
-      We assume that values are uniformly distributed across the [0..1] value
-      range.
-    */
-
-    /* 
-      If all buckets covered value ranges of the same size, the width of
-      value range would be:
+      We consider values that do not fit into one histogram values "popular".
+      The value we're looking for is "regular", not popular.
+      Compute the average selectivity for regular values.
     */
-    double avg_bucket_width= 1.0 / (get_width() + 1);
     
+    /* Number of distinct regular values in the table: */
+    n_distinct_regular_values= n_distinct - n_popular_values;
+
+    if (n_distinct_regular_values < 1.0)
+      goto handle_edge_case;
+
     /*
-      Let's see what is the width of value range that our bucket is covering.
-        (min==max currently. they are kept in the formula just in case we 
-         will want to extend it to handle multi-bucket case)
+      How many rows have regular values? Look at the fraction of histogram
+      buckets that are not occupied by popular values.
     */
-    double inv_prec_factor= (double) 1.0 / prec_factor(); 
-    double current_bucket_width= 
-        (max + 1 == get_width() ?  1.0 : (get_value(max) * inv_prec_factor)) -
-        (min == 0 ?  0.0 : (get_value(min-1) * inv_prec_factor));
+    regular_value_rows=
+      n_rows * (get_width() - n_popular_values_buckets) / get_width();
 
-    DBUG_ASSERT(current_bucket_width); /* We shouldn't get a one zero-width bucket */
+    if (regular_value_rows < 1.0)
+      goto handle_edge_case;
 
-    /*
-      So:
-      - each bucket has the same #rows 
-      - values are unformly distributed across the [min_value,max_value] domain.
+    /* Average #rows that a regular value has: */
+    rows_for_regular_val= regular_value_rows / n_distinct_regular_values;
 
-      If a bucket has value range that's N times bigger then average, than
-      each value will have to have N times fewer rows than average.
-    */
-    sel= avg_sel * avg_bucket_width / current_bucket_width;
+    /* Selectiving this many rows out of total table rows gives: */
+    sel= rows_for_regular_val / n_rows;
 
-    /*
-      (Q: if we just follow this proportion we may end up in a situation
-      where number of different values we expect to find in this bucket
-      exceeds the number of rows that this histogram has in a bucket. Are 
-      we ok with this or we would want to have certain caps?)
-    */
+    /* Handle possible errors: */
+    if (sel <= 0.0 || sel > 1.0 / (get_width()))
+    {
+      DBUG_ASSERT(0);
+    handle_edge_case:
+      sel= 1.0 / (get_width());
+    }
   }
   return sel;
 }
 
+
+/*
+  @brief
+    Compute n_popular_values and n_popular_values_buckets
+
+  @detail
+    They are used in point_selectivity().
+*/
+
+void Histogram::update_popular_value_counts()
+{
+  n_popular_values=0;
+  n_popular_values_buckets=0;
+
+  // This variable means "previous bucket was occupied by a popular value":
+  bool cur_value_popular= false;
+  uint prev_value= 0;
+
+  for (uint i=0; i < get_width(); i++)
+  {
+    uint left_endpoint= prev_value;
+    uint right_endpoint= get_value(i);
+
+    if (left_endpoint == right_endpoint)
+    {
+      /* This bucket is occupied by one value. It's a popular value */
+      n_popular_values_buckets++;
+      cur_value_popular= true;
+    }
+    else
+    {
+      if (cur_value_popular)
+      {
+        /*
+          The previous bucket (and maybe also buckets before it) was fully
+          occupied by a popular value.
+        */
+        n_popular_values++;
+
+        /* But current bucket is not (as left_endpoint!=right_endpoint) */
+        cur_value_popular= false;
+      }
+    }
+    prev_value= right_endpoint;
+  }
+
+  /* Check if the last bucket was occupied by a popular value */
+  if (cur_value_popular)
+    n_popular_values++;
+}
+
+
 /*
   Check whether the table is one of the persistent statistical tables.
 */
author	Sergei Petrunia <sergey@mariadb.com>	2023-04-27 18:07:55 +0300
committer	Sergei Petrunia <sergey@mariadb.com>	2023-04-27 20:08:16 +0300
commit	5285832b7810a9bee4f1c36ac0a4caff113ae949 (patch)
tree	02b4c144ba2792b5f0cc1f0bd8cda081fe8e6785 /sql/sql_statistics.cc
parent	8f87023d3f3fbaad4e33991713db884cbe052fbc (diff)
download	mariadb-git-bb-10.6-mdev31067-variant3.tar.gz