summaryrefslogtreecommitdiff
path: root/sql/sql_statistics.h
diff options
context:
space:
mode:
authorSergei Petrunia <sergey@mariadb.com>2023-04-27 18:07:55 +0300
committerSergei Petrunia <sergey@mariadb.com>2023-04-27 20:08:16 +0300
commit5285832b7810a9bee4f1c36ac0a4caff113ae949 (patch)
tree02b4c144ba2792b5f0cc1f0bd8cda081fe8e6785 /sql/sql_statistics.h
parent8f87023d3f3fbaad4e33991713db884cbe052fbc (diff)
downloadmariadb-git-bb-10.6-mdev31067-variant3.tar.gz
MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogrambb-10.6-mdev31067-variant3
Variant #3. When Histogram::point_selectivity() sees that the point value of interest falls into one bucket, it tries to guess whether the bucket has many different (unpopular) values or a few popular values. (The number of rows is fixed, as it's a Height-balanced histogram). The basis for this guess is the "width" of the value range the bucket covers. Buckets covering wider value ranges are assumed to contain values with proportionally lower frequencies. This is just a [brave] guesswork. For a very narrow bucket, it may produce an estimate that's larger than total #rows in the bucket or even in the whole table. Remove it and replace with another approach: compute the point selectivity by assuming that the number of rows that matches a point column=val condition is the average #rows for non-common value. A value is non-common if it takes less than one whole histogram bucket.
Diffstat (limited to 'sql/sql_statistics.h')
-rw-r--r--sql/sql_statistics.h19
1 files changed, 17 insertions, 2 deletions
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
index 35b3aa33acc..8bd00437e67 100644
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -148,6 +148,15 @@ private:
uint8 size; /* Size of values array, in bytes */
uchar *values;
+ /*
+ Number of popular values in the histogram. A value is considered popular if
+ it occupies one whole bucket or more than that.
+ */
+ uint n_popular_value;
+
+ /* Number of buckets that are fully occupied by popular values. */
+ uint n_popular_values_buckets;
+
uint prec_factor()
{
switch (type) {
@@ -223,6 +232,8 @@ private:
return i;
}
+ /* Re-compute n_popular_values and n_popular_values_buckets */
+ void update_popular_value_counts();
public:
uint get_size() { return (uint) size; }
@@ -235,7 +246,11 @@ public:
void set_type (Histogram_type t) { type= t; }
- void set_values (uchar *vals) { values= (uchar *) vals; }
+ void set_values(uchar *vals)
+ {
+ values= (uchar *) vals;
+ update_popular_value_counts();
+ }
bool is_available() { return get_size() > 0 && get_values(); }
@@ -287,7 +302,7 @@ public:
/*
Estimate selectivity of "col=const" using a histogram
*/
- double point_selectivity(double pos, double avg_sel);
+ double point_selectivity(double pos, double n_rows, double n_distinct);
};