diff options
author | Sergei Petrunia <sergey@mariadb.com> | 2023-04-27 18:07:55 +0300 |
---|---|---|
committer | Sergei Petrunia <sergey@mariadb.com> | 2023-04-27 20:08:16 +0300 |
commit | 5285832b7810a9bee4f1c36ac0a4caff113ae949 (patch) | |
tree | 02b4c144ba2792b5f0cc1f0bd8cda081fe8e6785 /sql/sql_statistics.h | |
parent | 8f87023d3f3fbaad4e33991713db884cbe052fbc (diff) | |
download | mariadb-git-bb-10.6-mdev31067-variant3.tar.gz |
MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogrambb-10.6-mdev31067-variant3
Variant #3.
When Histogram::point_selectivity() sees that the point value of interest
falls into one bucket, it tries to guess whether the bucket has many
different (unpopular) values or a few popular values. (The number of
rows is fixed, as it's a Height-balanced histogram).
The basis for this guess is the "width" of the value range the bucket
covers. Buckets covering wider value ranges are assumed to contain
values with proportionally lower frequencies.
This is just a [brave] guesswork. For a very narrow bucket, it may
produce an estimate that's larger than total #rows in the bucket
or even in the whole table.
Remove it and replace with another approach:
compute the point selectivity by assuming that the number of rows that
matches a point column=val condition is the average #rows for non-common
value. A value is non-common if it takes less than one whole histogram
bucket.
Diffstat (limited to 'sql/sql_statistics.h')
-rw-r--r-- | sql/sql_statistics.h | 19 |
1 files changed, 17 insertions, 2 deletions
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h index 35b3aa33acc..8bd00437e67 100644 --- a/sql/sql_statistics.h +++ b/sql/sql_statistics.h @@ -148,6 +148,15 @@ private: uint8 size; /* Size of values array, in bytes */ uchar *values; + /* + Number of popular values in the histogram. A value is considered popular if + it occupies one whole bucket or more than that. + */ + uint n_popular_value; + + /* Number of buckets that are fully occupied by popular values. */ + uint n_popular_values_buckets; + uint prec_factor() { switch (type) { @@ -223,6 +232,8 @@ private: return i; } + /* Re-compute n_popular_values and n_popular_values_buckets */ + void update_popular_value_counts(); public: uint get_size() { return (uint) size; } @@ -235,7 +246,11 @@ public: void set_type (Histogram_type t) { type= t; } - void set_values (uchar *vals) { values= (uchar *) vals; } + void set_values(uchar *vals) + { + values= (uchar *) vals; + update_popular_value_counts(); + } bool is_available() { return get_size() > 0 && get_values(); } @@ -287,7 +302,7 @@ public: /* Estimate selectivity of "col=const" using a histogram */ - double point_selectivity(double pos, double avg_sel); + double point_selectivity(double pos, double n_rows, double n_distinct); }; |