diff options
author | Sergey Petrunya <psergey@askmonty.org> | 2014-03-27 13:08:00 +0400 |
---|---|---|
committer | Sergey Petrunya <psergey@askmonty.org> | 2014-03-27 13:08:00 +0400 |
commit | 79a8a6130b0c43e98a64a1fde8f277e0df06da5d (patch) | |
tree | 2f1f0f604942584382be90140695865f890d88c9 /sql/sql_statistics.cc | |
parent | 0d67aafaa2c383b4d2d76f8621109f8adbfb2532 (diff) | |
download | mariadb-git-79a8a6130b0c43e98a64a1fde8f277e0df06da5d.tar.gz |
Code cleanup:
- Move [some] engine-agnostic tests from t/selectivity.test to t/selectivity_no_engine.test
- Move Histogram::point_selectivity to sql_statistics.cc
Diffstat (limited to 'sql/sql_statistics.cc')
-rw-r--r-- | sql/sql_statistics.cc | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc index fd521f7a2ec..67e7a9c304b 100644 --- a/sql/sql_statistics.cc +++ b/sql/sql_statistics.cc @@ -3565,3 +3565,120 @@ double get_column_range_cardinality(Field *field, } return res; } + + + +/* + Estimate selectivity of "col=const" using a histogram + + @param pos Position of the "const" between column's min_value and + max_value. This is a number in [0..1] range. + @param avg_sel Average selectivity of condition "col=const" in this table. + It is calcuated as (#non_null_values / #distinct_values). + + @return + Expected condition selectivity (a number between 0 and 1) + + @notes + [re_zero_length_buckets] If a bucket with zero value-length is in the + middle of the histogram, we will not have min==max. Example: suppose, + pos_value=0x12, and the histogram is: + + #n #n+1 #n+2 + ... 0x10 0x12 0x12 0x14 ... + | + +------------- bucket with zero value-length + + Here, we will get min=#n+1, max=#n+2, and use the multi-bucket formula. + + The problem happens at the histogram ends. if pos_value=0, and the + histogram is: + + 0x00 0x10 ... + + then min=0, max=0. This means pos_value is contained within bucket #0, + but on the other hand, histogram data says that the bucket has only one + value. +*/ + +double Histogram::point_selectivity(double pos, double avg_sel) +{ + double sel; + /* Find the bucket that contains the value 'pos'. */ + uint min= find_bucket(pos, TRUE); + uint pos_value= (uint) (pos * prec_factor()); + + /* Find how many buckets this value occupies */ + uint max= min; + while (max + 1 < get_width() && get_value(max + 1) == pos_value) + max++; + + /* + A special case: we're looking at a single bucket, and that bucket has + zero value-length. Use the multi-bucket formula (attempt to use + single-bucket formula will cause divison by zero). + + For more details see [re_zero_length_buckets] above. + */ + if (max == min && get_value(max) == ((max==0)? 0 : get_value(max-1))) + max++; + + if (max > min) + { + /* + The value occupies multiple buckets. Use start_bucket ... end_bucket as + selectivity. + */ + double bucket_sel= 1.0/(get_width() + 1); + sel= bucket_sel * (max - min + 1); + } + else + { + /* + The value 'pos' fits within one single histogram bucket. + + Histogram buckets have the same numbers of rows, but they cover + different ranges of values. + + We assume that values are uniformly distributed across the [0..1] value + range. + */ + + /* + If all buckets covered value ranges of the same size, the width of + value range would be: + */ + double avg_bucket_width= 1.0 / (get_width() + 1); + + /* + Let's see what is the width of value range that our bucket is covering. + (min==max currently. they are kept in the formula just in case we + will want to extend it to handle multi-bucket case) + */ + double inv_prec_factor= (double) 1.0 / prec_factor(); + double current_bucket_width= + (max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) - + (min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor)); + + DBUG_ASSERT(current_bucket_width); /* We shouldn't get a one zero-width bucket */ + + /* + So: + - each bucket has the same #rows + - values are unformly distributed across the [min_value,max_value] domain. + + If a bucket has value range that's N times bigger then average, than + each value will have to have N times fewer rows than average. + */ + sel= avg_sel * avg_bucket_width / current_bucket_width; + + /* + (Q: if we just follow this proportion we may end up in a situation + where number of different values we expect to find in this bucket + exceeds the number of rows that this histogram has in a bucket. Are + we ok with this or we would want to have certain caps?) + */ + } + return sel; +} + |