summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergei Petrunia <psergey@askmonty.org>2021-08-06 20:08:16 +0300
committerSergei Petrunia <psergey@askmonty.org>2021-08-06 20:08:16 +0300
commit4d3c434028649a0f14471ed4a1a4c10b97b78eb8 (patch)
tree4d90f38900b7c148e91ddee9ce29beaaf9dfe727
parent44f88fc96104365877b9ad8cc7106d48fc1d5448 (diff)
downloadmariadb-git-10.6-MDEV-21130-M4-notes.tar.gz
MDEV-21130: Histograms: use JSON as on-disk format10.6-MDEV-21130-M4-notes
A demo of how to use in-memory data structure for histogram. The patch shows how to * convert string form of data to binary form * compare two values in binary form * compute a fraction for val in [X, Y] range. grep for GSOC-TODO for notes.
-rw-r--r--sql/field.h3
-rw-r--r--sql/sql_statistics.cc256
-rw-r--r--sql/sql_statistics.h40
3 files changed, 289 insertions, 10 deletions
diff --git a/sql/field.h b/sql/field.h
index 4b64742b7b3..a24c6e8a300 100644
--- a/sql/field.h
+++ b/sql/field.h
@@ -1852,6 +1852,7 @@ public:
{
return (double) 0.5;
}
+ virtual bool pos_through_val_str() { return false;}
/*
Check if comparison between the field and an item unambiguously
@@ -2137,6 +2138,8 @@ public:
{
return pos_in_interval_val_str(min, max, length_size());
}
+ bool pos_through_val_str() override {return true;}
+
bool test_if_equality_guarantees_uniqueness(const Item *const_item) const
override;
SEL_ARG *get_mm_leaf(RANGE_OPT_PARAM *param, KEY_PART *key_part,
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index 818d2b8d492..eed22d7ed77 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -1240,7 +1240,8 @@ public:
default:
return NULL;
}
- if (!hist->parse(mem_root, table_field->read_stats->histogram_type_on_disk,
+ if (!hist->parse(mem_root, table_field,
+ table_field->read_stats->histogram_type_on_disk,
(const uchar*)val.ptr(), val.length()))
{
table_field->read_stats->histogram_= hist;
@@ -1253,7 +1254,7 @@ public:
}
};
-bool Histogram_binary::parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr_arg, uint size_arg)
+bool Histogram_binary::parse(MEM_ROOT *mem_root, Field *, Histogram_type type_arg, const uchar *ptr_arg, uint size_arg)
{
// Just copy the data
size = (uint8) size_arg;
@@ -1288,21 +1289,260 @@ void Histogram_json::init_for_collection(MEM_ROOT *mem_root, Histogram_type htyp
size = (uint8) size_arg;
}
-bool Histogram_json::parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr, uint size_arg)
+bool Histogram_json::parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg, const uchar *ptr, uint size_arg)
{
DBUG_ENTER("Histogram_json::parse");
type = type_arg;
const char *json = (char *)ptr;
int vt;
- bool result = json_get_array_items(json, json + strlen(json), &vt, hist_buckets);
+ bool result = json_get_array_items(json, json + strlen(json), &vt, hist_buckets_text);
if (!result)
{
my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), vt);
DBUG_RETURN(true);
}
+ size= hist_buckets_text.size();
+
+ /*
+ Convert the text based array into a data structure that allows lookups and
+ estimates
+ */
+ for (auto &s : hist_buckets_text)
+ {
+ field->store_text(s.data(), s.size(), &my_charset_bin);
+
+ // Get the value in "truncated key tuple format" here:
+ uchar buf[MAX_KEY_LENGTH];
+ uint len_to_copy= field->key_length();
+ uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
+ histogram_bounds.push_back(std::string((char*)buf, bytes));
+ }
+
DBUG_RETURN(false);
}
+
+static
+void store_key_image_to_rec_no_null(Field *field, uchar *ptr) {
+ MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
+ &field->table->write_set);
+ field->set_key_image(ptr, field->key_length());
+ dbug_tmp_restore_column_map(&field->table->write_set, old_map);
+}
+
+/*
+ GSOC-TODO:
+ This is our replacement for Field::pos_in_interval_val_real
+
+ We take midpoint_val and an interval [min_val, max_val], and return
+ a number between 0.0 and 1.0 which specifies how close midpoint_val is
+ to one of the bounds.
+
+ @param field Field object. We don't care about the field's current value
+ (actually, we overwrite it). We need it for its virtual
+ functions.
+
+*/
+double pos_in_interval_through_val_real(Field *field,
+ uchar* min_val,
+ uchar *max_val,
+ uchar *midpoint_val)
+{
+
+ // For each passed value: unpack it into Field's current value. Then, we can
+ // get the value as double.
+
+ store_key_image_to_rec_no_null(field, min_val);
+ double min_val_real= field->val_real();
+
+ store_key_image_to_rec_no_null(field, max_val);
+ double max_val_real= field->val_real();
+
+ store_key_image_to_rec_no_null(field, midpoint_val);
+ double midpoint_val_real= field->val_real();
+
+ // The code below is a copy of logic from Field::pos_in_interval_val_real:
+ double n, d;
+ n= midpoint_val_real - min_val_real;
+ if (n < 0)
+ return 0.0;
+ d= max_val_real - min_val_real;
+ if (d <= 0)
+ return 1.0;
+ return MY_MIN(n/d, 1.0);
+}
+
+// Copy-paste:
+static
+inline ulonglong char_prefix_to_ulonglong(uchar *src)
+{
+ uint sz= sizeof(ulonglong);
+ for (uint i= 0; i < sz/2; i++)
+ {
+ uchar tmp= src[i];
+ src[i]= src[sz-1-i];
+ src[sz-1-i]= tmp;
+ }
+ return uint8korr(src);
+}
+
+// copy-paste:
+static inline double safe_substract(ulonglong a, ulonglong b)
+{
+ return (a > b)? double(a - b) : -double(b - a);
+}
+
+/*
+ GSOC-TODO:
+ This is our replacement for Field::pos_in_interval_val_str
+
+ We take midpoint_val and an interval [min_val, max_val], and return
+ a number between 0.0 and 1.0 which specifies how close midpoint_val is
+ to one of the bounds.
+
+ @param field Field object. We don't care about the field's current value
+ (actually, we overwrite it). We need it for its virtual
+ functions.
+
+ @TODO
+ Instead of copying the pos_in_interval_val_str(), we should do better:
+ if all three passed values have a common prefix, skip it.
+ This will make the returned value more precise.
+
+*/
+
+double pos_in_interval_through_strxfrm(Field *field,
+ uchar *min_val,
+ uchar *max_val,
+ uchar *midpoint_val)
+{
+ // The code below is a copy of logic from Field::pos_in_interval_val_str
+ uchar mp_prefix[sizeof(ulonglong)];
+ uchar minp_prefix[sizeof(ulonglong)];
+ uchar maxp_prefix[sizeof(ulonglong)];
+ ulonglong mp, minp, maxp;
+
+ uint min_len= uint2korr(min_val);
+ uint max_len= uint2korr(max_val);
+ uint midpoint_len= uint2korr(midpoint_val);
+
+ auto cset= field->charset();
+
+ cset->strnxfrm(mp_prefix, sizeof(mp),
+ midpoint_val + HA_KEY_BLOB_LENGTH,
+ midpoint_len);
+ cset->strnxfrm(minp_prefix, sizeof(minp),
+ min_val + HA_KEY_BLOB_LENGTH,
+ min_len);
+ cset->strnxfrm(maxp_prefix, sizeof(maxp),
+ max_val + HA_KEY_BLOB_LENGTH,
+ max_len);
+ mp= char_prefix_to_ulonglong(mp_prefix);
+ minp= char_prefix_to_ulonglong(minp_prefix);
+ maxp= char_prefix_to_ulonglong(maxp_prefix);
+ double n, d;
+ n= safe_substract(mp, minp);
+ if (n < 0)
+ return 0.0;
+ d= safe_substract(maxp, minp);
+ if (d <= 0)
+ return 1.0;
+ return MY_MIN(n/d, 1.0);
+}
+
+
+/*
+ GSOC-TODO:
+ This is how range selectivity function should look like.
+
+ @param field The table field histogram is for. We don't care about the
+ field's current value, we only need its virtual functions to
+ perform various operations
+
+ @param min_endp, max_endp - this specifies the range.
+*/
+double Histogram_json::range_selectivity_new(Field *field, key_range *min_endp,
+ key_range *max_endp)
+{
+ fprintf(stderr, "Histogram_json::range_selectivity_new\n");
+
+
+ /*
+ GSOC-TODO:
+ The code below is NOT what this function have.
+
+ == WHAT THIS CODE DOES ==
+ At the moment it does a linear walk through histogram_bounds and compares
+ min_endp to each of histogram bucket's min and max.
+ ATTENTION: This is a demo of how key_cmp() is used to compare the values.
+
+ When it finds the bucket such that BUCKET_START < min_endp < BUCKET_END,
+ it computes a position of min_endp within the bucket.
+ ATTENTION: calls to pos_in_interval_.... are a demo of how to compute
+ position of a value within a [min,max] range.
+
+ == WHAT THIS CODE SHOULD DO ==
+ * Use binary search to locate the range [MIN_BUCKET; MAX_BUCKET] - the
+ set of buckets that overlaps with the search interval {min_endp, max_endp}.
+
+ * If the search interval covers MIN_BUCKET only partially, compute a
+ position of min_endp within the bucket.
+
+ * The same for max_endp.
+
+ * Compute the final selectivity and return it.
+ */
+ std::string prev_s;
+ bool have_prev_s=false;
+ for (auto &s : histogram_bounds)
+ {
+ if (!have_prev_s)
+ {
+ prev_s = s;
+ have_prev_s= true;
+ continue;
+ }
+
+ // It's a test code, so we only process min_endp.
+ if (min_endp)
+ {
+ const uchar *min_key= min_endp->key;
+ // TODO: also, properly handle SQL NULLs.
+ // in this test patch, we just assume the values are not SQL NULLs.
+ if (field->real_maybe_null())
+ min_key++;
+
+ int res1= field->key_cmp((uchar*)prev_s.data(), min_key);
+ const char *str1="<";
+ if (res1>0) str1=">";
+ if (res1==0) str1="=";
+
+ int res2= field->key_cmp(min_key, (uchar*)s.data());
+ const char *str2="<";
+ if (res2>0) str2=">";
+ if (res2==0) str2="=";
+ fprintf(stderr, "prev_bound %s min_key %s bound\n", str1, str2);
+
+ if (res1<0 && res2 < 0)
+ {
+ double sel;
+ if (field->pos_through_val_str())
+ sel= pos_in_interval_through_strxfrm(field, (uchar*)prev_s.data(),
+ (uchar*)s.data(), (uchar*)min_key);
+ else
+ sel= pos_in_interval_through_val_real(field, (uchar*)prev_s.data(),
+ (uchar*)s.data(), (uchar*)min_key);
+
+ fprintf(stderr, " pos_in_interval=%g\n", sel);
+ }
+
+ prev_s= s;
+ }
+ }
+ fprintf(stderr, "Histogram_json::range_selectivity_new ends\n");
+ return 0.5;
+}
+
void Histogram_json::serialize(Field *field)
{
field->store((char*)values, strlen((char*)values),
@@ -4107,8 +4347,14 @@ double get_column_range_cardinality(Field *field,
max_mp_pos= 1.0;
Histogram_base *hist = col_stats->histogram_;
- if (hist && hist->is_usable(thd))
+ if (hist && hist->is_usable(thd)) {
+ /*
+ GSOC-TODO: for now, we just call range_selectivity_new here.
+ */
+ sel= hist->range_selectivity_new(field, min_endp, max_endp);
+
sel= hist->range_selectivity(min_mp_pos, max_mp_pos);
+ }
else
sel= (max_mp_pos - min_mp_pos);
res= col_non_nulls * sel;
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
index 2673b36d050..6c498e17ac6 100644
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -149,7 +149,7 @@ bool is_eits_usable(Field* field);
class Histogram_base : public Sql_alloc
{
public:
- virtual bool parse(MEM_ROOT *mem_root, Histogram_type type_arg,
+ virtual bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
const uchar *ptr, uint size)= 0;
virtual void serialize(Field *to_field)= 0;
@@ -173,13 +173,19 @@ public:
virtual double point_selectivity(double pos, double avg_selection)=0;
+ virtual double range_selectivity_new(Field *field, key_range *min_endp,
+ key_range *max_endp)
+ {
+ return 1.0;
+ };
+
virtual ~Histogram_base(){}
};
class Histogram_binary : public Histogram_base
{
public:
- bool parse(MEM_ROOT *mem_root, Histogram_type type_arg,
+ bool parse(MEM_ROOT *mem_root, Field *, Histogram_type type_arg,
const uchar *ptr_arg, uint size_arg) override;
void serialize(Field *to_field) override;
@@ -341,11 +347,28 @@ class Histogram_json : public Histogram_base
private:
Histogram_type type;
uint8 size; /* Number of elements in the histogram*/
+
+ /*
+ GSOC-TODO: This is used for storing collected JSON text. Rename it
+ accordingly.
+ */
uchar *values;
- std::vector<std::string> hist_buckets;
+
+ // List of values in string form.
+ /*
+ GSOC-TODO: We don't need to save this. It can be a local variable in
+ parse().
+ Eventually we should get rid of this at all, as we can convert the
+ endpoints and add them to histogram_bounds as soon as we've read them.
+ */
+ std::vector<std::string> hist_buckets_text;
+
+ // Array of histogram bucket endpoints in KeyTupleFormat.
+ std::vector<std::string> histogram_bounds;
public:
- bool parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr, uint size) override;
+ bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
+ const uchar *ptr, uint size) override;
void serialize(Field *field) override;
@@ -364,7 +387,7 @@ public:
void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg, ulonglong size) override;
- bool is_available() override {return get_width() > 0 && get_values(); }
+ bool is_available() override {return get_width() > 0 /*&& get_values()*/; }
bool is_usable(THD *thd) override
{
@@ -379,6 +402,13 @@ public:
double range_selectivity(double min_pos, double max_pos) override {return 0.1;}
double point_selectivity(double pos, double avg_selection) override {return 0.5;}
+
+ /*
+ GSOC-TODO: This function should eventually replace both range_selectivity()
+ and point_selectivity(). See its code for more details.
+ */
+ double range_selectivity_new(Field *field, key_range *min_endp,
+ key_range *max_endp) override;
};
class Columns_statistics;