MDEV-21130: Histograms: use JSON as on-disk format10.6-MDEV-21130-M4-notes

A demo of how to use in-memory data structure for histogram. The patch shows how to * convert string form of data to binary form * compare two values in binary form * compute a fraction for val in [X, Y] range. grep for GSOC-TODO for notes.
author: Sergei Petrunia <psergey@askmonty.org> 2021-08-06 20:08:16 +0300
committer: Sergei Petrunia <psergey@askmonty.org> 2021-08-06 20:08:16 +0300
commit: 4d3c434028649a0f14471ed4a1a4c10b97b78eb8 (patch)
tree: 4d90f38900b7c148e91ddee9ce29beaaf9dfe727
parent: 44f88fc96104365877b9ad8cc7106d48fc1d5448 (diff)
download: mariadb-git-10.6-MDEV-21130-M4-notes.tar.gz
3 files changed, 289 insertions, 10 deletions
diff --git a/sql/field.h b/sql/field.h
index 4b64742b7b3..a24c6e8a300 100644
--- a/sql/field.h
+++ b/sql/field.h
@@ -1852,6 +1852,7 @@ public:
   {
     return (double) 0.5; 
   }
+  virtual bool pos_through_val_str() { return false;}
 
   /*
     Check if comparison between the field and an item unambiguously
@@ -2137,6 +2138,8 @@ public:
   {
     return pos_in_interval_val_str(min, max, length_size());
   }
+  bool pos_through_val_str() override {return true;}
+
   bool test_if_equality_guarantees_uniqueness(const Item *const_item) const
     override;
   SEL_ARG *get_mm_leaf(RANGE_OPT_PARAM *param, KEY_PART *key_part,
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index 818d2b8d492..eed22d7ed77 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -1240,7 +1240,8 @@ public:
       default:
         return NULL;
       }
-      if (!hist->parse(mem_root, table_field->read_stats->histogram_type_on_disk, 
+      if (!hist->parse(mem_root, table_field,
+                       table_field->read_stats->histogram_type_on_disk, 
                        (const uchar*)val.ptr(), val.length()))
       {
         table_field->read_stats->histogram_= hist;
@@ -1253,7 +1254,7 @@ public:
   }
 };
 
-bool Histogram_binary::parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr_arg, uint size_arg)
+bool Histogram_binary::parse(MEM_ROOT *mem_root, Field *, Histogram_type type_arg, const uchar *ptr_arg, uint size_arg)
 {
   // Just copy the data
   size = (uint8) size_arg;
@@ -1288,21 +1289,260 @@ void Histogram_json::init_for_collection(MEM_ROOT *mem_root, Histogram_type htyp
   size = (uint8) size_arg;
 }
 
-bool Histogram_json::parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr, uint size_arg)
+bool Histogram_json::parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg, const uchar *ptr, uint size_arg)
 {
   DBUG_ENTER("Histogram_json::parse");
   type = type_arg;
   const char *json = (char *)ptr;
   int vt;
-  bool result = json_get_array_items(json, json + strlen(json), &vt, hist_buckets);
+  bool result = json_get_array_items(json, json + strlen(json), &vt, hist_buckets_text);
   if (!result)
   {
     my_error(ER_JSON_HISTOGRAM_PARSE_FAILED, MYF(0), vt);
     DBUG_RETURN(true);
   }
+  size= hist_buckets_text.size();
+
+  /*
+    Convert the text based array into a data structure that allows lookups and
+    estimates
+  */
+  for (auto &s : hist_buckets_text)
+  {
+    field->store_text(s.data(), s.size(), &my_charset_bin);
+
+    // Get the value in "truncated key tuple format" here:
+    uchar buf[MAX_KEY_LENGTH];
+    uint len_to_copy= field->key_length();
+    uint bytes= field->get_key_image(buf, len_to_copy, Field::itRAW);
+    histogram_bounds.push_back(std::string((char*)buf, bytes));
+  }
+
   DBUG_RETURN(false);
 }
 
+
+static
+void store_key_image_to_rec_no_null(Field *field, uchar *ptr) {
+  MY_BITMAP *old_map= dbug_tmp_use_all_columns(field->table,
+                                    &field->table->write_set);
+  field->set_key_image(ptr, field->key_length()); 
+  dbug_tmp_restore_column_map(&field->table->write_set, old_map);
+}
+
+/*
+  GSOC-TODO:
+  This is our replacement for Field::pos_in_interval_val_real
+
+  We take midpoint_val and an interval [min_val, max_val], and return 
+  a number between 0.0 and 1.0 which specifies how close midpoint_val is 
+  to one of the bounds.
+  
+  @param field  Field object. We don't care about the field's current value
+                (actually, we overwrite it). We need it for its virtual
+                functions.
+                
+*/  
+double pos_in_interval_through_val_real(Field *field, 
+                                      uchar* min_val, 
+                                      uchar *max_val, 
+                                      uchar *midpoint_val)
+{
+
+  // For each passed value: unpack it into Field's current value. Then, we can
+  // get the value as double.
+
+  store_key_image_to_rec_no_null(field, min_val);
+  double min_val_real= field->val_real();
+  
+  store_key_image_to_rec_no_null(field, max_val);
+  double max_val_real= field->val_real();
+
+  store_key_image_to_rec_no_null(field, midpoint_val);
+  double midpoint_val_real= field->val_real();
+
+   // The code below is a copy of logic from Field::pos_in_interval_val_real:
+  double n, d;
+  n= midpoint_val_real - min_val_real;
+  if (n < 0)
+    return 0.0;
+  d= max_val_real - min_val_real;
+  if (d <= 0)
+    return 1.0;
+  return MY_MIN(n/d, 1.0);
+}
+
+// Copy-paste:
+static
+inline ulonglong char_prefix_to_ulonglong(uchar *src)
+{
+  uint sz= sizeof(ulonglong);
+  for (uint i= 0; i < sz/2; i++)
+  {
+    uchar tmp= src[i];
+    src[i]= src[sz-1-i];
+    src[sz-1-i]= tmp;
+  }
+  return uint8korr(src); 
+}
+
+// copy-paste:
+static inline double safe_substract(ulonglong a, ulonglong b)
+{
+  return (a > b)? double(a - b) : -double(b - a);
+}
+
+/*
+  GSOC-TODO: 
+  This is our replacement for Field::pos_in_interval_val_str
+
+  We take midpoint_val and an interval [min_val, max_val], and return 
+  a number between 0.0 and 1.0 which specifies how close midpoint_val is 
+  to one of the bounds.
+
+  @param field  Field object. We don't care about the field's current value
+                (actually, we overwrite it). We need it for its virtual
+                functions.
+
+  @TODO
+  Instead of copying the pos_in_interval_val_str(), we should do better:
+  if all three passed values have a common prefix, skip it.
+  This will make the returned value more precise.
+
+*/
+
+double pos_in_interval_through_strxfrm(Field *field, 
+                                       uchar *min_val, 
+                                       uchar *max_val, 
+                                       uchar *midpoint_val)
+{
+  // The code below is a copy of logic from Field::pos_in_interval_val_str
+  uchar mp_prefix[sizeof(ulonglong)];
+  uchar minp_prefix[sizeof(ulonglong)];
+  uchar maxp_prefix[sizeof(ulonglong)];
+  ulonglong mp, minp, maxp;
+
+  uint min_len= uint2korr(min_val);
+  uint max_len= uint2korr(max_val);
+  uint midpoint_len= uint2korr(midpoint_val);
+
+  auto cset= field->charset();
+
+  cset->strnxfrm(mp_prefix, sizeof(mp),
+                 midpoint_val + HA_KEY_BLOB_LENGTH,
+                 midpoint_len);
+  cset->strnxfrm(minp_prefix, sizeof(minp),
+                 min_val + HA_KEY_BLOB_LENGTH,
+                 min_len);
+  cset->strnxfrm(maxp_prefix, sizeof(maxp),
+                 max_val + HA_KEY_BLOB_LENGTH,
+                 max_len);
+  mp= char_prefix_to_ulonglong(mp_prefix);
+  minp= char_prefix_to_ulonglong(minp_prefix);
+  maxp= char_prefix_to_ulonglong(maxp_prefix);
+  double n, d;
+  n= safe_substract(mp, minp);
+  if (n < 0)
+    return 0.0;
+  d= safe_substract(maxp, minp);
+  if (d <= 0)
+    return 1.0;
+  return MY_MIN(n/d, 1.0);
+}
+
+
+/*
+  GSOC-TODO:
+  This is how range selectivity function should look like.
+
+  @param field  The table field histogram is for.  We don't care about the
+                 field's current value, we only need its virtual functions to 
+                 perform various operations
+  
+  @param min_endp, max_endp - this specifies the range.
+*/
+double Histogram_json::range_selectivity_new(Field *field, key_range *min_endp,
+                                             key_range *max_endp)
+{
+  fprintf(stderr, "Histogram_json::range_selectivity_new\n");
+
+
+  /*
+    GSOC-TODO: 
+    The code below is NOT what this function have. 
+
+    == WHAT THIS CODE DOES ==
+    At the moment it does a linear walk through histogram_bounds and compares 
+    min_endp to each of histogram bucket's min and max. 
+    ATTENTION:  This is a demo of how key_cmp() is used to compare the values.
+    
+    When it finds the bucket such that BUCKET_START < min_endp < BUCKET_END, 
+    it computes a position of min_endp within the bucket.
+    ATTENTION: calls to pos_in_interval_.... are a demo of how to compute 
+    position of a value within a [min,max] range.
+
+    == WHAT THIS CODE SHOULD DO ==
+    * Use binary search to locate the range  [MIN_BUCKET; MAX_BUCKET] - the
+      set of buckets that overlaps with the search interval {min_endp, max_endp}.
+
+    * If the search interval covers MIN_BUCKET only partially, compute a
+      position of min_endp within the bucket.
+
+    * The same for max_endp.
+
+    * Compute the final selectivity and return it.
+  */
+  std::string prev_s;
+  bool have_prev_s=false;
+  for (auto &s : histogram_bounds)
+  {
+    if (!have_prev_s)
+    {
+      prev_s = s;
+      have_prev_s= true;
+      continue;
+    }
+
+    // It's a test code, so we only process min_endp.
+    if (min_endp)
+    {
+      const uchar *min_key= min_endp->key;
+      // TODO: also, properly handle SQL NULLs.
+      // in this test patch, we just assume the values are not SQL NULLs.
+      if (field->real_maybe_null())
+        min_key++;
+
+      int res1= field->key_cmp((uchar*)prev_s.data(), min_key);
+      const char *str1="<";
+      if (res1>0) str1=">";
+      if (res1==0) str1="=";
+
+      int res2= field->key_cmp(min_key, (uchar*)s.data());
+      const char *str2="<";
+      if (res2>0) str2=">";
+      if (res2==0) str2="=";
+      fprintf(stderr, "prev_bound %s min_key %s bound\n", str1, str2);
+
+      if (res1<0 && res2 < 0)
+      {
+        double sel;
+        if (field->pos_through_val_str())
+          sel= pos_in_interval_through_strxfrm(field, (uchar*)prev_s.data(), 
+                                               (uchar*)s.data(), (uchar*)min_key);
+        else
+          sel= pos_in_interval_through_val_real(field, (uchar*)prev_s.data(), 
+                                                (uchar*)s.data(), (uchar*)min_key);
+
+        fprintf(stderr, "  pos_in_interval=%g\n", sel);
+      }
+
+      prev_s= s;
+    }
+  }
+  fprintf(stderr, "Histogram_json::range_selectivity_new ends\n");
+  return 0.5;
+}
+
 void Histogram_json::serialize(Field *field)
 {
   field->store((char*)values, strlen((char*)values),
@@ -4107,8 +4347,14 @@ double get_column_range_cardinality(Field *field,
         max_mp_pos= 1.0;
 
       Histogram_base *hist = col_stats->histogram_;
-      if (hist && hist->is_usable(thd))
+      if (hist && hist->is_usable(thd)) {
+        /*
+          GSOC-TODO: for now, we just call range_selectivity_new here.
+        */
+        sel= hist->range_selectivity_new(field, min_endp, max_endp);
+
         sel= hist->range_selectivity(min_mp_pos, max_mp_pos);
+      }
       else
         sel= (max_mp_pos - min_mp_pos);
       res= col_non_nulls * sel;
diff --git a/sql/sql_statistics.h b/sql/sql_statistics.h
index 2673b36d050..6c498e17ac6 100644
--- a/sql/sql_statistics.h
+++ b/sql/sql_statistics.h
@@ -149,7 +149,7 @@ bool is_eits_usable(Field* field);
 class Histogram_base : public Sql_alloc
 {
 public:
-  virtual bool parse(MEM_ROOT *mem_root, Histogram_type type_arg, 
+  virtual bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
                      const uchar *ptr, uint size)= 0;
   virtual void serialize(Field *to_field)= 0;
 
@@ -173,13 +173,19 @@ public:
 
   virtual double point_selectivity(double pos, double avg_selection)=0;
 
+  virtual double range_selectivity_new(Field *field, key_range *min_endp,
+                                       key_range *max_endp)
+  {
+    return 1.0;
+  };
+
   virtual ~Histogram_base(){}
 };
 
 class Histogram_binary : public Histogram_base
 {
 public:
-  bool parse(MEM_ROOT *mem_root, Histogram_type type_arg, 
+  bool parse(MEM_ROOT *mem_root, Field *, Histogram_type type_arg, 
              const uchar *ptr_arg, uint size_arg) override;
   void serialize(Field *to_field) override;
 
@@ -341,11 +347,28 @@ class Histogram_json : public Histogram_base
 private:
   Histogram_type type;
   uint8 size; /* Number of elements in the histogram*/
+
+  /*
+    GSOC-TODO: This is used for storing collected JSON text. Rename it
+    accordingly.
+  */
   uchar *values;
-  std::vector<std::string> hist_buckets;
+
+  // List of values in string form.
+  /* 
+    GSOC-TODO: We don't need to save this. It can be a local variable in
+    parse().
+    Eventually we should get rid of this at all, as we can convert the
+    endpoints and add them to histogram_bounds as soon as we've read them.
+  */
+  std::vector<std::string> hist_buckets_text;
+  
+  // Array of histogram bucket endpoints in KeyTupleFormat.
+  std::vector<std::string> histogram_bounds;
 
 public:
-  bool parse(MEM_ROOT *mem_root, Histogram_type type_arg, const uchar *ptr, uint size) override;
+  bool parse(MEM_ROOT *mem_root, Field *field, Histogram_type type_arg,
+            const uchar *ptr, uint size) override;
 
   void serialize(Field *field) override;
 
@@ -364,7 +387,7 @@ public:
 
   void init_for_collection(MEM_ROOT *mem_root, Histogram_type htype_arg, ulonglong size) override;
 
-  bool is_available() override {return get_width() > 0 && get_values(); }
+  bool is_available() override {return get_width() > 0 /*&& get_values()*/; }
 
   bool is_usable(THD *thd) override
   {
@@ -379,6 +402,13 @@ public:
   double range_selectivity(double min_pos, double max_pos) override {return 0.1;}
 
   double point_selectivity(double pos, double avg_selection) override {return 0.5;}
+
+  /*
+    GSOC-TODO: This function should eventually replace both range_selectivity()
+    and point_selectivity(). See its code for more details.
+  */
+  double range_selectivity_new(Field *field, key_range *min_endp,
+                                       key_range *max_endp) override;
 };
 
 class Columns_statistics;
author	Sergei Petrunia <psergey@askmonty.org>	2021-08-06 20:08:16 +0300
committer	Sergei Petrunia <psergey@askmonty.org>	2021-08-06 20:08:16 +0300
commit	4d3c434028649a0f14471ed4a1a4c10b97b78eb8 (patch)
tree	4d90f38900b7c148e91ddee9ce29beaaf9dfe727
parent	44f88fc96104365877b9ad8cc7106d48fc1d5448 (diff)
download	mariadb-git-10.6-MDEV-21130-M4-notes.tar.gz