Bug#13694811: THE OPTIMIZER WRONGLY USES THE FIRST INNODB

PARTITION STATISTICS Problem was the fix for bug#11756867; It always used the first partitions, and stopped after it checked 10 [sub]partitions. (or until it found a partition which would contain a match). This results in bad statistics for tables where the first 10 partitions don't represent the majority of the data (like when the first 10 partitions only contained a few rows in total). The solution was to take statisics from the partitions containing the most rows instead: Added an array of partition ids which is sorted by number of records in descending order. this array is used in records_in_range to cover as many records as possible in as few calls as possible. Also changed the limit of how many partitions to use for the statistics from a static max of 10 partitions, into a dynamic model: Maximum number of partitions is now log2(total number of partitions) taken from the ordered array. It will continue calling partitions records_in_range until it has checked: (total rows in matching partitions) * (maximum number of partitions) / (number of used partitions) Also reverted the changes for ha_partition::scan_time() and ha_partition::estimate_rows_upper_bound() to before the fix of bug#11756867. Since they are not as slow as records_in_range.
author: Mattias Jonsson <mattias.jonsson@oracle.com> 2012-02-22 23:13:36 +0100
committer: Mattias Jonsson <mattias.jonsson@oracle.com> 2012-02-22 23:13:36 +0100
commit: 8325fe02b3f2cb94fb99ac72dd837d9db0c037ba (patch)
tree: 0c7483648c83a2186a53487994a5de5ffcccd422 /sql
parent: b566d9a512a9671a423e1ff944d1e4ede2df7f7e (diff)
download: mariadb-git-8325fe02b3f2cb94fb99ac72dd837d9db0c037ba.tar.gz
2 files changed, 207 insertions, 117 deletions
diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc
index 7c7cf5a4302..ddfe271f6a3 100644
--- a/sql/ha_partition.cc
+++ b/sql/ha_partition.cc
@@ -286,6 +286,7 @@ void ha_partition::init_handler_variables()
   m_is_sub_partitioned= 0;
   m_is_clone_of= NULL;
   m_clone_mem_root= NULL;
+  m_part_ids_sorted_by_num_of_records= NULL;
 
 #ifdef DONT_HAVE_TO_BE_INITALIZED
   m_start_key.flag= 0;
@@ -321,6 +322,8 @@ ha_partition::~ha_partition()
       delete m_file[i];
   }
   my_free((char*) m_ordered_rec_buffer, MYF(MY_ALLOW_ZERO_PTR));
+  my_free((char*) m_part_ids_sorted_by_num_of_records,
+          MYF(MY_ALLOW_ZERO_PTR));
 
   clear_handler_file();
   DBUG_VOID_RETURN;
@@ -2638,6 +2641,16 @@ int ha_partition::open(const char *name, int mode, uint test_if_locked)
       m_start_key.key= (const uchar*)ptr;
     }
   }
+  if (!m_part_ids_sorted_by_num_of_records)
+  {
+    if (!(m_part_ids_sorted_by_num_of_records=
+            (uint32*) my_malloc(m_tot_parts * sizeof(uint32), MYF(MY_WME))))
+      DBUG_RETURN(error);
+    uint32 i;
+    /* Initialize it with all partition ids. */
+    for (i= 0; i < m_tot_parts; i++)
+      m_part_ids_sorted_by_num_of_records[i]= i;
+  }
 
   /* Initialize the bitmap we use to minimize ha_start_bulk_insert calls */
   if (bitmap_init(&m_bulk_insert_started, NULL, m_tot_parts + 1, FALSE))
@@ -5146,6 +5159,24 @@ int ha_partition::handle_ordered_prev(uchar *buf)
   and read_time calls
 */
 
+/**
+  Helper function for sorting according to number of rows in descending order.
+*/
+
+int ha_partition::compare_number_of_records(ha_partition *me,
+                                            const uint32 *a,
+                                            const uint32 *b)
+{
+  handler **file= me->m_file;
+  /* Note: sorting in descending order! */
+  if (file[*a]->stats.records > file[*b]->stats.records)
+    return -1;
+  if (file[*a]->stats.records < file[*b]->stats.records)
+    return 1;
+  return 0;
+}
+
+
 /*
   General method to gather info from handler
 
@@ -5387,6 +5418,15 @@ int ha_partition::info(uint flag)
       }
       i++;
     } while (*(++file_array));
+    /*
+      Sort the array of part_ids by number of records in
+      in descending order.
+    */
+    my_qsort2((void*) m_part_ids_sorted_by_num_of_records,
+              m_tot_parts,
+              sizeof(uint32),
+              (qsort2_cmp) compare_number_of_records,
+              this);
 
     file= m_file[handler_instance];
     file->info(HA_STATUS_CONST);
@@ -6124,116 +6164,113 @@ const key_map *ha_partition::keys_to_use_for_scanning()
   DBUG_RETURN(m_file[0]->keys_to_use_for_scanning());
 }
 
-#define MAX_PARTS_FOR_OPTIMIZER_CALLS 10
-/*
-  Prepare start variables for estimating optimizer costs.
-
-  @param[out] num_used_parts  Number of partitions after pruning.
-  @param[out] check_min_num   Number of partitions to call.
-  @param[out] first           first used partition.
+/**
+  Minimum number of rows to base optimizer estimate on.
 */
-void ha_partition::partitions_optimizer_call_preparations(uint *first,
-                                                          uint *num_used_parts,
-                                                          uint *check_min_num)
+
+ha_rows ha_partition::min_rows_for_estimate()
 {
-  *first= bitmap_get_first_set(&(m_part_info->used_partitions));
-  *num_used_parts= bitmap_bits_set(&(m_part_info->used_partitions));
-  *check_min_num= min(MAX_PARTS_FOR_OPTIMIZER_CALLS, *num_used_parts);
+  uint i, max_used_partitions, tot_used_partitions;
+  DBUG_ENTER("ha_partition::partitions_optimizer_call_preparations");
+
+  tot_used_partitions= bitmap_bits_set(&m_part_info->used_partitions);
+  DBUG_ASSERT(tot_used_partitions);
+
+  /*
+    Allow O(log2(tot_partitions)) increase in number of used partitions.
+    This gives O(1/log2(tot_partitions)) of rows to base the estimate on.
+    I.e when the total number of partitions doubles, allow one more
+    partition to be checked.
+  */
+  i= 2;
+  max_used_partitions= 1;
+  while (i < m_tot_parts)
+  {
+    max_used_partitions++;
+    i= i << 1;
+  }
+  if (max_used_partitions > tot_used_partitions)
+    max_used_partitions= tot_used_partitions;
+
+  /* stats.records is already updated by the info(HA_STATUS_VARIABLE) call. */
+  DBUG_PRINT("info", ("max_used_partitions: %u tot_rows: %lu",
+                      max_used_partitions,
+                      (ulong) stats.records));
+  DBUG_PRINT("info", ("tot_used_partitions: %u min_rows_to_check: %lu",
+                      tot_used_partitions,
+                      (ulong) stats.records * max_used_partitions
+                              / tot_used_partitions));
+  DBUG_RETURN(stats.records * max_used_partitions / tot_used_partitions);
 }
 
 
-/*
-  Return time for a scan of the table
+/**
+  Get the biggest used partition.
 
-  SYNOPSIS
-    scan_time()
+  Starting at the N:th biggest partition and skips all non used
+  partitions, returning the biggest used partition found
 
-  RETURN VALUE
-    time for scan
+  @param[in,out] part_index  Skip the *part_index biggest partitions
+
+  @return The biggest used partition with index not lower than *part_index.
+    @retval NO_CURRENT_PART_ID     No more partition used.
+    @retval != NO_CURRENT_PART_ID  partition id of biggest used partition with
+                                   index >= *part_index supplied. Note that
+                                   *part_index will be updated to the next
+                                   partition index to use.
 */
 
-double ha_partition::scan_time()
+uint ha_partition::get_biggest_used_partition(uint *part_index)
 {
-  double scan_time= 0.0;
-  uint first, part_id, num_used_parts, check_min_num, partitions_called= 0;
-  DBUG_ENTER("ha_partition::scan_time");
-
-  partitions_optimizer_call_preparations(&first, &num_used_parts, &check_min_num);
-  for (part_id= first; partitions_called < num_used_parts ; part_id++)
+  uint part_id;
+  while ((*part_index) < m_tot_parts)
   {
-    if (!bitmap_is_set(&(m_part_info->used_partitions), part_id))
-      continue;
-    scan_time+= m_file[part_id]->scan_time();
-    partitions_called++;
-    if (partitions_called >= check_min_num && scan_time != 0.0)
-    {
-      DBUG_RETURN(scan_time *
-                      (double) num_used_parts / (double) partitions_called);
-    }
+    part_id= m_part_ids_sorted_by_num_of_records[(*part_index)++];
+    if (bitmap_is_set(&m_part_info->used_partitions, part_id))
+      return part_id;
   }
-  DBUG_RETURN(scan_time);
+  return NO_CURRENT_PART_ID;
 }
 
 
 /*
-  Estimate rows for records_in_range or estimate_rows_upper_bound.
+  Return time for a scan of the table
 
-  @param is_records_in_range  call records_in_range instead of
-                              estimate_rows_upper_bound.
-  @param inx                  (only for records_in_range) index to use.
-  @param min_key              (only for records_in_range) start of range.
-  @param max_key              (only for records_in_range) end of range.
+  SYNOPSIS
+    scan_time()
 
-  @return Number of rows or HA_POS_ERROR.
+  RETURN VALUE
+    time for scan
 */
-ha_rows ha_partition::estimate_rows(bool is_records_in_range, uint inx,
-                                    key_range *min_key, key_range *max_key)
+
+double ha_partition::scan_time()
 {
-  ha_rows rows, estimated_rows= 0;
-  uint first, part_id, num_used_parts, check_min_num, partitions_called= 0;
-  DBUG_ENTER("ha_partition::records_in_range");
+  double scan_time= 0;
+  handler **file;
+  DBUG_ENTER("ha_partition::scan_time");
 
-  partitions_optimizer_call_preparations(&first, &num_used_parts, &check_min_num);
-  for (part_id= first; partitions_called < num_used_parts ; part_id++)
-  {
-    if (!bitmap_is_set(&(m_part_info->used_partitions), part_id))
-      continue;
-    if (is_records_in_range)
-      rows= m_file[part_id]->records_in_range(inx, min_key, max_key);
-    else
-      rows= m_file[part_id]->estimate_rows_upper_bound();
-    if (rows == HA_POS_ERROR)
-      DBUG_RETURN(HA_POS_ERROR);
-    estimated_rows+= rows;
-    partitions_called++;
-    if (partitions_called >= check_min_num && estimated_rows)
-    {
-      DBUG_RETURN(estimated_rows * num_used_parts / partitions_called);
-    }
-  }
-  DBUG_RETURN(estimated_rows);
+  for (file= m_file; *file; file++)
+    if (bitmap_is_set(&(m_part_info->used_partitions), (file - m_file)))
+      scan_time+= (*file)->scan_time();
+  DBUG_RETURN(scan_time);
 }
 
 
-/*
-  Find number of records in a range
-
-  SYNOPSIS
-    records_in_range()
-    inx                  Index number
-    min_key              Start of range
-    max_key              End of range
+/**
+  Find number of records in a range.
+  @param inx      Index number
+  @param min_key  Start of range
+  @param max_key  End of range
 
-  RETURN VALUE
-    Number of rows in range
+  @return Number of rows in range.
 
-  DESCRIPTION
-    Given a starting key, and an ending key estimate the number of rows that
-    will exist between the two. end_key may be empty which in case determine
-    if start_key matches any rows.
+  Given a starting key, and an ending key estimate the number of rows that
+  will exist between the two. end_key may be empty which in case determine
+  if start_key matches any rows.
 
-    Called from opt_range.cc by check_quick_keys().
+  Called from opt_range.cc by check_quick_keys().
 
+  @note
     monty: MUST be called for each range and added.
           Note that MySQL will assume that if this returns 0 there is no
           matching rows for the range!
@@ -6242,27 +6279,80 @@ ha_rows ha_partition::estimate_rows(bool is_records_in_range, uint inx,
 ha_rows ha_partition::records_in_range(uint inx, key_range *min_key,
 				       key_range *max_key)
 {
+  ha_rows min_rows_to_check, rows, estimated_rows=0, checked_rows= 0;
+  uint partition_index= 0, part_id;
   DBUG_ENTER("ha_partition::records_in_range");
 
-  DBUG_RETURN(estimate_rows(TRUE, inx, min_key, max_key));
-}
+  min_rows_to_check= min_rows_for_estimate();
 
+  while ((part_id= get_biggest_used_partition(&partition_index))
+         != NO_CURRENT_PART_ID)
+  {
+    rows= m_file[part_id]->records_in_range(inx, min_key, max_key);
+      
+    DBUG_PRINT("info", ("part %u match %lu rows of %lu", part_id, (ulong) rows,
+                        (ulong) m_file[part_id]->stats.records));
 
-/*
-  Estimate upper bound of number of rows
+    if (rows == HA_POS_ERROR)
+      DBUG_RETURN(HA_POS_ERROR);
+    estimated_rows+= rows;
+    checked_rows+= m_file[part_id]->stats.records;
+    /*
+      Returning 0 means no rows can be found, so we must continue
+      this loop as long as we have estimated_rows == 0.
+      Also many engines return 1 to indicate that there may exist
+      a matching row, we do not normalize this by dividing by number of
+      used partitions, but leave it to be returned as a sum, which will
+      reflect that we will need to scan each partition's index.
+
+      Note that this statistics may not always be correct, so we must
+      continue even if the current partition has 0 rows, since we might have
+      deleted rows from the current partition, or inserted to the next
+      partition.
+    */
+    if (estimated_rows && checked_rows &&
+        checked_rows >= min_rows_to_check)
+    {
+      DBUG_PRINT("info",
+                 ("records_in_range(inx %u): %lu (%lu * %lu / %lu)",
+                  inx,
+                  (ulong) (estimated_rows * stats.records / checked_rows),
+                  (ulong) estimated_rows,
+                  (ulong) stats.records,
+                  (ulong) checked_rows));
+      DBUG_RETURN(estimated_rows * stats.records / checked_rows);
+    }
+  }
+  DBUG_PRINT("info", ("records_in_range(inx %u): %lu",
+                      inx,
+                      (ulong) estimated_rows));
+  DBUG_RETURN(estimated_rows);
+}
 
-  SYNOPSIS
-    estimate_rows_upper_bound()
 
-  RETURN VALUE
-    Number of rows
+/**
+  Estimate upper bound of number of rows.
+
+  @return Number of rows.
 */
 
 ha_rows ha_partition::estimate_rows_upper_bound()
 {
+  ha_rows rows, tot_rows= 0;
+  handler **file= m_file;
   DBUG_ENTER("ha_partition::estimate_rows_upper_bound");
 
-  DBUG_RETURN(estimate_rows(FALSE, 0, NULL, NULL));
+  do
+  {
+    if (bitmap_is_set(&(m_part_info->used_partitions), (file - m_file)))
+    {
+      rows= (*file)->estimate_rows_upper_bound();
+      if (rows == HA_POS_ERROR)
+        DBUG_RETURN(HA_POS_ERROR);
+      tot_rows+= rows;
+    }
+  } while (*(++file));
+  DBUG_RETURN(tot_rows);
 }
 
 
@@ -6494,20 +6584,20 @@ int ha_partition::add_index(TABLE *table_arg, KEY *key_info, uint num_of_keys)
   return ret;
 err:
   if (file > m_file)
-  {
-    uint *key_numbers= (uint*) ha_thd()->alloc(sizeof(uint) * num_of_keys);
-    KEY *old_key_info= table_arg->key_info;
-    uint i;
-    /* Use the newly added key_info as table->key_info to remove them. */
-    for (i= 0; i < num_of_keys; i++)
-      key_numbers[i]= i;
-    table_arg->key_info= key_info;
-    while (--file >= m_file)
-    {
-      (void) (*file)->prepare_drop_index(table_arg, key_numbers, num_of_keys);
-      (void) (*file)->final_drop_index(table_arg);
-    }
-    table_arg->key_info= old_key_info;
+  {
+    uint *key_numbers= (uint*) ha_thd()->alloc(sizeof(uint) * num_of_keys);
+    KEY *old_key_info= table_arg->key_info;
+    uint i;
+    /* Use the newly added key_info as table->key_info to remove them. */
+    for (i= 0; i < num_of_keys; i++)
+      key_numbers[i]= i;
+    table_arg->key_info= key_info;
+    while (--file >= m_file)
+    {
+      (void) (*file)->prepare_drop_index(table_arg, key_numbers, num_of_keys);
+      (void) (*file)->final_drop_index(table_arg);
+    }
+    table_arg->key_info= old_key_info;
   }
   return ret;
 }
diff --git a/sql/ha_partition.h b/sql/ha_partition.h
index 46e2f447a47..49131518f8c 100644
--- a/sql/ha_partition.h
+++ b/sql/ha_partition.h
@@ -199,6 +199,12 @@ private:
   ha_rows   m_bulk_inserted_rows;
   /** used for prediction of start_bulk_insert rows */
   enum_monotonicity_info m_part_func_monotonicity_info;
+  /** Sorted array of partition ids in descending order of number of rows. */
+  uint32 *m_part_ids_sorted_by_num_of_records;
+  /* Compare function for my_qsort2, for reversed order. */
+  static int compare_number_of_records(ha_partition *me,
+                                       const uint32 *a,
+                                       const uint32 *b);
 public:
   handler *clone(const char *name, MEM_ROOT *mem_root);
   virtual void set_part_info(partition_info *part_info)
@@ -219,9 +225,9 @@ public:
   */
     ha_partition(handlerton *hton, TABLE_SHARE * table);
     ha_partition(handlerton *hton, partition_info * part_info);
-    ha_partition(handlerton *hton, TABLE_SHARE *share,
-                 partition_info *part_info_arg,
-                 ha_partition *clone_arg,
+    ha_partition(handlerton *hton, TABLE_SHARE *share,
+                 partition_info *part_info_arg,
+                 ha_partition *clone_arg,
                  MEM_ROOT *clone_mem_root_arg);
    ~ha_partition();
   /*
@@ -582,15 +588,9 @@ public:
   */
 
 private:
-  /*
-    Helper function to get the minimum number of partitions to use for
-    the optimizer hints/cost calls.
-  */
-  void partitions_optimizer_call_preparations(uint *num_used_parts,
-                                              uint *check_min_num,
-                                              uint *first);
-  ha_rows estimate_rows(bool is_records_in_range, uint inx,
-                        key_range *min_key, key_range *max_key);
+  /* Helper functions for optimizer hints. */
+  ha_rows min_rows_for_estimate();
+  uint get_biggest_used_partition(uint *part_index);
 public:
 
   /*
author	Mattias Jonsson <mattias.jonsson@oracle.com>	2012-02-22 23:13:36 +0100
committer	Mattias Jonsson <mattias.jonsson@oracle.com>	2012-02-22 23:13:36 +0100
commit	8325fe02b3f2cb94fb99ac72dd837d9db0c037ba (patch)
tree	0c7483648c83a2186a53487994a5de5ffcccd422 /sql
parent	b566d9a512a9671a423e1ff944d1e4ede2df7f7e (diff)
download	mariadb-git-8325fe02b3f2cb94fb99ac72dd837d9db0c037ba.tar.gz