diff options
author | Mattias Jonsson <mattias.jonsson@oracle.com> | 2012-02-22 23:13:36 +0100 |
---|---|---|
committer | Mattias Jonsson <mattias.jonsson@oracle.com> | 2012-02-22 23:13:36 +0100 |
commit | 8325fe02b3f2cb94fb99ac72dd837d9db0c037ba (patch) | |
tree | 0c7483648c83a2186a53487994a5de5ffcccd422 /sql | |
parent | b566d9a512a9671a423e1ff944d1e4ede2df7f7e (diff) | |
download | mariadb-git-8325fe02b3f2cb94fb99ac72dd837d9db0c037ba.tar.gz |
Bug#13694811: THE OPTIMIZER WRONGLY USES THE FIRST INNODB
PARTITION STATISTICS
Problem was the fix for bug#11756867; It always used the first
partitions, and stopped after it checked 10 [sub]partitions.
(or until it found a partition which would contain a match).
This results in bad statistics for tables where the first 10 partitions
don't represent the majority of the data (like when the first 10
partitions only contained a few rows in total).
The solution was to take statisics from the partitions containing
the most rows instead:
Added an array of partition ids which is sorted by number of records
in descending order.
this array is used in records_in_range to cover as many records as
possible in as few calls as possible.
Also changed the limit of how many partitions to use for the statistics
from a static max of 10 partitions, into a dynamic model:
Maximum number of partitions is now log2(total number of partitions)
taken from the ordered array.
It will continue calling partitions records_in_range until it has
checked:
(total rows in matching partitions) * (maximum number of partitions)
/ (number of used partitions)
Also reverted the changes for ha_partition::scan_time() and
ha_partition::estimate_rows_upper_bound() to before
the fix of bug#11756867. Since they are not as slow as
records_in_range.
Diffstat (limited to 'sql')
-rw-r--r-- | sql/ha_partition.cc | 300 | ||||
-rw-r--r-- | sql/ha_partition.h | 24 |
2 files changed, 207 insertions, 117 deletions
diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc index 7c7cf5a4302..ddfe271f6a3 100644 --- a/sql/ha_partition.cc +++ b/sql/ha_partition.cc @@ -286,6 +286,7 @@ void ha_partition::init_handler_variables() m_is_sub_partitioned= 0; m_is_clone_of= NULL; m_clone_mem_root= NULL; + m_part_ids_sorted_by_num_of_records= NULL; #ifdef DONT_HAVE_TO_BE_INITALIZED m_start_key.flag= 0; @@ -321,6 +322,8 @@ ha_partition::~ha_partition() delete m_file[i]; } my_free((char*) m_ordered_rec_buffer, MYF(MY_ALLOW_ZERO_PTR)); + my_free((char*) m_part_ids_sorted_by_num_of_records, + MYF(MY_ALLOW_ZERO_PTR)); clear_handler_file(); DBUG_VOID_RETURN; @@ -2638,6 +2641,16 @@ int ha_partition::open(const char *name, int mode, uint test_if_locked) m_start_key.key= (const uchar*)ptr; } } + if (!m_part_ids_sorted_by_num_of_records) + { + if (!(m_part_ids_sorted_by_num_of_records= + (uint32*) my_malloc(m_tot_parts * sizeof(uint32), MYF(MY_WME)))) + DBUG_RETURN(error); + uint32 i; + /* Initialize it with all partition ids. */ + for (i= 0; i < m_tot_parts; i++) + m_part_ids_sorted_by_num_of_records[i]= i; + } /* Initialize the bitmap we use to minimize ha_start_bulk_insert calls */ if (bitmap_init(&m_bulk_insert_started, NULL, m_tot_parts + 1, FALSE)) @@ -5146,6 +5159,24 @@ int ha_partition::handle_ordered_prev(uchar *buf) and read_time calls */ +/** + Helper function for sorting according to number of rows in descending order. +*/ + +int ha_partition::compare_number_of_records(ha_partition *me, + const uint32 *a, + const uint32 *b) +{ + handler **file= me->m_file; + /* Note: sorting in descending order! */ + if (file[*a]->stats.records > file[*b]->stats.records) + return -1; + if (file[*a]->stats.records < file[*b]->stats.records) + return 1; + return 0; +} + + /* General method to gather info from handler @@ -5387,6 +5418,15 @@ int ha_partition::info(uint flag) } i++; } while (*(++file_array)); + /* + Sort the array of part_ids by number of records in + in descending order. + */ + my_qsort2((void*) m_part_ids_sorted_by_num_of_records, + m_tot_parts, + sizeof(uint32), + (qsort2_cmp) compare_number_of_records, + this); file= m_file[handler_instance]; file->info(HA_STATUS_CONST); @@ -6124,116 +6164,113 @@ const key_map *ha_partition::keys_to_use_for_scanning() DBUG_RETURN(m_file[0]->keys_to_use_for_scanning()); } -#define MAX_PARTS_FOR_OPTIMIZER_CALLS 10 -/* - Prepare start variables for estimating optimizer costs. - - @param[out] num_used_parts Number of partitions after pruning. - @param[out] check_min_num Number of partitions to call. - @param[out] first first used partition. +/** + Minimum number of rows to base optimizer estimate on. */ -void ha_partition::partitions_optimizer_call_preparations(uint *first, - uint *num_used_parts, - uint *check_min_num) + +ha_rows ha_partition::min_rows_for_estimate() { - *first= bitmap_get_first_set(&(m_part_info->used_partitions)); - *num_used_parts= bitmap_bits_set(&(m_part_info->used_partitions)); - *check_min_num= min(MAX_PARTS_FOR_OPTIMIZER_CALLS, *num_used_parts); + uint i, max_used_partitions, tot_used_partitions; + DBUG_ENTER("ha_partition::partitions_optimizer_call_preparations"); + + tot_used_partitions= bitmap_bits_set(&m_part_info->used_partitions); + DBUG_ASSERT(tot_used_partitions); + + /* + Allow O(log2(tot_partitions)) increase in number of used partitions. + This gives O(1/log2(tot_partitions)) of rows to base the estimate on. + I.e when the total number of partitions doubles, allow one more + partition to be checked. + */ + i= 2; + max_used_partitions= 1; + while (i < m_tot_parts) + { + max_used_partitions++; + i= i << 1; + } + if (max_used_partitions > tot_used_partitions) + max_used_partitions= tot_used_partitions; + + /* stats.records is already updated by the info(HA_STATUS_VARIABLE) call. */ + DBUG_PRINT("info", ("max_used_partitions: %u tot_rows: %lu", + max_used_partitions, + (ulong) stats.records)); + DBUG_PRINT("info", ("tot_used_partitions: %u min_rows_to_check: %lu", + tot_used_partitions, + (ulong) stats.records * max_used_partitions + / tot_used_partitions)); + DBUG_RETURN(stats.records * max_used_partitions / tot_used_partitions); } -/* - Return time for a scan of the table +/** + Get the biggest used partition. - SYNOPSIS - scan_time() + Starting at the N:th biggest partition and skips all non used + partitions, returning the biggest used partition found - RETURN VALUE - time for scan + @param[in,out] part_index Skip the *part_index biggest partitions + + @return The biggest used partition with index not lower than *part_index. + @retval NO_CURRENT_PART_ID No more partition used. + @retval != NO_CURRENT_PART_ID partition id of biggest used partition with + index >= *part_index supplied. Note that + *part_index will be updated to the next + partition index to use. */ -double ha_partition::scan_time() +uint ha_partition::get_biggest_used_partition(uint *part_index) { - double scan_time= 0.0; - uint first, part_id, num_used_parts, check_min_num, partitions_called= 0; - DBUG_ENTER("ha_partition::scan_time"); - - partitions_optimizer_call_preparations(&first, &num_used_parts, &check_min_num); - for (part_id= first; partitions_called < num_used_parts ; part_id++) + uint part_id; + while ((*part_index) < m_tot_parts) { - if (!bitmap_is_set(&(m_part_info->used_partitions), part_id)) - continue; - scan_time+= m_file[part_id]->scan_time(); - partitions_called++; - if (partitions_called >= check_min_num && scan_time != 0.0) - { - DBUG_RETURN(scan_time * - (double) num_used_parts / (double) partitions_called); - } + part_id= m_part_ids_sorted_by_num_of_records[(*part_index)++]; + if (bitmap_is_set(&m_part_info->used_partitions, part_id)) + return part_id; } - DBUG_RETURN(scan_time); + return NO_CURRENT_PART_ID; } /* - Estimate rows for records_in_range or estimate_rows_upper_bound. + Return time for a scan of the table - @param is_records_in_range call records_in_range instead of - estimate_rows_upper_bound. - @param inx (only for records_in_range) index to use. - @param min_key (only for records_in_range) start of range. - @param max_key (only for records_in_range) end of range. + SYNOPSIS + scan_time() - @return Number of rows or HA_POS_ERROR. + RETURN VALUE + time for scan */ -ha_rows ha_partition::estimate_rows(bool is_records_in_range, uint inx, - key_range *min_key, key_range *max_key) + +double ha_partition::scan_time() { - ha_rows rows, estimated_rows= 0; - uint first, part_id, num_used_parts, check_min_num, partitions_called= 0; - DBUG_ENTER("ha_partition::records_in_range"); + double scan_time= 0; + handler **file; + DBUG_ENTER("ha_partition::scan_time"); - partitions_optimizer_call_preparations(&first, &num_used_parts, &check_min_num); - for (part_id= first; partitions_called < num_used_parts ; part_id++) - { - if (!bitmap_is_set(&(m_part_info->used_partitions), part_id)) - continue; - if (is_records_in_range) - rows= m_file[part_id]->records_in_range(inx, min_key, max_key); - else - rows= m_file[part_id]->estimate_rows_upper_bound(); - if (rows == HA_POS_ERROR) - DBUG_RETURN(HA_POS_ERROR); - estimated_rows+= rows; - partitions_called++; - if (partitions_called >= check_min_num && estimated_rows) - { - DBUG_RETURN(estimated_rows * num_used_parts / partitions_called); - } - } - DBUG_RETURN(estimated_rows); + for (file= m_file; *file; file++) + if (bitmap_is_set(&(m_part_info->used_partitions), (file - m_file))) + scan_time+= (*file)->scan_time(); + DBUG_RETURN(scan_time); } -/* - Find number of records in a range - - SYNOPSIS - records_in_range() - inx Index number - min_key Start of range - max_key End of range +/** + Find number of records in a range. + @param inx Index number + @param min_key Start of range + @param max_key End of range - RETURN VALUE - Number of rows in range + @return Number of rows in range. - DESCRIPTION - Given a starting key, and an ending key estimate the number of rows that - will exist between the two. end_key may be empty which in case determine - if start_key matches any rows. + Given a starting key, and an ending key estimate the number of rows that + will exist between the two. end_key may be empty which in case determine + if start_key matches any rows. - Called from opt_range.cc by check_quick_keys(). + Called from opt_range.cc by check_quick_keys(). + @note monty: MUST be called for each range and added. Note that MySQL will assume that if this returns 0 there is no matching rows for the range! @@ -6242,27 +6279,80 @@ ha_rows ha_partition::estimate_rows(bool is_records_in_range, uint inx, ha_rows ha_partition::records_in_range(uint inx, key_range *min_key, key_range *max_key) { + ha_rows min_rows_to_check, rows, estimated_rows=0, checked_rows= 0; + uint partition_index= 0, part_id; DBUG_ENTER("ha_partition::records_in_range"); - DBUG_RETURN(estimate_rows(TRUE, inx, min_key, max_key)); -} + min_rows_to_check= min_rows_for_estimate(); + while ((part_id= get_biggest_used_partition(&partition_index)) + != NO_CURRENT_PART_ID) + { + rows= m_file[part_id]->records_in_range(inx, min_key, max_key); + + DBUG_PRINT("info", ("part %u match %lu rows of %lu", part_id, (ulong) rows, + (ulong) m_file[part_id]->stats.records)); -/* - Estimate upper bound of number of rows + if (rows == HA_POS_ERROR) + DBUG_RETURN(HA_POS_ERROR); + estimated_rows+= rows; + checked_rows+= m_file[part_id]->stats.records; + /* + Returning 0 means no rows can be found, so we must continue + this loop as long as we have estimated_rows == 0. + Also many engines return 1 to indicate that there may exist + a matching row, we do not normalize this by dividing by number of + used partitions, but leave it to be returned as a sum, which will + reflect that we will need to scan each partition's index. + + Note that this statistics may not always be correct, so we must + continue even if the current partition has 0 rows, since we might have + deleted rows from the current partition, or inserted to the next + partition. + */ + if (estimated_rows && checked_rows && + checked_rows >= min_rows_to_check) + { + DBUG_PRINT("info", + ("records_in_range(inx %u): %lu (%lu * %lu / %lu)", + inx, + (ulong) (estimated_rows * stats.records / checked_rows), + (ulong) estimated_rows, + (ulong) stats.records, + (ulong) checked_rows)); + DBUG_RETURN(estimated_rows * stats.records / checked_rows); + } + } + DBUG_PRINT("info", ("records_in_range(inx %u): %lu", + inx, + (ulong) estimated_rows)); + DBUG_RETURN(estimated_rows); +} - SYNOPSIS - estimate_rows_upper_bound() - RETURN VALUE - Number of rows +/** + Estimate upper bound of number of rows. + + @return Number of rows. */ ha_rows ha_partition::estimate_rows_upper_bound() { + ha_rows rows, tot_rows= 0; + handler **file= m_file; DBUG_ENTER("ha_partition::estimate_rows_upper_bound"); - DBUG_RETURN(estimate_rows(FALSE, 0, NULL, NULL)); + do + { + if (bitmap_is_set(&(m_part_info->used_partitions), (file - m_file))) + { + rows= (*file)->estimate_rows_upper_bound(); + if (rows == HA_POS_ERROR) + DBUG_RETURN(HA_POS_ERROR); + tot_rows+= rows; + } + } while (*(++file)); + DBUG_RETURN(tot_rows); } @@ -6494,20 +6584,20 @@ int ha_partition::add_index(TABLE *table_arg, KEY *key_info, uint num_of_keys) return ret; err: if (file > m_file) - {
- uint *key_numbers= (uint*) ha_thd()->alloc(sizeof(uint) * num_of_keys);
- KEY *old_key_info= table_arg->key_info;
- uint i;
- /* Use the newly added key_info as table->key_info to remove them. */
- for (i= 0; i < num_of_keys; i++)
- key_numbers[i]= i;
- table_arg->key_info= key_info;
- while (--file >= m_file)
- {
- (void) (*file)->prepare_drop_index(table_arg, key_numbers, num_of_keys);
- (void) (*file)->final_drop_index(table_arg);
- }
- table_arg->key_info= old_key_info;
+ { + uint *key_numbers= (uint*) ha_thd()->alloc(sizeof(uint) * num_of_keys); + KEY *old_key_info= table_arg->key_info; + uint i; + /* Use the newly added key_info as table->key_info to remove them. */ + for (i= 0; i < num_of_keys; i++) + key_numbers[i]= i; + table_arg->key_info= key_info; + while (--file >= m_file) + { + (void) (*file)->prepare_drop_index(table_arg, key_numbers, num_of_keys); + (void) (*file)->final_drop_index(table_arg); + } + table_arg->key_info= old_key_info; } return ret; } diff --git a/sql/ha_partition.h b/sql/ha_partition.h index 46e2f447a47..49131518f8c 100644 --- a/sql/ha_partition.h +++ b/sql/ha_partition.h @@ -199,6 +199,12 @@ private: ha_rows m_bulk_inserted_rows; /** used for prediction of start_bulk_insert rows */ enum_monotonicity_info m_part_func_monotonicity_info; + /** Sorted array of partition ids in descending order of number of rows. */ + uint32 *m_part_ids_sorted_by_num_of_records; + /* Compare function for my_qsort2, for reversed order. */ + static int compare_number_of_records(ha_partition *me, + const uint32 *a, + const uint32 *b); public: handler *clone(const char *name, MEM_ROOT *mem_root); virtual void set_part_info(partition_info *part_info) @@ -219,9 +225,9 @@ public: */ ha_partition(handlerton *hton, TABLE_SHARE * table); ha_partition(handlerton *hton, partition_info * part_info); - ha_partition(handlerton *hton, TABLE_SHARE *share,
- partition_info *part_info_arg,
- ha_partition *clone_arg,
+ ha_partition(handlerton *hton, TABLE_SHARE *share, + partition_info *part_info_arg, + ha_partition *clone_arg, MEM_ROOT *clone_mem_root_arg); ~ha_partition(); /* @@ -582,15 +588,9 @@ public: */ private: - /* - Helper function to get the minimum number of partitions to use for - the optimizer hints/cost calls. - */ - void partitions_optimizer_call_preparations(uint *num_used_parts, - uint *check_min_num, - uint *first); - ha_rows estimate_rows(bool is_records_in_range, uint inx, - key_range *min_key, key_range *max_key); + /* Helper functions for optimizer hints. */ + ha_rows min_rows_for_estimate(); + uint get_biggest_used_partition(uint *part_index); public: /* |