summaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorsergefp@mysql.com <>2003-12-18 06:08:00 +0300
committersergefp@mysql.com <>2003-12-18 06:08:00 +0300
commit67c6d5113c76900cb3b985bc798fd1bf77133884 (patch)
treee88bcd81a11ac086bf267b0856389876bb760587 /sql
parent20295cf182faf27690add217edfba12924857bf6 (diff)
downloadmariadb-git-67c6d5113c76900cb3b985bc798fd1bf77133884.tar.gz
Precise read time estimates for index_merge/Unique
Diffstat (limited to 'sql')
-rw-r--r--sql/filesort.cc8
-rw-r--r--sql/ha_berkeley.h2
-rw-r--r--sql/ha_innodb.cc3
-rw-r--r--sql/ha_innodb.h2
-rw-r--r--sql/handler.h4
-rw-r--r--sql/mysql_priv.h20
-rw-r--r--sql/opt_range.cc489
-rw-r--r--sql/opt_range.h96
-rw-r--r--sql/records.cc1
-rw-r--r--sql/sql_class.h3
-rw-r--r--sql/uniques.cc182
11 files changed, 602 insertions, 208 deletions
diff --git a/sql/filesort.cc b/sql/filesort.cc
index 356afdf748c..27c59a05941 100644
--- a/sql/filesort.cc
+++ b/sql/filesort.cc
@@ -88,9 +88,9 @@ ha_rows filesort(THD *thd, TABLE *table, SORT_FIELD *sortorder, uint s_length,
#endif
FILESORT_INFO table_sort;
/*
- don't use table->sort in filesort as it is also used by
- QUICK_INDEX_MERGE_SELECT. work with a copy of it and put it back at the
- end when index_merge select has finished with it.
+ Don't use table->sort in filesort as it is also used by
+ QUICK_INDEX_MERGE_SELECT. Work with a copy and put it back at the end
+ when index_merge select has finished with it.
*/
memcpy(&table_sort, &table->sort, sizeof(FILESORT_INFO));
table->sort.io_cache= NULL;
@@ -452,7 +452,7 @@ static ha_rows find_all_keys(SORTPARAM *param, SQL_SELECT *select,
if (quick_select)
{
/*
- index_merge quick select uses table->sort when retrieving rows, so free
+ index_merge quick select uses table->sort when retrieving rows, so free
resoures it has allocated.
*/
end_read_record(&read_record_info);
diff --git a/sql/ha_berkeley.h b/sql/ha_berkeley.h
index 582a79906a7..f225c24eaf7 100644
--- a/sql/ha_berkeley.h
+++ b/sql/ha_berkeley.h
@@ -167,7 +167,7 @@ class ha_berkeley: public handler
longlong get_auto_increment();
void print_error(int error, myf errflag);
uint8 table_cache_type() { return HA_CACHE_TBL_TRANSACT; }
- bool primary_key_is_clustered_covering() { return true; }
+ bool primary_key_is_clustered() { return true; }
};
extern bool berkeley_skip, berkeley_shared_data;
diff --git a/sql/ha_innodb.cc b/sql/ha_innodb.cc
index b92a5ff8c3f..d949f8bcf9c 100644
--- a/sql/ha_innodb.cc
+++ b/sql/ha_innodb.cc
@@ -2003,7 +2003,8 @@ build_template(
update field->query_id so that the formula
thd->query_id == field->query_id did not work. */
- ibool index_contains_field = dict_index_contains_col_or_prefix(index, i);
+ ibool index_contains_field=
+ dict_index_contains_col_or_prefix(index, i);
if (templ_type == ROW_MYSQL_REC_FIELDS &&
((prebuilt->read_just_key && !index_contains_field) ||
diff --git a/sql/ha_innodb.h b/sql/ha_innodb.h
index 6fa66377cd6..c305a019fcd 100644
--- a/sql/ha_innodb.h
+++ b/sql/ha_innodb.h
@@ -187,7 +187,7 @@ class ha_innobase: public handler
void init_table_handle_for_HANDLER();
longlong get_auto_increment();
uint8 table_cache_type() { return HA_CACHE_TBL_ASKTRANSACT; }
- bool primary_key_is_clustered_covering() { return true; }
+ bool primary_key_is_clustered() { return true; }
};
extern bool innodb_skip;
diff --git a/sql/handler.h b/sql/handler.h
index 0bbaba81f96..2ad37233c9e 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -378,10 +378,10 @@ public:
/*
RETURN
- true primary key (if there is one) is clustered key covering all fields
+ true Primary key (if there is one) is clustered key covering all fields
false otherwise
*/
- virtual bool primary_key_is_clustered_covering() { return false; }
+ virtual bool primary_key_is_clustered() { return false; }
};
/* Some extern variables used with handlers */
diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h
index 3ace72ea24c..2b25f501a37 100644
--- a/sql/mysql_priv.h
+++ b/sql/mysql_priv.h
@@ -119,6 +119,26 @@ extern CHARSET_INFO *national_charset_info, *table_alias_charset;
#define TIME_FOR_COMPARE 5 // 5 compares == one read
/*
+ Number of comparisons of table rowids equivalent to reading one row from a
+ table.
+*/
+#define TIME_FOR_COMPARE_ROWID (TIME_FOR_COMPARE*2)
+
+/*
+ For sequential disk seeks the cost formula is:
+ DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST * #blocks_to_skip
+
+ The cost of average seek
+ DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST*BLOCKS_IN_AVG_SEEK =1.0.
+*/
+#define DISK_SEEK_BASE_COST ((double)0.5)
+
+#define BLOCKS_IN_AVG_SEEK 128
+
+#define DISK_SEEK_PROP_COST ((double)0.5/BLOCKS_IN_AVG_SEEK)
+
+
+/*
Number of rows in a reference table when refereed through a not unique key.
This value is only used when we don't know anything about the key
distribution.
diff --git a/sql/opt_range.cc b/sql/opt_range.cc
index 98002cc5b7a..e3d0f8624b9 100644
--- a/sql/opt_range.cc
+++ b/sql/opt_range.cc
@@ -307,12 +307,18 @@ static ha_rows check_quick_keys(PARAM *param,uint index,SEL_ARG *key_tree,
QUICK_RANGE_SELECT *get_quick_select(PARAM *param,uint index,
SEL_ARG *key_tree, MEM_ROOT *alloc = NULL);
-static int get_quick_select_params(SEL_TREE *tree, PARAM& param,
- key_map& needed_reg, TABLE *head,
+static int get_quick_select_params(SEL_TREE *tree, PARAM *param,
+ key_map& needed_reg,
bool index_read_can_be_used,
- double* read_time,
- ha_rows* records,
+ double *read_time,
+ ha_rows *records,
SEL_ARG*** key_to_read);
+static int get_index_merge_params(PARAM *param, key_map& needed_reg,
+ SEL_IMERGE *imerge, double *read_time,
+ ha_rows* imerge_rows);
+inline double get_index_only_read_time(PARAM* param, ha_rows records,
+ int keynr);
+
#ifndef DBUG_OFF
static void print_quick_sel_imerge(QUICK_INDEX_MERGE_SELECT *quick,
const key_map *needed_reg);
@@ -453,7 +459,7 @@ int SEL_IMERGE::or_sel_tree_with_checks(PARAM *param, SEL_TREE *new_tree)
}
}
- /* new tree cannot be combined with any of existing trees */
+ /* New tree cannot be combined with any of existing trees. */
return or_sel_tree(param, new_tree);
}
@@ -483,7 +489,6 @@ int SEL_IMERGE::or_sel_imerge_with_checks(PARAM *param, SEL_IMERGE* imerge)
/*
Perform AND operation on two index_merge lists and store result in *im1.
-
*/
inline void imerge_list_and_list(List<SEL_IMERGE> *im1, List<SEL_IMERGE> *im2)
@@ -503,18 +508,16 @@ inline void imerge_list_and_list(List<SEL_IMERGE> *im1, List<SEL_IMERGE> *im2)
i.e. all conjuncts except the first one are currently dropped.
This is done to avoid producing N*K ways to do index_merge.
- If (a_1||b_1) produce a condition that is always true, NULL is
- returned and index_merge is discarded. (while it is actually
- possible to try harder).
+ If (a_1||b_1) produce a condition that is always true, NULL is returned
+ and index_merge is discarded (while it is actually possible to try
+ harder).
- As a consequence of this, choice of keys to do index_merge
- read may depend on the order of conditions in WHERE part of
- the query.
+ As a consequence of this, choice of keys to do index_merge read may depend
+ on the order of conditions in WHERE part of the query.
RETURN
- 0 OK, result is stored in *im1
+ 0 OK, result is stored in *im1
other Error, both passed lists are unusable
-
*/
int imerge_list_or_list(PARAM *param,
@@ -533,7 +536,7 @@ int imerge_list_or_list(PARAM *param,
Perform OR operation on index_merge list and key tree.
RETURN
- 0 OK, result is stored in *im1
+ 0 OK, result is stored in *im1.
other Error
*/
@@ -685,10 +688,10 @@ bool
QUICK_INDEX_MERGE_SELECT::push_quick_back(QUICK_RANGE_SELECT *quick_sel_range)
{
/*
- Save quick_select that does scan on clustered covering primary key as
- it will be processed separately
+ Save quick_select that does scan on clustered primary key as it will be
+ processed separately.
*/
- if (head->file->primary_key_is_clustered_covering() &&
+ if (head->file->primary_key_is_clustered() &&
quick_sel_range->index == head->primary_key)
pk_quick_select= quick_sel_range;
else
@@ -1001,7 +1004,7 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
ha_rows found_records;
double found_read_time= read_time;
- if (!get_quick_select_params(tree, param, needed_reg, head, true,
+ if (!get_quick_select_params(tree, &param, needed_reg, true,
&found_read_time, &found_records,
&best_key))
{
@@ -1021,120 +1024,57 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
}
/*
- btw, tree type SEL_TREE::INDEX_MERGE was not introduced
- intentionally
+ Btw, tree type SEL_TREE::INDEX_MERGE was not introduced
+ intentionally.
*/
- /* if no range select could be built, try using index_merge */
+ /* If no range select could be built, try using index_merge. */
if (!quick && !tree->merges.is_empty())
{
DBUG_PRINT("info",("No range reads possible,"
" trying to construct index_merge"));
SEL_IMERGE *imerge;
SEL_IMERGE *min_imerge= NULL;
- double min_imerge_cost= DBL_MAX;
+ double min_imerge_read_time;
ha_rows min_imerge_records;
- List_iterator_fast<SEL_IMERGE> it(tree->merges);
- /* find index_merge with minimal cost */
- while ((imerge= it++))
- {
- bool imerge_failed= false;
- double imerge_cost= 0;
- ha_rows imerge_total_records= 0;
- double tree_read_time;
- ha_rows tree_records;
- imerge->best_keys=
- (SEL_ARG***)alloc_root(&alloc,
- (imerge->trees_next - imerge->trees)*
- sizeof(void*));
- /*
- It may be possible to use different keys for index_merge, e.g for
- queries like
- ...WHERE (key1 < c2 AND key2 < c2) OR (key3 < c3 AND key4 < c4)
- We assume we get the best index_merge if we choose the best key
- read inside each of the conjuncts.
- */
- for (SEL_TREE **ptree= imerge->trees;
- ptree != imerge->trees_next;
- ptree++)
- {
- tree_read_time= read_time;
- if (get_quick_select_params(*ptree, param, needed_reg, head,
- false,
- &tree_read_time, &tree_records,
- &(imerge->best_keys[ptree -
- imerge->trees])))
- imerge_failed= true;
- imerge_cost += tree_read_time;
- imerge_total_records += tree_records;
- }
-
- if (!imerge_failed)
- {
- imerge_total_records= min(imerge_total_records,
- head->file->records);
- imerge_cost += imerge_total_records / TIME_FOR_COMPARE;
- if (imerge_cost < min_imerge_cost)
- {
- min_imerge= imerge;
- min_imerge_cost= imerge_cost;
- min_imerge_records= imerge_total_records;
- }
- }
- }
-
- if (!min_imerge)
- goto end_free;
-
- records= min_imerge_records;
- /*
- Ok, got minimal index merge, *min_imerge, with cost min_imerge_cost
- Compare its cost with "all" scan cost (or "all+using index" if
- it is possible) and choose the best.
- */
-
if (!head->used_keys.is_clear_all())
- {
- /* check if "ALL" +"using index" read would be faster */
+ {
int key_for_use= find_shortest_key(head, &head->used_keys);
ha_rows total_table_records= (0 == head->file->records)? 1 :
head->file->records;
- uint keys_per_block= (head->file->block_size/2/
- (head->key_info[key_for_use].key_length+
- head->file->ref_length) + 1);
- double all_index_scan_read_time= ((double)(total_table_records+
- keys_per_block-1)/
- (double) keys_per_block);
-
+ read_time = get_index_only_read_time(&param, total_table_records,
+ key_for_use);
DBUG_PRINT("info",
("'all' scan will be using key %d, read time %g",
- key_for_use, all_index_scan_read_time));
- if (all_index_scan_read_time < min_imerge_cost)
- {
- DBUG_PRINT("info",
- ("index merge would be slower, "
- "will do full 'index' scan"));
- goto end_free;
- }
+ key_for_use, read_time));
}
- else
+
+ min_imerge_read_time=read_time;
+ /*
+ Ok, read_time contains best 'all' read time.
+ Now look for index_merge with cost < read_time
+ */
+ List_iterator_fast<SEL_IMERGE> it(tree->merges);
+ while ((imerge= it++))
{
- /* check if "ALL" would be faster */
- if (read_time < min_imerge_cost)
- {
- DBUG_PRINT("info",
- ("index merge would be slower, "
- "will do full table scan"));
- goto end_free;
- }
+ if (!get_index_merge_params(&param, needed_reg, imerge,
+ &min_imerge_read_time,
+ &min_imerge_records))
+ min_imerge= imerge;
}
-
+
+ if (!min_imerge)
+ goto end_free;
+
+ records= min_imerge_records;
+
+ /* Ok, using index_merge is the best option, so construct it. */
if (!(quick= quick_imerge= new QUICK_INDEX_MERGE_SELECT(thd, head)))
goto end_free;
quick->records= min_imerge_records;
- quick->read_time= min_imerge_cost;
+ quick->read_time= min_imerge_read_time;
my_pthread_setspecific_ptr(THR_MALLOC, &quick_imerge->alloc);
@@ -1152,10 +1092,10 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use,
&quick_imerge->alloc)))
{
new_quick->records= min_imerge_records;
- new_quick->read_time= min_imerge_cost;
+ new_quick->read_time= min_imerge_read_time;
/*
QUICK_RANGE_SELECT::QUICK_RANGE_SELECT leaves THR_MALLOC
- pointing to its allocator, restore it back
+ pointing to its allocator, restore it back.
*/
quick_imerge->last_quick_select= new_quick;
@@ -1214,15 +1154,265 @@ end:
/*
+ Calculate index merge cost and save parameters for its construction.
+
+ SYNOPSIS
+ get_index_merge_params()
+ param in parameter with structure.
+ needed_reg in/out needed_reg from this SQL_SELECT
+ imerge in index_merge description structure
+ read_time in/out in: cost of an existing way to read a table
+ out: cost of index merge
+ imerge_rows out pessimistic estimate of # of rows to be retrieved
+
+ RETURN
+ 0 Cost of this index_merge is less than passed *read_time,
+ *imerge_rows and *read_time contain new index_merge parameters.
+ 1 Cost of this index_merge is more than *read_time,
+ *imerge_rows and *read_time are not modified.
+ -1 error
+
+ NOTES
+ index_merge_cost =
+ cost(index_reads) + (see #1)
+ cost(rowid_to_row_scan) + (see #2)
+ cost(unique_use) (see #3)
+
+ 1. cost(index_reads) =SUM_i(cost(index_read_i))
+ For non-CPK scans,
+ cost(index_read_i) = {cost of ordinary 'index only' scan}
+ For CPK scan,
+ cost(index_read_i) = {cost of non-'index only' scan}
+
+ 2. cost(rowid_to_row_scan)
+ If table PK is clustered then
+ cost(rowid_to_row_scan) =
+ {cost of ordinary clustered PK scan with n_ranges=n_rows}
+
+ Otherwise, we use the following model to calculate costs:
+ We need to retrieve n_rows rows from file that occupies n_blocks blocks.
+ We assume that offsets of rows we need are independent variates with
+ uniform distribution in [0..max_file_offset] range.
+
+ We'll denote block as "busy" if it contains row(s) we need to retrieve
+ and "empty" if doesn't contain rows we need.
+
+ Probability that a block is empty is (1 - 1/n_blocks)^n_rows (this
+ applies to any block in file). Let x_i be a variate taking value 1 if
+ block #i is empty and 0 otherwise.
+
+ Then E(x_i) = (1 - 1/n_blocks)^n_rows;
+
+ E(n_empty_blocks) = E(sum(x_i)) = sum(E(x_i)) =
+ = n_blocks * ((1 - 1/n_blocks)^n_rows) =
+ ~= n_blocks * exp(-n_rows/n_blocks).
+
+ E(n_busy_blocks) = n_blocks*(1 - (1 - 1/n_blocks)^n_rows) =
+ ~= n_blocks * (1 - exp(-n_rows/n_blocks)).
+
+ Average size of "hole" between neighbor non-empty blocks is
+ E(hole_size) = n_blocks/E(n_busy_blocks).
+
+ The total cost of reading all needed blocks in one "sweep" is:
+
+ E(n_busy_blocks)*
+ (DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST*n_blocks/E(n_busy_blocks)).
+
+ 3. Cost of Unique use is calculated in Unique::get_use_cost function.
+*/
+
+static int get_index_merge_params(PARAM *param, key_map& needed_reg,
+ SEL_IMERGE *imerge, double *read_time,
+ ha_rows* imerge_rows)
+{
+ double imerge_cost= 0.0; /* cost of this index_merge */
+ bool imerge_too_expensive= false;
+ double tree_read_time;
+ ha_rows tree_records;
+ bool pk_is_clustered= param->table->file->primary_key_is_clustered();
+ bool have_cpk_scan;
+ ha_rows records_for_unique= 0;
+ ha_rows cpk_records= 0;
+
+ DBUG_ENTER("get_index_merge_params");
+
+ /* allocate structs to save construction info */
+ imerge->best_keys=
+ (SEL_ARG***)alloc_root(param->mem_root,
+ (imerge->trees_next - imerge->trees)*
+ sizeof(void*));
+ /*
+ PHASE 1: get the best keys to use for this index_merge
+ */
+
+ /*
+ It may be possible to use different keys for index_merge scans,
+ e.g. for query like
+ ...WHERE (key1 < c2 AND key2 < c2) OR (key3 < c3 AND key4 < c4)
+ we have to make choice between key1 and key2 for one scan and
+ between key3,key4 for another.
+ We assume we'll get the best way if we choose the best key read
+ inside each of the conjuncts. Comparison is done without 'using index'.
+ */
+ for (SEL_TREE **ptree= imerge->trees;
+ ptree != imerge->trees_next;
+ ptree++)
+ {
+ SEL_ARG **tree_best_key;
+ uint keynr;
+
+ tree_read_time= *read_time;
+ if (get_quick_select_params(*ptree, param, needed_reg, false,
+ &tree_read_time, &tree_records,
+ &tree_best_key))
+ {
+ /*
+ Non-'index only' range scan on a one in index_merge key is more
+ expensive than other available option. The entire index_merge will be
+ more expensive then, too. We continue here only to update SQL_SELECT
+ members.
+ */
+ imerge_too_expensive= true;
+ }
+
+ if (imerge_too_expensive)
+ continue;
+
+ imerge->best_keys[ptree - imerge->trees]= tree_best_key;
+ keynr= param->real_keynr[(tree_best_key-(*ptree)->keys)];
+
+ if (pk_is_clustered && keynr == param->table->primary_key)
+ {
+ /* This is a Clustered PK scan, it will be done without 'index only' */
+ imerge_cost += tree_read_time;
+ have_cpk_scan= true;
+ cpk_records= tree_records;
+ }
+ else
+ {
+ /* Non-CPK scan, calculate time to do it using 'index only' */
+ imerge_cost += get_index_only_read_time(param, tree_records,keynr);
+ records_for_unique += tree_records;
+ }
+ }
+
+ if (imerge_too_expensive)
+ DBUG_RETURN(1);
+
+ if ((imerge_cost > *read_time) ||
+ ((records_for_unique + cpk_records) >= param->table->file->records) &&
+ *read_time != DBL_MAX)
+ {
+ /* Bail out if it is obvious that index_merge would be more expensive */
+ DBUG_RETURN(1);
+ }
+
+ if (have_cpk_scan)
+ {
+ /*
+ Add one ROWID comparison for each row retrieved on non-CPK scan.
+ (it is done in QUICK_RANGE_SELECT::row_in_ranges)
+ */
+ imerge_cost += records_for_unique / TIME_FOR_COMPARE_ROWID;
+ }
+
+ /* PHASE 2: Calculate cost(rowid_to_row_scan) */
+ if (pk_is_clustered)
+ {
+ imerge_cost +=
+ param->table->file->read_time(param->table->primary_key,
+ records_for_unique,
+ records_for_unique)
+ + rows2double(records_for_unique) / TIME_FOR_COMPARE;
+ }
+ else
+ {
+ double n_blocks=
+ ceil((double)(longlong)param->table->file->data_file_length / IO_SIZE);
+ /* Don't ceil the following as it is an estimate */
+ double busy_blocks=
+ n_blocks * (1.0 - pow(1.0 - 1.0/n_blocks, records_for_unique));
+
+ JOIN *join= param->thd->lex->select_lex.join;
+ if (!join || join->tables == 1)
+ {
+ imerge_cost += busy_blocks*(DISK_SEEK_BASE_COST +
+ DISK_SEEK_PROP_COST*n_blocks/busy_blocks);
+ }
+ else
+ {
+ /*
+ It can be a join with source table being non-last table, so assume
+ that disk seeks are random here.
+ (TODO it is possible to determine if this *is* a last table in 'index
+ checked for each record'-type join)
+ */
+ imerge_cost += busy_blocks;
+ }
+ }
+
+ /* PHASE 3: Add Unique operations cost */
+ imerge_cost += Unique::get_use_cost(param->mem_root, records_for_unique,
+ param->table->file->ref_length,
+ param->thd->variables.sortbuff_size);
+ if (imerge_cost < *read_time)
+ {
+ *read_time= imerge_cost;
+ records_for_unique += cpk_records;
+ *imerge_rows= min(records_for_unique, param->table->file->records);
+ DBUG_RETURN(0);
+ }
+ DBUG_RETURN(1);
+}
+
+
+/*
+ Calculate cost of 'index only' scan for given index and number of records.
+ (We can resolve this by only reading through this key.)
+
+ SYNOPSIS
+ get_whole_index_read_time()
+ param parameters structure
+ records #of records to read
+ keynr key to read
+
+ NOTES
+ It is assumed that we will read trough the whole key range and that all
+ key blocks are half full (normally things are much better).
+*/
+
+inline double get_index_only_read_time(PARAM* param, ha_rows records, int keynr)
+{
+ double read_time;
+ uint keys_per_block= (param->table->file->block_size/2/
+ (param->table->key_info[keynr].key_length+
+ param->table->file->ref_length) + 1);
+ read_time=((double) (records+keys_per_block-1)/
+ (double) keys_per_block);
+ return read_time;
+}
+
+
+/*
Calculate quick range select read time, # of records, and best key to use
without constructing QUICK_RANGE_SELECT object.
+ SYNOPSIS
+ get_quick_select_params
+ tree in make range select for this SEL_TREE
+ param in parameters from test_quick_select
+ needed_reg in/out other table data needed by this quick_select
+ index_read_can_be_used if false, assume that 'index only' option is not
+ available.
+ read_time out read time estimate
+ records out # of records estimate
+ key_to_read out SEL_ARG to be used for creating quick select
*/
-static int get_quick_select_params(SEL_TREE *tree, PARAM& param,
- key_map& needed_reg, TABLE *head,
+static int get_quick_select_params(SEL_TREE *tree, PARAM *param,
+ key_map& needed_reg,
bool index_read_can_be_used,
- double* read_time, ha_rows* records,
- SEL_ARG*** key_to_read)
+ double *read_time, ha_rows *records,
+ SEL_ARG ***key_to_read)
{
int idx;
int result = 1;
@@ -1233,7 +1423,7 @@ static int get_quick_select_params(SEL_TREE *tree, PARAM& param,
*/
SEL_ARG **key,**end;
- for (idx= 0,key=tree->keys, end=key+param.keys ;
+ for (idx= 0,key=tree->keys, end=key+param->keys ;
key != end ;
key++,idx++)
{
@@ -1241,16 +1431,18 @@ static int get_quick_select_params(SEL_TREE *tree, PARAM& param,
double found_read_time;
if (*key)
{
- uint keynr= param.real_keynr[idx];
+ uint keynr= param->real_keynr[idx];
if ((*key)->type == SEL_ARG::MAYBE_KEY ||
(*key)->maybe_flag)
needed_reg.set_bit(keynr);
- bool read_index_only= index_read_can_be_used? head->used_keys.is_set(keynr): false;
- found_records=check_quick_select(&param, idx, *key);
+ bool read_index_only= index_read_can_be_used?
+ param->table->used_keys.is_set(keynr): false;
+ found_records=check_quick_select(param, idx, *key);
+
if (found_records != HA_POS_ERROR && found_records > 2 &&
read_index_only &&
- (head->file->index_flags(keynr) & HA_KEY_READ_ONLY))
+ (param->table->file->index_flags(keynr) & HA_KEY_READ_ONLY))
{
/*
We can resolve this by only reading through this key.
@@ -1258,21 +1450,17 @@ static int get_quick_select_params(SEL_TREE *tree, PARAM& param,
and that all key blocks are half full (normally things are
much better).
*/
- uint keys_per_block= (head->file->block_size/2/
- (head->key_info[keynr].key_length+
- head->file->ref_length) + 1);
- found_read_time=((double) (found_records+keys_per_block-1)/
- (double) keys_per_block);
+ found_read_time=get_index_only_read_time(param, found_records, keynr);
}
else
- found_read_time= (head->file->read_time(keynr,
- param.range_count,
+ found_read_time= (param->table->file->read_time(keynr,
+ param->range_count,
found_records)+
(double) found_records / TIME_FOR_COMPARE);
if (*read_time > found_read_time && found_records != HA_POS_ERROR)
{
*read_time= found_read_time;
- *records= found_records;
+ *records= found_records;
*key_to_read= key;
result = 0;
}
@@ -3118,8 +3306,8 @@ err:
/*
Fetch all row ids into unique.
- If table has a clustered primary key(PK) that contains all rows (bdb and
- innodb currently) and one of the index_merge scans is a scan on primary key,
+ If table has a clustered primary key that covers all rows (true for bdb
+ and innodb currently) and one of the index_merge scans is a scan on PK,
then
primary key scan rowids are not put into Unique and also
rows that will be retrieved by PK scan are not put into Unique
@@ -3134,15 +3322,15 @@ int QUICK_INDEX_MERGE_SELECT::prepare_unique()
int result;
DBUG_ENTER("QUICK_INDEX_MERGE_SELECT::prepare_unique");
- /* we're going to just read rowids. */
+ /* We're going to just read rowids. */
head->file->extra(HA_EXTRA_KEYREAD);
/*
Make innodb retrieve all PK member fields, so
* ha_innobase::position (which uses them) call works.
- * we filter out rows retrieved by CCPK.
+ * We can filter out rows that will be retrieved by clustered PK.
(This also creates a deficiency - it is possible that we will retrieve
- parts of key that are not used by current query at all)
+ parts of key that are not used by current query at all.)
*/
head->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS);
@@ -3170,22 +3358,15 @@ int QUICK_INDEX_MERGE_SELECT::prepare_unique()
if (result)
{
- /*
- table read error (including HA_ERR_END_OF_FILE on last quick select
- in index_merge)
- */
if (result != HA_ERR_END_OF_FILE)
- {
DBUG_RETURN(result);
- }
- else
- break;
+ break;
}
if (thd->killed)
DBUG_RETURN(1);
- /* skip row if it will be retrieved by clustered covering PK scan */
+ /* skip row if it will be retrieved by clustered PK scan */
if (pk_quick_select && pk_quick_select->row_in_ranges())
continue;
@@ -3207,14 +3388,16 @@ int QUICK_INDEX_MERGE_SELECT::prepare_unique()
DBUG_RETURN(result);
}
+
/*
Get next row for index_merge.
NOTES
- The rows are read from
- 1. rowids stored in Unique.
- 2. QUICK_RANGE_SELECT with clustered primary key (if any).
- the sets of rows retrieved in 1) and 2) are guaranteed to be disjoint.
+ The rows are read from
+ 1. rowids stored in Unique.
+ 2. QUICK_RANGE_SELECT with clustered primary key (if any).
+ The sets of rows retrieved in 1) and 2) are guaranteed to be disjoint.
*/
+
int QUICK_INDEX_MERGE_SELECT::get_next()
{
int result;
@@ -3228,8 +3411,8 @@ int QUICK_INDEX_MERGE_SELECT::get_next()
if (result == -1)
{
result= HA_ERR_END_OF_FILE;
- /* All rows from Unique have been retrieved, do a CCPK scan */
end_read_record(&read_record);
+ /* All rows from Unique have been retrieved, do a clustered PK scan */
if(pk_quick_select)
{
doing_pk_scan= true;
@@ -3275,8 +3458,9 @@ int QUICK_RANGE_SELECT::get_next()
if (!cur_range)
range= *(cur_range= (QUICK_RANGE**)ranges.buffer);
else
- range= (cur_range == ((QUICK_RANGE**)ranges.buffer + ranges.elements - 1))?
- NULL: *(++cur_range);
+ range=
+ (cur_range == ((QUICK_RANGE**)ranges.buffer + ranges.elements - 1))?
+ NULL: *(++cur_range);
if (!range)
DBUG_RETURN(HA_ERR_END_OF_FILE); // All ranges used
@@ -3371,16 +3555,17 @@ int QUICK_RANGE_SELECT::cmp_next(QUICK_RANGE *range_arg)
/*
Check if current row will be retrieved by this QUICK_RANGE_SELECT
- (this is used to filter out CCPK scan rows in index_merge).
NOTES
It is assumed that currently a scan is being done on another index
which reads all necessary parts of the index that is scanned by this
quick select.
-
The implementation does a binary search on sorted array of disjoint
ranges, without taking size of range into account.
+ This function is used to filter out clustered PK scan rows in
+ index_merge quick select.
+
RETURN
true if current row will be retrieved by this quick select
false if not
diff --git a/sql/opt_range.h b/sql/opt_range.h
index 35a0cb5df88..7c2981795a2 100644
--- a/sql/opt_range.h
+++ b/sql/opt_range.h
@@ -118,11 +118,13 @@ public:
protected:
friend void print_quick_sel_range(QUICK_RANGE_SELECT *quick,
const key_map* needed_reg);
- friend QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
- struct st_table_ref *ref);
+ friend
+ QUICK_RANGE_SELECT *get_quick_select_for_ref(THD *thd, TABLE *table,
+ struct st_table_ref *ref);
friend bool get_quick_keys(struct st_qsel_param *param,
QUICK_RANGE_SELECT *quick,KEY_PART *key,
- SEL_ARG *key_tree,char *min_key,uint min_key_flag,
+ SEL_ARG *key_tree,
+ char *min_key, uint min_key_flag,
char *max_key, uint max_key_flag);
friend QUICK_RANGE_SELECT *get_quick_select(struct st_qsel_param*,uint idx,
SEL_ARG *key_tree,
@@ -160,58 +162,62 @@ public:
/*
-QUICK_INDEX_MERGE_SELECT - index_merge acces method quick select.
+ QUICK_INDEX_MERGE_SELECT - index_merge access method quick select.
- QUICK_INDEX_MERGE_SELECT uses
- * QUICK_RANGE_SELECTs to get rows
- * Unique class to remove duplicate rows
+ QUICK_INDEX_MERGE_SELECT uses
+ * QUICK_RANGE_SELECTs to get rows
+ * Unique class to remove duplicate rows
-INDEX MERGE OPTIMIZER
- Current implementation doesn't detect all cases where index_merge could be
- used, in particular:
- * index_merge will never be used if range scan is possible (even if range
- scan is more expensive)
+ INDEX MERGE OPTIMIZER
+ Current implementation doesn't detect all cases where index_merge could
+ be used, in particular:
+ * index_merge will never be used if range scan is possible (even if
+ range scan is more expensive)
- * index_merge+'using index' is not supported (this the consequence of the
- above restriction)
+ * index_merge+'using index' is not supported (this the consequence of
+ the above restriction)
- * If WHERE part contains complex nested AND and OR conditions, some ways to
- retrieve rows using index_merge will not be considered. The choice of
- read plan may depend on the order of conjuncts/disjuncts in WHERE part of
- the query, see comments near SEL_IMERGE::or_sel_tree_with_checks and
- imerge_list_or_list function for details.
+ * If WHERE part contains complex nested AND and OR conditions, some ways
+ to retrieve rows using index_merge will not be considered. The choice
+ of read plan may depend on the order of conjuncts/disjuncts in WHERE
+ part of the query, see comments near imerge_list_or_list and
+ SEL_IMERGE::or_sel_tree_with_checks functions for details.
- * there is no "index_merge_ref" method (but index_merge on non-first table
- in join is possible with 'range checked for each record').
+ * There is no "index_merge_ref" method (but index_merge on non-first
+ table in join is possible with 'range checked for each record').
- See comments around SEL_IMERGE class and test_quick_select for more details.
+ See comments around SEL_IMERGE class and test_quick_select for more
+ details.
-ROW RETRIEVAL ALGORITHM
+ ROW RETRIEVAL ALGORITHM
- index_merge uses Unique class for duplicates removal. Index merge takes
- advantage of clustered covering primary key (CCPK) if the table has one.
- The algorithm is as follows:
+ index_merge uses Unique class for duplicates removal. index_merge takes
+ advantage of Clustered Primary Key (CPK) if the table has one.
+ The index_merge algorithm consists of two phases:
- prepare() //implemented in QUICK_INDEX_MERGE_SELECT::prepare_unique
- {
- activate 'index only';
- while(retrieve next row for non-CCPK scan)
+ Phase 1 (implemented in QUICK_INDEX_MERGE_SELECT::prepare_unique):
+ prepare()
{
- if (there is a CCPK scan and row will be retrieved by it)
- skip this row;
- else
- put rowid into Unique;
+ activate 'index only';
+ while(retrieve next row for non-CPK scan)
+ {
+ if (there is a CPK scan and row will be retrieved by it)
+ skip this row;
+ else
+ put its rowid into Unique;
+ }
+ deactivate 'index only';
}
- deactivate 'index only';
- }
-
- fetch() //implemented as sequence of QUICK_INDEX_MERGE_SELECT::get_next calls
- {
- retrieve all rows from row pointers stored in Unique;
- free Unique;
- retrieve all rows for CCPK scan;
- }
+
+ Phase 2 (implemented as sequence of QUICK_INDEX_MERGE_SELECT::get_next
+ calls):
+ fetch()
+ {
+ retrieve all rows from row pointers stored in Unique;
+ free Unique;
+ retrieve all rows for CPK scan;
+ }
*/
class QUICK_INDEX_MERGE_SELECT : public QUICK_SELECT_I
@@ -239,10 +245,10 @@ public:
/* last element in quick_selects list */
QUICK_RANGE_SELECT* last_quick_select;
- /* quick select that uses Covering Clustered Primary Key (NULL if none) */
+ /* quick select that uses clustered primary key (NULL if none) */
QUICK_RANGE_SELECT* pk_quick_select;
- /* true if this select is currently doing a CCPK scan */
+ /* true if this select is currently doing a clustered PK scan */
bool doing_pk_scan;
Unique *unique;
diff --git a/sql/records.cc b/sql/records.cc
index f8fbfe62187..b29b113a1bc 100644
--- a/sql/records.cc
+++ b/sql/records.cc
@@ -98,7 +98,6 @@ void init_read_record(READ_RECORD *info,THD *thd, TABLE *table,
}
}
else if (select && select->quick)
- //&& (select->quick->get_type() != QUICK_SELECT_I::QS_TYPE_INDEX_MERGE))
{
DBUG_PRINT("info",("using rr_quick"));
info->read_record=rr_quick;
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 8263789a2a2..76d2eae1bb5 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -1233,7 +1233,8 @@ public:
}
bool get(TABLE *table);
-
+ static double get_use_cost(MEM_ROOT *alloc, uint nkeys, uint key_size,
+ ulong max_in_memory_size);
friend int unique_write_to_file(gptr key, element_count count, Unique *unique);
friend int unique_write_to_ptrs(gptr key, element_count count, Unique *unique);
};
diff --git a/sql/uniques.cc b/sql/uniques.cc
index 02146426782..48233f29bee 100644
--- a/sql/uniques.cc
+++ b/sql/uniques.cc
@@ -63,12 +63,194 @@ Unique::Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg,
comp_func_fixed_arg);
/* If the following fail's the next add will also fail */
my_init_dynamic_array(&file_ptrs, sizeof(BUFFPEK), 16, 16);
+ /*
+ If you change the following, change it in get_max_elements function, too.
+ */
max_elements= max_in_memory_size / ALIGN_SIZE(sizeof(TREE_ELEMENT)+size);
open_cached_file(&file, mysql_tmpdir,TEMP_PREFIX, DISK_BUFFER_SIZE,
MYF(MY_WME));
}
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#define M_E (exp(1))
+
+inline double log2_n_fact(double x)
+{
+ return (2 * ( ((x)+1) * log(((x)+1)/M_E) + log(2*M_PI*((x)+1))/2 ) / log(2));
+}
+
+/*
+ Calculate cost of merge_buffers call.
+
+ NOTE
+ See comment near Unique::get_use_cost for cost formula derivation.
+*/
+static double get_merge_buffers_cost(uint* buff_sizes, uint elem_size,
+ int last, int f,int t)
+{
+ uint sum= 0;
+ for (int i=f; i <= t; i++)
+ sum+= buff_sizes[i];
+ buff_sizes[last]= sum;
+
+ int n_buffers= t - f + 1;
+ double buf_length= sum*elem_size;
+
+ return (((double)buf_length/(n_buffers+1)) / IO_SIZE) * 2 * n_buffers +
+ buf_length * log(n_buffers) / (TIME_FOR_COMPARE_ROWID * log(2.0));
+}
+
+/*
+ Calculate cost of merging buffers into one in Unique::get, i.e. calculate
+ how long (in terms of disk seeks) the two call
+ merge_many_buffs(...);
+ merge_buffers(...);
+ will take.
+
+ SYNOPSIS
+ get_merge_many_buffs_cost()
+ alloc memory pool to use
+ maxbuffer # of full buffers.
+ max_n_elems # of elements in first maxbuffer buffers.
+ last_n_elems # of elements in last buffer.
+ elem_size size of buffer element.
+
+ NOTES
+ It is assumed that maxbuffer+1 buffers are merged, first maxbuffer buffers
+ contain max_n_elems each, last buffer contains last_n_elems elements.
+
+ The current implementation does a dumb simulation of merge_many_buffs
+ actions.
+
+ RETURN
+ >=0 Cost of merge in disk seeks.
+ <0 Out of memory.
+*/
+static double get_merge_many_buffs_cost(MEM_ROOT *alloc,
+ uint maxbuffer, uint max_n_elems,
+ uint last_n_elems, int elem_size)
+{
+ register int i;
+ double total_cost= 0.0;
+ int lastbuff;
+ uint* buff_sizes;
+
+ if (!(buff_sizes= (uint*)alloc_root(alloc, sizeof(uint) * (maxbuffer + 1))))
+ return -1.0;
+ for(i = 0; i < (int)maxbuffer; i++)
+ buff_sizes[i]= max_n_elems;
+
+ buff_sizes[maxbuffer]= last_n_elems;
+
+ if (maxbuffer >= MERGEBUFF2)
+ {
+ /* Simulate merge_many_buff */
+ while (maxbuffer >= MERGEBUFF2)
+ {
+ lastbuff=0;
+ for (i = 0; i <= (int) maxbuffer - MERGEBUFF*3/2; i += MERGEBUFF)
+ total_cost += get_merge_buffers_cost(buff_sizes, elem_size,
+ lastbuff++, i, i+MERGEBUFF-1);
+
+ total_cost += get_merge_buffers_cost(buff_sizes, elem_size,
+ lastbuff++, i, maxbuffer);
+ maxbuffer= (uint)lastbuff-1;
+ }
+ }
+
+ /* Simulate final merge_buff call. */
+ total_cost += get_merge_buffers_cost(buff_sizes, elem_size, 0, 0,
+ maxbuffer);
+ return total_cost;
+}
+
+
+/*
+ Calclulate cost of using Unique for processing nkeys elements of size
+ key_size using max_in_memory_size memory.
+
+ RETURN
+ Use cost as # of disk seeks.
+
+ NOTES
+ cost(using_unqiue) =
+ cost(create_trees) + (see #1)
+ cost(merge) + (see #2)
+ cost(read_result) (see #3)
+
+ 1. Cost of trees creation
+ For each Unique::put operation there will be 2*log2(n+1) elements
+ comparisons, where n runs from 1 tree_size (we assume that all added
+ elements are different). Together this gives:
+
+ n_compares = 2*(log2(2) + log2(3) + ... + log2(N+1)) = 2*log2((N+1)!) =
+
+ = 2*ln((N+1)!) / ln(2) = {using Stirling formula} =
+
+ = 2*( (N+1)*ln((N+1)/e) + (1/2)*ln(2*pi*(N+1)) / ln(2).
+
+ then cost(tree_creation) = n_compares*ROWID_COMPARE_COST;
+
+ Total cost of creating trees:
+ (n_trees - 1)*max_size_tree_cost + non_max_size_tree_cost.
+
+ 2. Cost of merging.
+ If only one tree is created by Unique no merging will be necessary.
+ Otherwise, we model execution of merge_many_buff function and count
+ #of merges. (The reason behind this is that number of buffers is small,
+ while size of buffers is big and we don't want to loose precision with
+ O(x)-style formula)
+
+ 3. If only one tree is created by Unique no disk io will happen.
+ Otherwise, ceil(key_len*n_keys) disk seeks are necessary. We assume
+ these will be random seeks.
+*/
+
+double Unique::get_use_cost(MEM_ROOT *alloc, uint nkeys, uint key_size,
+ ulong max_in_memory_size)
+{
+ ulong max_elements_in_tree;
+ ulong last_tree_elems;
+ int n_full_trees; /* number of trees in unique - 1 */
+ double result;
+
+ max_elements_in_tree= max_in_memory_size /
+ ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size);
+ n_full_trees= nkeys / max_elements_in_tree;
+ last_tree_elems= nkeys % max_elements_in_tree;
+
+ /* Calculate cost of creating trees */
+ result= log2_n_fact(last_tree_elems);
+ if (n_full_trees)
+ result+= n_full_trees * log2_n_fact(max_elements_in_tree);
+ result /= TIME_FOR_COMPARE_ROWID;
+
+ /* Calculate cost of merging */
+ if (!n_full_trees)
+ return result;
+
+ /* There is more then one tree and merging is necessary. */
+ /* Add cost of writing all trees to disk. */
+ result += n_full_trees * ceil(key_size*max_elements_in_tree / IO_SIZE);
+ result += ceil(key_size*last_tree_elems / IO_SIZE);
+
+ /* Cost of merge */
+ result += get_merge_many_buffs_cost(alloc, n_full_trees,
+ max_elements_in_tree,
+ last_tree_elems, key_size);
+ /*
+ Add cost of reading the resulting sequence, assuming there were no
+ duplicate elements.
+ */
+ result += ceil((double)key_size*nkeys/IO_SIZE);
+
+ return result;
+}
+
Unique::~Unique()
{
close_cached_file(&file);