summaryrefslogtreecommitdiff
path: root/sql/multi_range_read.cc
diff options
context:
space:
mode:
Diffstat (limited to 'sql/multi_range_read.cc')
-rw-r--r--sql/multi_range_read.cc309
1 files changed, 235 insertions, 74 deletions
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc
index 601268ec2f6..ec039962be9 100644
--- a/sql/multi_range_read.cc
+++ b/sql/multi_range_read.cc
@@ -56,28 +56,67 @@
contain scan parameters.
*/
-ha_rows
+ha_rows
handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
void *seq_init_param, uint n_ranges_arg,
- uint *bufsz, uint *flags, Cost_estimate *cost)
+ uint *bufsz, uint *flags,
+ Cost_estimate *cost)
{
KEY_MULTI_RANGE range;
range_seq_t seq_it;
ha_rows total_rows= 0;
uint n_ranges=0;
- uint n_eq_ranges= 0;
- ulonglong total_touched_blocks= 0;
ha_rows max_rows= stats.records;
THD *thd= table->in_use;
+ ulonglong io_blocks;
+
+ /*
+ Counter of blocks that contain range edges for those ranges
+ for which records_in_range() is called
+ */
+ ulonglong edge_blocks_cnt= 0;
+ /*
+ Counter of blocks that contain index tuples for those ranges
+ for which records_in_range() is called
+ */
+ ulonglong range_blocks_cnt= 0;
+ /*
+ The position of the block containing the last record of the previous range
+ for which the info about range position is provided
+ */
+ ulonglong prev_range_last_block= UNUSED_PAGE_NO;
+ /* The counter of records the staring from prev_range_last_block */
+ ulonglong prev_range_last_block_records= 0;
+ /*
+ The counter of single point ranges.
+ (For single point ranges we do not call records_in_range())
+ */
+ ulonglong single_point_ranges= 0;
+ /*
+ The counter of of single point ranges that we succeded to assign
+ to some blocks
+ */
+ ulonglong assigned_single_point_ranges= 0;
+ /*
+ Counter of single point ranges for which records_in_range in not
+ called and that are encountered between two ranges without such property
+ For example, let's have a subsequence of ranges
+ R1,r1,....rk,R2
+ where r1,...,rk are single point ranges for which records_in_range is
+ called while R1 and R2 are not such ranges.
+ Then single_point_ranges_delta will count ranges r1,...,rk.
+ */
+ ulonglong unassigned_single_point_ranges= 0;
+
+ uint len= table->key_info[keyno].key_length + table->file->ref_length;
+ if (table->file->is_clustering_key(keyno))
+ len= table->s->stored_rec_length;
+ /* Assume block is 75 % full */
+ uint avg_block_records= ((uint) (stats.block_size*3/4))/len + 1;
uint limit= thd->variables.eq_range_index_dive_limit;
bool use_statistics_for_eq_range= eq_ranges_exceeds_limit(seq,
seq_init_param,
limit);
- uint len= table->key_info[keyno].key_length + table->file->ref_length;
- if (keyno == table->s->primary_key && table->file->primary_key_is_clustered())
- len= table->s->stored_rec_length;
- /* Assume block is 75 % full */
- uint avg_block_records= ((uint) (table->file->stats.block_size*3/4))/len + 1;
DBUG_ENTER("multi_range_read_info_const");
/* Default MRR implementation doesn't need buffer */
@@ -90,10 +129,8 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
if (unlikely(thd->killed != 0))
DBUG_RETURN(HA_POS_ERROR);
-
+
n_ranges++;
- if (range.range_flag & EQ_RANGE)
- n_eq_ranges++;
key_range *min_endp, *max_endp;
if (range.range_flag & GEOM_FLAG)
{
@@ -108,67 +145,192 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
max_endp= range.end_key.length? &range.end_key : NULL;
}
int keyparts_used= my_count_bits(range.start_key.keypart_map);
- if (use_statistics_for_eq_range &&
- !(range.range_flag & NULL_RANGE) &&
- (range.range_flag & EQ_RANGE) &&
- table->key_info[keyno].actual_rec_per_key(keyparts_used - 1) > 0.5)
+
+ if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
{
- if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
- rows= 1; /* there can be at most one row */
- else
- rows=
- (ha_rows) table->key_info[keyno].actual_rec_per_key(keyparts_used-1);
+ rows= 1;
+ /*
+ In this case we do not call records_in_range() and as a result
+ do not get any info on the edge blocks for this range. However if it
+ happens that the range for which we have such info uses the same block
+ for its first record as the last range for which such info is
+ provided uses for its last record then this range can be assigned
+ later to one of the blocks used by other ranges.
+
+ Note that we don't have to increment edge_blocks_cnt or
+ range_blocks_cnt here.
+ */
+ single_point_ranges++;
+ }
+ else if (use_statistics_for_eq_range &&
+ !(range.range_flag & NULL_RANGE) &&
+ (range.range_flag & EQ_RANGE) &&
+ table->key_info[keyno].actual_rec_per_key(keyparts_used - 1) > 0.5)
+ {
+ rows= ((ha_rows) table->key_info[keyno].
+ actual_rec_per_key(keyparts_used-1));
+ range_blocks_cnt+= ((MY_MAX(rows, 1) - 1) / avg_block_records + 1);
}
else
{
- if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE))
- rows= 1; /* there can be at most one row */
- else if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp,
- max_endp)))
+ page_range pages= unused_page_range;
+ if ((rows= this->records_in_range(keyno, min_endp, max_endp, &pages)) ==
+ HA_POS_ERROR)
{
/* Can't scan one range => can't do MRR scan at all */
total_rows= HA_POS_ERROR;
break;
}
+ if (pages.first_page == UNUSED_PAGE_NO)
+ {
+ /*
+ The engine does not provide info on the range position.
+ Place the range in a new block. Note that in this case
+ any new range will be placed in a new block.
+ */
+ ulonglong additional_blocks= ((MY_MAX(rows,1) - 1) / avg_block_records +
+ 1);
+ edge_blocks_cnt+= additional_blocks == 1 ? 1 : 2;
+ range_blocks_cnt+= additional_blocks;
+ }
+ else
+ {
+ /* The info on the range position is provided */
+ if (pages.first_page == prev_range_last_block)
+ {
+ /*
+ The new range starts in the same block that the last range
+ for which the position of the range was provided.
+ */
+ /*
+ First add records of single point ranges that can be placed
+ between these two ranges.
+ */
+ prev_range_last_block_records+= (single_point_ranges -
+ assigned_single_point_ranges);
+ assigned_single_point_ranges= single_point_ranges;
+ if (pages.first_page == pages.last_page)
+ {
+ /*
+ All records of the current range are in the same block
+ Note that the prev_range_last_block_records can be much larger
+ than max_records_in_block as the rows can be compressed!
+ */
+ prev_range_last_block_records+= rows;
+ DBUG_ASSERT(prev_range_last_block_records <
+ stats.block_size);
+ }
+ else
+ {
+ /*
+ The current range spans more than one block
+
+ Place part of the range records in 'prev_range_last_block'
+ and the remaining records in additional blocks.
+
+ We don't know where the first key was positioned in the
+ block, so we assume the range started in the middle of the
+ block.
+
+ Note that prev_range_last_block_records > avg_block_records
+ can be true in case of compressed rows.
+ */
+ ha_rows rem_rows= rows;
+
+ if (avg_block_records > prev_range_last_block_records)
+ {
+ ha_rows space_left_in_prev_block=
+ (avg_block_records - prev_range_last_block_records)/2;
+ rem_rows= 0;
+ if (rows > space_left_in_prev_block)
+ rem_rows= rows - space_left_in_prev_block;
+ }
+ /* Calculate how many additional blocks we need for rem_rows */
+ ulonglong additional_blocks= ((MY_MAX(rem_rows, 1) - 1) /
+ avg_block_records + 1);
+ edge_blocks_cnt++;
+ range_blocks_cnt+= additional_blocks;
+ prev_range_last_block= pages.last_page;
+ /* There is at least one row on last page */
+ prev_range_last_block_records= 1;
+ }
+ }
+ else
+ {
+ /*
+ The new range does not start in the same block that the last range
+ for which the position of the range was provided.
+ Note that rows may be 0!
+ */
+ ulonglong additional_blocks= ((MY_MAX(rows, 1) - 1) /
+ avg_block_records + 1);
+ edge_blocks_cnt+= additional_blocks == 1 ? 1 : 2;
+ range_blocks_cnt+= additional_blocks;
+ unassigned_single_point_ranges+= (single_point_ranges -
+ assigned_single_point_ranges);
+ assigned_single_point_ranges= single_point_ranges;
+ prev_range_last_block= pages.last_page;
+ /* There is at least one row on last page */
+ prev_range_last_block_records= 1;
+ }
+ }
}
- total_rows += rows;
- total_touched_blocks+= (rows / avg_block_records +1);
+ total_rows+= rows;
}
-
+ /*
+ Count the number of io_blocks that where not yet read and thus not cached.
+ The number of equal read blocks that where not read are:
+
+ (single_point_ranges - assigned_single_point_ranges).
+
+ We don't add these to io_blocks as we don't want to penalize equal
+ readss (if we did, a range that would read 5 rows would be
+ regarded as better than one equal read).
+
+ Better to assume we have done a records_in_range() for the equal
+ range and it's also cached.
+ */
+ io_blocks= (range_blocks_cnt - edge_blocks_cnt);
+ unassigned_single_point_ranges+= (single_point_ranges -
+ assigned_single_point_ranges);
+
if (total_rows != HA_POS_ERROR)
{
set_if_smaller(total_rows, max_rows);
+
/* The following calculation is the same as in multi_range_read_info(): */
*flags |= HA_MRR_USE_DEFAULT_IMPL;
cost->reset();
- cost->avg_io_cost= 1; /* assume random seeks */
- cost->idx_avg_io_cost= 1;
- if (!((keyno == table->s->primary_key && primary_key_is_clustered()) ||
- is_clustering_key(keyno)))
+ cost->avg_io_cost= cost->idx_avg_io_cost= avg_io_cost();
+
+ if (!is_clustering_key(keyno))
{
- cost->idx_io_count= total_touched_blocks +
- keyread_time(keyno, 0, total_rows);
- cost->cpu_cost= cost->idx_cpu_cost=
- (double) total_rows / TIME_FOR_COMPARE_IDX +
- (2 * n_ranges - n_eq_ranges) * IDX_LOOKUP_COST;
+ cost->idx_io_count= (double) io_blocks;
+ cost->idx_cpu_cost= (keyread_time(keyno, 0, total_rows) +
+ n_ranges * IDX_LOOKUP_COST);
if (!(*flags & HA_MRR_INDEX_ONLY))
- {
- cost->io_count= read_time(keyno, 0, total_rows);
- cost->cpu_cost+= (double) total_rows / TIME_FOR_COMPARE;
- }
+ cost->cpu_cost= read_time(keyno, 0, total_rows);
}
else
{
- cost->io_count= read_time(keyno, n_ranges, (uint) total_rows);
- cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01;
+ /*
+ Clustered index
+ If all index dives are to a few blocks, then limit the
+ ranges used by read_time to the number of dives.
+ */
+ io_blocks+= unassigned_single_point_ranges;
+ cost->idx_cpu_cost= n_ranges * IDX_LOOKUP_COST;
+ uint limited_ranges= (uint) MY_MIN((ulonglong) n_ranges, io_blocks);
+ cost->cpu_cost= read_time(keyno, limited_ranges, total_rows);
}
+ cost->cpu_cost+= (rows2double(total_rows) / TIME_FOR_COMPARE +
+ MULTI_RANGE_READ_SETUP_COST);
}
DBUG_PRINT("statistics",
("key: %s rows: %llu total_cost: %.3f io_blocks: %llu "
"idx_io_count: %.3f cpu_cost: %.3f io_count: %.3f",
table->s->keynames.type_names[keyno],
- (ulonglong) total_rows, cost->total_cost(),
- (ulonglong) total_touched_blocks,
+ (ulonglong) total_rows, cost->total_cost(), (ulonglong) io_blocks,
cost->idx_io_count, cost->cpu_cost, cost->io_count));
DBUG_RETURN(total_rows);
}
@@ -209,7 +371,7 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
*/
ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
- uint key_parts, uint *bufsz,
+ uint key_parts, uint *bufsz,
uint *flags, Cost_estimate *cost)
{
/*
@@ -222,25 +384,26 @@ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows,
*flags |= HA_MRR_USE_DEFAULT_IMPL;
cost->reset();
- cost->avg_io_cost= 1; /* assume random seeks */
-
/* Produce the same cost as non-MRR code does */
- if (!(keyno == table->s->primary_key && primary_key_is_clustered()))
+ if (!is_clustering_key(keyno))
{
- cost->idx_io_count= n_ranges + keyread_time(keyno, 0, n_rows);
- cost->cpu_cost= cost->idx_cpu_cost=
- (double) n_rows / TIME_FOR_COMPARE_IDX + n_ranges * IDX_LOOKUP_COST;
+ /*
+ idx_io_count could potentially be increased with the number of
+ index leaf blocks we have to read for finding n_rows.
+ */
+ cost->idx_io_count= n_ranges;
+ cost->idx_cpu_cost= (keyread_time(keyno, 0, n_rows) +
+ n_ranges * IDX_LOOKUP_COST);
if (!(*flags & HA_MRR_INDEX_ONLY))
{
- cost->io_count= read_time(keyno, 0, n_rows);
- cost->cpu_cost+= (double) n_rows / TIME_FOR_COMPARE;
+ cost->cpu_cost= read_time(keyno, 0, n_rows);
}
}
else
{
- cost->io_count= read_time(keyno, n_ranges, (uint)n_rows);
- cost->cpu_cost= (double) n_rows / TIME_FOR_COMPARE + 0.01;
+ cost->cpu_cost= read_time(keyno, n_ranges, (uint)n_rows);
}
+ cost->cpu_cost+= rows2double(n_rows) / TIME_FOR_COMPARE;
return 0;
}
@@ -939,7 +1102,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
h_idx= (primary_file->inited == handler::INDEX)? primary_file: secondary_file;
keyno= h_idx->active_index;
- if (!(keyno == table->s->primary_key && h_idx->primary_key_is_clustered()))
+ if (! h_idx->is_clustering_key(keyno))
{
strategy= disk_strategy= &reader_factory.ordered_rndpos_reader;
if (h_arg->pushed_rowid_filter)
@@ -976,11 +1139,10 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs,
if (strategy != index_strategy)
{
uint saved_pk_length=0;
- if (h_idx->primary_key_is_clustered())
+ uint pk= h_idx->get_table()->s->primary_key;
+ if (h_idx->pk_is_clustering_key(pk))
{
- uint pk= h_idx->get_table()->s->primary_key;
- if (pk != MAX_KEY)
- saved_pk_length= h_idx->get_table()->key_info[pk].key_length;
+ saved_pk_length= h_idx->get_table()->key_info[pk].key_length;
}
KEY *used_index= &h_idx->get_table()->key_info[h_idx->active_index];
@@ -1185,9 +1347,9 @@ int DsMrr_impl::setup_two_handlers()
We get here when the access alternates betwen MRR scan(s) and non-MRR
scans.
- Calling primary_file->index_end() will invoke dsmrr_close() for this object,
- which will delete secondary_file. We need to keep it, so put it away and dont
- let it be deleted:
+ Calling primary_file->index_end() will invoke dsmrr_close() for this
+ object, which will delete secondary_file. We need to keep it, so put it
+ away and don't let it be deleted:
*/
if (primary_file->inited == handler::INDEX)
{
@@ -1218,7 +1380,7 @@ void DsMrr_impl::close_second_handler()
{
secondary_file->extra(HA_EXTRA_NO_KEYREAD);
secondary_file->ha_index_or_rnd_end();
- secondary_file->ha_external_lock(current_thd, F_UNLCK);
+ secondary_file->ha_external_unlock(current_thd);
secondary_file->ha_close();
delete secondary_file;
secondary_file= NULL;
@@ -1621,8 +1783,7 @@ bool DsMrr_impl::check_cpk_scan(THD *thd, TABLE_SHARE *share, uint keyno,
uint mrr_flags)
{
return MY_TEST((mrr_flags & HA_MRR_SINGLE_POINT) &&
- keyno == share->primary_key &&
- primary_file->primary_key_is_clustered() &&
+ primary_file->is_clustering_key(keyno) &&
optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS));
}
@@ -1660,8 +1821,7 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags,
TABLE_SHARE *share= primary_file->get_table_share();
bool doing_cpk_scan= check_cpk_scan(thd, share, keyno, *flags);
- bool using_cpk= MY_TEST(keyno == share->primary_key &&
- primary_file->primary_key_is_clustered());
+ bool using_cpk= primary_file->is_clustering_key(keyno);
*flags &= ~HA_MRR_IMPLEMENTATION_FLAGS;
if (!optimizer_flag(thd, OPTIMIZER_SWITCH_MRR) ||
*flags & HA_MRR_INDEX_ONLY ||
@@ -1915,6 +2075,9 @@ void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, Cost_estimate *cost)
We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.
+ If handler::avg_io_cost() < 1.0, then we will trust the handler
+ when it comes to the average cost (this is for example true for HEAP).
+
@param table Table to be accessed
@param nrows Number of rows to retrieve
@param interrupted TRUE <=> Assume that the disk sweep will be
@@ -1928,12 +2091,12 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted,
DBUG_ENTER("get_sweep_read_cost");
cost->reset();
- if (table->file->primary_key_is_clustered())
+ if (table->file->pk_is_clustering_key(table->s->primary_key))
{
- cost->io_count= table->file->read_time(table->s->primary_key,
+ cost->cpu_cost= table->file->read_time(table->s->primary_key,
(uint) nrows, nrows);
}
- else
+ else if ((cost->avg_io_cost= table->file->avg_io_cost()) >= 0.999)
{
double n_blocks=
ceil(ulonglong2double(table->file->stats.data_file_length) / IO_SIZE);
@@ -1961,5 +2124,3 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted,
/* **************************************************************************
* DS-MRR implementation ends
***************************************************************************/
-
-