diff options
Diffstat (limited to 'sql/multi_range_read.cc')
-rw-r--r-- | sql/multi_range_read.cc | 309 |
1 files changed, 235 insertions, 74 deletions
diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc index 601268ec2f6..ec039962be9 100644 --- a/sql/multi_range_read.cc +++ b/sql/multi_range_read.cc @@ -56,28 +56,67 @@ contain scan parameters. */ -ha_rows +ha_rows handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, void *seq_init_param, uint n_ranges_arg, - uint *bufsz, uint *flags, Cost_estimate *cost) + uint *bufsz, uint *flags, + Cost_estimate *cost) { KEY_MULTI_RANGE range; range_seq_t seq_it; ha_rows total_rows= 0; uint n_ranges=0; - uint n_eq_ranges= 0; - ulonglong total_touched_blocks= 0; ha_rows max_rows= stats.records; THD *thd= table->in_use; + ulonglong io_blocks; + + /* + Counter of blocks that contain range edges for those ranges + for which records_in_range() is called + */ + ulonglong edge_blocks_cnt= 0; + /* + Counter of blocks that contain index tuples for those ranges + for which records_in_range() is called + */ + ulonglong range_blocks_cnt= 0; + /* + The position of the block containing the last record of the previous range + for which the info about range position is provided + */ + ulonglong prev_range_last_block= UNUSED_PAGE_NO; + /* The counter of records the staring from prev_range_last_block */ + ulonglong prev_range_last_block_records= 0; + /* + The counter of single point ranges. + (For single point ranges we do not call records_in_range()) + */ + ulonglong single_point_ranges= 0; + /* + The counter of of single point ranges that we succeded to assign + to some blocks + */ + ulonglong assigned_single_point_ranges= 0; + /* + Counter of single point ranges for which records_in_range in not + called and that are encountered between two ranges without such property + For example, let's have a subsequence of ranges + R1,r1,....rk,R2 + where r1,...,rk are single point ranges for which records_in_range is + called while R1 and R2 are not such ranges. + Then single_point_ranges_delta will count ranges r1,...,rk. + */ + ulonglong unassigned_single_point_ranges= 0; + + uint len= table->key_info[keyno].key_length + table->file->ref_length; + if (table->file->is_clustering_key(keyno)) + len= table->s->stored_rec_length; + /* Assume block is 75 % full */ + uint avg_block_records= ((uint) (stats.block_size*3/4))/len + 1; uint limit= thd->variables.eq_range_index_dive_limit; bool use_statistics_for_eq_range= eq_ranges_exceeds_limit(seq, seq_init_param, limit); - uint len= table->key_info[keyno].key_length + table->file->ref_length; - if (keyno == table->s->primary_key && table->file->primary_key_is_clustered()) - len= table->s->stored_rec_length; - /* Assume block is 75 % full */ - uint avg_block_records= ((uint) (table->file->stats.block_size*3/4))/len + 1; DBUG_ENTER("multi_range_read_info_const"); /* Default MRR implementation doesn't need buffer */ @@ -90,10 +129,8 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, if (unlikely(thd->killed != 0)) DBUG_RETURN(HA_POS_ERROR); - + n_ranges++; - if (range.range_flag & EQ_RANGE) - n_eq_ranges++; key_range *min_endp, *max_endp; if (range.range_flag & GEOM_FLAG) { @@ -108,67 +145,192 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, max_endp= range.end_key.length? &range.end_key : NULL; } int keyparts_used= my_count_bits(range.start_key.keypart_map); - if (use_statistics_for_eq_range && - !(range.range_flag & NULL_RANGE) && - (range.range_flag & EQ_RANGE) && - table->key_info[keyno].actual_rec_per_key(keyparts_used - 1) > 0.5) + + if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE)) { - if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE)) - rows= 1; /* there can be at most one row */ - else - rows= - (ha_rows) table->key_info[keyno].actual_rec_per_key(keyparts_used-1); + rows= 1; + /* + In this case we do not call records_in_range() and as a result + do not get any info on the edge blocks for this range. However if it + happens that the range for which we have such info uses the same block + for its first record as the last range for which such info is + provided uses for its last record then this range can be assigned + later to one of the blocks used by other ranges. + + Note that we don't have to increment edge_blocks_cnt or + range_blocks_cnt here. + */ + single_point_ranges++; + } + else if (use_statistics_for_eq_range && + !(range.range_flag & NULL_RANGE) && + (range.range_flag & EQ_RANGE) && + table->key_info[keyno].actual_rec_per_key(keyparts_used - 1) > 0.5) + { + rows= ((ha_rows) table->key_info[keyno]. + actual_rec_per_key(keyparts_used-1)); + range_blocks_cnt+= ((MY_MAX(rows, 1) - 1) / avg_block_records + 1); } else { - if ((range.range_flag & UNIQUE_RANGE) && !(range.range_flag & NULL_RANGE)) - rows= 1; /* there can be at most one row */ - else if (HA_POS_ERROR == (rows= this->records_in_range(keyno, min_endp, - max_endp))) + page_range pages= unused_page_range; + if ((rows= this->records_in_range(keyno, min_endp, max_endp, &pages)) == + HA_POS_ERROR) { /* Can't scan one range => can't do MRR scan at all */ total_rows= HA_POS_ERROR; break; } + if (pages.first_page == UNUSED_PAGE_NO) + { + /* + The engine does not provide info on the range position. + Place the range in a new block. Note that in this case + any new range will be placed in a new block. + */ + ulonglong additional_blocks= ((MY_MAX(rows,1) - 1) / avg_block_records + + 1); + edge_blocks_cnt+= additional_blocks == 1 ? 1 : 2; + range_blocks_cnt+= additional_blocks; + } + else + { + /* The info on the range position is provided */ + if (pages.first_page == prev_range_last_block) + { + /* + The new range starts in the same block that the last range + for which the position of the range was provided. + */ + /* + First add records of single point ranges that can be placed + between these two ranges. + */ + prev_range_last_block_records+= (single_point_ranges - + assigned_single_point_ranges); + assigned_single_point_ranges= single_point_ranges; + if (pages.first_page == pages.last_page) + { + /* + All records of the current range are in the same block + Note that the prev_range_last_block_records can be much larger + than max_records_in_block as the rows can be compressed! + */ + prev_range_last_block_records+= rows; + DBUG_ASSERT(prev_range_last_block_records < + stats.block_size); + } + else + { + /* + The current range spans more than one block + + Place part of the range records in 'prev_range_last_block' + and the remaining records in additional blocks. + + We don't know where the first key was positioned in the + block, so we assume the range started in the middle of the + block. + + Note that prev_range_last_block_records > avg_block_records + can be true in case of compressed rows. + */ + ha_rows rem_rows= rows; + + if (avg_block_records > prev_range_last_block_records) + { + ha_rows space_left_in_prev_block= + (avg_block_records - prev_range_last_block_records)/2; + rem_rows= 0; + if (rows > space_left_in_prev_block) + rem_rows= rows - space_left_in_prev_block; + } + /* Calculate how many additional blocks we need for rem_rows */ + ulonglong additional_blocks= ((MY_MAX(rem_rows, 1) - 1) / + avg_block_records + 1); + edge_blocks_cnt++; + range_blocks_cnt+= additional_blocks; + prev_range_last_block= pages.last_page; + /* There is at least one row on last page */ + prev_range_last_block_records= 1; + } + } + else + { + /* + The new range does not start in the same block that the last range + for which the position of the range was provided. + Note that rows may be 0! + */ + ulonglong additional_blocks= ((MY_MAX(rows, 1) - 1) / + avg_block_records + 1); + edge_blocks_cnt+= additional_blocks == 1 ? 1 : 2; + range_blocks_cnt+= additional_blocks; + unassigned_single_point_ranges+= (single_point_ranges - + assigned_single_point_ranges); + assigned_single_point_ranges= single_point_ranges; + prev_range_last_block= pages.last_page; + /* There is at least one row on last page */ + prev_range_last_block_records= 1; + } + } } - total_rows += rows; - total_touched_blocks+= (rows / avg_block_records +1); + total_rows+= rows; } - + /* + Count the number of io_blocks that where not yet read and thus not cached. + The number of equal read blocks that where not read are: + + (single_point_ranges - assigned_single_point_ranges). + + We don't add these to io_blocks as we don't want to penalize equal + readss (if we did, a range that would read 5 rows would be + regarded as better than one equal read). + + Better to assume we have done a records_in_range() for the equal + range and it's also cached. + */ + io_blocks= (range_blocks_cnt - edge_blocks_cnt); + unassigned_single_point_ranges+= (single_point_ranges - + assigned_single_point_ranges); + if (total_rows != HA_POS_ERROR) { set_if_smaller(total_rows, max_rows); + /* The following calculation is the same as in multi_range_read_info(): */ *flags |= HA_MRR_USE_DEFAULT_IMPL; cost->reset(); - cost->avg_io_cost= 1; /* assume random seeks */ - cost->idx_avg_io_cost= 1; - if (!((keyno == table->s->primary_key && primary_key_is_clustered()) || - is_clustering_key(keyno))) + cost->avg_io_cost= cost->idx_avg_io_cost= avg_io_cost(); + + if (!is_clustering_key(keyno)) { - cost->idx_io_count= total_touched_blocks + - keyread_time(keyno, 0, total_rows); - cost->cpu_cost= cost->idx_cpu_cost= - (double) total_rows / TIME_FOR_COMPARE_IDX + - (2 * n_ranges - n_eq_ranges) * IDX_LOOKUP_COST; + cost->idx_io_count= (double) io_blocks; + cost->idx_cpu_cost= (keyread_time(keyno, 0, total_rows) + + n_ranges * IDX_LOOKUP_COST); if (!(*flags & HA_MRR_INDEX_ONLY)) - { - cost->io_count= read_time(keyno, 0, total_rows); - cost->cpu_cost+= (double) total_rows / TIME_FOR_COMPARE; - } + cost->cpu_cost= read_time(keyno, 0, total_rows); } else { - cost->io_count= read_time(keyno, n_ranges, (uint) total_rows); - cost->cpu_cost= (double) total_rows / TIME_FOR_COMPARE + 0.01; + /* + Clustered index + If all index dives are to a few blocks, then limit the + ranges used by read_time to the number of dives. + */ + io_blocks+= unassigned_single_point_ranges; + cost->idx_cpu_cost= n_ranges * IDX_LOOKUP_COST; + uint limited_ranges= (uint) MY_MIN((ulonglong) n_ranges, io_blocks); + cost->cpu_cost= read_time(keyno, limited_ranges, total_rows); } + cost->cpu_cost+= (rows2double(total_rows) / TIME_FOR_COMPARE + + MULTI_RANGE_READ_SETUP_COST); } DBUG_PRINT("statistics", ("key: %s rows: %llu total_cost: %.3f io_blocks: %llu " "idx_io_count: %.3f cpu_cost: %.3f io_count: %.3f", table->s->keynames.type_names[keyno], - (ulonglong) total_rows, cost->total_cost(), - (ulonglong) total_touched_blocks, + (ulonglong) total_rows, cost->total_cost(), (ulonglong) io_blocks, cost->idx_io_count, cost->cpu_cost, cost->io_count)); DBUG_RETURN(total_rows); } @@ -209,7 +371,7 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, */ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows, - uint key_parts, uint *bufsz, + uint key_parts, uint *bufsz, uint *flags, Cost_estimate *cost) { /* @@ -222,25 +384,26 @@ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows, *flags |= HA_MRR_USE_DEFAULT_IMPL; cost->reset(); - cost->avg_io_cost= 1; /* assume random seeks */ - /* Produce the same cost as non-MRR code does */ - if (!(keyno == table->s->primary_key && primary_key_is_clustered())) + if (!is_clustering_key(keyno)) { - cost->idx_io_count= n_ranges + keyread_time(keyno, 0, n_rows); - cost->cpu_cost= cost->idx_cpu_cost= - (double) n_rows / TIME_FOR_COMPARE_IDX + n_ranges * IDX_LOOKUP_COST; + /* + idx_io_count could potentially be increased with the number of + index leaf blocks we have to read for finding n_rows. + */ + cost->idx_io_count= n_ranges; + cost->idx_cpu_cost= (keyread_time(keyno, 0, n_rows) + + n_ranges * IDX_LOOKUP_COST); if (!(*flags & HA_MRR_INDEX_ONLY)) { - cost->io_count= read_time(keyno, 0, n_rows); - cost->cpu_cost+= (double) n_rows / TIME_FOR_COMPARE; + cost->cpu_cost= read_time(keyno, 0, n_rows); } } else { - cost->io_count= read_time(keyno, n_ranges, (uint)n_rows); - cost->cpu_cost= (double) n_rows / TIME_FOR_COMPARE + 0.01; + cost->cpu_cost= read_time(keyno, n_ranges, (uint)n_rows); } + cost->cpu_cost+= rows2double(n_rows) / TIME_FOR_COMPARE; return 0; } @@ -939,7 +1102,7 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, h_idx= (primary_file->inited == handler::INDEX)? primary_file: secondary_file; keyno= h_idx->active_index; - if (!(keyno == table->s->primary_key && h_idx->primary_key_is_clustered())) + if (! h_idx->is_clustering_key(keyno)) { strategy= disk_strategy= &reader_factory.ordered_rndpos_reader; if (h_arg->pushed_rowid_filter) @@ -976,11 +1139,10 @@ int DsMrr_impl::dsmrr_init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, if (strategy != index_strategy) { uint saved_pk_length=0; - if (h_idx->primary_key_is_clustered()) + uint pk= h_idx->get_table()->s->primary_key; + if (h_idx->pk_is_clustering_key(pk)) { - uint pk= h_idx->get_table()->s->primary_key; - if (pk != MAX_KEY) - saved_pk_length= h_idx->get_table()->key_info[pk].key_length; + saved_pk_length= h_idx->get_table()->key_info[pk].key_length; } KEY *used_index= &h_idx->get_table()->key_info[h_idx->active_index]; @@ -1185,9 +1347,9 @@ int DsMrr_impl::setup_two_handlers() We get here when the access alternates betwen MRR scan(s) and non-MRR scans. - Calling primary_file->index_end() will invoke dsmrr_close() for this object, - which will delete secondary_file. We need to keep it, so put it away and dont - let it be deleted: + Calling primary_file->index_end() will invoke dsmrr_close() for this + object, which will delete secondary_file. We need to keep it, so put it + away and don't let it be deleted: */ if (primary_file->inited == handler::INDEX) { @@ -1218,7 +1380,7 @@ void DsMrr_impl::close_second_handler() { secondary_file->extra(HA_EXTRA_NO_KEYREAD); secondary_file->ha_index_or_rnd_end(); - secondary_file->ha_external_lock(current_thd, F_UNLCK); + secondary_file->ha_external_unlock(current_thd); secondary_file->ha_close(); delete secondary_file; secondary_file= NULL; @@ -1621,8 +1783,7 @@ bool DsMrr_impl::check_cpk_scan(THD *thd, TABLE_SHARE *share, uint keyno, uint mrr_flags) { return MY_TEST((mrr_flags & HA_MRR_SINGLE_POINT) && - keyno == share->primary_key && - primary_file->primary_key_is_clustered() && + primary_file->is_clustering_key(keyno) && optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS)); } @@ -1660,8 +1821,7 @@ bool DsMrr_impl::choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, TABLE_SHARE *share= primary_file->get_table_share(); bool doing_cpk_scan= check_cpk_scan(thd, share, keyno, *flags); - bool using_cpk= MY_TEST(keyno == share->primary_key && - primary_file->primary_key_is_clustered()); + bool using_cpk= primary_file->is_clustering_key(keyno); *flags &= ~HA_MRR_IMPLEMENTATION_FLAGS; if (!optimizer_flag(thd, OPTIMIZER_SWITCH_MRR) || *flags & HA_MRR_INDEX_ONLY || @@ -1915,6 +2075,9 @@ void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, Cost_estimate *cost) We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9. + If handler::avg_io_cost() < 1.0, then we will trust the handler + when it comes to the average cost (this is for example true for HEAP). + @param table Table to be accessed @param nrows Number of rows to retrieve @param interrupted TRUE <=> Assume that the disk sweep will be @@ -1928,12 +2091,12 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, DBUG_ENTER("get_sweep_read_cost"); cost->reset(); - if (table->file->primary_key_is_clustered()) + if (table->file->pk_is_clustering_key(table->s->primary_key)) { - cost->io_count= table->file->read_time(table->s->primary_key, + cost->cpu_cost= table->file->read_time(table->s->primary_key, (uint) nrows, nrows); } - else + else if ((cost->avg_io_cost= table->file->avg_io_cost()) >= 0.999) { double n_blocks= ceil(ulonglong2double(table->file->stats.data_file_length) / IO_SIZE); @@ -1961,5 +2124,3 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, /* ************************************************************************** * DS-MRR implementation ends ***************************************************************************/ - - |