diff options
author | Sergey Petrunya <psergey@askmonty.org> | 2010-11-01 18:49:59 +0300 |
---|---|---|
committer | Sergey Petrunya <psergey@askmonty.org> | 2010-11-01 18:49:59 +0300 |
commit | 6c15806b68a668b7039d8e5b5f2c6623c50bff5f (patch) | |
tree | d3cc5d6d70ecf2e90336b414c931c403a3461249 /sql/multi_range_read.h | |
parent | d48a8b60345c5b5f95c2ce590f7032a7c9f87c4b (diff) | |
parent | b76a8595c611bedf512b19a7c4ccc260f0d0a8f6 (diff) | |
download | mariadb-git-6c15806b68a668b7039d8e5b5f2c6623c50bff5f.tar.gz |
MWL#121-124 DS-MRR support for key-ordered retrieval, etc
- Merge into 5.3-main
Diffstat (limited to 'sql/multi_range_read.h')
-rw-r--r-- | sql/multi_range_read.h | 522 |
1 files changed, 493 insertions, 29 deletions
diff --git a/sql/multi_range_read.h b/sql/multi_range_read.h index 90e2e4c93d6..14bcdd5de6c 100644 --- a/sql/multi_range_read.h +++ b/sql/multi_range_read.h @@ -1,49 +1,430 @@ -/* - This file contains declarations for - - Disk-Sweep MultiRangeRead (DS-MRR) implementation +/** + @defgroup DS-MRR declarations + @{ */ /** - A Disk-Sweep MRR interface implementation + A Disk-Sweep implementation of MRR Interface (DS-MRR for short) - This implementation makes range (and, in the future, 'ref') scans to read - table rows in disk sweeps. - - Currently it is used by MyISAM and InnoDB. Potentially it can be used with - any table handler that has non-clustered indexes and on-disk rows. + This is a "plugin"(*) for storage engines that allows to + 1. When doing index scans, read table rows in rowid order; + 2. when making many index lookups, do them in key order and don't + lookup the same key value multiple times; + 3. Do both #1 and #2, when applicable. + These changes are expected to speed up query execution for disk-based + storage engines running io-bound loads and "big" queries (ie. queries that + do joins and enumerate lots of records). + + (*) - only conceptually. No dynamic loading or binary compatibility of any + kind. + + General scheme of things: + + SQL Layer code + | | | + v v v + -|---|---|---- handler->multi_range_read_XXX() function calls + | | | + _____________________________________ + / DS-MRR module \ + | (order/de-duplicate lookup keys, | + | scan indexes in key order, | + | order/de-duplicate rowids, | + | retrieve full record reads in rowid | + | order) | + \_____________________________________/ + | | | + -|---|---|----- handler->read_range_first()/read_range_next(), + | | | handler->index_read(), handler->rnd_pos() calls. + | | | + v v v + Storage engine internals + + + Currently DS-MRR is used by MyISAM, InnoDB/XtraDB and Maria storage engines. + Potentially it can be used with any table handler that has disk-based data + storage and has better performance when reading data in rowid order. */ -class DsMrr_impl +#include "sql_lifo_buffer.h" + +class DsMrr_impl; + +class Key_parameters { public: - typedef void (handler::*range_check_toggle_func_t)(bool on); + /* TRUE <=> We can get at most one index tuple for a lookup key */ + bool index_ranges_unique; - DsMrr_impl() - : h2(NULL) {}; + uint key_tuple_length; /* Length of index lookup tuple, in bytes */ + key_part_map key_tuple_map; /* keyparts used in index lookup tuples */ + + /* + This is + = key_tuple_length if we copy keys to buffer + = sizeof(void*) if we're using pointers to materialized keys. + */ + uint key_size_in_keybuf; + + /* TRUE <=> don't copy key values, use pointers to them instead. */ + bool use_key_pointers; +}; + +/** + Iterator over (record, range_id) pairs that match given key value. + + We may need to scan multiple (key_val, range_id) pairs with the same + key value. A key value may have multiple matching records, so we'll need to + produce a cross-product of sets of matching records and range_id-s. +*/ +class Mrr_ordered_index_reader; +class Key_value_records_iterator +{ + /* Scan parameters */ + Key_parameters *param; + Lifo_buffer_iterator identical_key_it; + uchar *last_identical_key_ptr; + bool get_next_row; + //handler *h; + /* TRUE <=> We can get at most one index tuple for a lookup key */ + //bool index_ranges_unique; + Mrr_ordered_index_reader *owner; + /* key_buffer.read() reads to here */ + uchar *cur_index_tuple; +public: + bool init(Mrr_ordered_index_reader *owner_arg); + /* - The "owner" handler object (the one that calls dsmrr_XXX functions. - It is used to retrieve full table rows by calling rnd_pos(). + Get next (key_val, range_id) pair. */ + int get_next(); + + void close(); + friend class Mrr_ordered_index_reader; +}; + + +/* + Something that will manage buffers for those that call it +*/ +class Buffer_manager +{ +public: + virtual void reset_buffer_sizes()= 0; + virtual void setup_buffer_sizes(uint key_size_in_keybuf, + key_part_map key_tuple_map)=0; + virtual Lifo_buffer* get_key_buffer()= 0; + virtual ~Buffer_manager(){} +}; + + +/* + Abstract MRR execution strategy + + An object of this class produces (R, range_info) pairs where R can be an + index tuple or a table record. + + Getting HA_ERR_END_OF_FILE from get_next() means that the source should be + re-filled. + + Was: + if eof() returns true after refill attempt, then the end of + stream has been reached and get_next() must not be called anymore. + + Now: + if refill_buffer() returns HA_ERR_END_OF_FILE that means the stream is + really exhausted. + +*/ + +class Mrr_reader +{ +public: + virtual int get_next(char **range_info) = 0; + virtual int refill_buffer()=0; + + virtual ~Mrr_reader() {}; /* just to remove compiler warning */ +}; + + +/* A common base for strategies that do index scans and produce index tuples */ +class Mrr_index_reader : public Mrr_reader +{ +public: handler *h; - TABLE *table; /* Always equal to h->table */ + + virtual int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, + uint mode, Buffer_manager *buf_manager_arg) = 0; + virtual bool eof() = 0; + virtual uchar *get_rowid_ptr()= 0; + virtual bool skip_record(char *range_id, uchar *rowid)=0; +}; + + +/* + A "bypass" strategy that uses default MRR implementation (i.e. + handler::multi_range_read_XXX() calls) to produce rows. +*/ + +class Mrr_simple_index_reader : public Mrr_index_reader +{ + int res; +public: + int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, + uint mode, Buffer_manager *buf_manager_arg); + int get_next(char **range_info); + int refill_buffer() { return HA_ERR_END_OF_FILE; } + bool eof() { return test(res); } + uchar *get_rowid_ptr() { return h->ref; } + bool skip_record(char *range_id, uchar *rowid) + { + return (h->mrr_funcs.skip_record && + h->mrr_funcs.skip_record(h->mrr_iter, range_id, rowid)); + } +}; + + + +/* + A strategy that sorts index lookup keys before scanning the index +*/ + +class Mrr_ordered_index_reader : public Mrr_index_reader +{ +public: + int init(handler *h_arg, RANGE_SEQ_IF *seq_funcs, + void *seq_init_param, uint n_ranges, + uint mode, Buffer_manager *buf_manager_arg); + int get_next(char **range_info); + int refill_buffer(); + bool eof() { return index_scan_eof; } + uchar *get_rowid_ptr() { return h->ref; } + + bool skip_record(char *range_info, uchar *rowid) + { + return (mrr_funcs.skip_record && + mrr_funcs.skip_record(mrr_iter, range_info, rowid)); + } private: - /* Secondary handler object. It is used for scanning the index */ - handler *h2; + Key_value_records_iterator kv_it; + + bool scanning_key_val_iter; + + char *cur_range_info; - /* Buffer to store rowids, or (rowid, range_id) pairs */ - uchar *rowids_buf; - uchar *rowids_buf_cur; /* Current position when reading/writing */ - uchar *rowids_buf_last; /* When reading: end of used buffer space */ - uchar *rowids_buf_end; /* End of the buffer */ + /* Buffer to store (key, range_id) pairs */ + Lifo_buffer *key_buffer; - bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */ + Buffer_manager *buf_manager; - /* TRUE <=> need range association, buffer holds {rowid, range_id} pairs */ + /* Initially FALSE, becomes TRUE when we've set key_tuple_xxx members */ + bool know_key_tuple_params; + + // bool use_key_pointers; + + Key_parameters keypar; + /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ bool is_mrr_assoc; - bool use_default_impl; /* TRUE <=> shortcut all calls to default MRR impl */ + bool no_more_keys; + RANGE_SEQ_IF mrr_funcs; + range_seq_t mrr_iter; + + //bool auto_refill; + + bool index_scan_eof; + + static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2); + static int key_tuple_cmp_reverse(void* arg, uchar* key1, uchar* key2); + //void cleanup(); + + friend class Key_value_records_iterator; + friend class DsMrr_impl; + friend class Mrr_ordered_rndpos_reader; +}; + + +/* MRR strategy that fetches rowids */ + +class Mrr_ordered_rndpos_reader : public Mrr_reader +{ +public: + int init(handler *h, Mrr_index_reader *index_reader, uint mode, + Lifo_buffer *buf); + int get_next(char **range_info); + int refill_buffer(); + int refill2(); + void cleanup(); +private: + handler *h; + + DsMrr_impl *dsmrr; + /* This what we get (rowid, range_info) pairs from */ + Mrr_index_reader *index_reader; + uchar *index_rowid; + bool index_reader_exhausted; + + /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ + bool is_mrr_assoc; + + uchar *last_identical_rowid; + Lifo_buffer *rowid_buffer; + + /* = h->ref_length [ + sizeof(range_assoc_info) ] */ + //uint rowid_buff_elem_size; + + /* rowid_buffer.read() will set the following: */ + uchar *rowid; + uchar *rowids_range_id; +}; + +class Mrr_reader_factory +{ +public: + Mrr_ordered_rndpos_reader ordered_rndpos_reader; + Mrr_ordered_index_reader ordered_index_reader; + Mrr_simple_index_reader simple_index_reader; +}; + +/* + DS-MRR implementation for one table. Create/use one object of this class for + each ha_{myisam/innobase/etc} object. That object will be further referred to + as "the handler" + + DsMrr_impl supports has the following execution strategies: + + - Bypass DS-MRR, pass all calls to default MRR implementation, which is + an MRR-to-non-MRR call converter. + - Key-Ordered Retrieval + - Rowid-Ordered Retrieval + + DsMrr_impl will use one of the above strategies, or a combination of them, + according to the following diagram: + + (mrr function calls) + | + +----------------->-----------------+ + | | + ___________v______________ _______________v________________ + / default: use lookup keys \ / KEY-ORDERED RETRIEVAL: \ + | (or ranges) in whatever | | sort lookup keys and then make | + | order they are supplied | | index lookups in index order | + \__________________________/ \________________________________/ + | | | | | + +---<---+ | +--------------->-----------|----+ + | | | | + | | +---------------+ | + | ______v___ ______ | _______________v_______________ + | / default: read \ | / ROWID-ORDERED RETRIEVAL: \ + | | table records | | | Before reading table records, | + v | in random order | v | sort their rowids and then | + | \_________________/ | | read them in rowid order | + | | | \_______________________________/ + | | | | + | | | | + +-->---+ | +----<------+-----------<--------+ + | | | + v v v + (table records and range_ids) + + The choice of strategy depends on MRR scan properties, table properties + (whether we're scanning clustered primary key), and @@optimizer_switch + settings. + + Key-Ordered Retrieval + --------------------- + The idea is: if MRR scan is essentially a series of lookups on + + tbl.key=value1 OR tbl.key=value2 OR ... OR tbl.key=valueN + + then it makes sense to collect and order the set of lookup values, i.e. + + sort(value1, value2, .. valueN) + + and then do index lookups in index order. This results in fewer index page + fetch operations, and we also can avoid making multiple index lookups for the + same value. That is, if value1=valueN we can easily discover that after + sorting and make one index lookup for them instead of two. + + Rowid-Ordered Retrieval + ----------------------- + If we do a regular index scan or a series of index lookups, we'll be hitting + table records at random. For disk-based engines, this is much slower than + reading the same records in disk order. We assume that disk ordering of + rows is the same as ordering of their rowids (which is provided by + handler::cmp_ref()) + In order to retrieve records in different order, we must separate index + scanning and record fetching, that is, MRR scan uses the following steps: + + 1. Scan the index (and only index, that is, with HA_EXTRA_KEYREAD on) and + fill a buffer with {rowid, range_id} pairs + 2. Sort the buffer by rowid value + 3. for each {rowid, range_id} pair in the buffer + get record by rowid and return the {record, range_id} pair + 4. Repeat the above steps until we've exhausted the list of ranges we're + scanning. + + Buffer space management considerations + -------------------------------------- + With regards to buffer/memory management, MRR interface specifies that + - SQL layer provides multi_range_read_init() with buffer of certain size. + - MRR implementation may use (i.e. have at its disposal till the end of + the MRR scan) all of the buffer, or return the unused end of the buffer + to SQL layer. + + DS-MRR needs buffer in order to accumulate and sort rowids and/or keys. When + we need to accumulate/sort only keys (or only rowids), it is fairly trivial. + + When we need to accumulate/sort both keys and rowids, efficient buffer use + gets complicated. We need to: + - First, accumulate keys and sort them + - Then use the keys (smaller values go first) to obtain rowids. A key is not + needed after we've got matching rowids for it. + - Make sure that rowids are accumulated at the front of the buffer, so that we + can return the end part of the buffer to SQL layer, should there be too + few rowid values to occupy the buffer. + + All of these goals are achieved by using the following scheme: + + | | We get an empty buffer from SQL layer. + + | *-| + | *----| First, we fill the buffer with keys. Key_buffer + | *-------| part grows from end of the buffer space to start + | *----------| (In this picture, the buffer is big enough to + | *-------------| accomodate all keys and even have some space left) + + | *=============| We want to do key-ordered index scan, so we sort + the keys + + |-x *===========| Then we use the keys get rowids. Rowids are + |----x *========| stored from start of buffer space towards the end. + |--------x *=====| The part of the buffer occupied with keys + |------------x *===| gradually frees up space for rowids. In this + |--------------x *=| picture we run out of keys before we've ran out + |----------------x | of buffer space (it can be other way as well). + + |================x | Then we sort the rowids. + + | |~~~| The unused part of the buffer is at the end, so + we can return it to the SQL layer. + + |================* Sorted rowids are then used to read table records + in disk order + +*/ + +class DsMrr_impl : public Buffer_manager +{ public: + typedef void (handler::*range_check_toggle_func_t)(bool on); + + DsMrr_impl() + : h2(NULL) {}; + void init(handler *h_arg, TABLE *table_arg) { h= h_arg; @@ -52,19 +433,102 @@ public: int dsmrr_init(handler *h, RANGE_SEQ_IF *seq_funcs, void *seq_init_param, uint n_ranges, uint mode, HANDLER_BUFFER *buf); void dsmrr_close(); - int dsmrr_fill_buffer(); int dsmrr_next(char **range_info); - ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint *bufsz, - uint *flags, COST_VECT *cost); + ha_rows dsmrr_info(uint keyno, uint n_ranges, uint keys, uint key_parts, + uint *bufsz, uint *flags, COST_VECT *cost); ha_rows dsmrr_info_const(uint keyno, RANGE_SEQ_IF *seq, void *seq_init_param, uint n_ranges, uint *bufsz, uint *flags, COST_VECT *cost); private: + /* Buffer to store (key, range_id) pairs */ + Lifo_buffer *key_buffer; + + /* + The "owner" handler object (the one that is expected to "own" this object + and call its functions). + */ + handler *h; + TABLE *table; /* Always equal to h->table */ + + /* + Secondary handler object. (created when needed, we need it when we need + to run both index scan and rnd_pos() scan at the same time) + */ + handler *h2; + + /** Properties of current MRR scan **/ + + uint keyno; /* index we're running the scan on */ + /* TRUE <=> need range association, buffers hold {rowid, range_id} pairs */ + bool is_mrr_assoc; + /* TRUE <=> sort the keys before making index lookups */ + //bool do_sort_keys; + /* TRUE <=> sort rowids and use rnd_pos() to get and return full records */ + //bool do_rndpos_scan; + + Mrr_reader_factory strategy_factory; + Mrr_reader *strategy; + Mrr_index_reader *index_strategy; + + /* The whole buffer space that we're using */ + uchar *full_buf; + uchar *full_buf_end; + + /* + When using both rowid and key buffers: the boundary between key and rowid + parts of the buffer. This is the "original" value, actual memory ranges + used by key and rowid parts may be different because of dynamic space + reallocation between them. + */ + uchar *rowid_buffer_end; + + /** Index scaning and key buffer-related members **/ + + /* + One of the following two is used for key buffer: forward is used when + we only need key buffer, backward is used when we need both key and rowid + buffers. + */ + Forward_lifo_buffer forward_key_buf; + Backward_lifo_buffer backward_key_buf; + + Forward_lifo_buffer rowid_buffer; + + /* = key_size_in_keybuf [ + sizeof(range_assoc_info) ] */ + //uint key_buff_elem_size_; + + /** rnd_pos() scan and rowid buffer-related members **/ + + /* + Buffer to store (rowid, range_id) pairs, or just rowids if + is_mrr_assoc==FALSE + */ + //Forward_lifo_buffer rowid_buffer; + bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, COST_VECT *cost); bool get_disk_sweep_mrr_cost(uint keynr, ha_rows rows, uint flags, uint *buffer_size, COST_VECT *cost); + bool check_cpk_scan(THD *thd, uint keyno, uint mrr_flags); + + void reallocate_buffer_space(); + + /* Buffer_manager implementation */ + void setup_buffer_sizes(uint key_size_in_keybuf, key_part_map key_tuple_map); + void reset_buffer_sizes(); + Lifo_buffer* get_key_buffer() { return key_buffer; } + + friend class Key_value_records_iterator; + friend class Mrr_ordered_index_reader; + friend class Mrr_ordered_rndpos_reader; + + int setup_two_handlers(); + void close_second_handler(); }; +/** + @} (end of group DS-MRR declarations) +*/ + |