diff options
author | Sergei Golubchik <sergii@pisem.net> | 2010-11-25 18:17:28 +0100 |
---|---|---|
committer | Sergei Golubchik <sergii@pisem.net> | 2010-11-25 18:17:28 +0100 |
commit | 65ca700def99289cc31a7040537f5aa6e12bf485 (patch) | |
tree | 97b3a07299b626c519da0e80c122b5b79b933914 /sql/sql_join_cache.cc | |
parent | 2ab57de38d13d927ddff2d51aed4af34e13998f5 (diff) | |
parent | 6e5bcca7935d3c62f84bb640e5357664a210ee12 (diff) | |
download | mariadb-git-65ca700def99289cc31a7040537f5aa6e12bf485.tar.gz |
merge.
checkpoint.
does not compile.
Diffstat (limited to 'sql/sql_join_cache.cc')
-rw-r--r-- | sql/sql_join_cache.cc | 3294 |
1 files changed, 3294 insertions, 0 deletions
diff --git a/sql/sql_join_cache.cc b/sql/sql_join_cache.cc new file mode 100644 index 00000000000..10064590a75 --- /dev/null +++ b/sql/sql_join_cache.cc @@ -0,0 +1,3294 @@ +/* Copyright (C) 2000-2006 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/** + @file + + @brief + join cache optimizations + + @defgroup Query_Optimizer Query Optimizer + @{ +*/ + +#ifdef USE_PRAGMA_IMPLEMENTATION +#pragma implementation // gcc: Class implementation +#endif + +#include "mysql_priv.h" +#include "sql_select.h" +#include "opt_subselect.h" + +#define NO_MORE_RECORDS_IN_BUFFER (uint)(-1) + + +/***************************************************************************** + * Join cache module +******************************************************************************/ + +/* + Fill in the descriptor of a flag field associated with a join cache + + SYNOPSIS + add_field_flag_to_join_cache() + str position in a record buffer to copy the field from/to + length length of the field + field IN/OUT pointer to the field descriptor to fill in + + DESCRIPTION + The function fill in the descriptor of a cache flag field to which + the parameter 'field' points to. The function uses the first two + parameters to set the position in the record buffer from/to which + the field value is to be copied and the length of the copied fragment. + Before returning the result the function increments the value of + *field by 1. + The function ignores the fields 'blob_length' and 'ofset' of the + descriptor. + + RETURN + the length of the field +*/ + +static +uint add_flag_field_to_join_cache(uchar *str, uint length, CACHE_FIELD **field) +{ + CACHE_FIELD *copy= *field; + copy->str= str; + copy->length= length; + copy->type= 0; + copy->field= 0; + copy->referenced_field_no= 0; + (*field)++; + return length; +} + + +/* + Fill in the descriptors of table data fields associated with a join cache + + SYNOPSIS + add_table_data_fields_to_join_cache() + tab descriptors of fields from this table are to be filled + field_set descriptors for only these fields are to be created + field_cnt IN/OUT counter of data fields + descr IN/OUT pointer to the first descriptor to be filled + field_ptr_cnt IN/OUT counter of pointers to the data fields + descr_ptr IN/OUT pointer to the first pointer to blob descriptors + + DESCRIPTION + The function fills in the descriptors of cache data fields from the table + 'tab'. The descriptors are filled only for the fields marked in the + bitmap 'field_set'. + The function fills the descriptors starting from the position pointed + by 'descr'. If an added field is of a BLOB type then a pointer to the + its descriptor is added to the array descr_ptr. + At the return 'descr' points to the position after the last added + descriptor while 'descr_ptr' points to the position right after the + last added pointer. + + RETURN + the total length of the added fields +*/ + +static +uint add_table_data_fields_to_join_cache(JOIN_TAB *tab, + MY_BITMAP *field_set, + uint *field_cnt, + CACHE_FIELD **descr, + uint *field_ptr_cnt, + CACHE_FIELD ***descr_ptr) +{ + Field **fld_ptr; + uint len= 0; + CACHE_FIELD *copy= *descr; + CACHE_FIELD **copy_ptr= *descr_ptr; + uint used_fields= bitmap_bits_set(field_set); + for (fld_ptr= tab->table->field; used_fields; fld_ptr++) + { + if (bitmap_is_set(field_set, (*fld_ptr)->field_index)) + { + len+= (*fld_ptr)->fill_cache_field(copy); + if (copy->type == CACHE_BLOB) + { + *copy_ptr= copy; + copy_ptr++; + (*field_ptr_cnt)++; + } + copy->field= *fld_ptr; + copy->referenced_field_no= 0; + copy++; + (*field_cnt)++; + used_fields--; + } + } + *descr= copy; + *descr_ptr= copy_ptr; + return len; +} + + +/* + Determine different counters of fields associated with a record in the cache + + SYNOPSIS + calc_record_fields() + + DESCRIPTION + The function counts the number of total fields stored in a record + of the cache and saves this number in the 'fields' member. It also + determines the number of flag fields and the number of blobs. + The function sets 'with_match_flag' on if 'join_tab' needs a match flag + i.e. if it is the first inner table of an outer join or a semi-join. + + RETURN + none +*/ + +void JOIN_CACHE::calc_record_fields() +{ + JOIN_TAB *tab = prev_cache ? prev_cache->join_tab : + join->join_tab+join->const_tables; + tables= join_tab-tab; + + fields= 0; + blobs= 0; + flag_fields= 0; + data_field_count= 0; + data_field_ptr_count= 0; + referenced_fields= 0; + + for ( ; tab < join_tab ; tab++) + { + calc_used_field_length(join->thd, tab); + flag_fields+= test(tab->used_null_fields || tab->used_uneven_bit_fields); + flag_fields+= test(tab->table->maybe_null); + fields+= tab->used_fields; + blobs+= tab->used_blobs; + + fields+= tab->check_rowid_field(); + } + if ((with_match_flag= join_tab->use_match_flag())) + flag_fields++; + fields+= flag_fields; +} + +/* + Allocate memory for descriptors and pointers to them associated with the cache + + SYNOPSIS + alloc_fields() + + DESCRIPTION + The function allocates memory for the array of fields descriptors + and the array of pointers to the field descriptors used to copy + join record data from record buffers into the join buffer and + backward. Some pointers refer to the field descriptor associated + with previous caches. They are placed at the beginning of the + array of pointers and its total number is specified by the parameter + 'external fields'. + The pointer of the first array is assigned to field_descr and the + number of elements is precalculated by the function calc_record_fields. + The allocated arrays are adjacent. + + NOTES + The memory is allocated in join->thd->memroot + + RETURN + pointer to the first array +*/ + +int JOIN_CACHE::alloc_fields(uint external_fields) +{ + uint ptr_cnt= external_fields+blobs+1; + uint fields_size= sizeof(CACHE_FIELD)*fields; + field_descr= (CACHE_FIELD*) sql_alloc(fields_size + + sizeof(CACHE_FIELD*)*ptr_cnt); + blob_ptr= (CACHE_FIELD **) ((uchar *) field_descr + fields_size); + return (field_descr == NULL); +} + +/* + Create descriptors of the record flag fields stored in the join buffer + + SYNOPSIS + create_flag_fields() + + DESCRIPTION + The function creates descriptors of the record flag fields stored + in the join buffer. These are descriptors for: + - an optional match flag field, + - table null bitmap fields, + - table null row fields. + The match flag field is created when 'join_tab' is the first inner + table of an outer join our a semi-join. A null bitmap field is + created for any table whose fields are to be stored in the join + buffer if at least one of these fields is nullable or is a BIT field + whose bits are partially stored with null bits. A null row flag + is created for any table assigned to the cache if it is an inner + table of an outer join. + The descriptor for flag fields are placed one after another at the + beginning of the array of field descriptors 'field_descr' that + contains 'fields' elements. If there is a match flag field the + descriptor for it is always first in the sequence of flag fields. + The descriptors for other flag fields can follow in an arbitrary + order. + The flag field values follow in a record stored in the join buffer + in the same order as field descriptors, with the match flag always + following first. + The function sets the value of 'flag_fields' to the total number + of the descriptors created for the flag fields. + The function sets the value of 'length' to the total length of the + flag fields. + + RETURN + none +*/ + +void JOIN_CACHE::create_flag_fields() +{ + CACHE_FIELD *copy; + JOIN_TAB *tab; + + copy= field_descr; + + length=0; + + /* If there is a match flag the first field is always used for this flag */ + if (with_match_flag) + length+= add_flag_field_to_join_cache((uchar*) &join_tab->found, + sizeof(join_tab->found), + ©); + + /* Create fields for all null bitmaps and null row flags that are needed */ + for (tab= join_tab-tables; tab < join_tab; tab++) + { + TABLE *table= tab->table; + + /* Create a field for the null bitmap from table if needed */ + if (tab->used_null_fields || tab->used_uneven_bit_fields) + length+= add_flag_field_to_join_cache(table->null_flags, + table->s->null_bytes, + ©); + + /* Create table for the null row flag if needed */ + if (table->maybe_null) + length+= add_flag_field_to_join_cache((uchar*) &table->null_row, + sizeof(table->null_row), + ©); + } + + /* Theoretically the new value of flag_fields can be less than the old one */ + flag_fields= copy-field_descr; +} + + +/* + Create descriptors of all remaining data fields stored in the join buffer + + SYNOPSIS + create_remaining_fields() + all_read_fields indicates that descriptors for all read data fields + are to be created + + DESCRIPTION + The function creates descriptors for all remaining data fields of a + record from the join buffer. If the parameter 'all_read_fields' is + true the function creates fields for all read record fields that + comprise the partial join record joined with join_tab. Otherwise, + for each table tab, the set of the read fields for which the descriptors + have to be added is determined as the difference between all read fields + and and those for which the descriptors have been already created. + The latter are supposed to be marked in the bitmap tab->table->tmp_set. + The function increases the value of 'length' to the the total length of + the added fields. + + NOTES + If 'all_read_fields' is false the function modifies the value of + tab->table->tmp_set for a each table whose fields are stored in the cache. + The function calls the method Field::fill_cache_field to figure out + the type of the cache field and the maximal length of its representation + in the join buffer. If this is a blob field then additionally a pointer + to this field is added as an element of the array blob_ptr. For a blob + field only the size of the length of the blob data is taken into account. + It is assumed that 'data_field_count' contains the number of descriptors + for data fields that have been already created and 'data_field_ptr_count' + contains the number of the pointers to such descriptors having been + stored up to the moment. + + RETURN + none +*/ + +void JOIN_CACHE:: create_remaining_fields(bool all_read_fields) +{ + JOIN_TAB *tab; + CACHE_FIELD *copy= field_descr+flag_fields+data_field_count; + CACHE_FIELD **copy_ptr= blob_ptr+data_field_ptr_count; + + for (tab= join_tab-tables; tab < join_tab; tab++) + { + MY_BITMAP *rem_field_set; + TABLE *table= tab->table; + + if (all_read_fields) + rem_field_set= table->read_set; + else + { + bitmap_invert(&table->tmp_set); + bitmap_intersect(&table->tmp_set, table->read_set); + rem_field_set= &table->tmp_set; + } + + length+= add_table_data_fields_to_join_cache(tab, rem_field_set, + &data_field_count, ©, + &data_field_ptr_count, + ©_ptr); + + /* SemiJoinDuplicateElimination: allocate space for rowid if needed */ + if (tab->keep_current_rowid) + { + copy->str= table->file->ref; + copy->length= table->file->ref_length; + copy->type= 0; + copy->field= 0; + copy->referenced_field_no= 0; + length+= copy->length; + data_field_count++; + copy++; + } + } +} + + +/* + Calculate and set all cache constants + + SYNOPSIS + set_constants() + + DESCRIPTION + The function calculates and set all precomputed constants that are used + when writing records into the join buffer and reading them from it. + It calculates the size of offsets of a record within the join buffer + and of a field within a record. It also calculates the number of bytes + used to store record lengths. + The function also calculates the maximal length of the representation + of record in the cache excluding blob_data. This value is used when + making a dicision whether more records should be added into the join + buffer or not. + + RETURN + none +*/ + +void JOIN_CACHE::set_constants() +{ + /* + Any record from a BKA cache is prepended with the record length. + We use the record length when reading the buffer and building key values + for each record. The length allows us not to read the fields that are + not needed for keys. + If a record has match flag it also may be skipped when the match flag + is on. It happens if the cache is used for a semi-join operation or + for outer join when the 'not exist' optimization can be applied. + If some of the fields are referenced from other caches then + the record length allows us to easily reach the saved offsets for + these fields since the offsets are stored at the very end of the record. + However at this moment we don't know whether we have referenced fields for + the cache or not. Later when a referenced field is registered for the cache + we adjust the value of the flag 'with_length'. + */ + with_length= is_key_access() || + join_tab->is_inner_table_of_semi_join_with_first_match() || + join_tab->is_inner_table_of_outer_join(); + /* + At this moment we don't know yet the value of 'referenced_fields', + but in any case it can't be greater than the value of 'fields'. + */ + uint len= length + fields*sizeof(uint)+blobs*sizeof(uchar *) + + (prev_cache ? prev_cache->get_size_of_rec_offset() : 0) + + sizeof(ulong); + buff_size= max(join->thd->variables.join_buff_size, 2*len); + size_of_rec_ofs= offset_size(buff_size); + size_of_rec_len= blobs ? size_of_rec_ofs : offset_size(len); + size_of_fld_ofs= size_of_rec_len; + /* + The size of the offsets for referenced fields will be added later. + The values of 'pack_length' and 'pack_length_with_blob_ptrs' are adjusted + every time when the first reference to the referenced field is registered. + */ + pack_length= (with_length ? size_of_rec_len : 0) + + (prev_cache ? prev_cache->get_size_of_rec_offset() : 0) + + length; + pack_length_with_blob_ptrs= pack_length + blobs*sizeof(uchar *); +} + + +/* + Allocate memory for a join buffer + + SYNOPSIS + alloc_buffer() + + DESCRIPTION + The function allocates a lump of memory for the cache join buffer. The + size of the allocated memory is 'buff_size' bytes. + + RETURN + 0 - if the memory has been successfully allocated + 1 - otherwise +*/ + +int JOIN_CACHE::alloc_buffer() +{ + buff= (uchar*) my_malloc(buff_size, MYF(0)); + return buff == NULL; +} + + +/* + Initialize a BNL cache + + SYNOPSIS + init() + + DESCRIPTION + The function initializes the cache structure. It supposed to be called + right after a constructor for the JOIN_CACHE_BNL. + The function allocates memory for the join buffer and for descriptors of + the record fields stored in the buffer. + + NOTES + The code of this function should have been included into the constructor + code itself. However the new operator for the class JOIN_CACHE_BNL would + never fail while memory allocation for the join buffer is not absolutely + unlikely to fail. That's why this memory allocation has to be placed in a + separate function that is called in a couple with a cache constructor. + It is quite natural to put almost all other constructor actions into + this function. + + RETURN + 0 initialization with buffer allocations has been succeeded + 1 otherwise +*/ + +int JOIN_CACHE_BNL::init() +{ + DBUG_ENTER("JOIN_CACHE::init"); + + calc_record_fields(); + + if (alloc_fields(0)) + DBUG_RETURN(1); + + create_flag_fields(); + + create_remaining_fields(TRUE); + + set_constants(); + + if (alloc_buffer()) + DBUG_RETURN(1); + + reset(TRUE); + + DBUG_RETURN(0); +} + + +/* + Initialize a BKA cache + + SYNOPSIS + init() + + DESCRIPTION + The function initializes the cache structure. It supposed to be called + right after a constructor for the JOIN_CACHE_BKA. + The function allocates memory for the join buffer and for descriptors of + the record fields stored in the buffer. + + NOTES + The code of this function should have been included into the constructor + code itself. However the new operator for the class JOIN_CACHE_BKA would + never fail while memory allocation for the join buffer is not absolutely + unlikely to fail. That's why this memory allocation has to be placed in a + separate function that is called in a couple with a cache constructor. + It is quite natural to put almost all other constructor actions into + this function. + + RETURN + 0 initialization with buffer allocations has been succeeded + 1 otherwise +*/ + +int JOIN_CACHE_BKA::init() +{ + JOIN_TAB *tab; + JOIN_CACHE *cache; + local_key_arg_fields= 0; + external_key_arg_fields= 0; + DBUG_ENTER("JOIN_CACHE_BKA::init"); + + calc_record_fields(); + + /* Mark all fields that can be used as arguments for this key access */ + TABLE_REF *ref= &join_tab->ref; + cache= this; + do + { + /* + Traverse the ref expressions and find the occurrences of fields in them for + each table 'tab' whose fields are to be stored in the 'cache' join buffer. + Mark these fields in the bitmap tab->table->tmp_set. + For these fields count the number of them stored in this cache and the + total number of them stored in the previous caches. Save the result + of the counting 'in local_key_arg_fields' and 'external_key_arg_fields' + respectively. + */ + for (tab= cache->join_tab-cache->tables; tab < cache->join_tab ; tab++) + { + uint key_args; + bitmap_clear_all(&tab->table->tmp_set); + for (uint i= 0; i < ref->key_parts; i++) + { + Item *ref_item= ref->items[i]; + if (!(tab->table->map & ref_item->used_tables())) + continue; + ref_item->walk(&Item::add_field_to_set_processor, 1, + (uchar *) tab->table); + } + if ((key_args= bitmap_bits_set(&tab->table->tmp_set))) + { + if (cache == this) + local_key_arg_fields+= key_args; + else + external_key_arg_fields+= key_args; + } + } + cache= cache->prev_cache; + } + while (cache); + + if (alloc_fields(external_key_arg_fields)) + DBUG_RETURN(1); + + create_flag_fields(); + + /* + Save pointers to the cache fields in previous caches + that are used to build keys for this key access. + */ + cache= this; + uint ext_key_arg_cnt= external_key_arg_fields; + CACHE_FIELD *copy; + CACHE_FIELD **copy_ptr= blob_ptr; + while (ext_key_arg_cnt) + { + cache= cache->prev_cache; + for (tab= cache->join_tab-cache->tables; tab < cache->join_tab ; tab++) + { + CACHE_FIELD *copy_end; + MY_BITMAP *key_read_set= &tab->table->tmp_set; + /* key_read_set contains the bitmap of tab's fields referenced by ref */ + if (bitmap_is_clear_all(key_read_set)) + continue; + copy_end= cache->field_descr+cache->fields; + for (copy= cache->field_descr+cache->flag_fields; copy < copy_end; copy++) + { + /* + (1) - when we store rowids for DuplicateWeedout, they have + copy->field==NULL + */ + if (copy->field && // (1) + copy->field->table == tab->table && + bitmap_is_set(key_read_set, copy->field->field_index)) + { + *copy_ptr++= copy; + ext_key_arg_cnt--; + if (!copy->referenced_field_no) + { + /* + Register the referenced field 'copy': + - set the offset number in copy->referenced_field_no, + - adjust the value of the flag 'with_length', + - adjust the values of 'pack_length' and + of 'pack_length_with_blob_ptrs'. + */ + copy->referenced_field_no= ++cache->referenced_fields; + cache->with_length= TRUE; + cache->pack_length+= cache->get_size_of_fld_offset(); + cache->pack_length_with_blob_ptrs+= cache->get_size_of_fld_offset(); + } + } + } + } + } + /* After this 'blob_ptr' shall not be be changed */ + blob_ptr= copy_ptr; + + /* Now create local fields that are used to build ref for this key access */ + copy= field_descr+flag_fields; + for (tab= join_tab-tables; tab < join_tab ; tab++) + { + length+= add_table_data_fields_to_join_cache(tab, &tab->table->tmp_set, + &data_field_count, ©, + &data_field_ptr_count, + ©_ptr); + } + + use_emb_key= check_emb_key_usage(); + + create_remaining_fields(FALSE); + + set_constants(); + + if (alloc_buffer()) + DBUG_RETURN(1); + + reset(TRUE); + + DBUG_RETURN(0); +} + + +/* + Check the possibility to read the access keys directly from the join buffer + + SYNOPSIS + check_emb_key_usage() + + DESCRIPTION + The function checks some conditions at which the key values can be read + directly from the join buffer. This is possible when the key values can be + composed by concatenation of the record fields stored in the join buffer. + Sometimes when the access key is multi-component the function has to re-order + the fields written into the join buffer to make keys embedded. If key + values for the key access are detected as embedded then 'use_emb_key' + is set to TRUE. + + EXAMPLE + Let table t2 has an index defined on the columns a,b . Let's assume also + that the columns t2.a, t2.b as well as the columns t1.a, t1.b are all + of the integer type. Then if the query + SELECT COUNT(*) FROM t1, t2 WHERE t1.a=t2.a and t1.b=t2.b + is executed with a join cache in such a way that t1 is the driving + table then the key values to access table t2 can be read directly + from the join buffer. + + NOTES + In some cases key values could be read directly from the join buffer but + we still do not consider them embedded. In the future we'll expand the + the class of keys which we identify as embedded. + + RETURN + TRUE - key values will be considered as embedded, + FALSE - otherwise. +*/ + +bool JOIN_CACHE_BKA::check_emb_key_usage() +{ + uint i; + Item *item; + KEY_PART_INFO *key_part; + CACHE_FIELD *copy; + CACHE_FIELD *copy_end; + uint len= 0; + TABLE *table= join_tab->table; + TABLE_REF *ref= &join_tab->ref; + KEY *keyinfo= table->key_info+ref->key; + + /* + If some of the key arguments are not from the local cache the key + is not considered as embedded. + TODO: + Expand it to the case when ref->key_parts=1 and local_key_arg_fields=0. + */ + if (external_key_arg_fields != 0) + return FALSE; + /* + If the number of the local key arguments is not equal to the number + of key parts the key value cannot be read directly from the join buffer. + */ + if (local_key_arg_fields != ref->key_parts) + return FALSE; + + /* + A key is not considered embedded if one of the following is true: + - one of its key parts is not equal to a field + - it is a partial key + - definition of the argument field does not coincide with the + definition of the corresponding key component + - some of the key components are nullable + */ + for (i=0; i < ref->key_parts; i++) + { + item= ref->items[i]->real_item(); + if (item->type() != Item::FIELD_ITEM) + return FALSE; + key_part= keyinfo->key_part+i; + if (key_part->key_part_flag & HA_PART_KEY_SEG) + return FALSE; + if (!key_part->field->eq_def(((Item_field *) item)->field)) + return FALSE; + if (key_part->field->maybe_null()) + return FALSE; + } + + copy= field_descr+flag_fields; + copy_end= copy+local_key_arg_fields; + for ( ; copy < copy_end; copy++) + { + /* + If some of the key arguments are of variable length the key + is not considered as embedded. + */ + if (copy->type != 0) + return FALSE; + /* + If some of the key arguments are bit fields whose bits are partially + stored with null bits the key is not considered as embedded. + */ + if (copy->field->type() == MYSQL_TYPE_BIT && + ((Field_bit*) (copy->field))->bit_len) + return FALSE; + len+= copy->length; + } + + emb_key_length= len; + + /* + Make sure that key fields follow the order of the corresponding + key components these fields are equal to. For this the descriptors + of the fields that comprise the key might be re-ordered. + */ + for (i= 0; i < ref->key_parts; i++) + { + uint j; + Item *item= ref->items[i]->real_item(); + Field *fld= ((Item_field *) item)->field; + CACHE_FIELD *init_copy= field_descr+flag_fields+i; + for (j= i, copy= init_copy; i < local_key_arg_fields; i++, copy++) + { + if (fld->eq(copy->field)) + { + if (j != i) + { + CACHE_FIELD key_part_copy= *copy; + *copy= *init_copy; + *init_copy= key_part_copy; + } + break; + } + } + } + + return TRUE; +} + + +/* + Calculate the increment of the MRR buffer for a record write + + SYNOPSIS + aux_buffer_incr() + + DESCRIPTION + This implementation of the virtual function aux_buffer_incr determines + for how much the size of the MRR buffer should be increased when another + record is added to the cache. + + RETURN + the increment of the size of the MRR buffer for the next record +*/ + +uint JOIN_CACHE_BKA::aux_buffer_incr() +{ + uint incr= 0; + TABLE_REF *ref= &join_tab->ref; + TABLE *tab= join_tab->table; + uint rec_per_key= tab->key_info[ref->key].rec_per_key[ref->key_parts-1]; + set_if_bigger(rec_per_key, 1); + if (records == 1) + incr= ref->key_length + tab->file->ref_length; + incr+= tab->file->stats.mrr_length_per_rec * rec_per_key; + return incr; +} + + +/* + Check if the record combination matches the index condition + + SYNOPSIS + JOIN_CACHE_BKA::skip_index_tuple() + rseq Value returned by bka_range_seq_init() + range_info MRR range association data + + DESCRIPTION + This function is invoked from MRR implementation to check if an index + tuple matches the index condition. It is used in the case where the index + condition actually depends on both columns of the used index and columns + from previous tables. + + Accessing columns of the previous tables requires special handling with + BKA. The idea of BKA is to collect record combinations in a buffer and + then do a batch of ref access lookups, i.e. by the time we're doing a + lookup its previous-records-combination is not in prev_table->record[0] + but somewhere in the join buffer. + + We need to get it from there back into prev_table(s)->record[0] before we + can evaluate the index condition, and that's why we need this function + instead of regular IndexConditionPushdown. + + NOTE + Possible optimization: + Before we unpack the record from a previous table + check if this table is used in the condition. + If so then unpack the record otherwise skip the unpacking. + This should be done by a special virtual method + get_partial_record_by_pos(). + + RETURN + 0 The record combination satisfies the index condition + 1 Otherwise +*/ + +bool JOIN_CACHE_BKA::skip_index_tuple(range_seq_t rseq, char *range_info) +{ + DBUG_ENTER("JOIN_CACHE_BKA::skip_index_tuple"); + JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq; + cache->get_record_by_pos((uchar*)range_info); + DBUG_RETURN(!join_tab->cache_idx_cond->val_int()); +} + + +/* + Check if the record combination matches the index condition + + SYNOPSIS + bka_skip_index_tuple() + rseq Value returned by bka_range_seq_init() + range_info MRR range association data + + DESCRIPTION + This is wrapper for JOIN_CACHE_BKA::skip_index_tuple method, + see comments there. + + NOTE + This function is used as a RANGE_SEQ_IF::skip_index_tuple callback. + + RETURN + 0 The record combination satisfies the index condition + 1 Otherwise +*/ + +static +bool bka_skip_index_tuple(range_seq_t rseq, char *range_info) +{ + DBUG_ENTER("bka_skip_index_tuple"); + JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq; + DBUG_RETURN(cache->skip_index_tuple(rseq, range_info)); +} + + +/* + Write record fields and their required offsets into the join cache buffer + + SYNOPSIS + write_record_data() + link a reference to the associated info in the previous cache + is_full OUT true if it has been decided that no more records will be + added to the join buffer + + DESCRIPTION + This function put into the cache buffer the following info that it reads + from the join record buffers or computes somehow: + (1) the length of all fields written for the record (optional) + (2) an offset to the associated info in the previous cache (if there is any) + determined by the link parameter + (3) all flag fields of the tables whose data field are put into the cache: + - match flag (optional), + - null bitmaps for all tables, + - null row flags for all tables + (4) values of all data fields including + - full images of those fixed legth data fields that cannot have + trailing spaces + - significant part of fixed length fields that can have trailing spaces + with the prepanded length + - data of non-blob variable length fields with the prepanded data length + - blob data from blob fields with the prepanded data length + (5) record offset values for the data fields that are referred to from + other caches + + The record is written at the current position stored in the field 'pos'. + At the end of the function 'pos' points at the position right after the + written record data. + The function increments the number of records in the cache that is stored + in the 'records' field by 1. The function also modifies the values of + 'curr_rec_pos' and 'last_rec_pos' to point to the written record. + The 'end_pos' cursor is modified accordingly. + The 'last_rec_blob_data_is_in_rec_buff' is set on if the blob data + remains in the record buffers and not copied to the join buffer. It may + happen only to the blob data from the last record added into the cache. + + + RETURN + length of the written record data +*/ + +uint JOIN_CACHE::write_record_data(uchar * link, bool *is_full) +{ + uint len; + bool last_record; + CACHE_FIELD *copy; + CACHE_FIELD *copy_end; + uchar *cp= pos; + uchar *init_pos= cp; + uchar *rec_len_ptr= 0; + + records++; /* Increment the counter of records in the cache */ + + len= pack_length; + + /* Make an adjustment for the size of the auxiliary buffer if there is any */ + uint incr= aux_buffer_incr(); + ulong rem= rem_space(); + aux_buff_size+= len+incr < rem ? incr : rem; + + /* + For each blob to be put into cache save its length and a pointer + to the value in the corresponding element of the blob_ptr array. + Blobs with null values are skipped. + Increment 'len' by the total length of all these blobs. + */ + if (blobs) + { + CACHE_FIELD **copy_ptr= blob_ptr; + CACHE_FIELD **copy_ptr_end= copy_ptr+blobs; + for ( ; copy_ptr < copy_ptr_end; copy_ptr++) + { + Field_blob *blob_field= (Field_blob *) (*copy_ptr)->field; + if (!blob_field->is_null()) + { + uint blob_len= blob_field->get_length(); + (*copy_ptr)->blob_length= blob_len; + len+= blob_len; + blob_field->get_ptr(&(*copy_ptr)->str); + } + } + } + + /* + Check whether we won't be able to add any new record into the cache after + this one because the cache will be full. Set last_record to TRUE if it's so. + The assume that the cache will be full after the record has been written + into it if either the remaining space of the cache is not big enough for the + record's blob values or if there is a chance that not all non-blob fields + of the next record can be placed there. + This function is called only in the case when there is enough space left in + the cache to store at least non-blob parts of the current record. + */ + last_record= (len+pack_length_with_blob_ptrs) > rem_space(); + + /* + Save the position for the length of the record in the cache if it's needed. + The length of the record will be inserted here when all fields of the record + are put into the cache. + */ + if (with_length) + { + rec_len_ptr= cp; + cp+= size_of_rec_len; + } + + /* + Put a reference to the fields of the record that are stored in the previous + cache if there is any. This reference is passed by the 'link' parameter. + */ + if (prev_cache) + { + cp+= prev_cache->get_size_of_rec_offset(); + prev_cache->store_rec_ref(cp, link); + } + + curr_rec_pos= cp; + + /* If the there is a match flag set its value to 0 */ + copy= field_descr; + if (with_match_flag) + *copy[0].str= 0; + + /* First put into the cache the values of all flag fields */ + copy_end= field_descr+flag_fields; + for ( ; copy < copy_end; copy++) + { + memcpy(cp, copy->str, copy->length); + cp+= copy->length; + } + + /* Now put the values of the remaining fields as soon as they are not nulls */ + copy_end= field_descr+fields; + for ( ; copy < copy_end; copy++) + { + Field *field= copy->field; + if (field && field->maybe_null() && field->is_null()) + { + /* Do not copy a field if its value is null */ + if (copy->referenced_field_no) + copy->offset= 0; + continue; + } + /* Save the offset of the field to put it later at the end of the record */ + if (copy->referenced_field_no) + copy->offset= cp-curr_rec_pos; + + if (copy->type == CACHE_BLOB) + { + Field_blob *blob_field= (Field_blob *) copy->field; + if (last_record) + { + last_rec_blob_data_is_in_rec_buff= 1; + /* Put down the length of the blob and the pointer to the data */ + blob_field->get_image(cp, copy->length+sizeof(char*), + blob_field->charset()); + cp+= copy->length+sizeof(char*); + } + else + { + /* First put down the length of the blob and then copy the data */ + blob_field->get_image(cp, copy->length, + blob_field->charset()); + memcpy(cp+copy->length, copy->str, copy->blob_length); + cp+= copy->length+copy->blob_length; + } + } + else + { + switch (copy->type) { + case CACHE_VARSTR1: + /* Copy the significant part of the short varstring field */ + len= (uint) copy->str[0] + 1; + memcpy(cp, copy->str, len); + cp+= len; + break; + case CACHE_VARSTR2: + /* Copy the significant part of the long varstring field */ + len= uint2korr(copy->str) + 2; + memcpy(cp, copy->str, len); + cp+= len; + break; + case CACHE_STRIPPED: + { + /* + Put down the field value stripping all trailing spaces off. + After this insert the length of the written sequence of bytes. + */ + uchar *str, *end; + for (str= copy->str, end= str+copy->length; + end > str && end[-1] == ' '; + end--) ; + len=(uint) (end-str); + int2store(cp, len); + memcpy(cp+2, str, len); + cp+= len+2; + break; + } + default: + /* Copy the entire image of the field from the record buffer */ + memcpy(cp, copy->str, copy->length); + cp+= copy->length; + } + } + } + + /* Add the offsets of the fields that are referenced from other caches */ + if (referenced_fields) + { + uint cnt= 0; + for (copy= field_descr+flag_fields; copy < copy_end ; copy++) + { + if (copy->referenced_field_no) + { + store_fld_offset(cp+size_of_fld_ofs*(copy->referenced_field_no-1), + copy->offset); + cnt++; + } + } + cp+= size_of_fld_ofs*cnt; + } + + if (rec_len_ptr) + store_rec_length(rec_len_ptr, (ulong) (cp-rec_len_ptr-size_of_rec_len)); + last_rec_pos= curr_rec_pos; + end_pos= pos= cp; + *is_full= last_record; + return (uint) (cp-init_pos); +} + + +/* + Reset the join buffer for reading/writing: default implementation + + SYNOPSIS + reset() + for_writing if it's TRUE the function reset the buffer for writing + + DESCRIPTION + This default implementation of the virtual function reset() resets + the join buffer for reading or writing. + If the buffer is reset for reading only the 'pos' value is reset + to point to the very beginning of the join buffer. If the buffer is + reset for writing additionally: + - the counter of the records in the buffer is set to 0, + - the the value of 'last_rec_pos' gets pointing at the position just + before the buffer, + - 'end_pos' is set to point to the beginning of the join buffer, + - the size of the auxiliary buffer is reset to 0, + - the flag 'last_rec_blob_data_is_in_rec_buff' is set to 0. + + RETURN + none +*/ + +void JOIN_CACHE::reset(bool for_writing) +{ + pos= buff; + curr_rec_link= 0; + if (for_writing) + { + records= 0; + last_rec_pos= buff; + aux_buff_size= 0; + end_pos= pos; + last_rec_blob_data_is_in_rec_buff= 0; + } +} + +/* + Add a record into the join buffer: the default implementation + + SYNOPSIS + put_record() + + DESCRIPTION + This default implementation of the virtual function put_record writes + the next matching record into the join buffer. + It also links the record having been written into the join buffer with + the matched record in the previous cache if there is any. + The implementation assumes that the function get_curr_link() + will return exactly the pointer to this matched record. + + RETURN + TRUE if it has been decided that it should be the last record + in the join buffer, + FALSE otherwise +*/ + +bool JOIN_CACHE::put_record() +{ + bool is_full; + uchar *link= 0; + if (prev_cache) + link= prev_cache->get_curr_rec_link(); + write_record_data(link, &is_full); + return is_full; +} + + +/* + Read the next record from the join buffer: the default implementation + + SYNOPSIS + get_record() + + DESCRIPTION + This default implementation of the virtual function get_record + reads fields of the next record from the join buffer of this cache. + The function also reads all other fields associated with this record + from the the join buffers of the previous caches. The fields are read + into the corresponding record buffers. + It is supposed that 'pos' points to the position in the buffer + right after the previous record when the function is called. + When the function returns the 'pos' values is updated to point + to the position after the read record. + The value of 'curr_rec_pos' is also updated by the function to + point to the beginning of the first field of the record in the + join buffer. + + RETURN + TRUE - there are no more records to read from the join buffer + FALSE - otherwise +*/ + +bool JOIN_CACHE::get_record() +{ + bool res; + uchar *prev_rec_ptr= 0; + if (with_length) + pos+= size_of_rec_len; + if (prev_cache) + { + pos+= prev_cache->get_size_of_rec_offset(); + prev_rec_ptr= prev_cache->get_rec_ref(pos); + } + curr_rec_pos= pos; + if (!(res= read_all_record_fields() == NO_MORE_RECORDS_IN_BUFFER)) + { + pos+= referenced_fields*size_of_fld_ofs; + if (prev_cache) + prev_cache->get_record_by_pos(prev_rec_ptr); + } + return res; +} + + +/* + Read a positioned record from the join buffer: the default implementation + + SYNOPSIS + get_record_by_pos() + rec_ptr position of the first field of the record in the join buffer + + DESCRIPTION + This default implementation of the virtual function get_record_pos + reads the fields of the record positioned at 'rec_ptr' from the join buffer. + The function also reads all other fields associated with this record + from the the join buffers of the previous caches. The fields are read + into the corresponding record buffers. + + RETURN + none +*/ + +void JOIN_CACHE::get_record_by_pos(uchar *rec_ptr) +{ + uchar *save_pos= pos; + pos= rec_ptr; + read_all_record_fields(); + pos= save_pos; + if (prev_cache) + { + uchar *prev_rec_ptr= prev_cache->get_rec_ref(rec_ptr); + prev_cache->get_record_by_pos(prev_rec_ptr); + } +} + + +/* + Test the match flag from the referenced record: the default implementation + + SYNOPSIS + get_match_flag_by_pos() + rec_ptr position of the first field of the record in the join buffer + + DESCRIPTION + This default implementation of the virtual function get_match_flag_by_pos + test the match flag for the record pointed by the reference at the position + rec_ptr. If the match flag in placed one of the previous buffers the function + first reaches the linked record fields in this buffer. + + RETURN + TRUE if the match flag is set on + FALSE otherwise +*/ + +bool JOIN_CACHE::get_match_flag_by_pos(uchar *rec_ptr) +{ + if (with_match_flag) + return test(*rec_ptr); + if (prev_cache) + { + uchar *prev_rec_ptr= prev_cache->get_rec_ref(rec_ptr); + return prev_cache->get_match_flag_by_pos(prev_rec_ptr); + } + DBUG_ASSERT(0); + return FALSE; +} + + +/* + Read all flag and data fields of a record from the join buffer + + SYNOPSIS + read_all_record_fields() + + DESCRIPTION + The function reads all flag and data fields of a record from the join + buffer into the corresponding record buffers. + The fields are read starting from the position 'pos' which is + supposed to point to the beginning og the first record field. + The function increments the value of 'pos' by the length of the + read data. + + RETURN + (-1) - if there is no more records in the join buffer + length of the data read from the join buffer - otherwise +*/ + +uint JOIN_CACHE::read_all_record_fields() +{ + uchar *init_pos= pos; + + if (pos > last_rec_pos || !records) + return NO_MORE_RECORDS_IN_BUFFER; + + /* First match flag, read null bitmaps and null_row flag for each table */ + read_flag_fields(); + + /* Now read the remaining table fields if needed */ + CACHE_FIELD *copy= field_descr+flag_fields; + CACHE_FIELD *copy_end= field_descr+fields; + bool blob_in_rec_buff= blob_data_is_in_rec_buff(init_pos); + for ( ; copy < copy_end; copy++) + read_record_field(copy, blob_in_rec_buff); + + return (uint) (pos-init_pos); +} + + +/* + Read all flag fields of a record from the join buffer + + SYNOPSIS + read_flag_fields() + + DESCRIPTION + The function reads all flag fields of a record from the join + buffer into the corresponding record buffers. + The fields are read starting from the position 'pos'. + The function increments the value of 'pos' by the length of the + read data. + + RETURN + length of the data read from the join buffer +*/ + +uint JOIN_CACHE::read_flag_fields() +{ + uchar *init_pos= pos; + CACHE_FIELD *copy= field_descr; + CACHE_FIELD *copy_end= copy+flag_fields; + for ( ; copy < copy_end; copy++) + { + memcpy(copy->str, pos, copy->length); + pos+= copy->length; + } + return (pos-init_pos); +} + + +/* + Read a data record field from the join buffer + + SYNOPSIS + read_record_field() + copy the descriptor of the data field to be read + blob_in_rec_buff indicates whether this is the field from the record + whose blob data are in record buffers + + DESCRIPTION + The function reads the data field specified by the parameter copy + from the join buffer into the corresponding record buffer. + The field is read starting from the position 'pos'. + The data of blob values is not copied from the join buffer. + The function increments the value of 'pos' by the length of the + read data. + + RETURN + length of the data read from the join buffer +*/ + +uint JOIN_CACHE::read_record_field(CACHE_FIELD *copy, bool blob_in_rec_buff) +{ + uint len; + /* Do not copy the field if its value is null */ + if (copy->field && copy->field->maybe_null() && copy->field->is_null()) + return 0; + if (copy->type == CACHE_BLOB) + { + Field_blob *blob_field= (Field_blob *) copy->field; + /* + Copy the length and the pointer to data but not the blob data + itself to the record buffer + */ + if (blob_in_rec_buff) + { + blob_field->set_image(pos, copy->length+sizeof(char*), + blob_field->charset()); + len= copy->length+sizeof(char*); + } + else + { + blob_field->set_ptr(pos, pos+copy->length); + len= copy->length+blob_field->get_length(); + } + } + else + { + switch (copy->type) { + case CACHE_VARSTR1: + /* Copy the significant part of the short varstring field */ + len= (uint) pos[0] + 1; + memcpy(copy->str, pos, len); + break; + case CACHE_VARSTR2: + /* Copy the significant part of the long varstring field */ + len= uint2korr(pos) + 2; + memcpy(copy->str, pos, len); + break; + case CACHE_STRIPPED: + /* Pad the value by spaces that has been stripped off */ + len= uint2korr(pos); + memcpy(copy->str, pos+2, len); + memset(copy->str+len, ' ', copy->length-len); + len+= 2; + break; + default: + /* Copy the entire image of the field from the record buffer */ + len= copy->length; + memcpy(copy->str, pos, len); + } + } + pos+= len; + return len; +} + + +/* + Read a referenced field from the join buffer + + SYNOPSIS + read_referenced_field() + copy pointer to the descriptor of the referenced field + rec_ptr pointer to the record that may contain this field + len IN/OUT total length of the record fields + + DESCRIPTION + The function checks whether copy points to a data field descriptor + for this cache object. If it does not then the function returns + FALSE. Otherwise the function reads the field of the record in + the join buffer pointed by 'rec_ptr' into the corresponding record + buffer and returns TRUE. + If the value of *len is 0 then the function sets it to the total + length of the record fields including possible trailing offset + values. Otherwise *len is supposed to provide this value that + has been obtained earlier. + + RETURN + TRUE 'copy' points to a data descriptor of this join cache + FALSE otherwise +*/ + +bool JOIN_CACHE::read_referenced_field(CACHE_FIELD *copy, + uchar *rec_ptr, + uint *len) +{ + uchar *ptr; + uint offset; + if (copy < field_descr || copy >= field_descr+fields) + return FALSE; + if (!*len) + { + /* Get the total length of the record fields */ + uchar *len_ptr= rec_ptr; + if (prev_cache) + len_ptr-= prev_cache->get_size_of_rec_offset(); + *len= get_rec_length(len_ptr-size_of_rec_len); + } + + ptr= rec_ptr-(prev_cache ? prev_cache->get_size_of_rec_offset() : 0); + offset= get_fld_offset(ptr+ *len - + size_of_fld_ofs* + (referenced_fields+1-copy->referenced_field_no)); + bool is_null= FALSE; + if (offset == 0 && flag_fields) + is_null= TRUE; + if (is_null) + copy->field->set_null(); + else + { + uchar *save_pos= pos; + copy->field->set_notnull(); + pos= rec_ptr+offset; + read_record_field(copy, blob_data_is_in_rec_buff(rec_ptr)); + pos= save_pos; + } + return TRUE; +} + + +/* + Skip record from join buffer if its match flag is on: default implementation + + SYNOPSIS + skip_record_if_match() + + DESCRIPTION + This default implementation of the virtual function skip_record_if_match + skips the next record from the join buffer if its match flag is set on. + If the record is skipped the value of 'pos' is set to points to the position + right after the record. + + RETURN + TRUE - the match flag is on and the record has been skipped + FALSE - the match flag is off +*/ + +bool JOIN_CACHE::skip_record_if_match() +{ + DBUG_ASSERT(with_length); + uint offset= size_of_rec_len; + if (prev_cache) + offset+= prev_cache->get_size_of_rec_offset(); + /* Check whether the match flag is on */ + if (get_match_flag_by_pos(pos+offset)) + { + pos+= size_of_rec_len + get_rec_length(pos); + return TRUE; + } + return FALSE; +} + + +/* + Restore the fields of the last record from the join buffer + + SYNOPSIS + restore_last_record() + + DESCRIPTION + This function restore the values of the fields of the last record put + into join buffer in record buffers. The values most probably have been + overwritten by the field values from other records when they were read + from the join buffer into the record buffer in order to check pushdown + predicates. + + RETURN + none +*/ + +void JOIN_CACHE::restore_last_record() +{ + if (records) + get_record_by_pos(last_rec_pos); +} + + +/* + Join records from the join buffer with records from the next join table + + SYNOPSIS + join_records() + skip_last do not find matches for the last record from the buffer + + DESCRIPTION + The functions extends all records from the join buffer by the matched + records from join_tab. In the case of outer join operation it also + adds null complementing extensions for the records from the join buffer + that have no match. + No extensions are generated for the last record from the buffer if + skip_last is true. + + NOTES + The function must make sure that if linked join buffers are used then + a join buffer cannot be refilled again until all extensions in the + buffers chained to this one are generated. + Currently an outer join operation with several inner tables always uses + at least two linked buffers with the match join flags placed in the + first buffer. Any record composed of rows of the inner tables that + matches a record in this buffer must refer to the position of the + corresponding match flag. + + IMPLEMENTATION + When generating extensions for outer tables of an outer join operation + first we generate all extensions for those records from the join buffer + that have matches, after which null complementing extension for all + unmatched records from the join buffer are generated. + + RETURN + return one of enum_nested_loop_state, except NESTED_LOOP_NO_MORE_ROWS. +*/ + +enum_nested_loop_state JOIN_CACHE::join_records(bool skip_last) +{ + JOIN_TAB *tab; + enum_nested_loop_state rc= NESTED_LOOP_OK; + bool outer_join_first_inner= join_tab->is_first_inner_for_outer_join(); + + if (outer_join_first_inner && !join_tab->first_unmatched) + join_tab->not_null_compl= TRUE; + + if (!join_tab->first_unmatched) + { + /* Find all records from join_tab that match records from join buffer */ + rc= join_matching_records(skip_last); + if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS) + goto finish; + if (outer_join_first_inner) + { + if (next_cache) + { + /* + Ensure that all matches for outer records from join buffer are to be + found. Now we ensure that all full records are found for records from + join buffer. Generally this is an overkill. + TODO: Ensure that only matches of the inner table records have to be + found for the records from join buffer. + */ + rc= next_cache->join_records(skip_last); + if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS) + goto finish; + } + join_tab->not_null_compl= FALSE; + /* Prepare for generation of null complementing extensions */ + for (tab= join_tab->first_inner; tab <= join_tab->last_inner; tab++) + tab->first_unmatched= join_tab->first_inner; + } + } + if (join_tab->first_unmatched) + { + if (is_key_access()) + restore_last_record(); + + /* + Generate all null complementing extensions for the records from + join buffer that don't have any matching rows from the inner tables. + */ + reset(FALSE); + rc= join_null_complements(skip_last); + if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS) + goto finish; + } + if(next_cache) + { + /* + When using linked caches we must ensure the records in the next caches + that refer to the records in the join buffer are fully extended. + Otherwise we could have references to the records that have been + already erased from the join buffer and replaced for new records. + */ + rc= next_cache->join_records(skip_last); + if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS) + goto finish; + } + if (outer_join_first_inner) + { + /* + All null complemented rows have been already generated for all + outer records from join buffer. Restore the state of the + first_unmatched values to 0 to avoid another null complementing. + */ + for (tab= join_tab->first_inner; tab <= join_tab->last_inner; tab++) + tab->first_unmatched= 0; + } + + if (skip_last) + { + DBUG_ASSERT(!is_key_access()); + /* + Restore the last record from the join buffer to generate + all extentions for it. + */ + get_record(); + } + +finish: + restore_last_record(); + reset(TRUE); + return rc; +} + + +/* + Using BNL find matches from the next table for records from the join buffer + + SYNOPSIS + join_matching_records() + skip_last do not look for matches for the last partial join record + + DESCRIPTION + The function retrieves all rows of the join_tab table and check whether + they match partial join records from the join buffer. If a match is found + the function will call the sub_select function trying to look for matches + for the remaining join operations. + This function currently is called only from the function join_records. + If the value of skip_last is true the function writes the partial join + record from the record buffer into the join buffer to save its value for + the future processing in the caller function. + + NOTES + The function produces all matching extensions for the records in the + join buffer following the path of the Blocked Nested Loops algorithm. + When an outer join operation is performed all unmatched records from + the join buffer must be extended by null values. The function + 'join_null_complements' serves this purpose. + + RETURN + return one of enum_nested_loop_state. +*/ + +enum_nested_loop_state JOIN_CACHE_BNL::join_matching_records(bool skip_last) +{ + uint cnt; + int error; + JOIN_TAB *tab; + READ_RECORD *info; + enum_nested_loop_state rc= NESTED_LOOP_OK; + bool check_only_first_match= join_tab->check_only_first_match(); + SQL_SELECT *select= join_tab->cache_select; + + join_tab->table->null_row= 0; + + /* Return at once if there are no records in the join buffer */ + if (!records) + return NESTED_LOOP_OK; + + /* + When joining we read records from the join buffer back into record buffers. + If matches for the last partial join record are found through a call to + the sub_select function then this partial join record must be saved in the + join buffer in order to be restored just before the sub_select call. + */ + if (skip_last) + put_record(); + + if (join_tab->use_quick == 2 && join_tab->select->quick) + { + /* A dynamic range access was used last. Clean up after it */ + delete join_tab->select->quick; + join_tab->select->quick= 0; + } + + for (tab= join->join_tab; tab != join_tab ; tab++) + { + tab->status= tab->table->status; + tab->table->status= 0; + } + + /* Start retrieving all records of the joined table */ + if ((error= join_init_read_record(join_tab))) + { + rc= error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR; + goto finish; + } + + info= &join_tab->read_record; + do + { + if (join_tab->keep_current_rowid) + join_tab->table->file->position(join_tab->table->record[0]); + + if (join->thd->killed) + { + /* The user has aborted the execution of the query */ + join->thd->send_kill_message(); + rc= NESTED_LOOP_KILLED; + goto finish; + } + int err= 0; + + if (rc == NESTED_LOOP_OK) + update_virtual_fields(join->thd, join_tab->table); + + /* + Do not look for matches if the last read record of the joined table + does not meet the conditions that have been pushed to this table + */ + if (rc == NESTED_LOOP_OK && + (!select || (err= select->skip_record(join->thd)) != 0)) + { + if (err < 0) + return NESTED_LOOP_ERROR; + rc= NESTED_LOOP_OK; + + /* Prepare to read records from the join buffer */ + reset(FALSE); + + /* Read each record from the join buffer and look for matches */ + for (cnt= records - test(skip_last) ; cnt; cnt--) + { + /* + If only the first match is needed and it has been already found for + the next record read from the join buffer then the record is skipped. + */ + if (!check_only_first_match || !skip_record_if_match()) + { + get_record(); + rc= generate_full_extensions(get_curr_rec()); + if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS) + goto finish; + } + } + } + } while (!(error= info->read_record(info))); + + if (error > 0) // Fatal error + rc= NESTED_LOOP_ERROR; +finish: + for (tab= join->join_tab; tab != join_tab ; tab++) + tab->table->status= tab->status; + return rc; +} + + +/* + Set match flag for a record in join buffer if it has not been set yet + + SYNOPSIS + set_match_flag_if_none() + first_inner the join table to which this flag is attached to + rec_ptr pointer to the record in the join buffer + + DESCRIPTION + If the records of the table are accumulated in a join buffer the function + sets the match flag for the record in the buffer that is referred to by + the record from this cache positioned at 'rec_ptr'. + The function also sets the match flag 'found' of the table first inner + if it has not been set before. + + NOTES + The function assumes that the match flag for any record in any cache + is placed in the first byte occupied by the record fields. + + RETURN + TRUE the match flag is set by this call for the first time + FALSE the match flag has been set before this call +*/ + +bool JOIN_CACHE::set_match_flag_if_none(JOIN_TAB *first_inner, + uchar *rec_ptr) +{ + if (!first_inner->cache) + { + /* + Records of the first inner table to which the flag is attached to + are not accumulated in a join buffer. + */ + if (first_inner->found) + return FALSE; + else + { + first_inner->found= 1; + return TRUE; + } + } + JOIN_CACHE *cache= this; + while (cache->join_tab != first_inner) + { + cache= cache->prev_cache; + DBUG_ASSERT(cache); + rec_ptr= cache->get_rec_ref(rec_ptr); + } + if (rec_ptr[0] == 0) + { + rec_ptr[0]= 1; + first_inner->found= 1; + return TRUE; + } + return FALSE; +} + + +/* + Generate all full extensions for a partial join record in the buffer + + SYNOPSIS + generate_full_extensions() + rec_ptr pointer to the record from join buffer to generate extensions + + DESCRIPTION + The function first checks whether the current record of 'join_tab' matches + the partial join record from join buffer located at 'rec_ptr'. If it is the + case the function calls the join_tab->next_select method to generate + all full extension for this partial join match. + + RETURN + return one of enum_nested_loop_state. +*/ + +enum_nested_loop_state JOIN_CACHE::generate_full_extensions(uchar *rec_ptr) +{ + enum_nested_loop_state rc= NESTED_LOOP_OK; + + /* + Check whether the extended partial join record meets + the pushdown conditions. + */ + if (check_match(rec_ptr)) + { + int res= 0; + + if (!join_tab->check_weed_out_table || + !(res= do_sj_dups_weedout(join->thd, join_tab->check_weed_out_table))) + { + set_curr_rec_link(rec_ptr); + rc= (join_tab->next_select)(join, join_tab+1, 0); + if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS) + { + reset(TRUE); + return rc; + } + } + if (res == -1) + { + rc= NESTED_LOOP_ERROR; + return rc; + } + } + return rc; +} + + +/* + Check matching to a partial join record from the join buffer + + SYNOPSIS + check_match() + rec_ptr pointer to the record from join buffer to check matching to + + DESCRIPTION + The function checks whether the current record of 'join_tab' matches + the partial join record from join buffer located at 'rec_ptr'. If this is + the case and 'join_tab' is the last inner table of a semi-join or an outer + join the function turns on the match flag for the 'rec_ptr' record unless + it has been already set. + + NOTES + Setting the match flag on can trigger re-evaluation of pushdown conditions + for the record when join_tab is the last inner table of an outer join. + + RETURN + TRUE there is a match + FALSE there is no match +*/ + +inline bool JOIN_CACHE::check_match(uchar *rec_ptr) +{ + /* Check whether pushdown conditions are satisfied */ + if (join_tab->select && join_tab->select->skip_record(join->thd) < 1) + return FALSE; + + if (!join_tab->is_last_inner_table()) + return TRUE; + + /* + This is the last inner table of an outer join, + and maybe of other embedding outer joins, or + this is the last inner table of a semi-join. + */ + JOIN_TAB *first_inner= join_tab->get_first_inner_table(); + do + { + set_match_flag_if_none(first_inner, rec_ptr); + if (first_inner->check_only_first_match() && + !join_tab->first_inner) + return TRUE; + /* + This is the first match for the outer table row. + The function set_match_flag_if_none has turned the flag + first_inner->found on. The pushdown predicates for + inner tables must be re-evaluated with this flag on. + Note that, if first_inner is the first inner table + of a semi-join, but is not an inner table of an outer join + such that 'not exists' optimization can be applied to it, + the re-evaluation of the pushdown predicates is not needed. + */ + for (JOIN_TAB *tab= first_inner; tab <= join_tab; tab++) + { + if (tab->select && tab->select->skip_record(join->thd) < 1) + return FALSE; + } + } + while ((first_inner= first_inner->first_upper) && + first_inner->last_inner == join_tab); + + return TRUE; +} + + +/* + Add null complements for unmatched outer records from join buffer + + SYNOPSIS + join_null_complements() + skip_last do not add null complements for the last record + + DESCRIPTION + This function is called only for inner tables of outer joins. + The function retrieves all rows from the join buffer and adds null + complements for those of them that do not have matches for outer + table records. + If the 'join_tab' is the last inner table of the embedding outer + join and the null complemented record satisfies the outer join + condition then the the corresponding match flag is turned on + unless it has been set earlier. This setting may trigger + re-evaluation of pushdown conditions for the record. + + NOTES + The same implementation of the virtual method join_null_complements + is used for JOIN_CACHE_BNL and JOIN_CACHE_BKA. + + RETURN + return one of enum_nested_loop_state. +*/ + +enum_nested_loop_state JOIN_CACHE::join_null_complements(bool skip_last) +{ + uint cnt; + enum_nested_loop_state rc= NESTED_LOOP_OK; + bool is_first_inner= join_tab == join_tab->first_unmatched; + bool is_last_inner= join_tab == join_tab->first_unmatched->last_inner; + + /* Return at once if there are no records in the join buffer */ + if (!records) + return NESTED_LOOP_OK; + + cnt= records - (is_key_access() ? 0 : test(skip_last)); + + /* This function may be called only for inner tables of outer joins */ + DBUG_ASSERT(join_tab->first_inner); + + for ( ; cnt; cnt--) + { + if (join->thd->killed) + { + /* The user has aborted the execution of the query */ + join->thd->send_kill_message(); + rc= NESTED_LOOP_KILLED; + goto finish; + } + /* Just skip the whole record if a match for it has been already found */ + if (!is_first_inner || !skip_record_if_match()) + { + get_record(); + /* The outer row is complemented by nulls for each inner table */ + restore_record(join_tab->table, s->default_values); + mark_as_null_row(join_tab->table); + /* Check all pushdown conditions attached to the inner table */ + join_tab->first_unmatched->found= 1; + if (join_tab->select && join_tab->select->skip_record(join->thd) < 1) + continue; + if (is_last_inner) + { + JOIN_TAB *first_upper= join_tab->first_unmatched->first_upper; + while (first_upper && first_upper->last_inner == join_tab) + { + set_match_flag_if_none(first_upper, get_curr_rec()); + for (JOIN_TAB* tab= first_upper; tab <= join_tab; tab++) + { + if (tab->select && tab->select->skip_record(join->thd) < 1) + goto next; + } + first_upper= first_upper->first_upper; + } + } + /* Find all matches for the remaining join tables */ + rc= (*join_tab->next_select)(join, join_tab+1, 0); + if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS) + { + reset(TRUE); + goto finish; + } + } + next: + ; + } + +finish: + return rc; +} + + +/* + Initialize retrieval of range sequence for BKA algorithm + + SYNOPSIS + bka_range_seq_init() + init_params pointer to the BKA join cache object + n_ranges the number of ranges obtained + flags combination of HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY + + DESCRIPTION + The function interprets init_param as a pointer to a JOIN_CACHE_BKA + object. The function prepares for an iteration over the join keys + built for all records from the cache join buffer. + + NOTE + This function are used only as a callback function. + + RETURN + init_param value that is to be used as a parameter of bka_range_seq_next() +*/ + +static +range_seq_t bka_range_seq_init(void *init_param, uint n_ranges, uint flags) +{ + DBUG_ENTER("bka_range_seq_init"); + JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) init_param; + cache->reset(0); + DBUG_RETURN((range_seq_t) init_param); +} + + +/* + Get the key over the next record from the join buffer used by BKA + + SYNOPSIS + bka_range_seq_next() + seq the value returned by bka_range_seq_init + range OUT reference to the next range + + DESCRIPTION + The function interprets seq as a pointer to a JOIN_CACHE_BKA + object. The function returns a pointer to the range descriptor + for the key built over the next record from the join buffer. + + NOTE + This function are used only as a callback function. + + RETURN + 0 ok, the range structure filled with info about the next key + 1 no more ranges +*/ + +static +uint bka_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range) +{ + DBUG_ENTER("bka_range_seq_next"); + JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq; + TABLE_REF *ref= &cache->join_tab->ref; + key_range *start_key= &range->start_key; + if ((start_key->length= cache->get_next_key((uchar **) &start_key->key))) + { + start_key->keypart_map= (1 << ref->key_parts) - 1; + start_key->flag= HA_READ_KEY_EXACT; + range->end_key= *start_key; + range->end_key.flag= HA_READ_AFTER_KEY; + range->ptr= (char *) cache->get_curr_rec(); + range->range_flag= EQ_RANGE; + DBUG_RETURN(0); + } + DBUG_RETURN(1); +} + + +/* + Check whether range_info orders to skip the next record from BKA buffer + + SYNOPSIS + bka_range_seq_skip_record() + seq value returned by bka_range_seq_init() + range_info information about the next range + rowid [NOT USED] rowid of the record to be checked + + + DESCRIPTION + The function interprets seq as a pointer to a JOIN_CACHE_BKA object. + The function interprets seq as a pointer to the JOIN_CACHE_BKA_UNIQUE + object. The function returns TRUE if the record with this range_info + is to be filtered out from the stream of records returned by + multi_range_read_next(). + + NOTE + This function are used only as a callback function. + + RETURN + 1 record with this range_info is to be filtered out from the stream + of records returned by multi_range_read_next() + 0 the record is to be left in the stream +*/ + +static +bool bka_range_seq_skip_record(range_seq_t rseq, char *range_info, uchar *rowid) +{ + DBUG_ENTER("bka_range_seq_skip_record"); + JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq; + bool res= cache->get_match_flag_by_pos((uchar *) range_info); + DBUG_RETURN(res); +} + +/* + Using BKA find matches from the next table for records from the join buffer + + SYNOPSIS + join_matching_records() + skip_last do not look for matches for the last partial join record + + DESCRIPTION + This function can be used only when the table join_tab can be accessed + by keys built over the fields of previous join tables. + The function retrieves all partial join records from the join buffer and + for each of them builds the key value to access join_tab, performs index + look-up with this key and selects matching records yielded by this look-up + If a match is found the function will call the sub_select function trying + to look for matches for the remaining join operations. + This function currently is called only from the function join_records. + It's assumed that this function is always called with the skip_last + parameter equal to false. + + NOTES + The function produces all matching extensions for the records in the + join buffer following the path of the Batched Key Access algorithm. + When an outer join operation is performed all unmatched records from + the join buffer must be extended by null values. The function + join_null_complements serves this purpose. + The Batched Key Access algorithm assumes that key accesses are batched. + In other words it assumes that, first, either keys themselves or the + corresponding rowids (primary keys) are accumulated in a buffer, then + data rows from join_tab are fetched for all of them. When a row is + fetched it is always returned with a reference to the key by which it + has been accessed. + When key values are batched we can save on the number of the server + requests for index lookups. For the remote engines, like NDB cluster, it + essentially reduces the number of round trips between the server and + the engine when performing a join operation. + When the rowids for the keys are batched we can optimize the order + in what we fetch the data for this rowids. The performance benefits of + this optimization can be significant for such engines as MyISAM, InnoDB. + What is exactly batched are hidden behind implementations of + MRR handler interface that is supposed to be appropriately chosen + for each engine. If for a engine no specific implementation of the MRR + interface is supllied then the default implementation is used. This + implementation actually follows the path of Nested Loops Join algorithm. + In this case BKA join surely will demonstrate a worse performance than + NL join. + + RETURN + return one of enum_nested_loop_state +*/ + +enum_nested_loop_state JOIN_CACHE_BKA::join_matching_records(bool skip_last) +{ + int error; + handler *file= join_tab->table->file; + enum_nested_loop_state rc= NESTED_LOOP_OK; + uchar *rec_ptr= 0; + bool check_only_first_match= join_tab->check_only_first_match(); + + /* Set functions to iterate over keys in the join buffer */ + + RANGE_SEQ_IF seq_funcs= { bka_range_seq_init, + bka_range_seq_next, + check_only_first_match ? + bka_range_seq_skip_record : 0, + join_tab->cache_idx_cond ? + bka_skip_index_tuple : 0 }; + + /* The value of skip_last must be always FALSE when this function is called */ + DBUG_ASSERT(!skip_last); + + /* Return at once if there are no records in the join buffer */ + if (!records) + return NESTED_LOOP_OK; + + rc= init_join_matching_records(&seq_funcs, records); + if (rc != NESTED_LOOP_OK) + goto finish; + + while (!(error= file->multi_range_read_next((char **) &rec_ptr))) + { + if (join->thd->killed) + { + /* The user has aborted the execution of the query */ + join->thd->send_kill_message(); + rc= NESTED_LOOP_KILLED; + goto finish; + } + if (join_tab->keep_current_rowid) + join_tab->table->file->position(join_tab->table->record[0]); + /* + If only the first match is needed and it has been already found + for the associated partial join record then the returned candidate + is discarded. + */ + if (rc == NESTED_LOOP_OK && + (!check_only_first_match || !get_match_flag_by_pos(rec_ptr))) + { + get_record_by_pos(rec_ptr); + update_virtual_fields(join->thd, join_tab->table); + rc= generate_full_extensions(rec_ptr); + if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS) + goto finish; + } + } + + if (error > 0 && error != HA_ERR_END_OF_FILE) + return NESTED_LOOP_ERROR; +finish: + return end_join_matching_records(rc); +} + + + +/* + Prepare to search for records that match records from the join buffer + + SYNOPSIS + init_join_matching_records() + seq_funcs structure of range sequence interface + ranges number of keys/ranges in the sequence + + DESCRIPTION + This function calls the multi_range_read_init function to set up + the BKA process of generating the keys from the records in the join + buffer and looking for matching records from the table to be joined. + The function passes as a parameter a structure of functions that + implement the range sequence interface. This interface is used to + enumerate all generated keys and optionally to filter the matching + records returned by the multi_range_read_next calls from the + intended invocation of the join_matching_records method. The + multi_range_read_init function also receives the parameters for + MRR buffer to be used and flags specifying the mode in which + this buffer will be functioning. + The number of keys in the sequence expected by multi_range_read_init + is passed through the parameter ranges. + + RETURN + return one of enum_nested_loop_state +*/ + +enum_nested_loop_state +JOIN_CACHE_BKA::init_join_matching_records(RANGE_SEQ_IF *seq_funcs, uint ranges) +{ + int error; + handler *file= join_tab->table->file; + enum_nested_loop_state rc= NESTED_LOOP_OK; + + join_tab->table->null_row= 0; + + + /* Dynamic range access is never used with BKA */ + DBUG_ASSERT(join_tab->use_quick != 2); + + for (JOIN_TAB *tab =join->join_tab; tab != join_tab ; tab++) + { + tab->status= tab->table->status; + tab->table->status= 0; + } + + init_mrr_buff(); + + /* + Prepare to iterate over keys from the join buffer and to get + matching candidates obtained with MMR handler functions. + */ + if (!file->inited) + file->ha_index_init(join_tab->ref.key, 1); + if ((error= file->multi_range_read_init(seq_funcs, (void*) this, ranges, + mrr_mode, &mrr_buff))) + rc= error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR; + + return rc; +} + + +/* + Finish searching for records that match records from the join buffer + + SYNOPSIS + end_join_matching_records() + rc return code passed by the join_matching_records function + + DESCRIPTION + This function perform final actions on searching for all matches for + the records from the join buffer and building all full join extensions + of the records with these matches. + + RETURN + return code rc passed to the function as a parameter +*/ + +enum_nested_loop_state +JOIN_CACHE_BKA::end_join_matching_records(enum_nested_loop_state rc) +{ + for (JOIN_TAB *tab=join->join_tab; tab != join_tab ; tab++) + tab->table->status= tab->status; + return rc; +} + + +/* + Get the key built over the next record from BKA join buffer + + SYNOPSIS + get_next_key() + key pointer to the buffer where the key value is to be placed + + DESCRIPTION + The function reads key fields from the current record in the join buffer. + and builds the key value out of these fields that will be used to access + the 'join_tab' table. Some of key fields may belong to previous caches. + They are accessed via record references to the record parts stored in the + previous join buffers. The other key fields always are placed right after + the flag fields of the record. + If the key is embedded, which means that its value can be read directly + from the join buffer, then *key is set to the beginning of the key in + this buffer. Otherwise the key is built in the join_tab->ref->key_buff. + The function returns the length of the key if it succeeds ro read it. + If is assumed that the functions starts reading at the position of + the record length which is provided for each records in a BKA cache. + After the key is built the 'pos' value points to the first position after + the current record. + The function returns 0 if the initial position is after the beginning + of the record fields for last record from the join buffer. + + RETURN + length of the key value - if the starting value of 'pos' points to + the position before the fields for the last record, + 0 - otherwise. +*/ + +uint JOIN_CACHE_BKA::get_next_key(uchar ** key) +{ + uint len; + uint32 rec_len; + uchar *init_pos; + JOIN_CACHE *cache; + + if (pos > last_rec_pos || !records) + return 0; + + /* Any record in a BKA cache is prepended with its length */ + DBUG_ASSERT(with_length); + + /* Read the length of the record */ + rec_len= get_rec_length(pos); + pos+= size_of_rec_len; + init_pos= pos; + + /* Read a reference to the previous cache if any */ + if (prev_cache) + pos+= prev_cache->get_size_of_rec_offset(); + + curr_rec_pos= pos; + + /* Read all flag fields of the record */ + read_flag_fields(); + + if (use_emb_key) + { + /* An embedded key is taken directly from the join buffer */ + *key= pos; + len= emb_key_length; + } + else + { + /* Read key arguments from previous caches if there are any such fields */ + if (external_key_arg_fields) + { + uchar *rec_ptr= curr_rec_pos; + uint key_arg_count= external_key_arg_fields; + CACHE_FIELD **copy_ptr= blob_ptr-key_arg_count; + for (cache= prev_cache; key_arg_count; cache= cache->prev_cache) + { + uint len= 0; + DBUG_ASSERT(cache); + rec_ptr= cache->get_rec_ref(rec_ptr); + while (!cache->referenced_fields) + { + cache= cache->prev_cache; + DBUG_ASSERT(cache); + rec_ptr= cache->get_rec_ref(rec_ptr); + } + while (key_arg_count && + cache->read_referenced_field(*copy_ptr, rec_ptr, &len)) + { + copy_ptr++; + --key_arg_count; + } + } + } + + /* + Read the other key arguments from the current record. The fields for + these arguments are always first in the sequence of the record's fields. + */ + CACHE_FIELD *copy= field_descr+flag_fields; + CACHE_FIELD *copy_end= copy+local_key_arg_fields; + bool blob_in_rec_buff= blob_data_is_in_rec_buff(curr_rec_pos); + for ( ; copy < copy_end; copy++) + read_record_field(copy, blob_in_rec_buff); + + /* Build the key over the fields read into the record buffers */ + TABLE_REF *ref= &join_tab->ref; + cp_buffer_from_ref(join->thd, join_tab->table, ref); + *key= ref->key_buff; + len= ref->key_length; + } + + pos= init_pos+rec_len; + + return len; +} + + +/* + Initialize a BKA_UNIQUE cache + + SYNOPSIS + init() + + DESCRIPTION + The function initializes the cache structure. It supposed to be called + right after a constructor for the JOIN_CACHE_BKA_UNIQUE. + The function allocates memory for the join buffer and for descriptors of + the record fields stored in the buffer. + The function also estimates the number of hash table entries in the hash + table to be used and initializes this hash table. + + NOTES + The code of this function should have been included into the constructor + code itself. However the new operator for the class JOIN_CACHE_BKA_UNIQUE + would never fail while memory allocation for the join buffer is not + absolutely unlikely to fail. That's why this memory allocation has to be + placed in a separate function that is called in a couple with a cache + constructor. + It is quite natural to put almost all other constructor actions into + this function. + + RETURN + 0 initialization with buffer allocations has been succeeded + 1 otherwise +*/ + +int JOIN_CACHE_BKA_UNIQUE::init() +{ + int rc= 0; + TABLE_REF *ref= &join_tab->ref; + + DBUG_ENTER("JOIN_CACHE_BKA_UNIQUE::init"); + + hash_table= 0; + key_entries= 0; + + if ((rc= JOIN_CACHE_BKA::init())) + DBUG_RETURN (rc); + + key_length= ref->key_length; + + /* Take into account a reference to the next record in the key chain */ + pack_length+= get_size_of_rec_offset(); + + /* Calculate the minimal possible value of size_of_key_ofs greater than 1 */ + uint max_size_of_key_ofs= max(2, get_size_of_rec_offset()); + for (size_of_key_ofs= 2; + size_of_key_ofs <= max_size_of_key_ofs; + size_of_key_ofs+= 2) + { + key_entry_length= get_size_of_rec_offset() + // key chain header + size_of_key_ofs + // reference to the next key + (use_emb_key ? get_size_of_rec_offset() : key_length); + + uint n= buff_size / (pack_length+key_entry_length+size_of_key_ofs); + + /* + TODO: Make a better estimate for this upper bound of + the number of records in in the join buffer. + */ + uint max_n= buff_size / (pack_length-length+ + key_entry_length+size_of_key_ofs); + + hash_entries= (uint) (n / 0.7); + + if (offset_size(max_n*key_entry_length) <= + size_of_key_ofs) + break; + } + + /* Initialize the hash table */ + hash_table= buff + (buff_size-hash_entries*size_of_key_ofs); + cleanup_hash_table(); + curr_key_entry= hash_table; + + pack_length+= key_entry_length; + pack_length_with_blob_ptrs+= get_size_of_rec_offset() + key_entry_length; + + rec_fields_offset= get_size_of_rec_offset()+get_size_of_rec_length()+ + (prev_cache ? prev_cache->get_size_of_rec_offset() : 0); + + data_fields_offset= 0; + if (use_emb_key) + { + CACHE_FIELD *copy= field_descr; + CACHE_FIELD *copy_end= copy+flag_fields; + for ( ; copy < copy_end; copy++) + data_fields_offset+= copy->length; + } + + DBUG_RETURN(rc); +} + + +/* + Reset the JOIN_CACHE_BKA_UNIQUE buffer for reading/writing + + SYNOPSIS + reset() + for_writing if it's TRUE the function reset the buffer for writing + + DESCRIPTION + This implementation of the virtual function reset() resets the join buffer + of the JOIN_CACHE_BKA_UNIQUE class for reading or writing. + Additionally to what the default implementation does this function + cleans up the hash table allocated within the buffer. + + RETURN + none +*/ + +void JOIN_CACHE_BKA_UNIQUE::reset(bool for_writing) +{ + this->JOIN_CACHE::reset(for_writing); + if (for_writing && hash_table) + cleanup_hash_table(); + curr_key_entry= hash_table; +} + +/* + Add a record into the JOIN_CACHE_BKA_UNIQUE buffer + + SYNOPSIS + put_record() + + DESCRIPTION + This implementation of the virtual function put_record writes the next + matching record into the join buffer of the JOIN_CACHE_BKA_UNIQUE class. + Additionally to what the default implementation does this function + performs the following. + It extracts from the record the key value used in lookups for matching + records and searches for this key in the hash tables from the join cache. + If it finds the key in the hash table it joins the record to the chain + of records with this key. If the key is not found in the hash table the + key is placed into it and a chain containing only the newly added record + is attached to the key entry. The key value is either placed in the hash + element added for the key or, if the use_emb_key flag is set, remains in + the record from the partial join. + + RETURN + TRUE if it has been decided that it should be the last record + in the join buffer, + FALSE otherwise +*/ + +bool JOIN_CACHE_BKA_UNIQUE::put_record() +{ + bool is_full; + uchar *key; + uint key_len= key_length; + uchar *key_ref_ptr; + uchar *link= 0; + TABLE_REF *ref= &join_tab->ref; + uchar *next_ref_ptr= pos; + + pos+= get_size_of_rec_offset(); + /* Write the record into the join buffer */ + if (prev_cache) + link= prev_cache->get_curr_rec_link(); + write_record_data(link, &is_full); + + if (use_emb_key) + key= get_curr_emb_key(); + else + { + /* Build the key over the fields read into the record buffers */ + cp_buffer_from_ref(join->thd, join_tab->table, ref); + key= ref->key_buff; + } + + /* Look for the key in the hash table */ + if (key_search(key, key_len, &key_ref_ptr)) + { + uchar *last_next_ref_ptr; + /* + The key is found in the hash table. + Add the record to the circular list of the records attached to this key. + Below 'rec' is the record to be added into the record chain for the found + key, 'key_ref' points to a flatten representation of the st_key_entry + structure that contains the key and the head of the record chain. + */ + last_next_ref_ptr= get_next_rec_ref(key_ref_ptr+get_size_of_key_offset()); + /* rec->next_rec= key_entry->last_rec->next_rec */ + memcpy(next_ref_ptr, last_next_ref_ptr, get_size_of_rec_offset()); + /* key_entry->last_rec->next_rec= rec */ + store_next_rec_ref(last_next_ref_ptr, next_ref_ptr); + /* key_entry->last_rec= rec */ + store_next_rec_ref(key_ref_ptr+get_size_of_key_offset(), next_ref_ptr); + } + else + { + /* + The key is not found in the hash table. + Put the key into the join buffer linking it with the keys for the + corresponding hash entry. Create a circular list with one element + referencing the record and attach the list to the key in the buffer. + */ + uchar *cp= last_key_entry; + cp-= get_size_of_rec_offset()+get_size_of_key_offset(); + store_next_key_ref(key_ref_ptr, cp); + store_null_key_ref(cp); + store_next_rec_ref(next_ref_ptr, next_ref_ptr); + store_next_rec_ref(cp+get_size_of_key_offset(), next_ref_ptr); + if (use_emb_key) + { + cp-= get_size_of_rec_offset(); + store_emb_key_ref(cp, key); + } + else + { + cp-= key_len; + memcpy(cp, key, key_len); + } + last_key_entry= cp; + /* Increment the counter of key_entries in the hash table */ + key_entries++; + } + return is_full; +} + + +/* + Read the next record from the JOIN_CACHE_BKA_UNIQUE buffer + + SYNOPSIS + get_record() + + DESCRIPTION + Additionally to what the default implementation of the virtual + function get_record does this implementation skips the link element + used to connect the records with the same key into a chain. + + RETURN + TRUE - there are no more records to read from the join buffer + FALSE - otherwise +*/ + +bool JOIN_CACHE_BKA_UNIQUE::get_record() +{ + pos+= get_size_of_rec_offset(); + return this->JOIN_CACHE::get_record(); +} + + +/* + Skip record from the JOIN_CACHE_BKA_UNIQUE join buffer if its match flag is on + + SYNOPSIS + skip_record_if_match() + + DESCRIPTION + This implementation of the virtual function skip_record_if_match does + the same as the default implementation does, but it takes into account + the link element used to connect the records with the same key into a chain. + + RETURN + TRUE - the match flag is on and the record has been skipped + FALSE - the match flag is off +*/ + +bool JOIN_CACHE_BKA_UNIQUE::skip_record_if_match() +{ + uchar *save_pos= pos; + pos+= get_size_of_rec_offset(); + if (!this->JOIN_CACHE::skip_record_if_match()) + { + pos= save_pos; + return FALSE; + } + return TRUE; +} + + +/* + Search for a key in the hash table of the join buffer + + SYNOPSIS + key_search() + key pointer to the key value + key_len key value length + key_ref_ptr OUT position of the reference to the next key from + the hash element for the found key , or + a position where the reference to the the hash + element for the key is to be added in the + case when the key has not been found + + DESCRIPTION + The function looks for a key in the hash table of the join buffer. + If the key is found the functionreturns the position of the reference + to the next key from to the hash element for the given key. + Otherwise the function returns the position where the reference to the + newly created hash element for the given key is to be added. + + RETURN + TRUE - the key is found in the hash table + FALSE - otherwise +*/ + +bool JOIN_CACHE_BKA_UNIQUE::key_search(uchar *key, uint key_len, + uchar **key_ref_ptr) +{ + bool is_found= FALSE; + uint idx= get_hash_idx(key, key_length); + uchar *ref_ptr= hash_table+size_of_key_ofs*idx; + while (!is_null_key_ref(ref_ptr)) + { + uchar *next_key; + ref_ptr= get_next_key_ref(ref_ptr); + next_key= use_emb_key ? get_emb_key(ref_ptr-get_size_of_rec_offset()) : + ref_ptr-key_length; + + if (memcmp(next_key, key, key_len) == 0) + { + is_found= TRUE; + break; + } + } + *key_ref_ptr= ref_ptr; + return is_found; +} + + +/* + Calclulate hash value for a key in the hash table of the join buffer + + SYNOPSIS + get_hash_idx() + key pointer to the key value + key_len key value length + + DESCRIPTION + The function calculates an index of the hash entry in the hash table + of the join buffer for the given key + + RETURN + the calculated index of the hash entry for the given key. +*/ + +uint JOIN_CACHE_BKA_UNIQUE::get_hash_idx(uchar* key, uint key_len) +{ + ulong nr= 1; + ulong nr2= 4; + uchar *pos= key; + uchar *end= key+key_len; + for (; pos < end ; pos++) + { + nr^= (ulong) ((((uint) nr & 63)+nr2)*((uint) *pos))+ (nr << 8); + nr2+= 3; + } + return nr % hash_entries; +} + + +/* + Clean up the hash table of the join buffer + + SYNOPSIS + cleanup_hash_table() + key pointer to the key value + key_len key value length + + DESCRIPTION + The function cleans up the hash table in the join buffer removing all + hash elements from the table. + + RETURN + none +*/ + +void JOIN_CACHE_BKA_UNIQUE:: cleanup_hash_table() +{ + last_key_entry= hash_table; + bzero(hash_table, (buff+buff_size)-hash_table); + key_entries= 0; +} + + +/* + Initialize retrieval of range sequence for BKA_UNIQUE algorithm + + SYNOPSIS + bka_range_seq_init() + init_params pointer to the BKA_INIQUE join cache object + n_ranges the number of ranges obtained + flags combination of HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY + + DESCRIPTION + The function interprets init_param as a pointer to a JOIN_CACHE_BKA_UNIQUE + object. The function prepares for an iteration over the unique join keys + built over the records from the cache join buffer. + + NOTE + This function are used only as a callback function. + + RETURN + init_param value that is to be used as a parameter of + bka_unique_range_seq_next() +*/ + +static +range_seq_t bka_unique_range_seq_init(void *init_param, uint n_ranges, + uint flags) +{ + DBUG_ENTER("bka_unique_range_seq_init"); + JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) init_param; + cache->reset(0); + DBUG_RETURN((range_seq_t) init_param); +} + + +/* + Get the key over the next record from the join buffer used by BKA_UNIQUE + + SYNOPSIS + bka_unique_range_seq_next() + seq value returned by bka_unique_range_seq_init() + range OUT reference to the next range + + DESCRIPTION + The function interprets seq as a pointer to the JOIN_CACHE_BKA_UNIQUE + object. The function returns a pointer to the range descriptor + for the next unique key built over records from the join buffer. + + NOTE + This function are used only as a callback function. + + RETURN + 0 ok, the range structure filled with info about the next key + 1 no more ranges +*/ + +static +uint bka_unique_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range) +{ + DBUG_ENTER("bka_unique_range_seq_next"); + JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) rseq; + TABLE_REF *ref= &cache->join_tab->ref; + key_range *start_key= &range->start_key; + if ((start_key->length= cache->get_next_key((uchar **) &start_key->key))) + { + start_key->keypart_map= (1 << ref->key_parts) - 1; + start_key->flag= HA_READ_KEY_EXACT; + range->end_key= *start_key; + range->end_key.flag= HA_READ_AFTER_KEY; + range->ptr= (char *) cache->get_curr_key_chain(); + range->range_flag= EQ_RANGE; + DBUG_RETURN(0); + } + DBUG_RETURN(1); +} + + +/* + Check whether range_info orders to skip the next record from BKA_UNIQUE buffer + + SYNOPSIS + bka_unique_range_seq_skip_record() + seq value returned by bka_unique_range_seq_init() + range_info information about the next range + rowid [NOT USED] rowid of the record to be checked (not used) + + DESCRIPTION + The function interprets seq as a pointer to the JOIN_CACHE_BKA_UNIQUE + object. The function returns TRUE if the record with this range_info + is to be filtered out from the stream of records returned by + multi_range_read_next(). + + NOTE + This function are used only as a callback function. + + RETURN + 1 record with this range_info is to be filtered out from the stream + of records returned by multi_range_read_next() + 0 the record is to be left in the stream +*/ + +static +bool bka_unique_range_seq_skip_record(range_seq_t rseq, char *range_info, + uchar *rowid) +{ + DBUG_ENTER("bka_unique_range_seq_skip_record"); + JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) rseq; + bool res= cache->check_all_match_flags_for_key((uchar *) range_info); + DBUG_RETURN(res); +} + + +/* + Check if the record combination matches the index condition + + SYNOPSIS + JOIN_CACHE_BKA_UNIQUE::skip_index_tuple() + rseq Value returned by bka_range_seq_init() + range_info MRR range association data + + DESCRIPTION + See JOIN_CACHE_BKA::skip_index_tuple(). + This function is the variant for use with + JOIN_CACHE_BKA_UNIQUE. The difference from JOIN_CACHE_BKA case is that + there may be multiple previous table record combinations that share the + same key, i.e. they map to the same MRR range. + As a consequence, we need to loop through all previous table record + combinations that match the given MRR range key range_info until we find + one that satisfies the index condition. + + NOTE + Possible optimization: + Before we unpack the record from a previous table + check if this table is used in the condition. + If so then unpack the record otherwise skip the unpacking. + This should be done by a special virtual method + get_partial_record_by_pos(). + + RETURN + 0 The record combination satisfies the index condition + 1 Otherwise + + +*/ + +bool JOIN_CACHE_BKA_UNIQUE::skip_index_tuple(range_seq_t rseq, char *range_info) +{ + DBUG_ENTER("JOIN_CACHE_BKA_UNIQUE::skip_index_tuple"); + JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) rseq; + uchar *last_rec_ref_ptr= cache->get_next_rec_ref((uchar*) range_info); + uchar *next_rec_ref_ptr= last_rec_ref_ptr; + do + { + next_rec_ref_ptr= cache->get_next_rec_ref(next_rec_ref_ptr); + uchar *rec_ptr= next_rec_ref_ptr + cache->rec_fields_offset; + cache->get_record_by_pos(rec_ptr); + if (join_tab->cache_idx_cond->val_int()) + DBUG_RETURN(FALSE); + } while(next_rec_ref_ptr != last_rec_ref_ptr); + DBUG_RETURN(TRUE); +} + + +/* + Check if the record combination matches the index condition + + SYNOPSIS + bka_unique_skip_index_tuple() + rseq Value returned by bka_range_seq_init() + range_info MRR range association data + + DESCRIPTION + This is wrapper for JOIN_CACHE_BKA_UNIQUE::skip_index_tuple method, + see comments there. + + NOTE + This function is used as a RANGE_SEQ_IF::skip_index_tuple callback. + + RETURN + 0 The record combination satisfies the index condition + 1 Otherwise +*/ + +static +bool bka_unique_skip_index_tuple(range_seq_t rseq, char *range_info) +{ + DBUG_ENTER("bka_unique_skip_index_tuple"); + JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) rseq; + DBUG_RETURN(cache->skip_index_tuple(rseq, range_info)); +} + + +/* + Using BKA_UNIQUE find matches from the next table for records from join buffer + + SYNOPSIS + join_matching_records() + skip_last do not look for matches for the last partial join record + + DESCRIPTION + This function can be used only when the table join_tab can be accessed + by keys built over the fields of previous join tables. + The function retrieves all keys from the hash table of the join buffer + built for partial join records from the buffer. For each of these keys + the function performs an index lookup and tries to match records yielded + by this lookup with records from the join buffer attached to the key. + If a match is found the function will call the sub_select function trying + to look for matches for the remaining join operations. + This function does not assume that matching records are necessarily + returned with references to the keys by which they were found. If the call + of the function multi_range_read_init returns flags with + HA_MRR_NO_ASSOCIATION then a search for the key built from the returned + record is carried on. The search is performed by probing in in the hash + table of the join buffer. + This function currently is called only from the function join_records. + It's assumed that this function is always called with the skip_last + parameter equal to false. + + RETURN + return one of enum_nested_loop_state +*/ + +enum_nested_loop_state +JOIN_CACHE_BKA_UNIQUE::join_matching_records(bool skip_last) +{ + int error; + uchar *key_chain_ptr; + handler *file= join_tab->table->file; + enum_nested_loop_state rc= NESTED_LOOP_OK; + bool check_only_first_match= join_tab->check_only_first_match(); + bool no_association= test(mrr_mode & HA_MRR_NO_ASSOCIATION); + + /* Set functions to iterate over keys in the join buffer */ + RANGE_SEQ_IF seq_funcs= { bka_unique_range_seq_init, + bka_unique_range_seq_next, + check_only_first_match && !no_association ? + bka_unique_range_seq_skip_record : 0, + join_tab->cache_idx_cond ? + bka_unique_skip_index_tuple : 0 }; + + /* The value of skip_last must be always FALSE when this function is called */ + DBUG_ASSERT(!skip_last); + + /* Return at once if there are no records in the join buffer */ + if (!records) + return NESTED_LOOP_OK; + + rc= init_join_matching_records(&seq_funcs, key_entries); + if (rc != NESTED_LOOP_OK) + goto finish; + + while (!(error= file->multi_range_read_next((char **) &key_chain_ptr))) + { + if (no_association) + { + uchar *key_ref_ptr; + TABLE *table= join_tab->table; + TABLE_REF *ref= &join_tab->ref; + KEY *keyinfo= table->key_info+ref->key; + /* + Build the key value out of the record returned by the call of + multi_range_read_next in the record buffer + */ + key_copy(ref->key_buff, table->record[0], keyinfo, ref->key_length); + /* Look for this key in the join buffer */ + if (!key_search(ref->key_buff, ref->key_length, &key_ref_ptr)) + continue; + key_chain_ptr= key_ref_ptr+get_size_of_key_offset(); + } + + uchar *last_rec_ref_ptr= get_next_rec_ref(key_chain_ptr); + uchar *next_rec_ref_ptr= last_rec_ref_ptr; + do + { + next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr); + uchar *rec_ptr= next_rec_ref_ptr+rec_fields_offset; + + if (join->thd->killed) + { + /* The user has aborted the execution of the query */ + join->thd->send_kill_message(); + rc= NESTED_LOOP_KILLED; + goto finish; + } + /* + If only the first match is needed and it has been already found + for the associated partial join record then the returned candidate + is discarded. + */ + if (rc == NESTED_LOOP_OK && + (!check_only_first_match || !get_match_flag_by_pos(rec_ptr))) + { + get_record_by_pos(rec_ptr); + update_virtual_fields(join->thd, join_tab->table); + rc= generate_full_extensions(rec_ptr); + if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS) + goto finish; + } + } + while (next_rec_ref_ptr != last_rec_ref_ptr); + } + + if (error > 0 && error != HA_ERR_END_OF_FILE) + return NESTED_LOOP_ERROR; +finish: + return end_join_matching_records(rc); +} + + +/* + Check whether all records in a key chain have their match flags set on + + SYNOPSIS + check_all_match_flags_for_key() + key_chain_ptr + + DESCRIPTION + This function retrieves records in the given circular chain and checks + whether their match flags are set on. The parameter key_chain_ptr shall + point to the position in the join buffer storing the reference to the + last element of this chain. + + RETURN + TRUE if each retrieved record has its match flag set on + FALSE otherwise +*/ + +bool JOIN_CACHE_BKA_UNIQUE::check_all_match_flags_for_key(uchar *key_chain_ptr) +{ + uchar *last_rec_ref_ptr= get_next_rec_ref(key_chain_ptr); + uchar *next_rec_ref_ptr= last_rec_ref_ptr; + do + { + next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr); + uchar *rec_ptr= next_rec_ref_ptr+rec_fields_offset; + if (!get_match_flag_by_pos(rec_ptr)) + return FALSE; + } + while (next_rec_ref_ptr != last_rec_ref_ptr); + return TRUE; +} + + +/* + Get the next key built for the records from BKA_UNIQUE join buffer + + SYNOPSIS + get_next_key() + key pointer to the buffer where the key value is to be placed + + DESCRIPTION + The function reads the next key value stored in the hash table of the + join buffer. Depending on the value of the use_emb_key flag of the + join cache the value is read either from the table itself or from + the record field where it occurs. + + RETURN + length of the key value - if the starting value of 'cur_key_entry' refers + to the position after that referred by the the value of 'last_key_entry' + 0 - otherwise. +*/ + +uint JOIN_CACHE_BKA_UNIQUE::get_next_key(uchar ** key) +{ + if (curr_key_entry == last_key_entry) + return 0; + + curr_key_entry-= key_entry_length; + + *key = use_emb_key ? get_emb_key(curr_key_entry) : curr_key_entry; + + DBUG_ASSERT(*key >= buff && *key < hash_table); + + return key_length; +} + + +/**************************************************************************** + * Join cache module end + ****************************************************************************/ |