merge.

checkpoint. does not compile.
author: Sergei Golubchik <sergii@pisem.net> 2010-11-25 18:17:28 +0100
committer: Sergei Golubchik <sergii@pisem.net> 2010-11-25 18:17:28 +0100
commit: 65ca700def99289cc31a7040537f5aa6e12bf485 (patch)
tree: 97b3a07299b626c519da0e80c122b5b79b933914 /sql/sql_join_cache.cc
parent: 2ab57de38d13d927ddff2d51aed4af34e13998f5 (diff)
parent: 6e5bcca7935d3c62f84bb640e5357664a210ee12 (diff)
download: mariadb-git-65ca700def99289cc31a7040537f5aa6e12bf485.tar.gz
1 files changed, 3294 insertions, 0 deletions
diff --git a/sql/sql_join_cache.cc b/sql/sql_join_cache.cc
new file mode 100644
index 00000000000..10064590a75
--- /dev/null
+++ b/sql/sql_join_cache.cc
@@ -0,0 +1,3294 @@
+/* Copyright (C) 2000-2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/**
+  @file
+
+  @brief
+  join cache optimizations
+
+  @defgroup Query_Optimizer  Query Optimizer
+  @{
+*/
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation				// gcc: Class implementation
+#endif
+
+#include "mysql_priv.h"
+#include "sql_select.h"
+#include "opt_subselect.h"
+
+#define NO_MORE_RECORDS_IN_BUFFER  (uint)(-1)
+
+
+/*****************************************************************************
+ *  Join cache module
+******************************************************************************/
+
+/* 
+  Fill in the descriptor of a flag field associated with a join cache    
+
+  SYNOPSIS
+    add_field_flag_to_join_cache()
+      str           position in a record buffer to copy the field from/to
+      length        length of the field 
+      field  IN/OUT pointer to the field descriptor to fill in 
+
+  DESCRIPTION
+    The function fill in the descriptor of a cache flag field to which
+    the parameter 'field' points to. The function uses the first two
+    parameters to set the position in the record buffer from/to which 
+    the field value is to be copied and the length of the copied fragment. 
+    Before returning the result the function increments the value of
+    *field by 1.
+    The function ignores the fields 'blob_length' and 'ofset' of the
+    descriptor.
+
+  RETURN
+    the length of the field  
+*/
+
+static
+uint add_flag_field_to_join_cache(uchar *str, uint length, CACHE_FIELD **field)
+{
+  CACHE_FIELD *copy= *field;
+  copy->str= str;
+  copy->length= length;
+  copy->type= 0;
+  copy->field= 0;
+  copy->referenced_field_no= 0;
+  (*field)++;
+  return length;    
+}
+
+
+/* 
+  Fill in the descriptors of table data fields associated with a join cache    
+
+  SYNOPSIS
+    add_table_data_fields_to_join_cache()
+      tab              descriptors of fields from this table are to be filled
+      field_set        descriptors for only these fields are to be created
+      field_cnt IN/OUT     counter of data fields  
+      descr  IN/OUT        pointer to the first descriptor to be filled
+      field_ptr_cnt IN/OUT counter of pointers to the data fields
+      descr_ptr IN/OUT     pointer to the first pointer to blob descriptors 
+
+  DESCRIPTION
+    The function fills in the descriptors of cache data fields from the table
+    'tab'. The descriptors are filled only for the fields marked in the 
+    bitmap 'field_set'. 
+    The function fills the descriptors starting from the position pointed
+    by 'descr'. If an added field is of a BLOB type then a pointer to the 
+    its descriptor is added to the array descr_ptr.   
+    At the return 'descr' points to the position after the last added
+    descriptor  while 'descr_ptr' points to the position right after the
+    last added pointer.  
+
+  RETURN
+    the total length of the added fields  
+*/
+
+static
+uint add_table_data_fields_to_join_cache(JOIN_TAB *tab, 
+                                         MY_BITMAP *field_set,
+                                         uint *field_cnt, 
+                                         CACHE_FIELD **descr,
+                                         uint *field_ptr_cnt,
+                                         CACHE_FIELD ***descr_ptr)
+{
+  Field **fld_ptr;
+  uint len= 0;
+  CACHE_FIELD *copy= *descr;
+  CACHE_FIELD **copy_ptr= *descr_ptr;
+  uint used_fields= bitmap_bits_set(field_set);
+  for (fld_ptr= tab->table->field; used_fields; fld_ptr++)
+  {
+    if (bitmap_is_set(field_set, (*fld_ptr)->field_index))
+    {
+      len+= (*fld_ptr)->fill_cache_field(copy);
+      if (copy->type == CACHE_BLOB)
+      {
+        *copy_ptr= copy;
+        copy_ptr++;
+        (*field_ptr_cnt)++;
+      }
+      copy->field= *fld_ptr;
+      copy->referenced_field_no= 0;
+      copy++;
+      (*field_cnt)++;
+      used_fields--;
+    }
+  }
+  *descr= copy;
+  *descr_ptr= copy_ptr;
+  return len;
+}
+    
+
+/* 
+  Determine different counters of fields associated with a record in the cache  
+
+  SYNOPSIS
+    calc_record_fields()
+
+  DESCRIPTION
+    The function counts the number of total fields stored in a record
+    of the cache and saves this number in the 'fields' member. It also
+    determines the number of flag fields and the number of blobs.
+    The function sets 'with_match_flag' on if 'join_tab' needs a match flag
+    i.e. if it is the first inner table of an outer join or a semi-join.  
+
+  RETURN
+    none 
+*/
+
+void JOIN_CACHE::calc_record_fields()
+{
+  JOIN_TAB *tab = prev_cache ? prev_cache->join_tab :
+                               join->join_tab+join->const_tables;
+  tables= join_tab-tab;
+
+  fields= 0;
+  blobs= 0;
+  flag_fields= 0;
+  data_field_count= 0;
+  data_field_ptr_count= 0;
+  referenced_fields= 0;
+
+  for ( ; tab < join_tab ; tab++)
+  {	    
+    calc_used_field_length(join->thd, tab);
+    flag_fields+= test(tab->used_null_fields || tab->used_uneven_bit_fields);
+    flag_fields+= test(tab->table->maybe_null);
+    fields+= tab->used_fields;
+    blobs+= tab->used_blobs;
+
+    fields+= tab->check_rowid_field();
+  }
+  if ((with_match_flag= join_tab->use_match_flag()))
+    flag_fields++;
+  fields+= flag_fields;
+}
+
+/* 
+  Allocate memory for descriptors and pointers to them associated with the cache  
+
+  SYNOPSIS
+    alloc_fields()
+
+  DESCRIPTION
+    The function allocates memory for the array of fields descriptors
+    and the array of pointers to the field descriptors used to copy
+    join record data from record buffers into the join buffer and
+    backward. Some pointers refer to the field descriptor associated
+    with previous caches. They are placed at the beginning of the
+    array of pointers and its total number is specified by the parameter
+    'external fields'.
+    The pointer of the first array is assigned to field_descr and the 
+    number of elements is precalculated by the function calc_record_fields. 
+    The allocated arrays are adjacent.
+  
+  NOTES
+    The memory is allocated in join->thd->memroot
+
+  RETURN
+    pointer to the first array  
+*/
+
+int JOIN_CACHE::alloc_fields(uint external_fields)
+{
+  uint ptr_cnt= external_fields+blobs+1;
+  uint fields_size= sizeof(CACHE_FIELD)*fields;
+  field_descr= (CACHE_FIELD*) sql_alloc(fields_size +
+                                        sizeof(CACHE_FIELD*)*ptr_cnt);
+  blob_ptr= (CACHE_FIELD **) ((uchar *) field_descr + fields_size);
+  return (field_descr == NULL);
+}  
+
+/* 
+  Create descriptors of the record flag fields stored in the join buffer 
+
+  SYNOPSIS
+    create_flag_fields()
+
+  DESCRIPTION
+    The function creates descriptors of the record flag fields stored
+    in the join buffer. These are descriptors for:
+    - an optional match flag field,
+    - table null bitmap fields, 
+    - table null row fields.
+    The match flag field is created when 'join_tab' is the first inner
+    table of an outer join our a semi-join. A null bitmap field is
+    created for any table whose fields are to be stored in the join
+    buffer if at least one of these fields is nullable or is a BIT field
+    whose bits are partially stored with null bits. A null row flag
+    is created for any table assigned to the cache if it is an inner
+    table of an outer join.
+    The descriptor for flag fields are placed one after another at the
+    beginning of the array of field descriptors 'field_descr' that
+    contains 'fields' elements. If there is a match flag field the 
+    descriptor for it is always first in the sequence of flag fields.
+    The descriptors for other flag fields can follow in an arbitrary
+    order. 
+    The flag field values follow in a record stored in the join buffer
+    in the same order as field descriptors, with the match flag always
+    following first.
+    The function sets the value of 'flag_fields' to the total number
+    of the descriptors created for the flag fields.
+    The function sets the value of 'length' to the total length of the
+    flag fields.
+  
+  RETURN
+    none
+*/
+
+void JOIN_CACHE::create_flag_fields()
+{
+  CACHE_FIELD *copy;
+  JOIN_TAB *tab;
+
+  copy= field_descr;
+
+  length=0;
+
+  /* If there is a match flag the first field is always used for this flag */ 
+  if (with_match_flag)
+    length+= add_flag_field_to_join_cache((uchar*) &join_tab->found,
+                                          sizeof(join_tab->found),
+	                                  &copy);
+
+  /* Create fields for all null bitmaps and null row flags that are needed */
+  for (tab= join_tab-tables; tab < join_tab; tab++)
+  {
+    TABLE *table= tab->table;
+
+    /* Create a field for the null bitmap from table if needed */
+    if (tab->used_null_fields || tab->used_uneven_bit_fields)			    
+      length+= add_flag_field_to_join_cache(table->null_flags,
+                                            table->s->null_bytes,
+                                            &copy);
+ 
+    /* Create table for the null row flag if needed */
+    if (table->maybe_null)
+      length+= add_flag_field_to_join_cache((uchar*) &table->null_row,
+                                            sizeof(table->null_row),
+                                            &copy);
+  }
+
+  /* Theoretically the new value of flag_fields can be less than the old one */   
+  flag_fields= copy-field_descr;
+}
+
+
+/* 
+  Create descriptors of all remaining data fields stored in the join buffer    
+
+  SYNOPSIS
+    create_remaining_fields()
+      all_read_fields   indicates that descriptors for all read data fields
+                        are to be created
+
+  DESCRIPTION
+    The function creates descriptors for all remaining data fields of a
+    record from the join buffer. If the parameter 'all_read_fields' is
+    true the function creates fields for all read record fields that
+    comprise the partial join record joined with join_tab. Otherwise, 
+    for each table tab, the set of the read fields for which the descriptors
+    have to be added is determined as the difference between all read fields
+    and and those for which the descriptors have been already created.
+    The latter are supposed to be marked in the bitmap tab->table->tmp_set.
+    The function increases the value of 'length' to the the total length of
+    the added fields.
+   
+  NOTES
+    If 'all_read_fields' is false the function modifies the value of
+    tab->table->tmp_set for a each table whose fields are stored in the cache.
+    The function calls the method Field::fill_cache_field to figure out
+    the type of the cache field and the maximal length of its representation
+    in the join buffer. If this is a blob field then additionally a pointer
+    to this field is added as an element of the array blob_ptr. For a blob
+    field only the size of the length of the blob data is taken into account.
+    It is assumed that 'data_field_count' contains the number of descriptors
+    for data fields that have been already created and 'data_field_ptr_count'
+    contains the number of the pointers to such descriptors having been
+    stored up to the moment.
+
+  RETURN
+    none 
+*/
+
+void JOIN_CACHE:: create_remaining_fields(bool all_read_fields)
+{
+  JOIN_TAB *tab;
+  CACHE_FIELD *copy= field_descr+flag_fields+data_field_count;
+  CACHE_FIELD **copy_ptr= blob_ptr+data_field_ptr_count;
+
+  for (tab= join_tab-tables; tab < join_tab; tab++)
+  {
+    MY_BITMAP *rem_field_set;
+    TABLE *table= tab->table;
+
+    if (all_read_fields)
+      rem_field_set= table->read_set;
+    else
+    {
+      bitmap_invert(&table->tmp_set);
+      bitmap_intersect(&table->tmp_set, table->read_set);
+      rem_field_set= &table->tmp_set;
+    }  
+
+    length+= add_table_data_fields_to_join_cache(tab, rem_field_set,
+                                                 &data_field_count, &copy,
+                                                 &data_field_ptr_count,
+                                                 &copy_ptr);
+  
+    /* SemiJoinDuplicateElimination: allocate space for rowid if needed */
+    if (tab->keep_current_rowid)
+    {
+      copy->str= table->file->ref;
+      copy->length= table->file->ref_length;
+      copy->type= 0;
+      copy->field= 0;
+      copy->referenced_field_no= 0;
+      length+= copy->length;
+      data_field_count++;
+      copy++;
+    }
+  }
+}
+
+
+/* 
+  Calculate and set all cache constants      
+
+  SYNOPSIS
+    set_constants()
+
+  DESCRIPTION
+    The function calculates and set all precomputed constants that are used
+    when writing records into the join buffer and reading them from it.
+    It calculates the size of offsets of a record within the join buffer
+    and of a field within a record. It also calculates the number of bytes
+    used to store record lengths.
+    The function also calculates the maximal length of the representation
+    of record in the cache excluding blob_data. This value is used when
+    making a dicision whether more records should be added into the join
+    buffer or not.
+  
+  RETURN
+    none 
+*/
+
+void JOIN_CACHE::set_constants()
+{ 
+  /* 
+    Any record from a BKA cache is prepended with the record length.
+    We use the record length when reading the buffer and building key values
+    for each record. The length allows us not to read the fields that are
+    not needed for keys.
+    If a record has match flag it also may be skipped when the match flag
+    is on. It happens if the cache is used for a semi-join operation or
+    for outer join when the 'not exist' optimization can be applied.
+    If some of the fields are referenced from other caches then
+    the record length allows us to easily reach the saved offsets for
+    these fields since the offsets are stored at the very end of the record.
+    However at this moment we don't know whether we have referenced fields for
+    the cache or not. Later when a referenced field is registered for the cache
+    we adjust the value of the flag 'with_length'.
+  */ 
+  with_length= is_key_access() || 
+               join_tab->is_inner_table_of_semi_join_with_first_match() ||
+               join_tab->is_inner_table_of_outer_join();
+  /* 
+     At this moment we don't know yet the value of 'referenced_fields',
+     but in any case it can't be greater than the value of 'fields'.
+  */
+  uint len= length + fields*sizeof(uint)+blobs*sizeof(uchar *) +
+            (prev_cache ? prev_cache->get_size_of_rec_offset() : 0) +
+            sizeof(ulong);
+  buff_size= max(join->thd->variables.join_buff_size, 2*len);
+  size_of_rec_ofs= offset_size(buff_size);
+  size_of_rec_len= blobs ? size_of_rec_ofs : offset_size(len); 
+  size_of_fld_ofs= size_of_rec_len;
+  /* 
+    The size of the offsets for referenced fields will be added later.
+    The values of 'pack_length' and 'pack_length_with_blob_ptrs' are adjusted
+    every time when the first reference to the referenced field is registered.
+  */
+  pack_length= (with_length ? size_of_rec_len : 0) +
+               (prev_cache ? prev_cache->get_size_of_rec_offset() : 0) + 
+               length;
+  pack_length_with_blob_ptrs= pack_length + blobs*sizeof(uchar *);
+}
+
+
+/* 
+  Allocate memory for a join buffer      
+
+  SYNOPSIS
+    alloc_buffer()
+
+  DESCRIPTION
+    The function allocates a lump of memory for the cache join buffer. The
+    size of the allocated memory is 'buff_size' bytes. 
+  
+  RETURN
+    0 - if the memory has been successfully allocated
+    1 - otherwise
+*/
+
+int JOIN_CACHE::alloc_buffer()
+{
+  buff= (uchar*) my_malloc(buff_size, MYF(0));
+  return buff == NULL;
+}    	
+  
+
+/* 
+  Initialize a BNL cache       
+
+  SYNOPSIS
+    init()
+
+  DESCRIPTION
+    The function initializes the cache structure. It supposed to be called
+    right after a constructor for the JOIN_CACHE_BNL.
+    The function allocates memory for the join buffer and for descriptors of
+    the record fields stored in the buffer.
+
+  NOTES
+    The code of this function should have been included into the constructor
+    code itself. However the new operator for the class JOIN_CACHE_BNL would
+    never fail while memory allocation for the join buffer is not absolutely
+    unlikely to fail. That's why this memory allocation has to be placed in a
+    separate function that is called in a couple with a cache constructor.
+    It is quite natural to put almost all other constructor actions into
+    this function.     
+  
+  RETURN
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
+*/
+
+int JOIN_CACHE_BNL::init()
+{
+  DBUG_ENTER("JOIN_CACHE::init");
+
+  calc_record_fields();
+
+  if (alloc_fields(0))
+    DBUG_RETURN(1);
+
+  create_flag_fields();
+  
+  create_remaining_fields(TRUE);
+
+  set_constants();
+
+  if (alloc_buffer())
+    DBUG_RETURN(1); 
+  
+  reset(TRUE); 
+
+  DBUG_RETURN(0);
+}
+
+
+/* 
+  Initialize a BKA cache       
+
+  SYNOPSIS
+    init()
+
+  DESCRIPTION
+    The function initializes the cache structure. It supposed to be called
+    right after a constructor for the JOIN_CACHE_BKA.
+    The function allocates memory for the join buffer and for descriptors of
+    the record fields stored in the buffer.
+
+  NOTES
+    The code of this function should have been included into the constructor
+    code itself. However the new operator for the class JOIN_CACHE_BKA would
+    never fail while memory allocation for the join buffer is not absolutely
+    unlikely to fail. That's why this memory allocation has to be placed in a
+    separate function that is called in a couple with a cache constructor.
+    It is quite natural to put almost all other constructor actions into
+    this function.     
+  
+  RETURN
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
+*/
+
+int JOIN_CACHE_BKA::init()
+{
+  JOIN_TAB *tab;
+  JOIN_CACHE *cache;
+  local_key_arg_fields= 0;
+  external_key_arg_fields= 0;
+  DBUG_ENTER("JOIN_CACHE_BKA::init");
+
+  calc_record_fields();
+
+  /* Mark all fields that can be used as arguments for this key access */
+  TABLE_REF *ref= &join_tab->ref;
+  cache= this;
+  do
+  {
+    /* 
+      Traverse the ref expressions and find the occurrences of fields in them for
+      each table 'tab' whose fields are to be stored in the 'cache' join buffer.
+      Mark these fields in the bitmap tab->table->tmp_set.
+      For these fields count the number of them stored in this cache and the
+      total number of them stored in the previous caches. Save the result
+      of the counting 'in local_key_arg_fields' and 'external_key_arg_fields'
+      respectively.
+    */ 
+    for (tab= cache->join_tab-cache->tables; tab < cache->join_tab ; tab++)
+    { 
+      uint key_args;
+      bitmap_clear_all(&tab->table->tmp_set);
+      for (uint i= 0; i < ref->key_parts; i++)
+      {
+        Item *ref_item= ref->items[i]; 
+        if (!(tab->table->map & ref_item->used_tables()))
+	  continue;
+	 ref_item->walk(&Item::add_field_to_set_processor, 1,
+                        (uchar *) tab->table);
+      }
+      if ((key_args= bitmap_bits_set(&tab->table->tmp_set)))
+      {
+        if (cache == this)
+          local_key_arg_fields+= key_args;
+        else
+          external_key_arg_fields+= key_args;
+      }
+    }
+    cache= cache->prev_cache;
+  } 
+  while (cache);
+
+  if (alloc_fields(external_key_arg_fields))
+    DBUG_RETURN(1);
+
+  create_flag_fields();
+  
+  /* 
+    Save pointers to the cache fields in previous caches
+    that  are used to build keys for this key access.
+  */
+  cache= this;
+  uint ext_key_arg_cnt= external_key_arg_fields;
+  CACHE_FIELD *copy;
+  CACHE_FIELD **copy_ptr= blob_ptr;
+  while (ext_key_arg_cnt)
+  {
+    cache= cache->prev_cache;
+    for (tab= cache->join_tab-cache->tables; tab < cache->join_tab ; tab++)
+    { 
+      CACHE_FIELD *copy_end;
+      MY_BITMAP *key_read_set= &tab->table->tmp_set;
+      /* key_read_set contains the bitmap of tab's fields referenced by ref */ 
+      if (bitmap_is_clear_all(key_read_set))
+        continue;
+      copy_end= cache->field_descr+cache->fields;
+      for (copy= cache->field_descr+cache->flag_fields; copy < copy_end; copy++)
+      {
+        /*
+          (1) - when we store rowids for DuplicateWeedout, they have
+                copy->field==NULL
+        */
+        if (copy->field &&  // (1)
+            copy->field->table == tab->table &&
+            bitmap_is_set(key_read_set, copy->field->field_index))
+        {
+          *copy_ptr++= copy; 
+          ext_key_arg_cnt--;
+          if (!copy->referenced_field_no)
+          {
+            /* 
+              Register the referenced field 'copy': 
+              - set the offset number in copy->referenced_field_no,
+              - adjust the value of the flag 'with_length',
+              - adjust the values of 'pack_length' and 
+                of 'pack_length_with_blob_ptrs'.
+	    */
+            copy->referenced_field_no= ++cache->referenced_fields;
+            cache->with_length= TRUE;
+	    cache->pack_length+= cache->get_size_of_fld_offset();
+            cache->pack_length_with_blob_ptrs+= cache->get_size_of_fld_offset();
+          }        
+        }
+      }
+    } 
+  }
+  /* After this 'blob_ptr' shall not be be changed */ 
+  blob_ptr= copy_ptr;
+  
+  /* Now create local fields that are used to build ref for this key access */
+  copy= field_descr+flag_fields;
+  for (tab= join_tab-tables; tab < join_tab ; tab++)
+  {
+    length+= add_table_data_fields_to_join_cache(tab, &tab->table->tmp_set,
+                                                 &data_field_count, &copy,
+                                                 &data_field_ptr_count, 
+                                                 &copy_ptr);
+  }
+
+  use_emb_key= check_emb_key_usage();
+
+  create_remaining_fields(FALSE);
+
+  set_constants();
+
+  if (alloc_buffer())
+    DBUG_RETURN(1); 
+
+  reset(TRUE);
+
+  DBUG_RETURN(0);
+}  
+
+
+/* 
+  Check the possibility to read the access keys directly from the join buffer       
+
+  SYNOPSIS
+    check_emb_key_usage()
+
+  DESCRIPTION
+    The function checks some conditions at which the key values can be read
+    directly from the join buffer. This is possible when the key values can be
+    composed by concatenation of the record fields stored in the join buffer.
+    Sometimes when the access key is multi-component the function has to re-order
+    the fields written into the join buffer to make keys embedded. If key 
+    values for the key access are detected as embedded then 'use_emb_key'
+    is set to TRUE.
+
+  EXAMPLE
+    Let table t2 has an index defined on the columns a,b . Let's assume also
+    that the columns t2.a, t2.b as well as the columns t1.a, t1.b are all
+    of the integer type. Then if the query
+      SELECT COUNT(*) FROM t1, t2 WHERE t1.a=t2.a and t1.b=t2.b  
+    is executed with a join cache in such a way that t1 is the driving
+    table then the key values to access table t2 can be read directly
+    from the join buffer.
+  
+  NOTES
+    In some cases key values could be read directly from the join buffer but
+    we still do not consider them embedded. In the future we'll expand the
+    the class of keys which we identify as embedded.
+
+  RETURN
+    TRUE  - key values will be considered as embedded,
+    FALSE - otherwise.
+*/
+
+bool JOIN_CACHE_BKA::check_emb_key_usage()
+{
+  uint i;
+  Item *item; 
+  KEY_PART_INFO *key_part;
+  CACHE_FIELD *copy;
+  CACHE_FIELD *copy_end;
+  uint len= 0;
+  TABLE *table= join_tab->table;
+  TABLE_REF *ref= &join_tab->ref;
+  KEY *keyinfo= table->key_info+ref->key;
+
+  /* 
+    If some of the key arguments are not from the local cache the key
+    is not considered as embedded.
+    TODO:
+    Expand it to the case when ref->key_parts=1 and local_key_arg_fields=0.
+  */  
+  if (external_key_arg_fields != 0)
+    return FALSE;
+  /* 
+    If the number of the local key arguments is not equal to the number
+    of key parts the key value cannot be read directly from the join buffer.   
+  */
+  if (local_key_arg_fields != ref->key_parts)
+    return FALSE;
+
+  /* 
+    A key is not considered embedded if one of the following is true:
+    - one of its key parts is not equal to a field
+    - it is a partial key
+    - definition of the argument field does not coincide with the
+      definition of the corresponding key component
+    - some of the key components are nullable
+  */  
+  for (i=0; i < ref->key_parts; i++)
+  {
+    item= ref->items[i]->real_item();
+    if (item->type() != Item::FIELD_ITEM)
+      return FALSE;
+    key_part= keyinfo->key_part+i;
+    if (key_part->key_part_flag & HA_PART_KEY_SEG)
+      return FALSE;
+    if (!key_part->field->eq_def(((Item_field *) item)->field))
+      return FALSE;
+    if (key_part->field->maybe_null())
+      return FALSE;
+  }
+  
+  copy= field_descr+flag_fields;
+  copy_end= copy+local_key_arg_fields;
+  for ( ; copy < copy_end; copy++)
+  {
+    /* 
+      If some of the key arguments are of variable length the key
+      is not considered as embedded.
+    */
+    if (copy->type != 0)
+      return FALSE;
+    /* 
+      If some of the key arguments are bit fields whose bits are partially
+      stored with null bits the key is not considered as embedded.
+    */
+    if (copy->field->type() == MYSQL_TYPE_BIT &&
+	 ((Field_bit*) (copy->field))->bit_len)
+      return FALSE;
+    len+= copy->length;
+  }
+
+  emb_key_length= len;
+
+  /* 
+    Make sure that key fields follow the order of the corresponding
+    key components these fields are equal to. For this the descriptors
+    of the fields that comprise the key might be re-ordered.
+  */
+  for (i= 0; i < ref->key_parts; i++)
+  {
+    uint j;
+    Item *item= ref->items[i]->real_item();
+    Field *fld= ((Item_field *) item)->field;
+    CACHE_FIELD *init_copy= field_descr+flag_fields+i; 
+    for (j= i, copy= init_copy; i < local_key_arg_fields;  i++, copy++)
+    {
+      if (fld->eq(copy->field))
+      {
+        if (j != i)
+        {
+          CACHE_FIELD key_part_copy= *copy;
+          *copy= *init_copy;
+          *init_copy= key_part_copy;
+        }
+        break;
+      }
+    }
+  }
+
+  return TRUE;
+}    
+
+
+/* 
+  Calculate the increment of the MRR buffer for a record write       
+
+  SYNOPSIS
+    aux_buffer_incr()
+
+  DESCRIPTION
+    This implementation of the virtual function aux_buffer_incr determines
+    for how much the size of the MRR buffer should be increased when another
+    record is added to the cache.   
+
+  RETURN
+    the increment of the size of the MRR buffer for the next record
+*/
+
+uint JOIN_CACHE_BKA::aux_buffer_incr()
+{
+  uint incr= 0;
+  TABLE_REF *ref= &join_tab->ref;
+  TABLE *tab= join_tab->table;
+  uint rec_per_key= tab->key_info[ref->key].rec_per_key[ref->key_parts-1];
+  set_if_bigger(rec_per_key, 1);
+  if (records == 1)
+    incr=  ref->key_length + tab->file->ref_length;
+  incr+= tab->file->stats.mrr_length_per_rec * rec_per_key;
+  return incr; 
+}
+
+
+/*
+  Check if the record combination matches the index condition
+
+  SYNOPSIS
+    JOIN_CACHE_BKA::skip_index_tuple()
+      rseq             Value returned by bka_range_seq_init()
+      range_info       MRR range association data
+    
+  DESCRIPTION
+    This function is invoked from MRR implementation to check if an index
+    tuple matches the index condition. It is used in the case where the index
+    condition actually depends on both columns of the used index and columns
+    from previous tables.
+    
+    Accessing columns of the previous tables requires special handling with
+    BKA. The idea of BKA is to collect record combinations in a buffer and 
+    then do a batch of ref access lookups, i.e. by the time we're doing a
+    lookup its previous-records-combination is not in prev_table->record[0]
+    but somewhere in the join buffer.
+    
+    We need to get it from there back into prev_table(s)->record[0] before we
+    can evaluate the index condition, and that's why we need this function
+    instead of regular IndexConditionPushdown.
+
+  NOTE
+    Possible optimization:
+    Before we unpack the record from a previous table
+    check if this table is used in the condition.
+    If so then unpack the record otherwise skip the unpacking.
+    This should be done by a special virtual method
+    get_partial_record_by_pos().
+
+  RETURN
+    0    The record combination satisfies the index condition
+    1    Otherwise
+*/
+
+bool JOIN_CACHE_BKA::skip_index_tuple(range_seq_t rseq, char *range_info)
+{
+  DBUG_ENTER("JOIN_CACHE_BKA::skip_index_tuple");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
+  cache->get_record_by_pos((uchar*)range_info);
+  DBUG_RETURN(!join_tab->cache_idx_cond->val_int());
+}
+
+
+/*
+  Check if the record combination matches the index condition
+
+  SYNOPSIS
+    bka_skip_index_tuple()
+      rseq             Value returned by bka_range_seq_init()
+      range_info       MRR range association data
+    
+  DESCRIPTION
+    This is wrapper for JOIN_CACHE_BKA::skip_index_tuple method,
+    see comments there.
+
+  NOTE
+    This function is used as a RANGE_SEQ_IF::skip_index_tuple callback.
+ 
+  RETURN
+    0    The record combination satisfies the index condition
+    1    Otherwise
+*/
+
+static 
+bool bka_skip_index_tuple(range_seq_t rseq, char *range_info)
+{
+  DBUG_ENTER("bka_skip_index_tuple");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
+  DBUG_RETURN(cache->skip_index_tuple(rseq, range_info));
+}
+
+
+/* 
+  Write record fields and their required offsets into the join cache buffer
+
+  SYNOPSIS
+    write_record_data()
+      link        a reference to the associated info in the previous cache
+      is_full OUT true if it has been decided that no more records will be
+                  added to the join buffer
+
+  DESCRIPTION
+    This function put into the cache buffer the following info that it reads
+    from the join record buffers or computes somehow:
+    (1) the length of all fields written for the record (optional)
+    (2) an offset to the associated info in the previous cache (if there is any)
+        determined by the link parameter
+    (3) all flag fields of the tables whose data field are put into the cache:
+        - match flag (optional),
+        - null bitmaps for all tables,
+        - null row flags for all tables
+    (4) values of all data fields including
+        - full images of those fixed legth data fields that cannot have 
+          trailing spaces
+        - significant part of fixed length fields that can have trailing spaces
+          with the prepanded length 
+        - data of non-blob variable length fields with the prepanded data length  
+        - blob data from blob fields with the prepanded data length
+    (5) record offset values for the data fields that are referred to from 
+        other caches
+ 
+    The record is written at the current position stored in the field 'pos'.
+    At the end of the function 'pos' points at the position right after the 
+    written record data.
+    The function increments the number of records in the cache that is stored
+    in the 'records' field by 1. The function also modifies the values of
+    'curr_rec_pos' and 'last_rec_pos' to point to the written record.
+    The 'end_pos' cursor is modified accordingly.
+    The 'last_rec_blob_data_is_in_rec_buff' is set on if the blob data 
+    remains in the record buffers and not copied to the join buffer. It may
+    happen only to the blob data from the last record added into the cache.
+   
+    
+  RETURN
+    length of the written record data
+*/
+
+uint JOIN_CACHE::write_record_data(uchar * link, bool *is_full)
+{
+  uint len;
+  bool last_record;
+  CACHE_FIELD *copy;
+  CACHE_FIELD *copy_end;
+  uchar *cp= pos;
+  uchar *init_pos= cp;
+  uchar *rec_len_ptr= 0;
+ 
+  records++;  /* Increment the counter of records in the cache */
+
+  len= pack_length;
+
+  /* Make an adjustment for the size of the auxiliary buffer if there is any */
+  uint incr= aux_buffer_incr();
+  ulong rem= rem_space();
+  aux_buff_size+= len+incr < rem ? incr : rem;
+
+  /*
+    For each blob to be put into cache save its length and a pointer
+    to the value in the corresponding element of the blob_ptr array.
+    Blobs with null values are skipped.
+    Increment 'len' by the total length of all these blobs. 
+  */    
+  if (blobs)
+  {
+    CACHE_FIELD **copy_ptr= blob_ptr;
+    CACHE_FIELD **copy_ptr_end= copy_ptr+blobs;
+    for ( ; copy_ptr < copy_ptr_end; copy_ptr++)
+    {
+      Field_blob *blob_field= (Field_blob *) (*copy_ptr)->field;
+      if (!blob_field->is_null())
+      {
+        uint blob_len= blob_field->get_length();
+        (*copy_ptr)->blob_length= blob_len;
+        len+= blob_len;
+        blob_field->get_ptr(&(*copy_ptr)->str);
+      }
+    }
+  }
+
+  /*
+    Check whether we won't be able to add any new record into the cache after
+    this one because the cache will be full. Set last_record to TRUE if it's so.
+    The assume that the cache will be full after the record has been written
+    into it if either the remaining space of the cache is not big enough for the 
+    record's blob values or if there is a chance that not all non-blob fields
+    of the next record can be placed there.
+    This function is called only in the case when there is enough space left in
+    the cache to store at least non-blob parts of the current record.
+  */
+  last_record= (len+pack_length_with_blob_ptrs) > rem_space();
+  
+  /* 
+    Save the position for the length of the record in the cache if it's needed.
+    The length of the record will be inserted here when all fields of the record
+    are put into the cache.  
+  */
+  if (with_length)
+  {
+    rec_len_ptr= cp;   
+    cp+= size_of_rec_len;
+  }
+
+  /*
+    Put a reference to the fields of the record that are stored in the previous
+    cache if there is any. This reference is passed by the 'link' parameter.     
+  */
+  if (prev_cache)
+  {
+    cp+= prev_cache->get_size_of_rec_offset();
+    prev_cache->store_rec_ref(cp, link);
+  } 
+
+  curr_rec_pos= cp;
+  
+  /* If the there is a match flag set its value to 0 */
+  copy= field_descr;
+  if (with_match_flag)
+    *copy[0].str= 0;
+
+  /* First put into the cache the values of all flag fields */
+  copy_end= field_descr+flag_fields;
+  for ( ; copy < copy_end; copy++)
+  {
+    memcpy(cp, copy->str, copy->length);
+    cp+= copy->length;
+  } 
+  
+  /* Now put the values of the remaining fields as soon as they are not nulls */ 
+  copy_end= field_descr+fields;
+  for ( ; copy < copy_end; copy++)
+  {
+    Field *field= copy->field;
+    if (field && field->maybe_null() && field->is_null())
+    {
+      /* Do not copy a field if its value is null */
+      if (copy->referenced_field_no)
+        copy->offset= 0;
+      continue;              
+    }
+    /* Save the offset of the field to put it later at the end of the record */ 
+    if (copy->referenced_field_no)
+      copy->offset= cp-curr_rec_pos;
+
+    if (copy->type == CACHE_BLOB)
+    {
+      Field_blob *blob_field= (Field_blob *) copy->field;
+      if (last_record)
+      {
+        last_rec_blob_data_is_in_rec_buff= 1;
+        /* Put down the length of the blob and the pointer to the data */  
+	blob_field->get_image(cp, copy->length+sizeof(char*),
+                              blob_field->charset());
+	cp+= copy->length+sizeof(char*);
+      }
+      else
+      {
+        /* First put down the length of the blob and then copy the data */ 
+	blob_field->get_image(cp, copy->length, 
+			      blob_field->charset());
+	memcpy(cp+copy->length, copy->str, copy->blob_length);               
+	cp+= copy->length+copy->blob_length;
+      }
+    }
+    else
+    {
+      switch (copy->type) {
+      case CACHE_VARSTR1:
+        /* Copy the significant part of the short varstring field */ 
+        len= (uint) copy->str[0] + 1;
+        memcpy(cp, copy->str, len);
+        cp+= len;
+        break;
+      case CACHE_VARSTR2:
+        /* Copy the significant part of the long varstring field */
+        len= uint2korr(copy->str) + 2;
+        memcpy(cp, copy->str, len);
+        cp+= len;
+        break;
+      case CACHE_STRIPPED:
+      {
+        /* 
+          Put down the field value stripping all trailing spaces off.
+          After this insert the length of the written sequence of bytes.
+        */ 
+	uchar *str, *end;
+	for (str= copy->str, end= str+copy->length;
+	     end > str && end[-1] == ' ';
+	     end--) ;
+	len=(uint) (end-str);
+        int2store(cp, len);
+	memcpy(cp+2, str, len);
+	cp+= len+2;
+        break;
+      }
+      default:      
+        /* Copy the entire image of the field from the record buffer */
+	memcpy(cp, copy->str, copy->length);
+	cp+= copy->length;
+      }
+    }
+  }
+  
+  /* Add the offsets of the fields that are referenced from other caches */ 
+  if (referenced_fields)
+  {
+    uint cnt= 0;
+    for (copy= field_descr+flag_fields; copy < copy_end ; copy++)
+    {
+      if (copy->referenced_field_no)
+      {
+        store_fld_offset(cp+size_of_fld_ofs*(copy->referenced_field_no-1),
+                         copy->offset);
+        cnt++;
+      }
+    }
+    cp+= size_of_fld_ofs*cnt;
+  }
+
+  if (rec_len_ptr)
+    store_rec_length(rec_len_ptr, (ulong) (cp-rec_len_ptr-size_of_rec_len));
+  last_rec_pos= curr_rec_pos; 
+  end_pos= pos= cp;
+  *is_full= last_record;
+  return (uint) (cp-init_pos);
+}
+
+
+/* 
+  Reset the join buffer for reading/writing: default implementation
+
+  SYNOPSIS
+    reset()
+      for_writing  if it's TRUE the function reset the buffer for writing
+
+  DESCRIPTION
+    This default implementation of the virtual function reset() resets 
+    the join buffer for reading or writing.
+    If the buffer is reset for reading only the 'pos' value is reset
+    to point to the very beginning of the join buffer. If the buffer is
+    reset for writing additionally: 
+    - the counter of the records in the buffer is set to 0,
+    - the the value of 'last_rec_pos' gets pointing at the position just
+      before the buffer, 
+    - 'end_pos' is set to point to the beginning of the join buffer,
+    - the size of the auxiliary buffer is reset to 0,
+    - the flag 'last_rec_blob_data_is_in_rec_buff' is set to 0.
+    
+  RETURN
+    none
+*/
+
+void JOIN_CACHE::reset(bool for_writing)
+{
+  pos= buff;
+  curr_rec_link= 0;
+  if (for_writing)
+  {
+    records= 0;
+    last_rec_pos= buff;
+    aux_buff_size= 0;
+    end_pos= pos;
+    last_rec_blob_data_is_in_rec_buff= 0;
+  }
+}
+
+/* 
+  Add a record into the join buffer: the default implementation
+
+  SYNOPSIS
+    put_record()
+
+  DESCRIPTION
+    This default implementation of the virtual function put_record writes
+    the next matching record into the join buffer.
+    It also links the record having been written into the join buffer with
+    the matched record in the previous cache if there is any.
+    The implementation assumes that the function get_curr_link() 
+    will return exactly the pointer to this matched record.
+
+  RETURN
+    TRUE    if it has been decided that it should be the last record
+            in the join buffer,
+    FALSE   otherwise
+*/
+
+bool JOIN_CACHE::put_record()
+{
+  bool is_full;
+  uchar *link= 0;
+  if (prev_cache)
+    link= prev_cache->get_curr_rec_link();
+  write_record_data(link, &is_full);
+  return is_full;
+}
+  
+
+/* 
+  Read the next record from the join buffer: the default implementation
+
+  SYNOPSIS
+    get_record()
+
+  DESCRIPTION
+    This default implementation of the virtual function get_record
+    reads fields of the next record from the join buffer of this cache.
+    The function also reads all other fields associated with this record
+    from the the join buffers of the previous caches. The fields are read
+    into the corresponding record buffers.
+    It is supposed that 'pos' points to the position in the buffer 
+    right after the previous record when the function is called.
+    When the function returns the 'pos' values is updated to point
+    to the position after the read record.
+    The value of 'curr_rec_pos' is also updated by the function to
+    point to the beginning of the first field of the record in the
+    join buffer.    
+
+  RETURN
+    TRUE  - there are no more records to read from the join buffer
+    FALSE - otherwise
+*/
+
+bool JOIN_CACHE::get_record()
+{ 
+  bool res;
+  uchar *prev_rec_ptr= 0;
+  if (with_length)
+    pos+= size_of_rec_len;
+  if (prev_cache)
+  {
+    pos+= prev_cache->get_size_of_rec_offset();
+    prev_rec_ptr= prev_cache->get_rec_ref(pos);
+  }
+  curr_rec_pos= pos;
+  if (!(res= read_all_record_fields() == NO_MORE_RECORDS_IN_BUFFER))
+  {
+    pos+= referenced_fields*size_of_fld_ofs;
+    if (prev_cache)
+      prev_cache->get_record_by_pos(prev_rec_ptr);
+  } 
+  return res; 
+}
+
+
+/* 
+  Read a positioned record from the join buffer: the default implementation
+
+  SYNOPSIS
+    get_record_by_pos()
+      rec_ptr  position of the first field of the record in the join buffer
+
+  DESCRIPTION
+    This default implementation of the virtual function get_record_pos
+    reads the fields of the record positioned at 'rec_ptr' from the join buffer.
+    The function also reads all other fields associated with this record 
+    from the the join buffers of the previous caches. The fields are read
+    into the corresponding record buffers.
+
+  RETURN
+    none
+*/
+
+void JOIN_CACHE::get_record_by_pos(uchar *rec_ptr)
+{
+  uchar *save_pos= pos;
+  pos= rec_ptr;
+  read_all_record_fields();
+  pos= save_pos;
+  if (prev_cache)
+  {
+    uchar *prev_rec_ptr= prev_cache->get_rec_ref(rec_ptr);
+    prev_cache->get_record_by_pos(prev_rec_ptr);
+  }
+}
+
+
+/* 
+  Test the match flag from the referenced record: the default implementation
+
+  SYNOPSIS
+    get_match_flag_by_pos()
+      rec_ptr  position of the first field of the record in the join buffer
+
+  DESCRIPTION
+    This default implementation of the virtual function get_match_flag_by_pos
+    test the match flag for the record pointed by the reference at the position
+    rec_ptr. If the match flag in placed one of the previous buffers the function
+    first reaches the linked record fields in this buffer.
+
+  RETURN
+    TRUE    if the match flag is set on
+    FALSE   otherwise
+*/
+
+bool JOIN_CACHE::get_match_flag_by_pos(uchar *rec_ptr)
+{
+  if (with_match_flag)
+    return test(*rec_ptr);
+  if (prev_cache)
+  {
+    uchar *prev_rec_ptr= prev_cache->get_rec_ref(rec_ptr);
+    return prev_cache->get_match_flag_by_pos(prev_rec_ptr);
+  } 
+  DBUG_ASSERT(0);
+  return FALSE;
+}
+
+
+/* 
+  Read all flag and data fields of a record from the join buffer
+
+  SYNOPSIS
+    read_all_record_fields()
+
+  DESCRIPTION
+    The function reads all flag and data fields of a record from the join
+    buffer into the corresponding record buffers.
+    The fields are read starting from the position 'pos' which is
+    supposed to point to the beginning og the first record field.
+    The function increments the value of 'pos' by the length of the
+    read data. 
+
+  RETURN
+    (-1) - if there is no more records in the join buffer
+    length of the data read from the join buffer - otherwise
+*/
+
+uint JOIN_CACHE::read_all_record_fields()
+{
+  uchar *init_pos= pos;
+  
+  if (pos > last_rec_pos || !records)
+    return NO_MORE_RECORDS_IN_BUFFER;
+
+  /* First match flag, read null bitmaps and null_row flag for each table */
+  read_flag_fields();
+ 
+  /* Now read the remaining table fields if needed */
+  CACHE_FIELD *copy= field_descr+flag_fields;
+  CACHE_FIELD *copy_end= field_descr+fields;
+  bool blob_in_rec_buff= blob_data_is_in_rec_buff(init_pos);
+  for ( ; copy < copy_end; copy++)
+    read_record_field(copy, blob_in_rec_buff);
+
+  return (uint) (pos-init_pos);
+}
+
+
+/* 
+  Read all flag fields of a record from the join buffer
+
+  SYNOPSIS
+    read_flag_fields()
+
+  DESCRIPTION
+    The function reads all flag fields of a record from the join
+    buffer into the corresponding record buffers.
+    The fields are read starting from the position 'pos'.
+    The function increments the value of 'pos' by the length of the
+    read data. 
+
+  RETURN
+    length of the data read from the join buffer
+*/
+
+uint JOIN_CACHE::read_flag_fields()
+{
+  uchar *init_pos= pos;
+  CACHE_FIELD *copy= field_descr;
+  CACHE_FIELD *copy_end= copy+flag_fields;
+  for ( ; copy < copy_end; copy++)
+  {
+    memcpy(copy->str, pos, copy->length);
+    pos+= copy->length;
+  }
+  return (pos-init_pos);
+}
+
+
+/* 
+  Read a data record field from the join buffer
+
+  SYNOPSIS
+    read_record_field()
+      copy             the descriptor of the data field to be read
+      blob_in_rec_buff indicates whether this is the field from the record
+                       whose blob data are in record buffers
+
+  DESCRIPTION
+    The function reads the data field specified by the parameter copy
+    from the join buffer into the corresponding record buffer. 
+    The field is read starting from the position 'pos'.
+    The data of blob values is not copied from the join buffer.
+    The function increments the value of 'pos' by the length of the
+    read data. 
+
+  RETURN
+    length of the data read from the join buffer
+*/
+
+uint JOIN_CACHE::read_record_field(CACHE_FIELD *copy, bool blob_in_rec_buff)
+{
+  uint len;
+  /* Do not copy the field if its value is null */ 
+  if (copy->field && copy->field->maybe_null() && copy->field->is_null())
+    return 0;           
+  if (copy->type == CACHE_BLOB)
+  {
+    Field_blob *blob_field= (Field_blob *) copy->field;
+    /* 
+      Copy the length and the pointer to data but not the blob data 
+      itself to the record buffer
+    */ 
+    if (blob_in_rec_buff)
+    {
+      blob_field->set_image(pos, copy->length+sizeof(char*),
+			    blob_field->charset());
+      len= copy->length+sizeof(char*);
+    }
+    else
+    {
+      blob_field->set_ptr(pos, pos+copy->length);
+      len= copy->length+blob_field->get_length();
+    }
+  }
+  else
+  {
+    switch (copy->type) {
+    case CACHE_VARSTR1:
+      /* Copy the significant part of the short varstring field */
+      len= (uint) pos[0] + 1;
+      memcpy(copy->str, pos, len);
+      break;
+    case CACHE_VARSTR2:
+      /* Copy the significant part of the long varstring field */
+      len= uint2korr(pos) + 2;
+      memcpy(copy->str, pos, len);
+      break;
+    case CACHE_STRIPPED:
+      /* Pad the value by spaces that has been stripped off */
+      len= uint2korr(pos);
+      memcpy(copy->str, pos+2, len);
+      memset(copy->str+len, ' ', copy->length-len);
+      len+= 2;
+      break;
+    default:
+      /* Copy the entire image of the field from the record buffer */
+      len= copy->length;
+      memcpy(copy->str, pos, len);
+    }
+  }
+  pos+= len;
+  return len;
+}
+
+
+/* 
+  Read a referenced field from the join buffer
+
+  SYNOPSIS
+    read_referenced_field()
+      copy         pointer to the descriptor of the referenced field
+      rec_ptr      pointer to the record that may contain this field
+      len  IN/OUT  total length of the record fields 
+
+  DESCRIPTION
+    The function checks whether copy points to a data field descriptor
+    for this cache object. If it does not then the function returns
+    FALSE. Otherwise the function reads the field of the record in
+    the join buffer pointed by 'rec_ptr' into the corresponding record
+    buffer and returns TRUE.
+    If the value of *len is 0 then the function sets it to the total
+    length of the record fields including possible trailing offset
+    values. Otherwise *len is supposed to provide this value that
+    has been obtained earlier.  
+
+  RETURN
+    TRUE   'copy' points to a data descriptor of this join cache
+    FALSE  otherwise
+*/
+
+bool JOIN_CACHE::read_referenced_field(CACHE_FIELD *copy,
+                                       uchar *rec_ptr, 
+                                       uint *len)
+{
+  uchar *ptr;
+  uint offset;
+  if (copy < field_descr || copy >= field_descr+fields)
+    return FALSE;
+  if (!*len)
+  {
+    /* Get the total length of the record fields */ 
+    uchar *len_ptr= rec_ptr;
+    if (prev_cache)
+      len_ptr-= prev_cache->get_size_of_rec_offset();
+    *len= get_rec_length(len_ptr-size_of_rec_len);
+  }
+  
+  ptr= rec_ptr-(prev_cache ? prev_cache->get_size_of_rec_offset() : 0);  
+  offset= get_fld_offset(ptr+ *len - 
+                         size_of_fld_ofs*
+                         (referenced_fields+1-copy->referenced_field_no));  
+  bool is_null= FALSE;
+  if (offset == 0 && flag_fields)
+    is_null= TRUE;
+  if (is_null)
+    copy->field->set_null();
+  else
+  {
+    uchar *save_pos= pos;
+    copy->field->set_notnull(); 
+    pos= rec_ptr+offset;
+    read_record_field(copy, blob_data_is_in_rec_buff(rec_ptr));
+    pos= save_pos;
+  }
+  return TRUE;
+}
+   
+
+/* 
+  Skip record from join buffer if its match flag is on: default implementation
+
+  SYNOPSIS
+    skip_record_if_match()
+
+  DESCRIPTION
+    This default implementation of the virtual function skip_record_if_match
+    skips the next record from the join buffer if its  match flag is set on.
+    If the record is skipped the value of 'pos' is set to points to the position
+    right after the record.
+
+  RETURN
+    TRUE  - the match flag is on and the record has been skipped
+    FALSE - the match flag is off 
+*/
+
+bool JOIN_CACHE::skip_record_if_match()
+{
+  DBUG_ASSERT(with_length);
+  uint offset= size_of_rec_len;
+  if (prev_cache)
+    offset+= prev_cache->get_size_of_rec_offset();
+  /* Check whether the match flag is on */
+  if (get_match_flag_by_pos(pos+offset))
+  {
+    pos+= size_of_rec_len + get_rec_length(pos);
+    return TRUE;
+  }
+  return FALSE;
+}      
+
+
+/* 
+  Restore the fields of the last record from the join buffer
+ 
+  SYNOPSIS
+    restore_last_record()
+
+  DESCRIPTION
+    This function restore the values of the fields of the last record put
+    into join buffer in record buffers. The values most probably have been
+    overwritten by the field values from other records when they were read
+    from the join buffer into the record buffer in order to check pushdown
+    predicates.
+
+  RETURN
+    none
+*/
+
+void JOIN_CACHE::restore_last_record()
+{
+  if (records)
+    get_record_by_pos(last_rec_pos);
+}
+
+
+/*
+  Join records from the join buffer with records from the next join table    
+
+  SYNOPSIS
+    join_records()
+      skip_last    do not find matches for the last record from the buffer
+
+  DESCRIPTION
+    The functions extends all records from the join buffer by the matched
+    records from join_tab. In the case of outer join operation it also
+    adds null complementing extensions for the records from the join buffer
+    that have no match. 
+    No extensions are generated for the last record from the buffer if
+    skip_last is true.  
+
+  NOTES
+    The function must make sure that if linked join buffers are used then
+    a join buffer cannot be refilled again until all extensions in the
+    buffers chained to this one are generated.
+    Currently an outer join operation with several inner tables always uses
+    at least two linked buffers with the match join flags placed in the
+    first buffer. Any record composed of rows of the inner tables that
+    matches a record in this buffer must refer to the position of the
+    corresponding match flag.
+
+  IMPLEMENTATION
+    When generating extensions for outer tables of an outer join operation
+    first we generate all extensions for those records from the join buffer
+    that have matches, after which null complementing extension for all
+    unmatched records from the join buffer are generated.  
+      
+  RETURN
+    return one of enum_nested_loop_state, except NESTED_LOOP_NO_MORE_ROWS.
+*/ 
+
+enum_nested_loop_state JOIN_CACHE::join_records(bool skip_last)
+{
+  JOIN_TAB *tab;
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  bool outer_join_first_inner= join_tab->is_first_inner_for_outer_join();
+
+  if (outer_join_first_inner && !join_tab->first_unmatched)
+    join_tab->not_null_compl= TRUE;   
+
+  if (!join_tab->first_unmatched)
+  {
+    /* Find all records from join_tab that match records from join buffer */
+    rc= join_matching_records(skip_last);   
+    if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+      goto finish;
+    if (outer_join_first_inner)
+    {
+      if (next_cache)
+      {
+        /* 
+          Ensure that all matches for outer records from join buffer are to be
+          found. Now we ensure that all full records are found for records from
+          join buffer. Generally this is an overkill.
+          TODO: Ensure that only matches of the inner table records have to be
+          found for the records from join buffer.
+	*/ 
+        rc= next_cache->join_records(skip_last);
+        if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+          goto finish;
+      }
+      join_tab->not_null_compl= FALSE;
+      /* Prepare for generation of null complementing extensions */
+      for (tab= join_tab->first_inner; tab <= join_tab->last_inner; tab++)
+        tab->first_unmatched= join_tab->first_inner;
+    }
+  }
+  if (join_tab->first_unmatched)
+  {
+    if (is_key_access())
+      restore_last_record();
+
+    /* 
+      Generate all null complementing extensions for the records from
+      join buffer that don't have any matching rows from the inner tables.
+    */
+    reset(FALSE);
+    rc= join_null_complements(skip_last);   
+    if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+      goto finish;
+  }
+  if(next_cache)
+  {
+    /* 
+      When using linked caches we must ensure the records in the next caches
+      that refer to the records in the join buffer are fully extended.
+      Otherwise we could have references to the records that have been
+      already erased from the join buffer and replaced for new records. 
+    */ 
+    rc= next_cache->join_records(skip_last);
+    if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+      goto finish;
+  }
+  if (outer_join_first_inner)
+  {
+    /* 
+      All null complemented rows have been already generated for all
+      outer records from join buffer. Restore the state of the
+      first_unmatched values to 0 to avoid another null complementing.
+    */
+    for (tab= join_tab->first_inner; tab <= join_tab->last_inner; tab++)
+      tab->first_unmatched= 0;
+  } 
+ 
+  if (skip_last)
+  {
+    DBUG_ASSERT(!is_key_access());
+    /*
+       Restore the last record from the join buffer to generate
+       all extentions for it.
+    */
+    get_record();		               
+  }
+
+finish:
+  restore_last_record();
+  reset(TRUE);
+  return rc;
+}
+
+
+/*
+  Using BNL find matches from the next table for records from the join buffer   
+
+  SYNOPSIS
+    join_matching_records()
+      skip_last    do not look for matches for the last partial join record 
+
+  DESCRIPTION
+    The function retrieves all rows of the join_tab table and check whether
+    they match partial join records from the join buffer. If a match is found
+    the function will call the sub_select function trying to look for matches
+    for the remaining join operations.
+    This function currently is called only from the function join_records.    
+    If the value of skip_last is true the function writes the partial join
+    record from the record buffer into the join buffer to save its value for
+    the future processing in the caller function.
+
+  NOTES
+    The function produces all matching extensions for the records in the 
+    join buffer following the path of the Blocked Nested Loops algorithm. 
+    When an outer join operation is performed all unmatched records from
+    the join buffer must be extended by null values. The function 
+    'join_null_complements' serves this purpose.  
+      
+  RETURN
+    return one of enum_nested_loop_state.
+*/ 
+
+enum_nested_loop_state JOIN_CACHE_BNL::join_matching_records(bool skip_last)
+{
+  uint cnt;
+  int error;
+  JOIN_TAB *tab;
+  READ_RECORD *info;
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  bool check_only_first_match= join_tab->check_only_first_match();
+  SQL_SELECT *select= join_tab->cache_select;
+
+  join_tab->table->null_row= 0;
+
+  /* Return at once if there are no records in the join buffer */
+  if (!records)     
+    return NESTED_LOOP_OK;   
+ 
+  /* 
+    When joining we read records from the join buffer back into record buffers.
+    If matches for the last partial join record are found through a call to
+    the sub_select function then this partial join record must be saved in the
+    join buffer in order to be restored just before the sub_select call.
+  */             
+  if (skip_last)     
+    put_record();     
+ 
+  if (join_tab->use_quick == 2 && join_tab->select->quick)
+  { 
+    /* A dynamic range access was used last. Clean up after it */
+    delete join_tab->select->quick;
+    join_tab->select->quick= 0;
+  }
+
+  for (tab= join->join_tab; tab != join_tab ; tab++)
+  {
+    tab->status= tab->table->status;
+    tab->table->status= 0;
+  }
+
+  /* Start retrieving all records of the joined table */
+  if ((error= join_init_read_record(join_tab))) 
+  {
+    rc= error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR;
+    goto finish;
+  }
+
+  info= &join_tab->read_record;
+  do
+  {
+    if (join_tab->keep_current_rowid)
+      join_tab->table->file->position(join_tab->table->record[0]);
+
+    if (join->thd->killed)
+    {
+      /* The user has aborted the execution of the query */
+      join->thd->send_kill_message();
+      rc= NESTED_LOOP_KILLED;
+      goto finish; 
+    }
+    int err= 0;
+
+     if (rc == NESTED_LOOP_OK)
+       update_virtual_fields(join->thd, join_tab->table);
+ 
+    /* 
+      Do not look for matches if the last read record of the joined table
+      does not meet the conditions that have been pushed to this table
+    */
+    if (rc == NESTED_LOOP_OK && 
+        (!select || (err= select->skip_record(join->thd)) != 0))
+    {
+      if (err < 0)
+        return NESTED_LOOP_ERROR;
+      rc= NESTED_LOOP_OK;
+
+      /* Prepare to read records from the join buffer */
+      reset(FALSE);
+
+      /* Read each record from the join buffer and look for matches */
+      for (cnt= records - test(skip_last) ; cnt; cnt--)
+      { 
+        /* 
+          If only the first match is needed and it has been already found for
+          the next record read from the join buffer then the record is skipped.
+	*/
+        if (!check_only_first_match || !skip_record_if_match())
+        {
+	  get_record();
+          rc= generate_full_extensions(get_curr_rec());
+          if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+	    goto finish;   
+        }
+      }
+    }
+  } while (!(error= info->read_record(info)));
+
+  if (error > 0)				// Fatal error
+    rc= NESTED_LOOP_ERROR; 
+finish:                  
+  for (tab= join->join_tab; tab != join_tab ; tab++)
+    tab->table->status= tab->status;
+  return rc;
+}
+
+     
+/*
+  Set match flag for a record in join buffer if it has not been set yet    
+
+  SYNOPSIS
+    set_match_flag_if_none()
+      first_inner     the join table to which this flag is attached to
+      rec_ptr         pointer to the record in the join buffer 
+
+  DESCRIPTION
+    If the records of the table are accumulated in a join buffer the function
+    sets the match flag for the record in the buffer that is referred to by
+    the record from this cache positioned at 'rec_ptr'. 
+    The function also sets the match flag 'found' of the table first inner
+    if it has not been set before. 
+
+  NOTES
+    The function assumes that the match flag for any record in any cache
+    is placed in the first byte occupied by the record fields. 
+
+  RETURN
+    TRUE   the match flag is set by this call for the first time
+    FALSE  the match flag has been set before this call
+*/ 
+
+bool JOIN_CACHE::set_match_flag_if_none(JOIN_TAB *first_inner,
+                                        uchar *rec_ptr)
+{
+  if (!first_inner->cache)
+  {
+    /* 
+      Records of the first inner table to which the flag is attached to
+      are not accumulated in a join buffer.
+    */
+    if (first_inner->found)
+      return FALSE;
+    else
+    {
+      first_inner->found= 1;
+      return TRUE;
+    }
+  }
+  JOIN_CACHE *cache= this;
+  while (cache->join_tab != first_inner)
+  {
+    cache= cache->prev_cache;
+    DBUG_ASSERT(cache);
+    rec_ptr= cache->get_rec_ref(rec_ptr);
+  } 
+  if (rec_ptr[0] == 0)
+  {
+    rec_ptr[0]= 1;
+    first_inner->found= 1;
+    return TRUE;  
+  }
+  return FALSE;
+}
+
+
+/*
+  Generate all full extensions for a partial join record in the buffer    
+
+  SYNOPSIS
+    generate_full_extensions()
+      rec_ptr     pointer to the record from join buffer to generate extensions 
+
+  DESCRIPTION
+    The function first checks whether the current record of 'join_tab' matches
+    the partial join record from join buffer located at 'rec_ptr'. If it is the
+    case the function calls the join_tab->next_select method to generate
+    all full extension for this partial join match.
+      
+  RETURN
+    return one of enum_nested_loop_state.
+*/ 
+
+enum_nested_loop_state JOIN_CACHE::generate_full_extensions(uchar *rec_ptr)
+{
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  
+  /*
+    Check whether the extended partial join record meets
+    the pushdown conditions. 
+  */
+  if (check_match(rec_ptr))
+  {    
+    int res= 0;
+
+    if (!join_tab->check_weed_out_table || 
+        !(res= do_sj_dups_weedout(join->thd, join_tab->check_weed_out_table)))
+    {
+      set_curr_rec_link(rec_ptr);
+      rc= (join_tab->next_select)(join, join_tab+1, 0);
+      if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+      {
+        reset(TRUE);
+        return rc;
+      }
+    }
+    if (res == -1)
+    {
+      rc= NESTED_LOOP_ERROR;
+      return rc;
+    }
+  }
+  return rc;
+}
+
+
+/*
+  Check matching to a partial join record from the join buffer    
+
+  SYNOPSIS
+    check_match()
+      rec_ptr     pointer to the record from join buffer to check matching to 
+
+  DESCRIPTION
+    The function checks whether the current record of 'join_tab' matches
+    the partial join record from join buffer located at 'rec_ptr'. If this is
+    the case and 'join_tab' is the last inner table of a semi-join or an outer
+    join the function turns on the match flag for the 'rec_ptr' record unless
+    it has been already set.
+
+  NOTES
+    Setting the match flag on can trigger re-evaluation of pushdown conditions
+    for the record when join_tab is the last inner table of an outer join.
+      
+  RETURN
+    TRUE   there is a match
+    FALSE  there is no match
+*/ 
+
+inline bool JOIN_CACHE::check_match(uchar *rec_ptr)
+{
+  /* Check whether pushdown conditions are satisfied */
+  if (join_tab->select && join_tab->select->skip_record(join->thd) < 1)
+    return FALSE;
+
+  if (!join_tab->is_last_inner_table())
+    return TRUE;
+
+  /* 
+     This is the last inner table of an outer join,
+     and maybe of other embedding outer joins, or
+     this is the last inner table of a semi-join.
+  */
+  JOIN_TAB *first_inner= join_tab->get_first_inner_table();
+  do
+  {
+    set_match_flag_if_none(first_inner, rec_ptr);
+    if (first_inner->check_only_first_match() &&
+        !join_tab->first_inner)
+      return TRUE;
+    /* 
+      This is the first match for the outer table row.
+      The function set_match_flag_if_none has turned the flag
+      first_inner->found on. The pushdown predicates for
+      inner tables must be re-evaluated with this flag on.
+      Note that, if first_inner is the first inner table 
+      of a semi-join, but is not an inner table of an outer join
+      such that 'not exists' optimization can  be applied to it, 
+      the re-evaluation of the pushdown predicates is not needed.
+    */      
+    for (JOIN_TAB *tab= first_inner; tab <= join_tab; tab++)
+    {
+      if (tab->select && tab->select->skip_record(join->thd) < 1)
+        return FALSE;
+    }
+  }
+  while ((first_inner= first_inner->first_upper) &&
+         first_inner->last_inner == join_tab);
+  
+  return TRUE;
+} 
+
+
+/*
+  Add null complements for unmatched outer records from join buffer    
+
+  SYNOPSIS
+    join_null_complements()
+      skip_last    do not add null complements for the last record 
+
+  DESCRIPTION
+    This function is called only for inner tables of outer joins.
+    The function retrieves all rows from the join buffer and adds null
+    complements for those of them that do not have matches for outer
+    table records.
+    If the 'join_tab' is the last inner table of the embedding outer 
+    join and the null complemented record satisfies the outer join
+    condition then the the corresponding match flag is turned on
+    unless it has been set earlier. This setting may trigger
+    re-evaluation of pushdown conditions for the record. 
+
+  NOTES
+    The same implementation of the virtual method join_null_complements
+    is used for JOIN_CACHE_BNL and JOIN_CACHE_BKA.
+      
+  RETURN
+    return one of enum_nested_loop_state.
+*/ 
+
+enum_nested_loop_state JOIN_CACHE::join_null_complements(bool skip_last)
+{
+  uint cnt; 
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  bool is_first_inner= join_tab == join_tab->first_unmatched;
+  bool is_last_inner= join_tab == join_tab->first_unmatched->last_inner;
+ 
+  /* Return at once if there are no records in the join buffer */
+  if (!records)
+    return NESTED_LOOP_OK;
+  
+  cnt= records - (is_key_access() ? 0 : test(skip_last));
+
+  /* This function may be called only for inner tables of outer joins */ 
+  DBUG_ASSERT(join_tab->first_inner);
+
+  for ( ; cnt; cnt--)
+  {
+    if (join->thd->killed)
+    {
+      /* The user has aborted the execution of the query */
+      join->thd->send_kill_message();
+      rc= NESTED_LOOP_KILLED;
+      goto finish;
+    }
+    /* Just skip the whole record if a match for it has been already found */
+    if (!is_first_inner || !skip_record_if_match())
+    {
+      get_record();
+      /* The outer row is complemented by nulls for each inner table */
+      restore_record(join_tab->table, s->default_values);
+      mark_as_null_row(join_tab->table);  
+      /* Check all pushdown conditions attached to the inner table */
+      join_tab->first_unmatched->found= 1;
+      if (join_tab->select && join_tab->select->skip_record(join->thd) < 1)
+        continue;
+      if (is_last_inner)
+      { 
+        JOIN_TAB *first_upper= join_tab->first_unmatched->first_upper;
+        while (first_upper && first_upper->last_inner == join_tab)
+        {
+          set_match_flag_if_none(first_upper, get_curr_rec());
+          for (JOIN_TAB* tab= first_upper; tab <= join_tab; tab++)
+          {
+            if (tab->select && tab->select->skip_record(join->thd) < 1)
+              goto next;
+          }
+          first_upper= first_upper->first_upper;
+        }
+      }
+      /* Find all matches for the remaining join tables */
+      rc= (*join_tab->next_select)(join, join_tab+1, 0);
+      if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+      {
+        reset(TRUE);
+        goto finish;
+      }
+    }
+  next:
+    ;
+  }
+
+finish:
+  return rc;
+}
+
+
+/*
+  Initialize retrieval of range sequence for BKA algorithm
+    
+  SYNOPSIS
+    bka_range_seq_init()
+     init_params   pointer to the BKA join cache object
+     n_ranges      the number of ranges obtained 
+     flags         combination of HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY
+
+  DESCRIPTION
+    The function interprets init_param as a pointer to a JOIN_CACHE_BKA
+    object. The function prepares for an iteration over the join keys
+    built for all records from the cache join buffer.
+
+  NOTE
+    This function are used only as a callback function.    
+
+  RETURN
+    init_param value that is to be used as a parameter of bka_range_seq_next()
+*/    
+
+static 
+range_seq_t bka_range_seq_init(void *init_param, uint n_ranges, uint flags)
+{
+  DBUG_ENTER("bka_range_seq_init");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) init_param;
+  cache->reset(0);
+  DBUG_RETURN((range_seq_t) init_param);
+}
+
+
+/*
+  Get the key over the next record from the join buffer used by BKA  
+    
+  SYNOPSIS
+    bka_range_seq_next()
+      seq    the value returned by  bka_range_seq_init
+      range  OUT reference to the next range
+  
+  DESCRIPTION
+    The function interprets seq as a pointer to a JOIN_CACHE_BKA
+    object. The function returns a pointer to the range descriptor
+    for the key built over the next record from the join buffer.
+
+  NOTE
+    This function are used only as a callback function.
+   
+  RETURN
+    0   ok, the range structure filled with info about the next key
+    1   no more ranges
+*/    
+
+static 
+uint bka_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+{
+  DBUG_ENTER("bka_range_seq_next");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
+  TABLE_REF *ref= &cache->join_tab->ref;
+  key_range *start_key= &range->start_key;
+  if ((start_key->length= cache->get_next_key((uchar **) &start_key->key)))
+  {
+    start_key->keypart_map= (1 << ref->key_parts) - 1;
+    start_key->flag= HA_READ_KEY_EXACT;
+    range->end_key= *start_key;
+    range->end_key.flag= HA_READ_AFTER_KEY;
+    range->ptr= (char *) cache->get_curr_rec();
+    range->range_flag= EQ_RANGE;
+    DBUG_RETURN(0);
+  } 
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Check whether range_info orders to skip the next record from BKA buffer
+
+  SYNOPSIS
+    bka_range_seq_skip_record()
+      seq              value returned by bka_range_seq_init()
+      range_info       information about the next range
+      rowid [NOT USED] rowid of the record to be checked 
+
+    
+  DESCRIPTION
+    The function interprets seq as a pointer to a JOIN_CACHE_BKA object.
+    The function interprets seq as a pointer to the JOIN_CACHE_BKA_UNIQUE
+    object. The function returns TRUE if the record with this range_info
+    is to be filtered out from the stream of records returned by
+    multi_range_read_next(). 
+
+  NOTE
+    This function are used only as a callback function.
+
+  RETURN
+    1    record with this range_info is to be filtered out from the stream
+         of records returned by multi_range_read_next()
+    0    the record is to be left in the stream
+*/ 
+
+static 
+bool bka_range_seq_skip_record(range_seq_t rseq, char *range_info, uchar *rowid)
+{
+  DBUG_ENTER("bka_range_seq_skip_record");
+  JOIN_CACHE_BKA *cache= (JOIN_CACHE_BKA *) rseq;
+  bool res= cache->get_match_flag_by_pos((uchar *) range_info);
+  DBUG_RETURN(res);
+}
+
+/*
+  Using BKA find matches from the next table for records from the join buffer   
+
+  SYNOPSIS
+    join_matching_records()
+      skip_last    do not look for matches for the last partial join record 
+
+  DESCRIPTION
+    This function can be used only when the table join_tab can be accessed
+    by keys built over the fields of previous join tables.
+    The function retrieves all partial join records from the join buffer and
+    for each of them builds the key value to access join_tab, performs index
+    look-up with this key and selects matching records yielded by this look-up
+    If a match is found the function will call the sub_select function trying
+    to look for matches for the remaining join operations.
+    This function currently is called only from the function join_records.    
+    It's assumed that this function is always called with the skip_last 
+    parameter equal to false.
+
+  NOTES
+    The function produces all matching extensions for the records in the 
+    join buffer following the path of the Batched Key Access algorithm. 
+    When an outer join operation is performed all unmatched records from
+    the join buffer must be extended by null values. The function 
+    join_null_complements serves this purpose.
+    The Batched Key Access algorithm assumes that key accesses are batched.
+    In other words it assumes that, first, either keys themselves or the
+    corresponding rowids (primary keys) are accumulated in a buffer, then
+    data rows from  join_tab are fetched for all of them. When a row is
+    fetched it is always returned with a reference to the key by which it
+    has been accessed.
+    When key values are batched we can save on the number of the server 
+    requests for index lookups. For the remote engines, like NDB cluster, it
+    essentially reduces the number of round trips between the server and
+    the engine when performing a join operation. 
+    When the rowids for the keys are batched we can optimize the order
+    in what we fetch the data for this rowids. The performance benefits of
+    this optimization can be significant for such engines as MyISAM, InnoDB.
+    What is exactly batched are hidden behind implementations of
+    MRR handler interface that is supposed to be appropriately chosen
+    for each engine. If for a engine no specific implementation of the MRR
+    interface is supllied then the default implementation is used. This
+    implementation actually follows the path of Nested Loops Join algorithm.
+    In this case BKA join surely will demonstrate a worse performance than
+    NL join. 
+            
+  RETURN
+    return one of enum_nested_loop_state
+*/
+
+enum_nested_loop_state JOIN_CACHE_BKA::join_matching_records(bool skip_last)
+{
+  int error;
+  handler *file= join_tab->table->file;
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  uchar *rec_ptr= 0;
+  bool check_only_first_match= join_tab->check_only_first_match();
+
+  /* Set functions to iterate over keys in the join buffer */
+
+  RANGE_SEQ_IF seq_funcs= { bka_range_seq_init, 
+                            bka_range_seq_next,
+                            check_only_first_match ?
+                              bka_range_seq_skip_record : 0,
+                            join_tab->cache_idx_cond ?
+                              bka_skip_index_tuple : 0 };
+
+  /* The value of skip_last must be always FALSE when this function is called */
+  DBUG_ASSERT(!skip_last);
+
+  /* Return at once if there are no records in the join buffer */
+  if (!records)
+    return NESTED_LOOP_OK;  
+                   
+  rc= init_join_matching_records(&seq_funcs, records);
+  if (rc != NESTED_LOOP_OK)
+    goto finish;
+
+  while (!(error= file->multi_range_read_next((char **) &rec_ptr)))
+  {
+    if (join->thd->killed)
+    {
+      /* The user has aborted the execution of the query */
+      join->thd->send_kill_message();
+      rc= NESTED_LOOP_KILLED; 
+      goto finish;
+    }
+    if (join_tab->keep_current_rowid)
+      join_tab->table->file->position(join_tab->table->record[0]);
+    /* 
+      If only the first match is needed and it has been already found 
+      for the associated partial join record then the returned candidate
+      is discarded.
+    */
+    if (rc == NESTED_LOOP_OK &&
+        (!check_only_first_match || !get_match_flag_by_pos(rec_ptr)))
+    {
+      get_record_by_pos(rec_ptr);
+      update_virtual_fields(join->thd, join_tab->table);
+      rc= generate_full_extensions(rec_ptr);
+      if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+	goto finish;   
+    }
+  }
+
+  if (error > 0 && error != HA_ERR_END_OF_FILE)	   
+    return NESTED_LOOP_ERROR; 
+finish:                  
+  return end_join_matching_records(rc);
+}
+
+
+
+/* 
+  Prepare to search for records that match records from the join buffer
+
+  SYNOPSIS
+    init_join_matching_records()
+      seq_funcs    structure of range sequence interface
+      ranges       number of keys/ranges in the sequence
+
+  DESCRIPTION
+    This function calls the multi_range_read_init function to set up
+    the BKA process of generating the keys from the records in the join
+    buffer and looking for matching records from the table to be joined.
+    The function passes as a parameter a structure of functions that
+    implement the range sequence interface. This interface is used to
+    enumerate all generated keys and optionally to filter the matching
+    records returned by the multi_range_read_next calls from the
+    intended invocation of the join_matching_records method. The
+    multi_range_read_init function also receives the parameters for
+    MRR buffer to be used and flags specifying the mode in which
+    this buffer will be functioning.
+    The number of keys in the sequence expected by multi_range_read_init
+    is passed through the parameter ranges.  
+    
+  RETURN
+    return one of enum_nested_loop_state
+*/
+
+enum_nested_loop_state 
+JOIN_CACHE_BKA::init_join_matching_records(RANGE_SEQ_IF *seq_funcs, uint ranges)
+{
+  int error;
+  handler *file= join_tab->table->file;
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+
+  join_tab->table->null_row= 0;
+
+
+  /* Dynamic range access is never used with BKA */
+  DBUG_ASSERT(join_tab->use_quick != 2);
+
+  for (JOIN_TAB *tab =join->join_tab; tab != join_tab ; tab++)
+  {
+    tab->status= tab->table->status;
+    tab->table->status= 0;
+  }
+
+  init_mrr_buff();
+
+  /* 
+    Prepare to iterate over keys from the join buffer and to get
+    matching candidates obtained with MMR handler functions.
+  */ 
+  if (!file->inited)
+    file->ha_index_init(join_tab->ref.key, 1);
+  if ((error= file->multi_range_read_init(seq_funcs, (void*) this, ranges,
+					  mrr_mode, &mrr_buff)))
+    rc= error < 0 ? NESTED_LOOP_NO_MORE_ROWS: NESTED_LOOP_ERROR;
+  
+  return rc;
+}
+
+
+/* 
+  Finish searching for records that match records from the join buffer
+
+  SYNOPSIS
+    end_join_matching_records()
+      rc      return code passed by the join_matching_records function
+
+  DESCRIPTION
+    This function perform final actions on searching for all matches for
+    the records from the join buffer and building all full join extensions
+    of the records with these matches. 
+    
+  RETURN
+    return code rc passed to the function as a parameter
+*/
+
+enum_nested_loop_state 
+JOIN_CACHE_BKA::end_join_matching_records(enum_nested_loop_state rc)
+{
+  for (JOIN_TAB *tab=join->join_tab; tab != join_tab ; tab++)
+    tab->table->status= tab->status;
+  return rc;  
+}
+
+
+/* 
+  Get the key built over the next record from BKA join buffer
+
+  SYNOPSIS
+    get_next_key()
+      key    pointer to the buffer where the key value is to be placed
+
+  DESCRIPTION
+    The function reads key fields from the current record in the join buffer.
+    and builds the key value out of these fields that will be used to access
+    the 'join_tab' table. Some of key fields may belong to previous caches.
+    They are accessed via record references to the record parts stored in the
+    previous join buffers. The other key fields always are placed right after
+    the flag fields of the record.
+    If the key is embedded, which means that its value can be read directly
+    from the join buffer, then *key is set to the beginning of the key in
+    this buffer. Otherwise the key is built in the join_tab->ref->key_buff.
+    The function returns the length of the key if it succeeds ro read it.
+    If is assumed that the functions starts reading at the position of
+    the record length which is provided for each records in a BKA cache.
+    After the key is built the 'pos' value points to the first position after
+    the current record. 
+    The function returns 0 if the initial position is after the beginning
+    of the record fields for last record from the join buffer. 
+
+  RETURN
+    length of the key value - if the starting value of 'pos' points to
+    the position before the fields for the last record,
+    0 - otherwise.     
+*/
+
+uint JOIN_CACHE_BKA::get_next_key(uchar ** key)
+{
+  uint len;
+  uint32 rec_len;
+  uchar *init_pos;
+  JOIN_CACHE *cache;
+  
+  if (pos > last_rec_pos || !records)
+    return 0;
+
+  /* Any record in a BKA cache is prepended with its length */
+  DBUG_ASSERT(with_length);
+   
+  /* Read the length of the record */
+  rec_len= get_rec_length(pos);
+  pos+= size_of_rec_len; 
+  init_pos= pos;
+
+  /* Read a reference to the previous cache if any */
+  if (prev_cache)
+    pos+= prev_cache->get_size_of_rec_offset();
+
+  curr_rec_pos= pos;
+
+  /* Read all flag fields of the record */
+  read_flag_fields();
+ 
+  if (use_emb_key)
+  {
+    /* An embedded key is taken directly from the join buffer */
+    *key= pos;
+    len= emb_key_length;
+  }
+  else
+  {
+    /* Read key arguments from previous caches if there are any such fields */
+    if (external_key_arg_fields)
+    {
+      uchar *rec_ptr= curr_rec_pos;
+      uint key_arg_count= external_key_arg_fields;
+      CACHE_FIELD **copy_ptr= blob_ptr-key_arg_count;
+      for (cache= prev_cache; key_arg_count; cache= cache->prev_cache)
+      { 
+        uint len= 0;
+        DBUG_ASSERT(cache);
+        rec_ptr= cache->get_rec_ref(rec_ptr);
+        while (!cache->referenced_fields)
+        {
+          cache= cache->prev_cache;
+          DBUG_ASSERT(cache);
+          rec_ptr= cache->get_rec_ref(rec_ptr);
+        }
+        while (key_arg_count && 
+               cache->read_referenced_field(*copy_ptr, rec_ptr, &len))
+        {
+          copy_ptr++;
+          --key_arg_count;
+        }
+      }
+    }
+    
+    /* 
+      Read the other key arguments from the current record. The fields for
+      these arguments are always first in the sequence of the record's fields.
+    */     
+    CACHE_FIELD *copy= field_descr+flag_fields;
+    CACHE_FIELD *copy_end= copy+local_key_arg_fields;
+    bool blob_in_rec_buff= blob_data_is_in_rec_buff(curr_rec_pos);
+    for ( ; copy < copy_end; copy++)
+      read_record_field(copy, blob_in_rec_buff);
+    
+    /* Build the key over the fields read into the record buffers */ 
+    TABLE_REF *ref= &join_tab->ref;
+    cp_buffer_from_ref(join->thd, join_tab->table, ref);
+    *key= ref->key_buff;
+    len= ref->key_length;
+  }
+
+  pos= init_pos+rec_len;
+
+  return len;
+} 
+
+
+/* 
+  Initialize a BKA_UNIQUE cache       
+
+  SYNOPSIS
+    init()
+
+  DESCRIPTION
+    The function initializes the cache structure. It supposed to be called
+    right after a constructor for the JOIN_CACHE_BKA_UNIQUE.
+    The function allocates memory for the join buffer and for descriptors of
+    the record fields stored in the buffer.
+    The function also estimates the number of hash table entries in the hash
+    table to be used and initializes this hash table.
+
+  NOTES
+    The code of this function should have been included into the constructor
+    code itself. However the new operator for the class JOIN_CACHE_BKA_UNIQUE
+    would never fail while memory allocation for the join buffer is not 
+    absolutely unlikely to fail. That's why this memory allocation has to be
+    placed in a separate function that is called in a couple with a cache 
+    constructor.
+    It is quite natural to put almost all other constructor actions into
+    this function.     
+  
+  RETURN
+    0   initialization with buffer allocations has been succeeded
+    1   otherwise
+*/
+
+int JOIN_CACHE_BKA_UNIQUE::init()
+{
+  int rc= 0;
+  TABLE_REF *ref= &join_tab->ref;
+  
+  DBUG_ENTER("JOIN_CACHE_BKA_UNIQUE::init");
+
+  hash_table= 0;
+  key_entries= 0;
+
+  if ((rc= JOIN_CACHE_BKA::init()))
+    DBUG_RETURN (rc);
+
+  key_length= ref->key_length;
+
+  /* Take into account a reference to the next record in the key chain */
+  pack_length+= get_size_of_rec_offset(); 
+ 
+  /* Calculate the minimal possible value of size_of_key_ofs greater than 1 */
+  uint max_size_of_key_ofs= max(2, get_size_of_rec_offset());  
+  for (size_of_key_ofs= 2;
+       size_of_key_ofs <= max_size_of_key_ofs;
+       size_of_key_ofs+= 2)
+  {    
+    key_entry_length= get_size_of_rec_offset() + // key chain header
+                      size_of_key_ofs +          // reference to the next key 
+                      (use_emb_key ?  get_size_of_rec_offset() : key_length);
+
+    uint n= buff_size / (pack_length+key_entry_length+size_of_key_ofs);
+
+    /*
+      TODO: Make a better estimate for this upper bound of
+            the number of records in in the join buffer.
+    */
+    uint max_n= buff_size / (pack_length-length+
+                             key_entry_length+size_of_key_ofs);
+
+    hash_entries= (uint) (n / 0.7);
+    
+    if (offset_size(max_n*key_entry_length) <=
+        size_of_key_ofs)
+      break;
+  }
+   
+  /* Initialize the hash table */ 
+  hash_table= buff + (buff_size-hash_entries*size_of_key_ofs);
+  cleanup_hash_table();
+  curr_key_entry= hash_table;
+
+  pack_length+= key_entry_length;
+  pack_length_with_blob_ptrs+= get_size_of_rec_offset() + key_entry_length;
+
+  rec_fields_offset= get_size_of_rec_offset()+get_size_of_rec_length()+
+                     (prev_cache ? prev_cache->get_size_of_rec_offset() : 0);
+
+  data_fields_offset= 0;
+  if (use_emb_key)
+  {
+    CACHE_FIELD *copy= field_descr;
+    CACHE_FIELD *copy_end= copy+flag_fields;
+    for ( ; copy < copy_end; copy++)
+      data_fields_offset+= copy->length;
+  } 
+
+  DBUG_RETURN(rc);
+}
+
+
+/* 
+  Reset the JOIN_CACHE_BKA_UNIQUE  buffer for reading/writing
+
+  SYNOPSIS
+    reset()
+      for_writing  if it's TRUE the function reset the buffer for writing
+
+  DESCRIPTION
+    This implementation of the virtual function reset() resets the join buffer
+    of the JOIN_CACHE_BKA_UNIQUE class for reading or writing.
+    Additionally to what the default implementation does this function
+    cleans up the hash table allocated within the buffer.  
+    
+  RETURN
+    none
+*/
+ 
+void JOIN_CACHE_BKA_UNIQUE::reset(bool for_writing)
+{
+  this->JOIN_CACHE::reset(for_writing);
+  if (for_writing && hash_table)
+    cleanup_hash_table();
+  curr_key_entry= hash_table;
+}
+
+/* 
+  Add a record into the JOIN_CACHE_BKA_UNIQUE buffer
+
+  SYNOPSIS
+    put_record()
+
+  DESCRIPTION
+    This implementation of the virtual function put_record writes the next
+    matching record into the join buffer of the JOIN_CACHE_BKA_UNIQUE class.
+    Additionally to what the default implementation does this function
+    performs the following. 
+    It extracts from the record the key value used in lookups for matching
+    records and searches for this key in the hash tables from the join cache.
+    If it finds the key in the hash table it joins the record to the chain
+    of records with this key. If the key is not found in the hash table the
+    key is placed into it and a chain containing only the newly added record 
+    is attached to the key entry. The key value is either placed in the hash 
+    element added for the key or, if the use_emb_key flag is set, remains in
+    the record from the partial join.
+    
+  RETURN
+    TRUE    if it has been decided that it should be the last record
+            in the join buffer,
+    FALSE   otherwise
+*/
+
+bool JOIN_CACHE_BKA_UNIQUE::put_record()
+{
+  bool is_full;
+  uchar *key;
+  uint key_len= key_length;
+  uchar *key_ref_ptr;
+  uchar *link= 0;
+  TABLE_REF *ref= &join_tab->ref;
+  uchar *next_ref_ptr= pos;
+
+  pos+= get_size_of_rec_offset();
+  /* Write the record into the join buffer */  
+  if (prev_cache)
+    link= prev_cache->get_curr_rec_link();
+  write_record_data(link, &is_full);
+
+  if (use_emb_key)
+    key= get_curr_emb_key();
+  else
+  {
+    /* Build the key over the fields read into the record buffers */ 
+    cp_buffer_from_ref(join->thd, join_tab->table, ref);
+    key= ref->key_buff;
+  }
+
+  /* Look for the key in the hash table */
+  if (key_search(key, key_len, &key_ref_ptr))
+  {
+    uchar *last_next_ref_ptr;
+    /* 
+      The key is found in the hash table. 
+      Add the record to the circular list of the records attached to this key.
+      Below 'rec' is the record to be added into the record chain for the found
+      key, 'key_ref' points to a flatten representation of the st_key_entry 
+      structure that contains the key and the head of the record chain.
+    */
+    last_next_ref_ptr= get_next_rec_ref(key_ref_ptr+get_size_of_key_offset());
+    /* rec->next_rec= key_entry->last_rec->next_rec */
+    memcpy(next_ref_ptr, last_next_ref_ptr, get_size_of_rec_offset());
+    /* key_entry->last_rec->next_rec= rec */ 
+    store_next_rec_ref(last_next_ref_ptr, next_ref_ptr);
+    /* key_entry->last_rec= rec */
+    store_next_rec_ref(key_ref_ptr+get_size_of_key_offset(), next_ref_ptr);
+  }
+  else
+  {
+    /* 
+      The key is not found in the hash table.
+      Put the key into the join buffer linking it with the keys for the
+      corresponding hash entry. Create a circular list with one element
+      referencing the record and attach the list to the key in the buffer.
+    */
+    uchar *cp= last_key_entry;
+    cp-= get_size_of_rec_offset()+get_size_of_key_offset();
+    store_next_key_ref(key_ref_ptr, cp);
+    store_null_key_ref(cp);
+    store_next_rec_ref(next_ref_ptr, next_ref_ptr);
+    store_next_rec_ref(cp+get_size_of_key_offset(), next_ref_ptr);
+    if (use_emb_key)
+    {
+      cp-= get_size_of_rec_offset();
+      store_emb_key_ref(cp, key);
+    }
+    else
+    {
+      cp-= key_len;
+      memcpy(cp, key, key_len);
+    }
+    last_key_entry= cp;
+    /* Increment the counter of key_entries in the hash table */ 
+    key_entries++;
+  }  
+  return is_full;
+}
+
+
+/*
+  Read the next record from the JOIN_CACHE_BKA_UNIQUE buffer
+
+  SYNOPSIS
+    get_record()
+
+  DESCRIPTION
+    Additionally to what the default implementation of the virtual 
+    function get_record does this implementation skips the link element
+    used to connect the records with the same key into a chain. 
+
+  RETURN
+    TRUE  - there are no more records to read from the join buffer
+    FALSE - otherwise
+*/
+
+bool JOIN_CACHE_BKA_UNIQUE::get_record()
+{ 
+  pos+= get_size_of_rec_offset();
+  return this->JOIN_CACHE::get_record();
+}
+
+
+/* 
+  Skip record from the JOIN_CACHE_BKA_UNIQUE join buffer if its match flag is on
+
+  SYNOPSIS
+    skip_record_if_match()
+
+  DESCRIPTION
+    This implementation of the virtual function skip_record_if_match does
+    the same as the default implementation does, but it takes into account
+    the link element used to connect the records with the same key into a chain. 
+
+  RETURN
+    TRUE  - the match flag is on and the record has been skipped
+    FALSE - the match flag is off 
+*/
+
+bool JOIN_CACHE_BKA_UNIQUE::skip_record_if_match()
+{
+  uchar *save_pos= pos;
+  pos+= get_size_of_rec_offset();
+  if (!this->JOIN_CACHE::skip_record_if_match())
+  {
+    pos= save_pos;
+    return FALSE;
+  }
+  return TRUE;
+}
+
+
+/* 
+  Search for a key in the hash table of the join buffer
+
+  SYNOPSIS
+    key_search()
+      key             pointer to the key value
+      key_len         key value length
+      key_ref_ptr OUT position of the reference to the next key from 
+                      the hash element for the found key , or
+                      a position where the reference to the the hash 
+                      element for the key is to be added in the
+                      case when the key has not been found
+      
+  DESCRIPTION
+    The function looks for a key in the hash table of the join buffer.
+    If the key is found the functionreturns the position of the reference
+    to the next key from  to the hash element for the given key. 
+    Otherwise the function returns the position where the reference to the
+    newly created hash element for the given key is to be added.  
+
+  RETURN
+    TRUE  - the key is found in the hash table
+    FALSE - otherwise
+*/
+
+bool JOIN_CACHE_BKA_UNIQUE::key_search(uchar *key, uint key_len,
+                                       uchar **key_ref_ptr) 
+{
+  bool is_found= FALSE;
+  uint idx= get_hash_idx(key, key_length);
+  uchar *ref_ptr= hash_table+size_of_key_ofs*idx;
+  while (!is_null_key_ref(ref_ptr))
+  {
+    uchar *next_key;
+    ref_ptr= get_next_key_ref(ref_ptr);
+    next_key= use_emb_key ? get_emb_key(ref_ptr-get_size_of_rec_offset()) :
+                            ref_ptr-key_length;
+
+    if (memcmp(next_key, key, key_len) == 0)
+    {
+      is_found= TRUE;
+      break;
+    }
+  }
+  *key_ref_ptr= ref_ptr;
+  return is_found;
+} 
+
+
+/* 
+  Calclulate hash value for a key in the hash table of the join buffer
+
+  SYNOPSIS
+    get_hash_idx()
+      key             pointer to the key value
+      key_len         key value length
+      
+  DESCRIPTION
+    The function calculates an index of the hash entry in the hash table
+    of the join buffer for the given key  
+
+  RETURN
+    the calculated index of the hash entry for the given key.  
+*/
+
+uint JOIN_CACHE_BKA_UNIQUE::get_hash_idx(uchar* key, uint key_len)
+{
+  ulong nr= 1;
+  ulong nr2= 4;
+  uchar *pos= key;
+  uchar *end= key+key_len;
+  for (; pos < end ; pos++)
+  {
+    nr^= (ulong) ((((uint) nr & 63)+nr2)*((uint) *pos))+ (nr << 8);
+    nr2+= 3;
+  }
+  return nr % hash_entries;
+}
+
+
+/* 
+  Clean up the hash table of the join buffer
+
+  SYNOPSIS
+    cleanup_hash_table()
+      key             pointer to the key value
+      key_len         key value length
+      
+  DESCRIPTION
+    The function cleans up the hash table in the join buffer removing all
+    hash elements from the table. 
+
+  RETURN
+    none  
+*/
+
+void JOIN_CACHE_BKA_UNIQUE:: cleanup_hash_table()
+{
+  last_key_entry= hash_table;
+  bzero(hash_table, (buff+buff_size)-hash_table);
+  key_entries= 0;
+}
+
+
+/*
+  Initialize retrieval of range sequence for BKA_UNIQUE algorithm
+    
+  SYNOPSIS
+    bka_range_seq_init()
+      init_params   pointer to the BKA_INIQUE join cache object
+      n_ranges      the number of ranges obtained 
+      flags         combination of HA_MRR_SINGLE_POINT, HA_MRR_FIXED_KEY
+
+  DESCRIPTION
+    The function interprets init_param as a pointer to a JOIN_CACHE_BKA_UNIQUE
+    object. The function prepares for an iteration over the unique join keys
+    built over the records from the cache join buffer.
+
+  NOTE
+    This function are used only as a callback function.    
+
+  RETURN
+    init_param    value that is to be used as a parameter of 
+                  bka_unique_range_seq_next()
+*/    
+
+static 
+range_seq_t bka_unique_range_seq_init(void *init_param, uint n_ranges,
+                                      uint flags)
+{
+  DBUG_ENTER("bka_unique_range_seq_init");
+  JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) init_param;
+  cache->reset(0);
+  DBUG_RETURN((range_seq_t) init_param);
+}
+
+
+/*
+  Get the key over the next record from the join buffer used by BKA_UNIQUE  
+    
+  SYNOPSIS
+    bka_unique_range_seq_next()
+      seq        value returned by  bka_unique_range_seq_init()
+      range  OUT reference to the next range
+  
+  DESCRIPTION
+    The function interprets seq as a pointer to the JOIN_CACHE_BKA_UNIQUE 
+    object. The function returns a pointer to the range descriptor
+    for the next unique key built over records from the join buffer.
+
+  NOTE
+    This function are used only as a callback function.
+   
+  RETURN
+    0    ok, the range structure filled with info about the next key
+    1    no more ranges
+*/    
+
+static 
+uint bka_unique_range_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range)
+{
+  DBUG_ENTER("bka_unique_range_seq_next");
+  JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) rseq;
+  TABLE_REF *ref= &cache->join_tab->ref;
+  key_range *start_key= &range->start_key;
+  if ((start_key->length= cache->get_next_key((uchar **) &start_key->key)))
+  {
+    start_key->keypart_map= (1 << ref->key_parts) - 1;
+    start_key->flag= HA_READ_KEY_EXACT;
+    range->end_key= *start_key;
+    range->end_key.flag= HA_READ_AFTER_KEY;
+    range->ptr= (char *) cache->get_curr_key_chain();
+    range->range_flag= EQ_RANGE;
+    DBUG_RETURN(0);
+  } 
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Check whether range_info orders to skip the next record from BKA_UNIQUE buffer
+
+  SYNOPSIS
+    bka_unique_range_seq_skip_record()
+      seq              value returned by bka_unique_range_seq_init()
+      range_info       information about the next range
+      rowid [NOT USED] rowid of the record to be checked (not used)
+    
+  DESCRIPTION
+    The function interprets seq as a pointer to the JOIN_CACHE_BKA_UNIQUE
+    object. The function returns TRUE if the record with this range_info
+    is to be filtered out from the stream of records returned by
+    multi_range_read_next(). 
+
+  NOTE
+    This function are used only as a callback function.
+
+  RETURN
+    1    record with this range_info is to be filtered out from the stream
+         of records returned by multi_range_read_next()
+    0    the record is to be left in the stream
+*/ 
+
+static 
+bool bka_unique_range_seq_skip_record(range_seq_t rseq, char *range_info,
+                                      uchar *rowid)
+{
+  DBUG_ENTER("bka_unique_range_seq_skip_record");
+  JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) rseq;
+  bool res= cache->check_all_match_flags_for_key((uchar *) range_info);
+  DBUG_RETURN(res);
+}
+
+ 
+/*
+  Check if the record combination matches the index condition
+
+  SYNOPSIS
+    JOIN_CACHE_BKA_UNIQUE::skip_index_tuple()
+      rseq             Value returned by bka_range_seq_init()
+      range_info       MRR range association data
+    
+  DESCRIPTION
+    See JOIN_CACHE_BKA::skip_index_tuple().
+    This function is the variant for use with
+    JOIN_CACHE_BKA_UNIQUE. The difference from JOIN_CACHE_BKA case is that
+    there may be multiple previous table record combinations that share the
+    same key, i.e. they map to the same MRR range.
+    As a consequence, we need to loop through all previous table record
+    combinations that match the given MRR range key range_info until we find
+    one that satisfies the index condition.
+
+  NOTE
+    Possible optimization:
+    Before we unpack the record from a previous table
+    check if this table is used in the condition.
+    If so then unpack the record otherwise skip the unpacking.
+    This should be done by a special virtual method
+    get_partial_record_by_pos().
+
+  RETURN
+    0    The record combination satisfies the index condition
+    1    Otherwise
+
+
+*/
+
+bool JOIN_CACHE_BKA_UNIQUE::skip_index_tuple(range_seq_t rseq, char *range_info)
+{
+  DBUG_ENTER("JOIN_CACHE_BKA_UNIQUE::skip_index_tuple");
+  JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) rseq;
+  uchar *last_rec_ref_ptr=  cache->get_next_rec_ref((uchar*) range_info);
+  uchar *next_rec_ref_ptr= last_rec_ref_ptr;
+  do
+  {
+    next_rec_ref_ptr= cache->get_next_rec_ref(next_rec_ref_ptr);
+    uchar *rec_ptr= next_rec_ref_ptr + cache->rec_fields_offset;
+    cache->get_record_by_pos(rec_ptr);
+    if (join_tab->cache_idx_cond->val_int())
+      DBUG_RETURN(FALSE);
+  } while(next_rec_ref_ptr != last_rec_ref_ptr);
+  DBUG_RETURN(TRUE);
+}
+
+
+/*
+  Check if the record combination matches the index condition
+
+  SYNOPSIS
+    bka_unique_skip_index_tuple()
+      rseq             Value returned by bka_range_seq_init()
+      range_info       MRR range association data
+    
+  DESCRIPTION
+    This is wrapper for JOIN_CACHE_BKA_UNIQUE::skip_index_tuple method,
+    see comments there.
+
+  NOTE
+    This function is used as a RANGE_SEQ_IF::skip_index_tuple callback.
+ 
+  RETURN
+    0    The record combination satisfies the index condition
+    1    Otherwise
+*/
+
+static 
+bool bka_unique_skip_index_tuple(range_seq_t rseq, char *range_info)
+{
+  DBUG_ENTER("bka_unique_skip_index_tuple");
+  JOIN_CACHE_BKA_UNIQUE *cache= (JOIN_CACHE_BKA_UNIQUE *) rseq;
+  DBUG_RETURN(cache->skip_index_tuple(rseq, range_info));
+}
+
+
+/*
+  Using BKA_UNIQUE find matches from the next table for records from join buffer   
+
+  SYNOPSIS
+    join_matching_records()
+      skip_last    do not look for matches for the last partial join record 
+
+  DESCRIPTION
+    This function can be used only when the table join_tab can be accessed
+    by keys built over the fields of previous join tables.
+    The function retrieves all keys from the hash table of the join buffer
+    built for partial join records from the buffer. For each of these keys
+    the function performs an index lookup and tries to match records yielded
+    by this lookup with records from the join buffer attached to the key.
+    If a match is found the function will call the sub_select function trying
+    to look for matches for the remaining join operations.
+    This function does not assume that matching records are necessarily
+    returned with references to the keys by which they were found. If the call
+    of the function multi_range_read_init returns flags with
+    HA_MRR_NO_ASSOCIATION then a search for the key built from the returned
+    record is carried on. The search is performed by probing in in the hash
+    table of the join buffer.
+    This function currently is called only from the function join_records.    
+    It's assumed that this function is always called with the skip_last 
+    parameter equal to false.
+            
+  RETURN
+    return one of enum_nested_loop_state 
+*/
+
+enum_nested_loop_state 
+JOIN_CACHE_BKA_UNIQUE::join_matching_records(bool skip_last)
+{
+  int error;
+  uchar *key_chain_ptr;
+  handler *file= join_tab->table->file;
+  enum_nested_loop_state rc= NESTED_LOOP_OK;
+  bool check_only_first_match= join_tab->check_only_first_match();
+  bool no_association= test(mrr_mode &  HA_MRR_NO_ASSOCIATION);
+
+  /* Set functions to iterate over keys in the join buffer */
+  RANGE_SEQ_IF seq_funcs= { bka_unique_range_seq_init,
+                            bka_unique_range_seq_next,
+                            check_only_first_match && !no_association ?
+                              bka_unique_range_seq_skip_record : 0,
+                            join_tab->cache_idx_cond ?
+                              bka_unique_skip_index_tuple : 0  };
+
+  /* The value of skip_last must be always FALSE when this function is called */
+  DBUG_ASSERT(!skip_last);
+
+  /* Return at once if there are no records in the join buffer */
+  if (!records)
+    return NESTED_LOOP_OK;  
+                   
+  rc= init_join_matching_records(&seq_funcs, key_entries);
+  if (rc != NESTED_LOOP_OK)
+    goto finish;
+
+  while (!(error= file->multi_range_read_next((char **) &key_chain_ptr)))
+  {
+    if (no_association)
+    {
+      uchar *key_ref_ptr;
+      TABLE *table= join_tab->table;
+      TABLE_REF *ref= &join_tab->ref;
+      KEY *keyinfo= table->key_info+ref->key;
+      /* 
+        Build the key value out of  the record returned by the call of
+        multi_range_read_next in the record buffer
+      */ 
+      key_copy(ref->key_buff, table->record[0], keyinfo, ref->key_length);
+      /* Look for this key in the join buffer */
+      if (!key_search(ref->key_buff, ref->key_length, &key_ref_ptr))
+	continue;
+      key_chain_ptr= key_ref_ptr+get_size_of_key_offset();
+    } 
+
+    uchar *last_rec_ref_ptr= get_next_rec_ref(key_chain_ptr);
+    uchar *next_rec_ref_ptr= last_rec_ref_ptr;
+    do
+    {
+      next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr);
+      uchar *rec_ptr= next_rec_ref_ptr+rec_fields_offset;
+
+      if (join->thd->killed)
+      {
+        /* The user has aborted the execution of the query */
+        join->thd->send_kill_message();
+        rc= NESTED_LOOP_KILLED; 
+        goto finish;
+      }
+      /* 
+        If only the first match is needed and it has been already found
+        for the associated partial join record then the returned candidate
+        is discarded.
+      */
+      if (rc == NESTED_LOOP_OK &&
+          (!check_only_first_match || !get_match_flag_by_pos(rec_ptr)))
+      {
+        get_record_by_pos(rec_ptr);
+        update_virtual_fields(join->thd, join_tab->table);
+        rc= generate_full_extensions(rec_ptr);
+        if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
+	  goto finish;   
+      }
+    }
+    while (next_rec_ref_ptr != last_rec_ref_ptr); 
+  }
+
+  if (error > 0 && error != HA_ERR_END_OF_FILE)	   
+    return NESTED_LOOP_ERROR; 
+finish:                  
+  return end_join_matching_records(rc);
+}
+
+
+/*
+  Check whether all records in a key chain have their match flags set on   
+
+  SYNOPSIS
+    check_all_match_flags_for_key()
+      key_chain_ptr     
+
+  DESCRIPTION
+    This function retrieves records in the given circular chain and checks
+    whether their match flags are set on. The parameter key_chain_ptr shall
+    point to the position in the join buffer storing the reference to the
+    last element of this chain. 
+            
+  RETURN
+    TRUE   if each retrieved record has its match flag set on
+    FALSE  otherwise 
+*/
+
+bool JOIN_CACHE_BKA_UNIQUE::check_all_match_flags_for_key(uchar *key_chain_ptr)
+{
+  uchar *last_rec_ref_ptr= get_next_rec_ref(key_chain_ptr);
+  uchar *next_rec_ref_ptr= last_rec_ref_ptr;
+  do
+  {
+    next_rec_ref_ptr= get_next_rec_ref(next_rec_ref_ptr);
+    uchar *rec_ptr= next_rec_ref_ptr+rec_fields_offset;
+    if (!get_match_flag_by_pos(rec_ptr))
+      return FALSE;
+  }
+  while (next_rec_ref_ptr != last_rec_ref_ptr);
+  return TRUE;
+}
+  
+
+/* 
+  Get the next key built for the records from BKA_UNIQUE join buffer
+
+  SYNOPSIS
+    get_next_key()
+      key    pointer to the buffer where the key value is to be placed
+
+  DESCRIPTION
+    The function reads the next key value stored in the hash table of the
+    join buffer. Depending on the value of the use_emb_key flag of the
+    join cache the value is read either from the table itself or from
+    the record field where it occurs. 
+
+  RETURN
+    length of the key value - if the starting value of 'cur_key_entry' refers
+    to the position after that referred by the the value of 'last_key_entry'    
+    0 - otherwise.     
+*/
+
+uint JOIN_CACHE_BKA_UNIQUE::get_next_key(uchar ** key)
+{  
+  if (curr_key_entry == last_key_entry)
+    return 0;
+
+  curr_key_entry-= key_entry_length;
+
+  *key = use_emb_key ? get_emb_key(curr_key_entry) : curr_key_entry;
+
+  DBUG_ASSERT(*key >= buff && *key < hash_table);
+
+  return key_length;
+}
+
+
+/****************************************************************************
+ * Join cache module end
+ ****************************************************************************/
author	Sergei Golubchik <sergii@pisem.net>	2010-11-25 18:17:28 +0100
committer	Sergei Golubchik <sergii@pisem.net>	2010-11-25 18:17:28 +0100
commit	65ca700def99289cc31a7040537f5aa6e12bf485 (patch)
tree	97b3a07299b626c519da0e80c122b5b79b933914 /sql/sql_join_cache.cc
parent	2ab57de38d13d927ddff2d51aed4af34e13998f5 (diff)
parent	6e5bcca7935d3c62f84bb640e5357664a210ee12 (diff)
download	mariadb-git-65ca700def99289cc31a7040537f5aa6e12bf485.tar.gz