diff options
author | Jacob Mathew <jacob.mathew@mariadb.com> | 2018-02-19 19:19:03 -0800 |
---|---|---|
committer | Jacob Mathew <jacob.mathew@mariadb.com> | 2018-02-19 21:36:19 -0800 |
commit | 884b83e28fed3d6de2593d5b4121dc23fce7f921 (patch) | |
tree | f56e51de430a099966b3a99cefd9a78ea6057ce8 | |
parent | d23fcc427cb4010b33defc69547089afeb9af811 (diff) | |
download | mariadb-git-bb-10.3-MDEV-14500.tar.gz |
MDEV-14500: Support engines without rnd_pos() andbb-10.3-MDEV-14500
engines with inefficient rnd_pos()
Some engines have not implemented rnd_pos(). There are other engines whose
implementation of rnd_pos() is inherently inefficient. Spider is such an
engine, whose implementation of rnd_pos() needs to access a table on a remote
data node to retrieve a single table row.
To address these limitations, a new temporary table has been added to filesort.
When filesort sequentially reads the table being sorted, each row is written to
the filesort temp table in addition to being copied to the sort buffer.
Subsequent calls to rnd_pos() will then access the table row in the filesort
temp table instead of in the table being sorted.
The following logic changes incorporate the new filesort temp table into the
server:
- A new handler method to determine whether a call to the engine's
rnd_pos() is expensive. The default return value is FALSE. Engines without
rnd_pos() or with an inefficient rnd_pos() should return TRUE.
- Create the filesort temp table only if:
- There are no add-on columns for filesort; and
- The engine's implementation of rnd_pos() is expensive.
- Write to the temp table each row that is read from the table being sorted.
- Do subsequent row retrievals that use rnd_pos() on the temp table instead of
on the table being sorted. Upon retrieving a row from the temp table, copy
its column values to the record of the table being sorted.
- Upon completion of retrieval of the sorted result rows, delete the filesort
temp table and free the memory allocated for using it.
The logic changes are in the following areas:
- Table handler.
- Partition engine.
- Spider engine.
- Filesort.
- Read record manager.
Note that these changes only address the use of rnd_pos() by filesort. They do
not address the use of rnd_pos() in other areas such as:
- Quick select.
- Insert.
- Update.
- Window functions.
- Multi Range Read.
Author:
Jacob Mathew.
Reviewer:
Sergei Golubchik.
-rw-r--r-- | sql/filesort.cc | 355 | ||||
-rw-r--r-- | sql/filesort.h | 12 | ||||
-rw-r--r-- | sql/ha_partition.cc | 30 | ||||
-rw-r--r-- | sql/ha_partition.h | 1 | ||||
-rw-r--r-- | sql/handler.h | 15 | ||||
-rw-r--r-- | sql/records.cc | 275 | ||||
-rw-r--r-- | sql/records.h | 31 | ||||
-rw-r--r-- | sql/sql_class.h | 1 | ||||
-rw-r--r-- | sql/sql_sort.h | 19 | ||||
-rw-r--r-- | storage/spider/ha_spider.h | 9 |
10 files changed, 668 insertions, 80 deletions
diff --git a/sql/filesort.cc b/sql/filesort.cc index 00dfa08bba8..86626c85ebf 100644 --- a/sql/filesort.cc +++ b/sql/filesort.cc @@ -55,6 +55,11 @@ static bool write_keys(Sort_param *param, SORT_INFO *fs_info, uint count, IO_CACHE *buffer_file, IO_CACHE *tempfile); static void make_sortkey(Sort_param *param, uchar *to, uchar *ref_pos); static void register_used_fields(Sort_param *param); +static void register_tmp_table_fields(SORT_INFO *fs_info); +static int create_fs_tmp_table_if_needed(THD *thd, Sort_param *param, + SORT_INFO *fs_info); +static int write_fs_tmp_table_row(THD *thd, SORT_INFO *fs_info); +static void free_fs_tmp_table(THD *thd, SORT_INFO *fs_info); static bool save_index(Sort_param *param, uint count, SORT_INFO *table_sort); static uint suffix_length(ulong string_length); @@ -63,7 +68,8 @@ static uint sortlength(THD *thd, SORT_FIELD *sortorder, uint s_length, static SORT_ADDON_FIELD *get_addon_fields(ulong max_length_for_sort_data, Field **ptabfield, uint sortlength, - LEX_STRING *addon_buf); + LEX_STRING *addon_buf, + uint *ptmp_fields); static void unpack_addon_fields(struct st_sort_addon_field *addon_field, uchar *buff, uchar *buff_end); static bool check_if_pq_applicable(Sort_param *param, SORT_INFO *info, @@ -72,7 +78,8 @@ static bool check_if_pq_applicable(Sort_param *param, SORT_INFO *info, void Sort_param::init_for_filesort(uint sortlen, TABLE *table, ulong max_length_for_sort_data, - ha_rows maxrows, bool sort_positions) + ha_rows maxrows, bool sort_positions, + uint *tmp_fields) { DBUG_ASSERT(addon_field == 0 && addon_buf.length == 0); @@ -86,7 +93,8 @@ void Sort_param::init_for_filesort(uint sortlen, TABLE *table, to sorted fields and get its total length in addon_buf.length */ addon_field= get_addon_fields(max_length_for_sort_data, - table->field, sort_length, &addon_buf); + table->field, sort_length, &addon_buf, + tmp_fields); } if (addon_field) { @@ -189,10 +197,11 @@ SORT_INFO *filesort(THD *thd, TABLE *table, Filesort *filesort, sort->found_rows= HA_POS_ERROR; param.init_for_filesort(sortlength(thd, filesort->sortorder, s_length, - &multi_byte_charset), + &multi_byte_charset), table, thd->variables.max_length_for_sort_data, - max_rows, filesort->sort_positions); + max_rows, filesort->sort_positions, + &sort->tmp_fields); sort->addon_buf= param.addon_buf; sort->addon_field= param.addon_field; @@ -273,7 +282,7 @@ SORT_INFO *filesort(THD *thd, TABLE *table, Filesort *filesort, num_rows= find_all_keys(thd, ¶m, select, sort, &buffpek_pointers, - &tempfile, + &tempfile, pq.is_initialized() ? &pq : NULL, &sort->found_rows); if (num_rows == HA_POS_ERROR) @@ -345,7 +354,7 @@ SORT_INFO *filesort(THD *thd, TABLE *table, Filesort *filesort, } error= 0; - err: +err: my_free(param.tmp_buffer); if (!subselect || !subselect->is_uncacheable()) { @@ -700,7 +709,7 @@ static void dbug_print_record(TABLE *table, bool print_rowid) static ha_rows find_all_keys(THD *thd, Sort_param *param, SQL_SELECT *select, SORT_INFO *fs_info, - IO_CACHE *buffpek_pointers, + IO_CACHE *buffpek_pointers, IO_CACHE *tempfile, Bounded_queue<uchar, uchar> *pq, ha_rows *found_rows) @@ -709,8 +718,10 @@ static ha_rows find_all_keys(THD *thd, Sort_param *param, SQL_SELECT *select, uint idx,indexpos,ref_length; uchar *ref_pos,*next_pos,ref_buff[MAX_REFLENGTH]; my_off_t record; + TABLE *fs_tmp_table; TABLE *sort_form; handler *file; + handler *ref_file; MY_BITMAP *save_read_set, *save_write_set, *save_vcol_set; Item *sort_cond; ha_rows retval; @@ -728,9 +739,24 @@ static ha_rows find_all_keys(THD *thd, Sort_param *param, SQL_SELECT *select, quick_select=select && select->quick; record=0; *found_rows= 0; + + if (!quick_select) + { + /* + Potentially create a temp table to avoid rnd_pos() calls on the + table to be sorted + */ + if (create_fs_tmp_table_if_needed(thd, param, fs_info)) + goto err; + fs_tmp_table= fs_info->fs_tmp_table; + } + else + fs_tmp_table= NULL; + ref_file= (fs_tmp_table ? fs_tmp_table->file : file); + flag= ((file->ha_table_flags() & HA_REC_NOT_IN_SEQ) || quick_select); if (flag) - ref_pos= &file->ref[0]; + ref_pos= &ref_file->ref[0]; next_pos=ref_pos; DBUG_EXECUTE_IF("show_explain_in_find_all_keys", @@ -760,6 +786,8 @@ static ha_rows find_all_keys(THD *thd, Sort_param *param, SQL_SELECT *select, register_used_fields(param); if (quick_select) select->quick->add_used_key_part_to_set(); + else + register_tmp_table_fields(fs_info); sort_cond= (!select ? 0 : (!select->pre_idx_push_select_cond ? @@ -786,18 +814,25 @@ static ha_rows find_all_keys(THD *thd, Sort_param *param, SQL_SELECT *select, } else /* Not quick-select */ { + error= file->ha_rnd_next(sort_form->record[0]); + if (!flag) { - error= file->ha_rnd_next(sort_form->record[0]); - if (!flag) - { - my_store_ptr(ref_pos,ref_length,record); // Position to row - record+= sort_form->s->db_record_offset; - } - else if (!error) - file->position(sort_form->record[0]); + my_store_ptr(ref_pos,ref_length,record); // Position to row + record+= sort_form->s->db_record_offset; + } + else if (!error) + { + /* + If filesort is using a temp table, write the row to the temp table, + and save its row position + */ + if (fs_tmp_table) + error= write_fs_tmp_table_row(thd, fs_info); + else + file->position(sort_form->record[0]); } if (error && error != HA_ERR_RECORD_DELETED) - break; + break; } if (thd->check_killed()) @@ -904,7 +939,19 @@ static ha_rows find_all_keys(THD *thd, Sort_param *param, SQL_SELECT *select, DBUG_RETURN(retval); err: + if (!quick_select) + { + if (file->inited) + { + (void)file->extra(HA_EXTRA_NO_CACHE); /* End cacheing of records */ + if (!next_pos) + file->ha_rnd_end(); + } + if (fs_tmp_table && fs_tmp_table->file->inited) + fs_tmp_table->file->ha_rnd_end(); + } sort_form->column_bitmaps_set(save_read_set, save_write_set, save_vcol_set); + free_fs_tmp_table(thd, fs_info); DBUG_RETURN(HA_POS_ERROR); } /* find_all_keys */ @@ -1295,6 +1342,199 @@ static void register_used_fields(Sort_param *param) } +/** + Register the filesort temp table fields in the sorted table's read set + + @param fs_info Filesort information that includes the filesort + temp table and an array of its fields. +*/ + +static void register_tmp_table_fields(SORT_INFO *fs_info) +{ + DBUG_ENTER("register_tmp_table_fields"); + + if (fs_info->fs_tmp_table) + { + Copy_field *tmp_field= fs_info->tmp_field; + + for (; tmp_field->from_field; tmp_field++) + { + /* Register the corresponding field in the original table */ + bitmap_fast_test_and_set(tmp_field->from_field->table->read_set, + tmp_field->from_field->field_index); + } + } + + DBUG_VOID_RETURN; +} + + +/** + Potentially create a filesort temp table to avoid rnd_pos() calls on the + table to be sorted + + @param param Sort information and parameters. + @param fs_info Filesort information that includes the filesort + temp table and an array of its fields. + + @retval + 0 Temp table creation succeeded or temp table is + unnecessary. + @retval + 1 Memory allocation failed or temp table creation failed. +*/ + +static int create_fs_tmp_table_if_needed(THD *thd, Sort_param *param, + SORT_INFO *fs_info) +{ + TABLE *table= param->sort_form; + DBUG_ENTER("create_fs_tmp_table_if_needed"); + + if (fs_info->tmp_fields && table->file->ha_is_rnd_pos_expensive()) + { + /* + Create a filesort temp table to avoid expensive rnd_pos() calls + on the table to be sorted + */ + Copy_field *tmp_field; + List<Item> tmp_field_list; + Item_field *item_field; + Field **pfield; + Field *field; + + /* Allocate memory for the temp table field array */ + tmp_field= (Copy_field *) + my_malloc(sizeof(Copy_field) * (fs_info->tmp_fields + 1), + MYF(MY_WME | MY_THREAD_SPECIFIC)); + if (!tmp_field) + DBUG_RETURN(1); + fs_info->tmp_field= tmp_field; + + /* Initialize the field array elements */ + for (pfield= table->field; (field= *pfield); pfield++) + { + if (!bitmap_is_set(table->read_set, field->field_index)) + continue; + /* + All fields referenced in the query are to be written + to the temp table + */ + tmp_field->from_field= field; + tmp_field++; + } + tmp_field->from_field= 0; // Put end marker + + /* Create the temp table field list */ + for (tmp_field= fs_info->tmp_field; tmp_field->from_field; tmp_field++) + { + item_field= new (thd->mem_root) Item_field(thd, tmp_field->from_field); + if (tmp_field_list.push_back(item_field, thd->mem_root)) + { + free_fs_tmp_table(thd, fs_info); + DBUG_RETURN(1); + } + } + + TMP_TABLE_PARAM tmp_table_param; + tmp_table_param.init(); + tmp_table_param.field_count= fs_info->tmp_fields; + tmp_table_param.table_charset= table->s->table_charset; + tmp_table_param.skip_create_table= TRUE; + + /* Create the filesort temp table */ + TABLE *fs_tmp_table= create_tmp_table(thd, &tmp_table_param, + tmp_field_list, + NULL, + FALSE, + FALSE, + thd->variables.option_bits | + TMP_TABLE_ALL_COLUMNS, + param->max_rows, &empty_clex_str, + FALSE, FALSE); + + if (!fs_tmp_table) + { + free_fs_tmp_table(thd, fs_info); + DBUG_RETURN(1); + } + + /* Fill in the pointers to the temp table fields in the field array */ + for (tmp_field= fs_info->tmp_field, pfield= fs_tmp_table->field; + (field= *pfield); + tmp_field++, pfield++) + tmp_field->set(field, tmp_field->from_field, FALSE); + fs_info->fs_tmp_table= fs_tmp_table; + + /* Fix up the sort buffer parameters */ + param->update_ref_length(fs_tmp_table->file->ref_length); + + fs_tmp_table->prepare_for_position(); + } + + DBUG_RETURN(0); +} + + +/** + Copy column values from the current row of the table being sorted + to the current filesort temp table row. Write the row to the + filesort temp table. + + @param fs_info Filesort information that includes the filesort + temp table and an array of its fields. + + @retval + 0 Temp table row was created and successfully written. + @retval + <> 0 Temp table write failed. +*/ + +static int write_fs_tmp_table_row(THD *thd, SORT_INFO *fs_info) +{ + TABLE *fs_tmp_table= fs_info->fs_tmp_table; + Copy_field *tmp_field; + int error; + DBUG_ENTER("write_fs_tmp_table_row"); + + /* + Copy each column value present in the temp table + from the table being sorted + */ + for (tmp_field= fs_info->tmp_field; tmp_field->from_field; tmp_field++) + tmp_field->do_copy(tmp_field); + + /* Write the temp table row */ + error= fs_tmp_table->file->ha_write_tmp_row(fs_tmp_table->record[0]); + if (error) + DBUG_RETURN(error); + + /* Save the written row's position in the temp table */ + fs_tmp_table->file->position(fs_tmp_table->record[0]); + DBUG_RETURN(0); +} + + +/** + Free the filesort temp table and its information structures. + + @param thd Thread handle. + @param fs_info Filesort information that includes the filesort + temp table and an array of its fields. +*/ + +static void free_fs_tmp_table(THD *thd, SORT_INFO *fs_info) +{ + if (fs_info->fs_tmp_table) + { + free_tmp_table(thd, fs_info->fs_tmp_table); + fs_info->fs_tmp_table= NULL; + } + my_free(fs_info->tmp_field); + fs_info->tmp_field= NULL; + fs_info->tmp_fields= 0; +} + + static bool save_index(Sort_param *param, uint count, SORT_INFO *table_sort) { @@ -2010,6 +2250,8 @@ sortlength(THD *thd, SORT_FIELD *sortorder, uint s_length, @param ptabfield Array of references to the table fields @param sortlength Total length of sorted fields @param [out] addon_buf Buffer to us for appended fields + @param [out] ptmp_fields Pointer to the number of temp table fields, + if any @note The null bits for the appended values are supposed to be put together @@ -2023,20 +2265,25 @@ sortlength(THD *thd, SORT_FIELD *sortorder, uint s_length, static SORT_ADDON_FIELD * get_addon_fields(ulong max_length_for_sort_data, - Field **ptabfield, uint sortlength, LEX_STRING *addon_buf) + Field **ptabfield, uint sortlength, LEX_STRING *addon_buf, + uint *ptmp_fields) { Field **pfield; Field *field; - SORT_ADDON_FIELD *addonf; - uint length= 0; - uint fields= 0; + SORT_ADDON_FIELD *addonf= NULL; + uint addon_length= 0; + uint addon_fields= 0; uint null_fields= 0; + uint nonaddon_fields= 0; + bool has_blob_field= FALSE; MY_BITMAP *read_set= (*ptabfield)->table->read_set; DBUG_ENTER("get_addon_fields"); /* - If there is a reference to a field in the query add it - to the the set of appended fields. + If there is a reference to a field in the query that is not a blob/text + field, add it to the the set of appended fields. + We cannot use addons if there is a blob/text field. + All referenced fields are written to the temp table. Note for future refinement: This this a too strong condition. Actually we need only the fields referred in the @@ -2051,34 +2298,52 @@ get_addon_fields(ulong max_length_for_sort_data, { if (!bitmap_is_set(read_set, field->field_index)) continue; - if (field->flags & BLOB_FLAG) - DBUG_RETURN(0); - length+= field->max_packed_col_length(field->pack_length()); - if (field->maybe_null()) - null_fields++; - fields++; - } - if (!fields) - DBUG_RETURN(0); - length+= (null_fields+7)/8; + if (has_blob_field) + nonaddon_fields++; + else if (field->flags & BLOB_FLAG) + { + has_blob_field= TRUE; + nonaddon_fields= (addon_fields + 1); + null_fields= 0; + addon_fields= 0; + addon_length= 0; + } + else + { + addon_length+= field->max_packed_col_length(field->pack_length()); + addon_fields++; + if (field->maybe_null()) + null_fields++; + } + } + if (nonaddon_fields) + *ptmp_fields= addon_fields + nonaddon_fields; // Total number of fields + else + *ptmp_fields= 0; // Temp table is unnecessary + if (!addon_fields) + DBUG_RETURN(NULL); - if (length+sortlength > max_length_for_sort_data || + addon_length+= (null_fields+7)/8; + + if (addon_length+sortlength > max_length_for_sort_data || !my_multi_malloc(MYF(MY_WME | MY_THREAD_SPECIFIC), - &addonf, sizeof(SORT_ADDON_FIELD) * (fields+1), - &addon_buf->str, length, + &addonf, sizeof(SORT_ADDON_FIELD) * (addon_fields+1), + &addon_buf->str, addon_length, NullS)) + { + *ptmp_fields= addon_fields + nonaddon_fields; // Total number of fields + DBUG_RETURN(NULL); + } - DBUG_RETURN(0); - - addon_buf->length= length; - length= (null_fields+7)/8; + addon_buf->length= addon_length; + addon_length= (null_fields+7)/8; null_fields= 0; for (pfield= ptabfield; (field= *pfield) ; pfield++) { if (!bitmap_is_set(read_set, field->field_index)) continue; addonf->field= field; - addonf->offset= length; + addonf->offset= addon_length; if (field->maybe_null()) { addonf->null_offset= null_fields/8; @@ -2091,13 +2356,13 @@ get_addon_fields(ulong max_length_for_sort_data, addonf->null_bit= 0; } addonf->length= field->max_packed_col_length(field->pack_length()); - length+= addonf->length; + addon_length+= addonf->length; addonf++; } addonf->field= 0; // Put end marker - DBUG_PRINT("info",("addon_length: %d",length)); - DBUG_RETURN(addonf-fields); + DBUG_PRINT("info",("addon_length: %d",addon_length)); + DBUG_RETURN(addonf-addon_fields); } diff --git a/sql/filesort.h b/sql/filesort.h index bd1d81f91ef..60b21ffab47 100644 --- a/sql/filesort.h +++ b/sql/filesort.h @@ -27,6 +27,7 @@ class Filesort_tracker; struct SORT_FIELD; typedef struct st_order ORDER; class JOIN; +class Copy_field; /** @@ -87,7 +88,8 @@ class SORT_INFO public: SORT_INFO() - :addon_field(0), record_pointers(0) + :addon_field(0), record_pointers(0), + fs_tmp_table(NULL), tmp_field(NULL), tmp_fields(0) { buffpek.str= 0; my_b_clear(&io_cache); @@ -101,6 +103,9 @@ public: my_free(record_pointers); my_free(buffpek.str); my_free(addon_field); + fs_tmp_table= NULL; // Freed in end_read_record() + tmp_field= NULL; // Freed in end_read_record() + tmp_fields= 0; } void reset() @@ -119,6 +124,11 @@ public: /* To unpack back */ void (*unpack)(struct st_sort_addon_field *, uchar *, uchar *); uchar *record_pointers; /* If sorted in memory */ + TABLE *fs_tmp_table; /* Optional temp table used by filesort to */ + /* eliminate rnd_pos() calls to the table + /* being sorted */ + Copy_field *tmp_field; /* Filesort temp table field array */ + uint tmp_fields; /* Number of filesort temp table fields */ /* How many rows in final result. Also how many rows in record_pointers, if used diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc index 4ec6f3dfa38..852bcefed3c 100644 --- a/sql/ha_partition.cc +++ b/sql/ha_partition.cc @@ -5185,6 +5185,36 @@ int ha_partition::rnd_pos_by_record(uchar *record) } +/* + Determine whether a call to rnd_pos() is expensive + + SYNOPSIS + is_rnd_pos_expensive() + + RETURN VALUE + FALSE No inherent inefficiencies in rnd_pos() + TRUE rnd_pos() call is inefficient + + DESCRIPTION + Some engines, such as Spider, have an inefficient implementation of + rnd_pos(), because they need to do a remote access to fetch the + single table row. Determine whether the rnd_pos() implementation + for any of the partitions is expensive. +*/ + +bool ha_partition::is_rnd_pos_expensive() +{ + DBUG_ENTER("ha_partition::is_rnd_pos_expensive"); + uint i; + + for (i= 0; i < m_tot_parts; i++) + if (m_file[i]->ha_is_rnd_pos_expensive()) + DBUG_RETURN(TRUE); + + DBUG_RETURN(FALSE); +} + + /**************************************************************************** MODULE index scan ****************************************************************************/ diff --git a/sql/ha_partition.h b/sql/ha_partition.h index 30dd24b6014..1d3f9c14c79 100644 --- a/sql/ha_partition.h +++ b/sql/ha_partition.h @@ -691,6 +691,7 @@ public: virtual int rnd_next(uchar * buf); virtual int rnd_pos(uchar * buf, uchar * pos); virtual int rnd_pos_by_record(uchar *record); + virtual bool is_rnd_pos_expensive(); virtual void position(const uchar * record); /* diff --git a/sql/handler.h b/sql/handler.h index a96e98c2f84..98e02eaa244 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -3536,6 +3536,17 @@ public: position(record); return rnd_pos(record, ref); } + /** + Some engines are unable to provide an efficient implementation + for rnd_pos(). Spider is such an engine, as a call to rnd_pos() + needs to access a table on a remote data node to retrieve the + single table row. + */ + virtual bool is_rnd_pos_expensive() + { + /* Engine's rnd_pos() implementation has no inherent inefficiencies */ + return FALSE; + } virtual int read_first_row(uchar *buf, uint primary_key); public: @@ -3545,6 +3556,10 @@ public: int ha_rnd_next(uchar *buf); int ha_rnd_pos(uchar *buf, uchar *pos); inline int ha_rnd_pos_by_record(uchar *buf); + inline bool ha_is_rnd_pos_expensive() + { + return is_rnd_pos_expensive(); + } inline int ha_read_first_row(uchar *buf, uint primary_key); /** diff --git a/sql/records.cc b/sql/records.cc index ac84ca84ab6..b016cd97783 100644 --- a/sql/records.cc +++ b/sql/records.cc @@ -38,16 +38,21 @@ static int rr_quick(READ_RECORD *info); int rr_sequential(READ_RECORD *info); static int rr_from_tempfile(READ_RECORD *info); +static int rr_from_tempfile_and_copy(READ_RECORD *info); static int rr_unpack_from_tempfile(READ_RECORD *info); static int rr_unpack_from_buffer(READ_RECORD *info); int rr_from_pointers(READ_RECORD *info); +int rr_from_pointers_and_copy(READ_RECORD *info); static int rr_from_cache(READ_RECORD *info); +static int rr_from_cache_and_copy(READ_RECORD *info); static int init_rr_cache(THD *thd, READ_RECORD *info); static int rr_cmp(uchar *a,uchar *b); static int rr_index_first(READ_RECORD *info); static int rr_index_last(READ_RECORD *info); static int rr_index(READ_RECORD *info); static int rr_index_desc(READ_RECORD *info); +static int init_copy(READ_RECORD *info); +static void end_copy(READ_RECORD *info); /** @@ -77,6 +82,11 @@ bool init_read_record_idx(READ_RECORD *info, THD *thd, TABLE *table, bzero((char*) info,sizeof(*info)); info->thd= thd; info->table= table; + info->copy_table= NULL; + info->tmp_field= NULL; + info->tmp_fields= 0; + info->free_tmp_table= FALSE; + info->addon_field= NULL; info->record= table->record[0]; info->print_error= print_error; info->unlock_row= rr_unlock_row; @@ -188,13 +198,39 @@ bool init_read_record(READ_RECORD *info,THD *thd, TABLE *table, bool disable_rr_cache) { IO_CACHE *tempfile; - SORT_ADDON_FIELD *addon_field= filesort ? filesort->addon_field : 0; + SORT_ADDON_FIELD *addon_field; + bool has_fs_tmp_table; DBUG_ENTER("init_read_record"); bzero((char*) info,sizeof(*info)); info->thd=thd; + if (filesort) + { + if (filesort->fs_tmp_table) + { + has_fs_tmp_table= TRUE; + info->copy_table= table; + table= filesort->fs_tmp_table; + } + else + { + has_fs_tmp_table= FALSE; + info->copy_table= NULL; + } + info->tmp_field= filesort->tmp_field; + info->tmp_fields= filesort->tmp_fields; + addon_field= filesort->addon_field; + } + else + { + has_fs_tmp_table= FALSE; + info->copy_table= NULL; + info->tmp_field= NULL; + info->tmp_fields= 0; + addon_field= NULL; + } + info->free_tmp_table= has_fs_tmp_table; info->table=table; - info->forms= &info->table; /* Only one table */ info->addon_field= addon_field; if ((table->s->tmp_table == INTERNAL_TMP_TABLE || @@ -230,13 +266,18 @@ bool init_read_record(READ_RECORD *info,THD *thd, TABLE *table, { DBUG_PRINT("info",("using rr_from_tempfile")); info->read_record_func= - addon_field ? rr_unpack_from_tempfile : rr_from_tempfile; + addon_field ? rr_unpack_from_tempfile : + has_fs_tmp_table ? rr_from_tempfile_and_copy : + rr_from_tempfile; info->io_cache= tempfile; reinit_io_cache(info->io_cache,READ_CACHE,0L,0,0); - info->ref_pos=table->file->ref; + info->ref_pos= table->file->ref; if (!table->file->inited) if (table->file->ha_rnd_init_with_error(0)) DBUG_RETURN(1); + if (has_fs_tmp_table) + if (init_copy(info)) + DBUG_RETURN(1); /* addon_field is checked because if we use addon fields, @@ -245,22 +286,26 @@ bool init_read_record(READ_RECORD *info,THD *thd, TABLE *table, */ if (!disable_rr_cache && !addon_field && - thd->variables.read_rnd_buff_size && - !(table->file->ha_table_flags() & HA_FAST_KEY_READ) && - (table->db_stat & HA_READ_ONLY || - table->reginfo.lock_type <= TL_READ_NO_INSERT) && - (ulonglong) table->s->reclength* (table->file->stats.records+ - table->file->stats.deleted) > - (ulonglong) MIN_FILE_LENGTH_TO_USE_ROW_CACHE && - info->io_cache->end_of_file/info->ref_length * table->s->reclength > - (my_off_t) MIN_ROWS_TO_USE_TABLE_CACHE && - !table->s->blob_fields && + thd->variables.read_rnd_buff_size && + !(table->file->ha_table_flags() & HA_FAST_KEY_READ) && + (table->db_stat & HA_READ_ONLY || + table->reginfo.lock_type <= TL_READ_NO_INSERT) && + (ulonglong) table->s->reclength* + (table->file->stats.records+ + table->file->stats.deleted) > + (ulonglong) MIN_FILE_LENGTH_TO_USE_ROW_CACHE && + info->io_cache->end_of_file/info->ref_length * + table->s->reclength > + (my_off_t) MIN_ROWS_TO_USE_TABLE_CACHE && + !table->s->blob_fields && info->ref_length <= MAX_REFLENGTH) { if (! init_rr_cache(thd, info)) { - DBUG_PRINT("info",("using rr_from_cache")); - info->read_record_func= rr_from_cache; + info->read_record_func= + has_fs_tmp_table ? rr_from_cache_and_copy : + rr_from_cache; + DBUG_PRINT("info",("using rr_from_cache")); } } } @@ -272,13 +317,19 @@ bool init_read_record(READ_RECORD *info,THD *thd, TABLE *table, else if (filesort && filesort->record_pointers) { DBUG_PRINT("info",("using record_pointers")); - if (table->file->ha_rnd_init_with_error(0)) - DBUG_RETURN(1); + if (!table->file->inited) + if (table->file->ha_rnd_init_with_error(0)) + DBUG_RETURN(1); info->cache_pos= filesort->record_pointers; info->cache_end= (info->cache_pos+ filesort->return_rows * info->ref_length); info->read_record_func= - addon_field ? rr_unpack_from_buffer : rr_from_pointers; + addon_field ? rr_unpack_from_buffer : + has_fs_tmp_table ? rr_from_pointers_and_copy : + rr_from_pointers; + if (has_fs_tmp_table) + if (init_copy(info)) + DBUG_RETURN(1); } else if (table->file->keyread_enabled()) { @@ -300,11 +351,11 @@ bool init_read_record(READ_RECORD *info,THD *thd, TABLE *table, DBUG_RETURN(1); /* We can use record cache if we don't update dynamic length tables */ if (!table->no_cache && - (use_record_cache > 0 || - (int) table->reginfo.lock_type <= (int) TL_READ_HIGH_PRIORITY || - !(table->s->db_options_in_use & HA_OPTION_PACK_RECORD) || - (use_record_cache < 0 && - !(table->file->ha_table_flags() & HA_NOT_DELETE_WITH_CACHE)))) + (use_record_cache > 0 || + (int) table->reginfo.lock_type <= (int) TL_READ_HIGH_PRIORITY || + !(table->s->db_options_in_use & HA_OPTION_PACK_RECORD) || + (use_record_cache < 0 && + !(table->file->ha_table_flags() & HA_NOT_DELETE_WITH_CACHE)))) (void) table->file->extra_opt(HA_EXTRA_CACHE, thd->variables.read_buff_size); } @@ -333,6 +384,15 @@ void end_read_record(READ_RECORD *info) (void) info->table->file->extra(HA_EXTRA_NO_CACHE); if (info->read_record_func != rr_quick) // otherwise quick_range does it (void) info->table->file->ha_index_or_rnd_end(); + if (info->free_tmp_table) + { + free_tmp_table(info->thd, info->table); + end_copy(info); + my_free(info->tmp_field); + info->tmp_field= NULL; + info->tmp_fields= 0; + info->free_tmp_table= FALSE; + } info->table=0; } } @@ -521,7 +581,35 @@ static int rr_from_tempfile(READ_RECORD *info) /** Read a result set record from a temporary file after sorting. - The function first reads the next sorted record from the temporary file. + The function first reads the next sorted record from the temporary file + into a buffer. If successful, it copies the fields to the + table being sorted. + + @param info Reference to the context including record descriptors + + @retval + 0 Record successfully read. + @retval + -1 No more records to read or record read failed. +*/ + +int rr_from_tempfile_and_copy(READ_RECORD *info) +{ + int error; + if ((error= rr_from_tempfile(info))) + return error; + + for (Copy_field *cp= info->copy_field; cp != info->copy_field_end; cp++) + (*cp->do_copy)(cp); + + return error; +} + + +/** + Read a result set record from a temporary file after sorting. + + The function first reads the next sorted record from the temporary file into a buffer. If a success it calls a callback function that unpacks the fields values use in the result set from this buffer into their positions in the regular record buffer. @@ -569,6 +657,35 @@ int rr_from_pointers(READ_RECORD *info) return tmp; } + +/** + Read a result set record from a temporary file after sorting. + + The function first reads the next sorted record from the temporary file + into a buffer. If successful, it copies the fields to the + table being sorted. + + @param info Reference to the context including record descriptors + + @retval + 0 Record successfully read. + @retval + -1 No more records to read or record read failed. +*/ + +int rr_from_pointers_and_copy(READ_RECORD *info) +{ + int error; + if ((error= rr_from_pointers(info))) + return error; + + for (Copy_field *cp= info->copy_field; cp != info->copy_field_end; cp++) + (*cp->do_copy)(cp); + + return error; +} + + /** Read a result set record from a buffer after sorting. @@ -703,6 +820,114 @@ static int rr_from_cache(READ_RECORD *info) } /* rr_from_cache */ +/** + Read a result set record from cache after sorting. + + The function first reads the next sorted record from cache. + If successful, it copies the fields to the table being sorted. + + @param info Reference to the context including record descriptors + + @retval + 0 Record successfully read. + @retval + -1 No more records to read or record read failed. +*/ + +int rr_from_cache_and_copy(READ_RECORD *info) +{ + int error; + if ((error= rr_from_cache(info))) + return error; + + for (Copy_field *cp= info->copy_field; cp != info->copy_field_end; cp++) + (*cp->do_copy)(cp); + + return error; +} + + +/** + Set up for copying the fields of the current row + from the filesort temp table to the table being sorted. + + @param info Reference to the context including record descriptors + + @retval + 0 Success. + @retval + 1 Memory allocation failure. +*/ + +static int init_copy(READ_RECORD *info) +{ + TABLE *table= info->copy_table; + Copy_field *tmp_field; + Copy_field *copy_field; + MY_BITMAP *write_set; + my_bitmap_map *column_bitmap= NULL; + + /* Allocate the memory for the copy_field descriptors */ + copy_field= (Copy_field *) my_malloc(sizeof(Copy_field) * info->tmp_fields, + MYF(MY_WME | MY_THREAD_SPECIFIC)); + if (!copy_field) + return 1; + info->copy_field= copy_field; + + /* Allocate the memory for the updated table write set */ + if (!(write_set= (MY_BITMAP *) + my_malloc(sizeof(MY_BITMAP), + MYF(MY_WME | MY_THREAD_SPECIFIC)))) + { + my_free(copy_field); + return 1; + } + /* Initialize the column bitmap for the updated table write set */ + my_bitmap_init(write_set, column_bitmap, table->s->fields, FALSE); + info->save_write_set= table->write_set; + table->column_bitmaps_set_no_signal(table->read_set, write_set); + + /* + Each column value present in the temp table needs to be copied + to the table being sorted + */ + for (tmp_field= info->tmp_field; tmp_field->from_field; tmp_field++) + { + bitmap_fast_test_and_set(table->write_set, + tmp_field->from_field->field_index); + copy_field->set(tmp_field->from_field, tmp_field->to_field, FALSE); + copy_field++; + } + table->file->column_bitmaps_signal(); + info->copy_field_end= copy_field; + + return 0; +} + + +/** + Do cleanup at the completion of copying field values from the + filesort temp table to the table being sorted. + + @param info Reference to the context including record descriptors +*/ + +static void end_copy(READ_RECORD *info) +{ + TABLE *table= info->copy_table; + MY_BITMAP *write_set= table->write_set; + + table->column_bitmaps_set(table->read_set, info->save_write_set); + + my_bitmap_free(write_set); + my_free(write_set); + my_free(info->copy_field); + info->copy_table= NULL; + info->save_write_set= NULL; + info->copy_field= info->copy_field_end= NULL; +} + + static int rr_cmp(uchar *a,uchar *b) { if (a[0] != b[0]) diff --git a/sql/records.h b/sql/records.h index 940c88ca0c7..0ffd08e320c 100644 --- a/sql/records.h +++ b/sql/records.h @@ -27,6 +27,8 @@ class SQL_SELECT; class Copy_field; class SORT_INFO; +#include "my_bitmap.h" + struct READ_RECORD; void end_read_record(READ_RECORD *info); @@ -53,7 +55,9 @@ struct READ_RECORD TABLE *table; /* Head-form */ //handler *file; - TABLE **forms; /* head and ref forms */ + TABLE *copy_table; /* Original table that a */ + /* filesort temp table */ + /* row is copied to */ Unlock_row_func unlock_row; Read_func read_record_func; THD *thd; @@ -61,25 +65,38 @@ struct READ_RECORD uint cache_records; uint ref_length,struct_length,reclength,rec_cache_size,error_offset; uint index; - uchar *ref_pos; /* pointer to form->refpos */ + uint tmp_fields; /* Number of filesort temp + table fields */ + MY_BITMAP *save_write_set; /* Original write set when */ + /* records are read from /* + /* the filesort temp table */ + /* and copied to the */ + /* original table */ + uchar *ref_pos; /* Pointer to form->refpos */ uchar *record; uchar *rec_buf; /* to read field values after filesort */ uchar *cache,*cache_pos,*cache_end,*read_positions; - struct st_sort_addon_field *addon_field; /* Pointer to the fields info */ + struct st_sort_addon_field *addon_field; /* Pointer to the fields info */ + Copy_field *tmp_field; /* Filesort temp table */ + /* field array */ struct st_io_cache *io_cache; - bool print_error, ignore_not_found_rows; + bool print_error, ignore_not_found_rows, free_tmp_table; void (*unpack)(struct st_sort_addon_field *, uchar *, uchar *); int read_record() { return read_record_func(this); } - /* + /* SJ-Materialization runtime may need to read fields from the materialized - table and unpack them into original table fields: + table and unpack them into original table fields. + Read following a filesort may need to read fields from its temp table + and unpack them into the corresponding original table fields. */ Copy_field *copy_field; Copy_field *copy_field_end; + public: - READ_RECORD() : table(NULL), cache(NULL) {} + READ_RECORD() + : table(NULL), cache(NULL), copy_field(NULL), copy_field_end(NULL) {} ~READ_RECORD() { end_read_record(this); } }; diff --git a/sql/sql_class.h b/sql/sql_class.h index 4722f3f5989..df333ae87cb 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -5767,7 +5767,6 @@ class user_var_entry user_var_entry *get_variable(HASH *hash, LEX_CSTRING *name, bool create_if_not_exists); -class SORT_INFO; class multi_delete :public select_result_interceptor { TABLE_LIST *delete_tables, *table_being_deleted; diff --git a/sql/sql_sort.h b/sql/sql_sort.h index d57239671a8..2ca9c383035 100644 --- a/sql/sql_sort.h +++ b/sql/sql_sort.h @@ -93,7 +93,24 @@ public: } void init_for_filesort(uint sortlen, TABLE *table, ulong max_length_for_sort_data, - ha_rows maxrows, bool sort_positions); + ha_rows maxrows, bool sort_positions, + uint *tmp_fields); + void update_ref_length(uint new_ref_length) + { + if (!addon_field) + { + if (ref_length) + { + res_length-= ref_length; + sort_length-= ref_length; + rec_length-= ref_length; + } + res_length+= new_ref_length; + sort_length+= new_ref_length; + rec_length+= new_ref_length; + ref_length= new_ref_length; + } + } }; diff --git a/storage/spider/ha_spider.h b/storage/spider/ha_spider.h index b79e1b89fbf..5b200b31d0c 100644 --- a/storage/spider/ha_spider.h +++ b/storage/spider/ha_spider.h @@ -429,6 +429,15 @@ public: KEY_MULTI_RANGE **found_range_p ); #endif + /** + Spider's implementation of rnd_pos() is inherently inefficient. + A call to rnd_pos() needs to access a table on a remote data node + to retrieve the single table row. + */ + virtual bool is_rnd_pos_expensive() + { + return TRUE; + } int rnd_init( bool scan ); |