summaryrefslogtreecommitdiff
path: root/sql/sql_select.h
diff options
context:
space:
mode:
authorIgor Babaev <igor@askmonty.org>2016-02-09 12:35:59 -0800
committerIgor Babaev <igor@askmonty.org>2016-02-09 12:35:59 -0800
commit2cfc450bf78c2d951729d1a0e8f731c0d987b1d5 (patch)
tree6c15f411927c9da723265d5f9891592390e97cea /sql/sql_select.h
parent7b50447aa6d051b8d14bb01ef14802cb8ffee223 (diff)
downloadmariadb-git-2cfc450bf78c2d951729d1a0e8f731c0d987b1d5.tar.gz
This is the consolidated patch for mdev-8646:bb-10.2-mdev8646
"Re-factor the code for post-join operations". The patch mainly contains the code ported from mysql-5.6 and created for two essential architectural changes: 1. WL#5558: Resolve ORDER BY execution method at the optimization stage 2. WL#6071: Inline tmp tables into the nested loops algorithm The first task was implemented for mysql-5.6 by Ole John Aske. It allows to make all decisions on ORDER BY operation at the optimization stage. The second task implemented for mysql-5.6 by Evgeny Potemkin adds JOIN_TAB nodes for post-join operations that require temporary tables. It allows to execute these operations within the nested loops algorithm that used to be used before this task only for join queries. Besides these task moves all planning on the execution of these operations from the execution phase to the optimization phase. Some other re-factoring changes of mysql-5.6 were pulled in, mainly because it was easier to pull them in than roll them back. In particular all changes concerning Ref_ptr_array were incorporated. The port required some changes in the MariaDB code that concerned the functionality of EXPLAIN and ANALYZE. This was done mainly by Sergey Petrunia.
Diffstat (limited to 'sql/sql_select.h')
-rw-r--r--sql/sql_select.h330
1 files changed, 234 insertions, 96 deletions
diff --git a/sql/sql_select.h b/sql/sql_select.h
index 1b1bb6ded71..59f29239f5e 100644
--- a/sql/sql_select.h
+++ b/sql/sql_select.h
@@ -32,7 +32,9 @@
#include "sql_array.h" /* Array */
#include "records.h" /* READ_RECORD */
#include "opt_range.h" /* SQL_SELECT, QUICK_SELECT_I */
+#include "filesort.h"
+typedef struct st_join_table JOIN_TAB;
/* Values in optimize */
#define KEY_OPTIMIZE_EXISTS 1
@@ -184,7 +186,7 @@ enum sj_strategy_enum
typedef enum_nested_loop_state
(*Next_select_func)(JOIN *, struct st_join_table *, bool);
-Next_select_func setup_end_select_func(JOIN *join);
+Next_select_func setup_end_select_func(JOIN *join, JOIN_TAB *tab);
int rr_sequential(READ_RECORD *info);
int rr_sequential_and_unpack(READ_RECORD *info);
@@ -198,9 +200,11 @@ int rr_sequential_and_unpack(READ_RECORD *info);
class JOIN_CACHE;
class SJ_TMP_TABLE;
class JOIN_TAB_RANGE;
+class AGGR_OP;
+class Filesort;
typedef struct st_join_table {
- st_join_table() {} /* Remove gcc warning */
+ st_join_table() {}
TABLE *table;
KEYUSE *keyuse; /**< pointer to first used key */
KEY *hj_key; /**< descriptor of the used best hash join key
@@ -260,6 +264,7 @@ typedef struct st_join_table {
*/
uint packed_info;
+ // READ_RECORD::Setup_func materialize_table;
READ_RECORD::Setup_func read_first_record;
Next_select_func next_select;
READ_RECORD read_record;
@@ -346,6 +351,7 @@ typedef struct st_join_table {
*/
Item *cache_idx_cond;
SQL_SELECT *cache_select;
+ AGGR_OP *aggr;
JOIN *join;
/*
Embedding SJ-nest (may be not the direct parent), or NULL if none.
@@ -412,6 +418,39 @@ typedef struct st_join_table {
/* NestedOuterJoins: Bitmap of nested joins this table is part of */
nested_join_map embedding_map;
+ /* Tmp table info */
+ TMP_TABLE_PARAM *tmp_table_param;
+
+ /* Sorting related info */
+ Filesort *filesort;
+
+ /**
+ List of topmost expressions in the select list. The *next* JOIN TAB
+ in the plan should use it to obtain correct values. Same applicable to
+ all_fields. These lists are needed because after tmp tables functions
+ will be turned to fields. These variables are pointing to
+ tmp_fields_list[123]. Valid only for tmp tables and the last non-tmp
+ table in the query plan.
+ @see JOIN::make_tmp_tables_info()
+ */
+ List<Item> *fields;
+ /** List of all expressions in the select list */
+ List<Item> *all_fields;
+ /*
+ Pointer to the ref array slice which to switch to before sending
+ records. Valid only for tmp tables.
+ */
+ Ref_ptr_array *ref_array;
+
+ /** Number of records saved in tmp table */
+ ha_rows send_records;
+
+ /** HAVING condition for checking prior saving a record into tmp table*/
+ Item *having;
+
+ /** TRUE <=> remove duplicates on this table. */
+ bool distinct;
+
/*
Semi-join strategy to be used for this join table. This is a copy of
POSITION::sj_strategy field. This field is set up by the
@@ -426,9 +465,9 @@ typedef struct st_join_table {
void cleanup();
inline bool is_using_loose_index_scan()
{
- return (select && select->quick &&
- (select->quick->get_type() ==
- QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX));
+ const SQL_SELECT *sel= filesort ? filesort->select : select;
+ return (sel && sel->quick &&
+ (sel->quick->get_type() == QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX));
}
bool is_using_agg_loose_index_scan ()
{
@@ -563,16 +602,22 @@ typedef struct st_join_table {
void save_explain_data(Explain_table_access *eta, table_map prefix_tables,
bool distinct, struct st_join_table *first_top_tab);
- void update_explain_data(uint idx);
+ bool use_order() const; ///< Use ordering provided by chosen index?
+ bool sort_table();
+ bool remove_duplicates();
+
} JOIN_TAB;
#include "sql_join_cache.h"
-enum_nested_loop_state sub_select_cache(JOIN *join, JOIN_TAB *join_tab, bool
- end_of_records);
-enum_nested_loop_state sub_select(JOIN *join,JOIN_TAB *join_tab, bool
- end_of_records);
+enum_nested_loop_state
+sub_select_cache(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
+enum_nested_loop_state
+sub_select(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
+enum_nested_loop_state
+sub_select_postjoin_aggr(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
+
enum_nested_loop_state
end_send_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
bool end_of_records);
@@ -867,12 +912,14 @@ typedef struct st_position
Sj_materialization_picker sjmat_picker;
} POSITION;
+typedef Bounds_checked_array<Item_null_result*> Item_null_array;
+
typedef struct st_rollup
{
enum State { STATE_NONE, STATE_INITED, STATE_READY };
State state;
- Item_null_result **null_items;
- Item ***ref_pointer_arrays;
+ Item_null_array null_items;
+ Ref_ptr_array *ref_pointer_arrays;
List<Item> *fields;
} ROLLUP;
@@ -886,6 +933,56 @@ public:
class Pushdown_query;
+/**
+ @brief
+ Class to perform postjoin aggregation operations
+
+ @details
+ The result records are obtained on the put_record() call.
+ The aggrgation process is determined by the write_func, it could be:
+ end_write Simply store all records in tmp table.
+ end_write_group Perform grouping using join->group_fields,
+ records are expected to be sorted.
+ end_update Perform grouping using the key generated on tmp
+ table. Input records aren't expected to be sorted.
+ Tmp table uses the heap engine
+ end_update_unique Same as above, but the engine is myisam.
+
+ Lazy table initialization is used - the table will be instantiated and
+ rnd/index scan started on the first put_record() call.
+
+*/
+
+class AGGR_OP :public Sql_alloc
+{
+public:
+ JOIN_TAB *join_tab;
+
+ AGGR_OP(JOIN_TAB *tab) : join_tab(tab), write_func(NULL)
+ {};
+
+ enum_nested_loop_state put_record() { return put_record(false); };
+ /*
+ Send the result of operation further (to a next operation/client)
+ This function is called after all records were put into tmp table.
+
+ @return return one of enum_nested_loop_state values.
+ */
+ enum_nested_loop_state end_send();
+ /** write_func setter */
+ void set_write_func(Next_select_func new_write_func)
+ {
+ write_func= new_write_func;
+ }
+
+private:
+ /** Write function that would be used for saving records in tmp table. */
+ Next_select_func write_func;
+ enum_nested_loop_state put_record(bool end_of_records);
+ bool prepare_tmp_table();
+};
+
+
class JOIN :public Sql_alloc
{
private:
@@ -954,14 +1051,6 @@ protected:
public:
JOIN_TAB *join_tab, **best_ref;
-
- /*
- Saved join_tab for pre_sorting. create_sort_index() will save here..
- */
- JOIN_TAB *pre_sort_join_tab;
- uint pre_sort_index;
- Item *pre_sort_idx_pushed_cond;
- void clean_pre_sort_join_tab();
/* List of fields that aren't under an aggregate function */
List<Item_field> non_agg_fields;
@@ -979,8 +1068,6 @@ public:
uint top_table_access_tabs_count;
JOIN_TAB **map2table; ///< mapping between table indexes and JOIN_TABs
- JOIN_TAB *join_tab_save; ///< saved join_tab for subquery reexecution
-
List<JOIN_TAB_RANGE> join_tab_ranges;
/*
@@ -1011,6 +1098,7 @@ public:
We keep it here so that it is saved/restored with JOIN::restore_tmp.
*/
uint top_join_tab_count;
+ uint aggr_tables; ///< Number of post-join tmp tables
uint send_group_parts;
/*
This counts how many times do_select() was invoked for this JOIN.
@@ -1123,6 +1211,7 @@ public:
*/
table_map complex_firstmatch_tables;
+ Next_select_func first_select;
/*
The cost of best complete join plan found so far during optimization,
after optimization phase - cost of picked join order (not taking into
@@ -1138,9 +1227,6 @@ public:
double join_record_count;
List<Item> *fields;
List<Cached_item> group_fields, group_fields_cache;
- TABLE *tmp_table;
- /// used to store 2 possible tmp table of SELECT
- TABLE *exec_tmp_table1, *exec_tmp_table2;
THD *thd;
Item_sum **sum_funcs, ***sum_funcs_end;
/** second copy of sumfuncs (for queries with 2 temporary tables */
@@ -1149,6 +1235,8 @@ public:
Item *having;
Item *tmp_having; ///< To store having when processed temporary table
Item *having_history; ///< Store having for explain
+ ORDER *group_list_for_estimates;
+ bool having_is_correlated;
ulonglong select_options;
/*
Bitmap of allowed types of the join caches that
@@ -1187,26 +1275,6 @@ public:
*/
bool filesort_found_rows;
- /**
- Copy of this JOIN to be used with temporary tables.
-
- tmp_join is used when the JOIN needs to be "reusable" (e.g. in a
- subquery that gets re-executed several times) and we know will use
- temporary tables for materialization. The materialization to a
- temporary table overwrites the JOIN structure to point to the
- temporary table after the materialization is done. This is where
- tmp_join is used : it's a copy of the JOIN before the
- materialization and is used in restoring before re-execution by
- overwriting the current JOIN structure with the saved copy.
- Because of this we should pay extra care of not freeing up helper
- structures that are referenced by the original contents of the
- JOIN. We can check for this by making sure the "current" join is
- not the temporary copy, e.g. !tmp_join || tmp_join != join
-
- We should free these sub-structures at JOIN::destroy() if the
- "current" join has a copy is not that copy.
- */
- JOIN *tmp_join;
ROLLUP rollup; ///< Used with rollup
bool mixed_implicit_grouping;
@@ -1228,6 +1296,19 @@ public:
GROUP/ORDER BY.
*/
bool simple_order, simple_group;
+
+ /*
+ ordered_index_usage is set if an ordered index access
+ should be used instead of a filesort when computing
+ ORDER/GROUP BY.
+ */
+ enum
+ {
+ ordered_index_void, // No ordered index avail.
+ ordered_index_group_by, // Use index for GROUP BY
+ ordered_index_order_by // Use index for ORDER BY
+ } ordered_index_usage;
+
/**
Is set only in case if we have a GROUP BY clause
and no ORDER BY after constant elimination of 'order'.
@@ -1280,10 +1361,19 @@ public:
List<Item> exec_const_order_group_cond;
SQL_SELECT *select; ///<created in optimisation phase
JOIN_TAB *return_tab; ///<used only for outer joins
- Item **ref_pointer_array; ///<used pointer reference for this select
- // Copy of above to be used with different lists
- Item **items0, **items1, **items2, **items3, **current_ref_pointer_array;
- uint ref_pointer_array_size; ///< size of above in bytes
+
+ /*
+ Used pointer reference for this select.
+ select_lex->ref_pointer_array contains five "slices" of the same length:
+ |========|========|========|========|========|
+ ref_ptrs items0 items1 items2 items3
+ */
+ Ref_ptr_array ref_ptrs;
+ // Copy of the initial slice above, to be used with different lists
+ Ref_ptr_array items0, items1, items2, items3;
+ // Used by rollup, to restore ref_ptrs after overwriting it.
+ Ref_ptr_array current_ref_ptrs;
+
const char *zero_result_cause; ///< not 0 if exec must return zero result
bool union_part; ///< this subselect is part of union
@@ -1310,20 +1400,12 @@ public:
/* SJM nests that are executed with SJ-Materialization strategy */
List<SJ_MATERIALIZATION_INFO> sjm_info_list;
- /*
- storage for caching buffers allocated during query execution.
- These buffers allocations need to be cached as the thread memory pool is
- cleared only at the end of the execution of the whole query and not caching
- allocations that occur in repetition at execution time will result in
- excessive memory usage.
- Note: make_simple_join always creates an execution plan that accesses
- a single table, thus it is sufficient to have a one-element array for
- table_reexec.
- */
- SORT_FIELD *sortorder; // make_unireg_sortorder()
- TABLE *table_reexec[1]; // make_simple_join()
- JOIN_TAB *join_tab_reexec; // make_simple_join()
- /* end of allocation caching storage */
+ /** TRUE <=> ref_pointer_array is set to items3. */
+ bool set_group_rpa;
+ /** Exec time only: TRUE <=> current group has been sent */
+ bool group_sent;
+
+ JOIN_TAB *sort_and_group_aggr_tab;
JOIN(THD *thd_arg, List<Item> &fields_arg, ulonglong select_options_arg,
select_result *result_arg)
@@ -1335,12 +1417,13 @@ public:
void init(THD *thd_arg, List<Item> &fields_arg, ulonglong select_options_arg,
select_result *result_arg)
{
- join_tab= join_tab_save= 0;
+ join_tab= 0;
table= 0;
table_count= 0;
top_join_tab_count= 0;
const_tables= 0;
const_table_map= 0;
+ aggr_tables= 0;
eliminated_tables= 0;
join_list= 0;
implicit_grouping= FALSE;
@@ -1350,25 +1433,21 @@ public:
send_records= 0;
found_records= 0;
fetch_limit= HA_POS_ERROR;
- join_examined_rows= 0;
- exec_tmp_table1= 0;
- exec_tmp_table2= 0;
- sortorder= 0;
- table_reexec[0]= 0;
- join_tab_reexec= 0;
thd= thd_arg;
sum_funcs= sum_funcs2= 0;
procedure= 0;
having= tmp_having= having_history= 0;
+ having_is_correlated= false;
+ group_list_for_estimates= 0;
select_options= select_options_arg;
result= result_arg;
lock= thd_arg->lock;
select_lex= 0; //for safety
- tmp_join= 0;
select_distinct= MY_TEST(select_options & SELECT_DISTINCT);
no_order= 0;
simple_order= 0;
simple_group= 0;
+ ordered_index_usage= ordered_index_void;
need_distinct= 0;
skip_sort_order= 0;
need_tmp= 0;
@@ -1376,8 +1455,11 @@ public:
error= 0;
select= 0;
return_tab= 0;
- ref_pointer_array= items0= items1= items2= items3= 0;
- ref_pointer_array_size= 0;
+ ref_ptrs.reset();
+ items0.reset();
+ items1.reset();
+ items2.reset();
+ items3.reset();
zero_result_cause= 0;
optimized= 0;
have_query_plan= QEP_NOT_PRESENT_YET;
@@ -1405,10 +1487,13 @@ public:
rollup.state= ROLLUP::STATE_NONE;
no_const_tables= FALSE;
+ first_select= sub_select;
+ set_group_rpa= false;
+ group_sent= 0;
+
outer_ref_cond= pseudo_bits_cond= NULL;
in_to_exists_where= NULL;
in_to_exists_having= NULL;
- pre_sort_join_tab= NULL;
emb_sjm_nest= NULL;
sjm_lookup_tables= 0;
@@ -1420,7 +1505,10 @@ public:
table_access_tabs= NULL;
}
- int prepare(Item ***rref_pointer_array, TABLE_LIST *tables, uint wind_num,
+ /* True if the plan guarantees that it will be returned zero or one row */
+ bool only_const_tables() { return const_tables == table_count; }
+
+ int prepare(TABLE_LIST *tables, uint wind_num,
COND *conds, uint og_num, ORDER *order, bool skip_order_by,
ORDER *group, Item *having, ORDER *proc_param, SELECT_LEX *select,
SELECT_LEX_UNIT *unit);
@@ -1431,6 +1519,7 @@ public:
int init_execution();
void exec();
void exec_inner();
+ bool prepare_result(List<Item> **columns_list);
int destroy();
void restore_tmp();
bool alloc_func_list();
@@ -1440,16 +1529,42 @@ public:
bool make_sum_func_list(List<Item> &all_fields, List<Item> &send_fields,
bool before_group_by, bool recompute= FALSE);
- inline void set_items_ref_array(Item **ptr)
+ /// Initialzes a slice, see comments for ref_ptrs above.
+ Ref_ptr_array ref_ptr_array_slice(size_t slice_num)
+ {
+ size_t slice_sz= select_lex->ref_pointer_array.size() / 5U;
+ DBUG_ASSERT(select_lex->ref_pointer_array.size() % 5 == 0);
+ DBUG_ASSERT(slice_num < 5U);
+ return Ref_ptr_array(&select_lex->ref_pointer_array[slice_num * slice_sz],
+ slice_sz);
+ }
+
+ /**
+ Overwrites one slice with the contents of another slice.
+ In the normal case, dst and src have the same size().
+ However: the rollup slices may have smaller size than slice_sz.
+ */
+ void copy_ref_ptr_array(Ref_ptr_array dst_arr, Ref_ptr_array src_arr)
+ {
+ DBUG_ASSERT(dst_arr.size() >= src_arr.size());
+ void *dest= dst_arr.array();
+ const void *src= src_arr.array();
+ memcpy(dest, src, src_arr.size() * src_arr.element_size());
+ }
+
+ /// Overwrites 'ref_ptrs' and remembers the the source as 'current'.
+ void set_items_ref_array(Ref_ptr_array src_arr)
{
- memcpy((char*) ref_pointer_array, (char*) ptr, ref_pointer_array_size);
- current_ref_pointer_array= ptr;
+ copy_ref_ptr_array(ref_ptrs, src_arr);
+ current_ref_ptrs= src_arr;
}
- inline void init_items_ref_array()
+
+ /// Initializes 'items0' and remembers that it is 'current'.
+ void init_items_ref_array()
{
- items0= ref_pointer_array + all_fields.elements;
- memcpy(items0, ref_pointer_array, ref_pointer_array_size);
- current_ref_pointer_array= items0;
+ items0= ref_ptr_array_slice(1);
+ copy_ref_ptr_array(items0, ref_ptrs);
+ current_ref_ptrs= items0;
}
bool rollup_init();
@@ -1458,18 +1573,10 @@ public:
Item_sum ***func);
int rollup_send_data(uint idx);
int rollup_write_data(uint idx, TABLE *table);
- /**
- Release memory and, if possible, the open tables held by this execution
- plan (and nested plans). It's used to release some tables before
- the end of execution in order to increase concurrency and reduce
- memory consumption.
- */
void join_free();
/** Cleanup this JOIN, possibly for reuse */
void cleanup(bool full);
void clear();
- bool save_join_tab();
- bool init_save_join_tab();
bool send_row_on_empty_set()
{
return (do_send_rows && implicit_grouping && !group_optimized_away &&
@@ -1488,6 +1595,8 @@ public:
return (table_map(1) << table_count) - 1;
}
void drop_unused_derived_keys();
+ bool get_best_combination();
+ bool add_sorting_to_table(JOIN_TAB *tab, ORDER *order);
inline void eval_select_list_used_tables();
/*
Return the table for which an index scan can be used to satisfy
@@ -1553,12 +1662,41 @@ public:
JOIN_TAB *first_breadth_first_execution_tab() { return join_tab; }
private:
/**
+ Create a temporary table to be used for processing DISTINCT/ORDER
+ BY/GROUP BY.
+
+ @note Will modify JOIN object wrt sort/group attributes
+
+ @param tab the JOIN_TAB object to attach created table to
+ @param tmp_table_fields List of items that will be used to define
+ column types of the table.
+ @param tmp_table_group Group key to use for temporary table, NULL if none.
+ @param save_sum_fields If true, do not replace Item_sum items in
+ @c tmp_fields list with Item_field items referring
+ to fields in temporary table.
+
+ @returns false on success, true on failure
+ */
+ bool create_postjoin_aggr_table(JOIN_TAB *tab, List<Item> *tmp_table_fields,
+ ORDER *tmp_table_group,
+ bool save_sum_fields,
+ bool distinct,
+ bool keep_row_ordermake);
+ /**
+ Optimize distinct when used on a subset of the tables.
+
+ E.g.,: SELECT DISTINCT t1.a FROM t1,t2 WHERE t1.b=t2.b
+ In this case we can stop scanning t2 when we have found one t1.a
+ */
+ void optimize_distinct();
+
+ /**
TRUE if the query contains an aggregate function but has no GROUP
BY clause.
*/
bool implicit_grouping;
- bool make_simple_join(JOIN *join, TABLE *tmp_table);
void cleanup_item_list(List<Item> &items) const;
+ bool make_aggr_tables_info();
};
enum enum_with_bush_roots { WITH_BUSH_ROOTS, WITHOUT_BUSH_ROOTS};
@@ -1583,7 +1721,7 @@ extern const char *join_type_str[];
void count_field_types(SELECT_LEX *select_lex, TMP_TABLE_PARAM *param,
List<Item> &fields, bool reset_with_sum_func);
bool setup_copy_fields(THD *thd, TMP_TABLE_PARAM *param,
- Item **ref_pointer_array,
+ Ref_ptr_array ref_pointer_array,
List<Item> &new_list1, List<Item> &new_list2,
uint elements, List<Item> &fields);
void copy_fields(TMP_TABLE_PARAM *param);
@@ -1824,19 +1962,19 @@ int safe_index_read(JOIN_TAB *tab);
int get_quick_record(SQL_SELECT *select);
SORT_FIELD * make_unireg_sortorder(THD *thd, ORDER *order, uint *length,
SORT_FIELD *sortorder);
-int setup_order(THD *thd, Item **ref_pointer_array, TABLE_LIST *tables,
+int setup_order(THD *thd, Ref_ptr_array ref_pointer_array, TABLE_LIST *tables,
List<Item> &fields, List <Item> &all_fields, ORDER *order);
-int setup_group(THD *thd, Item **ref_pointer_array, TABLE_LIST *tables,
+int setup_group(THD *thd, Ref_ptr_array ref_pointer_array, TABLE_LIST *tables,
List<Item> &fields, List<Item> &all_fields, ORDER *order,
bool *hidden_group_fields);
bool fix_inner_refs(THD *thd, List<Item> &all_fields, SELECT_LEX *select,
- Item **ref_pointer_array);
+ Ref_ptr_array ref_pointer_array);
int join_read_key2(THD *thd, struct st_join_table *tab, TABLE *table,
struct st_table_ref *table_ref);
bool handle_select(THD *thd, LEX *lex, select_result *result,
ulong setup_tables_done_option);
-bool mysql_select(THD *thd, Item ***rref_pointer_array,
+bool mysql_select(THD *thd,
TABLE_LIST *tables, uint wild_num, List<Item> &list,
COND *conds, uint og_num, ORDER *order, ORDER *group,
Item *having, ORDER *proc_param, ulonglong select_type,