1 files changed, 234 insertions, 96 deletions
diff --git a/sql/sql_select.h b/sql/sql_select.h
index 1b1bb6ded71..59f29239f5e 100644
--- a/sql/sql_select.h
+++ b/sql/sql_select.h
@@ -32,7 +32,9 @@
 #include "sql_array.h"                        /* Array */
 #include "records.h"                          /* READ_RECORD */
 #include "opt_range.h"                /* SQL_SELECT, QUICK_SELECT_I */
+#include "filesort.h"
 
+typedef struct st_join_table JOIN_TAB;
 
 /* Values in optimize */
 #define KEY_OPTIMIZE_EXISTS		1
@@ -184,7 +186,7 @@ enum sj_strategy_enum
 
 typedef enum_nested_loop_state
 (*Next_select_func)(JOIN *, struct st_join_table *, bool);
-Next_select_func setup_end_select_func(JOIN *join);
+Next_select_func setup_end_select_func(JOIN *join, JOIN_TAB *tab);
 int rr_sequential(READ_RECORD *info);
 int rr_sequential_and_unpack(READ_RECORD *info);
 
@@ -198,9 +200,11 @@ int rr_sequential_and_unpack(READ_RECORD *info);
 class JOIN_CACHE;
 class SJ_TMP_TABLE;
 class JOIN_TAB_RANGE;
+class AGGR_OP;
+class Filesort;
 
 typedef struct st_join_table {
-  st_join_table() {}                          /* Remove gcc warning */
+  st_join_table() {}
   TABLE		*table;
   KEYUSE	*keyuse;			/**< pointer to first used key */
   KEY           *hj_key;       /**< descriptor of the used best hash join key
@@ -260,6 +264,7 @@ typedef struct st_join_table {
   */
   uint          packed_info;
 
+  //  READ_RECORD::Setup_func materialize_table;
   READ_RECORD::Setup_func read_first_record;
   Next_select_func next_select;
   READ_RECORD	read_record;
@@ -346,6 +351,7 @@ typedef struct st_join_table {
   */
   Item          *cache_idx_cond;
   SQL_SELECT    *cache_select;
+  AGGR_OP       *aggr;
   JOIN		*join;
   /*
     Embedding SJ-nest (may be not the direct parent), or NULL if none.
@@ -412,6 +418,39 @@ typedef struct st_join_table {
   /* NestedOuterJoins: Bitmap of nested joins this table is part of */
   nested_join_map embedding_map;
 
+  /* Tmp table info */
+  TMP_TABLE_PARAM *tmp_table_param;
+
+  /* Sorting related info */
+  Filesort *filesort;
+
+  /**
+    List of topmost expressions in the select list. The *next* JOIN TAB
+    in the plan should use it to obtain correct values. Same applicable to
+    all_fields. These lists are needed because after tmp tables functions
+    will be turned to fields. These variables are pointing to
+    tmp_fields_list[123]. Valid only for tmp tables and the last non-tmp
+    table in the query plan.
+    @see JOIN::make_tmp_tables_info()
+  */
+  List<Item> *fields;
+  /** List of all expressions in the select list */
+  List<Item> *all_fields;
+  /*
+    Pointer to the ref array slice which to switch to before sending
+    records. Valid only for tmp tables.
+  */
+  Ref_ptr_array *ref_array;
+
+  /** Number of records saved in tmp table */
+  ha_rows send_records;
+
+  /** HAVING condition for checking prior saving a record into tmp table*/
+  Item *having;
+
+  /** TRUE <=> remove duplicates on this table. */
+  bool distinct;
+
   /*
     Semi-join strategy to be used for this join table. This is a copy of
     POSITION::sj_strategy field. This field is set up by the
@@ -426,9 +465,9 @@ typedef struct st_join_table {
   void cleanup();
   inline bool is_using_loose_index_scan()
   {
-    return (select && select->quick &&
-            (select->quick->get_type() ==
-             QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX));
+    const SQL_SELECT *sel= filesort ? filesort->select : select;
+    return (sel && sel->quick &&
+            (sel->quick->get_type() == QUICK_SELECT_I::QS_TYPE_GROUP_MIN_MAX));
   }
   bool is_using_agg_loose_index_scan ()
   {
@@ -563,16 +602,22 @@ typedef struct st_join_table {
   void save_explain_data(Explain_table_access *eta, table_map prefix_tables, 
                          bool distinct, struct st_join_table *first_top_tab);
 
-  void update_explain_data(uint idx);
+  bool use_order() const; ///< Use ordering provided by chosen index?
+  bool sort_table();
+  bool remove_duplicates();
+
 } JOIN_TAB;
 
 
 #include "sql_join_cache.h"
 
-enum_nested_loop_state sub_select_cache(JOIN *join, JOIN_TAB *join_tab, bool
-                                        end_of_records);
-enum_nested_loop_state sub_select(JOIN *join,JOIN_TAB *join_tab, bool
-                                  end_of_records);
+enum_nested_loop_state
+sub_select_cache(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
+enum_nested_loop_state 
+sub_select(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
+enum_nested_loop_state
+sub_select_postjoin_aggr(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
+
 enum_nested_loop_state
 end_send_group(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
 	       bool end_of_records);
@@ -867,12 +912,14 @@ typedef struct st_position
   Sj_materialization_picker sjmat_picker;
 } POSITION;
 
+typedef Bounds_checked_array<Item_null_result*> Item_null_array;
+
 typedef struct st_rollup
 {
   enum State { STATE_NONE, STATE_INITED, STATE_READY };
   State state;
-  Item_null_result **null_items;
-  Item ***ref_pointer_arrays;
+  Item_null_array null_items;
+  Ref_ptr_array *ref_pointer_arrays;
   List<Item> *fields;
 } ROLLUP;
 
@@ -886,6 +933,56 @@ public:
 
 class Pushdown_query;
 
+/**
+  @brief
+    Class to perform postjoin aggregation operations
+
+  @details
+    The result records are obtained on the put_record() call.
+    The aggrgation process is determined by the write_func, it could be:
+      end_write          Simply store all records in tmp table.
+      end_write_group    Perform grouping using join->group_fields,
+                         records are expected to be sorted.
+      end_update         Perform grouping using the key generated on tmp
+                         table. Input records aren't expected to be sorted.
+                         Tmp table uses the heap engine
+      end_update_unique  Same as above, but the engine is myisam.
+
+    Lazy table initialization is used - the table will be instantiated and
+    rnd/index scan started on the first put_record() call.
+
+*/
+
+class AGGR_OP :public Sql_alloc
+{
+public:
+  JOIN_TAB *join_tab;
+
+  AGGR_OP(JOIN_TAB *tab) : join_tab(tab), write_func(NULL)
+  {};
+
+  enum_nested_loop_state put_record() { return put_record(false); };
+  /*
+    Send the result of operation further (to a next operation/client)
+    This function is called after all records were put into tmp table.
+
+    @return return one of enum_nested_loop_state values.
+  */
+  enum_nested_loop_state end_send();
+  /** write_func setter */
+  void set_write_func(Next_select_func new_write_func)
+  {
+    write_func= new_write_func;
+  }
+
+private:
+  /** Write function that would be used for saving records in tmp table. */
+  Next_select_func write_func;
+  enum_nested_loop_state put_record(bool end_of_records);
+  bool prepare_tmp_table();
+};
+
+
 class JOIN :public Sql_alloc
 {
 private:
@@ -954,14 +1051,6 @@ protected:
 
 public:
   JOIN_TAB *join_tab, **best_ref;
-  
-  /* 
-    Saved join_tab for pre_sorting. create_sort_index() will save here.. 
-  */
-  JOIN_TAB *pre_sort_join_tab;
-  uint pre_sort_index;
-  Item *pre_sort_idx_pushed_cond;
-  void clean_pre_sort_join_tab();
 
   /* List of fields that aren't under an aggregate function */
   List<Item_field> non_agg_fields;
@@ -979,8 +1068,6 @@ public:
   uint     top_table_access_tabs_count;
   
   JOIN_TAB **map2table;    ///< mapping between table indexes and JOIN_TABs
-  JOIN_TAB *join_tab_save; ///< saved join_tab for subquery reexecution
-
   List<JOIN_TAB_RANGE> join_tab_ranges;
   
   /*
@@ -1011,6 +1098,7 @@ public:
     We keep it here so that it is saved/restored with JOIN::restore_tmp.
   */
   uint     top_join_tab_count;
+  uint     aggr_tables;     ///< Number of post-join tmp tables 
   uint	   send_group_parts;
   /*
     This counts how many times do_select() was invoked for this JOIN.
@@ -1123,6 +1211,7 @@ public:
   */
   table_map complex_firstmatch_tables;
 
+  Next_select_func first_select;
   /*
     The cost of best complete join plan found so far during optimization,
     after optimization phase - cost of picked join order (not taking into
@@ -1138,9 +1227,6 @@ public:
   double   join_record_count;
   List<Item> *fields;
   List<Cached_item> group_fields, group_fields_cache;
-  TABLE    *tmp_table;
-  /// used to store 2 possible tmp table of SELECT
-  TABLE    *exec_tmp_table1, *exec_tmp_table2;
   THD	   *thd;
   Item_sum  **sum_funcs, ***sum_funcs_end;
   /** second copy of sumfuncs (for queries with 2 temporary tables */
@@ -1149,6 +1235,8 @@ public:
   Item	    *having;
   Item      *tmp_having; ///< To store having when processed temporary table
   Item      *having_history; ///< Store having for explain
+  ORDER     *group_list_for_estimates;
+  bool      having_is_correlated;
   ulonglong  select_options;
   /* 
     Bitmap of allowed types of the join caches that
@@ -1187,26 +1275,6 @@ public:
   */
   bool filesort_found_rows;
   
-  /**
-    Copy of this JOIN to be used with temporary tables.
-
-    tmp_join is used when the JOIN needs to be "reusable" (e.g. in a
-    subquery that gets re-executed several times) and we know will use
-    temporary tables for materialization. The materialization to a
-    temporary table overwrites the JOIN structure to point to the
-    temporary table after the materialization is done. This is where
-    tmp_join is used : it's a copy of the JOIN before the
-    materialization and is used in restoring before re-execution by
-    overwriting the current JOIN structure with the saved copy.
-    Because of this we should pay extra care of not freeing up helper
-    structures that are referenced by the original contents of the
-    JOIN. We can check for this by making sure the "current" join is
-    not the temporary copy, e.g.  !tmp_join || tmp_join != join
- 
-    We should free these sub-structures at JOIN::destroy() if the
-    "current" join has a copy is not that copy.
-  */
-  JOIN *tmp_join;
   ROLLUP rollup;				///< Used with rollup
   
   bool mixed_implicit_grouping;
@@ -1228,6 +1296,19 @@ public:
     GROUP/ORDER BY.
   */
   bool simple_order, simple_group;
+
+  /*
+    ordered_index_usage is set if an ordered index access
+    should be used instead of a filesort when computing 
+    ORDER/GROUP BY.
+  */
+  enum
+  {
+    ordered_index_void,       // No ordered index avail.
+    ordered_index_group_by,   // Use index for GROUP BY
+    ordered_index_order_by    // Use index for ORDER BY
+  } ordered_index_usage;
+
   /**
     Is set only in case if we have a GROUP BY clause
     and no ORDER BY after constant elimination of 'order'.
@@ -1280,10 +1361,19 @@ public:
   List<Item> exec_const_order_group_cond;
   SQL_SELECT *select;                ///<created in optimisation phase
   JOIN_TAB *return_tab;              ///<used only for outer joins
-  Item **ref_pointer_array; ///<used pointer reference for this select
-  // Copy of above to be used with different lists
-  Item **items0, **items1, **items2, **items3, **current_ref_pointer_array;
-  uint ref_pointer_array_size; ///< size of above in bytes
+
+  /*
+    Used pointer reference for this select.
+    select_lex->ref_pointer_array contains five "slices" of the same length:
+    |========|========|========|========|========|
+     ref_ptrs items0   items1   items2   items3
+   */
+  Ref_ptr_array ref_ptrs;
+  // Copy of the initial slice above, to be used with different lists
+  Ref_ptr_array items0, items1, items2, items3;
+  // Used by rollup, to restore ref_ptrs after overwriting it.
+  Ref_ptr_array current_ref_ptrs;
+
   const char *zero_result_cause; ///< not 0 if exec must return zero result
   
   bool union_part; ///< this subselect is part of union 
@@ -1310,20 +1400,12 @@ public:
   /* SJM nests that are executed with SJ-Materialization strategy */
   List<SJ_MATERIALIZATION_INFO> sjm_info_list;
 
-  /* 
-    storage for caching buffers allocated during query execution. 
-    These buffers allocations need to be cached as the thread memory pool is
-    cleared only at the end of the execution of the whole query and not caching
-    allocations that occur in repetition at execution time will result in 
-    excessive memory usage.
-    Note: make_simple_join always creates an execution plan that accesses
-    a single table, thus it is sufficient to have a one-element array for
-    table_reexec.
-  */  
-  SORT_FIELD *sortorder;                        // make_unireg_sortorder()
-  TABLE *table_reexec[1];                       // make_simple_join()
-  JOIN_TAB *join_tab_reexec;                    // make_simple_join()
-  /* end of allocation caching storage */
+  /** TRUE <=> ref_pointer_array is set to items3. */
+  bool set_group_rpa;
+  /** Exec time only: TRUE <=> current group has been sent */
+  bool group_sent;
+
+  JOIN_TAB *sort_and_group_aggr_tab;
 
   JOIN(THD *thd_arg, List<Item> &fields_arg, ulonglong select_options_arg,
        select_result *result_arg)
@@ -1335,12 +1417,13 @@ public:
   void init(THD *thd_arg, List<Item> &fields_arg, ulonglong select_options_arg,
        select_result *result_arg)
   {
-    join_tab= join_tab_save= 0;
+    join_tab= 0;
     table= 0;
     table_count= 0;
     top_join_tab_count= 0;
     const_tables= 0;
     const_table_map= 0;
+    aggr_tables= 0;
     eliminated_tables= 0;
     join_list= 0;
     implicit_grouping= FALSE;
@@ -1350,25 +1433,21 @@ public:
     send_records= 0;
     found_records= 0;
     fetch_limit= HA_POS_ERROR;
-    join_examined_rows= 0;
-    exec_tmp_table1= 0;
-    exec_tmp_table2= 0;
-    sortorder= 0;
-    table_reexec[0]= 0;
-    join_tab_reexec= 0;
     thd= thd_arg;
     sum_funcs= sum_funcs2= 0;
     procedure= 0;
     having= tmp_having= having_history= 0;
+    having_is_correlated= false;
+    group_list_for_estimates= 0;
     select_options= select_options_arg;
     result= result_arg;
     lock= thd_arg->lock;
     select_lex= 0; //for safety
-    tmp_join= 0;
     select_distinct= MY_TEST(select_options & SELECT_DISTINCT);
     no_order= 0;
     simple_order= 0;
     simple_group= 0;
+    ordered_index_usage= ordered_index_void;
     need_distinct= 0;
     skip_sort_order= 0;
     need_tmp= 0;
@@ -1376,8 +1455,11 @@ public:
     error= 0;
     select= 0;
     return_tab= 0;
-    ref_pointer_array= items0= items1= items2= items3= 0;
-    ref_pointer_array_size= 0;
+    ref_ptrs.reset();
+    items0.reset();
+    items1.reset();
+    items2.reset();
+    items3.reset();
     zero_result_cause= 0;
     optimized= 0;
     have_query_plan= QEP_NOT_PRESENT_YET;
@@ -1405,10 +1487,13 @@ public:
     rollup.state= ROLLUP::STATE_NONE;
 
     no_const_tables= FALSE;
+    first_select= sub_select;
+    set_group_rpa= false;
+    group_sent= 0;
+
     outer_ref_cond= pseudo_bits_cond= NULL;
     in_to_exists_where= NULL;
     in_to_exists_having= NULL;
-    pre_sort_join_tab= NULL;
     emb_sjm_nest= NULL;
     sjm_lookup_tables= 0;
 
@@ -1420,7 +1505,10 @@ public:
     table_access_tabs= NULL; 
   }
 
-  int prepare(Item ***rref_pointer_array, TABLE_LIST *tables, uint wind_num,
+  /* True if the plan guarantees that it will be returned zero or one row */
+  bool only_const_tables()  { return const_tables == table_count; }
+
+  int prepare(TABLE_LIST *tables, uint wind_num,
 	      COND *conds, uint og_num, ORDER *order, bool skip_order_by,
               ORDER *group, Item *having, ORDER *proc_param, SELECT_LEX *select,
 	      SELECT_LEX_UNIT *unit);
@@ -1431,6 +1519,7 @@ public:
   int init_execution();
   void exec();
   void exec_inner();
+  bool prepare_result(List<Item> **columns_list);
   int destroy();
   void restore_tmp();
   bool alloc_func_list();
@@ -1440,16 +1529,42 @@ public:
   bool make_sum_func_list(List<Item> &all_fields, List<Item> &send_fields,
 			  bool before_group_by, bool recompute= FALSE);
 
-  inline void set_items_ref_array(Item **ptr)
+  /// Initialzes a slice, see comments for ref_ptrs above.
+  Ref_ptr_array ref_ptr_array_slice(size_t slice_num)
+  {
+    size_t slice_sz= select_lex->ref_pointer_array.size() / 5U;
+    DBUG_ASSERT(select_lex->ref_pointer_array.size() % 5 == 0);
+    DBUG_ASSERT(slice_num < 5U);
+    return Ref_ptr_array(&select_lex->ref_pointer_array[slice_num * slice_sz],
+                         slice_sz);
+  }
+
+  /**
+     Overwrites one slice with the contents of another slice.
+     In the normal case, dst and src have the same size().
+     However: the rollup slices may have smaller size than slice_sz.
+   */
+  void copy_ref_ptr_array(Ref_ptr_array dst_arr, Ref_ptr_array src_arr)
+  {
+    DBUG_ASSERT(dst_arr.size() >= src_arr.size());
+    void *dest= dst_arr.array();
+    const void *src= src_arr.array();
+    memcpy(dest, src, src_arr.size() * src_arr.element_size());
+  }
+
+  /// Overwrites 'ref_ptrs' and remembers the the source as 'current'.
+  void set_items_ref_array(Ref_ptr_array src_arr)
   {
-    memcpy((char*) ref_pointer_array, (char*) ptr, ref_pointer_array_size);
-    current_ref_pointer_array= ptr;
+    copy_ref_ptr_array(ref_ptrs, src_arr);
+    current_ref_ptrs= src_arr;
   }
-  inline void init_items_ref_array()
+
+  /// Initializes 'items0' and remembers that it is 'current'.
+  void init_items_ref_array()
   {
-    items0= ref_pointer_array + all_fields.elements;
-    memcpy(items0, ref_pointer_array, ref_pointer_array_size);
-    current_ref_pointer_array= items0;
+    items0= ref_ptr_array_slice(1);
+    copy_ref_ptr_array(items0, ref_ptrs);
+    current_ref_ptrs= items0;
   }
 
   bool rollup_init();
@@ -1458,18 +1573,10 @@ public:
 			  Item_sum ***func);
   int rollup_send_data(uint idx);
   int rollup_write_data(uint idx, TABLE *table);
-  /**
-    Release memory and, if possible, the open tables held by this execution
-    plan (and nested plans). It's used to release some tables before
-    the end of execution in order to increase concurrency and reduce
-    memory consumption.
-  */
   void join_free();
   /** Cleanup this JOIN, possibly for reuse */
   void cleanup(bool full);
   void clear();
-  bool save_join_tab();
-  bool init_save_join_tab();
   bool send_row_on_empty_set()
   {
     return (do_send_rows && implicit_grouping && !group_optimized_away &&
@@ -1488,6 +1595,8 @@ public:
     return (table_map(1) << table_count) - 1;
   }
   void drop_unused_derived_keys();
+  bool get_best_combination();
+  bool add_sorting_to_table(JOIN_TAB *tab, ORDER *order);
   inline void eval_select_list_used_tables();
   /* 
     Return the table for which an index scan can be used to satisfy 
@@ -1553,12 +1662,41 @@ public:
   JOIN_TAB *first_breadth_first_execution_tab() { return join_tab; }
 private:
   /**
+    Create a temporary table to be used for processing DISTINCT/ORDER
+    BY/GROUP BY.
+
+    @note Will modify JOIN object wrt sort/group attributes
+
+    @param tab              the JOIN_TAB object to attach created table to
+    @param tmp_table_fields List of items that will be used to define
+                            column types of the table.
+    @param tmp_table_group  Group key to use for temporary table, NULL if none.
+    @param save_sum_fields  If true, do not replace Item_sum items in 
+                            @c tmp_fields list with Item_field items referring 
+                            to fields in temporary table.
+
+    @returns false on success, true on failure
+  */
+  bool create_postjoin_aggr_table(JOIN_TAB *tab, List<Item> *tmp_table_fields,
+                                  ORDER *tmp_table_group,
+                                  bool save_sum_fields,
+                                  bool distinct,
+                                  bool keep_row_ordermake);
+  /**
+    Optimize distinct when used on a subset of the tables.
+
+    E.g.,: SELECT DISTINCT t1.a FROM t1,t2 WHERE t1.b=t2.b
+    In this case we can stop scanning t2 when we have found one t1.a
+  */
+  void optimize_distinct();
+
+  /**
     TRUE if the query contains an aggregate function but has no GROUP
     BY clause. 
   */
   bool implicit_grouping; 
-  bool make_simple_join(JOIN *join, TABLE *tmp_table);
   void cleanup_item_list(List<Item> &items) const;
+  bool make_aggr_tables_info();
 };
 
 enum enum_with_bush_roots { WITH_BUSH_ROOTS, WITHOUT_BUSH_ROOTS};
@@ -1583,7 +1721,7 @@ extern const char *join_type_str[];
 void count_field_types(SELECT_LEX *select_lex, TMP_TABLE_PARAM *param, 
                        List<Item> &fields, bool reset_with_sum_func);
 bool setup_copy_fields(THD *thd, TMP_TABLE_PARAM *param,
-		       Item **ref_pointer_array,
+		       Ref_ptr_array ref_pointer_array,
 		       List<Item> &new_list1, List<Item> &new_list2,
 		       uint elements, List<Item> &fields);
 void copy_fields(TMP_TABLE_PARAM *param);
@@ -1824,19 +1962,19 @@ int safe_index_read(JOIN_TAB *tab);
 int get_quick_record(SQL_SELECT *select);
 SORT_FIELD * make_unireg_sortorder(THD *thd, ORDER *order, uint *length,
                                   SORT_FIELD *sortorder);
-int setup_order(THD *thd, Item **ref_pointer_array, TABLE_LIST *tables,
+int setup_order(THD *thd, Ref_ptr_array ref_pointer_array, TABLE_LIST *tables,
 		List<Item> &fields, List <Item> &all_fields, ORDER *order);
-int setup_group(THD *thd, Item **ref_pointer_array, TABLE_LIST *tables,
+int setup_group(THD *thd,  Ref_ptr_array ref_pointer_array, TABLE_LIST *tables,
 		List<Item> &fields, List<Item> &all_fields, ORDER *order,
 		bool *hidden_group_fields);
 bool fix_inner_refs(THD *thd, List<Item> &all_fields, SELECT_LEX *select,
-                   Item **ref_pointer_array);
+                    Ref_ptr_array ref_pointer_array);
 int join_read_key2(THD *thd, struct st_join_table *tab, TABLE *table,
                    struct st_table_ref *table_ref);
 
 bool handle_select(THD *thd, LEX *lex, select_result *result,
                    ulong setup_tables_done_option);
-bool mysql_select(THD *thd, Item ***rref_pointer_array,
+bool mysql_select(THD *thd,
                   TABLE_LIST *tables, uint wild_num,  List<Item> &list,
                   COND *conds, uint og_num, ORDER *order, ORDER *group,
                   Item *having, ORDER *proc_param, ulonglong select_type,