1 files changed, 73 insertions, 21 deletions
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index 5d69f1d5f87..9ab87f8134e 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -3728,15 +3728,26 @@ bool JOIN::make_aggr_tables_info()
 
     /*
       If we have different sort & group then we must sort the data by group
-      and copy it to another tmp table
+      and copy it to another tmp table.
+
       This code is also used if we are using distinct something
       we haven't been able to store in the temporary table yet
       like SEC_TO_TIME(SUM(...)).
+
+      3. Also, this is used when
+      - the query has Window functions,
+      - the GROUP BY operation is done with OrderedGroupBy algorithm.
+      In this case, the first temptable will contain pre-GROUP-BY data. Force
+      the creation of the second temporary table. Post-GROUP-BY dataset will be
+      written there, and then Window Function processing code will be able to
+      process it.
     */
     if ((group_list &&
          (!test_if_subpart(group_list, order) || select_distinct)) ||
-        (select_distinct && tmp_table_param.using_outer_summary_function))
-    {					/* Must copy to another table */
+        (select_distinct && tmp_table_param.using_outer_summary_function) ||
+        (group_list && !tmp_table_param.quick_group &&  // (3)
+         select_lex->have_window_funcs())) // (3)
+   {					/* Must copy to another table */
       DBUG_PRINT("info",("Creating group table"));
 
       calc_group_buffer(this, group_list);
@@ -7889,6 +7900,7 @@ best_access_path(JOIN      *join,
         rec= MATCHING_ROWS_IN_OTHER_TABLE;      // Fix for small tables
 
       Json_writer_object trace_access_idx(thd);
+      double eq_ref_rows= 0.0, eq_ref_cost= 0.0;
       /*
         full text keys require special treatment
       */
@@ -7933,7 +7945,10 @@ best_access_path(JOIN      *join,
               tmp= adjust_quick_cost(table->opt_range[key].cost, 1);
             else
               tmp= table->file->avg_io_cost();
-            tmp*= prev_record_reads(join_positions, idx, found_ref);
+            eq_ref_rows= prev_record_reads(join_positions, idx,
+                                           found_ref);
+            tmp*= eq_ref_rows;
+            eq_ref_cost= tmp;
             records=1.0;
           }
           else
@@ -8231,7 +8246,27 @@ best_access_path(JOIN      *join,
           (table->file->index_flags(start_key->key,0,1) &
            HA_DO_RANGE_FILTER_PUSHDOWN))
       {
-        double rows= record_count * records;
+        double rows;
+        if (type == JT_EQ_REF)
+        {
+          /*
+            Treat EQ_REF access in a special way:
+            1. We have no cost for index-only read. Assume its cost is 50% of
+               the cost of the full read.
+
+            2. A regular ref access will do #record_count lookups, but eq_ref
+               has "lookup cache" which reduces the number of lookups made.
+               The estimation code uses prev_record_reads() call to estimate:
+
+                tmp = prev_record_reads(join_positions, idx, found_ref);
+
+               Set the effective number of rows from "tmp" here.
+          */
+          keyread_tmp= COST_ADD(eq_ref_cost / 2, s->startup_cost);
+          rows= eq_ref_rows;
+        }
+        else
+          rows= record_count * records;
 
         /*
           If we use filter F with selectivity s the the cost of fetching data
@@ -8274,10 +8309,6 @@ best_access_path(JOIN      *join,
           we cannot use filters as the cost calculation below would cause
           tmp to become negative.  The future resultion is to not limit
           cost with worst_seek.
-
-          We cannot use filter with JT_EQ_REF as in this case 'tmp' is
-          number of rows from prev_record_read() and keyread_tmp is 0. These
-          numbers are not usable with rowid filter code.
 	*/
         double access_cost_factor= MY_MIN((rows - keyread_tmp) / rows, 1.0);
         if (!(records < s->worst_seeks &&
@@ -8285,7 +8316,7 @@ best_access_path(JOIN      *join,
           trace_access_idx.add("rowid_filter_skipped", "worst/max seeks clipping");
         else if (access_cost_factor <= 0.0)
           trace_access_idx.add("rowid_filter_skipped", "cost_factor <= 0");
-        else if (type != JT_EQ_REF)
+        else
         {
           filter=
             table->best_range_rowid_filter_for_partial_join(start_key->key,
@@ -22729,11 +22760,17 @@ end_send(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
 
 /*
   @brief
-    Perform a GROUP BY operation over a stream of rows ordered by their group.
-    The result is sent into join->result.
+    Perform OrderedGroupBy operation and write the output into join->result.
 
   @detail
-    Also applies HAVING, etc.
+    The input stream is ordered by the GROUP BY expression, so groups come
+    one after another. We only need to accumulate the aggregate value, when
+    a GROUP BY group ends, check the HAVING and send the group.
+
+    Note that the output comes in the GROUP BY order, which is required by
+    the MySQL's GROUP BY semantics. No further sorting is needed.
+
+  @seealso end_write_group() also implements SortAndGroup
 */
 
 enum_nested_loop_state
@@ -22933,13 +22970,26 @@ end:
 
 /*
   @brief
-    Perform a GROUP BY operation over rows coming in arbitrary order. 
-    
-    This is done by looking up the group in a temp.table and updating group
-    values.
+    Perform GROUP BY operation over rows coming in arbitrary order: use
+    TemporaryTableWithPartialSums algorithm.
+
+  @detail
+    The TemporaryTableWithPartialSums algorithm is:
+
+    CREATE TEMPORARY TABLE tmp (
+      group_by_columns PRIMARY KEY,
+      partial_sum
+    );
+
+    for each row R in join output {
+      INSERT INTO tmp (R.group_by_columns, R.sum_value)
+        ON DUPLICATE KEY UPDATE partial_sum=partial_sum + R.sum_value;
+    }
 
   @detail
     Also applies HAVING, etc.
+
+  @seealso end_unique_update()
 */
 
 static enum_nested_loop_state
@@ -23092,13 +23142,15 @@ end_unique_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)),
 
 /*
   @brief
-    Perform a GROUP BY operation over a stream of rows ordered by their group.
-    Write the result into a temporary table.
+    Perform OrderedGroupBy operation and write the output into the temporary
+    table (join_tab->table).
 
   @detail
-    Also applies HAVING, etc.
+    The input stream is ordered by the GROUP BY expression, so groups come
+    one after another. We only need to accumulate the aggregate value, when
+    a GROUP BY group ends, check the HAVING and write the group.
 
-    The rows are written into temptable so e.g. filesort can read them.
+  @seealso end_send_group() also implements OrderedGroupBy
 */
 
 enum_nested_loop_state