sql/sql_sort_nest.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593

/* Copyright (c) 2000, 2013, Oracle and/or its affiliates.
   Copyright (c) 2008, 2017, MariaDB Corporation.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; version 2 of the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */


#include "mariadb.h"
#include "sql_select.h"
#include "opt_trace.h"

/**

INTRODUCTION

This file contains the functions to support the cost based ORDER BY with LIMIT
optimization.

The motivation behind this optimization is to shortcut the join execution
for queries having ORDER BY with LIMIT clause. In other words we would like to
avoid computing the entire join for queries having ORDER BY with LIMIT.

The main idea behind this optimization is to push the LIMIT to a partial join.
For pushing the LIMIT there is one pre-requisite and that is the partial
join MUST resolve the ORDER BY clause.

What does PUSHING THE LIMIT mean?

Pushing the limit to a partial join means that one would only read a fraction
of records of the prefix that are sorted in accordance with the ORDER BY
clause.


Let's say we have tables
  t1, t2, t3, t4 .............tk,tk+1.........................tn
  |<---------prefix------------>|<-------suffix--------------->|

and lets assume the prefix can resolve the ORDER BY clause and we can push
the LIMIT.

So considering the fraction of output we get in a general case with LIMIT is

            +-------------------------------------+
            |                                     |
fraction  = |  LIMIT / cardinality(t1,t2....tn)   |
            |                                     |
            +-------------------------------------+

We assume that the same fraction would be read for the prefix also, so the
records read for the prefix that can resolve the ORDER BY clause is:

              +--------------------------------------+
              |                                      |
records_read= | fraction * (cardinality(t1,t2....tk) |
              |                                      |
              +--------------------------------------+

              +--------------------------------------------------------------+
              |                                                              |
            = | LIMIT * (cardinality(t1,t2....tk) / cardinality(t1,t2....tn))|
              |                                                              |
              +--------------------------------------------------------------+

The LIMIT is pushed to all partial join orders enumerated by the join
planner that can resolve the ORDER BY clause. This is how we achieve a complete
cost based solution for ORDER BY with LIMIT optimization.


IMPLEMENTATION DETAILS

Let us divide the implementation details in 3 stages:

OPTIMIZATION STAGE

- We invoke the join planner to get an estimate of the cardinality of the
  join. This is needed for pushing the LIMIT in different partial plans
  which can resolve the ORDER BY clause.

- Join planner is invoked again to find the best join order for the tables
  inside the join. The join planner enumerated various join orders.
  For each partial plan we try to find out if it can resolve the ORDER BY
  clause or not.
  To resolve the ORDER BY clause, equalities from the WHERE clause are also
  considered.

- After a partial plan that can resolve ORDER BY clause is found, we push
  the LIMIT to the partial plan.

- Access methods that ensure pre-existing ordering are also taken into account
  inside the join planner. There can be indexes on the first non-const table
  that can resolve the ORDER BY clause. So we push the LIMIT to the first
  non-const table also.

- For each partial plan that can resolve the ORDER BY clause,
  we consider 2 cases
     1) Push the LIMIT at the current partial plan
     2) Push the LIMIT later

  This helps us to enumerate all plans where on can push LIMIT at different
  partial plans. Finally the plan with the lowest cost is picked by the join
  planner


COMPILATION STAGE

Preparation of Sort Nest

Let's say we have the best join order as:

  t1, t2, t3, t4 .............tk,tk+1.........................tn
  |<---------prefix------------>|<-------suffix--------------->|


The array of join_tab structures would look like

  t1, t2, t3, t4 .............tk, <sort nest>, tk+1.........................tn

After the best execution plan is picked by the join planner which requires
a nest for a prefix of tables that can resolve the ORDER BY clause, we want
to prepare the temporary table that would hold the result of materialization
of the tables in the prefix.

t1, t2, t3, t4..............tk ======> inner tables of the nest

To create the temporary table we need a list of Items which we want to store
inside the temporary table of the nest. Currently this list contains all
fields of the inner tables of the nest that have their bitmap read_set set.
With this list of Items we create the temporary table for the nest.
Also we create a list of Items for all the fields of the temporary table.
This list is needed for substitution of items that will be evaluated in the
POST ORDER BY context.


After the nest for the prefix is prepared, we extract a sub-condition which is
dependent on the inner tables of the nest from the WHERE clause. This
condition is then attached to the inner tables of the nest. This condition
would be evaluated before the ORDER BY clause is applied to the temporary
table of the nest.

We need to make substitution for items belonging to the inner tables of the
nest which will be evaluated in the POST ORDER BY context. These items need
to be substituted with the corresponding items of the temporary table
of the nest.


EXECUTION STAGE

Let's say we have the best join order as:

  t1, t2, t3, t4 .............tk,tk+1.........................tn
  |<---------prefix------------>|<-------suffix--------------->|

  The prefix are the inner table of the sort nest while the suffix are the
  tables outside the sort nest.

  As soon as the join execution starts, we compute the partial join for the
  tables in the prefix and store the result inside the temporary table
  for the sort nest.
  Then we sort the temporary table in accordance with the ORDER BY clause.
  After the sort is performed we read the records from the temporary
  table of the sort nest one by one and continue the join with the
  tables in the suffix.

  The join execution for this optimization can be split in 3 parts

  a) Materialize the prefix
                                     materialize
    t1, t2, t3, t4 .............tk  ============>  <sort nest>
    |<---------prefix------------>|

  b) Sort the <sort nest> in accordance with the ORDER BY clause

  c) Read records from the Filesort buffer one by one and continue join
     execution with the tables in the suffix

     <sort nest>, tk+1.........................tn
                  |<----------suffix----------->|

     The execution stops as soon as we get LIMIT records in the output.

*/

int test_if_order_by_key(JOIN *join, ORDER *order, TABLE *table, uint idx,
                         uint *used_key_parts= NULL);
COND* substitute_for_best_equal_field(THD *thd, JOIN_TAB *context_tab,
                                      COND *cond,
                                      COND_EQUAL *cond_equal,
                                      void *table_join_idx,
                                      bool do_substitution);
enum_nested_loop_state
end_nest_materialization(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
bool get_range_limit_read_cost(const JOIN_TAB *tab, const TABLE *table,
                               ha_rows table_records, uint keynr,
                               ha_rows rows_limit, double *read_time);
Item **get_sargable_cond(JOIN *join, TABLE *table);
void find_cost_of_index_with_ordering(THD *thd, const JOIN_TAB *tab,
                                      TABLE *table,
                                      ha_rows *select_limit_arg,
                                      double fanout, double est_best_records,
                                      uint nr, double *index_scan_time,
                                      Json_writer_object *trace_possible_key);


/*
  @brief
    Substitute field items of tables inside the nest with nest's field items.

  @param nest_info          Structure holding information about the nest

  @details
    Substitute field items of tables inside the sort-nest with sort-nest's
    field items. This is needed for expressions which would be
    evaluated in the post ORDER BY context.

    Example:

      SELECT * FROM t1, t2, t3
      WHERE t1.a = t2.a AND t2.b = t3.b AND t1.c > t3.c
      ORDER BY t1.a,t2.c
      LIMIT 5;

    Let's say in this case the join order is t1,t2,t3 and there is a sort-nest
    on the prefix t1,t2.

    Now looking at the WHERE clause, splitting it into 2 parts:
      (1) t2.b = t3.b AND t1.c > t3.c   ---> condition external to the nest
      (2) t1.a = t2.a                   ---> condition internal to the nest

    Now look at the condition in (1), this would be evaluated in the post
    ORDER BY context.

    So t2.b and t1.c should actually refer to the sort-nest's field items
    instead of field items of the tables inside the sort-nest.
    This is why we need to substitute field items of the tables inside the
    sort-nest with sort-nest's field items.

    For the condition in (2) there is no need for substitution as this
    condition is internal to the nest and would be evaluated before we
    do the sorting for the sort-nest.

    This function does the substitution for
      - WHERE clause
      - SELECT LIST
      - ORDER BY clause
      - ON expression
      - REF access items
*/

void
JOIN::substitute_base_with_nest_field_items(Mat_join_tab_nest_info *nest_info)
{
  List_iterator<Item> it(fields_list);
  Item *item, *new_item;

  /* Substituting SELECT list field items with sort-nest's field items */
  while ((item= it++))
  {
    if ((new_item= item->transform(thd,
                                   &Item::replace_with_nest_items, TRUE,
                                   (uchar *) nest_info)) != item)
    {
      new_item->name= item->name;
      thd->change_item_tree(it.ref(), new_item);
    }
    new_item->update_used_tables();
  }

  /* Substituting ORDER BY field items with sort-nest's field items */
  ORDER *ord;
  for (ord= order; ord ; ord=ord->next)
  {
    (*ord->item)= (*ord->item)->transform(thd,
                                          &Item::replace_with_nest_items,
                                          TRUE, (uchar *) nest_info);
    (*ord->item)->update_used_tables();
  }

  JOIN_TAB *end_tab= nest_info->nest_tab;
  uint i, j;
  for (i= const_tables + nest_info->number_of_tables(), j=0;
       i < top_join_tab_count; i++, j++)
  {
    JOIN_TAB *tab= end_tab + j;

    if (tab->type == JT_REF || tab->type == JT_EQ_REF ||
        tab->type == JT_REF_OR_NULL)
      substitute_ref_items(tab, nest_info);

    /* Substituting ON-EXPR field items with sort-nest's field items */
    if (*tab->on_expr_ref)
    {
      item= (*tab->on_expr_ref)->transform(thd,
                                           &Item::replace_with_nest_items,
                                           TRUE, (uchar *) nest_info);
      *tab->on_expr_ref= item;
      (*tab->on_expr_ref)->update_used_tables();
    }

    /*
      Substituting REF field items for SJM lookup with sort-nest's field items
    */
    if (tab->bush_children)
      substitutions_for_sjm_lookup(tab, nest_info);
  }

  /*
    This needs a pointer to the nest, so that this could be used in general
  */
  extract_condition_for_the_nest(nest_info);

  /* Substituting WHERE clause's field items with sort-nest's field items */
  if (conds)
  {
    conds= conds->transform(thd, &Item::replace_with_nest_items, TRUE,
                            (uchar *) nest_info);
    conds->update_used_tables();
  }
}


/*
  @brief
    Substitute ref access field items with nest's field items.

  @param tab                join tab structure having ref access
  @param nest_info          Structure holding information about the nest

*/

void JOIN::substitute_ref_items(JOIN_TAB *tab,
                                Mat_join_tab_nest_info* nest_info)
{
  Item *item;
  /* Substituting REF field items with sort-nest's field items */
  for (uint keypart= 0; keypart < tab->ref.key_parts; keypart++)
  {
    item= tab->ref.items[keypart]->transform(thd,
                                             &Item::replace_with_nest_items,
                                             TRUE, (uchar *) nest_info);
    if (item != tab->ref.items[keypart])
    {
      tab->ref.items[keypart]= item;
      Item *real_item= item->real_item();
      store_key *key_copy= tab->ref.key_copy[keypart];
      if (key_copy->type() == store_key::FIELD_STORE_KEY)
      {
        store_key_field *field_copy= ((store_key_field *)key_copy);
        DBUG_ASSERT(real_item->type() == Item::FIELD_ITEM);
        field_copy->change_source_field((Item_field *) real_item);
      }
    }
  }
}


/*
  @brief
    Substitute the left expression of the IN subquery with nest's field items.

  @param sjm_tab           SJM lookup join tab
  @param nest_info         Structure holding information about the nest

  @details
    This substitution is needed for SJM lookup when the SJM materialized
    table is outside the nest.

    For example:
      SELECT t1.a, t2.a
      FROM  t1, t2
      WHERE ot1.a in (SELECT it.b FROM it) AND ot1.b = t1.b
      ORDER BY t1.a desc, ot1.a desc
      LIMIT 5;

    Lets consider the join order here is t1, t2, <subquery2> and there is a
    nest on t1, t2. For <subquery2> we do SJM lookup.
    So for the SJM table there would be a ref access created on the condition
    t2.a=it.b. But as one can see table t2 is inside the nest and the
    condition t2.a=it.b can only be evaluated in the post nest creation
    context, so we need to substitute t2.a with the corresponding field item
    of the nest.

    For example:
      If we had a sort nest on t1,t2 the the condition t2.a = it.b will be
      evaluated in the POST ORDER BY context, so t2.a should refer to the
      field item of the sort nest.

*/

void JOIN::substitutions_for_sjm_lookup(JOIN_TAB *sjm_tab,
                                        Mat_join_tab_nest_info* nest_info)
{
  JOIN_TAB *tab= sjm_tab->bush_children->start;
  TABLE_LIST *emb_sj_nest= tab->table->pos_in_table_list->embedding;

  /*
    @see setup_sj_materialization_part1
  */
  while (!emb_sj_nest->sj_mat_info)
    emb_sj_nest= emb_sj_nest->embedding;
  SJ_MATERIALIZATION_INFO *sjm= emb_sj_nest->sj_mat_info;

  if (!sjm->is_sj_scan)
  {
    Item *left_expr= emb_sj_nest->sj_subq_pred->left_expr;
    left_expr= left_expr->transform(thd, &Item::replace_with_nest_items,
                                    TRUE, (uchar *) nest_info);
    left_expr->update_used_tables();
    emb_sj_nest->sj_subq_pred->left_expr= left_expr;
  }
}


/*
  @brief
    Extract from the WHERE clause the sub-condition for tables inside the nest.

  @param  nest_info               Structure holding information about the nest

  @details
    Extract the sub-condition from the WHERE clause that can be added to the
    tables inside the nest.

  Example
    SELECT * from t1,t2,t3
    WHERE t1.a > t2.a        (1)
     AND  t2.b = t3.b        (2)
    ORDER BY t1.a,t2.a
    LIMIT 5;

    let's say in this case the join order is t1,t2,t3 and there is a nest
    on t1,t2

    From the WHERE clause we would like to extract the condition that depends
    only on the inner tables of the nest. The condition (1) here satisfies
    this criteria so it would be extracted from the WHERE clause.
    The extracted condition here would be t1.a > t2.a.

    The extracted condition is stored inside the Mat_join_tab_nest_info
    structure.

    Also we remove the top level conjuncts of the WHERE clause that were
    present in the extracted condition.

    So after removal the final results would be:
      WHERE clause:    t2.b = t3.b         ----> condition external to the nest
      extracted cond:  t1.a > t2.a         ----> condition internal to the nest

    @note
      For the sort nest the sub-condition will be evaluated before the
      ORDER BY clause is applied.
*/

void JOIN::extract_condition_for_the_nest(Mat_join_tab_nest_info* nest_info)
{
  DBUG_ASSERT(nest_info);
  Item *orig_cond= conds;
  Item *extracted_cond;

  /*
    check_pushable_cond_extraction would set the flag NO_EXTRACTION_FL for
    all the predicates that cannot be added to the inner tables of the nest.
  */
  table_map nest_tables_map= nest_info->get_tables_map();
  conds->check_pushable_cond_extraction(
                                       &Item::pushable_cond_checker_for_tables,
                                        (uchar*)&nest_tables_map);

  /*
    build_pushable_condition would create a sub-condition that would be
    added to the inner tables of the nest. This may clone some predicates too.
  */
  extracted_cond= orig_cond->build_pushable_condition(thd, TRUE);

  if (extracted_cond)
  {
    if (extracted_cond->fix_fields_if_needed(thd, 0))
      return;
    extracted_cond->update_used_tables();
    /*
      Remove from the WHERE clause  the top level conjuncts that were
      extracted for the inner tables of the nest
    */
    orig_cond= remove_pushed_top_conjuncts(thd, orig_cond);
    nest_info->set_nest_cond(extracted_cond);
  }
  conds= orig_cond;
}


/*
  @brief
    Propagate the Multiple Equalities for all the ORDER BY items.

  @details
    Propagate the multiple equalities for the ORDER BY items.
    This is needed so that we can generate different join orders
    that would satisfy ordering after taking equality propagation
    into consideration.

    Example
      SELECT * FROM t1, t2, t3
      WHERE t1.a = t2.a AND t2.b = t3.a
      ORDER BY t2.a, t3.a
      LIMIT 10;

    Possible join orders which satisfy the ORDER BY clause and
    which we can get after equality propagation are:
        - t2, sort(t2), t3, t1          ---> substitute t3.a with t2.b
        - t2, sort(t2), t1, t3          ---> substitute t3.a with t2.b
        - t1, t3, sort(t1,t3), t2       ---> substitute t2.a with t1.a
        - t1, t2, sort(t1,t2), t3       ---> substitute t3.a with t2.b

    So with equality propagation for ORDER BY items, we can get more
    join orders that could satisfy the ORDER BY clause.
*/

void JOIN::propagate_equal_field_for_orderby()
{
  if (!sort_nest_possible)
    return;
  ORDER *ord;
  for (ord= order; ord; ord= ord->next)
  {
    if (optimizer_flag(thd, OPTIMIZER_SWITCH_ORDERBY_EQ_PROP) && cond_equal)
    {
      Item *item= ord->item[0];
      /*
        TODO: equality substitution in the context of ORDER BY is
        sometimes allowed when it is not allowed in the general case.
        We make the below call for its side effect: it will locate the
        multiple equality the item belongs to and set item->item_equal
        accordingly.
      */
      (void)item->propagate_equal_fields(thd,
                                         Value_source::
                                         Context_identity(),
                                         cond_equal);
    }
  }
}


/*
  @brief
    Check whether ORDER BY items can be evaluated for a given prefix

  @param previous_tables  table_map of all the tables in the prefix
                          of the current partial plan

  @details
    Here we walk through the ORDER BY items and check if the prefix of the
    join resolves the ordering.
    Also we look at the multiple equalities for each item in the ORDER BY list
    to see if the ORDER BY items can be resolved by the given prefix.

    Example
      SELECT * FROM t1, t2, t3
      WHERE t1.a = t2.a AND t2.b = t3.a
      ORDER BY t2.a, t3.a
      LIMIT 10;

    Let's say the given prefix is table {t1,t3}, then this function would
    return TRUE because there is an equality condition t2.a=t1.a ,
    so t2.a can be resolved with t1.a. Hence the given prefix {t1,t3} would
    resolve the ORDER BY clause.

  @retval
   TRUE   ordering can be evaluated by the given prefix
   FALSE  otherwise

*/

bool JOIN::check_join_prefix_resolves_ordering(table_map previous_tables)
{
  DBUG_ASSERT(order);
  ORDER *ord;
  for (ord= order; ord; ord= ord->next)
  {
    Item *order_item= ord->item[0];
    table_map order_tables=order_item->used_tables();
    if (!(order_tables & ~previous_tables) ||
         (order_item->excl_dep_on_tables(previous_tables, FALSE)))
      continue;
    else
      return FALSE;
  }
  return TRUE;
}


/*
  @brief
    Check if the best plan has a sort-nest or not.

  @param
    n_tables[out]             set to the number of tables inside the sort-nest
    nest_tables_map[out]      map of tables inside the sort-nest
  @details
    This function walks through the JOIN::best_positions array
    which holds the best plan and checks if there is prefix for
    which the join planner had picked a sort-nest.

    Also this function computes a table map for tables that are inside the
    sort-nest

  @retval
    TRUE  sort-nest present
    FALSE no sort-nest present
*/

bool JOIN::check_if_sort_nest_present(uint* n_tables,
                                      table_map *nest_tables_map)
{
  if (!sort_nest_possible)
    return FALSE;

  uint tablenr;
  table_map nest_tables= 0;
  uint tables= 0;
  for (tablenr=const_tables ; tablenr < table_count ; tablenr++)
  {
    tables++;
    POSITION *pos= &best_positions[tablenr];
    if (pos->sj_strategy == SJ_OPT_MATERIALIZE ||
        pos->sj_strategy == SJ_OPT_MATERIALIZE_SCAN)
    {
      SJ_MATERIALIZATION_INFO *sjm= pos->table->emb_sj_nest->sj_mat_info;
      for (uint j= 0; j < sjm->tables; j++)
      {
        JOIN_TAB *tab= (pos+j)->table;
        nest_tables|= tab->table->map;
      }
      tablenr+= (sjm->tables-1);
    }
    else
      nest_tables|= pos->table->table->map;

    if (pos->sort_nest_operation_here)
    {
      *n_tables= tables;
      *nest_tables_map= nest_tables;
      return TRUE;
    }
  }
  return FALSE;
}


/*
  @brief
    Create a sort nest info structure

  @param n_tables          number of tables inside the sort-nest
  @param nest_tables_map   map of top level tables inside the sort-nest

  @details
    This sort-nest structure would hold all the information about the
    sort-nest.

  @retval
    FALSE     successful in creating the sort-nest info structure
    TRUE      error
*/

bool JOIN::create_sort_nest_info(uint n_tables, table_map nest_tables_map)
{
  if (!(sort_nest_info= new Sort_nest_info(this, n_tables, nest_tables_map)))
    return TRUE;
  return FALSE;
}


void JOIN::substitute_best_fields_for_order_by_items()
{
  ORDER *ord;
  /*
    Substitute the ORDER by items with the best field so that equality
    propagation considered during best_access_path can be used.
  */
  for (ord= order; ord; ord=ord->next)
  {
    Item *item= ord->item[0];
    item= substitute_for_best_equal_field(thd, NO_PARTICULAR_TAB, item,
                                          cond_equal,
                                          map2table, true);
    item->update_used_tables();
    ord->item[0]= item;
  }
}


/*
  @brief
    Make the sort-nest.

  @param
    nest_info               Structure holding information about the nest

  @details
    Setup execution structures for sort-nest materialization:
      - Create the list of Items of the inner tables of the sort-nest
        that are needed for the post ORDER BY computations
      - Create the materialization temporary table for the sort-nest

    This function fills up the Sort_nest_info structure

  @retval
    TRUE   : In case of error
    FALSE  : Nest creation successful
*/

bool JOIN::make_sort_nest(Mat_join_tab_nest_info *nest_info)
{
  Field_iterator_table field_iterator;

  JOIN_TAB *j;
  JOIN_TAB *tab;

  if (unlikely(thd->trace_started()))
    add_sort_nest_tables_to_trace(this, nest_info);

  /*
    List of field items of the tables inside the sort-nest is created for
    the field items that are needed to be stored inside the temporary table
    of the sort-nest. Currently Item_field objects are created for the tables
    inside the sort-nest for all the  fields which have bitmap read_set
    set for them.

    TODO varun:
      An improvement would be if to remove the fields from this
      list that are completely internal to the nest because such
      fields would not be used in computing expression in the post
      ORDER BY context
  */

  for (j= join_tab + const_tables; j < nest_info->nest_tab; j++)
  {
    if (!j->bush_children)
    {
      TABLE *table= j->table;
      field_iterator.set_table(table);
      for (; !field_iterator.end_of_fields(); field_iterator.next())
      {
        Field *field= field_iterator.field();
        if (!bitmap_is_set(table->read_set, field->field_index))
          continue;
        Item *item;
        if (!(item= field_iterator.create_item(thd)))
          return TRUE;
        nest_info->nest_base_table_cols.push_back(item, thd->mem_root);
      }
    }
    else
    {
      TABLE_LIST *emb_sj_nest;
      JOIN_TAB *child_tab= j->bush_children->start;
      emb_sj_nest= child_tab->table->pos_in_table_list->embedding;
      /*
        @see setup_sj_materialization_part1
      */
      while (!emb_sj_nest->sj_mat_info)
        emb_sj_nest= emb_sj_nest->embedding;
      Item_in_subselect *item_sub= emb_sj_nest->sj_subq_pred;
      SELECT_LEX *subq_select= item_sub->unit->first_select();
      List_iterator_fast<Item> li(subq_select->item_list);
      Item *item;
      while((item= li++))
        nest_info->nest_base_table_cols.push_back(item, thd->mem_root);
    }
  }

  tab= nest_info->nest_tab;
  DBUG_ASSERT(!tab->table);

  uint sort_nest_elements= nest_info->nest_base_table_cols.elements;
  nest_info->tmp_table_param.init();
  nest_info->tmp_table_param.bit_fields_as_long= TRUE;
  nest_info->tmp_table_param.field_count= sort_nest_elements;
  nest_info->tmp_table_param.force_not_null_cols= FALSE;

  const LEX_CSTRING order_nest_name= { STRING_WITH_LEN("sort-nest") };
  if (!(tab->table= create_tmp_table(thd, &nest_info->tmp_table_param,
                                     nest_info->nest_base_table_cols,
                                     (ORDER*) 0,
                                     FALSE /* distinct */,
                                     0, /*save_sum_fields*/
                                     thd->variables.option_bits |
                                     TMP_TABLE_ALL_COLUMNS,
                                     HA_POS_ERROR /*rows_limit */,
                                     &order_nest_name)))
    return TRUE; /* purecov: inspected */

  tab->table->map= nest_info->get_tables_map();
  nest_info->table= tab->table;
  tab->type= JT_ALL;
  tab->table->reginfo.join_tab= tab;

  /*
    The list of temp table items created here, these are needed for the
    substitution for items that would be evaluated in POST SORT NEST context
  */
  field_iterator.set_table(tab->table);
  List_iterator_fast<Item> li(nest_info->nest_base_table_cols);
  Item *item;
  for (; !field_iterator.end_of_fields() && (item= li++);
       field_iterator.next())
  {
    Field *field= field_iterator.field();
    Item *nest_item;
    if (!(nest_item= new (thd->mem_root)Item_temptable_field(thd, field)))
      return TRUE;
    Item_pair *tmp_field= new Item_pair(item, nest_item);
    nest_info->mapping_of_items.push_back(tmp_field, thd->mem_root);
  }

  /* Setting up the scan on the temp table */
  tab->read_first_record= join_init_read_record;
  tab->read_record.read_record_func= rr_sequential;
  tab[-1].next_select= end_nest_materialization;
  DBUG_ASSERT(!nest_info->is_materialized());

  return FALSE;
}


/*
  @brief
    Calculate the cost of adding a sort-nest.

  @param
    join_record_count   the cardinality of the partial join
    idx                 position of the joined table in the partial plan
    rec_len             estimate of length of the record in the sort-nest table

  @details
    The calculation for the cost of the sort-nest is done here, the cost
    includes three components
      1) Filling the sort-nest table
      2) Sorting the sort-nest table
      3) Reading from the sort-nest table
*/

double JOIN::sort_nest_oper_cost(double join_record_count, uint idx,
                                 ulong rec_len)
{
  double cost= 0;
  set_if_bigger(join_record_count, 1);
  /*
    The sort-nest table is not created for sorting when one does sorting
    on the first non-const table. So for this case we don't need to add
    the cost of filling the table.
  */
  if (idx != const_tables)
    cost=  get_tmp_table_write_cost(thd, join_record_count, rec_len) *
           join_record_count; // cost to fill temp table

  // cost to perform  sorting
  cost+= get_tmp_table_lookup_cost(thd, join_record_count, rec_len) +
         (join_record_count == 0 ? 0 :
          join_record_count * log2 (join_record_count)) *
         SORT_INDEX_CMP_COST;

  /*
    cost for scanning the temp table.
    Picked this cost from get_delayed_table_estimates()
  */
  double data_size= COST_MULT(join_record_count * fraction_output_for_nest,
                              rec_len);
  cost+= data_size/IO_SIZE + 2;

  return cost;
}


/*
  @brief
    Calculate the number of records that would be read from the sort-nest.

  @param n_tables          number of tables in the sort-nest

  @details
    The number of records read from the sort-nest would be:

      cardinality(join of inner table of nest) * selectivity_of_limit;

    Here selectivity of limit is how many records we would expect in the
    output.
      selectivity_of_limit= limit / cardinality(join of all tables)

    This number of records is what we would also see in the EXPLAIN output
    for the sort-nest in the columns "rows".

  @retval Number of records that the optimizer expects to be read from the
          sort-nest
*/

double JOIN::calculate_record_count_for_sort_nest(uint n_tables)
{
  double sort_nest_records= 1, record_count;
  JOIN_TAB *tab= join_tab + const_tables;
  for (uint j= 0; j < n_tables ; j++, tab++)
  {
    record_count= tab->records_read * tab->cond_selectivity;
    sort_nest_records= COST_MULT(sort_nest_records, record_count);
  }
  sort_nest_records= sort_nest_records * fraction_output_for_nest;
  set_if_bigger(sort_nest_records, 1);
  return sort_nest_records;
}


/*
  @brief
    Find all keys that can resolve the ORDER BY clause for a table

  @details
    This function sets the flag TABLE::keys_with_ordering with all the
    indexes of a table that can resolve the ORDER BY clause.
*/

void JOIN_TAB::find_keys_that_can_achieve_ordering()
{
  if (!join->sort_nest_possible)
    return;

  table->keys_with_ordering.clear_all();
  for (uint index= 0; index < table->s->keys; index++)
  {
    if (table->keys_in_use_for_query.is_set(index) &&
        test_if_order_by_key(join, join->order, table, index))
      table->keys_with_ordering.set_bit(index);
  }
  table->keys_with_ordering.intersect(table->keys_in_use_for_order_by);
}


/*
  @brief
    Checks if the given prefix needs Filesort for ordering.

  @param
    idx            position of the joined table in the partial plan
    index_used     >=0 number of the index that is picked as best access
                   -1  no index access chosen

  @details
    Here we check if a given prefix requires Filesort or index on the
    first non-const table to resolve the ORDER BY clause.

  @retval
    TRUE   Filesort is needed
    FALSE  index present that satisfies the ordering
*/

bool JOIN_TAB::needs_filesort(uint idx, int index_used)
{
  if (idx != join->const_tables)
    return TRUE;

  return !check_if_index_satisfies_ordering(index_used);
}


/*
  @brief
    Find a cheaper index that resolves ordering on the first non-const table.

  @param
    tab                       joined table
    read_time [out]           cost for the best index picked if cheaper
    records   [out]           estimate of records going to be accessed by the
                              index
    index_used                >=0 number of index used for best access
                              -1  no index used for best access
    idx                       position of the joined table in the partial plan

  @details
    Here we try to walk through all the indexes for the first non-const table
    of a given prefix.
    From these indexes we are only interested in the indexes that can resolve
    the ORDER BY clause as we want to shortcut the join execution for ORDER BY
    LIMIT optimization.

    For each index we are interested in we try to estimate the records we
    have to read to ensure #limit records in the join output.

    Then with this estimate of records we calculate the cost of using an index
    and try to find the best index for access.
    If the best index found from here has a lower cost than the best access
    found in best_access_path, we switch the access to use the index found
    here.

  @retval
    -1     no cheaper index found for ordering
    >=0    cheaper index found for ordering
*/

int get_best_index_for_order_by_limit(JOIN_TAB *tab,
                                      ha_rows select_limit_arg,
                                      double *read_time,
                                      double *records,
                                      int index_used,
                                      uint idx)
{
  double cardinality;
  JOIN *join= tab->join;
  cardinality= join->cardinality_estimate;
  /**
    Cases when there is no need to consider indexes that can resolve the
    ORDER BY clause

    1) Table in consideration should be the first non-const table.
    2) Query does not use the ORDER BY LIMIT optimization with sort_nest
       @see sort_nest_allowed
    3) Join planner is run to get an estimate of cardinality for a join
    4) No index present that can resolve the ORDER BY clause
  */

  if (idx != join->const_tables ||                             // (1)
      !join->sort_nest_possible ||                             // (2)
      join->get_cardinality_estimate ||                        // (3)
      tab->table->keys_with_ordering.is_clear_all())           // (4)
    return -1;

  THD *thd= join->thd;
  Json_writer_object trace_index_for_ordering(thd);
  TABLE *table= tab->table;
  double save_read_time= *read_time;
  double save_records= *records;
  double est_records= *records;
  double fanout= MY_MAX(1.0, cardinality / est_records);
  int best_index=-1;
  trace_index_for_ordering.add("rows_estimation", est_records);
  Json_writer_array considered_indexes(thd, "considered_indexes");

  for (uint idx= 0 ; idx < table->s->keys; idx++)
  {
    ha_rows select_limit= select_limit_arg;
    if (!table->keys_with_ordering.is_set(idx))
      continue;
    Json_writer_object possible_key(thd);
    double index_scan_time;
    possible_key.add("index", table->key_info[idx].name);
    find_cost_of_index_with_ordering(thd, tab, table, &select_limit,
                                     fanout, est_records,
                                     idx, &index_scan_time,
                                     &possible_key);

    if (index_scan_time < *read_time)
    {
      best_index= idx;
      *read_time= index_scan_time;
      *records= select_limit;
    }
  }
  considered_indexes.end();

  if (unlikely(thd->trace_started()))
  {
    trace_index_for_ordering.add("best_index",
                                 static_cast<ulonglong>(best_index));
    trace_index_for_ordering.add("records", *records);
    trace_index_for_ordering.add("best_cost", *read_time);
  }

  /*
    If an index already found satisfied the ordering and we picked an index
    for which we choose to do index scan then revert the cost and stick
    with the access picked first.
    Index scan would not help in comparison with ref access.
  */
  if (tab->check_if_index_satisfies_ordering(index_used))
  {
    if (!table->quick_keys.is_set(static_cast<uint>(index_used)))
    {
      best_index= -1;
      *records= save_records;
      *read_time= save_read_time;
    }
  }
  return best_index;
}


/*
  @brief
    Disallow join buffering for tables that are read after sorting is done.

  @param
    tab                      table to check if join buffering is allowed or not

  @details
    Disallow join buffering for all the tables at the top level that are read
    after sorting is done.
    There are 2 cases
      1) Sorting on the first non-const table
         For all the tables join buffering is not allowed
      2) Sorting on a prefix of the join with a sort-nest
         For the tables inside the sort-nest join buffering is allowed but
         for tables outside the sort-nest join buffering is not allowed

    Also for SJM table that come after the sort-nest, join buffering is allowed
    for the inner tables of the SJM.

  @retval
    TRUE   Join buffering is allowed
    FALSE  Otherwise
*/

bool JOIN::is_join_buffering_allowed(JOIN_TAB *tab)
{
  if (!sort_nest_info)
    return TRUE;

  // no need to disable join buffering for the inner tables of SJM
  if (tab->bush_root_tab)
    return TRUE;

  if (tab->table->map & sort_nest_info->get_tables_map())
    return TRUE;
  return FALSE;
}


/*
  @brief
    Check if an index on a table resolves the ORDER BY clause.

  @param
    index_used                  index to be checked

  @retval
    TRUE  index resolves the ORDER BY clause
    FALSE otherwise
*/

bool JOIN_TAB::check_if_index_satisfies_ordering(int index_used)
{
  /*
    index_used is set to
    -1          for Table Scan
    MAX_KEY     for HASH JOIN
    >=0         for ref/range/index access
  */
  if (index_used < 0 || index_used == MAX_KEY)
    return FALSE;

  if (table->keys_with_ordering.is_set(static_cast<uint>(index_used)))
    return TRUE;
  return FALSE;
}


/*
  @brief
    Set up range scan for the table.

  @param
    tab                    table for which range scan needs to be setup
    idx                    index for which range scan needs to created
    records                estimate of records to be read with range scan

  @details
    Range scan is setup here for an index that can resolve the ORDER BY clause.
    There are 2 cases here:
      1) If the range scan is on the same index for which we created
         QUICK_SELECT when we ran the range optimizer earlier, then we try
         to reuse it.
      2) The range scan is on a different index then we need to create
         QUICK_SELECT for the new key. This is done by running the range
         optimizer again.

    Also here we take into account if the ordering is in reverse direction.
    For DESCENDING we try to reverse the QUICK_SELECT.

  @note
    This is done for the ORDER BY LIMIT optimization. We try to force creation
    of range scan for an index that the join planner picked for us. Also here
    we reverse the range scan if the ordering is in reverse direction.
*/

void JOIN::setup_range_scan(JOIN_TAB *tab, uint idx, double records)
{
  SQL_SELECT *sel= NULL;
  Item **sargable_cond= get_sargable_cond(this, tab->table);
  int err, rc, direction;
  uint used_key_parts;
  key_map keymap_for_range;
  Json_writer_array forcing_range(thd, "range_scan_for_order_by_limit");

  sel= make_select(tab->table, const_table_map, const_table_map,
                   *sargable_cond, (SORT_INFO*) 0, 1, &err);
  if (!sel)
    goto use_filesort;

  /*
    If the table already had a range access, check if it is the same as the
    one we wanted to create range scan for, if yes don't run the range
    optimizer again.
  */

  if (!(tab->quick && tab->quick->index == idx))
  {
    /* Free the QUICK_SELECT that was built earlier. */
    delete tab->quick;
    tab->quick= NULL;

    keymap_for_range.clear_all();  // Force the creation of quick select
    keymap_for_range.set_bit(idx); // only for index using range access.

    rc= sel->test_quick_select(thd, keymap_for_range,
                               (table_map) 0,
                               (ha_rows) HA_POS_ERROR,
                               true, false, true, true);
    if (rc <= 0)
      goto use_filesort;
  }
  else
    sel->quick= tab->quick;

  direction= test_if_order_by_key(this, order, tab->table, idx,
                                  &used_key_parts);

  if (direction == -1)
  {
    /*
      QUICK structure is reversed here as the ordering is in DESC order
    */
    QUICK_SELECT_I *reverse_quick;
    if (sel && sel->quick)
    {
      reverse_quick= sel->quick->make_reverse(used_key_parts);
      if (!reverse_quick)
        goto use_filesort;
      sel->set_quick(reverse_quick);
    }
  }

  tab->quick= sel->quick;

  /*
    Fix for explain, the records here should be set to the value
    which was stored in the JOIN::best_positions object. This is needed
    because the estimate of rows to be read for the first non-const table had
    taken selectivity of limit into account.
  */
  if (sort_nest_possible && records < tab->quick->records)
    tab->quick->records= records;

  sel->quick= NULL;

use_filesort:
  delete sel;
}


/*
  @brief
    Setup range/index scan to resolve ordering on the first non-const table.

  @param
    index        index for which index scan or range scan needs to be setup

  @details
    Here we try to prepare range scan or index scan for an index that can be
    used to resolve the ORDER BY clause. This is used only for the first
    non-const table of the join.

    For range scan
      There is a separate call to setup_range_scan, where the QUICK_SELECT is
      created for range access. In case we are not able to create a range
      access, we switch back to use Filesort on the first table.
      see @setup_range_scan
    For index scan
      We just store the index in Sort_nest_info::index_used.
*/

void JOIN::setup_index_use_for_ordering(int index)
{
  DBUG_ASSERT(sort_nest_info->index_used == -1);

  sort_nest_info->nest_tab= join_tab + const_tables;
  POSITION *cur_pos= &best_positions[const_tables];
  JOIN_TAB *tab= cur_pos->table;

  if (cur_pos->key)
    return;

  index= (index == -1) ?
         (cur_pos->table->quick ? cur_pos->table->quick->index : -1) :
          index;

  if (tab->check_if_index_satisfies_ordering(index))
  {
    if (tab->table->quick_keys.is_set(index))
    {
      // Range scan
      setup_range_scan(tab, index, cur_pos->records_read);
      sort_nest_info->index_used= -1;
    }
    else
    {
       // Index scan
      if (tab->quick)
      {
        delete tab->quick;
        tab->quick= NULL;
      }
      sort_nest_info->index_used= index;
    }
  }
}


/*
  @brief
    Get index used to access the table, if present

  @retval
    >=0 index used to access the table
    -1  no index used to access table, probably table scan is done
*/

int JOIN_TAB::get_index_on_table()
{
  int idx= -1;

  if (type == JT_REF  || type == JT_EQ_REF || type == JT_REF_OR_NULL)
    idx= ref.key;
  else if (type == JT_NEXT)
    idx= index;
  else if (type == JT_ALL && select && select->quick)
    idx= select->quick->index;
  return idx;
}


/*
  @brief
    Calculate the selectivity of limit.

  @details
    The selectivity of limit is calculated as
        selecitivity_of_limit=  rows_in_limit / cardinality_of_join

  @note
    The selectivity that we get is used to make an estimate of rows
    that we would read from the partial join of the tables inside the
    sort-nest.
*/

void JOIN::set_fraction_output_for_nest()
{
  if (sort_nest_possible && !get_cardinality_estimate)
  {
    fraction_output_for_nest= select_limit < cardinality_estimate ?
                              select_limit / cardinality_estimate :
                              1.0;
    if (unlikely(thd->trace_started()))
    {
      Json_writer_object trace_limit(thd);
      trace_limit.add("cardinality", cardinality_estimate);
      trace_limit.add("selectivity_of_limit", fraction_output_for_nest*100);
    }
  }
}


/*
  @brief
    Sort nest is allowed when one can shortcut the join execution.

  @details
    For all the operations where one requires entire join computation to be
    done first and then apply the operation on the join output,
    such operations can't make use of the sort-nest.
    So this function disables the use of sort-nest for such operations.

    Sort nest is not allowed for
    1) No ORDER BY clause
    2) Only constant tables in the join
    3) DISTINCT CLAUSE
    4) GROUP BY CLAUSE
    5) HAVING clause
    6) Aggregate Functions
    7) Window Functions
    8) Using ROLLUP
    9) Using SQL_BUFFER_RESULT
    10) LIMIT is absent
    11) Only SELECT queries can use the sort nest

  @retval
   TRUE     Sort-nest is allowed
   FALSE    Otherwise

*/

bool JOIN::sort_nest_allowed()
{
  return optimizer_flag(thd, OPTIMIZER_SWITCH_COST_BASED_ORDER_BY_LIMIT) &&
         order &&
         !(const_tables == table_count ||
           (select_distinct || group_list) ||
           having  ||
           MY_TEST(select_options & OPTION_BUFFER_RESULT) ||
           (rollup.state != ROLLUP::STATE_NONE && select_distinct) ||
           select_lex->window_specs.elements > 0 ||
           select_lex->agg_func_used() ||
           select_limit == HA_POS_ERROR ||
           thd->lex->sql_command != SQLCOM_SELECT ||
           select_lex->uncacheable & UNCACHEABLE_DEPENDENT ||
           MY_TEST(select_options & SELECT_STRAIGHT_JOIN));
}


/*
  @brief
    Consider adding a sort-nest on a prefix of the join

  @param prefix_tables           map of all the tables in the prefix

  @details
    This function is used during the join planning stage, where the join
    planner decides if it can add a sort-nest on a prefix of a join.
    The join planner does not add the sort-nest in the following cases:
      1) Queries where adding a sort-nest is not possible.
         see @sort_nest_allowed
      2) Join planner is run to get the cardinality of the join
      3) All inner tables of an outer join are inside the nest or outside
      4) All inner tables of a semi-join are inside the nest or outside
      5) Given prefix cannot resolve the ORDER BY clause

  @retval
    TRUE   sort-nest can be added on a prefix of a join
    FALSE  otherwise
*/

bool JOIN::consider_adding_sort_nest(table_map prefix_tables)
{
  if (!sort_nest_possible ||                        // (1)
      get_cardinality_estimate ||                   // (2)
      cur_embedding_map ||                          // (3)
      cur_sj_inner_tables)                          // (4)
    return FALSE;

  return check_join_prefix_resolves_ordering(prefix_tables);  // (5)
}


/*
  @brief
    Check if indexes on a table are allowed to resolve the ORDER BY clause

  @param
    idx                       position of the table in the partial plan

  @retval
    TRUE    Indexes are allowed to resolve ORDER BY clause
    FALSE   Otherwise

*/

bool JOIN::is_index_with_ordering_allowed(uint idx)
{
  /*
    An index on a table can allowed to resolve ordering in these cases:
      1) Table should be the first non-const table
      2) Query that allows the ORDER BY LIMIT optimization.
         @see sort_nest_allowed
      3) Join planner is not run to get the estimate of cardinality
  */
  return  idx == const_tables &&                                   // (1)
          sort_nest_possible &&                                    // (2)
          !get_cardinality_estimate;                               // (3)
}

/*
  @brief
    Find the cost to access a table with an index that can resolve ORDER BY.

  @param
    THD                                  thread structure
    tab                                  join_tab structure for joined table
    table                                first non-const table
    select_limit_arg                     limit for the query
    fanout                               fanout of the join
    est_best_records                     estimate of records for best access
    nr                                   index number
    index_scan_time[out]                 cost to access the table with the
                                         the index
*/

void find_cost_of_index_with_ordering(THD *thd, const JOIN_TAB *tab,
                                      TABLE *table,
                                      ha_rows *select_limit_arg,
                                      double fanout, double est_best_records,
                                      uint nr, double *index_scan_time,
                                      Json_writer_object *trace_possible_key)
{
  KEY *keyinfo= table->key_info + nr;
  ha_rows select_limit= *select_limit_arg;
  double rec_per_key;
  double table_records= table->stat_records();
  /*
    If tab=tk is not the last joined table tn then to get first
    L records from the result set we can expect to retrieve
    only L/fanout(tk,tn) where fanout(tk,tn) says how many
    rows in the record set on average will match each row tk.
    Usually our estimates for fanouts are too pessimistic.
    So the estimate for L/fanout(tk,tn) will be too optimistic
    and as result we'll choose an index scan when using ref/range
    access + filesort will be cheaper.
  */
  select_limit= (ha_rows) (select_limit < fanout ?
                           1 : select_limit/fanout);

  /*
    refkey_rows_estimate is E(#rows) produced by the table access
    strategy that was picked without regard to ORDER BY ... LIMIT.

    It will be used as the source of selectivity data.
    Use table->cond_selectivity as a better estimate which includes
    condition selectivity too.
  */
  {
    // we use MIN(...), because "Using LooseScan" queries have
    // cond_selectivity=1 while refkey_rows_estimate has a better
    // estimate.
    est_best_records= MY_MIN(est_best_records,
                             ha_rows(table_records * table->cond_selectivity));
  }

  /*
    We assume that each of the tested indexes is not correlated
    with ref_key. Thus, to select first N records we have to scan
    N/selectivity(ref_key) index entries.
    selectivity(ref_key) = #scanned_records/#table_records =
    refkey_rows_estimate/table_records.
    In any case we can't select more than #table_records.
    N/(refkey_rows_estimate/table_records) > table_records
    <=> N > refkey_rows_estimate.
   */

  if (select_limit > est_best_records)
    select_limit= table_records;
  else
    select_limit= (ha_rows) (select_limit *
                             (double) table_records /
                              est_best_records);

  rec_per_key= keyinfo->actual_rec_per_key(keyinfo->user_defined_key_parts-1);
  set_if_bigger(rec_per_key, 1);
  /*
    Here we take into account the fact that rows are
    accessed in sequences rec_per_key records in each.
    Rows in such a sequence are supposed to be ordered
    by rowid/primary key. When reading the data
    in a sequence we'll touch not more pages than the
    table file contains.
    TODO. Use the formula for a disk sweep sequential access
    to calculate the cost of accessing data rows for one
    index entry.
  */
  *index_scan_time= select_limit/rec_per_key *
                    MY_MIN(rec_per_key, table->file->scan_time());

  if (unlikely(thd->trace_started()))
  {
    trace_possible_key->add("updated_limit", select_limit);
    trace_possible_key->add("index_scan_time", *index_scan_time);
  }

  double range_scan_time;
  if (get_range_limit_read_cost(tab, table, table_records, nr,
                                select_limit, &range_scan_time))
  {
    trace_possible_key->add("range_scan_time", range_scan_time);
    if (range_scan_time < *index_scan_time)
      *index_scan_time= range_scan_time;
  }
  *select_limit_arg= select_limit;
}