1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
|
/*-------------------------------------------------------------------------
*
* network_selfuncs.c
* Functions for selectivity estimation of inet/cidr operators
*
* This module provides estimators for the subnet inclusion and overlap
* operators. Estimates are based on null fraction, most common values,
* and histogram of inet/cidr columns.
*
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/utils/adt/network_selfuncs.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <math.h>
#include "access/htup_details.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_statistic.h"
#include "utils/builtins.h"
#include "utils/inet.h"
#include "utils/lsyscache.h"
#include "utils/selfuncs.h"
/* Default selectivity for the inet overlap operator */
#define DEFAULT_OVERLAP_SEL 0.01
/* Default selectivity for the various inclusion operators */
#define DEFAULT_INCLUSION_SEL 0.005
/* Default selectivity for specified operator */
#define DEFAULT_SEL(operator) \
((operator) == OID_INET_OVERLAP_OP ? \
DEFAULT_OVERLAP_SEL : DEFAULT_INCLUSION_SEL)
/* Maximum number of items to consider in join selectivity calculations */
#define MAX_CONSIDERED_ELEMS 1024
static Selectivity networkjoinsel_inner(Oid operator,
VariableStatData *vardata1, VariableStatData *vardata2);
static Selectivity networkjoinsel_semi(Oid operator,
VariableStatData *vardata1, VariableStatData *vardata2);
static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues);
static Selectivity inet_hist_value_sel(Datum *values, int nvalues,
Datum constvalue, int opr_codenum);
static Selectivity inet_mcv_join_sel(Datum *mcv1_values,
float4 *mcv1_numbers, int mcv1_nvalues, Datum *mcv2_values,
float4 *mcv2_numbers, int mcv2_nvalues, Oid operator);
static Selectivity inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers,
int mcv_nvalues, Datum *hist_values, int hist_nvalues,
int opr_codenum);
static Selectivity inet_hist_inclusion_join_sel(Datum *hist1_values,
int hist1_nvalues,
Datum *hist2_values, int hist2_nvalues,
int opr_codenum);
static Selectivity inet_semi_join_sel(Datum lhs_value,
bool mcv_exists, Datum *mcv_values, int mcv_nvalues,
bool hist_exists, Datum *hist_values, int hist_nvalues,
double hist_weight,
FmgrInfo *proc, int opr_codenum);
static int inet_opr_codenum(Oid operator);
static int inet_inclusion_cmp(inet *left, inet *right, int opr_codenum);
static int inet_masklen_inclusion_cmp(inet *left, inet *right,
int opr_codenum);
static int inet_hist_match_divider(inet *boundary, inet *query,
int opr_codenum);
/*
* Selectivity estimation for the subnet inclusion/overlap operators
*/
Datum
networksel(PG_FUNCTION_ARGS)
{
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
Oid operator = PG_GETARG_OID(1);
List *args = (List *) PG_GETARG_POINTER(2);
int varRelid = PG_GETARG_INT32(3);
VariableStatData vardata;
Node *other;
bool varonleft;
Selectivity selec,
mcv_selec,
non_mcv_selec;
Datum constvalue;
Form_pg_statistic stats;
AttStatsSlot hslot;
double sumcommon,
nullfrac;
FmgrInfo proc;
/*
* If expression is not (variable op something) or (something op
* variable), then punt and return a default estimate.
*/
if (!get_restriction_variable(root, args, varRelid,
&vardata, &other, &varonleft))
PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
/*
* Can't do anything useful if the something is not a constant, either.
*/
if (!IsA(other, Const))
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
}
/* All of the operators handled here are strict. */
if (((Const *) other)->constisnull)
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(0.0);
}
constvalue = ((Const *) other)->constvalue;
/* Otherwise, we need stats in order to produce a non-default estimate. */
if (!HeapTupleIsValid(vardata.statsTuple))
{
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
}
stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
nullfrac = stats->stanullfrac;
/*
* If we have most-common-values info, add up the fractions of the MCV
* entries that satisfy MCV OP CONST. These fractions contribute directly
* to the result selectivity. Also add up the total fraction represented
* by MCV entries.
*/
fmgr_info(get_opcode(operator), &proc);
mcv_selec = mcv_selectivity(&vardata, &proc, constvalue, varonleft,
&sumcommon);
/*
* If we have a histogram, use it to estimate the proportion of the
* non-MCV population that satisfies the clause. If we don't, apply the
* default selectivity to that population.
*/
if (get_attstatsslot(&hslot, vardata.statsTuple,
STATISTIC_KIND_HISTOGRAM, InvalidOid,
ATTSTATSSLOT_VALUES))
{
int opr_codenum = inet_opr_codenum(operator);
/* Commute if needed, so we can consider histogram to be on the left */
if (!varonleft)
opr_codenum = -opr_codenum;
non_mcv_selec = inet_hist_value_sel(hslot.values, hslot.nvalues,
constvalue, opr_codenum);
free_attstatsslot(&hslot);
}
else
non_mcv_selec = DEFAULT_SEL(operator);
/* Combine selectivities for MCV and non-MCV populations */
selec = mcv_selec + (1.0 - nullfrac - sumcommon) * non_mcv_selec;
/* Result should be in range, but make sure... */
CLAMP_PROBABILITY(selec);
ReleaseVariableStats(vardata);
PG_RETURN_FLOAT8(selec);
}
/*
* Join selectivity estimation for the subnet inclusion/overlap operators
*
* This function has the same structure as eqjoinsel() in selfuncs.c.
*
* Throughout networkjoinsel and its subroutines, we have a performance issue
* in that the amount of work to be done is O(N^2) in the length of the MCV
* and histogram arrays. To keep the runtime from getting out of hand when
* large statistics targets have been set, we arbitrarily limit the number of
* values considered to 1024 (MAX_CONSIDERED_ELEMS). For the MCV arrays, this
* is easy: just consider at most the first N elements. (Since the MCVs are
* sorted by decreasing frequency, this correctly gets us the first N MCVs.)
* For the histogram arrays, we decimate; that is consider only every k'th
* element, where k is chosen so that no more than MAX_CONSIDERED_ELEMS
* elements are considered. This should still give us a good random sample of
* the non-MCV population. Decimation is done on-the-fly in the loops that
* iterate over the histogram arrays.
*/
Datum
networkjoinsel(PG_FUNCTION_ARGS)
{
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
Oid operator = PG_GETARG_OID(1);
List *args = (List *) PG_GETARG_POINTER(2);
#ifdef NOT_USED
JoinType jointype = (JoinType) PG_GETARG_INT16(3);
#endif
SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
double selec;
VariableStatData vardata1;
VariableStatData vardata2;
bool join_is_reversed;
get_join_variables(root, args, sjinfo,
&vardata1, &vardata2, &join_is_reversed);
switch (sjinfo->jointype)
{
case JOIN_INNER:
case JOIN_LEFT:
case JOIN_FULL:
/*
* Selectivity for left/full join is not exactly the same as inner
* join, but we neglect the difference, as eqjoinsel does.
*/
selec = networkjoinsel_inner(operator, &vardata1, &vardata2);
break;
case JOIN_SEMI:
case JOIN_ANTI:
/* Here, it's important that we pass the outer var on the left. */
if (!join_is_reversed)
selec = networkjoinsel_semi(operator, &vardata1, &vardata2);
else
selec = networkjoinsel_semi(get_commutator(operator),
&vardata2, &vardata1);
break;
default:
/* other values not expected here */
elog(ERROR, "unrecognized join type: %d",
(int) sjinfo->jointype);
selec = 0; /* keep compiler quiet */
break;
}
ReleaseVariableStats(vardata1);
ReleaseVariableStats(vardata2);
CLAMP_PROBABILITY(selec);
PG_RETURN_FLOAT8((float8) selec);
}
/*
* Inner join selectivity estimation for subnet inclusion/overlap operators
*
* Calculates MCV vs MCV, MCV vs histogram and histogram vs histogram
* selectivity for join using the subnet inclusion operators. Unlike the
* join selectivity function for the equality operator, eqjoinsel_inner(),
* one to one matching of the values is not enough. Network inclusion
* operators are likely to match many to many, so we must check all pairs.
* (Note: it might be possible to exploit understanding of the histogram's
* btree ordering to reduce the work needed, but we don't currently try.)
* Also, MCV vs histogram selectivity is not neglected as in eqjoinsel_inner().
*/
static Selectivity
networkjoinsel_inner(Oid operator,
VariableStatData *vardata1, VariableStatData *vardata2)
{
Form_pg_statistic stats;
double nullfrac1 = 0.0,
nullfrac2 = 0.0;
Selectivity selec = 0.0,
sumcommon1 = 0.0,
sumcommon2 = 0.0;
bool mcv1_exists = false,
mcv2_exists = false,
hist1_exists = false,
hist2_exists = false;
int opr_codenum;
int mcv1_length = 0,
mcv2_length = 0;
AttStatsSlot mcv1_slot;
AttStatsSlot mcv2_slot;
AttStatsSlot hist1_slot;
AttStatsSlot hist2_slot;
if (HeapTupleIsValid(vardata1->statsTuple))
{
stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
nullfrac1 = stats->stanullfrac;
mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple,
STATISTIC_KIND_MCV, InvalidOid,
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple,
STATISTIC_KIND_HISTOGRAM, InvalidOid,
ATTSTATSSLOT_VALUES);
/* Arbitrarily limit number of MCVs considered */
mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS);
if (mcv1_exists)
sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length);
}
else
{
memset(&mcv1_slot, 0, sizeof(mcv1_slot));
memset(&hist1_slot, 0, sizeof(hist1_slot));
}
if (HeapTupleIsValid(vardata2->statsTuple))
{
stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
nullfrac2 = stats->stanullfrac;
mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple,
STATISTIC_KIND_MCV, InvalidOid,
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple,
STATISTIC_KIND_HISTOGRAM, InvalidOid,
ATTSTATSSLOT_VALUES);
/* Arbitrarily limit number of MCVs considered */
mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS);
if (mcv2_exists)
sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length);
}
else
{
memset(&mcv2_slot, 0, sizeof(mcv2_slot));
memset(&hist2_slot, 0, sizeof(hist2_slot));
}
opr_codenum = inet_opr_codenum(operator);
/*
* Calculate selectivity for MCV vs MCV matches.
*/
if (mcv1_exists && mcv2_exists)
selec += inet_mcv_join_sel(mcv1_slot.values, mcv1_slot.numbers,
mcv1_length,
mcv2_slot.values, mcv2_slot.numbers,
mcv2_length,
operator);
/*
* Add in selectivities for MCV vs histogram matches, scaling according to
* the fractions of the populations represented by the histograms. Note
* that the second case needs to commute the operator.
*/
if (mcv1_exists && hist2_exists)
selec += (1.0 - nullfrac2 - sumcommon2) *
inet_mcv_hist_sel(mcv1_slot.values, mcv1_slot.numbers, mcv1_length,
hist2_slot.values, hist2_slot.nvalues,
opr_codenum);
if (mcv2_exists && hist1_exists)
selec += (1.0 - nullfrac1 - sumcommon1) *
inet_mcv_hist_sel(mcv2_slot.values, mcv2_slot.numbers, mcv2_length,
hist1_slot.values, hist1_slot.nvalues,
-opr_codenum);
/*
* Add in selectivity for histogram vs histogram matches, again scaling
* appropriately.
*/
if (hist1_exists && hist2_exists)
selec += (1.0 - nullfrac1 - sumcommon1) *
(1.0 - nullfrac2 - sumcommon2) *
inet_hist_inclusion_join_sel(hist1_slot.values, hist1_slot.nvalues,
hist2_slot.values, hist2_slot.nvalues,
opr_codenum);
/*
* If useful statistics are not available then use the default estimate.
* We can apply null fractions if known, though.
*/
if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists))
selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator);
/* Release stats. */
free_attstatsslot(&mcv1_slot);
free_attstatsslot(&mcv2_slot);
free_attstatsslot(&hist1_slot);
free_attstatsslot(&hist2_slot);
return selec;
}
/*
* Semi join selectivity estimation for subnet inclusion/overlap operators
*
* Calculates MCV vs MCV, MCV vs histogram, histogram vs MCV, and histogram vs
* histogram selectivity for semi/anti join cases.
*/
static Selectivity
networkjoinsel_semi(Oid operator,
VariableStatData *vardata1, VariableStatData *vardata2)
{
Form_pg_statistic stats;
Selectivity selec = 0.0,
sumcommon1 = 0.0,
sumcommon2 = 0.0;
double nullfrac1 = 0.0,
nullfrac2 = 0.0,
hist2_weight = 0.0;
bool mcv1_exists = false,
mcv2_exists = false,
hist1_exists = false,
hist2_exists = false;
int opr_codenum;
FmgrInfo proc;
int i,
mcv1_length = 0,
mcv2_length = 0;
AttStatsSlot mcv1_slot;
AttStatsSlot mcv2_slot;
AttStatsSlot hist1_slot;
AttStatsSlot hist2_slot;
if (HeapTupleIsValid(vardata1->statsTuple))
{
stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
nullfrac1 = stats->stanullfrac;
mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple,
STATISTIC_KIND_MCV, InvalidOid,
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple,
STATISTIC_KIND_HISTOGRAM, InvalidOid,
ATTSTATSSLOT_VALUES);
/* Arbitrarily limit number of MCVs considered */
mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS);
if (mcv1_exists)
sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length);
}
else
{
memset(&mcv1_slot, 0, sizeof(mcv1_slot));
memset(&hist1_slot, 0, sizeof(hist1_slot));
}
if (HeapTupleIsValid(vardata2->statsTuple))
{
stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
nullfrac2 = stats->stanullfrac;
mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple,
STATISTIC_KIND_MCV, InvalidOid,
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple,
STATISTIC_KIND_HISTOGRAM, InvalidOid,
ATTSTATSSLOT_VALUES);
/* Arbitrarily limit number of MCVs considered */
mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS);
if (mcv2_exists)
sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length);
}
else
{
memset(&mcv2_slot, 0, sizeof(mcv2_slot));
memset(&hist2_slot, 0, sizeof(hist2_slot));
}
opr_codenum = inet_opr_codenum(operator);
fmgr_info(get_opcode(operator), &proc);
/* Estimate number of input rows represented by RHS histogram. */
if (hist2_exists && vardata2->rel)
hist2_weight = (1.0 - nullfrac2 - sumcommon2) * vardata2->rel->rows;
/*
* Consider each element of the LHS MCV list, matching it to whatever RHS
* stats we have. Scale according to the known frequency of the MCV.
*/
if (mcv1_exists && (mcv2_exists || hist2_exists))
{
for (i = 0; i < mcv1_length; i++)
{
selec += mcv1_slot.numbers[i] *
inet_semi_join_sel(mcv1_slot.values[i],
mcv2_exists, mcv2_slot.values, mcv2_length,
hist2_exists,
hist2_slot.values, hist2_slot.nvalues,
hist2_weight,
&proc, opr_codenum);
}
}
/*
* Consider each element of the LHS histogram, except for the first and
* last elements, which we exclude on the grounds that they're outliers
* and thus not very representative. Scale on the assumption that each
* such histogram element represents an equal share of the LHS histogram
* population (which is a bit bogus, because the members of its bucket may
* not all act the same with respect to the join clause, but it's hard to
* do better).
*
* If there are too many histogram elements, decimate to limit runtime.
*/
if (hist1_exists && hist1_slot.nvalues > 2 && (mcv2_exists || hist2_exists))
{
double hist_selec_sum = 0.0;
int k,
n;
k = (hist1_slot.nvalues - 3) / MAX_CONSIDERED_ELEMS + 1;
n = 0;
for (i = 1; i < hist1_slot.nvalues - 1; i += k)
{
hist_selec_sum +=
inet_semi_join_sel(hist1_slot.values[i],
mcv2_exists, mcv2_slot.values, mcv2_length,
hist2_exists,
hist2_slot.values, hist2_slot.nvalues,
hist2_weight,
&proc, opr_codenum);
n++;
}
selec += (1.0 - nullfrac1 - sumcommon1) * hist_selec_sum / n;
}
/*
* If useful statistics are not available then use the default estimate.
* We can apply null fractions if known, though.
*/
if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists))
selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator);
/* Release stats. */
free_attstatsslot(&mcv1_slot);
free_attstatsslot(&mcv2_slot);
free_attstatsslot(&hist1_slot);
free_attstatsslot(&hist2_slot);
return selec;
}
/*
* Compute the fraction of a relation's population that is represented
* by the MCV list.
*/
static Selectivity
mcv_population(float4 *mcv_numbers, int mcv_nvalues)
{
Selectivity sumcommon = 0.0;
int i;
for (i = 0; i < mcv_nvalues; i++)
{
sumcommon += mcv_numbers[i];
}
return sumcommon;
}
/*
* Inet histogram vs single value selectivity estimation
*
* Estimate the fraction of the histogram population that satisfies
* "value OPR CONST". (The result needs to be scaled to reflect the
* proportion of the total population represented by the histogram.)
*
* The histogram is originally for the inet btree comparison operators.
* Only the common bits of the network part and the length of the network part
* (masklen) are interesting for the subnet inclusion operators. Fortunately,
* btree comparison treats the network part as the major sort key. Even so,
* the length of the network part would not really be significant in the
* histogram. This would lead to big mistakes for data sets with uneven
* masklen distribution. To reduce this problem, comparisons with the left
* and the right sides of the buckets are used together.
*
* Histogram bucket matches are calculated in two forms. If the constant
* matches both bucket endpoints the bucket is considered as fully matched.
* The second form is to match the bucket partially; we recognize this when
* the constant matches just one endpoint, or the two endpoints fall on
* opposite sides of the constant. (Note that when the constant matches an
* interior histogram element, it gets credit for partial matches to the
* buckets on both sides, while a match to a histogram endpoint gets credit
* for only one partial match. This is desirable.)
*
* The divider in the partial bucket match is imagined as the distance
* between the decisive bits and the common bits of the addresses. It will
* be used as a power of two as it is the natural scale for the IP network
* inclusion. This partial bucket match divider calculation is an empirical
* formula and subject to change with more experiment.
*
* For a partial match, we try to calculate dividers for both of the
* boundaries. If the address family of a boundary value does not match the
* constant or comparison of the length of the network parts is not correct
* for the operator, the divider for that boundary will not be taken into
* account. If both of the dividers are valid, the greater one will be used
* to minimize the mistake in buckets that have disparate masklens. This
* calculation is unfair when dividers can be calculated for both of the
* boundaries but they are far from each other; but it is not a common
* situation as the boundaries are expected to share most of their significant
* bits of their masklens. The mistake would be greater, if we would use the
* minimum instead of the maximum, and we don't know a sensible way to combine
* them.
*
* For partial match in buckets that have different address families on the
* left and right sides, only the boundary with the same address family is
* taken into consideration. This can cause more mistakes for these buckets
* if the masklens of their boundaries are also disparate. But this can only
* happen in one bucket, since only two address families exist. It seems a
* better option than not considering these buckets at all.
*/
static Selectivity
inet_hist_value_sel(Datum *values, int nvalues, Datum constvalue,
int opr_codenum)
{
Selectivity match = 0.0;
inet *query,
*left,
*right;
int i,
k,
n;
int left_order,
right_order,
left_divider,
right_divider;
/* guard against zero-divide below */
if (nvalues <= 1)
return 0.0;
/* if there are too many histogram elements, decimate to limit runtime */
k = (nvalues - 2) / MAX_CONSIDERED_ELEMS + 1;
query = DatumGetInetPP(constvalue);
/* "left" is the left boundary value of the current bucket ... */
left = DatumGetInetPP(values[0]);
left_order = inet_inclusion_cmp(left, query, opr_codenum);
n = 0;
for (i = k; i < nvalues; i += k)
{
/* ... and "right" is the right boundary value */
right = DatumGetInetPP(values[i]);
right_order = inet_inclusion_cmp(right, query, opr_codenum);
if (left_order == 0 && right_order == 0)
{
/* The whole bucket matches, since both endpoints do. */
match += 1.0;
}
else if ((left_order <= 0 && right_order >= 0) ||
(left_order >= 0 && right_order <= 0))
{
/* Partial bucket match. */
left_divider = inet_hist_match_divider(left, query, opr_codenum);
right_divider = inet_hist_match_divider(right, query, opr_codenum);
if (left_divider >= 0 || right_divider >= 0)
match += 1.0 / pow(2.0, Max(left_divider, right_divider));
}
/* Shift the variables. */
left = right;
left_order = right_order;
/* Count the number of buckets considered. */
n++;
}
return match / n;
}
/*
* Inet MCV vs MCV join selectivity estimation
*
* We simply add up the fractions of the populations that satisfy the clause.
* The result is exact and does not need to be scaled further.
*/
static Selectivity
inet_mcv_join_sel(Datum *mcv1_values, float4 *mcv1_numbers, int mcv1_nvalues,
Datum *mcv2_values, float4 *mcv2_numbers, int mcv2_nvalues,
Oid operator)
{
Selectivity selec = 0.0;
FmgrInfo proc;
int i,
j;
fmgr_info(get_opcode(operator), &proc);
for (i = 0; i < mcv1_nvalues; i++)
{
for (j = 0; j < mcv2_nvalues; j++)
if (DatumGetBool(FunctionCall2(&proc,
mcv1_values[i],
mcv2_values[j])))
selec += mcv1_numbers[i] * mcv2_numbers[j];
}
return selec;
}
/*
* Inet MCV vs histogram join selectivity estimation
*
* For each MCV on the lefthand side, estimate the fraction of the righthand's
* histogram population that satisfies the join clause, and add those up,
* scaling by the MCV's frequency. The result still needs to be scaled
* according to the fraction of the righthand's population represented by
* the histogram.
*/
static Selectivity
inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers, int mcv_nvalues,
Datum *hist_values, int hist_nvalues,
int opr_codenum)
{
Selectivity selec = 0.0;
int i;
/*
* We'll call inet_hist_value_selec with the histogram on the left, so we
* must commute the operator.
*/
opr_codenum = -opr_codenum;
for (i = 0; i < mcv_nvalues; i++)
{
selec += mcv_numbers[i] *
inet_hist_value_sel(hist_values, hist_nvalues, mcv_values[i],
opr_codenum);
}
return selec;
}
/*
* Inet histogram vs histogram join selectivity estimation
*
* Here, we take all values listed in the second histogram (except for the
* first and last elements, which are excluded on the grounds of possibly
* not being very representative) and treat them as a uniform sample of
* the non-MCV population for that relation. For each one, we apply
* inet_hist_value_selec to see what fraction of the first histogram
* it matches.
*
* We could alternatively do this the other way around using the operator's
* commutator. XXX would it be worthwhile to do it both ways and take the
* average? That would at least avoid non-commutative estimation results.
*/
static Selectivity
inet_hist_inclusion_join_sel(Datum *hist1_values, int hist1_nvalues,
Datum *hist2_values, int hist2_nvalues,
int opr_codenum)
{
double match = 0.0;
int i,
k,
n;
if (hist2_nvalues <= 2)
return 0.0; /* no interior histogram elements */
/* if there are too many histogram elements, decimate to limit runtime */
k = (hist2_nvalues - 3) / MAX_CONSIDERED_ELEMS + 1;
n = 0;
for (i = 1; i < hist2_nvalues - 1; i += k)
{
match += inet_hist_value_sel(hist1_values, hist1_nvalues,
hist2_values[i], opr_codenum);
n++;
}
return match / n;
}
/*
* Inet semi join selectivity estimation for one value
*
* The function calculates the probability that there is at least one row
* in the RHS table that satisfies the "lhs_value op column" condition.
* It is used in semi join estimation to check a sample from the left hand
* side table.
*
* The MCV and histogram from the right hand side table should be provided as
* arguments with the lhs_value from the left hand side table for the join.
* hist_weight is the total number of rows represented by the histogram.
* For example, if the table has 1000 rows, and 10% of the rows are in the MCV
* list, and another 10% are NULLs, hist_weight would be 800.
*
* First, the lhs_value will be matched to the most common values. If it
* matches any of them, 1.0 will be returned, because then there is surely
* a match.
*
* Otherwise, the histogram will be used to estimate the number of rows in
* the second table that match the condition. If the estimate is greater
* than 1.0, 1.0 will be returned, because it means there is a greater chance
* that the lhs_value will match more than one row in the table. If it is
* between 0.0 and 1.0, it will be returned as the probability.
*/
static Selectivity
inet_semi_join_sel(Datum lhs_value,
bool mcv_exists, Datum *mcv_values, int mcv_nvalues,
bool hist_exists, Datum *hist_values, int hist_nvalues,
double hist_weight,
FmgrInfo *proc, int opr_codenum)
{
if (mcv_exists)
{
int i;
for (i = 0; i < mcv_nvalues; i++)
{
if (DatumGetBool(FunctionCall2(proc,
lhs_value,
mcv_values[i])))
return 1.0;
}
}
if (hist_exists && hist_weight > 0)
{
Selectivity hist_selec;
/* Commute operator, since we're passing lhs_value on the right */
hist_selec = inet_hist_value_sel(hist_values, hist_nvalues,
lhs_value, -opr_codenum);
if (hist_selec > 0)
return Min(1.0, hist_weight * hist_selec);
}
return 0.0;
}
/*
* Assign useful code numbers for the subnet inclusion/overlap operators
*
* Only inet_masklen_inclusion_cmp() and inet_hist_match_divider() depend
* on the exact codes assigned here; but many other places in this file
* know that they can negate a code to obtain the code for the commutator
* operator.
*/
static int
inet_opr_codenum(Oid operator)
{
switch (operator)
{
case OID_INET_SUP_OP:
return -2;
case OID_INET_SUPEQ_OP:
return -1;
case OID_INET_OVERLAP_OP:
return 0;
case OID_INET_SUBEQ_OP:
return 1;
case OID_INET_SUB_OP:
return 2;
default:
elog(ERROR, "unrecognized operator %u for inet selectivity",
operator);
}
return 0; /* unreached, but keep compiler quiet */
}
/*
* Comparison function for the subnet inclusion/overlap operators
*
* If the comparison is okay for the specified inclusion operator, the return
* value will be 0. Otherwise the return value will be less than or greater
* than 0 as appropriate for the operator.
*
* Comparison is compatible with the basic comparison function for the inet
* type. See network_cmp_internal() in network.c for the original. Basic
* comparison operators are implemented with the network_cmp_internal()
* function. It is possible to implement the subnet inclusion operators with
* this function.
*
* Comparison is first on the common bits of the network part, then on the
* length of the network part (masklen) as in the network_cmp_internal()
* function. Only the first part is in this function. The second part is
* separated to another function for reusability. The difference between the
* second part and the original network_cmp_internal() is that the inclusion
* operator is considered while comparing the lengths of the network parts.
* See the inet_masklen_inclusion_cmp() function below.
*/
static int
inet_inclusion_cmp(inet *left, inet *right, int opr_codenum)
{
if (ip_family(left) == ip_family(right))
{
int order;
order = bitncmp(ip_addr(left), ip_addr(right),
Min(ip_bits(left), ip_bits(right)));
if (order != 0)
return order;
return inet_masklen_inclusion_cmp(left, right, opr_codenum);
}
return ip_family(left) - ip_family(right);
}
/*
* Masklen comparison function for the subnet inclusion/overlap operators
*
* Compares the lengths of the network parts of the inputs. If the comparison
* is okay for the specified inclusion operator, the return value will be 0.
* Otherwise the return value will be less than or greater than 0 as
* appropriate for the operator.
*/
static int
inet_masklen_inclusion_cmp(inet *left, inet *right, int opr_codenum)
{
int order;
order = (int) ip_bits(left) - (int) ip_bits(right);
/*
* Return 0 if the operator would accept this combination of masklens.
* Note that opr_codenum zero (overlaps) will accept all cases.
*/
if ((order > 0 && opr_codenum >= 0) ||
(order == 0 && opr_codenum >= -1 && opr_codenum <= 1) ||
(order < 0 && opr_codenum <= 0))
return 0;
/*
* Otherwise, return a negative value for sup/supeq (notionally, the RHS
* needs to have a larger masklen than it has, which would make it sort
* later), or a positive value for sub/subeq (vice versa).
*/
return opr_codenum;
}
/*
* Inet histogram partial match divider calculation
*
* First the families and the lengths of the network parts are compared using
* the subnet inclusion operator. If those are acceptable for the operator,
* the divider will be calculated using the masklens and the common bits of
* the addresses. -1 will be returned if it cannot be calculated.
*
* See commentary for inet_hist_value_sel() for some rationale for this.
*/
static int
inet_hist_match_divider(inet *boundary, inet *query, int opr_codenum)
{
if (ip_family(boundary) == ip_family(query) &&
inet_masklen_inclusion_cmp(boundary, query, opr_codenum) == 0)
{
int min_bits,
decisive_bits;
min_bits = Min(ip_bits(boundary), ip_bits(query));
/*
* Set decisive_bits to the masklen of the one that should contain the
* other according to the operator.
*/
if (opr_codenum < 0)
decisive_bits = ip_bits(boundary);
else if (opr_codenum > 0)
decisive_bits = ip_bits(query);
else
decisive_bits = min_bits;
/*
* Now return the number of non-common decisive bits. (This will be
* zero if the boundary and query in fact match, else positive.)
*/
if (min_bits > 0)
return decisive_bits - bitncommon(ip_addr(boundary),
ip_addr(query),
min_bits);
return decisive_bits;
}
return -1;
}
|