summaryrefslogtreecommitdiff
path: root/src/backend/utils
diff options
context:
space:
mode:
authorSimon Riggs <simon@2ndQuadrant.com>2015-05-15 04:02:54 +0200
committerSimon Riggs <simon@2ndQuadrant.com>2015-05-15 04:02:54 +0200
commit83e176ec18d2a91dbea1d0d1bd94c38dc47cd77c (patch)
tree6c92ec7402b4ac14a585c41d960e35bd283a878b /src/backend/utils
parent5a3022fde018aca9b23b7f7506233b437d943de2 (diff)
downloadpostgresql-83e176ec18d2a91dbea1d0d1bd94c38dc47cd77c.tar.gz
Separate block sampling functions
Refactoring ahead of tablesample patch Requested and reviewed by Michael Paquier Petr Jelinek
Diffstat (limited to 'src/backend/utils')
-rw-r--r--src/backend/utils/misc/Makefile2
-rw-r--r--src/backend/utils/misc/sampling.c226
2 files changed, 227 insertions, 1 deletions
diff --git a/src/backend/utils/misc/Makefile b/src/backend/utils/misc/Makefile
index 378b77e0bc..788910157d 100644
--- a/src/backend/utils/misc/Makefile
+++ b/src/backend/utils/misc/Makefile
@@ -15,7 +15,7 @@ include $(top_builddir)/src/Makefile.global
override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS)
OBJS = guc.o help_config.o pg_rusage.o ps_status.o rls.o \
- superuser.o timeout.o tzparser.o
+ sampling.o superuser.o timeout.o tzparser.o
# This location might depend on the installation directories. Therefore
# we can't subsitute it into pg_config.h.
diff --git a/src/backend/utils/misc/sampling.c b/src/backend/utils/misc/sampling.c
new file mode 100644
index 0000000000..1eeabaf158
--- /dev/null
+++ b/src/backend/utils/misc/sampling.c
@@ -0,0 +1,226 @@
+/*-------------------------------------------------------------------------
+ *
+ * sampling.c
+ * Relation block sampling routines.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/misc/sampling.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <math.h>
+
+#include "utils/sampling.h"
+
+
+/*
+ * BlockSampler_Init -- prepare for random sampling of blocknumbers
+ *
+ * BlockSampler provides algorithm for block level sampling of a relation
+ * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
+ * It selects a random sample of samplesize blocks out of
+ * the nblocks blocks in the table. If the table has less than
+ * samplesize blocks, all blocks are selected.
+ *
+ * Since we know the total number of blocks in advance, we can use the
+ * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
+ * algorithm.
+ */
+void
+BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
+ long randseed)
+{
+ bs->N = nblocks; /* measured table size */
+
+ /*
+ * If we decide to reduce samplesize for tables that have less or not much
+ * more than samplesize blocks, here is the place to do it.
+ */
+ bs->n = samplesize;
+ bs->t = 0; /* blocks scanned so far */
+ bs->m = 0; /* blocks selected so far */
+}
+
+bool
+BlockSampler_HasMore(BlockSampler bs)
+{
+ return (bs->t < bs->N) && (bs->m < bs->n);
+}
+
+BlockNumber
+BlockSampler_Next(BlockSampler bs)
+{
+ BlockNumber K = bs->N - bs->t; /* remaining blocks */
+ int k = bs->n - bs->m; /* blocks still to sample */
+ double p; /* probability to skip block */
+ double V; /* random */
+
+ Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
+
+ if ((BlockNumber) k >= K)
+ {
+ /* need all the rest */
+ bs->m++;
+ return bs->t++;
+ }
+
+ /*----------
+ * It is not obvious that this code matches Knuth's Algorithm S.
+ * Knuth says to skip the current block with probability 1 - k/K.
+ * If we are to skip, we should advance t (hence decrease K), and
+ * repeat the same probabilistic test for the next block. The naive
+ * implementation thus requires an sampler_random_fract() call for each
+ * block number. But we can reduce this to one sampler_random_fract()
+ * call per selected block, by noting that each time the while-test
+ * succeeds, we can reinterpret V as a uniform random number in the range
+ * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
+ * the appropriate fraction of its former value, and our next loop
+ * makes the appropriate probabilistic test.
+ *
+ * We have initially K > k > 0. If the loop reduces K to equal k,
+ * the next while-test must fail since p will become exactly zero
+ * (we assume there will not be roundoff error in the division).
+ * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
+ * to be doubly sure about roundoff error.) Therefore K cannot become
+ * less than k, which means that we cannot fail to select enough blocks.
+ *----------
+ */
+ V = sampler_random_fract();
+ p = 1.0 - (double) k / (double) K;
+ while (V < p)
+ {
+ /* skip */
+ bs->t++;
+ K--; /* keep K == N - t */
+
+ /* adjust p to be new cutoff point in reduced range */
+ p *= 1.0 - (double) k / (double) K;
+ }
+
+ /* select */
+ bs->m++;
+ return bs->t++;
+}
+
+/*
+ * These two routines embody Algorithm Z from "Random sampling with a
+ * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
+ * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
+ * of the count S of records to skip before processing another record.
+ * It is computed primarily based on t, the number of records already read.
+ * The only extra state needed between calls is W, a random state variable.
+ *
+ * reservoir_init_selection_state computes the initial W value.
+ *
+ * Given that we've already read t records (t >= n), reservoir_get_next_S
+ * determines the number of records to skip before the next record is
+ * processed.
+ */
+void
+reservoir_init_selection_state(ReservoirState rs, int n)
+{
+ /* Initial value of W (for use when Algorithm Z is first applied) */
+ *rs = exp(-log(sampler_random_fract()) / n);
+}
+
+double
+reservoir_get_next_S(ReservoirState rs, double t, int n)
+{
+ double S;
+
+ /* The magic constant here is T from Vitter's paper */
+ if (t <= (22.0 * n))
+ {
+ /* Process records using Algorithm X until t is large enough */
+ double V,
+ quot;
+
+ V = sampler_random_fract(); /* Generate V */
+ S = 0;
+ t += 1;
+ /* Note: "num" in Vitter's code is always equal to t - n */
+ quot = (t - (double) n) / t;
+ /* Find min S satisfying (4.1) */
+ while (quot > V)
+ {
+ S += 1;
+ t += 1;
+ quot *= (t - (double) n) / t;
+ }
+ }
+ else
+ {
+ /* Now apply Algorithm Z */
+ double W = *rs;
+ double term = t - (double) n + 1;
+
+ for (;;)
+ {
+ double numer,
+ numer_lim,
+ denom;
+ double U,
+ X,
+ lhs,
+ rhs,
+ y,
+ tmp;
+
+ /* Generate U and X */
+ U = sampler_random_fract();
+ X = t * (W - 1.0);
+ S = floor(X); /* S is tentatively set to floor(X) */
+ /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
+ tmp = (t + 1) / term;
+ lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
+ rhs = (((t + X) / (term + S)) * term) / t;
+ if (lhs <= rhs)
+ {
+ W = rhs / lhs;
+ break;
+ }
+ /* Test if U <= f(S)/cg(X) */
+ y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
+ if ((double) n < S)
+ {
+ denom = t;
+ numer_lim = term + S;
+ }
+ else
+ {
+ denom = t - (double) n + S;
+ numer_lim = t + 1;
+ }
+ for (numer = t + S; numer >= numer_lim; numer -= 1)
+ {
+ y *= numer / denom;
+ denom -= 1;
+ }
+ W = exp(-log(sampler_random_fract()) / n); /* Generate W in advance */
+ if (exp(log(y) / n) <= (t + X) / t)
+ break;
+ }
+ *rs = W;
+ }
+ return S;
+}
+
+
+/*----------
+ * Random number generator used by sampling
+ *----------
+ */
+
+/* Select a random value R uniformly distributed in (0 - 1) */
+double
+sampler_random_fract()
+{
+ return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2);
+}