diff options
author | Simon Riggs <simon@2ndQuadrant.com> | 2015-05-15 04:02:54 +0200 |
---|---|---|
committer | Simon Riggs <simon@2ndQuadrant.com> | 2015-05-15 04:02:54 +0200 |
commit | 83e176ec18d2a91dbea1d0d1bd94c38dc47cd77c (patch) | |
tree | 6c92ec7402b4ac14a585c41d960e35bd283a878b /src/backend/utils | |
parent | 5a3022fde018aca9b23b7f7506233b437d943de2 (diff) | |
download | postgresql-83e176ec18d2a91dbea1d0d1bd94c38dc47cd77c.tar.gz |
Separate block sampling functions
Refactoring ahead of tablesample patch
Requested and reviewed by Michael Paquier
Petr Jelinek
Diffstat (limited to 'src/backend/utils')
-rw-r--r-- | src/backend/utils/misc/Makefile | 2 | ||||
-rw-r--r-- | src/backend/utils/misc/sampling.c | 226 |
2 files changed, 227 insertions, 1 deletions
diff --git a/src/backend/utils/misc/Makefile b/src/backend/utils/misc/Makefile index 378b77e0bc..788910157d 100644 --- a/src/backend/utils/misc/Makefile +++ b/src/backend/utils/misc/Makefile @@ -15,7 +15,7 @@ include $(top_builddir)/src/Makefile.global override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS) OBJS = guc.o help_config.o pg_rusage.o ps_status.o rls.o \ - superuser.o timeout.o tzparser.o + sampling.o superuser.o timeout.o tzparser.o # This location might depend on the installation directories. Therefore # we can't subsitute it into pg_config.h. diff --git a/src/backend/utils/misc/sampling.c b/src/backend/utils/misc/sampling.c new file mode 100644 index 0000000000..1eeabaf158 --- /dev/null +++ b/src/backend/utils/misc/sampling.c @@ -0,0 +1,226 @@ +/*------------------------------------------------------------------------- + * + * sampling.c + * Relation block sampling routines. + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/utils/misc/sampling.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <math.h> + +#include "utils/sampling.h" + + +/* + * BlockSampler_Init -- prepare for random sampling of blocknumbers + * + * BlockSampler provides algorithm for block level sampling of a relation + * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB") + * It selects a random sample of samplesize blocks out of + * the nblocks blocks in the table. If the table has less than + * samplesize blocks, all blocks are selected. + * + * Since we know the total number of blocks in advance, we can use the + * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's + * algorithm. + */ +void +BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize, + long randseed) +{ + bs->N = nblocks; /* measured table size */ + + /* + * If we decide to reduce samplesize for tables that have less or not much + * more than samplesize blocks, here is the place to do it. + */ + bs->n = samplesize; + bs->t = 0; /* blocks scanned so far */ + bs->m = 0; /* blocks selected so far */ +} + +bool +BlockSampler_HasMore(BlockSampler bs) +{ + return (bs->t < bs->N) && (bs->m < bs->n); +} + +BlockNumber +BlockSampler_Next(BlockSampler bs) +{ + BlockNumber K = bs->N - bs->t; /* remaining blocks */ + int k = bs->n - bs->m; /* blocks still to sample */ + double p; /* probability to skip block */ + double V; /* random */ + + Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */ + + if ((BlockNumber) k >= K) + { + /* need all the rest */ + bs->m++; + return bs->t++; + } + + /*---------- + * It is not obvious that this code matches Knuth's Algorithm S. + * Knuth says to skip the current block with probability 1 - k/K. + * If we are to skip, we should advance t (hence decrease K), and + * repeat the same probabilistic test for the next block. The naive + * implementation thus requires an sampler_random_fract() call for each + * block number. But we can reduce this to one sampler_random_fract() + * call per selected block, by noting that each time the while-test + * succeeds, we can reinterpret V as a uniform random number in the range + * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be + * the appropriate fraction of its former value, and our next loop + * makes the appropriate probabilistic test. + * + * We have initially K > k > 0. If the loop reduces K to equal k, + * the next while-test must fail since p will become exactly zero + * (we assume there will not be roundoff error in the division). + * (Note: Knuth suggests a "<=" loop condition, but we use "<" just + * to be doubly sure about roundoff error.) Therefore K cannot become + * less than k, which means that we cannot fail to select enough blocks. + *---------- + */ + V = sampler_random_fract(); + p = 1.0 - (double) k / (double) K; + while (V < p) + { + /* skip */ + bs->t++; + K--; /* keep K == N - t */ + + /* adjust p to be new cutoff point in reduced range */ + p *= 1.0 - (double) k / (double) K; + } + + /* select */ + bs->m++; + return bs->t++; +} + +/* + * These two routines embody Algorithm Z from "Random sampling with a + * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1 + * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms + * of the count S of records to skip before processing another record. + * It is computed primarily based on t, the number of records already read. + * The only extra state needed between calls is W, a random state variable. + * + * reservoir_init_selection_state computes the initial W value. + * + * Given that we've already read t records (t >= n), reservoir_get_next_S + * determines the number of records to skip before the next record is + * processed. + */ +void +reservoir_init_selection_state(ReservoirState rs, int n) +{ + /* Initial value of W (for use when Algorithm Z is first applied) */ + *rs = exp(-log(sampler_random_fract()) / n); +} + +double +reservoir_get_next_S(ReservoirState rs, double t, int n) +{ + double S; + + /* The magic constant here is T from Vitter's paper */ + if (t <= (22.0 * n)) + { + /* Process records using Algorithm X until t is large enough */ + double V, + quot; + + V = sampler_random_fract(); /* Generate V */ + S = 0; + t += 1; + /* Note: "num" in Vitter's code is always equal to t - n */ + quot = (t - (double) n) / t; + /* Find min S satisfying (4.1) */ + while (quot > V) + { + S += 1; + t += 1; + quot *= (t - (double) n) / t; + } + } + else + { + /* Now apply Algorithm Z */ + double W = *rs; + double term = t - (double) n + 1; + + for (;;) + { + double numer, + numer_lim, + denom; + double U, + X, + lhs, + rhs, + y, + tmp; + + /* Generate U and X */ + U = sampler_random_fract(); + X = t * (W - 1.0); + S = floor(X); /* S is tentatively set to floor(X) */ + /* Test if U <= h(S)/cg(X) in the manner of (6.3) */ + tmp = (t + 1) / term; + lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n); + rhs = (((t + X) / (term + S)) * term) / t; + if (lhs <= rhs) + { + W = rhs / lhs; + break; + } + /* Test if U <= f(S)/cg(X) */ + y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X); + if ((double) n < S) + { + denom = t; + numer_lim = term + S; + } + else + { + denom = t - (double) n + S; + numer_lim = t + 1; + } + for (numer = t + S; numer >= numer_lim; numer -= 1) + { + y *= numer / denom; + denom -= 1; + } + W = exp(-log(sampler_random_fract()) / n); /* Generate W in advance */ + if (exp(log(y) / n) <= (t + X) / t) + break; + } + *rs = W; + } + return S; +} + + +/*---------- + * Random number generator used by sampling + *---------- + */ + +/* Select a random value R uniformly distributed in (0 - 1) */ +double +sampler_random_fract() +{ + return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2); +} |