summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorTomas Vondra <tomas.vondra@postgresql.org>2021-03-26 23:22:01 +0100
committerTomas Vondra <tomas.vondra@postgresql.org>2021-03-27 00:01:11 +0100
commita4d75c86bf15220df22de0a92c819ecef9db3849 (patch)
treea736a68b1c3f022590a886b7bac45276f1f490a6 /src
parent98376c18f12e562421b5c77e619248e8b7aae3c6 (diff)
downloadpostgresql-a4d75c86bf15220df22de0a92c819ecef9db3849.tar.gz
Extended statistics on expressions
Allow defining extended statistics on expressions, not just just on simple column references. With this commit, expressions are supported by all existing extended statistics kinds, improving the same types of estimates. A simple example may look like this: CREATE TABLE t (a int); CREATE STATISTICS s ON mod(a,10), mod(a,20) FROM t; ANALYZE t; The collected statistics are useful e.g. to estimate queries with those expressions in WHERE or GROUP BY clauses: SELECT * FROM t WHERE mod(a,10) = 0 AND mod(a,20) = 0; SELECT 1 FROM t GROUP BY mod(a,10), mod(a,20); This introduces new internal statistics kind 'e' (expressions) which is built automatically when the statistics object definition includes any expressions. This represents single-expression statistics, as if there was an expression index (but without the index maintenance overhead). The statistics is stored in pg_statistics_ext_data as an array of composite types, which is possible thanks to 79f6a942bd. CREATE STATISTICS allows building statistics on a single expression, in which case in which case it's not possible to specify statistics kinds. A new system view pg_stats_ext_exprs can be used to display expression statistics, similarly to pg_stats and pg_stats_ext views. ALTER TABLE ... ALTER COLUMN ... TYPE now treats indexes the same way it treats indexes, i.e. it drops and recreates the statistics. This means all statistics are reset, and we no longer try to preserve at least the functional dependencies. This should not be a major issue in practice, as the functional dependencies actually rely on per-column statistics, which were always reset anyway. Author: Tomas Vondra Reviewed-by: Justin Pryzby, Dean Rasheed, Zhihong Yu Discussion: https://postgr.es/m/ad7891d2-e90c-b446-9fe2-7419143847d7%40enterprisedb.com
Diffstat (limited to 'src')
-rw-r--r--src/backend/catalog/Makefile8
-rw-r--r--src/backend/catalog/system_views.sql69
-rw-r--r--src/backend/commands/statscmds.c437
-rw-r--r--src/backend/commands/tablecmds.c104
-rw-r--r--src/backend/nodes/copyfuncs.c14
-rw-r--r--src/backend/nodes/equalfuncs.c13
-rw-r--r--src/backend/nodes/outfuncs.c12
-rw-r--r--src/backend/optimizer/util/plancat.c62
-rw-r--r--src/backend/parser/gram.y38
-rw-r--r--src/backend/parser/parse_agg.c10
-rw-r--r--src/backend/parser/parse_expr.c6
-rw-r--r--src/backend/parser/parse_func.c3
-rw-r--r--src/backend/parser/parse_utilcmd.c125
-rw-r--r--src/backend/statistics/dependencies.c616
-rw-r--r--src/backend/statistics/extended_stats.c1253
-rw-r--r--src/backend/statistics/mcv.c369
-rw-r--r--src/backend/statistics/mvdistinct.c96
-rw-r--r--src/backend/tcop/utility.c29
-rw-r--r--src/backend/utils/adt/ruleutils.c281
-rw-r--r--src/backend/utils/adt/selfuncs.c413
-rw-r--r--src/bin/pg_dump/t/002_pg_dump.pl12
-rw-r--r--src/bin/psql/describe.c130
-rw-r--r--src/include/catalog/catversion.h2
-rw-r--r--src/include/catalog/pg_proc.dat8
-rw-r--r--src/include/catalog/pg_statistic_ext.h4
-rw-r--r--src/include/catalog/pg_statistic_ext_data.h1
-rw-r--r--src/include/commands/defrem.h4
-rw-r--r--src/include/nodes/nodes.h1
-rw-r--r--src/include/nodes/parsenodes.h19
-rw-r--r--src/include/nodes/pathnodes.h1
-rw-r--r--src/include/parser/parse_node.h1
-rw-r--r--src/include/parser/parse_utilcmd.h2
-rw-r--r--src/include/statistics/extended_stats_internal.h32
-rw-r--r--src/include/statistics/statistics.h5
-rw-r--r--src/include/utils/ruleutils.h2
-rw-r--r--src/test/regress/expected/create_table_like.out20
-rw-r--r--src/test/regress/expected/oidjoins.out10
-rw-r--r--src/test/regress/expected/rules.out73
-rw-r--r--src/test/regress/expected/stats_ext.out1340
-rw-r--r--src/test/regress/sql/create_table_like.sql2
-rw-r--r--src/test/regress/sql/stats_ext.sql589
41 files changed, 5425 insertions, 791 deletions
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index 70bc2123df..e36a9602c1 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -49,15 +49,15 @@ include $(top_srcdir)/src/backend/common.mk
# Note: the order of this list determines the order in which the catalog
# header files are assembled into postgres.bki. BKI_BOOTSTRAP catalogs
-# must appear first, and there are reputedly other, undocumented ordering
-# dependencies.
+# must appear first, and pg_statistic before pg_statistic_ext_data, and
+# there are reputedly other, undocumented ordering dependencies.
CATALOG_HEADERS := \
pg_proc.h pg_type.h pg_attribute.h pg_class.h \
pg_attrdef.h pg_constraint.h pg_inherits.h pg_index.h pg_operator.h \
pg_opfamily.h pg_opclass.h pg_am.h pg_amop.h pg_amproc.h \
pg_language.h pg_largeobject_metadata.h pg_largeobject.h pg_aggregate.h \
- pg_statistic_ext.h pg_statistic_ext_data.h \
- pg_statistic.h pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \
+ pg_statistic.h pg_statistic_ext.h pg_statistic_ext_data.h \
+ pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \
pg_cast.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \
pg_database.h pg_db_role_setting.h pg_tablespace.h \
pg_authid.h pg_auth_members.h pg_shdepend.h pg_shdescription.h \
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 0dca65dc7b..6483563204 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -264,6 +264,7 @@ CREATE VIEW pg_stats_ext WITH (security_barrier) AS
JOIN pg_attribute a
ON (a.attrelid = s.stxrelid AND a.attnum = k)
) AS attnames,
+ pg_get_statisticsobjdef_expressions(s.oid) as exprs,
s.stxkind AS kinds,
sd.stxdndistinct AS n_distinct,
sd.stxddependencies AS dependencies,
@@ -290,6 +291,74 @@ CREATE VIEW pg_stats_ext WITH (security_barrier) AS
WHERE NOT has_column_privilege(c.oid, a.attnum, 'select') )
AND (c.relrowsecurity = false OR NOT row_security_active(c.oid));
+CREATE VIEW pg_stats_ext_exprs WITH (security_barrier) AS
+ SELECT cn.nspname AS schemaname,
+ c.relname AS tablename,
+ sn.nspname AS statistics_schemaname,
+ s.stxname AS statistics_name,
+ pg_get_userbyid(s.stxowner) AS statistics_owner,
+ stat.expr,
+ (stat.a).stanullfrac AS null_frac,
+ (stat.a).stawidth AS avg_width,
+ (stat.a).stadistinct AS n_distinct,
+ (CASE
+ WHEN (stat.a).stakind1 = 1 THEN (stat.a).stavalues1
+ WHEN (stat.a).stakind2 = 1 THEN (stat.a).stavalues2
+ WHEN (stat.a).stakind3 = 1 THEN (stat.a).stavalues3
+ WHEN (stat.a).stakind4 = 1 THEN (stat.a).stavalues4
+ WHEN (stat.a).stakind5 = 1 THEN (stat.a).stavalues5
+ END) AS most_common_vals,
+ (CASE
+ WHEN (stat.a).stakind1 = 1 THEN (stat.a).stanumbers1
+ WHEN (stat.a).stakind2 = 1 THEN (stat.a).stanumbers2
+ WHEN (stat.a).stakind3 = 1 THEN (stat.a).stanumbers3
+ WHEN (stat.a).stakind4 = 1 THEN (stat.a).stanumbers4
+ WHEN (stat.a).stakind5 = 1 THEN (stat.a).stanumbers5
+ END) AS most_common_freqs,
+ (CASE
+ WHEN (stat.a).stakind1 = 2 THEN (stat.a).stavalues1
+ WHEN (stat.a).stakind2 = 2 THEN (stat.a).stavalues2
+ WHEN (stat.a).stakind3 = 2 THEN (stat.a).stavalues3
+ WHEN (stat.a).stakind4 = 2 THEN (stat.a).stavalues4
+ WHEN (stat.a).stakind5 = 2 THEN (stat.a).stavalues5
+ END) AS histogram_bounds,
+ (CASE
+ WHEN (stat.a).stakind1 = 3 THEN (stat.a).stanumbers1[1]
+ WHEN (stat.a).stakind2 = 3 THEN (stat.a).stanumbers2[1]
+ WHEN (stat.a).stakind3 = 3 THEN (stat.a).stanumbers3[1]
+ WHEN (stat.a).stakind4 = 3 THEN (stat.a).stanumbers4[1]
+ WHEN (stat.a).stakind5 = 3 THEN (stat.a).stanumbers5[1]
+ END) correlation,
+ (CASE
+ WHEN (stat.a).stakind1 = 4 THEN (stat.a).stavalues1
+ WHEN (stat.a).stakind2 = 4 THEN (stat.a).stavalues2
+ WHEN (stat.a).stakind3 = 4 THEN (stat.a).stavalues3
+ WHEN (stat.a).stakind4 = 4 THEN (stat.a).stavalues4
+ WHEN (stat.a).stakind5 = 4 THEN (stat.a).stavalues5
+ END) AS most_common_elems,
+ (CASE
+ WHEN (stat.a).stakind1 = 4 THEN (stat.a).stanumbers1
+ WHEN (stat.a).stakind2 = 4 THEN (stat.a).stanumbers2
+ WHEN (stat.a).stakind3 = 4 THEN (stat.a).stanumbers3
+ WHEN (stat.a).stakind4 = 4 THEN (stat.a).stanumbers4
+ WHEN (stat.a).stakind5 = 4 THEN (stat.a).stanumbers5
+ END) AS most_common_elem_freqs,
+ (CASE
+ WHEN (stat.a).stakind1 = 5 THEN (stat.a).stanumbers1
+ WHEN (stat.a).stakind2 = 5 THEN (stat.a).stanumbers2
+ WHEN (stat.a).stakind3 = 5 THEN (stat.a).stanumbers3
+ WHEN (stat.a).stakind4 = 5 THEN (stat.a).stanumbers4
+ WHEN (stat.a).stakind5 = 5 THEN (stat.a).stanumbers5
+ END) AS elem_count_histogram
+ FROM pg_statistic_ext s JOIN pg_class c ON (c.oid = s.stxrelid)
+ LEFT JOIN pg_statistic_ext_data sd ON (s.oid = sd.stxoid)
+ LEFT JOIN pg_namespace cn ON (cn.oid = c.relnamespace)
+ LEFT JOIN pg_namespace sn ON (sn.oid = s.stxnamespace)
+ JOIN LATERAL (
+ SELECT unnest(pg_get_statisticsobjdef_expressions(s.oid)) AS expr,
+ unnest(sd.stxdexpr)::pg_statistic AS a
+ ) stat ON (stat.expr IS NOT NULL);
+
-- unprivileged users may read pg_statistic_ext but not pg_statistic_ext_data
REVOKE ALL on pg_statistic_ext_data FROM public;
diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c
index 2bae205845..df4768952d 100644
--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -29,6 +29,8 @@
#include "commands/comment.h"
#include "commands/defrem.h"
#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/optimizer.h"
#include "statistics/statistics.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
@@ -62,7 +64,8 @@ ObjectAddress
CreateStatistics(CreateStatsStmt *stmt)
{
int16 attnums[STATS_MAX_DIMENSIONS];
- int numcols = 0;
+ int nattnums = 0;
+ int numcols;
char *namestr;
NameData stxname;
Oid statoid;
@@ -74,21 +77,25 @@ CreateStatistics(CreateStatsStmt *stmt)
Datum datavalues[Natts_pg_statistic_ext_data];
bool datanulls[Natts_pg_statistic_ext_data];
int2vector *stxkeys;
+ List *stxexprs = NIL;
+ Datum exprsDatum;
Relation statrel;
Relation datarel;
Relation rel = NULL;
Oid relid;
ObjectAddress parentobject,
myself;
- Datum types[3]; /* one for each possible type of statistic */
+ Datum types[4]; /* one for each possible type of statistic */
int ntypes;
ArrayType *stxkind;
bool build_ndistinct;
bool build_dependencies;
bool build_mcv;
+ bool build_expressions;
bool requested_type = false;
int i;
ListCell *cell;
+ ListCell *cell2;
Assert(IsA(stmt, CreateStatsStmt));
@@ -190,101 +197,124 @@ CreateStatistics(CreateStatsStmt *stmt)
}
/*
- * Currently, we only allow simple column references in the expression
- * list. That will change someday, and again the grammar already supports
- * it so we have to enforce restrictions here. For now, we can convert
- * the expression list to a simple array of attnums. While at it, enforce
- * some constraints.
+ * Make sure no more than STATS_MAX_DIMENSIONS columns are used. There
+ * might be duplicates and so on, but we'll deal with those later.
+ */
+ numcols = list_length(stmt->exprs);
+ if (numcols > STATS_MAX_DIMENSIONS)
+ ereport(ERROR,
+ (errcode(ERRCODE_TOO_MANY_COLUMNS),
+ errmsg("cannot have more than %d columns in statistics",
+ STATS_MAX_DIMENSIONS)));
+
+ /*
+ * Convert the expression list to a simple array of attnums, but also keep
+ * a list of more complex expressions. While at it, enforce some
+ * constraints.
+ *
+ * XXX We do only the bare minimum to separate simple attribute and
+ * complex expressions - for example "(a)" will be treated as a complex
+ * expression. No matter how elaborate the check is, there'll always be a
+ * way around it, if the user is determined (consider e.g. "(a+0)"), so
+ * it's not worth protecting against it.
*/
foreach(cell, stmt->exprs)
{
Node *expr = (Node *) lfirst(cell);
- ColumnRef *cref;
- char *attname;
+ StatsElem *selem;
HeapTuple atttuple;
Form_pg_attribute attForm;
TypeCacheEntry *type;
- if (!IsA(expr, ColumnRef))
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("only simple column references are allowed in CREATE STATISTICS")));
- cref = (ColumnRef *) expr;
-
- if (list_length(cref->fields) != 1)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("only simple column references are allowed in CREATE STATISTICS")));
- attname = strVal((Value *) linitial(cref->fields));
-
- atttuple = SearchSysCacheAttName(relid, attname);
- if (!HeapTupleIsValid(atttuple))
- ereport(ERROR,
- (errcode(ERRCODE_UNDEFINED_COLUMN),
- errmsg("column \"%s\" does not exist",
- attname)));
- attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
-
- /* Disallow use of system attributes in extended stats */
- if (attForm->attnum <= 0)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("statistics creation on system columns is not supported")));
-
- /* Disallow data types without a less-than operator */
- type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
- if (type->lt_opr == InvalidOid)
+ /*
+ * We should not get anything else than StatsElem, given the grammar.
+ * But let's keep it as a safety.
+ */
+ if (!IsA(expr, StatsElem))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
- attname, format_type_be(attForm->atttypid))));
+ errmsg("only simple column references and expressions are allowed in CREATE STATISTICS")));
- /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */
- if (numcols >= STATS_MAX_DIMENSIONS)
- ereport(ERROR,
- (errcode(ERRCODE_TOO_MANY_COLUMNS),
- errmsg("cannot have more than %d columns in statistics",
- STATS_MAX_DIMENSIONS)));
+ selem = (StatsElem *) expr;
- attnums[numcols] = attForm->attnum;
- numcols++;
- ReleaseSysCache(atttuple);
+ if (selem->name) /* column reference */
+ {
+ char *attname;
+
+ attname = selem->name;
+
+ atttuple = SearchSysCacheAttName(relid, attname);
+ if (!HeapTupleIsValid(atttuple))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_COLUMN),
+ errmsg("column \"%s\" does not exist",
+ attname)));
+ attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
+
+ /* Disallow use of system attributes in extended stats */
+ if (attForm->attnum <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("statistics creation on system columns is not supported")));
+
+ /* Disallow data types without a less-than operator */
+ type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
+ if (type->lt_opr == InvalidOid)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
+ attname, format_type_be(attForm->atttypid))));
+
+ attnums[nattnums] = attForm->attnum;
+ nattnums++;
+ ReleaseSysCache(atttuple);
+ }
+ else /* expression */
+ {
+ Node *expr = selem->expr;
+ Oid atttype;
+
+ Assert(expr != NULL);
+
+ /*
+ * Disallow data types without a less-than operator.
+ *
+ * We ignore this for statistics on a single expression, in which
+ * case we'll build the regular statistics only (and that code can
+ * deal with such data types).
+ */
+ if (list_length(stmt->exprs) > 1)
+ {
+ atttype = exprType(expr);
+ type = lookup_type_cache(atttype, TYPECACHE_LT_OPR);
+ if (type->lt_opr == InvalidOid)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("expression cannot be used in multivariate statistics because its type %s has no default btree operator class",
+ format_type_be(atttype))));
+ }
+
+ stxexprs = lappend(stxexprs, expr);
+ }
}
/*
- * Check that at least two columns were specified in the statement. The
- * upper bound was already checked in the loop above.
- */
- if (numcols < 2)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
- errmsg("extended statistics require at least 2 columns")));
-
- /*
- * Sort the attnums, which makes detecting duplicates somewhat easier, and
- * it does not hurt (it does not affect the efficiency, unlike for
- * indexes, for example).
- */
- qsort(attnums, numcols, sizeof(int16), compare_int16);
-
- /*
- * Check for duplicates in the list of columns. The attnums are sorted so
- * just check consecutive elements.
+ * Parse the statistics kinds.
+ *
+ * First check that if this is the case with a single expression, there
+ * are no statistics kinds specified (we don't allow that for the simple
+ * CREATE STATISTICS form).
*/
- for (i = 1; i < numcols; i++)
+ if ((list_length(stmt->exprs) == 1) && (list_length(stxexprs) == 1))
{
- if (attnums[i] == attnums[i - 1])
+ /* statistics kinds not specified */
+ if (list_length(stmt->stat_types) > 0)
ereport(ERROR,
- (errcode(ERRCODE_DUPLICATE_COLUMN),
- errmsg("duplicate column name in statistics definition")));
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("when building statistics on a single expression, statistics kinds may not be specified")));
}
- /* Form an int2vector representation of the sorted column list */
- stxkeys = buildint2vector(attnums, numcols);
-
- /*
- * Parse the statistics kinds.
- */
+ /* OK, let's check that we recognize the statistics kinds. */
build_ndistinct = false;
build_dependencies = false;
build_mcv = false;
@@ -313,14 +343,91 @@ CreateStatistics(CreateStatsStmt *stmt)
errmsg("unrecognized statistics kind \"%s\"",
type)));
}
- /* If no statistic type was specified, build them all. */
- if (!requested_type)
+
+ /*
+ * If no statistic type was specified, build them all (but only when the
+ * statistics is defined on more than one column/expression).
+ */
+ if ((!requested_type) && (numcols >= 2))
{
build_ndistinct = true;
build_dependencies = true;
build_mcv = true;
}
+ /*
+ * When there are non-trivial expressions, build the expression stats
+ * automatically. This allows calculating good estimates for stats that
+ * consider per-clause estimates (e.g. functional dependencies).
+ */
+ build_expressions = (list_length(stxexprs) > 0);
+
+ /*
+ * Check that at least two columns were specified in the statement, or
+ * that we're building statistics on a single expression.
+ */
+ if ((numcols < 2) && (list_length(stxexprs) != 1))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("extended statistics require at least 2 columns")));
+
+ /*
+ * Sort the attnums, which makes detecting duplicates somewhat easier, and
+ * it does not hurt (it does not matter for the contents, unlike for
+ * indexes, for example).
+ */
+ qsort(attnums, nattnums, sizeof(int16), compare_int16);
+
+ /*
+ * Check for duplicates in the list of columns. The attnums are sorted so
+ * just check consecutive elements.
+ */
+ for (i = 1; i < nattnums; i++)
+ {
+ if (attnums[i] == attnums[i - 1])
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_COLUMN),
+ errmsg("duplicate column name in statistics definition")));
+ }
+
+ /*
+ * Check for duplicate expressions. We do two loops, counting the
+ * occurrences of each expression. This is O(N^2) but we only allow small
+ * number of expressions and it's not executed often.
+ *
+ * XXX We don't cross-check attributes and expressions, because it does
+ * not seem worth it. In principle we could check that expressions don't
+ * contain trivial attribute references like "(a)", but the reasoning is
+ * similar to why we don't bother with extracting columns from
+ * expressions. It's either expensive or very easy to defeat for
+ * determined user, and there's no risk if we allow such statistics (the
+ * statistics is useless, but harmless).
+ */
+ foreach(cell, stxexprs)
+ {
+ Node *expr1 = (Node *) lfirst(cell);
+ int cnt = 0;
+
+ foreach(cell2, stxexprs)
+ {
+ Node *expr2 = (Node *) lfirst(cell2);
+
+ if (equal(expr1, expr2))
+ cnt += 1;
+ }
+
+ /* every expression should find at least itself */
+ Assert(cnt >= 1);
+
+ if (cnt > 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_COLUMN),
+ errmsg("duplicate expression in statistics definition")));
+ }
+
+ /* Form an int2vector representation of the sorted column list */
+ stxkeys = buildint2vector(attnums, nattnums);
+
/* construct the char array of enabled statistic types */
ntypes = 0;
if (build_ndistinct)
@@ -329,9 +436,23 @@ CreateStatistics(CreateStatsStmt *stmt)
types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
if (build_mcv)
types[ntypes++] = CharGetDatum(STATS_EXT_MCV);
+ if (build_expressions)
+ types[ntypes++] = CharGetDatum(STATS_EXT_EXPRESSIONS);
Assert(ntypes > 0 && ntypes <= lengthof(types));
stxkind = construct_array(types, ntypes, CHAROID, 1, true, TYPALIGN_CHAR);
+ /* convert the expressions (if any) to a text datum */
+ if (stxexprs != NIL)
+ {
+ char *exprsString;
+
+ exprsString = nodeToString(stxexprs);
+ exprsDatum = CStringGetTextDatum(exprsString);
+ pfree(exprsString);
+ }
+ else
+ exprsDatum = (Datum) 0;
+
statrel = table_open(StatisticExtRelationId, RowExclusiveLock);
/*
@@ -351,6 +472,10 @@ CreateStatistics(CreateStatsStmt *stmt)
values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys);
values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind);
+ values[Anum_pg_statistic_ext_stxexprs - 1] = exprsDatum;
+ if (exprsDatum == (Datum) 0)
+ nulls[Anum_pg_statistic_ext_stxexprs - 1] = true;
+
/* insert it into pg_statistic_ext */
htup = heap_form_tuple(statrel->rd_att, values, nulls);
CatalogTupleInsert(statrel, htup);
@@ -373,6 +498,7 @@ CreateStatistics(CreateStatsStmt *stmt)
datanulls[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true;
datanulls[Anum_pg_statistic_ext_data_stxddependencies - 1] = true;
datanulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
+ datanulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = true;
/* insert it into pg_statistic_ext_data */
htup = heap_form_tuple(datarel->rd_att, datavalues, datanulls);
@@ -396,13 +522,42 @@ CreateStatistics(CreateStatsStmt *stmt)
*/
ObjectAddressSet(myself, StatisticExtRelationId, statoid);
- for (i = 0; i < numcols; i++)
+ /* add dependencies for plain column references */
+ for (i = 0; i < nattnums; i++)
{
ObjectAddressSubSet(parentobject, RelationRelationId, relid, attnums[i]);
recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
}
/*
+ * If there are no dependencies on a column, give the statistics an auto
+ * dependency on the whole table. In most cases, this will be redundant,
+ * but it might not be if the statistics expressions contain no Vars
+ * (which might seem strange but possible). This is consistent with what
+ * we do for indexes in index_create.
+ *
+ * XXX We intentionally don't consider the expressions before adding this
+ * dependency, because recordDependencyOnSingleRelExpr may not create any
+ * dependencies for whole-row Vars.
+ */
+ if (!nattnums)
+ {
+ ObjectAddressSet(parentobject, RelationRelationId, relid);
+ recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
+ }
+
+ /*
+ * Store dependencies on anything mentioned in statistics expressions,
+ * just like we do for index expressions.
+ */
+ if (stxexprs)
+ recordDependencyOnSingleRelExpr(&myself,
+ (Node *) stxexprs,
+ relid,
+ DEPENDENCY_NORMAL,
+ DEPENDENCY_AUTO, false, true);
+
+ /*
* Also add dependencies on namespace and owner. These are required
* because the stats object might have a different namespace and/or owner
* than the underlying table(s).
@@ -583,87 +738,6 @@ RemoveStatisticsById(Oid statsOid)
}
/*
- * Update a statistics object for ALTER COLUMN TYPE on a source column.
- *
- * This could throw an error if the type change can't be supported.
- * If it can be supported, but the stats must be recomputed, a likely choice
- * would be to set the relevant column(s) of the pg_statistic_ext_data tuple
- * to null until the next ANALYZE. (Note that the type change hasn't actually
- * happened yet, so one option that's *not* on the table is to recompute
- * immediately.)
- *
- * For both ndistinct and functional-dependencies stats, the on-disk
- * representation is independent of the source column data types, and it is
- * plausible to assume that the old statistic values will still be good for
- * the new column contents. (Obviously, if the ALTER COLUMN TYPE has a USING
- * expression that substantially alters the semantic meaning of the column
- * values, this assumption could fail. But that seems like a corner case
- * that doesn't justify zapping the stats in common cases.)
- *
- * For MCV lists that's not the case, as those statistics store the datums
- * internally. In this case we simply reset the statistics value to NULL.
- *
- * Note that "type change" includes collation change, which means we can rely
- * on the MCV list being consistent with the collation info in pg_attribute
- * during estimation.
- */
-void
-UpdateStatisticsForTypeChange(Oid statsOid, Oid relationOid, int attnum,
- Oid oldColumnType, Oid newColumnType)
-{
- HeapTuple stup,
- oldtup;
-
- Relation rel;
-
- Datum values[Natts_pg_statistic_ext_data];
- bool nulls[Natts_pg_statistic_ext_data];
- bool replaces[Natts_pg_statistic_ext_data];
-
- oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statsOid));
- if (!HeapTupleIsValid(oldtup))
- elog(ERROR, "cache lookup failed for statistics object %u", statsOid);
-
- /*
- * When none of the defined statistics types contain datum values from the
- * table's columns then there's no need to reset the stats. Functional
- * dependencies and ndistinct stats should still hold true.
- */
- if (!statext_is_kind_built(oldtup, STATS_EXT_MCV))
- {
- ReleaseSysCache(oldtup);
- return;
- }
-
- /*
- * OK, we need to reset some statistics. So let's build the new tuple,
- * replacing the affected statistics types with NULL.
- */
- memset(nulls, 0, Natts_pg_statistic_ext_data * sizeof(bool));
- memset(replaces, 0, Natts_pg_statistic_ext_data * sizeof(bool));
- memset(values, 0, Natts_pg_statistic_ext_data * sizeof(Datum));
-
- replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
- nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
-
- rel = table_open(StatisticExtDataRelationId, RowExclusiveLock);
-
- /* replace the old tuple */
- stup = heap_modify_tuple(oldtup,
- RelationGetDescr(rel),
- values,
- nulls,
- replaces);
-
- ReleaseSysCache(oldtup);
- CatalogTupleUpdate(rel, &stup->t_self, stup);
-
- heap_freetuple(stup);
-
- table_close(rel, RowExclusiveLock);
-}
-
-/*
* Select a nonconflicting name for a new statistics.
*
* name1, name2, and label are used the same way as for makeObjectName(),
@@ -731,19 +805,28 @@ ChooseExtendedStatisticNameAddition(List *exprs)
buf[0] = '\0';
foreach(lc, exprs)
{
- ColumnRef *cref = (ColumnRef *) lfirst(lc);
+ StatsElem *selem = (StatsElem *) lfirst(lc);
const char *name;
/* It should be one of these, but just skip if it happens not to be */
- if (!IsA(cref, ColumnRef))
+ if (!IsA(selem, StatsElem))
continue;
- name = strVal((Value *) linitial(cref->fields));
+ name = selem->name;
if (buflen > 0)
buf[buflen++] = '_'; /* insert _ between names */
/*
+ * We use fixed 'expr' for expressions, which have empty column names.
+ * For indexes this is handled in ChooseIndexColumnNames, but we have
+ * no such function for stats and it does not seem worth adding. If a
+ * better name is needed, the user can specify it explicitly.
+ */
+ if (!name)
+ name = "expr";
+
+ /*
* At this point we have buflen <= NAMEDATALEN. name should be less
* than NAMEDATALEN already, but use strlcpy for paranoia.
*/
@@ -754,3 +837,29 @@ ChooseExtendedStatisticNameAddition(List *exprs)
}
return pstrdup(buf);
}
+
+/*
+ * StatisticsGetRelation: given a statistics's relation OID, get the OID of
+ * the relation it is an statistics on. Uses the system cache.
+ */
+Oid
+StatisticsGetRelation(Oid statId, bool missing_ok)
+{
+ HeapTuple tuple;
+ Form_pg_statistic_ext stx;
+ Oid result;
+
+ tuple = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statId));
+ if (!HeapTupleIsValid(tuple))
+ {
+ if (missing_ok)
+ return InvalidOid;
+ elog(ERROR, "cache lookup failed for statistics object %u", statId);
+ }
+ stx = (Form_pg_statistic_ext) GETSTRUCT(tuple);
+ Assert(stx->oid == statId);
+
+ result = stx->stxrelid;
+ ReleaseSysCache(tuple);
+ return result;
+}
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index efac06f72c..88a68a4697 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -41,6 +41,7 @@
#include "catalog/pg_namespace.h"
#include "catalog/pg_opclass.h"
#include "catalog/pg_tablespace.h"
+#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_trigger.h"
#include "catalog/pg_type.h"
#include "catalog/storage.h"
@@ -188,6 +189,8 @@ typedef struct AlteredTableInfo
List *changedIndexDefs; /* string definitions of same */
char *replicaIdentityIndex; /* index to reset as REPLICA IDENTITY */
char *clusterOnIndex; /* index to use for CLUSTER */
+ List *changedStatisticsOids; /* OIDs of statistics to rebuild */
+ List *changedStatisticsDefs; /* string definitions of same */
} AlteredTableInfo;
/* Struct describing one new constraint to check in Phase 3 scan */
@@ -440,6 +443,8 @@ static ObjectAddress ATExecDropColumn(List **wqueue, Relation rel, const char *c
ObjectAddresses *addrs);
static ObjectAddress ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
IndexStmt *stmt, bool is_rebuild, LOCKMODE lockmode);
+static ObjectAddress ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
+ CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode);
static ObjectAddress ATExecAddConstraint(List **wqueue,
AlteredTableInfo *tab, Relation rel,
Constraint *newConstraint, bool recurse, bool is_readd,
@@ -496,6 +501,7 @@ static ObjectAddress ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
AlterTableCmd *cmd, LOCKMODE lockmode);
static void RememberConstraintForRebuilding(Oid conoid, AlteredTableInfo *tab);
static void RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab);
+static void RememberStatisticsForRebuilding(Oid indoid, AlteredTableInfo *tab);
static void ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab,
LOCKMODE lockmode);
static void ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId,
@@ -4756,6 +4762,10 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab,
address = ATExecAddIndex(tab, rel, (IndexStmt *) cmd->def, true,
lockmode);
break;
+ case AT_ReAddStatistics: /* ADD STATISTICS */
+ address = ATExecAddStatistics(tab, rel, (CreateStatsStmt *) cmd->def,
+ true, lockmode);
+ break;
case AT_AddConstraint: /* ADD CONSTRAINT */
/* Transform the command only during initial examination */
if (cur_pass == AT_PASS_ADD_CONSTR)
@@ -8284,6 +8294,29 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
}
/*
+ * ALTER TABLE ADD STATISTICS
+ *
+ * This is no such command in the grammar, but we use this internally to add
+ * AT_ReAddStatistics subcommands to rebuild extended statistics after a table
+ * column type change.
+ */
+static ObjectAddress
+ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
+ CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode)
+{
+ ObjectAddress address;
+
+ Assert(IsA(stmt, CreateStatsStmt));
+
+ /* The CreateStatsStmt has already been through transformStatsStmt */
+ Assert(stmt->transformed);
+
+ address = CreateStatistics(stmt);
+
+ return address;
+}
+
+/*
* ALTER TABLE ADD CONSTRAINT USING INDEX
*
* Returns the address of the new constraint.
@@ -11830,9 +11863,7 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
* Give the extended-stats machinery a chance to fix anything
* that this column type change would break.
*/
- UpdateStatisticsForTypeChange(foundObject.objectId,
- RelationGetRelid(rel), attnum,
- attTup->atttypid, targettype);
+ RememberStatisticsForRebuilding(foundObject.objectId, tab);
break;
case OCLASS_PROC:
@@ -12203,6 +12234,32 @@ RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab)
}
/*
+ * Subroutine for ATExecAlterColumnType: remember that a statistics object
+ * needs to be rebuilt (which we might already know).
+ */
+static void
+RememberStatisticsForRebuilding(Oid stxoid, AlteredTableInfo *tab)
+{
+ /*
+ * This de-duplication check is critical for two independent reasons: we
+ * mustn't try to recreate the same statistics object twice, and if the
+ * statistics depends on more than one column whose type is to be altered,
+ * we must capture its definition string before applying any of the type
+ * changes. ruleutils.c will get confused if we ask again later.
+ */
+ if (!list_member_oid(tab->changedStatisticsOids, stxoid))
+ {
+ /* OK, capture the index's existing definition string */
+ char *defstring = pg_get_statisticsobjdef_string(stxoid);
+
+ tab->changedStatisticsOids = lappend_oid(tab->changedStatisticsOids,
+ stxoid);
+ tab->changedStatisticsDefs = lappend(tab->changedStatisticsDefs,
+ defstring);
+ }
+}
+
+/*
* Cleanup after we've finished all the ALTER TYPE operations for a
* particular relation. We have to drop and recreate all the indexes
* and constraints that depend on the altered columns. We do the
@@ -12306,6 +12363,22 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode)
add_exact_object_address(&obj, objects);
}
+ /* add dependencies for new statistics */
+ forboth(oid_item, tab->changedStatisticsOids,
+ def_item, tab->changedStatisticsDefs)
+ {
+ Oid oldId = lfirst_oid(oid_item);
+ Oid relid;
+
+ relid = StatisticsGetRelation(oldId, false);
+ ATPostAlterTypeParse(oldId, relid, InvalidOid,
+ (char *) lfirst(def_item),
+ wqueue, lockmode, tab->rewrite);
+
+ ObjectAddressSet(obj, StatisticExtRelationId, oldId);
+ add_exact_object_address(&obj, objects);
+ }
+
/*
* Queue up command to restore replica identity index marking
*/
@@ -12354,9 +12427,9 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode)
}
/*
- * Parse the previously-saved definition string for a constraint or index
- * against the newly-established column data type(s), and queue up the
- * resulting command parsetrees for execution.
+ * Parse the previously-saved definition string for a constraint, index or
+ * statistics object against the newly-established column data type(s), and
+ * queue up the resulting command parsetrees for execution.
*
* This might fail if, for example, you have a WHERE clause that uses an
* operator that's not available for the new column type.
@@ -12402,6 +12475,11 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
querytree_list = lappend(querytree_list, stmt);
querytree_list = list_concat(querytree_list, afterStmts);
}
+ else if (IsA(stmt, CreateStatsStmt))
+ querytree_list = lappend(querytree_list,
+ transformStatsStmt(oldRelId,
+ (CreateStatsStmt *) stmt,
+ cmd));
else
querytree_list = lappend(querytree_list, stmt);
}
@@ -12540,6 +12618,20 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
elog(ERROR, "unexpected statement subtype: %d",
(int) stmt->subtype);
}
+ else if (IsA(stm, CreateStatsStmt))
+ {
+ CreateStatsStmt *stmt = (CreateStatsStmt *) stm;
+ AlterTableCmd *newcmd;
+
+ /* keep the statistics object's comment */
+ stmt->stxcomment = GetComment(oldId, StatisticExtRelationId, 0);
+
+ newcmd = makeNode(AlterTableCmd);
+ newcmd->subtype = AT_ReAddStatistics;
+ newcmd->def = (Node *) stmt;
+ tab->subcmds[AT_PASS_MISC] =
+ lappend(tab->subcmds[AT_PASS_MISC], newcmd);
+ }
else
elog(ERROR, "unexpected statement type: %d",
(int) nodeTag(stm));
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 38b56231b7..d5b1ad4567 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -2980,6 +2980,17 @@ _copyIndexElem(const IndexElem *from)
return newnode;
}
+static StatsElem *
+_copyStatsElem(const StatsElem *from)
+{
+ StatsElem *newnode = makeNode(StatsElem);
+
+ COPY_STRING_FIELD(name);
+ COPY_NODE_FIELD(expr);
+
+ return newnode;
+}
+
static ColumnDef *
_copyColumnDef(const ColumnDef *from)
{
@@ -5699,6 +5710,9 @@ copyObjectImpl(const void *from)
case T_IndexElem:
retval = _copyIndexElem(from);
break;
+ case T_StatsElem:
+ retval = _copyStatsElem(from);
+ break;
case T_ColumnDef:
retval = _copyColumnDef(from);
break;
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 3292dda342..d46909bbc4 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -2596,6 +2596,16 @@ _equalIndexElem(const IndexElem *a, const IndexElem *b)
return true;
}
+
+static bool
+_equalStatsElem(const StatsElem *a, const StatsElem *b)
+{
+ COMPARE_STRING_FIELD(name);
+ COMPARE_NODE_FIELD(expr);
+
+ return true;
+}
+
static bool
_equalColumnDef(const ColumnDef *a, const ColumnDef *b)
{
@@ -3724,6 +3734,9 @@ equal(const void *a, const void *b)
case T_IndexElem:
retval = _equalIndexElem(a, b);
break;
+ case T_StatsElem:
+ retval = _equalStatsElem(a, b);
+ break;
case T_ColumnDef:
retval = _equalColumnDef(a, b);
break;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 9f7918c7e9..12561c4757 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -2944,6 +2944,15 @@ _outIndexElem(StringInfo str, const IndexElem *node)
}
static void
+_outStatsElem(StringInfo str, const StatsElem *node)
+{
+ WRITE_NODE_TYPE("STATSELEM");
+
+ WRITE_STRING_FIELD(name);
+ WRITE_NODE_FIELD(expr);
+}
+
+static void
_outQuery(StringInfo str, const Query *node)
{
WRITE_NODE_TYPE("QUERY");
@@ -4286,6 +4295,9 @@ outNode(StringInfo str, const void *obj)
case T_IndexElem:
_outIndexElem(str, obj);
break;
+ case T_StatsElem:
+ _outStatsElem(str, obj);
+ break;
case T_Query:
_outQuery(str, obj);
break;
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 6c39bf893f..0fa8875f09 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -34,6 +34,7 @@
#include "foreign/fdwapi.h"
#include "miscadmin.h"
#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
#include "nodes/supportnodes.h"
#include "optimizer/clauses.h"
#include "optimizer/cost.h"
@@ -1308,6 +1309,7 @@ get_relation_constraints(PlannerInfo *root,
static List *
get_relation_statistics(RelOptInfo *rel, Relation relation)
{
+ Index varno = rel->relid;
List *statoidlist;
List *stainfos = NIL;
ListCell *l;
@@ -1321,6 +1323,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
HeapTuple htup;
HeapTuple dtup;
Bitmapset *keys = NULL;
+ List *exprs = NIL;
int i;
htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
@@ -1340,6 +1343,49 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
for (i = 0; i < staForm->stxkeys.dim1; i++)
keys = bms_add_member(keys, staForm->stxkeys.values[i]);
+ /*
+ * Preprocess expressions (if any). We read the expressions, run them
+ * through eval_const_expressions, and fix the varnos.
+ */
+ {
+ bool isnull;
+ Datum datum;
+
+ /* decode expression (if any) */
+ datum = SysCacheGetAttr(STATEXTOID, htup,
+ Anum_pg_statistic_ext_stxexprs, &isnull);
+
+ if (!isnull)
+ {
+ char *exprsString;
+
+ exprsString = TextDatumGetCString(datum);
+ exprs = (List *) stringToNode(exprsString);
+ pfree(exprsString);
+
+ /*
+ * Run the expressions through eval_const_expressions. This is
+ * not just an optimization, but is necessary, because the
+ * planner will be comparing them to similarly-processed qual
+ * clauses, and may fail to detect valid matches without this.
+ * We must not use canonicalize_qual, however, since these
+ * aren't qual expressions.
+ */
+ exprs = (List *) eval_const_expressions(NULL, (Node *) exprs);
+
+ /* May as well fix opfuncids too */
+ fix_opfuncids((Node *) exprs);
+
+ /*
+ * Modify the copies we obtain from the relcache to have the
+ * correct varno for the parent relation, so that they match
+ * up correctly against qual clauses.
+ */
+ if (varno != 1)
+ ChangeVarNodes((Node *) exprs, 1, varno, 0);
+ }
+ }
+
/* add one StatisticExtInfo for each kind built */
if (statext_is_kind_built(dtup, STATS_EXT_NDISTINCT))
{
@@ -1349,6 +1395,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
info->rel = rel;
info->kind = STATS_EXT_NDISTINCT;
info->keys = bms_copy(keys);
+ info->exprs = exprs;
stainfos = lappend(stainfos, info);
}
@@ -1361,6 +1408,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
info->rel = rel;
info->kind = STATS_EXT_DEPENDENCIES;
info->keys = bms_copy(keys);
+ info->exprs = exprs;
stainfos = lappend(stainfos, info);
}
@@ -1373,6 +1421,20 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
info->rel = rel;
info->kind = STATS_EXT_MCV;
info->keys = bms_copy(keys);
+ info->exprs = exprs;
+
+ stainfos = lappend(stainfos, info);
+ }
+
+ if (statext_is_kind_built(dtup, STATS_EXT_EXPRESSIONS))
+ {
+ StatisticExtInfo *info = makeNode(StatisticExtInfo);
+
+ info->statOid = statOid;
+ info->rel = rel;
+ info->kind = STATS_EXT_EXPRESSIONS;
+ info->keys = bms_copy(keys);
+ info->exprs = exprs;
stainfos = lappend(stainfos, info);
}
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 2132cf4d82..7ff36bc842 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -239,6 +239,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
WindowDef *windef;
JoinExpr *jexpr;
IndexElem *ielem;
+ StatsElem *selem;
Alias *alias;
RangeVar *range;
IntoClause *into;
@@ -405,7 +406,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
old_aggr_definition old_aggr_list
oper_argtypes RuleActionList RuleActionMulti
opt_column_list columnList opt_name_list
- sort_clause opt_sort_clause sortby_list index_params
+ sort_clause opt_sort_clause sortby_list index_params stats_params
opt_include opt_c_include index_including_params
name_list role_list from_clause from_list opt_array_bounds
qualified_name_list any_name any_name_list type_name_list
@@ -512,6 +513,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
%type <list> func_alias_clause
%type <sortby> sortby
%type <ielem> index_elem index_elem_options
+%type <selem> stats_param
%type <node> table_ref
%type <jexpr> joined_table
%type <range> relation_expr
@@ -4097,7 +4099,7 @@ ExistingIndex: USING INDEX name { $$ = $3; }
CreateStatsStmt:
CREATE STATISTICS any_name
- opt_name_list ON expr_list FROM from_list
+ opt_name_list ON stats_params FROM from_list
{
CreateStatsStmt *n = makeNode(CreateStatsStmt);
n->defnames = $3;
@@ -4109,7 +4111,7 @@ CreateStatsStmt:
$$ = (Node *)n;
}
| CREATE STATISTICS IF_P NOT EXISTS any_name
- opt_name_list ON expr_list FROM from_list
+ opt_name_list ON stats_params FROM from_list
{
CreateStatsStmt *n = makeNode(CreateStatsStmt);
n->defnames = $6;
@@ -4122,6 +4124,36 @@ CreateStatsStmt:
}
;
+/*
+ * Statistics attributes can be either simple column references, or arbitrary
+ * expressions in parens. For compatibility with index attributes permitted
+ * in CREATE INDEX, we allow an expression that's just a function call to be
+ * written without parens.
+ */
+
+stats_params: stats_param { $$ = list_make1($1); }
+ | stats_params ',' stats_param { $$ = lappend($1, $3); }
+ ;
+
+stats_param: ColId
+ {
+ $$ = makeNode(StatsElem);
+ $$->name = $1;
+ $$->expr = NULL;
+ }
+ | func_expr_windowless
+ {
+ $$ = makeNode(StatsElem);
+ $$->name = NULL;
+ $$->expr = $1;
+ }
+ | '(' a_expr ')'
+ {
+ $$ = makeNode(StatsElem);
+ $$->name = NULL;
+ $$->expr = $2;
+ }
+ ;
/*****************************************************************************
*
diff --git a/src/backend/parser/parse_agg.c b/src/backend/parser/parse_agg.c
index 7c3e01aa22..ceb0bf597d 100644
--- a/src/backend/parser/parse_agg.c
+++ b/src/backend/parser/parse_agg.c
@@ -485,6 +485,13 @@ check_agglevels_and_constraints(ParseState *pstate, Node *expr)
err = _("grouping operations are not allowed in index predicates");
break;
+ case EXPR_KIND_STATS_EXPRESSION:
+ if (isAgg)
+ err = _("aggregate functions are not allowed in statistics expressions");
+ else
+ err = _("grouping operations are not allowed in statistics expressions");
+
+ break;
case EXPR_KIND_ALTER_COL_TRANSFORM:
if (isAgg)
err = _("aggregate functions are not allowed in transform expressions");
@@ -910,6 +917,9 @@ transformWindowFuncCall(ParseState *pstate, WindowFunc *wfunc,
case EXPR_KIND_INDEX_EXPRESSION:
err = _("window functions are not allowed in index expressions");
break;
+ case EXPR_KIND_STATS_EXPRESSION:
+ err = _("window functions are not allowed in statistics expressions");
+ break;
case EXPR_KIND_INDEX_PREDICATE:
err = _("window functions are not allowed in index predicates");
break;
diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c
index f869e159d6..03373d551f 100644
--- a/src/backend/parser/parse_expr.c
+++ b/src/backend/parser/parse_expr.c
@@ -500,6 +500,7 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref)
case EXPR_KIND_FUNCTION_DEFAULT:
case EXPR_KIND_INDEX_EXPRESSION:
case EXPR_KIND_INDEX_PREDICATE:
+ case EXPR_KIND_STATS_EXPRESSION:
case EXPR_KIND_ALTER_COL_TRANSFORM:
case EXPR_KIND_EXECUTE_PARAMETER:
case EXPR_KIND_TRIGGER_WHEN:
@@ -1741,6 +1742,9 @@ transformSubLink(ParseState *pstate, SubLink *sublink)
case EXPR_KIND_INDEX_PREDICATE:
err = _("cannot use subquery in index predicate");
break;
+ case EXPR_KIND_STATS_EXPRESSION:
+ err = _("cannot use subquery in statistics expression");
+ break;
case EXPR_KIND_ALTER_COL_TRANSFORM:
err = _("cannot use subquery in transform expression");
break;
@@ -3030,6 +3034,8 @@ ParseExprKindName(ParseExprKind exprKind)
return "index expression";
case EXPR_KIND_INDEX_PREDICATE:
return "index predicate";
+ case EXPR_KIND_STATS_EXPRESSION:
+ return "statistics expression";
case EXPR_KIND_ALTER_COL_TRANSFORM:
return "USING";
case EXPR_KIND_EXECUTE_PARAMETER:
diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c
index 37cebc7d82..debef1d14f 100644
--- a/src/backend/parser/parse_func.c
+++ b/src/backend/parser/parse_func.c
@@ -2503,6 +2503,9 @@ check_srf_call_placement(ParseState *pstate, Node *last_srf, int location)
case EXPR_KIND_INDEX_PREDICATE:
err = _("set-returning functions are not allowed in index predicates");
break;
+ case EXPR_KIND_STATS_EXPRESSION:
+ err = _("set-returning functions are not allowed in statistics expressions");
+ break;
case EXPR_KIND_ALTER_COL_TRANSFORM:
err = _("set-returning functions are not allowed in transform expressions");
break;
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index aa6c19adad..b968c25dd6 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -1917,6 +1917,9 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
stat_types = lappend(stat_types, makeString("dependencies"));
else if (enabled[i] == STATS_EXT_MCV)
stat_types = lappend(stat_types, makeString("mcv"));
+ else if (enabled[i] == STATS_EXT_EXPRESSIONS)
+ /* expression stats are not exposed to users */
+ continue;
else
elog(ERROR, "unrecognized statistics kind %c", enabled[i]);
}
@@ -1924,14 +1927,47 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
/* Determine which columns the statistics are on */
for (i = 0; i < statsrec->stxkeys.dim1; i++)
{
- ColumnRef *cref = makeNode(ColumnRef);
+ StatsElem *selem = makeNode(StatsElem);
AttrNumber attnum = statsrec->stxkeys.values[i];
- cref->fields = list_make1(makeString(get_attname(heapRelid,
- attnum, false)));
- cref->location = -1;
+ selem->name = get_attname(heapRelid, attnum, false);
+ selem->expr = NULL;
- def_names = lappend(def_names, cref);
+ def_names = lappend(def_names, selem);
+ }
+
+ /*
+ * Now handle expressions, if there are any. The order (with respect to
+ * regular attributes) does not really matter for extended stats, so we
+ * simply append them after simple column references.
+ *
+ * XXX Some places during build/estimation treat expressions as if they
+ * are before atttibutes, but for the CREATE command that's entirely
+ * irrelevant.
+ */
+ datum = SysCacheGetAttr(STATEXTOID, ht_stats,
+ Anum_pg_statistic_ext_stxexprs, &isnull);
+
+ if (!isnull)
+ {
+ ListCell *lc;
+ List *exprs = NIL;
+ char *exprsString;
+
+ exprsString = TextDatumGetCString(datum);
+ exprs = (List *) stringToNode(exprsString);
+
+ foreach(lc, exprs)
+ {
+ StatsElem *selem = makeNode(StatsElem);
+
+ selem->name = NULL;
+ selem->expr = (Node *) lfirst(lc);
+
+ def_names = lappend(def_names, selem);
+ }
+
+ pfree(exprsString);
}
/* finally, build the output node */
@@ -1942,6 +1978,7 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
stats->relations = list_make1(heapRel);
stats->stxcomment = NULL;
stats->if_not_exists = false;
+ stats->transformed = true; /* don't need transformStatsStmt again */
/* Clean up */
ReleaseSysCache(ht_stats);
@@ -2866,6 +2903,84 @@ transformIndexStmt(Oid relid, IndexStmt *stmt, const char *queryString)
return stmt;
}
+/*
+ * transformStatsStmt - parse analysis for CREATE STATISTICS
+ *
+ * To avoid race conditions, it's important that this function rely only on
+ * the passed-in relid (and not on stmt->relation) to determine the target
+ * relation.
+ */
+CreateStatsStmt *
+transformStatsStmt(Oid relid, CreateStatsStmt *stmt, const char *queryString)
+{
+ ParseState *pstate;
+ ParseNamespaceItem *nsitem;
+ ListCell *l;
+ Relation rel;
+
+ /* Nothing to do if statement already transformed. */
+ if (stmt->transformed)
+ return stmt;
+
+ /*
+ * We must not scribble on the passed-in CreateStatsStmt, so copy it.
+ * (This is overkill, but easy.)
+ */
+ stmt = copyObject(stmt);
+
+ /* Set up pstate */
+ pstate = make_parsestate(NULL);
+ pstate->p_sourcetext = queryString;
+
+ /*
+ * Put the parent table into the rtable so that the expressions can refer
+ * to its fields without qualification. Caller is responsible for locking
+ * relation, but we still need to open it.
+ */
+ rel = relation_open(relid, NoLock);
+ nsitem = addRangeTableEntryForRelation(pstate, rel,
+ AccessShareLock,
+ NULL, false, true);
+
+ /* no to join list, yes to namespaces */
+ addNSItemToQuery(pstate, nsitem, false, true, true);
+
+ /* take care of any expressions */
+ foreach(l, stmt->exprs)
+ {
+ StatsElem *selem = (StatsElem *) lfirst(l);
+
+ if (selem->expr)
+ {
+ /* Now do parse transformation of the expression */
+ selem->expr = transformExpr(pstate, selem->expr,
+ EXPR_KIND_STATS_EXPRESSION);
+
+ /* We have to fix its collations too */
+ assign_expr_collations(pstate, selem->expr);
+ }
+ }
+
+ /*
+ * Check that only the base rel is mentioned. (This should be dead code
+ * now that add_missing_from is history.)
+ */
+ if (list_length(pstate->p_rtable) != 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("statistics expressions can refer only to the table being indexed")));
+
+ free_parsestate(pstate);
+
+ /* Close relation */
+ table_close(rel, NoLock);
+
+ /* Mark statement as successfully transformed */
+ stmt->transformed = true;
+
+ return stmt;
+}
+
/*
* transformRuleStmt -
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c
index eac9285165..cf8a6d5f68 100644
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -70,15 +70,15 @@ static void generate_dependencies(DependencyGenerator state);
static DependencyGenerator DependencyGenerator_init(int n, int k);
static void DependencyGenerator_free(DependencyGenerator state);
static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
-static double dependency_degree(int numrows, HeapTuple *rows, int k,
- AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs);
+static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency);
static bool dependency_is_fully_matched(MVDependency *dependency,
Bitmapset *attnums);
static bool dependency_is_compatible_clause(Node *clause, Index relid,
AttrNumber *attnum);
+static bool dependency_is_compatible_expression(Node *clause, Index relid,
+ List *statlist, Node **expr);
static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
- int ndependencies,
- Bitmapset *attnums);
+ int ndependencies, Bitmapset *attnums);
static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
int varRelid, JoinType jointype,
SpecialJoinInfo *sjinfo,
@@ -219,16 +219,13 @@ DependencyGenerator_next(DependencyGenerator state)
* the last one.
*/
static double
-dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
- VacAttrStats **stats, Bitmapset *attrs)
+dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency)
{
int i,
nitems;
MultiSortSupport mss;
SortItem *items;
- AttrNumber *attnums;
AttrNumber *attnums_dep;
- int numattrs;
/* counters valid within a group */
int group_size = 0;
@@ -244,15 +241,12 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
mss = multi_sort_init(k);
/*
- * Transform the attrs from bitmap to an array to make accessing the i-th
- * member easier, and then construct a filtered version with only attnums
- * referenced by the dependency we validate.
+ * Translate the array of indexes to regular attnums for the dependency (we
+ * will need this to identify the columns in StatsBuildData).
*/
- attnums = build_attnums_array(attrs, &numattrs);
-
attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
for (i = 0; i < k; i++)
- attnums_dep[i] = attnums[dependency[i]];
+ attnums_dep[i] = data->attnums[dependency[i]];
/*
* Verify the dependency (a,b,...)->z, using a rather simple algorithm:
@@ -270,7 +264,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
/* prepare the sort function for the dimensions */
for (i = 0; i < k; i++)
{
- VacAttrStats *colstat = stats[dependency[i]];
+ VacAttrStats *colstat = data->stats[dependency[i]];
TypeCacheEntry *type;
type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
@@ -289,8 +283,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
* descriptor. For now that assumption holds, but it might change in the
* future for example if we support statistics on multiple tables.
*/
- items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
- mss, k, attnums_dep);
+ items = build_sorted_items(data, &nitems, mss, k, attnums_dep);
/*
* Walk through the sorted array, split it into rows according to the
@@ -336,11 +329,10 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
pfree(items);
pfree(mss);
- pfree(attnums);
pfree(attnums_dep);
/* Compute the 'degree of validity' as (supporting/total). */
- return (n_supporting_rows * 1.0 / numrows);
+ return (n_supporting_rows * 1.0 / data->numrows);
}
/*
@@ -360,23 +352,15 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
* (c) -> b
*/
MVDependencies *
-statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
- VacAttrStats **stats)
+statext_dependencies_build(StatsBuildData *data)
{
int i,
k;
- int numattrs;
- AttrNumber *attnums;
/* result */
MVDependencies *dependencies = NULL;
- /*
- * Transform the bms into an array, to make accessing i-th member easier.
- */
- attnums = build_attnums_array(attrs, &numattrs);
-
- Assert(numattrs >= 2);
+ Assert(data->nattnums >= 2);
/*
* We'll try build functional dependencies starting from the smallest ones
@@ -384,12 +368,12 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
* included in the statistics object. We start from the smallest ones
* because we want to be able to skip already implied ones.
*/
- for (k = 2; k <= numattrs; k++)
+ for (k = 2; k <= data->nattnums; k++)
{
AttrNumber *dependency; /* array with k elements */
/* prepare a DependencyGenerator of variation */
- DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k);
+ DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k);
/* generate all possible variations of k values (out of n) */
while ((dependency = DependencyGenerator_next(DependencyGenerator)))
@@ -398,7 +382,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
MVDependency *d;
/* compute how valid the dependency seems */
- degree = dependency_degree(numrows, rows, k, dependency, stats, attrs);
+ degree = dependency_degree(data, k, dependency);
/*
* if the dependency seems entirely invalid, don't store it
@@ -413,7 +397,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
d->degree = degree;
d->nattributes = k;
for (i = 0; i < k; i++)
- d->attributes[i] = attnums[dependency[i]];
+ d->attributes[i] = data->attnums[dependency[i]];
/* initialize the list of dependencies */
if (dependencies == NULL)
@@ -747,6 +731,7 @@ static bool
dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
{
Var *var;
+ Node *clause_expr;
if (IsA(clause, RestrictInfo))
{
@@ -774,9 +759,9 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
/* Make sure non-selected argument is a pseudoconstant. */
if (is_pseudo_constant_clause(lsecond(expr->args)))
- var = linitial(expr->args);
+ clause_expr = linitial(expr->args);
else if (is_pseudo_constant_clause(linitial(expr->args)))
- var = lsecond(expr->args);
+ clause_expr = lsecond(expr->args);
else
return false;
@@ -805,8 +790,8 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
/*
* Reject ALL() variant, we only care about ANY/IN.
*
- * FIXME Maybe we should check if all the values are the same, and
- * allow ALL in that case? Doesn't seem very practical, though.
+ * XXX Maybe we should check if all the values are the same, and allow
+ * ALL in that case? Doesn't seem very practical, though.
*/
if (!expr->useOr)
return false;
@@ -822,7 +807,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
if (!is_pseudo_constant_clause(lsecond(expr->args)))
return false;
- var = linitial(expr->args);
+ clause_expr = linitial(expr->args);
/*
* If it's not an "=" operator, just ignore the clause, as it's not
@@ -838,13 +823,13 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
}
else if (is_orclause(clause))
{
- BoolExpr *expr = (BoolExpr *) clause;
+ BoolExpr *bool_expr = (BoolExpr *) clause;
ListCell *lc;
/* start with no attribute number */
*attnum = InvalidAttrNumber;
- foreach(lc, expr->args)
+ foreach(lc, bool_expr->args)
{
AttrNumber clause_attnum;
@@ -859,6 +844,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
if (*attnum == InvalidAttrNumber)
*attnum = clause_attnum;
+ /* ensure all the variables are the same (same attnum) */
if (*attnum != clause_attnum)
return false;
}
@@ -872,7 +858,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
* "NOT x" can be interpreted as "x = false", so get the argument and
* proceed with seeing if it's a suitable Var.
*/
- var = (Var *) get_notclausearg(clause);
+ clause_expr = (Node *) get_notclausearg(clause);
}
else
{
@@ -880,20 +866,23 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
* A boolean expression "x" can be interpreted as "x = true", so
* proceed with seeing if it's a suitable Var.
*/
- var = (Var *) clause;
+ clause_expr = (Node *) clause;
}
/*
* We may ignore any RelabelType node above the operand. (There won't be
* more than one, since eval_const_expressions has been applied already.)
*/
- if (IsA(var, RelabelType))
- var = (Var *) ((RelabelType *) var)->arg;
+ if (IsA(clause_expr, RelabelType))
+ clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
/* We only support plain Vars for now */
- if (!IsA(var, Var))
+ if (!IsA(clause_expr, Var))
return false;
+ /* OK, we know we have a Var */
+ var = (Var *) clause_expr;
+
/* Ensure Var is from the correct relation */
if (var->varno != relid)
return false;
@@ -1158,6 +1147,212 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
}
/*
+ * dependency_is_compatible_expression
+ * Determines if the expression is compatible with functional dependencies
+ *
+ * Similar to dependency_is_compatible_clause, but doesn't enforce that the
+ * expression is a simple Var. OTOH we check that there's at least one
+ * statistics object matching the expression.
+ */
+static bool
+dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr)
+{
+ List *vars;
+ ListCell *lc,
+ *lc2;
+ Node *clause_expr;
+
+ if (IsA(clause, RestrictInfo))
+ {
+ RestrictInfo *rinfo = (RestrictInfo *) clause;
+
+ /* Pseudoconstants are not interesting (they couldn't contain a Var) */
+ if (rinfo->pseudoconstant)
+ return false;
+
+ /* Clauses referencing multiple, or no, varnos are incompatible */
+ if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
+ return false;
+
+ clause = (Node *) rinfo->clause;
+ }
+
+ if (is_opclause(clause))
+ {
+ /* If it's an opclause, check for Var = Const or Const = Var. */
+ OpExpr *expr = (OpExpr *) clause;
+
+ /* Only expressions with two arguments are candidates. */
+ if (list_length(expr->args) != 2)
+ return false;
+
+ /* Make sure non-selected argument is a pseudoconstant. */
+ if (is_pseudo_constant_clause(lsecond(expr->args)))
+ clause_expr = linitial(expr->args);
+ else if (is_pseudo_constant_clause(linitial(expr->args)))
+ clause_expr = lsecond(expr->args);
+ else
+ return false;
+
+ /*
+ * If it's not an "=" operator, just ignore the clause, as it's not
+ * compatible with functional dependencies.
+ *
+ * This uses the function for estimating selectivity, not the operator
+ * directly (a bit awkward, but well ...).
+ *
+ * XXX this is pretty dubious; probably it'd be better to check btree
+ * or hash opclass membership, so as not to be fooled by custom
+ * selectivity functions, and to be more consistent with decisions
+ * elsewhere in the planner.
+ */
+ if (get_oprrest(expr->opno) != F_EQSEL)
+ return false;
+
+ /* OK to proceed with checking "var" */
+ }
+ else if (IsA(clause, ScalarArrayOpExpr))
+ {
+ /* If it's an scalar array operator, check for Var IN Const. */
+ ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
+
+ /*
+ * Reject ALL() variant, we only care about ANY/IN.
+ *
+ * FIXME Maybe we should check if all the values are the same, and
+ * allow ALL in that case? Doesn't seem very practical, though.
+ */
+ if (!expr->useOr)
+ return false;
+
+ /* Only expressions with two arguments are candidates. */
+ if (list_length(expr->args) != 2)
+ return false;
+
+ /*
+ * We know it's always (Var IN Const), so we assume the var is the
+ * first argument, and pseudoconstant is the second one.
+ */
+ if (!is_pseudo_constant_clause(lsecond(expr->args)))
+ return false;
+
+ clause_expr = linitial(expr->args);
+
+ /*
+ * If it's not an "=" operator, just ignore the clause, as it's not
+ * compatible with functional dependencies. The operator is identified
+ * simply by looking at which function it uses to estimate
+ * selectivity. That's a bit strange, but it's what other similar
+ * places do.
+ */
+ if (get_oprrest(expr->opno) != F_EQSEL)
+ return false;
+
+ /* OK to proceed with checking "var" */
+ }
+ else if (is_orclause(clause))
+ {
+ BoolExpr *bool_expr = (BoolExpr *) clause;
+ ListCell *lc;
+
+ /* start with no expression (we'll use the first match) */
+ *expr = NULL;
+
+ foreach(lc, bool_expr->args)
+ {
+ Node *or_expr = NULL;
+
+ /*
+ * Had we found incompatible expression in the arguments, treat
+ * the whole expression as incompatible.
+ */
+ if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid,
+ statlist, &or_expr))
+ return false;
+
+ if (*expr == NULL)
+ *expr = or_expr;
+
+ /* ensure all the expressions are the same */
+ if (!equal(or_expr, *expr))
+ return false;
+ }
+
+ /* the expression is already checked by the recursive call */
+ return true;
+ }
+ else if (is_notclause(clause))
+ {
+ /*
+ * "NOT x" can be interpreted as "x = false", so get the argument and
+ * proceed with seeing if it's a suitable Var.
+ */
+ clause_expr = (Node *) get_notclausearg(clause);
+ }
+ else
+ {
+ /*
+ * A boolean expression "x" can be interpreted as "x = true", so
+ * proceed with seeing if it's a suitable Var.
+ */
+ clause_expr = (Node *) clause;
+ }
+
+ /*
+ * We may ignore any RelabelType node above the operand. (There won't be
+ * more than one, since eval_const_expressions has been applied already.)
+ */
+ if (IsA(clause_expr, RelabelType))
+ clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
+
+ vars = pull_var_clause(clause_expr, 0);
+
+ foreach(lc, vars)
+ {
+ Var *var = (Var *) lfirst(lc);
+
+ /* Ensure Var is from the correct relation */
+ if (var->varno != relid)
+ return false;
+
+ /* We also better ensure the Var is from the current level */
+ if (var->varlevelsup != 0)
+ return false;
+
+ /* Also ignore system attributes (we don't allow stats on those) */
+ if (!AttrNumberIsForUserDefinedAttr(var->varattno))
+ return false;
+ }
+
+ /*
+ * Check if we actually have a matching statistics for the expression.
+ *
+ * XXX Maybe this is an overkill. We'll eliminate the expressions later.
+ */
+ foreach(lc, statlist)
+ {
+ StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
+
+ /* ignore stats without dependencies */
+ if (info->kind != STATS_EXT_DEPENDENCIES)
+ continue;
+
+ foreach(lc2, info->exprs)
+ {
+ Node *stat_expr = (Node *) lfirst(lc2);
+
+ if (equal(clause_expr, stat_expr))
+ {
+ *expr = stat_expr;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/*
* dependencies_clauselist_selectivity
* Return the estimated selectivity of (a subset of) the given clauses
* using functional dependency statistics, or 1.0 if no useful functional
@@ -1204,6 +1399,11 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
MVDependency **dependencies;
int ndependencies;
int i;
+ AttrNumber attnum_offset;
+
+ /* unique expressions */
+ Node **unique_exprs;
+ int unique_exprs_cnt;
/* check if there's any stats that might be useful for us. */
if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
@@ -1213,6 +1413,15 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
list_length(clauses));
/*
+ * We allocate space as if every clause was a unique expression, although
+ * that's probably overkill. Some will be simple column references that
+ * we'll translate to attnums, and there might be duplicates. But it's
+ * easier and cheaper to just do one allocation than repalloc later.
+ */
+ unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses));
+ unique_exprs_cnt = 0;
+
+ /*
* Pre-process the clauses list to extract the attnums seen in each item.
* We need to determine if there's any clauses which will be useful for
* dependency selectivity estimations. Along the way we'll record all of
@@ -1222,29 +1431,127 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
*
* We also skip clauses that we already estimated using different types of
* statistics (we treat them as incompatible).
+ *
+ * To handle expressions, we assign them negative attnums, as if it was a
+ * system attribute (this is fine, as we only allow extended stats on user
+ * attributes). And then we offset everything by the number of
+ * expressions, so that we can store the values in a bitmapset.
*/
listidx = 0;
foreach(l, clauses)
{
Node *clause = (Node *) lfirst(l);
AttrNumber attnum;
+ Node *expr = NULL;
+
+ /* ignore clause by default */
+ list_attnums[listidx] = InvalidAttrNumber;
- if (!bms_is_member(listidx, *estimatedclauses) &&
- dependency_is_compatible_clause(clause, rel->relid, &attnum))
+ if (!bms_is_member(listidx, *estimatedclauses))
{
- list_attnums[listidx] = attnum;
- clauses_attnums = bms_add_member(clauses_attnums, attnum);
+ /*
+ * If it's a simple column refrence, just extract the attnum. If
+ * it's an expression, assign a negative attnum as if it was a
+ * system attribute.
+ */
+ if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
+ {
+ list_attnums[listidx] = attnum;
+ }
+ else if (dependency_is_compatible_expression(clause, rel->relid,
+ rel->statlist,
+ &expr))
+ {
+ /* special attnum assigned to this expression */
+ attnum = InvalidAttrNumber;
+
+ Assert(expr != NULL);
+
+ /* If the expression is duplicate, use the same attnum. */
+ for (i = 0; i < unique_exprs_cnt; i++)
+ {
+ if (equal(unique_exprs[i], expr))
+ {
+ /* negative attribute number to expression */
+ attnum = -(i + 1);
+ break;
+ }
+ }
+
+ /* not found in the list, so add it */
+ if (attnum == InvalidAttrNumber)
+ {
+ unique_exprs[unique_exprs_cnt++] = expr;
+
+ /* after incrementing the value, to get -1, -2, ... */
+ attnum = (-unique_exprs_cnt);
+ }
+
+ /* remember which attnum was assigned to this clause */
+ list_attnums[listidx] = attnum;
+ }
}
- else
- list_attnums[listidx] = InvalidAttrNumber;
listidx++;
}
+ Assert(listidx == list_length(clauses));
+
/*
- * If there's not at least two distinct attnums then reject the whole list
- * of clauses. We must return 1.0 so the calling function's selectivity is
- * unaffected.
+ * How much we need to offset the attnums? If there are no expressions,
+ * then no offset is needed. Otherwise we need to offset enough for the
+ * lowest value (-unique_exprs_cnt) to become 1.
+ */
+ if (unique_exprs_cnt > 0)
+ attnum_offset = (unique_exprs_cnt + 1);
+ else
+ attnum_offset = 0;
+
+ /*
+ * Now that we know how many expressions there are, we can offset the
+ * values just enough to build the bitmapset.
+ */
+ for (i = 0; i < list_length(clauses); i++)
+ {
+ AttrNumber attnum;
+
+ /* ignore incompatible or already estimated clauses */
+ if (list_attnums[i] == InvalidAttrNumber)
+ continue;
+
+ /* make sure the attnum is in the expected range */
+ Assert(list_attnums[i] >= (-unique_exprs_cnt));
+ Assert(list_attnums[i] <= MaxHeapAttributeNumber);
+
+ /* make sure the attnum is positive (valid AttrNumber) */
+ attnum = list_attnums[i] + attnum_offset;
+
+ /*
+ * Either it's a regular attribute, or it's an expression, in which
+ * case we must not have seen it before (expressions are unique).
+ *
+ * XXX Check whether it's a regular attribute has to be done using the
+ * original attnum, while the second check has to use the value with
+ * an offset.
+ */
+ Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) ||
+ !bms_is_member(attnum, clauses_attnums));
+
+ /*
+ * Remember the offset attnum, both for attributes and expressions.
+ * We'll pass list_attnums to clauselist_apply_dependencies, which
+ * uses it to identify clauses in a bitmap. We could also pass the
+ * offset, but this is more convenient.
+ */
+ list_attnums[i] = attnum;
+
+ clauses_attnums = bms_add_member(clauses_attnums, attnum);
+ }
+
+ /*
+ * If there's not at least two distinct attnums and expressions, then
+ * reject the whole list of clauses. We must return 1.0 so the calling
+ * function's selectivity is unaffected.
*/
if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
{
@@ -1272,26 +1579,203 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
foreach(l, rel->statlist)
{
StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
- Bitmapset *matched;
- BMS_Membership membership;
+ int nmatched;
+ int nexprs;
+ int k;
+ MVDependencies *deps;
/* skip statistics that are not of the correct type */
if (stat->kind != STATS_EXT_DEPENDENCIES)
continue;
- matched = bms_intersect(clauses_attnums, stat->keys);
- membership = bms_membership(matched);
- bms_free(matched);
+ /*
+ * Count matching attributes - we have to undo the attnum offsets. The
+ * input attribute numbers are not offset (expressions are not
+ * included in stat->keys, so it's not necessary). But we need to
+ * offset it before checking against clauses_attnums.
+ */
+ nmatched = 0;
+ k = -1;
+ while ((k = bms_next_member(stat->keys, k)) >= 0)
+ {
+ AttrNumber attnum = (AttrNumber) k;
- /* skip objects matching fewer than two attributes from clauses */
- if (membership != BMS_MULTIPLE)
+ /* skip expressions */
+ if (!AttrNumberIsForUserDefinedAttr(attnum))
+ continue;
+
+ /* apply the same offset as above */
+ attnum += attnum_offset;
+
+ if (bms_is_member(attnum, clauses_attnums))
+ nmatched++;
+ }
+
+ /* count matching expressions */
+ nexprs = 0;
+ for (i = 0; i < unique_exprs_cnt; i++)
+ {
+ ListCell *lc;
+
+ foreach(lc, stat->exprs)
+ {
+ Node *stat_expr = (Node *) lfirst(lc);
+
+ /* try to match it */
+ if (equal(stat_expr, unique_exprs[i]))
+ nexprs++;
+ }
+ }
+
+ /*
+ * Skip objects matching fewer than two attributes/expressions from
+ * clauses.
+ */
+ if (nmatched + nexprs < 2)
continue;
- func_dependencies[nfunc_dependencies]
- = statext_dependencies_load(stat->statOid);
+ deps = statext_dependencies_load(stat->statOid);
+
+ /*
+ * The expressions may be represented by different attnums in the
+ * stats, we need to remap them to be consistent with the clauses.
+ * That will make the later steps (e.g. picking the strongest item and
+ * so on) much simpler and cheaper, because it won't need to care
+ * about the offset at all.
+ *
+ * When we're at it, we can ignore dependencies that are not fully
+ * matched by clauses (i.e. referencing attributes or expressions that
+ * are not in the clauses).
+ *
+ * We have to do this for all statistics, as long as there are any
+ * expressions - we need to shift the attnums in all dependencies.
+ *
+ * XXX Maybe we should do this always, because it also eliminates some
+ * of the dependencies early. It might be cheaper than having to walk
+ * the longer list in find_strongest_dependency later, especially as
+ * we need to do that repeatedly?
+ *
+ * XXX We have to do this even when there are no expressions in
+ * clauses, otherwise find_strongest_dependency may fail for stats
+ * with expressions (due to lookup of negative value in bitmap). So we
+ * need to at least filter out those dependencies. Maybe we could do
+ * it in a cheaper way (if there are no expr clauses, we can just
+ * discard all negative attnums without any lookups).
+ */
+ if (unique_exprs_cnt > 0 || stat->exprs != NIL)
+ {
+ int ndeps = 0;
+
+ for (i = 0; i < deps->ndeps; i++)
+ {
+ bool skip = false;
+ MVDependency *dep = deps->deps[i];
+ int j;
+
+ for (j = 0; j < dep->nattributes; j++)
+ {
+ int idx;
+ Node *expr;
+ int k;
+ AttrNumber unique_attnum = InvalidAttrNumber;
+ AttrNumber attnum;
+
+ /* undo the per-statistics offset */
+ attnum = dep->attributes[j];
+
+ /*
+ * For regular attributes we can simply check if it
+ * matches any clause. If there's no matching clause, we
+ * can just ignore it. We need to offset the attnum
+ * though.
+ */
+ if (AttrNumberIsForUserDefinedAttr(attnum))
+ {
+ dep->attributes[j] = attnum + attnum_offset;
+
+ if (!bms_is_member(dep->attributes[j], clauses_attnums))
+ {
+ skip = true;
+ break;
+ }
+
+ continue;
+ }
+
+ /*
+ * the attnum should be a valid system attnum (-1, -2,
+ * ...)
+ */
+ Assert(AttributeNumberIsValid(attnum));
+
+ /*
+ * For expressions, we need to do two translations. First
+ * we have to translate the negative attnum to index in
+ * the list of expressions (in the statistics object).
+ * Then we need to see if there's a matching clause. The
+ * index of the unique expression determines the attnum
+ * (and we offset it).
+ */
+ idx = -(1 + attnum);
+
+ /* Is the expression index is valid? */
+ Assert((idx >= 0) && (idx < list_length(stat->exprs)));
+
+ expr = (Node *) list_nth(stat->exprs, idx);
+
+ /* try to find the expression in the unique list */
+ for (k = 0; k < unique_exprs_cnt; k++)
+ {
+ /*
+ * found a matching unique expression, use the attnum
+ * (derived from index of the unique expression)
+ */
+ if (equal(unique_exprs[k], expr))
+ {
+ unique_attnum = -(k + 1) + attnum_offset;
+ break;
+ }
+ }
+
+ /*
+ * Found no matching expression, so we can simply skip
+ * this dependency, because there's no chance it will be
+ * fully covered.
+ */
+ if (unique_attnum == InvalidAttrNumber)
+ {
+ skip = true;
+ break;
+ }
+
+ /* otherwise remap it to the new attnum */
+ dep->attributes[j] = unique_attnum;
+ }
- total_ndeps += func_dependencies[nfunc_dependencies]->ndeps;
- nfunc_dependencies++;
+ /* if found a matching dependency, keep it */
+ if (!skip)
+ {
+ /* maybe we've skipped something earlier, so move it */
+ if (ndeps != i)
+ deps->deps[ndeps] = deps->deps[i];
+
+ ndeps++;
+ }
+ }
+
+ deps->ndeps = ndeps;
+ }
+
+ /*
+ * It's possible we've removed all dependencies, in which case we
+ * don't bother adding it to the list.
+ */
+ if (deps->ndeps > 0)
+ {
+ func_dependencies[nfunc_dependencies] = deps;
+ total_ndeps += deps->ndeps;
+ nfunc_dependencies++;
+ }
}
/* if no matching stats could be found then we've nothing to do */
@@ -1300,6 +1784,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
pfree(func_dependencies);
bms_free(clauses_attnums);
pfree(list_attnums);
+ pfree(unique_exprs);
return 1.0;
}
@@ -1347,6 +1832,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
pfree(func_dependencies);
bms_free(clauses_attnums);
pfree(list_attnums);
+ pfree(unique_exprs);
return s1;
}
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 7808c6a09c..8c75690fce 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -24,6 +24,7 @@
#include "catalog/pg_collation.h"
#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_statistic_ext_data.h"
+#include "executor/executor.h"
#include "commands/progress.h"
#include "miscadmin.h"
#include "nodes/nodeFuncs.h"
@@ -35,13 +36,16 @@
#include "statistics/statistics.h"
#include "utils/acl.h"
#include "utils/array.h"
+#include "utils/attoptcache.h"
#include "utils/builtins.h"
+#include "utils/datum.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/selfuncs.h"
#include "utils/syscache.h"
+#include "utils/typcache.h"
/*
* To avoid consuming too much memory during analysis and/or too much space
@@ -66,18 +70,38 @@ typedef struct StatExtEntry
Bitmapset *columns; /* attribute numbers covered by the object */
List *types; /* 'char' list of enabled statistics kinds */
int stattarget; /* statistics target (-1 for default) */
+ List *exprs; /* expressions */
} StatExtEntry;
static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
-static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
+static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs,
int nvacatts, VacAttrStats **vacatts);
-static void statext_store(Oid relid,
+static void statext_store(Oid statOid,
MVNDistinct *ndistinct, MVDependencies *dependencies,
- MCVList *mcv, VacAttrStats **stats);
+ MCVList *mcv, Datum exprs, VacAttrStats **stats);
static int statext_compute_stattarget(int stattarget,
int natts, VacAttrStats **stats);
+/* Information needed to analyze a single simple expression. */
+typedef struct AnlExprData
+{
+ Node *expr; /* expression to analyze */
+ VacAttrStats *vacattrstat; /* statistics attrs to analyze */
+} AnlExprData;
+
+static void compute_expr_stats(Relation onerel, double totalrows,
+ AnlExprData * exprdata, int nexprs,
+ HeapTuple *rows, int numrows);
+static Datum serialize_expr_stats(AnlExprData * exprdata, int nexprs);
+static Datum expr_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
+static AnlExprData *build_expr_data(List *exprs, int stattarget);
+
+static StatsBuildData *make_build_data(Relation onerel, StatExtEntry *stat,
+ int numrows, HeapTuple *rows,
+ VacAttrStats **stats, int stattarget);
+
+
/*
* Compute requested extended stats, using the rows sampled for the plain
* (single-column) stats.
@@ -92,21 +116,25 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
{
Relation pg_stext;
ListCell *lc;
- List *stats;
+ List *statslist;
MemoryContext cxt;
MemoryContext oldcxt;
int64 ext_cnt;
+ /* Do nothing if there are no columns to analyze. */
+ if (!natts)
+ return;
+
cxt = AllocSetContextCreate(CurrentMemoryContext,
"BuildRelationExtStatistics",
ALLOCSET_DEFAULT_SIZES);
oldcxt = MemoryContextSwitchTo(cxt);
pg_stext = table_open(StatisticExtRelationId, RowExclusiveLock);
- stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
+ statslist = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
/* report this phase */
- if (stats != NIL)
+ if (statslist != NIL)
{
const int index[] = {
PROGRESS_ANALYZE_PHASE,
@@ -114,28 +142,30 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
};
const int64 val[] = {
PROGRESS_ANALYZE_PHASE_COMPUTE_EXT_STATS,
- list_length(stats)
+ list_length(statslist)
};
pgstat_progress_update_multi_param(2, index, val);
}
ext_cnt = 0;
- foreach(lc, stats)
+ foreach(lc, statslist)
{
StatExtEntry *stat = (StatExtEntry *) lfirst(lc);
MVNDistinct *ndistinct = NULL;
MVDependencies *dependencies = NULL;
MCVList *mcv = NULL;
+ Datum exprstats = (Datum) 0;
VacAttrStats **stats;
ListCell *lc2;
int stattarget;
+ StatsBuildData *data;
/*
* Check if we can build these stats based on the column analyzed. If
* not, report this fact (except in autovacuum) and move on.
*/
- stats = lookup_var_attr_stats(onerel, stat->columns,
+ stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs,
natts, vacattrstats);
if (!stats)
{
@@ -150,10 +180,6 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
continue;
}
- /* check allowed number of dimensions */
- Assert(bms_num_members(stat->columns) >= 2 &&
- bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS);
-
/* compute statistics target for this statistics */
stattarget = statext_compute_stattarget(stat->stattarget,
bms_num_members(stat->columns),
@@ -167,28 +193,49 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
if (stattarget == 0)
continue;
+ /* evaluate expressions (if the statistics has any) */
+ data = make_build_data(onerel, stat, numrows, rows, stats, stattarget);
+
/* compute statistic of each requested type */
foreach(lc2, stat->types)
{
char t = (char) lfirst_int(lc2);
if (t == STATS_EXT_NDISTINCT)
- ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
- stat->columns, stats);
+ ndistinct = statext_ndistinct_build(totalrows, data);
else if (t == STATS_EXT_DEPENDENCIES)
- dependencies = statext_dependencies_build(numrows, rows,
- stat->columns, stats);
+ dependencies = statext_dependencies_build(data);
else if (t == STATS_EXT_MCV)
- mcv = statext_mcv_build(numrows, rows, stat->columns, stats,
- totalrows, stattarget);
+ mcv = statext_mcv_build(data, totalrows, stattarget);
+ else if (t == STATS_EXT_EXPRESSIONS)
+ {
+ AnlExprData *exprdata;
+ int nexprs;
+
+ /* should not happen, thanks to checks when defining stats */
+ if (!stat->exprs)
+ elog(ERROR, "requested expression stats, but there are no expressions");
+
+ exprdata = build_expr_data(stat->exprs, stattarget);
+ nexprs = list_length(stat->exprs);
+
+ compute_expr_stats(onerel, totalrows,
+ exprdata, nexprs,
+ rows, numrows);
+
+ exprstats = serialize_expr_stats(exprdata, nexprs);
+ }
}
/* store the statistics in the catalog */
- statext_store(stat->statOid, ndistinct, dependencies, mcv, stats);
+ statext_store(stat->statOid, ndistinct, dependencies, mcv, exprstats, stats);
/* for reporting progress */
pgstat_progress_update_param(PROGRESS_ANALYZE_EXT_STATS_COMPUTED,
++ext_cnt);
+
+ /* free the build data (allocated as a single chunk) */
+ pfree(data);
}
table_close(pg_stext, RowExclusiveLock);
@@ -221,6 +268,10 @@ ComputeExtStatisticsRows(Relation onerel,
MemoryContext oldcxt;
int result = 0;
+ /* If there are no columns to analyze, just return 0. */
+ if (!natts)
+ return 0;
+
cxt = AllocSetContextCreate(CurrentMemoryContext,
"ComputeExtStatisticsRows",
ALLOCSET_DEFAULT_SIZES);
@@ -241,7 +292,7 @@ ComputeExtStatisticsRows(Relation onerel,
* analyzed. If not, ignore it (don't report anything, we'll do that
* during the actual build BuildRelationExtStatistics).
*/
- stats = lookup_var_attr_stats(onerel, stat->columns,
+ stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs,
natts, vacattrstats);
if (!stats)
@@ -349,6 +400,10 @@ statext_is_kind_built(HeapTuple htup, char type)
attnum = Anum_pg_statistic_ext_data_stxdmcv;
break;
+ case STATS_EXT_EXPRESSIONS:
+ attnum = Anum_pg_statistic_ext_data_stxdexpr;
+ break;
+
default:
elog(ERROR, "unexpected statistics type requested: %d", type);
}
@@ -388,6 +443,7 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
ArrayType *arr;
char *enabled;
Form_pg_statistic_ext staForm;
+ List *exprs = NIL;
entry = palloc0(sizeof(StatExtEntry));
staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
@@ -415,10 +471,40 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
{
Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
(enabled[i] == STATS_EXT_DEPENDENCIES) ||
- (enabled[i] == STATS_EXT_MCV));
+ (enabled[i] == STATS_EXT_MCV) ||
+ (enabled[i] == STATS_EXT_EXPRESSIONS));
entry->types = lappend_int(entry->types, (int) enabled[i]);
}
+ /* decode expression (if any) */
+ datum = SysCacheGetAttr(STATEXTOID, htup,
+ Anum_pg_statistic_ext_stxexprs, &isnull);
+
+ if (!isnull)
+ {
+ char *exprsString;
+
+ exprsString = TextDatumGetCString(datum);
+ exprs = (List *) stringToNode(exprsString);
+
+ pfree(exprsString);
+
+ /*
+ * Run the expressions through eval_const_expressions. This is not
+ * just an optimization, but is necessary, because the planner
+ * will be comparing them to similarly-processed qual clauses, and
+ * may fail to detect valid matches without this. We must not use
+ * canonicalize_qual, however, since these aren't qual
+ * expressions.
+ */
+ exprs = (List *) eval_const_expressions(NULL, (Node *) exprs);
+
+ /* May as well fix opfuncids too */
+ fix_opfuncids((Node *) exprs);
+ }
+
+ entry->exprs = exprs;
+
result = lappend(result, entry);
}
@@ -428,6 +514,187 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
}
/*
+ * examine_attribute -- pre-analysis of a single column
+ *
+ * Determine whether the column is analyzable; if so, create and initialize
+ * a VacAttrStats struct for it. If not, return NULL.
+ */
+static VacAttrStats *
+examine_attribute(Node *expr)
+{
+ HeapTuple typtuple;
+ VacAttrStats *stats;
+ int i;
+ bool ok;
+
+ /*
+ * Create the VacAttrStats struct. Note that we only have a copy of the
+ * fixed fields of the pg_attribute tuple.
+ */
+ stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats));
+
+ /* fake the attribute */
+ stats->attr = (Form_pg_attribute) palloc0(ATTRIBUTE_FIXED_PART_SIZE);
+ stats->attr->attstattarget = -1;
+
+ /*
+ * When analyzing an expression, believe the expression tree's type not
+ * the column datatype --- the latter might be the opckeytype storage
+ * type of the opclass, which is not interesting for our purposes. (Note:
+ * if we did anything with non-expression statistics columns, we'd need to
+ * figure out where to get the correct type info from, but for now that's
+ * not a problem.) It's not clear whether anyone will care about the
+ * typmod, but we store that too just in case.
+ */
+ stats->attrtypid = exprType(expr);
+ stats->attrtypmod = exprTypmod(expr);
+ stats->attrcollid = exprCollation(expr);
+
+ typtuple = SearchSysCacheCopy1(TYPEOID,
+ ObjectIdGetDatum(stats->attrtypid));
+ if (!HeapTupleIsValid(typtuple))
+ elog(ERROR, "cache lookup failed for type %u", stats->attrtypid);
+ stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple);
+
+ /*
+ * We don't actually analyze individual attributes, so no need to set the
+ * memory context.
+ */
+ stats->anl_context = NULL;
+ stats->tupattnum = InvalidAttrNumber;
+
+ /*
+ * The fields describing the stats->stavalues[n] element types default to
+ * the type of the data being analyzed, but the type-specific typanalyze
+ * function can change them if it wants to store something else.
+ */
+ for (i = 0; i < STATISTIC_NUM_SLOTS; i++)
+ {
+ stats->statypid[i] = stats->attrtypid;
+ stats->statyplen[i] = stats->attrtype->typlen;
+ stats->statypbyval[i] = stats->attrtype->typbyval;
+ stats->statypalign[i] = stats->attrtype->typalign;
+ }
+
+ /*
+ * Call the type-specific typanalyze function. If none is specified, use
+ * std_typanalyze().
+ */
+ if (OidIsValid(stats->attrtype->typanalyze))
+ ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze,
+ PointerGetDatum(stats)));
+ else
+ ok = std_typanalyze(stats);
+
+ if (!ok || stats->compute_stats == NULL || stats->minrows <= 0)
+ {
+ heap_freetuple(typtuple);
+ pfree(stats->attr);
+ pfree(stats);
+ return NULL;
+ }
+
+ return stats;
+}
+
+/*
+ * examine_expression -- pre-analysis of a single expression
+ *
+ * Determine whether the expression is analyzable; if so, create and initialize
+ * a VacAttrStats struct for it. If not, return NULL.
+ */
+static VacAttrStats *
+examine_expression(Node *expr, int stattarget)
+{
+ HeapTuple typtuple;
+ VacAttrStats *stats;
+ int i;
+ bool ok;
+
+ Assert(expr != NULL);
+
+ /*
+ * Create the VacAttrStats struct.
+ */
+ stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats));
+
+ /*
+ * When analyzing an expression, believe the expression tree's type.
+ */
+ stats->attrtypid = exprType(expr);
+ stats->attrtypmod = exprTypmod(expr);
+
+ /*
+ * We don't allow collation to be specified in CREATE STATISTICS, so we
+ * have to use the collation specified for the expression. It's possible
+ * to specify the collation in the expression "(col COLLATE "en_US")" in
+ * which case exprCollation() does the right thing.
+ */
+ stats->attrcollid = exprCollation(expr);
+
+ /*
+ * We don't have any pg_attribute for expressions, so let's fake something
+ * reasonable into attstattarget, which is the only thing std_typanalyze
+ * needs.
+ */
+ stats->attr = (Form_pg_attribute) palloc(ATTRIBUTE_FIXED_PART_SIZE);
+
+ /*
+ * We can't have statistics target specified for the expression, so we
+ * could use either the default_statistics_target, or the target computed
+ * for the extended statistics. The second option seems more reasonable.
+ */
+ stats->attr->attstattarget = stattarget;
+
+ /* initialize some basic fields */
+ stats->attr->attrelid = InvalidOid;
+ stats->attr->attnum = InvalidAttrNumber;
+ stats->attr->atttypid = stats->attrtypid;
+
+ typtuple = SearchSysCacheCopy1(TYPEOID,
+ ObjectIdGetDatum(stats->attrtypid));
+ if (!HeapTupleIsValid(typtuple))
+ elog(ERROR, "cache lookup failed for type %u", stats->attrtypid);
+
+ stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple);
+ stats->anl_context = CurrentMemoryContext; /* XXX should be using
+ * something else? */
+ stats->tupattnum = InvalidAttrNumber;
+
+ /*
+ * The fields describing the stats->stavalues[n] element types default to
+ * the type of the data being analyzed, but the type-specific typanalyze
+ * function can change them if it wants to store something else.
+ */
+ for (i = 0; i < STATISTIC_NUM_SLOTS; i++)
+ {
+ stats->statypid[i] = stats->attrtypid;
+ stats->statyplen[i] = stats->attrtype->typlen;
+ stats->statypbyval[i] = stats->attrtype->typbyval;
+ stats->statypalign[i] = stats->attrtype->typalign;
+ }
+
+ /*
+ * Call the type-specific typanalyze function. If none is specified, use
+ * std_typanalyze().
+ */
+ if (OidIsValid(stats->attrtype->typanalyze))
+ ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze,
+ PointerGetDatum(stats)));
+ else
+ ok = std_typanalyze(stats);
+
+ if (!ok || stats->compute_stats == NULL || stats->minrows <= 0)
+ {
+ heap_freetuple(typtuple);
+ pfree(stats);
+ return NULL;
+ }
+
+ return stats;
+}
+
+/*
* Using 'vacatts' of size 'nvacatts' as input data, return a newly built
* VacAttrStats array which includes only the items corresponding to
* attributes indicated by 'stxkeys'. If we don't have all of the per column
@@ -435,15 +702,18 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
* to the caller that the stats should not be built.
*/
static VacAttrStats **
-lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
+lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs,
int nvacatts, VacAttrStats **vacatts)
{
int i = 0;
int x = -1;
+ int natts;
VacAttrStats **stats;
+ ListCell *lc;
+
+ natts = bms_num_members(attrs) + list_length(exprs);
- stats = (VacAttrStats **)
- palloc(bms_num_members(attrs) * sizeof(VacAttrStats *));
+ stats = (VacAttrStats **) palloc(natts * sizeof(VacAttrStats *));
/* lookup VacAttrStats info for the requested columns (same attnum) */
while ((x = bms_next_member(attrs, x)) >= 0)
@@ -480,6 +750,24 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
i++;
}
+ /* also add info for expressions */
+ foreach(lc, exprs)
+ {
+ Node *expr = (Node *) lfirst(lc);
+
+ stats[i] = examine_attribute(expr);
+
+ /*
+ * XXX We need tuple descriptor later, and we just grab it from
+ * stats[0]->tupDesc (see e.g. statext_mcv_build). But as coded
+ * examine_attribute does not set that, so just grab it from the first
+ * vacatts element.
+ */
+ stats[i]->tupDesc = vacatts[0]->tupDesc;
+
+ i++;
+ }
+
return stats;
}
@@ -491,7 +779,7 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
static void
statext_store(Oid statOid,
MVNDistinct *ndistinct, MVDependencies *dependencies,
- MCVList *mcv, VacAttrStats **stats)
+ MCVList *mcv, Datum exprs, VacAttrStats **stats)
{
Relation pg_stextdata;
HeapTuple stup,
@@ -532,11 +820,17 @@ statext_store(Oid statOid,
nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = (data == NULL);
values[Anum_pg_statistic_ext_data_stxdmcv - 1] = PointerGetDatum(data);
}
+ if (exprs != (Datum) 0)
+ {
+ nulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = false;
+ values[Anum_pg_statistic_ext_data_stxdexpr - 1] = exprs;
+ }
/* always replace the value (either by bytea or NULL) */
replaces[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true;
replaces[Anum_pg_statistic_ext_data_stxddependencies - 1] = true;
replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
+ replaces[Anum_pg_statistic_ext_data_stxdexpr - 1] = true;
/* there should already be a pg_statistic_ext_data tuple */
oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statOid));
@@ -668,7 +962,7 @@ compare_datums_simple(Datum a, Datum b, SortSupport ssup)
* is not necessary here (and when querying the bitmap).
*/
AttrNumber *
-build_attnums_array(Bitmapset *attrs, int *numattrs)
+build_attnums_array(Bitmapset *attrs, int nexprs, int *numattrs)
{
int i,
j;
@@ -684,16 +978,19 @@ build_attnums_array(Bitmapset *attrs, int *numattrs)
j = -1;
while ((j = bms_next_member(attrs, j)) >= 0)
{
+ AttrNumber attnum = (j - nexprs);
+
/*
* Make sure the bitmap contains only user-defined attributes. As
* bitmaps can't contain negative values, this can be violated in two
* ways. Firstly, the bitmap might contain 0 as a member, and secondly
* the integer value might be larger than MaxAttrNumber.
*/
- Assert(AttrNumberIsForUserDefinedAttr(j));
- Assert(j <= MaxAttrNumber);
+ Assert(AttributeNumberIsValid(attnum));
+ Assert(attnum <= MaxAttrNumber);
+ Assert(attnum >= (-nexprs));
- attnums[i++] = (AttrNumber) j;
+ attnums[i++] = (AttrNumber) attnum;
/* protect against overflows */
Assert(i <= num);
@@ -710,29 +1007,31 @@ build_attnums_array(Bitmapset *attrs, int *numattrs)
* can simply pfree the return value to release all of it.
*/
SortItem *
-build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
- MultiSortSupport mss, int numattrs, AttrNumber *attnums)
+build_sorted_items(StatsBuildData *data, int *nitems,
+ MultiSortSupport mss,
+ int numattrs, AttrNumber *attnums)
{
int i,
j,
len,
- idx;
- int nvalues = numrows * numattrs;
+ nrows;
+ int nvalues = data->numrows * numattrs;
SortItem *items;
Datum *values;
bool *isnull;
char *ptr;
+ int *typlen;
/* Compute the total amount of memory we need (both items and values). */
- len = numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
+ len = data->numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
/* Allocate the memory and split it into the pieces. */
ptr = palloc0(len);
/* items to sort */
items = (SortItem *) ptr;
- ptr += numrows * sizeof(SortItem);
+ ptr += data->numrows * sizeof(SortItem);
/* values and null flags */
values = (Datum *) ptr;
@@ -745,21 +1044,47 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
Assert((ptr - (char *) items) == len);
/* fix the pointers to Datum and bool arrays */
- idx = 0;
- for (i = 0; i < numrows; i++)
+ nrows = 0;
+ for (i = 0; i < data->numrows; i++)
{
- bool toowide = false;
+ items[nrows].values = &values[nrows * numattrs];
+ items[nrows].isnull = &isnull[nrows * numattrs];
- items[idx].values = &values[idx * numattrs];
- items[idx].isnull = &isnull[idx * numattrs];
+ nrows++;
+ }
+
+ /* build a local cache of typlen for all attributes */
+ typlen = (int *) palloc(sizeof(int) * data->nattnums);
+ for (i = 0; i < data->nattnums; i++)
+ typlen[i] = get_typlen(data->stats[i]->attrtypid);
+
+ nrows = 0;
+ for (i = 0; i < data->numrows; i++)
+ {
+ bool toowide = false;
/* load the values/null flags from sample rows */
for (j = 0; j < numattrs; j++)
{
Datum value;
bool isnull;
+ int attlen;
+ AttrNumber attnum = attnums[j];
+
+ int idx;
+
+ /* match attnum to the pre-calculated data */
+ for (idx = 0; idx < data->nattnums; idx++)
+ {
+ if (attnum == data->attnums[idx])
+ break;
+ }
- value = heap_getattr(rows[i], attnums[j], tdesc, &isnull);
+ Assert(idx < data->nattnums);
+
+ value = data->values[idx][i];
+ isnull = data->nulls[idx][i];
+ attlen = typlen[idx];
/*
* If this is a varlena value, check if it's too wide and if yes
@@ -770,8 +1095,7 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
* on the assumption that those are small (below WIDTH_THRESHOLD)
* and will be discarded at the end of analyze.
*/
- if ((!isnull) &&
- (TupleDescAttr(tdesc, attnums[j] - 1)->attlen == -1))
+ if ((!isnull) && (attlen == -1))
{
if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)
{
@@ -782,21 +1106,21 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
value = PointerGetDatum(PG_DETOAST_DATUM(value));
}
- items[idx].values[j] = value;
- items[idx].isnull[j] = isnull;
+ items[nrows].values[j] = value;
+ items[nrows].isnull[j] = isnull;
}
if (toowide)
continue;
- idx++;
+ nrows++;
}
/* store the actual number of items (ignoring the too-wide ones) */
- *nitems = idx;
+ *nitems = nrows;
/* all items were too wide */
- if (idx == 0)
+ if (nrows == 0)
{
/* everything is allocated as a single chunk */
pfree(items);
@@ -804,7 +1128,7 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
}
/* do the sort, using the multi-sort */
- qsort_arg((void *) items, idx, sizeof(SortItem),
+ qsort_arg((void *) items, nrows, sizeof(SortItem),
multi_sort_compare, mss);
return items;
@@ -831,6 +1155,63 @@ has_stats_of_kind(List *stats, char requiredkind)
}
/*
+ * stat_find_expression
+ * Search for an expression in statistics object's list of expressions.
+ *
+ * Returns the index of the expression in the statistics object's list of
+ * expressions, or -1 if not found.
+ */
+static int
+stat_find_expression(StatisticExtInfo *stat, Node *expr)
+{
+ ListCell *lc;
+ int idx;
+
+ idx = 0;
+ foreach(lc, stat->exprs)
+ {
+ Node *stat_expr = (Node *) lfirst(lc);
+
+ if (equal(stat_expr, expr))
+ return idx;
+ idx++;
+ }
+
+ /* Expression not found */
+ return -1;
+}
+
+/*
+ * stat_covers_expressions
+ * Test whether a statistics object covers all expressions in a list.
+ *
+ * Returns true if all expressions are covered. If expr_idxs is non-NULL, it
+ * is populated with the indexes of the expressions found.
+ */
+static bool
+stat_covers_expressions(StatisticExtInfo *stat, List *exprs,
+ Bitmapset **expr_idxs)
+{
+ ListCell *lc;
+
+ foreach(lc, exprs)
+ {
+ Node *expr = (Node *) lfirst(lc);
+ int expr_idx;
+
+ expr_idx = stat_find_expression(stat, expr);
+ if (expr_idx == -1)
+ return false;
+
+ if (expr_idxs != NULL)
+ *expr_idxs = bms_add_member(*expr_idxs, expr_idx);
+ }
+
+ /* If we reach here, all expressions are covered */
+ return true;
+}
+
+/*
* choose_best_statistics
* Look for and return statistics with the specified 'requiredkind' which
* have keys that match at least two of the given attnums. Return NULL if
@@ -850,7 +1231,8 @@ has_stats_of_kind(List *stats, char requiredkind)
*/
StatisticExtInfo *
choose_best_statistics(List *stats, char requiredkind,
- Bitmapset **clause_attnums, int nclauses)
+ Bitmapset **clause_attnums, List **clause_exprs,
+ int nclauses)
{
ListCell *lc;
StatisticExtInfo *best_match = NULL;
@@ -861,7 +1243,8 @@ choose_best_statistics(List *stats, char requiredkind,
{
int i;
StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
- Bitmapset *matched = NULL;
+ Bitmapset *matched_attnums = NULL;
+ Bitmapset *matched_exprs = NULL;
int num_matched;
int numkeys;
@@ -870,35 +1253,43 @@ choose_best_statistics(List *stats, char requiredkind,
continue;
/*
- * Collect attributes in remaining (unestimated) clauses fully covered
- * by this statistic object.
+ * Collect attributes and expressions in remaining (unestimated)
+ * clauses fully covered by this statistic object.
*/
for (i = 0; i < nclauses; i++)
{
+ Bitmapset *expr_idxs = NULL;
+
/* ignore incompatible/estimated clauses */
- if (!clause_attnums[i])
+ if (!clause_attnums[i] && !clause_exprs[i])
continue;
/* ignore clauses that are not covered by this object */
- if (!bms_is_subset(clause_attnums[i], info->keys))
+ if (!bms_is_subset(clause_attnums[i], info->keys) ||
+ !stat_covers_expressions(info, clause_exprs[i], &expr_idxs))
continue;
- matched = bms_add_members(matched, clause_attnums[i]);
+ /* record attnums and indexes of expressions covered */
+ matched_attnums = bms_add_members(matched_attnums, clause_attnums[i]);
+ matched_exprs = bms_add_members(matched_exprs, expr_idxs);
}
- num_matched = bms_num_members(matched);
- bms_free(matched);
+ num_matched = bms_num_members(matched_attnums) + bms_num_members(matched_exprs);
+
+ bms_free(matched_attnums);
+ bms_free(matched_exprs);
/*
* save the actual number of keys in the stats so that we can choose
* the narrowest stats with the most matching keys.
*/
- numkeys = bms_num_members(info->keys);
+ numkeys = bms_num_members(info->keys) + list_length(info->exprs);
/*
- * Use this object when it increases the number of matched clauses or
- * when it matches the same number of attributes but these stats have
- * fewer keys than any previous match.
+ * Use this object when it increases the number of matched attributes
+ * and expressions or when it matches the same number of attributes
+ * and expressions but these stats have fewer keys than any previous
+ * match.
*/
if (num_matched > best_num_matched ||
(num_matched == best_num_matched && numkeys < best_match_keys))
@@ -923,7 +1314,8 @@ choose_best_statistics(List *stats, char requiredkind,
*/
static bool
statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
- Index relid, Bitmapset **attnums)
+ Index relid, Bitmapset **attnums,
+ List **exprs)
{
/* Look inside any binary-compatible relabeling (as in examine_variable) */
if (IsA(clause, RelabelType))
@@ -951,19 +1343,19 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
return true;
}
- /* (Var op Const) or (Const op Var) */
+ /* (Var/Expr op Const) or (Const op Var/Expr) */
if (is_opclause(clause))
{
RangeTblEntry *rte = root->simple_rte_array[relid];
OpExpr *expr = (OpExpr *) clause;
- Var *var;
+ Node *clause_expr;
/* Only expressions with two arguments are considered compatible. */
if (list_length(expr->args) != 2)
return false;
- /* Check if the expression has the right shape (one Var, one Const) */
- if (!examine_clause_args(expr->args, &var, NULL, NULL))
+ /* Check if the expression has the right shape */
+ if (!examine_opclause_args(expr->args, &clause_expr, NULL, NULL))
return false;
/*
@@ -981,7 +1373,7 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
case F_SCALARLESEL:
case F_SCALARGTSEL:
case F_SCALARGESEL:
- /* supported, will continue with inspection of the Var */
+ /* supported, will continue with inspection of the Var/Expr */
break;
default:
@@ -1003,23 +1395,29 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
!get_func_leakproof(get_opcode(expr->opno)))
return false;
- return statext_is_compatible_clause_internal(root, (Node *) var,
- relid, attnums);
+ /* Check (Var op Const) or (Const op Var) clauses by recursing. */
+ if (IsA(clause_expr, Var))
+ return statext_is_compatible_clause_internal(root, clause_expr,
+ relid, attnums, exprs);
+
+ /* Otherwise we have (Expr op Const) or (Const op Expr). */
+ *exprs = lappend(*exprs, clause_expr);
+ return true;
}
- /* Var IN Array */
+ /* Var/Expr IN Array */
if (IsA(clause, ScalarArrayOpExpr))
{
RangeTblEntry *rte = root->simple_rte_array[relid];
ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
- Var *var;
+ Node *clause_expr;
/* Only expressions with two arguments are considered compatible. */
if (list_length(expr->args) != 2)
return false;
/* Check if the expression has the right shape (one Var, one Const) */
- if (!examine_clause_args(expr->args, &var, NULL, NULL))
+ if (!examine_opclause_args(expr->args, &clause_expr, NULL, NULL))
return false;
/*
@@ -1037,7 +1435,7 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
case F_SCALARLESEL:
case F_SCALARGTSEL:
case F_SCALARGESEL:
- /* supported, will continue with inspection of the Var */
+ /* supported, will continue with inspection of the Var/Expr */
break;
default:
@@ -1059,8 +1457,14 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
!get_func_leakproof(get_opcode(expr->opno)))
return false;
- return statext_is_compatible_clause_internal(root, (Node *) var,
- relid, attnums);
+ /* Check Var IN Array clauses by recursing. */
+ if (IsA(clause_expr, Var))
+ return statext_is_compatible_clause_internal(root, clause_expr,
+ relid, attnums, exprs);
+
+ /* Otherwise we have Expr IN Array. */
+ *exprs = lappend(*exprs, clause_expr);
+ return true;
}
/* AND/OR/NOT clause */
@@ -1093,54 +1497,62 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
*/
if (!statext_is_compatible_clause_internal(root,
(Node *) lfirst(lc),
- relid, attnums))
+ relid, attnums, exprs))
return false;
}
return true;
}
- /* Var IS NULL */
+ /* Var/Expr IS NULL */
if (IsA(clause, NullTest))
{
NullTest *nt = (NullTest *) clause;
- /*
- * Only simple (Var IS NULL) expressions supported for now. Maybe we
- * could use examine_variable to fix this?
- */
- if (!IsA(nt->arg, Var))
- return false;
+ /* Check Var IS NULL clauses by recursing. */
+ if (IsA(nt->arg, Var))
+ return statext_is_compatible_clause_internal(root, (Node *) (nt->arg),
+ relid, attnums, exprs);
- return statext_is_compatible_clause_internal(root, (Node *) (nt->arg),
- relid, attnums);
+ /* Otherwise we have Expr IS NULL. */
+ *exprs = lappend(*exprs, nt->arg);
+ return true;
}
- return false;
+ /*
+ * Treat any other expressions as bare expressions to be matched against
+ * expressions in statistics objects.
+ */
+ *exprs = lappend(*exprs, clause);
+ return true;
}
/*
* statext_is_compatible_clause
* Determines if the clause is compatible with MCV lists.
*
- * Currently, we only support three types of clauses:
+ * Currently, we only support the following types of clauses:
*
- * (a) OpExprs of the form (Var op Const), or (Const op Var), where the op
- * is one of ("=", "<", ">", ">=", "<=")
+ * (a) OpExprs of the form (Var/Expr op Const), or (Const op Var/Expr), where
+ * the op is one of ("=", "<", ">", ">=", "<=")
*
- * (b) (Var IS [NOT] NULL)
+ * (b) (Var/Expr IS [NOT] NULL)
*
* (c) combinations using AND/OR/NOT
*
+ * (d) ScalarArrayOpExprs of the form (Var/Expr op ANY (array)) or (Var/Expr
+ * op ALL (array))
+ *
* In the future, the range of supported clauses may be expanded to more
* complex cases, for example (Var op Var).
*/
static bool
statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
- Bitmapset **attnums)
+ Bitmapset **attnums, List **exprs)
{
RangeTblEntry *rte = root->simple_rte_array[relid];
RestrictInfo *rinfo = (RestrictInfo *) clause;
+ int clause_relid;
Oid userid;
/*
@@ -1160,7 +1572,7 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
foreach(lc, expr->args)
{
if (!statext_is_compatible_clause(root, (Node *) lfirst(lc),
- relid, attnums))
+ relid, attnums, exprs))
return false;
}
@@ -1175,25 +1587,36 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
if (rinfo->pseudoconstant)
return false;
- /* clauses referencing multiple varnos are incompatible */
- if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
+ /* Clauses referencing other varnos are incompatible. */
+ if (!bms_get_singleton_member(rinfo->clause_relids, &clause_relid) ||
+ clause_relid != relid)
return false;
/* Check the clause and determine what attributes it references. */
if (!statext_is_compatible_clause_internal(root, (Node *) rinfo->clause,
- relid, attnums))
+ relid, attnums, exprs))
return false;
/*
- * Check that the user has permission to read all these attributes. Use
+ * Check that the user has permission to read all required attributes. Use
* checkAsUser if it's set, in case we're accessing the table via a view.
*/
userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
if (pg_class_aclcheck(rte->relid, userid, ACL_SELECT) != ACLCHECK_OK)
{
+ Bitmapset *clause_attnums;
+
/* Don't have table privilege, must check individual columns */
- if (bms_is_member(InvalidAttrNumber, *attnums))
+ if (*exprs != NIL)
+ {
+ pull_varattnos((Node *) exprs, relid, &clause_attnums);
+ clause_attnums = bms_add_members(clause_attnums, *attnums);
+ }
+ else
+ clause_attnums = *attnums;
+
+ if (bms_is_member(InvalidAttrNumber, clause_attnums))
{
/* Have a whole-row reference, must have access to all columns */
if (pg_attribute_aclcheck_all(rte->relid, userid, ACL_SELECT,
@@ -1205,7 +1628,7 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
/* Check the columns referenced by the clause */
int attnum = -1;
- while ((attnum = bms_next_member(*attnums, attnum)) >= 0)
+ while ((attnum = bms_next_member(clause_attnums, attnum)) >= 0)
{
if (pg_attribute_aclcheck(rte->relid, attnum, userid,
ACL_SELECT) != ACLCHECK_OK)
@@ -1259,7 +1682,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
bool is_or)
{
ListCell *l;
- Bitmapset **list_attnums;
+ Bitmapset **list_attnums; /* attnums extracted from the clause */
+ List **list_exprs; /* expressions matched to any statistic */
int listidx;
Selectivity sel = (is_or) ? 0.0 : 1.0;
@@ -1270,13 +1694,16 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) *
list_length(clauses));
+ /* expressions extracted from complex expressions */
+ list_exprs = (List **) palloc(sizeof(Node *) * list_length(clauses));
+
/*
- * Pre-process the clauses list to extract the attnums seen in each item.
- * We need to determine if there's any clauses which will be useful for
- * selectivity estimations with extended stats. Along the way we'll record
- * all of the attnums for each clause in a list which we'll reference
- * later so we don't need to repeat the same work again. We'll also keep
- * track of all attnums seen.
+ * Pre-process the clauses list to extract the attnums and expressions
+ * seen in each item. We need to determine if there are any clauses which
+ * will be useful for selectivity estimations with extended stats. Along
+ * the way we'll record all of the attnums and expressions for each clause
+ * in lists which we'll reference later so we don't need to repeat the
+ * same work again.
*
* We also skip clauses that we already estimated using different types of
* statistics (we treat them as incompatible).
@@ -1286,12 +1713,19 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
{
Node *clause = (Node *) lfirst(l);
Bitmapset *attnums = NULL;
+ List *exprs = NIL;
if (!bms_is_member(listidx, *estimatedclauses) &&
- statext_is_compatible_clause(root, clause, rel->relid, &attnums))
+ statext_is_compatible_clause(root, clause, rel->relid, &attnums, &exprs))
+ {
list_attnums[listidx] = attnums;
+ list_exprs[listidx] = exprs;
+ }
else
+ {
list_attnums[listidx] = NULL;
+ list_exprs[listidx] = NIL;
+ }
listidx++;
}
@@ -1305,7 +1739,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
/* find the best suited statistics object for these attnums */
stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV,
- list_attnums, list_length(clauses));
+ list_attnums, list_exprs,
+ list_length(clauses));
/*
* if no (additional) matching stats could be found then we've nothing
@@ -1320,28 +1755,39 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
/* now filter the clauses to be estimated using the selected MCV */
stat_clauses = NIL;
- /* record which clauses are simple (single column) */
+ /* record which clauses are simple (single column or expression) */
simple_clauses = NULL;
listidx = 0;
foreach(l, clauses)
{
/*
- * If the clause is compatible with the selected statistics, mark
- * it as estimated and add it to the list to estimate.
+ * If the clause is not already estimated and is compatible with
+ * the selected statistics object (all attributes and expressions
+ * covered), mark it as estimated and add it to the list to
+ * estimate.
*/
- if (list_attnums[listidx] != NULL &&
- bms_is_subset(list_attnums[listidx], stat->keys))
+ if (!bms_is_member(listidx, *estimatedclauses) &&
+ bms_is_subset(list_attnums[listidx], stat->keys) &&
+ stat_covers_expressions(stat, list_exprs[listidx], NULL))
{
- if (bms_membership(list_attnums[listidx]) == BMS_SINGLETON)
+ /* record simple clauses (single column or expression) */
+ if ((list_attnums[listidx] == NULL &&
+ list_length(list_exprs[listidx]) == 1) ||
+ (list_exprs[listidx] == NIL &&
+ bms_membership(list_attnums[listidx]) == BMS_SINGLETON))
simple_clauses = bms_add_member(simple_clauses,
list_length(stat_clauses));
+ /* add clause to list and mark as estimated */
stat_clauses = lappend(stat_clauses, (Node *) lfirst(l));
*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
bms_free(list_attnums[listidx]);
list_attnums[listidx] = NULL;
+
+ list_free(list_exprs[listidx]);
+ list_exprs[listidx] = NULL;
}
listidx++;
@@ -1530,23 +1976,24 @@ statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
}
/*
- * examine_opclause_expression
- * Split expression into Var and Const parts.
+ * examine_opclause_args
+ * Split an operator expression's arguments into Expr and Const parts.
*
- * Attempts to match the arguments to either (Var op Const) or (Const op Var),
- * possibly with a RelabelType on top. When the expression matches this form,
- * returns true, otherwise returns false.
+ * Attempts to match the arguments to either (Expr op Const) or (Const op
+ * Expr), possibly with a RelabelType on top. When the expression matches this
+ * form, returns true, otherwise returns false.
*
- * Optionally returns pointers to the extracted Var/Const nodes, when passed
- * non-null pointers (varp, cstp and varonleftp). The varonleftp flag specifies
- * on which side of the operator we found the Var node.
+ * Optionally returns pointers to the extracted Expr/Const nodes, when passed
+ * non-null pointers (exprp, cstp and expronleftp). The expronleftp flag
+ * specifies on which side of the operator we found the expression node.
*/
bool
-examine_clause_args(List *args, Var **varp, Const **cstp, bool *varonleftp)
+examine_opclause_args(List *args, Node **exprp, Const **cstp,
+ bool *expronleftp)
{
- Var *var;
+ Node *expr;
Const *cst;
- bool varonleft;
+ bool expronleft;
Node *leftop,
*rightop;
@@ -1563,30 +2010,564 @@ examine_clause_args(List *args, Var **varp, Const **cstp, bool *varonleftp)
if (IsA(rightop, RelabelType))
rightop = (Node *) ((RelabelType *) rightop)->arg;
- if (IsA(leftop, Var) && IsA(rightop, Const))
+ if (IsA(rightop, Const))
{
- var = (Var *) leftop;
+ expr = (Node *) leftop;
cst = (Const *) rightop;
- varonleft = true;
+ expronleft = true;
}
- else if (IsA(leftop, Const) && IsA(rightop, Var))
+ else if (IsA(leftop, Const))
{
- var = (Var *) rightop;
+ expr = (Node *) rightop;
cst = (Const *) leftop;
- varonleft = false;
+ expronleft = false;
}
else
return false;
/* return pointers to the extracted parts if requested */
- if (varp)
- *varp = var;
+ if (exprp)
+ *exprp = expr;
if (cstp)
*cstp = cst;
- if (varonleftp)
- *varonleftp = varonleft;
+ if (expronleftp)
+ *expronleftp = expronleft;
return true;
}
+
+
+/*
+ * Compute statistics about expressions of a relation.
+ */
+static void
+compute_expr_stats(Relation onerel, double totalrows,
+ AnlExprData *exprdata, int nexprs,
+ HeapTuple *rows, int numrows)
+{
+ MemoryContext expr_context,
+ old_context;
+ int ind,
+ i;
+
+ expr_context = AllocSetContextCreate(CurrentMemoryContext,
+ "Analyze Expression",
+ ALLOCSET_DEFAULT_SIZES);
+ old_context = MemoryContextSwitchTo(expr_context);
+
+ for (ind = 0; ind < nexprs; ind++)
+ {
+ AnlExprData *thisdata = &exprdata[ind];
+ VacAttrStats *stats = thisdata->vacattrstat;
+ Node *expr = thisdata->expr;
+ TupleTableSlot *slot;
+ EState *estate;
+ ExprContext *econtext;
+ Datum *exprvals;
+ bool *exprnulls;
+ ExprState *exprstate;
+ int tcnt;
+
+ /* Are we still in the main context? */
+ Assert(CurrentMemoryContext == expr_context);
+
+ /*
+ * Need an EState for evaluation of expressions. Create it in the
+ * per-expression context to be sure it gets cleaned up at the bottom
+ * of the loop.
+ */
+ estate = CreateExecutorState();
+ econtext = GetPerTupleExprContext(estate);
+
+ /* Set up expression evaluation state */
+ exprstate = ExecPrepareExpr((Expr *) expr, estate);
+
+ /* Need a slot to hold the current heap tuple, too */
+ slot = MakeSingleTupleTableSlot(RelationGetDescr(onerel),
+ &TTSOpsHeapTuple);
+
+ /* Arrange for econtext's scan tuple to be the tuple under test */
+ econtext->ecxt_scantuple = slot;
+
+ /* Compute and save expression values */
+ exprvals = (Datum *) palloc(numrows * sizeof(Datum));
+ exprnulls = (bool *) palloc(numrows * sizeof(bool));
+
+ tcnt = 0;
+ for (i = 0; i < numrows; i++)
+ {
+ Datum datum;
+ bool isnull;
+
+ /*
+ * Reset the per-tuple context each time, to reclaim any cruft
+ * left behind by evaluating the statistics expressions.
+ */
+ ResetExprContext(econtext);
+
+ /* Set up for expression evaluation */
+ ExecStoreHeapTuple(rows[i], slot, false);
+
+ /*
+ * Evaluate the expression. We do this in the per-tuple context so
+ * as not to leak memory, and then copy the result into the
+ * context created at the beginning of this function.
+ */
+ datum = ExecEvalExprSwitchContext(exprstate,
+ GetPerTupleExprContext(estate),
+ &isnull);
+ if (isnull)
+ {
+ exprvals[tcnt] = (Datum) 0;
+ exprnulls[tcnt] = true;
+ }
+ else
+ {
+ /* Make sure we copy the data into the context. */
+ Assert(CurrentMemoryContext == expr_context);
+
+ exprvals[tcnt] = datumCopy(datum,
+ stats->attrtype->typbyval,
+ stats->attrtype->typlen);
+ exprnulls[tcnt] = false;
+ }
+
+ tcnt++;
+ }
+
+ /*
+ * Now we can compute the statistics for the expression columns.
+ *
+ * XXX Unlike compute_index_stats we don't need to switch and reset
+ * memory contexts here, because we're only computing stats for a
+ * single expression (and not iterating over many indexes), so we just
+ * do it in expr_context. Note that compute_stats copies the result
+ * into stats->anl_context, so it does not disappear.
+ */
+ if (tcnt > 0)
+ {
+ AttributeOpts *aopt =
+ get_attribute_options(stats->attr->attrelid,
+ stats->attr->attnum);
+
+ stats->exprvals = exprvals;
+ stats->exprnulls = exprnulls;
+ stats->rowstride = 1;
+ stats->compute_stats(stats,
+ expr_fetch_func,
+ tcnt,
+ tcnt);
+
+ /*
+ * If the n_distinct option is specified, it overrides the above
+ * computation.
+ */
+ if (aopt != NULL && aopt->n_distinct != 0.0)
+ stats->stadistinct = aopt->n_distinct;
+ }
+
+ /* And clean up */
+ MemoryContextSwitchTo(expr_context);
+
+ ExecDropSingleTupleTableSlot(slot);
+ FreeExecutorState(estate);
+ MemoryContextResetAndDeleteChildren(expr_context);
+ }
+
+ MemoryContextSwitchTo(old_context);
+ MemoryContextDelete(expr_context);
+}
+
+
+/*
+ * Fetch function for analyzing statistics object expressions.
+ *
+ * We have not bothered to construct tuples from the data, instead the data
+ * is just in Datum arrays.
+ */
+static Datum
+expr_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull)
+{
+ int i;
+
+ /* exprvals and exprnulls are already offset for proper column */
+ i = rownum * stats->rowstride;
+ *isNull = stats->exprnulls[i];
+ return stats->exprvals[i];
+}
+
+/*
+ * Build analyze data for a list of expressions. As this is not tied
+ * directly to a relation (table or index), we have to fake some of
+ * the fields in examine_expression().
+ */
+static AnlExprData *
+build_expr_data(List *exprs, int stattarget)
+{
+ int idx;
+ int nexprs = list_length(exprs);
+ AnlExprData *exprdata;
+ ListCell *lc;
+
+ exprdata = (AnlExprData *) palloc0(nexprs * sizeof(AnlExprData));
+
+ idx = 0;
+ foreach(lc, exprs)
+ {
+ Node *expr = (Node *) lfirst(lc);
+ AnlExprData *thisdata = &exprdata[idx];
+
+ thisdata->expr = expr;
+ thisdata->vacattrstat = examine_expression(expr, stattarget);
+ idx++;
+ }
+
+ return exprdata;
+}
+
+/* form an array of pg_statistic rows (per update_attstats) */
+static Datum
+serialize_expr_stats(AnlExprData *exprdata, int nexprs)
+{
+ int exprno;
+ Oid typOid;
+ Relation sd;
+
+ ArrayBuildState *astate = NULL;
+
+ sd = table_open(StatisticRelationId, RowExclusiveLock);
+
+ /* lookup OID of composite type for pg_statistic */
+ typOid = get_rel_type_id(StatisticRelationId);
+ if (!OidIsValid(typOid))
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("relation \"pg_statistic\" does not have a composite type")));
+
+ for (exprno = 0; exprno < nexprs; exprno++)
+ {
+ int i,
+ k;
+ VacAttrStats *stats = exprdata[exprno].vacattrstat;
+
+ Datum values[Natts_pg_statistic];
+ bool nulls[Natts_pg_statistic];
+ HeapTuple stup;
+
+ if (!stats->stats_valid)
+ {
+ astate = accumArrayResult(astate,
+ (Datum) 0,
+ true,
+ typOid,
+ CurrentMemoryContext);
+ continue;
+ }
+
+ /*
+ * Construct a new pg_statistic tuple
+ */
+ for (i = 0; i < Natts_pg_statistic; ++i)
+ {
+ nulls[i] = false;
+ }
+
+ values[Anum_pg_statistic_starelid - 1] = ObjectIdGetDatum(InvalidOid);
+ values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(InvalidAttrNumber);
+ values[Anum_pg_statistic_stainherit - 1] = BoolGetDatum(false);
+ values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac);
+ values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum(stats->stawidth);
+ values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->stadistinct);
+ i = Anum_pg_statistic_stakind1 - 1;
+ for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+ {
+ values[i++] = Int16GetDatum(stats->stakind[k]); /* stakindN */
+ }
+ i = Anum_pg_statistic_staop1 - 1;
+ for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+ {
+ values[i++] = ObjectIdGetDatum(stats->staop[k]); /* staopN */
+ }
+ i = Anum_pg_statistic_stacoll1 - 1;
+ for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+ {
+ values[i++] = ObjectIdGetDatum(stats->stacoll[k]); /* stacollN */
+ }
+ i = Anum_pg_statistic_stanumbers1 - 1;
+ for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+ {
+ int nnum = stats->numnumbers[k];
+
+ if (nnum > 0)
+ {
+ int n;
+ Datum *numdatums = (Datum *) palloc(nnum * sizeof(Datum));
+ ArrayType *arry;
+
+ for (n = 0; n < nnum; n++)
+ numdatums[n] = Float4GetDatum(stats->stanumbers[k][n]);
+ /* XXX knows more than it should about type float4: */
+ arry = construct_array(numdatums, nnum,
+ FLOAT4OID,
+ sizeof(float4), true, TYPALIGN_INT);
+ values[i++] = PointerGetDatum(arry); /* stanumbersN */
+ }
+ else
+ {
+ nulls[i] = true;
+ values[i++] = (Datum) 0;
+ }
+ }
+ i = Anum_pg_statistic_stavalues1 - 1;
+ for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+ {
+ if (stats->numvalues[k] > 0)
+ {
+ ArrayType *arry;
+
+ arry = construct_array(stats->stavalues[k],
+ stats->numvalues[k],
+ stats->statypid[k],
+ stats->statyplen[k],
+ stats->statypbyval[k],
+ stats->statypalign[k]);
+ values[i++] = PointerGetDatum(arry); /* stavaluesN */
+ }
+ else
+ {
+ nulls[i] = true;
+ values[i++] = (Datum) 0;
+ }
+ }
+
+ stup = heap_form_tuple(RelationGetDescr(sd), values, nulls);
+
+ astate = accumArrayResult(astate,
+ heap_copy_tuple_as_datum(stup, RelationGetDescr(sd)),
+ false,
+ typOid,
+ CurrentMemoryContext);
+ }
+
+ table_close(sd, RowExclusiveLock);
+
+ return makeArrayResult(astate, CurrentMemoryContext);
+}
+
+/*
+ * Loads pg_statistic record from expression statistics for expression
+ * identified by the supplied index.
+ */
+HeapTuple
+statext_expressions_load(Oid stxoid, int idx)
+{
+ bool isnull;
+ Datum value;
+ HeapTuple htup;
+ ExpandedArrayHeader *eah;
+ HeapTupleHeader td;
+ HeapTupleData tmptup;
+ HeapTuple tup;
+
+ htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(stxoid));
+ if (!HeapTupleIsValid(htup))
+ elog(ERROR, "cache lookup failed for statistics object %u", stxoid);
+
+ value = SysCacheGetAttr(STATEXTDATASTXOID, htup,
+ Anum_pg_statistic_ext_data_stxdexpr, &isnull);
+ if (isnull)
+ elog(ERROR,
+ "requested statistics kind \"%c\" is not yet built for statistics object %u",
+ STATS_EXT_DEPENDENCIES, stxoid);
+
+ eah = DatumGetExpandedArray(value);
+
+ deconstruct_expanded_array(eah);
+
+ td = DatumGetHeapTupleHeader(eah->dvalues[idx]);
+
+ /* Build a temporary HeapTuple control structure */
+ tmptup.t_len = HeapTupleHeaderGetDatumLength(td);
+ tmptup.t_data = td;
+
+ tup = heap_copytuple(&tmptup);
+
+ ReleaseSysCache(htup);
+
+ return tup;
+}
+
+/*
+ * Evaluate the expressions, so that we can use the results to build
+ * all the requested statistics types. This matters especially for
+ * expensive expressions, of course.
+ */
+static StatsBuildData *
+make_build_data(Relation rel, StatExtEntry *stat, int numrows, HeapTuple *rows,
+ VacAttrStats **stats, int stattarget)
+{
+ /* evaluated expressions */
+ StatsBuildData *result;
+ char *ptr;
+ Size len;
+
+ int i;
+ int k;
+ int idx;
+ TupleTableSlot *slot;
+ EState *estate;
+ ExprContext *econtext;
+ List *exprstates = NIL;
+ int nkeys = bms_num_members(stat->columns) + list_length(stat->exprs);
+ ListCell *lc;
+
+ /* allocate everything as a single chunk, so we can free it easily */
+ len = MAXALIGN(sizeof(StatsBuildData));
+ len += MAXALIGN(sizeof(AttrNumber) * nkeys); /* attnums */
+ len += MAXALIGN(sizeof(VacAttrStats *) * nkeys); /* stats */
+
+ /* values */
+ len += MAXALIGN(sizeof(Datum *) * nkeys);
+ len += nkeys * MAXALIGN(sizeof(Datum) * numrows);
+
+ /* nulls */
+ len += MAXALIGN(sizeof(bool *) * nkeys);
+ len += nkeys * MAXALIGN(sizeof(bool) * numrows);
+
+ ptr = palloc(len);
+
+ /* set the pointers */
+ result = (StatsBuildData *) ptr;
+ ptr += MAXALIGN(sizeof(StatsBuildData));
+
+ /* attnums */
+ result->attnums = (AttrNumber *) ptr;
+ ptr += MAXALIGN(sizeof(AttrNumber) * nkeys);
+
+ /* stats */
+ result->stats = (VacAttrStats **) ptr;
+ ptr += MAXALIGN(sizeof(VacAttrStats *) * nkeys);
+
+ /* values */
+ result->values = (Datum **) ptr;
+ ptr += MAXALIGN(sizeof(Datum *) * nkeys);
+
+ /* nulls */
+ result->nulls = (bool **) ptr;
+ ptr += MAXALIGN(sizeof(bool *) * nkeys);
+
+ for (i = 0; i < nkeys; i++)
+ {
+ result->values[i] = (Datum *) ptr;
+ ptr += MAXALIGN(sizeof(Datum) * numrows);
+
+ result->nulls[i] = (bool *) ptr;
+ ptr += MAXALIGN(sizeof(bool) * numrows);
+ }
+
+ Assert((ptr - (char *) result) == len);
+
+ /* we have it allocated, so let's fill the values */
+ result->nattnums = nkeys;
+ result->numrows = numrows;
+
+ /* fill the attribute info - first attributes, then expressions */
+ idx = 0;
+ k = -1;
+ while ((k = bms_next_member(stat->columns, k)) >= 0)
+ {
+ result->attnums[idx] = k;
+ result->stats[idx] = stats[idx];
+
+ idx++;
+ }
+
+ k = -1;
+ foreach(lc, stat->exprs)
+ {
+ Node *expr = (Node *) lfirst(lc);
+
+ result->attnums[idx] = k;
+ result->stats[idx] = examine_expression(expr, stattarget);
+
+ idx++;
+ k--;
+ }
+
+ /* first extract values for all the regular attributes */
+ for (i = 0; i < numrows; i++)
+ {
+ idx = 0;
+ k = -1;
+ while ((k = bms_next_member(stat->columns, k)) >= 0)
+ {
+ result->values[idx][i] = heap_getattr(rows[i], k,
+ result->stats[idx]->tupDesc,
+ &result->nulls[idx][i]);
+
+ idx++;
+ }
+ }
+
+ /* Need an EState for evaluation expressions. */
+ estate = CreateExecutorState();
+ econtext = GetPerTupleExprContext(estate);
+
+ /* Need a slot to hold the current heap tuple, too */
+ slot = MakeSingleTupleTableSlot(RelationGetDescr(rel),
+ &TTSOpsHeapTuple);
+
+ /* Arrange for econtext's scan tuple to be the tuple under test */
+ econtext->ecxt_scantuple = slot;
+
+ /* Set up expression evaluation state */
+ exprstates = ExecPrepareExprList(stat->exprs, estate);
+
+ for (i = 0; i < numrows; i++)
+ {
+ /*
+ * Reset the per-tuple context each time, to reclaim any cruft left
+ * behind by evaluating the statistics object expressions.
+ */
+ ResetExprContext(econtext);
+
+ /* Set up for expression evaluation */
+ ExecStoreHeapTuple(rows[i], slot, false);
+
+ idx = bms_num_members(stat->columns);
+ foreach(lc, exprstates)
+ {
+ Datum datum;
+ bool isnull;
+ ExprState *exprstate = (ExprState *) lfirst(lc);
+
+ /*
+ * XXX This probably leaks memory. Maybe we should use
+ * ExecEvalExprSwitchContext but then we need to copy the result
+ * somewhere else.
+ */
+ datum = ExecEvalExpr(exprstate,
+ GetPerTupleExprContext(estate),
+ &isnull);
+ if (isnull)
+ {
+ result->values[idx][i] = (Datum) 0;
+ result->nulls[idx][i] = true;
+ }
+ else
+ {
+ result->values[idx][i] = (Datum) datum;
+ result->nulls[idx][i] = false;
+ }
+
+ idx++;
+ }
+ }
+
+ ExecDropSingleTupleTableSlot(slot);
+ FreeExecutorState(estate);
+
+ return result;
+}
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 8335dff241..2a00fb4848 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -74,7 +74,7 @@
((ndims) * sizeof(DimensionInfo)) + \
((nitems) * ITEM_SIZE(ndims)))
-static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
+static MultiSortSupport build_mss(StatsBuildData *data);
static SortItem *build_distinct_groups(int numrows, SortItem *items,
MultiSortSupport mss, int *ndistinct);
@@ -181,32 +181,33 @@ get_mincount_for_mcv_list(int samplerows, double totalrows)
*
*/
MCVList *
-statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
- VacAttrStats **stats, double totalrows, int stattarget)
+statext_mcv_build(StatsBuildData *data, double totalrows, int stattarget)
{
int i,
numattrs,
+ numrows,
ngroups,
nitems;
- AttrNumber *attnums;
double mincount;
SortItem *items;
SortItem *groups;
MCVList *mcvlist = NULL;
MultiSortSupport mss;
- attnums = build_attnums_array(attrs, &numattrs);
-
/* comparator for all the columns */
- mss = build_mss(stats, numattrs);
+ mss = build_mss(data);
/* sort the rows */
- items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
- mss, numattrs, attnums);
+ items = build_sorted_items(data, &nitems, mss,
+ data->nattnums, data->attnums);
if (!items)
return NULL;
+ /* for convenience */
+ numattrs = data->nattnums;
+ numrows = data->numrows;
+
/* transform the sorted rows into groups (sorted by frequency) */
groups = build_distinct_groups(nitems, items, mss, &ngroups);
@@ -289,7 +290,7 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
/* store info about data type OIDs */
for (i = 0; i < numattrs; i++)
- mcvlist->types[i] = stats[i]->attrtypid;
+ mcvlist->types[i] = data->stats[i]->attrtypid;
/* Copy the first chunk of groups into the result. */
for (i = 0; i < nitems; i++)
@@ -347,9 +348,10 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
* build MultiSortSupport for the attributes passed in attrs
*/
static MultiSortSupport
-build_mss(VacAttrStats **stats, int numattrs)
+build_mss(StatsBuildData *data)
{
int i;
+ int numattrs = data->nattnums;
/* Sort by multiple columns (using array of SortSupport) */
MultiSortSupport mss = multi_sort_init(numattrs);
@@ -357,7 +359,7 @@ build_mss(VacAttrStats **stats, int numattrs)
/* prepare the sort functions for all the attributes */
for (i = 0; i < numattrs; i++)
{
- VacAttrStats *colstat = stats[i];
+ VacAttrStats *colstat = data->stats[i];
TypeCacheEntry *type;
type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
@@ -1524,6 +1526,59 @@ pg_mcv_list_send(PG_FUNCTION_ARGS)
}
/*
+ * match the attribute/expression to a dimension of the statistic
+ *
+ * Match the attribute/expression to statistics dimension. Optionally
+ * determine the collation.
+ */
+static int
+mcv_match_expression(Node *expr, Bitmapset *keys, List *exprs, Oid *collid)
+{
+ int idx = -1;
+
+ if (IsA(expr, Var))
+ {
+ /* simple Var, so just lookup using varattno */
+ Var *var = (Var *) expr;
+
+ if (collid)
+ *collid = var->varcollid;
+
+ idx = bms_member_index(keys, var->varattno);
+
+ /* make sure the index is valid */
+ Assert((idx >= 0) && (idx <= bms_num_members(keys)));
+ }
+ else
+ {
+ ListCell *lc;
+
+ /* expressions are stored after the simple columns */
+ idx = bms_num_members(keys);
+
+ if (collid)
+ *collid = exprCollation(expr);
+
+ /* expression - lookup in stats expressions */
+ foreach(lc, exprs)
+ {
+ Node *stat_expr = (Node *) lfirst(lc);
+
+ if (equal(expr, stat_expr))
+ break;
+
+ idx++;
+ }
+
+ /* make sure the index is valid */
+ Assert((idx >= bms_num_members(keys)) &&
+ (idx <= bms_num_members(keys) + list_length(exprs)));
+ }
+
+ return idx;
+}
+
+/*
* mcv_get_match_bitmap
* Evaluate clauses using the MCV list, and update the match bitmap.
*
@@ -1544,7 +1599,8 @@ pg_mcv_list_send(PG_FUNCTION_ARGS)
*/
static bool *
mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
- Bitmapset *keys, MCVList *mcvlist, bool is_or)
+ Bitmapset *keys, List *exprs,
+ MCVList *mcvlist, bool is_or)
{
int i;
ListCell *l;
@@ -1582,77 +1638,78 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
OpExpr *expr = (OpExpr *) clause;
FmgrInfo opproc;
- /* valid only after examine_clause_args returns true */
- Var *var;
+ /* valid only after examine_opclause_args returns true */
+ Node *clause_expr;
Const *cst;
- bool varonleft;
+ bool expronleft;
+ int idx;
+ Oid collid;
fmgr_info(get_opcode(expr->opno), &opproc);
- /* extract the var and const from the expression */
- if (examine_clause_args(expr->args, &var, &cst, &varonleft))
+ /* extract the var/expr and const from the expression */
+ if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
+ elog(ERROR, "incompatible clause");
+
+ /* match the attribute/expression to a dimension of the statistic */
+ idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
+
+ /*
+ * Walk through the MCV items and evaluate the current clause. We
+ * can skip items that were already ruled out, and terminate if
+ * there are no remaining MCV items that might possibly match.
+ */
+ for (i = 0; i < mcvlist->nitems; i++)
{
- int idx;
+ bool match = true;
+ MCVItem *item = &mcvlist->items[i];
- /* match the attribute to a dimension of the statistic */
- idx = bms_member_index(keys, var->varattno);
+ Assert(idx >= 0);
/*
- * Walk through the MCV items and evaluate the current clause.
- * We can skip items that were already ruled out, and
- * terminate if there are no remaining MCV items that might
- * possibly match.
+ * When the MCV item or the Const value is NULL we can treat
+ * this as a mismatch. We must not call the operator because
+ * of strictness.
*/
- for (i = 0; i < mcvlist->nitems; i++)
+ if (item->isnull[idx] || cst->constisnull)
{
- bool match = true;
- MCVItem *item = &mcvlist->items[i];
-
- /*
- * When the MCV item or the Const value is NULL we can
- * treat this as a mismatch. We must not call the operator
- * because of strictness.
- */
- if (item->isnull[idx] || cst->constisnull)
- {
- matches[i] = RESULT_MERGE(matches[i], is_or, false);
- continue;
- }
+ matches[i] = RESULT_MERGE(matches[i], is_or, false);
+ continue;
+ }
- /*
- * Skip MCV items that can't change result in the bitmap.
- * Once the value gets false for AND-lists, or true for
- * OR-lists, we don't need to look at more clauses.
- */
- if (RESULT_IS_FINAL(matches[i], is_or))
- continue;
+ /*
+ * Skip MCV items that can't change result in the bitmap. Once
+ * the value gets false for AND-lists, or true for OR-lists,
+ * we don't need to look at more clauses.
+ */
+ if (RESULT_IS_FINAL(matches[i], is_or))
+ continue;
- /*
- * First check whether the constant is below the lower
- * boundary (in that case we can skip the bucket, because
- * there's no overlap).
- *
- * We don't store collations used to build the statistics,
- * but we can use the collation for the attribute itself,
- * as stored in varcollid. We do reset the statistics
- * after a type change (including collation change), so
- * this is OK. We may need to relax this after allowing
- * extended statistics on expressions.
- */
- if (varonleft)
- match = DatumGetBool(FunctionCall2Coll(&opproc,
- var->varcollid,
- item->values[idx],
- cst->constvalue));
- else
- match = DatumGetBool(FunctionCall2Coll(&opproc,
- var->varcollid,
- cst->constvalue,
- item->values[idx]));
-
- /* update the match bitmap with the result */
- matches[i] = RESULT_MERGE(matches[i], is_or, match);
- }
+ /*
+ * First check whether the constant is below the lower
+ * boundary (in that case we can skip the bucket, because
+ * there's no overlap).
+ *
+ * We don't store collations used to build the statistics, but
+ * we can use the collation for the attribute itself, as
+ * stored in varcollid. We do reset the statistics after a
+ * type change (including collation change), so this is OK.
+ * For expressions we use the collation extracted from the
+ * expression itself.
+ */
+ if (expronleft)
+ match = DatumGetBool(FunctionCall2Coll(&opproc,
+ collid,
+ item->values[idx],
+ cst->constvalue));
+ else
+ match = DatumGetBool(FunctionCall2Coll(&opproc,
+ collid,
+ cst->constvalue,
+ item->values[idx]));
+
+ /* update the match bitmap with the result */
+ matches[i] = RESULT_MERGE(matches[i], is_or, match);
}
}
else if (IsA(clause, ScalarArrayOpExpr))
@@ -1660,115 +1717,116 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
FmgrInfo opproc;
- /* valid only after examine_clause_args returns true */
- Var *var;
+ /* valid only after examine_opclause_args returns true */
+ Node *clause_expr;
Const *cst;
- bool varonleft;
+ bool expronleft;
+ Oid collid;
+ int idx;
+
+ /* array evaluation */
+ ArrayType *arrayval;
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ int num_elems;
+ Datum *elem_values;
+ bool *elem_nulls;
fmgr_info(get_opcode(expr->opno), &opproc);
- /* extract the var and const from the expression */
- if (examine_clause_args(expr->args, &var, &cst, &varonleft))
+ /* extract the var/expr and const from the expression */
+ if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
+ elog(ERROR, "incompatible clause");
+
+ /* ScalarArrayOpExpr has the Var always on the left */
+ Assert(expronleft);
+
+ /* XXX what if (cst->constisnull == NULL)? */
+ if (!cst->constisnull)
{
- int idx;
+ arrayval = DatumGetArrayTypeP(cst->constvalue);
+ get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
+ &elmlen, &elmbyval, &elmalign);
+ deconstruct_array(arrayval,
+ ARR_ELEMTYPE(arrayval),
+ elmlen, elmbyval, elmalign,
+ &elem_values, &elem_nulls, &num_elems);
+ }
- ArrayType *arrayval;
- int16 elmlen;
- bool elmbyval;
- char elmalign;
- int num_elems;
- Datum *elem_values;
- bool *elem_nulls;
+ /* match the attribute/expression to a dimension of the statistic */
+ idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
- /* ScalarArrayOpExpr has the Var always on the left */
- Assert(varonleft);
+ /*
+ * Walk through the MCV items and evaluate the current clause. We
+ * can skip items that were already ruled out, and terminate if
+ * there are no remaining MCV items that might possibly match.
+ */
+ for (i = 0; i < mcvlist->nitems; i++)
+ {
+ int j;
+ bool match = (expr->useOr ? false : true);
+ MCVItem *item = &mcvlist->items[i];
- if (!cst->constisnull)
+ /*
+ * When the MCV item or the Const value is NULL we can treat
+ * this as a mismatch. We must not call the operator because
+ * of strictness.
+ */
+ if (item->isnull[idx] || cst->constisnull)
{
- arrayval = DatumGetArrayTypeP(cst->constvalue);
- get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
- &elmlen, &elmbyval, &elmalign);
- deconstruct_array(arrayval,
- ARR_ELEMTYPE(arrayval),
- elmlen, elmbyval, elmalign,
- &elem_values, &elem_nulls, &num_elems);
+ matches[i] = RESULT_MERGE(matches[i], is_or, false);
+ continue;
}
- /* match the attribute to a dimension of the statistic */
- idx = bms_member_index(keys, var->varattno);
-
/*
- * Walk through the MCV items and evaluate the current clause.
- * We can skip items that were already ruled out, and
- * terminate if there are no remaining MCV items that might
- * possibly match.
+ * Skip MCV items that can't change result in the bitmap. Once
+ * the value gets false for AND-lists, or true for OR-lists,
+ * we don't need to look at more clauses.
*/
- for (i = 0; i < mcvlist->nitems; i++)
+ if (RESULT_IS_FINAL(matches[i], is_or))
+ continue;
+
+ for (j = 0; j < num_elems; j++)
{
- int j;
- bool match = (expr->useOr ? false : true);
- MCVItem *item = &mcvlist->items[i];
+ Datum elem_value = elem_values[j];
+ bool elem_isnull = elem_nulls[j];
+ bool elem_match;
- /*
- * When the MCV item or the Const value is NULL we can
- * treat this as a mismatch. We must not call the operator
- * because of strictness.
- */
- if (item->isnull[idx] || cst->constisnull)
+ /* NULL values always evaluate as not matching. */
+ if (elem_isnull)
{
- matches[i] = RESULT_MERGE(matches[i], is_or, false);
+ match = RESULT_MERGE(match, expr->useOr, false);
continue;
}
/*
- * Skip MCV items that can't change result in the bitmap.
- * Once the value gets false for AND-lists, or true for
- * OR-lists, we don't need to look at more clauses.
+ * Stop evaluating the array elements once we reach match
+ * value that can't change - ALL() is the same as
+ * AND-list, ANY() is the same as OR-list.
*/
- if (RESULT_IS_FINAL(matches[i], is_or))
- continue;
+ if (RESULT_IS_FINAL(match, expr->useOr))
+ break;
- for (j = 0; j < num_elems; j++)
- {
- Datum elem_value = elem_values[j];
- bool elem_isnull = elem_nulls[j];
- bool elem_match;
-
- /* NULL values always evaluate as not matching. */
- if (elem_isnull)
- {
- match = RESULT_MERGE(match, expr->useOr, false);
- continue;
- }
-
- /*
- * Stop evaluating the array elements once we reach
- * match value that can't change - ALL() is the same
- * as AND-list, ANY() is the same as OR-list.
- */
- if (RESULT_IS_FINAL(match, expr->useOr))
- break;
-
- elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
- var->varcollid,
- item->values[idx],
- elem_value));
-
- match = RESULT_MERGE(match, expr->useOr, elem_match);
- }
+ elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
+ collid,
+ item->values[idx],
+ elem_value));
- /* update the match bitmap with the result */
- matches[i] = RESULT_MERGE(matches[i], is_or, match);
+ match = RESULT_MERGE(match, expr->useOr, elem_match);
}
+
+ /* update the match bitmap with the result */
+ matches[i] = RESULT_MERGE(matches[i], is_or, match);
}
}
else if (IsA(clause, NullTest))
{
NullTest *expr = (NullTest *) clause;
- Var *var = (Var *) (expr->arg);
+ Node *clause_expr = (Node *) (expr->arg);
- /* match the attribute to a dimension of the statistic */
- int idx = bms_member_index(keys, var->varattno);
+ /* match the attribute/expression to a dimension of the statistic */
+ int idx = mcv_match_expression(clause_expr, keys, exprs, NULL);
/*
* Walk through the MCV items and evaluate the current clause. We
@@ -1811,7 +1869,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
Assert(list_length(bool_clauses) >= 2);
/* build the match bitmap for the OR-clauses */
- bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys,
+ bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys, exprs,
mcvlist, is_orclause(clause));
/*
@@ -1839,7 +1897,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
Assert(list_length(not_args) == 1);
/* build the match bitmap for the NOT-clause */
- not_matches = mcv_get_match_bitmap(root, not_args, keys,
+ not_matches = mcv_get_match_bitmap(root, not_args, keys, exprs,
mcvlist, false);
/*
@@ -1982,7 +2040,8 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
mcv = statext_mcv_load(stat->statOid);
/* build a match bitmap for the clauses */
- matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false);
+ matches = mcv_get_match_bitmap(root, clauses, stat->keys, stat->exprs,
+ mcv, false);
/* sum frequencies for all the matching MCV items */
*basesel = 0.0;
@@ -2056,7 +2115,7 @@ mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat,
/* build the match bitmap for the new clause */
new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys,
- mcv, false);
+ stat->exprs, mcv, false);
/*
* Sum the frequencies for all the MCV items matching this clause and also
diff --git a/src/backend/statistics/mvdistinct.c b/src/backend/statistics/mvdistinct.c
index e08c001e3f..4481312d61 100644
--- a/src/backend/statistics/mvdistinct.c
+++ b/src/backend/statistics/mvdistinct.c
@@ -36,8 +36,7 @@
#include "utils/syscache.h"
#include "utils/typcache.h"
-static double ndistinct_for_combination(double totalrows, int numrows,
- HeapTuple *rows, VacAttrStats **stats,
+static double ndistinct_for_combination(double totalrows, StatsBuildData *data,
int k, int *combination);
static double estimate_ndistinct(double totalrows, int numrows, int d, int f1);
static int n_choose_k(int n, int k);
@@ -81,15 +80,18 @@ static void generate_combinations(CombinationGenerator *state);
*
* This computes the ndistinct estimate using the same estimator used
* in analyze.c and then computes the coefficient.
+ *
+ * To handle expressions easily, we treat them as system attributes with
+ * negative attnums, and offset everything by number of expressions to
+ * allow using Bitmapsets.
*/
MVNDistinct *
-statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows,
- Bitmapset *attrs, VacAttrStats **stats)
+statext_ndistinct_build(double totalrows, StatsBuildData *data)
{
MVNDistinct *result;
int k;
int itemcnt;
- int numattrs = bms_num_members(attrs);
+ int numattrs = data->nattnums;
int numcombs = num_combinations(numattrs);
result = palloc(offsetof(MVNDistinct, items) +
@@ -112,13 +114,19 @@ statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows,
MVNDistinctItem *item = &result->items[itemcnt];
int j;
- item->attrs = NULL;
+ item->attributes = palloc(sizeof(AttrNumber) * k);
+ item->nattributes = k;
+
+ /* translate the indexes to attnums */
for (j = 0; j < k; j++)
- item->attrs = bms_add_member(item->attrs,
- stats[combination[j]]->attr->attnum);
+ {
+ item->attributes[j] = data->attnums[combination[j]];
+
+ Assert(AttributeNumberIsValid(item->attributes[j]));
+ }
+
item->ndistinct =
- ndistinct_for_combination(totalrows, numrows, rows,
- stats, k, combination);
+ ndistinct_for_combination(totalrows, data, k, combination);
itemcnt++;
Assert(itemcnt <= result->nitems);
@@ -189,7 +197,7 @@ statext_ndistinct_serialize(MVNDistinct *ndistinct)
{
int nmembers;
- nmembers = bms_num_members(ndistinct->items[i].attrs);
+ nmembers = ndistinct->items[i].nattributes;
Assert(nmembers >= 2);
len += SizeOfItem(nmembers);
@@ -214,22 +222,15 @@ statext_ndistinct_serialize(MVNDistinct *ndistinct)
for (i = 0; i < ndistinct->nitems; i++)
{
MVNDistinctItem item = ndistinct->items[i];
- int nmembers = bms_num_members(item.attrs);
- int x;
+ int nmembers = item.nattributes;
memcpy(tmp, &item.ndistinct, sizeof(double));
tmp += sizeof(double);
memcpy(tmp, &nmembers, sizeof(int));
tmp += sizeof(int);
- x = -1;
- while ((x = bms_next_member(item.attrs, x)) >= 0)
- {
- AttrNumber value = (AttrNumber) x;
-
- memcpy(tmp, &value, sizeof(AttrNumber));
- tmp += sizeof(AttrNumber);
- }
+ memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers);
+ tmp += nmembers * sizeof(AttrNumber);
/* protect against overflows */
Assert(tmp <= ((char *) output + len));
@@ -301,27 +302,21 @@ statext_ndistinct_deserialize(bytea *data)
for (i = 0; i < ndistinct->nitems; i++)
{
MVNDistinctItem *item = &ndistinct->items[i];
- int nelems;
-
- item->attrs = NULL;
/* ndistinct value */
memcpy(&item->ndistinct, tmp, sizeof(double));
tmp += sizeof(double);
/* number of attributes */
- memcpy(&nelems, tmp, sizeof(int));
+ memcpy(&item->nattributes, tmp, sizeof(int));
tmp += sizeof(int);
- Assert((nelems >= 2) && (nelems <= STATS_MAX_DIMENSIONS));
+ Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS));
- while (nelems-- > 0)
- {
- AttrNumber attno;
+ item->attributes
+ = (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber));
- memcpy(&attno, tmp, sizeof(AttrNumber));
- tmp += sizeof(AttrNumber);
- item->attrs = bms_add_member(item->attrs, attno);
- }
+ memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes);
+ tmp += sizeof(AttrNumber) * item->nattributes;
/* still within the bytea */
Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
@@ -369,17 +364,17 @@ pg_ndistinct_out(PG_FUNCTION_ARGS)
for (i = 0; i < ndist->nitems; i++)
{
+ int j;
MVNDistinctItem item = ndist->items[i];
- int x = -1;
- bool first = true;
if (i > 0)
appendStringInfoString(&str, ", ");
- while ((x = bms_next_member(item.attrs, x)) >= 0)
+ for (j = 0; j < item.nattributes; j++)
{
- appendStringInfo(&str, "%s%d", first ? "\"" : ", ", x);
- first = false;
+ AttrNumber attnum = item.attributes[j];
+
+ appendStringInfo(&str, "%s%d", (j == 0) ? "\"" : ", ", attnum);
}
appendStringInfo(&str, "\": %d", (int) item.ndistinct);
}
@@ -427,8 +422,8 @@ pg_ndistinct_send(PG_FUNCTION_ARGS)
* combination of multiple columns.
*/
static double
-ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows,
- VacAttrStats **stats, int k, int *combination)
+ndistinct_for_combination(double totalrows, StatsBuildData *data,
+ int k, int *combination)
{
int i,
j;
@@ -439,6 +434,7 @@ ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows,
Datum *values;
SortItem *items;
MultiSortSupport mss;
+ int numrows = data->numrows;
mss = multi_sort_init(k);
@@ -467,25 +463,27 @@ ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows,
*/
for (i = 0; i < k; i++)
{
- VacAttrStats *colstat = stats[combination[i]];
+ Oid typid;
TypeCacheEntry *type;
+ Oid collid = InvalidOid;
+ VacAttrStats *colstat = data->stats[combination[i]];
+
+ typid = colstat->attrtypid;
+ collid = colstat->attrcollid;
- type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
+ type = lookup_type_cache(typid, TYPECACHE_LT_OPR);
if (type->lt_opr == InvalidOid) /* shouldn't happen */
elog(ERROR, "cache lookup failed for ordering operator for type %u",
- colstat->attrtypid);
+ typid);
/* prepare the sort function for this dimension */
- multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
+ multi_sort_add_dimension(mss, i, type->lt_opr, collid);
/* accumulate all the data for this dimension into the arrays */
for (j = 0; j < numrows; j++)
{
- items[j].values[i] =
- heap_getattr(rows[j],
- colstat->attr->attnum,
- colstat->tupDesc,
- &items[j].isnull[i]);
+ items[j].values[i] = data->values[combination[i]][j];
+ items[j].isnull[i] = data->nulls[combination[i]][j];
}
}
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 729274b330..16c6f17e23 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -1816,7 +1816,34 @@ ProcessUtilitySlow(ParseState *pstate,
break;
case T_CreateStatsStmt:
- address = CreateStatistics((CreateStatsStmt *) parsetree);
+ {
+ Oid relid;
+ CreateStatsStmt *stmt = (CreateStatsStmt *) parsetree;
+ RangeVar *rel = (RangeVar *) linitial(stmt->relations);
+
+ if (!IsA(rel, RangeVar))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("only a single relation is allowed in CREATE STATISTICS")));
+
+ /*
+ * CREATE STATISTICS will influence future execution plans
+ * but does not interfere with currently executing plans.
+ * So it should be enough to take ShareUpdateExclusiveLock
+ * on relation, conflicting with ANALYZE and other DDL
+ * that sets statistical information, but not with normal
+ * queries.
+ *
+ * XXX RangeVarCallbackOwnsRelation not needed here, to
+ * keep the same behavior as before.
+ */
+ relid = RangeVarGetRelid(rel, ShareUpdateExclusiveLock, false);
+
+ /* Run parse analysis ... */
+ stmt = transformStatsStmt(relid, stmt, queryString);
+
+ address = CreateStatistics(stmt);
+ }
break;
case T_AlterStatsStmt:
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index f0de2a25c9..3de98d2333 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -336,7 +336,8 @@ static char *pg_get_indexdef_worker(Oid indexrelid, int colno,
bool attrsOnly, bool keysOnly,
bool showTblSpc, bool inherits,
int prettyFlags, bool missing_ok);
-static char *pg_get_statisticsobj_worker(Oid statextid, bool missing_ok);
+static char *pg_get_statisticsobj_worker(Oid statextid, bool columns_only,
+ bool missing_ok);
static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags,
bool attrsOnly, bool missing_ok);
static char *pg_get_constraintdef_worker(Oid constraintId, bool fullCommand,
@@ -1507,7 +1508,36 @@ pg_get_statisticsobjdef(PG_FUNCTION_ARGS)
Oid statextid = PG_GETARG_OID(0);
char *res;
- res = pg_get_statisticsobj_worker(statextid, true);
+ res = pg_get_statisticsobj_worker(statextid, false, true);
+
+ if (res == NULL)
+ PG_RETURN_NULL();
+
+ PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+/*
+ * Internal version for use by ALTER TABLE.
+ * Includes a tablespace clause in the result.
+ * Returns a palloc'd C string; no pretty-printing.
+ */
+char *
+pg_get_statisticsobjdef_string(Oid statextid)
+{
+ return pg_get_statisticsobj_worker(statextid, false, false);
+}
+
+/*
+ * pg_get_statisticsobjdef_columns
+ * Get columns and expressions for an extended statistics object
+ */
+Datum
+pg_get_statisticsobjdef_columns(PG_FUNCTION_ARGS)
+{
+ Oid statextid = PG_GETARG_OID(0);
+ char *res;
+
+ res = pg_get_statisticsobj_worker(statextid, true, true);
if (res == NULL)
PG_RETURN_NULL();
@@ -1519,7 +1549,7 @@ pg_get_statisticsobjdef(PG_FUNCTION_ARGS)
* Internal workhorse to decompile an extended statistics object.
*/
static char *
-pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
+pg_get_statisticsobj_worker(Oid statextid, bool columns_only, bool missing_ok)
{
Form_pg_statistic_ext statextrec;
HeapTuple statexttup;
@@ -1534,6 +1564,11 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
bool dependencies_enabled;
bool mcv_enabled;
int i;
+ List *context;
+ ListCell *lc;
+ List *exprs = NIL;
+ bool has_exprs;
+ int ncolumns;
statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));
@@ -1544,75 +1579,114 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
elog(ERROR, "cache lookup failed for statistics object %u", statextid);
}
- statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup);
+ /* has the statistics expressions? */
+ has_exprs = !heap_attisnull(statexttup, Anum_pg_statistic_ext_stxexprs, NULL);
- initStringInfo(&buf);
-
- nsp = get_namespace_name(statextrec->stxnamespace);
- appendStringInfo(&buf, "CREATE STATISTICS %s",
- quote_qualified_identifier(nsp,
- NameStr(statextrec->stxname)));
+ statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup);
/*
- * Decode the stxkind column so that we know which stats types to print.
+ * Get the statistics expressions, if any. (NOTE: we do not use the
+ * relcache versions of the expressions, because we want to display
+ * non-const-folded expressions.)
*/
- datum = SysCacheGetAttr(STATEXTOID, statexttup,
- Anum_pg_statistic_ext_stxkind, &isnull);
- Assert(!isnull);
- arr = DatumGetArrayTypeP(datum);
- if (ARR_NDIM(arr) != 1 ||
- ARR_HASNULL(arr) ||
- ARR_ELEMTYPE(arr) != CHAROID)
- elog(ERROR, "stxkind is not a 1-D char array");
- enabled = (char *) ARR_DATA_PTR(arr);
-
- ndistinct_enabled = false;
- dependencies_enabled = false;
- mcv_enabled = false;
-
- for (i = 0; i < ARR_DIMS(arr)[0]; i++)
+ if (has_exprs)
{
- if (enabled[i] == STATS_EXT_NDISTINCT)
- ndistinct_enabled = true;
- if (enabled[i] == STATS_EXT_DEPENDENCIES)
- dependencies_enabled = true;
- if (enabled[i] == STATS_EXT_MCV)
- mcv_enabled = true;
+ Datum exprsDatum;
+ bool isnull;
+ char *exprsString;
+
+ exprsDatum = SysCacheGetAttr(STATEXTOID, statexttup,
+ Anum_pg_statistic_ext_stxexprs, &isnull);
+ Assert(!isnull);
+ exprsString = TextDatumGetCString(exprsDatum);
+ exprs = (List *) stringToNode(exprsString);
+ pfree(exprsString);
}
+ else
+ exprs = NIL;
- /*
- * If any option is disabled, then we'll need to append the types clause
- * to show which options are enabled. We omit the types clause on purpose
- * when all options are enabled, so a pg_dump/pg_restore will create all
- * statistics types on a newer postgres version, if the statistics had all
- * options enabled on the original version.
- */
- if (!ndistinct_enabled || !dependencies_enabled || !mcv_enabled)
+ /* count the number of columns (attributes and expressions) */
+ ncolumns = statextrec->stxkeys.dim1 + list_length(exprs);
+
+ initStringInfo(&buf);
+
+ if (!columns_only)
{
- bool gotone = false;
+ nsp = get_namespace_name(statextrec->stxnamespace);
+ appendStringInfo(&buf, "CREATE STATISTICS %s",
+ quote_qualified_identifier(nsp,
+ NameStr(statextrec->stxname)));
- appendStringInfoString(&buf, " (");
+ /*
+ * Decode the stxkind column so that we know which stats types to
+ * print.
+ */
+ datum = SysCacheGetAttr(STATEXTOID, statexttup,
+ Anum_pg_statistic_ext_stxkind, &isnull);
+ Assert(!isnull);
+ arr = DatumGetArrayTypeP(datum);
+ if (ARR_NDIM(arr) != 1 ||
+ ARR_HASNULL(arr) ||
+ ARR_ELEMTYPE(arr) != CHAROID)
+ elog(ERROR, "stxkind is not a 1-D char array");
+ enabled = (char *) ARR_DATA_PTR(arr);
- if (ndistinct_enabled)
+ ndistinct_enabled = false;
+ dependencies_enabled = false;
+ mcv_enabled = false;
+
+ for (i = 0; i < ARR_DIMS(arr)[0]; i++)
{
- appendStringInfoString(&buf, "ndistinct");
- gotone = true;
+ if (enabled[i] == STATS_EXT_NDISTINCT)
+ ndistinct_enabled = true;
+ else if (enabled[i] == STATS_EXT_DEPENDENCIES)
+ dependencies_enabled = true;
+ else if (enabled[i] == STATS_EXT_MCV)
+ mcv_enabled = true;
+
+ /* ignore STATS_EXT_EXPRESSIONS (it's built automatically) */
}
- if (dependencies_enabled)
+ /*
+ * If any option is disabled, then we'll need to append the types
+ * clause to show which options are enabled. We omit the types clause
+ * on purpose when all options are enabled, so a pg_dump/pg_restore
+ * will create all statistics types on a newer postgres version, if
+ * the statistics had all options enabled on the original version.
+ *
+ * But if the statistics is defined on just a single column, it has to
+ * be an expression statistics. In that case we don't need to specify
+ * kinds.
+ */
+ if ((!ndistinct_enabled || !dependencies_enabled || !mcv_enabled) &&
+ (ncolumns > 1))
{
- appendStringInfo(&buf, "%sdependencies", gotone ? ", " : "");
- gotone = true;
- }
+ bool gotone = false;
- if (mcv_enabled)
- appendStringInfo(&buf, "%smcv", gotone ? ", " : "");
+ appendStringInfoString(&buf, " (");
- appendStringInfoChar(&buf, ')');
- }
+ if (ndistinct_enabled)
+ {
+ appendStringInfoString(&buf, "ndistinct");
+ gotone = true;
+ }
+
+ if (dependencies_enabled)
+ {
+ appendStringInfo(&buf, "%sdependencies", gotone ? ", " : "");
+ gotone = true;
+ }
- appendStringInfoString(&buf, " ON ");
+ if (mcv_enabled)
+ appendStringInfo(&buf, "%smcv", gotone ? ", " : "");
+
+ appendStringInfoChar(&buf, ')');
+ }
+
+ appendStringInfoString(&buf, " ON ");
+ }
+ /* decode simple column references */
for (colno = 0; colno < statextrec->stxkeys.dim1; colno++)
{
AttrNumber attnum = statextrec->stxkeys.values[colno];
@@ -1626,8 +1700,33 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
appendStringInfoString(&buf, quote_identifier(attname));
}
- appendStringInfo(&buf, " FROM %s",
- generate_relation_name(statextrec->stxrelid, NIL));
+ context = deparse_context_for(get_relation_name(statextrec->stxrelid),
+ statextrec->stxrelid);
+
+ foreach(lc, exprs)
+ {
+ Node *expr = (Node *) lfirst(lc);
+ char *str;
+ int prettyFlags = PRETTYFLAG_INDENT;
+
+ str = deparse_expression_pretty(expr, context, false, false,
+ prettyFlags, 0);
+
+ if (colno > 0)
+ appendStringInfoString(&buf, ", ");
+
+ /* Need parens if it's not a bare function call */
+ if (looks_like_function(expr))
+ appendStringInfoString(&buf, str);
+ else
+ appendStringInfo(&buf, "(%s)", str);
+
+ colno++;
+ }
+
+ if (!columns_only)
+ appendStringInfo(&buf, " FROM %s",
+ generate_relation_name(statextrec->stxrelid, NIL));
ReleaseSysCache(statexttup);
@@ -1635,6 +1734,76 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
}
/*
+ * Generate text array of expressions for statistics object.
+ */
+Datum
+pg_get_statisticsobjdef_expressions(PG_FUNCTION_ARGS)
+{
+ Oid statextid = PG_GETARG_OID(0);
+ Form_pg_statistic_ext statextrec;
+ HeapTuple statexttup;
+ Datum datum;
+ bool isnull;
+ List *context;
+ ListCell *lc;
+ List *exprs = NIL;
+ bool has_exprs;
+ char *tmp;
+ ArrayBuildState *astate = NULL;
+
+ statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));
+
+ if (!HeapTupleIsValid(statexttup))
+ elog(ERROR, "cache lookup failed for statistics object %u", statextid);
+
+ /* has the statistics expressions? */
+ has_exprs = !heap_attisnull(statexttup, Anum_pg_statistic_ext_stxexprs, NULL);
+
+ /* no expressions? we're done */
+ if (!has_exprs)
+ {
+ ReleaseSysCache(statexttup);
+ PG_RETURN_NULL();
+ }
+
+ statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup);
+
+ /*
+ * Get the statistics expressions, and deparse them into text values.
+ */
+ datum = SysCacheGetAttr(STATEXTOID, statexttup,
+ Anum_pg_statistic_ext_stxexprs, &isnull);
+
+ Assert(!isnull);
+ tmp = TextDatumGetCString(datum);
+ exprs = (List *) stringToNode(tmp);
+ pfree(tmp);
+
+ context = deparse_context_for(get_relation_name(statextrec->stxrelid),
+ statextrec->stxrelid);
+
+ foreach(lc, exprs)
+ {
+ Node *expr = (Node *) lfirst(lc);
+ char *str;
+ int prettyFlags = PRETTYFLAG_INDENT;
+
+ str = deparse_expression_pretty(expr, context, false, false,
+ prettyFlags, 0);
+
+ astate = accumArrayResult(astate,
+ PointerGetDatum(cstring_to_text(str)),
+ false,
+ TEXTOID,
+ CurrentMemoryContext);
+ }
+
+ ReleaseSysCache(statexttup);
+
+ PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
+}
+
+/*
* pg_get_partkeydef
*
* Returns the partition key specification, ie, the following:
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 2348d4a772..7e41bc5641 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3430,6 +3430,14 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
* If examine_variable is able to deduce anything about the GROUP BY
* expression, treat it as a single variable even if it's really more
* complicated.
+ *
+ * XXX This has the consequence that if there's a statistics on the
+ * expression, we don't split it into individual Vars. This affects
+ * our selection of statistics in estimate_multivariate_ndistinct,
+ * because it's probably better to use more accurate estimate for
+ * each expression and treat them as independent, than to combine
+ * estimates for the extracted variables when we don't know how that
+ * relates to the expressions.
*/
examine_variable(root, groupexpr, 0, &vardata);
if (HeapTupleIsValid(vardata.statsTuple) || vardata.isunique)
@@ -3880,50 +3888,77 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
List **varinfos, double *ndistinct)
{
ListCell *lc;
- Bitmapset *attnums = NULL;
- int nmatches;
+ int nmatches_vars;
+ int nmatches_exprs;
Oid statOid = InvalidOid;
MVNDistinct *stats;
- Bitmapset *matched = NULL;
+ StatisticExtInfo *matched_info = NULL;
/* bail out immediately if the table has no extended statistics */
if (!rel->statlist)
return false;
- /* Determine the attnums we're looking for */
- foreach(lc, *varinfos)
- {
- GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc);
- AttrNumber attnum;
-
- Assert(varinfo->rel == rel);
-
- if (!IsA(varinfo->var, Var))
- continue;
-
- attnum = ((Var *) varinfo->var)->varattno;
-
- if (!AttrNumberIsForUserDefinedAttr(attnum))
- continue;
-
- attnums = bms_add_member(attnums, attnum);
- }
-
/* look for the ndistinct statistics matching the most vars */
- nmatches = 1; /* we require at least two matches */
+ nmatches_vars = 0; /* we require at least two matches */
+ nmatches_exprs = 0;
foreach(lc, rel->statlist)
{
+ ListCell *lc2;
StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
- Bitmapset *shared;
- int nshared;
+ int nshared_vars = 0;
+ int nshared_exprs = 0;
/* skip statistics of other kinds */
if (info->kind != STATS_EXT_NDISTINCT)
continue;
- /* compute attnums shared by the vars and the statistics object */
- shared = bms_intersect(info->keys, attnums);
- nshared = bms_num_members(shared);
+ /*
+ * Determine how many expressions (and variables in non-matched
+ * expressions) match. We'll then use these numbers to pick the
+ * statistics object that best matches the clauses.
+ */
+ foreach(lc2, *varinfos)
+ {
+ ListCell *lc3;
+ GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc2);
+ AttrNumber attnum;
+
+ Assert(varinfo->rel == rel);
+
+ /* simple Var, search in statistics keys directly */
+ if (IsA(varinfo->var, Var))
+ {
+ attnum = ((Var *) varinfo->var)->varattno;
+
+ /*
+ * Ignore system attributes - we don't support statistics on
+ * them, so can't match them (and it'd fail as the values are
+ * negative).
+ */
+ if (!AttrNumberIsForUserDefinedAttr(attnum))
+ continue;
+
+ if (bms_is_member(attnum, info->keys))
+ nshared_vars++;
+
+ continue;
+ }
+
+ /* expression - see if it's in the statistics */
+ foreach(lc3, info->exprs)
+ {
+ Node *expr = (Node *) lfirst(lc3);
+
+ if (equal(varinfo->var, expr))
+ {
+ nshared_exprs++;
+ break;
+ }
+ }
+ }
+
+ if (nshared_vars + nshared_exprs < 2)
+ continue;
/*
* Does this statistics object match more columns than the currently
@@ -3932,18 +3967,21 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
* XXX This should break ties using name of the object, or something
* like that, to make the outcome stable.
*/
- if (nshared > nmatches)
+ if ((nshared_exprs > nmatches_exprs) ||
+ (((nshared_exprs == nmatches_exprs)) && (nshared_vars > nmatches_vars)))
{
statOid = info->statOid;
- nmatches = nshared;
- matched = shared;
+ nmatches_vars = nshared_vars;
+ nmatches_exprs = nshared_exprs;
+ matched_info = info;
}
}
/* No match? */
if (statOid == InvalidOid)
return false;
- Assert(nmatches > 1 && matched != NULL);
+
+ Assert(nmatches_vars + nmatches_exprs > 1);
stats = statext_ndistinct_load(statOid);
@@ -3956,20 +3994,135 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
int i;
List *newlist = NIL;
MVNDistinctItem *item = NULL;
+ ListCell *lc2;
+ Bitmapset *matched = NULL;
+ AttrNumber attnum_offset;
+
+ /*
+ * How much we need to offset the attnums? If there are no
+ * expressions, no offset is needed. Otherwise offset enough to move
+ * the lowest one (which is equal to number of expressions) to 1.
+ */
+ if (matched_info->exprs)
+ attnum_offset = (list_length(matched_info->exprs) + 1);
+ else
+ attnum_offset = 0;
+
+ /* see what actually matched */
+ foreach(lc2, *varinfos)
+ {
+ ListCell *lc3;
+ int idx;
+ bool found = false;
+
+ GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc2);
+
+ /*
+ * Process a simple Var expression, by matching it to keys
+ * directly. If there's a matchine expression, we'll try
+ * matching it later.
+ */
+ if (IsA(varinfo->var, Var))
+ {
+ AttrNumber attnum = ((Var *) varinfo->var)->varattno;
+
+ /*
+ * Ignore expressions on system attributes. Can't rely on
+ * the bms check for negative values.
+ */
+ if (!AttrNumberIsForUserDefinedAttr(attnum))
+ continue;
+
+ /* Is the variable covered by the statistics? */
+ if (!bms_is_member(attnum, matched_info->keys))
+ continue;
+
+ attnum = attnum + attnum_offset;
+
+ /* ensure sufficient offset */
+ Assert(AttrNumberIsForUserDefinedAttr(attnum));
+
+ matched = bms_add_member(matched, attnum);
+
+ found = true;
+ }
+
+ /*
+ * XXX Maybe we should allow searching the expressions even if we
+ * found an attribute matching the expression? That would handle
+ * trivial expressions like "(a)" but it seems fairly useless.
+ */
+ if (found)
+ continue;
+
+ /* expression - see if it's in the statistics */
+ idx = 0;
+ foreach(lc3, matched_info->exprs)
+ {
+ Node *expr = (Node *) lfirst(lc3);
+
+ if (equal(varinfo->var, expr))
+ {
+ AttrNumber attnum = -(idx + 1);
+
+ attnum = attnum + attnum_offset;
+
+ /* ensure sufficient offset */
+ Assert(AttrNumberIsForUserDefinedAttr(attnum));
+
+ matched = bms_add_member(matched, attnum);
+
+ /* there should be just one matching expression */
+ break;
+ }
+
+ idx++;
+ }
+ }
/* Find the specific item that exactly matches the combination */
for (i = 0; i < stats->nitems; i++)
{
+ int j;
MVNDistinctItem *tmpitem = &stats->items[i];
- if (bms_subset_compare(tmpitem->attrs, matched) == BMS_EQUAL)
+ if (tmpitem->nattributes != bms_num_members(matched))
+ continue;
+
+ /* assume it's the right item */
+ item = tmpitem;
+
+ /* check that all item attributes/expressions fit the match */
+ for (j = 0; j < tmpitem->nattributes; j++)
{
- item = tmpitem;
- break;
+ AttrNumber attnum = tmpitem->attributes[j];
+
+ /*
+ * Thanks to how we constructed the matched bitmap above, we
+ * can just offset all attnums the same way.
+ */
+ attnum = attnum + attnum_offset;
+
+ if (!bms_is_member(attnum, matched))
+ {
+ /* nah, it's not this item */
+ item = NULL;
+ break;
+ }
}
+
+ /*
+ * If the item has all the matched attributes, we know it's the
+ * right one - there can't be a better one. matching more.
+ */
+ if (item)
+ break;
}
- /* make sure we found an item */
+ /*
+ * Make sure we found an item. There has to be one, because ndistinct
+ * statistics includes all combinations of attributes.
+ */
if (!item)
elog(ERROR, "corrupt MVNDistinct entry");
@@ -3977,18 +4130,63 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
foreach(lc, *varinfos)
{
GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc);
- AttrNumber attnum;
+ ListCell *lc3;
+ bool found = false;
- if (!IsA(varinfo->var, Var))
+ /*
+ * Let's look at plain variables first, because it's the most
+ * common case and the check is quite cheap. We can simply get the
+ * attnum and check (with an offset) matched bitmap.
+ */
+ if (IsA(varinfo->var, Var))
{
- newlist = lappend(newlist, varinfo);
+ AttrNumber attnum = ((Var *) varinfo->var)->varattno;
+
+ /*
+ * If it's a system attribute, we're done. We don't support
+ * extended statistics on system attributes, so it's clearly
+ * not matched. Just keep the expression and continue.
+ */
+ if (!AttrNumberIsForUserDefinedAttr(attnum))
+ {
+ newlist = lappend(newlist, varinfo);
+ continue;
+ }
+
+ /* apply the same offset as above */
+ attnum += attnum_offset;
+
+ /* if it's not matched, keep the varinfo */
+ if (!bms_is_member(attnum, matched))
+ newlist = lappend(newlist, varinfo);
+
+ /* The rest of the loop deals with complex expressions. */
continue;
}
- attnum = ((Var *) varinfo->var)->varattno;
+ /*
+ * Process complex expressions, not just simple Vars.
+ *
+ * First, we search for an exact match of an expression. If we
+ * find one, we can just discard the whole GroupExprInfo, with all
+ * the variables we extracted from it.
+ *
+ * Otherwise we inspect the individual vars, and try matching it
+ * to variables in the item.
+ */
+ foreach(lc3, matched_info->exprs)
+ {
+ Node *expr = (Node *) lfirst(lc3);
+
+ if (equal(varinfo->var, expr))
+ {
+ found = true;
+ break;
+ }
+ }
- if (AttrNumberIsForUserDefinedAttr(attnum) &&
- bms_is_member(attnum, matched))
+ /* found exact match, skip */
+ if (found)
continue;
newlist = lappend(newlist, varinfo);
@@ -4690,6 +4888,13 @@ get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo,
*join_is_reversed = false;
}
+/* statext_expressions_load copies the tuple, so just pfree it. */
+static void
+ReleaseDummy(HeapTuple tuple)
+{
+ pfree(tuple);
+}
+
/*
* examine_variable
* Try to look up statistical data about an expression.
@@ -4830,6 +5035,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
* operator we are estimating for. FIXME later.
*/
ListCell *ilist;
+ ListCell *slist;
foreach(ilist, onerel->indexlist)
{
@@ -4986,6 +5192,129 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
if (vardata->statsTuple)
break;
}
+
+ /*
+ * Search extended statistics for one with a matching expression.
+ * There might be multiple ones, so just grab the first one. In the
+ * future, we might consider the statistics target (and pick the most
+ * accurate statistics) and maybe some other parameters.
+ */
+ foreach(slist, onerel->statlist)
+ {
+ StatisticExtInfo *info = (StatisticExtInfo *) lfirst(slist);
+ ListCell *expr_item;
+ int pos;
+
+ /*
+ * Stop once we've found statistics for the expression (either
+ * from extended stats, or for an index in the preceding loop).
+ */
+ if (vardata->statsTuple)
+ break;
+
+ /* skip stats without per-expression stats */
+ if (info->kind != STATS_EXT_EXPRESSIONS)
+ continue;
+
+ pos = 0;
+ foreach(expr_item, info->exprs)
+ {
+ Node *expr = (Node *) lfirst(expr_item);
+
+ Assert(expr);
+
+ /* strip RelabelType before comparing it */
+ if (expr && IsA(expr, RelabelType))
+ expr = (Node *) ((RelabelType *) expr)->arg;
+
+ /* found a match, see if we can extract pg_statistic row */
+ if (equal(node, expr))
+ {
+ HeapTuple t = statext_expressions_load(info->statOid, pos);
+
+ /* Get index's table for permission check */
+ RangeTblEntry *rte;
+ Oid userid;
+
+ vardata->statsTuple = t;
+
+ /*
+ * XXX Not sure if we should cache the tuple somewhere.
+ * Now we just create a new copy every time.
+ */
+ vardata->freefunc = ReleaseDummy;
+
+ rte = planner_rt_fetch(onerel->relid, root);
+ Assert(rte->rtekind == RTE_RELATION);
+
+ /*
+ * Use checkAsUser if it's set, in case we're accessing
+ * the table via a view.
+ */
+ userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
+
+ /*
+ * For simplicity, we insist on the whole table being
+ * selectable, rather than trying to identify which
+ * column(s) the statistics depends on. Also require all
+ * rows to be selectable --- there must be no
+ * securityQuals from security barrier views or RLS
+ * policies.
+ */
+ vardata->acl_ok =
+ rte->securityQuals == NIL &&
+ (pg_class_aclcheck(rte->relid, userid,
+ ACL_SELECT) == ACLCHECK_OK);
+
+ /*
+ * If the user doesn't have permissions to access an
+ * inheritance child relation, check the permissions of
+ * the table actually mentioned in the query, since most
+ * likely the user does have that permission. Note that
+ * whole-table select privilege on the parent doesn't
+ * quite guarantee that the user could read all columns of
+ * the child. But in practice it's unlikely that any
+ * interesting security violation could result from
+ * allowing access to the expression stats, so we allow it
+ * anyway. See similar code in examine_simple_variable()
+ * for additional comments.
+ */
+ if (!vardata->acl_ok &&
+ root->append_rel_array != NULL)
+ {
+ AppendRelInfo *appinfo;
+ Index varno = onerel->relid;
+
+ appinfo = root->append_rel_array[varno];
+ while (appinfo &&
+ planner_rt_fetch(appinfo->parent_relid,
+ root)->rtekind == RTE_RELATION)
+ {
+ varno = appinfo->parent_relid;
+ appinfo = root->append_rel_array[varno];
+ }
+ if (varno != onerel->relid)
+ {
+ /* Repeat access check on this rel */
+ rte = planner_rt_fetch(varno, root);
+ Assert(rte->rtekind == RTE_RELATION);
+
+ userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
+
+ vardata->acl_ok =
+ rte->securityQuals == NIL &&
+ (pg_class_aclcheck(rte->relid,
+ userid,
+ ACL_SELECT) == ACLCHECK_OK);
+ }
+ }
+
+ break;
+ }
+
+ pos++;
+ }
+ }
}
}
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 737e46464a..86113df29c 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -2637,6 +2637,18 @@ my %tests = (
unlike => { exclude_dump_test_schema => 1, },
},
+ 'CREATE STATISTICS extended_stats_expression' => {
+ create_order => 99,
+ create_sql => 'CREATE STATISTICS dump_test.test_ext_stats_expr
+ ON (2 * col1) FROM dump_test.test_fifth_table',
+ regexp => qr/^
+ \QCREATE STATISTICS dump_test.test_ext_stats_expr ON ((2 * col1)) FROM dump_test.test_fifth_table;\E
+ /xms,
+ like =>
+ { %full_runs, %dump_test_schema_runs, section_post_data => 1, },
+ unlike => { exclude_dump_test_schema => 1, },
+ },
+
'CREATE SEQUENCE test_table_col1_seq' => {
regexp => qr/^
\QCREATE SEQUENCE dump_test.test_table_col1_seq\E
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index e56cc43e11..440249ff69 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -2712,7 +2712,104 @@ describeOneTableDetails(const char *schemaname,
}
/* print any extended statistics */
- if (pset.sversion >= 100000)
+ if (pset.sversion >= 140000)
+ {
+ printfPQExpBuffer(&buf,
+ "SELECT oid, "
+ "stxrelid::pg_catalog.regclass, "
+ "stxnamespace::pg_catalog.regnamespace AS nsp, "
+ "stxname,\n"
+ "pg_get_statisticsobjdef_columns(oid) AS columns,\n"
+ " 'd' = any(stxkind) AS ndist_enabled,\n"
+ " 'f' = any(stxkind) AS deps_enabled,\n"
+ " 'm' = any(stxkind) AS mcv_enabled,\n"
+ "stxstattarget\n"
+ "FROM pg_catalog.pg_statistic_ext stat\n"
+ "WHERE stxrelid = '%s'\n"
+ "ORDER BY 1;",
+ oid);
+
+ result = PSQLexec(buf.data);
+ if (!result)
+ goto error_return;
+ else
+ tuples = PQntuples(result);
+
+ if (tuples > 0)
+ {
+ printTableAddFooter(&cont, _("Statistics objects:"));
+
+ for (i = 0; i < tuples; i++)
+ {
+ bool gotone = false;
+ bool has_ndistinct;
+ bool has_dependencies;
+ bool has_mcv;
+ bool has_all;
+ bool has_some;
+
+ has_ndistinct = (strcmp(PQgetvalue(result, i, 5), "t") == 0);
+ has_dependencies = (strcmp(PQgetvalue(result, i, 6), "t") == 0);
+ has_mcv = (strcmp(PQgetvalue(result, i, 7), "t") == 0);
+
+ printfPQExpBuffer(&buf, " ");
+
+ /* statistics object name (qualified with namespace) */
+ appendPQExpBuffer(&buf, "\"%s\".\"%s\"",
+ PQgetvalue(result, i, 2),
+ PQgetvalue(result, i, 3));
+
+ /*
+ * When printing kinds we ignore expression statistics,
+ * which is used only internally and can't be specified by
+ * user. We don't print the kinds when either none are
+ * specified (in which case it has to be statistics on a
+ * single expr) or when all are specified (in which case
+ * we assume it's expanded by CREATE STATISTICS).
+ */
+ has_all = (has_ndistinct && has_dependencies && has_mcv);
+ has_some = (has_ndistinct || has_dependencies || has_mcv);
+
+ if (has_some && !has_all)
+ {
+ appendPQExpBuffer(&buf, " (");
+
+ /* options */
+ if (has_ndistinct)
+ {
+ appendPQExpBufferStr(&buf, "ndistinct");
+ gotone = true;
+ }
+
+ if (has_dependencies)
+ {
+ appendPQExpBuffer(&buf, "%sdependencies", gotone ? ", " : "");
+ gotone = true;
+ }
+
+ if (has_mcv)
+ {
+ appendPQExpBuffer(&buf, "%smcv", gotone ? ", " : "");
+ }
+
+ appendPQExpBuffer(&buf, ")");
+ }
+
+ appendPQExpBuffer(&buf, " ON %s FROM %s",
+ PQgetvalue(result, i, 4),
+ PQgetvalue(result, i, 1));
+
+ /* Show the stats target if it's not default */
+ if (strcmp(PQgetvalue(result, i, 8), "-1") != 0)
+ appendPQExpBuffer(&buf, "; STATISTICS %s",
+ PQgetvalue(result, i, 8));
+
+ printTableAddFooter(&cont, buf.data);
+ }
+ }
+ PQclear(result);
+ }
+ else if (pset.sversion >= 100000)
{
printfPQExpBuffer(&buf,
"SELECT oid, "
@@ -4468,18 +4565,27 @@ listExtendedStats(const char *pattern)
printfPQExpBuffer(&buf,
"SELECT \n"
"es.stxnamespace::pg_catalog.regnamespace::text AS \"%s\", \n"
- "es.stxname AS \"%s\", \n"
- "pg_catalog.format('%%s FROM %%s', \n"
- " (SELECT pg_catalog.string_agg(pg_catalog.quote_ident(a.attname),', ') \n"
- " FROM pg_catalog.unnest(es.stxkeys) s(attnum) \n"
- " JOIN pg_catalog.pg_attribute a \n"
- " ON (es.stxrelid = a.attrelid \n"
- " AND a.attnum = s.attnum \n"
- " AND NOT a.attisdropped)), \n"
- "es.stxrelid::regclass) AS \"%s\"",
+ "es.stxname AS \"%s\", \n",
gettext_noop("Schema"),
- gettext_noop("Name"),
- gettext_noop("Definition"));
+ gettext_noop("Name"));
+
+ if (pset.sversion >= 140000)
+ appendPQExpBuffer(&buf,
+ "pg_catalog.format('%%s FROM %%s', \n"
+ " pg_get_statisticsobjdef_columns(es.oid), \n"
+ " es.stxrelid::regclass) AS \"%s\"",
+ gettext_noop("Definition"));
+ else
+ appendPQExpBuffer(&buf,
+ "pg_catalog.format('%%s FROM %%s', \n"
+ " (SELECT pg_catalog.string_agg(pg_catalog.quote_ident(a.attname),', ') \n"
+ " FROM pg_catalog.unnest(es.stxkeys) s(attnum) \n"
+ " JOIN pg_catalog.pg_attribute a \n"
+ " ON (es.stxrelid = a.attrelid \n"
+ " AND a.attnum = s.attnum \n"
+ " AND NOT a.attisdropped)), \n"
+ "es.stxrelid::regclass) AS \"%s\"",
+ gettext_noop("Definition"));
appendPQExpBuffer(&buf,
",\nCASE WHEN 'd' = any(es.stxkind) THEN 'defined' \n"
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 474ee2982b..4a39da3c9d 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202103265
+#define CATALOG_VERSION_NO 202103266
#endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index ecf12f4639..cc7d90d2b0 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3658,6 +3658,14 @@
proname => 'pg_get_statisticsobjdef', provolatile => 's',
prorettype => 'text', proargtypes => 'oid',
prosrc => 'pg_get_statisticsobjdef' },
+{ oid => '8887', descr => 'extended statistics columns',
+ proname => 'pg_get_statisticsobjdef_columns', provolatile => 's',
+ prorettype => 'text', proargtypes => 'oid',
+ prosrc => 'pg_get_statisticsobjdef_columns' },
+{ oid => '8886', descr => 'extended statistics expressions',
+ proname => 'pg_get_statisticsobjdef_expressions', provolatile => 's',
+ prorettype => '_text', proargtypes => 'oid',
+ prosrc => 'pg_get_statisticsobjdef_expressions' },
{ oid => '3352', descr => 'partition key description',
proname => 'pg_get_partkeydef', provolatile => 's', prorettype => 'text',
proargtypes => 'oid', prosrc => 'pg_get_partkeydef' },
diff --git a/src/include/catalog/pg_statistic_ext.h b/src/include/catalog/pg_statistic_ext.h
index 29649f5814..36912ce528 100644
--- a/src/include/catalog/pg_statistic_ext.h
+++ b/src/include/catalog/pg_statistic_ext.h
@@ -54,6 +54,9 @@ CATALOG(pg_statistic_ext,3381,StatisticExtRelationId)
#ifdef CATALOG_VARLEN
char stxkind[1] BKI_FORCE_NOT_NULL; /* statistics kinds requested
* to build */
+ pg_node_tree stxexprs; /* A list of expression trees for stats
+ * attributes that are not simple column
+ * references. */
#endif
} FormData_pg_statistic_ext;
@@ -81,6 +84,7 @@ DECLARE_ARRAY_FOREIGN_KEY((stxrelid, stxkeys), pg_attribute, (attrelid, attnum))
#define STATS_EXT_NDISTINCT 'd'
#define STATS_EXT_DEPENDENCIES 'f'
#define STATS_EXT_MCV 'm'
+#define STATS_EXT_EXPRESSIONS 'e'
#endif /* EXPOSE_TO_CLIENT_CODE */
diff --git a/src/include/catalog/pg_statistic_ext_data.h b/src/include/catalog/pg_statistic_ext_data.h
index 2f2577c218..5729154383 100644
--- a/src/include/catalog/pg_statistic_ext_data.h
+++ b/src/include/catalog/pg_statistic_ext_data.h
@@ -38,6 +38,7 @@ CATALOG(pg_statistic_ext_data,3429,StatisticExtDataRelationId)
pg_ndistinct stxdndistinct; /* ndistinct coefficients (serialized) */
pg_dependencies stxddependencies; /* dependencies (serialized) */
pg_mcv_list stxdmcv; /* MCV (serialized) */
+ pg_statistic stxdexpr[1]; /* stats for expressions */
#endif
diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h
index 1a79540c94..339f29f4c8 100644
--- a/src/include/commands/defrem.h
+++ b/src/include/commands/defrem.h
@@ -81,9 +81,7 @@ extern ObjectAddress AlterOperator(AlterOperatorStmt *stmt);
extern ObjectAddress CreateStatistics(CreateStatsStmt *stmt);
extern ObjectAddress AlterStatistics(AlterStatsStmt *stmt);
extern void RemoveStatisticsById(Oid statsOid);
-extern void UpdateStatisticsForTypeChange(Oid statsOid,
- Oid relationOid, int attnum,
- Oid oldColumnType, Oid newColumnType);
+extern Oid StatisticsGetRelation(Oid statId, bool missing_ok);
/* commands/aggregatecmds.c */
extern ObjectAddress DefineAggregate(ParseState *pstate, List *name, List *args, bool oldstyle,
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index e22df890ef..299956f329 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -454,6 +454,7 @@ typedef enum NodeTag
T_TypeName,
T_ColumnDef,
T_IndexElem,
+ T_StatsElem,
T_Constraint,
T_DefElem,
T_RangeTblEntry,
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 0ce19d98ec..12e0e026dc 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -1914,7 +1914,8 @@ typedef enum AlterTableType
AT_AddIdentity, /* ADD IDENTITY */
AT_SetIdentity, /* SET identity column options */
AT_DropIdentity, /* DROP IDENTITY */
- AT_AlterCollationRefreshVersion /* ALTER COLLATION ... REFRESH VERSION */
+ AT_AlterCollationRefreshVersion, /* ALTER COLLATION ... REFRESH VERSION */
+ AT_ReAddStatistics /* internal to commands/tablecmds.c */
} AlterTableType;
typedef struct ReplicaIdentityStmt
@@ -2872,8 +2873,24 @@ typedef struct CreateStatsStmt
List *relations; /* rels to build stats on (list of RangeVar) */
char *stxcomment; /* comment to apply to stats, or NULL */
bool if_not_exists; /* do nothing if stats name already exists */
+ bool transformed; /* true when transformStatsStmt is finished */
} CreateStatsStmt;
+/*
+ * StatsElem - statistics parameters (used in CREATE STATISTICS)
+ *
+ * For a plain attribute, 'name' is the name of the referenced table column
+ * and 'expr' is NULL. For an expression, 'name' is NULL and 'expr' is the
+ * expression tree.
+ */
+typedef struct StatsElem
+{
+ NodeTag type;
+ char *name; /* name of attribute to index, or NULL */
+ Node *expr; /* expression to index, or NULL */
+} StatsElem;
+
+
/* ----------------------
* Alter Statistics Statement
* ----------------------
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index c13642e35e..e4b554f811 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -923,6 +923,7 @@ typedef struct StatisticExtInfo
RelOptInfo *rel; /* back-link to statistic's table */
char kind; /* statistics kind of this entry */
Bitmapset *keys; /* attnums of the columns covered */
+ List *exprs; /* expressions */
} StatisticExtInfo;
/*
diff --git a/src/include/parser/parse_node.h b/src/include/parser/parse_node.h
index 176b9f37c1..a71d7e1f74 100644
--- a/src/include/parser/parse_node.h
+++ b/src/include/parser/parse_node.h
@@ -69,6 +69,7 @@ typedef enum ParseExprKind
EXPR_KIND_FUNCTION_DEFAULT, /* default parameter value for function */
EXPR_KIND_INDEX_EXPRESSION, /* index expression */
EXPR_KIND_INDEX_PREDICATE, /* index predicate */
+ EXPR_KIND_STATS_EXPRESSION, /* extended statistics expression */
EXPR_KIND_ALTER_COL_TRANSFORM, /* transform expr in ALTER COLUMN TYPE */
EXPR_KIND_EXECUTE_PARAMETER, /* parameter value in EXECUTE */
EXPR_KIND_TRIGGER_WHEN, /* WHEN condition in CREATE TRIGGER */
diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h
index bfa4a6b0f2..1056bf081b 100644
--- a/src/include/parser/parse_utilcmd.h
+++ b/src/include/parser/parse_utilcmd.h
@@ -26,6 +26,8 @@ extern AlterTableStmt *transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
List **afterStmts);
extern IndexStmt *transformIndexStmt(Oid relid, IndexStmt *stmt,
const char *queryString);
+extern CreateStatsStmt *transformStatsStmt(Oid relid, CreateStatsStmt *stmt,
+ const char *queryString);
extern void transformRuleStmt(RuleStmt *stmt, const char *queryString,
List **actions, Node **whereClause);
extern List *transformCreateSchemaStmt(CreateSchemaStmt *stmt);
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index a0a3cf5b0f..55cd9252a5 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -57,19 +57,27 @@ typedef struct SortItem
int count;
} SortItem;
-extern MVNDistinct *statext_ndistinct_build(double totalrows,
- int numrows, HeapTuple *rows,
- Bitmapset *attrs, VacAttrStats **stats);
+/* a unified representation of the data the statistics is built on */
+typedef struct StatsBuildData
+{
+ int numrows;
+ int nattnums;
+ AttrNumber *attnums;
+ VacAttrStats **stats;
+ Datum **values;
+ bool **nulls;
+} StatsBuildData;
+
+
+extern MVNDistinct *statext_ndistinct_build(double totalrows, StatsBuildData *data);
extern bytea *statext_ndistinct_serialize(MVNDistinct *ndistinct);
extern MVNDistinct *statext_ndistinct_deserialize(bytea *data);
-extern MVDependencies *statext_dependencies_build(int numrows, HeapTuple *rows,
- Bitmapset *attrs, VacAttrStats **stats);
+extern MVDependencies *statext_dependencies_build(StatsBuildData *data);
extern bytea *statext_dependencies_serialize(MVDependencies *dependencies);
extern MVDependencies *statext_dependencies_deserialize(bytea *data);
-extern MCVList *statext_mcv_build(int numrows, HeapTuple *rows,
- Bitmapset *attrs, VacAttrStats **stats,
+extern MCVList *statext_mcv_build(StatsBuildData *data,
double totalrows, int stattarget);
extern bytea *statext_mcv_serialize(MCVList *mcv, VacAttrStats **stats);
extern MCVList *statext_mcv_deserialize(bytea *data);
@@ -85,14 +93,14 @@ extern int multi_sort_compare_dims(int start, int end, const SortItem *a,
extern int compare_scalars_simple(const void *a, const void *b, void *arg);
extern int compare_datums_simple(Datum a, Datum b, SortSupport ssup);
-extern AttrNumber *build_attnums_array(Bitmapset *attrs, int *numattrs);
+extern AttrNumber *build_attnums_array(Bitmapset *attrs, int nexprs, int *numattrs);
-extern SortItem *build_sorted_items(int numrows, int *nitems, HeapTuple *rows,
- TupleDesc tdesc, MultiSortSupport mss,
+extern SortItem *build_sorted_items(StatsBuildData *data, int *nitems,
+ MultiSortSupport mss,
int numattrs, AttrNumber *attnums);
-extern bool examine_clause_args(List *args, Var **varp,
- Const **cstp, bool *varonleftp);
+extern bool examine_opclause_args(List *args, Node **exprp,
+ Const **cstp, bool *expronleftp);
extern Selectivity mcv_combine_selectivities(Selectivity simple_sel,
Selectivity mcv_sel,
diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h
index fec50688ea..326cf26fea 100644
--- a/src/include/statistics/statistics.h
+++ b/src/include/statistics/statistics.h
@@ -26,7 +26,8 @@
typedef struct MVNDistinctItem
{
double ndistinct; /* ndistinct value for this combination */
- Bitmapset *attrs; /* attr numbers of items */
+ int nattributes; /* number of attributes */
+ AttrNumber *attributes; /* attribute numbers */
} MVNDistinctItem;
/* A MVNDistinct object, comprising all possible combinations of columns */
@@ -121,6 +122,8 @@ extern Selectivity statext_clauselist_selectivity(PlannerInfo *root,
extern bool has_stats_of_kind(List *stats, char requiredkind);
extern StatisticExtInfo *choose_best_statistics(List *stats, char requiredkind,
Bitmapset **clause_attnums,
+ List **clause_exprs,
int nclauses);
+extern HeapTuple statext_expressions_load(Oid stxoid, int idx);
#endif /* STATISTICS_H */
diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h
index ac3d0a6742..d333e5e8a5 100644
--- a/src/include/utils/ruleutils.h
+++ b/src/include/utils/ruleutils.h
@@ -41,4 +41,6 @@ extern char *generate_collation_name(Oid collid);
extern char *generate_opclass_name(Oid opclass);
extern char *get_range_partbound_string(List *bound_datums);
+extern char *pg_get_statisticsobjdef_string(Oid statextid);
+
#endif /* RULEUTILS_H */
diff --git a/src/test/regress/expected/create_table_like.out b/src/test/regress/expected/create_table_like.out
index 10d17be23c..4dc5e6aa5f 100644
--- a/src/test/regress/expected/create_table_like.out
+++ b/src/test/regress/expected/create_table_like.out
@@ -304,7 +304,9 @@ CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) PRIMARY KEY, b text);
CREATE INDEX ctlt1_b_key ON ctlt1 (b);
CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b));
CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1;
+CREATE STATISTICS ctlt1_expr_stat ON (a || b) FROM ctlt1;
COMMENT ON STATISTICS ctlt1_a_b_stat IS 'ab stats';
+COMMENT ON STATISTICS ctlt1_expr_stat IS 'ab expr stats';
COMMENT ON COLUMN ctlt1.a IS 'A';
COMMENT ON COLUMN ctlt1.b IS 'B';
COMMENT ON CONSTRAINT ctlt1_a_check ON ctlt1 IS 't1_a_check';
@@ -414,7 +416,8 @@ Indexes:
Check constraints:
"ctlt1_a_check" CHECK (length(a) > 2)
Statistics objects:
- "public"."ctlt_all_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM ctlt_all
+ "public"."ctlt_all_a_b_stat" ON a, b FROM ctlt_all
+ "public"."ctlt_all_expr_stat" ON ((a || b)) FROM ctlt_all
SELECT c.relname, objsubid, description FROM pg_description, pg_index i, pg_class c WHERE classoid = 'pg_class'::regclass AND objoid = i.indexrelid AND c.oid = i.indexrelid AND i.indrelid = 'ctlt_all'::regclass ORDER BY c.relname, objsubid;
relname | objsubid | description
@@ -424,10 +427,11 @@ SELECT c.relname, objsubid, description FROM pg_description, pg_index i, pg_clas
(2 rows)
SELECT s.stxname, objsubid, description FROM pg_description, pg_statistic_ext s WHERE classoid = 'pg_statistic_ext'::regclass AND objoid = s.oid AND s.stxrelid = 'ctlt_all'::regclass ORDER BY s.stxname, objsubid;
- stxname | objsubid | description
--------------------+----------+-------------
- ctlt_all_a_b_stat | 0 | ab stats
-(1 row)
+ stxname | objsubid | description
+--------------------+----------+---------------
+ ctlt_all_a_b_stat | 0 | ab stats
+ ctlt_all_expr_stat | 0 | ab expr stats
+(2 rows)
CREATE TABLE inh_error1 () INHERITS (ctlt1, ctlt4);
NOTICE: merging multiple inherited definitions of column "a"
@@ -452,7 +456,8 @@ Indexes:
Check constraints:
"ctlt1_a_check" CHECK (length(a) > 2)
Statistics objects:
- "public"."pg_attrdef_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM public.pg_attrdef
+ "public"."pg_attrdef_a_b_stat" ON a, b FROM public.pg_attrdef
+ "public"."pg_attrdef_expr_stat" ON ((a || b)) FROM public.pg_attrdef
DROP TABLE public.pg_attrdef;
-- Check that LIKE isn't confused when new table masks the old, either
@@ -473,7 +478,8 @@ Indexes:
Check constraints:
"ctlt1_a_check" CHECK (length(a) > 2)
Statistics objects:
- "ctl_schema"."ctlt1_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM ctlt1
+ "ctl_schema"."ctlt1_a_b_stat" ON a, b FROM ctlt1
+ "ctl_schema"."ctlt1_expr_stat" ON ((a || b)) FROM ctlt1
ROLLBACK;
DROP TABLE ctlt1, ctlt2, ctlt3, ctlt4, ctlt12_storage, ctlt12_comments, ctlt1_inh, ctlt13_inh, ctlt13_like, ctlt_all, ctla, ctlb CASCADE;
diff --git a/src/test/regress/expected/oidjoins.out b/src/test/regress/expected/oidjoins.out
index 50d046d3ef..1461e947cd 100644
--- a/src/test/regress/expected/oidjoins.out
+++ b/src/test/regress/expected/oidjoins.out
@@ -151,11 +151,6 @@ NOTICE: checking pg_aggregate {aggmfinalfn} => pg_proc {oid}
NOTICE: checking pg_aggregate {aggsortop} => pg_operator {oid}
NOTICE: checking pg_aggregate {aggtranstype} => pg_type {oid}
NOTICE: checking pg_aggregate {aggmtranstype} => pg_type {oid}
-NOTICE: checking pg_statistic_ext {stxrelid} => pg_class {oid}
-NOTICE: checking pg_statistic_ext {stxnamespace} => pg_namespace {oid}
-NOTICE: checking pg_statistic_ext {stxowner} => pg_authid {oid}
-NOTICE: checking pg_statistic_ext {stxrelid,stxkeys} => pg_attribute {attrelid,attnum}
-NOTICE: checking pg_statistic_ext_data {stxoid} => pg_statistic_ext {oid}
NOTICE: checking pg_statistic {starelid} => pg_class {oid}
NOTICE: checking pg_statistic {staop1} => pg_operator {oid}
NOTICE: checking pg_statistic {staop2} => pg_operator {oid}
@@ -168,6 +163,11 @@ NOTICE: checking pg_statistic {stacoll3} => pg_collation {oid}
NOTICE: checking pg_statistic {stacoll4} => pg_collation {oid}
NOTICE: checking pg_statistic {stacoll5} => pg_collation {oid}
NOTICE: checking pg_statistic {starelid,staattnum} => pg_attribute {attrelid,attnum}
+NOTICE: checking pg_statistic_ext {stxrelid} => pg_class {oid}
+NOTICE: checking pg_statistic_ext {stxnamespace} => pg_namespace {oid}
+NOTICE: checking pg_statistic_ext {stxowner} => pg_authid {oid}
+NOTICE: checking pg_statistic_ext {stxrelid,stxkeys} => pg_attribute {attrelid,attnum}
+NOTICE: checking pg_statistic_ext_data {stxoid} => pg_statistic_ext {oid}
NOTICE: checking pg_rewrite {ev_class} => pg_class {oid}
NOTICE: checking pg_trigger {tgrelid} => pg_class {oid}
NOTICE: checking pg_trigger {tgparentid} => pg_trigger {oid}
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 9b12cc122a..9b59a7b4a5 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2418,6 +2418,7 @@ pg_stats_ext| SELECT cn.nspname AS schemaname,
( SELECT array_agg(a.attname ORDER BY a.attnum) AS array_agg
FROM (unnest(s.stxkeys) k(k)
JOIN pg_attribute a ON (((a.attrelid = s.stxrelid) AND (a.attnum = k.k))))) AS attnames,
+ pg_get_statisticsobjdef_expressions(s.oid) AS exprs,
s.stxkind AS kinds,
sd.stxdndistinct AS n_distinct,
sd.stxddependencies AS dependencies,
@@ -2439,6 +2440,78 @@ pg_stats_ext| SELECT cn.nspname AS schemaname,
FROM (unnest(s.stxkeys) k(k)
JOIN pg_attribute a ON (((a.attrelid = s.stxrelid) AND (a.attnum = k.k))))
WHERE (NOT has_column_privilege(c.oid, a.attnum, 'select'::text))))) AND ((c.relrowsecurity = false) OR (NOT row_security_active(c.oid))));
+pg_stats_ext_exprs| SELECT cn.nspname AS schemaname,
+ c.relname AS tablename,
+ sn.nspname AS statistics_schemaname,
+ s.stxname AS statistics_name,
+ pg_get_userbyid(s.stxowner) AS statistics_owner,
+ stat.expr,
+ (stat.a).stanullfrac AS null_frac,
+ (stat.a).stawidth AS avg_width,
+ (stat.a).stadistinct AS n_distinct,
+ CASE
+ WHEN ((stat.a).stakind1 = 1) THEN (stat.a).stavalues1
+ WHEN ((stat.a).stakind2 = 1) THEN (stat.a).stavalues2
+ WHEN ((stat.a).stakind3 = 1) THEN (stat.a).stavalues3
+ WHEN ((stat.a).stakind4 = 1) THEN (stat.a).stavalues4
+ WHEN ((stat.a).stakind5 = 1) THEN (stat.a).stavalues5
+ ELSE NULL::anyarray
+ END AS most_common_vals,
+ CASE
+ WHEN ((stat.a).stakind1 = 1) THEN (stat.a).stanumbers1
+ WHEN ((stat.a).stakind2 = 1) THEN (stat.a).stanumbers2
+ WHEN ((stat.a).stakind3 = 1) THEN (stat.a).stanumbers3
+ WHEN ((stat.a).stakind4 = 1) THEN (stat.a).stanumbers4
+ WHEN ((stat.a).stakind5 = 1) THEN (stat.a).stanumbers5
+ ELSE NULL::real[]
+ END AS most_common_freqs,
+ CASE
+ WHEN ((stat.a).stakind1 = 2) THEN (stat.a).stavalues1
+ WHEN ((stat.a).stakind2 = 2) THEN (stat.a).stavalues2
+ WHEN ((stat.a).stakind3 = 2) THEN (stat.a).stavalues3
+ WHEN ((stat.a).stakind4 = 2) THEN (stat.a).stavalues4
+ WHEN ((stat.a).stakind5 = 2) THEN (stat.a).stavalues5
+ ELSE NULL::anyarray
+ END AS histogram_bounds,
+ CASE
+ WHEN ((stat.a).stakind1 = 3) THEN (stat.a).stanumbers1[1]
+ WHEN ((stat.a).stakind2 = 3) THEN (stat.a).stanumbers2[1]
+ WHEN ((stat.a).stakind3 = 3) THEN (stat.a).stanumbers3[1]
+ WHEN ((stat.a).stakind4 = 3) THEN (stat.a).stanumbers4[1]
+ WHEN ((stat.a).stakind5 = 3) THEN (stat.a).stanumbers5[1]
+ ELSE NULL::real
+ END AS correlation,
+ CASE
+ WHEN ((stat.a).stakind1 = 4) THEN (stat.a).stavalues1
+ WHEN ((stat.a).stakind2 = 4) THEN (stat.a).stavalues2
+ WHEN ((stat.a).stakind3 = 4) THEN (stat.a).stavalues3
+ WHEN ((stat.a).stakind4 = 4) THEN (stat.a).stavalues4
+ WHEN ((stat.a).stakind5 = 4) THEN (stat.a).stavalues5
+ ELSE NULL::anyarray
+ END AS most_common_elems,
+ CASE
+ WHEN ((stat.a).stakind1 = 4) THEN (stat.a).stanumbers1
+ WHEN ((stat.a).stakind2 = 4) THEN (stat.a).stanumbers2
+ WHEN ((stat.a).stakind3 = 4) THEN (stat.a).stanumbers3
+ WHEN ((stat.a).stakind4 = 4) THEN (stat.a).stanumbers4
+ WHEN ((stat.a).stakind5 = 4) THEN (stat.a).stanumbers5
+ ELSE NULL::real[]
+ END AS most_common_elem_freqs,
+ CASE
+ WHEN ((stat.a).stakind1 = 5) THEN (stat.a).stanumbers1
+ WHEN ((stat.a).stakind2 = 5) THEN (stat.a).stanumbers2
+ WHEN ((stat.a).stakind3 = 5) THEN (stat.a).stanumbers3
+ WHEN ((stat.a).stakind4 = 5) THEN (stat.a).stanumbers4
+ WHEN ((stat.a).stakind5 = 5) THEN (stat.a).stanumbers5
+ ELSE NULL::real[]
+ END AS elem_count_histogram
+ FROM (((((pg_statistic_ext s
+ JOIN pg_class c ON ((c.oid = s.stxrelid)))
+ LEFT JOIN pg_statistic_ext_data sd ON ((s.oid = sd.stxoid)))
+ LEFT JOIN pg_namespace cn ON ((cn.oid = c.relnamespace)))
+ LEFT JOIN pg_namespace sn ON ((sn.oid = s.stxnamespace)))
+ JOIN LATERAL ( SELECT unnest(pg_get_statisticsobjdef_expressions(s.oid)) AS expr,
+ unnest(sd.stxdexpr) AS a) stat ON ((stat.expr IS NOT NULL)));
pg_tables| SELECT n.nspname AS schemaname,
c.relname AS tablename,
pg_get_userbyid(c.relowner) AS tableowner,
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 07af1e29e3..60e9bfcd78 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -25,7 +25,7 @@ begin
end;
$$;
-- Verify failures
-CREATE TABLE ext_stats_test (x int, y int, z int);
+CREATE TABLE ext_stats_test (x text, y int, z int);
CREATE STATISTICS tst;
ERROR: syntax error at or near ";"
LINE 1: CREATE STATISTICS tst;
@@ -44,12 +44,25 @@ CREATE STATISTICS tst ON a, b FROM ext_stats_test;
ERROR: column "a" does not exist
CREATE STATISTICS tst ON x, x, y FROM ext_stats_test;
ERROR: duplicate column name in statistics definition
-CREATE STATISTICS tst ON x + y FROM ext_stats_test;
-ERROR: only simple column references are allowed in CREATE STATISTICS
-CREATE STATISTICS tst ON (x, y) FROM ext_stats_test;
-ERROR: only simple column references are allowed in CREATE STATISTICS
+CREATE STATISTICS tst ON x, x, y, x, x, y, x, x, y FROM ext_stats_test;
+ERROR: cannot have more than 8 columns in statistics
+CREATE STATISTICS tst ON x, x, y, x, x, (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
+ERROR: cannot have more than 8 columns in statistics
+CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
+ERROR: cannot have more than 8 columns in statistics
+CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), y FROM ext_stats_test;
+ERROR: duplicate expression in statistics definition
CREATE STATISTICS tst (unrecognized) ON x, y FROM ext_stats_test;
ERROR: unrecognized statistics kind "unrecognized"
+-- incorrect expressions
+CREATE STATISTICS tst ON y + z FROM ext_stats_test; -- missing parentheses
+ERROR: syntax error at or near "+"
+LINE 1: CREATE STATISTICS tst ON y + z FROM ext_stats_test;
+ ^
+CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; -- tuple expression
+ERROR: syntax error at or near ","
+LINE 1: CREATE STATISTICS tst ON (x, y) FROM ext_stats_test;
+ ^
DROP TABLE ext_stats_test;
-- Ensure stats are dropped sanely, and test IF NOT EXISTS while at it
CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
@@ -79,7 +92,7 @@ ALTER TABLE ab1 DROP COLUMN a;
b | integer | | |
c | integer | | |
Statistics objects:
- "public"."ab1_b_c_stats" (ndistinct, dependencies, mcv) ON b, c FROM ab1
+ "public"."ab1_b_c_stats" ON b, c FROM ab1
-- Ensure statistics are dropped when table is
SELECT stxname FROM pg_statistic_ext WHERE stxname LIKE 'ab1%';
@@ -111,7 +124,7 @@ ALTER STATISTICS ab1_a_b_stats SET STATISTICS 0;
a | integer | | |
b | integer | | |
Statistics objects:
- "public"."ab1_a_b_stats" (ndistinct, dependencies, mcv) ON a, b FROM ab1; STATISTICS 0
+ "public"."ab1_a_b_stats" ON a, b FROM ab1; STATISTICS 0
ANALYZE ab1;
SELECT stxname, stxdndistinct, stxddependencies, stxdmcv
@@ -131,7 +144,7 @@ ALTER STATISTICS ab1_a_b_stats SET STATISTICS -1;
a | integer | | | | plain | |
b | integer | | | | plain | |
Statistics objects:
- "public"."ab1_a_b_stats" (ndistinct, dependencies, mcv) ON a, b FROM ab1
+ "public"."ab1_a_b_stats" ON a, b FROM ab1
-- partial analyze doesn't build stats either
ANALYZE ab1 (a);
@@ -150,6 +163,39 @@ CREATE STATISTICS ab1_a_b_stats ON a, b FROM ab1;
ANALYZE ab1;
DROP TABLE ab1 CASCADE;
NOTICE: drop cascades to table ab1c
+-- basic test for statistics on expressions
+CREATE TABLE ab1 (a INTEGER, b INTEGER, c TIMESTAMP, d TIMESTAMPTZ);
+-- expression stats may be built on a single expression column
+CREATE STATISTICS ab1_exprstat_1 ON (a+b) FROM ab1;
+-- with a single expression, we only enable expression statistics
+CREATE STATISTICS ab1_exprstat_2 ON (a+b) FROM ab1;
+SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_2';
+ stxkind
+---------
+ {e}
+(1 row)
+
+-- adding anything to the expression builds all statistics kinds
+CREATE STATISTICS ab1_exprstat_3 ON (a+b), a FROM ab1;
+SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_3';
+ stxkind
+-----------
+ {d,f,m,e}
+(1 row)
+
+-- date_trunc on timestamptz is not immutable, but that should not matter
+CREATE STATISTICS ab1_exprstat_4 ON date_trunc('day', d) FROM ab1;
+-- date_trunc on timestamp is immutable
+CREATE STATISTICS ab1_exprstat_5 ON date_trunc('day', c) FROM ab1;
+-- insert some data and run analyze, to test that these cases build properly
+INSERT INTO ab1
+SELECT
+ generate_series(1,10),
+ generate_series(1,10),
+ generate_series('2020-10-01'::timestamp, '2020-10-10'::timestamp, interval '1 day'),
+ generate_series('2020-10-01'::timestamptz, '2020-10-10'::timestamptz, interval '1 day');
+ANALYZE ab1;
+DROP TABLE ab1;
-- Verify supported object types for extended statistics
CREATE schema tststats;
CREATE TABLE tststats.t (a int, b int, c text);
@@ -244,6 +290,30 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c
200 | 11
(1 row)
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+ estimated | actual
+-----------+--------
+ 100 | 11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 100 | 11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+ estimated | actual
+-----------+--------
+ 100 | 11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 100 | 11
+(1 row)
+
-- correct command
CREATE STATISTICS s10 ON a, b, c FROM ndistinct;
ANALYZE ndistinct;
@@ -282,6 +352,32 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b
11 | 11
(1 row)
+-- partial improvement (match on attributes)
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+ estimated | actual
+-----------+--------
+ 11 | 11
+(1 row)
+
+-- expressions - no improvement
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 11 | 11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+ estimated | actual
+-----------+--------
+ 11 | 11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 11 | 11
+(1 row)
+
-- last two plans keep using Group Aggregate, because 'd' is not covered
-- by the statistic and while it's NULL-only we assume 200 values for it
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
@@ -343,6 +439,30 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d
200 | 13
(1 row)
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+ estimated | actual
+-----------+--------
+ 221 | 221
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 221 | 221
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+ estimated | actual
+-----------+--------
+ 1000 | 1000
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 221 | 221
+(1 row)
+
DROP STATISTICS s10;
SELECT s.stxkind, d.stxdndistinct
FROM pg_statistic_ext s, pg_statistic_ext_data d
@@ -383,6 +503,413 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d
200 | 13
(1 row)
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+ estimated | actual
+-----------+--------
+ 100 | 221
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 100 | 221
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+ estimated | actual
+-----------+--------
+ 100 | 1000
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 100 | 221
+(1 row)
+
+-- ndistinct estimates with statistics on expressions
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 100 | 221
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+ estimated | actual
+-----------+--------
+ 100 | 1000
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 100 | 221
+(1 row)
+
+CREATE STATISTICS s10 (ndistinct) ON (a+1), (b+100), (2*c) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT s.stxkind, d.stxdndistinct
+ FROM pg_statistic_ext s, pg_statistic_ext_data d
+ WHERE s.stxrelid = 'ndistinct'::regclass
+ AND d.stxoid = s.oid;
+ stxkind | stxdndistinct
+---------+-------------------------------------------------------------------
+ {d,e} | {"-1, -2": 221, "-1, -3": 247, "-2, -3": 323, "-1, -2, -3": 1000}
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 221 | 221
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+ estimated | actual
+-----------+--------
+ 1000 | 1000
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 221 | 221
+(1 row)
+
+DROP STATISTICS s10;
+-- a mix of attributes and expressions
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+ estimated | actual
+-----------+--------
+ 100 | 221
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
+ estimated | actual
+-----------+--------
+ 100 | 247
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
+ estimated | actual
+-----------+--------
+ 100 | 1000
+(1 row)
+
+CREATE STATISTICS s10 (ndistinct) ON a, b, (2*c) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT s.stxkind, d.stxdndistinct
+ FROM pg_statistic_ext s, pg_statistic_ext_data d
+ WHERE s.stxrelid = 'ndistinct'::regclass
+ AND d.stxoid = s.oid;
+ stxkind | stxdndistinct
+---------+-------------------------------------------------------------
+ {d,e} | {"3, 4": 221, "3, -1": 247, "4, -1": 323, "3, 4, -1": 1000}
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+ estimated | actual
+-----------+--------
+ 221 | 221
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
+ estimated | actual
+-----------+--------
+ 247 | 247
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
+ estimated | actual
+-----------+--------
+ 1000 | 1000
+(1 row)
+
+DROP STATISTICS s10;
+-- combination of multiple ndistinct statistics, with/without expressions
+TRUNCATE ndistinct;
+-- two mostly independent groups of columns
+INSERT INTO ndistinct (a, b, c, d)
+ SELECT mod(i,3), mod(i,9), mod(i,5), mod(i,20)
+ FROM generate_series(1,1000) s(i);
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+ estimated | actual
+-----------+--------
+ 27 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+ estimated | actual
+-----------+--------
+ 27 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+ estimated | actual
+-----------+--------
+ 27 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+ estimated | actual
+-----------+--------
+ 27 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+ estimated | actual
+-----------+--------
+ 100 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+ estimated | actual
+-----------+--------
+ 100 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+ estimated | actual
+-----------+--------
+ 100 | 180
+(1 row)
+
+-- basic statistics on both attributes (no expressions)
+CREATE STATISTICS s11 (ndistinct) ON a, b FROM ndistinct;
+CREATE STATISTICS s12 (ndistinct) ON c, d FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+ estimated | actual
+-----------+--------
+ 45 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+ estimated | actual
+-----------+--------
+ 45 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+ estimated | actual
+-----------+--------
+ 100 | 180
+(1 row)
+
+-- replace the second statistics by statistics on expressions
+DROP STATISTICS s12;
+CREATE STATISTICS s12 (ndistinct) ON (c * 10), (d - 1) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+ estimated | actual
+-----------+--------
+ 45 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+ estimated | actual
+-----------+--------
+ 45 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+ estimated | actual
+-----------+--------
+ 100 | 180
+(1 row)
+
+-- replace the second statistics by statistics on both attributes and expressions
+DROP STATISTICS s12;
+CREATE STATISTICS s12 (ndistinct) ON c, d, (c * 10), (d - 1) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+ estimated | actual
+-----------+--------
+ 45 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+ estimated | actual
+-----------+--------
+ 45 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+ estimated | actual
+-----------+--------
+ 100 | 180
+(1 row)
+
+-- replace the other statistics by statistics on both attributes and expressions
+DROP STATISTICS s11;
+CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+ estimated | actual
+-----------+--------
+ 45 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+ estimated | actual
+-----------+--------
+ 45 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+ estimated | actual
+-----------+--------
+ 100 | 180
+(1 row)
+
+-- replace statistics by somewhat overlapping ones (this expected to get worse estimate
+-- because the first statistics shall be applied to 3 columns, and the second one can't
+-- be really applied)
+DROP STATISTICS s11;
+DROP STATISTICS s12;
+CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
+CREATE STATISTICS s12 (ndistinct) ON a, (b+1), (c * 10) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+ estimated | actual
+-----------+--------
+ 45 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+ estimated | actual
+-----------+--------
+ 100 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+ estimated | actual
+-----------+--------
+ 100 | 180
+(1 row)
+
+DROP STATISTICS s11;
+DROP STATISTICS s12;
-- functional dependencies tests
CREATE TABLE functional_dependencies (
filler1 TEXT,
@@ -430,6 +957,40 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
-- a => b, a => c, b => c
TRUNCATE functional_dependencies;
DROP STATISTICS func_deps_stat;
+-- now do the same thing, but with expressions
+INSERT INTO functional_dependencies (a, b, c, filler1)
+ SELECT i, i, i, i FROM generate_series(1,5000) s(i);
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 35
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 5
+(1 row)
+
+-- create statistics
+CREATE STATISTICS func_deps_stat (dependencies) ON (mod(a,11)), (mod(b::int, 13)), (mod(c, 7)) FROM functional_dependencies;
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
+ estimated | actual
+-----------+--------
+ 35 | 35
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
+ estimated | actual
+-----------+--------
+ 5 | 5
+(1 row)
+
+-- a => b, a => c, b => c
+TRUNCATE functional_dependencies;
+DROP STATISTICS func_deps_stat;
INSERT INTO functional_dependencies (a, b, c, filler1)
SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
ANALYZE functional_dependencies;
@@ -755,14 +1316,14 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
1 | 0
(1 row)
--- changing the type of column c causes its single-column stats to be dropped,
--- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple
--- clauses estimated with functional dependencies does not exceed this
+-- changing the type of column c causes all its stats to be dropped, reverting
+-- to default estimates without any statistics, i.e. 0.5% selectivity for each
+-- condition
ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
estimated | actual
-----------+--------
- 25 | 50
+ 1 | 50
(1 row)
ANALYZE functional_dependencies;
@@ -772,6 +1333,332 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
50 | 50
(1 row)
+DROP STATISTICS func_deps_stat;
+-- now try functional dependencies with expressions
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
+ estimated | actual
+-----------+--------
+ 1 | 400
+(1 row)
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
+ estimated | actual
+-----------+--------
+ 1 | 400
+(1 row)
+
+-- ANY with inequalities should not benefit from functional dependencies
+-- the estimates however improve thanks to having expression statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
+ estimated | actual
+-----------+--------
+ 926 | 1900
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 1543 | 2250
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 2229 | 2050
+(1 row)
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+-- create statistics on expressions
+CREATE STATISTICS func_deps_stat (dependencies) ON (a * 2), (b || 'X'), (c + 1) FROM functional_dependencies;
+ANALYZE functional_dependencies;
+-- print the detected dependencies
+SELECT dependencies FROM pg_stats_ext WHERE statistics_name = 'func_deps_stat';
+ dependencies
+------------------------------------------------------------------------------------------------------------------------
+ {"-1 => -2": 1.000000, "-1 => -3": 1.000000, "-2 => -3": 1.000000, "-1, -2 => -3": 1.000000, "-1, -3 => -2": 1.000000}
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
+ estimated | actual
+-----------+--------
+ 400 | 400
+(1 row)
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 99 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+ estimated | actual
+-----------+--------
+ 99 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+ estimated | actual
+-----------+--------
+ 197 | 200
+(1 row)
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 3 | 100
+(1 row)
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
+ estimated | actual
+-----------+--------
+ 400 | 400
+(1 row)
+
+-- ANY with inequalities should not benefit from functional dependencies
+-- the estimates however improve thanks to having expression statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
+ estimated | actual
+-----------+--------
+ 1957 | 1900
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 2933 | 2250
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 3548 | 2050
+(1 row)
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
+ estimated | actual
+-----------+--------
+ 2 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
-- check the ability to use multiple functional dependencies
CREATE TABLE functional_dependencies_multi (
a INTEGER,
@@ -896,6 +1783,39 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b =
1 | 1
(1 row)
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+-- random data (no MCV list), but with expression
+INSERT INTO mcv_lists (a, b, c, filler1)
+ SELECT i, i, i, i FROM generate_series(1,1000) s(i);
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 13
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 1
+(1 row)
+
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,7)), (mod(b::int,11)), (mod(c,13)) FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
+ estimated | actual
+-----------+--------
+ 13 | 13
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 1
+(1 row)
+
-- 100 distinct combinations, all in the MCV list
TRUNCATE mcv_lists;
DROP STATISTICS mcv_lists_stats;
@@ -1121,6 +2041,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = '
200 | 200
(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
estimated | actual
-----------+--------
@@ -1207,6 +2133,212 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b =
50 | 50
(1 row)
+-- 100 distinct combinations, all in the MCV list, but with expressions
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+INSERT INTO mcv_lists (a, b, c, filler1)
+ SELECT i, i, i, i FROM generate_series(1,1000) s(i);
+ANALYZE mcv_lists;
+-- without any stats on the expressions, we have to use default selectivities, which
+-- is why the estimates here are different from the pre-computed case above
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+ estimated | actual
+-----------+--------
+ 111 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+ estimated | actual
+-----------+--------
+ 111 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+ estimated | actual
+-----------+--------
+ 15 | 120
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+ estimated | actual
+-----------+--------
+ 11 | 150
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+-- create statistics with expressions only (we create three separate stats, in order not to build more complex extended stats)
+CREATE STATISTICS mcv_lists_stats_1 ON (mod(a,20)) FROM mcv_lists;
+CREATE STATISTICS mcv_lists_stats_2 ON (mod(b::int,10)) FROM mcv_lists;
+CREATE STATISTICS mcv_lists_stats_3 ON (mod(c,5)) FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+ estimated | actual
+-----------+--------
+ 5 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+ estimated | actual
+-----------+--------
+ 5 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+ estimated | actual
+-----------+--------
+ 5 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+ estimated | actual
+-----------+--------
+ 5 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+ estimated | actual
+-----------+--------
+ 149 | 120
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+ estimated | actual
+-----------+--------
+ 20 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+ estimated | actual
+-----------+--------
+ 20 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+ estimated | actual
+-----------+--------
+ 116 | 150
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+ estimated | actual
+-----------+--------
+ 12 | 100
+(1 row)
+
+DROP STATISTICS mcv_lists_stats_1;
+DROP STATISTICS mcv_lists_stats_2;
+DROP STATISTICS mcv_lists_stats_3;
+-- create statistics with both MCV and expressions
+CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,20)), (mod(b::int,10)), (mod(c,5)) FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+ estimated | actual
+-----------+--------
+ 105 | 120
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+ estimated | actual
+-----------+--------
+ 150 | 150
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+-- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,5) = 1 OR d IS NOT NULL');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
-- 100 distinct combinations with NULL values, all in the MCV list
TRUNCATE mcv_lists;
DROP STATISTICS mcv_lists_stats;
@@ -1712,6 +2844,100 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR
(1 row)
DROP TABLE mcv_lists_multi;
+-- statistics on integer expressions
+CREATE TABLE expr_stats (a int, b int, c int);
+INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+CREATE STATISTICS expr_stats_1 (mcv) ON (a+b), (a-b), (2*a), (3*b) FROM expr_stats;
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+DROP STATISTICS expr_stats_1;
+DROP TABLE expr_stats;
+-- statistics on a mix columns and expressions
+CREATE TABLE expr_stats (a int, b int, c int);
+INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (2*a), (3*b), (a+b), (a-b) FROM expr_stats;
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+DROP TABLE expr_stats;
+-- statistics on expressions with different data types
+CREATE TABLE expr_stats (a int, b name, c text);
+INSERT INTO expr_stats SELECT mod(i,10), md5(mod(i,10)::text), md5(mod(i,10)::text) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
+ estimated | actual
+-----------+--------
+ 11 | 100
+(1 row)
+
+CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (b || c), (c || b) FROM expr_stats;
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+DROP TABLE expr_stats;
-- Permission tests. Users should not be able to see specific data values in
-- the extended statistics, if they lack permission to see those values in
-- the underlying table.
@@ -1743,21 +2969,21 @@ create statistics stts_s2.stts_yama (dependencies, mcv) on col1, col3 from stts_
insert into stts_t1 select i,i from generate_series(1,100) i;
analyze stts_t1;
\dX
- List of extended statistics
- Schema | Name | Definition | Ndistinct | Dependencies | MCV
-----------+------------------------+--------------------------------------+-----------+--------------+---------
- public | func_deps_stat | a, b, c FROM functional_dependencies | | defined |
- public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined
- public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined
- public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined
- public | stts_1 | a, b FROM stts_t1 | defined | |
- public | stts_2 | a, b FROM stts_t1 | defined | defined |
- public | stts_3 | a, b FROM stts_t1 | defined | defined | defined
- public | stts_4 | b, c FROM stts_t2 | defined | defined | defined
- public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined
- stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined
- stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined
- tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined
+ List of extended statistics
+ Schema | Name | Definition | Ndistinct | Dependencies | MCV
+----------+------------------------+----------------------------------------------------------------------------------+-----------+--------------+---------
+ public | func_deps_stat | ((a * 2)), ((b || 'X'::text)), ((c + (1)::numeric)) FROM functional_dependencies | | defined |
+ public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined
+ public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined
+ public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined
+ public | stts_1 | a, b FROM stts_t1 | defined | |
+ public | stts_2 | a, b FROM stts_t1 | defined | defined |
+ public | stts_3 | a, b FROM stts_t1 | defined | defined | defined
+ public | stts_4 | b, c FROM stts_t2 | defined | defined | defined
+ public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined
+ stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined
+ stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined
+ tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined
(12 rows)
\dX stts_?
@@ -1778,21 +3004,21 @@ analyze stts_t1;
(1 row)
\dX+
- List of extended statistics
- Schema | Name | Definition | Ndistinct | Dependencies | MCV
-----------+------------------------+--------------------------------------+-----------+--------------+---------
- public | func_deps_stat | a, b, c FROM functional_dependencies | | defined |
- public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined
- public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined
- public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined
- public | stts_1 | a, b FROM stts_t1 | defined | |
- public | stts_2 | a, b FROM stts_t1 | defined | defined |
- public | stts_3 | a, b FROM stts_t1 | defined | defined | defined
- public | stts_4 | b, c FROM stts_t2 | defined | defined | defined
- public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined
- stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined
- stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined
- tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined
+ List of extended statistics
+ Schema | Name | Definition | Ndistinct | Dependencies | MCV
+----------+------------------------+----------------------------------------------------------------------------------+-----------+--------------+---------
+ public | func_deps_stat | ((a * 2)), ((b || 'X'::text)), ((c + (1)::numeric)) FROM functional_dependencies | | defined |
+ public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined
+ public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined
+ public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined
+ public | stts_1 | a, b FROM stts_t1 | defined | |
+ public | stts_2 | a, b FROM stts_t1 | defined | defined |
+ public | stts_3 | a, b FROM stts_t1 | defined | defined | defined
+ public | stts_4 | b, c FROM stts_t2 | defined | defined | defined
+ public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined
+ stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined
+ stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined
+ tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined
(12 rows)
\dX+ stts_?
@@ -1822,21 +3048,21 @@ analyze stts_t1;
create role regress_stats_ext nosuperuser;
set role regress_stats_ext;
\dX
- List of extended statistics
- Schema | Name | Definition | Ndistinct | Dependencies | MCV
-----------+------------------------+--------------------------------------+-----------+--------------+---------
- public | func_deps_stat | a, b, c FROM functional_dependencies | | defined |
- public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined
- public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined
- public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined
- public | stts_1 | a, b FROM stts_t1 | defined | |
- public | stts_2 | a, b FROM stts_t1 | defined | defined |
- public | stts_3 | a, b FROM stts_t1 | defined | defined | defined
- public | stts_4 | b, c FROM stts_t2 | defined | defined | defined
- public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined
- stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined
- stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined
- tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined
+ List of extended statistics
+ Schema | Name | Definition | Ndistinct | Dependencies | MCV
+----------+------------------------+----------------------------------------------------------------------------------+-----------+--------------+---------
+ public | func_deps_stat | ((a * 2)), ((b || 'X'::text)), ((c + (1)::numeric)) FROM functional_dependencies | | defined |
+ public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined
+ public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined
+ public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined
+ public | stts_1 | a, b FROM stts_t1 | defined | |
+ public | stts_2 | a, b FROM stts_t1 | defined | defined |
+ public | stts_3 | a, b FROM stts_t1 | defined | defined | defined
+ public | stts_4 | b, c FROM stts_t2 | defined | defined | defined
+ public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined
+ stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined
+ stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined
+ tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined
(12 rows)
reset role;
diff --git a/src/test/regress/sql/create_table_like.sql b/src/test/regress/sql/create_table_like.sql
index 06b76f949d..4929d373a2 100644
--- a/src/test/regress/sql/create_table_like.sql
+++ b/src/test/regress/sql/create_table_like.sql
@@ -124,7 +124,9 @@ CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) PRIMARY KEY, b text);
CREATE INDEX ctlt1_b_key ON ctlt1 (b);
CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b));
CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1;
+CREATE STATISTICS ctlt1_expr_stat ON (a || b) FROM ctlt1;
COMMENT ON STATISTICS ctlt1_a_b_stat IS 'ab stats';
+COMMENT ON STATISTICS ctlt1_expr_stat IS 'ab expr stats';
COMMENT ON COLUMN ctlt1.a IS 'A';
COMMENT ON COLUMN ctlt1.b IS 'B';
COMMENT ON CONSTRAINT ctlt1_a_check ON ctlt1 IS 't1_a_check';
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index f9f12769e4..fb3af6e523 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -28,16 +28,21 @@ end;
$$;
-- Verify failures
-CREATE TABLE ext_stats_test (x int, y int, z int);
+CREATE TABLE ext_stats_test (x text, y int, z int);
CREATE STATISTICS tst;
CREATE STATISTICS tst ON a, b;
CREATE STATISTICS tst FROM sometab;
CREATE STATISTICS tst ON a, b FROM nonexistent;
CREATE STATISTICS tst ON a, b FROM ext_stats_test;
CREATE STATISTICS tst ON x, x, y FROM ext_stats_test;
-CREATE STATISTICS tst ON x + y FROM ext_stats_test;
-CREATE STATISTICS tst ON (x, y) FROM ext_stats_test;
+CREATE STATISTICS tst ON x, x, y, x, x, y, x, x, y FROM ext_stats_test;
+CREATE STATISTICS tst ON x, x, y, x, x, (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
+CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
+CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), y FROM ext_stats_test;
CREATE STATISTICS tst (unrecognized) ON x, y FROM ext_stats_test;
+-- incorrect expressions
+CREATE STATISTICS tst ON y + z FROM ext_stats_test; -- missing parentheses
+CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; -- tuple expression
DROP TABLE ext_stats_test;
-- Ensure stats are dropped sanely, and test IF NOT EXISTS while at it
@@ -97,6 +102,36 @@ CREATE STATISTICS ab1_a_b_stats ON a, b FROM ab1;
ANALYZE ab1;
DROP TABLE ab1 CASCADE;
+-- basic test for statistics on expressions
+CREATE TABLE ab1 (a INTEGER, b INTEGER, c TIMESTAMP, d TIMESTAMPTZ);
+
+-- expression stats may be built on a single expression column
+CREATE STATISTICS ab1_exprstat_1 ON (a+b) FROM ab1;
+
+-- with a single expression, we only enable expression statistics
+CREATE STATISTICS ab1_exprstat_2 ON (a+b) FROM ab1;
+SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_2';
+
+-- adding anything to the expression builds all statistics kinds
+CREATE STATISTICS ab1_exprstat_3 ON (a+b), a FROM ab1;
+SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_3';
+
+-- date_trunc on timestamptz is not immutable, but that should not matter
+CREATE STATISTICS ab1_exprstat_4 ON date_trunc('day', d) FROM ab1;
+
+-- date_trunc on timestamp is immutable
+CREATE STATISTICS ab1_exprstat_5 ON date_trunc('day', c) FROM ab1;
+
+-- insert some data and run analyze, to test that these cases build properly
+INSERT INTO ab1
+SELECT
+ generate_series(1,10),
+ generate_series(1,10),
+ generate_series('2020-10-01'::timestamp, '2020-10-10'::timestamp, interval '1 day'),
+ generate_series('2020-10-01'::timestamptz, '2020-10-10'::timestamptz, interval '1 day');
+ANALYZE ab1;
+DROP TABLE ab1;
+
-- Verify supported object types for extended statistics
CREATE schema tststats;
@@ -164,6 +199,14 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
-- correct command
CREATE STATISTICS s10 ON a, b, c FROM ndistinct;
@@ -184,6 +227,16 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c');
+-- partial improvement (match on attributes)
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+
+-- expressions - no improvement
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
-- last two plans keep using Group Aggregate, because 'd' is not covered
-- by the statistic and while it's NULL-only we assume 200 values for it
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
@@ -216,6 +269,14 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
DROP STATISTICS s10;
SELECT s.stxkind, d.stxdndistinct
@@ -234,6 +295,206 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
+-- ndistinct estimates with statistics on expressions
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
+CREATE STATISTICS s10 (ndistinct) ON (a+1), (b+100), (2*c) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT s.stxkind, d.stxdndistinct
+ FROM pg_statistic_ext s, pg_statistic_ext_data d
+ WHERE s.stxrelid = 'ndistinct'::regclass
+ AND d.stxoid = s.oid;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
+DROP STATISTICS s10;
+
+-- a mix of attributes and expressions
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
+
+CREATE STATISTICS s10 (ndistinct) ON a, b, (2*c) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT s.stxkind, d.stxdndistinct
+ FROM pg_statistic_ext s, pg_statistic_ext_data d
+ WHERE s.stxrelid = 'ndistinct'::regclass
+ AND d.stxoid = s.oid;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
+
+DROP STATISTICS s10;
+
+-- combination of multiple ndistinct statistics, with/without expressions
+TRUNCATE ndistinct;
+
+-- two mostly independent groups of columns
+INSERT INTO ndistinct (a, b, c, d)
+ SELECT mod(i,3), mod(i,9), mod(i,5), mod(i,20)
+ FROM generate_series(1,1000) s(i);
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+-- basic statistics on both attributes (no expressions)
+CREATE STATISTICS s11 (ndistinct) ON a, b FROM ndistinct;
+
+CREATE STATISTICS s12 (ndistinct) ON c, d FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+
+-- replace the second statistics by statistics on expressions
+
+DROP STATISTICS s12;
+
+CREATE STATISTICS s12 (ndistinct) ON (c * 10), (d - 1) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+
+-- replace the second statistics by statistics on both attributes and expressions
+
+DROP STATISTICS s12;
+
+CREATE STATISTICS s12 (ndistinct) ON c, d, (c * 10), (d - 1) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+
+-- replace the other statistics by statistics on both attributes and expressions
+
+DROP STATISTICS s11;
+
+CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+
+-- replace statistics by somewhat overlapping ones (this expected to get worse estimate
+-- because the first statistics shall be applied to 3 columns, and the second one can't
+-- be really applied)
+
+DROP STATISTICS s11;
+DROP STATISTICS s12;
+
+CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
+CREATE STATISTICS s12 (ndistinct) ON a, (b+1), (c * 10) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+DROP STATISTICS s11;
+DROP STATISTICS s12;
+
-- functional dependencies tests
CREATE TABLE functional_dependencies (
filler1 TEXT,
@@ -272,6 +533,29 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
TRUNCATE functional_dependencies;
DROP STATISTICS func_deps_stat;
+-- now do the same thing, but with expressions
+INSERT INTO functional_dependencies (a, b, c, filler1)
+ SELECT i, i, i, i FROM generate_series(1,5000) s(i);
+
+ANALYZE functional_dependencies;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
+
+-- create statistics
+CREATE STATISTICS func_deps_stat (dependencies) ON (mod(a,11)), (mod(b::int, 13)), (mod(c, 7)) FROM functional_dependencies;
+
+ANALYZE functional_dependencies;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
+
+-- a => b, a => c, b => c
+TRUNCATE functional_dependencies;
+DROP STATISTICS func_deps_stat;
+
INSERT INTO functional_dependencies (a, b, c, filler1)
SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
@@ -397,9 +681,9 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');
--- changing the type of column c causes its single-column stats to be dropped,
--- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple
--- clauses estimated with functional dependencies does not exceed this
+-- changing the type of column c causes all its stats to be dropped, reverting
+-- to default estimates without any statistics, i.e. 0.5% selectivity for each
+-- condition
ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
@@ -408,6 +692,132 @@ ANALYZE functional_dependencies;
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+DROP STATISTICS func_deps_stat;
+
+-- now try functional dependencies with expressions
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
+
+-- ANY with inequalities should not benefit from functional dependencies
+-- the estimates however improve thanks to having expression statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+
+-- create statistics on expressions
+CREATE STATISTICS func_deps_stat (dependencies) ON (a * 2), (b || 'X'), (c + 1) FROM functional_dependencies;
+
+ANALYZE functional_dependencies;
+
+-- print the detected dependencies
+SELECT dependencies FROM pg_stats_ext WHERE statistics_name = 'func_deps_stat';
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
+
+-- ANY with inequalities should not benefit from functional dependencies
+-- the estimates however improve thanks to having expression statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+
-- check the ability to use multiple functional dependencies
CREATE TABLE functional_dependencies_multi (
a INTEGER,
@@ -479,6 +889,28 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b =
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+
+-- random data (no MCV list), but with expression
+INSERT INTO mcv_lists (a, b, c, filler1)
+ SELECT i, i, i, i FROM generate_series(1,1000) s(i);
+
+ANALYZE mcv_lists;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
+
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,7)), (mod(b::int,11)), (mod(c,13)) FROM mcv_lists;
+
+ANALYZE mcv_lists;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
+
-- 100 distinct combinations, all in the MCV list
TRUNCATE mcv_lists;
DROP STATISTICS mcv_lists_stats;
@@ -565,6 +997,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = '
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
+
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52, NULL) AND b IN ( ''1'', ''2'', NULL)');
@@ -602,6 +1036,98 @@ ANALYZE mcv_lists;
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+
+-- 100 distinct combinations, all in the MCV list, but with expressions
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+
+INSERT INTO mcv_lists (a, b, c, filler1)
+ SELECT i, i, i, i FROM generate_series(1,1000) s(i);
+
+ANALYZE mcv_lists;
+
+-- without any stats on the expressions, we have to use default selectivities, which
+-- is why the estimates here are different from the pre-computed case above
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+
+-- create statistics with expressions only (we create three separate stats, in order not to build more complex extended stats)
+CREATE STATISTICS mcv_lists_stats_1 ON (mod(a,20)) FROM mcv_lists;
+CREATE STATISTICS mcv_lists_stats_2 ON (mod(b::int,10)) FROM mcv_lists;
+CREATE STATISTICS mcv_lists_stats_3 ON (mod(c,5)) FROM mcv_lists;
+
+ANALYZE mcv_lists;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+
+DROP STATISTICS mcv_lists_stats_1;
+DROP STATISTICS mcv_lists_stats_2;
+DROP STATISTICS mcv_lists_stats_3;
+
+-- create statistics with both MCV and expressions
+CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,20)), (mod(b::int,10)), (mod(c,5)) FROM mcv_lists;
+
+ANALYZE mcv_lists;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+
+-- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,5) = 1 OR d IS NOT NULL');
+
-- 100 distinct combinations with NULL values, all in the MCV list
TRUNCATE mcv_lists;
DROP STATISTICS mcv_lists_stats;
@@ -894,6 +1420,57 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR
DROP TABLE mcv_lists_multi;
+
+-- statistics on integer expressions
+CREATE TABLE expr_stats (a int, b int, c int);
+INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
+
+CREATE STATISTICS expr_stats_1 (mcv) ON (a+b), (a-b), (2*a), (3*b) FROM expr_stats;
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
+
+DROP STATISTICS expr_stats_1;
+DROP TABLE expr_stats;
+
+-- statistics on a mix columns and expressions
+CREATE TABLE expr_stats (a int, b int, c int);
+INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
+
+CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (2*a), (3*b), (a+b), (a-b) FROM expr_stats;
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
+
+DROP TABLE expr_stats;
+
+-- statistics on expressions with different data types
+CREATE TABLE expr_stats (a int, b name, c text);
+INSERT INTO expr_stats SELECT mod(i,10), md5(mod(i,10)::text), md5(mod(i,10)::text) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
+
+CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (b || c), (c || b) FROM expr_stats;
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
+
+DROP TABLE expr_stats;
+
+
-- Permission tests. Users should not be able to see specific data values in
-- the extended statistics, if they lack permission to see those values in
-- the underlying table.