summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2007-01-03 22:39:49 +0000
committerTom Lane <tgl@sss.pgh.pa.us>2007-01-03 22:39:49 +0000
commit85690a0e1a71d751076066f6093ea5498440f72c (patch)
tree856f7a5e35ab09d433cc6ddf5018e92c090a0237
parentcbca48aa240190f3eacfcddaee09b5f16ebf46bf (diff)
downloadpostgresql-85690a0e1a71d751076066f6093ea5498440f72c.tar.gz
Fix regex_fixed_prefix() to cope reasonably well with regex patterns of the
form '^(foo)$'. Before, these could never be optimized into indexscans. The recent changes to make psql and pg_dump generate such patterns (for \d commands and -t and related switches, respectively) therefore represented a big performance hit for people with large pg_class catalogs, as seen in recent gripe from Erik Jones. While at it, be more paranoid about case-sensitivity checking in multibyte encodings, and fix some other corner cases in which a regex might be interpreted too liberally.
-rw-r--r--src/backend/utils/adt/regexp.c11
-rw-r--r--src/backend/utils/adt/selfuncs.c145
-rw-r--r--src/include/utils/builtins.h3
3 files changed, 108 insertions, 51 deletions
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index cff81b7475..d999d24f9e 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.56.4.1 2006/04/13 18:01:45 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.56.4.2 2007/01/03 22:39:49 tgl Exp $
*
* Alistair Crooks added the code for the regex caching
* agc - cached the regular expressions used - there's a good chance
@@ -557,3 +557,12 @@ similar_escape(PG_FUNCTION_ARGS)
PG_RETURN_TEXT_P(result);
}
+
+/*
+ * report whether regex_flavor is currently BASIC
+ */
+bool
+regex_flavor_is_basic(void)
+{
+ return (regex_flavor == REG_BASIC);
+}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index b9e250ef29..3a6473919d 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.169.4.5 2005/06/01 17:05:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.169.4.6 2007/01/03 22:39:49 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -3369,7 +3369,10 @@ get_variable_maximum(Query *root, VariableStatData *vardata,
* These routines support analysis of LIKE and regular-expression patterns
* by the planner/optimizer. It's important that they agree with the
* regular-expression code in backend/regex/ and the LIKE code in
- * backend/utils/adt/like.c.
+ * backend/utils/adt/like.c. Also, the computation of the fixed prefix
+ * must be conservative: if we report a string longer than the true fixed
+ * prefix, the query may produce actually wrong answers, rather than just
+ * getting a bad selectivity estimate!
*
* Note that the prefix-analysis functions are called from
* backend/optimizer/path/indxpath.c as well as from routines in this file.
@@ -3401,6 +3404,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
Oid typeid = patt_const->consttype;
int pos,
match_pos;
+ bool is_multibyte = (pg_database_encoding_max_length() > 1);
/* the right-hand const is type text or bytea */
Assert(typeid == BYTEAOID || typeid == TEXTOID);
@@ -3450,11 +3454,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
}
/*
- * XXX I suspect isalpha() is not an adequately locale-sensitive
- * test for characters that can vary under case folding?
+ * XXX In multibyte character sets, we can't trust isalpha, so assume
+ * any multibyte char is potentially case-varying.
*/
- if (case_insensitive && isalpha((unsigned char) patt[pos]))
- break;
+ if (case_insensitive)
+ {
+ if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
+ break;
+ if (isalpha((unsigned char) patt[pos]))
+ break;
+ }
/*
* NOTE: this code used to think that %% meant a literal %, but
@@ -3501,11 +3510,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
int pos,
match_pos,
prev_pos,
- prev_match_pos,
- paren_depth;
+ prev_match_pos;
+ bool have_leading_paren;
char *patt;
char *rest;
Oid typeid = patt_const->consttype;
+ bool is_basic = regex_flavor_is_basic();
+ bool is_multibyte = (pg_database_encoding_max_length() > 1);
/*
* Should be unnecessary, there are no bytea regex operators defined.
@@ -3520,8 +3531,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
/* the right-hand const is type text for all of these */
patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
+ /*
+ * Check for ARE director prefix. It's worth our trouble to recognize
+ * this because similar_escape() uses it.
+ */
+ pos = 0;
+ if (strncmp(patt, "***:", 4) == 0)
+ {
+ pos = 4;
+ is_basic = false;
+ }
+
/* Pattern must be anchored left */
- if (patt[0] != '^')
+ if (patt[pos] != '^')
{
rest = patt;
@@ -3530,74 +3552,86 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
return Pattern_Prefix_None;
}
+ pos++;
/*
- * If unquoted | is present at paren level 0 in pattern, then there
- * are multiple alternatives for the start of the string.
+ * If '|' is present in pattern, then there may be multiple alternatives
+ * for the start of the string. (There are cases where this isn't so,
+ * for instance if the '|' is inside parens, but detecting that reliably
+ * is too hard.)
*/
- paren_depth = 0;
- for (pos = 1; patt[pos]; pos++)
+ if (strchr(patt + pos, '|') != NULL)
{
- if (patt[pos] == '|' && paren_depth == 0)
- {
- rest = patt;
+ rest = patt;
- *prefix_const = NULL;
- *rest_const = string_to_const(rest, typeid);
+ *prefix_const = NULL;
+ *rest_const = string_to_const(rest, typeid);
- return Pattern_Prefix_None;
- }
- else if (patt[pos] == '(')
- paren_depth++;
- else if (patt[pos] == ')' && paren_depth > 0)
- paren_depth--;
- else if (patt[pos] == '\\')
- {
- /* backslash quotes the next character */
- pos++;
- if (patt[pos] == '\0')
- break;
- }
+ return Pattern_Prefix_None;
}
/* OK, allocate space for pattern */
match = palloc(strlen(patt) + 1);
prev_match_pos = match_pos = 0;
- /* note start at pos 1 to skip leading ^ */
- for (prev_pos = pos = 1; patt[pos]; )
+ /*
+ * We special-case the syntax '^(...)$' because psql uses it. But beware:
+ * in BRE mode these parentheses are just ordinary characters. Also,
+ * sequences beginning "(?" are not what they seem, unless they're "(?:".
+ * (We should recognize that, too, because of similar_escape().)
+ *
+ * Note: it's a bit bogus to be depending on the current regex_flavor
+ * setting here, because the setting could change before the pattern is
+ * used. We minimize the risk by trusting the flavor as little as we can,
+ * but perhaps it would be a good idea to get rid of the "basic" setting.
+ */
+ have_leading_paren = false;
+ if (patt[pos] == '(' && !is_basic &&
+ (patt[pos + 1] != '?' || patt[pos + 2] == ':'))
{
- int len;
+ have_leading_paren = true;
+ pos += (patt[pos + 1] != '?' ? 1 : 3);
+ }
+
+ /* Scan remainder of pattern */
+ prev_pos = pos;
+ while (patt[pos])
+ {
+ int len;
/*
- * Check for characters that indicate multiple possible matches
- * here. XXX I suspect isalpha() is not an adequately
- * locale-sensitive test for characters that can vary under case
- * folding?
+ * Check for characters that indicate multiple possible matches here.
+ * Also, drop out at ')' or '$' so the termination test works right.
*/
if (patt[pos] == '.' ||
patt[pos] == '(' ||
+ patt[pos] == ')' ||
patt[pos] == '[' ||
- patt[pos] == '$' ||
- (case_insensitive && isalpha((unsigned char) patt[pos])))
+ patt[pos] == '^' ||
+ patt[pos] == '$')
break;
/*
- * In AREs, backslash followed by alphanumeric is an escape, not
- * a quoted character. Must treat it as having multiple possible
- * matches.
+ * XXX In multibyte character sets, we can't trust isalpha, so assume
+ * any multibyte char is potentially case-varying.
*/
- if (patt[pos] == '\\' && isalnum((unsigned char) patt[pos + 1]))
- break;
+ if (case_insensitive)
+ {
+ if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
+ break;
+ if (isalpha((unsigned char) patt[pos]))
+ break;
+ }
/*
* Check for quantifiers. Except for +, this means the preceding
- * character is optional, so we must remove it from the prefix
- * too!
+ * character is optional, so we must remove it from the prefix too!
+ * Note: in BREs, \{ is a quantifier.
*/
if (patt[pos] == '*' ||
patt[pos] == '?' ||
- patt[pos] == '{')
+ patt[pos] == '{' ||
+ (patt[pos] == '\\' && patt[pos + 1] == '{'))
{
match_pos = prev_match_pos;
pos = prev_pos;
@@ -3608,9 +3642,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
pos = prev_pos;
break;
}
+
+ /*
+ * Normally, backslash quotes the next character. But in AREs,
+ * backslash followed by alphanumeric is an escape, not a quoted
+ * character. Must treat it as having multiple possible matches.
+ * In BREs, \( is a parenthesis, so don't trust that either.
+ * Note: since only ASCII alphanumerics are escapes, we don't have
+ * to be paranoid about multibyte here.
+ */
if (patt[pos] == '\\')
{
- /* backslash quotes the next character */
+ if (isalnum((unsigned char) patt[pos + 1]) || patt[pos + 1] == '(')
+ break;
pos++;
if (patt[pos] == '\0')
break;
@@ -3628,6 +3672,9 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
match[match_pos] = '\0';
rest = &patt[pos];
+ if (have_leading_paren && patt[pos] == ')')
+ pos++;
+
if (patt[pos] == '$' && patt[pos + 1] == '\0')
{
rest = &patt[pos + 1];
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 7de4ae5e35..1e5a4ba93f 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
- * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.252 2004/12/31 22:03:45 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.252.4.1 2007/01/03 22:39:49 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -426,6 +426,7 @@ extern Datum texticregexeq(PG_FUNCTION_ARGS);
extern Datum texticregexne(PG_FUNCTION_ARGS);
extern Datum textregexsubstr(PG_FUNCTION_ARGS);
extern Datum similar_escape(PG_FUNCTION_ARGS);
+extern bool regex_flavor_is_basic(void);
/* regproc.c */
extern Datum regprocin(PG_FUNCTION_ARGS);