summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2014-02-28 14:22:43 +0200
committerArnold D. Robbins <arnold@skeeve.com>2014-02-28 14:22:43 +0200
commit0b27d28fad7db725bb6a72c025c91ac39f5eeaf3 (patch)
treeeac9f9b9af6debd10bc6f0e8fa78e06393adef41
parent4cea49ca8f817354ffd513c6ec808152e9299f21 (diff)
downloadgawk-0b27d28fad7db725bb6a72c025c91ac39f5eeaf3.tar.gz
Sync dfa with grep, update a test.
-rw-r--r--ChangeLog5
-rw-r--r--dfa.c142
-rw-r--r--test/ChangeLog4
-rw-r--r--test/regrange.ok2
4 files changed, 83 insertions, 70 deletions
diff --git a/ChangeLog b/ChangeLog
index 507cd3e7..965cd1f7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2014-02-28 Arnold D. Robbins <arnold@skeeve.com>
+
+ * dfa.c: Sync with grep. Looks like good improvement with
+ respect to bracket expressions.
+
2014-02-27 Arnold D. Robbins <arnold@skeeve.com>
Fixes for enum/int mismatches as warned by some compilers.
diff --git a/dfa.c b/dfa.c
index 03a61878..2e9d2fd0 100644
--- a/dfa.c
+++ b/dfa.c
@@ -219,7 +219,8 @@ enum
EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
the empty string. */
- BACKREF, /* BACKREF is generated by \<digit>; it
+ BACKREF, /* BACKREF is generated by \<digit>
+ or by any other construct that
is not completely handled. If the scanner
detects a transition on backref, it returns
a kind of "semi-success" indicating that
@@ -811,6 +812,45 @@ using_utf8 (void)
return utf8;
}
+/* Return true if the current locale is known to be a unibyte locale
+ without multicharacter collating sequences and where range
+ comparisons simply use the native encoding. These locales can be
+ processed more efficiently. */
+
+static bool
+using_simple_locale (void)
+{
+ /* True if the native character set is known to be compatible with
+ the C locale. The following test isn't perfect, but it's good
+ enough in practice, as only ASCII and EBCDIC are in common use
+ and this test correctly accepts ASCII and rejects EBCDIC. */
+ enum { native_c_charset =
+ ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
+ && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
+ && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
+ && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
+ && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
+ && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
+ && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
+ && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
+ && '}' == 125 && '~' == 126)
+ };
+
+ if (! native_c_charset || MB_CUR_MAX > 1)
+ return false;
+ else
+ {
+ static int unibyte_c = -1;
+ if (unibyte_c < 0)
+ {
+ char *locale = setlocale (LC_ALL, 0);
+ unibyte_c = (locale && (STREQ (locale, "C")
+ || STREQ (locale, "POSIX")));
+ }
+ return unibyte_c;
+ }
+}
+
/* Lexical analyzer. All the dross that deals with the obnoxious
GNU Regex syntax bits is located here. The poor, suffering
reader is referred to the GNU Regex documentation for the
@@ -928,7 +968,7 @@ static const struct dfa_ctype prednames[] = {
{"upper", isupper, false},
{"lower", islower, false},
{"digit", isdigit, true},
- {"xdigit", isxdigit, true},
+ {"xdigit", isxdigit, false},
{"space", isspace, false},
{"punct", ispunct, false},
{"alnum", isalnum, false},
@@ -959,6 +999,10 @@ parse_bracket_exp (void)
int c, c1, c2;
charclass ccl;
+ /* True if this is a bracket expression that dfaexec is known to
+ process correctly. */
+ bool known_bracket_exp = true;
+
/* Used to warn about [:space:].
Bit 0 = first character is a colon.
Bit 1 = last character is a colon.
@@ -1000,6 +1044,7 @@ parse_bracket_exp (void)
{
FETCH_WC (c, wc, _("unbalanced ["));
invert = 1;
+ known_bracket_exp = using_simple_locale ();
}
else
invert = 0;
@@ -1014,16 +1059,14 @@ parse_bracket_exp (void)
we just treat it as a bunch of ordinary characters. We can do
this because we assume regex has checked for syntax errors before
dfa is ever called. */
- if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
+ if (c == '[')
{
#define MAX_BRACKET_STRING_LEN 32
char str[MAX_BRACKET_STRING_LEN + 1];
FETCH_WC (c1, wc1, _("unbalanced ["));
- /* If pattern contains '[[:', '[[.', or '[[='. */
- if (c1 == ':'
- /* TODO: handle '[[.' and '[[=' also for MB_CUR_MAX == 1. */
- || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '=')))
+ if ((c1 == ':' && (syntax_bits & RE_CHAR_CLASSES))
+ || c1 == '.' || c1 == '=')
{
size_t len = 0;
for (;;)
@@ -1042,7 +1085,10 @@ parse_bracket_exp (void)
/* Fetch bracket. */
FETCH_WC (c, wc, _("unbalanced ["));
if (c1 == ':')
- /* build character class. */
+ /* Build character class. POSIX allows character
+ classes to match multicharacter collating elements,
+ but the regex code does not support that, so do not
+ worry about that possibility. */
{
char const *class
= (case_fold && (STREQ (str, "upper")
@@ -1066,28 +1112,9 @@ parse_bracket_exp (void)
if (pred->func (c2))
setbit_case_fold_c (c2, ccl);
}
+ else
+ known_bracket_exp = false;
- else if (MBS_SUPPORT && (c1 == '=' || c1 == '.'))
- {
- char *elem = xmemdup (str, len + 1);
-
- if (c1 == '=')
- /* build equivalence class. */
- {
- REALLOC_IF_NECESSARY (work_mbc->equivs,
- equivs_al, work_mbc->nequivs + 1);
- work_mbc->equivs[work_mbc->nequivs++] = elem;
- }
-
- if (c1 == '.')
- /* build collating element. */
- {
- REALLOC_IF_NECESSARY (work_mbc->coll_elems,
- coll_elems_al,
- work_mbc->ncoll_elems + 1);
- work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
- }
- }
colon_warning_state |= 8;
/* Fetch new lookahead character. */
@@ -1109,6 +1136,16 @@ parse_bracket_exp (void)
/* build range characters. */
{
FETCH_WC (c2, wc2, _("unbalanced ["));
+
+ /* A bracket expression like [a-[.aa.]] matches an unknown set.
+ Treat it like [-a[.aa.]] while parsing it, and
+ remember that the set is unknown. */
+ if (c2 == '[' && *lexptr == '.')
+ {
+ known_bracket_exp = false;
+ c2 = ']';
+ }
+
if (c2 == ']')
{
/* In the case [x-], the - is an ordinary hyphen,
@@ -1146,47 +1183,11 @@ parse_bracket_exp (void)
work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
}
}
+ else if (using_simple_locale ())
+ for (; c <= c2; c++)
+ setbit_case_fold_c (c, ccl);
else
- {
-#ifdef GAWK
- c1 = c;
- if (case_fold)
- {
- c1 = tolower (c1);
- c2 = tolower (c2);
- }
- for (c = c1; c <= c2; c++)
- setbit_case_fold_c (c, ccl);
-#else
- /* Defer to the system regex library about the meaning
- of range expressions. */
- struct re_pattern_buffer re = { 0 };
- char const *compile_msg;
-#if 199901 <= __STDC_VERSION__
- char pattern[] = { '[', '\\', c, '-', '\\', c2, ']' };
-#else
- char pattern[] = { '[', '\\', 0, '-', '\\', 0, ']' };
- pattern[2] = c;
- pattern[5] = c2;
-#endif
- re_set_syntax (syntax_bits | RE_BACKSLASH_ESCAPE_IN_LISTS);
- compile_msg = re_compile_pattern (pattern, sizeof pattern, &re);
- if (compile_msg)
- dfaerror (compile_msg);
- for (c = 0; c < NOTCHAR; c++)
- {
- char subject = c;
- switch (re_match (&re, &subject, 1, 0, NULL))
- {
- case 1: setbit (c, ccl); break;
- case -1: break;
- default: xalloc_die ();
- }
- }
- regfree (&re);
- re_set_syntax (syntax_bits);
-#endif
- }
+ known_bracket_exp = false;
colon_warning_state |= 8;
FETCH_WC (c1, wc1, _("unbalanced ["));
@@ -1224,6 +1225,9 @@ parse_bracket_exp (void)
if (colon_warning_state == 7)
dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
+ if (! known_bracket_exp)
+ return BACKREF;
+
if (MB_CUR_MAX > 1)
{
static charclass zeroclass;
diff --git a/test/ChangeLog b/test/ChangeLog
index 18b912dd..ab7ff0f2 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,7 @@
+2014-02-28 Arnold D. Robbins <arnold@skeeve.com>
+
+ * regrange.ok: Update after code improvements.
+
2014-02-03 Stepan Kasal <kasal@ucw.cz>
* strftime.awk: the default format uses %e, not %d (Introduced on
diff --git a/test/regrange.ok b/test/regrange.ok
index 1fa00c70..ae8c6499 100644
--- a/test/regrange.ok
+++ b/test/regrange.ok
@@ -3,4 +3,4 @@
"c" ~ /[[a-d]/ --> 1
"\" ~ /[\[-\]]/ --> 1
"[.c.]" ~ /[a-[.e.]]/ --> 1
-"[.d.]" ~ /[[.c.]-[.z.]]/ --> 0
+"[.d.]" ~ /[[.c.]-[.z.]]/ --> 1