summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2014-03-03 21:28:10 +0200
committerArnold D. Robbins <arnold@skeeve.com>2014-03-03 21:28:10 +0200
commitabe4984928229338afd2dc9bbeb95bb9f361ef94 (patch)
tree0affec401e159f00842b92da078ed33b86b52957
parent4bd44218e46dd480a9fa2fa422a1cd58566f2aa9 (diff)
downloadgawk-abe4984928229338afd2dc9bbeb95bb9f361ef94.tar.gz
Sync dfa with grep.
-rw-r--r--ChangeLog4
-rw-r--r--dfa.c171
2 files changed, 90 insertions, 85 deletions
diff --git a/ChangeLog b/ChangeLog
index 965cd1f7..1b08057b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2014-03-03 Arnold D. Robbins <arnold@skeeve.com>
+
+ * dfa.c: Sync with grep. Yet again.
+
2014-02-28 Arnold D. Robbins <arnold@skeeve.com>
* dfa.c: Sync with grep. Looks like good improvement with
diff --git a/dfa.c b/dfa.c
index 2e9d2fd0..3dd3c209 100644
--- a/dfa.c
+++ b/dfa.c
@@ -130,7 +130,7 @@ typedef unsigned int charclass[CHARCLASS_INTS];
/* Convert a possibly-signed character to an unsigned character. This is
a bit safer than casting to unsigned char, since it catches some type
errors that the cast doesn't. */
-static inline unsigned char
+static unsigned char
to_uchar (char ch)
{
return ch;
@@ -732,42 +732,27 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
this may happen when folding case in weird Turkish locales where
dotless i/dotted I are not included in the chosen character set.
Return whether a bit was set in the charclass. */
-#if MBS_SUPPORT
static bool
setbit_wc (wint_t wc, charclass c)
{
+#if MBS_SUPPORT
int b = wctob (wc);
if (b == EOF)
return false;
setbit (b, c);
return true;
-}
-
-/* Set a bit in the charclass for the given single byte character,
- if it is valid in the current character set. */
-static void
-setbit_c (int b, charclass c)
-{
- /* Do nothing if b is invalid in this character set. */
- if (MB_CUR_MAX > 1 && btowc (b) == WEOF)
- return;
- setbit (b, c);
-}
#else
-# define setbit_c setbit
-static inline bool
-setbit_wc (wint_t wc, charclass c)
-{
abort ();
/*NOTREACHED*/ return false;
-}
#endif
+}
-/* Like setbit_c, but if case is folded, set both cases of a letter. For
- MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
- and the caller takes care of setting the appropriate field of struct
- mb_char_classes. */
+/* Set a bit for B in the charclass C, if B is a valid single byte
+ character in the current character set. If case is folded, set B's
+ lower and upper case variants similarly. If MB_CUR_MAX > 1, the
+ resulting charset is used only as an optimization, and the caller
+ should set the appropriate field of struct mb_char_classes. */
static void
setbit_case_fold_c (int b, charclass c)
{
@@ -776,16 +761,21 @@ setbit_case_fold_c (int b, charclass c)
wint_t wc = btowc (b);
if (wc == WEOF)
return;
- setbit (b, c);
- if (case_fold && iswalpha (wc))
- setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
+ if (case_fold)
+ {
+ setbit_wc (towlower (wc), c);
+ setbit_wc (towupper (wc), c);
+ }
}
else
{
- setbit (b, c);
- if (case_fold && isalpha (b))
- setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
+ if (case_fold)
+ {
+ setbit (tolower (b), c);
+ setbit (toupper (b), c);
+ }
}
+ setbit (b, c);
}
@@ -843,7 +833,7 @@ using_simple_locale (void)
static int unibyte_c = -1;
if (unibyte_c < 0)
{
- char *locale = setlocale (LC_ALL, 0);
+ char *locale = setlocale (LC_ALL, NULL);
unibyte_c = (locale && (STREQ (locale, "C")
|| STREQ (locale, "POSIX")));
}
@@ -1146,52 +1136,51 @@ parse_bracket_exp (void)
c2 = ']';
}
- if (c2 == ']')
+ if (c2 != ']')
{
- /* In the case [x-], the - is an ordinary hyphen,
- which is left in c1, the lookahead character. */
- lexptr -= cur_mb_len;
- lexleft += cur_mb_len;
- }
- }
-
- if (c1 == '-' && c2 != ']')
- {
- if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
- FETCH_WC (c2, wc2, _("unbalanced ["));
+ if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+ FETCH_WC (c2, wc2, _("unbalanced ["));
- if (MB_CUR_MAX > 1)
- {
- /* When case folding map a range, say [m-z] (or even [M-z])
- to the pair of ranges, [m-z] [M-Z]. */
- REALLOC_IF_NECESSARY (work_mbc->range_sts,
- range_sts_al, work_mbc->nranges + 1);
- REALLOC_IF_NECESSARY (work_mbc->range_ends,
- range_ends_al, work_mbc->nranges + 1);
- work_mbc->range_sts[work_mbc->nranges] =
- case_fold ? towlower (wc) : (wchar_t) wc;
- work_mbc->range_ends[work_mbc->nranges++] =
- case_fold ? towlower (wc2) : (wchar_t) wc2;
-
- if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+ if (MB_CUR_MAX > 1)
{
+ /* When case folding map a range, say [m-z] (or even [M-z])
+ to the pair of ranges, [m-z] [M-Z]. Although this code
+ is wrong in multiple ways, it's never used in practice.
+ FIXME: Remove this (and related) unused code. */
REALLOC_IF_NECESSARY (work_mbc->range_sts,
range_sts_al, work_mbc->nranges + 1);
- work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
REALLOC_IF_NECESSARY (work_mbc->range_ends,
range_ends_al, work_mbc->nranges + 1);
- work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
+ work_mbc->range_sts[work_mbc->nranges] =
+ case_fold ? towlower (wc) : (wchar_t) wc;
+ work_mbc->range_ends[work_mbc->nranges++] =
+ case_fold ? towlower (wc2) : (wchar_t) wc2;
+
+ if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+ {
+ REALLOC_IF_NECESSARY (work_mbc->range_sts,
+ range_sts_al, work_mbc->nranges + 1);
+ work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
+ REALLOC_IF_NECESSARY (work_mbc->range_ends,
+ range_ends_al, work_mbc->nranges + 1);
+ work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
+ }
}
+ else if (using_simple_locale ())
+ for (; c <= c2; c++)
+ setbit_case_fold_c (c, ccl);
+ else
+ known_bracket_exp = false;
+
+ colon_warning_state |= 8;
+ FETCH_WC (c1, wc1, _("unbalanced ["));
+ continue;
}
- else if (using_simple_locale ())
- for (; c <= c2; c++)
- setbit_case_fold_c (c, ccl);
- else
- known_bracket_exp = false;
- colon_warning_state |= 8;
- FETCH_WC (c1, wc1, _("unbalanced ["));
- continue;
+ /* In the case [x-], the - is an ordinary hyphen,
+ which is left in c1, the lookahead character. */
+ lexptr -= cur_mb_len;
+ lexleft += cur_mb_len;
}
colon_warning_state |= (c == ':') ? 2 : 4;
@@ -1202,16 +1191,22 @@ parse_bracket_exp (void)
continue;
}
- if (case_fold && iswalpha (wc))
+ if (case_fold)
{
- wc = towlower (wc);
- if (!setbit_wc (wc, ccl))
+ wint_t folded = towlower (wc);
+ if (folded != wc && !setbit_wc (folded, ccl))
+ {
+ REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
+ work_mbc->nchars + 1);
+ work_mbc->chars[work_mbc->nchars++] = folded;
+ }
+ folded = towupper (wc);
+ if (folded != wc && !setbit_wc (folded, ccl))
{
REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] = wc;
+ work_mbc->chars[work_mbc->nchars++] = folded;
}
- wc = towupper (wc);
}
if (!setbit_wc (wc, ccl))
{
@@ -1557,7 +1552,7 @@ lex (void)
if (MB_CUR_MAX > 1)
return lasttok = WCHAR;
- if (case_fold && isalpha (c))
+ if (case_fold && (tolower (c) != c || toupper (c) != c))
{
zeroset (ccl);
setbit_case_fold_c (c, ccl);
@@ -1645,10 +1640,11 @@ addtok (token t)
work_mbc->nchars = 0;
}
- /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET. */
+ /* If the MBCSET is non-inverted and doesn't include neither
+ character classes including multibyte characters, range
+ expressions, equivalence classes nor collating elements,
+ it can be replaced to a simple CSET. */
if (work_mbc->invert
- || (!using_utf8 () && work_mbc->cset != -1)
- || work_mbc->nchars != 0
|| work_mbc->nch_classes != 0
|| work_mbc->nranges != 0
|| work_mbc->nequivs != 0 || work_mbc->ncoll_elems != 0)
@@ -1663,7 +1659,6 @@ addtok (token t)
that the mbcset is empty now. Do nothing in that case. */
if (work_mbc->cset != -1)
{
- assert (using_utf8 ());
addtok (CSET + work_mbc->cset);
if (need_or)
addtok (OR);
@@ -1801,17 +1796,23 @@ add_utf8_anychar (void)
static void
atom (void)
{
- if (0)
+ if (MBS_SUPPORT && tok == WCHAR)
{
- /* empty */
- }
- else if (MBS_SUPPORT && tok == WCHAR)
- {
- addtok_wc (case_fold ? towlower (wctok) : wctok);
- if (case_fold && iswalpha (wctok))
+ addtok_wc (wctok);
+ if (case_fold)
{
- addtok_wc (towupper (wctok));
- addtok (OR);
+ wint_t folded = towlower (wctok);
+ if (folded != wctok)
+ {
+ addtok_wc (folded);
+ addtok (OR);
+ }
+ folded = towupper (wctok);
+ if (folded != wctok)
+ {
+ addtok_wc (folded);
+ addtok (OR);
+ }
}
tok = lex ();