summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <bonzini@gnu.org>2010-03-07 11:22:00 +0100
committerPaolo Bonzini <bonzini@gnu.org>2010-03-17 15:32:54 +0100
commit8f9106c419d18759f767da351b3b6913f022c8f8 (patch)
tree598d251c073b0c65c5b10927692dddbc2613a2fb
parent3cba8f98be7f791a55af6433863349e742054dd0 (diff)
downloadgrep-8f9106c419d18759f767da351b3b6913f022c8f8.tar.gz
dfa: speed up handling of brackets
This patch has two sides. One is to fold the parsing of brackets in the single- and multi-byte cases. The second is to leverage this change, and use a bitset to test for single-byte characters in the charset. Splitting the two would be very hard. Testcase: yes 'the quick brown fox jumps over the lazy dog' | sed 100000q | \ time grep -c [ABCDEFGHIJKLMNOPQRSTUVWXYZ,] Before: 59ms (best of three runs); after: 51ms (best of three runs). Nice, but mostly providing infrastructure for the next patch. * src/dfa.c (setbit_case_fold): Try applying towlower/towupper. (looking_at): Remove. (FETCH_WC): New. (fetch_wc): Merge into FETCH_WC [MBS_SUPPORT]. (FETCH) [MBS_SUPPORT]: Call FETCH_WC. (prednames, find_pred, is_blank and other predicates): Move above, remove K&R syntax support. (parse_bracket_exp): New name of parse_bracket_exp_mb, rewritten to include single-byte character set parsing of brackets. (lex): Adjust for fetch_wc->FETCH_WC change, remove single-byte character set parsing of brackets. (match_mb_charset): Test against work_mbc->cset. * src/dfa.h (struct mb_char_classes): Add cset.
-rw-r--r--src/dfa.c609
-rw-r--r--src/dfa.h1
2 files changed, 305 insertions, 305 deletions
diff --git a/src/dfa.c b/src/dfa.c
index 352782a5..3b0d8610 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -238,17 +238,40 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
eolbyte = eol;
}
-/* Like setbit, but if case is folded, set both cases of a letter. */
+/* Like setbit, but if case is folded, set both cases of a letter.
+ For MB_CUR_MAX > 1, one or both of the two cases may not be set,
+ so the resulting charset may only be used as an optimization. */
static void
setbit_case_fold (unsigned b, charclass c)
{
- setbit (b, c);
if (case_fold)
{
- if (ISUPPER (b))
- setbit (tolower (b), c);
- else if (ISLOWER (b))
- setbit (toupper (b), c);
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1)
+ {
+ wint_t b1 = iswupper(b) ? towlower(b) : b;
+ wint_t b2 = iswlower(b) ? towupper(b) : b;
+ if (wctob ((unsigned char)b1) == b1)
+ setbit (b1, c);
+ if (b2 != b1 && wctob ((unsigned char)b2) == b2)
+ setbit (b2, c);
+ }
+ else
+ {
+#endif
+ unsigned char b1 = ISUPPER(b) ? tolower(b) : b;
+ unsigned char b2 = ISLOWER(b) ? toupper(b) : b;
+ setbit (b1, c);
+ if (b2 != b1)
+ setbit (b2, c);
+ }
+ }
+ else
+ {
+#ifdef MBS_SUPPORT
+ if (wctob ((unsigned char)b) == b)
+#endif
+ setbit (b, c);
}
}
@@ -315,43 +338,39 @@ static unsigned char const *buf_end; /* reference to end in dfaexec(). */
#ifdef MBS_SUPPORT
/* Note that characters become unsigned here. */
-# define FETCH(c, eoferr) \
+# define FETCH_WC(c, wc, eoferr) \
do { \
if (! lexleft) \
- { \
- if (eoferr != 0) \
+ { \
+ if (eoferr != 0) \
dfaerror (eoferr); \
- else \
+ else \
return lasttok = END; \
} \
- (c) = (unsigned char) *lexptr++; \
- --lexleft; \
+ else \
+ { \
+ cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); \
+ if (cur_mb_len <= 0) \
+ { \
+ cur_mb_len = 1; \
+ --lexleft; \
+ wc = c = (unsigned char) *lexptr++; \
+ } \
+ else \
+ { \
+ lexptr += cur_mb_len; \
+ lexleft -= cur_mb_len; \
+ (c) = wctob(wc); \
+ } \
+ } \
} while(0)
-/* This function fetch a wide character, and update cur_mb_len,
- used only if the current locale is a multibyte environment. */
-static wint_t
-fetch_wc (char const *eoferr)
-{
- wchar_t wc;
- if (! lexleft)
- {
- if (eoferr != 0)
- dfaerror (eoferr);
- else
- return WEOF;
- }
+# define FETCH(c, eoferr) \
+ do { \
+ wint_t _wc; \
+ FETCH_WC(c, _wc, eoferr); \
+ } while(0)
- cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);
- if (cur_mb_len <= 0)
- {
- cur_mb_len = 1;
- wc = (unsigned char) *lexptr;
- }
- lexptr += cur_mb_len;
- lexleft -= cur_mb_len;
- return wc;
-}
#else
/* Note that characters become unsigned here. */
# define FETCH(c, eoferr) \
@@ -366,6 +385,10 @@ fetch_wc (char const *eoferr)
(c) = (unsigned char) *lexptr++; \
--lexleft; \
} while(0)
+
+# define FETCH_WC(c, unused, eoferr) \
+ FETCH(c, eoferr)
+
#endif /* MBS_SUPPORT */
static int
@@ -375,13 +398,70 @@ in_coll_range (char ch, char from, char to)
return strcoll (&c[0], &c[2]) <= 0 && 0 <= strcoll (&c[2], &c[4]);
}
-#ifdef MBS_SUPPORT
+static int is_alpha(int c) { return ISALPHA(c); }
+static int is_upper(int c) { return ISUPPER(c); }
+static int is_lower(int c) { return ISLOWER(c); }
+static int is_digit(int c) { return ISDIGIT(c); }
+static int is_xdigit(int c) { return ISXDIGIT(c); }
+static int is_space(int c) { return ISSPACE(c); }
+static int is_punct(int c) { return ISPUNCT(c); }
+static int is_alnum(int c) { return ISALNUM(c); }
+static int is_print(int c) { return ISPRINT(c); }
+static int is_graph(int c) { return ISGRAPH(c); }
+static int is_cntrl(int c) { return ISCNTRL(c); }
+
+static int
+is_blank (int c)
+{
+ return (c == ' ' || c == '\t');
+}
+
+typedef int predicate (int);
+
+/* The following list maps the names of the Posix named character classes
+ to predicate functions that determine whether a given character is in
+ the class. The leading [ has already been eaten by the lexical analyzer. */
+static struct {
+ const char *name;
+ predicate *pred;
+} const prednames[] = {
+ { "alpha", is_alpha },
+ { "upper", is_upper },
+ { "lower", is_lower },
+ { "digit", is_digit },
+ { "xdigit", is_xdigit },
+ { "space", is_space },
+ { "punct", is_punct },
+ { "alnum", is_alnum },
+ { "print", is_print },
+ { "graph", is_graph },
+ { "cntrl", is_cntrl },
+ { "blank", is_blank },
+ { NULL, NULL }
+};
+
+static predicate *
+find_pred (const char *str)
+{
+ unsigned int i;
+ for (i = 0; prednames[i].name; ++i)
+ if (!strcmp(str, prednames[i].name))
+ break;
+
+ return prednames[i].pred;
+}
+
/* Multibyte character handling sub-routine for lex.
This function parse a bracket expression and build a struct
mb_char_classes. */
static token
-parse_bracket_exp_mb (void)
+parse_bracket_exp (void)
{
+ int invert;
+ int c, c1, c2;
+ charclass ccl;
+
+#ifdef MBS_SUPPORT
wint_t wc, wc1, wc2;
/* Work area to build a mb_char_classes. */
@@ -389,63 +469,68 @@ parse_bracket_exp_mb (void)
int chars_al, range_sts_al, range_ends_al, ch_classes_al,
equivs_al, coll_elems_al;
- REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
- dfa->mbcsets_alloc, dfa->nmbcsets + 1);
- /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
- We will update dfa->multibyte_prop[] in addtok(), because we can't
- decide the index in dfa->tokens[]. */
-
- /* Initialize work are */
- work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
-
chars_al = 1;
range_sts_al = range_ends_al = 0;
ch_classes_al = equivs_al = coll_elems_al = 0;
+ if (MB_CUR_MAX > 1)
+ {
+ REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
+ dfa->mbcsets_alloc, dfa->nmbcsets + 1);
+
+ /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
+ We will update dfa->multibyte_prop[] in addtok(), because we can't
+ decide the index in dfa->tokens[]. */
+
+ /* Initialize work area. */
+ work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
+ work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
+ work_mbc->nequivs = work_mbc->ncoll_elems = 0;
+ work_mbc->chars = NULL;
+ work_mbc->ch_classes = NULL;
+ work_mbc->range_sts = work_mbc->range_ends = NULL;
+ work_mbc->equivs = work_mbc->coll_elems = NULL;
+ }
+ else
+ work_mbc = NULL;
+#endif
- work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;
- work_mbc->nequivs = work_mbc->ncoll_elems = 0;
- work_mbc->chars = NULL;
- work_mbc->ch_classes = NULL;
- work_mbc->range_sts = work_mbc->range_ends = NULL;
- work_mbc->equivs = work_mbc->coll_elems = NULL;
-
- wc = fetch_wc(_("unbalanced ["));
- if (wc == L'^')
+ memset (ccl, 0, sizeof(ccl));
+ FETCH_WC (c, wc, _("unbalanced ["));
+ if (c == '^')
{
- wc = fetch_wc(_("unbalanced ["));
- work_mbc->invert = 1;
+ FETCH_WC (c, wc, _("unbalanced ["));
+ invert = 1;
}
else
- work_mbc->invert = 0;
+ invert = 0;
+
do
{
- wc1 = WEOF; /* mark wc1 is not initialized". */
+ c1 = EOF; /* mark c1 is not initialized". */
/* Note that if we're looking at some other [:...:] construct,
we just treat it as a bunch of ordinary characters. We can do
this because we assume regex has checked for syntax errors before
dfa is ever called. */
- if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))
+ if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
{
#define BRACKET_BUFFER_SIZE 128
char str[BRACKET_BUFFER_SIZE];
- wc1 = wc;
- wc = fetch_wc(_("unbalanced ["));
+ FETCH_WC (c1, wc1, _("unbalanced ["));
/* If pattern contains `[[:', `[[.', or `[[='. */
- if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))
+ if (c1 == ':'
+#ifdef MBS_SUPPORT
+ /* TODO: handle `[[.' and `[[=' also for MB_CUR_MAX == 1. */
+ || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '='))
+#endif
+ )
{
- unsigned char c;
- unsigned char delim = (unsigned char)wc;
int len = 0;
for (;;)
{
- if (! lexleft)
- dfaerror (_("unbalanced ["));
- c = (unsigned char) *lexptr++;
- --lexleft;
-
- if ((c == delim && *lexptr == ']') || lexleft == 0)
+ FETCH (c, _("unbalanced ["));
+ if ((c == c1 && *lexptr == ']') || lexleft == 0)
break;
if (len < BRACKET_BUFFER_SIZE)
str[len++] = c;
@@ -455,18 +540,9 @@ parse_bracket_exp_mb (void)
}
str[len] = '\0';
- if (lexleft == 0)
- {
- REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
- work_mbc->nchars + 2);
- work_mbc->chars[work_mbc->nchars++] = L'[';
- work_mbc->chars[work_mbc->nchars++] = delim;
- break;
- }
-
- if (--lexleft, *lexptr++ != ']')
- dfaerror (_("unbalanced ["));
- if (delim == ':')
+ /* Fetch bracket. */
+ FETCH (c, _("unbalanced ["));
+ if (c1 == ':')
/* build character class. */
{
char const *class
@@ -474,24 +550,39 @@ parse_bracket_exp_mb (void)
|| !strcmp (str, "lower"))
? "alpha"
: str);
- /* Query the character class as wctype_t. */
- wctype_t wt = wctype (class);
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1)
+ {
+ /* Store the character class as wctype_t. */
+ wctype_t wt = wctype (class);
+
+ if (ch_classes_al == 0)
+ MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
+ REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
+ ch_classes_al,
+ work_mbc->nch_classes + 1);
+ work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
+ }
+#endif
- if (ch_classes_al == 0)
- MALLOC(work_mbc->ch_classes, wctype_t, ++ch_classes_al);
- REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
- ch_classes_al,
- work_mbc->nch_classes + 1);
- work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
+ {
+ predicate *pred = find_pred (class);
+ if (!pred)
+ dfaerror(_("invalid character class"));
+ for (c2 = 0; c2 < NOTCHAR; ++c2)
+ if ((*pred)(c2))
+ setbit_case_fold (c2, ccl);
+ }
+ }
- }
- else if (delim == '=' || delim == '.')
+#ifdef MBS_SUPPORT
+ else if (c1 == '=' || c1 == '.')
{
char *elem;
MALLOC(elem, char, len + 1);
strncpy(elem, str, len + 1);
- if (delim == '=')
+ if (c1 == '=')
/* build equivalent class. */
{
if (equivs_al == 0)
@@ -502,7 +593,7 @@ parse_bracket_exp_mb (void)
work_mbc->equivs[work_mbc->nequivs++] = elem;
}
- if (delim == '.')
+ if (c1 == '.')
/* build collating element. */
{
if (coll_elems_al == 0)
@@ -513,158 +604,157 @@ parse_bracket_exp_mb (void)
work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
}
}
- wc1 = wc = WEOF;
- }
- else
- /* We treat '[' as a normal character here. */
- {
- wc2 = wc1; wc1 = wc; wc = wc2; /* swap */
+#endif
+
+ /* Fetch new lookahead character. */
+ FETCH_WC (c1, wc1, _("unbalanced ["));
+ continue;
}
- }
- else
- {
- if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
- wc = fetch_wc(("unbalanced ["));
+
+ /* We treat '[' as a normal character here. c/c1/wc/wc1
+ are already set up. */
}
- if (wc1 == WEOF)
- wc1 = fetch_wc(_("unbalanced ["));
+ if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+ FETCH_WC(c, wc, _("unbalanced ["));
- if (wc1 == L'-')
+ if (c1 == EOF)
+ FETCH_WC(c1, wc1, _("unbalanced ["));
+
+ if (c1 == '-')
/* build range characters. */
{
- wc2 = fetch_wc(_("unbalanced ["));
- if (wc2 == L']')
+ FETCH_WC(c2, wc2, _("unbalanced ["));
+ if (c2 == ']')
{
/* In the case [x-], the - is an ordinary hyphen,
which is left in c1, the lookahead character. */
lexptr -= cur_mb_len;
lexleft += cur_mb_len;
- wc2 = wc;
- }
- else
- {
- if (wc2 == L'\\'
- && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
- wc2 = fetch_wc(_("unbalanced ["));
- wc1 = fetch_wc(_("unbalanced ["));
- }
+ }
+ }
- /* When case folding map a range, say [m-z] (or even [M-z]) to the
- pair of ranges, [m-z] [M-Z]. */
- if (range_sts_al == 0)
- {
- MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
- MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
- }
- REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
- range_sts_al, work_mbc->nranges + 1);
- REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
- range_ends_al, work_mbc->nranges + 1);
- work_mbc->range_sts[work_mbc->nranges] =
- case_fold ? towlower(wc) : (wchar_t)wc;
- work_mbc->range_ends[work_mbc->nranges++] =
- case_fold ? towlower(wc2) : (wchar_t)wc2;
+ if (c1 == '-' && c2 != ']')
+ {
+ if (c2 == '\\'
+ && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+ FETCH_WC(c2, wc2, _("unbalanced ["));
-#ifndef GREP
- if (case_fold)
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1)
{
+ /* When case folding map a range, say [m-z] (or even [M-z])
+ to the pair of ranges, [m-z] [M-Z]. */
+ if (range_sts_al == 0)
+ {
+ MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
+ MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
+ }
REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
range_sts_al, work_mbc->nranges + 1);
- work_mbc->range_sts[work_mbc->nranges] = towupper(wc);
REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
range_ends_al, work_mbc->nranges + 1);
- work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
+ work_mbc->range_sts[work_mbc->nranges] =
+ case_fold ? towlower(wc) : (wchar_t)wc;
+ work_mbc->range_ends[work_mbc->nranges++] =
+ case_fold ? towlower(wc2) : (wchar_t)wc2;
+
+#ifndef GREP
+ if (case_fold && (iswalpha(wc) || iswalpha(wc2)))
+ {
+ REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
+ range_sts_al, work_mbc->nranges + 1);
+ work_mbc->range_sts[work_mbc->nranges] = towupper(wc);
+ REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+ range_ends_al, work_mbc->nranges + 1);
+ work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
+ }
+#endif
}
+ else
#endif
+ {
+ c1 = c;
+ if (case_fold)
+ {
+ c1 = tolower (c1);
+ c2 = tolower (c2);
+ }
+ if (!hard_LC_COLLATE)
+ for (c = c1; c <= c2; c++)
+ setbit_case_fold (c, ccl);
+ else
+ for (c = 0; c < NOTCHAR; ++c)
+ if (!(case_fold && ISUPPER (c))
+ && in_coll_range (c, c1, c2))
+ setbit_case_fold (c, ccl);
+ }
+
+ FETCH_WC(c1, wc1, _("unbalanced ["));
+ continue;
}
- else if (wc != WEOF)
- /* build normal characters. */
- {
- REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
- work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] =
- (wchar_t) (case_fold ? towlower(wc) : wc);
-#ifndef GREP
- if (case_fold)
+
+ setbit_case_fold (c, ccl);
+#ifdef MBS_SUPPORT
+ /* Build normal characters. */
+ if (MB_CUR_MAX > 1)
+ {
+ if (case_fold && iswalpha(wc))
+ {
+ wc = towlower(wc);
+ c = wctob(wc);
+ if (c == EOF || (wint_t)c != (wint_t)wc)
+ {
+ REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
+ work_mbc->nchars + 1);
+ work_mbc->chars[work_mbc->nchars++] = wc;
+ }
+#ifdef GREP
+ continue;
+#else
+ wc = towupper(wc);
+ c = wctob(wc);
+#endif
+ }
+ if (c == EOF || (wint_t)c != (wint_t)wc)
{
REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] = towupper(wc);
+ work_mbc->chars[work_mbc->nchars++] = wc;
}
#endif
}
}
- while ((wc = wc1) != L']');
- return MBCSET;
-}
-#endif /* MBS_SUPPORT */
+ while ((wc = wc1, (c = c1) != L']'));
-#ifdef __STDC__
-#define FUNC(F, P) static int F(int c) { return P(c); }
-#else
-#define FUNC(F, P) static int F(c) int c; { return P(c); }
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1)
+ {
+ static charclass zeroclass;
+ work_mbc->invert = invert;
+ work_mbc->cset = equal(ccl, zeroclass) ? -1 : charclass_index(ccl);
+ return MBCSET;
+ }
#endif
-FUNC(is_alpha, ISALPHA)
-FUNC(is_upper, ISUPPER)
-FUNC(is_lower, ISLOWER)
-FUNC(is_digit, ISDIGIT)
-FUNC(is_xdigit, ISXDIGIT)
-FUNC(is_space, ISSPACE)
-FUNC(is_punct, ISPUNCT)
-FUNC(is_alnum, ISALNUM)
-FUNC(is_print, ISPRINT)
-FUNC(is_graph, ISGRAPH)
-FUNC(is_cntrl, ISCNTRL)
+ if (invert)
+ {
+ notset(ccl);
+ if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
+ clrbit(eolbyte, ccl);
+ }
-static int
-is_blank (int c)
-{
- return (c == ' ' || c == '\t');
+ return CSET + charclass_index(ccl);
}
-/* The following list maps the names of the Posix named character classes
- to predicate functions that determine whether a given character is in
- the class. The leading [ has already been eaten by the lexical analyzer. */
-static struct {
- const char *name;
- int (*pred) (int);
-} const prednames[] = {
- { ":alpha:]", is_alpha },
- { ":upper:]", is_upper },
- { ":lower:]", is_lower },
- { ":digit:]", is_digit },
- { ":xdigit:]", is_xdigit },
- { ":space:]", is_space },
- { ":punct:]", is_punct },
- { ":alnum:]", is_alnum },
- { ":print:]", is_print },
- { ":graph:]", is_graph },
- { ":cntrl:]", is_cntrl },
- { ":blank:]", is_blank },
- { 0, 0 }
-};
-
/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
-static int
-looking_at (char const *s)
-{
- size_t len;
-
- len = strlen(s);
- if (lexleft < len)
- return 0;
- return strncmp(s, lexptr, len) == 0;
-}
-
static token
lex (void)
{
- unsigned c, c1, c2;
- int backslash = 0, invert;
+ unsigned c, c2;
+ int backslash = 0;
charclass ccl;
int i;
@@ -679,10 +769,7 @@ lex (void)
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
- wint_t wi = fetch_wc (NULL);
- if (wi == WEOF)
- return lasttok = EOF;
- wctok = wi, c = wctob (wi);
+ FETCH_WC (c, wctok, NULL);
if ((int)c == EOF)
goto normal_char;
}
@@ -963,100 +1050,7 @@ lex (void)
if (backslash)
goto normal_char;
laststart = 0;
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- {
- /* In multibyte environment a bracket expression may contain
- multibyte characters, which must be treated as characters
- (not bytes). So we parse it by parse_bracket_exp_mb(). */
- return lasttok = parse_bracket_exp_mb();
- }
-#endif
- zeroset(ccl);
- FETCH(c, _("unbalanced ["));
- if (c == '^')
- {
- FETCH(c, _("unbalanced ["));
- invert = 1;
- }
- else
- invert = 0;
- do
- {
- /* Nobody ever said this had to be fast. :-)
- Note that if we're looking at some other [:...:]
- construct, we just treat it as a bunch of ordinary
- characters. We can do this because we assume
- regex has checked for syntax errors before
- dfa is ever called. */
- if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
- for (c1 = 0; prednames[c1].name; ++c1)
- if (looking_at(prednames[c1].name))
- {
- int (*pred) (int) = prednames[c1].pred;
-
- for (c2 = 0; c2 < NOTCHAR; ++c2)
- if ((*pred)(c2))
- setbit_case_fold (c2, ccl);
- lexptr += strlen(prednames[c1].name);
- lexleft -= strlen(prednames[c1].name);
- FETCH(c1, _("unbalanced ["));
- goto skip;
- }
- if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
- FETCH(c, _("unbalanced ["));
- FETCH(c1, _("unbalanced ["));
- if (c1 == '-')
- {
- FETCH(c2, _("unbalanced ["));
- if (c2 == ']')
- {
- /* In the case [x-], the - is an ordinary hyphen,
- which is left in c1, the lookahead character. */
- --lexptr;
- ++lexleft;
- }
- else
- {
- if (c2 == '\\'
- && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
- FETCH(c2, _("unbalanced ["));
-
- c1 = c;
- if (!hard_LC_COLLATE)
- for (c = c1; c <= c2; c++)
- setbit_case_fold (c, ccl);
- else
- {
- if (case_fold)
- {
- c1 = tolower (c1);
- c2 = tolower (c2);
- }
- for (c = 0; c < NOTCHAR; ++c)
- if (!(case_fold && ISUPPER (c))
- && in_coll_range (c, c1, c2))
- setbit_case_fold (c, ccl);
- }
-
- FETCH(c1, _("unbalanced ["));
- continue;
- }
- }
-
- setbit_case_fold (c, ccl);
-
- skip:
- ;
- }
- while ((c = c1) != ']');
- if (invert)
- {
- notset(ccl);
- if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
- clrbit(eolbyte, ccl);
- }
- return lasttok = CSET + charclass_index(ccl);
+ return lasttok = parse_bracket_exp();
default:
normal_char:
@@ -2499,6 +2493,11 @@ match_mb_charset (struct dfa *d, int s, position pos, int idx)
match = !work_mbc->invert;
match_len = (mblen_buf[idx] == 0)? 1 : mblen_buf[idx];
+ /* Match in range 0-255? */
+ if (wc < NOTCHAR && work_mbc->cset != -1
+ && tstbit((unsigned char)wc, d->charclasses[work_mbc->cset]))
+ goto charset_matched;
+
/* match with a character class? */
for (i = 0; i<work_mbc->nch_classes; i++)
{
diff --git a/src/dfa.h b/src/dfa.h
index 4928d822..594e25cf 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -243,6 +243,7 @@ struct dfamust
e.g. [a-c], [[:alpha:]], etc. */
struct mb_char_classes
{
+ int cset;
int invert;
wchar_t *chars; /* Normal characters. */
int nchars;