summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2016-08-23 05:48:38 +0300
committerArnold D. Robbins <arnold@skeeve.com>2016-08-23 05:48:38 +0300
commit9b2cc2ba3a51f012d9006596811a867eb7846265 (patch)
treeeba141889df81975d6b671f2faa20473b17442a2
parent9346a8f73056487f614d81532c50b8703e3a3cf0 (diff)
downloadgawk-9b2cc2ba3a51f012d9006596811a867eb7846265.tar.gz
Update dfa, including API changes.
-rw-r--r--ChangeLog9
-rw-r--r--dfa.c859
-rw-r--r--dfa.h13
-rw-r--r--node.c2
-rw-r--r--re.c17
5 files changed, 478 insertions, 422 deletions
diff --git a/ChangeLog b/ChangeLog
index 72e3abe6..356722de 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2016-08-23 Arnold D. Robbins <arnold@skeeve.com>
+
+ * dfa.h: Sync with grep. API changes.
+ * dfa.c: Sync with grep.
+ * re.c (make_regexp): Adjust for API changes, move call to dfasyntax
+ into stanza that compiles the regex.
+ (resetup): Call dfa_init.
+ * node.c (str2wstr): using_utf8 is now called dfa_using_utf8.
+
2016-08-18 Arnold D. Robbins <arnold@skeeve.com>
* dfa.c: Sync with grep.
diff --git a/dfa.c b/dfa.c
index aeb38df9..4d1e1ab0 100644
--- a/dfa.c
+++ b/dfa.c
@@ -363,14 +363,85 @@ struct mb_char_classes
size_t nchars;
};
+struct regex_syntax
+{
+ /* Syntax bits controlling the behavior of the lexical analyzer. */
+ reg_syntax_t syntax_bits;
+ bool syntax_bits_set;
+
+ /* Flag for case-folding letters into sets. */
+ bool case_fold;
+
+ /* End-of-line byte in data. */
+ unsigned char eolbyte;
+
+ /* Cache of char-context values. */
+ int sbit[NOTCHAR];
+
+ /* If never_trail[B], the byte B cannot be a non-initial byte in a
+ multibyte character. */
+ bool never_trail[NOTCHAR];
+
+ /* Set of characters considered letters. */
+ charclass letters;
+
+ /* Set of characters that are newline. */
+ charclass newline;
+};
+
+/* Lexical analyzer. All the dross that deals with the obnoxious
+ GNU Regex syntax bits is located here. The poor, suffering
+ reader is referred to the GNU Regex documentation for the
+ meaning of the @#%!@#%^!@ syntax bits. */
+struct lexer_state
+{
+ char const *lexptr; /* Pointer to next input character. */
+ size_t lexleft; /* Number of characters remaining. */
+ token lasttok; /* Previous token returned; initially END. */
+ size_t parens; /* Count of outstanding left parens. */
+ int minrep, maxrep; /* Repeat counts for {m,n}. */
+
+ /* Wide character representation of the current multibyte character,
+ or WEOF if there was an encoding error. Used only if
+ MB_CUR_MAX > 1. */
+ wint_t wctok;
+
+ /* Length of the multibyte representation of wctok. */
+ int cur_mb_len;
+
+ /* We're separated from beginning or (, | only by zero-width characters. */
+ bool laststart;
+};
+
+/* Recursive descent parser for regular expressions. */
+
+struct parser_state
+{
+ token tok; /* Lookahead token. */
+ size_t depth; /* Current depth of a hypothetical stack
+ holding deferred productions. This is
+ used to determine the depth that will be
+ required of the real stack later on in
+ dfaanalyze. */
+};
+
/* A compiled regular expression. */
struct dfa
{
+ /* Syntax configuration */
+ struct regex_syntax syntax;
+
/* Fields filled by the scanner. */
charclass *charclasses; /* Array of character sets for CSET tokens. */
size_t cindex; /* Index for adding new charclasses. */
size_t calloc; /* Number of charclasses allocated. */
+ /* Scanner state */
+ struct lexer_state lexstate;
+
+ /* Parser state */
+ struct parser_state parsestate;
+
/* Fields filled by the parser. */
token *tokens; /* Postfix parse array. */
size_t tindex; /* Index for adding new tokens. */
@@ -486,7 +557,7 @@ struct dfa
#define ACCEPTS_IN_CONTEXT(prev, curr, state, dfa) \
SUCCEEDS_IN_CONTEXT ((dfa).states[state].constraint, prev, curr)
-static void regexp (void);
+static void regexp (struct dfa *dfa);
/* A table indexed by byte values that contains the corresponding wide
character (if any) for that byte. WEOF means the byte is not a
@@ -705,39 +776,6 @@ dfa_charclass_index (struct dfa *d, charclass const s)
return i;
}
-/* A pointer to the current dfa is kept here during parsing. */
-static struct dfa *dfa;
-
-/* Find the index of charclass S in the current DFA, or allocate a new one. */
-static size_t
-charclass_index (charclass const s)
-{
- return dfa_charclass_index (dfa, s);
-}
-
-/* Syntax bits controlling the behavior of the lexical analyzer. */
-static reg_syntax_t syntax_bits;
-static bool syntax_bits_set;
-
-/* Flag for case-folding letters into sets. */
-static bool case_fold;
-
-/* End-of-line byte in data. */
-static unsigned char eolbyte;
-
-/* Cache of char-context values. */
-static int sbit[NOTCHAR];
-
-/* If never_trail[B], the byte B cannot be a non-initial byte in a
- multibyte character. */
-static bool never_trail[NOTCHAR];
-
-/* Set of characters considered letters. */
-static charclass letters;
-
-/* Set of characters that are newline. */
-static charclass newline;
-
static bool
unibyte_word_constituent (unsigned char c)
{
@@ -745,25 +783,29 @@ unibyte_word_constituent (unsigned char c)
}
static int
-char_context (unsigned char c)
+char_context (struct dfa const *dfa, unsigned char c)
{
- if (c == eolbyte)
+ if (c == dfa->syntax.eolbyte)
return CTX_NEWLINE;
if (unibyte_word_constituent (c))
return CTX_LETTER;
return CTX_NONE;
}
-/* Entry point to set syntax options. */
-void
-dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol)
+/* UTF-8 encoding allows some optimizations that we can't otherwise
+ assume in a multibyte encoding. */
+static bool using_utf8;
+
+bool
+dfa_using_utf8 (void)
{
- int i;
- syntax_bits_set = true;
- syntax_bits = bits;
- case_fold = fold;
- eolbyte = eol;
+ return using_utf8;
+}
+static void
+init_mbrtowc_cache (void)
+{
+ int i;
for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
{
char c = i;
@@ -771,23 +813,39 @@ dfasyntax (reg_syntax_t bits, bool fold, unsigned char eol)
mbstate_t s = { 0 };
wchar_t wc;
mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
+ }
+}
+
+/* Entry point to set syntax options. */
+void
+dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol)
+{
+ int i;
+ dfa->syntax.syntax_bits_set = true;
+ dfa->syntax.syntax_bits = bits;
+ dfa->syntax.case_fold = fold;
+ dfa->syntax.eolbyte = eol;
- /* Now that mbrtowc_cache[uc] is set, use it to calculate sbit. */
- sbit[uc] = char_context (uc);
- switch (sbit[uc])
+ for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+ {
+ unsigned char uc = i;
+
+ /* Use mbrtowc_cache to calculate sbit. */
+ dfa->syntax.sbit[uc] = char_context (dfa, uc);
+ switch (dfa->syntax.sbit[uc])
{
case CTX_LETTER:
- setbit (uc, letters);
+ setbit (uc, dfa->syntax.letters);
break;
case CTX_NEWLINE:
- setbit (uc, newline);
+ setbit (uc, dfa->syntax.newline);
break;
}
/* POSIX requires that the five bytes in "\n\r./" (including the
terminating NUL) cannot occur inside a multibyte character. */
- never_trail[uc] = (using_utf8 () ? (uc & 0xc0) != 0x80
- : strchr ("\n\r./", uc) != NULL);
+ dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80
+ : strchr ("\n\r./", uc) != NULL);
}
}
@@ -819,25 +877,21 @@ setbit_case_fold_c (int b, charclass c)
setbit (i, c);
}
+static void check_utf8 (void)
+{
+ wchar_t wc;
+ mbstate_t mbs = { 0 };
+ using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
+}
+static bool unibyte_c;
-/* UTF-8 encoding allows some optimizations that we can't otherwise
- assume in a multibyte encoding. */
-bool
-using_utf8 (void)
+static void check_unibyte_c (void)
{
- static int utf8 = -1;
- if (utf8 < 0)
- {
- wchar_t wc;
- mbstate_t mbs = { 0 };
- utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
-#ifdef LIBC_IS_BORKED
- if (gawk_mb_cur_max == 1)
- utf8 = 0;
-#endif
- }
- return utf8;
+ char const *locale = setlocale (LC_ALL, NULL);
+ unibyte_c = (!locale
+ || STREQ (locale, "C")
+ || STREQ (locale, "POSIX"));
}
/* The current locale is known to be a unibyte locale
@@ -846,7 +900,7 @@ using_utf8 (void)
processed more efficiently. */
static bool
-using_simple_locale (void)
+using_simple_locale (struct dfa const *dfa)
{
/* The native character set is known to be compatible with
the C locale. The following test isn't perfect, but it's good
@@ -864,44 +918,9 @@ using_simple_locale (void)
&& '}' == 125 && '~' == 126)
};
- if (! native_c_charset || dfa->multibyte)
- return false;
- else
- {
- static int unibyte_c = -1;
- if (unibyte_c < 0)
- {
- char const *locale = setlocale (LC_ALL, NULL);
- unibyte_c = (!locale
- || STREQ (locale, "C")
- || STREQ (locale, "POSIX"));
- }
- return unibyte_c;
- }
+ return (!native_c_charset || dfa->multibyte) ? false : unibyte_c;
}
-/* Lexical analyzer. All the dross that deals with the obnoxious
- GNU Regex syntax bits is located here. The poor, suffering
- reader is referred to the GNU Regex documentation for the
- meaning of the @#%!@#%^!@ syntax bits. */
-
-static char const *lexptr; /* Pointer to next input character. */
-static size_t lexleft; /* Number of characters remaining. */
-static token lasttok; /* Previous token returned; initially END. */
-static bool laststart; /* We're separated from beginning or (,
- | only by zero-width characters. */
-static size_t parens; /* Count of outstanding left parens. */
-static int minrep, maxrep; /* Repeat counts for {m,n}. */
-
-static int cur_mb_len = 1; /* Length of the multibyte representation of
- wctok. */
-
-static wint_t wctok; /* Wide character representation of the current
- multibyte character, or WEOF if there was
- an encoding error. Used only if
- MB_CUR_MAX > 1. */
-
-
/* Fetch the next lexical input character. Set C (of type int) to the
next input byte, except set C to EOF if the input is a multibyte
character of length greater than 1. Set WC (of type wint_t) to the
@@ -909,24 +928,25 @@ static wint_t wctok; /* Wide character representation of the current
of length 1); otherwise set WC to WEOF. If there is no more input,
report EOFERR if EOFERR is not null, and return lasttok = END
otherwise. */
-# define FETCH_WC(c, wc, eoferr) \
+# define FETCH_WC(dfa, c, wc, eoferr) \
do { \
- if (! lexleft) \
+ if (! dfa->lexstate.lexleft) \
{ \
if ((eoferr) != 0) \
dfaerror (eoferr); \
else \
- return lasttok = END; \
+ return dfa->lexstate.lasttok = END; \
} \
else \
{ \
wint_t _wc; \
- size_t nbytes = mbs_to_wchar (&_wc, lexptr, lexleft, dfa); \
- cur_mb_len = nbytes; \
+ size_t nbytes = mbs_to_wchar (&_wc, dfa->lexstate.lexptr, \
+ dfa->lexstate.lexleft, dfa); \
+ dfa->lexstate.cur_mb_len = nbytes; \
(wc) = _wc; \
- (c) = nbytes == 1 ? to_uchar (*lexptr) : EOF; \
- lexptr += nbytes; \
- lexleft -= nbytes; \
+ (c) = nbytes == 1 ? to_uchar (*dfa->lexstate.lexptr) : EOF; \
+ dfa->lexstate.lexptr += nbytes; \
+ dfa->lexstate.lexleft -= nbytes; \
} \
} while (false)
@@ -1023,7 +1043,7 @@ find_pred (const char *str)
/* Multibyte character handling sub-routine for lex.
Parse a bracket expression and build a struct mb_char_classes. */
static token
-parse_bracket_exp (void)
+parse_bracket_exp (struct dfa *dfa)
{
bool invert;
int c, c1, c2;
@@ -1067,12 +1087,12 @@ parse_bracket_exp (void)
work_mbc = NULL;
memset (ccl, 0, sizeof ccl);
- FETCH_WC (c, wc, _("unbalanced ["));
+ FETCH_WC (dfa, c, wc, _("unbalanced ["));
if (c == '^')
{
- FETCH_WC (c, wc, _("unbalanced ["));
+ FETCH_WC (dfa, c, wc, _("unbalanced ["));
invert = true;
- known_bracket_exp = using_simple_locale ();
+ known_bracket_exp = using_simple_locale (dfa);
}
else
invert = false;
@@ -1089,9 +1109,9 @@ parse_bracket_exp (void)
dfa is ever called. */
if (c == '[')
{
- FETCH_WC (c1, wc1, _("unbalanced ["));
+ FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
- if ((c1 == ':' && (syntax_bits & RE_CHAR_CLASSES))
+ if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
|| c1 == '.' || c1 == '=')
{
enum { MAX_BRACKET_STRING_LEN = 32 };
@@ -1099,8 +1119,9 @@ parse_bracket_exp (void)
size_t len = 0;
for (;;)
{
- FETCH_WC (c, wc, _("unbalanced ["));
- if ((c == c1 && *lexptr == ']') || lexleft == 0)
+ FETCH_WC (dfa, c, wc, _("unbalanced ["));
+ if ((c == c1 && *dfa->lexstate.lexptr == ']')
+ || dfa->lexstate.lexleft == 0)
break;
if (len < MAX_BRACKET_STRING_LEN)
str[len++] = c;
@@ -1111,7 +1132,7 @@ parse_bracket_exp (void)
str[len] = '\0';
/* Fetch bracket. */
- FETCH_WC (c, wc, _("unbalanced ["));
+ FETCH_WC (dfa, c, wc, _("unbalanced ["));
if (c1 == ':')
/* Build character class. POSIX allows character
classes to match multicharacter collating elements,
@@ -1119,8 +1140,9 @@ parse_bracket_exp (void)
worry about that possibility. */
{
char const *class
- = (case_fold && (STREQ (str, "upper")
- || STREQ (str, "lower")) ? "alpha" : str);
+ = (dfa->syntax.case_fold && (STREQ (str, "upper")
+ || STREQ (str, "lower")) ?
+ "alpha" : str);
const struct dfa_ctype *pred = find_pred (class);
if (!pred)
dfaerror (_("invalid character class"));
@@ -1138,7 +1160,7 @@ parse_bracket_exp (void)
colon_warning_state |= 8;
/* Fetch new lookahead character. */
- FETCH_WC (c1, wc1, _("unbalanced ["));
+ FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
continue;
}
@@ -1146,21 +1168,21 @@ parse_bracket_exp (void)
are already set up. */
}
- if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
- FETCH_WC (c, wc, _("unbalanced ["));
+ if (c == '\\' && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+ FETCH_WC (dfa, c, wc, _("unbalanced ["));
if (c1 == NOTCHAR)
- FETCH_WC (c1, wc1, _("unbalanced ["));
+ FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
if (c1 == '-')
/* build range characters. */
{
- FETCH_WC (c2, wc2, _("unbalanced ["));
+ FETCH_WC (dfa, c2, wc2, _("unbalanced ["));
/* A bracket expression like [a-[.aa.]] matches an unknown set.
Treat it like [-a[.aa.]] while parsing it, and
remember that the set is unknown. */
- if (c2 == '[' && *lexptr == '.')
+ if (c2 == '[' && *dfa->lexstate.lexptr == '.')
{
known_bracket_exp = false;
c2 = ']';
@@ -1170,28 +1192,29 @@ parse_bracket_exp (void)
{
/* In the case [x-], the - is an ordinary hyphen,
which is left in c1, the lookahead character. */
- lexptr -= cur_mb_len;
- lexleft += cur_mb_len;
+ dfa->lexstate.lexptr -= dfa->lexstate.cur_mb_len;
+ dfa->lexstate.lexleft += dfa->lexstate.cur_mb_len;
}
else
{
- if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
- FETCH_WC (c2, wc2, _("unbalanced ["));
+ if (c2 == '\\' && (dfa->syntax.syntax_bits
+ & RE_BACKSLASH_ESCAPE_IN_LISTS))
+ FETCH_WC (dfa, c2, wc2, _("unbalanced ["));
colon_warning_state |= 8;
- FETCH_WC (c1, wc1, _("unbalanced ["));
+ FETCH_WC (dfa, c1, wc1, _("unbalanced ["));
/* Treat [x-y] as a range if x != y. */
if (wc != wc2 || wc == WEOF)
{
if (dfa->multibyte)
known_bracket_exp = false;
- else if (using_simple_locale ())
+ else if (using_simple_locale (dfa))
{
int ci;
for (ci = c; ci <= c2; ci++)
setbit (ci, ccl);
- if (case_fold)
+ if (dfa->syntax.case_fold)
{
int uc = toupper (c);
int uc2 = toupper (c2);
@@ -1215,7 +1238,7 @@ parse_bracket_exp (void)
if (!dfa->multibyte)
{
- if (case_fold)
+ if (dfa->syntax.case_fold)
setbit_case_fold_c (c, ccl);
else
setbit (c, ccl);
@@ -1228,7 +1251,7 @@ parse_bracket_exp (void)
{
wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
unsigned int i;
- unsigned int n = (case_fold
+ unsigned int n = (dfa->syntax.case_fold
? case_folded_counterparts (wc, folded + 1) + 1
: 1);
folded[0] = wc;
@@ -1253,7 +1276,7 @@ parse_bracket_exp (void)
if (dfa->multibyte)
{
work_mbc->invert = invert;
- work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (ccl);
+ work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl);
return MBCSET;
}
@@ -1261,29 +1284,29 @@ parse_bracket_exp (void)
{
assert (!dfa->multibyte);
notset (ccl);
- if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
+ if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
clrbit ('\n', ccl);
}
- return CSET + charclass_index (ccl);
+ return CSET + dfa_charclass_index (dfa, ccl);
}
#define PUSH_LEX_STATE(s) \
do \
{ \
- char const *lexptr_saved = lexptr; \
- size_t lexleft_saved = lexleft; \
- lexptr = (s); \
- lexleft = strlen (lexptr)
+ char const *lexptr_saved = dfa->lexstate.lexptr; \
+ size_t lexleft_saved = dfa->lexstate.lexleft; \
+ dfa->lexstate.lexptr = (s); \
+ dfa->lexstate.lexleft = strlen (dfa->lexstate.lexptr)
#define POP_LEX_STATE() \
- lexptr = lexptr_saved; \
- lexleft = lexleft_saved; \
+ dfa->lexstate.lexptr = lexptr_saved; \
+ dfa->lexstate.lexleft = lexleft_saved; \
} \
while (false)
static token
-lex (void)
+lex (struct dfa *dfa)
{
int c, c2;
bool backslash = false;
@@ -1298,14 +1321,14 @@ lex (void)
"if (backslash) ...". */
for (i = 0; i < 2; ++i)
{
- FETCH_WC (c, wctok, NULL);
+ FETCH_WC (dfa, c, dfa->lexstate.wctok, NULL);
switch (c)
{
case '\\':
if (backslash)
goto normal_char;
- if (lexleft == 0)
+ if (dfa->lexstate.lexleft == 0)
dfaerror (_("unfinished \\ escape"));
backslash = true;
break;
@@ -1313,25 +1336,29 @@ lex (void)
case '^':
if (backslash)
goto normal_char;
- if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
- || lasttok == END || lasttok == LPAREN || lasttok == OR)
- return lasttok = BEGLINE;
+ if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
+ || dfa->lexstate.lasttok == END || dfa->lexstate.lasttok == LPAREN
+ || dfa->lexstate.lasttok == OR)
+ return dfa->lexstate.lasttok = BEGLINE;
goto normal_char;
case '$':
if (backslash)
goto normal_char;
- if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
- || lexleft == 0
- || (syntax_bits & RE_NO_BK_PARENS
- ? lexleft > 0 && *lexptr == ')'
- : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
- || (syntax_bits & RE_NO_BK_VBAR
- ? lexleft > 0 && *lexptr == '|'
- : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
- || ((syntax_bits & RE_NEWLINE_ALT)
- && lexleft > 0 && *lexptr == '\n'))
- return lasttok = ENDLINE;
+ if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
+ || dfa->lexstate.lexleft == 0
+ || (dfa->syntax.syntax_bits & RE_NO_BK_PARENS
+ ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == ')'
+ : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\'
+ && dfa->lexstate.lexptr[1] == ')')
+ || (dfa->syntax.syntax_bits & RE_NO_BK_VBAR
+ ? dfa->lexstate.lexleft > 0 && *dfa->lexstate.lexptr == '|'
+ : dfa->lexstate.lexleft > 1 && dfa->lexstate.lexptr[0] == '\\'
+ && dfa->lexstate.lexptr[1] == '|')
+ || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
+ && dfa->lexstate.lexleft > 0
+ && *dfa->lexstate.lexptr == '\n'))
+ return dfa->lexstate.lasttok = ENDLINE;
goto normal_char;
case '1':
@@ -1343,74 +1370,84 @@ lex (void)
case '7':
case '8':
case '9':
- if (backslash && !(syntax_bits & RE_NO_BK_REFS))
+ if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
{
- laststart = false;
- return lasttok = BACKREF;
+ dfa->lexstate.laststart = false;
+ return dfa->lexstate.lasttok = BACKREF;
}
goto normal_char;
case '`':
- if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
- return lasttok = BEGLINE; /* FIXME: should be beginning of string */
+ if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
+ {
+ /* FIXME: should be beginning of string */
+ return dfa->lexstate.lasttok = BEGLINE;
+ }
goto normal_char;
case '\'':
- if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
- return lasttok = ENDLINE; /* FIXME: should be end of string */
+ if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
+ {
+ /* FIXME: should be end of string */
+ return dfa->lexstate.lasttok = ENDLINE;
+ }
goto normal_char;
case '<':
- if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
- return lasttok = BEGWORD;
+ if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
+ return dfa->lexstate.lasttok = BEGWORD;
goto normal_char;
case '>':
- if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
- return lasttok = ENDWORD;
+ if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
+ return dfa->lexstate.lasttok = ENDWORD;
goto normal_char;
case 'b':
- if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
- return lasttok = LIMWORD;
+ if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
+ return dfa->lexstate.lasttok = LIMWORD;
goto normal_char;
case 'B':
- if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
- return lasttok = NOTLIMWORD;
+ if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
+ return dfa->lexstate.lasttok = NOTLIMWORD;
goto normal_char;
case '?':
- if (syntax_bits & RE_LIMITED_OPS)
+ if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
goto normal_char;
- if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
+ if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
goto normal_char;
- if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
+ if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
+ && dfa->lexstate.laststart)
goto normal_char;
- return lasttok = QMARK;
+ return dfa->lexstate.lasttok = QMARK;
case '*':
if (backslash)
goto normal_char;
- if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
+ if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
+ && dfa->lexstate.laststart)
goto normal_char;
- return lasttok = STAR;
+ return dfa->lexstate.lasttok = STAR;
case '+':
- if (syntax_bits & RE_LIMITED_OPS)
+ if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
goto normal_char;
- if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
+ if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
goto normal_char;
- if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
+ if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
+ && dfa->lexstate.laststart)
goto normal_char;
- return lasttok = PLUS;
+ return dfa->lexstate.lasttok = PLUS;
case '{':
- if (!(syntax_bits & RE_INTERVALS))
+ if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
goto normal_char;
- if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
+ if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
goto normal_char;
- if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
+ if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
+ && dfa->lexstate.laststart)
goto normal_char;
/* Cases:
@@ -1420,79 +1457,86 @@ lex (void)
{,} - 0 to infinity (same as '*')
{M,N} - M through N */
{
- char const *p = lexptr;
- char const *lim = p + lexleft;
- minrep = maxrep = -1;
+ char const *p = dfa->lexstate.lexptr;
+ char const *lim = p + dfa->lexstate.lexleft;
+ dfa->lexstate.minrep = dfa->lexstate.maxrep = -1;
for (; p != lim && ISASCIIDIGIT (*p); p++)
{
- if (minrep < 0)
- minrep = *p - '0';
+ if (dfa->lexstate.minrep < 0)
+ dfa->lexstate.minrep = *p - '0';
else
- minrep = MIN (RE_DUP_MAX + 1, minrep * 10 + *p - '0');
+ dfa->lexstate.minrep = MIN (RE_DUP_MAX + 1,
+ (dfa->lexstate.minrep
+ * 10 + *p - '0'));
}
if (p != lim)
{
if (*p != ',')
- maxrep = minrep;
+ dfa->lexstate.maxrep = dfa->lexstate.minrep;
else
{
- if (minrep < 0)
- minrep = 0;
+ if (dfa->lexstate.minrep < 0)
+ dfa->lexstate.minrep = 0;
while (++p != lim && ISASCIIDIGIT (*p))
{
- if (maxrep < 0)
- maxrep = *p - '0';
+ if (dfa->lexstate.maxrep < 0)
+ dfa->lexstate.maxrep = *p - '0';
else
- maxrep = MIN (RE_DUP_MAX + 1, maxrep * 10 + *p - '0');
+ dfa->lexstate.maxrep = MIN (RE_DUP_MAX + 1,
+ (dfa->lexstate.maxrep
+ * 10 + *p - '0'));
}
}
}
if (! ((! backslash || (p != lim && *p++ == '\\'))
&& p != lim && *p++ == '}'
- && 0 <= minrep && (maxrep < 0 || minrep <= maxrep)))
+ && 0 <= dfa->lexstate.minrep
+ && (dfa->lexstate.maxrep < 0
+ || dfa->lexstate.minrep <= dfa->lexstate.maxrep)))
{
- if (syntax_bits & RE_INVALID_INTERVAL_ORD)
+ if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
goto normal_char;
dfaerror (_("invalid content of \\{\\}"));
}
- if (RE_DUP_MAX < maxrep)
+ if (RE_DUP_MAX < dfa->lexstate.maxrep)
dfaerror (_("regular expression too big"));
- lexptr = p;
- lexleft = lim - p;
+ dfa->lexstate.lexptr = p;
+ dfa->lexstate.lexleft = lim - p;
}
- laststart = false;
- return lasttok = REPMN;
+ dfa->lexstate.laststart = false;
+ return dfa->lexstate.lasttok = REPMN;
case '|':
- if (syntax_bits & RE_LIMITED_OPS)
+ if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
goto normal_char;
- if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
+ if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
goto normal_char;
- laststart = true;
- return lasttok = OR;
+ dfa->lexstate.laststart = true;
+ return dfa->lexstate.lasttok = OR;
case '\n':
- if (syntax_bits & RE_LIMITED_OPS
- || backslash || !(syntax_bits & RE_NEWLINE_ALT))
+ if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
+ || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
goto normal_char;
- laststart = true;
- return lasttok = OR;
+ dfa->lexstate.laststart = true;
+ return dfa->lexstate.lasttok = OR;
case '(':
- if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
+ if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
goto normal_char;
- ++parens;
- laststart = true;
- return lasttok = LPAREN;
+ ++dfa->lexstate.parens;
+ dfa->lexstate.laststart = true;
+ return dfa->lexstate.lasttok = LPAREN;
case ')':
- if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
+ if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
goto normal_char;
- if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
+ if (dfa->lexstate.parens == 0
+ && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
goto normal_char;
- --parens;
- laststart = false;
- return lasttok = RPAREN;
+ --dfa->lexstate.parens;
+ dfa->lexstate.laststart = false;
+ return dfa->lexstate.lasttok = RPAREN;
case '.':
if (backslash)
@@ -1501,21 +1545,21 @@ lex (void)
{
/* In multibyte environment period must match with a single
character not a byte. So we use ANYCHAR. */
- laststart = false;
- return lasttok = ANYCHAR;
+ dfa->lexstate.laststart = false;
+ return dfa->lexstate.lasttok = ANYCHAR;
}
zeroset (ccl);
notset (ccl);
- if (!(syntax_bits & RE_DOT_NEWLINE))
+ if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
clrbit ('\n', ccl);
- if (syntax_bits & RE_DOT_NOT_NULL)
+ if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', ccl);
- laststart = false;
- return lasttok = CSET + charclass_index (ccl);
+ dfa->lexstate.laststart = false;
+ return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa, ccl);
case 's':
case 'S':
- if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
+ if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
if (!dfa->multibyte)
{
@@ -1525,8 +1569,9 @@ lex (void)
setbit (c2, ccl);
if (c == 'S')
notset (ccl);
- laststart = false;
- return lasttok = CSET + charclass_index (ccl);
+ dfa->lexstate.laststart = false;
+ return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa,
+ ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1537,16 +1582,16 @@ lex (void)
strings, each minus its "already processed" '['. */
PUSH_LEX_STATE (c == 's' ? "[:space:]]" : "^[:space:]]");
- lasttok = parse_bracket_exp ();
+ dfa->lexstate.lasttok = parse_bracket_exp (dfa);
POP_LEX_STATE ();
- laststart = false;
- return lasttok;
+ dfa->lexstate.laststart = false;
+ return dfa->lexstate.lasttok;
case 'w':
case 'W':
- if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
+ if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
if (!dfa->multibyte)
@@ -1557,8 +1602,9 @@ lex (void)
setbit (c2, ccl);
if (c == 'W')
notset (ccl);
- laststart = false;
- return lasttok = CSET + charclass_index (ccl);
+ dfa->lexstate.laststart = false;
+ return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa,
+ ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1569,35 +1615,36 @@ lex (void)
strings, each minus its "already processed" '['. */
PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]");
- lasttok = parse_bracket_exp ();
+ dfa->lexstate.lasttok = parse_bracket_exp (dfa);
POP_LEX_STATE ();
- laststart = false;
- return lasttok;
+ dfa->lexstate.laststart = false;
+ return dfa->lexstate.lasttok;
case '[':
if (backslash)
goto normal_char;
- laststart = false;
- return lasttok = parse_bracket_exp ();
+ dfa->lexstate.laststart = false;
+ return dfa->lexstate.lasttok = parse_bracket_exp (dfa);
default:
normal_char:
- laststart = false;
+ dfa->lexstate.laststart = false;
/* For multibyte character sets, folding is done in atom. Always
return WCHAR. */
if (dfa->multibyte)
- return lasttok = WCHAR;
+ return dfa->lexstate.lasttok = WCHAR;
- if (case_fold && isalpha (c))
+ if (dfa->syntax.case_fold && isalpha (c))
{
zeroset (ccl);
setbit_case_fold_c (c, ccl);
- return lasttok = CSET + charclass_index (ccl);
+ return dfa->lexstate.lasttok = CSET + dfa_charclass_index (dfa,
+ ccl);
}
- return lasttok = c;
+ return dfa->lexstate.lasttok = c;
}
}
@@ -1607,17 +1654,8 @@ lex (void)
return END; /* keeps pedantic compilers happy. */
}
-/* Recursive descent parser for regular expressions. */
-
-static token tok; /* Lookahead token. */
-static size_t depth; /* Current depth of a hypothetical stack
- holding deferred productions. This is
- used to determine the depth that will be
- required of the real stack later on in
- dfaanalyze. */
-
static void
-addtok_mb (token t, int mbprop)
+addtok_mb (struct dfa *dfa, token t, int mbprop)
{
if (dfa->talloc == dfa->tindex)
{
@@ -1640,7 +1678,7 @@ addtok_mb (token t, int mbprop)
case CAT:
case OR:
- --depth;
+ --dfa->parsestate.depth;
break;
case BACKREF:
@@ -1650,19 +1688,19 @@ addtok_mb (token t, int mbprop)
++dfa->nleaves;
/* fallthrough */
case EMPTY:
- ++depth;
+ ++dfa->parsestate.depth;
break;
}
- if (depth > dfa->depth)
- dfa->depth = depth;
+ if (dfa->parsestate.depth > dfa->depth)
+ dfa->depth = dfa->parsestate.depth;
}
-static void addtok_wc (wint_t wc);
+static void addtok_wc (struct dfa *dfa, wint_t wc);
/* Add the given token to the parse tree, maintaining the depth count and
updating the maximum depth if necessary. */
static void
-addtok (token t)
+addtok (struct dfa *dfa, token t)
{
if (dfa->multibyte && t == MBCSET)
{
@@ -1674,9 +1712,9 @@ addtok (token t)
This does not require UTF-8. */
for (i = 0; i < work_mbc->nchars; i++)
{
- addtok_wc (work_mbc->chars[i]);
+ addtok_wc (dfa, work_mbc->chars[i]);
if (need_or)
- addtok (OR);
+ addtok (dfa, OR);
need_or = true;
}
work_mbc->nchars = 0;
@@ -1685,14 +1723,14 @@ addtok (token t)
that the mbcset is empty now. Do nothing in that case. */
if (work_mbc->cset != -1)
{
- addtok (CSET + work_mbc->cset);
+ addtok (dfa, CSET + work_mbc->cset);
if (need_or)
- addtok (OR);
+ addtok (dfa, OR);
}
}
else
{
- addtok_mb (t, 3);
+ addtok_mb (dfa, t, 3);
}
}
@@ -1703,7 +1741,7 @@ addtok (token t)
<mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
<mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
static void
-addtok_wc (wint_t wc)
+addtok_wc (struct dfa *dfa, wint_t wc)
{
unsigned char buf[MB_LEN_MAX];
mbstate_t s = { 0 };
@@ -1711,25 +1749,25 @@ addtok_wc (wint_t wc)
size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
if (stored_bytes != (size_t) -1)
- cur_mb_len = stored_bytes;
+ dfa->lexstate.cur_mb_len = stored_bytes;
else
{
/* This is merely stop-gap. buf[0] is undefined, yet skipping
the addtok_mb call altogether can corrupt the heap. */
- cur_mb_len = 1;
+ dfa->lexstate.cur_mb_len = 1;
buf[0] = 0;
}
- addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1);
- for (i = 1; i < cur_mb_len; i++)
+ addtok_mb (dfa, buf[0], dfa->lexstate.cur_mb_len == 1 ? 3 : 1);
+ for (i = 1; i < dfa->lexstate.cur_mb_len; i++)
{
- addtok_mb (buf[i], i == cur_mb_len - 1 ? 2 : 0);
- addtok (CAT);
+ addtok_mb (dfa, buf[i], i == dfa->lexstate.cur_mb_len - 1 ? 2 : 0);
+ addtok (dfa, CAT);
}
}
static void
-add_utf8_anychar (void)
+add_utf8_anychar (struct dfa *dfa)
{
static charclass const utf8_classes[5] = {
/* 80-bf: non-leading bytes. */
@@ -1758,12 +1796,12 @@ add_utf8_anychar (void)
copyset (utf8_classes[i], c);
if (i == 1)
{
- if (!(syntax_bits & RE_DOT_NEWLINE))
+ if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
clrbit ('\n', c);
- if (syntax_bits & RE_DOT_NOT_NULL)
+ if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', c);
}
- dfa->utf8_anychar_classes[i] = CSET + charclass_index (c);
+ dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c);
}
/* A valid UTF-8 character is
@@ -1777,12 +1815,12 @@ add_utf8_anychar (void)
and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
for (i = 1; i < n; i++)
- addtok (dfa->utf8_anychar_classes[i]);
+ addtok (dfa, dfa->utf8_anychar_classes[i]);
while (--i > 1)
{
- addtok (dfa->utf8_anychar_classes[0]);
- addtok (CAT);
- addtok (OR);
+ addtok (dfa, dfa->utf8_anychar_classes[0]);
+ addtok (dfa, CAT);
+ addtok (dfa, OR);
}
}
@@ -1822,31 +1860,32 @@ add_utf8_anychar (void)
The parser builds a parse tree in postfix form in an array of tokens. */
static void
-atom (void)
+atom (struct dfa *dfa)
{
- if (tok == WCHAR)
+ if (dfa->parsestate.tok == WCHAR)
{
- if (wctok == WEOF)
- addtok (BACKREF);
+ if (dfa->lexstate.wctok == WEOF)
+ addtok (dfa, BACKREF);
else
{
- addtok_wc (wctok);
+ addtok_wc (dfa, dfa->lexstate.wctok);
- if (case_fold)
+ if (dfa->syntax.case_fold)
{
wchar_t folded[CASE_FOLDED_BUFSIZE];
- unsigned int i, n = case_folded_counterparts (wctok, folded);
+ unsigned int i, n = case_folded_counterparts (dfa->lexstate.wctok,
+ folded);
for (i = 0; i < n; i++)
{
- addtok_wc (folded[i]);
- addtok (OR);
+ addtok_wc (dfa, folded[i]);
+ addtok (dfa, OR);
}
}
}
- tok = lex ();
+ dfa->parsestate.tok = lex (dfa);
}
- else if (tok == ANYCHAR && using_utf8 ())
+ else if (dfa->parsestate.tok == ANYCHAR && using_utf8)
{
/* For UTF-8 expand the period to a series of CSETs that define a valid
UTF-8 character. This avoids using the slow multibyte path. I'm
@@ -1855,32 +1894,35 @@ atom (void)
it is done above in add_utf8_anychar. So, let's start with
UTF-8: it is the most used, and the structure of the encoding
makes the correctness more obvious. */
- add_utf8_anychar ();
- tok = lex ();
+ add_utf8_anychar (dfa);
+ dfa->parsestate.tok = lex (dfa);
}
- else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
- || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
- || tok == ANYCHAR || tok == MBCSET
- || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
+ else if ((dfa->parsestate.tok >= 0 && dfa->parsestate.tok < NOTCHAR)
+ || dfa->parsestate.tok >= CSET || dfa->parsestate.tok == BACKREF
+ || dfa->parsestate.tok == BEGLINE || dfa->parsestate.tok == ENDLINE
+ || dfa->parsestate.tok == BEGWORD || dfa->parsestate.tok == ANYCHAR
+ || dfa->parsestate.tok == MBCSET || dfa->parsestate.tok == ENDWORD
+ || dfa->parsestate.tok == LIMWORD
+ || dfa->parsestate.tok == NOTLIMWORD)
{
- addtok (tok);
- tok = lex ();
+ addtok (dfa, dfa->parsestate.tok);
+ dfa->parsestate.tok = lex (dfa);
}
- else if (tok == LPAREN)
+ else if (dfa->parsestate.tok == LPAREN)
{
- tok = lex ();
- regexp ();
- if (tok != RPAREN)
+ dfa->parsestate.tok = lex (dfa);
+ regexp (dfa);
+ if (dfa->parsestate.tok != RPAREN)
dfaerror (_("unbalanced ("));
- tok = lex ();
+ dfa->parsestate.tok = lex (dfa);
}
else
- addtok (EMPTY);
+ addtok (dfa, EMPTY);
}
/* Return the number of tokens in the given subexpression. */
static size_t _GL_ATTRIBUTE_PURE
-nsubtoks (size_t tindex)
+nsubtoks (struct dfa const *dfa, size_t tindex)
{
size_t ntoks1;
@@ -1891,90 +1933,93 @@ nsubtoks (size_t tindex)
case QMARK:
case STAR:
case PLUS:
- return 1 + nsubtoks (tindex - 1);
+ return 1 + nsubtoks (dfa, tindex - 1);
case CAT:
case OR:
- ntoks1 = nsubtoks (tindex - 1);
- return 1 + ntoks1 + nsubtoks (tindex - 1 - ntoks1);
+ ntoks1 = nsubtoks (dfa, tindex - 1);
+ return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
}
}
/* Copy the given subexpression to the top of the tree. */
static void
-copytoks (size_t tindex, size_t ntokens)
+copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
{
size_t i;
if (dfa->multibyte)
for (i = 0; i < ntokens; ++i)
- addtok_mb (dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
+ addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
else
for (i = 0; i < ntokens; ++i)
- addtok_mb (dfa->tokens[tindex + i], 3);
+ addtok_mb (dfa, dfa->tokens[tindex + i], 3);
}
static void
-closure (void)
+closure (struct dfa *dfa)
{
int i;
size_t tindex, ntokens;
- atom ();
- while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
- if (tok == REPMN && (minrep || maxrep))
+ atom (dfa);
+ while (dfa->parsestate.tok == QMARK || dfa->parsestate.tok == STAR
+ || dfa->parsestate.tok == PLUS || dfa->parsestate.tok == REPMN)
+ if (dfa->parsestate.tok == REPMN
+ && (dfa->lexstate.minrep || dfa->lexstate.maxrep))
{
- ntokens = nsubtoks (dfa->tindex);
+ ntokens = nsubtoks (dfa, dfa->tindex);
tindex = dfa->tindex - ntokens;
- if (maxrep < 0)
- addtok (PLUS);
- if (minrep == 0)
- addtok (QMARK);
- for (i = 1; i < minrep; ++i)
+ if (dfa->lexstate.maxrep < 0)
+ addtok (dfa, PLUS);
+ if (dfa->lexstate.minrep == 0)
+ addtok (dfa, QMARK);
+ for (i = 1; i < dfa->lexstate.minrep; ++i)
{
- copytoks (tindex, ntokens);
- addtok (CAT);
+ copytoks (dfa, tindex, ntokens);
+ addtok (dfa, CAT);
}
- for (; i < maxrep; ++i)
+ for (; i < dfa->lexstate.maxrep; ++i)
{
- copytoks (tindex, ntokens);
- addtok (QMARK);
- addtok (CAT);
+ copytoks (dfa, tindex, ntokens);
+ addtok (dfa, QMARK);
+ addtok (dfa, CAT);
}
- tok = lex ();
+ dfa->parsestate.tok = lex (dfa);
}
- else if (tok == REPMN)
+ else if (dfa->parsestate.tok == REPMN)
{
- dfa->tindex -= nsubtoks (dfa->tindex);
- tok = lex ();
- closure ();
+ dfa->tindex -= nsubtoks (dfa, dfa->tindex);
+ dfa->parsestate.tok = lex (dfa);
+ closure (dfa);
}
else
{
- addtok (tok);
- tok = lex ();
+ addtok (dfa, dfa->parsestate.tok);
+ dfa->parsestate.tok = lex (dfa);
}
}
static void
-branch (void)
+branch (struct dfa* dfa)
{
- closure ();
- while (tok != RPAREN && tok != OR && tok >= 0)
+ closure (dfa);
+ while (dfa->parsestate.tok != RPAREN && dfa->parsestate.tok != OR
+ && dfa->parsestate.tok >= 0)
{
- closure ();
- addtok (CAT);
+ closure (dfa);
+ addtok (dfa, CAT);
}
}
static void
-regexp (void)
+regexp (struct dfa *dfa)
{
- branch ();
- while (tok == OR)
+ branch (dfa);
+ while (dfa->parsestate.tok == OR)
{
- tok = lex ();
- branch ();
- addtok (OR);
+ dfa->parsestate.tok = lex (dfa);
+ branch (dfa);
+ addtok (dfa, OR);
}
}
@@ -1984,34 +2029,33 @@ regexp (void)
static void
dfaparse (char const *s, size_t len, struct dfa *d)
{
- dfa = d;
- lexptr = s;
- lexleft = len;
- lasttok = END;
- laststart = true;
- parens = 0;
- if (dfa->multibyte)
+ d->lexstate.lexptr = s;
+ d->lexstate.lexleft = len;
+ d->lexstate.lasttok = END;
+ d->lexstate.laststart = true;
+ d->lexstate.parens = 0;
+ if (d->multibyte)
{
- cur_mb_len = 0;
+ d->lexstate.cur_mb_len = 0;
memset (&d->mbs, 0, sizeof d->mbs);
}
- if (!syntax_bits_set)
+ if (!d->syntax.syntax_bits_set)
dfaerror (_("no syntax specified"));
- tok = lex ();
- depth = d->depth;
+ d->parsestate.tok = lex (d);
+ d->parsestate.depth = d->depth;
- regexp ();
+ regexp (d);
- if (tok != END)
+ if (d->parsestate.tok != END)
dfaerror (_("unbalanced )"));
- addtok (END - d->nregexps);
- addtok (CAT);
+ addtok (d, END - d->nregexps);
+ addtok (d, CAT);
if (d->nregexps)
- addtok (OR);
+ addtok (d, OR);
++d->nregexps;
}
@@ -2282,19 +2326,19 @@ epsclosure (position_set *s, struct dfa const *d, char *visited)
character included in C. */
static int
-charclass_context (charclass c)
+charclass_context (struct dfa const *dfa, charclass c)
{
int context = 0;
unsigned int j;
- if (tstbit (eolbyte, c))
+ if (tstbit (dfa->syntax.eolbyte, c))
context |= CTX_NEWLINE;
for (j = 0; j < CHARCLASS_WORDS; ++j)
{
- if (c[j] & letters[j])
+ if (c[j] & dfa->syntax.letters[j])
context |= CTX_LETTER;
- if (c[j] & ~(letters[j] | newline[j]))
+ if (c[j] & ~(dfa->syntax.letters[j] | dfa->syntax.newline[j]))
context |= CTX_NONE;
}
@@ -2689,15 +2733,15 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
d->states[s].context, CTX_NEWLINE))
for (j = 0; j < CHARCLASS_WORDS; ++j)
- matches[j] &= ~newline[j];
+ matches[j] &= ~d->syntax.newline[j];
if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
d->states[s].context, CTX_LETTER))
for (j = 0; j < CHARCLASS_WORDS; ++j)
- matches[j] &= ~letters[j];
+ matches[j] &= ~d->syntax.letters[j];
if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
d->states[s].context, CTX_NONE))
for (j = 0; j < CHARCLASS_WORDS; ++j)
- matches[j] &= letters[j] | newline[j];
+ matches[j] &= d->syntax.letters[j] | d->syntax.newline[j];
/* If there are no characters left, there's no point in going on. */
for (j = 0; j < CHARCLASS_WORDS && !matches[j]; ++j)
@@ -2803,7 +2847,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
for (i = 0; i < NOTCHAR; ++i)
trans[i] = unibyte_word_constituent (i) ? state_letter : state;
- trans[eolbyte] = state_newline;
+ trans[d->syntax.eolbyte] = state_newline;
}
else
for (i = 0; i < NOTCHAR; ++i)
@@ -2859,7 +2903,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
}
/* Find out if the new state will want any context information. */
- possible_contexts = charclass_context (labels[i]);
+ possible_contexts = charclass_context (d, labels[i]);
separate_contexts = state_separate_contexts (&follows);
/* Find the state(s) corresponding to the union of the follows. */
@@ -2906,7 +2950,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
{
int c = j * CHARCLASS_WORD_BITS + k;
- if (c == eolbyte)
+ if (c == d->syntax.eolbyte)
trans[c] = state_newline;
else if (unibyte_word_constituent (c))
trans[c] = state_letter;
@@ -3032,8 +3076,8 @@ build_state (state_num s, struct dfa *d)
/* Keep the newline transition in a special place so we can use it as
a sentinel. */
- d->newlines[s] = trans[eolbyte];
- trans[eolbyte] = -1;
+ d->newlines[s] = trans[d->syntax.eolbyte];
+ trans[d->syntax.eolbyte] = -1;
if (ACCEPTING (s, *d))
d->fails[s] = trans;
@@ -3052,7 +3096,7 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
{
state_num *t;
- if (**pp == eolbyte)
+ if (**pp == d->syntax.eolbyte)
{
/* S is always an initial state in transit_state, so the
transition table for the state must have been built already. */
@@ -3095,7 +3139,7 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
size_t i, j;
int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
- int context = wc == eolbyte ? CTX_NEWLINE : CTX_NONE;
+ int context = wc == d->syntax.eolbyte ? CTX_NEWLINE : CTX_NONE;
bool context_newline = context == CTX_NEWLINE;
/* This state has some operators which can match a multibyte character. */
@@ -3213,7 +3257,7 @@ skip_remains_mb (struct dfa *d, unsigned char const *p,
unsigned char const *mbp, char const *end, wint_t *wcp)
{
wint_t wc = WEOF;
- if (never_trail[*p])
+ if (d->syntax.never_trail[*p])
return p;
while (mbp < p)
mbp += mbs_to_wchar (&wc, (char const *) mbp,
@@ -3251,7 +3295,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
unsigned char const *p, *mbp; /* Current input character. */
state_num **trans, *t; /* Copy of d->trans so it can be optimized
into a register. */
- unsigned char eol = eolbyte; /* Likewise for eolbyte. */
+ unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
unsigned char saved_end;
size_t nlcount = 0;
@@ -3318,8 +3362,8 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
}
if (d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl)
- || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE))
- || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
+ || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE))
+ || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL))
|| (char *) p >= end)
{
/* If an input character does not match ANYCHAR, do it
@@ -3382,14 +3426,14 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
}
else if (d->fails[s])
{
- if (d->success[s] & sbit[*p])
+ if (d->success[s] & d->syntax.sbit[*p])
goto done;
s1 = s;
if (!multibyte || d->states[s].mbps.nelem == 0
|| (*p == eol && !allow_nl)
- || (*p == '\n' && !(syntax_bits & RE_DOT_NEWLINE))
- || (*p == '\0' && (syntax_bits & RE_DOT_NOT_NULL))
+ || (*p == '\n' && !(d->syntax.syntax_bits & RE_DOT_NEWLINE))
+ || (*p == '\0' && (d->syntax.syntax_bits & RE_DOT_NOT_NULL))
|| (char *) p >= end)
{
/* If a input character does not match ANYCHAR, do it
@@ -3491,17 +3535,6 @@ free_mbdata (struct dfa *d)
}
}
-/* Initialize the components of a dfa that the other routines don't
- initialize for themselves. */
-static void
-dfainit (struct dfa *d)
-{
- memset (d, 0, sizeof *d);
- d->multibyte = MB_CUR_MAX > 1;
- d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
- d->fast = !d->multibyte;
-}
-
/* Return true if every construct in D is supported by this DFA matcher. */
static bool _GL_ATTRIBUTE_PURE
dfa_supported (struct dfa const *d)
@@ -3533,7 +3566,7 @@ dfaoptimize (struct dfa *d)
size_t i;
bool have_backref = false;
- if (!using_utf8 ())
+ if (!using_utf8)
return;
for (i = 0; i < d->tindex; ++i)
@@ -3652,7 +3685,6 @@ dfassbuild (struct dfa *d)
void
dfacomp (char const *s, size_t len, struct dfa *d, bool searchflag)
{
- dfainit (d);
dfaparse (s, len, d);
dfassbuild (d);
@@ -3969,7 +4001,7 @@ dfamust (struct dfa const *d)
size_t rj;
bool need_begline = false;
bool need_endline = false;
- bool case_fold_unibyte = case_fold && MB_CUR_MAX == 1;
+ bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
struct dfamust *dm;
for (ri = 0; ri < d->tindex; ++ri)
@@ -4206,7 +4238,20 @@ dfamustfree (struct dfamust *dm)
struct dfa *
dfaalloc (void)
{
- return xmalloc (sizeof (struct dfa));
+ struct dfa *d = xcalloc (1, sizeof (struct dfa));
+ d->multibyte = MB_CUR_MAX > 1;
+ d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
+ d->fast = !d->multibyte;
+ d->lexstate.cur_mb_len = 1;
+ return d;
+}
+
+void
+dfa_init (void)
+{
+ check_utf8 ();
+ check_unibyte_c ();
+ init_mbrtowc_cache ();
}
/* vim:set shiftwidth=2: */
diff --git a/dfa.h b/dfa.h
index 242f4cfa..02f56f44 100644
--- a/dfa.h
+++ b/dfa.h
@@ -54,10 +54,10 @@ extern struct dfamust *dfamust (struct dfa const *);
/* Free the storage held by the components of a struct dfamust. */
extern void dfamustfree (struct dfamust *);
-/* dfasyntax() takes three arguments; the first sets the syntax bits described
- earlier in this file, the second sets the case-folding flag, and the
- third specifies the line terminator. */
-extern void dfasyntax (reg_syntax_t, bool, unsigned char);
+/* dfasyntax() takes four arguments; the first is the dfa to operate on, the
+ second sets the syntax bits described earlier in this file, the third sets
+ the case-folding flag, and the fourth specifies the line terminator. */
+extern void dfasyntax (struct dfa *, reg_syntax_t, bool, unsigned char);
/* Compile the given string of the given length into the given struct dfa.
Final argument is a flag specifying whether to build a searching or an
@@ -104,4 +104,7 @@ extern void dfawarn (const char *);
The user must supply a dfaerror. */
extern _Noreturn void dfaerror (const char *);
-extern bool using_utf8 (void);
+extern bool dfa_using_utf8 (void) _GL_ATTRIBUTE_PURE;
+
+/* This must be called before calling any of the above dfa*() functions. */
+extern void dfa_init (void);
diff --git a/node.c b/node.c
index 24f1493c..b63017aa 100644
--- a/node.c
+++ b/node.c
@@ -756,7 +756,7 @@ str2wstr(NODE *n, size_t **ptr)
* stopping early. This is particularly important
* for match() where we need to build the indices.
*/
- if (using_utf8()) {
+ if (dfa_using_utf8()) {
count = 1;
wc = 0xFFFD; /* unicode replacement character */
goto set_wc;
diff --git a/re.c b/re.c
index 8a325308..878c884e 100644
--- a/re.c
+++ b/re.c
@@ -203,14 +203,11 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
syn &= ~RE_ICASE;
}
- /* only call dfasyntax if we're using dfa; saves time */
- if (dfa && ! no_dfa) {
- dfa_syn = syn;
- /* FIXME: dfa doesn't pay attention RE_ICASE */
- if (ignorecase)
- dfa_syn |= RE_ICASE;
- dfasyntax(dfa_syn, ignorecase, '\n');
- }
+ dfa_syn = syn;
+ /* FIXME: dfa doesn't pay attention RE_ICASE */
+ if (ignorecase)
+ dfa_syn |= RE_ICASE;
+
re_set_syntax(syn);
if ((rerr = re_compile_pattern(buf, len, &(rp->pat))) != NULL) {
@@ -228,6 +225,7 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
if (dfa && ! no_dfa) {
rp->dfa = true;
rp->dfareg = dfaalloc();
+ dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n');
dfacomp(buf, len, rp->dfareg, true);
} else
rp->dfa = false;
@@ -423,7 +421,8 @@ resetup()
syn |= RE_INTERVALS | RE_INVALID_INTERVAL_ORD | RE_NO_BK_BRACES;
(void) re_set_syntax(syn);
- dfasyntax(syn, false, '\n');
+
+ dfa_init();
}
/* avoid_dfa --- return true if we should not use the DFA matcher */