summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorPaolo Bonzini <bonzini@gnu.org>2004-11-22 15:49:28 +0000
committerPaolo Bonzini <bonzini@gnu.org>2008-01-09 16:10:54 +0100
commit611e77d579b58b2cf6da9249d6d850b6fdb8e6e8 (patch)
tree8d963d74f87a2f431040a108372f2f3610cb8bc5 /lib
parent7f53c1c16e7654427dc884e528f5e2d201c15fc5 (diff)
downloadsed-611e77d579b58b2cf6da9249d6d850b6fdb8e6e8.tar.gz
git-archimport-id: bonzini@gnu.org--2004b/sed--stable--4.1--patch-20
Diffstat (limited to 'lib')
-rw-r--r--lib/regcomp.c121
-rw-r--r--lib/regex_.h4
-rw-r--r--lib/regex_internal.c1
-rw-r--r--lib/regex_internal.h7
-rw-r--r--lib/regexec.c142
5 files changed, 197 insertions, 78 deletions
diff --git a/lib/regcomp.c b/lib/regcomp.c
index 91958c9..dafad9b 100644
--- a/lib/regcomp.c
+++ b/lib/regcomp.c
@@ -33,6 +33,14 @@ static reg_errcode_t create_initial_state (re_dfa_t *dfa);
#ifdef RE_ENABLE_I18N
static void optimize_utf8 (re_dfa_t *dfa);
#endif
+struct subexp_optimize
+{
+ re_dfa_t *dfa;
+ re_token_t *nodes;
+ int no_sub, re_nsub;
+};
+static bin_tree_t *optimize_subexps (struct subexp_optimize *so,
+ bin_tree_t *node, int sidx, int depth);
static reg_errcode_t analyze (re_dfa_t *dfa);
static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
@@ -238,8 +246,8 @@ re_compile_pattern (pattern, length, bufp)
/* And GNU code determines whether or not to get register information
by passing null for the REGS argument to re_match, etc., not by
- setting no_sub. */
- bufp->no_sub = 0;
+ setting no_sub, unless RE_NO_SUB is set. */
+ bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
/* Match anchors at newline. */
bufp->newline_anchor = 1;
@@ -633,6 +641,7 @@ free_dfa_content (re_dfa_t *dfa)
if (dfa->sb_char != utf8_sb_map)
re_free (dfa->sb_char);
#endif
+ re_free (dfa->subexp_map);
#ifdef DEBUG
re_free (dfa->re_str);
#endif
@@ -810,6 +819,17 @@ re_compile_internal (preg, pattern, length, syntax)
optimize_utf8 (dfa);
#endif
+ if (preg->re_nsub > 0)
+ {
+ struct subexp_optimize so;
+
+ so.dfa = dfa;
+ so.nodes = dfa->nodes;
+ so.no_sub = preg->no_sub;
+ so.re_nsub = preg->re_nsub;
+ dfa->str_tree = optimize_subexps (&so, dfa->str_tree, -1, 0);
+ }
+
/* Analyze the tree and collect information which is necessary to
create the dfa. */
err = analyze (dfa);
@@ -842,9 +862,9 @@ init_dfa (dfa, pat_len)
int pat_len;
{
int table_size;
- #ifndef _LIBC
+#ifndef _LIBC
char *codeset_name;
- #endif
+#endif
memset (dfa, '\0', sizeof (re_dfa_t));
@@ -868,9 +888,9 @@ init_dfa (dfa, pat_len)
dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
dfa->mb_cur_max = MB_CUR_MAX;
-#if defined _LIBC
- codeset_name = _NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME);
- if (dfa->mb_cur_max == 6 && strcmp (codeset_name, "UTF-8") == 0)
+#ifdef _LIBC
+ if (dfa->mb_cur_max == 6
+ && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
dfa->is_utf8 = 1;
dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
!= 0);
@@ -927,9 +947,9 @@ init_dfa (dfa, pat_len)
# endif
}
}
- }
- #endif
-
+ }
+#endif
+
if (BE (dfa->nodes == NULL || dfa->state_table == NULL
|| dfa->subexps == NULL, 0))
return REG_ESPACE;
@@ -1121,6 +1141,82 @@ optimize_utf8 (dfa)
}
#endif
+static bin_tree_t *
+optimize_subexps (so, node, sidx, depth)
+ struct subexp_optimize *so;
+ bin_tree_t *node;
+ int sidx, depth;
+{
+ int idx, new_depth, new_sidx;
+ bin_tree_t *ret;
+ if (node == NULL)
+ return NULL;
+
+ new_depth = 0;
+ new_sidx = sidx;
+ if ((depth & 1) && node->type == CONCAT
+ && node->right && node->right->type == 0
+ && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
+ {
+ new_depth = depth + 1;
+ if (new_depth == 2
+ || (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
+ && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx)))
+ new_sidx = so->nodes[idx].opr.idx;
+ }
+ node->left = optimize_subexps (so, node->left, new_sidx, new_depth);
+ new_depth = (depth & 1) == 0 && node->type == CONCAT
+ && node->left && node->left->type == 0
+ && so->nodes[node->left->node_idx].type == OP_OPEN_SUBEXP
+ ? depth + 1 : 0;
+ node->right = optimize_subexps (so, node->right, sidx, new_depth);
+
+ if (node->type != CONCAT)
+ return node;
+ if ((depth & 1) == 0
+ && node->left
+ && node->left->type == 0
+ && so->nodes[idx = node->left->node_idx].type == OP_OPEN_SUBEXP)
+ ret = node->right;
+ else if ((depth & 1)
+ && node->right
+ && node->right->type == 0
+ && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
+ ret = node->left;
+ else
+ return node;
+
+ if (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
+ && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx))
+ return node;
+
+ if (!so->no_sub)
+ {
+ int i;
+
+ if (depth < 2)
+ return node;
+
+ if (so->dfa->subexp_map == NULL)
+ {
+ so->dfa->subexp_map = re_malloc (int, so->re_nsub);
+ if (so->dfa->subexp_map == NULL)
+ return node;
+
+ for (i = 0; i < so->re_nsub; i++)
+ so->dfa->subexp_map[i] = i;
+ }
+
+ i = so->nodes[idx].opr.idx;
+ assert (sidx < i);
+ so->dfa->subexp_map[i] = sidx;
+ }
+
+ so->nodes[idx].type = OP_DELETED_SUBEXP;
+ ret->parent = node->parent;
+ return ret;
+}
+
/* Analyze the structure tree, and calculate "first", "next", "edest",
"eclosure", and "inveclosure". */
@@ -1525,6 +1621,8 @@ calc_inveclosure (dfa)
int src, idx, dest;
for (src = 0; src < dfa->nodes_len; ++src)
{
+ if (dfa->nodes[src].type == OP_DELETED_SUBEXP)
+ continue;
for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
{
dest = dfa->eclosures[src].elems[idx];
@@ -1560,6 +1658,9 @@ calc_eclosure (dfa)
#ifdef DEBUG
assert (dfa->eclosures[node_idx].nelem != -1);
#endif
+ if (dfa->nodes[node_idx].type == OP_DELETED_SUBEXP)
+ continue;
+
/* If we have already calculated, skip it. */
if (dfa->eclosures[node_idx].nelem != 0)
continue;
diff --git a/lib/regex_.h b/lib/regex_.h
index 4279dbc..b2d9a62 100644
--- a/lib/regex_.h
+++ b/lib/regex_.h
@@ -179,6 +179,10 @@ typedef unsigned long int reg_syntax_t;
immediately after an alternation or begin-group operator. */
#define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1)
+/* If this bit is set, then no_sub will be set to 1 during
+ re_compile_pattern. */
+#define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1)
+
/* This global variable defines the particular regexp syntax to use (for
some interfaces). When a regexp is compiled, the syntax used is
stored in the pattern buffer, so changing this does not affect
diff --git a/lib/regex_internal.c b/lib/regex_internal.c
index d6f44b6..bb1d73d 100644
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -1641,7 +1641,6 @@ free_state (state)
re_free (state->entrance_nodes);
}
re_node_set_free (&state->nodes);
- re_free (state->word_trtable);
re_free (state->trtable);
re_free (state);
}
diff --git a/lib/regex_internal.h b/lib/regex_internal.h
index 3a93b5b..a778032 100644
--- a/lib/regex_internal.h
+++ b/lib/regex_internal.h
@@ -189,6 +189,7 @@ typedef enum
OP_DUP_PLUS = EPSILON_BIT | 4,
OP_DUP_QUESTION = EPSILON_BIT | 5,
ANCHOR = EPSILON_BIT | 6,
+ OP_DELETED_SUBEXP = EPSILON_BIT | 7,
/* Tree type, these are used only by tree. */
CONCAT = 16,
@@ -481,7 +482,7 @@ struct re_dfastate_t
unsigned int hash;
re_node_set nodes;
re_node_set *entrance_nodes;
- struct re_dfastate_t **trtable, **word_trtable;
+ struct re_dfastate_t **trtable;
unsigned int context : 4;
unsigned int halt : 1;
/* If this state can accept `multi byte'.
@@ -491,6 +492,7 @@ struct re_dfastate_t
/* If this state has backreference node(s). */
unsigned int has_backref : 1;
unsigned int has_constraint : 1;
+ unsigned int word_trtable : 1;
};
typedef struct re_dfastate_t re_dfastate_t;
@@ -549,7 +551,7 @@ struct re_backref_cache_entry
int subexp_to;
char more;
char unused;
- short eps_reachable_subexps_map;
+ unsigned short int eps_reachable_subexps_map;
};
typedef struct
@@ -643,6 +645,7 @@ struct re_dfa_t
int mb_cur_max;
bitset word_char;
reg_syntax_t syntax;
+ int *subexp_map;
#ifdef DEBUG
char* re_str;
#endif
diff --git a/lib/regexec.c b/lib/regexec.c
index cab3300..5877ade 100644
--- a/lib/regexec.c
+++ b/lib/regexec.c
@@ -175,8 +175,8 @@ static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
re_node_set *cur_nodes, int cur_str,
int subexp_num, int type) internal_function;
-static int build_trtable (re_dfa_t *dfa,
- re_dfastate_t *state) internal_function;
+static re_dfastate_t **build_trtable (re_dfa_t *dfa,
+ re_dfastate_t *state) internal_function;
#ifdef RE_ENABLE_I18N
static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
const re_string_t *input, int idx) internal_function;
@@ -684,11 +684,12 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
left_lim = (range < 0) ? start + range : start;
right_lim = (range < 0) ? start : start + range;
sb = dfa->mb_cur_max == 1;
- match_kind =
- (fastmap ? 8 : 0)
- | (sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
- | (range >= 0 ? 2 : 0)
- | (t != NULL ? 1 : 0);
+ match_kind =
+ (fastmap
+ ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
+ | (range >= 0 ? 2 : 0)
+ | (t != NULL ? 1 : 0))
+ : 8);
for (;; match_first += incr)
{
@@ -703,19 +704,18 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
save on code size. We use a switch statement for speed. */
switch (match_kind)
{
- case 0: case 1: case 2: case 3:
- case 4: case 5: case 6: case 7:
+ case 8:
/* No fastmap. */
break;
- case 15:
+ case 7:
/* Fastmap with single-byte translation, match forward. */
while (BE (match_first < right_lim, 1)
&& !fastmap[t[(unsigned char) string[match_first]]])
++match_first;
goto forward_match_found_start_or_reached_end;
- case 14:
+ case 6:
/* Fastmap without translation, match forward. */
while (BE (match_first < right_lim, 1)
&& !fastmap[(unsigned char) string[match_first]])
@@ -731,8 +731,8 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
}
break;
- case 12:
- case 13:
+ case 4:
+ case 5:
/* Fastmap without multi-byte translation, match backwards. */
while (match_first >= left_lim)
{
@@ -745,7 +745,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
if (match_first < left_lim)
goto free_return;
break;
-
+
default:
/* In this case, we can't determine easily the current byte,
since it might be a component byte of a multibyte
@@ -754,19 +754,20 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
{
/* If MATCH_FIRST is out of the valid range, reconstruct the
buffers. */
- if (mctx.input.raw_mbs_idx + mctx.input.valid_raw_len <= match_first
- || match_first < mctx.input.raw_mbs_idx)
+ unsigned int offset = match_first - mctx.input.raw_mbs_idx;
+ if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
{
err = re_string_reconstruct (&mctx.input, match_first,
eflags);
if (BE (err != REG_NOERROR, 0))
goto free_return;
+
+ offset = match_first - mctx.input.raw_mbs_idx;
}
/* If MATCH_FIRST is out of the buffer, leave it as '\0'.
Note that MATCH_FIRST must not be smaller than 0. */
- ch = ((match_first >= length) ? 0
- : re_string_byte_at (&mctx.input,
- match_first - mctx.input.raw_mbs_idx));
+ ch = (match_first >= length
+ ? 0 : re_string_byte_at (&mctx.input, offset));
if (fastmap[ch])
break;
match_first += incr;
@@ -881,6 +882,18 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
pmatch[reg_idx].rm_so += match_first;
pmatch[reg_idx].rm_eo += match_first;
}
+
+ if (dfa->subexp_map)
+ for (reg_idx = 0;
+ reg_idx + 1 < nmatch && reg_idx < preg->re_nsub;
+ reg_idx++)
+ if (dfa->subexp_map[reg_idx] != reg_idx)
+ {
+ pmatch[reg_idx + 1].rm_so
+ = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
+ pmatch[reg_idx + 1].rm_eo
+ = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
+ }
}
free_return:
@@ -2200,36 +2213,36 @@ transit_state (err, mctx, state)
re_match_context_t *mctx;
re_dfastate_t *state;
{
+ re_dfa_t *const dfa = mctx->dfa;
re_dfastate_t **trtable;
unsigned char ch;
#ifdef RE_ENABLE_I18N
- /* If the current state can accept multibyte. */
- if (BE (state->accept_mb, 0))
- {
- *err = transit_state_mb (mctx, state);
- if (BE (*err != REG_NOERROR, 0))
- return NULL;
- }
+ /* If the current state can accept multibyte. */
+ if (BE (state->accept_mb, 0))
+ {
+ *err = transit_state_mb (mctx, state);
+ if (BE (*err != REG_NOERROR, 0))
+ return NULL;
+ }
#endif /* RE_ENABLE_I18N */
/* Then decide the next state with the single byte. */
-#if 0
- if (0)
- /* don't use transition table */
- return transit_state_sb (err, mctx, state);
-#endif
-
- /* Use transition table */
- ch = re_string_fetch_byte (&mctx->input);
- for (;;)
+ if (1)
{
+ /* Use transition table */
+ ch = re_string_fetch_byte (&mctx->input);
trtable = state->trtable;
- if (BE (trtable != NULL, 1))
- return trtable[ch];
-
- trtable = state->word_trtable;
- if (BE (trtable != NULL, 1))
+ if (trtable == NULL)
+ {
+ trtable = build_trtable (dfa, state);
+ if (trtable == NULL)
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+ }
+ if (BE (state->word_trtable, 0))
{
unsigned int context;
context
@@ -2241,15 +2254,14 @@ transit_state (err, mctx, state)
else
return trtable[ch];
}
-
- if (!build_trtable (mctx->dfa, state))
- {
- *err = REG_ESPACE;
- return NULL;
- }
-
- /* Retry, we now have a transition table. */
+ else
+ return trtable[ch];
}
+#if 0
+ else
+ /* don't use transition table */
+ return transit_state_sb (err, mctx, state);
+#endif
}
/* Update the state_log if we need */
@@ -3253,15 +3265,15 @@ expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
}
/* Build transition table for the state.
- Return 1 if succeeded, otherwise return NULL. */
+ Return the new table if succeeded, otherwise return NULL. */
-static int
+static re_dfastate_t **
build_trtable (dfa, state)
re_dfa_t *dfa;
re_dfastate_t *state;
{
reg_errcode_t err;
- int i, j, ch, need_word_trtable = 0;
+ int i, j, ch;
unsigned int elem, mask;
int dests_node_malloced = 0, dest_states_malloced = 0;
int ndests; /* Number of the destination states from `state'. */
@@ -3285,13 +3297,13 @@ build_trtable (dfa, state)
dests_node = (re_node_set *)
malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
if (BE (dests_node == NULL, 0))
- return 0;
+ return NULL;
dests_node_malloced = 1;
}
dests_ch = (bitset *) (dests_node + SBC_MAX);
/* Initialize transiton table. */
- state->word_trtable = state->trtable = 0;
+ state->word_trtable = 0;
/* At first, group all nodes belonging to `state' into several
destinations. */
@@ -3300,14 +3312,14 @@ build_trtable (dfa, state)
{
if (dests_node_malloced)
free (dests_node);
- /* Return 0 in case of an error, 1 otherwise. */
+ /* Return NULL in case of an error, trtable otherwise. */
if (ndests == 0)
{
state->trtable = (re_dfastate_t **)
calloc (sizeof (re_dfastate_t *), SBC_MAX);;
- return 1;
+ return state->trtable;
}
- return 0;
+ return NULL;
}
err = re_node_set_alloc (&follows, ndests + 1);
@@ -3334,7 +3346,7 @@ out_free:
re_node_set_free (dests_node + i);
if (dests_node_malloced)
free (dests_node);
- return 0;
+ return NULL;
}
dest_states_malloced = 1;
}
@@ -3372,7 +3384,7 @@ out_free:
if (dest_states[i] != dest_states_word[i]
&& dfa->mb_cur_max > 1)
- need_word_trtable = 1;
+ state->word_trtable = 1;
dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
CONTEXT_NEWLINE);
@@ -3387,14 +3399,13 @@ out_free:
bitset_merge (acceptable, dests_ch[i]);
}
- if (!BE (need_word_trtable, 0))
+ if (!BE (state->word_trtable, 0))
{
/* We don't care about whether the following character is a word
character, or we are in a single-byte character set so we can
discern by looking at the character code: allocate a
256-entry transition table. */
- trtable = state->trtable =
- (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
+ trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
if (BE (trtable == NULL, 0))
goto out_free;
@@ -3424,8 +3435,8 @@ out_free:
by looking at the character code: build two 256-entry
transition tables, one starting at trtable[0] and one
starting at trtable[SBC_MAX]. */
- trtable = state->word_trtable =
- (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
+ trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
+ 2 * SBC_MAX);
if (BE (trtable == NULL, 0))
goto out_free;
@@ -3456,7 +3467,7 @@ out_free:
{
/* k-th destination accepts newline character. */
trtable[NEWLINE_CHAR] = dest_states_nl[j];
- if (need_word_trtable)
+ if (state->word_trtable)
trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
/* There must be only one destination which accepts
newline. See group_nodes_into_DFAstates. */
@@ -3474,7 +3485,8 @@ out_free:
if (dests_node_malloced)
free (dests_node);
- return 1;
+ state->trtable = trtable;
+ return trtable;
}
/* Group all nodes belonging to STATE into several destinations.