diff options
author | Paolo Bonzini <bonzini@gnu.org> | 2004-11-22 15:49:28 +0000 |
---|---|---|
committer | Paolo Bonzini <bonzini@gnu.org> | 2008-01-09 16:10:54 +0100 |
commit | 611e77d579b58b2cf6da9249d6d850b6fdb8e6e8 (patch) | |
tree | 8d963d74f87a2f431040a108372f2f3610cb8bc5 /lib | |
parent | 7f53c1c16e7654427dc884e528f5e2d201c15fc5 (diff) | |
download | sed-611e77d579b58b2cf6da9249d6d850b6fdb8e6e8.tar.gz |
git-archimport-id: bonzini@gnu.org--2004b/sed--stable--4.1--patch-20
Diffstat (limited to 'lib')
-rw-r--r-- | lib/regcomp.c | 121 | ||||
-rw-r--r-- | lib/regex_.h | 4 | ||||
-rw-r--r-- | lib/regex_internal.c | 1 | ||||
-rw-r--r-- | lib/regex_internal.h | 7 | ||||
-rw-r--r-- | lib/regexec.c | 142 |
5 files changed, 197 insertions, 78 deletions
diff --git a/lib/regcomp.c b/lib/regcomp.c index 91958c9..dafad9b 100644 --- a/lib/regcomp.c +++ b/lib/regcomp.c @@ -33,6 +33,14 @@ static reg_errcode_t create_initial_state (re_dfa_t *dfa); #ifdef RE_ENABLE_I18N static void optimize_utf8 (re_dfa_t *dfa); #endif +struct subexp_optimize +{ + re_dfa_t *dfa; + re_token_t *nodes; + int no_sub, re_nsub; +}; +static bin_tree_t *optimize_subexps (struct subexp_optimize *so, + bin_tree_t *node, int sidx, int depth); static reg_errcode_t analyze (re_dfa_t *dfa); static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node); static void calc_first (re_dfa_t *dfa, bin_tree_t *node); @@ -238,8 +246,8 @@ re_compile_pattern (pattern, length, bufp) /* And GNU code determines whether or not to get register information by passing null for the REGS argument to re_match, etc., not by - setting no_sub. */ - bufp->no_sub = 0; + setting no_sub, unless RE_NO_SUB is set. */ + bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); /* Match anchors at newline. */ bufp->newline_anchor = 1; @@ -633,6 +641,7 @@ free_dfa_content (re_dfa_t *dfa) if (dfa->sb_char != utf8_sb_map) re_free (dfa->sb_char); #endif + re_free (dfa->subexp_map); #ifdef DEBUG re_free (dfa->re_str); #endif @@ -810,6 +819,17 @@ re_compile_internal (preg, pattern, length, syntax) optimize_utf8 (dfa); #endif + if (preg->re_nsub > 0) + { + struct subexp_optimize so; + + so.dfa = dfa; + so.nodes = dfa->nodes; + so.no_sub = preg->no_sub; + so.re_nsub = preg->re_nsub; + dfa->str_tree = optimize_subexps (&so, dfa->str_tree, -1, 0); + } + /* Analyze the tree and collect information which is necessary to create the dfa. */ err = analyze (dfa); @@ -842,9 +862,9 @@ init_dfa (dfa, pat_len) int pat_len; { int table_size; - #ifndef _LIBC +#ifndef _LIBC char *codeset_name; - #endif +#endif memset (dfa, '\0', sizeof (re_dfa_t)); @@ -868,9 +888,9 @@ init_dfa (dfa, pat_len) dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc); dfa->mb_cur_max = MB_CUR_MAX; -#if defined _LIBC - codeset_name = _NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME); - if (dfa->mb_cur_max == 6 && strcmp (codeset_name, "UTF-8") == 0) +#ifdef _LIBC + if (dfa->mb_cur_max == 6 + && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) dfa->is_utf8 = 1; dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) != 0); @@ -927,9 +947,9 @@ init_dfa (dfa, pat_len) # endif } } - } - #endif - + } +#endif + if (BE (dfa->nodes == NULL || dfa->state_table == NULL || dfa->subexps == NULL, 0)) return REG_ESPACE; @@ -1121,6 +1141,82 @@ optimize_utf8 (dfa) } #endif +static bin_tree_t * +optimize_subexps (so, node, sidx, depth) + struct subexp_optimize *so; + bin_tree_t *node; + int sidx, depth; +{ + int idx, new_depth, new_sidx; + bin_tree_t *ret; + if (node == NULL) + return NULL; + + new_depth = 0; + new_sidx = sidx; + if ((depth & 1) && node->type == CONCAT + && node->right && node->right->type == 0 + && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP) + { + new_depth = depth + 1; + if (new_depth == 2 + || (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map) + && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx))) + new_sidx = so->nodes[idx].opr.idx; + } + node->left = optimize_subexps (so, node->left, new_sidx, new_depth); + new_depth = (depth & 1) == 0 && node->type == CONCAT + && node->left && node->left->type == 0 + && so->nodes[node->left->node_idx].type == OP_OPEN_SUBEXP + ? depth + 1 : 0; + node->right = optimize_subexps (so, node->right, sidx, new_depth); + + if (node->type != CONCAT) + return node; + if ((depth & 1) == 0 + && node->left + && node->left->type == 0 + && so->nodes[idx = node->left->node_idx].type == OP_OPEN_SUBEXP) + ret = node->right; + else if ((depth & 1) + && node->right + && node->right->type == 0 + && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP) + ret = node->left; + else + return node; + + if (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map) + && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx)) + return node; + + if (!so->no_sub) + { + int i; + + if (depth < 2) + return node; + + if (so->dfa->subexp_map == NULL) + { + so->dfa->subexp_map = re_malloc (int, so->re_nsub); + if (so->dfa->subexp_map == NULL) + return node; + + for (i = 0; i < so->re_nsub; i++) + so->dfa->subexp_map[i] = i; + } + + i = so->nodes[idx].opr.idx; + assert (sidx < i); + so->dfa->subexp_map[i] = sidx; + } + + so->nodes[idx].type = OP_DELETED_SUBEXP; + ret->parent = node->parent; + return ret; +} + /* Analyze the structure tree, and calculate "first", "next", "edest", "eclosure", and "inveclosure". */ @@ -1525,6 +1621,8 @@ calc_inveclosure (dfa) int src, idx, dest; for (src = 0; src < dfa->nodes_len; ++src) { + if (dfa->nodes[src].type == OP_DELETED_SUBEXP) + continue; for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx) { dest = dfa->eclosures[src].elems[idx]; @@ -1560,6 +1658,9 @@ calc_eclosure (dfa) #ifdef DEBUG assert (dfa->eclosures[node_idx].nelem != -1); #endif + if (dfa->nodes[node_idx].type == OP_DELETED_SUBEXP) + continue; + /* If we have already calculated, skip it. */ if (dfa->eclosures[node_idx].nelem != 0) continue; diff --git a/lib/regex_.h b/lib/regex_.h index 4279dbc..b2d9a62 100644 --- a/lib/regex_.h +++ b/lib/regex_.h @@ -179,6 +179,10 @@ typedef unsigned long int reg_syntax_t; immediately after an alternation or begin-group operator. */ #define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1) +/* If this bit is set, then no_sub will be set to 1 during + re_compile_pattern. */ +#define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1) + /* This global variable defines the particular regexp syntax to use (for some interfaces). When a regexp is compiled, the syntax used is stored in the pattern buffer, so changing this does not affect diff --git a/lib/regex_internal.c b/lib/regex_internal.c index d6f44b6..bb1d73d 100644 --- a/lib/regex_internal.c +++ b/lib/regex_internal.c @@ -1641,7 +1641,6 @@ free_state (state) re_free (state->entrance_nodes); } re_node_set_free (&state->nodes); - re_free (state->word_trtable); re_free (state->trtable); re_free (state); } diff --git a/lib/regex_internal.h b/lib/regex_internal.h index 3a93b5b..a778032 100644 --- a/lib/regex_internal.h +++ b/lib/regex_internal.h @@ -189,6 +189,7 @@ typedef enum OP_DUP_PLUS = EPSILON_BIT | 4, OP_DUP_QUESTION = EPSILON_BIT | 5, ANCHOR = EPSILON_BIT | 6, + OP_DELETED_SUBEXP = EPSILON_BIT | 7, /* Tree type, these are used only by tree. */ CONCAT = 16, @@ -481,7 +482,7 @@ struct re_dfastate_t unsigned int hash; re_node_set nodes; re_node_set *entrance_nodes; - struct re_dfastate_t **trtable, **word_trtable; + struct re_dfastate_t **trtable; unsigned int context : 4; unsigned int halt : 1; /* If this state can accept `multi byte'. @@ -491,6 +492,7 @@ struct re_dfastate_t /* If this state has backreference node(s). */ unsigned int has_backref : 1; unsigned int has_constraint : 1; + unsigned int word_trtable : 1; }; typedef struct re_dfastate_t re_dfastate_t; @@ -549,7 +551,7 @@ struct re_backref_cache_entry int subexp_to; char more; char unused; - short eps_reachable_subexps_map; + unsigned short int eps_reachable_subexps_map; }; typedef struct @@ -643,6 +645,7 @@ struct re_dfa_t int mb_cur_max; bitset word_char; reg_syntax_t syntax; + int *subexp_map; #ifdef DEBUG char* re_str; #endif diff --git a/lib/regexec.c b/lib/regexec.c index cab3300..5877ade 100644 --- a/lib/regexec.c +++ b/lib/regexec.c @@ -175,8 +175,8 @@ static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa, static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes, int cur_str, int subexp_num, int type) internal_function; -static int build_trtable (re_dfa_t *dfa, - re_dfastate_t *state) internal_function; +static re_dfastate_t **build_trtable (re_dfa_t *dfa, + re_dfastate_t *state) internal_function; #ifdef RE_ENABLE_I18N static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx, const re_string_t *input, int idx) internal_function; @@ -684,11 +684,12 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, left_lim = (range < 0) ? start + range : start; right_lim = (range < 0) ? start : start + range; sb = dfa->mb_cur_max == 1; - match_kind = - (fastmap ? 8 : 0) - | (sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0) - | (range >= 0 ? 2 : 0) - | (t != NULL ? 1 : 0); + match_kind = + (fastmap + ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0) + | (range >= 0 ? 2 : 0) + | (t != NULL ? 1 : 0)) + : 8); for (;; match_first += incr) { @@ -703,19 +704,18 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, save on code size. We use a switch statement for speed. */ switch (match_kind) { - case 0: case 1: case 2: case 3: - case 4: case 5: case 6: case 7: + case 8: /* No fastmap. */ break; - case 15: + case 7: /* Fastmap with single-byte translation, match forward. */ while (BE (match_first < right_lim, 1) && !fastmap[t[(unsigned char) string[match_first]]]) ++match_first; goto forward_match_found_start_or_reached_end; - case 14: + case 6: /* Fastmap without translation, match forward. */ while (BE (match_first < right_lim, 1) && !fastmap[(unsigned char) string[match_first]]) @@ -731,8 +731,8 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, } break; - case 12: - case 13: + case 4: + case 5: /* Fastmap without multi-byte translation, match backwards. */ while (match_first >= left_lim) { @@ -745,7 +745,7 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, if (match_first < left_lim) goto free_return; break; - + default: /* In this case, we can't determine easily the current byte, since it might be a component byte of a multibyte @@ -754,19 +754,20 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, { /* If MATCH_FIRST is out of the valid range, reconstruct the buffers. */ - if (mctx.input.raw_mbs_idx + mctx.input.valid_raw_len <= match_first - || match_first < mctx.input.raw_mbs_idx) + unsigned int offset = match_first - mctx.input.raw_mbs_idx; + if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0)) { err = re_string_reconstruct (&mctx.input, match_first, eflags); if (BE (err != REG_NOERROR, 0)) goto free_return; + + offset = match_first - mctx.input.raw_mbs_idx; } /* If MATCH_FIRST is out of the buffer, leave it as '\0'. Note that MATCH_FIRST must not be smaller than 0. */ - ch = ((match_first >= length) ? 0 - : re_string_byte_at (&mctx.input, - match_first - mctx.input.raw_mbs_idx)); + ch = (match_first >= length + ? 0 : re_string_byte_at (&mctx.input, offset)); if (fastmap[ch]) break; match_first += incr; @@ -881,6 +882,18 @@ re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch, pmatch[reg_idx].rm_so += match_first; pmatch[reg_idx].rm_eo += match_first; } + + if (dfa->subexp_map) + for (reg_idx = 0; + reg_idx + 1 < nmatch && reg_idx < preg->re_nsub; + reg_idx++) + if (dfa->subexp_map[reg_idx] != reg_idx) + { + pmatch[reg_idx + 1].rm_so + = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so; + pmatch[reg_idx + 1].rm_eo + = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo; + } } free_return: @@ -2200,36 +2213,36 @@ transit_state (err, mctx, state) re_match_context_t *mctx; re_dfastate_t *state; { + re_dfa_t *const dfa = mctx->dfa; re_dfastate_t **trtable; unsigned char ch; #ifdef RE_ENABLE_I18N - /* If the current state can accept multibyte. */ - if (BE (state->accept_mb, 0)) - { - *err = transit_state_mb (mctx, state); - if (BE (*err != REG_NOERROR, 0)) - return NULL; - } + /* If the current state can accept multibyte. */ + if (BE (state->accept_mb, 0)) + { + *err = transit_state_mb (mctx, state); + if (BE (*err != REG_NOERROR, 0)) + return NULL; + } #endif /* RE_ENABLE_I18N */ /* Then decide the next state with the single byte. */ -#if 0 - if (0) - /* don't use transition table */ - return transit_state_sb (err, mctx, state); -#endif - - /* Use transition table */ - ch = re_string_fetch_byte (&mctx->input); - for (;;) + if (1) { + /* Use transition table */ + ch = re_string_fetch_byte (&mctx->input); trtable = state->trtable; - if (BE (trtable != NULL, 1)) - return trtable[ch]; - - trtable = state->word_trtable; - if (BE (trtable != NULL, 1)) + if (trtable == NULL) + { + trtable = build_trtable (dfa, state); + if (trtable == NULL) + { + *err = REG_ESPACE; + return NULL; + } + } + if (BE (state->word_trtable, 0)) { unsigned int context; context @@ -2241,15 +2254,14 @@ transit_state (err, mctx, state) else return trtable[ch]; } - - if (!build_trtable (mctx->dfa, state)) - { - *err = REG_ESPACE; - return NULL; - } - - /* Retry, we now have a transition table. */ + else + return trtable[ch]; } +#if 0 + else + /* don't use transition table */ + return transit_state_sb (err, mctx, state); +#endif } /* Update the state_log if we need */ @@ -3253,15 +3265,15 @@ expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num, } /* Build transition table for the state. - Return 1 if succeeded, otherwise return NULL. */ + Return the new table if succeeded, otherwise return NULL. */ -static int +static re_dfastate_t ** build_trtable (dfa, state) re_dfa_t *dfa; re_dfastate_t *state; { reg_errcode_t err; - int i, j, ch, need_word_trtable = 0; + int i, j, ch; unsigned int elem, mask; int dests_node_malloced = 0, dest_states_malloced = 0; int ndests; /* Number of the destination states from `state'. */ @@ -3285,13 +3297,13 @@ build_trtable (dfa, state) dests_node = (re_node_set *) malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX); if (BE (dests_node == NULL, 0)) - return 0; + return NULL; dests_node_malloced = 1; } dests_ch = (bitset *) (dests_node + SBC_MAX); /* Initialize transiton table. */ - state->word_trtable = state->trtable = 0; + state->word_trtable = 0; /* At first, group all nodes belonging to `state' into several destinations. */ @@ -3300,14 +3312,14 @@ build_trtable (dfa, state) { if (dests_node_malloced) free (dests_node); - /* Return 0 in case of an error, 1 otherwise. */ + /* Return NULL in case of an error, trtable otherwise. */ if (ndests == 0) { state->trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);; - return 1; + return state->trtable; } - return 0; + return NULL; } err = re_node_set_alloc (&follows, ndests + 1); @@ -3334,7 +3346,7 @@ out_free: re_node_set_free (dests_node + i); if (dests_node_malloced) free (dests_node); - return 0; + return NULL; } dest_states_malloced = 1; } @@ -3372,7 +3384,7 @@ out_free: if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1) - need_word_trtable = 1; + state->word_trtable = 1; dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows, CONTEXT_NEWLINE); @@ -3387,14 +3399,13 @@ out_free: bitset_merge (acceptable, dests_ch[i]); } - if (!BE (need_word_trtable, 0)) + if (!BE (state->word_trtable, 0)) { /* We don't care about whether the following character is a word character, or we are in a single-byte character set so we can discern by looking at the character code: allocate a 256-entry transition table. */ - trtable = state->trtable = - (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); + trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX); if (BE (trtable == NULL, 0)) goto out_free; @@ -3424,8 +3435,8 @@ out_free: by looking at the character code: build two 256-entry transition tables, one starting at trtable[0] and one starting at trtable[SBC_MAX]. */ - trtable = state->word_trtable = - (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX); + trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), + 2 * SBC_MAX); if (BE (trtable == NULL, 0)) goto out_free; @@ -3456,7 +3467,7 @@ out_free: { /* k-th destination accepts newline character. */ trtable[NEWLINE_CHAR] = dest_states_nl[j]; - if (need_word_trtable) + if (state->word_trtable) trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j]; /* There must be only one destination which accepts newline. See group_nodes_into_DFAstates. */ @@ -3474,7 +3485,8 @@ out_free: if (dests_node_malloced) free (dests_node); - return 1; + state->trtable = trtable; + return trtable; } /* Group all nodes belonging to STATE into several destinations. |