diff options
author | Alain Magloire <alainm@rcsm.ee.mcgill.ca> | 2001-03-07 04:11:26 +0000 |
---|---|---|
committer | Alain Magloire <alainm@rcsm.ee.mcgill.ca> | 2001-03-07 04:11:26 +0000 |
commit | 519059dbf2c9dfe4e60621f84a6c4490a941b4e7 (patch) | |
tree | 1ff381805f8f229579a9818966915c81e8c18a2f /src/dfa.h | |
parent | 2b9c2eb1c5f396c5ba1727278caa6ba04b2f415f (diff) | |
download | grep-519059dbf2c9dfe4e60621f84a6c4490a941b4e7.tar.gz |
Fix the search bugs
Only the Regex patterns should be split in an array, patterns[].
The dfa and KWset compiled patterns should remain global and the
patterns compiled all at once.
* src/search.c : include "error.h" and "xalloc.h" to get prototyping
of x*alloc() and error().
(kwsinit) : Reverse to previous behaviour and takes no argument.
(kwsmusts) : Likewised.
(Gcompile) : For the regex pattern, split them and each pattern
is put in different compiled structure patterns[]. The patterns
are given to dfacomp() and kwsmusts() as is.
(Ecompile) : Likewised.
(Fcompile) : Reverse to the old behaviour of compiling the enire
patterns in one shot.
(EGexecute) : If falling to GNU regex for the matching, loop in the
array of compile patterns[] to find a match.
(error) : Many error () were call with arguments in the wrong order.
* tests/file.sh : Simple test to check for pattern in files.
Reaction to bug report fired by Greg Louis <glouis@dynamicro.on.ca>
In multibyte environments, handle multibyte characters as single
characters in bracket expressions.
* src/dfa.h (mb_char_classes) : new structure.
(mbcsets) : new variable.
(nmbcsets) : new variable.
(mbcsets_alloc) : new variable.
* src/dfa.c (prtok) : handle MBCSET.
(fetch_wc) : new function to fetch a wide character.
(parse_bracket_exp_mb) : new function to handle multibyte character
in lex().
(lex) : invoke parse_bracket_exp_mb() for multibyte bracket expression.
(atom) : handle MBCSET.
(epsclosure) : likewise.
(dfaanalyze) : likewise.
(dfastate) : likewise.
(match_mb_charset) : new function to judge whether a bracket match
with a multibyte character.
(check_matching_with_multibyte_ops) : handle MBCSET.
(dfainit) : initialize new variables.
(dfafree) : free new variables.
Diffstat (limited to 'src/dfa.h')
-rw-r--r-- | src/dfa.h | 30 |
1 files changed, 30 insertions, 0 deletions
@@ -152,6 +152,9 @@ typedef enum ANYCHAR, /* ANYCHAR is a terminal symbol that matches any multibyte(or singlebyte) characters. It is used only if MB_CUR_MAX > 1. */ + + MBCSET, /* MBCSET is similar to CSET, but for + multibyte characters. */ #endif /* MBS_SUPPORT */ CSET /* CSET and (and any value greater) is a @@ -258,6 +261,26 @@ struct dfamust struct dfamust *next; }; +#ifdef MBS_SUPPORT +/* A bracket operator. + e.g. [a-c], [[:alpha:]], etc. */ +struct mb_char_classes +{ + int invert; + wchar_t *chars; /* Normal characters. */ + int nchars; + wctype_t *ch_classes; /* Character classes. */ + int nch_classes; + wchar_t *range_sts; /* Range characters (start of the range). */ + wchar_t *range_ends; /* Range characters (end of the range). */ + int nranges; + char **equivs; /* Equivalent classes. */ + int nequivs; + char **coll_elems; + int ncoll_elems; /* Collating elements. */ +}; +#endif + /* A compiled regular expression. */ struct dfa { @@ -286,6 +309,8 @@ struct dfa a multibyte character. bit 0 : tokens[i] is a singlebyte character, or the 1st-byte of a multibyte character. + if tokens[i] = MBCSET + ("the index of mbcsets correspnd to this operator" << 2) + 3 e.g. tokens @@ -294,6 +319,11 @@ struct dfa multibyte_prop = 3 , 1 , 0 , 2 , 3 */ + + /* Array of the bracket expressoin in the DFA. */ + struct mb_char_classes *mbcsets; + int nmbcsets; + int mbcsets_alloc; #endif /* Stuff owned by the state builder. */ |