diff options
author | Norihiro Tanaka <noritnk@kcn.ne.jp> | 2014-10-25 01:46:01 +0900 |
---|---|---|
committer | Jim Meyering <meyering@fb.com> | 2014-10-28 13:05:27 -0700 |
commit | f66dafc2181bf997f8e7192ad49d3d6ec9dc2b87 (patch) | |
tree | 297b2308c20f28542c98535650a85d70fab812a7 /src | |
parent | 1519c4e5e4bf68ec348bfe4261f78768710aa985 (diff) | |
download | grep-f66dafc2181bf997f8e7192ad49d3d6ec9dc2b87.tar.gz |
dfa: make \w and \W work in multibyte locales
Reported by Jaroslav Skarvada in: http://bugs.gnu.org/18817
Now, \w and \W are supported in not only single byte locale but multibyte
locale.
* src/dfa.c (PUSH_LEX_STATE, POP_LEX_STATE): Move definitions "up",
so they are not within the function.
(lex): Make \w and \W work in a multibyte locale, the same way
we made \s and \S work.
* tests/word-multibyte: New test for this change.
* tests/Makefile.am: Add a rule to build new test.
* NEWS (Bug fixes): Mention it.
Diffstat (limited to 'src')
-rw-r--r-- | src/dfa.c | 61 |
1 files changed, 40 insertions, 21 deletions
@@ -1249,6 +1249,20 @@ parse_bracket_exp (void) return CSET + charclass_index (ccl); } +#define PUSH_LEX_STATE(s) \ + do \ + { \ + char const *lexptr_saved = lexptr; \ + size_t lexleft_saved = lexleft; \ + lexptr = (s); \ + lexleft = strlen (lexptr) + +#define POP_LEX_STATE() \ + lexptr = lexptr_saved; \ + lexleft = lexleft_saved; \ + } \ + while (0) + static token lex (void) { @@ -1496,20 +1510,6 @@ lex (void) return lasttok = CSET + charclass_index (ccl); } -#define PUSH_LEX_STATE(s) \ - do \ - { \ - char const *lexptr_saved = lexptr; \ - size_t lexleft_saved = lexleft; \ - lexptr = (s); \ - lexleft = strlen (lexptr) - -#define POP_LEX_STATE() \ - lexptr = lexptr_saved; \ - lexleft = lexleft_saved; \ - } \ - while (0) - /* FIXME: see if optimizing this, as is done with ANYCHAR and add_utf8_anychar, makes sense. */ @@ -1529,14 +1529,33 @@ lex (void) case 'W': if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - zeroset (ccl); - for (c2 = 0; c2 < NOTCHAR; ++c2) - if (IS_WORD_CONSTITUENT (c2)) - setbit (c2, ccl); - if (c == 'W') - notset (ccl); + + if (!dfa->multibyte) + { + zeroset (ccl); + for (c2 = 0; c2 < NOTCHAR; ++c2) + if (IS_WORD_CONSTITUENT (c2)) + setbit (c2, ccl); + if (c == 'W') + notset (ccl); + laststart = false; + return lasttok = CSET + charclass_index (ccl); + } + + /* FIXME: see if optimizing this, as is done with ANYCHAR and + add_utf8_anychar, makes sense. */ + + /* \w and \W are documented to be equivalent to [_[:alnum:]] and + [^_[:alnum:]] respectively, so tell the lexer to process those + strings, each minus its "already processed" '['. */ + PUSH_LEX_STATE (c == 'w' ? "_[:alnum:]]" : "^_[:alnum:]]"); + + lasttok = parse_bracket_exp (); + + POP_LEX_STATE (); + laststart = false; - return lasttok = CSET + charclass_index (ccl); + return lasttok; case '[': if (backslash) |