From 9234beafca50945843fe41697000d4bcc7d5b0ba Mon Sep 17 00:00:00 2001 From: "Arnold D. Robbins" Date: Mon, 10 Mar 2014 22:15:11 +0200 Subject: Sync dfa with grep. --- ChangeLog | 4 ++ dfa.c | 142 +++++++++++++++++++++++++++++++++++++++----------------------- dfa.h | 8 ++++ 3 files changed, 101 insertions(+), 53 deletions(-) diff --git a/ChangeLog b/ChangeLog index 31d1d616..bdd31107 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2014-03-10 Arnold D. Robbins + + * dfa.h, dfa.c: Sync with grep. Yet again. + 2014-03-08 Andrew J. Schorr * gawkapi.c (api_impl): Add memory allocation function pointers. diff --git a/dfa.c b/dfa.c index 3dd3c209..8771bbee 100644 --- a/dfa.c +++ b/dfa.c @@ -45,6 +45,11 @@ #include "dfa.h" +/* Gawk doesn't use Gnulib, so don't assume static_assert is present. */ +#ifndef static_assert +# define static_assert(cond, diagnostic) \ + extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })] +#endif #define STREQ(a, b) (strcmp (a, b) == 0) @@ -748,34 +753,16 @@ setbit_wc (wint_t wc, charclass c) #endif } -/* Set a bit for B in the charclass C, if B is a valid single byte - character in the current character set. If case is folded, set B's - lower and upper case variants similarly. If MB_CUR_MAX > 1, the - resulting charset is used only as an optimization, and the caller - should set the appropriate field of struct mb_char_classes. */ +/* Set a bit for B and its case variants in the charclass C. + MB_CUR_MAX must be 1. */ static void setbit_case_fold_c (int b, charclass c) { - if (MB_CUR_MAX > 1) - { - wint_t wc = btowc (b); - if (wc == WEOF) - return; - if (case_fold) - { - setbit_wc (towlower (wc), c); - setbit_wc (towupper (wc), c); - } - } - else - { - if (case_fold) - { - setbit (tolower (b), c); - setbit (toupper (b), c); - } - } - setbit (b, c); + int ub = toupper (b); + int i; + for (i = 0; i < NOTCHAR; i++) + if (toupper (i) == ub) + setbit (i, c); } @@ -940,6 +927,50 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */ # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif +/* The set of wchar_t values C such that there's a useful locale + somewhere where C != towupper (C) && C != towlower (towupper (C)). + For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because + towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and + towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ +static short const lonesome_lower[] = + { + 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, + 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, + + /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase + counterpart in locales predating Unicode 4.0.0 (April 2003). */ + 0x03F2, + + 0x03F5, 0x1E9B, 0x1FBE, + }; + +static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2 + == CASE_FOLDED_BUFSIZE), + "CASE_FOLDED_BUFSIZE is wrong"); + +/* Find the characters equal to C after case-folding, other than C + itself, and store them into FOLDED. Return the number of characters + stored. */ +int +case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) +{ + int i; + int n = 0; + wint_t uc = towupper (c); + wint_t lc = towlower (uc); + if (uc != c) + folded[n++] = uc; + if (lc != uc && lc != c && towupper (lc) == uc) + folded[n++] = lc; + for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) + { + wint_t li = lonesome_lower[i]; + if (li != lc && li != uc && li != c && towupper (li) == uc) + folded[n++] = li; + } + return n; +} + typedef int predicate (int); /* The following list maps the names of the Posix named character classes @@ -1100,7 +1131,7 @@ parse_bracket_exp (void) for (c2 = 0; c2 < NOTCHAR; ++c2) if (pred->func (c2)) - setbit_case_fold_c (c2, ccl); + setbit (c2, ccl); } else known_bracket_exp = false; @@ -1167,8 +1198,21 @@ parse_bracket_exp (void) } } else if (using_simple_locale ()) - for (; c <= c2; c++) - setbit_case_fold_c (c, ccl); + { + for (c1 = c; c1 <= c2; c1++) + setbit (c1, ccl); + if (case_fold) + { + int uc = toupper (c); + int uc2 = toupper (c2); + for (c1 = 0; c1 < NOTCHAR; c1++) + { + int uc1 = toupper (c1); + if (uc <= uc1 && uc1 <= uc2) + setbit (c1, ccl); + } + } + } else known_bracket_exp = false; @@ -1187,26 +1231,22 @@ parse_bracket_exp (void) if (MB_CUR_MAX == 1) { - setbit_case_fold_c (c, ccl); + if (case_fold) + setbit_case_fold_c (c, ccl); + else + setbit (c, ccl); continue; } if (case_fold) { - wint_t folded = towlower (wc); - if (folded != wc && !setbit_wc (folded, ccl)) - { - REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, - work_mbc->nchars + 1); - work_mbc->chars[work_mbc->nchars++] = folded; - } - folded = towupper (wc); - if (folded != wc && !setbit_wc (folded, ccl)) - { - REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, - work_mbc->nchars + 1); - work_mbc->chars[work_mbc->nchars++] = folded; - } + wchar_t folded[CASE_FOLDED_BUFSIZE]; + int i, n = case_folded_counterparts (wc, folded); + REALLOC_IF_NECESSARY (work_mbc->chars, chars_al, + work_mbc->nchars + n); + for (i = 0; i < n; i++) + if (!setbit_wc (folded[i], ccl)) + work_mbc->chars[work_mbc->nchars++] = folded[i]; } if (!setbit_wc (wc, ccl)) { @@ -1552,7 +1592,7 @@ lex (void) if (MB_CUR_MAX > 1) return lasttok = WCHAR; - if (case_fold && (tolower (c) != c || toupper (c) != c)) + if (case_fold && isalpha (c)) { zeroset (ccl); setbit_case_fold_c (c, ccl); @@ -1799,18 +1839,14 @@ atom (void) if (MBS_SUPPORT && tok == WCHAR) { addtok_wc (wctok); + if (case_fold) { - wint_t folded = towlower (wctok); - if (folded != wctok) - { - addtok_wc (folded); - addtok (OR); - } - folded = towupper (wctok); - if (folded != wctok) + wchar_t folded[CASE_FOLDED_BUFSIZE]; + int i, n = case_folded_counterparts (wctok, folded); + for (i = 0; i < n; i++) { - addtok_wc (folded); + addtok_wc (folded[i]); addtok (OR); } } diff --git a/dfa.h b/dfa.h index 7e0674fc..24fbcbe7 100644 --- a/dfa.h +++ b/dfa.h @@ -101,3 +101,11 @@ extern void dfawarn (const char *); extern _Noreturn void dfaerror (const char *); extern int using_utf8 (void); + +/* Maximum number of characters that can be the case-folded + counterparts of a single character, not counting the character + itself. This is 1 for towupper, 1 for towlower, and 1 for each + entry in LONESOME_LOWER; see dfa.c. */ +enum { CASE_FOLDED_BUFSIZE = 1 + 1 + 19 }; + +extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]); -- cgit v1.2.1