diff options
-rw-r--r-- | NEWS | 5 | ||||
-rw-r--r-- | THANKS | 1 | ||||
-rw-r--r-- | src/dfasearch.c | 13 | ||||
-rw-r--r-- | src/kwsearch.c | 14 | ||||
-rw-r--r-- | src/search.h | 21 | ||||
-rw-r--r-- | src/searchutils.c | 25 | ||||
-rw-r--r-- | tests/Makefile.am | 1 |
7 files changed, 68 insertions, 12 deletions
@@ -4,6 +4,11 @@ GNU grep NEWS -*- outline -*- ** Bug fixes + grep -i, in a multi-byte locale, when matching a line containing a character + like the UTF-8 Turkish I-with-dot (U+0130) (whose lower-case representation + occupies fewer bytes), would print an incomplete output line. + [bug introduced in grep-2.6] + --include and --exclude can again be combined, and again apply to the command line, e.g., "grep --include='*.[ch]' --exclude='system.h' PATTERN *" again reads all *.c and *.h files except for system.h. @@ -84,6 +84,7 @@ Shannon Hill <hill@synnet.com> Sotiris Vassilopoulos <Sotiris.Vassilopoulos@betatech.gr> Standish Parsley <adsspamtrap01@yahoo.com> Stewart Levin <stew@sep.stanford.edu> +Strahinja Kustudic <kustodian@gmail.com> Sven Joachim <svenjoac@gmx.de> Sydoruk Stepan <step@unitex.kiev.ua> Tapani Tarvainen <tt@mit.jyu.fi> diff --git a/src/dfasearch.c b/src/dfasearch.c index bd09aa67..a48333d2 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -78,8 +78,9 @@ static char const * kwsincr_case (const char *must) { size_t n = strlen (must); + unsigned char *map = NULL; const char *buf = (match_icase && MB_CUR_MAX > 1 - ? mbtolower (must, &n) + ? mbtolower (must, &n, &map) : must); return kwsincr (kwset, buf, n); } @@ -217,13 +218,15 @@ EGexecute (char const *buf, size_t size, size_t *match_size, ptrdiff_t len, best_len; struct kwsmatch kwsm; size_t i, ret_val; + unsigned char *map = NULL; + if (MB_CUR_MAX > 1) { if (match_icase) { /* mbtolower adds a NUL byte at the end. That will provide space for the sentinel byte dfaexec may add. */ - char *case_buf = mbtolower (buf, &size); + char *case_buf = mbtolower (buf, &size, &map); if (start_ptr) start_ptr = case_buf + (start_ptr - buf); buf = case_buf; @@ -408,9 +411,11 @@ EGexecute (char const *buf, size_t size, size_t *match_size, success: len = end - beg; - success_in_len: + success_in_len:; + size_t off = beg - buf; + mb_case_map_apply (map, &off, &len); *match_size = len; - ret_val = beg - buf; + ret_val = off; out: return ret_val; } diff --git a/src/kwsearch.c b/src/kwsearch.c index f1a802e7..d0bb201f 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -34,8 +34,9 @@ Fcompile (char const *pattern, size_t size) { char const *err; size_t psize = size; + unsigned char *map = NULL; char const *pat = (match_icase && MB_CUR_MAX > 1 - ? mbtolower (pattern, &psize) + ? mbtolower (pattern, &psize, &map) : pattern); kwsinit (&kwset); @@ -83,11 +84,13 @@ Fexecute (char const *buf, size_t size, size_t *match_size, char eol = eolbyte; struct kwsmatch kwsmatch; size_t ret_val; + unsigned char *map = NULL; + if (MB_CUR_MAX > 1) { if (match_icase) { - char *case_buf = mbtolower (buf, &size); + char *case_buf = mbtolower (buf, &size, &map); if (start_ptr) start_ptr = case_buf + (start_ptr - buf); buf = case_buf; @@ -162,9 +165,12 @@ Fexecute (char const *buf, size_t size, size_t *match_size, while (buf < beg && beg[-1] != eol) --beg; len = end - beg; - success_in_beg_and_len: + success_in_beg_and_len:; + size_t off = beg - buf; + mb_case_map_apply (map, &off, &len); + *match_size = len; - ret_val = beg - buf; + ret_val = off; out: return ret_val; } diff --git a/src/search.h b/src/search.h index 3074407e..529e7e23 100644 --- a/src/search.h +++ b/src/search.h @@ -38,7 +38,7 @@ /* searchutils.c */ extern void kwsinit (kwset_t *); -extern char *mbtolower (const char *, size_t *); +extern char *mbtolower (const char *, size_t *, unsigned char **); extern bool is_mb_middle (const char **, const char *, const char *, size_t); /* dfasearch.c */ @@ -53,4 +53,23 @@ extern size_t Fexecute (char const *, size_t, size_t *, char const *); extern void Pcompile (char const *, size_t); extern size_t Pexecute (char const *, size_t, size_t *, char const *); +/* Apply a non-NULL MAP from mbtolower to the lowercase-buffer-relative + *OFF and *LEN, converting them to be relative to the original buffer. */ +static inline void +mb_case_map_apply (unsigned char const *map, size_t *off, size_t *len) +{ + if (map) + { + size_t off_incr = 0; + size_t len_incr = 0; + size_t k; + for (k = 0; k < *off; k++) + off_incr += map[k]; + for (k = *off; k < *off + *len; k++) + len_incr += map[k]; + *off += off_incr; + *len += len_incr; + } +} + #endif /* GREP_SEARCH_H */ diff --git a/src/searchutils.c b/src/searchutils.c index b787fe67..4942c516 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -53,25 +53,38 @@ kwsinit (kwset_t *kwset) Note that while this function returns a pointer to malloc'd storage, the caller must not free it, since this function retains a pointer to the buffer and reuses it on any subsequent call. As a consequence, - this function is not thread-safe. */ + this function is not thread-safe. + + When the lowercase result string has the same length as the input string, + set *LEN_MAP_P to NULL. Otherwise, set it to a malloc'd buffer (like the + returned buffer, this must not be freed by caller) of the same length as + the result string. (*LEN_MAP_P)[J] is one less than the length-in-bytes + of the character in BEG that formed byte J of the result. This map is + used by the caller to convert offset,length pairs that reference the + lowercase result to numbers that refer to the corresponding parts of + the original buffer. */ char * -mbtolower (const char *beg, size_t *n) +mbtolower (const char *beg, size_t *n, unsigned char **len_map_p) { static char *out; + static unsigned char *len_map; static size_t outalloc; size_t outlen, mb_cur_max; mbstate_t is, os; const char *end; char *p; + unsigned char *m; if (*n > outalloc || outalloc == 0) { outalloc = MAX(1, *n); out = xrealloc (out, outalloc); + len_map = xrealloc (len_map, outalloc); } /* appease clang-2.6 */ assert (out); + assert (len_map); if (*n == 0) return out; @@ -81,6 +94,7 @@ mbtolower (const char *beg, size_t *n) mb_cur_max = MB_CUR_MAX; p = out; + m = len_map; outlen = 0; while (beg < end) { @@ -88,14 +102,18 @@ mbtolower (const char *beg, size_t *n) size_t mbclen = mbrtowc(&wc, beg, end - beg, &is); if (outlen + mb_cur_max >= outalloc) { + size_t dm = m - len_map; out = x2nrealloc (out, &outalloc, 1); + len_map = xrealloc (len_map, outalloc); p = out + outlen; + m = len_map + dm; } if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) { /* An invalid sequence, or a truncated multi-octet character. We treat it as a single-octet character. */ + *m++ = 0; *p++ = *beg++; outlen++; memset (&is, 0, sizeof (is)); @@ -103,6 +121,7 @@ mbtolower (const char *beg, size_t *n) } else { + *m++ = mbclen - 1; beg += mbclen; mbclen = wcrtomb (p, towlower ((wint_t) wc), &os); p += mbclen; @@ -110,6 +129,8 @@ mbtolower (const char *beg, size_t *n) } } + /* If the new length differs from the original, give caller the map. */ + *len_map_p = p - out == *n ? NULL : len_map; *n = p - out; *p = 0; return out; diff --git a/tests/Makefile.am b/tests/Makefile.am index 7be788c2..167e3186 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -32,7 +32,6 @@ XFAIL_TESTS = \ if USE_INCLUDED_REGEX XFAIL_TESTS += equiv-classes endif -XFAIL_TESTS += turkish-I TESTS = \ backref \ |