summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJim Meyering <meyering@redhat.com>2012-06-01 21:18:00 +0200
committerJim Meyering <meyering@redhat.com>2012-06-02 11:06:08 +0200
commit7aa698d36b5b2eeb8e90e7a327eb7ebe46d59e87 (patch)
treecadef59a85a9ffe401994e8addeb1a995394c279 /src
parent2665746b756bd372ba856e165388dc98032362fd (diff)
downloadgrep-7aa698d36b5b2eeb8e90e7a327eb7ebe46d59e87.tar.gz
grep: fix how -i works with a match containing the Turkish I-with-dot
Fix a long-standing problem in the way grep's -i interacts with data whose byte count changes when we convert it to lower case. For example, the UTF-8 Turkish I-with-dot (İ) occupies two bytes, but its lower case analog, i, occupies just one byte. The code converts both search string and the haystack data to lower case, and then searches for the modified string in the modified buffer. The trouble arose when using a lowercase buffer <offset,length> pair to manipulate the original (longer) buffer. The solution is to change mbtolower to return additional information: a malloc'd mapping vector. With that, the caller maps the lowercase- relative <offset,length> to numbers that refer to the original buffer. This mapping is used only when lengths actually differ, so the cost in general should be small. * src/searchutils.c (mbtolower): Add the new map parameter. * src/search.h (mb_case_map_apply): New function. * src/kwsearch.c (Fexecute): Update mbtolower caller, and upon success, apply the new map. * src/dfasearch.c (EGexecute): Likewise. * tests/Makefile.am (XFAIL_TESTS): Remove turkish-I from this list; that test is no longer expected to fail. * NEWS (Bug fixes): Mention it. Reported by Ilya Basin in http://thread.gmane.org/gmane.comp.gnu.grep.bugs/3413 and later by Strahinja Kustudic in http://savannah.gnu.org/bugs/?36567
Diffstat (limited to 'src')
-rw-r--r--src/dfasearch.c13
-rw-r--r--src/kwsearch.c14
-rw-r--r--src/search.h21
-rw-r--r--src/searchutils.c25
4 files changed, 62 insertions, 11 deletions
diff --git a/src/dfasearch.c b/src/dfasearch.c
index bd09aa67..a48333d2 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -78,8 +78,9 @@ static char const *
kwsincr_case (const char *must)
{
size_t n = strlen (must);
+ unsigned char *map = NULL;
const char *buf = (match_icase && MB_CUR_MAX > 1
- ? mbtolower (must, &n)
+ ? mbtolower (must, &n, &map)
: must);
return kwsincr (kwset, buf, n);
}
@@ -217,13 +218,15 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
ptrdiff_t len, best_len;
struct kwsmatch kwsm;
size_t i, ret_val;
+ unsigned char *map = NULL;
+
if (MB_CUR_MAX > 1)
{
if (match_icase)
{
/* mbtolower adds a NUL byte at the end. That will provide
space for the sentinel byte dfaexec may add. */
- char *case_buf = mbtolower (buf, &size);
+ char *case_buf = mbtolower (buf, &size, &map);
if (start_ptr)
start_ptr = case_buf + (start_ptr - buf);
buf = case_buf;
@@ -408,9 +411,11 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
success:
len = end - beg;
- success_in_len:
+ success_in_len:;
+ size_t off = beg - buf;
+ mb_case_map_apply (map, &off, &len);
*match_size = len;
- ret_val = beg - buf;
+ ret_val = off;
out:
return ret_val;
}
diff --git a/src/kwsearch.c b/src/kwsearch.c
index f1a802e7..d0bb201f 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -34,8 +34,9 @@ Fcompile (char const *pattern, size_t size)
{
char const *err;
size_t psize = size;
+ unsigned char *map = NULL;
char const *pat = (match_icase && MB_CUR_MAX > 1
- ? mbtolower (pattern, &psize)
+ ? mbtolower (pattern, &psize, &map)
: pattern);
kwsinit (&kwset);
@@ -83,11 +84,13 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
char eol = eolbyte;
struct kwsmatch kwsmatch;
size_t ret_val;
+ unsigned char *map = NULL;
+
if (MB_CUR_MAX > 1)
{
if (match_icase)
{
- char *case_buf = mbtolower (buf, &size);
+ char *case_buf = mbtolower (buf, &size, &map);
if (start_ptr)
start_ptr = case_buf + (start_ptr - buf);
buf = case_buf;
@@ -162,9 +165,12 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
while (buf < beg && beg[-1] != eol)
--beg;
len = end - beg;
- success_in_beg_and_len:
+ success_in_beg_and_len:;
+ size_t off = beg - buf;
+ mb_case_map_apply (map, &off, &len);
+
*match_size = len;
- ret_val = beg - buf;
+ ret_val = off;
out:
return ret_val;
}
diff --git a/src/search.h b/src/search.h
index 3074407e..529e7e23 100644
--- a/src/search.h
+++ b/src/search.h
@@ -38,7 +38,7 @@
/* searchutils.c */
extern void kwsinit (kwset_t *);
-extern char *mbtolower (const char *, size_t *);
+extern char *mbtolower (const char *, size_t *, unsigned char **);
extern bool is_mb_middle (const char **, const char *, const char *, size_t);
/* dfasearch.c */
@@ -53,4 +53,23 @@ extern size_t Fexecute (char const *, size_t, size_t *, char const *);
extern void Pcompile (char const *, size_t);
extern size_t Pexecute (char const *, size_t, size_t *, char const *);
+/* Apply a non-NULL MAP from mbtolower to the lowercase-buffer-relative
+ *OFF and *LEN, converting them to be relative to the original buffer. */
+static inline void
+mb_case_map_apply (unsigned char const *map, size_t *off, size_t *len)
+{
+ if (map)
+ {
+ size_t off_incr = 0;
+ size_t len_incr = 0;
+ size_t k;
+ for (k = 0; k < *off; k++)
+ off_incr += map[k];
+ for (k = *off; k < *off + *len; k++)
+ len_incr += map[k];
+ *off += off_incr;
+ *len += len_incr;
+ }
+}
+
#endif /* GREP_SEARCH_H */
diff --git a/src/searchutils.c b/src/searchutils.c
index b787fe67..4942c516 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -53,25 +53,38 @@ kwsinit (kwset_t *kwset)
Note that while this function returns a pointer to malloc'd storage,
the caller must not free it, since this function retains a pointer
to the buffer and reuses it on any subsequent call. As a consequence,
- this function is not thread-safe. */
+ this function is not thread-safe.
+
+ When the lowercase result string has the same length as the input string,
+ set *LEN_MAP_P to NULL. Otherwise, set it to a malloc'd buffer (like the
+ returned buffer, this must not be freed by caller) of the same length as
+ the result string. (*LEN_MAP_P)[J] is one less than the length-in-bytes
+ of the character in BEG that formed byte J of the result. This map is
+ used by the caller to convert offset,length pairs that reference the
+ lowercase result to numbers that refer to the corresponding parts of
+ the original buffer. */
char *
-mbtolower (const char *beg, size_t *n)
+mbtolower (const char *beg, size_t *n, unsigned char **len_map_p)
{
static char *out;
+ static unsigned char *len_map;
static size_t outalloc;
size_t outlen, mb_cur_max;
mbstate_t is, os;
const char *end;
char *p;
+ unsigned char *m;
if (*n > outalloc || outalloc == 0)
{
outalloc = MAX(1, *n);
out = xrealloc (out, outalloc);
+ len_map = xrealloc (len_map, outalloc);
}
/* appease clang-2.6 */
assert (out);
+ assert (len_map);
if (*n == 0)
return out;
@@ -81,6 +94,7 @@ mbtolower (const char *beg, size_t *n)
mb_cur_max = MB_CUR_MAX;
p = out;
+ m = len_map;
outlen = 0;
while (beg < end)
{
@@ -88,14 +102,18 @@ mbtolower (const char *beg, size_t *n)
size_t mbclen = mbrtowc(&wc, beg, end - beg, &is);
if (outlen + mb_cur_max >= outalloc)
{
+ size_t dm = m - len_map;
out = x2nrealloc (out, &outalloc, 1);
+ len_map = xrealloc (len_map, outalloc);
p = out + outlen;
+ m = len_map + dm;
}
if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
{
/* An invalid sequence, or a truncated multi-octet character.
We treat it as a single-octet character. */
+ *m++ = 0;
*p++ = *beg++;
outlen++;
memset (&is, 0, sizeof (is));
@@ -103,6 +121,7 @@ mbtolower (const char *beg, size_t *n)
}
else
{
+ *m++ = mbclen - 1;
beg += mbclen;
mbclen = wcrtomb (p, towlower ((wint_t) wc), &os);
p += mbclen;
@@ -110,6 +129,8 @@ mbtolower (const char *beg, size_t *n)
}
}
+ /* If the new length differs from the original, give caller the map. */
+ *len_map_p = p - out == *n ? NULL : len_map;
*n = p - out;
*p = 0;
return out;