summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS5
-rw-r--r--THANKS1
-rw-r--r--src/dfasearch.c13
-rw-r--r--src/kwsearch.c14
-rw-r--r--src/search.h21
-rw-r--r--src/searchutils.c25
-rw-r--r--tests/Makefile.am1
7 files changed, 68 insertions, 12 deletions
diff --git a/NEWS b/NEWS
index 69262765..d0ea60ab 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,11 @@ GNU grep NEWS -*- outline -*-
** Bug fixes
+ grep -i, in a multi-byte locale, when matching a line containing a character
+ like the UTF-8 Turkish I-with-dot (U+0130) (whose lower-case representation
+ occupies fewer bytes), would print an incomplete output line.
+ [bug introduced in grep-2.6]
+
--include and --exclude can again be combined, and again apply to
the command line, e.g., "grep --include='*.[ch]' --exclude='system.h'
PATTERN *" again reads all *.c and *.h files except for system.h.
diff --git a/THANKS b/THANKS
index 80df5ec3..1720232d 100644
--- a/THANKS
+++ b/THANKS
@@ -84,6 +84,7 @@ Shannon Hill <hill@synnet.com>
Sotiris Vassilopoulos <Sotiris.Vassilopoulos@betatech.gr>
Standish Parsley <adsspamtrap01@yahoo.com>
Stewart Levin <stew@sep.stanford.edu>
+Strahinja Kustudic <kustodian@gmail.com>
Sven Joachim <svenjoac@gmx.de>
Sydoruk Stepan <step@unitex.kiev.ua>
Tapani Tarvainen <tt@mit.jyu.fi>
diff --git a/src/dfasearch.c b/src/dfasearch.c
index bd09aa67..a48333d2 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -78,8 +78,9 @@ static char const *
kwsincr_case (const char *must)
{
size_t n = strlen (must);
+ unsigned char *map = NULL;
const char *buf = (match_icase && MB_CUR_MAX > 1
- ? mbtolower (must, &n)
+ ? mbtolower (must, &n, &map)
: must);
return kwsincr (kwset, buf, n);
}
@@ -217,13 +218,15 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
ptrdiff_t len, best_len;
struct kwsmatch kwsm;
size_t i, ret_val;
+ unsigned char *map = NULL;
+
if (MB_CUR_MAX > 1)
{
if (match_icase)
{
/* mbtolower adds a NUL byte at the end. That will provide
space for the sentinel byte dfaexec may add. */
- char *case_buf = mbtolower (buf, &size);
+ char *case_buf = mbtolower (buf, &size, &map);
if (start_ptr)
start_ptr = case_buf + (start_ptr - buf);
buf = case_buf;
@@ -408,9 +411,11 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
success:
len = end - beg;
- success_in_len:
+ success_in_len:;
+ size_t off = beg - buf;
+ mb_case_map_apply (map, &off, &len);
*match_size = len;
- ret_val = beg - buf;
+ ret_val = off;
out:
return ret_val;
}
diff --git a/src/kwsearch.c b/src/kwsearch.c
index f1a802e7..d0bb201f 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -34,8 +34,9 @@ Fcompile (char const *pattern, size_t size)
{
char const *err;
size_t psize = size;
+ unsigned char *map = NULL;
char const *pat = (match_icase && MB_CUR_MAX > 1
- ? mbtolower (pattern, &psize)
+ ? mbtolower (pattern, &psize, &map)
: pattern);
kwsinit (&kwset);
@@ -83,11 +84,13 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
char eol = eolbyte;
struct kwsmatch kwsmatch;
size_t ret_val;
+ unsigned char *map = NULL;
+
if (MB_CUR_MAX > 1)
{
if (match_icase)
{
- char *case_buf = mbtolower (buf, &size);
+ char *case_buf = mbtolower (buf, &size, &map);
if (start_ptr)
start_ptr = case_buf + (start_ptr - buf);
buf = case_buf;
@@ -162,9 +165,12 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
while (buf < beg && beg[-1] != eol)
--beg;
len = end - beg;
- success_in_beg_and_len:
+ success_in_beg_and_len:;
+ size_t off = beg - buf;
+ mb_case_map_apply (map, &off, &len);
+
*match_size = len;
- ret_val = beg - buf;
+ ret_val = off;
out:
return ret_val;
}
diff --git a/src/search.h b/src/search.h
index 3074407e..529e7e23 100644
--- a/src/search.h
+++ b/src/search.h
@@ -38,7 +38,7 @@
/* searchutils.c */
extern void kwsinit (kwset_t *);
-extern char *mbtolower (const char *, size_t *);
+extern char *mbtolower (const char *, size_t *, unsigned char **);
extern bool is_mb_middle (const char **, const char *, const char *, size_t);
/* dfasearch.c */
@@ -53,4 +53,23 @@ extern size_t Fexecute (char const *, size_t, size_t *, char const *);
extern void Pcompile (char const *, size_t);
extern size_t Pexecute (char const *, size_t, size_t *, char const *);
+/* Apply a non-NULL MAP from mbtolower to the lowercase-buffer-relative
+ *OFF and *LEN, converting them to be relative to the original buffer. */
+static inline void
+mb_case_map_apply (unsigned char const *map, size_t *off, size_t *len)
+{
+ if (map)
+ {
+ size_t off_incr = 0;
+ size_t len_incr = 0;
+ size_t k;
+ for (k = 0; k < *off; k++)
+ off_incr += map[k];
+ for (k = *off; k < *off + *len; k++)
+ len_incr += map[k];
+ *off += off_incr;
+ *len += len_incr;
+ }
+}
+
#endif /* GREP_SEARCH_H */
diff --git a/src/searchutils.c b/src/searchutils.c
index b787fe67..4942c516 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -53,25 +53,38 @@ kwsinit (kwset_t *kwset)
Note that while this function returns a pointer to malloc'd storage,
the caller must not free it, since this function retains a pointer
to the buffer and reuses it on any subsequent call. As a consequence,
- this function is not thread-safe. */
+ this function is not thread-safe.
+
+ When the lowercase result string has the same length as the input string,
+ set *LEN_MAP_P to NULL. Otherwise, set it to a malloc'd buffer (like the
+ returned buffer, this must not be freed by caller) of the same length as
+ the result string. (*LEN_MAP_P)[J] is one less than the length-in-bytes
+ of the character in BEG that formed byte J of the result. This map is
+ used by the caller to convert offset,length pairs that reference the
+ lowercase result to numbers that refer to the corresponding parts of
+ the original buffer. */
char *
-mbtolower (const char *beg, size_t *n)
+mbtolower (const char *beg, size_t *n, unsigned char **len_map_p)
{
static char *out;
+ static unsigned char *len_map;
static size_t outalloc;
size_t outlen, mb_cur_max;
mbstate_t is, os;
const char *end;
char *p;
+ unsigned char *m;
if (*n > outalloc || outalloc == 0)
{
outalloc = MAX(1, *n);
out = xrealloc (out, outalloc);
+ len_map = xrealloc (len_map, outalloc);
}
/* appease clang-2.6 */
assert (out);
+ assert (len_map);
if (*n == 0)
return out;
@@ -81,6 +94,7 @@ mbtolower (const char *beg, size_t *n)
mb_cur_max = MB_CUR_MAX;
p = out;
+ m = len_map;
outlen = 0;
while (beg < end)
{
@@ -88,14 +102,18 @@ mbtolower (const char *beg, size_t *n)
size_t mbclen = mbrtowc(&wc, beg, end - beg, &is);
if (outlen + mb_cur_max >= outalloc)
{
+ size_t dm = m - len_map;
out = x2nrealloc (out, &outalloc, 1);
+ len_map = xrealloc (len_map, outalloc);
p = out + outlen;
+ m = len_map + dm;
}
if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
{
/* An invalid sequence, or a truncated multi-octet character.
We treat it as a single-octet character. */
+ *m++ = 0;
*p++ = *beg++;
outlen++;
memset (&is, 0, sizeof (is));
@@ -103,6 +121,7 @@ mbtolower (const char *beg, size_t *n)
}
else
{
+ *m++ = mbclen - 1;
beg += mbclen;
mbclen = wcrtomb (p, towlower ((wint_t) wc), &os);
p += mbclen;
@@ -110,6 +129,8 @@ mbtolower (const char *beg, size_t *n)
}
}
+ /* If the new length differs from the original, give caller the map. */
+ *len_map_p = p - out == *n ? NULL : len_map;
*n = p - out;
*p = 0;
return out;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 7be788c2..167e3186 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -32,7 +32,6 @@ XFAIL_TESTS = \
if USE_INCLUDED_REGEX
XFAIL_TESTS += equiv-classes
endif
-XFAIL_TESTS += turkish-I
TESTS = \
backref \