summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCharles Levert <charles_levert@gna.org>2005-11-10 19:57:54 +0000
committerCharles Levert <charles_levert@gna.org>2005-11-10 19:57:54 +0000
commit8a025cf81cdf5c3e809c3ecf0656419a2d596796 (patch)
tree1f5dcbf8a4e683dbfdd87fe4cacc9c687ecd6d66
parent8f11f423903b40b89aca547f5c4062bdd75de995 (diff)
downloadgrep-8a025cf81cdf5c3e809c3ecf0656419a2d596796.tar.gz
The introduction of the --only-matching and --color GNU extensions
to grep added the requirement that each execute() implementation not only be able to identify matching lines as a whole, but also individual "exact" matches within a line known to be matching, from leftmost to rightmost match, when the output from matching lines is actually produced. The interface and implementations of execute() were not up to it. This set of changes aims to rectify that situation. Previously failing tests relative to left anchors (^ and \<) and -w should now pass. This fixes <http://savannah.gnu.org/bugs/?func=detailitem&item_id=11579>, <http://savannah.gnu.org/patch/?func=detailitem&item_id=1834>, <http://savannah.gnu.org/bugs/?func=detailitem&item_id=8243>, and possibly part of other, bigger, pending patches. The problem was also compounded by the POSIX requirement to support a pattern list instead of just an individual pattern (for -G and -E as well). * tests/foad1.sh: Test for increasing/decreasing-length word matches, given pattern order, and leftmost/longest match. * tests/fmbtest.sh: Modify test #6 according to new expectations. Better document what tests #6 and #7 are actually for. Eliminate test #5 in favor of bringing tests #6 and #7 within the F G E loop. * src/grep.h (EXECUTE_ARGS): Change last argument from "int exact" to "char const *start_ptr". Testing for "start_ptr" being non-NULL retains the same semantics as testing for "exact" being non-zero. * src/grep.c (print_line_middle): Call execute() with whole buffer to work on, but using current position as start_ptr. * src/grep.c (prpending, grepbuf): Call execute() with NULL as start_ptr. * src/search.c (EGexecute, Fexecute, Pexecute): When start_ptr is non-NULL, return first match from it as an offset relative to buf. * src/search.c (EGexecute): Consider all patterns if many and, for an exact match, return the best one (leftmost, then longest). Don't explore worst solutions, of course (branch and bound).
-rw-r--r--ChangeLog36
-rw-r--r--src/grep.c29
-rw-r--r--src/grep.h3
-rw-r--r--src/search.c173
-rwxr-xr-xtests/fmbtest.sh26
-rwxr-xr-xtests/foad1.sh28
6 files changed, 184 insertions, 111 deletions
diff --git a/ChangeLog b/ChangeLog
index d673fd27..8a768a2b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,41 @@
2005-11-10 Charles Levert <charles_levert@gna.org>
+ The introduction of the --only-matching and --color GNU extensions
+ to grep added the requirement that each execute() implementation
+ not only be able to identify matching lines as a whole, but also
+ individual "exact" matches within a line known to be matching,
+ from leftmost to rightmost match, when the output from matching
+ lines is actually produced. The interface and implementations
+ of execute() were not up to it. This set of changes aims to
+ rectify that situation. Previously failing tests relative to
+ left anchors (^ and \<) and -w should now pass. This fixes
+ <http://savannah.gnu.org/bugs/?func=detailitem&item_id=11579>,
+ <http://savannah.gnu.org/patch/?func=detailitem&item_id=1834>,
+ <http://savannah.gnu.org/bugs/?func=detailitem&item_id=8243>,
+ and possibly part of other, bigger, pending patches. The problem
+ was also compounded by the POSIX requirement to support a pattern
+ list instead of just an individual pattern (for -G and -E as well).
+
+ * tests/foad1.sh: Test for increasing/decreasing-length word
+ matches, given pattern order, and leftmost/longest match.
+ * tests/fmbtest.sh: Modify test #6 according to new expectations.
+ Better document what tests #6 and #7 are actually for. Eliminate
+ test #5 in favor of bringing tests #6 and #7 within the F G E loop.
+ * src/grep.h (EXECUTE_ARGS): Change last argument from "int exact"
+ to "char const *start_ptr". Testing for "start_ptr" being non-NULL
+ retains the same semantics as testing for "exact" being non-zero.
+ * src/grep.c (print_line_middle): Call execute() with whole
+ buffer to work on, but using current position as start_ptr.
+ * src/grep.c (prpending, grepbuf): Call execute() with NULL
+ as start_ptr.
+ * src/search.c (EGexecute, Fexecute, Pexecute): When start_ptr is
+ non-NULL, return first match from it as an offset relative to buf.
+ * src/search.c (EGexecute): Consider all patterns if many and,
+ for an exact match, return the best one (leftmost, then longest).
+ Don't explore worst solutions, of course (branch and bound).
+
+2005-11-10 Charles Levert <charles_levert@gna.org>
+
* src/grep.c (main): Fix a subtle memory allocation bug introduced
with the mb_icase_keys() function, which can call xrealloc() or
free() on keys, by making sure keys is always dynamically allocated.
diff --git a/src/grep.c b/src/grep.c
index a0cf3ba8..28ae71be 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -761,6 +761,7 @@ print_line_middle (const char *beg, const char *lim)
{
size_t match_size;
size_t match_offset;
+ const char *cur = beg;
const char *mid = NULL;
char *buf; /* XXX */
const char *ibeg; /* XXX */
@@ -781,9 +782,9 @@ print_line_middle (const char *beg, const char *lim)
ibeg = beg;
}
- while ( lim > beg
- && ( (match_offset = execute(ibeg, lim - beg, &match_size, 1))
- != (size_t) -1))
+ while ( lim > cur
+ && ((match_offset = execute(ibeg, lim - beg, &match_size,
+ ibeg + (cur - beg))) != (size_t) -1))
{
char const *b = beg + match_offset;
@@ -798,7 +799,7 @@ print_line_middle (const char *beg, const char *lim)
/* XXX - Could really advance by one whole multi-octet character. */
match_size = 1;
if (!mid)
- mid = beg;
+ mid = cur;
}
else
{
@@ -809,11 +810,10 @@ print_line_middle (const char *beg, const char *lim)
PR_SGR_START(mlines_color);
if (mid)
{
- fwrite (mid, sizeof (char), (beg - mid) + match_offset, stdout);
+ cur = mid;
mid = NULL;
}
- else
- fwrite (beg, sizeof (char), match_offset, stdout);
+ fwrite (cur, sizeof (char), b - cur, stdout);
}
PR_SGR_START_IF(grep_color);
@@ -822,19 +822,18 @@ print_line_middle (const char *beg, const char *lim)
if (only_matching)
fputs("\n", stdout);
}
- beg = b + match_size;
- ibeg += match_offset + match_size; /* XXX */
+ cur = b + match_size;
}
if (buf)
free(buf); /* XXX */
if (only_matching)
- beg = lim;
+ cur = lim;
else if (mid)
- beg = mid;
+ cur = mid;
- return beg;
+ return cur;
}
static const char *
@@ -904,7 +903,8 @@ prpending (char const *lim)
size_t match_size;
--pending;
if (outleft
- || ((execute(lastout, nl + 1 - lastout, &match_size, 0) == (size_t) -1)
+ || ((execute(lastout, nl + 1 - lastout,
+ &match_size, NULL) == (size_t) -1)
== !out_invert))
prline (lastout, nl + 1, SEP_CHAR_CONTEXT);
else
@@ -994,7 +994,8 @@ grepbuf (char const *beg, char const *lim)
nlines = 0;
p = beg;
- while ((match_offset = execute(p, lim - p, &match_size, 0)) != (size_t) -1)
+ while ((match_offset = execute(p, lim - p, &match_size,
+ NULL)) != (size_t) -1)
{
char const *b = p + match_offset;
char const *endp = b + match_size;
diff --git a/src/grep.h b/src/grep.h
index 3384c40c..f8b6f454 100644
--- a/src/grep.h
+++ b/src/grep.h
@@ -34,7 +34,8 @@
(char const *pattern, size_t size)
#define EXECUTE_RET size_t
#define EXECUTE_ARGS \
- (char const *buf, size_t size, size_t *match_size, int exact)
+ (char const *buf, size_t size, size_t *match_size, char const *start_ptr)
+ /* start_ptr == NULL means the caller is not looking for an exact match. */
#ifdef GREP_PROGRAM
/* Function definitions. */
diff --git a/src/search.c b/src/search.c
index f06a75e2..206260d7 100644
--- a/src/search.c
+++ b/src/search.c
@@ -297,9 +297,9 @@ COMPILE_FCT(Ecompile)
EXECUTE_FCT(EGexecute)
{
- register char const *buflim, *beg, *end;
+ register char const *buflim, *beg, *end, *match, *best_match;
char eol = eolbyte;
- int backref, start, len;
+ int backref, start, len, best_len;
struct kwsmatch kwsm;
size_t i, ret_val;
#ifdef MBS_SUPPORT
@@ -310,6 +310,8 @@ EXECUTE_FCT(EGexecute)
{
char *case_buf = xmalloc(size);
memcpy(case_buf, buf, size);
+ if (start_ptr)
+ start_ptr = case_buf + (start_ptr - buf);
buf = case_buf;
}
if (kwset)
@@ -321,8 +323,9 @@ EXECUTE_FCT(EGexecute)
for (beg = end = buf; end < buflim; beg = end)
{
- if (!exact)
+ if (!start_ptr)
{
+ /* We don't care about an exact match. */
if (kwset)
{
/* Find a possible match using the KWset matcher. */
@@ -363,26 +366,38 @@ EXECUTE_FCT(EGexecute)
goto success;
}
else
- end = beg + size;
+ {
+ /* We are looking for the leftmost (then longest) exact match.
+ We will go through the outer loop only once. */
+ beg = start_ptr;
+ end = buflim;
+ }
/* If we've made it to this point, this means DFA has seen
a probable match, and we need to run it through Regex. */
+ best_match = end;
+ best_len = 0;
for (i = 0; i < pcount; i++)
{
patterns[i].regexbuf.not_eol = 0;
- if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
- end - beg - 1, 0,
- end - beg - 1, &(patterns[i].regs))))
+ if (0 <= (start = re_search (&(patterns[i].regexbuf),
+ buf, end - buf - 1,
+ beg - buf, end - beg - 1,
+ &(patterns[i].regs))))
{
len = patterns[i].regs.end[0] - start;
- if (exact)
- {
- *match_size = len;
- return start;
- }
+ match = buf + start;
+ if (match > best_match)
+ continue;
+ if (start_ptr && !match_words)
+ goto assess_pattern_match;
if ((!match_lines && !match_words)
|| (match_lines && len == end - beg - 1))
- goto success;
+ {
+ match = beg;
+ len = end - beg;
+ goto assess_pattern_match;
+ }
/* If -w, check if the match aligns with word boundaries.
We do this iteratively because:
(a) the line may contain more than one occurence of the
@@ -391,53 +406,75 @@ EXECUTE_FCT(EGexecute)
given point, and we may need to consider a shorter one to
find a word boundary. */
if (match_words)
- while (start >= 0)
+ while (match <= best_match)
{
- if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
+ if ((match == buf || !WCHAR ((unsigned char) match[-1]))
&& (len == end - beg - 1
- || !WCHAR ((unsigned char) beg[start + len])))
- goto success;
+ || !WCHAR ((unsigned char) match[len])))
+ goto assess_pattern_match;
if (len > 0)
{
/* Try a shorter length anchored at the same place. */
--len;
patterns[i].regexbuf.not_eol = 1;
- len = re_match (&(patterns[i].regexbuf), beg,
- start + len, start,
+ len = re_match (&(patterns[i].regexbuf),
+ buf, match + len - beg, match - buf,
&(patterns[i].regs));
}
if (len <= 0)
{
/* Try looking further on. */
- if (start == end - beg - 1)
+ if (match == end - 1)
break;
- ++start;
+ match++;
patterns[i].regexbuf.not_eol = 0;
- start = re_search (&(patterns[i].regexbuf), beg,
- end - beg - 1,
- start, end - beg - 1 - start,
+ start = re_search (&(patterns[i].regexbuf),
+ buf, end - buf - 1,
+ match - buf, end - match - 1,
&(patterns[i].regs));
+ if (start < 0)
+ break;
len = patterns[i].regs.end[0] - start;
+ match = buf + start;
}
- }
- }
+ } /* while (match <= best_match) */
+ continue;
+ assess_pattern_match:
+ if (!start_ptr)
+ {
+ /* Good enough for a non-exact match.
+ No need to look at further patterns, if any. */
+ beg = match;
+ goto success_in_len;
+ }
+ if (match < best_match || (match == best_match && len > best_len))
+ {
+ /* Best exact match: leftmost, then longest. */
+ best_match = match;
+ best_len = len;
+ }
+ } /* if re_search >= 0 */
} /* for Regex patterns. */
+ if (best_match < end)
+ {
+ /* We have found an exact match. We were just
+ waiting for the best one (leftmost then longest). */
+ beg = best_match;
+ len = best_len;
+ goto success_in_len;
+ }
} /* for (beg = end ..) */
failure:
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- {
- if (match_icase)
- free((char*)buf);
- if (mb_properties)
- free(mb_properties);
- }
-#endif /* MBS_SUPPORT */
- return (size_t) -1;
+ ret_val = -1;
+ goto out;
success:
+ len = end - beg;
+ success_in_len:
+ *match_size = len;
ret_val = beg - buf;
+ out:
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
@@ -447,7 +484,6 @@ EXECUTE_FCT(EGexecute)
free(mb_properties);
}
#endif /* MBS_SUPPORT */
- *match_size = end - beg;
return ret_val;
}
#endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */
@@ -490,47 +526,27 @@ EXECUTE_FCT(Fexecute)
{
char *case_buf = xmalloc(size);
memcpy(case_buf, buf, size);
+ if (start_ptr)
+ start_ptr = case_buf + (start_ptr - buf);
buf = case_buf;
}
mb_properties = check_multibyte_string(buf, size);
}
#endif /* MBS_SUPPORT */
- for (beg = buf; beg <= buf + size; ++beg)
+ for (beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
{
size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
if (offset == (size_t) -1)
- {
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- {
- if (match_icase)
- free ((char*)buf);
- free(mb_properties);
- }
-#endif /* MBS_SUPPORT */
- return offset;
- }
+ goto failure;
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
continue; /* It is a part of multibyte character. */
#endif /* MBS_SUPPORT */
beg += offset;
len = kwsmatch.size[0];
- if (exact)
- {
- *match_size = len;
- ret_val = beg - buf;
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- {
- if (match_icase)
- free ((char*)buf);
- free(mb_properties);
- }
-#endif /* MBS_SUPPORT */
- return ret_val;
- }
+ if (start_ptr && !match_words)
+ goto success_in_beg_and_len;
if (match_lines)
{
if (beg > buf && beg[-1] != eol)
@@ -552,31 +568,29 @@ EXECUTE_FCT(Fexecute)
try = beg + offset;
len = kwsmatch.size[0];
}
- else
+ else if (!start_ptr)
goto success;
- }
+ else
+ goto success_in_beg_and_len;
+ } /* for (try) */
else
goto success;
- }
+ } /* for (beg in buf) */
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- {
- if (match_icase)
- free((char*)buf);
- if (mb_properties)
- free(mb_properties);
- }
-#endif /* MBS_SUPPORT */
- return -1;
+ failure:
+ ret_val = -1;
+ goto out;
success:
end = memchr (beg + len, eol, (buf + size) - (beg + len));
end++;
while (buf < beg && beg[-1] != eol)
--beg;
- *match_size = end - beg;
+ len = end - beg;
+ success_in_beg_and_len:
+ *match_size = len;
ret_val = beg - buf;
+ out:
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
@@ -673,7 +687,8 @@ EXECUTE_FCT(Pexecute)
is just for performance improvement in pcre_exec. */
int sub[300];
- int e = pcre_exec (cre, extra, buf, size, 0, 0,
+ int e = pcre_exec (cre, extra, buf, size,
+ start_ptr ? (start_ptr - buf) : 0, 0,
sub, sizeof sub / sizeof *sub);
if (e <= 0)
@@ -697,7 +712,7 @@ EXECUTE_FCT(Pexecute)
char const *end = buf + sub[1];
char const *buflim = buf + size;
char eol = eolbyte;
- if (!exact)
+ if (!start_ptr)
{
/* FIXME: The case when '\n' is not found indicates a bug:
Since grep is line oriented, the match should never contain
diff --git a/tests/fmbtest.sh b/tests/fmbtest.sh
index 7aebe4a7..9b5b4e63 100755
--- a/tests/fmbtest.sh
+++ b/tests/fmbtest.sh
@@ -65,31 +65,19 @@ if test "$test4" != "01 02 08 13 17 19"; then
failures=1
fi
-done
-
-# Test that -F --color=always prefers longer matches.
-test5="`echo 'Cosi tu ČišÍ...' \
- | LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -Fi -e 'čiš' -e 'čiší'`"
-if echo "$test5" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*m\(.\[K\)\?ČišÍ.*\[.*m\(.\[K\)\?\.\.\.'; then
- :
-else
- echo "Test #5 F failed: $test5"
- failures=1
-fi
-
-for mode in G E; do
-
-# Test that -{G,E} --color=always prefers earlier pattern matches.
+# Test that --color=always does not depend on individual pattern order within the pattern
+# list, and that a longer match is preferred to a shorter one starting at the same point.
test6="`echo 'Cosi tu ČišÍ...' \
| LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -${mode}i -e 'čiš' -e 'čiší'`"
-if echo "$test6" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*m\(.\[K\)\?Čiš.*\[.*m\(.\[K\)\?Í\.\.\.'; then
+if echo "$test6" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*m\(.\[K\)\?ČišÍ.*\[.*m\(.\[K\)\?\.\.\.'; then
:
else
echo "Test #6 ${mode} failed: $test6"
failures=1
fi
-# Test that -{G,E} --color=always prefers earlier pattern matches.
+# Test that --color=always does not depend on individual pattern order within the pattern
+# list, and that a longer match is preferred to a shorter one starting at the same point.
test7="`echo 'Cosi tu ČišÍ...' \
| LC_ALL=cs_CZ.UTF-8 ${GREP} --color=always -${mode}i -e 'čiší' -e 'čiš'`"
if echo "$test7" | LC_ALL=C ${GREP} -q 'Cosi tu .*\[.*m\(.\[K\)\?ČišÍ.*\[.*m\(.\[K\)\?\.\.\.'; then
@@ -99,6 +87,10 @@ else
failures=1
fi
+done
+
+for mode in G E; do
+
test8="$(echo `LC_ALL=cs_CZ.UTF-8 ${GREP} -${mode}i -e 'Č.šE' -e 'Č[a-f]s' csinput \
| LC_ALL=C sed 's/^.*\([0-9][0-9]\).*$/\1/'`)"
if test "$test8" != "01 02 07 08 10 11 12 13 14 15 16 17 18 19 20"; then
diff --git a/tests/foad1.sh b/tests/foad1.sh
index 1e04a01d..08cf1173 100755
--- a/tests/foad1.sh
+++ b/tests/foad1.sh
@@ -96,6 +96,34 @@ grep_test "LIN7C 55327/" "" -wF -e 5327 -e 5532
grep_test 'xyz/' 'y/' -o 'y*'
grep_test 'xyz/' "x${CB}y${CE}z/" --color=always 'y*'
+# Test for increasing/decreasing-length word matches,
+# for independence from pattern order within the pattern list,
+# and for preferring the longest match at a given position.
+x0='a bb ccc dddd/'
+x1='dddd ccc bb a/'
+x2='bcd abcd abc bc bcd abc/'
+x3='bc abcd bc/'
+y0="a ${CB}bb${CE} ${CB}ccc${CE} dddd/"
+y1="dddd ${CB}ccc${CE} ${CB}bb${CE} a/"
+y2="bcd abcd abc ${CB}bc${CE} bcd abc/"
+y3="${CB}bc${CE} abcd ${CB}bc${CE}/"
+grep_test "$x0" "$y0" -E --color=always -e bb -e cc -e ccc
+grep_test "$x0" "$y0" -F --color=always -e bb -e cc -e ccc
+grep_test "$x0" "$y0" -E --color=always -e bb -e ccc -e cc
+grep_test "$x0" "$y0" -F --color=always -e bb -e ccc -e cc
+grep_test "$x0" "$y0" -E -w --color=always -e bb -e ccc
+grep_test "$x0" "$y0" -F -w --color=always -e bb -e ccc
+grep_test "$x0" "$y0" -E -w --color=always -e ccc -e bb
+grep_test "$x0" "$y0" -F -w --color=always -e ccc -e bb
+grep_test "$x1" "$y1" -E -w --color=always -e bb -e ccc
+grep_test "$x1" "$y1" -F -w --color=always -e bb -e ccc
+grep_test "$x1" "$y1" -E -w --color=always -e ccc -e bb
+grep_test "$x1" "$y1" -F -w --color=always -e ccc -e bb
+grep_test "$x2" "$y2" -E -w --color=always bc
+grep_test "$x2" "$y2" -F -w --color=always bc
+grep_test "$x3" "$y3" -E -w --color=always bc
+grep_test "$x3" "$y3" -F -w --color=always bc
+
# The rest of this file is meant to be executed under this locale.
LC_ALL=cs_CZ.UTF-8; export LC_ALL