diff options
author | Charles Levert <charles_levert@gna.org> | 2005-11-10 19:57:54 +0000 |
---|---|---|
committer | Charles Levert <charles_levert@gna.org> | 2005-11-10 19:57:54 +0000 |
commit | 8a025cf81cdf5c3e809c3ecf0656419a2d596796 (patch) | |
tree | 1f5dcbf8a4e683dbfdd87fe4cacc9c687ecd6d66 /src | |
parent | 8f11f423903b40b89aca547f5c4062bdd75de995 (diff) | |
download | grep-8a025cf81cdf5c3e809c3ecf0656419a2d596796.tar.gz |
The introduction of the --only-matching and --color GNU extensions
to grep added the requirement that each execute() implementation
not only be able to identify matching lines as a whole, but also
individual "exact" matches within a line known to be matching,
from leftmost to rightmost match, when the output from matching
lines is actually produced. The interface and implementations
of execute() were not up to it. This set of changes aims to
rectify that situation. Previously failing tests relative to
left anchors (^ and \<) and -w should now pass. This fixes
<http://savannah.gnu.org/bugs/?func=detailitem&item_id=11579>,
<http://savannah.gnu.org/patch/?func=detailitem&item_id=1834>,
<http://savannah.gnu.org/bugs/?func=detailitem&item_id=8243>,
and possibly part of other, bigger, pending patches. The problem
was also compounded by the POSIX requirement to support a pattern
list instead of just an individual pattern (for -G and -E as well).
* tests/foad1.sh: Test for increasing/decreasing-length word
matches, given pattern order, and leftmost/longest match.
* tests/fmbtest.sh: Modify test #6 according to new expectations.
Better document what tests #6 and #7 are actually for. Eliminate
test #5 in favor of bringing tests #6 and #7 within the F G E loop.
* src/grep.h (EXECUTE_ARGS): Change last argument from "int exact"
to "char const *start_ptr". Testing for "start_ptr" being non-NULL
retains the same semantics as testing for "exact" being non-zero.
* src/grep.c (print_line_middle): Call execute() with whole
buffer to work on, but using current position as start_ptr.
* src/grep.c (prpending, grepbuf): Call execute() with NULL
as start_ptr.
* src/search.c (EGexecute, Fexecute, Pexecute): When start_ptr is
non-NULL, return first match from it as an offset relative to buf.
* src/search.c (EGexecute): Consider all patterns if many and,
for an exact match, return the best one (leftmost, then longest).
Don't explore worst solutions, of course (branch and bound).
Diffstat (limited to 'src')
-rw-r--r-- | src/grep.c | 29 | ||||
-rw-r--r-- | src/grep.h | 3 | ||||
-rw-r--r-- | src/search.c | 173 |
3 files changed, 111 insertions, 94 deletions
@@ -761,6 +761,7 @@ print_line_middle (const char *beg, const char *lim) { size_t match_size; size_t match_offset; + const char *cur = beg; const char *mid = NULL; char *buf; /* XXX */ const char *ibeg; /* XXX */ @@ -781,9 +782,9 @@ print_line_middle (const char *beg, const char *lim) ibeg = beg; } - while ( lim > beg - && ( (match_offset = execute(ibeg, lim - beg, &match_size, 1)) - != (size_t) -1)) + while ( lim > cur + && ((match_offset = execute(ibeg, lim - beg, &match_size, + ibeg + (cur - beg))) != (size_t) -1)) { char const *b = beg + match_offset; @@ -798,7 +799,7 @@ print_line_middle (const char *beg, const char *lim) /* XXX - Could really advance by one whole multi-octet character. */ match_size = 1; if (!mid) - mid = beg; + mid = cur; } else { @@ -809,11 +810,10 @@ print_line_middle (const char *beg, const char *lim) PR_SGR_START(mlines_color); if (mid) { - fwrite (mid, sizeof (char), (beg - mid) + match_offset, stdout); + cur = mid; mid = NULL; } - else - fwrite (beg, sizeof (char), match_offset, stdout); + fwrite (cur, sizeof (char), b - cur, stdout); } PR_SGR_START_IF(grep_color); @@ -822,19 +822,18 @@ print_line_middle (const char *beg, const char *lim) if (only_matching) fputs("\n", stdout); } - beg = b + match_size; - ibeg += match_offset + match_size; /* XXX */ + cur = b + match_size; } if (buf) free(buf); /* XXX */ if (only_matching) - beg = lim; + cur = lim; else if (mid) - beg = mid; + cur = mid; - return beg; + return cur; } static const char * @@ -904,7 +903,8 @@ prpending (char const *lim) size_t match_size; --pending; if (outleft - || ((execute(lastout, nl + 1 - lastout, &match_size, 0) == (size_t) -1) + || ((execute(lastout, nl + 1 - lastout, + &match_size, NULL) == (size_t) -1) == !out_invert)) prline (lastout, nl + 1, SEP_CHAR_CONTEXT); else @@ -994,7 +994,8 @@ grepbuf (char const *beg, char const *lim) nlines = 0; p = beg; - while ((match_offset = execute(p, lim - p, &match_size, 0)) != (size_t) -1) + while ((match_offset = execute(p, lim - p, &match_size, + NULL)) != (size_t) -1) { char const *b = p + match_offset; char const *endp = b + match_size; @@ -34,7 +34,8 @@ (char const *pattern, size_t size) #define EXECUTE_RET size_t #define EXECUTE_ARGS \ - (char const *buf, size_t size, size_t *match_size, int exact) + (char const *buf, size_t size, size_t *match_size, char const *start_ptr) + /* start_ptr == NULL means the caller is not looking for an exact match. */ #ifdef GREP_PROGRAM /* Function definitions. */ diff --git a/src/search.c b/src/search.c index f06a75e2..206260d7 100644 --- a/src/search.c +++ b/src/search.c @@ -297,9 +297,9 @@ COMPILE_FCT(Ecompile) EXECUTE_FCT(EGexecute) { - register char const *buflim, *beg, *end; + register char const *buflim, *beg, *end, *match, *best_match; char eol = eolbyte; - int backref, start, len; + int backref, start, len, best_len; struct kwsmatch kwsm; size_t i, ret_val; #ifdef MBS_SUPPORT @@ -310,6 +310,8 @@ EXECUTE_FCT(EGexecute) { char *case_buf = xmalloc(size); memcpy(case_buf, buf, size); + if (start_ptr) + start_ptr = case_buf + (start_ptr - buf); buf = case_buf; } if (kwset) @@ -321,8 +323,9 @@ EXECUTE_FCT(EGexecute) for (beg = end = buf; end < buflim; beg = end) { - if (!exact) + if (!start_ptr) { + /* We don't care about an exact match. */ if (kwset) { /* Find a possible match using the KWset matcher. */ @@ -363,26 +366,38 @@ EXECUTE_FCT(EGexecute) goto success; } else - end = beg + size; + { + /* We are looking for the leftmost (then longest) exact match. + We will go through the outer loop only once. */ + beg = start_ptr; + end = buflim; + } /* If we've made it to this point, this means DFA has seen a probable match, and we need to run it through Regex. */ + best_match = end; + best_len = 0; for (i = 0; i < pcount; i++) { patterns[i].regexbuf.not_eol = 0; - if (0 <= (start = re_search (&(patterns[i].regexbuf), beg, - end - beg - 1, 0, - end - beg - 1, &(patterns[i].regs)))) + if (0 <= (start = re_search (&(patterns[i].regexbuf), + buf, end - buf - 1, + beg - buf, end - beg - 1, + &(patterns[i].regs)))) { len = patterns[i].regs.end[0] - start; - if (exact) - { - *match_size = len; - return start; - } + match = buf + start; + if (match > best_match) + continue; + if (start_ptr && !match_words) + goto assess_pattern_match; if ((!match_lines && !match_words) || (match_lines && len == end - beg - 1)) - goto success; + { + match = beg; + len = end - beg; + goto assess_pattern_match; + } /* If -w, check if the match aligns with word boundaries. We do this iteratively because: (a) the line may contain more than one occurence of the @@ -391,53 +406,75 @@ EXECUTE_FCT(EGexecute) given point, and we may need to consider a shorter one to find a word boundary. */ if (match_words) - while (start >= 0) + while (match <= best_match) { - if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) + if ((match == buf || !WCHAR ((unsigned char) match[-1])) && (len == end - beg - 1 - || !WCHAR ((unsigned char) beg[start + len]))) - goto success; + || !WCHAR ((unsigned char) match[len]))) + goto assess_pattern_match; if (len > 0) { /* Try a shorter length anchored at the same place. */ --len; patterns[i].regexbuf.not_eol = 1; - len = re_match (&(patterns[i].regexbuf), beg, - start + len, start, + len = re_match (&(patterns[i].regexbuf), + buf, match + len - beg, match - buf, &(patterns[i].regs)); } if (len <= 0) { /* Try looking further on. */ - if (start == end - beg - 1) + if (match == end - 1) break; - ++start; + match++; patterns[i].regexbuf.not_eol = 0; - start = re_search (&(patterns[i].regexbuf), beg, - end - beg - 1, - start, end - beg - 1 - start, + start = re_search (&(patterns[i].regexbuf), + buf, end - buf - 1, + match - buf, end - match - 1, &(patterns[i].regs)); + if (start < 0) + break; len = patterns[i].regs.end[0] - start; + match = buf + start; } - } - } + } /* while (match <= best_match) */ + continue; + assess_pattern_match: + if (!start_ptr) + { + /* Good enough for a non-exact match. + No need to look at further patterns, if any. */ + beg = match; + goto success_in_len; + } + if (match < best_match || (match == best_match && len > best_len)) + { + /* Best exact match: leftmost, then longest. */ + best_match = match; + best_len = len; + } + } /* if re_search >= 0 */ } /* for Regex patterns. */ + if (best_match < end) + { + /* We have found an exact match. We were just + waiting for the best one (leftmost then longest). */ + beg = best_match; + len = best_len; + goto success_in_len; + } } /* for (beg = end ..) */ failure: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (match_icase) - free((char*)buf); - if (mb_properties) - free(mb_properties); - } -#endif /* MBS_SUPPORT */ - return (size_t) -1; + ret_val = -1; + goto out; success: + len = end - beg; + success_in_len: + *match_size = len; ret_val = beg - buf; + out: #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { @@ -447,7 +484,6 @@ EXECUTE_FCT(EGexecute) free(mb_properties); } #endif /* MBS_SUPPORT */ - *match_size = end - beg; return ret_val; } #endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */ @@ -490,47 +526,27 @@ EXECUTE_FCT(Fexecute) { char *case_buf = xmalloc(size); memcpy(case_buf, buf, size); + if (start_ptr) + start_ptr = case_buf + (start_ptr - buf); buf = case_buf; } mb_properties = check_multibyte_string(buf, size); } #endif /* MBS_SUPPORT */ - for (beg = buf; beg <= buf + size; ++beg) + for (beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) { size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); if (offset == (size_t) -1) - { -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (match_icase) - free ((char*)buf); - free(mb_properties); - } -#endif /* MBS_SUPPORT */ - return offset; - } + goto failure; #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) continue; /* It is a part of multibyte character. */ #endif /* MBS_SUPPORT */ beg += offset; len = kwsmatch.size[0]; - if (exact) - { - *match_size = len; - ret_val = beg - buf; -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (match_icase) - free ((char*)buf); - free(mb_properties); - } -#endif /* MBS_SUPPORT */ - return ret_val; - } + if (start_ptr && !match_words) + goto success_in_beg_and_len; if (match_lines) { if (beg > buf && beg[-1] != eol) @@ -552,31 +568,29 @@ EXECUTE_FCT(Fexecute) try = beg + offset; len = kwsmatch.size[0]; } - else + else if (!start_ptr) goto success; - } + else + goto success_in_beg_and_len; + } /* for (try) */ else goto success; - } + } /* for (beg in buf) */ -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (match_icase) - free((char*)buf); - if (mb_properties) - free(mb_properties); - } -#endif /* MBS_SUPPORT */ - return -1; + failure: + ret_val = -1; + goto out; success: end = memchr (beg + len, eol, (buf + size) - (beg + len)); end++; while (buf < beg && beg[-1] != eol) --beg; - *match_size = end - beg; + len = end - beg; + success_in_beg_and_len: + *match_size = len; ret_val = beg - buf; + out: #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { @@ -673,7 +687,8 @@ EXECUTE_FCT(Pexecute) is just for performance improvement in pcre_exec. */ int sub[300]; - int e = pcre_exec (cre, extra, buf, size, 0, 0, + int e = pcre_exec (cre, extra, buf, size, + start_ptr ? (start_ptr - buf) : 0, 0, sub, sizeof sub / sizeof *sub); if (e <= 0) @@ -697,7 +712,7 @@ EXECUTE_FCT(Pexecute) char const *end = buf + sub[1]; char const *buflim = buf + size; char eol = eolbyte; - if (!exact) + if (!start_ptr) { /* FIXME: The case when '\n' is not found indicates a bug: Since grep is line oriented, the match should never contain |