diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/grep.c | 29 | ||||
-rw-r--r-- | src/grep.h | 3 | ||||
-rw-r--r-- | src/search.c | 173 |
3 files changed, 111 insertions, 94 deletions
@@ -761,6 +761,7 @@ print_line_middle (const char *beg, const char *lim) { size_t match_size; size_t match_offset; + const char *cur = beg; const char *mid = NULL; char *buf; /* XXX */ const char *ibeg; /* XXX */ @@ -781,9 +782,9 @@ print_line_middle (const char *beg, const char *lim) ibeg = beg; } - while ( lim > beg - && ( (match_offset = execute(ibeg, lim - beg, &match_size, 1)) - != (size_t) -1)) + while ( lim > cur + && ((match_offset = execute(ibeg, lim - beg, &match_size, + ibeg + (cur - beg))) != (size_t) -1)) { char const *b = beg + match_offset; @@ -798,7 +799,7 @@ print_line_middle (const char *beg, const char *lim) /* XXX - Could really advance by one whole multi-octet character. */ match_size = 1; if (!mid) - mid = beg; + mid = cur; } else { @@ -809,11 +810,10 @@ print_line_middle (const char *beg, const char *lim) PR_SGR_START(mlines_color); if (mid) { - fwrite (mid, sizeof (char), (beg - mid) + match_offset, stdout); + cur = mid; mid = NULL; } - else - fwrite (beg, sizeof (char), match_offset, stdout); + fwrite (cur, sizeof (char), b - cur, stdout); } PR_SGR_START_IF(grep_color); @@ -822,19 +822,18 @@ print_line_middle (const char *beg, const char *lim) if (only_matching) fputs("\n", stdout); } - beg = b + match_size; - ibeg += match_offset + match_size; /* XXX */ + cur = b + match_size; } if (buf) free(buf); /* XXX */ if (only_matching) - beg = lim; + cur = lim; else if (mid) - beg = mid; + cur = mid; - return beg; + return cur; } static const char * @@ -904,7 +903,8 @@ prpending (char const *lim) size_t match_size; --pending; if (outleft - || ((execute(lastout, nl + 1 - lastout, &match_size, 0) == (size_t) -1) + || ((execute(lastout, nl + 1 - lastout, + &match_size, NULL) == (size_t) -1) == !out_invert)) prline (lastout, nl + 1, SEP_CHAR_CONTEXT); else @@ -994,7 +994,8 @@ grepbuf (char const *beg, char const *lim) nlines = 0; p = beg; - while ((match_offset = execute(p, lim - p, &match_size, 0)) != (size_t) -1) + while ((match_offset = execute(p, lim - p, &match_size, + NULL)) != (size_t) -1) { char const *b = p + match_offset; char const *endp = b + match_size; @@ -34,7 +34,8 @@ (char const *pattern, size_t size) #define EXECUTE_RET size_t #define EXECUTE_ARGS \ - (char const *buf, size_t size, size_t *match_size, int exact) + (char const *buf, size_t size, size_t *match_size, char const *start_ptr) + /* start_ptr == NULL means the caller is not looking for an exact match. */ #ifdef GREP_PROGRAM /* Function definitions. */ diff --git a/src/search.c b/src/search.c index f06a75e2..206260d7 100644 --- a/src/search.c +++ b/src/search.c @@ -297,9 +297,9 @@ COMPILE_FCT(Ecompile) EXECUTE_FCT(EGexecute) { - register char const *buflim, *beg, *end; + register char const *buflim, *beg, *end, *match, *best_match; char eol = eolbyte; - int backref, start, len; + int backref, start, len, best_len; struct kwsmatch kwsm; size_t i, ret_val; #ifdef MBS_SUPPORT @@ -310,6 +310,8 @@ EXECUTE_FCT(EGexecute) { char *case_buf = xmalloc(size); memcpy(case_buf, buf, size); + if (start_ptr) + start_ptr = case_buf + (start_ptr - buf); buf = case_buf; } if (kwset) @@ -321,8 +323,9 @@ EXECUTE_FCT(EGexecute) for (beg = end = buf; end < buflim; beg = end) { - if (!exact) + if (!start_ptr) { + /* We don't care about an exact match. */ if (kwset) { /* Find a possible match using the KWset matcher. */ @@ -363,26 +366,38 @@ EXECUTE_FCT(EGexecute) goto success; } else - end = beg + size; + { + /* We are looking for the leftmost (then longest) exact match. + We will go through the outer loop only once. */ + beg = start_ptr; + end = buflim; + } /* If we've made it to this point, this means DFA has seen a probable match, and we need to run it through Regex. */ + best_match = end; + best_len = 0; for (i = 0; i < pcount; i++) { patterns[i].regexbuf.not_eol = 0; - if (0 <= (start = re_search (&(patterns[i].regexbuf), beg, - end - beg - 1, 0, - end - beg - 1, &(patterns[i].regs)))) + if (0 <= (start = re_search (&(patterns[i].regexbuf), + buf, end - buf - 1, + beg - buf, end - beg - 1, + &(patterns[i].regs)))) { len = patterns[i].regs.end[0] - start; - if (exact) - { - *match_size = len; - return start; - } + match = buf + start; + if (match > best_match) + continue; + if (start_ptr && !match_words) + goto assess_pattern_match; if ((!match_lines && !match_words) || (match_lines && len == end - beg - 1)) - goto success; + { + match = beg; + len = end - beg; + goto assess_pattern_match; + } /* If -w, check if the match aligns with word boundaries. We do this iteratively because: (a) the line may contain more than one occurence of the @@ -391,53 +406,75 @@ EXECUTE_FCT(EGexecute) given point, and we may need to consider a shorter one to find a word boundary. */ if (match_words) - while (start >= 0) + while (match <= best_match) { - if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) + if ((match == buf || !WCHAR ((unsigned char) match[-1])) && (len == end - beg - 1 - || !WCHAR ((unsigned char) beg[start + len]))) - goto success; + || !WCHAR ((unsigned char) match[len]))) + goto assess_pattern_match; if (len > 0) { /* Try a shorter length anchored at the same place. */ --len; patterns[i].regexbuf.not_eol = 1; - len = re_match (&(patterns[i].regexbuf), beg, - start + len, start, + len = re_match (&(patterns[i].regexbuf), + buf, match + len - beg, match - buf, &(patterns[i].regs)); } if (len <= 0) { /* Try looking further on. */ - if (start == end - beg - 1) + if (match == end - 1) break; - ++start; + match++; patterns[i].regexbuf.not_eol = 0; - start = re_search (&(patterns[i].regexbuf), beg, - end - beg - 1, - start, end - beg - 1 - start, + start = re_search (&(patterns[i].regexbuf), + buf, end - buf - 1, + match - buf, end - match - 1, &(patterns[i].regs)); + if (start < 0) + break; len = patterns[i].regs.end[0] - start; + match = buf + start; } - } - } + } /* while (match <= best_match) */ + continue; + assess_pattern_match: + if (!start_ptr) + { + /* Good enough for a non-exact match. + No need to look at further patterns, if any. */ + beg = match; + goto success_in_len; + } + if (match < best_match || (match == best_match && len > best_len)) + { + /* Best exact match: leftmost, then longest. */ + best_match = match; + best_len = len; + } + } /* if re_search >= 0 */ } /* for Regex patterns. */ + if (best_match < end) + { + /* We have found an exact match. We were just + waiting for the best one (leftmost then longest). */ + beg = best_match; + len = best_len; + goto success_in_len; + } } /* for (beg = end ..) */ failure: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (match_icase) - free((char*)buf); - if (mb_properties) - free(mb_properties); - } -#endif /* MBS_SUPPORT */ - return (size_t) -1; + ret_val = -1; + goto out; success: + len = end - beg; + success_in_len: + *match_size = len; ret_val = beg - buf; + out: #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { @@ -447,7 +484,6 @@ EXECUTE_FCT(EGexecute) free(mb_properties); } #endif /* MBS_SUPPORT */ - *match_size = end - beg; return ret_val; } #endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */ @@ -490,47 +526,27 @@ EXECUTE_FCT(Fexecute) { char *case_buf = xmalloc(size); memcpy(case_buf, buf, size); + if (start_ptr) + start_ptr = case_buf + (start_ptr - buf); buf = case_buf; } mb_properties = check_multibyte_string(buf, size); } #endif /* MBS_SUPPORT */ - for (beg = buf; beg <= buf + size; ++beg) + for (beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) { size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); if (offset == (size_t) -1) - { -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (match_icase) - free ((char*)buf); - free(mb_properties); - } -#endif /* MBS_SUPPORT */ - return offset; - } + goto failure; #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) continue; /* It is a part of multibyte character. */ #endif /* MBS_SUPPORT */ beg += offset; len = kwsmatch.size[0]; - if (exact) - { - *match_size = len; - ret_val = beg - buf; -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (match_icase) - free ((char*)buf); - free(mb_properties); - } -#endif /* MBS_SUPPORT */ - return ret_val; - } + if (start_ptr && !match_words) + goto success_in_beg_and_len; if (match_lines) { if (beg > buf && beg[-1] != eol) @@ -552,31 +568,29 @@ EXECUTE_FCT(Fexecute) try = beg + offset; len = kwsmatch.size[0]; } - else + else if (!start_ptr) goto success; - } + else + goto success_in_beg_and_len; + } /* for (try) */ else goto success; - } + } /* for (beg in buf) */ -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (match_icase) - free((char*)buf); - if (mb_properties) - free(mb_properties); - } -#endif /* MBS_SUPPORT */ - return -1; + failure: + ret_val = -1; + goto out; success: end = memchr (beg + len, eol, (buf + size) - (beg + len)); end++; while (buf < beg && beg[-1] != eol) --beg; - *match_size = end - beg; + len = end - beg; + success_in_beg_and_len: + *match_size = len; ret_val = beg - buf; + out: #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { @@ -673,7 +687,8 @@ EXECUTE_FCT(Pexecute) is just for performance improvement in pcre_exec. */ int sub[300]; - int e = pcre_exec (cre, extra, buf, size, 0, 0, + int e = pcre_exec (cre, extra, buf, size, + start_ptr ? (start_ptr - buf) : 0, 0, sub, sizeof sub / sizeof *sub); if (e <= 0) @@ -697,7 +712,7 @@ EXECUTE_FCT(Pexecute) char const *end = buf + sub[1]; char const *buflim = buf + size; char eol = eolbyte; - if (!exact) + if (!start_ptr) { /* FIXME: The case when '\n' is not found indicates a bug: Since grep is line oriented, the match should never contain |