diff options
author | Paolo Bonzini <bonzini@gnu.org> | 2010-03-18 13:40:10 +0100 |
---|---|---|
committer | Paolo Bonzini <bonzini@gnu.org> | 2010-03-22 09:55:30 +0100 |
commit | c59a6cd03de84dc38c577083f34e3b0dfe87e36d (patch) | |
tree | 4971c8e1233d4823579dc7beb2fab820b0729cb3 | |
parent | 59040143e96ce960476c5a360d829256759ff4ab (diff) | |
download | grep-c59a6cd03de84dc38c577083f34e3b0dfe87e36d.tar.gz |
grep: split search.c
* po/POTFILES.in: Update.
* src/Makefile.am (grep_SOURCES, egrep_SOURCES, fgrep_SOURCES): Move
kwset.c and dfa.c to libsearch.a. Add searchutils.c there too.
* src/search.h, src/dfasearch.c, src/pcresearch.c, src/kwsearch.c,
src/searchutils.c: New files, split out of src/search.c.
* src/esearch.c, src/fsearch.c: Include the new files instead of search.c.
* src/gsearch.c: Likewise, plus move Gcompile/Acompile here.
-rw-r--r-- | po/POTFILES.in | 3 | ||||
-rw-r--r-- | src/Makefile.am | 18 | ||||
-rw-r--r-- | src/dfasearch.c (renamed from src/search.c) | 462 | ||||
-rw-r--r-- | src/esearch.c | 3 | ||||
-rw-r--r-- | src/fsearch.c | 3 | ||||
-rw-r--r-- | src/gsearch.c | 20 | ||||
-rw-r--r-- | src/kwsearch.c | 162 | ||||
-rw-r--r-- | src/pcresearch.c | 178 | ||||
-rw-r--r-- | src/search.h | 47 | ||||
-rw-r--r-- | src/searchutils.c | 141 |
10 files changed, 565 insertions, 472 deletions
diff --git a/po/POTFILES.in b/po/POTFILES.in index 920413ec..e2454a0b 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -26,4 +26,5 @@ lib/xstrtol-error.c src/dfa.c src/grep.c src/kwset.c -src/search.c +src/dfasearch.c +src/pcresearch.c diff --git a/src/Makefile.am b/src/Makefile.am index 0b0140ec..7ebc126d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -19,16 +19,20 @@ LN = ln AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS) bin_PROGRAMS = grep egrep fgrep -grep_SOURCES = grep.c gsearch.c kwset.c dfa.c -egrep_SOURCES = egrep.c esearch.c kwset.c dfa.c -fgrep_SOURCES = fgrep.c fsearch.c kwset.c -noinst_HEADERS = grep.h dfa.h kwset.h system.h mbsupport.h +grep_SOURCES = grep.c gsearch.c +egrep_SOURCES = egrep.c esearch.c +fgrep_SOURCES = fgrep.c fsearch.c +noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h mbsupport.h -LDADD = $(LIBINTL) ../lib/libgreputils.a -grep_LDADD = $(PCRE_LIBS) $(LDADD) +noinst_LIBRARIES = libsearch.a +libsearch_a_SOURCES = kwset.c dfa.c searchutils.c + +LDADD = $(LIBINTL) libsearch.a ../lib/libgreputils.a +grep_LDADD = $(LDADD) $(PCRE_LIBS) localedir = $(datadir)/locale INCLUDES = -I$(top_srcdir)/lib -DLOCALEDIR=\"$(localedir)\" EXTRA_DIST = \ - dosbuf.c search.c \ + dosbuf.c \ + pcresearch.c dfasearch.c kwsearch.c \ vms_fab.c vms_fab.h diff --git a/src/search.c b/src/dfasearch.c index 5e542516..707874c4 100644 --- a/src/search.c +++ b/src/dfasearch.c @@ -1,4 +1,4 @@ -/* search.c - searching subroutines using dfa, kwset and regex for grep. +/* dfasearch.c - searching subroutines using dfa and regex for grep. Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify @@ -19,30 +19,8 @@ /* Written August 1992 by Mike Haertel. */ #include <config.h> - -#include <sys/types.h> - -#include "mbsupport.h" -#ifdef MBS_SUPPORT -/* We can handle multibyte strings. */ -# include <wchar.h> -# include <wctype.h> -#endif - -#include "system.h" -#include "grep.h" -#ifndef FGREP_PROGRAM -# include <regex.h> -# include "dfa.h" -#endif -#include "kwset.h" -#include "error.h" -#include "xalloc.h" -#ifdef HAVE_LIBPCRE -# include <pcre.h> -#endif - -#define NCHAR (UCHAR_MAX + 1) +#include "search.h" +#include "dfa.h" /* For -w, we also consider _ to be word constituent. */ #define WCHAR(C) (ISALNUM(C) || (C) == '_') @@ -52,96 +30,6 @@ any string matching the regexp. */ static kwset_t kwset; -static void -kwsinit (void) -{ - static char trans[NCHAR]; - int i; - - if (match_icase && MB_CUR_MAX == 1) - { - for (i = 0; i < NCHAR; ++i) - trans[i] = TOLOWER (i); - - kwset = kwsalloc (trans); - } - else - kwset = kwsalloc (NULL); - - if (!kwset) - xalloc_die (); -} - -#ifdef MBS_SUPPORT -/* Convert the *N-byte string, BEG, to lowercase, and write the - NUL-terminated result into malloc'd storage. Upon success, set *N - to the length (in bytes) of the resulting string (not including the - trailing NUL byte), and return a pointer to the lowercase string. - Upon memory allocation failure, this function exits. - - Note that while this function returns a pointer to malloc'd storage, - the caller must not free it, since this function retains a pointer - to the buffer and reuses it on any subsequent call. As a consequence, - this function is not thread-safe. */ -static char * -mbtolower (const char *beg, size_t *n) -{ - static char *out; - static size_t outalloc; - size_t outlen, mb_cur_max; - mbstate_t is, os; - const char *end; - char *p; - - if (*n > outalloc) - { - out = xrealloc (out, *n); - outalloc = *n; - } - - memset (&is, 0, sizeof (is)); - memset (&os, 0, sizeof (os)); - end = beg + *n; - - mb_cur_max = MB_CUR_MAX; - p = out; - outlen = 0; - while (beg < end) - { - wchar_t wc; - size_t mbclen = mbrtowc(&wc, beg, end - beg, &is); - if (outlen + mb_cur_max >= outalloc) - { - out = x2nrealloc (out, &outalloc, 1); - p = out + outlen; - } - - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) - { - /* An invalid sequence, or a truncated multi-octet character. - We treat it as a single-octet character. */ - *p++ = *beg++; - outlen++; - memset (&is, 0, sizeof (is)); - memset (&os, 0, sizeof (os)); - } - else - { - beg += mbclen; - mbclen = wcrtomb (p, towlower ((wint_t) wc), &os); - p += mbclen; - outlen += mbclen; - } - } - - *n = p - out; - *p++ = 0; - return out; -} -#endif - - -#ifndef FGREP_PROGRAM /* DFA compiled regexp. */ static struct dfa dfa; @@ -196,7 +84,7 @@ kwsmusts (void) if (dfa.musts) { - kwsinit (); + kwsinit (&kwset); /* First, we compile in the substrings known to be exact matches. The kwset matcher will return the index of the matching string that it chooses. */ @@ -221,42 +109,7 @@ kwsmusts (void) error (EXIT_TROUBLE, 0, "%s", err); } } -#endif /* !FGREP_PROGRAM */ -#ifdef MBS_SUPPORT - -static bool -is_mb_middle(const char **good, const char *buf, const char *end) -{ - const char *p = *good; - const char *prev = p; - mbstate_t cur_state; - - /* TODO: can be optimized for UTF-8. */ - memset(&cur_state, 0, sizeof(mbstate_t)); - while (p < buf) - { - size_t mbclen = mbrlen(p, end - p, &cur_state); - - /* Store the beginning of the previous complete multibyte character. */ - if (mbclen != (size_t) -2) - prev = p; - - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) - { - /* An invalid sequence, or a truncated multibyte character. - We treat it as a single byte character. */ - mbclen = 1; - } - p += mbclen; - } - - *good = prev; - return p > buf; -} -#endif /* MBS_SUPPORT */ - -#if defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) /* No __VA_ARGS__ in C89. So we have to do it this way. */ static void GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) @@ -342,23 +195,6 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) free(motif); } -#ifndef EGREP_PROGRAM -static void -Gcompile (char const *pattern, size_t size) -{ - return GEAcompile (pattern, size, - (RE_SYNTAX_GREP - | RE_HAT_LISTS_NOT_NEWLINE - | RE_NO_EMPTY_RANGES)); -} - -static void -Acompile (char const *pattern, size_t size) -{ - return GEAcompile (pattern, size, RE_SYNTAX_AWK); -} -#endif /* !EGREP_PROGRAM */ - static void Ecompile (char const *pattern, size_t size) { @@ -557,293 +393,3 @@ EGexecute (char const *buf, size_t size, size_t *match_size, out: return ret_val; } -#endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */ - -#if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) -static void -Fcompile (char const *pattern, size_t size) -{ - char const *beg, *end, *lim, *err, *pat; - size_t psize; - - kwsinit (); - psize = size; - if (match_icase && MB_CUR_MAX > 1) - pat = mbtolower (pattern, &psize); - else - pat = pattern; - - beg = pat; - do - { - for (lim = beg;; ++lim) - { - end = lim; - if (lim >= pat + psize) - break; - if (*lim == '\n') - { - lim++; - break; - } -#if HAVE_DOS_FILE_CONTENTS - if (*lim == '\r' && lim + 1 < pat + psize && lim[1] == '\n') - { - lim += 2; - break; - } -#endif - } - - if ((err = kwsincr (kwset, beg, end - beg)) != NULL) - error (EXIT_TROUBLE, 0, "%s", err); - beg = lim; - } - while (beg < pat + psize); - - if ((err = kwsprep (kwset)) != NULL) - error (EXIT_TROUBLE, 0, "%s", err); -} - -static size_t -Fexecute (char const *buf, size_t size, size_t *match_size, - char const *start_ptr) -{ - char const *beg, *try, *end, *mb_start; - size_t len; - char eol = eolbyte; - struct kwsmatch kwsmatch; - size_t ret_val; -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (match_icase) - { - char *case_buf = mbtolower (buf, &size); - if (start_ptr) - start_ptr = case_buf + (start_ptr - buf); - buf = case_buf; - } - } -#endif /* MBS_SUPPORT */ - - for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) - { - size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); - if (offset == (size_t) -1) - goto failure; -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && is_mb_middle (&mb_start, beg + offset, buf + size)) - { - beg = mb_start - 1; - continue; /* It is a part of multibyte character. */ - } -#endif /* MBS_SUPPORT */ - beg += offset; - len = kwsmatch.size[0]; - if (start_ptr && !match_words) - goto success_in_beg_and_len; - if (match_lines) - { - if (beg > buf && beg[-1] != eol) - continue; - if (beg + len < buf + size && beg[len] != eol) - continue; - goto success; - } - else if (match_words) - for (try = beg; len; ) - { - if (try > buf && WCHAR((unsigned char) try[-1])) - break; - if (try + len < buf + size && WCHAR((unsigned char) try[len])) - { - offset = kwsexec (kwset, beg, --len, &kwsmatch); - if (offset == (size_t) -1) - break; - try = beg + offset; - len = kwsmatch.size[0]; - } - else if (!start_ptr) - goto success; - else - goto success_in_beg_and_len; - } /* for (try) */ - else - goto success; - } /* for (beg in buf) */ - - failure: - ret_val = -1; - goto out; - - success: - if ((end = memchr (beg + len, eol, (buf + size) - (beg + len))) != NULL) - end++; - else - end = buf + size; - while (buf < beg && beg[-1] != eol) - --beg; - len = end - beg; - success_in_beg_and_len: - *match_size = len; - ret_val = beg - buf; - out: - return ret_val; -} -#endif /* defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) */ - -#ifdef GREP_PROGRAM -#if HAVE_LIBPCRE -/* Compiled internal form of a Perl regular expression. */ -static pcre *cre; - -/* Additional information about the pattern. */ -static pcre_extra *extra; -#endif - -static void -Pcompile (char const *pattern, size_t size) -{ -#if !HAVE_LIBPCRE - error (EXIT_TROUBLE, 0, "%s", - _("support for the -P option is not compiled into " - "this --disable-perl-regexp binary")); -#else - int e; - char const *ep; - char *re = xmalloc (4 * size + 7); - int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0); - char const *patlim = pattern + size; - char *n = re; - char const *p; - char const *pnul; - - /* FIXME: Remove these restrictions. */ - if (memchr(pattern, '\n', size)) - error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern")); - - *n = '\0'; - if (match_lines) - strcpy (n, "^("); - if (match_words) - strcpy (n, "\\b("); - n += strlen (n); - - /* The PCRE interface doesn't allow NUL bytes in the pattern, so - replace each NUL byte in the pattern with the four characters - "\000", removing a preceding backslash if there are an odd - number of backslashes before the NUL. - - FIXME: This method does not work with some multibyte character - encodings, notably Shift-JIS, where a multibyte character can end - in a backslash byte. */ - for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1) - { - memcpy (n, p, pnul - p); - n += pnul - p; - for (p = pnul; pattern < p && p[-1] == '\\'; p--) - continue; - n -= (pnul - p) & 1; - strcpy (n, "\\000"); - n += 4; - } - - memcpy (n, p, patlim - p); - n += patlim - p; - *n = '\0'; - if (match_words) - strcpy (n, ")\\b"); - if (match_lines) - strcpy (n, ")$"); - - cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ()); - if (!cre) - error (EXIT_TROUBLE, 0, "%s", ep); - - extra = pcre_study (cre, 0, &ep); - if (ep) - error (EXIT_TROUBLE, 0, "%s", ep); - - free (re); -#endif -} - -static size_t -Pexecute (char const *buf, size_t size, size_t *match_size, - char const *start_ptr) -{ -#if !HAVE_LIBPCRE - abort (); - return -1; -#else - /* This array must have at least two elements; everything after that - is just for performance improvement in pcre_exec. */ - int sub[300]; - - const char *line_buf, *line_end, *line_next; - int e = PCRE_ERROR_NOMATCH; - ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0; - - /* PCRE can't limit the matching to single lines, therefore we have to - match each line in the buffer separately. */ - for (line_next = buf; - e == PCRE_ERROR_NOMATCH && line_next < buf + size; - start_ofs -= line_next - line_buf) - { - line_buf = line_next; - line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf); - if (line_end == NULL) - line_next = line_end = buf + size; - else - line_next = line_end + 1; - - if (start_ptr && start_ptr >= line_end) - continue; - - e = pcre_exec (cre, extra, line_buf, line_end - line_buf, - start_ofs < 0 ? 0 : start_ofs, 0, - sub, sizeof sub / sizeof *sub); - } - - if (e <= 0) - { - switch (e) - { - case PCRE_ERROR_NOMATCH: - return -1; - - case PCRE_ERROR_NOMEMORY: - error (EXIT_TROUBLE, 0, _("memory exhausted")); - - default: - abort (); - } - } - else - { - /* Narrow down to the line we've found. */ - char const *beg = line_buf + sub[0]; - char const *end = line_buf + sub[1]; - char const *buflim = buf + size; - char eol = eolbyte; - if (!start_ptr) - { - /* FIXME: The case when '\n' is not found indicates a bug: - Since grep is line oriented, the match should never contain - a newline, so there _must_ be a newline following. - */ - if (!(end = memchr (end, eol, buflim - end))) - end = buflim; - else - end++; - while (buf < beg && beg[-1] != eol) - --beg; - } - - *match_size = end - beg; - return beg - buf; - } -#endif -} -#endif /* GREP_PROGRAM */ diff --git a/src/esearch.c b/src/esearch.c index d76c310a..8c749c8b 100644 --- a/src/esearch.c +++ b/src/esearch.c @@ -1,5 +1,4 @@ -#define EGREP_PROGRAM -#include "search.c" +#include "dfasearch.c" struct matcher const matchers[] = { { "egrep", Ecompile, EGexecute }, diff --git a/src/fsearch.c b/src/fsearch.c index e1ca0b19..b16e7693 100644 --- a/src/fsearch.c +++ b/src/fsearch.c @@ -1,5 +1,4 @@ -#define FGREP_PROGRAM -#include "search.c" +#include "kwsearch.c" struct matcher const matchers[] = { { "fgrep", Fcompile, Fexecute }, diff --git a/src/gsearch.c b/src/gsearch.c index e3e0423a..4d8b7730 100644 --- a/src/gsearch.c +++ b/src/gsearch.c @@ -1,4 +1,21 @@ -#include "search.c" +#include "dfasearch.c" +#include "pcresearch.c" +#include "kwsearch.c" + +static void +Gcompile (char const *pattern, size_t size) +{ + return GEAcompile (pattern, size, + RE_SYNTAX_GREP + | RE_HAT_LISTS_NOT_NEWLINE + | RE_NO_EMPTY_RANGES); +} + +static void +Acompile (char const *pattern, size_t size) +{ + return GEAcompile (pattern, size, RE_SYNTAX_AWK); +} struct matcher const matchers[] = { { "grep", Gcompile, EGexecute }, @@ -8,4 +25,3 @@ struct matcher const matchers[] = { { "perl", Pcompile, Pexecute }, { NULL, NULL, NULL }, }; - diff --git a/src/kwsearch.c b/src/kwsearch.c new file mode 100644 index 00000000..245ccf0a --- /dev/null +++ b/src/kwsearch.c @@ -0,0 +1,162 @@ +/* kwsearch.c - searching subroutines using kwset for grep. + Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written August 1992 by Mike Haertel. */ + +#include <config.h> +#include "search.h" + +/* For -w, we also consider _ to be word constituent. */ +#define WCHAR(C) (ISALNUM(C) || (C) == '_') + +/* KWset compiled pattern. For Ecompile and Gcompile, we compile + a list of strings, at least one of which is known to occur in + any string matching the regexp. */ +static kwset_t kwset; + +static void +Fcompile (char const *pattern, size_t size) +{ + char const *beg, *end, *lim, *err, *pat; + size_t psize; + + kwsinit (&kwset); + psize = size; + if (match_icase && MB_CUR_MAX > 1) + pat = mbtolower (pattern, &psize); + else + pat = pattern; + + beg = pat; + do + { + for (lim = beg;; ++lim) + { + end = lim; + if (lim >= pat + psize) + break; + if (*lim == '\n') + { + lim++; + break; + } +#if HAVE_DOS_FILE_CONTENTS + if (*lim == '\r' && lim + 1 < pat + psize && lim[1] == '\n') + { + lim += 2; + break; + } +#endif + } + + if ((err = kwsincr (kwset, beg, end - beg)) != NULL) + error (EXIT_TROUBLE, 0, "%s", err); + beg = lim; + } + while (beg < pat + psize); + + if ((err = kwsprep (kwset)) != NULL) + error (EXIT_TROUBLE, 0, "%s", err); +} + +static size_t +Fexecute (char const *buf, size_t size, size_t *match_size, + char const *start_ptr) +{ + char const *beg, *try, *end, *mb_start; + size_t len; + char eol = eolbyte; + struct kwsmatch kwsmatch; + size_t ret_val; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + if (match_icase) + { + char *case_buf = mbtolower (buf, &size); + if (start_ptr) + start_ptr = case_buf + (start_ptr - buf); + buf = case_buf; + } + } +#endif /* MBS_SUPPORT */ + + for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) + { + size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); + if (offset == (size_t) -1) + goto failure; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && is_mb_middle (&mb_start, beg + offset, buf + size)) + { + beg = mb_start - 1; + continue; /* It is a part of multibyte character. */ + } +#endif /* MBS_SUPPORT */ + beg += offset; + len = kwsmatch.size[0]; + if (start_ptr && !match_words) + goto success_in_beg_and_len; + if (match_lines) + { + if (beg > buf && beg[-1] != eol) + continue; + if (beg + len < buf + size && beg[len] != eol) + continue; + goto success; + } + else if (match_words) + for (try = beg; len; ) + { + if (try > buf && WCHAR((unsigned char) try[-1])) + break; + if (try + len < buf + size && WCHAR((unsigned char) try[len])) + { + offset = kwsexec (kwset, beg, --len, &kwsmatch); + if (offset == (size_t) -1) + break; + try = beg + offset; + len = kwsmatch.size[0]; + } + else if (!start_ptr) + goto success; + else + goto success_in_beg_and_len; + } /* for (try) */ + else + goto success; + } /* for (beg in buf) */ + + failure: + ret_val = -1; + goto out; + + success: + if ((end = memchr (beg + len, eol, (buf + size) - (beg + len))) != NULL) + end++; + else + end = buf + size; + while (buf < beg && beg[-1] != eol) + --beg; + len = end - beg; + success_in_beg_and_len: + *match_size = len; + ret_val = beg - buf; + out: + return ret_val; +} diff --git a/src/pcresearch.c b/src/pcresearch.c new file mode 100644 index 00000000..f09acdc7 --- /dev/null +++ b/src/pcresearch.c @@ -0,0 +1,178 @@ +/* pcresearch.c - searching subroutines using PCRE for grep. + Copyright 2000, 2007, 2009-2010 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* Written August 1992 by Mike Haertel. */ + +#include <config.h> +#include "search.h" +#ifdef HAVE_LIBPCRE +# include <pcre.h> +#endif + +#if HAVE_LIBPCRE +/* Compiled internal form of a Perl regular expression. */ +static pcre *cre; + +/* Additional information about the pattern. */ +static pcre_extra *extra; +#endif + +static void +Pcompile (char const *pattern, size_t size) +{ +#if !HAVE_LIBPCRE + error (EXIT_TROUBLE, 0, "%s", + _("support for the -P option is not compiled into " + "this --disable-perl-regexp binary")); +#else + int e; + char const *ep; + char *re = xmalloc (4 * size + 7); + int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0); + char const *patlim = pattern + size; + char *n = re; + char const *p; + char const *pnul; + + /* FIXME: Remove these restrictions. */ + if (memchr(pattern, '\n', size)) + error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern")); + + *n = '\0'; + if (match_lines) + strcpy (n, "^("); + if (match_words) + strcpy (n, "\\b("); + n += strlen (n); + + /* The PCRE interface doesn't allow NUL bytes in the pattern, so + replace each NUL byte in the pattern with the four characters + "\000", removing a preceding backslash if there are an odd + number of backslashes before the NUL. + + FIXME: This method does not work with some multibyte character + encodings, notably Shift-JIS, where a multibyte character can end + in a backslash byte. */ + for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1) + { + memcpy (n, p, pnul - p); + n += pnul - p; + for (p = pnul; pattern < p && p[-1] == '\\'; p--) + continue; + n -= (pnul - p) & 1; + strcpy (n, "\\000"); + n += 4; + } + + memcpy (n, p, patlim - p); + n += patlim - p; + *n = '\0'; + if (match_words) + strcpy (n, ")\\b"); + if (match_lines) + strcpy (n, ")$"); + + cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ()); + if (!cre) + error (EXIT_TROUBLE, 0, "%s", ep); + + extra = pcre_study (cre, 0, &ep); + if (ep) + error (EXIT_TROUBLE, 0, "%s", ep); + + free (re); +#endif +} + +static size_t +Pexecute (char const *buf, size_t size, size_t *match_size, + char const *start_ptr) +{ +#if !HAVE_LIBPCRE + abort (); + return -1; +#else + /* This array must have at least two elements; everything after that + is just for performance improvement in pcre_exec. */ + int sub[300]; + + const char *line_buf, *line_end, *line_next; + int e = PCRE_ERROR_NOMATCH; + ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0; + + /* PCRE can't limit the matching to single lines, therefore we have to + match each line in the buffer separately. */ + for (line_next = buf; + e == PCRE_ERROR_NOMATCH && line_next < buf + size; + start_ofs -= line_next - line_buf) + { + line_buf = line_next; + line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf); + if (line_end == NULL) + line_next = line_end = buf + size; + else + line_next = line_end + 1; + + if (start_ptr && start_ptr >= line_end) + continue; + + e = pcre_exec (cre, extra, line_buf, line_end - line_buf, + start_ofs < 0 ? 0 : start_ofs, 0, + sub, sizeof sub / sizeof *sub); + } + + if (e <= 0) + { + switch (e) + { + case PCRE_ERROR_NOMATCH: + return -1; + + case PCRE_ERROR_NOMEMORY: + error (EXIT_TROUBLE, 0, _("memory exhausted")); + + default: + abort (); + } + } + else + { + /* Narrow down to the line we've found. */ + char const *beg = line_buf + sub[0]; + char const *end = line_buf + sub[1]; + char const *buflim = buf + size; + char eol = eolbyte; + if (!start_ptr) + { + /* FIXME: The case when '\n' is not found indicates a bug: + Since grep is line oriented, the match should never contain + a newline, so there _must_ be a newline following. + */ + if (!(end = memchr (end, eol, buflim - end))) + end = buflim; + else + end++; + while (buf < beg && beg[-1] != eol) + --beg; + } + + *match_size = end - beg; + return beg - buf; + } +#endif +} diff --git a/src/search.h b/src/search.h new file mode 100644 index 00000000..cb3b535f --- /dev/null +++ b/src/search.h @@ -0,0 +1,47 @@ +/* search.c - searching subroutines using dfa, kwset and regex for grep. + Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +#ifndef GREP_SEARCH_H +#define GREP_SEARCH_H 1 + +#include <config.h> + +#include <sys/types.h> + +#include "mbsupport.h" +#ifdef MBS_SUPPORT +/* We can handle multibyte strings. */ +# include <wchar.h> +# include <wctype.h> +#endif + +#include <regex.h> +#include "system.h" +#include "grep.h" +#include "error.h" +#include "kwset.h" +#include "xalloc.h" + +void kwsinit (kwset_t *); + +#ifdef MBS_SUPPORT +char * mbtolower (const char *, size_t *); +bool is_mb_middle(const char **, const char *, const char *); +#endif + +#endif /* GREP_SEARCH_H */ diff --git a/src/searchutils.c b/src/searchutils.c new file mode 100644 index 00000000..ef4fef39 --- /dev/null +++ b/src/searchutils.c @@ -0,0 +1,141 @@ +/* searchutils.c - helper subroutines for grep's matchers. + Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA + 02110-1301, USA. */ + +#include <config.h> +#include "search.h" + +#define NCHAR (UCHAR_MAX + 1) + +void +kwsinit (kwset_t *kwset) +{ + static char trans[NCHAR]; + int i; + + if (match_icase && MB_CUR_MAX == 1) + { + for (i = 0; i < NCHAR; ++i) + trans[i] = TOLOWER (i); + + *kwset = kwsalloc (trans); + } + else + *kwset = kwsalloc (NULL); + + if (!*kwset) + xalloc_die (); +} + +#ifdef MBS_SUPPORT +/* Convert the *N-byte string, BEG, to lowercase, and write the + NUL-terminated result into malloc'd storage. Upon success, set *N + to the length (in bytes) of the resulting string (not including the + trailing NUL byte), and return a pointer to the lowercase string. + Upon memory allocation failure, this function exits. + + Note that while this function returns a pointer to malloc'd storage, + the caller must not free it, since this function retains a pointer + to the buffer and reuses it on any subsequent call. As a consequence, + this function is not thread-safe. */ +char * +mbtolower (const char *beg, size_t *n) +{ + static char *out; + static size_t outalloc; + size_t outlen, mb_cur_max; + mbstate_t is, os; + const char *end; + char *p; + + if (*n > outalloc) + { + out = xrealloc (out, *n); + outalloc = *n; + } + + memset (&is, 0, sizeof (is)); + memset (&os, 0, sizeof (os)); + end = beg + *n; + + mb_cur_max = MB_CUR_MAX; + p = out; + outlen = 0; + while (beg < end) + { + wchar_t wc; + size_t mbclen = mbrtowc(&wc, beg, end - beg, &is); + if (outlen + mb_cur_max >= outalloc) + { + out = x2nrealloc (out, &outalloc, 1); + p = out + outlen; + } + + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) + { + /* An invalid sequence, or a truncated multi-octet character. + We treat it as a single-octet character. */ + *p++ = *beg++; + outlen++; + memset (&is, 0, sizeof (is)); + memset (&os, 0, sizeof (os)); + } + else + { + beg += mbclen; + mbclen = wcrtomb (p, towlower ((wint_t) wc), &os); + p += mbclen; + outlen += mbclen; + } + } + + *n = p - out; + *p++ = 0; + return out; +} + + +bool +is_mb_middle(const char **good, const char *buf, const char *end) +{ + const char *p = *good; + const char *prev = p; + mbstate_t cur_state; + + /* TODO: can be optimized for UTF-8. */ + memset(&cur_state, 0, sizeof(mbstate_t)); + while (p < buf) + { + size_t mbclen = mbrlen(p, end - p, &cur_state); + + /* Store the beginning of the previous complete multibyte character. */ + if (mbclen != (size_t) -2) + prev = p; + + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) + { + /* An invalid sequence, or a truncated multibyte character. + We treat it as a single byte character. */ + mbclen = 1; + } + p += mbclen; + } + + *good = prev; + return p > buf; +} +#endif /* MBS_SUPPORT */ |