grep: split search.c

* po/POTFILES.in: Update. * src/Makefile.am (grep_SOURCES, egrep_SOURCES, fgrep_SOURCES): Move kwset.c and dfa.c to libsearch.a. Add searchutils.c there too. * src/search.h, src/dfasearch.c, src/pcresearch.c, src/kwsearch.c, src/searchutils.c: New files, split out of src/search.c. * src/esearch.c, src/fsearch.c: Include the new files instead of search.c. * src/gsearch.c: Likewise, plus move Gcompile/Acompile here.
author: Paolo Bonzini <bonzini@gnu.org> 2010-03-18 13:40:10 +0100
committer: Paolo Bonzini <bonzini@gnu.org> 2010-03-22 09:55:30 +0100
commit: c59a6cd03de84dc38c577083f34e3b0dfe87e36d (patch)
tree: 4971c8e1233d4823579dc7beb2fab820b0729cb3
parent: 59040143e96ce960476c5a360d829256759ff4ab (diff)
download: grep-c59a6cd03de84dc38c577083f34e3b0dfe87e36d.tar.gz
10 files changed, 565 insertions, 472 deletions
diff --git a/po/POTFILES.in b/po/POTFILES.in
index 920413ec..e2454a0b 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -26,4 +26,5 @@ lib/xstrtol-error.c
 src/dfa.c
 src/grep.c
 src/kwset.c
-src/search.c
+src/dfasearch.c
+src/pcresearch.c
diff --git a/src/Makefile.am b/src/Makefile.am
index 0b0140ec..7ebc126d 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,16 +19,20 @@ LN = ln
 AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS)
 
 bin_PROGRAMS = grep egrep fgrep
-grep_SOURCES  =  grep.c gsearch.c kwset.c dfa.c
-egrep_SOURCES = egrep.c esearch.c kwset.c dfa.c
-fgrep_SOURCES = fgrep.c fsearch.c kwset.c
-noinst_HEADERS = grep.h dfa.h kwset.h system.h mbsupport.h
+grep_SOURCES  =  grep.c gsearch.c
+egrep_SOURCES = egrep.c esearch.c
+fgrep_SOURCES = fgrep.c fsearch.c
+noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h mbsupport.h
 
-LDADD = $(LIBINTL) ../lib/libgreputils.a
-grep_LDADD = $(PCRE_LIBS) $(LDADD)
+noinst_LIBRARIES = libsearch.a
+libsearch_a_SOURCES = kwset.c dfa.c searchutils.c
+
+LDADD = $(LIBINTL) libsearch.a ../lib/libgreputils.a
+grep_LDADD = $(LDADD) $(PCRE_LIBS)
 localedir = $(datadir)/locale
 INCLUDES = -I$(top_srcdir)/lib -DLOCALEDIR=\"$(localedir)\"
 
 EXTRA_DIST = \
-             dosbuf.c search.c \
+             dosbuf.c \
+	     pcresearch.c dfasearch.c kwsearch.c \
              vms_fab.c vms_fab.h
diff --git a/src/search.c b/src/dfasearch.c
index 5e542516..707874c4 100644
--- a/src/search.c
+++ b/src/dfasearch.c
@@ -1,4 +1,4 @@
-/* search.c - searching subroutines using dfa, kwset and regex for grep.
+/* dfasearch.c - searching subroutines using dfa and regex for grep.
    Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
 
    This program is free software; you can redistribute it and/or modify
@@ -19,30 +19,8 @@
 /* Written August 1992 by Mike Haertel. */
 
 #include <config.h>
-
-#include <sys/types.h>
-
-#include "mbsupport.h"
-#ifdef MBS_SUPPORT
-/* We can handle multibyte strings. */
-# include <wchar.h>
-# include <wctype.h>
-#endif
-
-#include "system.h"
-#include "grep.h"
-#ifndef FGREP_PROGRAM
-# include <regex.h>
-# include "dfa.h"
-#endif
-#include "kwset.h"
-#include "error.h"
-#include "xalloc.h"
-#ifdef HAVE_LIBPCRE
-# include <pcre.h>
-#endif
-
-#define NCHAR (UCHAR_MAX + 1)
+#include "search.h"
+#include "dfa.h"
 
 /* For -w, we also consider _ to be word constituent.  */
 #define WCHAR(C) (ISALNUM(C) || (C) == '_')
@@ -52,96 +30,6 @@
    any string matching the regexp. */
 static kwset_t kwset;
 
-static void
-kwsinit (void)
-{
-  static char trans[NCHAR];
-  int i;
-
-  if (match_icase && MB_CUR_MAX == 1)
-    {
-      for (i = 0; i < NCHAR; ++i)
-        trans[i] = TOLOWER (i);
-
-      kwset = kwsalloc (trans);
-    }
-  else
-    kwset = kwsalloc (NULL);
-
-  if (!kwset)
-    xalloc_die ();
-}
-
-#ifdef MBS_SUPPORT
-/* Convert the *N-byte string, BEG, to lowercase, and write the
-   NUL-terminated result into malloc'd storage.  Upon success, set *N
-   to the length (in bytes) of the resulting string (not including the
-   trailing NUL byte), and return a pointer to the lowercase string.
-   Upon memory allocation failure, this function exits.
-
-   Note that while this function returns a pointer to malloc'd storage,
-   the caller must not free it, since this function retains a pointer
-   to the buffer and reuses it on any subsequent call.  As a consequence,
-   this function is not thread-safe.  */
-static char *
-mbtolower (const char *beg, size_t *n)
-{
-  static char *out;
-  static size_t outalloc;
-  size_t outlen, mb_cur_max;
-  mbstate_t is, os;
-  const char *end;
-  char *p;
-
-  if (*n > outalloc)
-    {
-      out = xrealloc (out, *n);
-      outalloc = *n;
-    }
-
-  memset (&is, 0, sizeof (is));
-  memset (&os, 0, sizeof (os));
-  end = beg + *n;
-
-  mb_cur_max = MB_CUR_MAX;
-  p = out;
-  outlen = 0;
-  while (beg < end)
-    {
-      wchar_t wc;
-      size_t mbclen = mbrtowc(&wc, beg, end - beg, &is);
-      if (outlen + mb_cur_max >= outalloc)
-        {
-          out = x2nrealloc (out, &outalloc, 1);
-          p = out + outlen;
-        }
-
-      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
-        {
-          /* An invalid sequence, or a truncated multi-octet character.
-             We treat it as a single-octet character.  */
-          *p++ = *beg++;
-          outlen++;
-          memset (&is, 0, sizeof (is));
-          memset (&os, 0, sizeof (os));
-        }
-      else
-        {
-          beg += mbclen;
-          mbclen = wcrtomb (p, towlower ((wint_t) wc), &os);
-          p += mbclen;
-          outlen += mbclen;
-        }
-    }
-
-  *n = p - out;
-  *p++ = 0;
-  return out;
-}
-#endif
-
-
-#ifndef FGREP_PROGRAM
 /* DFA compiled regexp. */
 static struct dfa dfa;
 
@@ -196,7 +84,7 @@ kwsmusts (void)
 
   if (dfa.musts)
     {
-      kwsinit ();
+      kwsinit (&kwset);
       /* First, we compile in the substrings known to be exact
 	 matches.  The kwset matcher will return the index
 	 of the matching string that it chooses. */
@@ -221,42 +109,7 @@ kwsmusts (void)
 	error (EXIT_TROUBLE, 0, "%s", err);
     }
 }
-#endif /* !FGREP_PROGRAM */
 
-#ifdef MBS_SUPPORT
-
-static bool
-is_mb_middle(const char **good, const char *buf, const char *end)
-{
-  const char *p = *good;
-  const char *prev = p;
-  mbstate_t cur_state;
-
-  /* TODO: can be optimized for UTF-8.  */
-  memset(&cur_state, 0, sizeof(mbstate_t));
-  while (p < buf)
-    {
-      size_t mbclen = mbrlen(p, end - p, &cur_state);
-
-      /* Store the beginning of the previous complete multibyte character.  */
-      if (mbclen != (size_t) -2)
-        prev = p;
-
-      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
-	{
-	  /* An invalid sequence, or a truncated multibyte character.
-	     We treat it as a single byte character.  */
-	  mbclen = 1;
-	}
-      p += mbclen;
-    }
-
-  *good = prev;
-  return p > buf;
-}
-#endif /* MBS_SUPPORT */
-
-#if defined(GREP_PROGRAM) || defined(EGREP_PROGRAM)
 /* No __VA_ARGS__ in C89.  So we have to do it this way.  */
 static void
 GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
@@ -342,23 +195,6 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
   free(motif);
 }
 
-#ifndef EGREP_PROGRAM
-static void
-Gcompile (char const *pattern, size_t size)
-{
-  return GEAcompile (pattern, size,
-		     (RE_SYNTAX_GREP
-		      | RE_HAT_LISTS_NOT_NEWLINE
-		      | RE_NO_EMPTY_RANGES));
-}
-
-static void
-Acompile (char const *pattern, size_t size)
-{
-  return GEAcompile (pattern, size, RE_SYNTAX_AWK);
-}
-#endif /* !EGREP_PROGRAM */
-
 static void
 Ecompile (char const *pattern, size_t size)
 {
@@ -557,293 +393,3 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
  out:
   return ret_val;
 }
-#endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */
-
-#if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM)
-static void
-Fcompile (char const *pattern, size_t size)
-{
-  char const *beg, *end, *lim, *err, *pat;
-  size_t psize;
-
-  kwsinit ();
-  psize = size;
-  if (match_icase && MB_CUR_MAX > 1)
-    pat = mbtolower (pattern, &psize);
-  else
-    pat = pattern;
-
-  beg = pat;
-  do
-    {
-      for (lim = beg;; ++lim)
-	{
-	  end = lim;
-	  if (lim >= pat + psize)
-	    break;
-	 if (*lim == '\n')
-	   {
-	     lim++;
-	     break;
-	   }
-#if HAVE_DOS_FILE_CONTENTS
-	 if (*lim == '\r' && lim + 1 < pat + psize && lim[1] == '\n')
-	   {
-	     lim += 2;
-	     break;
-	   }
-#endif
-	}
-
-      if ((err = kwsincr (kwset, beg, end - beg)) != NULL)
-	error (EXIT_TROUBLE, 0, "%s", err);
-      beg = lim;
-    }
-  while (beg < pat + psize);
-
-  if ((err = kwsprep (kwset)) != NULL)
-    error (EXIT_TROUBLE, 0, "%s", err);
-}
-
-static size_t
-Fexecute (char const *buf, size_t size, size_t *match_size,
-	  char const *start_ptr)
-{
-  char const *beg, *try, *end, *mb_start;
-  size_t len;
-  char eol = eolbyte;
-  struct kwsmatch kwsmatch;
-  size_t ret_val;
-#ifdef MBS_SUPPORT
-  if (MB_CUR_MAX > 1)
-    {
-      if (match_icase)
-        {
-          char *case_buf = mbtolower (buf, &size);
-	  if (start_ptr)
-	    start_ptr = case_buf + (start_ptr - buf);
-          buf = case_buf;
-        }
-    }
-#endif /* MBS_SUPPORT */
-
-  for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
-    {
-      size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
-      if (offset == (size_t) -1)
-	goto failure;
-#ifdef MBS_SUPPORT
-      if (MB_CUR_MAX > 1 && is_mb_middle (&mb_start, beg + offset, buf + size))
-        {
-          beg = mb_start - 1;
-          continue; /* It is a part of multibyte character.  */
-        }
-#endif /* MBS_SUPPORT */
-      beg += offset;
-      len = kwsmatch.size[0];
-      if (start_ptr && !match_words)
-	goto success_in_beg_and_len;
-      if (match_lines)
-	{
-	  if (beg > buf && beg[-1] != eol)
-	    continue;
-	  if (beg + len < buf + size && beg[len] != eol)
-	    continue;
-	  goto success;
-	}
-      else if (match_words)
-	for (try = beg; len; )
-	  {
-	    if (try > buf && WCHAR((unsigned char) try[-1]))
-	      break;
-	    if (try + len < buf + size && WCHAR((unsigned char) try[len]))
-	      {
-		offset = kwsexec (kwset, beg, --len, &kwsmatch);
-		if (offset == (size_t) -1)
-		  break;
-		try = beg + offset;
-		len = kwsmatch.size[0];
-	      }
-	    else if (!start_ptr)
-	      goto success;
-	    else
-	      goto success_in_beg_and_len;
-	  } /* for (try) */
-      else
-	goto success;
-    } /* for (beg in buf) */
-
- failure:
-  ret_val = -1;
-  goto out;
-
- success:
-  if ((end = memchr (beg + len, eol, (buf + size) - (beg + len))) != NULL)
-    end++;
-  else
-    end = buf + size;
-  while (buf < beg && beg[-1] != eol)
-    --beg;
-  len = end - beg;
- success_in_beg_and_len:
-  *match_size = len;
-  ret_val = beg - buf;
- out:
-  return ret_val;
-}
-#endif /* defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) */
-
-#ifdef GREP_PROGRAM
-#if HAVE_LIBPCRE
-/* Compiled internal form of a Perl regular expression.  */
-static pcre *cre;
-
-/* Additional information about the pattern.  */
-static pcre_extra *extra;
-#endif
-
-static void
-Pcompile (char const *pattern, size_t size)
-{
-#if !HAVE_LIBPCRE
-  error (EXIT_TROUBLE, 0, "%s",
-	 _("support for the -P option is not compiled into "
-	   "this --disable-perl-regexp binary"));
-#else
-  int e;
-  char const *ep;
-  char *re = xmalloc (4 * size + 7);
-  int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
-  char const *patlim = pattern + size;
-  char *n = re;
-  char const *p;
-  char const *pnul;
-
-  /* FIXME: Remove these restrictions.  */
-  if (memchr(pattern, '\n', size))
-    error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
-
-  *n = '\0';
-  if (match_lines)
-    strcpy (n, "^(");
-  if (match_words)
-    strcpy (n, "\\b(");
-  n += strlen (n);
-
-  /* The PCRE interface doesn't allow NUL bytes in the pattern, so
-     replace each NUL byte in the pattern with the four characters
-     "\000", removing a preceding backslash if there are an odd
-     number of backslashes before the NUL.
-
-     FIXME: This method does not work with some multibyte character
-     encodings, notably Shift-JIS, where a multibyte character can end
-     in a backslash byte.  */
-  for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
-    {
-      memcpy (n, p, pnul - p);
-      n += pnul - p;
-      for (p = pnul; pattern < p && p[-1] == '\\'; p--)
-	continue;
-      n -= (pnul - p) & 1;
-      strcpy (n, "\\000");
-      n += 4;
-    }
-
-  memcpy (n, p, patlim - p);
-  n += patlim - p;
-  *n = '\0';
-  if (match_words)
-    strcpy (n, ")\\b");
-  if (match_lines)
-    strcpy (n, ")$");
-
-  cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
-  if (!cre)
-    error (EXIT_TROUBLE, 0, "%s", ep);
-
-  extra = pcre_study (cre, 0, &ep);
-  if (ep)
-    error (EXIT_TROUBLE, 0, "%s", ep);
-
-  free (re);
-#endif
-}
-
-static size_t
-Pexecute (char const *buf, size_t size, size_t *match_size,
-	  char const *start_ptr)
-{
-#if !HAVE_LIBPCRE
-  abort ();
-  return -1;
-#else
-  /* This array must have at least two elements; everything after that
-     is just for performance improvement in pcre_exec.  */
-  int sub[300];
-
-  const char *line_buf, *line_end, *line_next;
-  int e = PCRE_ERROR_NOMATCH;
-  ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
-
-  /* PCRE can't limit the matching to single lines, therefore we have to
-     match each line in the buffer separately.  */
-  for (line_next = buf;
-       e == PCRE_ERROR_NOMATCH && line_next < buf + size;
-       start_ofs -= line_next - line_buf)
-    {
-      line_buf = line_next;
-      line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
-      if (line_end == NULL)
-        line_next = line_end = buf + size;
-      else
-        line_next = line_end + 1;
-
-      if (start_ptr && start_ptr >= line_end)
-        continue;
-
-      e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
-                     start_ofs < 0 ? 0 : start_ofs, 0,
-                     sub, sizeof sub / sizeof *sub);
-    }
-
-  if (e <= 0)
-    {
-      switch (e)
-	{
-	case PCRE_ERROR_NOMATCH:
-	  return -1;
-
-	case PCRE_ERROR_NOMEMORY:
-	  error (EXIT_TROUBLE, 0, _("memory exhausted"));
-
-	default:
-	  abort ();
-	}
-    }
-  else
-    {
-      /* Narrow down to the line we've found.  */
-      char const *beg = line_buf + sub[0];
-      char const *end = line_buf + sub[1];
-      char const *buflim = buf + size;
-      char eol = eolbyte;
-      if (!start_ptr)
-	{
-	  /* FIXME: The case when '\n' is not found indicates a bug:
-	     Since grep is line oriented, the match should never contain
-	     a newline, so there _must_ be a newline following.
-	   */
-	  if (!(end = memchr (end, eol, buflim - end)))
-	    end = buflim;
-	  else
-	    end++;
-	  while (buf < beg && beg[-1] != eol)
-	    --beg;
-	}
-
-      *match_size = end - beg;
-      return beg - buf;
-    }
-#endif
-}
-#endif /* GREP_PROGRAM */
diff --git a/src/esearch.c b/src/esearch.c
index d76c310a..8c749c8b 100644
--- a/src/esearch.c
+++ b/src/esearch.c
@@ -1,5 +1,4 @@
-#define EGREP_PROGRAM
-#include "search.c"
+#include "dfasearch.c"
 
 struct matcher const matchers[] = {
   { "egrep", Ecompile, EGexecute },
diff --git a/src/fsearch.c b/src/fsearch.c
index e1ca0b19..b16e7693 100644
--- a/src/fsearch.c
+++ b/src/fsearch.c
@@ -1,5 +1,4 @@
-#define FGREP_PROGRAM
-#include "search.c"
+#include "kwsearch.c"
 
 struct matcher const matchers[] = {
   { "fgrep", Fcompile, Fexecute },
diff --git a/src/gsearch.c b/src/gsearch.c
index e3e0423a..4d8b7730 100644
--- a/src/gsearch.c
+++ b/src/gsearch.c
@@ -1,4 +1,21 @@
-#include "search.c"
+#include "dfasearch.c"
+#include "pcresearch.c"
+#include "kwsearch.c"
+
+static void
+Gcompile (char const *pattern, size_t size)
+{
+  return GEAcompile (pattern, size,
+		     RE_SYNTAX_GREP
+		     | RE_HAT_LISTS_NOT_NEWLINE
+		     | RE_NO_EMPTY_RANGES);
+}
+
+static void
+Acompile (char const *pattern, size_t size)
+{
+  return GEAcompile (pattern, size, RE_SYNTAX_AWK);
+}
 
 struct matcher const matchers[] = {
   { "grep",    Gcompile, EGexecute },
@@ -8,4 +25,3 @@ struct matcher const matchers[] = {
   { "perl",    Pcompile, Pexecute },
   { NULL, NULL, NULL },
 };
-
diff --git a/src/kwsearch.c b/src/kwsearch.c
new file mode 100644
index 00000000..245ccf0a
--- /dev/null
+++ b/src/kwsearch.c
@@ -0,0 +1,162 @@
+/* kwsearch.c - searching subroutines using kwset for grep.
+   Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* Written August 1992 by Mike Haertel. */
+
+#include <config.h>
+#include "search.h"
+
+/* For -w, we also consider _ to be word constituent.  */
+#define WCHAR(C) (ISALNUM(C) || (C) == '_')
+
+/* KWset compiled pattern.  For Ecompile and Gcompile, we compile
+   a list of strings, at least one of which is known to occur in
+   any string matching the regexp. */
+static kwset_t kwset;
+
+static void
+Fcompile (char const *pattern, size_t size)
+{
+  char const *beg, *end, *lim, *err, *pat;
+  size_t psize;
+
+  kwsinit (&kwset);
+  psize = size;
+  if (match_icase && MB_CUR_MAX > 1)
+    pat = mbtolower (pattern, &psize);
+  else
+    pat = pattern;
+
+  beg = pat;
+  do
+    {
+      for (lim = beg;; ++lim)
+	{
+	  end = lim;
+	  if (lim >= pat + psize)
+	    break;
+	 if (*lim == '\n')
+	   {
+	     lim++;
+	     break;
+	   }
+#if HAVE_DOS_FILE_CONTENTS
+	 if (*lim == '\r' && lim + 1 < pat + psize && lim[1] == '\n')
+	   {
+	     lim += 2;
+	     break;
+	   }
+#endif
+	}
+
+      if ((err = kwsincr (kwset, beg, end - beg)) != NULL)
+	error (EXIT_TROUBLE, 0, "%s", err);
+      beg = lim;
+    }
+  while (beg < pat + psize);
+
+  if ((err = kwsprep (kwset)) != NULL)
+    error (EXIT_TROUBLE, 0, "%s", err);
+}
+
+static size_t
+Fexecute (char const *buf, size_t size, size_t *match_size,
+	  char const *start_ptr)
+{
+  char const *beg, *try, *end, *mb_start;
+  size_t len;
+  char eol = eolbyte;
+  struct kwsmatch kwsmatch;
+  size_t ret_val;
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    {
+      if (match_icase)
+        {
+          char *case_buf = mbtolower (buf, &size);
+	  if (start_ptr)
+	    start_ptr = case_buf + (start_ptr - buf);
+          buf = case_buf;
+        }
+    }
+#endif /* MBS_SUPPORT */
+
+  for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
+    {
+      size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
+      if (offset == (size_t) -1)
+	goto failure;
+#ifdef MBS_SUPPORT
+      if (MB_CUR_MAX > 1 && is_mb_middle (&mb_start, beg + offset, buf + size))
+        {
+          beg = mb_start - 1;
+          continue; /* It is a part of multibyte character.  */
+        }
+#endif /* MBS_SUPPORT */
+      beg += offset;
+      len = kwsmatch.size[0];
+      if (start_ptr && !match_words)
+	goto success_in_beg_and_len;
+      if (match_lines)
+	{
+	  if (beg > buf && beg[-1] != eol)
+	    continue;
+	  if (beg + len < buf + size && beg[len] != eol)
+	    continue;
+	  goto success;
+	}
+      else if (match_words)
+	for (try = beg; len; )
+	  {
+	    if (try > buf && WCHAR((unsigned char) try[-1]))
+	      break;
+	    if (try + len < buf + size && WCHAR((unsigned char) try[len]))
+	      {
+		offset = kwsexec (kwset, beg, --len, &kwsmatch);
+		if (offset == (size_t) -1)
+		  break;
+		try = beg + offset;
+		len = kwsmatch.size[0];
+	      }
+	    else if (!start_ptr)
+	      goto success;
+	    else
+	      goto success_in_beg_and_len;
+	  } /* for (try) */
+      else
+	goto success;
+    } /* for (beg in buf) */
+
+ failure:
+  ret_val = -1;
+  goto out;
+
+ success:
+  if ((end = memchr (beg + len, eol, (buf + size) - (beg + len))) != NULL)
+    end++;
+  else
+    end = buf + size;
+  while (buf < beg && beg[-1] != eol)
+    --beg;
+  len = end - beg;
+ success_in_beg_and_len:
+  *match_size = len;
+  ret_val = beg - buf;
+ out:
+  return ret_val;
+}
diff --git a/src/pcresearch.c b/src/pcresearch.c
new file mode 100644
index 00000000..f09acdc7
--- /dev/null
+++ b/src/pcresearch.c
@@ -0,0 +1,178 @@
+/* pcresearch.c - searching subroutines using PCRE for grep.
+   Copyright 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* Written August 1992 by Mike Haertel. */
+
+#include <config.h>
+#include "search.h"
+#ifdef HAVE_LIBPCRE
+# include <pcre.h>
+#endif
+
+#if HAVE_LIBPCRE
+/* Compiled internal form of a Perl regular expression.  */
+static pcre *cre;
+
+/* Additional information about the pattern.  */
+static pcre_extra *extra;
+#endif
+
+static void
+Pcompile (char const *pattern, size_t size)
+{
+#if !HAVE_LIBPCRE
+  error (EXIT_TROUBLE, 0, "%s",
+	 _("support for the -P option is not compiled into "
+	   "this --disable-perl-regexp binary"));
+#else
+  int e;
+  char const *ep;
+  char *re = xmalloc (4 * size + 7);
+  int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
+  char const *patlim = pattern + size;
+  char *n = re;
+  char const *p;
+  char const *pnul;
+
+  /* FIXME: Remove these restrictions.  */
+  if (memchr(pattern, '\n', size))
+    error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
+
+  *n = '\0';
+  if (match_lines)
+    strcpy (n, "^(");
+  if (match_words)
+    strcpy (n, "\\b(");
+  n += strlen (n);
+
+  /* The PCRE interface doesn't allow NUL bytes in the pattern, so
+     replace each NUL byte in the pattern with the four characters
+     "\000", removing a preceding backslash if there are an odd
+     number of backslashes before the NUL.
+
+     FIXME: This method does not work with some multibyte character
+     encodings, notably Shift-JIS, where a multibyte character can end
+     in a backslash byte.  */
+  for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
+    {
+      memcpy (n, p, pnul - p);
+      n += pnul - p;
+      for (p = pnul; pattern < p && p[-1] == '\\'; p--)
+	continue;
+      n -= (pnul - p) & 1;
+      strcpy (n, "\\000");
+      n += 4;
+    }
+
+  memcpy (n, p, patlim - p);
+  n += patlim - p;
+  *n = '\0';
+  if (match_words)
+    strcpy (n, ")\\b");
+  if (match_lines)
+    strcpy (n, ")$");
+
+  cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
+  if (!cre)
+    error (EXIT_TROUBLE, 0, "%s", ep);
+
+  extra = pcre_study (cre, 0, &ep);
+  if (ep)
+    error (EXIT_TROUBLE, 0, "%s", ep);
+
+  free (re);
+#endif
+}
+
+static size_t
+Pexecute (char const *buf, size_t size, size_t *match_size,
+	  char const *start_ptr)
+{
+#if !HAVE_LIBPCRE
+  abort ();
+  return -1;
+#else
+  /* This array must have at least two elements; everything after that
+     is just for performance improvement in pcre_exec.  */
+  int sub[300];
+
+  const char *line_buf, *line_end, *line_next;
+  int e = PCRE_ERROR_NOMATCH;
+  ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
+
+  /* PCRE can't limit the matching to single lines, therefore we have to
+     match each line in the buffer separately.  */
+  for (line_next = buf;
+       e == PCRE_ERROR_NOMATCH && line_next < buf + size;
+       start_ofs -= line_next - line_buf)
+    {
+      line_buf = line_next;
+      line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
+      if (line_end == NULL)
+        line_next = line_end = buf + size;
+      else
+        line_next = line_end + 1;
+
+      if (start_ptr && start_ptr >= line_end)
+        continue;
+
+      e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
+                     start_ofs < 0 ? 0 : start_ofs, 0,
+                     sub, sizeof sub / sizeof *sub);
+    }
+
+  if (e <= 0)
+    {
+      switch (e)
+	{
+	case PCRE_ERROR_NOMATCH:
+	  return -1;
+
+	case PCRE_ERROR_NOMEMORY:
+	  error (EXIT_TROUBLE, 0, _("memory exhausted"));
+
+	default:
+	  abort ();
+	}
+    }
+  else
+    {
+      /* Narrow down to the line we've found.  */
+      char const *beg = line_buf + sub[0];
+      char const *end = line_buf + sub[1];
+      char const *buflim = buf + size;
+      char eol = eolbyte;
+      if (!start_ptr)
+	{
+	  /* FIXME: The case when '\n' is not found indicates a bug:
+	     Since grep is line oriented, the match should never contain
+	     a newline, so there _must_ be a newline following.
+	   */
+	  if (!(end = memchr (end, eol, buflim - end)))
+	    end = buflim;
+	  else
+	    end++;
+	  while (buf < beg && beg[-1] != eol)
+	    --beg;
+	}
+
+      *match_size = end - beg;
+      return beg - buf;
+    }
+#endif
+}
diff --git a/src/search.h b/src/search.h
new file mode 100644
index 00000000..cb3b535f
--- /dev/null
+++ b/src/search.h
@@ -0,0 +1,47 @@
+/* search.c - searching subroutines using dfa, kwset and regex for grep.
+   Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+#ifndef GREP_SEARCH_H
+#define GREP_SEARCH_H 1
+
+#include <config.h>
+
+#include <sys/types.h>
+
+#include "mbsupport.h"
+#ifdef MBS_SUPPORT
+/* We can handle multibyte strings. */
+# include <wchar.h>
+# include <wctype.h>
+#endif
+
+#include <regex.h>
+#include "system.h"
+#include "grep.h"
+#include "error.h"
+#include "kwset.h"
+#include "xalloc.h"
+
+void kwsinit (kwset_t *);
+
+#ifdef MBS_SUPPORT
+char * mbtolower (const char *, size_t *);
+bool is_mb_middle(const char **, const char *, const char *);
+#endif
+
+#endif /* GREP_SEARCH_H */
diff --git a/src/searchutils.c b/src/searchutils.c
new file mode 100644
index 00000000..ef4fef39
--- /dev/null
+++ b/src/searchutils.c
@@ -0,0 +1,141 @@
+/* searchutils.c - helper subroutines for grep's matchers.
+   Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+#include <config.h>
+#include "search.h"
+
+#define NCHAR (UCHAR_MAX + 1)
+
+void
+kwsinit (kwset_t *kwset)
+{
+  static char trans[NCHAR];
+  int i;
+
+  if (match_icase && MB_CUR_MAX == 1)
+    {
+      for (i = 0; i < NCHAR; ++i)
+        trans[i] = TOLOWER (i);
+
+      *kwset = kwsalloc (trans);
+    }
+  else
+    *kwset = kwsalloc (NULL);
+
+  if (!*kwset)
+    xalloc_die ();
+}
+
+#ifdef MBS_SUPPORT
+/* Convert the *N-byte string, BEG, to lowercase, and write the
+   NUL-terminated result into malloc'd storage.  Upon success, set *N
+   to the length (in bytes) of the resulting string (not including the
+   trailing NUL byte), and return a pointer to the lowercase string.
+   Upon memory allocation failure, this function exits.
+
+   Note that while this function returns a pointer to malloc'd storage,
+   the caller must not free it, since this function retains a pointer
+   to the buffer and reuses it on any subsequent call.  As a consequence,
+   this function is not thread-safe.  */
+char *
+mbtolower (const char *beg, size_t *n)
+{
+  static char *out;
+  static size_t outalloc;
+  size_t outlen, mb_cur_max;
+  mbstate_t is, os;
+  const char *end;
+  char *p;
+
+  if (*n > outalloc)
+    {
+      out = xrealloc (out, *n);
+      outalloc = *n;
+    }
+
+  memset (&is, 0, sizeof (is));
+  memset (&os, 0, sizeof (os));
+  end = beg + *n;
+
+  mb_cur_max = MB_CUR_MAX;
+  p = out;
+  outlen = 0;
+  while (beg < end)
+    {
+      wchar_t wc;
+      size_t mbclen = mbrtowc(&wc, beg, end - beg, &is);
+      if (outlen + mb_cur_max >= outalloc)
+        {
+          out = x2nrealloc (out, &outalloc, 1);
+          p = out + outlen;
+        }
+
+      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+        {
+          /* An invalid sequence, or a truncated multi-octet character.
+             We treat it as a single-octet character.  */
+          *p++ = *beg++;
+          outlen++;
+          memset (&is, 0, sizeof (is));
+          memset (&os, 0, sizeof (os));
+        }
+      else
+        {
+          beg += mbclen;
+          mbclen = wcrtomb (p, towlower ((wint_t) wc), &os);
+          p += mbclen;
+          outlen += mbclen;
+        }
+    }
+
+  *n = p - out;
+  *p++ = 0;
+  return out;
+}
+
+
+bool
+is_mb_middle(const char **good, const char *buf, const char *end)
+{
+  const char *p = *good;
+  const char *prev = p;
+  mbstate_t cur_state;
+
+  /* TODO: can be optimized for UTF-8.  */
+  memset(&cur_state, 0, sizeof(mbstate_t));
+  while (p < buf)
+    {
+      size_t mbclen = mbrlen(p, end - p, &cur_state);
+
+      /* Store the beginning of the previous complete multibyte character.  */
+      if (mbclen != (size_t) -2)
+        prev = p;
+
+      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+	{
+	  /* An invalid sequence, or a truncated multibyte character.
+	     We treat it as a single byte character.  */
+	  mbclen = 1;
+	}
+      p += mbclen;
+    }
+
+  *good = prev;
+  return p > buf;
+}
+#endif /* MBS_SUPPORT */
author	Paolo Bonzini <bonzini@gnu.org>	2010-03-18 13:40:10 +0100
committer	Paolo Bonzini <bonzini@gnu.org>	2010-03-22 09:55:30 +0100
commit	c59a6cd03de84dc38c577083f34e3b0dfe87e36d (patch)
tree	4971c8e1233d4823579dc7beb2fab820b0729cb3
parent	59040143e96ce960476c5a360d829256759ff4ab (diff)
download	grep-c59a6cd03de84dc38c577083f34e3b0dfe87e36d.tar.gz