summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlain Magloire <alainm@rcsm.ee.mcgill.ca>2001-02-18 04:13:21 +0000
committerAlain Magloire <alainm@rcsm.ee.mcgill.ca>2001-02-18 04:13:21 +0000
commit1236f00774c60964d9c1661e7a8f6833d45596f5 (patch)
tree2392756d9ab216677151659ec06073f76ee16f6e
parent67c5b94f135ef975f657bac7e896719761646db9 (diff)
downloadgrep-1236f00774c60964d9c1661e7a8f6833d45596f5.tar.gz
New option back-references are local, beefup manual.
* doc/grep.texi : Document the new options and the new behaviour back-references are local. Use excerpt from Karl Berry regex texinfo. * bootstrap/Makefile.try : Added xstrtoumax.o xstrtoul.o hard-local.o From Guglielmo 'bond' Bondioni : The bug was that using a multi line file that contained REs (one per line), backreferences in the REs were considered global (to the file) and not local (to the line). That is, \1 in line n refers to the first \(.\) in the whole file, rather than in the line itself. From Tapani Tarvainen : # Re: grep -e '\(a\)\1' -e '\(b\)\1' That's not the way it should work: multiple -e arguments should be treated as independent patterns and back references should not refer to previous ones. From Paul Eggert : GNU grep currently does not issue diagnostics for the following two cases, both of which are erroneous: grep -e '[' -e ']' grep '[ ]' POSIX requires a diagnostic in both cases because '[' is not a valid regular expression. To overcome those problems, grep no longer pass the concatenate patterns to GNU regex but rather compile each patterns separately and keep the result in an array. * src/search.c (patterns) : New global variable; a structure array holding the compiled patterns. Declare function prototypes to minimize error. (dfa, kswset, regexbuf, regs) : Removed, no longer static globals, but rather fields in patterns[] structure per motif. (Fcompile) : Alloc an entry in patterns[] to hold the regex. (Ecompile) : Alloc an entry per motif in the patterns[] array. (Gcompile) : Likewise. (EGexecute) : Loop through of array of patterns[] for a match. From Bernd Strieder : # tail -f logfile | grep important | do_something_urgent # tail -f logfile | grep important | do_something_taking_very_long If grep does full buffering in these cases then the urgent operation does not happen as it should in the first case, and in the second case time is lost due to waiting for the buffer to be filled. This is clearly spoken not grep's fault in the first place, but libc's. There is a heuristic in libc that make a stream line-buffered only if a terminal is on the other end. This doesn't take care of the cases where this connection is somehow indirect. * src/grep.c (line_buffered) : new option variable. (prline) : if line_buffered is set fflush() is call. (usage) : line_buffered new option. Input from Paul Eggert, doing setvbuf() may not be portable and breaks grep -z. This patch prevent kwset_matcher from following problems. For example, in SJIS encoding, one character has the codepoint 0x895c. So the second byte of the character can match with '\' incorrectly. And in eucJP encoding, there are the characters whose codepoints are 0xa5b9, 0xa5c8. On the other hand, there is one character whose codepoint is 0xb9a5. So 0xb9a5 can match with 2nd byte of 0xa5b9 and 1st byte of 0xa5c8. (EGexecute) : call check_multibyte_string when kwset is set. (Fexecute) : call to check_multibyte_string. (MB_CUR_MAX) : new macro.
-rw-r--r--.cvsignore7
-rw-r--r--ChangeLog75
-rw-r--r--NEWS14
-rw-r--r--THANKS123
-rw-r--r--bootstrap/.cvsignore2
-rw-r--r--bootstrap/Makefile.try4
-rw-r--r--configure.in4
-rw-r--r--djgpp/.cvsignore2
-rw-r--r--doc/.cvsignore3
-rw-r--r--doc/grep.15
-rw-r--r--doc/grep.texi291
-rw-r--r--src/.cvsignore2
-rw-r--r--src/grep.c24
-rw-r--r--src/posix/.cvsignore2
-rw-r--r--src/search.c575
-rw-r--r--tests/.cvsignore2
-rw-r--r--vms/.cvsignore2
17 files changed, 746 insertions, 391 deletions
diff --git a/.cvsignore b/.cvsignore
index 2df40277..3c11108a 100644
--- a/.cvsignore
+++ b/.cvsignore
@@ -1,6 +1,13 @@
*.gmo
+Makefile
Makefile.in
aclocal.m4
+acinclude.m4
configure
+config.cache
+config.h
+config.hin
config.log
+config.status
stamp-h.in
+stamp-h
diff --git a/ChangeLog b/ChangeLog
index 15c0c26b..909f7f9e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,11 +1,86 @@
+2001-02-17 Alain Malgoire
+
+ * doc/grep.texi : Document the new options and the new behaviour
+ back-references are local. Use excerpt from Karl Berry regex
+ texinfo.
+
+ * bootstrap/Makefile.try : Added xstrtoumax.o xstrtoul.o hard-local.o
+
+2001-02-17 Alain Magloire
+
+ From Guglielmo 'bond' Bondioni :
+ The bug was that using a multi line file that contained REs (one per
+ line), backreferences in the REs were considered global (to the file)
+ and not local (to the line).
+ That is, \1 in line n refers to the first \(.\) in the whole file,
+ rather than in the line itself.
+
+ From Tapani Tarvainen :
+ # Re: grep -e '\(a\)\1' -e '\(b\)\1'
+ That's not the way it should work: multiple -e arguments
+ should be treated as independent patterns and back references
+ should not refer to previous ones.
+
+ From Paul Eggert :
+ GNU grep currently does not issue
+ diagnostics for the following two cases, both of which are erroneous:
+ grep -e '[' -e ']'
+ grep '[
+ ]'
+ POSIX requires a diagnostic in both cases because '[' is not a valid
+ regular expression.
+
+ To overcome those problems, grep no longer pass the concatenate
+ patterns to GNU regex but rather compile each patterns separately
+ and keep the result in an array.
+
+ * src/search.c (patterns) : New global variable; a structure array
+ holding the compiled patterns.
+ Declare function prototypes to minimize error.
+ (dfa, kswset, regexbuf, regs) : Removed, no longer static globals, but
+ rather fields in patterns[] structure per motif.
+ (Fcompile) : Alloc an entry in patterns[] to hold the regex.
+ (Ecompile) : Alloc an entry per motif in the patterns[] array.
+ (Gcompile) : Likewise.
+ (EGexecute) : Loop through of array of patterns[] for a match.
+
+2001-02-17 Alain Magloire
+
+ From Bernd Strieder :
+ # tail -f logfile | grep important | do_something_urgent
+ # tail -f logfile | grep important | do_something_taking_very_long
+ If grep does full buffering in these cases then the urgent operation
+ does not happen as it should in the first case, and in the second case
+ time is lost due to waiting for the buffer to be filled.
+ This is clearly spoken not grep's fault in the first place, but libc's.
+ There is a heuristic in libc that make a stream line-buffered only if a
+ terminal is on the other end. This doesn't take care of the cases where
+ this connection is somehow indirect.
+
+ * src/grep.c (line_buffered) : new option variable.
+ (prline) : if line_buffered is set fflush() is call.
+ (usage) : line_buffered new option.
+ Input from Paul Eggert, doing setvbuf() may not be portable
+ and breaks grep -z.
+
2001-02-16 Alain Magloire
Patch from Isamu Hasegawa, for multibyte support.
+ This patch prevent kwset_matcher from following problems.
+ For example, in SJIS encoding, one character has the codepoint 0x895c.
+ So the second byte of the character can match with '\' incorrectly.
+ And in eucJP encoding, there are the characters whose codepoints are
+ 0xa5b9, 0xa5c8. On the other hand, there is one character whose
+ codepoint is 0xb9a5. So 0xb9a5 can match with 2nd byte of 0xa5b9
+ and 1st byte of 0xa5c8.
* configure.in : Add check for mbrtowc.
* src/search.c (check_multibyte_string) : new function.
Support for multibyte string.
+ (EGexecute) : call check_multibyte_string when kwset is set.
+ (Fexecute) : call to check_multibyte_string.
(MBS_SUPPORT) : new macro.
+ (MB_CUR_MAX) : new macro.
2001-02-16 Alain Magloire
diff --git a/NEWS b/NEWS
index 4543090d..38977983 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,17 @@
+Version 2.5
+
+ - The new option --line-buffered fflush on everyline. There is a noticeable
+ slow down when forcing line buffering.
+
+ - Back references are now local to the regex.
+ grep -e '\(a\)\1' -e '\(b\)\1'
+ The last backref \1 in the second expression refer to \(b\)
+
- The new option --include=PATTERN will only search matching files
- when recursing in directories and --exclude=PATTERN, will skip them.
+ when recursing in directories
+
+ - The new option --exclude=PATTERN will skip matching files when
+ recursing in directories.
- The new option --color will use the environment variable GREP_COLOR
(default is red) to highlight the matching string.
diff --git a/THANKS b/THANKS
index e4025e42..d085075e 100644
--- a/THANKS
+++ b/THANKS
@@ -1,60 +1,65 @@
-Aharon Robbins <arnold@gnu.org>
-Akim Demaille <akim@epita.fr>
-Alain Magloire <alainm@gnu.org>
-Andreas Schwab <schwab@suse.de>
-Andreas Ley <andy@rz.uni-karlsruhe.de>
-Ben Elliston <bje@cygnus.com>
+Aharon Robbins <arnold@gnu.org>
+Akim Demaille <akim@epita.fr>
+Alain Magloire <alainm@gnu.org>
+Andreas Schwab <schwab@suse.de>
+Andreas Ley <andy@rz.uni-karlsruhe.de>
Bastiaan "Darquan" Stougie <darquan@zonnet.nl>
-Bob Proulx <rwp@hprwp.fc.hp.com>
-Brian Youmans <3diff@gnu.org>
-Bruno Haible <haible@ilog.fr>
-Christian Groessler <cpg@aladdin.de>
-David J MacKenzie <djm@catapult.va.pubnix.com>
-David O'Brien <obrien@freebsd.org>
-Eli Zaretskii <eliz@is.elta.co.il>
-Florian La Roche <florian@knorke.saar.de>
-Franc,ois Pinard <pinard@IRO.UMontreal.CA>
-Gerald Stoller <gerald_stoller@hotmail.com>
-Grant McDorman <grant@isgtec.com>
-Harald Hanche-Olsen <hanche@math.ntnu.no>
-H. Merijn Brand <h.m.brand@hccnet.nl>
-Jeff Bailey <jbailey@nisa.net>
-Jim Hand <jhand@austx.tandem.com>
-Jim Meyering <meyering@asic.sc.ti.com>
-Jochen Hein <jochen.hein@delphi.central.de>
-Joel N. Weber II <devnull@gnu.org>
-John Hughes <john@nitelite.calvacom.fr>
-Jorge Stolfi <stolfi@dcc.unicamp.br>
-Juan Manuel Guerrero <ST001906@HRZ1.HRZ.TU-Darmstadt.De>
-Karl Berry <karl@cs.umb.edu>
-Karl Heuer <kwzh@gnu.org>
-Kaveh R. Ghazi <ghazi@caip.rutgers.edu>
-Kazuro Furukawa <furukawa@apricot.kek.jp>
-Keith Bostic <bostic@bsdi.com>
-Krishna Sethuraman <krishna@sgihub.corp.sgi.com>
-Mark Waite <markw@mddmew.fc.hp.com>
-Martin P.J. Zinser <zinser@decus.de>
-Martin Rex <martin.rex@sap-ag.de>
-Michael Aichlmayr <mikla@nx.com>
-Miles Bader <miles@ccs.mt.nec.co.jp>
-Olaf Kirch <okir@ns.lst.de>
-Paul Eggert <eggert@twinsun.com>
-Paul Kimoto <kimoto@spacenet.tn.cornell.edu>
-Phillip C. Brisco <phillip.craig.brisco@ccmail.census.gov>
-Philippe Defert <Philippe.Defert@cern.ch>
-Philippe De Muyter <phdm@info.ucl.ac.be>
-Philip Hazel <ph10@cus.cam.ac.uk>
-Roland Roberts <rroberts@muller.com>
-Ruslan Ermilov <ru@freebsd.org>
-Shannon Hill <hill@synnet.com>
-Sotiris Vassilopoulos <Sotiris.Vassilopoulos@betatech.gr>
-Stewart Levin <stew@sep.stanford.edu>
-Sydoruk Stepan <step@unitex.kiev.ua>
-Tom 'moof' Spindler <dogcow@ccs.neu.edu>
-Tom Tromey <tromey@creche.cygnus.com>
-Ulrich Drepper <drepper@cygnus.com>
-UEBAYASHI Masao <masao@nf.enveng.titech.ac.jp>
-Volker Borchert <bt@teknon.de>
-Wichert Akkerman <wichert@cistron.nl>
-William Bader <william@nscs.fast.net>
-Wolfgang Schludi <schludi@syscomp.de>
+Ben Elliston <bje@cygnus.com>
+Bernd Strieder <strieder@student.uni-kl.de>
+Bob Proulx <rwp@hprwp.fc.hp.com>
+Brian Youmans <3diff@gnu.org>
+Bruno Haible <haible@ilog.fr>
+Christian Groessler <cpg@aladdin.de>
+David J MacKenzie <djm@catapult.va.pubnix.com>
+David O'Brien <obrien@freebsd.org>
+Eli Zaretskii <eliz@is.elta.co.il>
+Florian La Roche <florian@knorke.saar.de>
+Franc,ois Pinard <pinard@IRO.UMontreal.CA>
+Gerald Stoller <gerald_stoller@hotmail.com>
+Grant McDorman <grant@isgtec.com>
+Guglielmo 'bond' Bondioni <g.bondioni@libero.it>
+H. Merijn Brand <h.m.brand@hccnet.nl>
+Harald Hanche-Olsen <hanche@math.ntnu.no>
+Hans-Bernhard Broeker <broeker@physik.rwth-aachen.de>
+Isamu Hasegawa <isamu@yamato.ibm.co.jp>
+Jeff Bailey <jbailey@nisa.net>
+Jim Hand <jhand@austx.tandem.com>
+Jim Meyering <meyering@asic.sc.ti.com>
+Jochen Hein <jochen.hein@delphi.central.de>
+Joel N. Weber II <devnull@gnu.org>
+John Hughes <john@nitelite.calvacom.fr>
+Jorge Stolfi <stolfi@dcc.unicamp.br>
+Juan Manuel Guerrero <ST001906@HRZ1.HRZ.TU-Darmstadt.De>
+Karl Berry <karl@cs.umb.edu>
+Karl Heuer <kwzh@gnu.org>
+Kaveh R. Ghazi <ghazi@caip.rutgers.edu>
+Kazuro Furukawa <furukawa@apricot.kek.jp>
+Keith Bostic <bostic@bsdi.com>
+Krishna Sethuraman <krishna@sgihub.corp.sgi.com>
+Mark Waite <markw@mddmew.fc.hp.com>
+Martin P.J. Zinser <zinser@decus.de>
+Martin Rex <martin.rex@sap-ag.de>
+Michael Aichlmayr <mikla@nx.com>
+Miles Bader <miles@ccs.mt.nec.co.jp>
+Olaf Kirch <okir@ns.lst.de>
+Paul Eggert <eggert@twinsun.com>
+Paul Kimoto <kimoto@spacenet.tn.cornell.edu>
+Phillip C. Brisco <phillip.craig.brisco@ccmail.census.gov>
+Philippe Defert <Philippe.Defert@cern.ch>
+Philippe De Muyter <phdm@info.ucl.ac.be>
+Philip Hazel <ph10@cus.cam.ac.uk>
+Roland Roberts <rroberts@muller.com>
+Ruslan Ermilov <ru@freebsd.org>
+Shannon Hill <hill@synnet.com>
+Sotiris Vassilopoulos <Sotiris.Vassilopoulos@betatech.gr>
+Stewart Levin <stew@sep.stanford.edu>
+Sydoruk Stepan <step@unitex.kiev.ua>
+Tapani Tarvainen <tt@mit.jyu.fi>
+Tom 'moof' Spindler <dogcow@ccs.neu.edu>
+Tom Tromey <tromey@creche.cygnus.com>
+Ulrich Drepper <drepper@cygnus.com>
+UEBAYASHI Masao <masao@nf.enveng.titech.ac.jp>
+Volker Borchert <bt@teknon.de>
+Wichert Akkerman <wichert@cistron.nl>
+William Bader <william@nscs.fast.net>
+Wolfgang Schludi <schludi@syscomp.de>
diff --git a/bootstrap/.cvsignore b/bootstrap/.cvsignore
new file mode 100644
index 00000000..282522db
--- /dev/null
+++ b/bootstrap/.cvsignore
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
diff --git a/bootstrap/Makefile.try b/bootstrap/Makefile.try
index bd9aef16..b34f0b2d 100644
--- a/bootstrap/Makefile.try
+++ b/bootstrap/Makefile.try
@@ -13,11 +13,15 @@ OBJS = alloca.$(OBJEXT) \
getopt1.$(OBJEXT) \
grep.$(OBJEXT) \
kwset.$(OBJEXT) \
+ hard-local.$(OBJEXT) \
+ isdir.$(OBJEXT) \
memchr.$(OBJEXT) \
obstack.$(OBJEXT) \
regex.$(OBJEXT) \
savedir.$(OBJEXT) \
search.$(OBJEXT) \
+ xstrtol.$(OBJEXT) \
+ xstrtoumax.$(OBJEXT) \
stpcpy.$(OBJEXT)
# Where is DIR and opendir/readdir defined.
diff --git a/configure.in b/configure.in
index 93e958c3..50a32ebd 100644
--- a/configure.in
+++ b/configure.in
@@ -1,6 +1,6 @@
# Configuration for grep
#
-# Alain Magloire <alainm@rcsm.ee.mcgill.ca>
+# Alain Magloire <alainm@gnu.org>
#
dnl Process this file with autoconf to produce a configure script
AC_INIT(src/grep.c)
@@ -8,7 +8,7 @@ AC_DEFINE(GREP)
AC_PREREQ(2.13)
dnl Automake stuff.
-AM_INIT_AUTOMAKE(grep, 2.5b)
+AM_INIT_AUTOMAKE(grep, 2.5c)
AM_CONFIG_HEADER(config.h:config.hin)
dnl Checks for programs.
diff --git a/djgpp/.cvsignore b/djgpp/.cvsignore
new file mode 100644
index 00000000..282522db
--- /dev/null
+++ b/djgpp/.cvsignore
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
diff --git a/doc/.cvsignore b/doc/.cvsignore
new file mode 100644
index 00000000..14bb26d9
--- /dev/null
+++ b/doc/.cvsignore
@@ -0,0 +1,3 @@
+Makefile
+Makefile.in
+version.texi
diff --git a/doc/grep.1 b/doc/grep.1
index 387e0024..752eb334 100644
--- a/doc/grep.1
+++ b/doc/grep.1
@@ -12,7 +12,7 @@
.de Id
.ds Dt \\$4
..
-.Id $Id: grep.1,v 1.17 2001/02/16 05:50:23 alainm Exp $
+.Id $Id: grep.1,v 1.18 2001/02/18 04:13:21 alainm Exp $
.TH GREP 1 \*(Dt "GNU Project"
.SH NAME
grep, egrep, fgrep \- print lines matching a pattern
@@ -269,6 +269,9 @@ is operating, or if an I/O error occurs.
Prefix each line of output with the line number
within its input file.
.TP
+.BR \-\^\-line-buffering
+Use line buffering, it can be a performance penality.
+.TP
.BR \-q ", " \-\^\-quiet ", " \-\^\-silent
Quiet; do not write anything to standard output.
Exit immediately with zero status if any match is found,
diff --git a/doc/grep.texi b/doc/grep.texi
index 42ad1c2a..66ea5394 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -147,7 +147,7 @@ extensions.
@item -c
@itemx --count
@opindex -c
-@opindex -count
+@opindex --count
@cindex counting lines
Suppress normal output; instead print a count of matching
lines for each input file. With the @samp{-v}, @samp{--invert-match} option,
@@ -261,12 +261,8 @@ Print @var{num} lines of leading context before matching lines.
@cindex context
Print @var{num} lines of output context.
+@item --colour
@itemx --color
-@opindex --color
-@cindex highlight, color, colour
-Equivalent to @samp{--colour}.
-
-@itemx --colour
@opindex --colour
@cindex highlight, color, colour
The matching string is surrounded by the marker specify in @var{GREP_COLOR}.
@@ -346,6 +342,11 @@ Print the filename for each match.
@cindex no filename prefix
Suppress the prefixing of filenames on output when multiple files are searched.
+@item --line-buffered
+@opindex --line-buffered
+@cindex line buffering
+Set the line buffering policy, this can be a performance penality.
+
@item -L
@itemx --files-without-match
@opindex -L
@@ -381,12 +382,8 @@ it must be either at the end of the line or followed by
a non-word constituent character. Word-constituent
characters are letters, digits, and the underscore.
-@item -R
-@cindex recursive search
-@cindex searching directory trees
-Equivalent to @sam{--directories=recurse}.
-
@item -r
+@itemx -R
@itemx --recursive
@opindex -r
@opindex --recursive
@@ -396,18 +393,18 @@ For each directory mentioned in the command line, read and process all
files in that directory, recursively. This is the same as the
@samp{--directories=recurse} option.
-@item --include=@var{pattern}
+@item --include=@var{file_pattern}
@opindex --include
-@cindex recursive search
+@cindex include files
@cindex searching directory trees
-When processing directories recursively, only files matching @var{pattern}
-@var{pattern} will be search.
+When processing directories recursively, only files matching @var{file_pattern}
+will be search.
-@item --exclude=@var{pattern}
+@item --exclude=@var{file_pattern}
@opindex --exclude
-@cindex recursive search
+@cindex exclude files
@cindex searching directory trees
-When processing directories recursively, skip files matching @var{pattern}.
+When processing directories recursively, skip files matching @var{file_pattern}.
@item -m @var{num}
@itemx --max-count=@var{num}
@@ -558,7 +555,7 @@ specify an option containing whitespace or a backslash.
@item GREP_COLOR
@vindex GREP_COLOR
-@cindex default options environment variable, highlight, color, coulor
+@cindex highlight markers
This variable specifies the surrounding markers use to highlight the matching
text. The default is control ascii red.
@@ -690,8 +687,8 @@ A @dfn{regular expression} is a pattern that describes a set of strings.
Regular expressions are constructed analogously to arithmetic expressions,
by using various operators to combine smaller expressions.
@command{grep} understands two different versions of regular expression
-syntax: ``basic'' and ``extended''. In @sc{gnu} @command{grep}, there is no
-difference in available functionality using either syntax.
+syntax: ``basic''(BRE) and ``extended''(ERE). In @sc{gnu} @command{grep},
+there is no difference in available functionality using either syntax.
In other implementations, basic regular expressions are less powerful.
The following description applies to extended regular expressions;
differences for basic regular expressions are summarized afterwards.
@@ -701,13 +698,74 @@ a single character. Most characters, including all letters and digits,
are regular expressions that match themselves. Any metacharacter
with special meaning may be quoted by preceding it with a backslash.
+A regular expression may be followed by one of several
+repetition operators:
+
+@table @samp
+
+@item .
+@opindex .
+@cindex dot
+@cindex period
+The period @samp{.} matches any single character.
+
+@item ?
+@opindex ?
+@cindex question mark
+@cindex match sub-expression at most once
+The preceding item is optional and will be matched at most once.
+
+@item *
+@opindex *
+@cindex asterisk
+@cindex match sub-expression zero or more times
+The preceding item will be matched zero or more times.
+
+@item +
+@opindex +
+@cindex plus sign
+The preceding item will be matched one or more times.
+
+@item @{@var{n}@}
+@opindex @{n@}
+@cindex braces, one argument
+@cindex match sub-expression n times
+The preceding item is matched exactly @var{n} times.
+
+@item @{@var{n},@}
+@opindex @{n,@}
+@cindex braces, second argument omitted
+@cindex match sub-expression n or more times
+The preceding item is matched n or more times.
+
+@item @{@var{n},@var{m}@}
+@opindex @{n,m@}
+@cindex braces, two arguments
+The preceding item is matched at least @var{n} times, but not more than
+@var{m} times.
+
+@end table
+
+Two regular expressions may be concatenated; the resulting regular
+expression matches any string formed by concatenating two substrings
+that respectively match the concatenated subexpressions.
+
+Two regular expressions may be joined by the infix operator @samp{|}; the
+resulting regular expression matches any string matching either subexpression.
+
+Repetition takes precedence over concatenation, which in turn
+takes precedence over alternation. A whole subexpression may be
+enclosed in parentheses to override these precedence rules.
+
+@section Character Class
+
@cindex bracket expression
+@cindex character class
A @dfn{bracket expression} is a list of characters enclosed by @samp{[} and
-@samp{]}. It matches any
-single character in that list; if the first character of the list is the
-caret @samp{^}, then it
-matches any character @strong{not} in the list. For example, the regular
-expression @samp{[0123456789]} matches any single digit.
+@samp{]}. It matches any single character in that list; if the first
+character of the list is the caret @samp{^}, then it matches any character
+@strong{not} in the list. For example, the regular expression
+@samp{[0123456789]} matches any single digit.
@cindex range expression
Within a bracket expression, a @dfn{range expression} consists of two
@@ -812,82 +870,96 @@ depends upon the C locale and the @sc{ascii} character
encoding, whereas the former is independent of locale and character set.
(Note that the brackets in these class names are
part of the symbolic names, and must be included in addition to
-the brackets delimiting the bracket list.) Most metacharacters lose
-their special meaning inside lists. To include a literal @samp{]}, place it
-first in the list. Similarly, to include a literal @samp{^}, place it anywhere
-but first. Finally, to include a literal @samp{-}, place it last.
+the brackets delimiting the bracket list.)
-The period @samp{.} matches any single character. The symbol @samp{\w}
-is a synonym for @samp{[[:alnum:]]} and @samp{\W} is a synonym for
-@samp{[^[:alnum]]}.
+Most metacharacters lose their special meaning inside lists.
-The caret @samp{^} and the dollar sign @samp{$} are metacharacters that
-respectively match the empty string at the beginning and end
-of a line. The symbols @samp{\<} and @samp{\>} respectively match the
-empty string at the beginning and end of a word. The symbol
-@samp{\b} matches the empty string at the edge of a word, and @samp{\B}
-matches the empty string provided it's not at the edge of a word.
+@table @samp
+@item ]
+ends the list if it's not the first list item. So, if you want to make
+the @samp{]} character a list item, you must put it first.
-A regular expression may be followed by one of several
-repetition operators:
+@item [.
+represents the open collating symbol.
+
+@item .]
+represents the close collating symbol.
+
+@item [=
+represents the open equivalence class.
+
+@item =]
+represents the close equivalence class.
+
+@item [:
+represents the open character class followed by a valid character class name.
+
+@item :]
+represents the close character class followed by a valid character class name.
+@item -
+represents the range if it's not first or last in a list or the ending point
+of a range.
+
+@item ^
+represents the characters not in the list. If you want to make the @samp{^}
+character a list item, place it anywhere but first.
+
+@end table
+
+@section Backslash Character
+@cindex backslash
+
+The @samp{\} when followed by certain ordinary characters take a special
+meaning :
@table @samp
-@item ?
-@opindex ?
-@cindex question mark
-@cindex match sub-expression at most once
-The preceding item is optional and will be matched at most once.
+@item @samp{\b}
+Match the empty string at the edge of a word.
-@item *
-@opindex *
-@cindex asterisk
-@cindex match sub-expression zero or more times
-The preceding item will be matched zero or more times.
+@item @samp{\B}
+Match the empty string provided it's not at the edge of a word.
-@item +
-@opindex +
-@cindex plus sign
-The preceding item will be matched one or more times.
+@item @samp{\<}
+Match the empty string at the beginning of word.
-@item @{@var{n}@}
-@opindex @{n@}
-@cindex braces, one argument
-@cindex match sub-expression n times
-The preceding item is matched exactly @var{n} times.
+@item @samp{\>}
+Match the empty string at the end of word.
-@item @{@var{n},@}
-@opindex @{n,@}
-@cindex braces, second argument omitted
-@cindex match sub-expression n or more times
-The preceding item is matched n or more times.
+@item @samp{\w}
+Match word constituent, it is a synonym for @samp{[[:alnum:]]}.
-@item @{@var{n},@var{m}@}
-@opindex @{n,m@}
-@cindex braces, two arguments
-The preceding item is matched at least @var{n} times, but not more than
-@var{m} times.
+@item @samp{\W}
+Match non word constituent, it is a synonym for @samp{[^[:alnum:]]}.
@end table
-Two regular expressions may be concatenated; the resulting regular
-expression matches any string formed by concatenating two substrings
-that respectively match the concatenated subexpressions.
+For example , @samp{\brat\b} matches the separate word @samp{rat},
+@samp{c\Brat\Be} matches @samp{crate}, but @samp{dirty \Brat} doesn't
+match @samp{dirty rat}.
-Two regular expressions may be joined by the infix operator @samp{|}; the
-resulting regular expression matches any string matching either
-subexpression.
+@section Anchoring
+@cindex anchoring
-Repetition takes precedence over concatenation, which in turn
-takes precedence over alternation. A whole subexpression may be
-enclosed in parentheses to override these precedence rules.
+The caret @samp{^} and the dollar sign @samp{$} are metacharacters that
+respectively match the empty string at the beginning and end of a line.
+
+@section Back-reference
+@cindex back-reference
-The backreference @samp{\@var{n}}, where @var{n} is a single digit, matches the
-substring previously matched by the @var{n}th parenthesized subexpression
-of the regular expression.
+The back-reference @samp{\@var{n}}, where @var{n} is a single digit, matches
+the substring previously matched by the @var{n}th parenthesized subexpression
+of the regular expression. For example, @samp{(a)\1} matches @samp{aa}.
+When use with alternation if the group does not participate in the match, then
+the back-reference makes the whole match fail. For example, @samp{a(.)|b\1}
+will not match @samp{ba}. When multiple regular expressions are given with
+@samp{-e} or from a file @samp{-f file}, the back-referecences are local to
+each expression.
+@section Basic vs Extended
@cindex basic regular expressions
+
In basic regular expressions the metacharacters @samp{?}, @samp{+},
@samp{@{}, @samp{|}, @samp{(}, and @samp{)} lose their special meaning;
instead use the backslashed versions @samp{\?}, @samp{\+}, @samp{\@{},
@@ -1038,6 +1110,9 @@ ps -ef | grep '[c]ron'
If the pattern had been written without the square brackets, it would
have matched not only the @command{ps} output line for @command{cron},
but also the @command{ps} output line for @command{grep}.
+Note that some platforms @command{ps} limit the ouput to the width
+of the screen, grep does not have any limit on the length of a line
+except the available memory.
@item
Why does @command{grep} report ``Binary file matches''?
@@ -1077,6 +1152,56 @@ Use the special file name @samp{-}:
@example
cat /etc/passwd | grep 'alain' - /etc/motd
@end example
+
+@item
+@cindex palindromes
+How to express palindromes in a regular expression?
+
+It can be done by using the back referecences, for example a palindrome
+of 4 chararcters can be written in BRE.
+
+@example
+grep -w -e '\(.\)\(.\).\2\1' file
+@end example
+
+It matches the word "radar" or "civic".
+
+Guglielmo Bondioni proposed a single RE that finds all the palindromes up to 19
+characters long.
+
+@example
+egrep -e '^(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?)(.?).?\9\8\7\6\5\4\3\2\1$' file
+@end example
+
+Note this is done by using GNU ERE extensions, it might not be portable on
+other greps.
+
+@item
+Why are my expressions whith the vertical bar fail?
+
+@example
+/bin/echo "ba" | egrep '(a)\1|(b)\1'
+@end example
+
+The first alternate branch fails then the first group was not in the match
+this will make the second alternate branch fails. For example, "aaba" will
+match, the first group participate in the match and can be reuse in the
+second branch.
+
+@item
+What do @command{grep, fgrep, egrep} stand for ?
+
+grep comes from the way line editing was done on Unix. For example,
+@command{ed} uses this syntax to print a list of matching lines on the screen.
+
+@example
+global/regular expression/print
+g/re/p
+@end example
+
+@command{fgrep} stands for Fixed @command{grep}, @command{egrep} Extended
+@command{grep}.
+
@end enumerate
@node Reporting Bugs
@@ -1090,7 +1215,7 @@ Large repetition counts in the @samp{@{m,n@}} construct may cause
@command{grep} to use lots of memory. In addition, certain other
obscure regular expressions require exponential time and
space, and may cause grep to run out of memory.
-Backreferences are very slow, and may require exponential time.
+Back-references are very slow, and may require exponential time.
@page
@node Concept Index
diff --git a/src/.cvsignore b/src/.cvsignore
new file mode 100644
index 00000000..282522db
--- /dev/null
+++ b/src/.cvsignore
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
diff --git a/src/grep.c b/src/grep.c
index cce9bc90..32758aa2 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -80,6 +80,7 @@ enum
COLOR_OPTION,
INCLUDE_OPTION,
EXCLUDE_OPTION,
+ LINE_BUFFERED_OPTION
};
/* Long options equivalences. */
@@ -105,6 +106,7 @@ static struct option const long_options[] =
{"help", no_argument, &show_help, 1},
{"include", required_argument, NULL, INCLUDE_OPTION},
{"ignore-case", no_argument, NULL, 'i'},
+ {"line-buffered", no_argument, NULL, LINE_BUFFERED_OPTION},
{"line-number", no_argument, NULL, 'n'},
{"line-regexp", no_argument, NULL, 'x'},
{"max-count", required_argument, NULL, 'm'},
@@ -455,11 +457,12 @@ fillbuf (size_t save, struct stats const *stats)
/* Flags controlling the style of output. */
static enum
- {
- BINARY_BINARY_FILES,
- TEXT_BINARY_FILES,
- WITHOUT_MATCH_BINARY_FILES
- } binary_files; /* How to handle binary files. */
+{
+ BINARY_BINARY_FILES,
+ TEXT_BINARY_FILES,
+ WITHOUT_MATCH_BINARY_FILES
+} binary_files; /* How to handle binary files. */
+
static int filename_mask; /* If zero, output nulls after filenames. */
static int out_quiet; /* Suppress all normal output. */
static int out_invert; /* Print nonmatching stuff. */
@@ -473,6 +476,9 @@ static int list_files; /* List matching files. */
static int no_filenames; /* Suppress file names. */
static off_t max_count; /* Stop after outputting this many
lines from an input file. */
+static int line_buffered; /* If nonzero, use line buffering, i.e.
+ fflush everyline out. */
+
/* Internal variables to keep track of byte count, context, etc. */
static uintmax_t totalcc; /* Total character count before bufbeg. */
@@ -573,6 +579,8 @@ prline (char const *beg, char const *lim, int sep)
if (ferror (stdout))
error (_("writing output"), errno);
lastout = lim;
+ if (line_buffered)
+ fflush (stdout);
}
/* Print pending lines of trailing context prior to LIM. Trailing context ends
@@ -1047,6 +1055,7 @@ Output control:\n\
-m, --max-count=NUM stop after NUM matches\n\
-b, --byte-offset print the byte offset with output lines\n\
-n, --line-number print line number with output lines\n\
+ --line-buffered flush output on every line\n\
-H, --with-filename print the filename for each match\n\
-h, --no-filename suppress the prefixing filename on output\n\
-q, --quiet, --silent suppress all normal output\n\
@@ -1060,7 +1069,7 @@ Output control:\n\
--include=PATTERN equivalent to --directories=recurse but only\n\
files that match PATTERN will be examine\n\
--exclude=PATTERN equivalent to --directories=recurse, files that\n\
- match PATTERN will be skip.
+ match PATTERN will be skip.\n\
-L, --files-without-match only print FILE names containing no match\n\
-l, --files-with-matches only print FILE names containing matches\n\
-c, --count only print a count of matching lines per FILE\n\
@@ -1477,6 +1486,9 @@ main (int argc, char **argv)
include_pattern = optarg;
directories = RECURSE_DIRECTORIES;
break;
+ case LINE_BUFFERED_OPTION:
+ line_buffered = 1;
+ break;
case 0:
/* long options */
break;
diff --git a/src/posix/.cvsignore b/src/posix/.cvsignore
new file mode 100644
index 00000000..282522db
--- /dev/null
+++ b/src/posix/.cvsignore
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
diff --git a/src/search.c b/src/search.c
index 5ec8647b..84becc8c 100644
--- a/src/search.c
+++ b/src/search.c
@@ -43,21 +43,44 @@
# include <wctype.h>
#endif
-/* DFA compiled regexp. */
-static struct dfa dfa;
+/* The compile patterns. */
+static struct patterns
+{
+ /* DFA compiled regexp. */
+ struct re_pattern_buffer regexbuf;
+
+ /* Regex compiled regexp. */
+ struct dfa dfa;
-/* Regex compiled regexp. */
-static struct re_pattern_buffer regexbuf;
+ /* KWset compiled pattern. For Ecompile and Gcompile, we compile
+ a list of strings, at least one of which is known to occur in
+ any string matching the regexp. */
+ kwset_t kwset;
-/* KWset compiled pattern. For Ecompile and Gcompile, we compile
- a list of strings, at least one of which is known to occur in
- any string matching the regexp. */
-static kwset_t kwset;
+ /* Number of compiled fixed strings known to exactly match the regexp.
+ If kwsexec returns < kwset_exact_matches, then we don't need to
+ call the regexp matcher at all. */
+ int kwset_exact_matches;
-/* Number of compiled fixed strings known to exactly match the regexp.
- If kwsexec returns < kwset_exact_matches, then we don't need to
- call the regexp matcher at all. */
-static int kwset_exact_matches;
+ struct re_registers regs; /* This is here on account of a BRAIN-DEAD
+ Q@#%!# library interface in regex.c. */
+} patterns0;
+
+struct patterns *patterns;
+size_t pcount;
+
+#if defined(MBS_SUPPORT)
+static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
+#endif
+static void kwsinit PARAMS ((kwset_t *));
+static void kwsmusts PARAMS ((struct patterns *));
+static void Gcompile PARAMS ((char const *, size_t));
+static void Ecompile PARAMS ((char const *, size_t));
+static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
+static void Fcompile PARAMS ((char const *, size_t));
+static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
+static void Pcompile PARAMS ((char const *, size_t ));
+static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
void
dfaerror (char const *mesg)
@@ -66,7 +89,7 @@ dfaerror (char const *mesg)
}
static void
-kwsinit (void)
+kwsinit (kwset_t *pkwset)
{
static char trans[NCHAR];
int i;
@@ -75,8 +98,8 @@ kwsinit (void)
for (i = 0; i < NCHAR; ++i)
trans[i] = TOLOWER(i);
- if (!(kwset = kwsalloc(match_icase ? trans : (char *) 0)))
- fatal("memory exhausted", 0);
+ if (!((*pkwset) = kwsalloc (match_icase ? trans : (char *) 0)))
+ fatal(_("memory exhausted"), 0);
}
/* If the DFA turns out to have some set of fixed strings one of
@@ -84,35 +107,35 @@ kwsinit (void)
to find those strings, and thus quickly filter out impossible
matches. */
static void
-kwsmusts (void)
+kwsmusts (struct patterns *pats)
{
struct dfamust const *dm;
char const *err;
- if (dfa.musts)
+ if (pats->dfa.musts)
{
- kwsinit();
+ kwsinit (&(pats->kwset));
/* First, we compile in the substrings known to be exact
matches. The kwset matcher will return the index
of the matching string that it chooses. */
- for (dm = dfa.musts; dm; dm = dm->next)
+ for (dm = pats->dfa.musts; dm; dm = dm->next)
{
if (!dm->exact)
continue;
- ++kwset_exact_matches;
- if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
+ ++(pats->kwset_exact_matches);
+ if ((err = kwsincr(pats->kwset, dm->must, strlen(dm->must))) != 0)
fatal(err, 0);
}
/* Now, we compile the substrings that will require
the use of the regexp matcher. */
- for (dm = dfa.musts; dm; dm = dm->next)
+ for (dm = pats->dfa.musts; dm; dm = dm->next)
{
if (dm->exact)
continue;
- if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
+ if ((err = kwsincr(pats->kwset, dm->must, strlen(dm->must))) != 0)
fatal(err, 0);
}
- if ((err = kwsprep(kwset)) != 0)
+ if ((err = kwsprep(pats->kwset)) != 0)
fatal(err, 0);
}
}
@@ -164,51 +187,81 @@ check_multibyte_string(char const *buf, size_t size)
#endif
static void
-Gcompile (char const *pattern, size_t size)
+Gcompile (char const *motif, size_t total)
{
const char *err;
+ size_t size;
+ const char *sep;
re_set_syntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
dfasyntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
- if ((err = re_compile_pattern(pattern, size, &regexbuf)) != 0)
- fatal(err, 0);
-
- /* In the match_words and match_lines cases, we use a different pattern
- for the DFA matcher that will quickly throw out cases that won't work.
- Then if DFA succeeds we do some hairy stuff using the regex matcher
- to decide whether the match should really count. */
- if (match_words || match_lines)
+ do
{
- /* In the whole-word case, we use the pattern:
- \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
- In the whole-line case, we use the pattern:
- ^\(userpattern\)$. */
-
- static char const line_beg[] = "^\\(";
- static char const line_end[] = "\\)$";
- static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
- static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
- char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
- size_t i;
- strcpy (n, match_lines ? line_beg : word_beg);
- i = strlen(n);
- memcpy(n + i, pattern, size);
- i += size;
- strcpy (n + i, match_lines ? line_end : word_end);
- i += strlen(n + i);
- pattern = n;
- size = i;
- }
+ sep = memchr (motif, '\n', total + 1);
+ if (sep)
+ {
+ size = sep - motif;
+ total -= size;
+ sep++;
+ }
+ else
+ size = total;
- dfacomp (pattern, size, &dfa, 1);
- kwsmusts();
+ patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
+ if (patterns == NULL)
+ {
+ fatal (_("memory exhausted"), 0);
+ }
+
+ patterns[pcount] = patterns0;
+
+ if ((err = re_compile_pattern(motif, size,
+ &(patterns[pcount].regexbuf))) != 0)
+ fatal(err, 0);
+
+ /* In the match_words and match_lines cases, we use a different pattern
+ for the DFA matcher that will quickly throw out cases that won't work.
+ Then if DFA succeeds we do some hairy stuff using the regex matcher
+ to decide whether the match should really count. */
+ if (match_words || match_lines)
+ {
+ /* In the whole-word case, we use the pattern:
+ \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
+ In the whole-line case, we use the pattern:
+ ^\(userpattern\)$. */
+
+ static char const line_beg[] = "^\\(";
+ static char const line_end[] = "\\)$";
+ static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
+ static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
+ char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
+ size_t i;
+ strcpy (n, match_lines ? line_beg : word_beg);
+ i = strlen(n);
+ memcpy(n + i, motif, size);
+ i += size;
+ strcpy (n + i, match_lines ? line_end : word_end);
+
+ i += strlen(n + i);
+ motif = n;
+ size = i;
+ }
+
+ dfacomp (motif, size, &(patterns[pcount].dfa), 1);
+ kwsmusts(&(patterns[pcount]));
+ pcount++;
+
+ motif = sep;
+ } while (sep && total != 0);
}
static void
-Ecompile (char const *pattern, size_t size)
+Ecompile (char const *motif, size_t total)
{
const char *err;
+ const char *sep;
+ size_t size;
if (strcmp(matcher, "awk") == 0)
{
@@ -221,38 +274,60 @@ Ecompile (char const *pattern, size_t size)
dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
}
- if ((err = re_compile_pattern(pattern, size, &regexbuf)) != 0)
- fatal(err, 0);
-
- /* In the match_words and match_lines cases, we use a different pattern
- for the DFA matcher that will quickly throw out cases that won't work.
- Then if DFA succeeds we do some hairy stuff using the regex matcher
- to decide whether the match should really count. */
- if (match_words || match_lines)
+ do
{
- /* In the whole-word case, we use the pattern:
- (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
- In the whole-line case, we use the pattern:
- ^(userpattern)$. */
-
- static char const line_beg[] = "^(";
- static char const line_end[] = ")$";
- static char const word_beg[] = "(^|[^[:alnum:]_])(";
- static char const word_end[] = ")([^[:alnum:]_]|$)";
- char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
- size_t i;
- strcpy (n, match_lines ? line_beg : word_beg);
- i = strlen(n);
- memcpy(n + i, pattern, size);
- i += size;
- strcpy (n + i, match_lines ? line_end : word_end);
- i += strlen(n + i);
- pattern = n;
- size = i;
- }
+ sep = memchr (motif, '\n', total);
+ if (sep)
+ {
+ size = sep - motif;
+ sep++;
+ total -= (size + 1);
+ }
+ else
+ size = total;
+
+ patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
+ if (patterns == NULL)
+ fatal (_("memory exhausted"), 0);
+ patterns[pcount] = patterns0;
+
+ if ((err = re_compile_pattern(motif, size,
+ &(patterns[pcount].regexbuf))) != 0)
+ fatal(err, 0);
+
+ /* In the match_words and match_lines cases, we use a different pattern
+ for the DFA matcher that will quickly throw out cases that won't work.
+ Then if DFA succeeds we do some hairy stuff using the regex matcher
+ to decide whether the match should really count. */
+ if (match_words || match_lines)
+ {
+ /* In the whole-word case, we use the pattern:
+ (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
+ In the whole-line case, we use the pattern:
+ ^(userpattern)$. */
+
+ static char const line_beg[] = "^(";
+ static char const line_end[] = ")$";
+ static char const word_beg[] = "(^|[^[:alnum:]_])(";
+ static char const word_end[] = ")([^[:alnum:]_]|$)";
+ char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
+ size_t i;
+ strcpy (n, match_lines ? line_beg : word_beg);
+ i = strlen(n);
+ memcpy(n + i, motif, size);
+ i += size;
+ strcpy (n + i, match_lines ? line_end : word_end);
+ i += strlen(n + i);
+ motif = n;
+ size = i;
+ }
+
+ dfacomp (motif, size, &(patterns[pcount].dfa), 1);
+ kwsmusts (&(patterns[pcount]));
+ pcount++;
- dfacomp (pattern, size, &dfa, 1);
- kwsmusts();
+ motif = sep;
+ } while (sep && total == 0);
}
static size_t
@@ -262,131 +337,141 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
char eol = eolbyte;
int backref, start, len;
struct kwsmatch kwsm;
- static struct re_registers regs; /* This is static on account of a BRAIN-DEAD
- Q@#%!# library interface in regex.c. */
+ size_t i;
#ifdef MBS_SUPPORT
- char *mb_properties;
- if (MB_CUR_MAX > 1 && kwset)
- mb_properties = check_multibyte_string(buf, size);
+ char *mb_properties = NULL;
#endif /* MBS_SUPPORT */
- buflim = buf + size;
-
- for (beg = end = buf; end < buflim; beg = end)
+ for (i = 0; i < pcount; i++)
{
- if (!exact)
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1 && patterns[i].kwset)
+ mb_properties = check_multibyte_string(buf, size);
+#endif /* MBS_SUPPORT */
+
+ buflim = buf + size;
+
+ for (beg = end = buf; end < buflim; beg = end)
{
- if (kwset)
+ if (!exact)
{
- /* Find a possible match using the KWset matcher. */
- size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
- if (offset == (size_t) -1)
+ if (patterns[i].kwset)
{
+ /* Find a possible match using the KWset matcher. */
+ size_t offset = kwsexec (patterns[i].kwset, beg,
+ buflim - beg, &kwsm);
+ if (offset == (size_t) -1)
+ {
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- free(mb_properties);
+ if (MB_CUR_MAX > 1)
+ free(mb_properties);
#endif
- return (size_t) -1;
- }
- beg += offset;
- /* Narrow down to the line containing the candidate, and
- run it through DFA. */
- end = memchr(beg, eol, buflim - beg);
- end++;
- while (beg > buf && beg[-1] != eol)
- --beg;
- if (kwsm.index < kwset_exact_matches)
- {
+ break;
+ }
+ beg += offset;
+ /* Narrow down to the line containing the candidate, and
+ run it through DFA. */
+ end = memchr(beg, eol, buflim - beg);
+ end++;
+ while (beg > buf && beg[-1] != eol)
+ --beg;
+ if (kwsm.index < patterns[i].kwset_exact_matches)
+ {
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX == 1 || mb_properties[beg - buf] != 0)
- goto success;
+ if (MB_CUR_MAX == 1 || mb_properties[beg - buf] != 0)
+ goto success;
#else
- goto success;
+ goto success;
#endif
+ }
+ if (dfaexec (&(patterns[i].dfa), beg, end - beg, &backref) == (size_t) -1)
+ continue;
}
- if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
- continue;
+ else
+ {
+ /* No good fixed strings; start with DFA. */
+ size_t offset = dfaexec (&(patterns[i].dfa), beg, buflim - beg, &backref);
+ if (offset == (size_t) -1)
+ break;
+ /* Narrow down to the line we've found. */
+ beg += offset;
+ end = memchr(beg, eol, buflim - beg);
+ end++;
+ while (beg > buf && beg[-1] != eol)
+ --beg;
+ }
+ /* Successful, no backreferences encountered! */
+ if (!backref)
+ goto success;
}
else
+ end = beg + size;
+
+ /* If we've made it to this point, this means DFA has seen
+ a probable match, and we need to run it through Regex. */
+ patterns[i].regexbuf.not_eol = 0;
+ if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
+ end - beg - 1, 0,
+ end - beg - 1, &(patterns[i].regs))))
{
- /* No good fixed strings; start with DFA. */
- size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
- if (offset == (size_t) -1)
- return (size_t) -1;
- /* Narrow down to the line we've found. */
- beg += offset;
- end = memchr(beg, eol, buflim - beg);
- end++;
- while (beg > buf && beg[-1] != eol)
- --beg;
- }
- /* Successful, no backreferences encountered! */
- if (!backref)
- goto success;
- }
- else
- end = beg + size;
-
- /* If we've made it to this point, this means DFA has seen
- a probable match, and we need to run it through Regex. */
- regexbuf.not_eol = 0;
- if (0 <= (start = re_search (&regexbuf, beg,
- end - beg - 1, 0,
- end - beg - 1, &regs)))
- {
- len = regs.end[0] - start;
- if (exact)
- {
- *match_size = len;
- return start;
- }
- if ((!match_lines && !match_words)
- || (match_lines && len == end - beg - 1))
- goto success;
- /* If -w, check if the match aligns with word boundaries.
- We do this iteratively because:
- (a) the line may contain more than one occurence of the pattern, and
- (b) Several alternatives in the pattern might be valid at a given
- point, and we may need to consider a shorter one to find a word
- boundary. */
- if (match_words)
- while (start >= 0)
- {
- if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
- && (len == end - beg - 1
- || !WCHAR ((unsigned char) beg[start + len])))
- goto success;
- if (len > 0)
- {
- /* Try a shorter length anchored at the same place. */
- --len;
- regexbuf.not_eol = 1;
- len = re_match(&regexbuf, beg, start + len, start, &regs);
- }
- if (len <= 0)
+ len = patterns[i].regs.end[0] - start;
+ if (exact)
+ {
+ *match_size = len;
+ return start;
+ }
+ if ((!match_lines && !match_words)
+ || (match_lines && len == end - beg - 1))
+ goto success;
+ /* If -w, check if the match aligns with word boundaries.
+ We do this iteratively because:
+ (a) the line may contain more than one occurence of the
+ pattern, and
+ (b) Several alternatives in the pattern might be valid at a
+ given point, and we may need to consider a shorter one to
+ find a word boundary. */
+ if (match_words)
+ while (start >= 0)
{
- /* Try looking further on. */
- if (start == end - beg - 1)
- break;
- ++start;
- regexbuf.not_eol = 0;
- start = re_search (&regexbuf, beg, end - beg - 1,
- start, end - beg - 1 - start, &regs);
- len = regs.end[0] - start;
+ if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
+ && (len == end - beg - 1
+ || !WCHAR ((unsigned char) beg[start + len])))
+ goto success;
+ if (len > 0)
+ {
+ /* Try a shorter length anchored at the same place. */
+ --len;
+ patterns[i].regexbuf.not_eol = 1;
+ len = re_match(&(patterns[i].regexbuf), beg,
+ start + len, start,
+ &(patterns[i].regs));
+ }
+ if (len <= 0)
+ {
+ /* Try looking further on. */
+ if (start == end - beg - 1)
+ break;
+ ++start;
+ patterns[i].regexbuf.not_eol = 0;
+ start = re_search (&(patterns[i].regexbuf), beg,
+ end - beg - 1,
+ start, end - beg - 1 - start,
+ &(patterns[i].regs));
+ len = patterns[i].regs.end[0] - start;
+ }
}
- }
- }
- }
-
+ }
+ } /* for (beg = end ..) */
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1 && kwset)
- free(mb_properties);
+ if (MB_CUR_MAX > 1 && mb_properties)
+ free(mb_properties);
#endif /* MBS_SUPPORT */
+ } /* for patterns */
return (size_t) -1;
success:
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1 && kwset)
+ if (MB_CUR_MAX > 1 && mb_properties)
free(mb_properties);
#endif /* MBS_SUPPORT */
*match_size = end - beg;
@@ -394,26 +479,32 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
}
static void
-Fcompile (char const *pattern, size_t size)
+Fcompile (char const *motif, size_t size)
{
char const *beg, *lim, *err;
- kwsinit();
- beg = pattern;
+ patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
+ if (patterns == NULL)
+ fatal (_("memory exhausted"), 0);
+ patterns[pcount] = patterns0;
+
+ kwsinit(&(patterns[pcount].kwset));
+ beg = motif;
do
{
- for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
+ for (lim = beg; lim < motif + size && *lim != '\n'; ++lim)
;
- if ((err = kwsincr(kwset, beg, lim - beg)) != 0)
+ if ((err = kwsincr(patterns[pcount].kwset, beg, lim - beg)) != 0)
fatal(err, 0);
- if (lim < pattern + size)
+ if (lim < motif + size)
++lim;
beg = lim;
}
- while (beg < pattern + size);
+ while (beg < motif + size);
- if ((err = kwsprep(kwset)) != 0)
+ if ((err = kwsprep(patterns[pcount].kwset)) != 0)
fatal(err, 0);
+ pcount++;
}
static size_t
@@ -423,70 +514,76 @@ Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
register size_t len;
char eol = eolbyte;
struct kwsmatch kwsmatch;
+ size_t i;
#ifdef MBS_SUPPORT
char *mb_properties;
if (MB_CUR_MAX > 1)
mb_properties = check_multibyte_string(buf, size);
#endif /* MBS_SUPPORT */
- for (beg = buf; beg <= buf + size; ++beg)
+ for (i = 0; i < pcount; i++)
{
- size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
- if (offset == (size_t) -1)
+ for (beg = buf; beg <= buf + size; ++beg)
{
+ size_t offset = kwsexec (patterns[i].kwset, beg,
+ buf + size - beg, &kwsmatch);
+ if (offset == (size_t) -1)
+ {
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- free(mb_properties);
+ if (MB_CUR_MAX > 1)
+ free(mb_properties);
#endif /* MBS_SUPPORT */
- return offset;
- }
+ return offset;
+ }
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1 && mb_properties[offset] == 0)
- continue; /* It is a part of multibyte character. */
+ if (MB_CUR_MAX > 1 && mb_properties[offset] == 0)
+ continue; /* It is a part of multibyte character. */
#endif /* MBS_SUPPORT */
- beg += offset;
- len = kwsmatch.size[0];
- if (exact)
- {
- *match_size = len;
+ beg += offset;
+ len = kwsmatch.size[0];
+ if (exact)
+ {
+ *match_size = len;
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- free(mb_properties);
+ if (MB_CUR_MAX > 1)
+ free(mb_properties);
#endif /* MBS_SUPPORT */
- return beg - buf;
- }
- if (match_lines)
- {
- if (beg > buf && beg[-1] != eol)
- continue;
- if (beg + len < buf + size && beg[len] != eol)
- continue;
- goto success;
- }
- else if (match_words)
- for (try = beg; len; )
- {
- if (try > buf && WCHAR((unsigned char) try[-1]))
- break;
- if (try + len < buf + size && WCHAR((unsigned char) try[len]))
+ return beg - buf;
+ }
+ if (match_lines)
+ {
+ if (beg > buf && beg[-1] != eol)
+ continue;
+ if (beg + len < buf + size && beg[len] != eol)
+ continue;
+ goto success;
+ }
+ else if (match_words)
+ for (try = beg; len; )
{
- size_t offset = kwsexec (kwset, beg, --len, &kwsmatch);
- if (offset == (size_t) -1)
+ if (try > buf && WCHAR((unsigned char) try[-1]))
+ break;
+ if (try + len < buf + size && WCHAR((unsigned char) try[len]))
{
+ size_t offset = kwsexec (patterns[i].kwset, beg,
+ --len, &kwsmatch);
+ if (offset == (size_t) -1)
+ {
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- free(mb_properties);
+ if (MB_CUR_MAX > 1)
+ free(mb_properties);
#endif /* MBS_SUPPORT */
- return offset;
+ return offset;
+ }
+ try = beg + offset;
+ len = kwsmatch.size[0];
}
- try = beg + offset;
- len = kwsmatch.size[0];
+ else
+ goto success;
}
- else
- goto success;
- }
- else
- goto success;
+ else
+ goto success;
+ }
}
#ifdef MBS_SUPPORT
diff --git a/tests/.cvsignore b/tests/.cvsignore
new file mode 100644
index 00000000..282522db
--- /dev/null
+++ b/tests/.cvsignore
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
diff --git a/vms/.cvsignore b/vms/.cvsignore
new file mode 100644
index 00000000..282522db
--- /dev/null
+++ b/vms/.cvsignore
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in