summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2015-08-02 20:41:21 +0300
committerArnold D. Robbins <arnold@skeeve.com>2015-08-02 20:41:21 +0300
commit2a5231bdfeb289a826bcc2c176d3363f0b2b89ed (patch)
treefde8992b44829592dd695e2175f2152c2a774777
parent0222717b832975115a091d74f9f78bb816760bc6 (diff)
parentc5137ae530c49765049adb53777c79ebb7607ebe (diff)
downloadgawk-feature/zOS.tar.gz
Merge branch 'gawk-4.1-stable' into feature/zOSfeature/zOS
-rw-r--r--ChangeLog4
-rw-r--r--dfa.c103
-rw-r--r--extension/ChangeLog6
-rw-r--r--extension/revoutput.3am4
-rw-r--r--extension/revoutput.c15
5 files changed, 84 insertions, 48 deletions
diff --git a/ChangeLog b/ChangeLog
index bbca613b..c8d655a6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2015-08-02 Arnold D. Robbins <arnold@skeeve.com>
+
+ * dfa.c: Sync with GNU grep. Yet again.
+
2015-07-21 Arnold D. Robbins <arnold@skeeve.com>
* dfa.c: Sync with GNU grep.
diff --git a/dfa.c b/dfa.c
index 782ef5cb..c55a5c93 100644
--- a/dfa.c
+++ b/dfa.c
@@ -317,8 +317,6 @@ typedef struct
size_t hash; /* Hash of the positions of this state. */
position_set elems; /* Positions this state could match. */
unsigned char context; /* Context from previous state. */
- bool has_backref; /* This state matches a \<digit>. */
- bool has_mbcset; /* This state matches a MBCSET. */
unsigned short constraint; /* Constraint for this state to accept. */
token first_end; /* Token value of the first END in elems. */
position_set mbps; /* Positions which can match multibyte
@@ -2207,8 +2205,6 @@ state_index (struct dfa *d, position_set const *s, int context)
alloc_position_set (&d->states[i].elems, s->nelem);
copy (s, &d->states[i].elems);
d->states[i].context = context;
- d->states[i].has_backref = false;
- d->states[i].has_mbcset = false;
d->states[i].constraint = 0;
d->states[i].first_end = 0;
d->states[i].mbps.nelem = 0;
@@ -2224,10 +2220,7 @@ state_index (struct dfa *d, position_set const *s, int context)
d->states[i].first_end = d->tokens[s->elems[j].index];
}
else if (d->tokens[s->elems[j].index] == BACKREF)
- {
- d->states[i].constraint = NO_CONSTRAINT;
- d->states[i].has_backref = true;
- }
+ d->states[i].constraint = NO_CONSTRAINT;
++d->sindex;
@@ -2686,9 +2679,6 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
if (d->tokens[pos.index] == MBCSET
|| d->tokens[pos.index] == ANYCHAR)
{
- /* MB_CUR_MAX > 1 */
- if (d->tokens[pos.index] == MBCSET)
- d->states[s].has_mbcset = true;
/* ANYCHAR and MBCSET must match with a single character, so we
must put it to d->states[s].mbps, which contains the positions
which can match with a single character not a byte. */
@@ -3400,15 +3390,18 @@ skip_remains_mb (struct dfa *d, unsigned char const *p,
When ALLOW_NL is nonzero, newlines may appear in the matching string.
If COUNT is non-NULL, increment *COUNT once for each newline processed.
Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
- encountered a back-reference (1) or not (0). The caller may use this
- to decide whether to fall back on a backtracking matcher.
-
- If MULTIBYTE, the input consists of multibyte characters and/or
- encoding-error bytes. Otherwise, the input consists of single-byte
- characters. */
+ encountered a DFA-unfriendly construct. The caller may use this to
+ decide whether to fall back on a matcher like regex. If MULTIBYTE,
+ the input consists of multibyte characters and/or encoding-error bytes.
+ Otherwise, the input consists of single-byte characters.
+ Here is the list of features that make this DFA matcher punt:
+ - [M-N]-range-in-MB-locale: regex is up to 25% faster on [a-z]
+ - back-reference: (.)\1
+ - word-delimiter-in-MB-locale: \<, \>, \b
+ */
static inline char *
-dfaexec_main (struct dfa *d, char const *begin, char *end,
- int allow_nl, size_t *count, int *backref, bool multibyte)
+dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl,
+ size_t *count, bool multibyte)
{
state_num s, s1; /* Current state. */
unsigned char const *p, *mbp; /* Current input character. */
@@ -3498,16 +3491,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end,
Use a macro to avoid the risk that they diverge. */
#define State_transition() \
do { \
- /* Falling back to the glibc matcher in this case gives \
- better performance (up to 25% better on [a-z], for \
- example) and enables support for collating symbols and \
- equivalence classes. */ \
- if (d->states[s].has_mbcset && backref) \
- { \
- *backref = 1; \
- goto done; \
- } \
- \
/* Can match with a multibyte character (and multi-character \
collating element). Transition table might be updated. */ \
s = transit_state (d, s, &p, (unsigned char *) end); \
@@ -3581,11 +3564,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end,
if (d->fails[s])
{
if (d->success[s] & sbit[*p])
- {
- if (backref)
- *backref = d->states[s].has_backref;
- goto done;
- }
+ goto done;
s1 = s;
if (multibyte)
@@ -3615,14 +3594,24 @@ static char *
dfaexec_mb (struct dfa *d, char const *begin, char *end,
int allow_nl, size_t *count, int *backref)
{
- return dfaexec_main (d, begin, end, allow_nl, count, backref, true);
+ return dfaexec_main (d, begin, end, allow_nl, count, true);
}
static char *
dfaexec_sb (struct dfa *d, char const *begin, char *end,
int allow_nl, size_t *count, int *backref)
{
- return dfaexec_main (d, begin, end, allow_nl, count, backref, false);
+ return dfaexec_main (d, begin, end, allow_nl, count, false);
+}
+
+/* Always set *BACKREF and return BEGIN. Use this wrapper for
+ any regexp that uses a construct not supported by this code. */
+static char *
+dfaexec_noop (struct dfa *d, char const *begin, char *end,
+ int allow_nl, size_t *count, int *backref)
+{
+ *backref = 1;
+ return (char *) begin;
}
/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, BACKREF, D->multibyte),
@@ -3688,6 +3677,31 @@ dfainit (struct dfa *d)
d->fast = !d->multibyte;
}
+/* Return true if every construct in D is supported by this DFA matcher. */
+static bool _GL_ATTRIBUTE_PURE
+dfa_supported (struct dfa const *d)
+{
+ size_t i;
+ for (i = 0; i < d->tindex; i++)
+ {
+ switch (d->tokens[i])
+ {
+ case BEGWORD:
+ case ENDWORD:
+ case LIMWORD:
+ case NOTLIMWORD:
+ if (!d->multibyte)
+ continue;
+ /* fallthrough */
+
+ case BACKREF:
+ case MBCSET:
+ return false;
+ }
+ }
+ return true;
+}
+
static void
dfaoptimize (struct dfa *d)
{
@@ -3785,10 +3799,8 @@ dfassbuild (struct dfa *d)
if (d->multibyte)
{
/* These constraints aren't supported in a multibyte locale.
- Ignore them in the superset DFA, and treat them as
- backreferences in the main DFA. */
+ Ignore them in the superset DFA. */
sup->tokens[j++] = EMPTY;
- d->tokens[i] = BACKREF;
break;
}
default:
@@ -3818,8 +3830,17 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
dfambcache (d);
dfaparse (s, len, d);
dfassbuild (d);
- dfaoptimize (d);
- dfaanalyze (d, searchflag);
+
+ if (dfa_supported (d))
+ {
+ dfaoptimize (d);
+ dfaanalyze (d, searchflag);
+ }
+ else
+ {
+ d->dfaexec = dfaexec_noop;
+ }
+
if (d->superset)
{
d->fast = true;
diff --git a/extension/ChangeLog b/extension/ChangeLog
index 993729a5..3cd932b6 100644
--- a/extension/ChangeLog
+++ b/extension/ChangeLog
@@ -1,3 +1,9 @@
+2015-08-02 Arnold D. Robbins <arnold@skeeve.com>
+
+ * revoutput.c (init_revoutput): Don't install REVOUT if it's
+ there already. Makes the extension usable with -v.
+ * revoutput.3am: Add a BUGS section.
+
2015-06-17 Andrew J. Schorr <aschorr@telemetry-investments.com>
* inplace.3am (BUGS): Document that ACLs are not preserved, and
diff --git a/extension/revoutput.3am b/extension/revoutput.3am
index 9c8f062f..8620935b 100644
--- a/extension/revoutput.3am
+++ b/extension/revoutput.3am
@@ -1,4 +1,4 @@
-.TH REVOUTPUT 3am "Jan 15 2013" "Free Software Foundation" "GNU Awk Extension Modules"
+.TH REVOUTPUT 3am "Aug 02 2015" "Free Software Foundation" "GNU Awk Extension Modules"
.SH NAME
revoutput \- Reverse output strings sample extension
.SH SYNOPSIS
@@ -35,6 +35,8 @@ The output from this program is:
dlrow ,olleh
.fi
.ft R
+.SH BUGS
+This extension does not affect the default standard output.
.SH "SEE ALSO"
.IR "GAWK: Effective AWK Programming" ,
.IR filefuncs (3am),
diff --git a/extension/revoutput.c b/extension/revoutput.c
index ae4b444a..69257167 100644
--- a/extension/revoutput.c
+++ b/extension/revoutput.c
@@ -7,7 +7,7 @@
*/
/*
- * Copyright (C) 2012, 2013 the Free Software Foundation, Inc.
+ * Copyright (C) 2012, 2013, 2015 the Free Software Foundation, Inc.
*
* This file is part of GAWK, the GNU implementation of the
* AWK Programming Language.
@@ -47,7 +47,7 @@
static const gawk_api_t *api; /* for convenience macros to work */
static awk_ext_id_t *ext_id;
-static const char *ext_version = "revoutput extension: version 1.0";
+static const char *ext_version = "revoutput extension: version 1.1";
static awk_bool_t init_revoutput(void);
static awk_bool_t (*init_func)(void) = init_revoutput;
@@ -120,11 +120,14 @@ init_revoutput()
register_output_wrapper(& output_wrapper);
- make_number(0.0, & value); /* init to false */
- if (! sym_update("REVOUT", & value)) {
- warning(ext_id, _("revoutput: could not initialize REVOUT variable"));
+ if (! sym_lookup("REVOUT", AWK_SCALAR, & value)) {
+ /* only install it if not there, e.g. -v REVOUT=1 */
+ make_number(0.0, & value); /* init to false */
+ if (! sym_update("REVOUT", & value)) {
+ warning(ext_id, _("revoutput: could not initialize REVOUT variable"));
- return awk_false;
+ return awk_false;
+ }
}
return awk_true;