diff options
| -rw-r--r-- | src/ChangeLog | 14 | ||||
| -rw-r--r-- | src/regex.c | 149 | ||||
| -rw-r--r-- | src/regex.h | 14 | 
3 files changed, 121 insertions, 56 deletions
| diff --git a/src/ChangeLog b/src/ChangeLog index f72cf43685a..27b79e8998a 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,17 @@ +2000-08-31  Stefan Monnier  <monnier@cs.yale.edu> + +	* regex.h (RE_NO_NEWLINE_ANCHOR): New syntax flag. +	(struct re_pattern_buffer): Remove newline_anchor. +	* regex.c: Keep namespace clean for GNU libc by renaming <fun> +	to __<fun> and using `weak_alias (__<fun>, <fun>)'. +	(re_max_failures, fail_stack): Use size_t rather than unsigned. +	(regex_compile): For ^ and $, choose between buffer and line (beg|end) +	depending on the new RE_NO_NEWLINE_ANCHOR syntax flag. +	(print_compiled_pattern, re_search_2, mutually_exclusive_p) +	(re_match_2_internal, re_compile_pattern, re_comp, regcomp): +	Get rid of references to newline_anchor. +	(regcomp): Allocate and precompute a fastmap. +  2000-08-31  Gerd Moellmann  <gerd@gnu.org>  	* lread.c (openp): GCPRO local variable `filename'. diff --git a/src/regex.c b/src/regex.c index 71c9dfe4507..f779d9d82e1 100644 --- a/src/regex.c +++ b/src/regex.c @@ -25,7 +25,6 @@     - replace succeed_n + jump_n with a combined operation so that the counter       can simply be decremented when popping the failure_point without having       to stack up failure_count entries. -   - get rid of `newline_anchor'.   */  /* AIX requires this to be the first thing in the file. */ @@ -47,6 +46,38 @@  # include <sys/types.h>  #endif +#ifdef _LIBC +/* We have to keep the namespace clean.  */ +# define regfree(preg) __regfree (preg) +# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef) +# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags) +# define regerror(errcode, preg, errbuf, errbuf_size) \ +	__regerror(errcode, preg, errbuf, errbuf_size) +# define re_set_registers(bu, re, nu, st, en) \ +	__re_set_registers (bu, re, nu, st, en) +# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \ +	__re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) +# define re_match(bufp, string, size, pos, regs) \ +	__re_match (bufp, string, size, pos, regs) +# define re_search(bufp, string, size, startpos, range, regs) \ +	__re_search (bufp, string, size, startpos, range, regs) +# define re_compile_pattern(pattern, length, bufp) \ +	__re_compile_pattern (pattern, length, bufp) +# define re_set_syntax(syntax) __re_set_syntax (syntax) +# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \ +	__re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop) +# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp) + +# define WEAK_ALIAS(a,b) weak_alias (a, b) + +/* We are also using some library internals.  */ +# include <locale/localeinfo.h> +# include <locale/elem-hash.h> +# include <langinfo.h> +#else +# define WEAK_ALIAS(a,b) +#endif +  /* This is for other GNU distributions with internationalized messages.  */  #if HAVE_LIBINTL_H || defined _LIBC  # include <libintl.h> @@ -1108,7 +1139,6 @@ print_compiled_pattern (bufp)    printf ("re_nsub: %d\t", bufp->re_nsub);    printf ("regs_alloc: %d\t", bufp->regs_allocated);    printf ("can_be_null: %d\t", bufp->can_be_null); -  printf ("newline_anchor: %d\n", bufp->newline_anchor);    printf ("no_sub: %d\t", bufp->no_sub);    printf ("not_bol: %d\t", bufp->not_bol);    printf ("not_eol: %d\t", bufp->not_eol); @@ -1184,6 +1214,7 @@ re_set_syntax (syntax)    re_syntax_options = syntax;    return ret;  } +WEAK_ALIAS (__re_set_syntax, re_set_syntax)  /* This table gives an error message for each of the error codes listed     in regex.h.  Obviously the order here has to be same as there. @@ -1264,21 +1295,22 @@ static const char *re_error_msgid[] =  /* Roughly the maximum number of failure points on the stack.  Would be     exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed.     This is a variable only so users of regex can assign to it; we never -   change it ourselves.	 */ -#if defined MATCH_MAY_ALLOCATE -/* Note that 4400 is enough to cause a crash on Alpha OSF/1, +   change it ourselves.  */ +# if defined MATCH_MAY_ALLOCATE +/* Note that 4400 was enough to cause a crash on Alpha OSF/1,     whose default stack limit is 2mb.  In order for a larger     value to work reliably, you have to try to make it accord     with the process stack limit.  */ -int re_max_failures = 40000; -#else -int re_max_failures = 4000; -#endif +size_t re_max_failures = 40000; +# else +size_t re_max_failures = 4000; +# endif  union fail_stack_elt  { -   const unsigned char *pointer; -  unsigned int integer; +  const unsigned char *pointer; +  /* This should be the biggest `int' that's no bigger than a pointer.  */ +  long integer;  };  typedef union fail_stack_elt fail_stack_elt_t; @@ -1286,9 +1318,9 @@ typedef union fail_stack_elt fail_stack_elt_t;  typedef struct  {    fail_stack_elt_t *stack; -  unsigned size; -  unsigned avail;		/* Offset of next open position.  */ -  unsigned frame;		/* Offset of the cur constructed frame.  */ +  size_t size; +  size_t avail;	/* Offset of next open position.  */ +  size_t frame;	/* Offset of the cur constructed frame.  */  } fail_stack_type;  #define PATTERN_STACK_EMPTY()     (fail_stack.avail == 0) @@ -1963,8 +1995,7 @@ static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type       `re_nsub' is the number of subexpressions in PATTERN;       `not_bol' and `not_eol' are zero; -   The `fastmap' and `newline_anchor' fields are neither -   examined nor set.  */ +   The `fastmap' field is neither examined nor set.  */  /* Insert the `jump' from the end of last alternative to "here".     The space for the jump has already been allocated. */ @@ -2126,7 +2157,7 @@ regex_compile (pattern, size, syntax, bufp)  		|| syntax & RE_CONTEXT_INDEP_ANCHORS  		   /* Otherwise, depends on what's come before.	 */  		|| at_begline_loc_p (pattern, p, syntax)) -	      BUF_PUSH (begline); +	      BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? begbuf : begline);  	    else  	      goto normal_char;  	  } @@ -2141,7 +2172,7 @@ regex_compile (pattern, size, syntax, bufp)  		|| syntax & RE_CONTEXT_INDEP_ANCHORS  		   /* Otherwise, depends on what's next.  */  		|| at_endline_loc_p (p, pend, syntax)) -	       BUF_PUSH (endline); +	       BUF_PUSH ((syntax & RE_NO_NEWLINE_ANCHOR) ? endbuf : endline);  	     else  	       goto normal_char;  	   } @@ -3399,7 +3430,6 @@ analyse_first (p, pend, fastmap, multibyte)       so that `p' is monotonically increasing.  More to the point, we       never set `p' (or push) anything `<= p1'.  */ -  /* If can_be_null is set, then the fastmap will not be used anyway.  */    while (1)      {        /* `p1' is used as a marker of how far back a `on_failure_jump' @@ -3689,9 +3719,9 @@ re_compile_fastmap (bufp)    analysis = analyse_first (bufp->buffer, bufp->buffer + bufp->used,  			    fastmap, RE_MULTIBYTE_P (bufp)); +  bufp->can_be_null = (analysis != 0);    if (analysis < -1)      return analysis; -  bufp->can_be_null = (analysis != 0);    return 0;  } /* re_compile_fastmap */ @@ -3729,6 +3759,7 @@ re_set_registers (bufp, regs, num_regs, starts, ends)        regs->start = regs->end = (regoff_t *) 0;      }  } +WEAK_ALIAS (__re_set_registers, re_set_registers)  /* Searching routines.	*/ @@ -3745,6 +3776,7 @@ re_search (bufp, string, size, startpos, range, regs)    return re_search_2 (bufp, NULL, 0, string, size, startpos, range,  		      regs, size);  } +WEAK_ALIAS (__re_search, re_search)  /* End address of virtual concatenation of string.  */  #define STOP_ADDR_VSTRING(P)				\ @@ -3792,7 +3824,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)    register RE_TRANSLATE_TYPE translate = bufp->translate;    int total_size = size1 + size2;    int endpos = startpos + range; -  int anchored_start = 0; +  boolean anchored_start;    /* Nonzero if we have to concern multibyte character.	 */    const boolean multibyte = RE_MULTIBYTE_P (bufp); @@ -3836,8 +3868,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)        return -2;    /* See whether the pattern is anchored.  */ -  if (bufp->buffer[0] == begline) -    anchored_start = 1; +  anchored_start = (bufp->buffer[0] == begline);  #ifdef emacs    gl_state.object = re_match_object; @@ -3857,10 +3888,9 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)  	 because that case doesn't repeat.  */        if (anchored_start && startpos > 0)  	{ -	  if (! (bufp->newline_anchor -		 && ((startpos <= size1 ? string1[startpos - 1] -		      : string2[startpos - size1 - 1]) -		     == '\n'))) +	  if (! ((startpos <= size1 ? string1[startpos - 1] +		  : string2[startpos - size1 - 1]) +		 == '\n'))  	    goto advance;  	} @@ -4009,6 +4039,7 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)      }    return -1;  } /* re_search_2 */ +WEAK_ALIAS (__re_search_2, re_search_2)  /* Declarations and macros for re_match_2.  */ @@ -4213,9 +4244,6 @@ mutually_exclusive_p (bufp, p1, p2)        break;      case endline: -      if (!bufp->newline_anchor) -	break; -      /* Fallthrough */      case exactn:        {  	register unsigned int c @@ -4377,6 +4405,7 @@ re_match (bufp, string, size, pos, regs)  # endif    return result;  } +WEAK_ALIAS (__re_match, re_match)  #endif /* not emacs */  #ifdef emacs @@ -4424,6 +4453,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)  #endif    return result;  } +WEAK_ALIAS (__re_match_2, re_match_2)  /* This is a separate function so that we can force an alloca cleanup     afterwards.	*/ @@ -5089,8 +5119,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)  	/* begline matches the empty string at the beginning of the string -	   (unless `not_bol' is set in `bufp'), and, if -	   `newline_anchor' is set, after newlines.  */ +	   (unless `not_bol' is set in `bufp'), and after newlines.  */  	case begline:  	  DEBUG_PRINT1 ("EXECUTING begline.\n"); @@ -5102,7 +5131,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)  	    {  	      unsigned char c;  	      GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2); -	      if (c == '\n' && bufp->newline_anchor) +	      if (c == '\n')  		break;  	    }  	  /* In all other cases, we fail.  */ @@ -5120,7 +5149,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)  	  else  	    {  	      PREFETCH_NOLIMIT (); -	      if (*d == '\n' && bufp->newline_anchor) +	      if (*d == '\n')  		break;  	    }  	  goto fail; @@ -5645,15 +5674,13 @@ re_compile_pattern (pattern, length, bufp)       setting no_sub.  */    bufp->no_sub = 0; -  /* Match anchors at newline.  */ -  bufp->newline_anchor = 1; -    ret = regex_compile ((re_char*) pattern, length, re_syntax_options, bufp);    if (!ret)      return NULL;    return gettext (re_error_msgid[(int) ret]);  } +WEAK_ALIAS (__re_compile_pattern, re_compile_pattern)  /* Entry points compatible with 4.2 BSD regex library.  We don't define     them unless specifically requested.  */ @@ -5700,9 +5727,6 @@ re_comp (s)    /* Since `re_exec' always passes NULL for the `regs' argument, we       don't need to initialize the pattern buffer fields which affect it.  */ -  /* Match anchors at newlines.  */ -  re_comp_buf.newline_anchor = 1; -    ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);    if (!ret) @@ -5740,8 +5764,8 @@ re_exec (s)       `syntax' to RE_SYNTAX_POSIX_EXTENDED if the         REG_EXTENDED bit in CFLAGS is set; otherwise, to         RE_SYNTAX_POSIX_BASIC; -     `newline_anchor' to REG_NEWLINE being set in CFLAGS; -     `fastmap' and `fastmap_accurate' to zero; +     `fastmap' to an allocated space for the fastmap; +     `fastmap_accurate' to zero;       `re_nsub' to the number of subexpressions in PATTERN.     PATTERN is the address of the pattern string. @@ -5780,11 +5804,8 @@ regcomp (preg, pattern, cflags)    preg->allocated = 0;    preg->used = 0; -  /* Don't bother to use a fastmap when searching.  This simplifies the -     REG_NEWLINE case: if we used a fastmap, we'd have to put all the -     characters after newlines into the fastmap.  This way, we just try -     every character.  */ -  preg->fastmap = 0; +  /* Try to allocate space for the fastmap.  */ +  preg->fastmap = (char *) malloc (1 << BYTEWIDTH);    if (cflags & REG_ICASE)      { @@ -5808,11 +5829,9 @@ regcomp (preg, pattern, cflags)      { /* REG_NEWLINE implies neither . nor [^...] match newline.  */        syntax &= ~RE_DOT_NEWLINE;        syntax |= RE_HAT_LISTS_NOT_NEWLINE; -      /* It also changes the matching behavior.  */ -      preg->newline_anchor = 1;      }    else -    preg->newline_anchor = 0; +    syntax |= RE_NO_NEWLINE_ANCHOR;    preg->no_sub = !!(cflags & REG_NOSUB); @@ -5822,10 +5841,22 @@ regcomp (preg, pattern, cflags)    /* POSIX doesn't distinguish between an unmatched open-group and an       unmatched close-group: both are REG_EPAREN.  */ -  if (ret == REG_ERPAREN) ret = REG_EPAREN; - +  if (ret == REG_ERPAREN) +    ret = REG_EPAREN; + +  if (ret == REG_NOERROR && preg->fastmap) +    { /* Compute the fastmap now, since regexec cannot modify the pattern +	 buffer.  */ +      re_compile_fastmap (preg); +      if (preg->can_be_null) +	{ /* The fastmap can't be used anyway.  */ +	  free (preg->fastmap); +	  preg->fastmap = NULL; +	} +    }    return (int) ret;  } +WEAK_ALIAS (__regcomp, regcomp)  /* regexec searches for a given pattern, specified by PREG, in the @@ -5854,7 +5885,7 @@ regexec (preg, string, nmatch, pmatch, eflags)    struct re_registers regs;    regex_t private_preg;    int len = strlen (string); -  boolean want_reg_info = !preg->no_sub && nmatch > 0; +  boolean want_reg_info = !preg->no_sub && nmatch > 0 && pmatch;    private_preg = *preg; @@ -5875,6 +5906,15 @@ regexec (preg, string, nmatch, pmatch, eflags)        regs.end = regs.start + nmatch;      } +  /* Instead of using not_eol to implement REG_NOTEOL, we could simply +     pass (&private_preg, string, len + 1, 0, len, ...) pretending the string +     was a little bit longer but still only matching the real part. +     This works because the `endline' will check for a '\n' and will find a +     '\0', correctly deciding that this is not the end of a line. +     But it doesn't work out so nicely for REG_NOTBOL, since we don't have +     a convenient '\0' there.  For all we know, the string could be preceded +     by '\n' which would throw things off.  */ +    /* Perform the searching operation.  */    ret = re_search (&private_preg, string, len,  		   /* start: */ 0, /* range: */ len, @@ -5901,6 +5941,7 @@ regexec (preg, string, nmatch, pmatch, eflags)    /* We want zero return to mean success, unlike `re_search'.  */    return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;  } +WEAK_ALIAS (__regexec, regexec)  /* Returns a message corresponding to an error code, ERRCODE, returned @@ -5941,6 +5982,7 @@ regerror (errcode, preg, errbuf, errbuf_size)    return msg_size;  } +WEAK_ALIAS (__regerror, regerror)  /* Free dynamically allocated space used by PREG.  */ @@ -5965,5 +6007,6 @@ regfree (preg)      free (preg->translate);    preg->translate = NULL;  } +WEAK_ALIAS (__regfree, regfree)  #endif /* not emacs  */ diff --git a/src/regex.h b/src/regex.h index 46f2a633c3a..ef4284cdce2 100644 --- a/src/regex.h +++ b/src/regex.h @@ -150,6 +150,17 @@ typedef unsigned long int reg_syntax_t;  /* If this bit is set, then (?:...) is treated as a shy group.  */  #define RE_SHY_GROUPS (RE_FRUGAL << 1) +/* If this bit is set, ^ and $ only match at beg/end of buffer.  */ +#define RE_NO_NEWLINE_ANCHOR (RE_SHY_GROUPS << 1) + +/* If this bit is set, turn on internal regex debugging. +   If not set, and debugging was on, turn it off. +   This only works if regex.c is compiled -DDEBUG. +   We define this bit always, so that all that's needed to turn on +   debugging is to recompile regex.c; the calling code can always have +   this bit set, and it won't affect anything in the normal case. */ +#define RE_DEBUG (RE_NO_NEWLINE_ANCHOR << 1) +  /* This global variable defines the particular regexp syntax to use (for     some interfaces).  When a regexp is compiled, the syntax used is     stored in the pattern buffer, so changing this does not affect @@ -379,9 +390,6 @@ struct re_pattern_buffer          /* Similarly for an end-of-line anchor.  */    unsigned not_eol : 1; -        /* If true, an anchor at a newline matches.  */ -  unsigned newline_anchor : 1; -  #ifdef emacs    /* If true, multi-byte form in the `buffer' should be recognized as a       multibyte character. */ | 
