diff options
| author | Karl Berry <karl@gnu.org> | 1992-11-21 01:51:33 +0000 | 
|---|---|---|
| committer | Karl Berry <karl@gnu.org> | 1992-11-21 01:51:33 +0000 | 
| commit | 9114e2792fe477299d6d7f8856c616fe4ce31d21 (patch) | |
| tree | 0289e6410211f571f96229fc8321d9e83b1abef1 /src/regex.c | |
| parent | 9549c46d0b7b76c104c152e78feb33029c0e562a (diff) | |
| download | emacs-9114e2792fe477299d6d7f8856c616fe4ce31d21.tar.gz | |
*** empty log message ***
Diffstat (limited to 'src/regex.c')
| -rw-r--r-- | src/regex.c | 166 | 
1 files changed, 98 insertions, 68 deletions
diff --git a/src/regex.c b/src/regex.c index 71aa4cc87e1..112e0d6d7ad 100644 --- a/src/regex.c +++ b/src/regex.c @@ -47,9 +47,15 @@     `BSTRING', as far as I know, and neither of them use this code.  */  #if USG || STDC_HEADERS  #include <string.h> +#ifndef bcmp  #define bcmp(s1, s2, n)	memcmp ((s1), (s2), (n)) +#endif +#ifndef bcopy  #define bcopy(s, d, n)	memcpy ((d), (s), (n)) +#endif +#ifndef bzero  #define bzero(s, n)	memset ((s), 0, (n)) +#endif  #else  #include <strings.h>  #endif @@ -135,12 +141,8 @@ init_syntax_once ()     (Per Bothner suggested the basic approach.)  */  #undef SIGN_EXTEND_CHAR  #if __STDC__ -#ifndef VMS  #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) -#else /* On VMS, VAXC doesn't recognize `signed' before `char' */ -#define SIGN_EXTEND_CHAR(c) ((char) (c)) -#endif /* VMS */ -#else +#else  /* not __STDC__ */  /* As in Harbison and Steele.  */  #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)  #endif @@ -447,6 +449,7 @@ static int debug = 0;  #define DEBUG_PRINT1(x) if (debug) printf (x)  #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)  #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) +#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)  #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 				\    if (debug) print_partial_compiled_pattern (s, e)  #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)			\ @@ -760,6 +763,7 @@ print_double_string (where, string1, size1, string2, size2)  #define DEBUG_PRINT1(x)  #define DEBUG_PRINT2(x1, x2)  #define DEBUG_PRINT3(x1, x2, x3) +#define DEBUG_PRINT4(x1, x2, x3, x4)  #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)  #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) @@ -1025,9 +1029,9 @@ typedef struct       `buffer' is the compiled pattern;       `syntax' is set to SYNTAX;       `used' is set to the length of the compiled pattern; -     `fastmap_accurate' is set to zero; -     `re_nsub' is set to the number of groups in PATTERN; -     `not_bol' and `not_eol' are set to zero. +     `fastmap_accurate' is zero; +     `re_nsub' is the number of subexpressions in PATTERN; +     `not_bol' and `not_eol' are zero;     The `fastmap' and `newline_anchor' fields are neither     examined nor set.  */ @@ -1676,10 +1680,10 @@ regex_compile (pattern, size, syntax, bufp)                            |   v |   v                            a | b   | c    -                 If we are at `b,' then fixup_alt_jump right now points to a -                 three-byte space after `a.'  We'll put in the jump, set -                 fixup_alt_jump to right after `b,' and leave behind three -                 bytes which we'll fill in when we get to after `c.'  */ +                 If we are at `b', then fixup_alt_jump right now points to a +                 three-byte space after `a'.  We'll put in the jump, set +                 fixup_alt_jump to right after `b', and leave behind three +                 bytes which we'll fill in when we get to after `c'.  */                if (fixup_alt_jump)                  STORE_JUMP (jump_past_alt, fixup_alt_jump, b); @@ -2320,6 +2324,7 @@ typedef struct      int this_reg;							\      									\      DEBUG_STATEMENT (failure_id++);					\ +    DEBUG_STATEMENT (nfailure_points_pushed++);				\      DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id);		\      DEBUG_PRINT2 ("  Before push, next avail: %d\n", (fail_stack).avail);\      DEBUG_PRINT2 ("                     size: %d\n", (fail_stack).size);\ @@ -2473,6 +2478,8 @@ typedef struct        regstart[this_reg] = (const char *) POP_FAILURE_ITEM ();		\        DEBUG_PRINT2 ("      start: 0x%x\n", regstart[this_reg]);		\      }									\ +									\ +  DEBUG_STATEMENT (nfailure_points_popped++);				\  } /* POP_FAILURE_POINT */  /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in @@ -2860,15 +2867,9 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)    else if (endpos > total_size)      range = total_size - startpos; -  /* Update the fastmap now if not correct already.  */ -  if (fastmap && !bufp->fastmap_accurate) -    if (re_compile_fastmap (bufp) == -2) -      return -2; -      /* If the search isn't to be a backwards one, don't waste time in a -     long search for a pattern that says it is anchored.  */ -  if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf -      && range > 0) +     search for a pattern that must be anchored.  */ +  if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)      {        if (startpos > 0)  	return -1; @@ -2876,6 +2877,12 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)  	range = 1;      } +  /* Update the fastmap now if not correct already.  */ +  if (fastmap && !bufp->fastmap_accurate) +    if (re_compile_fastmap (bufp) == -2) +      return -2; +   +  /* Loop through the string, looking for a place to start matching.  */    for (;;)      {         /* If a fastmap is supplied, skip quickly over characters that @@ -2913,7 +2920,7 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)                                   ? string2[startpos - size1]                                    : string1[startpos]); -	      if (!fastmap[TRANSLATE (c)]) +	      if (!fastmap[(unsigned char) TRANSLATE (c)])  		goto advance;  	    }  	} @@ -2987,12 +2994,9 @@ typedef union  #define EVER_MATCHED_SOMETHING(R)  ((R).bits.ever_matched_something) -/* Call this when have matched something; it sets `matched' flags for the -   registers corresponding to the group of which we currently are inside.   -   Also records whether this group ever matched something.  We only care -   about this information at `stop_memory', and then only about the -   previous time through the loop (if the group is starred or whatever). -   So it is ok to clear all the nonactive registers here.  */ +/* Call this when have matched a real character; it sets `matched' flags +   for the subexpressions which we are currently inside.  Also records +   that those subexprs have matched.  */  #define SET_REGS_MATCHED()						\    do									\      {									\ @@ -3037,24 +3041,24 @@ typedef union  /* Test if at very beginning or at very end of the virtual concatenation     of `string1' and `string2'.  If only one string, it's `string2'.  */ -#define AT_STRINGS_BEG() (d == (size1 ? string1 : string2) || !size2) -#define AT_STRINGS_END() (d == end2)	 +#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) +#define AT_STRINGS_END(d) ((d) == end2)	  /* Test if D points to a character which is word-constituent.  We have     two special cases to check for: if past the end of string1, look at     the first character in string2; and if before the beginning of -   string2, look at the last character in string1. -    -   Assumes `string1' exists, so use in conjunction with AT_STRINGS_BEG ().  */ -#define LETTER_P(d)							\ +   string2, look at the last character in string1.  */ +#define WORDCHAR_P(d)							\    (SYNTAX ((d) == end1 ? *string2					\ -  : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == Sword) +           : (d) == string2 - 1 ? *(end1 - 1) : *(d))			\ +   == Sword)  /* Test if the character before D and the one at D differ with respect     to being word-constituent.  */  #define AT_WORD_BOUNDARY(d)						\ -  (AT_STRINGS_BEG () || AT_STRINGS_END () || LETTER_P (d - 1) != LETTER_P (d)) +  (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)				\ +   || WORDCHAR_P (d - 1) != WORDCHAR_P (d))  /* Free everything we malloc.  */ @@ -3161,6 +3165,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)    fail_stack_type fail_stack;  #ifdef DEBUG    static unsigned failure_id = 0; +  unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;  #endif    /* We fill all the registers internally, independent of what we @@ -3254,8 +3259,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)    else      {        /* We must initialize all our variables to NULL, so that -         `FREE_VARIABLES' doesn't try to free them.  Too bad this isn't -         Lisp, so we could have a list of variables.  As it is, */ +         `FREE_VARIABLES' doesn't try to free them.  */        regstart = regend = old_regstart = old_regend = best_regstart          = best_regend = reg_dummy = NULL;        reg_info = reg_info_dummy = (register_info_type *) NULL; @@ -3339,8 +3343,10 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)        if (p == pend)  	{ /* End of pattern means we might have succeeded.  */ -          DEBUG_PRINT1 ("End of pattern: "); -	  /* If not end of string, try backtracking.  Otherwise done.  */ +          DEBUG_PRINT1 ("end of pattern ... "); +           +	  /* If we haven't matched the entire string, and we want the +             longest match, try backtracking.  */            if (d != end_match_2)  	    {                DEBUG_PRINT1 ("backtracking.\n"); @@ -3378,6 +3384,8 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)                       For example, the pattern `x.*y.*z' against the                       strings `x-' and `y-z-', if the two strings are                       not consecutive in memory.  */ +                  DEBUG_PRINT1 ("Restoring best registers.\n"); +                                      d = match_end;                    dend = ((d >= string1 && d <= end1)  		           ? end_match_1 : end_match_2); @@ -3390,7 +3398,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)                  }              } /* d != end_match_2 */ -          DEBUG_PRINT1 ("\nAccepting match.\n"); +          DEBUG_PRINT1 ("Accepting match.\n");            /* If caller wants register contents data back, do it.  */            if (regs && !bufp->no_sub) @@ -3456,7 +3464,10 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)  	    } /* regs && !bufp->no_sub */            FREE_VARIABLES (); -          DEBUG_PRINT2 ("%d registers pushed.\n", num_regs_pushed); +          DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", +                        nfailure_points_pushed, nfailure_points_popped, +                        nfailure_points_pushed - nfailure_points_popped); +          DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);            mcnt = d - pos - (MATCHING_IN_FIRST_STRING   			    ? string1  @@ -3658,7 +3669,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)            /* If just failed to match something this time around with a               group that's operated on by a repetition operator, try to -             force exit from the ``loop,'' and restore the register +             force exit from the ``loop'', and restore the register               information for this group that we had before trying this               last match.  */            if ((!MATCHED_SOMETHING (reg_info[*p]) @@ -3802,7 +3813,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)  	case begline:            DEBUG_PRINT1 ("EXECUTING begline.\n"); -          if (AT_STRINGS_BEG ()) +          if (AT_STRINGS_BEG (d))              {                if (!bufp->not_bol) break;              } @@ -3818,7 +3829,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)  	case endline:            DEBUG_PRINT1 ("EXECUTING endline.\n"); -          if (AT_STRINGS_END ()) +          if (AT_STRINGS_END (d))              {                if (!bufp->not_eol) break;              } @@ -3835,7 +3846,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)  	/* Match at the very beginning of the data.  */          case begbuf:            DEBUG_PRINT1 ("EXECUTING begbuf.\n"); -          if (AT_STRINGS_BEG ()) +          if (AT_STRINGS_BEG (d))              break;            goto fail; @@ -3843,7 +3854,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)  	/* Match at the very end of the data.  */          case endbuf:            DEBUG_PRINT1 ("EXECUTING endbuf.\n"); -	  if (AT_STRINGS_END ()) +	  if (AT_STRINGS_END (d))  	    break;            goto fail; @@ -3897,7 +3908,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)               the original * applied to a group), save the information               for that group and all inner ones, so that if we fail back               to this point, the group's information will be correct. -             For example, in \(a*\)*\1, we only need the preceding group, +             For example, in \(a*\)*\1, we need the preceding group,               and in \(\(a*\)b*\)\2, we need the inner group.  */            /* We can't use `p' to check ahead because we push @@ -3927,8 +3938,8 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)            break; -        /* A smart repeat ends with a maybe_pop_jump. -	   We change it either to a pop_failure_jump or a jump.  */ +        /* A smart repeat ends with `maybe_pop_jump'. +	   We change it to either `pop_failure_jump' or `jump'.  */          case maybe_pop_jump:            EXTRACT_NUMBER_AND_INCR (mcnt, p);            DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); @@ -3956,10 +3967,21 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)              /* If we're at the end of the pattern, we can change.  */              if (p2 == pend) -              { +              { /* But if we're also at the end of the string, we might +                   as well skip changing anything.  For example, in `a+' +                   against `a', we'll have already matched the `a', and +                   I don't see the the point of changing the opcode, +                   popping the failure point, finding out it fails, and +                   then going into our endgame.  */ +                if (d == dend) +                  { +                    p = pend; +                    DEBUG_PRINT1 ("  End of pattern & string => done.\n"); +                    continue; +                  } +                    	        p[-3] = (unsigned char) pop_failure_jump; -                DEBUG_PRINT1 -                  ("  End of pattern: change to `pop_failure_jump'.\n"); +                DEBUG_PRINT1 ("  End of pattern => pop_failure_jump.\n");                }              else if ((re_opcode_t) *p2 == exactn @@ -3973,7 +3995,12 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)                     to the `maybe_finalize_jump' of this case.  Examine what                      follows.  */                  if ((re_opcode_t) p1[3] == exactn && p1[5] != c) -		  p[-3] = (unsigned char) pop_failure_jump; +                  { +  		    p[-3] = (unsigned char) pop_failure_jump; +                    DEBUG_PRINT3 ("  %c != %c => pop_failure_jump.\n", +                                  c, p1[5]); +                  } +                    		else if ((re_opcode_t) p1[3] == charset  			 || (re_opcode_t) p1[3] == charset_not)  		  { @@ -3988,9 +4015,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)  		    if (!not)                        {    		        p[-3] = (unsigned char) pop_failure_jump; -                        DEBUG_PRINT1 -                          ("  No match: change to `pop_failure_jump'.\n"); - +                        DEBUG_PRINT1 ("  No match => pop_failure_jump.\n");                        }  		  }  	      } @@ -3999,6 +4024,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)  	  if ((re_opcode_t) p[-1] != pop_failure_jump)  	    {  	      p[-1] = (unsigned char) jump; +              DEBUG_PRINT1 ("  Match => jump.\n");  	      goto unconditional_jump;  	    }          /* Note fall through.  */ @@ -4060,7 +4086,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)          /* At the end of an alternative, we need to push a dummy failure -           point in case we are followed by a pop_failure_jump', because +           point in case we are followed by a `pop_failure_jump', because             we don't want the failure point for the alternative to be             popped.  For example, matching `(a|ab)*' against `aab'             requires that we match the `ab' alternative.  */ @@ -4137,14 +4163,14 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)  	case wordbeg:            DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); -	  if (LETTER_P (d) && (AT_STRINGS_BEG () || !LETTER_P (d - 1))) +	  if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))  	    break;            goto fail;  	case wordend:            DEBUG_PRINT1 ("EXECUTING wordend.\n"); -	  if (!AT_STRINGS_BEG () && LETTER_P (d - 1) -              && (!LETTER_P (d) || AT_STRINGS_END ())) +	  if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) +              && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))  	    break;            goto fail; @@ -4181,11 +4207,12 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)  	  goto matchsyntax;          case wordchar: -          DEBUG_PRINT1 ("EXECUTING wordchar.\n"); +          DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");  	  mcnt = (int) Sword;          matchsyntax:  	  PREFETCH (); -	  if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail; +	  if (SYNTAX (*d++) != (enum syntaxcode) mcnt) +            goto fail;            SET_REGS_MATCHED ();  	  break; @@ -4195,11 +4222,12 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)  	  goto matchnotsyntax;          case notwordchar: -          DEBUG_PRINT1 ("EXECUTING notwordchar.\n"); +          DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");  	  mcnt = (int) Sword; -        matchnotsyntax: /* We goto here from notsyntaxspec.  */ +        matchnotsyntax:  	  PREFETCH (); -	  if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail; +	  if (SYNTAX (*d++) == (enum syntaxcode) mcnt) +            goto fail;  	  SET_REGS_MATCHED ();            break; @@ -4207,17 +4235,19 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)  	case wordchar:            DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");  	  PREFETCH (); -          if (!LETTER_P (d)) +          if (!WORDCHAR_P (d))              goto fail;  	  SET_REGS_MATCHED (); +          d++;  	  break;  	case notwordchar:            DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");  	  PREFETCH (); -	  if (LETTER_P (d)) +	  if (WORDCHAR_P (d))              goto fail;            SET_REGS_MATCHED (); +          d++;  	  break;  #endif /* not emacs */ @@ -4812,7 +4842,7 @@ regexec (preg, string, nmatch, pmatch, eflags)  /* Returns a message corresponding to an error code, ERRCODE, returned -   from either regcomp or regexec.   */ +   from either regcomp or regexec.   We don't use PREG here.  */  size_t  regerror (errcode, preg, errbuf, errbuf_size)  | 
