diff options
Diffstat (limited to 'src/regex.c')
-rw-r--r-- | src/regex.c | 781 |
1 files changed, 513 insertions, 268 deletions
diff --git a/src/regex.c b/src/regex.c index 1e80b9bbeef..c6d6c36a0bc 100644 --- a/src/regex.c +++ b/src/regex.c @@ -3,11 +3,12 @@ internationalization features.) Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, - 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. + 2002, 2003, 2004, 2005, 2006, 2007 + Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) + the Free Software Foundation; either version 3, or (at your option) any later version. This program is distributed in the hope that it will be useful, @@ -123,7 +124,7 @@ # define SYNTAX_ENTRY_VIA_PROPERTY # include "syntax.h" -# include "charset.h" +# include "character.h" # include "category.h" # ifdef malloc @@ -144,28 +145,51 @@ # define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object))) # define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte) -# define RE_STRING_CHAR(p, s) \ +# define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte) +# define RE_STRING_CHAR(p, s, multibyte) \ (multibyte ? (STRING_CHAR (p, s)) : (*(p))) -# define RE_STRING_CHAR_AND_LENGTH(p, s, len) \ +# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) \ (multibyte ? (STRING_CHAR_AND_LENGTH (p, s, len)) : ((len) = 1, *(p))) -/* Set C a (possibly multibyte) character before P. P points into a - string which is the virtual concatenation of STR1 (which ends at - END1) or STR2 (which ends at END2). */ -# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ - do { \ - if (multibyte) \ - { \ - re_char *dtemp = (p) == (str2) ? (end1) : (p); \ - re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \ - re_char *d0 = dtemp; \ - PREV_CHAR_BOUNDARY (d0, dlimit); \ - c = STRING_CHAR (d0, dtemp - d0); \ - } \ - else \ - (c = ((p) == (str2) ? (end1) : (p))[-1]); \ +# define RE_CHAR_TO_MULTIBYTE(c) unibyte_to_multibyte_table[(c)] + +# define RE_CHAR_TO_UNIBYTE(c) \ + (ASCII_CHAR_P (c) ? (c) \ + : CHAR_BYTE8_P (c) ? CHAR_TO_BYTE8 (c) \ + : multibyte_char_to_unibyte_safe (c)) + +/* Set C a (possibly converted to multibyte) character before P. P + points into a string which is the virtual concatenation of STR1 + (which ends at END1) or STR2 (which ends at END2). */ +# define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ + do { \ + if (target_multibyte) \ + { \ + re_char *dtemp = (p) == (str2) ? (end1) : (p); \ + re_char *dlimit = ((p) > (str2) && (p) <= (end2)) ? (str2) : (str1); \ + while (dtemp-- > dlimit && !CHAR_HEAD_P (*dtemp)); \ + c = STRING_CHAR (dtemp, (p) - dtemp); \ + } \ + else \ + { \ + (c = ((p) == (str2) ? (end1) : (p))[-1]); \ + (c) = RE_CHAR_TO_MULTIBYTE (c); \ + } \ } while (0) +/* Set C a (possibly converted to multibyte) character at P, and set + LEN to the byte length of that character. */ +# define GET_CHAR_AFTER(c, p, len) \ + do { \ + if (target_multibyte) \ + (c) = STRING_CHAR_AND_LENGTH (p, 0, len); \ + else \ + { \ + (c) = *p; \ + len = 1; \ + (c) = RE_CHAR_TO_MULTIBYTE (c); \ + } \ + } while (0) #else /* not emacs */ @@ -277,6 +301,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; # define CHARSET_LEADING_CODE_BASE(c) 0 # define MAX_MULTIBYTE_LENGTH 1 # define RE_MULTIBYTE_P(x) 0 +# define RE_TARGET_MULTIBYTE_P(x) 0 # define WORD_BOUNDARY_P(c1, c2) (0) # define CHAR_HEAD_P(p) (1) # define SINGLE_BYTE_CHAR_P(c) (1) @@ -284,13 +309,21 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 }; # define MULTIBYTE_FORM_LENGTH(p, s) (1) # define PREV_CHAR_BOUNDARY(p, limit) ((p)--) # define STRING_CHAR(p, s) (*(p)) -# define RE_STRING_CHAR STRING_CHAR +# define RE_STRING_CHAR(p, s, multibyte) STRING_CHAR ((p), (s)) # define CHAR_STRING(c, s) (*(s) = (c), 1) # define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) -# define RE_STRING_CHAR_AND_LENGTH STRING_CHAR_AND_LENGTH +# define RE_STRING_CHAR_AND_LENGTH(p, s, len, multibyte) STRING_CHAR_AND_LENGTH ((p), (s), (len)) +# define RE_CHAR_TO_MULTIBYTE(c) (c) +# define RE_CHAR_TO_UNIBYTE(c) (c) # define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) +# define GET_CHAR_AFTER(c, p, len) \ + (c = *p, len = 1) # define MAKE_CHAR(charset, c1, c2) (c1) +# define BYTE8_TO_CHAR(c) (c) +# define CHAR_BYTE8_P(c) (0) +# define CHAR_LEADING_CODE(c) (c) + #endif /* not emacs */ #ifndef RE_TRANSLATE @@ -496,7 +529,7 @@ init_syntax_once () # ifdef __GNUC__ # define alloca __builtin_alloca # else /* not __GNUC__ */ -# if HAVE_ALLOCA_H +# ifdef HAVE_ALLOCA_H # include <alloca.h> # endif /* HAVE_ALLOCA_H */ # endif /* not __GNUC__ */ @@ -1736,7 +1769,7 @@ static int analyse_first _RE_ARGS ((re_char *p, re_char *pend, do { \ int len; \ if (p == pend) return REG_EEND; \ - c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len); \ + c = RE_STRING_CHAR_AND_LENGTH (p, pend - p, len, multibyte); \ p += len; \ } while (0) @@ -1947,10 +1980,10 @@ struct range_table_work_area #define EXTEND_RANGE_TABLE(work_area, n) \ do { \ - if (((work_area)->used + (n)) * sizeof (int) > (work_area)->allocated) \ + if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated) \ { \ - extend_range_table_work_area (work_area); \ - if ((work_area)->table == 0) \ + extend_range_table_work_area (&work_area); \ + if ((work_area).table == 0) \ return (REG_ESPACE); \ } \ } while (0) @@ -1967,15 +2000,12 @@ struct range_table_work_area #define BIT_UPPER 0x10 #define BIT_MULTIBYTE 0x20 -/* Set a range START..END to WORK_AREA. - The range is passed through TRANSLATE, so START and END - should be untranslated. */ -#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end) \ +/* Set a range (RANGE_START, RANGE_END) to WORK_AREA. */ +#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end) \ do { \ - int tem; \ - tem = set_image_of_range (&work_area, start, end, translate); \ - if (tem > 0) \ - FREE_STACK_RETURN (tem); \ + EXTEND_RANGE_TABLE ((work_area), 2); \ + (work_area).table[(work_area).used++] = (range_start); \ + (work_area).table[(work_area).used++] = (range_end); \ } while (0) /* Free allocated memory for WORK_AREA. */ @@ -1995,6 +2025,113 @@ struct range_table_work_area #define SET_LIST_BIT(c) (b[((c)) / BYTEWIDTH] |= 1 << ((c) % BYTEWIDTH)) +#ifdef emacs + +/* Store characters in the range FROM to TO in the bitmap at B (for + ASCII and unibyte characters) and WORK_AREA (for multibyte + characters) while translating them and paying attention to the + continuity of translated characters. + + Implementation note: It is better to implement these fairly big + macros by a function, but it's not that easy because macros called + in this macro assume various local variables already declared. */ + +/* Both FROM and TO are ASCII characters. */ + +#define SETUP_ASCII_RANGE(work_area, FROM, TO) \ + do { \ + int C0, C1; \ + \ + for (C0 = (FROM); C0 <= (TO); C0++) \ + { \ + C1 = TRANSLATE (C0); \ + if (! ASCII_CHAR_P (C1)) \ + { \ + SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \ + if ((C1 = RE_CHAR_TO_UNIBYTE (C1)) < 0) \ + C1 = C0; \ + } \ + SET_LIST_BIT (C1); \ + } \ + } while (0) + + +/* Both FROM and TO are unibyte characters (0x80..0xFF). */ + +#define SETUP_UNIBYTE_RANGE(work_area, FROM, TO) \ + do { \ + int C0, C1, C2, I; \ + int USED = RANGE_TABLE_WORK_USED (work_area); \ + \ + for (C0 = (FROM); C0 <= (TO); C0++) \ + { \ + C1 = RE_CHAR_TO_MULTIBYTE (C0); \ + if (CHAR_BYTE8_P (C1)) \ + SET_LIST_BIT (C0); \ + else \ + { \ + C2 = TRANSLATE (C1); \ + if (C2 == C1 \ + || (C1 = RE_CHAR_TO_UNIBYTE (C2)) < 0) \ + C1 = C0; \ + SET_LIST_BIT (C1); \ + for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \ + { \ + int from = RANGE_TABLE_WORK_ELT (work_area, I); \ + int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \ + \ + if (C2 >= from - 1 && C2 <= to + 1) \ + { \ + if (C2 == from - 1) \ + RANGE_TABLE_WORK_ELT (work_area, I)--; \ + else if (C2 == to + 1) \ + RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \ + break; \ + } \ + } \ + if (I < USED) \ + SET_RANGE_TABLE_WORK_AREA ((work_area), C2, C2); \ + } \ + } \ + } while (0) + + +/* Both FROM and TO are mulitbyte characters. */ + +#define SETUP_MULTIBYTE_RANGE(work_area, FROM, TO) \ + do { \ + int C0, C1, C2, I, USED = RANGE_TABLE_WORK_USED (work_area); \ + \ + SET_RANGE_TABLE_WORK_AREA ((work_area), (FROM), (TO)); \ + for (C0 = (FROM); C0 <= (TO); C0++) \ + { \ + C1 = TRANSLATE (C0); \ + if ((C2 = RE_CHAR_TO_UNIBYTE (C1)) >= 0 \ + || (C1 != C0 && (C2 = RE_CHAR_TO_UNIBYTE (C0)) >= 0)) \ + SET_LIST_BIT (C2); \ + if (C1 >= (FROM) && C1 <= (TO)) \ + continue; \ + for (I = RANGE_TABLE_WORK_USED (work_area) - 2; I >= USED; I -= 2) \ + { \ + int from = RANGE_TABLE_WORK_ELT (work_area, I); \ + int to = RANGE_TABLE_WORK_ELT (work_area, I + 1); \ + \ + if (C1 >= from - 1 && C1 <= to + 1) \ + { \ + if (C1 == from - 1) \ + RANGE_TABLE_WORK_ELT (work_area, I)--; \ + else if (C1 == to + 1) \ + RANGE_TABLE_WORK_ELT (work_area, I + 1)++; \ + break; \ + } \ + } \ + if (I < USED) \ + SET_RANGE_TABLE_WORK_AREA ((work_area), C1, C1); \ + } \ + } while (0) + +#endif /* emacs */ + /* Get the next unsigned number in the uncompiled pattern. */ #define GET_UNSIGNED_NUMBER(num) \ do { \ @@ -2118,6 +2255,7 @@ extend_range_table_work_area (work_area) = (int *) malloc (work_area->allocated); } +#if 0 #ifdef emacs /* Carefully find the ranges of codes that are equivalent @@ -2350,6 +2488,7 @@ set_image_of_range (work_area, start, end, translate) return -1; } +#endif /* 0 */ #ifndef MATCH_MAY_ALLOCATE @@ -2488,6 +2627,9 @@ regex_compile (pattern, size, syntax, bufp) /* If the object matched can contain multibyte characters. */ const boolean multibyte = RE_MULTIBYTE_P (bufp); + /* If a target of matching can contain multibyte characters. */ + const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp); + /* Nonzero if we have pushed down into a subpattern. */ int in_subpattern = 0; @@ -2840,6 +2982,7 @@ regex_compile (pattern, size, syntax, bufp) { boolean escaped_char = false; const unsigned char *p2 = p; + re_wchar_t ch, c2; if (p == pend) FREE_STACK_RETURN (REG_EBRACK); @@ -2866,10 +3009,6 @@ regex_compile (pattern, size, syntax, bufp) break; } - /* What should we do for the character which is - greater than 0x7F, but not BASE_LEADING_CODE_P? - XXX */ - /* See if we're at the beginning of a possible character class. */ @@ -2906,8 +3045,8 @@ regex_compile (pattern, size, syntax, bufp) them). */ if (c == ':' && *p == ']') { - re_wchar_t ch; re_wctype_t cc; + int limit; cc = re_wctype (str); @@ -2920,6 +3059,15 @@ regex_compile (pattern, size, syntax, bufp) if (p == pend) FREE_STACK_RETURN (REG_EBRACK); +#ifndef emacs + for (ch = 0; ch < (1 << BYTEWIDTH); ++ch) + if (re_iswctype (btowc (ch), cc)) + { + c = TRANSLATE (ch); + if (c < (1 << BYTEWIDTH)) + SET_LIST_BIT (c); + } +#else /* emacs */ /* Most character classes in a multibyte match just set a flag. Exceptions are is_blank, is_digit, is_cntrl, and is_xdigit, since @@ -2927,18 +3075,25 @@ regex_compile (pattern, size, syntax, bufp) don't need to handle them for multibyte. They are distinguished by a negative wctype. */ - if (multibyte) - SET_RANGE_TABLE_WORK_AREA_BIT (range_table_work, - re_wctype_to_bit (cc)); - - for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) + for (ch = 0; ch < 256; ++ch) { - int translated = TRANSLATE (ch); - if (translated < (1 << BYTEWIDTH) - && re_iswctype (btowc (ch), cc)) - SET_LIST_BIT (translated); + c = RE_CHAR_TO_MULTIBYTE (ch); + if (! CHAR_BYTE8_P (c) + && re_iswctype (c, cc)) + { + SET_LIST_BIT (ch); + c1 = TRANSLATE (c); + if (c1 == c) + continue; + if (ASCII_CHAR_P (c1)) + SET_LIST_BIT (c1); + else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0) + SET_LIST_BIT (c1); + } } - + SET_RANGE_TABLE_WORK_AREA_BIT + (range_table_work, re_wctype_to_bit (cc)); +#endif /* emacs */ /* In most cases the matching rule for char classes only uses the syntax table for multibyte chars, so that the content of the syntax-table it is not @@ -2971,64 +3126,63 @@ regex_compile (pattern, size, syntax, bufp) /* Fetch the character which ends the range. */ PATFETCH (c1); - - if (SINGLE_BYTE_CHAR_P (c)) - { - if (! SINGLE_BYTE_CHAR_P (c1)) - { - /* Handle a range starting with a - character of less than 256, and ending - with a character of not less than 256. - Split that into two ranges, the low one - ending at 0377, and the high one - starting at the smallest character in - the charset of C1 and ending at C1. */ - int charset = CHAR_CHARSET (c1); - re_wchar_t c2 = MAKE_CHAR (charset, 0, 0); - - SET_RANGE_TABLE_WORK_AREA (range_table_work, - c2, c1); - c1 = 0377; - } - } - else if (!SAME_CHARSET_P (c, c1)) - FREE_STACK_RETURN (REG_ERANGEX); +#ifdef emacs + if (CHAR_BYTE8_P (c1) + && ! ASCII_CHAR_P (c) && ! CHAR_BYTE8_P (c)) + /* Treat the range from a multibyte character to + raw-byte character as empty. */ + c = c1 + 1; +#endif /* emacs */ } else /* Range from C to C. */ c1 = c; - /* Set the range ... */ - if (SINGLE_BYTE_CHAR_P (c)) - /* ... into bitmap. */ + if (c > c1) { - re_wchar_t this_char; - re_wchar_t range_start = c, range_end = c1; - - /* If the start is after the end, the range is empty. */ - if (range_start > range_end) + if (syntax & RE_NO_EMPTY_RANGES) + FREE_STACK_RETURN (REG_ERANGEX); + /* Else, repeat the loop. */ + } + else + { +#ifndef emacs + /* Set the range into bitmap */ + for (; c <= c1; c++) { - if (syntax & RE_NO_EMPTY_RANGES) - FREE_STACK_RETURN (REG_ERANGE); - /* Else, repeat the loop. */ + ch = TRANSLATE (c); + if (ch < (1 << BYTEWIDTH)) + SET_LIST_BIT (ch); } - else +#else /* emacs */ + if (c < 128) + { + ch = MIN (127, c1); + SETUP_ASCII_RANGE (range_table_work, c, ch); + c = ch + 1; + if (CHAR_BYTE8_P (c1)) + c = BYTE8_TO_CHAR (128); + } + if (c <= c1) { - for (this_char = range_start; this_char <= range_end; - this_char++) + if (CHAR_BYTE8_P (c)) + { + c = CHAR_TO_BYTE8 (c); + c1 = CHAR_TO_BYTE8 (c1); + for (; c <= c1; c++) + SET_LIST_BIT (c); + } + else if (multibyte) + { + SETUP_MULTIBYTE_RANGE (range_table_work, c, c1); + } + else { - int translated = TRANSLATE (this_char); - if (translated < (1 << BYTEWIDTH)) - SET_LIST_BIT (translated); - else - SET_RANGE_TABLE_WORK_AREA - (range_table_work, translated, translated); + SETUP_UNIBYTE_RANGE (range_table_work, c, c1); } } +#endif /* emacs */ } - else - /* ... into range table. */ - SET_RANGE_TABLE_WORK_AREA (range_table_work, c, c1); } /* Discard any (non)matching list bytes that are all 0 at the @@ -3639,12 +3793,25 @@ regex_compile (pattern, size, syntax, bufp) { int len; - c = TRANSLATE (c); if (multibyte) - len = CHAR_STRING (c, b); + { + c = TRANSLATE (c); + len = CHAR_STRING (c, b); + b += len; + } else - *b = c, len = 1; - b += len; + { + c1 = RE_CHAR_TO_MULTIBYTE (c); + if (! CHAR_BYTE8_P (c1)) + { + re_wchar_t c2 = TRANSLATE (c1); + + if (c1 != c2 && (c1 = RE_CHAR_TO_UNIBYTE (c2)) >= 0) + c = c1; + } + *b++ = c; + len = 1; + } (*pending_exact) += len; } @@ -3914,14 +4081,21 @@ analyse_first (p, pend, fastmap, multibyte) case exactn: if (fastmap) { - int c = RE_STRING_CHAR (p + 1, pend - p); - /* When fast-scanning, the fastmap can be indexed either with - a char (smaller than 256) or with the first byte of - a char's byte sequence. So we have to conservatively add - both to the table. */ - if (SINGLE_BYTE_CHAR_P (c)) - fastmap[c] = 1; + /* If multibyte is nonzero, the first byte of each + character is an ASCII or a leading code. Otherwise, + each byte is a character. Thus, this works in both + cases. */ fastmap[p[1]] = 1; + if (! multibyte) + { + /* For the case of matching this unibyte regex + against multibyte, we must set a leading code of + the corresponding multibyte character. */ + int c = RE_CHAR_TO_MULTIBYTE (p[1]); + + if (! CHAR_BYTE8_P (c)) + fastmap[CHAR_LEADING_CODE (c)] = 1; + } } break; @@ -3934,18 +4108,14 @@ analyse_first (p, pend, fastmap, multibyte) case charset_not: - /* Chars beyond end of bitmap are possible matches. - All the single-byte codes can occur in multibyte buffers. - So any that are not listed in the charset - are possible matches, even in multibyte buffers. */ if (!fastmap) break; - /* We don't need to mark LEADING_CODE_8_BIT_CONTROL specially - because it will automatically be set when needed by virtue of - being larger than the highest char of its charset (0xbf) but - smaller than (1<<BYTEWIDTH). */ - for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; - j < (1 << BYTEWIDTH); j++) - fastmap[j] = 1; + { + /* Chars beyond end of bitmap are possible matches. */ + for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH; + j < (1 << BYTEWIDTH); j++) + fastmap[j] = 1; + } + /* Fallthrough */ case charset: if (!fastmap) break; @@ -3953,28 +4123,24 @@ analyse_first (p, pend, fastmap, multibyte) for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++; j >= 0; j--) if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not) - { - fastmap[j] = 1; -#ifdef emacs - if (j >= 0x80 && j < 0xa0) - fastmap[LEADING_CODE_8_BIT_CONTROL] = 1; -#endif - } + fastmap[j] = 1; - if ((not && multibyte) - /* Any character set can possibly contain a character +#ifdef emacs + if (/* Any leading code can possibly start a character which doesn't match the specified set of characters. */ - || (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) - && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)) - /* If we can match a character class, we can match - any character set. */ + not + || + /* If we can match a character class, we can match any + multibyte characters. */ + (CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) + && CHARSET_RANGE_TABLE_BITS (&p[-2]) != 0)) + { - set_fastmap_for_multibyte_characters: if (match_any_multibyte_characters == false) { - for (j = 0x80; j < 0xA0; j++) /* XXX */ - if (BASE_LEADING_CODE_P (j)) - fastmap[j] = 1; + for (j = MIN_MULTIBYTE_LEADING_CODE; + j <= MAX_MULTIBYTE_LEADING_CODE; j++) + fastmap[j] = 1; match_any_multibyte_characters = true; } } @@ -3982,9 +4148,10 @@ analyse_first (p, pend, fastmap, multibyte) else if (!not && CHARSET_RANGE_TABLE_EXISTS_P (&p[-2]) && match_any_multibyte_characters == false) { - /* Set fastmap[I] 1 where I is a base leading code of each - multibyte character in the range table. */ + /* Set fastmap[I] to 1 where I is a leading code of each + multibyte characer in the range table. */ int c, count; + unsigned char lc1, lc2; /* Make P points the range table. `+ 2' is to skip flag bits for a character class. */ @@ -3992,14 +4159,19 @@ analyse_first (p, pend, fastmap, multibyte) /* Extract the number of ranges in range table into COUNT. */ EXTRACT_NUMBER_AND_INCR (count, p); - for (; count > 0; count--, p += 2 * 3) /* XXX */ + for (; count > 0; count--, p += 3) { - /* Extract the start of each range. */ + /* Extract the start and end of each range. */ + EXTRACT_CHARACTER (c, p); + lc1 = CHAR_LEADING_CODE (c); + p += 3; EXTRACT_CHARACTER (c, p); - j = CHAR_CHARSET (c); - fastmap[CHARSET_LEADING_CODE_BASE (j)] = 1; + lc2 = CHAR_LEADING_CODE (c); + for (j = lc1; j <= lc2; j++) + fastmap[j] = 1; } } +#endif break; case syntaxspec: @@ -4022,14 +4194,19 @@ analyse_first (p, pend, fastmap, multibyte) if (!fastmap) break; not = (re_opcode_t)p[-1] == notcategoryspec; k = *p++; - for (j = 0; j < (1 << BYTEWIDTH); j++) + for (j = (1 << BYTEWIDTH); j >= 0; j--) if ((CHAR_HAS_CATEGORY (j, k)) ^ not) fastmap[j] = 1; - if (multibyte) - /* Any character set can possibly contain a character - whose category is K (or not). */ - goto set_fastmap_for_multibyte_characters; + /* Any leading code can possibly start a character which + has or doesn't has the specified category. */ + if (match_any_multibyte_characters == false) + { + for (j = MIN_MULTIBYTE_LEADING_CODE; + j <= MAX_MULTIBYTE_LEADING_CODE; j++) + fastmap[j] = 1; + match_any_multibyte_characters = true; + } break; /* All cases after this match the empty string. These end with @@ -4279,9 +4456,8 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) int total_size = size1 + size2; int endpos = startpos + range; boolean anchored_start; - - /* Nonzero if we have to concern multibyte character. */ - const boolean multibyte = RE_MULTIBYTE_P (bufp); + /* Nonzero if we are searching multibyte string. */ + const boolean multibyte = RE_TARGET_MULTIBYTE_P (bufp); /* Check for out-of-range STARTPOS. */ if (startpos < 0 || startpos > total_size) @@ -4377,59 +4553,51 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim, buf_charlen); - buf_ch = RE_TRANSLATE (translate, buf_ch); - if (buf_ch >= 0400 - || fastmap[buf_ch]) + if (fastmap[CHAR_LEADING_CODE (buf_ch)]) break; range -= buf_charlen; d += buf_charlen; } else - { - /* Convert *d to integer to shut up GCC's - whining about comparison that is always - true. */ - int di = *d; - - while (range > lim - && !fastmap[RE_TRANSLATE (translate, di)]) - { - di = *(++d); - range--; - } - } - } - else - do - { - re_char *d_start = d; - while (range > lim && !fastmap[*d]) + while (range > lim) { + register re_wchar_t ch, translated; + + buf_ch = *d; + ch = RE_CHAR_TO_MULTIBYTE (buf_ch); + translated = RE_TRANSLATE (translate, ch); + if (translated != ch + && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0) + buf_ch = ch; + if (fastmap[buf_ch]) + break; d++; range--; } -#ifdef emacs - if (multibyte && range > lim) + } + else + { + if (multibyte) + while (range > lim) { - /* Check that we are at the beginning of a char. */ - int at_boundary; - AT_CHAR_BOUNDARY_P (at_boundary, d, d_start); - if (at_boundary) + int buf_charlen; + + buf_ch = STRING_CHAR_AND_LENGTH (d, range - lim, + buf_charlen); + if (fastmap[CHAR_LEADING_CODE (buf_ch)]) break; - else - { /* We have matched an internal byte of a char - rather than the leading byte, so it's a false - positive: we should keep scanning. */ - d++; range--; - } + range -= buf_charlen; + d += buf_charlen; } - else -#endif - break; - } while (1); - + else + while (range > lim && !fastmap[*d]) + { + d++; + range--; + } + } startpos += irange - range; } else /* Searching backwards. */ @@ -4437,12 +4605,26 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop) int room = (startpos >= size1 ? size2 + size1 - startpos : size1 - startpos); - buf_ch = RE_STRING_CHAR (d, room); - buf_ch = TRANSLATE (buf_ch); - - if (! (buf_ch >= 0400 - || fastmap[buf_ch])) - goto advance; + if (multibyte) + { + buf_ch = STRING_CHAR (d, room); + buf_ch = TRANSLATE (buf_ch); + if (! fastmap[CHAR_LEADING_CODE (buf_ch)]) + goto advance; + } + else + { + register re_wchar_t ch, translated; + + buf_ch = *d; + ch = RE_CHAR_TO_MULTIBYTE (buf_ch); + translated = TRANSLATE (ch); + if (translated != ch + && (ch = RE_CHAR_TO_UNIBYTE (translated)) >= 0) + buf_ch = ch; + if (! fastmap[TRANSLATE (buf_ch)]) + goto advance; + } } } @@ -4721,11 +4903,11 @@ mutually_exclusive_p (bufp, p1, p2) { register re_wchar_t c = (re_opcode_t) *p2 == endline ? '\n' - : RE_STRING_CHAR (p2 + 2, pend - p2 - 2); + : RE_STRING_CHAR (p2 + 2, pend - p2 - 2, multibyte); if ((re_opcode_t) *p1 == exactn) { - if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2)) + if (c != RE_STRING_CHAR (p1 + 2, pend - p1 - 2, multibyte)) { DEBUG_PRINT3 (" '%c' != '%c' => fast loop.\n", c, p1[2]); return 1; @@ -4739,7 +4921,7 @@ mutually_exclusive_p (bufp, p1, p2) /* Test if C is listed in charset (or charset_not) at `p1'. */ - if (SINGLE_BYTE_CHAR_P (c)) + if (! multibyte || IS_REAL_ASCII (c)) { if (c < CHARSET_BITMAP_SIZE (p1) * BYTEWIDTH && p1[2 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) @@ -4782,9 +4964,10 @@ mutually_exclusive_p (bufp, p1, p2) size of bitmap table of P1 is extracted by using macro `CHARSET_BITMAP_SIZE'. - Since we know that all the character listed in - P2 is ASCII, it is enough to test only bitmap - table of P1. */ + In a multibyte case, we know that all the character + listed in P2 is ASCII. In a unibyte case, P1 has only a + bitmap table. So, in both cases, it is enough to test + only the bitmap table of P1. */ if ((re_opcode_t) *p1 == charset) { @@ -4948,6 +5131,7 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) } WEAK_ALIAS (__re_match_2, re_match_2) + /* This is a separate function so that we can force an alloca cleanup afterwards. */ static int @@ -4984,12 +5168,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) re_char *p = bufp->buffer; re_char *pend = p + bufp->used; - /* We use this to map every character in the string. */ + /* We use this to map every character in the string. */ RE_TRANSLATE_TYPE translate = bufp->translate; - /* Nonzero if we have to concern multibyte character. */ + /* Nonzero if BUFP is setup from a multibyte regex. */ const boolean multibyte = RE_MULTIBYTE_P (bufp); + /* Nonzero if STRING1/STRING2 are multibyte. */ + const boolean target_multibyte = RE_TARGET_MULTIBYTE_P (bufp); + /* Failure point stack. Each place that can handle a failure further down the line pushes a failure point on this stack. It consists of regstart, and regend for all registers corresponding to @@ -5341,63 +5528,99 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Remember the start point to rollback upon failure. */ dfail = d; +#ifndef emacs /* This is written out as an if-else so we don't waste time testing `translate' inside the loop. */ if (RE_TRANSLATE_P (translate)) - { - if (multibyte) - do + do + { + PREFETCH (); + if (RE_TRANSLATE (translate, *d) != *p++) { - int pat_charlen, buf_charlen; - unsigned int pat_ch, buf_ch; + d = dfail; + goto fail; + } + d++; + } + while (--mcnt); + else + do + { + PREFETCH (); + if (*d++ != *p++) + { + d = dfail; + goto fail; + } + } + while (--mcnt); +#else /* emacs */ + /* The cost of testing `translate' is comparatively small. */ + if (target_multibyte) + do + { + int pat_charlen, buf_charlen; + int pat_ch, buf_ch; - PREFETCH (); - pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen); - buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); + PREFETCH (); + if (multibyte) + pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen); + else + { + pat_ch = RE_CHAR_TO_MULTIBYTE (*p); + pat_charlen = 1; + } + buf_ch = STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); - if (RE_TRANSLATE (translate, buf_ch) - != pat_ch) - { - d = dfail; - goto fail; - } + if (TRANSLATE (buf_ch) != pat_ch) + { + d = dfail; + goto fail; + } + + p += pat_charlen; + d += buf_charlen; + mcnt -= pat_charlen; + } + while (mcnt > 0); + else + do + { + int pat_charlen, buf_charlen; + int pat_ch, buf_ch; - p += pat_charlen; - d += buf_charlen; - mcnt -= pat_charlen; + PREFETCH (); + if (multibyte) + { + pat_ch = STRING_CHAR_AND_LENGTH (p, pend - p, pat_charlen); + if (CHAR_BYTE8_P (pat_ch)) + pat_ch = CHAR_TO_BYTE8 (pat_ch); + else + pat_ch = RE_CHAR_TO_UNIBYTE (pat_ch); } - while (mcnt > 0); - else - do + else { - /* Avoid compiler whining about comparison being - always true. */ - int di; - - PREFETCH (); - di = *d; - if (RE_TRANSLATE (translate, di) != *p++) - { - d = dfail; - goto fail; - } - d++; + pat_ch = *p; + pat_charlen = 1; } - while (--mcnt); - } - else - { - do - { - PREFETCH (); - if (*d++ != *p++) - { - d = dfail; - goto fail; - } - } - while (--mcnt); - } + buf_ch = RE_CHAR_TO_MULTIBYTE (*d); + if (! CHAR_BYTE8_P (buf_ch)) + { + buf_ch = TRANSLATE (buf_ch); + buf_ch = RE_CHAR_TO_UNIBYTE (buf_ch); + if (buf_ch < 0) + buf_ch = *d; + } + if (buf_ch != pat_ch) + { + d = dfail; + goto fail; + } + p += pat_charlen; + d++; + } + while (--mcnt); +#endif break; @@ -5410,7 +5633,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) DEBUG_PRINT1 ("EXECUTING anychar.\n"); PREFETCH (); - buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen); + buf_ch = RE_STRING_CHAR_AND_LENGTH (d, dend - d, buf_charlen, + target_multibyte); buf_ch = TRANSLATE (buf_ch); if ((!(bufp->syntax & RE_DOT_NEWLINE) @@ -5454,10 +5678,30 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) } PREFETCH (); - c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); - c = TRANSLATE (c); /* The character to match. */ + c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len, target_multibyte); + if (target_multibyte) + { + int c1; - if (SINGLE_BYTE_CHAR_P (c)) + c = TRANSLATE (c); + c1 = RE_CHAR_TO_UNIBYTE (c); + if (c1 >= 0) + c = c1; + } + else + { + int c1 = RE_CHAR_TO_MULTIBYTE (c); + + if (! CHAR_BYTE8_P (c1)) + { + c1 = TRANSLATE (c1); + c1 = RE_CHAR_TO_UNIBYTE (c1); + if (c1 >= 0) + c = c1; + } + } + + if (c < (1 << BYTEWIDTH)) { /* Lookup bitmap. */ /* Cast to `unsigned' instead of `unsigned char' in case the bit list is a full 32 bytes long. */ @@ -5597,7 +5841,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Compare that many; failure if mismatch, else move past them. */ if (RE_TRANSLATE_P (translate) - ? bcmp_translate (d, d2, mcnt, translate, multibyte) + ? bcmp_translate (d, d2, mcnt, translate, target_multibyte) : memcmp (d, d2, mcnt)) { d = dfail; @@ -5620,7 +5864,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) } else { - unsigned char c; + unsigned c; GET_CHAR_BEFORE_2 (c, d, string1, end1, string2, end2); if (c == '\n') break; @@ -5887,6 +6131,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) is the character at D, and S2 is the syntax of C2. */ re_wchar_t c1, c2; int s1, s2; + int dummy; #ifdef emacs int offset = PTR_TO_OFFSET (d - 1); int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); @@ -5898,7 +6143,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); #endif PREFETCH_NOLIMIT (); - c2 = RE_STRING_CHAR (d, dend - d); + GET_CHAR_AFTER (c2, d, dummy); s2 = SYNTAX (c2); if (/* Case 2: Only one of S1 and S2 is Sword. */ @@ -5927,13 +6172,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) is the character at D, and S2 is the syntax of C2. */ re_wchar_t c1, c2; int s1, s2; + int dummy; #ifdef emacs int offset = PTR_TO_OFFSET (d); int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); UPDATE_SYNTAX_TABLE (charpos); #endif PREFETCH (); - c2 = RE_STRING_CHAR (d, dend - d); + GET_CHAR_AFTER (c2, d, dummy); s2 = SYNTAX (c2); /* Case 2: S2 is not Sword. */ @@ -5971,6 +6217,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) is the character at D, and S2 is the syntax of C2. */ re_wchar_t c1, c2; int s1, s2; + int dummy; #ifdef emacs int offset = PTR_TO_OFFSET (d) - 1; int charpos = SYNTAX_TABLE_BYTE_TO_CHAR (offset); @@ -5987,9 +6234,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) if (!AT_STRINGS_END (d)) { PREFETCH_NOLIMIT (); - c2 = RE_STRING_CHAR (d, dend - d); + GET_CHAR_AFTER (c2, d, dummy); #ifdef emacs - UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); + UPDATE_SYNTAX_TABLE_FORWARD (charpos); #endif s2 = SYNTAX (c2); @@ -6021,7 +6268,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) UPDATE_SYNTAX_TABLE (charpos); #endif PREFETCH (); - c2 = RE_STRING_CHAR (d, dend - d); + c2 = RE_STRING_CHAR (d, dend - d, target_multibyte); s2 = SYNTAX (c2); /* Case 2: S2 is neither Sword nor Ssymbol. */ @@ -6074,7 +6321,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) if (!AT_STRINGS_END (d)) { PREFETCH_NOLIMIT (); - c2 = RE_STRING_CHAR (d, dend - d); + c2 = RE_STRING_CHAR (d, dend - d, target_multibyte); #ifdef emacs UPDATE_SYNTAX_TABLE_FORWARD (charpos + 1); #endif @@ -6104,8 +6351,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) int len; re_wchar_t c; - c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); - + GET_CHAR_AFTER (c, d, len); if ((SYNTAX (c) != (enum syntaxcode) mcnt) ^ not) goto fail; d += len; @@ -6141,8 +6387,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) int len; re_wchar_t c; - c = RE_STRING_CHAR_AND_LENGTH (d, dend - d, len); - + GET_CHAR_AFTER (c, d, len); if ((!CHAR_HAS_CATEGORY (c, mcnt)) ^ not) goto fail; d += len; @@ -6217,11 +6462,11 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) bytes; nonzero otherwise. */ static int -bcmp_translate (s1, s2, len, translate, multibyte) +bcmp_translate (s1, s2, len, translate, target_multibyte) re_char *s1, *s2; register int len; RE_TRANSLATE_TYPE translate; - const int multibyte; + const int target_multibyte; { register re_char *p1 = s1, *p2 = s2; re_char *p1_end = s1 + len; @@ -6234,8 +6479,8 @@ bcmp_translate (s1, s2, len, translate, multibyte) int p1_charlen, p2_charlen; re_wchar_t p1_ch, p2_ch; - p1_ch = RE_STRING_CHAR_AND_LENGTH (p1, p1_end - p1, p1_charlen); - p2_ch = RE_STRING_CHAR_AND_LENGTH (p2, p2_end - p2, p2_charlen); + GET_CHAR_AFTER (p1_ch, p1, p1_charlen); + GET_CHAR_AFTER (p2_ch, p2, p2_charlen); if (RE_TRANSLATE (translate, p1_ch) != RE_TRANSLATE (translate, p2_ch)) |