diff options
Diffstat (limited to 'regex.c')
-rw-r--r-- | regex.c | 2692 |
1 files changed, 2303 insertions, 389 deletions
@@ -2,7 +2,7 @@ version 0.12. (Implements POSIX draft P1003.2/D11.2, except for some of the internationalization features.) - Copyright (C) 1993, 94, 95, 96, 97, 98, 99, 2000 Free Software Foundation, Inc. + Copyright (C) 1993, 94, 95, 96, 97, 98, 99, 2000, 2002 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -38,6 +38,12 @@ # endif /* GCC. */ #endif /* Not PARAMS. */ +#ifndef INSIDE_RECURSION + +#if !defined __GNUC__ +# define __alignof__(type) sizeof(type) +#endif + #if defined STDC_HEADERS && !defined emacs # include <stddef.h> #else @@ -45,7 +51,21 @@ # include <sys/types.h> #endif +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC +/* We can handle multibyte string. */ +# define MBS_SUPPORT +# if !WIDE_CHAR_SUPPORT +# define WIDE_CHAR_SUPPORT 1 +# endif +#else #define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC) +#endif + +/* Activate this if definition of alignof() for non-GNU C is good. +#if !defined __GNUC__ +# undef MBS_SUPPORT +#endif +*/ /* For platform which support the ISO C amendement 1 functionality we support user defined character classes. */ @@ -80,9 +100,14 @@ #define btowc __btowc #endif +/* Don't use gettext if ENABLE_NLS is not defined */ +#ifdef ENABLE_NLS /* This is for other GNU distributions with internationalized messages. */ -#if HAVE_LIBINTL_H || defined _LIBC -# include <libintl.h> +# if HAVE_LIBINTL_H || defined _LIBC +# include <libintl.h> +# else +# define gettext(msgid) (msgid) +# endif #else # define gettext(msgid) (msgid) #endif @@ -165,6 +190,14 @@ char *realloc (); #endif /* not emacs */ +# if defined _LIBC || HAVE_LIMITS_H +# include <limits.h> +# endif + +# ifndef MB_LEN_MAX +# define MB_LEN_MAX 1 +# endif + /* Get the interface, including the syntax bits. */ #include <regex.h> @@ -374,12 +407,46 @@ typedef char boolean; #define false 0 #define true 1 -static int re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp, +static reg_errcode_t byte_regex_compile _RE_ARGS ((const char *pattern, size_t size, + reg_syntax_t syntax, + struct re_pattern_buffer *bufp)); + +static int byte_re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp, const char *string1, int size1, const char *string2, int size2, int pos, struct re_registers *regs, int stop)); +static int byte_re_search_2 PARAMS ((struct re_pattern_buffer *bufp, + const char *string1, int size1, + const char *string2, int size2, + int startpos, int range, + struct re_registers *regs, int stop)); +static int byte_re_compile_fastmap PARAMS ((struct re_pattern_buffer *bufp)); + +#ifdef MBS_SUPPORT +static reg_errcode_t wcs_regex_compile _RE_ARGS ((const char *pattern, size_t size, + reg_syntax_t syntax, + struct re_pattern_buffer *bufp)); + + +static int wcs_re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp, + const char *cstring1, int csize1, + const char *cstring2, int csize2, + int pos, + struct re_registers *regs, + int stop, + wchar_t *string1, int size1, + wchar_t *string2, int size2, + int *mbs_offset1, int *mbs_offset2)); +static int wcs_re_search_2 PARAMS ((struct re_pattern_buffer *bufp, + const char *string1, int size1, + const char *string2, int size2, + int startpos, int range, + struct re_registers *regs, int stop)); +static int wcs_re_compile_fastmap PARAMS ((struct re_pattern_buffer *bufp)); +#endif + /* These are the command codes that appear in compiled regular expressions. Some opcodes are followed by argument bytes. A @@ -396,6 +463,11 @@ typedef enum /* Followed by one byte giving n, then by n literal bytes. */ exactn, +# ifdef MBS_SUPPORT + /* Same as exactn, but contains binary data. */ + exactn_bin, +# endif + /* Matches any (more or less) character. */ anychar, @@ -405,6 +477,13 @@ typedef enum are ordered low-bit-first. A character is in the set if its bit is 1. A character too large to have a bit in the map is automatically not in the set. */ + /* ifdef MBS_SUPPORT, following element is length of character + classes, length of collating symbols, length of equivalence + classes, length of character ranges, and length of characters. + Next, character class element, collating symbols elements, + equivalence class elements, range elements, and character + elements follow. + See regex_compile function. */ charset, /* Same parameters as charset, but match any character that is @@ -454,6 +533,7 @@ typedef enum /* Followed by two-byte relative address of place to resume at in case of failure. */ + /* ifdef WCHAR, the size of address is 1. */ on_failure_jump, /* Like on_failure_jump, but pushes a placeholder instead of the @@ -462,6 +542,7 @@ typedef enum /* Throw away latest failure point and then jump to following two-byte relative address. */ + /* ifdef WCHAR, the size of address is 1. */ pop_failure_jump, /* Change to pop_failure_jump if know won't have to backtrack to @@ -471,6 +552,7 @@ typedef enum sure that there is no use backtracking out of repetitions already matched, then we change it to a pop_failure_jump. Followed by two-byte address. */ + /* ifdef WCHAR, the size of address is 1. */ maybe_pop_jump, /* Jump to following two-byte address, and push a dummy failure @@ -478,6 +560,7 @@ typedef enum is made to use it for a failure. A `+' construct makes this before the first repeat. Also used as an intermediary kind of jump when compiling an alternative. */ + /* ifdef WCHAR, the size of address is 1. */ dummy_failure_jump, /* Push a dummy failure point and continue. Used at the end of @@ -486,15 +569,18 @@ typedef enum /* Followed by two-byte relative address and two-byte number n. After matching N times, jump to the address upon failure. */ + /* ifdef WCHAR, the size of address is 1. */ succeed_n, /* Followed by two-byte relative address, and two-byte number n. Jump to the address N times, then fail. */ + /* ifdef WCHAR, the size of address is 1. */ jump_n, /* Set the following two-byte relative address to the subsequent two-byte number. The address *includes* the two bytes of number. */ + /* ifdef WCHAR, the size of address is 1. */ set_number_at, wordchar, /* Matches any word-constituent character. */ @@ -519,51 +605,131 @@ typedef enum notsyntaxspec #endif /* emacs */ } re_opcode_t; +#endif /* not INSIDE_RECURSION */ + +#ifdef BYTE +# define CHAR_T char +# define UCHAR_T unsigned char +# define COMPILED_BUFFER_VAR bufp->buffer +/* Offset address size in compiled pattern. */ +# define OFFSET_ADDRESS_SIZE 2 +#ifdef HAVE_STRINGIZE /* if it supports #, it probably supports ## too */ +# define PREFIX(name) byte_##name +#else +# define PREFIX(name) byte_/**/name +#endif +# define ARG_PREFIX(name) name +# define PUT_CHAR(c) putchar (c) +#else +#ifdef WCHAR +# define CHAR_T wchar_t +# define CHAR_T_SIGN (1 << (sizeof(CHAR_T) * 8 - 1)) +# if defined _AIX +# define WCHAR_T_NEED_SIGNEXTEND 1 +# endif /* _AIX */ +# define UCHAR_T wchar_t +# define COMPILED_BUFFER_VAR wc_buffer +/* Offset address size in compiled pattern. */ +# define OFFSET_ADDRESS_SIZE 1 +# define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_T)+1) +#ifdef HAVE_STRINGIZE /* if it supports #, it probably supports ## too */ +# define PREFIX(name) wcs_##name +#else +# define PREFIX(name) wcs_/**/name +#endif +# define ARG_PREFIX(name) c##name +/* Should we use wide stream?? */ +# define PUT_CHAR(c) printf ("%C", c); +# define TRUE 1 +# define FALSE 0 +#else +# ifdef MBS_SUPPORT +# define WCHAR +# define INSIDE_RECURSION +# include "regex.c" +# undef INSIDE_RECURSION +# endif /* MBS_SUPPORT */ +# define BYTE +# define INSIDE_RECURSION +# include "regex.c" +# undef INSIDE_RECURSION +#endif /* WCHAR */ +#endif /* BYTE */ + +#ifdef INSIDE_RECURSION /* Common operations on the compiled pattern. */ /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ +/* ifdef WCHAR, we store NUMBER in 1 element. */ -#define STORE_NUMBER(destination, number) \ +# ifdef WCHAR +# define STORE_NUMBER(destination, number) \ + do { \ + *(destination) = (UCHAR_T)(number); \ + } while (0) +# else /* BYTE */ +# define STORE_NUMBER(destination, number) \ do { \ (destination)[0] = (number) & 0377; \ (destination)[1] = (number) >> 8; \ } while (0) +# endif /* WCHAR */ /* Same as STORE_NUMBER, except increment DESTINATION to the byte after where the number is stored. Therefore, DESTINATION must be an lvalue. */ +/* ifdef WCHAR, we store NUMBER in 1 element. */ #define STORE_NUMBER_AND_INCR(destination, number) \ do { \ STORE_NUMBER (destination, number); \ - (destination) += 2; \ + (destination) += OFFSET_ADDRESS_SIZE; \ } while (0) /* Put into DESTINATION a number stored in two contiguous bytes starting at SOURCE. */ +/* ifdef WCHAR, we store NUMBER in 1 element. */ -#define EXTRACT_NUMBER(destination, source) \ +# ifdef WCHAR +# ifdef WCHAR_T_NEED_SIGNEXTEND +# define EXTRACT_NUMBER(destination, source) \ + (destination) = (*(source) ^ CHAR_T_SIGN) - CHAR_T_SIGN; +# else +# define EXTRACT_NUMBER(destination, source) \ + (destination) = *(source) +# endif /* WCHAR_T_NEED_SIGNEXTEND */ +# else /* BYTE */ +# define EXTRACT_NUMBER(destination, source) \ do { \ (destination) = *(source) & 0377; \ (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ } while (0) +# endif #ifdef DEBUG -static void extract_number _RE_ARGS ((int *dest, unsigned char *source)); +static void PREFIX(extract_number) _RE_ARGS ((int *dest, UCHAR_T *source)); static void -extract_number (dest, source) +PREFIX(extract_number) (dest, source) int *dest; - unsigned char *source; + UCHAR_T *source; { +# ifdef WCHAR +# ifdef WCHAR_T_NEED_SIGNEXTEND + *dest = (*source ^ CHAR_T_SIGN) - CHAR_T_SIGN; +# else + *dest = *source; +# endif /* WCHAR_T_NEED_SIGNEXTEND */ +# else /* BYTE */ int temp = SIGN_EXTEND_CHAR (*(source + 1)); *dest = *source & 0377; *dest += temp << 8; +# endif } # ifndef EXTRACT_MACROS /* To debug the macros. */ # undef EXTRACT_NUMBER -# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) +# define EXTRACT_NUMBER(dest, src) PREFIX(extract_number) (&dest, src) # endif /* not EXTRACT_MACROS */ #endif /* DEBUG */ @@ -574,25 +740,25 @@ extract_number (dest, source) #define EXTRACT_NUMBER_AND_INCR(destination, source) \ do { \ EXTRACT_NUMBER (destination, source); \ - (source) += 2; \ + (source) += OFFSET_ADDRESS_SIZE; \ } while (0) #ifdef DEBUG -static void extract_number_and_incr _RE_ARGS ((int *destination, - unsigned char **source)); +static void PREFIX(extract_number_and_incr) _RE_ARGS ((int *destination, + UCHAR_T **source)); static void -extract_number_and_incr (destination, source) +PREFIX(extract_number_and_incr) (destination, source) int *destination; - unsigned char **source; + UCHAR_T **source; { - extract_number (destination, *source); - *source += 2; + PREFIX(extract_number) (destination, *source); + *source += OFFSET_ADDRESS_SIZE; } # ifndef EXTRACT_MACROS # undef EXTRACT_NUMBER_AND_INCR # define EXTRACT_NUMBER_AND_INCR(dest, src) \ - extract_number_and_incr (&dest, &src) + PREFIX(extract_number_and_incr) (&dest, &src) # endif /* not EXTRACT_MACROS */ #endif /* DEBUG */ @@ -605,6 +771,8 @@ extract_number_and_incr (destination, source) #ifdef DEBUG +#ifndef DEFINED_ONCE + /* We use standard I/O for debugging. */ # include <stdio.h> @@ -618,14 +786,17 @@ static int debug; # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) +#endif /* not DEFINED_ONCE */ + # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ - if (debug) print_partial_compiled_pattern (s, e) + if (debug) PREFIX(print_partial_compiled_pattern) (s, e) # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ - if (debug) print_double_string (w, s1, sz1, s2, sz2) + if (debug) PREFIX(print_double_string) (w, s1, sz1, s2, sz2) /* Print the fastmap in human-readable form. */ +#ifndef DEFINED_ONCE void print_fastmap (fastmap) char *fastmap; @@ -653,20 +824,21 @@ print_fastmap (fastmap) } putchar ('\n'); } +#endif /* not DEFINED_ONCE */ /* Print a compiled pattern string in human-readable form, starting at the START pointer into it and ending just before the pointer END. */ void -print_partial_compiled_pattern (start, end) - unsigned char *start; - unsigned char *end; +PREFIX(print_partial_compiled_pattern) (start, end) + UCHAR_T *start; + UCHAR_T *end; { int mcnt, mcnt2; - unsigned char *p1; - unsigned char *p = start; - unsigned char *pend = end; + UCHAR_T *p1; + UCHAR_T *p = start; + UCHAR_T *pend = end; if (start == NULL) { @@ -691,10 +863,22 @@ print_partial_compiled_pattern (start, end) do { putchar ('/'); - putchar (*p++); + PUT_CHAR (*p++); + } + while (--mcnt); + break; + +# ifdef MBS_SUPPORT + case exactn_bin: + mcnt = *p++; + printf ("/exactn_bin/%d", mcnt); + do + { + printf("/%lx", (long int) *p++); } while (--mcnt); break; +# endif /* MBS_SUPPORT */ case start_memory: mcnt = *p++; @@ -717,6 +901,45 @@ print_partial_compiled_pattern (start, end) case charset: case charset_not: { +# ifdef WCHAR + int i, length; + wchar_t *workp = p; + printf ("/charset [%s", + (re_opcode_t) *(workp - 1) == charset_not ? "^" : ""); + p += 5; + length = *workp++; /* the length of char_classes */ + for (i=0 ; i<length ; i++) + printf("[:%lx:]", (long int) *p++); + length = *workp++; /* the length of collating_symbol */ + for (i=0 ; i<length ;) + { + printf("[."); + while(*p != 0) + PUT_CHAR((i++,*p++)); + i++,p++; + printf(".]"); + } + length = *workp++; /* the length of equivalence_class */ + for (i=0 ; i<length ;) + { + printf("[="); + while(*p != 0) + PUT_CHAR((i++,*p++)); + i++,p++; + printf("=]"); + } + length = *workp++; /* the length of char_range */ + for (i=0 ; i<length ; i++) + { + wchar_t range_start = *p++; + wchar_t range_end = *p++; + printf("%C-%C", range_start, range_end); + } + length = *workp++; /* the length of char */ + for (i=0 ; i<length ; i++) + printf("%C", *p++); + putchar (']'); +# else register int c, last = -100; register int in_range = 0; @@ -754,6 +977,7 @@ print_partial_compiled_pattern (start, end) putchar (']'); p += 1 + *p; +# endif /* WCHAR */ } break; @@ -766,17 +990,17 @@ print_partial_compiled_pattern (start, end) break; case on_failure_jump: - extract_number_and_incr (&mcnt, &p); + PREFIX(extract_number_and_incr) (&mcnt, &p); printf ("/on_failure_jump to %d", p + mcnt - start); break; case on_failure_keep_string_jump: - extract_number_and_incr (&mcnt, &p); + PREFIX(extract_number_and_incr) (&mcnt, &p); printf ("/on_failure_keep_string_jump to %d", p + mcnt - start); break; case dummy_failure_jump: - extract_number_and_incr (&mcnt, &p); + PREFIX(extract_number_and_incr) (&mcnt, &p); printf ("/dummy_failure_jump to %d", p + mcnt - start); break; @@ -785,43 +1009,43 @@ print_partial_compiled_pattern (start, end) break; case maybe_pop_jump: - extract_number_and_incr (&mcnt, &p); + PREFIX(extract_number_and_incr) (&mcnt, &p); printf ("/maybe_pop_jump to %d", p + mcnt - start); break; case pop_failure_jump: - extract_number_and_incr (&mcnt, &p); + PREFIX(extract_number_and_incr) (&mcnt, &p); printf ("/pop_failure_jump to %d", p + mcnt - start); break; case jump_past_alt: - extract_number_and_incr (&mcnt, &p); + PREFIX(extract_number_and_incr) (&mcnt, &p); printf ("/jump_past_alt to %d", p + mcnt - start); break; case jump: - extract_number_and_incr (&mcnt, &p); + PREFIX(extract_number_and_incr) (&mcnt, &p); printf ("/jump to %d", p + mcnt - start); break; case succeed_n: - extract_number_and_incr (&mcnt, &p); + PREFIX(extract_number_and_incr) (&mcnt, &p); p1 = p + mcnt; - extract_number_and_incr (&mcnt2, &p); + PREFIX(extract_number_and_incr) (&mcnt2, &p); printf ("/succeed_n to %d, %d times", p1 - start, mcnt2); break; case jump_n: - extract_number_and_incr (&mcnt, &p); + PREFIX(extract_number_and_incr) (&mcnt, &p); p1 = p + mcnt; - extract_number_and_incr (&mcnt2, &p); + PREFIX(extract_number_and_incr) (&mcnt2, &p); printf ("/jump_n to %d, %d times", p1 - start, mcnt2); break; case set_number_at: - extract_number_and_incr (&mcnt, &p); + PREFIX(extract_number_and_incr) (&mcnt, &p); p1 = p + mcnt; - extract_number_and_incr (&mcnt2, &p); + PREFIX(extract_number_and_incr) (&mcnt2, &p); printf ("/set_number_at location %d to %d", p1 - start, mcnt2); break; @@ -894,12 +1118,13 @@ print_partial_compiled_pattern (start, end) void -print_compiled_pattern (bufp) +PREFIX(print_compiled_pattern) (bufp) struct re_pattern_buffer *bufp; { - unsigned char *buffer = bufp->buffer; + UCHAR_T *buffer = (UCHAR_T*) bufp->buffer; - print_partial_compiled_pattern (buffer, buffer + bufp->used); + PREFIX(print_partial_compiled_pattern) (buffer, buffer + + bufp->used / sizeof(UCHAR_T)); printf ("%ld bytes used/%ld bytes allocated.\n", bufp->used, bufp->allocated); @@ -922,10 +1147,10 @@ print_compiled_pattern (bufp) void -print_double_string (where, string1, size1, string2, size2) - const char *where; - const char *string1; - const char *string2; +PREFIX(print_double_string) (where, string1, size1, string2, size2) + const CHAR_T *where; + const CHAR_T *string1; + const CHAR_T *string2; int size1; int size2; { @@ -938,25 +1163,28 @@ print_double_string (where, string1, size1, string2, size2) if (FIRST_STRING_P (where)) { for (this_char = where - string1; this_char < size1; this_char++) - putchar (string1[this_char]); + PUT_CHAR (string1[this_char]); where = string2; } for (this_char = where - string2; this_char < size2; this_char++) - putchar (string2[this_char]); + PUT_CHAR (string2[this_char]); } } +# ifndef DEFINED_ONCE void printchar (c) int c; { putc (c, stderr); } +#endif #else /* not DEBUG */ +#ifndef DEFINED_ONCE # undef assert # define assert(e) @@ -965,11 +1193,88 @@ printchar (c) # define DEBUG_PRINT2(x1, x2) # define DEBUG_PRINT3(x1, x2, x3) # define DEBUG_PRINT4(x1, x2, x3, x4) +#endif /* not DEFINED_ONCE */ # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) #endif /* not DEBUG */ +# ifdef WCHAR +/* This convert a multibyte string to a wide character string. + And write their correspondances to offset_buffer(see below) + and write whether each wchar_t is binary data to is_binary. + This assume invalid multibyte sequences as binary data. + We assume offset_buffer and is_binary is already allocated + enough space. */ + +static size_t convert_mbs_to_wcs (CHAR_T *dest, const unsigned char* src, + size_t len, int *offset_buffer, + char *is_binary); +static size_t +convert_mbs_to_wcs (dest, src, len, offset_buffer, is_binary) + CHAR_T *dest; + const unsigned char* src; + size_t len; /* the length of multibyte string. */ + + /* It hold correspondances between src(char string) and + dest(wchar_t string) for optimization. + e.g. src = "xxxyzz" + dest = {'X', 'Y', 'Z'} + (each "xxx", "y" and "zz" represent one multibyte character + corresponding to 'X', 'Y' and 'Z'.) + offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")} + = {0, 3, 4, 6} + */ + int *offset_buffer; + char *is_binary; +{ + wchar_t *pdest = dest; + const unsigned char *psrc = src; + size_t wc_count = 0; + + mbstate_t mbs; + int i, consumed; + size_t mb_remain = len; + size_t mb_count = 0; + + /* Initialize the conversion state. */ + memset (&mbs, 0, sizeof (mbstate_t)); + + offset_buffer[0] = 0; + for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed, + psrc += consumed) + { + consumed = mbrtowc (pdest, psrc, mb_remain, &mbs); + + if (consumed <= 0) + /* failed to convert. maybe src contains binary data. + So we consume 1 byte manualy. */ + { + *pdest = *psrc; + consumed = 1; + is_binary[wc_count] = TRUE; + } + else + is_binary[wc_count] = FALSE; + /* In sjis encoding, we use yen sign as escape character in + place of reverse solidus. So we convert 0x5c(yen sign in + sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse + solidus in UCS2). */ + if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5) + *pdest = (wchar_t) *psrc; + + offset_buffer[wc_count + 1] = mb_count += consumed; + } + + /* Fill remain of the buffer with sentinel. */ + for (i = wc_count + 1 ; i <= len ; i++) + offset_buffer[i] = mb_count + 1; + + return wc_count; +} +# endif /* WCHAR */ + +#else /* not INSIDE_RECURSION */ /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can also be assigned to arbitrarily: each pattern buffer stores its own syntax, so it can be changed between regex compilations. */ @@ -1033,6 +1338,9 @@ static const char *re_error_msgid[] = gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */ }; +#endif /* INSIDE_RECURSION */ + +#ifndef DEFINED_ONCE /* Avoiding alloca during matching, to placate r_alloc. */ /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the @@ -1069,8 +1377,10 @@ static const char *re_error_msgid[] = #if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs # undef MATCH_MAY_ALLOCATE #endif +#endif /* not DEFINED_ONCE */ +#ifdef INSIDE_RECURSION /* Failure stack declarations and macros; both re_compile_fastmap and re_match_2 use a failure stack. These have to be macros because of REGEX_ALLOCATE_STACK. */ @@ -1090,59 +1400,93 @@ static const char *re_error_msgid[] = #ifdef INT_IS_16BIT -# if defined MATCH_MAY_ALLOCATE +#ifndef DEFINED_ONCE + +# ifdef REGEX_MALLOC +/* + * Set RE_MAX_FAILURES to the largest reasonable value, + * to avoid spurious "not enough memory" messages. + * The '80' is computed as follows: + * regex.c's MAX_FAILURE_ITEMS is at most 20. + * regex.c computes 2 * MAX_FAILURE_ITEMS, giving us 40. + * re_max_failures is signed, not unsigned, for another factor of 2, + * giving us 80. + */ +long int re_max_failures = ((unsigned) -1) / 80; +# else /* ! not REGEX_MALLOC */ +# if defined MATCH_MAY_ALLOCATE /* 4400 was enough to cause a crash on Alpha OSF/1, whose default stack limit is 2mb. */ long int re_max_failures = 4000; -# else +# else long int re_max_failures = 2000; -# endif +# endif +# endif /* ! not REGEX_MALLOC */ +#endif /* ! DEFINED_ONCE */ -union fail_stack_elt +union PREFIX(fail_stack_elt) { - unsigned char *pointer; + UCHAR_T *pointer; long int integer; }; -typedef union fail_stack_elt fail_stack_elt_t; +typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t); typedef struct { - fail_stack_elt_t *stack; + PREFIX(fail_stack_elt_t) *stack; unsigned long int size; unsigned long int avail; /* Offset of next open position. */ -} fail_stack_type; +} PREFIX(fail_stack_type); #else /* not INT_IS_16BIT */ -# if defined MATCH_MAY_ALLOCATE +#ifndef DEFINED_ONCE + +# ifdef REGEX_MALLOC +/* + * Set RE_MAX_FAILURES to the largest reasonable value, + * to avoid spurious "not enough memory" messages. + * The '80' is computed as follows: + * regex.c's MAX_FAILURE_ITEMS is at most 20. + * regex.c computes 2 * MAX_FAILURE_ITEMS, giving us 40. + * re_max_failures is signed, not unsigned, for another factor of 2, + * giving us 80. + */ +long int re_max_failures = ((unsigned) -1) / 80; +# else /* ! not REGEX_MALLOC */ +# if defined MATCH_MAY_ALLOCATE /* 4400 was enough to cause a crash on Alpha OSF/1, whose default stack limit is 2mb. */ int re_max_failures = 20000; -# else +# else int re_max_failures = 2000; -# endif +# endif +# endif /* ! not REGEX_MALLOC */ +#endif -union fail_stack_elt +union PREFIX(fail_stack_elt) { - unsigned char *pointer; + UCHAR_T *pointer; int integer; }; -typedef union fail_stack_elt fail_stack_elt_t; +typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t); typedef struct { - fail_stack_elt_t *stack; + PREFIX(fail_stack_elt_t) *stack; unsigned size; unsigned avail; /* Offset of next open position. */ -} fail_stack_type; +} PREFIX(fail_stack_type); #endif /* INT_IS_16BIT */ +#ifndef DEFINED_ONCE #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) +#endif /* Define macros to initialize and free the failure stack. @@ -1151,8 +1495,8 @@ typedef struct #ifdef MATCH_MAY_ALLOCATE # define INIT_FAIL_STACK() \ do { \ - fail_stack.stack = (fail_stack_elt_t *) \ - REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ + fail_stack.stack = (PREFIX(fail_stack_elt_t) *) \ + REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (PREFIX(fail_stack_elt_t))); \ \ if (fail_stack.stack == NULL) \ return -2; \ @@ -1182,10 +1526,10 @@ typedef struct #define DOUBLE_FAIL_STACK(fail_stack) \ ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \ ? 0 \ - : ((fail_stack).stack = (fail_stack_elt_t *) \ + : ((fail_stack).stack = (PREFIX(fail_stack_elt_t) *) \ REGEX_REALLOCATE_STACK ((fail_stack).stack, \ - (fail_stack).size * sizeof (fail_stack_elt_t), \ - ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ + (fail_stack).size * sizeof (PREFIX(fail_stack_elt_t)), \ + ((fail_stack).size << 1) * sizeof (PREFIX(fail_stack_elt_t))),\ \ (fail_stack).stack == NULL \ ? 0 \ @@ -1207,7 +1551,7 @@ typedef struct Assumes the variable `fail_stack'. Probably should only be called from within `PUSH_FAILURE_POINT'. */ #define PUSH_FAILURE_POINTER(item) \ - fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) + fail_stack.stack[fail_stack.avail++].pointer = (UCHAR_T *) (item) /* This pushes an integer-valued item onto the failure stack. Assumes the variable `fail_stack'. Probably should only @@ -1325,6 +1669,7 @@ typedef struct DEBUG_PUSH (failure_id); \ } while (0) +#ifndef DEFINED_ONCE /* This is the number of items that are pushed and popped on the stack for each register. */ #define NUM_REG_ITEMS 3 @@ -1351,6 +1696,7 @@ typedef struct /* How many items can still be added to the stack without overflowing it. */ #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) +#endif /* not DEFINED_ONCE */ /* Pops what PUSH_FAIL_STACK pushes. @@ -1369,7 +1715,7 @@ typedef struct { \ DEBUG_STATEMENT (unsigned failure_id;) \ active_reg_t this_reg; \ - const unsigned char *string_temp; \ + const UCHAR_T *string_temp; \ \ assert (!FAIL_STACK_EMPTY ()); \ \ @@ -1388,13 +1734,13 @@ typedef struct saved NULL, thus retaining our current position in the string. */ \ string_temp = POP_FAILURE_POINTER (); \ if (string_temp != NULL) \ - str = (const char *) string_temp; \ + str = (const CHAR_T *) string_temp; \ \ DEBUG_PRINT2 (" Popping string %p: `", str); \ DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ DEBUG_PRINT1 ("'\n"); \ \ - pat = (unsigned char *) POP_FAILURE_POINTER (); \ + pat = (UCHAR_T *) POP_FAILURE_POINTER (); \ DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \ DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ \ @@ -1414,10 +1760,10 @@ typedef struct DEBUG_PRINT2 (" info: %p\n", \ reg_info[this_reg].word.pointer); \ \ - regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \ + regend[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \ DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ \ - regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \ + regstart[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \ DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ } \ else \ @@ -1453,7 +1799,7 @@ typedef struct typedef union { - fail_stack_elt_t word; + PREFIX(fail_stack_elt_t) word; struct { /* This field is one if this group can match the empty string, @@ -1464,8 +1810,9 @@ typedef union unsigned matched_something : 1; unsigned ever_matched_something : 1; } bits; -} register_info_type; +} PREFIX(register_info_type); +#ifndef DEFINED_ONCE #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) #define IS_ACTIVE(R) ((R).bits.is_active) #define MATCHED_SOMETHING(R) ((R).bits.matched_something) @@ -1491,51 +1838,71 @@ typedef union } \ } \ while (0) +#endif /* not DEFINED_ONCE */ /* Registers are set to a sentinel when they haven't yet matched. */ -static char reg_unset_dummy; -#define REG_UNSET_VALUE (®_unset_dummy) +static CHAR_T PREFIX(reg_unset_dummy); +#define REG_UNSET_VALUE (&PREFIX(reg_unset_dummy)) #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) /* Subroutine declarations and macros for regex_compile. */ - -static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size, - reg_syntax_t syntax, - struct re_pattern_buffer *bufp)); -static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg)); -static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc, +static void PREFIX(store_op1) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc, int arg)); +static void PREFIX(store_op2) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc, int arg1, int arg2)); -static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, - int arg, unsigned char *end)); -static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc, - int arg1, int arg2, unsigned char *end)); -static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p, +static void PREFIX(insert_op1) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc, + int arg, UCHAR_T *end)); +static void PREFIX(insert_op2) _RE_ARGS ((re_opcode_t op, UCHAR_T *loc, + int arg1, int arg2, UCHAR_T *end)); +static boolean PREFIX(at_begline_loc_p) _RE_ARGS ((const CHAR_T *pattern, const CHAR_T *p, reg_syntax_t syntax)); -static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend, +static boolean PREFIX(at_endline_loc_p) _RE_ARGS ((const CHAR_T *p, const CHAR_T *pend, reg_syntax_t syntax)); -static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, - const char *pend, - char *translate, - reg_syntax_t syntax, - unsigned char *b)); +# ifdef WCHAR +static reg_errcode_t wcs_compile_range _RE_ARGS ((CHAR_T range_start, + const CHAR_T **p_ptr, + const CHAR_T *pend, + char *translate, + reg_syntax_t syntax, + UCHAR_T *b, + CHAR_T *char_set)); +static void insert_space _RE_ARGS ((int num, CHAR_T *loc, CHAR_T *end)); +# else /* BYTE */ +static reg_errcode_t byte_compile_range _RE_ARGS ((unsigned int range_start, + const char **p_ptr, + const char *pend, + char *translate, + reg_syntax_t syntax, + unsigned char *b)); +# endif /* WCHAR */ /* Fetch the next character in the uncompiled pattern---translating it if necessary. Also cast from a signed character in the constant string passed to us by the user to an unsigned char that we can use as an array index (in, e.g., `translate'). */ +/* ifdef WCHAR, we translate only if character <= 0xff, + because it is impossible to allocate 4GB array for some encodings + which have 4 byte character_set like UCS4. */ #ifndef PATFETCH -# define PATFETCH(c) \ +# ifdef WCHAR +# define PATFETCH(c) \ + do {if (p == pend) return REG_EEND; \ + c = (UCHAR_T) *p++; \ + if (translate && (c <= 0xff)) c = (UCHAR_T) translate[c]; \ + } while (0) +# else /* BYTE */ +# define PATFETCH(c) \ do {if (p == pend) return REG_EEND; \ c = (unsigned char) *p++; \ if (translate) c = (unsigned char) translate[c]; \ } while (0) +# endif /* WCHAR */ #endif /* Fetch the next character in the uncompiled pattern, with no translation. */ #define PATFETCH_RAW(c) \ do {if (p == pend) return REG_EEND; \ - c = (unsigned char) *p++; \ + c = (UCHAR_T) *p++; \ } while (0) /* Go backwards one character in the pattern. */ @@ -1546,27 +1913,43 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, cast the subscript to translate because some data is declared as `char *', to avoid warnings when a string constant is passed. But when we use a character as a subscript we must make it unsigned. */ +/* ifdef WCHAR, we translate only if character <= 0xff, + because it is impossible to allocate 4GB array for some encodings + which have 4 byte character_set like UCS4. */ #ifndef TRANSLATE -# define TRANSLATE(d) \ +# ifdef WCHAR +# define TRANSLATE(d) \ + ((translate && ((UCHAR_T) (d)) <= 0xff) \ + ? (char) translate[(unsigned char) (d)] : (d)) +# else /* BYTE */ +# define TRANSLATE(d) \ (translate ? (char) translate[(unsigned char) (d)] : (d)) +# endif /* WCHAR */ #endif /* Macros for outputting the compiled pattern into `buffer'. */ /* If the buffer isn't allocated when it comes in, use this. */ -#define INIT_BUF_SIZE 32 +# define INIT_BUF_SIZE (32 * sizeof(UCHAR_T)) /* Make sure we have at least N more bytes of space in buffer. */ -#define GET_BUFFER_SPACE(n) \ +# ifdef WCHAR +# define GET_BUFFER_SPACE(n) \ + while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \ + + (n)*sizeof(CHAR_T)) > bufp->allocated) \ + EXTEND_BUFFER () +# else /* BYTE */ +# define GET_BUFFER_SPACE(n) \ while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \ EXTEND_BUFFER () +# endif /* WCHAR */ /* Make sure we have one more byte of buffer space and then add C to it. */ #define BUF_PUSH(c) \ do { \ GET_BUFFER_SPACE (1); \ - *b++ = (unsigned char) (c); \ + *b++ = (UCHAR_T) (c); \ } while (0) @@ -1574,8 +1957,8 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, #define BUF_PUSH_2(c1, c2) \ do { \ GET_BUFFER_SPACE (2); \ - *b++ = (unsigned char) (c1); \ - *b++ = (unsigned char) (c2); \ + *b++ = (UCHAR_T) (c1); \ + *b++ = (UCHAR_T) (c2); \ } while (0) @@ -1583,28 +1966,28 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, #define BUF_PUSH_3(c1, c2, c3) \ do { \ GET_BUFFER_SPACE (3); \ - *b++ = (unsigned char) (c1); \ - *b++ = (unsigned char) (c2); \ - *b++ = (unsigned char) (c3); \ + *b++ = (UCHAR_T) (c1); \ + *b++ = (UCHAR_T) (c2); \ + *b++ = (UCHAR_T) (c3); \ } while (0) /* Store a jump with opcode OP at LOC to location TO. We store a relative address offset by the three bytes the jump itself occupies. */ #define STORE_JUMP(op, loc, to) \ - store_op1 (op, loc, (int) ((to) - (loc) - 3)) + PREFIX(store_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE))) /* Likewise, for a two-argument jump. */ #define STORE_JUMP2(op, loc, to, arg) \ - store_op2 (op, loc, (int) ((to) - (loc) - 3), arg) + PREFIX(store_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg) /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ #define INSERT_JUMP(op, loc, to) \ - insert_op1 (op, loc, (int) ((to) - (loc) - 3), b) + PREFIX(insert_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b) /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ #define INSERT_JUMP2(op, loc, to, arg) \ - insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b) + PREFIX(insert_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg, b) /* This is not an arbitrary limit: the arguments which represent offsets @@ -1615,6 +1998,7 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up reallocating to 0 bytes. Such thing is not going to work too well. You have been warned!! */ +#ifndef DEFINED_ONCE #if defined _MSC_VER && !defined WIN32 /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. The REALLOC define eliminates a flurry of conversion warnings, @@ -1650,34 +2034,72 @@ static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, # define MOVE_BUFFER_POINTER(P) (P) += incr # define ELSE_EXTEND_BUFFER_HIGH_BOUND #endif -#define EXTEND_BUFFER() \ - do { \ - unsigned char *old_buffer = bufp->buffer; \ - if (bufp->allocated == MAX_BUF_SIZE) \ +#endif /* not DEFINED_ONCE */ +# ifdef WCHAR +# define EXTEND_BUFFER() \ + do { \ + UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \ + int wchar_count; \ + if (bufp->allocated + sizeof(UCHAR_T) > MAX_BUF_SIZE) \ return REG_ESIZE; \ bufp->allocated <<= 1; \ if (bufp->allocated > MAX_BUF_SIZE) \ - bufp->allocated = MAX_BUF_SIZE; \ - bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\ - if (bufp->buffer == NULL) \ + bufp->allocated = MAX_BUF_SIZE; \ + /* How many characters the new buffer can have? */ \ + wchar_count = bufp->allocated / sizeof(UCHAR_T); \ + if (wchar_count == 0) wchar_count = 1; \ + /* Truncate the buffer to CHAR_T align. */ \ + bufp->allocated = wchar_count * sizeof(UCHAR_T); \ + RETALLOC (COMPILED_BUFFER_VAR, wchar_count, UCHAR_T); \ + bufp->buffer = (char*)COMPILED_BUFFER_VAR; \ + if (COMPILED_BUFFER_VAR == NULL) \ return REG_ESPACE; \ /* If the buffer moved, move all the pointers into it. */ \ - if (old_buffer != bufp->buffer) \ + if (old_buffer != COMPILED_BUFFER_VAR) \ { \ - int incr = bufp->buffer - old_buffer; \ + int incr = COMPILED_BUFFER_VAR - old_buffer; \ + MOVE_BUFFER_POINTER (b); \ + MOVE_BUFFER_POINTER (begalt); \ + if (fixup_alt_jump) \ + MOVE_BUFFER_POINTER (fixup_alt_jump); \ + if (laststart) \ + MOVE_BUFFER_POINTER (laststart); \ + if (pending_exact) \ + MOVE_BUFFER_POINTER (pending_exact); \ + } \ + ELSE_EXTEND_BUFFER_HIGH_BOUND \ + } while (0) +# else /* BYTE */ +# define EXTEND_BUFFER() \ + do { \ + UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \ + if (bufp->allocated == MAX_BUF_SIZE) \ + return REG_ESIZE; \ + bufp->allocated <<= 1; \ + if (bufp->allocated > MAX_BUF_SIZE) \ + bufp->allocated = MAX_BUF_SIZE; \ + bufp->buffer = (UCHAR_T *) REALLOC (COMPILED_BUFFER_VAR, \ + bufp->allocated); \ + if (COMPILED_BUFFER_VAR == NULL) \ + return REG_ESPACE; \ + /* If the buffer moved, move all the pointers into it. */ \ + if (old_buffer != COMPILED_BUFFER_VAR) \ + { \ + int incr = COMPILED_BUFFER_VAR - old_buffer; \ MOVE_BUFFER_POINTER (b); \ MOVE_BUFFER_POINTER (begalt); \ - if (fixup_alt_jump) \ + if (fixup_alt_jump) \ MOVE_BUFFER_POINTER (fixup_alt_jump); \ - if (laststart) \ + if (laststart) \ MOVE_BUFFER_POINTER (laststart); \ - if (pending_exact) \ + if (pending_exact) \ MOVE_BUFFER_POINTER (pending_exact); \ } \ ELSE_EXTEND_BUFFER_HIGH_BOUND \ } while (0) +# endif /* WCHAR */ - +#ifndef DEFINED_ONCE /* Since we have one byte reserved for the register number argument to {start,stop}_memory, the maximum number of groups we can report things about is what fits in that byte. */ @@ -1721,12 +2143,15 @@ typedef struct /* The next available element. */ #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) +#endif /* not DEFINED_ONCE */ /* Set the bit for character C in a list. */ +#ifndef DEFINED_ONCE #define SET_LIST_BIT(c) \ (b[((unsigned char) (c)) / BYTEWIDTH] \ |= 1 << (((unsigned char) c) % BYTEWIDTH)) +#endif /* DEFINED_ONCE */ /* Get the next unsigned number in the uncompiled pattern. */ #define GET_UNSIGNED_NUMBER(num) \ @@ -1745,6 +2170,7 @@ typedef struct } \ } +#ifndef DEFINED_ONCE #if defined _LIBC || WIDE_CHAR_SUPPORT /* The GNU C library provides support for user-defined character classes and the functions from ISO C amendement 1. */ @@ -1772,6 +2198,7 @@ typedef struct || STREQ (string, "punct") || STREQ (string, "graph") \ || STREQ (string, "cntrl") || STREQ (string, "blank")) #endif +#endif /* DEFINED_ONCE */ #ifndef MATCH_MAY_ALLOCATE @@ -1782,25 +2209,28 @@ typedef struct The register vectors, we adjust in size each time we compile a regexp, according to the number of registers it needs. */ -static fail_stack_type fail_stack; +static PREFIX(fail_stack_type) fail_stack; /* Size with which the following vectors are currently allocated. That is so we can make them bigger as needed, but never make them smaller. */ +#ifdef DEFINED_ONCE static int regs_allocated_size; static const char ** regstart, ** regend; static const char ** old_regstart, ** old_regend; static const char **best_regstart, **best_regend; -static register_info_type *reg_info; static const char **reg_dummy; -static register_info_type *reg_info_dummy; +#endif /* DEFINED_ONCE */ + +static PREFIX(register_info_type) *PREFIX(reg_info); +static PREFIX(register_info_type) *PREFIX(reg_info_dummy); /* Make the register vectors big enough for NUM_REGS registers, but don't make them smaller. */ -static -regex_grow_registers (num_regs) +static void +PREFIX(regex_grow_registers) (num_regs) int num_regs; { if (num_regs > regs_allocated_size) @@ -1811,9 +2241,9 @@ regex_grow_registers (num_regs) RETALLOC_IF (old_regend, num_regs, const char *); RETALLOC_IF (best_regstart, num_regs, const char *); RETALLOC_IF (best_regend, num_regs, const char *); - RETALLOC_IF (reg_info, num_regs, register_info_type); + RETALLOC_IF (PREFIX(reg_info), num_regs, PREFIX(register_info_type)); RETALLOC_IF (reg_dummy, num_regs, const char *); - RETALLOC_IF (reg_info_dummy, num_regs, register_info_type); + RETALLOC_IF (PREFIX(reg_info_dummy), num_regs, PREFIX(register_info_type)); regs_allocated_size = num_regs; } @@ -1821,9 +2251,11 @@ regex_grow_registers (num_regs) #endif /* not MATCH_MAY_ALLOCATE */ +#ifndef DEFINED_ONCE static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type compile_stack, regnum_t regnum)); +#endif /* not DEFINED_ONCE */ /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. Returns one of error codes defined in `regex.h', or zero for success. @@ -1844,33 +2276,55 @@ static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type examined nor set. */ /* Return, freeing storage we allocated. */ -#define FREE_STACK_RETURN(value) \ +# ifdef WCHAR +# define FREE_STACK_RETURN(value) \ + return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value) +# else +# define FREE_STACK_RETURN(value) \ return (free (compile_stack.stack), value) +# endif /* WCHAR */ static reg_errcode_t -regex_compile (pattern, size, syntax, bufp) - const char *pattern; - size_t size; +PREFIX(regex_compile) (ARG_PREFIX(pattern), ARG_PREFIX(size), syntax, bufp) + const char *ARG_PREFIX(pattern); + size_t ARG_PREFIX(size); reg_syntax_t syntax; struct re_pattern_buffer *bufp; { /* We fetch characters from PATTERN here. Even though PATTERN is `char *' (i.e., signed), we declare these variables as unsigned, so they can be reliably used as array indices. */ - register unsigned char c, c1; + register UCHAR_T c, c1; + +#ifdef WCHAR + /* A temporary space to keep wchar_t pattern and compiled pattern. */ + CHAR_T *pattern, *COMPILED_BUFFER_VAR; + size_t size; + /* offset buffer for optimization. See convert_mbs_to_wc. */ + int *mbs_offset = NULL; + /* It hold whether each wchar_t is binary data or not. */ + char *is_binary = NULL; + /* A flag whether exactn is handling binary data or not. */ + char is_exactn_bin = FALSE; +#endif /* WCHAR */ /* A random temporary spot in PATTERN. */ - const char *p1; + const CHAR_T *p1; /* Points to the end of the buffer, where we should append. */ - register unsigned char *b; + register UCHAR_T *b; /* Keeps track of unclosed groups. */ compile_stack_type compile_stack; /* Points to the current (ending) position in the pattern. */ - const char *p = pattern; - const char *pend = pattern + size; +#ifdef WCHAR + const CHAR_T *p; + const CHAR_T *pend; +#else /* BYTE */ + const CHAR_T *p = pattern; + const CHAR_T *pend = pattern + size; +#endif /* WCHAR */ /* How to translate the characters in the pattern. */ RE_TRANSLATE_TYPE translate = bufp->translate; @@ -1879,30 +2333,54 @@ regex_compile (pattern, size, syntax, bufp) command. This makes it possible to tell if a new exact-match character can be added to that command or if the character requires a new `exactn' command. */ - unsigned char *pending_exact = 0; + UCHAR_T *pending_exact = 0; /* Address of start of the most recently finished expression. This tells, e.g., postfix * where to find the start of its operand. Reset at the beginning of groups and alternatives. */ - unsigned char *laststart = 0; + UCHAR_T *laststart = 0; /* Address of beginning of regexp, or inside of last group. */ - unsigned char *begalt; + UCHAR_T *begalt; /* Place in the uncompiled pattern (i.e., the {) to which to go back if the interval is invalid. */ - const char *beg_interval; + const CHAR_T *beg_interval; /* Address of the place where a forward jump should go to the end of the containing expression. Each alternative of an `or' -- except the last -- ends with a forward jump of this sort. */ - unsigned char *fixup_alt_jump = 0; + UCHAR_T *fixup_alt_jump = 0; /* Counts open-groups as they are encountered. Remembered for the matching close-group on the compile stack, so the same register number is put in the stop_memory as the start_memory. */ regnum_t regnum = 0; +#ifdef WCHAR + /* Initialize the wchar_t PATTERN and offset_buffer. */ + p = pend = pattern = TALLOC(csize + 1, CHAR_T); + mbs_offset = TALLOC(csize + 1, int); + is_binary = TALLOC(csize + 1, char); + if (pattern == NULL || mbs_offset == NULL || is_binary == NULL) + { + free(pattern); + free(mbs_offset); + free(is_binary); + return REG_ESPACE; + } + pattern[csize] = L'\0'; /* sentinel */ + size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary); + pend = p + size; + if (size < 0) + { + free(pattern); + free(mbs_offset); + free(is_binary); + return REG_BADPAT; + } +#endif + #ifdef DEBUG DEBUG_PRINT1 ("\nCompiling pattern: "); if (debug) @@ -1910,7 +2388,7 @@ regex_compile (pattern, size, syntax, bufp) unsigned debug_count; for (debug_count = 0; debug_count < size; debug_count++) - putchar (pattern[debug_count]); + PUT_CHAR (pattern[debug_count]); putchar ('\n'); } #endif /* DEBUG */ @@ -1918,7 +2396,15 @@ regex_compile (pattern, size, syntax, bufp) /* Initialize the compile stack. */ compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); if (compile_stack.stack == NULL) - return REG_ESPACE; + { +#ifdef WCHAR + free(pattern); + free(mbs_offset); + free(is_binary); +#endif + return REG_ESPACE; + } + compile_stack.size = INIT_COMPILE_STACK_SIZE; compile_stack.avail = 0; @@ -1947,18 +2433,34 @@ regex_compile (pattern, size, syntax, bufp) { /* If zero allocated, but buffer is non-null, try to realloc enough space. This loses if buffer's address is bogus, but that is the user's responsibility. */ - RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); +#ifdef WCHAR + /* Free bufp->buffer and allocate an array for wchar_t pattern + buffer. */ + free(bufp->buffer); + COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(UCHAR_T), + UCHAR_T); +#else + RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, UCHAR_T); +#endif /* WCHAR */ } else { /* Caller did not allocate a buffer. Do it for them. */ - bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); + COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(UCHAR_T), + UCHAR_T); } - if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE); + if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE); +#ifdef WCHAR + bufp->buffer = (char*)COMPILED_BUFFER_VAR; +#endif /* WCHAR */ bufp->allocated = INIT_BUF_SIZE; } +#ifdef WCHAR + else + COMPILED_BUFFER_VAR = (UCHAR_T*) bufp->buffer; +#endif - begalt = b = bufp->buffer; + begalt = b = COMPILED_BUFFER_VAR; /* Loop through the uncompiled pattern until we're at the end. */ while (p != pend) @@ -1974,7 +2476,7 @@ regex_compile (pattern, size, syntax, bufp) /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's come before. */ - || at_begline_loc_p (pattern, p, syntax)) + || PREFIX(at_begline_loc_p) (pattern, p, syntax)) BUF_PUSH (begline); else goto normal_char; @@ -1989,7 +2491,7 @@ regex_compile (pattern, size, syntax, bufp) /* If context independent, it's an operator. */ || syntax & RE_CONTEXT_INDEP_ANCHORS /* Otherwise, depends on what's next. */ - || at_endline_loc_p (p, pend, syntax)) + || PREFIX(at_endline_loc_p) (p, pend, syntax)) BUF_PUSH (endline); else goto normal_char; @@ -2083,7 +2585,7 @@ regex_compile (pattern, size, syntax, bufp) assert (p - 1 > pattern); /* Allocate the space for the jump. */ - GET_BUFFER_SPACE (3); + GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); /* We know we are not at the first character of the pattern, because laststart was nonzero. And we've already @@ -2100,20 +2602,23 @@ regex_compile (pattern, size, syntax, bufp) } else /* Anything else. */ - STORE_JUMP (maybe_pop_jump, b, laststart - 3); + STORE_JUMP (maybe_pop_jump, b, laststart - + (1 + OFFSET_ADDRESS_SIZE)); /* We've added more stuff to the buffer. */ - b += 3; + b += 1 + OFFSET_ADDRESS_SIZE; } /* On failure, jump from laststart to b + 3, which will be the end of the buffer after this jump is inserted. */ - GET_BUFFER_SPACE (3); + /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of + 'b + 3'. */ + GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump : on_failure_jump, - laststart, b + 3); + laststart, b + 1 + OFFSET_ADDRESS_SIZE); pending_exact = 0; - b += 3; + b += 1 + OFFSET_ADDRESS_SIZE; if (!zero_times_ok) { @@ -2122,9 +2627,10 @@ regex_compile (pattern, size, syntax, bufp) `on_failure_jump' instruction of the loop. This effects a skip over that instruction the first time we hit that loop. */ - GET_BUFFER_SPACE (3); - INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); - b += 3; + GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); + INSERT_JUMP (dummy_failure_jump, laststart, laststart + + 2 + 2 * OFFSET_ADDRESS_SIZE); + b += 1 + OFFSET_ADDRESS_SIZE; } } break; @@ -2139,9 +2645,343 @@ regex_compile (pattern, size, syntax, bufp) case '[': { boolean had_char_class = false; - +#ifdef WCHAR + CHAR_T range_start = 0xffffffff; +#else + unsigned int range_start = 0xffffffff; +#endif if (p == pend) FREE_STACK_RETURN (REG_EBRACK); +#ifdef WCHAR + /* We assume a charset(_not) structure as a wchar_t array. + charset[0] = (re_opcode_t) charset(_not) + charset[1] = l (= length of char_classes) + charset[2] = m (= length of collating_symbols) + charset[3] = n (= length of equivalence_classes) + charset[4] = o (= length of char_ranges) + charset[5] = p (= length of chars) + + charset[6] = char_class (wctype_t) + charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t) + ... + charset[l+5] = char_class (wctype_t) + + charset[l+6] = collating_symbol (wchar_t) + ... + charset[l+m+5] = collating_symbol (wchar_t) + ifdef _LIBC we use the index if + _NL_COLLATE_SYMB_EXTRAMB instead of + wchar_t string. + + charset[l+m+6] = equivalence_classes (wchar_t) + ... + charset[l+m+n+5] = equivalence_classes (wchar_t) + ifdef _LIBC we use the index in + _NL_COLLATE_WEIGHT instead of + wchar_t string. + + charset[l+m+n+6] = range_start + charset[l+m+n+7] = range_end + ... + charset[l+m+n+2o+4] = range_start + charset[l+m+n+2o+5] = range_end + ifdef _LIBC we use the value looked up + in _NL_COLLATE_COLLSEQ instead of + wchar_t character. + + charset[l+m+n+2o+6] = char + ... + charset[l+m+n+2o+p+5] = char + + */ + + /* We need at least 6 spaces: the opcode, the length of + char_classes, the length of collating_symbols, the length of + equivalence_classes, the length of char_ranges, the length of + chars. */ + GET_BUFFER_SPACE (6); + + /* Save b as laststart. And We use laststart as the pointer + to the first element of the charset here. + In other words, laststart[i] indicates charset[i]. */ + laststart = b; + + /* We test `*p == '^' twice, instead of using an if + statement, so we only need one BUF_PUSH. */ + BUF_PUSH (*p == '^' ? charset_not : charset); + if (*p == '^') + p++; + + /* Push the length of char_classes, the length of + collating_symbols, the length of equivalence_classes, the + length of char_ranges and the length of chars. */ + BUF_PUSH_3 (0, 0, 0); + BUF_PUSH_2 (0, 0); + + /* Remember the first position in the bracket expression. */ + p1 = p; + + /* charset_not matches newline according to a syntax bit. */ + if ((re_opcode_t) b[-6] == charset_not + && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) + { + BUF_PUSH('\n'); + laststart[5]++; /* Update the length of characters */ + } + + /* Read in characters and ranges, setting map bits. */ + for (;;) + { + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + + PATFETCH (c); + + /* \ might escape characters inside [...] and [^...]. */ + if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') + { + if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); + + PATFETCH (c1); + BUF_PUSH(c1); + laststart[5]++; /* Update the length of chars */ + range_start = c1; + continue; + } + + /* Could be the end of the bracket expression. If it's + not (i.e., when the bracket expression is `[]' so + far), the ']' character bit gets set way below. */ + if (c == ']' && p != p1 + 1) + break; + + /* Look ahead to see if it's a range when the last thing + was a character class. */ + if (had_char_class && c == '-' && *p != ']') + FREE_STACK_RETURN (REG_ERANGE); + + /* Look ahead to see if it's a range when the last thing + was a character: if this is a hyphen not at the + beginning or the end of a list, then it's the range + operator. */ + if (c == '-' + && !(p - 2 >= pattern && p[-2] == '[') + && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') + && *p != ']') + { + reg_errcode_t ret; + /* Allocate the space for range_start and range_end. */ + GET_BUFFER_SPACE (2); + /* Update the pointer to indicate end of buffer. */ + b += 2; + ret = wcs_compile_range (range_start, &p, pend, translate, + syntax, b, laststart); + if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); + range_start = 0xffffffff; + } + else if (p[0] == '-' && p[1] != ']') + { /* This handles ranges made up of characters only. */ + reg_errcode_t ret; + + /* Move past the `-'. */ + PATFETCH (c1); + /* Allocate the space for range_start and range_end. */ + GET_BUFFER_SPACE (2); + /* Update the pointer to indicate end of buffer. */ + b += 2; + ret = wcs_compile_range (c, &p, pend, translate, syntax, b, + laststart); + if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); + range_start = 0xffffffff; + } + + /* See if we're at the beginning of a possible character + class. */ + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') + { /* Leave room for the null. */ + char str[CHAR_CLASS_MAX_LENGTH + 1]; + + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[:'. */ + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + + for (;;) + { + PATFETCH (c); + if ((c == ':' && *p == ']') || p == pend) + break; + if (c1 < CHAR_CLASS_MAX_LENGTH) + str[c1++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[c1] = '\0'; + + /* If isn't a word bracketed by `[:' and `:]': + undo the ending character, the letters, and leave + the leading `:' and `[' (but store them as character). */ + if (c == ':' && *p == ']') + { + wctype_t wt; + unsigned long int alignedp; + + /* Query the character class as wctype_t. */ + wt = IS_CHAR_CLASS (str); + if (wt == 0) + FREE_STACK_RETURN (REG_ECTYPE); + + /* Throw away the ] at the end of the character + class. */ + PATFETCH (c); + + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + + /* Allocate the space for character class. */ + GET_BUFFER_SPACE(CHAR_CLASS_SIZE); + /* Update the pointer to indicate end of buffer. */ + b += CHAR_CLASS_SIZE; + /* Move data which follow character classes + not to violate the data. */ + insert_space(CHAR_CLASS_SIZE, + laststart + 6 + laststart[1], + b - 1); + alignedp = ((unsigned long int)(laststart + 6 + laststart[1]) + + __alignof__(wctype_t) - 1) + & ~(unsigned long int)(__alignof__(wctype_t) - 1); + /* Store the character class. */ + *((wctype_t*)alignedp) = wt; + /* Update length of char_classes */ + laststart[1] += CHAR_CLASS_SIZE; + + had_char_class = true; + } + else + { + c1++; + while (c1--) + PATUNFETCH; + BUF_PUSH ('['); + BUF_PUSH (':'); + laststart[5] += 2; /* Update the length of characters */ + range_start = ':'; + had_char_class = false; + } + } + else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '=' + || *p == '.')) + { + CHAR_T str[128]; /* Should be large enough. */ + CHAR_T delim = *p; /* '=' or '.' */ + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[=' or '[[.'. */ + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + + for (;;) + { + PATFETCH (c); + if ((c == delim && *p == ']') || p == pend) + break; + if (c1 < sizeof (str) - 1) + str[c1++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[c1] = '\0'; + + if (c == delim && *p == ']' && str[0] != '\0') + { + unsigned int i, offset; + /* If we have no collation data we use the default + collation in which each character is in a class + by itself. It also means that ASCII is the + character set and therefore we cannot have character + with more than one byte in the multibyte + representation. */ + + /* If not defined _LIBC, we push the name and + `\0' for the sake of matching performance. */ + int datasize = c1 + 1; + + if (c1 != 1) + FREE_STACK_RETURN (REG_ECOLLATE); + + /* Throw away the ] at the end of the equivalence + class (or collating symbol). */ + PATFETCH (c); + + /* Allocate the space for the equivalence class + (or collating symbol) (and '\0' if needed). */ + GET_BUFFER_SPACE(datasize); + /* Update the pointer to indicate end of buffer. */ + b += datasize; + + if (delim == '=') + { /* equivalence class */ + /* Calculate the offset of char_ranges, + which is next to equivalence_classes. */ + offset = laststart[1] + laststart[2] + + laststart[3] +6; + /* Insert space. */ + insert_space(datasize, laststart + offset, b - 1); + + /* Write the equivalence_class and \0. */ + for (i = 0 ; i < datasize ; i++) + laststart[offset + i] = str[i]; + + /* Update the length of equivalence_classes. */ + laststart[3] += datasize; + had_char_class = true; + } + else /* delim == '.' */ + { /* collating symbol */ + /* Calculate the offset of the equivalence_classes, + which is next to collating_symbols. */ + offset = laststart[1] + laststart[2] + 6; + /* Insert space and write the collationg_symbol + and \0. */ + insert_space(datasize, laststart + offset, b-1); + for (i = 0 ; i < datasize ; i++) + laststart[offset + i] = str[i]; + + /* In re_match_2_internal if range_start < -1, we + assume -range_start is the offset of the + collating symbol which is specified as + the character of the range start. So we assign + -(laststart[1] + laststart[2] + 6) to + range_start. */ + range_start = -(laststart[1] + laststart[2] + 6); + /* Update the length of collating_symbol. */ + laststart[2] += datasize; + had_char_class = false; + } + } + else + { + c1++; + while (c1--) + PATUNFETCH; + BUF_PUSH ('['); + BUF_PUSH (delim); + laststart[5] += 2; /* Update the length of characters */ + range_start = delim; + had_char_class = false; + } + } + else + { + had_char_class = false; + BUF_PUSH(c); + laststart[5]++; /* Update the length of characters */ + range_start = c; + } + } + +#else /* BYTE */ /* Ensure that we have enough space to push a charset: the opcode, the length count, and the bitset; 34 bytes in all. */ GET_BUFFER_SPACE (34); @@ -2182,6 +3022,7 @@ regex_compile (pattern, size, syntax, bufp) PATFETCH (c1); SET_LIST_BIT (c1); + range_start = c1; continue; } @@ -2206,8 +3047,10 @@ regex_compile (pattern, size, syntax, bufp) && *p != ']') { reg_errcode_t ret - = compile_range (&p, pend, translate, syntax, b); + = byte_compile_range (range_start, &p, pend, translate, + syntax, b); if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); + range_start = 0xffffffff; } else if (p[0] == '-' && p[1] != ']') @@ -2217,8 +3060,9 @@ regex_compile (pattern, size, syntax, bufp) /* Move past the `-'. */ PATFETCH (c1); - ret = compile_range (&p, pend, translate, syntax, b); + ret = byte_compile_range (c, &p, pend, translate, syntax, b); if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); + range_start = 0xffffffff; } /* See if we're at the beginning of a possible character @@ -2341,6 +3185,119 @@ regex_compile (pattern, size, syntax, bufp) PATUNFETCH; SET_LIST_BIT ('['); SET_LIST_BIT (':'); + range_start = ':'; + had_char_class = false; + } + } + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=') + { + unsigned char str[MB_LEN_MAX + 1]; + + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[='. */ + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + + for (;;) + { + PATFETCH (c); + if ((c == '=' && *p == ']') || p == pend) + break; + if (c1 < MB_LEN_MAX) + str[c1++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[c1] = '\0'; + + if (c == '=' && *p == ']' && str[0] != '\0') + { + /* If we have no collation data we use the default + collation in which each character is in a class + by itself. It also means that ASCII is the + character set and therefore we cannot have character + with more than one byte in the multibyte + representation. */ + { + if (c1 != 1) + FREE_STACK_RETURN (REG_ECOLLATE); + + /* Throw away the ] at the end of the equivalence + class. */ + PATFETCH (c); + + /* Set the bit for the character. */ + SET_LIST_BIT (str[0]); + } + had_char_class = true; + } + else + { + c1++; + while (c1--) + PATUNFETCH; + SET_LIST_BIT ('['); + SET_LIST_BIT ('='); + range_start = '='; + had_char_class = false; + } + } + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.') + { + unsigned char str[128]; /* Should be large enough. */ + + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[.'. */ + if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + + for (;;) + { + PATFETCH (c); + if ((c == '.' && *p == ']') || p == pend) + break; + if (c1 < sizeof (str)) + str[c1++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[c1] = '\0'; + + if (c == '.' && *p == ']' && str[0] != '\0') + { + /* If we have no collation data we use the default + collation in which each character is the name + for its own class which contains only the one + character. It also means that ASCII is the + character set and therefore we cannot have character + with more than one byte in the multibyte + representation. */ + { + if (c1 != 1) + FREE_STACK_RETURN (REG_ECOLLATE); + + /* Throw away the ] at the end of the equivalence + class. */ + PATFETCH (c); + + /* Set the bit for the character. */ + SET_LIST_BIT (str[0]); + range_start = ((const unsigned char *) str)[0]; + } + had_char_class = false; + } + else + { + c1++; + while (c1--) + PATUNFETCH; + SET_LIST_BIT ('['); + SET_LIST_BIT ('.'); + range_start = '.'; had_char_class = false; } } @@ -2348,6 +3305,7 @@ regex_compile (pattern, size, syntax, bufp) { had_char_class = false; SET_LIST_BIT (c); + range_start = c; } } @@ -2356,6 +3314,7 @@ regex_compile (pattern, size, syntax, bufp) while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) b[-1]--; b += b[-1]; +#endif /* WCHAR */ } break; @@ -2426,10 +3385,10 @@ regex_compile (pattern, size, syntax, bufp) group. They are all relative offsets, so that if the whole pattern moves because of realloc, they will still be valid. */ - COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; + COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR; COMPILE_STACK_TOP.fixup_alt_jump - = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; - COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; + = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0; + COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR; COMPILE_STACK_TOP.regnum = regnum; /* We will eventually replace the 0 with the number of @@ -2438,7 +3397,8 @@ regex_compile (pattern, size, syntax, bufp) represent in the compiled pattern. */ if (regnum <= MAX_REGNUM) { - COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; + COMPILE_STACK_TOP.inner_group_offset = b + - COMPILED_BUFFER_VAR + 2; BUF_PUSH_3 (start_memory, regnum, 0); } @@ -2497,12 +3457,12 @@ regex_compile (pattern, size, syntax, bufp) regnum_t this_group_regnum; compile_stack.avail--; - begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; + begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset; fixup_alt_jump = COMPILE_STACK_TOP.fixup_alt_jump - ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 + ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1 : 0; - laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; + laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset; this_group_regnum = COMPILE_STACK_TOP.regnum; /* If we've reached MAX_REGNUM groups, then this open won't actually generate any code, so we'll have to @@ -2513,8 +3473,8 @@ regex_compile (pattern, size, syntax, bufp) groups were inside this one. */ if (this_group_regnum <= MAX_REGNUM) { - unsigned char *inner_group_loc - = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; + UCHAR_T *inner_group_loc + = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset; *inner_group_loc = regnum - this_group_regnum; BUF_PUSH_3 (stop_memory, this_group_regnum, @@ -2533,10 +3493,11 @@ regex_compile (pattern, size, syntax, bufp) /* Insert before the previous alternative a jump which jumps to this alternative if the former fails. */ - GET_BUFFER_SPACE (3); - INSERT_JUMP (on_failure_jump, begalt, b + 6); + GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); + INSERT_JUMP (on_failure_jump, begalt, + b + 2 + 2 * OFFSET_ADDRESS_SIZE); pending_exact = 0; - b += 3; + b += 1 + OFFSET_ADDRESS_SIZE; /* The alternative before this one has a jump after it which gets executed if it gets matched. Adjust that @@ -2561,8 +3522,8 @@ regex_compile (pattern, size, syntax, bufp) to be filled in later either by next alternative or when know we're at the end of a series of alternatives. */ fixup_alt_jump = b; - GET_BUFFER_SPACE (3); - b += 3; + GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); + b += 1 + OFFSET_ADDRESS_SIZE; laststart = 0; begalt = b; @@ -2646,11 +3607,13 @@ regex_compile (pattern, size, syntax, bufp) /* If the upper bound is zero, don't want to succeed at all; jump from `laststart' to `b + 3', which will be the end of the buffer after we insert the jump. */ + /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' + instead of 'b + 3'. */ if (upper_bound == 0) { - GET_BUFFER_SPACE (3); - INSERT_JUMP (jump, laststart, b + 3); - b += 3; + GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); + INSERT_JUMP (jump, laststart, b + 1 + OFFSET_ADDRESS_SIZE); + b += 1 + OFFSET_ADDRESS_SIZE; } /* Otherwise, we have a nontrivial interval. When @@ -2665,7 +3628,8 @@ regex_compile (pattern, size, syntax, bufp) else { /* If the upper bound is > 1, we need to insert more at the end of the loop. */ - unsigned nbytes = 10 + (upper_bound > 1) * 10; + unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE + + (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE); GET_BUFFER_SPACE (nbytes); @@ -2675,16 +3639,21 @@ regex_compile (pattern, size, syntax, bufp) because `re_compile_fastmap' needs to know. Jump to the `jump_n' we might insert below. */ INSERT_JUMP2 (succeed_n, laststart, - b + 5 + (upper_bound > 1) * 5, + b + 1 + 2 * OFFSET_ADDRESS_SIZE + + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE), lower_bound); - b += 5; + b += 1 + 2 * OFFSET_ADDRESS_SIZE; /* Code to initialize the lower bound. Insert before the `succeed_n'. The `5' is the last two bytes of this `set_number_at', plus 3 bytes of the following `succeed_n'. */ - insert_op2 (set_number_at, laststart, 5, lower_bound, b); - b += 5; + /* ifdef WCHAR, The '1+2*OFFSET_ADDRESS_SIZE' + is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE' + of the following `succeed_n'. */ + PREFIX(insert_op2) (set_number_at, laststart, + 1 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b); + b += 1 + 2 * OFFSET_ADDRESS_SIZE; if (upper_bound > 1) { /* More than one repetition is allowed, so @@ -2694,9 +3663,10 @@ regex_compile (pattern, size, syntax, bufp) When we've reached this during matching, we'll have matched the interval once, so jump back only `upper_bound - 1' times. */ - STORE_JUMP2 (jump_n, b, laststart + 5, + STORE_JUMP2 (jump_n, b, laststart + + 1 + 2 * OFFSET_ADDRESS_SIZE, upper_bound - 1); - b += 5; + b += 1 + 2 * OFFSET_ADDRESS_SIZE; /* The location we want to set is the second parameter of the `jump_n'; that is `b-2' as @@ -2712,9 +3682,10 @@ regex_compile (pattern, size, syntax, bufp) We insert this at the beginning of the loop so that if we fail during matching, we'll reinitialize the bounds. */ - insert_op2 (set_number_at, laststart, b - laststart, - upper_bound - 1, b); - b += 5; + PREFIX(insert_op2) (set_number_at, laststart, + b - laststart, + upper_bound - 1, b); + b += 1 + 2 * OFFSET_ADDRESS_SIZE; } } pending_exact = 0; @@ -2853,6 +3824,11 @@ regex_compile (pattern, size, syntax, bufp) normal_char: /* If no exactn currently being built. */ if (!pending_exact +#ifdef WCHAR + /* If last exactn handle binary(or character) and + new exactn handle character(or binary). */ + || is_exactn_bin != is_binary[p - 1 - pattern] +#endif /* WCHAR */ /* If last exactn not at current position. */ || pending_exact + *pending_exact + 1 != b @@ -2874,7 +3850,16 @@ regex_compile (pattern, size, syntax, bufp) laststart = b; +#ifdef WCHAR + /* Is this exactn binary data or character? */ + is_exactn_bin = is_binary[p - 1 - pattern]; + if (is_exactn_bin) + BUF_PUSH_2 (exactn_bin, 0); + else + BUF_PUSH_2 (exactn, 0); +#else BUF_PUSH_2 (exactn, 0); +#endif /* WCHAR */ pending_exact = b - 1; } @@ -2898,16 +3883,25 @@ regex_compile (pattern, size, syntax, bufp) if (syntax & RE_NO_POSIX_BACKTRACKING) BUF_PUSH (succeed); +#ifdef WCHAR + free (pattern); + free (mbs_offset); + free (is_binary); +#endif free (compile_stack.stack); /* We have succeeded; set the length of the buffer. */ +#ifdef WCHAR + bufp->used = (unsigned long int) b - (unsigned long int) COMPILED_BUFFER_VAR; +#else bufp->used = b - bufp->buffer; +#endif #ifdef DEBUG if (debug) { DEBUG_PRINT1 ("\nCompiled pattern: \n"); - print_compiled_pattern (bufp); + PREFIX(print_compiled_pattern) (bufp); } #endif /* DEBUG */ @@ -2928,27 +3922,27 @@ regex_compile (pattern, size, syntax, bufp) # ifdef emacs if (! fail_stack.stack) fail_stack.stack - = (fail_stack_elt_t *) xmalloc (fail_stack.size - * sizeof (fail_stack_elt_t)); + = (PREFIX(fail_stack_elt_t) *) xmalloc (fail_stack.size + * sizeof (PREFIX(fail_stack_elt_t))); else fail_stack.stack - = (fail_stack_elt_t *) xrealloc (fail_stack.stack, + = (PREFIX(fail_stack_elt_t) *) xrealloc (fail_stack.stack, (fail_stack.size - * sizeof (fail_stack_elt_t))); + * sizeof (PREFIX(fail_stack_elt_t)))); # else /* not emacs */ if (! fail_stack.stack) fail_stack.stack - = (fail_stack_elt_t *) malloc (fail_stack.size - * sizeof (fail_stack_elt_t)); + = (PREFIX(fail_stack_elt_t) *) malloc (fail_stack.size + * sizeof (PREFIX(fail_stack_elt_t))); else fail_stack.stack - = (fail_stack_elt_t *) realloc (fail_stack.stack, + = (PREFIX(fail_stack_elt_t) *) realloc (fail_stack.stack, (fail_stack.size - * sizeof (fail_stack_elt_t))); + * sizeof (PREFIX(fail_stack_elt_t)))); # endif /* not emacs */ } - regex_grow_registers (num_regs); + PREFIX(regex_grow_registers (num_regs)); } #endif /* not MATCH_MAY_ALLOCATE */ @@ -2958,68 +3952,72 @@ regex_compile (pattern, size, syntax, bufp) /* Subroutines for `regex_compile'. */ /* Store OP at LOC followed by two-byte integer parameter ARG. */ +/* ifdef WCHAR, integer parameter is 1 wchar_t. */ static void -store_op1 (op, loc, arg) +PREFIX(store_op1) (op, loc, arg) re_opcode_t op; - unsigned char *loc; + UCHAR_T *loc; int arg; { - *loc = (unsigned char) op; + *loc = (UCHAR_T) op; STORE_NUMBER (loc + 1, arg); } /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ +/* ifdef WCHAR, integer parameter is 1 wchar_t. */ static void -store_op2 (op, loc, arg1, arg2) +PREFIX(store_op2) (op, loc, arg1, arg2) re_opcode_t op; - unsigned char *loc; + UCHAR_T *loc; int arg1, arg2; { - *loc = (unsigned char) op; + *loc = (UCHAR_T) op; STORE_NUMBER (loc + 1, arg1); - STORE_NUMBER (loc + 3, arg2); + STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2); } /* Copy the bytes from LOC to END to open up three bytes of space at LOC for OP followed by two-byte integer parameter ARG. */ +/* ifdef WCHAR, integer parameter is 1 wchar_t. */ static void -insert_op1 (op, loc, arg, end) +PREFIX(insert_op1) (op, loc, arg, end) re_opcode_t op; - unsigned char *loc; + UCHAR_T *loc; int arg; - unsigned char *end; + UCHAR_T *end; { - register unsigned char *pfrom = end; - register unsigned char *pto = end + 3; + register UCHAR_T *pfrom = end; + register UCHAR_T *pto = end + 1 + OFFSET_ADDRESS_SIZE; while (pfrom != loc) *--pto = *--pfrom; - store_op1 (op, loc, arg); + PREFIX(store_op1) (op, loc, arg); } /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ +/* ifdef WCHAR, integer parameter is 1 wchar_t. */ static void -insert_op2 (op, loc, arg1, arg2, end) +PREFIX(insert_op2) (op, loc, arg1, arg2, end) re_opcode_t op; - unsigned char *loc; + UCHAR_T *loc; int arg1, arg2; - unsigned char *end; + UCHAR_T *end; { - register unsigned char *pfrom = end; - register unsigned char *pto = end + 5; + register UCHAR_T *pfrom = end; + register UCHAR_T *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE; while (pfrom != loc) *--pto = *--pfrom; - store_op2 (op, loc, arg1, arg2); + PREFIX(store_op2) (op, loc, arg1, arg2); } @@ -3028,11 +4026,11 @@ insert_op2 (op, loc, arg1, arg2, end) least one character before the ^. */ static boolean -at_begline_loc_p (pattern, p, syntax) - const char *pattern, *p; +PREFIX(at_begline_loc_p) (pattern, p, syntax) + const CHAR_T *pattern, *p; reg_syntax_t syntax; { - const char *prev = p - 2; + const CHAR_T *prev = p - 2; boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; return @@ -3047,13 +4045,13 @@ at_begline_loc_p (pattern, p, syntax) at least one character after the $, i.e., `P < PEND'. */ static boolean -at_endline_loc_p (p, pend, syntax) - const char *p, *pend; +PREFIX(at_endline_loc_p) (p, pend, syntax) + const CHAR_T *p, *pend; reg_syntax_t syntax; { - const char *next = p; + const CHAR_T *next = p; boolean next_backslash = *next == '\\'; - const char *next_next = p + 1 < pend ? p + 1 : 0; + const CHAR_T *next_next = p + 1 < pend ? p + 1 : 0; return /* Before a subexpression? */ @@ -3064,6 +4062,7 @@ at_endline_loc_p (p, pend, syntax) : next_backslash && next_next && *next_next == '|'); } +#else /* not INSIDE_RECURSION */ /* Returns true if REGNUM is in one of COMPILE_STACK's elements and false if it's not. */ @@ -3084,7 +4083,63 @@ group_in_compile_stack (compile_stack, regnum) return false; } +#endif /* not INSIDE_RECURSION */ + +#ifdef INSIDE_RECURSION + +#ifdef WCHAR +/* This insert space, which size is "num", into the pattern at "loc". + "end" must point the end of the allocated buffer. */ +static void +insert_space (num, loc, end) + int num; + CHAR_T *loc; + CHAR_T *end; +{ + register CHAR_T *pto = end; + register CHAR_T *pfrom = end - num; + + while (pfrom >= loc) + *pto-- = *pfrom--; +} +#endif /* WCHAR */ +#ifdef WCHAR +static reg_errcode_t +wcs_compile_range (range_start_char, p_ptr, pend, translate, syntax, b, + char_set) + CHAR_T range_start_char; + const CHAR_T **p_ptr, *pend; + CHAR_T *char_set, *b; + RE_TRANSLATE_TYPE translate; + reg_syntax_t syntax; +{ + const CHAR_T *p = *p_ptr; + CHAR_T range_start, range_end; + reg_errcode_t ret; + if (p == pend) + return REG_ERANGE; + + range_start = (range_start_char >= 0)? TRANSLATE (range_start_char): + range_start_char; + range_end = TRANSLATE (p[0]); + /* Report an error if the range is empty and the syntax prohibits + this. */ + ret = ((syntax & RE_NO_EMPTY_RANGES) + && (range_start > range_end))? REG_ERANGE : REG_NOERROR; + + /* Insert space to the end of the char_ranges. */ + insert_space(2, b - char_set[5] - 2, b - 1); + *(b - char_set[5] - 2) = range_start; + *(b - char_set[5] - 1) = range_end; + char_set[4]++; /* ranges_index */ + + /* Have to increment the pointer into the pattern string, so the + caller isn't still at the ending character. */ + (*p_ptr)++; + return ret; +} +#else /* BYTE */ /* Read the ending character of a range (in a bracket expression) from the uncompiled pattern *P_PTR (which ends at PEND). We assume the starting character is in `P[-2]'. (`P[-1]' is the character `-'.) @@ -3095,52 +4150,51 @@ group_in_compile_stack (compile_stack, regnum) We use these short variable names so we can use the same macros as `regex_compile' itself. */ - static reg_errcode_t -compile_range (p_ptr, pend, translate, syntax, b) - const char **p_ptr, *pend; - RE_TRANSLATE_TYPE translate; - reg_syntax_t syntax; - unsigned char *b; +byte_compile_range (range_start_char, p_ptr, pend, translate, syntax, b) + unsigned int range_start_char; + const char **p_ptr, *pend; + RE_TRANSLATE_TYPE translate; + reg_syntax_t syntax; + unsigned char *b; { unsigned this_char; const char *p = *p_ptr; - unsigned int range_start, range_end; + reg_errcode_t ret; + unsigned end_char; + if (p == pend) return REG_ERANGE; - /* Even though the pattern is a signed `char *', we need to fetch - with unsigned char *'s; if the high bit of the pattern character - is set, the range endpoints will be negative if we fetch using a - signed char *. - - We also want to fetch the endpoints without translating them; the - appropriate translation is done in the bit-setting loop below. */ - /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */ - range_start = ((const unsigned char *) p)[-2]; - range_end = ((const unsigned char *) p)[0]; - /* Have to increment the pointer into the pattern string, so the caller isn't still at the ending character. */ (*p_ptr)++; - /* If the start is after the end, the range is empty. */ - if (range_start > range_end) - return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; + /* Report an error if the range is empty and the syntax prohibits this. */ + ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; /* Here we see why `this_char' has to be larger than an `unsigned - char' -- the range is inclusive, so if `range_end' == 0xff - (assuming 8-bit characters), we would otherwise go into an infinite - loop, since all characters <= 0xff. */ - for (this_char = range_start; this_char <= range_end; this_char++) + char' -- we would otherwise go into an infinite loop, since all + characters <= 0xff. */ + range_start_char = TRANSLATE (range_start_char); + /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE, + and some compilers cast it to int implicitly, so following for_loop + may fall to (almost) infinite loop. + e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff. + To avoid this, we cast p[0] to unsigned int and truncate it. */ + end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1)); + + for (this_char = range_start_char; this_char <= end_char; ++this_char) { SET_LIST_BIT (TRANSLATE (this_char)); + ret = REG_NOERROR; } - return REG_NOERROR; + return ret; } +# endif /* WCHAR */ /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible @@ -3155,27 +4209,50 @@ compile_range (p_ptr, pend, translate, syntax, b) Returns 0 if we succeed, -2 if an internal error. */ -int -re_compile_fastmap (bufp) +#ifdef WCHAR +/* local function for re_compile_fastmap. + truncate wchar_t character to char. */ +static unsigned char truncate_wchar (CHAR_T c); + +static unsigned char +truncate_wchar (c) + CHAR_T c; +{ + unsigned char buf[MB_LEN_MAX]; + int retval = wctomb(buf, c); + return retval > 0 ? buf[0] : (unsigned char)c; +} +#endif /* WCHAR */ + +static int +PREFIX(re_compile_fastmap) (bufp) struct re_pattern_buffer *bufp; { int j, k; #ifdef MATCH_MAY_ALLOCATE - fail_stack_type fail_stack; + PREFIX(fail_stack_type) fail_stack; #endif #ifndef REGEX_MALLOC char *destination; #endif register char *fastmap = bufp->fastmap; - unsigned char *pattern = bufp->buffer; - unsigned char *p = pattern; - register unsigned char *pend = pattern + bufp->used; + +#ifdef WCHAR + /* We need to cast pattern to (wchar_t*), because we casted this compiled + pattern to (char*) in regex_compile. */ + UCHAR_T *pattern = (UCHAR_T*)bufp->buffer; + register UCHAR_T *pend = (UCHAR_T*) (bufp->buffer + bufp->used); +#else /* BYTE */ + UCHAR_T *pattern = bufp->buffer; + register UCHAR_T *pend = pattern + bufp->used; +#endif /* WCHAR */ + UCHAR_T *p = pattern; #ifdef REL_ALLOC /* This holds the pointer to the failure stack, when it is allocated relocatably. */ - fail_stack_elt_t *failure_stack_ptr; + PREFIX(fail_stack_elt_t) *failure_stack_ptr; #endif /* Assume that each path through the pattern can be null until @@ -3233,11 +4310,32 @@ re_compile_fastmap (bufp) /* Following are the cases which match a character. These end with `break'. */ +#ifdef WCHAR + case exactn: + fastmap[truncate_wchar(p[1])] = 1; + break; +#else /* BYTE */ case exactn: fastmap[p[1]] = 1; break; +#endif /* WCHAR */ +#ifdef MBS_SUPPORT + case exactn_bin: + fastmap[p[1]] = 1; + break; +#endif +#ifdef WCHAR + /* It is hard to distinguish fastmap from (multi byte) characters + which depends on current locale. */ + case charset: + case charset_not: + case wordchar: + case notwordchar: + bufp->can_be_null = 1; + goto done; +#else /* BYTE */ case charset: for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) @@ -3268,6 +4366,7 @@ re_compile_fastmap (bufp) if (SYNTAX (j) != Sword) fastmap[j] = 1; break; +#endif /* WCHAR */ case anychar: @@ -3398,13 +4497,13 @@ re_compile_fastmap (bufp) case succeed_n: /* Get to the number of times to succeed. */ - p += 2; + p += OFFSET_ADDRESS_SIZE; /* Increment p past the n for when k != 0. */ EXTRACT_NUMBER_AND_INCR (k, p); if (k == 0) { - p -= 4; + p -= 2 * OFFSET_ADDRESS_SIZE; succeed_n_p = true; /* Spaghetti code alert. */ goto handle_on_failure_jump; } @@ -3412,7 +4511,7 @@ re_compile_fastmap (bufp) case set_number_at: - p += 4; + p += 2 * OFFSET_ADDRESS_SIZE; continue; @@ -3443,6 +4542,20 @@ re_compile_fastmap (bufp) done: RESET_FAIL_STACK (); return 0; +} + +#else /* not INSIDE_RECURSION */ + +int +re_compile_fastmap (bufp) + struct re_pattern_buffer *bufp; +{ +# ifdef MBS_SUPPORT + if (MB_CUR_MAX != 1) + return wcs_re_compile_fastmap(bufp); + else +# endif + return byte_re_compile_fastmap(bufp); } /* re_compile_fastmap */ #ifdef _LIBC weak_alias (__re_compile_fastmap, re_compile_fastmap) @@ -3537,11 +4650,84 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) struct re_registers *regs; int stop; { +# ifdef MBS_SUPPORT + if (MB_CUR_MAX != 1) + return wcs_re_search_2 (bufp, string1, size1, string2, size2, startpos, + range, regs, stop); + else +# endif + return byte_re_search_2 (bufp, string1, size1, string2, size2, startpos, + range, regs, stop); +} /* re_search_2 */ +#ifdef _LIBC +weak_alias (__re_search_2, re_search_2) +#endif + +#endif /* not INSIDE_RECURSION */ + +#ifdef INSIDE_RECURSION + +#ifdef MATCH_MAY_ALLOCATE +# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL +#else +# define FREE_VAR(var) if (var) free (var); var = NULL +#endif + +#ifdef WCHAR +# define MAX_ALLOCA_SIZE 2000 + +# define FREE_WCS_BUFFERS() \ + do { \ + if (size1 > MAX_ALLOCA_SIZE) \ + { \ + free (wcs_string1); \ + free (mbs_offset1); \ + } \ + else \ + { \ + FREE_VAR (wcs_string1); \ + FREE_VAR (mbs_offset1); \ + } \ + if (size2 > MAX_ALLOCA_SIZE) \ + { \ + free (wcs_string2); \ + free (mbs_offset2); \ + } \ + else \ + { \ + FREE_VAR (wcs_string2); \ + FREE_VAR (mbs_offset2); \ + } \ + } while (0) + +#endif + +static int +PREFIX(re_search_2) (bufp, string1, size1, string2, size2, startpos, range, + regs, stop) + struct re_pattern_buffer *bufp; + const char *string1, *string2; + int size1, size2; + int startpos; + int range; + struct re_registers *regs; + int stop; +{ int val; register char *fastmap = bufp->fastmap; register RE_TRANSLATE_TYPE translate = bufp->translate; int total_size = size1 + size2; int endpos = startpos + range; +#ifdef WCHAR + /* We need wchar_t* buffers correspond to cstring1, cstring2. */ + wchar_t *wcs_string1 = NULL, *wcs_string2 = NULL; + /* We need the size of wchar_t buffers correspond to csize1, csize2. */ + int wcs_size1 = 0, wcs_size2 = 0; + /* offset buffer for optimizatoin. See convert_mbs_to_wc. */ + int *mbs_offset1 = NULL, *mbs_offset2 = NULL; + /* They hold whether each wchar_t is binary data or not. */ + char *is_binary = NULL; +#endif /* WCHAR */ /* Check for out-of-range STARTPOS. */ if (startpos < 0 || startpos > total_size) @@ -3585,6 +4771,80 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) if (re_compile_fastmap (bufp) == -2) return -2; +#ifdef WCHAR + /* Allocate wchar_t array for wcs_string1 and wcs_string2 and + fill them with converted string. */ + if (size1 != 0) + { + if (size1 > MAX_ALLOCA_SIZE) + { + wcs_string1 = TALLOC (size1 + 1, CHAR_T); + mbs_offset1 = TALLOC (size1 + 1, int); + is_binary = TALLOC (size1 + 1, char); + } + else + { + wcs_string1 = REGEX_TALLOC (size1 + 1, CHAR_T); + mbs_offset1 = REGEX_TALLOC (size1 + 1, int); + is_binary = REGEX_TALLOC (size1 + 1, char); + } + if (!wcs_string1 || !mbs_offset1 || !is_binary) + { + if (size1 > MAX_ALLOCA_SIZE) + { + free (wcs_string1); + free (mbs_offset1); + free (is_binary); + } + else + { + FREE_VAR (wcs_string1); + FREE_VAR (mbs_offset1); + FREE_VAR (is_binary); + } + return -2; + } + wcs_size1 = convert_mbs_to_wcs(wcs_string1, string1, size1, + mbs_offset1, is_binary); + wcs_string1[wcs_size1] = L'\0'; /* for a sentinel */ + if (size1 > MAX_ALLOCA_SIZE) + free (is_binary); + else + FREE_VAR (is_binary); + } + if (size2 != 0) + { + if (size2 > MAX_ALLOCA_SIZE) + { + wcs_string2 = TALLOC (size2 + 1, CHAR_T); + mbs_offset2 = TALLOC (size2 + 1, int); + is_binary = TALLOC (size2 + 1, char); + } + else + { + wcs_string2 = REGEX_TALLOC (size2 + 1, CHAR_T); + mbs_offset2 = REGEX_TALLOC (size2 + 1, int); + is_binary = REGEX_TALLOC (size2 + 1, char); + } + if (!wcs_string2 || !mbs_offset2 || !is_binary) + { + FREE_WCS_BUFFERS (); + if (size2 > MAX_ALLOCA_SIZE) + free (is_binary); + else + FREE_VAR (is_binary); + return -2; + } + wcs_size2 = convert_mbs_to_wcs(wcs_string2, string2, size2, + mbs_offset2, is_binary); + wcs_string2[wcs_size2] = L'\0'; /* for a sentinel */ + if (size2 > MAX_ALLOCA_SIZE) + free (is_binary); + else + FREE_VAR (is_binary); + } +#endif /* WCHAR */ + /* Loop through the string, looking for a place to start matching. */ for (;;) { @@ -3620,9 +4880,9 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) } else /* Searching backwards. */ { - register char c = (size1 == 0 || startpos >= size1 - ? string2[startpos - size1] - : string1[startpos]); + register CHAR_T c = (size1 == 0 || startpos >= size1 + ? string2[startpos - size1] + : string1[startpos]); if (!fastmap[(unsigned char) TRANSLATE (c)]) goto advance; @@ -3632,10 +4892,24 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) /* If can't match the null string, and that's all we have left, fail. */ if (range >= 0 && startpos == total_size && fastmap && !bufp->can_be_null) - return -1; + { +#ifdef WCHAR + FREE_WCS_BUFFERS (); +#endif + return -1; + } + +#ifdef WCHAR + val = wcs_re_match_2_internal (bufp, string1, size1, string2, + size2, startpos, regs, stop, + wcs_string1, wcs_size1, + wcs_string2, wcs_size2, + mbs_offset1, mbs_offset2); +#else /* BYTE */ + val = byte_re_match_2_internal (bufp, string1, size1, string2, + size2, startpos, regs, stop); +#endif /* BYTE */ - val = re_match_2_internal (bufp, string1, size1, string2, size2, - startpos, regs, stop); #ifndef REGEX_MALLOC # ifdef C_ALLOCA alloca (0); @@ -3643,10 +4917,20 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) #endif if (val >= 0) - return startpos; + { +#ifdef WCHAR + FREE_WCS_BUFFERS (); +#endif + return startpos; + } if (val == -2) - return -2; + { +#ifdef WCHAR + FREE_WCS_BUFFERS (); +#endif + return -2; + } advance: if (!range) @@ -3662,18 +4946,31 @@ re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) startpos--; } } +#ifdef WCHAR + FREE_WCS_BUFFERS (); +#endif return -1; + } /* re_search_2 */ -#ifdef _LIBC -weak_alias (__re_search_2, re_search_2) -#endif - + +#ifdef WCHAR +/* This converts PTR, a pointer into one of the search wchar_t strings + `string1' and `string2' into an multibyte string offset from the + beginning of that string. We use mbs_offset to optimize. + See convert_mbs_to_wcs. */ +# define POINTER_TO_OFFSET(ptr) \ + (FIRST_STRING_P (ptr) \ + ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \ + : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \ + + csize1))) +#else /* BYTE */ /* This converts PTR, a pointer into one of the search strings `string1' and `string2' into an offset from the beginning of that string. */ #define POINTER_TO_OFFSET(ptr) \ (FIRST_STRING_P (ptr) \ ? ((regoff_t) ((ptr) - string1)) \ : ((regoff_t) ((ptr) - string2 + size1))) +#endif /* WCHAR */ /* Macros for dealing with the split strings in re_match_2. */ @@ -3703,10 +5000,19 @@ weak_alias (__re_search_2, re_search_2) two special cases to check for: if past the end of string1, look at the first character in string2; and if before the beginning of string2, look at the last character in string1. */ +#ifdef WCHAR +/* Use internationalized API instead of SYNTAX. */ +# define WORDCHAR_P(d) \ + (iswalnum ((wint_t)((d) == end1 ? *string2 \ + : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0 \ + || ((d) == end1 ? *string2 \ + : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == L'_') +#else /* BYTE */ #define WORDCHAR_P(d) \ (SYNTAX ((d) == end1 ? *string2 \ : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ == Sword) +#endif /* WCHAR */ /* Disabled due to a compiler bug -- see comment at case wordbound */ #if 0 @@ -3719,7 +5025,7 @@ weak_alias (__re_search_2, re_search_2) /* Free everything we malloc. */ #ifdef MATCH_MAY_ALLOCATE -# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL +# ifdef WCHAR # define FREE_VARIABLES() \ do { \ REGEX_FREE_STACK (fail_stack.stack); \ @@ -3732,9 +5038,44 @@ weak_alias (__re_search_2, re_search_2) FREE_VAR (reg_info); \ FREE_VAR (reg_dummy); \ FREE_VAR (reg_info_dummy); \ + if (!cant_free_wcs_buf) \ + { \ + FREE_VAR (string1); \ + FREE_VAR (string2); \ + FREE_VAR (mbs_offset1); \ + FREE_VAR (mbs_offset2); \ + } \ + } while (0) +# else /* BYTE */ +# define FREE_VARIABLES() \ + do { \ + REGEX_FREE_STACK (fail_stack.stack); \ + FREE_VAR (regstart); \ + FREE_VAR (regend); \ + FREE_VAR (old_regstart); \ + FREE_VAR (old_regend); \ + FREE_VAR (best_regstart); \ + FREE_VAR (best_regend); \ + FREE_VAR (reg_info); \ + FREE_VAR (reg_dummy); \ + FREE_VAR (reg_info_dummy); \ } while (0) +# endif /* WCHAR */ #else +# ifdef WCHAR +# define FREE_VARIABLES() \ + do { \ + if (!cant_free_wcs_buf) \ + { \ + FREE_VAR (string1); \ + FREE_VAR (string2); \ + FREE_VAR (mbs_offset1); \ + FREE_VAR (mbs_offset2); \ + } \ + } while (0) +# else /* BYTE */ # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */ +# endif /* WCHAR */ #endif /* not MATCH_MAY_ALLOCATE */ /* These values must meet several constraints. They must not be valid @@ -3746,6 +5087,7 @@ weak_alias (__re_search_2, re_search_2) to actually save any registers when none are active. */ #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) +#else /* not INSIDE_RECURSION */ /* Matching routines. */ @@ -3759,7 +5101,15 @@ re_match (bufp, string, size, pos, regs) int size, pos; struct re_registers *regs; { - int result = re_match_2_internal (bufp, NULL, 0, string, size, + int result; +# ifdef MBS_SUPPORT + if (MB_CUR_MAX != 1) + result = wcs_re_match_2_internal (bufp, NULL, 0, string, size, + pos, regs, size, + NULL, 0, NULL, 0, NULL, NULL); + else +# endif + result = byte_re_match_2_internal (bufp, NULL, 0, string, size, pos, regs, size); # ifndef REGEX_MALLOC # ifdef C_ALLOCA @@ -3773,17 +5123,21 @@ weak_alias (__re_match, re_match) # endif #endif /* not emacs */ -static boolean group_match_null_string_p _RE_ARGS ((unsigned char **p, - unsigned char *end, - register_info_type *reg_info)); -static boolean alt_match_null_string_p _RE_ARGS ((unsigned char *p, - unsigned char *end, - register_info_type *reg_info)); -static boolean common_op_match_null_string_p _RE_ARGS ((unsigned char **p, - unsigned char *end, - register_info_type *reg_info)); -static int bcmp_translate _RE_ARGS ((const char *s1, const char *s2, +#endif /* not INSIDE_RECURSION */ + +#ifdef INSIDE_RECURSION +static boolean PREFIX(group_match_null_string_p) _RE_ARGS ((UCHAR_T **p, + UCHAR_T *end, + PREFIX(register_info_type) *reg_info)); +static boolean PREFIX(alt_match_null_string_p) _RE_ARGS ((UCHAR_T *p, + UCHAR_T *end, + PREFIX(register_info_type) *reg_info)); +static boolean PREFIX(common_op_match_null_string_p) _RE_ARGS ((UCHAR_T **p, + UCHAR_T *end, + PREFIX(register_info_type) *reg_info)); +static int PREFIX(bcmp_translate) _RE_ARGS ((const CHAR_T *s1, const CHAR_T *s2, int len, char *translate)); +#else /* not INSIDE_RECURSION */ /* re_match_2 matches the compiled pattern in BUFP against the the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 @@ -3807,8 +5161,16 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) struct re_registers *regs; int stop; { - int result = re_match_2_internal (bufp, string1, size1, string2, size2, - pos, regs, stop); + int result; +# ifdef MBS_SUPPORT + if (MB_CUR_MAX != 1) + result = wcs_re_match_2_internal (bufp, string1, size1, string2, size2, + pos, regs, stop, + NULL, 0, NULL, 0, NULL, NULL); + else +# endif + result = byte_re_match_2_internal (bufp, string1, size1, string2, size2, + pos, regs, stop); #ifndef REGEX_MALLOC # ifdef C_ALLOCA alloca (0); @@ -3820,38 +5182,123 @@ re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) weak_alias (__re_match_2, re_match_2) #endif +#endif /* not INSIDE_RECURSION */ + +#ifdef INSIDE_RECURSION + +#ifdef WCHAR +static int count_mbs_length PARAMS ((int *, int)); + +/* This check the substring (from 0, to length) of the multibyte string, + to which offset_buffer correspond. And count how many wchar_t_characters + the substring occupy. We use offset_buffer to optimization. + See convert_mbs_to_wcs. */ + +static int +count_mbs_length(offset_buffer, length) + int *offset_buffer; + int length; +{ + int upper, lower; + + /* Check whether the size is valid. */ + if (length < 0) + return -1; + + if (offset_buffer == NULL) + return 0; + + /* If there are no multibyte character, offset_buffer[i] == i. + Optmize for this case. */ + if (offset_buffer[length] == length) + return length; + + /* Set up upper with length. (because for all i, offset_buffer[i] >= i) */ + upper = length; + lower = 0; + + while (true) + { + int middle = (lower + upper) / 2; + if (middle == lower || middle == upper) + break; + if (offset_buffer[middle] > length) + upper = middle; + else if (offset_buffer[middle] < length) + lower = middle; + else + return middle; + } + + return -1; +} +#endif /* WCHAR */ + /* This is a separate function so that we can force an alloca cleanup afterwards. */ +#ifdef WCHAR +static int +wcs_re_match_2_internal (bufp, cstring1, csize1, cstring2, csize2, pos, + regs, stop, string1, size1, string2, size2, + mbs_offset1, mbs_offset2) + struct re_pattern_buffer *bufp; + const char *cstring1, *cstring2; + int csize1, csize2; + int pos; + struct re_registers *regs; + int stop; + /* string1 == string2 == NULL means string1/2, size1/2 and + mbs_offset1/2 need seting up in this function. */ + /* We need wchar_t* buffers correspond to cstring1, cstring2. */ + wchar_t *string1, *string2; + /* We need the size of wchar_t buffers correspond to csize1, csize2. */ + int size1, size2; + /* offset buffer for optimizatoin. See convert_mbs_to_wc. */ + int *mbs_offset1, *mbs_offset2; +#else /* BYTE */ static int -re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) +byte_re_match_2_internal (bufp, string1, size1,string2, size2, pos, + regs, stop) struct re_pattern_buffer *bufp; const char *string1, *string2; int size1, size2; int pos; struct re_registers *regs; int stop; +#endif /* BYTE */ { /* General temporaries. */ int mcnt; - unsigned char *p1; + UCHAR_T *p1; +#ifdef WCHAR + /* They hold whether each wchar_t is binary data or not. */ + char *is_binary = NULL; + /* If true, we can't free string1/2, mbs_offset1/2. */ + int cant_free_wcs_buf = 1; +#endif /* WCHAR */ /* Just past the end of the corresponding string. */ - const char *end1, *end2; + const CHAR_T *end1, *end2; /* Pointers into string1 and string2, just past the last characters in each to consider matching. */ - const char *end_match_1, *end_match_2; + const CHAR_T *end_match_1, *end_match_2; /* Where we are in the data, and the end of the current string. */ - const char *d, *dend; + const CHAR_T *d, *dend; /* Where we are in the pattern, and the end of the pattern. */ - unsigned char *p = bufp->buffer; - register unsigned char *pend = p + bufp->used; +#ifdef WCHAR + UCHAR_T *pattern, *p; + register UCHAR_T *pend; +#else /* BYTE */ + UCHAR_T *p = bufp->buffer; + register UCHAR_T *pend = p + bufp->used; +#endif /* WCHAR */ /* Mark the opcode just after a start_memory, so we can test for an empty subpattern when we get to the stop_memory. */ - unsigned char *just_past_start_mem = 0; + UCHAR_T *just_past_start_mem = 0; /* We use this to map every character in the string. */ RE_TRANSLATE_TYPE translate = bufp->translate; @@ -3866,7 +5313,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) a ``dummy''; if a failure happens and the failure point is a dummy, it gets discarded and the next next one is tried. */ #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ - fail_stack_type fail_stack; + PREFIX(fail_stack_type) fail_stack; #endif #ifdef DEBUG static unsigned failure_id; @@ -3896,7 +5343,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) stopped matching the regnum-th subexpression. (The zeroth register keeps track of what the whole pattern matches.) */ #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ - const char **regstart, **regend; + const CHAR_T **regstart, **regend; #endif /* If a group that's operated upon by a repetition operator fails to @@ -3905,7 +5352,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) are when we last see its open-group operator. Similarly for a register's end. */ #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ - const char **old_regstart, **old_regend; + const CHAR_T **old_regstart, **old_regend; #endif /* The is_active field of reg_info helps us keep track of which (possibly @@ -3915,7 +5362,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) subexpression. These two fields get reset each time through any loop their register is in. */ #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ - register_info_type *reg_info; + PREFIX(register_info_type) *reg_info; #endif /* The following record the register info as found in the above @@ -3924,7 +5371,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) turn happens only if we have not yet matched the entire string. */ unsigned best_regs_set = false; #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ - const char **best_regstart, **best_regend; + const CHAR_T **best_regstart, **best_regend; #endif /* Logically, this is `best_regend[0]'. But we don't want to have to @@ -3935,15 +5382,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) the end of the best match so far in a separate variable. We initialize this to NULL so that when we backtrack the first time and need to test it, it's not garbage. */ - const char *match_end = NULL; + const CHAR_T *match_end = NULL; /* This helps SET_REGS_MATCHED avoid doing redundant work. */ int set_regs_matched_done = 0; /* Used when we pop values we don't care about. */ #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ - const char **reg_dummy; - register_info_type *reg_info_dummy; + const CHAR_T **reg_dummy; + PREFIX(register_info_type) *reg_info_dummy; #endif #ifdef DEBUG @@ -3963,15 +5410,15 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) array indexing. We should fix this. */ if (bufp->re_nsub) { - regstart = REGEX_TALLOC (num_regs, const char *); - regend = REGEX_TALLOC (num_regs, const char *); - old_regstart = REGEX_TALLOC (num_regs, const char *); - old_regend = REGEX_TALLOC (num_regs, const char *); - best_regstart = REGEX_TALLOC (num_regs, const char *); - best_regend = REGEX_TALLOC (num_regs, const char *); - reg_info = REGEX_TALLOC (num_regs, register_info_type); - reg_dummy = REGEX_TALLOC (num_regs, const char *); - reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); + regstart = REGEX_TALLOC (num_regs, const CHAR_T *); + regend = REGEX_TALLOC (num_regs, const CHAR_T *); + old_regstart = REGEX_TALLOC (num_regs, const CHAR_T *); + old_regend = REGEX_TALLOC (num_regs, const CHAR_T *); + best_regstart = REGEX_TALLOC (num_regs, const CHAR_T *); + best_regend = REGEX_TALLOC (num_regs, const CHAR_T *); + reg_info = REGEX_TALLOC (num_regs, PREFIX(register_info_type)); + reg_dummy = REGEX_TALLOC (num_regs, const CHAR_T *); + reg_info_dummy = REGEX_TALLOC (num_regs, PREFIX(register_info_type)); if (!(regstart && regend && old_regstart && old_regend && reg_info && best_regstart && best_regend && reg_dummy && reg_info_dummy)) @@ -3986,17 +5433,72 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) `FREE_VARIABLES' doesn't try to free them. */ regstart = regend = old_regstart = old_regend = best_regstart = best_regend = reg_dummy = NULL; - reg_info = reg_info_dummy = (register_info_type *) NULL; + reg_info = reg_info_dummy = (PREFIX(register_info_type) *) NULL; } #endif /* MATCH_MAY_ALLOCATE */ /* The starting position is bogus. */ +#ifdef WCHAR + if (pos < 0 || pos > csize1 + csize2) +#else /* BYTE */ if (pos < 0 || pos > size1 + size2) +#endif { FREE_VARIABLES (); return -1; } +#ifdef WCHAR + /* Allocate wchar_t array for string1 and string2 and + fill them with converted string. */ + if (string1 == NULL && string2 == NULL) + { + /* We need seting up buffers here. */ + + /* We must free wcs buffers in this function. */ + cant_free_wcs_buf = 0; + + if (csize1 != 0) + { + string1 = REGEX_TALLOC (csize1 + 1, CHAR_T); + mbs_offset1 = REGEX_TALLOC (csize1 + 1, int); + is_binary = REGEX_TALLOC (csize1 + 1, char); + if (!string1 || !mbs_offset1 || !is_binary) + { + FREE_VAR (string1); + FREE_VAR (mbs_offset1); + FREE_VAR (is_binary); + return -2; + } + } + if (csize2 != 0) + { + string2 = REGEX_TALLOC (csize2 + 1, CHAR_T); + mbs_offset2 = REGEX_TALLOC (csize2 + 1, int); + is_binary = REGEX_TALLOC (csize2 + 1, char); + if (!string2 || !mbs_offset2 || !is_binary) + { + FREE_VAR (string1); + FREE_VAR (mbs_offset1); + FREE_VAR (string2); + FREE_VAR (mbs_offset2); + FREE_VAR (is_binary); + return -2; + } + size2 = convert_mbs_to_wcs(string2, cstring2, csize2, + mbs_offset2, is_binary); + string2[size2] = L'\0'; /* for a sentinel */ + FREE_VAR (is_binary); + } + } + + /* We need to cast pattern to (wchar_t*), because we casted this compiled + pattern to (char*) in regex_compile. */ + p = pattern = (CHAR_T*)bufp->buffer; + pend = (CHAR_T*)(bufp->buffer + bufp->used); + +#endif /* WCHAR */ + /* Initialize subexpression text positions to -1 to mark ones that no start_memory/stop_memory has been seen for. Also initialize the register information struct. */ @@ -4019,11 +5521,38 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) size2 = size1; string1 = 0; size1 = 0; +#ifdef WCHAR + mbs_offset2 = mbs_offset1; + csize2 = csize1; + mbs_offset1 = NULL; + csize1 = 0; +#endif } end1 = string1 + size1; end2 = string2 + size2; /* Compute where to stop matching, within the two strings. */ +#ifdef WCHAR + if (stop <= csize1) + { + mcnt = count_mbs_length(mbs_offset1, stop); + end_match_1 = string1 + mcnt; + end_match_2 = string2; + } + else + { + if (stop > csize1 + csize2) + stop = csize1 + csize2; + end_match_1 = end1; + mcnt = count_mbs_length(mbs_offset2, stop-csize1); + end_match_2 = string2 + mcnt; + } + if (mcnt < 0) + { /* count_mbs_length return error. */ + FREE_VARIABLES (); + return -1; + } +#else if (stop <= size1) { end_match_1 = string1 + stop; @@ -4034,6 +5563,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) end_match_1 = end1; end_match_2 = string2 + stop - size1; } +#endif /* WCHAR */ /* `p' scans through the pattern as `d' scans through the data. `dend' is the end of the input string that `d' points within. `d' @@ -4041,6 +5571,26 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) this happens before fetching; therefore, at the beginning of the loop, `d' can be pointing at the end of a string, but it cannot equal `string2'. */ +#ifdef WCHAR + if (size1 > 0 && pos <= csize1) + { + mcnt = count_mbs_length(mbs_offset1, pos); + d = string1 + mcnt; + dend = end_match_1; + } + else + { + mcnt = count_mbs_length(mbs_offset2, pos-csize1); + d = string2 + mcnt; + dend = end_match_2; + } + + if (mcnt < 0) + { /* count_mbs_length return error. */ + FREE_VARIABLES (); + return -1; + } +#else if (size1 > 0 && pos <= size1) { d = string1 + pos; @@ -4051,6 +5601,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) d = string2 + pos - size1; dend = end_match_2; } +#endif /* WCHAR */ DEBUG_PRINT1 ("The compiled pattern is:\n"); DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); @@ -4188,9 +5739,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) if (regs->num_regs > 0) { regs->start[0] = pos; +#ifdef WCHAR + if (MATCHING_IN_FIRST_STRING) + regs->end[0] = mbs_offset1 != NULL ? + mbs_offset1[d-string1] : 0; + else + regs->end[0] = csize1 + (mbs_offset2 != NULL ? + mbs_offset2[d-string2] : 0); +#else regs->end[0] = (MATCHING_IN_FIRST_STRING ? ((regoff_t) (d - string1)) : ((regoff_t) (d - string2 + size1))); +#endif /* WCHAR */ } /* Go through the first `min (num_regs, regs->num_regs)' @@ -4223,9 +5783,18 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) nfailure_points_pushed - nfailure_points_popped); DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); +#ifdef WCHAR + if (MATCHING_IN_FIRST_STRING) + mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0; + else + mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) + + csize1; + mcnt -= pos; +#else mcnt = d - pos - (MATCHING_IN_FIRST_STRING ? string1 : string2 - size1); +#endif /* WCHAR */ DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); @@ -4250,6 +5819,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) byte in the pattern defines n, and the n bytes after that are the characters to match. */ case exactn: +#ifdef MBS_SUPPORT + case exactn_bin: +#endif mcnt = *p++; DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); @@ -4260,9 +5832,23 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) do { PREFETCH (); - if ((unsigned char) translate[(unsigned char) *d++] - != (unsigned char) *p++) +#ifdef WCHAR + if (*d <= 0xff) + { + if ((UCHAR_T) translate[(unsigned char) *d++] + != (UCHAR_T) *p++) + goto fail; + } + else + { + if (*d++ != (CHAR_T) *p++) + goto fail; + } +#else + if ((UCHAR_T) translate[(unsigned char) *d++] + != (UCHAR_T) *p++) goto fail; +#endif /* WCHAR */ } while (--mcnt); } @@ -4271,7 +5857,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) do { PREFETCH (); - if (*d++ != (char) *p++) goto fail; + if (*d++ != (CHAR_T) *p++) goto fail; } while (--mcnt); } @@ -4298,7 +5884,14 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) case charset: case charset_not: { - register unsigned char c; +#ifdef WCHAR + unsigned int i, char_class_length, coll_symbol_length, + equiv_class_length, ranges_length, chars_length, length; + CHAR_T *workp, *workp2, *charset_top; +# define WORK_BUFFER_SIZE 128 + CHAR_T str_buf[WORK_BUFFER_SIZE]; +#endif /* WCHAR */ + register UCHAR_T c; boolean not = (re_opcode_t) *(p - 1) == charset_not; DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); @@ -4306,6 +5899,199 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) PREFETCH (); c = TRANSLATE (*d); /* The character to match. */ +#ifdef WCHAR + charset_top = p - 1; + char_class_length = *p++; + coll_symbol_length = *p++; + equiv_class_length = *p++; + ranges_length = *p++; + chars_length = *p++; + /* p points charset[6], so the address of the next instruction + (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'], + where l=length of char_classes, m=length of collating_symbol, + n=equivalence_class, o=length of char_range, + p'=length of character. */ + workp = p; + /* Update p to indicate the next instruction. */ + p += char_class_length + coll_symbol_length+ equiv_class_length + + 2*ranges_length + chars_length; + + /* match with char_class? */ + for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE) + { + wctype_t wctype; + unsigned long int alignedp = ((unsigned long int)workp + + __alignof__(wctype_t) - 1) + & ~(unsigned long int)(__alignof__(wctype_t) - 1); + wctype = *((wctype_t*)alignedp); + workp += CHAR_CLASS_SIZE; + if (iswctype((wint_t)c, wctype)) + goto char_set_matched; + } + + /* match with collating_symbol? */ + /* If we can't look up collation data, we use wcscoll + instead. */ + for (workp2 = workp + coll_symbol_length ; workp < workp2 ;) + { + const CHAR_T *backup_d = d, *backup_dend = dend; + length = wcslen(workp); + + /* If wcscoll(the collating symbol, whole string) > 0, + any substring of the string never match with the + collating symbol. */ + if (wcscoll(workp, d) > 0) + { + workp += length + 1; + continue; + } + + /* First, we compare the collating symbol with + the first character of the string. + If it don't match, we add the next character to + the compare buffer in turn. */ + for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++) + { + int match; + if (d == dend) + { + if (dend == end_match_2) + break; + d = string2; + dend = end_match_2; + } + + /* add next character to the compare buffer. */ + str_buf[i] = TRANSLATE(*d); + str_buf[i+1] = '\0'; + + match = wcscoll(workp, str_buf); + if (match == 0) + goto char_set_matched; + + if (match < 0) + /* (str_buf > workp) indicate (str_buf + X > workp), + because for all X (str_buf + X > str_buf). + So we don't need continue this loop. */ + break; + + /* Otherwise(str_buf < workp), + (str_buf+next_character) may equals (workp). + So we continue this loop. */ + } + /* not matched */ + d = backup_d; + dend = backup_dend; + workp += length + 1; + } + /* match with equivalence_class? */ + /* If we can't look up collation data, we use wcscoll + instead. */ + for (workp2 = workp + equiv_class_length ; workp < workp2 ;) + { + const CHAR_T *backup_d = d, *backup_dend = dend; + length = wcslen(workp); + + /* If wcscoll(the collating symbol, whole string) > 0, + any substring of the string never match with the + collating symbol. */ + if (wcscoll(workp, d) > 0) + { + workp += length + 1; + break; + } + + /* First, we compare the equivalence class with + the first character of the string. + If it don't match, we add the next character to + the compare buffer in turn. */ + for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++) + { + int match; + if (d == dend) + { + if (dend == end_match_2) + break; + d = string2; + dend = end_match_2; + } + + /* add next character to the compare buffer. */ + str_buf[i] = TRANSLATE(*d); + str_buf[i+1] = '\0'; + + match = wcscoll(workp, str_buf); + + if (match == 0) + goto char_set_matched; + + if (match < 0) + /* (str_buf > workp) indicate (str_buf + X > workp), + because for all X (str_buf + X > str_buf). + So we don't need continue this loop. */ + break; + + /* Otherwise(str_buf < workp), + (str_buf+next_character) may equals (workp). + So we continue this loop. */ + } + /* not matched */ + d = backup_d; + dend = backup_dend; + workp += length + 1; + } + + /* match with char_range? */ + /* We set range_start_char at str_buf[0], range_end_char + at str_buf[4], and compared char at str_buf[2]. */ + str_buf[1] = 0; + str_buf[2] = c; + str_buf[3] = 0; + str_buf[5] = 0; + for (; workp < p - chars_length ;) + { + wchar_t *range_start_char, *range_end_char; + + /* match if (range_start_char <= c <= range_end_char). */ + + /* If range_start(or end) < 0, we assume -range_start(end) + is the offset of the collating symbol which is specified + as the character of the range start(end). */ + + /* range_start */ + if (*workp < 0) + range_start_char = charset_top - (*workp++); + else + { + str_buf[0] = *workp++; + range_start_char = str_buf; + } + + /* range_end */ + if (*workp < 0) + range_end_char = charset_top - (*workp++); + else + { + str_buf[4] = *workp++; + range_end_char = str_buf + 4; + } + + if (wcscoll(range_start_char, str_buf+2) <= 0 && + wcscoll(str_buf+2, range_end_char) <= 0) + + goto char_set_matched; + } + + /* match with char? */ + for (; workp < p ; workp++) + if (c == *workp) + goto char_set_matched; + + not = !not; + + char_set_matched: + if (not) goto fail; +#else /* Cast to `unsigned' instead of `unsigned char' in case the bit list is a full 32 bytes long. */ if (c < (unsigned) (*p * BYTEWIDTH) @@ -4315,7 +6101,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) p += 1 + *p; if (!not) goto fail; - +#undef WORK_BUFFER_SIZE +#endif /* WCHAR */ SET_REGS_MATCHED (); d++; break; @@ -4335,7 +6122,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) REG_MATCH_NULL_STRING_P (reg_info[*p]) - = group_match_null_string_p (&p1, pend, reg_info); + = PREFIX(group_match_null_string_p) (&p1, pend, reg_info); /* Save the position in the string where we were the last time we were at this open-group operator in case the group is @@ -4410,7 +6197,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) it isn't necessarily one less than now: consider (a(b)c(d(e)f)g). When group 3 ends, after the f), the new highest active register is 1. */ - unsigned char r = *p - 1; + UCHAR_T r = *p - 1; while (r > 0 && !IS_ACTIVE (reg_info[r])) r--; @@ -4453,7 +6240,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) case dummy_failure_jump: EXTRACT_NUMBER_AND_INCR (mcnt, p1); if (is_a_jump_n) - p1 += 2; + p1 += OFFSET_ADDRESS_SIZE; break; default: @@ -4467,7 +6254,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) by forcing a failure after pushing on the stack the on_failure_jump's jump in the pattern, and d. */ if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump - && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) + && (re_opcode_t) p1[1 + OFFSET_ADDRESS_SIZE] == start_memory + && p1[2 + OFFSET_ADDRESS_SIZE] == *p) { /* If this group ever matched anything, then restore what its registers were before trying this last @@ -4513,7 +6301,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) followed by the numeric value of <digit> as the register number. */ case duplicate: { - register const char *d2, *dend2; + register const CHAR_T *d2, *dend2; int regno = *p++; /* Get which register to match against. */ DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); @@ -4562,8 +6350,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Compare that many; failure if mismatch, else move past them. */ if (translate - ? bcmp_translate (d, d2, mcnt, translate) - : memcmp (d, d2, mcnt)) + ? PREFIX(bcmp_translate) (d, d2, mcnt, translate) + : memcmp (d, d2, mcnt*sizeof(UCHAR_T))) goto fail; d += mcnt, d2 += mcnt; @@ -4719,7 +6507,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) EXTRACT_NUMBER_AND_INCR (mcnt, p); DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); { - register unsigned char *p2 = p; + register UCHAR_T *p2 = p; /* Compare the beginning of the repeat with what in the pattern follows its end. If we can establish that there @@ -4744,9 +6532,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) && ((re_opcode_t) *p2 == stop_memory || (re_opcode_t) *p2 == start_memory)) p2 += 3; - else if (p2 + 6 < pend + else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend && (re_opcode_t) *p2 == dummy_failure_jump) - p2 += 6; + p2 += 2 + 2 * OFFSET_ADDRESS_SIZE; else break; } @@ -4762,24 +6550,39 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Consider what happens when matching ":\(.*\)" against ":/". I don't really understand this code yet. */ - p[-3] = (unsigned char) pop_failure_jump; + p[-(1 + OFFSET_ADDRESS_SIZE)] = (UCHAR_T) pop_failure_jump; DEBUG_PRINT1 (" End of pattern: change to `pop_failure_jump'.\n"); } else if ((re_opcode_t) *p2 == exactn +#ifdef MBS_SUPPORT + || (re_opcode_t) *p2 == exactn_bin +#endif || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) { - register unsigned char c - = *p2 == (unsigned char) endline ? '\n' : p2[2]; + register UCHAR_T c + = *p2 == (UCHAR_T) endline ? '\n' : p2[2]; - if ((re_opcode_t) p1[3] == exactn && p1[5] != c) + if (((re_opcode_t) p1[1 + OFFSET_ADDRESS_SIZE] == exactn +#ifdef MBS_SUPPORT + || (re_opcode_t) p1[1 + OFFSET_ADDRESS_SIZE] == exactn_bin +#endif + ) && p1[3 + OFFSET_ADDRESS_SIZE] != c) { - p[-3] = (unsigned char) pop_failure_jump; - DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", - c, p1[5]); + p[-(1 + OFFSET_ADDRESS_SIZE)] = (UCHAR_T) pop_failure_jump; +#ifdef WCHAR + DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n", + (wint_t) c, + (wint_t) p1[3+OFFSET_ADDRESS_SIZE]); +#else + DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", + (char) c, + (char) p1[3+OFFSET_ADDRESS_SIZE]); +#endif } +#ifndef WCHAR else if ((re_opcode_t) p1[3] == charset || (re_opcode_t) p1[3] == charset_not) { @@ -4797,7 +6600,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); } } +#endif /* not WCHAR */ } +#ifndef WCHAR else if ((re_opcode_t) *p2 == charset) { /* We win if the first character of the loop is not part @@ -4846,11 +6651,12 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) } } } +#endif /* not WCHAR */ } - p -= 2; /* Point at relative address again. */ + p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */ if ((re_opcode_t) p[-1] != pop_failure_jump) { - p[-1] = (unsigned char) jump; + p[-1] = (UCHAR_T) jump; DEBUG_PRINT1 (" Match => jump.\n"); goto unconditional_jump; } @@ -4871,8 +6677,8 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) register from the stack, since lowest will == highest in `pop_failure_point'. */ active_reg_t dummy_low_reg, dummy_high_reg; - unsigned char *pdummy; - const char *sdummy; + UCHAR_T *pdummy = NULL; + const CHAR_T *sdummy = NULL; DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); POP_FAILURE_POINT (sdummy, pdummy, @@ -4937,7 +6743,7 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) /* Have to succeed matching what follows at least n times. After that, handle like `on_failure_jump'. */ case succeed_n: - EXTRACT_NUMBER (mcnt, p + 2); + EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE); DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); assert (mcnt >= 0); @@ -4945,46 +6751,57 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) if (mcnt > 0) { mcnt--; - p += 2; + p += OFFSET_ADDRESS_SIZE; STORE_NUMBER_AND_INCR (p, mcnt); #ifdef _LIBC - DEBUG_PRINT3 (" Setting %p to %d.\n", p - 2, mcnt); + DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE, + mcnt); #else - DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - 2, mcnt); + DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE, + mcnt); #endif } else if (mcnt == 0) { #ifdef _LIBC - DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2); + DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", + p + OFFSET_ADDRESS_SIZE); #else - DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); + DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", + p + OFFSET_ADDRESS_SIZE); #endif - p[2] = (unsigned char) no_op; - p[3] = (unsigned char) no_op; + +#ifdef WCHAR + p[1] = (UCHAR_T) no_op; +#else + p[2] = (UCHAR_T) no_op; + p[3] = (UCHAR_T) no_op; +#endif /* WCHAR */ goto on_failure; } break; case jump_n: - EXTRACT_NUMBER (mcnt, p + 2); + EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE); DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); /* Originally, this is how many times we CAN jump. */ if (mcnt) { mcnt--; - STORE_NUMBER (p + 2, mcnt); + STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt); #ifdef _LIBC - DEBUG_PRINT3 (" Setting %p to %d.\n", p + 2, mcnt); + DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE, + mcnt); #else - DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + 2, mcnt); + DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE, + mcnt); #endif goto unconditional_jump; } /* If don't have to jump any more, skip over the rest of command. */ else - p += 4; + p += 2 * OFFSET_ADDRESS_SIZE; break; case set_number_at: @@ -5215,13 +7032,13 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) We don't handle duplicates properly (yet). */ static boolean -group_match_null_string_p (p, end, reg_info) - unsigned char **p, *end; - register_info_type *reg_info; +PREFIX(group_match_null_string_p) (p, end, reg_info) + UCHAR_T **p, *end; + PREFIX(register_info_type) *reg_info; { int mcnt; /* Point to after the args to the start_memory. */ - unsigned char *p1 = *p + 2; + UCHAR_T *p1 = *p + 2; while (p1 < end) { @@ -5259,14 +7076,16 @@ group_match_null_string_p (p, end, reg_info) with an on_failure_jump (see above) that jumps to right past a jump_past_alt. */ - while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) + while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] == + jump_past_alt) { /* `mcnt' holds how many bytes long the alternative is, including the ending `jump_past_alt' and its number. */ - if (!alt_match_null_string_p (p1, p1 + mcnt - 3, - reg_info)) + if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt - + (1 + OFFSET_ADDRESS_SIZE), + reg_info)) return false; /* Move to right after this alternative, including the @@ -5282,10 +7101,11 @@ group_match_null_string_p (p, end, reg_info) alternative that starts with an on_failure_jump. */ p1++; EXTRACT_NUMBER_AND_INCR (mcnt, p1); - if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) + if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] != + jump_past_alt) { /* Get to the beginning of the n-th alternative. */ - p1 -= 3; + p1 -= 1 + OFFSET_ADDRESS_SIZE; break; } } @@ -5293,9 +7113,9 @@ group_match_null_string_p (p, end, reg_info) /* Deal with the last alternative: go back and get number of the `jump_past_alt' just before it. `mcnt' contains the length of the alternative. */ - EXTRACT_NUMBER (mcnt, p1 - 2); + EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE); - if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) + if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt, reg_info)) return false; p1 += mcnt; /* Get past the n-th alternative. */ @@ -5310,7 +7130,7 @@ group_match_null_string_p (p, end, reg_info) default: - if (!common_op_match_null_string_p (&p1, end, reg_info)) + if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info)) return false; } } /* while p1 < end */ @@ -5324,12 +7144,12 @@ group_match_null_string_p (p, end, reg_info) byte past the last. The alternative can contain groups. */ static boolean -alt_match_null_string_p (p, end, reg_info) - unsigned char *p, *end; - register_info_type *reg_info; +PREFIX(alt_match_null_string_p) (p, end, reg_info) + UCHAR_T *p, *end; + PREFIX(register_info_type) *reg_info; { int mcnt; - unsigned char *p1 = p; + UCHAR_T *p1 = p; while (p1 < end) { @@ -5346,7 +7166,7 @@ alt_match_null_string_p (p, end, reg_info) break; default: - if (!common_op_match_null_string_p (&p1, end, reg_info)) + if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info)) return false; } } /* while p1 < end */ @@ -5361,14 +7181,14 @@ alt_match_null_string_p (p, end, reg_info) Sets P to one after the op and its arguments, if any. */ static boolean -common_op_match_null_string_p (p, end, reg_info) - unsigned char **p, *end; - register_info_type *reg_info; +PREFIX(common_op_match_null_string_p) (p, end, reg_info) + UCHAR_T **p, *end; + PREFIX(register_info_type) *reg_info; { int mcnt; boolean ret; int reg_no; - unsigned char *p1 = *p; + UCHAR_T *p1 = *p; switch ((re_opcode_t) *p1++) { @@ -5391,7 +7211,7 @@ common_op_match_null_string_p (p, end, reg_info) case start_memory: reg_no = *p1; assert (reg_no > 0 && reg_no <= MAX_REGNUM); - ret = group_match_null_string_p (&p1, end, reg_info); + ret = PREFIX(group_match_null_string_p) (&p1, end, reg_info); /* Have to set this here in case we're checking a group which contains a group and a back reference to it. */ @@ -5414,12 +7234,12 @@ common_op_match_null_string_p (p, end, reg_info) case succeed_n: /* Get to the number of times to succeed. */ - p1 += 2; + p1 += OFFSET_ADDRESS_SIZE; EXTRACT_NUMBER_AND_INCR (mcnt, p1); if (mcnt == 0) { - p1 -= 4; + p1 -= 2 * OFFSET_ADDRESS_SIZE; EXTRACT_NUMBER_AND_INCR (mcnt, p1); p1 += mcnt; } @@ -5433,7 +7253,7 @@ common_op_match_null_string_p (p, end, reg_info) break; case set_number_at: - p1 += 4; + p1 += 2 * OFFSET_ADDRESS_SIZE; default: /* All other opcodes mean we cannot match the empty string. */ @@ -5449,21 +7269,30 @@ common_op_match_null_string_p (p, end, reg_info) bytes; nonzero otherwise. */ static int -bcmp_translate (s1, s2, len, translate) - const char *s1, *s2; +PREFIX(bcmp_translate) (s1, s2, len, translate) + const CHAR_T *s1, *s2; register int len; RE_TRANSLATE_TYPE translate; { - register const unsigned char *p1 = (const unsigned char *) s1; - register const unsigned char *p2 = (const unsigned char *) s2; + register const UCHAR_T *p1 = (const UCHAR_T *) s1; + register const UCHAR_T *p2 = (const UCHAR_T *) s2; while (len) { +#ifdef WCHAR + if (((*p1<=0xff)?translate[*p1++]:*p1++) + != ((*p2<=0xff)?translate[*p2++]:*p2++)) + return 1; +#else /* BYTE */ if (translate[*p1++] != translate[*p2++]) return 1; +#endif /* WCHAR */ len--; } return 0; } + +#else /* not INSIDE_RECURSION */ + /* Entry points for GNU code. */ /* re_compile_pattern is the GNU regular expression compiler: it @@ -5495,7 +7324,12 @@ re_compile_pattern (pattern, length, bufp) /* Match anchors at newline. */ bufp->newline_anchor = 1; - ret = regex_compile (pattern, length, re_syntax_options, bufp); +# ifdef MBS_SUPPORT + if (MB_CUR_MAX != 1) + ret = wcs_regex_compile (pattern, length, re_syntax_options, bufp); + else +# endif + ret = byte_regex_compile (pattern, length, re_syntax_options, bufp); if (!ret) return NULL; @@ -5550,7 +7384,12 @@ re_comp (s) /* Match anchors at newlines. */ re_comp_buf.newline_anchor = 1; - ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); +# ifdef MBS_SUPPORT + if (MB_CUR_MAX != 1) + ret = wcs_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); + else +# endif + ret = byte_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); if (!ret) return NULL; @@ -5667,7 +7506,12 @@ regcomp (preg, pattern, cflags) /* POSIX says a null character in the pattern terminates it, so we can use strlen here in compiling the pattern. */ - ret = regex_compile (pattern, strlen (pattern), syntax, preg); +# ifdef MBS_SUPPORT + if (MB_CUR_MAX != 1) + ret = wcs_regex_compile (pattern, strlen (pattern), syntax, preg); + else +# endif + ret = byte_regex_compile (pattern, strlen (pattern), syntax, preg); /* POSIX doesn't distinguish between an unmatched open-group and an unmatched close-group: both are REG_EPAREN. */ @@ -5852,3 +7696,73 @@ weak_alias (__regfree, regfree) #endif #endif /* not emacs */ + +#endif /* not INSIDE_RECURSION */ + +#undef STORE_NUMBER +#undef STORE_NUMBER_AND_INCR +#undef EXTRACT_NUMBER +#undef EXTRACT_NUMBER_AND_INCR + +#undef DEBUG_PRINT_COMPILED_PATTERN +#undef DEBUG_PRINT_DOUBLE_STRING + +#undef INIT_FAIL_STACK +#undef RESET_FAIL_STACK +#undef DOUBLE_FAIL_STACK +#undef PUSH_PATTERN_OP +#undef PUSH_FAILURE_POINTER +#undef PUSH_FAILURE_INT +#undef PUSH_FAILURE_ELT +#undef POP_FAILURE_POINTER +#undef POP_FAILURE_INT +#undef POP_FAILURE_ELT +#undef DEBUG_PUSH +#undef DEBUG_POP +#undef PUSH_FAILURE_POINT +#undef POP_FAILURE_POINT + +#undef REG_UNSET_VALUE +#undef REG_UNSET + +#undef PATFETCH +#undef PATFETCH_RAW +#undef PATUNFETCH +#undef TRANSLATE + +#undef INIT_BUF_SIZE +#undef GET_BUFFER_SPACE +#undef BUF_PUSH +#undef BUF_PUSH_2 +#undef BUF_PUSH_3 +#undef STORE_JUMP +#undef STORE_JUMP2 +#undef INSERT_JUMP +#undef INSERT_JUMP2 +#undef EXTEND_BUFFER +#undef GET_UNSIGNED_NUMBER +#undef FREE_STACK_RETURN + +#undef POINTER_TO_OFFSET +#undef MATCHING_IN_FRST_STRING +#undef PREFETCH +#undef AT_STRINGS_BEG +#undef AT_STRINGS_END +#undef WORDCHAR_P +#undef FREE_VAR +#undef FREE_VARIABLES +#undef NO_HIGHEST_ACTIVE_REG +#undef NO_LOWEST_ACTIVE_REG + +#undef CHAR_T +#undef UCHAR_T +#undef COMPILED_BUFFER_VAR +#undef OFFSET_ADDRESS_SIZE +#undef CHAR_CLASS_SIZE +#undef PREFIX +#undef ARG_PREFIX +#undef PUT_CHAR +#undef BYTE +#undef WCHAR + +#define DEFINED_ONCE |