summaryrefslogtreecommitdiff
path: root/src/regex.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/regex.c')
-rw-r--r--src/regex.c63
1 files changed, 55 insertions, 8 deletions
diff --git a/src/regex.c b/src/regex.c
index 846c87041b1..ae80ad0cee8 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -2530,6 +2530,7 @@ regex_compile (pattern, size, syntax, bufp)
bufp->syntax = syntax;
bufp->fastmap_accurate = 0;
bufp->not_bol = bufp->not_eol = 0;
+ bufp->used_syntax = 0;
/* Set `used' to zero, so that if we return an error, the pattern
printer (for debugging) will think there's no pattern. We reset it
@@ -2942,6 +2943,14 @@ regex_compile (pattern, size, syntax, bufp)
SET_LIST_BIT (translated);
}
+ /* In most cases the matching rule for char classes
+ only uses the syntax table for multibyte chars,
+ so that the content of the syntax-table it is not
+ hardcoded in the range_table. SPACE and WORD are
+ the two exceptions. */
+ if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD)))
+ bufp->used_syntax = 1;
+
/* Repeat the loop. */
continue;
}
@@ -3877,11 +3886,13 @@ analyse_first (p, pend, fastmap, multibyte)
if (fastmap)
{
int c = RE_STRING_CHAR (p + 1, pend - p);
-
+ /* When fast-scanning, the fastmap can be indexed either with
+ a char (smaller than 256) or with the first byte of
+ a char's byte sequence. So we have to conservatively add
+ both to the table. */
if (SINGLE_BYTE_CHAR_P (c))
fastmap[c] = 1;
- else
- fastmap[p[1]] = 1;
+ fastmap[p[1]] = 1;
}
break;
@@ -3899,6 +3910,10 @@ analyse_first (p, pend, fastmap, multibyte)
So any that are not listed in the charset
are possible matches, even in multibyte buffers. */
if (!fastmap) break;
+ /* We don't need to mark LEADING_CODE_8_BIT_CONTROL specially
+ because it will automatically be set when needed by virtue of
+ being larger than the highest char of its charset (0xbf) but
+ smaller than (1<<BYTEWIDTH). */
for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH;
j < (1 << BYTEWIDTH); j++)
fastmap[j] = 1;
@@ -3909,7 +3924,13 @@ analyse_first (p, pend, fastmap, multibyte)
for (j = CHARSET_BITMAP_SIZE (&p[-1]) * BYTEWIDTH - 1, p++;
j >= 0; j--)
if (!!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) ^ not)
- fastmap[j] = 1;
+ {
+ fastmap[j] = 1;
+#ifdef emacs
+ if (j >= 0x80 && j < 0xa0)
+ fastmap[LEADING_CODE_8_BIT_CONTROL] = 1;
+#endif
+ }
if ((not && multibyte)
/* Any character set can possibly contain a character
@@ -4352,11 +4373,33 @@ re_search_2 (bufp, str1, size1, str2, size2, startpos, range, regs, stop)
}
}
else
- while (range > lim && !fastmap[*d])
+ do
{
- d++;
- range--;
- }
+ re_char *d_start = d;
+ while (range > lim && !fastmap[*d])
+ {
+ d++;
+ range--;
+ }
+#ifdef emacs
+ if (multibyte && range > lim)
+ {
+ /* Check that we are at the beginning of a char. */
+ int at_boundary;
+ AT_CHAR_BOUNDARY_P (at_boundary, d, d_start);
+ if (at_boundary)
+ break;
+ else
+ { /* We have matched an internal byte of a char
+ rather than the leading byte, so it's a false
+ positive: we should keep scanning. */
+ d++; range--;
+ }
+ }
+ else
+#endif
+ break;
+ } while (1);
startpos += irange - range;
}
@@ -6197,6 +6240,10 @@ re_compile_pattern (pattern, length, bufp)
{
reg_errcode_t ret;
+#ifdef emacs
+ gl_state.current_syntax_table = current_buffer->syntax_table;
+#endif
+
/* GNU code is written to assume at least RE_NREGS registers will be set
(and at least one extra will be -1). */
bufp->regs_allocated = REGS_UNALLOCATED;