summaryrefslogtreecommitdiff
path: root/ext/pcre/pcrelib/pcre_study.c
diff options
context:
space:
mode:
authorIlia Alshanetsky <iliaa@php.net>2010-08-14 14:37:13 +0000
committerIlia Alshanetsky <iliaa@php.net>2010-08-14 14:37:13 +0000
commitc5602787fca62e64a63b99e7ae3f310f72e2cac9 (patch)
treec03f412a77986b5f66e76ca2a1155ba7efc511f0 /ext/pcre/pcrelib/pcre_study.c
parentc1019643253d2d65d813358e1bc37ecf5dd365ed (diff)
downloadphp-git-c5602787fca62e64a63b99e7ae3f310f72e2cac9.tar.gz
MFH: Upgraded bundled PCRE to version 8.10.
Diffstat (limited to 'ext/pcre/pcrelib/pcre_study.c')
-rw-r--r--ext/pcre/pcrelib/pcre_study.c274
1 files changed, 208 insertions, 66 deletions
diff --git a/ext/pcre/pcrelib/pcre_study.c b/ext/pcre/pcrelib/pcre_study.c
index 2653624e0a..3ac7e81496 100644
--- a/ext/pcre/pcrelib/pcre_study.c
+++ b/ext/pcre/pcrelib/pcre_study.c
@@ -46,6 +46,7 @@ supporting functions. */
#include "pcre_internal.h"
+#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
/* Returns from set_start_bits() */
@@ -411,6 +412,15 @@ for (;;)
#endif
break;
+ /* Skip these, but we need to add in the name length. */
+
+ case OP_MARK:
+ case OP_PRUNE_ARG:
+ case OP_SKIP_ARG:
+ case OP_THEN_ARG:
+ cc += _pcre_OP_lengths[op] + cc[1];
+ break;
+
/* For the record, these are the opcodes that are matched by "default":
OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
OP_THEN. */
@@ -429,25 +439,121 @@ for (;;)
* Set a bit and maybe its alternate case *
*************************************************/
-/* Given a character, set its bit in the table, and also the bit for the other
-version of a letter if we are caseless.
+/* Given a character, set its first byte's bit in the table, and also the
+corresponding bit for the other version of a letter if we are caseless. In
+UTF-8 mode, for characters greater than 127, we can only do the caseless thing
+when Unicode property support is available.
Arguments:
start_bits points to the bit map
- c is the character
+ p points to the character
caseless the caseless flag
cd the block with char table pointers
+ utf8 TRUE for UTF-8 mode
+
+Returns: pointer after the character
+*/
+
+static const uschar *
+set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
+ compile_data *cd, BOOL utf8)
+{
+unsigned int c = *p;
+
+SET_BIT(c);
+
+#ifdef SUPPORT_UTF8
+if (utf8 && c > 127)
+ {
+ GETCHARINC(c, p);
+#ifdef SUPPORT_UCP
+ if (caseless)
+ {
+ uschar buff[8];
+ c = UCD_OTHERCASE(c);
+ (void)_pcre_ord2utf8(c, buff);
+ SET_BIT(buff[0]);
+ }
+#endif
+ return p;
+ }
+#endif
+
+/* Not UTF-8 mode, or character is less than 127. */
+
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
+return p + 1;
+}
+
+
-Returns: nothing
+/*************************************************
+* Set bits for a positive character type *
+*************************************************/
+
+/* This function sets starting bits for a character type. In UTF-8 mode, we can
+only do a direct setting for bytes less than 128, as otherwise there can be
+confusion with bytes in the middle of UTF-8 characters. In a "traditional"
+environment, the tables will only recognize ASCII characters anyway, but in at
+least one Windows environment, some higher bytes bits were set in the tables.
+So we deal with that case by considering the UTF-8 encoding.
+
+Arguments:
+ start_bits the starting bitmap
+ cbit type the type of character wanted
+ table_limit 32 for non-UTF-8; 16 for UTF-8
+ cd the block with char table pointers
+
+Returns: nothing
*/
static void
-set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
+set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
compile_data *cd)
{
-start_bits[c/8] |= (1 << (c&7));
-if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
- start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
+register int c;
+for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
+if (table_limit == 32) return;
+for (c = 128; c < 256; c++)
+ {
+ if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
+ {
+ uschar buff[8];
+ (void)_pcre_ord2utf8(c, buff);
+ SET_BIT(buff[0]);
+ }
+ }
+}
+
+
+/*************************************************
+* Set bits for a negative character type *
+*************************************************/
+
+/* This function sets starting bits for a negative character type such as \D.
+In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
+otherwise there can be confusion with bytes in the middle of UTF-8 characters.
+Unlike in the positive case, where we can set appropriate starting bits for
+specific high-valued UTF-8 characters, in this case we have to set the bits for
+all high-valued characters. The lowest is 0xc2, but we overkill by starting at
+0xc0 (192) for simplicity.
+
+Arguments:
+ start_bits the starting bitmap
+ cbit type the type of character wanted
+ table_limit 32 for non-UTF-8; 16 for UTF-8
+ cd the block with char table pointers
+
+Returns: nothing
+*/
+
+static void
+set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
+ compile_data *cd)
+{
+register int c;
+for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
+if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
}
@@ -482,6 +588,7 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
{
register int c;
int yield = SSB_DONE;
+int table_limit = utf8? 16:32;
#if 0
/* ========================================================================= */
@@ -605,12 +712,7 @@ do
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
- set_table_bit(start_bits, tcode[1], caseless, cd);
- tcode += 2;
-#ifdef SUPPORT_UTF8
- if (utf8 && tcode[-1] >= 0xc0)
- tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+ tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
break;
/* Single-char upto sets the bit and tries the next */
@@ -618,12 +720,7 @@ do
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
- set_table_bit(start_bits, tcode[3], caseless, cd);
- tcode += 4;
-#ifdef SUPPORT_UTF8
- if (utf8 && tcode[-1] >= 0xc0)
- tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+ tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
break;
/* At least one single char sets the bit and stops */
@@ -636,59 +733,86 @@ do
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
- set_table_bit(start_bits, tcode[1], caseless, cd);
+ (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
try_next = FALSE;
break;
- /* Single character type sets the bits and stops */
+ /* Special spacing and line-terminating items. These recognize specific
+ lists of characters. The difference between VSPACE and ANYNL is that the
+ latter can match the two-character CRLF sequence, but that is not
+ relevant for finding the first character, so their code here is
+ identical. */
+
+ case OP_HSPACE:
+ SET_BIT(0x09);
+ SET_BIT(0x20);
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+00A0 */
+ SET_BIT(0xE1); /* For U+1680, U+180E */
+ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
+ SET_BIT(0xE3); /* For U+3000 */
+ }
+ else SET_BIT(0xA0);
+ try_next = FALSE;
+ break;
+
+ case OP_ANYNL:
+ case OP_VSPACE:
+ SET_BIT(0x0A);
+ SET_BIT(0x0B);
+ SET_BIT(0x0C);
+ SET_BIT(0x0D);
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+0085 */
+ SET_BIT(0xE2); /* For U+2028, U+2029 */
+ }
+ else SET_BIT(0x85);
+ try_next = FALSE;
+ break;
+
+ /* Single character types set the bits and stop. Note that if PCRE_UCP
+ is set, we do not see these op codes because \d etc are converted to
+ properties. Therefore, these apply in the case when only characters less
+ than 256 are recognized to match the types. */
case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_digit];
+ set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
try_next = FALSE;
break;
case OP_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_digit];
+ set_type_bits(start_bits, cbit_digit, table_limit, cd);
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ ensure it is set as not whitespace. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= ~d;
- }
+ set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] |= 0x08;
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ not set it from the table. */
case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= d;
- }
+ c = start_bits[1]; /* Save in case it was already set */
+ set_type_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] = (start_bits[1] & ~0x08) | c;
try_next = FALSE;
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_word];
+ set_nottype_bits(start_bits, cbit_word, table_limit, cd);
try_next = FALSE;
break;
case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_word];
+ set_type_bits(start_bits, cbit_word, table_limit, cd);
try_next = FALSE;
break;
@@ -697,6 +821,7 @@ do
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
+ case OP_TYPEPOSPLUS:
tcode++;
break;
@@ -720,52 +845,69 @@ do
case OP_TYPEPOSQUERY:
switch(tcode[1])
{
+ default:
case OP_ANY:
case OP_ALLANY:
return SSB_FAIL;
+ case OP_HSPACE:
+ SET_BIT(0x09);
+ SET_BIT(0x20);
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+00A0 */
+ SET_BIT(0xE1); /* For U+1680, U+180E */
+ SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
+ SET_BIT(0xE3); /* For U+3000 */
+ }
+ else SET_BIT(0xA0);
+ break;
+
+ case OP_ANYNL:
+ case OP_VSPACE:
+ SET_BIT(0x0A);
+ SET_BIT(0x0B);
+ SET_BIT(0x0C);
+ SET_BIT(0x0D);
+ if (utf8)
+ {
+ SET_BIT(0xC2); /* For U+0085 */
+ SET_BIT(0xE2); /* For U+2028, U+2029 */
+ }
+ else SET_BIT(0x85);
+ break;
+
case OP_NOT_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_digit];
+ set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
break;
case OP_DIGIT:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_digit];
+ set_type_bits(start_bits, cbit_digit, table_limit, cd);
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ ensure it gets set as not whitespace. */
case OP_NOT_WHITESPACE:
- for (c = 0; c < 32; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= ~d;
- }
+ set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] |= 0x08;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
- discard it. */
+ avoid setting it. */
case OP_WHITESPACE:
- for (c = 0; c < 32; c++)
- {
- int d = cd->cbits[c+cbit_space];
- if (c == 1) d &= ~0x08;
- start_bits[c] |= d;
- }
+ c = start_bits[1]; /* Save in case it was already set */
+ set_type_bits(start_bits, cbit_space, table_limit, cd);
+ start_bits[1] = (start_bits[1] & ~0x08) | c;
break;
case OP_NOT_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= ~cd->cbits[c+cbit_word];
+ set_nottype_bits(start_bits, cbit_word, table_limit, cd);
break;
case OP_WORDCHAR:
- for (c = 0; c < 32; c++)
- start_bits[c] |= cd->cbits[c+cbit_word];
+ set_type_bits(start_bits, cbit_word, table_limit, cd);
break;
}