summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2014-12-18 10:42:30 -0700
committerKarl Williamson <khw@cpan.org>2014-12-29 13:52:56 -0700
commita4525e789871d3846f20d0ea7d2d239c6a21a5a4 (patch)
tree06441d6a5a5038c8ef54a185ce585d54f7c56b3a
parente7fd4aa18abbfe0099d4947060c99ca85f42f764 (diff)
downloadperl-a4525e789871d3846f20d0ea7d2d239c6a21a5a4.tar.gz
Add regex nodes for locale
These will be used in a future commit to distinguish between /l patterns vs non-/l.
-rw-r--r--pod/perldebguts.pod6
-rw-r--r--regcomp.c61
-rw-r--r--regcomp.h10
-rw-r--r--regcomp.sym3
-rw-r--r--regexec.c66
-rw-r--r--regnodes.h311
6 files changed, 277 insertions, 180 deletions
diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod
index 8b90342453..57fa1f42ab 100644
--- a/pod/perldebguts.pod
+++ b/pod/perldebguts.pod
@@ -593,6 +593,7 @@ will be lost.
CANY no Match any one byte.
ANYOF sv 1 Match character in (or not in) this class,
single char match only
+ ANYOFL sv 1 Like ANYOF, but /l is in effect
# POSIX Character Classes:
POSIXD none Some [[:class:]] under /d; the FLAGS field
@@ -626,6 +627,7 @@ will be lost.
# Literals
EXACT str Match this string (preceded by length).
+ EXACTL str Like EXACT, but /l is in effect.
EXACTF str Match this non-UTF-8 string (not guaranteed
to be folded) using /id rules (w/len).
EXACTFL str Match this string (not guaranteed to be
@@ -635,9 +637,13 @@ will be lost.
UTF-8) using /iu rules (w/len).
EXACTFA str Match this string (not guaranteed to be
folded) using /iaa rules (w/len).
+
EXACTFU_SS str Match this string (folded iff in UTF-8,
length in folding may change even if not in
UTF-8) using /iu rules (w/len).
+ EXACTFLU8 str Rare cirucmstances: like EXACTFU, but is
+ under /l, UTF-8, folded, and everything in
+ it is above 255.
EXACTFA_NO_TRIE str Match this string (which is not trie-able;
not guaranteed to be folded) using /iaa
rules (w/len).
diff --git a/regcomp.c b/regcomp.c
index 25d382cd87..905d41bcae 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1879,7 +1879,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
May be the same as tail.
tail : item following the branch sequence
count : words in the sequence
- flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
+ flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
depth : indent depth
Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
@@ -2143,10 +2143,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
#endif
switch (flags) {
- case EXACT: break;
+ case EXACT: case EXACTL: break;
case EXACTFA:
case EXACTFU_SS:
- case EXACTFU: folder = PL_fold_latin1; break;
+ case EXACTFU:
+ case EXACTFLU8: folder = PL_fold_latin1; break;
case EXACTF: folder = PL_fold; break;
default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
}
@@ -2157,7 +2158,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
trie->wordcount = word_count;
RExC_rxi->data->data[ data_slot ] = (void*)trie;
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
- if (flags == EXACT)
+ if (flags == EXACT || flags == EXACTL)
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
trie->wordcount+1, sizeof(reg_trie_wordinfo));
@@ -3500,7 +3501,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
* this final joining, sequences could have been split over boundaries, and
* hence missed). The sequences only happen in folding, hence for any
* non-EXACT EXACTish node */
- if (OP(scan) != EXACT) {
+ if (OP(scan) != EXACT && OP(scan) != EXACTL) {
U8* s0 = (U8*) STRING(scan);
U8* s = s0;
U8* s_end = s0 + STR_LEN(scan);
@@ -4148,6 +4149,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
EXACTFU | EXACTFU
EXACTFU_SS | EXACTFU
EXACTFA | EXACTFA
+ EXACTL | EXACTL
+ EXACTFLU8 | EXACTFLU8
*/
@@ -4159,7 +4162,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
? EXACTFU \
: ( EXACTFA == (X) ) \
? EXACTFA \
- : 0 )
+ : ( EXACTL == (X) ) \
+ ? EXACTL \
+ : ( EXACTFLU8 == (X) ) \
+ ? EXACTFLU8 \
+ : 0 )
/* dont use tail as the end marker for this traverse */
for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
@@ -4475,7 +4482,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
continue;
}
}
- else if (OP(scan) == EXACT) {
+ else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
SSize_t l = STR_LEN(scan);
UV uc;
if (UTF) {
@@ -4593,7 +4600,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
case PLUS:
if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
next = NEXTOPER(scan);
- if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
+ if (OP(next) == EXACT
+ || OP(next) == EXACTL
+ || (flags & SCF_DO_STCLASS))
+ {
mincount = 1;
maxcount = REG_INFTY;
next = regnext(scan);
@@ -5074,6 +5084,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
}
break;
+ case ANYOFL:
case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
ssc_and(pRExC_state, data->start_class,
@@ -6986,7 +6997,7 @@ reStudy:
DEBUG_PEEP("first:",first,0);
/* Ignore EXACT as we deal with it later. */
if (PL_regkind[OP(first)] == EXACT) {
- if (OP(first) == EXACT)
+ if (OP(first) == EXACT || OP(first) == EXACTL)
NOOP; /* Empty, get anchored substr later. */
else
ri->regstclass = first;
@@ -7336,7 +7347,7 @@ reStudy:
&& OP(regnext(first)) == END)
r->extflags |= RXf_WHITE;
else if ( r->extflags & RXf_SPLIT
- && fop == EXACT
+ && (fop == EXACT || fop == EXACTL)
&& STR_LEN(first) == 1
&& *(STRING(first)) == ' '
&& OP(regnext(first)) == END )
@@ -11352,7 +11363,9 @@ S_compute_EXACTish(RExC_state_t *pRExC_state)
PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
if (! FOLD) {
- return EXACT;
+ return (LOC)
+ ? EXACTL
+ : EXACT;
}
op = get_regex_charset(RExC_flags);
@@ -11450,7 +11463,9 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
for those. */
&& ! _invlist_contains_cp(PL_utf8_foldable, code_point))
{
- OP(node) = EXACT;
+ OP(node) = (LOC)
+ ? EXACTL
+ : EXACT;
}
}
else if (code_point <= MAX_UTF8_TWO_BYTE) {
@@ -12751,10 +12766,14 @@ tryagain:
* differently depending on UTF8ness of the target string
* (for /u), or depending on locale for /l */
if (maybe_exact) {
- OP(ret) = EXACT;
+ OP(ret) = (LOC)
+ ? EXACTL
+ : EXACT;
}
else if (maybe_exactfu) {
- OP(ret) = EXACTFU;
+ OP(ret) = (LOC)
+ ? EXACTFLU8
+ : EXACTFU;
}
}
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
@@ -13808,7 +13827,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
DEBUG_PARSE("clas");
/* Assume we are going to generate an ANYOF node. */
- ret = reganode(pRExC_state, ANYOF, 0);
+ ret = reganode(pRExC_state,
+ (LOC)
+ ? ANYOFL
+ : ANYOF,
+ 0);
if (SIZE_ONLY) {
RExC_size += ANYOF_SKIP;
@@ -15287,7 +15310,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
value = start;
if (! FOLD) {
- op = EXACT;
+ op = (LOC)
+ ? EXACTL
+ : EXACT;
}
else if (LOC) {
@@ -16000,10 +16025,12 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
if ( exact ) {
switch (OP(scan)) {
case EXACT:
+ case EXACTL:
case EXACTF:
case EXACTFA_NO_TRIE:
case EXACTFA:
case EXACTFU:
+ case EXACTFLU8:
case EXACTFU_SS:
case EXACTFL:
if( exact == PSEUDO )
@@ -16417,7 +16444,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
SV* bitmap_invlist; /* Will hold what the bit map contains */
- if (flags & ANYOF_LOCALE_FLAGS)
+ if (OP(o) == ANYOFL)
sv_catpvs(sv, "{loc}");
if (flags & ANYOF_LOC_FOLD)
sv_catpvs(sv, "{i}");
diff --git a/regcomp.h b/regcomp.h
index 049ac43a7e..a11189380a 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -389,7 +389,15 @@ struct regnode_ssc {
* probably better than that commit anyway. But it could be reinstated if we
* need a bit. The LOC flags are only for /l nodes; the reverted commit was
* only for /d, so there are no combinatorial issues. The LOC flag to use is
- * probably the POSIXL one.
+ * probably the POSIXL one. Now that there is an ANYOFL (locale) node, another
+ * option would be to make all of those include the POSIXL data structure,
+ * which would get rid of needing a separate POSIXL flag. But it would
+ * increase the size of all such nodes, so it's probably not as atractive as
+ * having an ANYOF_POSIXL node type. But if we did do it, note that not all 32
+ * bits of that extra space are used, one bit of that could be set aside for
+ * the LOC_FOLD flag, yielding yet another bit. This would require extra code
+ * for masking, so again not the most attractive solution.
+ *
* Several flags are not used in synthetic start class (SSC) nodes, so could be
* shared should new flags be needed for SSCs, like SSC_MATCHES_EMPTY_STRING
* now. */
diff --git a/regcomp.sym b/regcomp.sym
index f2ddab1ae3..c20c5aaad1 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -58,6 +58,7 @@ REG_ANY REG_ANY, no 0 S ; Match any one character (except newline).
SANY REG_ANY, no 0 S ; Match any one character.
CANY REG_ANY, no 0 S ; Match any one byte.
ANYOF ANYOF, sv 1 S ; Match character in (or not in) this class, single char match only
+ANYOFL ANYOF, sv 1 S ; Like ANYOF, but /l is in effect
#* POSIX Character Classes:
# Order of the below is important. See ordering comment above.
@@ -90,6 +91,7 @@ BRANCH BRANCH, node 0 V ; Match this alternative, or the next...
# NOTE: the relative ordering of these types is important do not change it
EXACT EXACT, str ; Match this string (preceded by length).
+EXACTL EXACT, str ; Like EXACT, but /l is in effect.
EXACTF EXACT, str ; Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len).
EXACTFL EXACT, str ; Match this string (not guaranteed to be folded) using /il rules (w/len).
EXACTFU EXACT, str ; Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len).
@@ -98,6 +100,7 @@ EXACTFA EXACT, str ; Match this string (not guaranteed to be folded)
# End of important relative ordering.
EXACTFU_SS EXACT, str ; Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len).
+EXACTFLU8 EXACT, str ; Rare cirucmstances: like EXACTFU, but is under /l, UTF-8, folded, and everything in it is above 255.
EXACTFA_NO_TRIE EXACT, str ; Match this string (which is not trie-able; not guaranteed to be folded) using /iaa rules (w/len).
#*Do nothing types
diff --git a/regexec.c b/regexec.c
index 10343a0f88..4526d23e14 100644
--- a/regexec.c
+++ b/regexec.c
@@ -231,15 +231,15 @@ static const char* const non_utf8_target_but_utf8_required
#if 0
/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
- we don't need this definition. */
+ we don't need this definition. XXX These are now out-of-sync*/
#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
#else
/* ... so we use this as its faster. */
-#define IS_TEXT(rn) ( OP(rn)==EXACT )
-#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
+#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
+#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
@@ -1434,26 +1434,34 @@ Perl_re_intuit_start(pTHX_
#define DECL_TRIE_TYPE(scan) \
const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
- trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
+ trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
+ trie_utf8l, trie_flu8 } \
trie_type = ((scan->flags == EXACT) \
? (utf8_target ? trie_utf8 : trie_plain) \
- : (scan->flags == EXACTFA) \
- ? (utf8_target \
- ? trie_utf8_exactfa_fold \
- : trie_latin_utf8_exactfa_fold) \
- : (utf8_target \
- ? trie_utf8_fold \
- : trie_latin_utf8_fold))
+ : (scan->flags == EXACTL) \
+ ? (utf8_target ? trie_utf8l : trie_plain) \
+ : (scan->flags == EXACTFA) \
+ ? (utf8_target \
+ ? trie_utf8_exactfa_fold \
+ : trie_latin_utf8_exactfa_fold) \
+ : (scan->flags == EXACTFLU8 \
+ ? trie_flu8 \
+ : (utf8_target \
+ ? trie_utf8_fold \
+ : trie_latin_utf8_fold)))
#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
STMT_START { \
STRLEN skiplen; \
U8 flags = FOLD_FLAGS_FULL; \
switch (trie_type) { \
+ case trie_flu8: \
+ goto do_trie_utf8_fold; \
case trie_utf8_exactfa_fold: \
flags |= FOLD_FLAGS_NOMIX_ASCII; \
/* FALLTHROUGH */ \
case trie_utf8_fold: \
+ do_trie_utf8_fold: \
if ( foldlen>0 ) { \
uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
foldlen -= len; \
@@ -1484,6 +1492,7 @@ STMT_START {
uscan = foldbuf + skiplen; \
} \
break; \
+ case trie_utf8l: \
case trie_utf8: \
uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
break; \
@@ -1743,6 +1752,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
/* We know what class it must start with. */
switch (OP(c)) {
+ case ANYOFL:
case ANYOF:
if (utf8_target) {
REXEC_FBC_UTF8_CLASS_SCAN(
@@ -1798,6 +1808,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
}
goto do_exactf_utf8;
+ case EXACTFLU8:
+ if (! utf8_target) { /* All code points in this node require
+ UTF-8 to express. */
+ break;
+ }
+ utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
+ goto do_exactf_utf8;
+
case EXACTFU:
if (is_utf8_pat || utf8_target) {
utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
@@ -3652,7 +3670,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
U8 *pat = (U8*)STRING(text_node);
U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
- if (OP(text_node) == EXACT) {
+ if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
/* In an exact node, only one thing can be matched, that first
* character. If both the pat and the target are UTF-8, we can just
@@ -4429,6 +4447,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
}
#undef ST
+ case EXACTL: /* /abc/l */
case EXACT: { /* /abc/ */
char *s = STRING(scan);
ln = STR_LEN(scan);
@@ -4520,6 +4539,15 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
fold_utf8_flags = FOLDEQ_LOCALE;
goto do_exactf;
+ case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
+ is effectively /u; hence to match, target
+ must be UTF-8. */
+ if (! utf8_target) {
+ sayNO;
+ }
+ fold_utf8_flags = FOLDEQ_S1_ALREADY_FOLDED;
+ goto do_exactf;
+
case EXACTFU_SS: /* /\x{df}/iu */
case EXACTFU: /* /abc/iu */
folder = foldEQ_latin1;
@@ -4665,7 +4693,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
sayNO;
break;
- case ANYOF: /* /[abc]/ */
+ case ANYOFL: /* /[abc]/l */
+ case ANYOF: /* /[abc]/ */
if (NEXTCHR_IS_EOS)
sayNO;
if (utf8_target) {
@@ -7178,6 +7207,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
scan = loceol;
}
break;
+ case EXACTL:
case EXACT:
assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
@@ -7259,6 +7289,13 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
utf8_flags = 0;
goto do_exactf;
+ case EXACTFLU8:
+ if (! utf8_target) {
+ break;
+ }
+ utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
+ goto do_exactf;
+
case EXACTFU_SS:
case EXACTFU:
utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
@@ -7322,6 +7359,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
}
break;
}
+ case ANYOFL:
case ANYOF:
if (utf8_target) {
while (hardcount < max
@@ -7631,7 +7669,7 @@ Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit,
/*
- reginclass - determine if a character falls into a character class
- n is the ANYOF regnode
+ n is the ANYOF-type regnode
p is the target string
p_end points to one byte beyond the end of the target string
utf8_target tells whether p is in UTF-8.
diff --git a/regnodes.h b/regnodes.h
index 41662a05cc..94616a60a6 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
/* Regops and State definitions */
-#define REGNODE_MAX 90
-#define REGMATCH_STATE_MAX 130
+#define REGNODE_MAX 93
+#define REGMATCH_STATE_MAX 133
#define END 0 /* 0000 End of program. */
#define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */
@@ -31,77 +31,80 @@
#define SANY 17 /* 0x11 Match any one character. */
#define CANY 18 /* 0x12 Match any one byte. */
#define ANYOF 19 /* 0x13 Match character in (or not in) this class, single char match only */
-#define POSIXD 20 /* 0x14 Some [[:class:]] under /d; the FLAGS field gives which one */
-#define POSIXL 21 /* 0x15 Some [[:class:]] under /l; the FLAGS field gives which one */
-#define POSIXU 22 /* 0x16 Some [[:class:]] under /u; the FLAGS field gives which one */
-#define POSIXA 23 /* 0x17 Some [[:class:]] under /a; the FLAGS field gives which one */
-#define NPOSIXD 24 /* 0x18 complement of POSIXD, [[:^class:]] */
-#define NPOSIXL 25 /* 0x19 complement of POSIXL, [[:^class:]] */
-#define NPOSIXU 26 /* 0x1a complement of POSIXU, [[:^class:]] */
-#define NPOSIXA 27 /* 0x1b complement of POSIXA, [[:^class:]] */
-#define CLUMP 28 /* 0x1c Match any extended grapheme cluster sequence */
-#define BRANCH 29 /* 0x1d Match this alternative, or the next... */
-#define EXACT 30 /* 0x1e Match this string (preceded by length). */
-#define EXACTF 31 /* 0x1f Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */
-#define EXACTFL 32 /* 0x20 Match this string (not guaranteed to be folded) using /il rules (w/len). */
-#define EXACTFU 33 /* 0x21 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
-#define EXACTFA 34 /* 0x22 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
-#define EXACTFU_SS 35 /* 0x23 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
-#define EXACTFA_NO_TRIE 36 /* 0x24 Match this string (which is not trie-able; not guaranteed to be folded) using /iaa rules (w/len). */
-#define NOTHING 37 /* 0x25 Match empty string. */
-#define TAIL 38 /* 0x26 Match empty string. Can jump here from outside. */
-#define STAR 39 /* 0x27 Match this (simple) thing 0 or more times. */
-#define PLUS 40 /* 0x28 Match this (simple) thing 1 or more times. */
-#define CURLY 41 /* 0x29 Match this simple thing {n,m} times. */
-#define CURLYN 42 /* 0x2a Capture next-after-this simple thing */
-#define CURLYM 43 /* 0x2b Capture this medium-complex thing {n,m} times. */
-#define CURLYX 44 /* 0x2c Match this complex thing {n,m} times. */
-#define WHILEM 45 /* 0x2d Do curly processing and see if rest matches. */
-#define OPEN 46 /* 0x2e Mark this point in input as start of #n. */
-#define CLOSE 47 /* 0x2f Analogous to OPEN. */
-#define REF 48 /* 0x30 Match some already matched string */
-#define REFF 49 /* 0x31 Match already matched string, folded using native charset rules for non-utf8 */
-#define REFFL 50 /* 0x32 Match already matched string, folded in loc. */
-#define REFFU 51 /* 0x33 Match already matched string, folded using unicode rules for non-utf8 */
-#define REFFA 52 /* 0x34 Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */
-#define NREF 53 /* 0x35 Match some already matched string */
-#define NREFF 54 /* 0x36 Match already matched string, folded using native charset rules for non-utf8 */
-#define NREFFL 55 /* 0x37 Match already matched string, folded in loc. */
-#define NREFFU 56 /* 0x38 Match already matched string, folded using unicode rules for non-utf8 */
-#define NREFFA 57 /* 0x39 Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */
-#define LONGJMP 58 /* 0x3a Jump far away. */
-#define BRANCHJ 59 /* 0x3b BRANCH with long offset. */
-#define IFMATCH 60 /* 0x3c Succeeds if the following matches. */
-#define UNLESSM 61 /* 0x3d Fails if the following matches. */
-#define SUSPEND 62 /* 0x3e "Independent" sub-RE. */
-#define IFTHEN 63 /* 0x3f Switch, should be preceded by switcher. */
-#define GROUPP 64 /* 0x40 Whether the group matched. */
-#define EVAL 65 /* 0x41 Execute some Perl code. */
-#define MINMOD 66 /* 0x42 Next operator is not greedy. */
-#define LOGICAL 67 /* 0x43 Next opcode should set the flag only. */
-#define RENUM 68 /* 0x44 Group with independently numbered parens. */
-#define TRIE 69 /* 0x45 Match many EXACT(F[ALU]?)? at once. flags==type */
-#define TRIEC 70 /* 0x46 Same as TRIE, but with embedded charclass data */
-#define AHOCORASICK 71 /* 0x47 Aho Corasick stclass. flags==type */
-#define AHOCORASICKC 72 /* 0x48 Same as AHOCORASICK, but with embedded charclass data */
-#define GOSUB 73 /* 0x49 recurse to paren arg1 at (signed) ofs arg2 */
-#define GOSTART 74 /* 0x4a recurse to start of pattern */
-#define NGROUPP 75 /* 0x4b Whether the group matched. */
-#define INSUBP 76 /* 0x4c Whether we are in a specific recurse. */
-#define DEFINEP 77 /* 0x4d Never execute directly. */
-#define ENDLIKE 78 /* 0x4e Used only for the type field of verbs */
-#define OPFAIL 79 /* 0x4f Same as (?!) */
-#define ACCEPT 80 /* 0x50 Accepts the current matched string. */
-#define VERB 81 /* 0x51 Used only for the type field of verbs */
-#define PRUNE 82 /* 0x52 Pattern fails at this startpoint if no-backtracking through this */
-#define MARKPOINT 83 /* 0x53 Push the current location for rollback by cut. */
-#define SKIP 84 /* 0x54 On failure skip forward (to the mark) before retrying */
-#define COMMIT 85 /* 0x55 Pattern fails outright if backtracking through this */
-#define CUTGROUP 86 /* 0x56 On failure go to the next alternation in the group */
-#define KEEPS 87 /* 0x57 $& begins here. */
-#define LNBREAK 88 /* 0x58 generic newline pattern */
-#define OPTIMIZED 89 /* 0x59 Placeholder for dump. */
-#define PSEUDO 90 /* 0x5a Pseudo opcode for internal use. */
+#define ANYOFL 20 /* 0x14 Like ANYOF, but /l is in effect */
+#define POSIXD 21 /* 0x15 Some [[:class:]] under /d; the FLAGS field gives which one */
+#define POSIXL 22 /* 0x16 Some [[:class:]] under /l; the FLAGS field gives which one */
+#define POSIXU 23 /* 0x17 Some [[:class:]] under /u; the FLAGS field gives which one */
+#define POSIXA 24 /* 0x18 Some [[:class:]] under /a; the FLAGS field gives which one */
+#define NPOSIXD 25 /* 0x19 complement of POSIXD, [[:^class:]] */
+#define NPOSIXL 26 /* 0x1a complement of POSIXL, [[:^class:]] */
+#define NPOSIXU 27 /* 0x1b complement of POSIXU, [[:^class:]] */
+#define NPOSIXA 28 /* 0x1c complement of POSIXA, [[:^class:]] */
+#define CLUMP 29 /* 0x1d Match any extended grapheme cluster sequence */
+#define BRANCH 30 /* 0x1e Match this alternative, or the next... */
+#define EXACT 31 /* 0x1f Match this string (preceded by length). */
+#define EXACTL 32 /* 0x20 Like EXACT, but /l is in effect. */
+#define EXACTF 33 /* 0x21 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */
+#define EXACTFL 34 /* 0x22 Match this string (not guaranteed to be folded) using /il rules (w/len). */
+#define EXACTFU 35 /* 0x23 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */
+#define EXACTFA 36 /* 0x24 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */
+#define EXACTFU_SS 37 /* 0x25 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */
+#define EXACTFLU8 38 /* 0x26 Rare cirucmstances: like EXACTFU, but is under /l, UTF-8, folded, and everything in it is above 255. */
+#define EXACTFA_NO_TRIE 39 /* 0x27 Match this string (which is not trie-able; not guaranteed to be folded) using /iaa rules (w/len). */
+#define NOTHING 40 /* 0x28 Match empty string. */
+#define TAIL 41 /* 0x29 Match empty string. Can jump here from outside. */
+#define STAR 42 /* 0x2a Match this (simple) thing 0 or more times. */
+#define PLUS 43 /* 0x2b Match this (simple) thing 1 or more times. */
+#define CURLY 44 /* 0x2c Match this simple thing {n,m} times. */
+#define CURLYN 45 /* 0x2d Capture next-after-this simple thing */
+#define CURLYM 46 /* 0x2e Capture this medium-complex thing {n,m} times. */
+#define CURLYX 47 /* 0x2f Match this complex thing {n,m} times. */
+#define WHILEM 48 /* 0x30 Do curly processing and see if rest matches. */
+#define OPEN 49 /* 0x31 Mark this point in input as start of #n. */
+#define CLOSE 50 /* 0x32 Analogous to OPEN. */
+#define REF 51 /* 0x33 Match some already matched string */
+#define REFF 52 /* 0x34 Match already matched string, folded using native charset rules for non-utf8 */
+#define REFFL 53 /* 0x35 Match already matched string, folded in loc. */
+#define REFFU 54 /* 0x36 Match already matched string, folded using unicode rules for non-utf8 */
+#define REFFA 55 /* 0x37 Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */
+#define NREF 56 /* 0x38 Match some already matched string */
+#define NREFF 57 /* 0x39 Match already matched string, folded using native charset rules for non-utf8 */
+#define NREFFL 58 /* 0x3a Match already matched string, folded in loc. */
+#define NREFFU 59 /* 0x3b Match already matched string, folded using unicode rules for non-utf8 */
+#define NREFFA 60 /* 0x3c Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */
+#define LONGJMP 61 /* 0x3d Jump far away. */
+#define BRANCHJ 62 /* 0x3e BRANCH with long offset. */
+#define IFMATCH 63 /* 0x3f Succeeds if the following matches. */
+#define UNLESSM 64 /* 0x40 Fails if the following matches. */
+#define SUSPEND 65 /* 0x41 "Independent" sub-RE. */
+#define IFTHEN 66 /* 0x42 Switch, should be preceded by switcher. */
+#define GROUPP 67 /* 0x43 Whether the group matched. */
+#define EVAL 68 /* 0x44 Execute some Perl code. */
+#define MINMOD 69 /* 0x45 Next operator is not greedy. */
+#define LOGICAL 70 /* 0x46 Next opcode should set the flag only. */
+#define RENUM 71 /* 0x47 Group with independently numbered parens. */
+#define TRIE 72 /* 0x48 Match many EXACT(F[ALU]?)? at once. flags==type */
+#define TRIEC 73 /* 0x49 Same as TRIE, but with embedded charclass data */
+#define AHOCORASICK 74 /* 0x4a Aho Corasick stclass. flags==type */
+#define AHOCORASICKC 75 /* 0x4b Same as AHOCORASICK, but with embedded charclass data */
+#define GOSUB 76 /* 0x4c recurse to paren arg1 at (signed) ofs arg2 */
+#define GOSTART 77 /* 0x4d recurse to start of pattern */
+#define NGROUPP 78 /* 0x4e Whether the group matched. */
+#define INSUBP 79 /* 0x4f Whether we are in a specific recurse. */
+#define DEFINEP 80 /* 0x50 Never execute directly. */
+#define ENDLIKE 81 /* 0x51 Used only for the type field of verbs */
+#define OPFAIL 82 /* 0x52 Same as (?!) */
+#define ACCEPT 83 /* 0x53 Accepts the current matched string. */
+#define VERB 84 /* 0x54 Used only for the type field of verbs */
+#define PRUNE 85 /* 0x55 Pattern fails at this startpoint if no-backtracking through this */
+#define MARKPOINT 86 /* 0x56 Push the current location for rollback by cut. */
+#define SKIP 87 /* 0x57 On failure skip forward (to the mark) before retrying */
+#define COMMIT 88 /* 0x58 Pattern fails outright if backtracking through this */
+#define CUTGROUP 89 /* 0x59 On failure go to the next alternation in the group */
+#define KEEPS 90 /* 0x5a $& begins here. */
+#define LNBREAK 91 /* 0x5b generic newline pattern */
+#define OPTIMIZED 92 /* 0x5c Placeholder for dump. */
+#define PSEUDO 93 /* 0x5d Pseudo opcode for internal use. */
/* ------------ States ------------- */
#define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */
#define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */
@@ -170,6 +173,7 @@ EXTCONST U8 PL_regkind[] = {
REG_ANY, /* SANY */
REG_ANY, /* CANY */
ANYOF, /* ANYOF */
+ ANYOF, /* ANYOFL */
POSIXD, /* POSIXD */
POSIXD, /* POSIXL */
POSIXD, /* POSIXU */
@@ -181,11 +185,13 @@ EXTCONST U8 PL_regkind[] = {
CLUMP, /* CLUMP */
BRANCH, /* BRANCH */
EXACT, /* EXACT */
+ EXACT, /* EXACTL */
EXACT, /* EXACTF */
EXACT, /* EXACTFL */
EXACT, /* EXACTFU */
EXACT, /* EXACTFA */
EXACT, /* EXACTFU_SS */
+ EXACT, /* EXACTFLU8 */
EXACT, /* EXACTFA_NO_TRIE */
NOTHING, /* NOTHING */
NOTHING, /* TAIL */
@@ -309,6 +315,7 @@ static const U8 regarglen[] = {
0, /* SANY */
0, /* CANY */
EXTRA_SIZE(struct regnode_1), /* ANYOF */
+ EXTRA_SIZE(struct regnode_1), /* ANYOFL */
0, /* POSIXD */
0, /* POSIXL */
0, /* POSIXU */
@@ -320,11 +327,13 @@ static const U8 regarglen[] = {
0, /* CLUMP */
0, /* BRANCH */
0, /* EXACT */
+ 0, /* EXACTL */
0, /* EXACTF */
0, /* EXACTFL */
0, /* EXACTFU */
0, /* EXACTFA */
0, /* EXACTFU_SS */
+ 0, /* EXACTFLU8 */
0, /* EXACTFA_NO_TRIE */
0, /* NOTHING */
0, /* TAIL */
@@ -405,6 +414,7 @@ static const char reg_off_by_arg[] = {
0, /* SANY */
0, /* CANY */
0, /* ANYOF */
+ 0, /* ANYOFL */
0, /* POSIXD */
0, /* POSIXL */
0, /* POSIXU */
@@ -416,11 +426,13 @@ static const char reg_off_by_arg[] = {
0, /* CLUMP */
0, /* BRANCH */
0, /* EXACT */
+ 0, /* EXACTL */
0, /* EXACTF */
0, /* EXACTFL */
0, /* EXACTFU */
0, /* EXACTFA */
0, /* EXACTFU_SS */
+ 0, /* EXACTFLU8 */
0, /* EXACTFA_NO_TRIE */
0, /* NOTHING */
0, /* TAIL */
@@ -506,77 +518,80 @@ EXTCONST char * const PL_reg_name[] = {
"SANY", /* 0x11 */
"CANY", /* 0x12 */
"ANYOF", /* 0x13 */
- "POSIXD", /* 0x14 */
- "POSIXL", /* 0x15 */
- "POSIXU", /* 0x16 */
- "POSIXA", /* 0x17 */
- "NPOSIXD", /* 0x18 */
- "NPOSIXL", /* 0x19 */
- "NPOSIXU", /* 0x1a */
- "NPOSIXA", /* 0x1b */
- "CLUMP", /* 0x1c */
- "BRANCH", /* 0x1d */
- "EXACT", /* 0x1e */
- "EXACTF", /* 0x1f */
- "EXACTFL", /* 0x20 */
- "EXACTFU", /* 0x21 */
- "EXACTFA", /* 0x22 */
- "EXACTFU_SS", /* 0x23 */
- "EXACTFA_NO_TRIE", /* 0x24 */
- "NOTHING", /* 0x25 */
- "TAIL", /* 0x26 */
- "STAR", /* 0x27 */
- "PLUS", /* 0x28 */
- "CURLY", /* 0x29 */
- "CURLYN", /* 0x2a */
- "CURLYM", /* 0x2b */
- "CURLYX", /* 0x2c */
- "WHILEM", /* 0x2d */
- "OPEN", /* 0x2e */
- "CLOSE", /* 0x2f */
- "REF", /* 0x30 */
- "REFF", /* 0x31 */
- "REFFL", /* 0x32 */
- "REFFU", /* 0x33 */
- "REFFA", /* 0x34 */
- "NREF", /* 0x35 */
- "NREFF", /* 0x36 */
- "NREFFL", /* 0x37 */
- "NREFFU", /* 0x38 */
- "NREFFA", /* 0x39 */
- "LONGJMP", /* 0x3a */
- "BRANCHJ", /* 0x3b */
- "IFMATCH", /* 0x3c */
- "UNLESSM", /* 0x3d */
- "SUSPEND", /* 0x3e */
- "IFTHEN", /* 0x3f */
- "GROUPP", /* 0x40 */
- "EVAL", /* 0x41 */
- "MINMOD", /* 0x42 */
- "LOGICAL", /* 0x43 */
- "RENUM", /* 0x44 */
- "TRIE", /* 0x45 */
- "TRIEC", /* 0x46 */
- "AHOCORASICK", /* 0x47 */
- "AHOCORASICKC", /* 0x48 */
- "GOSUB", /* 0x49 */
- "GOSTART", /* 0x4a */
- "NGROUPP", /* 0x4b */
- "INSUBP", /* 0x4c */
- "DEFINEP", /* 0x4d */
- "ENDLIKE", /* 0x4e */
- "OPFAIL", /* 0x4f */
- "ACCEPT", /* 0x50 */
- "VERB", /* 0x51 */
- "PRUNE", /* 0x52 */
- "MARKPOINT", /* 0x53 */
- "SKIP", /* 0x54 */
- "COMMIT", /* 0x55 */
- "CUTGROUP", /* 0x56 */
- "KEEPS", /* 0x57 */
- "LNBREAK", /* 0x58 */
- "OPTIMIZED", /* 0x59 */
- "PSEUDO", /* 0x5a */
+ "ANYOFL", /* 0x14 */
+ "POSIXD", /* 0x15 */
+ "POSIXL", /* 0x16 */
+ "POSIXU", /* 0x17 */
+ "POSIXA", /* 0x18 */
+ "NPOSIXD", /* 0x19 */
+ "NPOSIXL", /* 0x1a */
+ "NPOSIXU", /* 0x1b */
+ "NPOSIXA", /* 0x1c */
+ "CLUMP", /* 0x1d */
+ "BRANCH", /* 0x1e */
+ "EXACT", /* 0x1f */
+ "EXACTL", /* 0x20 */
+ "EXACTF", /* 0x21 */
+ "EXACTFL", /* 0x22 */
+ "EXACTFU", /* 0x23 */
+ "EXACTFA", /* 0x24 */
+ "EXACTFU_SS", /* 0x25 */
+ "EXACTFLU8", /* 0x26 */
+ "EXACTFA_NO_TRIE", /* 0x27 */
+ "NOTHING", /* 0x28 */
+ "TAIL", /* 0x29 */
+ "STAR", /* 0x2a */
+ "PLUS", /* 0x2b */
+ "CURLY", /* 0x2c */
+ "CURLYN", /* 0x2d */
+ "CURLYM", /* 0x2e */
+ "CURLYX", /* 0x2f */
+ "WHILEM", /* 0x30 */
+ "OPEN", /* 0x31 */
+ "CLOSE", /* 0x32 */
+ "REF", /* 0x33 */
+ "REFF", /* 0x34 */
+ "REFFL", /* 0x35 */
+ "REFFU", /* 0x36 */
+ "REFFA", /* 0x37 */
+ "NREF", /* 0x38 */
+ "NREFF", /* 0x39 */
+ "NREFFL", /* 0x3a */
+ "NREFFU", /* 0x3b */
+ "NREFFA", /* 0x3c */
+ "LONGJMP", /* 0x3d */
+ "BRANCHJ", /* 0x3e */
+ "IFMATCH", /* 0x3f */
+ "UNLESSM", /* 0x40 */
+ "SUSPEND", /* 0x41 */
+ "IFTHEN", /* 0x42 */
+ "GROUPP", /* 0x43 */
+ "EVAL", /* 0x44 */
+ "MINMOD", /* 0x45 */
+ "LOGICAL", /* 0x46 */
+ "RENUM", /* 0x47 */
+ "TRIE", /* 0x48 */
+ "TRIEC", /* 0x49 */
+ "AHOCORASICK", /* 0x4a */
+ "AHOCORASICKC", /* 0x4b */
+ "GOSUB", /* 0x4c */
+ "GOSTART", /* 0x4d */
+ "NGROUPP", /* 0x4e */
+ "INSUBP", /* 0x4f */
+ "DEFINEP", /* 0x50 */
+ "ENDLIKE", /* 0x51 */
+ "OPFAIL", /* 0x52 */
+ "ACCEPT", /* 0x53 */
+ "VERB", /* 0x54 */
+ "PRUNE", /* 0x55 */
+ "MARKPOINT", /* 0x56 */
+ "SKIP", /* 0x57 */
+ "COMMIT", /* 0x58 */
+ "CUTGROUP", /* 0x59 */
+ "KEEPS", /* 0x5a */
+ "LNBREAK", /* 0x5b */
+ "OPTIMIZED", /* 0x5c */
+ "PSEUDO", /* 0x5d */
/* ------------ States ------------- */
"TRIE_next", /* REGNODE_MAX +0x01 */
"TRIE_next_fail", /* REGNODE_MAX +0x02 */
@@ -711,7 +726,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
EXTCONST U8 PL_varies_bitmask[];
#else
EXTCONST U8 PL_varies_bitmask[] = {
- 0x00, 0x00, 0x00, 0x30, 0x80, 0x3F, 0xFF, 0xCB, 0x00, 0x00, 0x00, 0x00
+ 0x00, 0x00, 0x00, 0x60, 0x00, 0xFC, 0xF9, 0x5F, 0x06, 0x00, 0x00, 0x00
};
#endif /* DOINIT */
@@ -723,8 +738,8 @@ EXTCONST U8 PL_varies_bitmask[] = {
EXTCONST U8 PL_simple[] __attribute__deprecated__;
#else
EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
- REG_ANY, SANY, CANY, ANYOF, POSIXD, POSIXL, POSIXU, POSIXA, NPOSIXD,
- NPOSIXL, NPOSIXU, NPOSIXA,
+ REG_ANY, SANY, CANY, ANYOF, ANYOFL, POSIXD, POSIXL, POSIXU, POSIXA,
+ NPOSIXD, NPOSIXL, NPOSIXU, NPOSIXA,
0
};
#endif /* DOINIT */
@@ -733,7 +748,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
EXTCONST U8 PL_simple_bitmask[];
#else
EXTCONST U8 PL_simple_bitmask[] = {
- 0x00, 0x00, 0xFF, 0x0F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ 0x00, 0x00, 0xFF, 0x1F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
#endif /* DOINIT */