summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-02-17 15:03:32 -0700
committerKarl Williamson <khw@cpan.org>2015-02-19 22:55:01 -0700
commit64935bc6975bb01af403817752e88d6540c8711d (patch)
tree6a1619ac46d1501a5b296a2b69d9af0b98db8c58 /regcomp.c
parent0e0b935601a8b7a2c56653412a94a36f986bc34f (diff)
downloadperl-64935bc6975bb01af403817752e88d6540c8711d.tar.gz
Add qr/\b{gcb}/
A function implements seeing if the space between any two characters is a grapheme cluster break. Afer I wrote this, I realized that an array lookup might be a better implementation, but the deadline for v5.22 was too close to change it. I did see that my gcc optimized it down to an array lookup. This makes the implementation of \X go from being complicated to trivial.
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c96
1 files changed, 83 insertions, 13 deletions
diff --git a/regcomp.c b/regcomp.c
index 82be6417d7..80c937786c 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -87,7 +87,6 @@ EXTERN_C const struct regexp_engine my_reg_engine;
#endif
#include "dquote_static.c"
-#include "charclass_invlists.h"
#include "inline_invlist.c"
#include "unicode_constants.h"
@@ -11772,27 +11771,90 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
invert = 1;
/* FALLTHROUGH */
case 'b':
+ {
+ regex_charset charset = get_regex_charset(RExC_flags);
+
RExC_seen_zerolen++;
RExC_seen |= REG_LOOKBEHIND_SEEN;
- op = BOUND + get_regex_charset(RExC_flags);
- if (op > BOUNDA) { /* /aa is same as /a */
- op = BOUNDA;
- }
- else if (op == BOUNDL) {
- RExC_contains_locale = 1;
- }
+ op = BOUND + charset;
- if (invert) {
- op += NBOUND - BOUND;
+ if (op == BOUNDL) {
+ RExC_contains_locale = 1;
}
ret = reg_node(pRExC_state, op);
*flagp |= SIMPLE;
- if ((U8) *(RExC_parse + 1) == '{') {
- /* diag_listed_as: Use "%s" instead of "%s" */
- vFAIL3("Use \"\\%c\\{\" instead of \"\\%c{\"", *RExC_parse, *RExC_parse);
+ if (*(RExC_parse + 1) != '{') {
+ FLAGS(ret) = TRADITIONAL_BOUND;
+ if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
+ OP(ret) = BOUNDA;
+ }
+ }
+ else {
+ STRLEN length;
+ char name = *RExC_parse;
+ char * endbrace;
+ RExC_parse += 2;
+ endbrace = strchr(RExC_parse, '}');
+
+ if (! endbrace) {
+ vFAIL2("Missing right brace on \\%c{}", name);
+ }
+ /* XXX Need to decide whether to take spaces or not. Should be
+ * consistent with \p{}, but that currently is SPACE, which
+ * means vertical too, which seems wrong
+ * while (isBLANK(*RExC_parse)) {
+ RExC_parse++;
+ }*/
+ if (endbrace == RExC_parse) {
+ RExC_parse++; /* After the '}' */
+ vFAIL2("Empty \\%c{}", name);
+ }
+ length = endbrace - RExC_parse;
+ /*while (isBLANK(*(RExC_parse + length - 1))) {
+ length--;
+ }*/
+ switch (*RExC_parse) {
+ case 'g':
+ if (length != 1
+ && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
+ {
+ goto bad_bound_type;
+ }
+ FLAGS(ret) = GCB_BOUND;
+ break;
+ default:
+ bad_bound_type:
+ RExC_parse = endbrace;
+ vFAIL2utf8f(
+ "'%"UTF8f"' is an unknown bound type",
+ UTF8fARG(UTF, length, endbrace - length));
+ NOT_REACHED; /*NOTREACHED*/
+ }
+ RExC_parse = endbrace;
+ RExC_uni_semantics = 1;
+
+ if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
+ OP(ret) = BOUNDU;
+ length += 4;
+
+ /* Don't have to worry about UTF-8, in this message because
+ * to get here the contents of the \b must be ASCII */
+ ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
+ "Using /u for '%.*s' instead of /%s",
+ (unsigned) length,
+ endbrace - length + 1,
+ (charset == REGEX_ASCII_RESTRICTED_CHARSET)
+ ? ASCII_RESTRICT_PAT_MODS
+ : ASCII_MORE_RESTRICT_PAT_MODS);
+ }
}
+
+ if (PASS2 && invert) {
+ OP(ret) += NBOUND - BOUND;
+ }
goto finish_meta_pat;
+ }
case 'D':
invert = 1;
@@ -16735,6 +16797,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
}
}
+ else if (k == BOUND || k == NBOUND) {
+ /* Must be synced with order of 'bound_type' in regcomp.h */
+ const char * const bounds[] = {
+ "", /* Traditional */
+ "{gcb}"
+ };
+ sv_catpv(sv, bounds[FLAGS(o)]);
+ }
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
else if (OP(o) == SBOL)