summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc1
-rw-r--r--embed.h1
-rw-r--r--embedvar.h1
-rw-r--r--intrpvar.h1
-rw-r--r--lib/unicore/mktables47
-rw-r--r--perl.c3
-rw-r--r--perl.h1
-rw-r--r--pod/perlcheat.pod2
-rw-r--r--pod/perldebguts.pod26
-rw-r--r--pod/perldelta.pod9
-rw-r--r--pod/perldiag.pod24
-rw-r--r--pod/perlre.pod12
-rw-r--r--pod/perlrebackslash.pod39
-rw-r--r--pod/perlreref.pod2
-rw-r--r--pod/perlunicode.pod8
-rw-r--r--proto.h3
-rw-r--r--regcomp.c96
-rw-r--r--regcomp.h5
-rw-r--r--regcomp.sym16
-rw-r--r--regexec.c500
-rw-r--r--regnodes.h16
-rw-r--r--sv.c1
-rw-r--r--t/lib/warnings/regexec15
-rw-r--r--t/re/reg_mesg.t17
-rw-r--r--utf8.c1
25 files changed, 587 insertions, 260 deletions
diff --git a/embed.fnc b/embed.fnc
index cfe634ff5d..b74beae280 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2276,6 +2276,7 @@ Es |void |to_utf8_substr |NN regexp * prog
Es |bool |to_byte_substr |NN regexp * prog
ERsn |I32 |reg_check_named_buff_matched |NN const regexp *rex \
|NN const regnode *scan
+EsnR |bool |isGCB |const PL_GCB_enum before|const PL_GCB_enum after
# ifdef DEBUGGING
Es |void |dump_exec_pos |NN const char *locinput|NN const regnode *scan|NN const char *loc_regeol\
|NN const char *loc_bostr|NN const char *loc_reg_starttry|const bool do_utf8
diff --git a/embed.h b/embed.h
index 802b624598..ea5b7e92f3 100644
--- a/embed.h
+++ b/embed.h
@@ -1054,6 +1054,7 @@
#define find_byclass(a,b,c,d,e) S_find_byclass(aTHX_ a,b,c,d,e)
#define isFOO_lc(a,b) S_isFOO_lc(aTHX_ a,b)
#define isFOO_utf8_lc(a,b) S_isFOO_utf8_lc(aTHX_ a,b)
+#define isGCB S_isGCB
#define reg_check_named_buff_matched S_reg_check_named_buff_matched
#define regcppop(a,b) S_regcppop(aTHX_ a,b)
#define regcppush(a,b,c) S_regcppush(aTHX_ a,b,c)
diff --git a/embedvar.h b/embedvar.h
index da3c331634..dde2340c73 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -53,6 +53,7 @@
#define PL_DBtrace (vTHX->IDBtrace)
#define PL_Dir (vTHX->IDir)
#define PL_Env (vTHX->IEnv)
+#define PL_GCB_invlist (vTHX->IGCB_invlist)
#define PL_HasMultiCharFold (vTHX->IHasMultiCharFold)
#define PL_InBitmap (vTHX->IInBitmap)
#define PL_LIO (vTHX->ILIO)
diff --git a/intrpvar.h b/intrpvar.h
index b88f6dfb0b..dc44b31cd7 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -610,6 +610,7 @@ PERLVAR(I, utf8_charname_continue, SV *)
PERLVARA(I, utf8_swash_ptrs, POSIX_SWASH_COUNT, SV *)
PERLVARA(I, Posix_ptrs, POSIX_CC_COUNT, SV *)
PERLVARA(I, XPosix_ptrs, POSIX_CC_COUNT, SV *)
+PERLVAR(I, GCB_invlist, SV *)
PERLVAR(I, last_swash_hv, HV *)
PERLVAR(I, last_swash_tmps, U8 *)
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 2da7bb3416..511ad020ee 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -18762,6 +18762,7 @@ sub _test_break($$) {
my @should_match = map { eval "\"$_\"" } @should_display;
# If a string can be represented in both non-ut8 and utf8, test both cases
+ my $display_upgrade = "";
UPGRADE:
for my $to_upgrade (0 .. 1) {
@@ -18771,8 +18772,54 @@ sub _test_break($$) {
next UPGRADE if utf8::is_utf8($string);
utf8::upgrade($string);
+ $display_upgrade = " (utf8-upgraded)";
+ }
+
+ # The /l modifier has C after it to indicate the locale to try
+ my @modifiers = qw(a aa d lC u i);
+ push @modifiers, "l$utf8_locale" if defined $utf8_locale;
+
+ # Test for each of the regex modifiers.
+ for my $modifier (@modifiers) {
+ my $display_locale = "";
+
+ # For /l, set the locale to what it says to.
+ if ($modifier =~ / ^ l (.*) /x) {
+ my $locale = $1;
+ $display_locale = "(locale = $locale)";
+ use Config;
+ if (defined $Config{d_setlocale}) {
+ eval { require POSIX; import POSIX 'locale_h'; };
+ if (defined &POSIX::LC_CTYPE) {
+ POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
+ }
+ }
+ $modifier = 'l';
+ }
+
+ no warnings qw(locale regexp surrogate);
+ my $pattern = "(?$modifier:$break_pattern)";
+
+ # Actually do the test
+ my $matched = $string =~ qr/$pattern/;
+ print "not " unless $matched;
+
+ # Fancy display of test results
+ $matched = ($matched) ? "matched" : "failed to match";
+ print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";
+
+ # Repeat with the first \B{} in the pattern. This makes sure the
+ # code in regexec.c:find_byclass() for \B gets executed
+ if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
+ my $B_pattern = "$1$2";
+ $matched = $string =~ qr/$B_pattern/;
+ print "not " unless $matched;
+ print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n";
+ }
}
+ next if $break_type ne 'gcb';
+
# Finally, do the \X match.
my @matches = $string =~ /(\X)/g;
diff --git a/perl.c b/perl.c
index cda99ff700..27338cbd3f 100644
--- a/perl.c
+++ b/perl.c
@@ -33,7 +33,6 @@
#include "perl.h"
#include "patchlevel.h" /* for local_patches */
#include "XSUB.h"
-#include "charclass_invlists.h"
#ifdef NETWARE
#include "nwutil.h"
@@ -391,6 +390,7 @@ perl_construct(pTHXx)
PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(XPosixWord_invlist);
PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
+ PL_GCB_invlist = _new_invlist_C_array(Grapheme_Cluster_Break_invlist);
ENTER;
}
@@ -1060,6 +1060,7 @@ perl_destruct(pTHXx)
SvREFCNT_dec(PL_XPosix_ptrs[i]);
PL_XPosix_ptrs[i] = NULL;
}
+ PL_GCB_invlist = NULL;
if (!specialWARN(PL_compiling.cop_warnings))
PerlMemShared_free(PL_compiling.cop_warnings);
diff --git a/perl.h b/perl.h
index 9976f86bdd..b3b77ba121 100644
--- a/perl.h
+++ b/perl.h
@@ -2685,6 +2685,7 @@ typedef struct padname PADNAME;
#endif
#include "handy.h"
+#include "charclass_invlists.h"
#if defined(USE_LARGE_FILES) && !defined(NO_64_BIT_RAWIO)
# if LSEEKSIZE == 8 && !defined(USE_64_BIT_RAWIO)
diff --git a/pod/perlcheat.pod b/pod/perlcheat.pod
index f288692a87..6e4e919ff5 100644
--- a/pod/perlcheat.pod
+++ b/pod/perlcheat.pod
@@ -46,7 +46,7 @@ already be overwhelming.
, => /a ASCII /aa safe {3,7} repeat in range
list ops /l locale /d dual | alternation
not /u Unicode [] character class
- and /e evaluate /ee rpts \b word boundary
+ and /e evaluate /ee rpts \b boundary
or xor /g global \z string end
/o compile pat once () capture
DEBUG (?:p) no capture
diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod
index 57fa1f42ab..591e69bd12 100644
--- a/pod/perldebguts.pod
+++ b/pod/perldebguts.pod
@@ -573,19 +573,23 @@ will be lost.
# Word Boundary Opcodes:
BOUND no Match "" at any word boundary using native
- charset rules for non-utf8
- BOUNDL no Match "" at any locale word boundary
- BOUNDU no Match "" at any word boundary using Unicode
- rules
- BOUNDA no Match "" at any word boundary using ASCII
- rules
+ charset rules for non-utf8, otherwise
+ Unicode rules
+ BOUNDL no Match "" at any boundary of a given type
+ using locale rules
+ BOUNDU no Match "" at any boundary of a given type
+ using Unicode rules
+ BOUNDA no Match "" at any boundary of a given type
+ using ASCII rules
NBOUND no Match "" at any word non-boundary using
- native charset rules for non-utf8
- NBOUNDL no Match "" at any locale word non-boundary
- NBOUNDU no Match "" at any word non-boundary using
+ native charset rules for non-utf8, otherwise
Unicode rules
- NBOUNDA no Match "" at any word non-boundary using
- ASCII rules
+ NBOUNDL no Match "" at any boundary of a given type
+ using locale rules
+ NBOUNDU no Match "" at any boundary of a given type
+ using using Unicode rules
+ NBOUNDA no Match "" at any boundary of a given type
+ using using ASCII rules
# [Special] alternatives:
REG_ANY no Match any one character (except newline).
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 5a80e745d9..5db41e2bdc 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -25,7 +25,14 @@ XXX New core language features go here. Summarize user-visible core language
enhancements. Particularly prominent performance optimisations could go
here, but most should go in the L</Performance Enhancements> section.
-[ List each enhancement as a =head2 entry ]
+=head2 qr/\b{gcb}/ is now handled in regular expressions
+
+C<gcb> stands for Grapheme Cluster Boundary. It is a Unicode property
+that finds the boundary between sequences of characters that look like a
+single character to a native speaker of a language. Perl has long had
+the ability to deal with these through the C<\X> regular escape
+sequence. Now, there is an alternative way of handling these. See
+L<perlrebackslash/\b{}, \b, \B{}, \B> for details.
=head1 Security
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 877b992270..7db5b5414d 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -2894,6 +2894,12 @@ with 'useperlio'.
(F) Your machine doesn't implement the sockatmark() functionality,
neither as a system call nor an ioctl call (SIOCATMARK).
+=item '%s' is an unknown bound type in regex; marked by <-- HERE in m/%s/
+
+(F) You used C<\b{...}> or C<\B{...}> and the C<...> is not known to
+Perl. The current valid ones are given in
+L<perlrebackslash/\b{}, \b, \B{}, \B>.
+
=item "%s" is more clearly written simply as "%s" in regex; marked by <-- HERE in m/%s/
(W regexp) (only under C<S<use re 'strict'>> or within C<(?[...])>)
@@ -6638,6 +6644,15 @@ is deprecated. See L<perlvar/"$[">.
form if you wish to use an empty line as the terminator of the
here-document.
+=item Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale
+
+(W locale) You are matching a regular expression using locale rules,
+and a Unicode boundary is being matched, but the locale is not a Unicode
+one. This doesn't make sense. Perl will continue, assuming a Unicode
+(UTF-8) locale, but the results could well be wrong except if the locale
+happens to be ISO-8859-1 (Latin1) where this message is spurious and can
+be ignored.
+
=item Use of chdir('') or chdir(undef) as chdir() deprecated
(D deprecated) chdir() with no arguments is documented to change to
@@ -6859,6 +6874,15 @@ a range. For these, what should happen isn't clear at all. In
these circumstances, Perl discards all but the first character
of the returned sequence, which is not likely what you want.
+=item Using /u for '%s' instead of /%s in regex; marked by <-- HERE in m/%s/
+
+(W regexp) You used a Unicode boundary (C<\b{...}> or C<\B{...}>) in a
+portion of a regular expression where the character set modifiers C</a>
+or C</aa> are in effect. These two modifiers indicate an ASCII
+interpretation, and this doesn't make sense for a Unicode definiton.
+The generated regular expression will compile so that the boundary uses
+all of Unicode. No other portion of the regular expression is affected.
+
=item Using !~ with %s doesn't make sense
(F) Using the C<!~> operator with C<s///r>, C<tr///r> or C<y///r> is
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 4231e99591..90858b1157 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -388,6 +388,10 @@ the pattern uses a Unicode property (C<\p{...}> or C<\P{...}>); or
=item 6
+the pattern uses a Unicode break (C<\b{...}> or C<\B{...}>); or
+
+=item 7
+
the pattern uses L</C<(?[ ])>>
=back
@@ -770,6 +774,8 @@ X<regexp, zero-width assertion>
X<regular expression, zero-width assertion>
X<\b> X<\B> X<\A> X<\Z> X<\z> X<\G>
+ \b{} Match at Unicode boundary of specified type
+ \B{} Match where corresponding \b{} doesn't match
\b Match a word boundary
\B Match except at a word boundary
\A Match only at beginning of string
@@ -778,6 +784,12 @@ X<\b> X<\B> X<\A> X<\Z> X<\z> X<\G>
\G Match only at pos() (e.g. at the end-of-match position
of prior m//g)
+A Unicode boundary (C<\b{}>), available starting in v5.22, is a spot
+between two characters, or before the first character in the string, or
+after the final character in the string where certain criteria defined
+by Unicode are met. See L<perlrebackslash/\b{}, \b, \B{}, \B> for
+details.
+
A word boundary (C<\b>) is a spot between two characters
that has a C<\w> on one side of it and a C<\W> on the other side
of it (in either order), counting the imaginary characters off the
diff --git a/pod/perlrebackslash.pod b/pod/perlrebackslash.pod
index 230e76dea8..ea460cb82c 100644
--- a/pod/perlrebackslash.pod
+++ b/pod/perlrebackslash.pod
@@ -66,8 +66,8 @@ as C<Not in [].>
\1 Absolute backreference. Not in [].
\a Alarm or bell.
\A Beginning of string. Not in [].
- \b Word/non-word boundary. (Backspace in []).
- \B Not a word/non-word boundary. Not in [].
+ \b{}, \b Boundary. (\b is a backspace in []).
+ \B{}, \B Not a boundary.
\cX Control-X.
\C Single octet, even under UTF-8. Not in [].
(Deprecated)
@@ -134,7 +134,8 @@ description. (For EBCDIC platforms, see L<perlebcdic/OPERATOR DIFFERENCES>.)
=item [1]
C<\b> is the backspace character only inside a character class. Outside a
-character class, C<\b> is a word/non-word boundary.
+character class, C<\b> alone is a word-character/non-word-character
+boundary, and C<\b{}> is some other type of boundary.
=item [2]
@@ -525,10 +526,21 @@ or the beginning of that string if there was no previous match.
Mnemonic: I<G>lobal.
-=item \b, \B
+=item \b{}, \b, \B{}, \B
-C<\b> matches at any place between a word and a non-word character; C<\B>
-matches at any place between characters where C<\b> doesn't match. C<\b>
+C<\b{...}>, available starting in v5.22, matches a boundary (between two
+characters, or before the first character of the string, or after the
+final character of the string) based on the Unicode rules for the
+boundary type specified inside the braces. The currently known boundary
+types are given a few paragraphs below. C<\B{...}> matches at any place
+between characters where C<\b{...}> of the same type doesn't match.
+
+C<\b> when not immediately followed by a C<"{"> matches at any place
+between a word (something matched by C<\w>) and a non-word character
+(C<\W>); C<\B> when not immediately followed by a C<"{"> matches at any
+place between characters where C<\b> doesn't match.
+
+C<\b>
and C<\B> assume there's a non-word character before the beginning and after
the end of the source string; so C<\b> will match at the beginning (or end)
of the source string if the source string begins (or ends) with a word
@@ -537,13 +549,22 @@ character. Otherwise, C<\B> will match.
Do not use something like C<\b=head\d\b> and expect it to match the
beginning of a line. It can't, because for there to be a boundary before
the non-word "=", there must be a word character immediately previous.
-All boundary determinations look for word characters alone, not for
-non-words characters nor for string ends. It may help to understand how
+All plain C<\b> and C<\B> boundary determinations look for word
+characters alone, not for
+non-word characters nor for string ends. It may help to understand how
<\b> and <\B> work by equating them as follows:
\b really means (?:(?<=\w)(?!\w)|(?<!\w)(?=\w))
\B really means (?:(?<=\w)(?=\w)|(?<!\w)(?!\w))
+In contrast, C<\b{...}> always matches at the beginning and end of the
+line (and C<\B{...}> never does). The only boundary type currently
+"Grapheme Cluster Boundary". (Actually Perl always uses the improved
+"extended" grapheme cluster"). These are explained below under C<\X>.
+In fact, C<\X> is another way to get the same functionality. It is
+equivalent to C</.+?\b{gcb}/>. Use whichever is most convenient for
+your situation.
+
Mnemonic: I<b>oundary.
=back
@@ -650,6 +671,8 @@ were a single character.
The match is greedy and non-backtracking, so that the cluster is never
broken up into smaller components.
+See also L<C<\b{gcb}>|/\b{}, \b, \B{}, \B>.
+
Mnemonic: eI<X>tended Unicode character.
=back
diff --git a/pod/perlreref.pod b/pod/perlreref.pod
index 7ae8f6cfdf..bc4bef76f4 100644
--- a/pod/perlreref.pod
+++ b/pod/perlreref.pod
@@ -201,6 +201,8 @@ All are zero-width assertions.
^ Match string start (or line, if /m is used)
$ Match string end (or line, if /m is used) or before newline
+ \b{} Match boundary of type specified within the braces
+ \B{} Match wherever \b{} doesn't match
\b Match word boundary (between \w and \W)
\B Match except at word boundary (between \w and \w or \W and \W)
\A Match string start (regardless of /m)
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 0482d92596..ee99198e2d 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -1100,7 +1100,8 @@ Level 2 - Extended Unicode Support
[10] see UAX#15 "Unicode Normalization Forms"
[11] have Unicode::Normalize but not integrated to regexes
- [12] have \X but we don't have a "Grapheme Cluster Mode"
+ [12] have \X and \b{gcb} but we don't have a "Grapheme Cluster
+ Mode"
[14] see UAX#29, Word Boundaries
[15] This is covered in Chapter 3.13 (in Unicode 6.0)
@@ -1575,8 +1576,9 @@ regular expressions outside the scope.
=item *
-Matching any of several properties in regular expressions, namely C<\b>,
-C<\B>, C<\s>, C<\S>, C<\w>, C<\W>, and all the Posix character classes
+Matching any of several properties in regular expressions, namely
+C<\b> (without braces), C<\B> (without braces), C<\s>, C<\S>, C<\w>,
+C<\W>, and all the Posix character classes
I<except> C<[[:ascii:]]>.
Starting in Perl 5.14.0, regular expressions compiled within
the scope of C<unicode_strings> use character semantics
diff --git a/proto.h b/proto.h
index 966c6d880d..3ba6666e2b 100644
--- a/proto.h
+++ b/proto.h
@@ -7432,6 +7432,9 @@ STATIC bool S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
#define PERL_ARGS_ASSERT_ISFOO_UTF8_LC \
assert(character)
+STATIC bool S_isGCB(const PL_GCB_enum before, const PL_GCB_enum after)
+ __attribute__warn_unused_result__;
+
STATIC I32 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
__attribute__warn_unused_result__
__attribute__nonnull__(1)
diff --git a/regcomp.c b/regcomp.c
index 82be6417d7..80c937786c 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -87,7 +87,6 @@ EXTERN_C const struct regexp_engine my_reg_engine;
#endif
#include "dquote_static.c"
-#include "charclass_invlists.h"
#include "inline_invlist.c"
#include "unicode_constants.h"
@@ -11772,27 +11771,90 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
invert = 1;
/* FALLTHROUGH */
case 'b':
+ {
+ regex_charset charset = get_regex_charset(RExC_flags);
+
RExC_seen_zerolen++;
RExC_seen |= REG_LOOKBEHIND_SEEN;
- op = BOUND + get_regex_charset(RExC_flags);
- if (op > BOUNDA) { /* /aa is same as /a */
- op = BOUNDA;
- }
- else if (op == BOUNDL) {
- RExC_contains_locale = 1;
- }
+ op = BOUND + charset;
- if (invert) {
- op += NBOUND - BOUND;
+ if (op == BOUNDL) {
+ RExC_contains_locale = 1;
}
ret = reg_node(pRExC_state, op);
*flagp |= SIMPLE;
- if ((U8) *(RExC_parse + 1) == '{') {
- /* diag_listed_as: Use "%s" instead of "%s" */
- vFAIL3("Use \"\\%c\\{\" instead of \"\\%c{\"", *RExC_parse, *RExC_parse);
+ if (*(RExC_parse + 1) != '{') {
+ FLAGS(ret) = TRADITIONAL_BOUND;
+ if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
+ OP(ret) = BOUNDA;
+ }
+ }
+ else {
+ STRLEN length;
+ char name = *RExC_parse;
+ char * endbrace;
+ RExC_parse += 2;
+ endbrace = strchr(RExC_parse, '}');
+
+ if (! endbrace) {
+ vFAIL2("Missing right brace on \\%c{}", name);
+ }
+ /* XXX Need to decide whether to take spaces or not. Should be
+ * consistent with \p{}, but that currently is SPACE, which
+ * means vertical too, which seems wrong
+ * while (isBLANK(*RExC_parse)) {
+ RExC_parse++;
+ }*/
+ if (endbrace == RExC_parse) {
+ RExC_parse++; /* After the '}' */
+ vFAIL2("Empty \\%c{}", name);
+ }
+ length = endbrace - RExC_parse;
+ /*while (isBLANK(*(RExC_parse + length - 1))) {
+ length--;
+ }*/
+ switch (*RExC_parse) {
+ case 'g':
+ if (length != 1
+ && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
+ {
+ goto bad_bound_type;
+ }
+ FLAGS(ret) = GCB_BOUND;
+ break;
+ default:
+ bad_bound_type:
+ RExC_parse = endbrace;
+ vFAIL2utf8f(
+ "'%"UTF8f"' is an unknown bound type",
+ UTF8fARG(UTF, length, endbrace - length));
+ NOT_REACHED; /*NOTREACHED*/
+ }
+ RExC_parse = endbrace;
+ RExC_uni_semantics = 1;
+
+ if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
+ OP(ret) = BOUNDU;
+ length += 4;
+
+ /* Don't have to worry about UTF-8, in this message because
+ * to get here the contents of the \b must be ASCII */
+ ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
+ "Using /u for '%.*s' instead of /%s",
+ (unsigned) length,
+ endbrace - length + 1,
+ (charset == REGEX_ASCII_RESTRICTED_CHARSET)
+ ? ASCII_RESTRICT_PAT_MODS
+ : ASCII_MORE_RESTRICT_PAT_MODS);
+ }
}
+
+ if (PASS2 && invert) {
+ OP(ret) += NBOUND - BOUND;
+ }
goto finish_meta_pat;
+ }
case 'D':
invert = 1;
@@ -16735,6 +16797,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
}
}
+ else if (k == BOUND || k == NBOUND) {
+ /* Must be synced with order of 'bound_type' in regcomp.h */
+ const char * const bounds[] = {
+ "", /* Traditional */
+ "{gcb}"
+ };
+ sv_catpv(sv, bounds[FLAGS(o)]);
+ }
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
else if (OP(o) == SBOL)
diff --git a/regcomp.h b/regcomp.h
index c17bf62d42..ee9be7a505 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -993,6 +993,11 @@ re.pm, especially to the documentation.
#endif /* DEBUG RELATED DEFINES */
+typedef enum {
+ TRADITIONAL_BOUND = _CC_WORDCHAR,
+ GCB_BOUND
+} bound_type;
+
/*
* Local variables:
* c-indentation-style: bsd
diff --git a/regcomp.sym b/regcomp.sym
index c20c5aaad1..7daa241dba 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -43,15 +43,15 @@ GPOS GPOS, no ; Matches where last m//g left off.
# in regcomp.c uses the enum value of the modifier as an offset from the /d
# version. The complements must come after the non-complements.
# BOUND, POSIX and their complements are affected, as well as EXACTF.
-BOUND BOUND, no ; Match "" at any word boundary using native charset rules for non-utf8
-BOUNDL BOUND, no ; Match "" at any locale word boundary
-BOUNDU BOUND, no ; Match "" at any word boundary using Unicode rules
-BOUNDA BOUND, no ; Match "" at any word boundary using ASCII rules
+BOUND BOUND, no ; Match "" at any word boundary using native charset rules for non-utf8, otherwise Unicode rules
+BOUNDL BOUND, no ; Match "" at any boundary of a given type using locale rules
+BOUNDU BOUND, no ; Match "" at any boundary of a given type using Unicode rules
+BOUNDA BOUND, no ; Match "" at any boundary of a given type using ASCII rules
# All NBOUND nodes are required by code in regexec.c to be greater than all BOUND ones
-NBOUND NBOUND, no ; Match "" at any word non-boundary using native charset rules for non-utf8
-NBOUNDL NBOUND, no ; Match "" at any locale word non-boundary
-NBOUNDU NBOUND, no ; Match "" at any word non-boundary using Unicode rules
-NBOUNDA NBOUND, no ; Match "" at any word non-boundary using ASCII rules
+NBOUND NBOUND, no ; Match "" at any word non-boundary using native charset rules for non-utf8, otherwise Unicode rules
+NBOUNDL NBOUND, no ; Match "" at any boundary of a given type using locale rules
+NBOUNDU NBOUND, no ; Match "" at any boundary of a given type using using Unicode rules
+NBOUNDA NBOUND, no ; Match "" at any boundary of a given type using using ASCII rules
#* [Special] alternatives:
REG_ANY REG_ANY, no 0 S ; Match any one character (except newline).
diff --git a/regexec.c b/regexec.c
index a4fea0a735..95dae10cae 100644
--- a/regexec.c
+++ b/regexec.c
@@ -37,6 +37,9 @@
#include "re_top.h"
#endif
+#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
+ "Use of \\b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
+
/*
* pregcomp and pregexec -- regsub and regerror are not used in perl
*
@@ -191,18 +194,6 @@ static const char* const non_utf8_target_but_utf8_required
PL_XPosix_ptrs[_CC_WORDCHAR], \
LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
-#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
- STMT_START { \
- LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
- "_X_regular_begin", \
- NULL, \
- LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
- LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
- "_X_extend", \
- NULL, \
- COMBINING_GRAVE_ACCENT_UTF8); \
- } STMT_END
-
#define PLACEHOLDER /* Something for the preprocessor to grab onto */
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
@@ -262,16 +253,6 @@ static const char* const non_utf8_target_but_utf8_required
} \
} STMT_END
-/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
- * These are for the pre-composed Hangul syllables, which are all in a
- * contiguous block and arranged there in such a way so as to facilitate
- * alorithmic determination of their characteristics. As such, they don't need
- * a swash, but can be determined by simple arithmetic. Almost all are
- * GCB=LVT, but every 28th one is a GCB=LV */
-#define SBASE 0xAC00 /* Start of block */
-#define SCount 11172 /* Length of block */
-#define TCount 28
-
#define SLAB_FIRST(s) (&(s)->states[0])
#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
@@ -1728,6 +1709,33 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
+/* Takes a pointer to an inversion list, a pointer to its corresponding
+ * inversion map, and a code point, and returns the code point's value
+ * according to the two arrays. It assumes that all code points have a value.
+ * This is used as the base macro for macros for particular properties */
+#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
+ invmap[_invlist_search(invlist, cp)]
+
+/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
+ * of a code point, returning the value for the first code point in the string.
+ * And it takes the particular macro name that finds the desired value given a
+ * code point. Merely convert the UTF-8 to code point and call the cp macro */
+#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
+ (__ASSERT_(pos < strend) \
+ /* Note assumes is valid UTF-8 */ \
+ (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
+
+/* Returns the GCB value for the input code point */
+#define getGCB_VAL_CP(cp) \
+ _generic_GET_BREAK_VAL_CP( \
+ PL_GCB_invlist, \
+ Grapheme_Cluster_Break_invmap, \
+ (cp))
+
+/* Returns the GCB value for the first code point in the UTF-8 encoded string
+ * bounded by pos and strend */
+#define getGCB_VAL_UTF8(pos, strend) \
+ _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
/* We know what class REx starts with. Try to find this position... */
/* if reginfo->intuit, its a dryrun */
@@ -1937,30 +1945,120 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
case BOUNDL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (FLAGS(c) != TRADITIONAL_BOUND) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ goto do_boundu;
+ }
+
FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
break;
+
case NBOUNDL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (FLAGS(c) != TRADITIONAL_BOUND) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ goto do_nboundu;
+ }
+
FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
break;
- case BOUND:
+
+ case BOUND: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
break;
- case BOUNDA:
+
+ case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
FBC_BOUND_A(isWORDCHAR_A);
break;
- case NBOUND:
+
+ case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
break;
- case NBOUNDA:
+
+ case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
+ meaning */
+ assert(FLAGS(c) == TRADITIONAL_BOUND);
+
FBC_NBOUND_A(isWORDCHAR_A);
break;
- case BOUNDU:
- FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
- break;
+
case NBOUNDU:
- FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+ if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
+ FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+ break;
+ }
+
+ do_nboundu:
+
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case BOUNDU:
+ do_boundu:
+ switch((bound_type) FLAGS(c)) {
+ case TRADITIONAL_BOUND:
+ FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+ break;
+ case GCB_BOUND:
+ if (s == reginfo->strbeg) { /* GCB always matches at begin and
+ end */
+ if (to_complement ^ cBOOL(reginfo->intuit
+ || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ s += (utf8_target) ? UTF8SKIP(s) : 1;
+ }
+
+ if (utf8_target) {
+ PL_GCB_enum before = getGCB_VAL_UTF8(
+ reghop3((U8*)s, -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ PL_GCB_enum after = getGCB_VAL_UTF8((U8*) s,
+ (U8*) reginfo->strend);
+ if (to_complement ^ isGCB(before, after)) {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ before = after;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Not utf8. Everything is a GCB except between CR and
+ LF */
+ while (s < strend) {
+ if (to_complement ^ (UCHARAT(s - 1) != '\r'
+ || UCHARAT(s) != '\n'))
+ {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ s++;
+ }
+ }
+ }
+
+ if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) {
+ goto got_it;
+ }
+ break;
+ }
break;
+
case LNBREAK:
REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
is_LNBREAK_latin1_safe(s, strend)
@@ -3892,6 +3990,105 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
return TRUE;
}
+/* This creates a single number by combining two, with 'before' being like the
+ * 10's digit, but this isn't necessarily base 10; it is base however many
+ * elements of the enum there are */
+#define GCBcase(before, after) ((PL_GCB_ENUM_COUNT * before) + after)
+
+STATIC bool
+S_isGCB(const PL_GCB_enum before, const PL_GCB_enum after)
+{
+ /* returns a boolean indicating if there is a Grapheme Cluster Boundary
+ * between the inputs. See http://www.unicode.org/reports/tr29/ */
+
+ switch (GCBcase(before, after)) {
+
+ /* Break at the start and end of text.
+ GB1. sot ÷
+ GB2. ÷ eot
+
+ Break before and after controls except between CR and LF
+ GB4. ( Control | CR | LF ) ÷
+ GB5. ÷ ( Control | CR | LF )
+
+ Otherwise, break everywhere.
+ GB10. Any ÷ Any */
+ default:
+ return TRUE;
+
+ /* Do not break between a CR and LF.
+ GB3. CR × LF */
+ case GCBcase(PL_GCB_CR, PL_GCB_LF):
+ return FALSE;
+
+ /* Do not break Hangul syllable sequences.
+ GB6. L × ( L | V | LV | LVT ) */
+ case GCBcase(PL_GCB_L, PL_GCB_L):
+ case GCBcase(PL_GCB_L, PL_GCB_V):
+ case GCBcase(PL_GCB_L, PL_GCB_LV):
+ case GCBcase(PL_GCB_L, PL_GCB_LVT):
+ return FALSE;
+
+ /* GB7. ( LV | V ) × ( V | T ) */
+ case GCBcase(PL_GCB_LV, PL_GCB_V):
+ case GCBcase(PL_GCB_LV, PL_GCB_T):
+ case GCBcase(PL_GCB_V, PL_GCB_V):
+ case GCBcase(PL_GCB_V, PL_GCB_T):
+ return FALSE;
+
+ /* GB8. ( LVT | T) × T */
+ case GCBcase(PL_GCB_LVT, PL_GCB_T):
+ case GCBcase(PL_GCB_T, PL_GCB_T):
+ return FALSE;
+
+ /* Do not break between regional indicator symbols.
+ GB8a. Regional_Indicator × Regional_Indicator */
+ case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Regional_Indicator):
+ return FALSE;
+
+ /* Do not break before extending characters.
+ GB9. × Extend */
+ case GCBcase(PL_GCB_Other, PL_GCB_Extend):
+ case GCBcase(PL_GCB_Extend, PL_GCB_Extend):
+ case GCBcase(PL_GCB_L, PL_GCB_Extend):
+ case GCBcase(PL_GCB_LV, PL_GCB_Extend):
+ case GCBcase(PL_GCB_LVT, PL_GCB_Extend):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_Extend):
+ case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Extend):
+ case GCBcase(PL_GCB_SpacingMark, PL_GCB_Extend):
+ case GCBcase(PL_GCB_T, PL_GCB_Extend):
+ case GCBcase(PL_GCB_V, PL_GCB_Extend):
+ return FALSE;
+
+ /* Do not break before SpacingMarks, or after Prepend characters.
+ GB9a. × SpacingMark */
+ case GCBcase(PL_GCB_Other, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_Extend, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_L, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_LV, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_LVT, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_SpacingMark, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_T, PL_GCB_SpacingMark):
+ case GCBcase(PL_GCB_V, PL_GCB_SpacingMark):
+ return FALSE;
+
+ /* GB9b. Prepend × */
+ case GCBcase(PL_GCB_Prepend, PL_GCB_Other):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_L):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_LV):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_LVT):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_Prepend):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_Regional_Indicator):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_T):
+ case GCBcase(PL_GCB_Prepend, PL_GCB_V):
+ return FALSE;
+ }
+
+ NOT_REACHED;
+}
+
/* returns -1 on failure, $+[0] on success */
STATIC SSize_t
S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
@@ -3964,6 +4161,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
int to_complement; /* Invert the result? */
_char_class_number classnum;
bool is_utf8_pat = reginfo->is_utf8_pat;
+ bool match = FALSE;
+
#ifdef DEBUGGING
GET_RE_DEBUG_FLAGS_DECL;
@@ -4623,13 +4822,21 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
break;
}
- /* XXX At that point regcomp.c would no longer * have to set the FLAGS fields of these */
case NBOUNDL: /* /\B/l */
to_complement = 1;
/* FALLTHROUGH */
case BOUNDL: /* /\b/l */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if (FLAGS(scan) != TRADITIONAL_BOUND) {
+ if (! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ }
+ goto boundu;
+ }
+
if (utf8_target) {
if (locinput == reginfo->strbeg)
ln = isWORDCHAR_LC('\n');
@@ -4696,9 +4903,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
/* FALLTHROUGH */
case BOUNDU: /* /\b/u */
+
+ boundu:
if (utf8_target) {
- bound_utf8:
+ bound_utf8:
+ switch((bound_type) FLAGS(scan)) {
+ case TRADITIONAL_BOUND:
ln = (locinput == reginfo->strbeg)
? isWORDCHAR_L1('\n')
: isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
@@ -4706,18 +4917,55 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
n = (NEXTCHR_IS_EOS)
? isWORDCHAR_L1('\n')
: isWORDCHAR_utf8((U8*)locinput);
+
+ match = ln != n;
+ break;
+ case GCB_BOUND:
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE; /* GCB always matches at begin and
+ end */
+ }
+ else {
+ /* Find the gcb values of previous and current
+ * chars, then see if is a break point */
+ match = isGCB(getGCB_VAL_UTF8(
+ reghop3((U8*)locinput,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend),
+ getGCB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend));
+ }
+ break;
+ }
}
- else {
+ else { /* Not utf8 target */
+ switch((bound_type) FLAGS(scan)) {
+ case TRADITIONAL_BOUND:
ln = (locinput == reginfo->strbeg)
? isWORDCHAR_L1('\n')
: isWORDCHAR_L1(UCHARAT(locinput - 1));
n = (NEXTCHR_IS_EOS)
? isWORDCHAR_L1('\n')
: isWORDCHAR_L1(nextchr);
+ match = ln != n;
+ break;
+ case GCB_BOUND:
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE; /* GCB always matches at begin and
+ end */
+ }
+ else { /* Only CR-LF combo isn't a GCB in 0-255
+ range */
+ match = UCHARAT(locinput - 1) != '\r'
+ || UCHARAT(locinput) != '\n';
+ }
+ break;
+ }
}
- if (to_complement ^ (ln == n)) {
+ if (to_complement ^ ! match) {
sayNO;
}
break;
@@ -4921,38 +5169,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
case CLUMP: /* Match \X: logical Unicode character. This is defined as
a Unicode extended Grapheme Cluster */
- /* From http://www.unicode.org/reports/tr29 (5.2 version). An
- extended Grapheme Cluster is:
-
- CR LF
- | Prepend* Begin Extend*
- | .
-
- Begin is: ( Special_Begin | ! Control )
- Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
- Extend is: ( Grapheme_Extend | Spacing_Mark )
- Control is: [ GCB_Control | CR | LF ]
- Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
-
- If we create a 'Regular_Begin' = Begin - Special_Begin, then
- we can rewrite
-
- Begin is ( Regular_Begin + Special Begin )
-
- It turns out that 98.4% of all Unicode code points match
- Regular_Begin. Doing it this way eliminates a table match in
- the previous implementation for almost all Unicode code points.
-
- There is a subtlety with Prepend* which showed up in testing.
- Note that the Begin, and only the Begin is required in:
- | Prepend* Begin Extend*
- Also, Begin contains '! Control'. A Prepend must be a
- '! Control', which means it must also be a Begin. What it
- comes down to is that if we match Prepend* and then find no
- suitable Begin afterwards, that if we backtrack the last
- Prepend, that one will be a suitable Begin.
- */
-
if (NEXTCHR_IS_EOS)
sayNO;
if (! utf8_target) {
@@ -4970,147 +5186,27 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
}
else {
- /* Utf8: See if is ( CR LF ); already know that locinput <
- * reginfo->strend, so locinput+1 is in bounds */
- if ( nextchr == '\r' && locinput+1 < reginfo->strend
- && UCHARAT(locinput + 1) == '\n')
- {
- locinput += 2;
- }
- else {
- STRLEN len;
-
- /* In case have to backtrack to beginning, then match '.' */
- char *starting = locinput;
-
- /* In case have to backtrack the last prepend */
- char *previous_prepend = NULL;
+ /* Get the gcb type for the current character */
+ PL_GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend);
- LOAD_UTF8_CHARCLASS_GCB();
-
- /* Match (prepend)* */
- while (locinput < reginfo->strend
- && (len = is_GCB_Prepend_utf8(locinput)))
- {
- previous_prepend = locinput;
- locinput += len;
- }
-
- /* As noted above, if we matched a prepend character, but
- * the next thing won't match, back off the last prepend we
- * matched, as it is guaranteed to match the begin */
- if (previous_prepend
- && (locinput >= reginfo->strend
- || (! swash_fetch(PL_utf8_X_regular_begin,
- (U8*)locinput, utf8_target)
- && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
- )
- {
- locinput = previous_prepend;
- }
-
- /* Note that here we know reginfo->strend > locinput, as we
- * tested that upon input to this switch case, and if we
- * moved locinput forward, we tested the result just above
- * and it either passed, or we backed off so that it will
- * now pass */
- if (swash_fetch(PL_utf8_X_regular_begin,
- (U8*)locinput, utf8_target)) {
- locinput += UTF8SKIP(locinput);
+ /* Then scan through the input until we get to the first
+ * character whose type is supposed to be a gcb with the
+ * current character. (There is always a break at the
+ * end-of-input) */
+ locinput += UTF8SKIP(locinput);
+ while (locinput < reginfo->strend) {
+ PL_GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend);
+ if (isGCB(prev_gcb, cur_gcb)) {
+ break;
}
- else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
-
- /* Here did not match the required 'Begin' in the
- * second term. So just match the very first
- * character, the '.' of the final term of the regex */
- locinput = starting + UTF8SKIP(starting);
- goto exit_utf8;
- } else {
-
- /* Here is a special begin. It can be composed of
- * several individual characters. One possibility is
- * RI+ */
- if ((len = is_GCB_RI_utf8(locinput))) {
- locinput += len;
- while (locinput < reginfo->strend
- && (len = is_GCB_RI_utf8(locinput)))
- {
- locinput += len;
- }
- } else if ((len = is_GCB_T_utf8(locinput))) {
- /* Another possibility is T+ */
- locinput += len;
- while (locinput < reginfo->strend
- && (len = is_GCB_T_utf8(locinput)))
- {
- locinput += len;
- }
- } else {
- /* Here, neither RI+ nor T+; must be some other
- * Hangul. That means it is one of the others: L,
- * LV, LVT or V, and matches:
- * L* (L | LVT T* | V * V* T* | LV V* T*) */
-
- /* Match L* */
- while (locinput < reginfo->strend
- && (len = is_GCB_L_utf8(locinput)))
- {
- locinput += len;
- }
-
- /* Here, have exhausted L*. If the next character
- * is not an LV, LVT nor V, it means we had to have
- * at least one L, so matches L+ in the original
- * equation, we have a complete hangul syllable.
- * Are done. */
+ prev_gcb = cur_gcb;
+ locinput += UTF8SKIP(locinput);
+ }
- if (locinput < reginfo->strend
- && is_GCB_LV_LVT_V_utf8(locinput))
- {
- /* Otherwise keep going. Must be LV, LVT or V.
- * See if LVT, by first ruling out V, then LV */
- if (! is_GCB_V_utf8(locinput)
- /* All but every TCount one is LV */
- && (valid_utf8_to_uvchr((U8 *) locinput,
- NULL)
- - SBASE)
- % TCount != 0)
- {
- locinput += UTF8SKIP(locinput);
- } else {
-
- /* Must be V or LV. Take it, then match
- * V* */
- locinput += UTF8SKIP(locinput);
- while (locinput < reginfo->strend
- && (len = is_GCB_V_utf8(locinput)))
- {
- locinput += len;
- }
- }
- /* And any of LV, LVT, or V can be followed
- * by T* */
- while (locinput < reginfo->strend
- && (len = is_GCB_T_utf8(locinput)))
- {
- locinput += len;
- }
- }
- }
- }
-
- /* Match any extender */
- while (locinput < reginfo->strend
- && swash_fetch(PL_utf8_X_extend,
- (U8*)locinput, utf8_target))
- {
- locinput += UTF8SKIP(locinput);
- }
- }
- exit_utf8:
- if (locinput > reginfo->strend) sayNO;
}
break;
diff --git a/regnodes.h b/regnodes.h
index 439fa8d7b1..144d6f63b5 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -19,14 +19,14 @@
#define MEOL 5 /* 0x05 Same, assuming multiline: /$/m */
#define EOS 6 /* 0x06 Match "" at end of string: /\z/ */
#define GPOS 7 /* 0x07 Matches where last m//g left off. */
-#define BOUND 8 /* 0x08 Match "" at any word boundary using native charset rules for non-utf8 */
-#define BOUNDL 9 /* 0x09 Match "" at any locale word boundary */
-#define BOUNDU 10 /* 0x0a Match "" at any word boundary using Unicode rules */
-#define BOUNDA 11 /* 0x0b Match "" at any word boundary using ASCII rules */
-#define NBOUND 12 /* 0x0c Match "" at any word non-boundary using native charset rules for non-utf8 */
-#define NBOUNDL 13 /* 0x0d Match "" at any locale word non-boundary */
-#define NBOUNDU 14 /* 0x0e Match "" at any word non-boundary using Unicode rules */
-#define NBOUNDA 15 /* 0x0f Match "" at any word non-boundary using ASCII rules */
+#define BOUND 8 /* 0x08 Match "" at any word boundary using native charset rules for non-utf8, otherwise Unicode rules */
+#define BOUNDL 9 /* 0x09 Match "" at any boundary of a given type using locale rules */
+#define BOUNDU 10 /* 0x0a Match "" at any boundary of a given type using Unicode rules */
+#define BOUNDA 11 /* 0x0b Match "" at any boundary of a given type using ASCII rules */
+#define NBOUND 12 /* 0x0c Match "" at any word non-boundary using native charset rules for non-utf8, otherwise Unicode rules */
+#define NBOUNDL 13 /* 0x0d Match "" at any boundary of a given type using locale rules */
+#define NBOUNDU 14 /* 0x0e Match "" at any boundary of a given type using using Unicode rules */
+#define NBOUNDA 15 /* 0x0f Match "" at any boundary of a given type using using ASCII rules */
#define REG_ANY 16 /* 0x10 Match any one character (except newline). */
#define SANY 17 /* 0x11 Match any one character. */
#define CANY 18 /* 0x12 Match any one byte. */
diff --git a/sv.c b/sv.c
index 5670fd1fe1..3b65510a88 100644
--- a/sv.c
+++ b/sv.c
@@ -14944,6 +14944,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
for (i = 0; i < POSIX_CC_COUNT; i++) {
PL_XPosix_ptrs[i] = sv_dup_inc(proto_perl->IXPosix_ptrs[i], param);
}
+ PL_GCB_invlist = sv_dup_inc(proto_perl->IGCB_invlist, param);
PL_utf8_mark = sv_dup_inc(proto_perl->Iutf8_mark, param);
PL_utf8_X_regular_begin = sv_dup_inc(proto_perl->Iutf8_X_regular_begin, param);
PL_utf8_X_extend = sv_dup_inc(proto_perl->Iutf8_X_extend, param);
diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec
index 0c6a16a5ba..3f15db0e19 100644
--- a/t/lib/warnings/regexec
+++ b/t/lib/warnings/regexec
@@ -143,3 +143,18 @@ Wide character (U+100) in pattern match (m//) at - line 10.
Wide character (U+100) in pattern match (m//) at - line 11.
Wide character (U+100) in pattern match (m//) at - line 12.
Wide character (U+100) in pattern match (m//) at - line 12.
+########
+# NAME \b{} in non-UTF-8 locale
+eval { require POSIX; POSIX->import("locale_h") };
+if ($@) {
+ print("SKIPPED\n# no POSIX\n"),exit;
+}
+use warnings 'locale';
+use locale;
+setlocale(&POSIX::LC_CTYPE, "C");
+"a" =~ /\b{gcb}/l;
+no warnings 'locale';
+"a" =~ /\b{gcb}/l;
+EXPECT
+Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 8.
+Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 8.
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index c985c8e1b0..452d982d17 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -142,8 +142,6 @@ my @death =
'/(?lil:foo)/' => 'Regexp modifier "l" may not appear twice {#} m/(?lil{#}:foo)/',
'/(?aaia:foo)/' => 'Regexp modifier "a" may appear a maximum of twice {#} m/(?aaia{#}:foo)/',
'/(?i-l:foo)/' => 'Regexp modifier "l" may not appear after the "-" {#} m/(?i-l{#}:foo)/',
-'/a\b{cde/' => 'Use "\b\{" instead of "\b{" {#} m/a\{#}b{cde/',
-'/a\B{cde/' => 'Use "\B\{" instead of "\B{" {#} m/a\{#}B{cde/',
'/((x)/' => 'Unmatched ( {#} m/({#}(x)/',
@@ -188,8 +186,17 @@ my @death =
'/[z-a]/' => 'Invalid [] range "z-a" {#} m/[z-a{#}]/',
'/\p/' => 'Empty \p{} {#} m/\p{#}/',
-
'/\P{}/' => 'Empty \P{} {#} m/\P{{#}}/',
+
+'/a\b{cde/' => 'Missing right brace on \b{} {#} m/a\b{{#}cde/',
+'/a\B{cde/' => 'Missing right brace on \B{} {#} m/a\B{{#}cde/',
+
+ '/\b{}/' => 'Empty \b{} {#} m/\b{}{#}/',
+ '/\B{}/' => 'Empty \B{} {#} m/\B{}{#}/',
+
+ '/\b{gc}/' => "'gc' is an unknown bound type {#} m/\\b{gc{#}}/",
+ '/\B{gc}/' => "'gc' is an unknown bound type {#} m/\\B{gc{#}}/",
+
'/(?[[[:word]]])/' => "Unmatched ':' in POSIX class {#} m/(?[[[:word{#}]]])/",
'/(?[[:word]])/' => "Unmatched ':' in POSIX class {#} m/(?[[:word{#}]])/",
'/(?[[[:digit: ])/' => "Unmatched '[' in POSIX class {#} m/(?[[[:digit:{#} ])/",
@@ -417,6 +424,8 @@ my @death_utf8 = mark_as_utf8(
'/(?[ \t + \e # ネ This was supposed to be a comment ])/' => 'Syntax error in (?[...]) in regex m/(?[ \t + \e # ネ This was supposed to be a comment ])/',
'm/(*ネ)ネ/' => q<Unknown verb pattern 'ネ' {#} m/(*ネ){#}ネ/>,
'/\cネ/' => "Character following \"\\c\" must be printable ASCII",
+ '/\b{ネ}/' => "'ネ' is an unknown bound type {#} m/\\b{ネ{#}}/",
+ '/\B{ネ}/' => "'ネ' is an unknown bound type {#} m/\\B{ネ{#}}/",
);
push @death, @death_utf8;
@@ -450,6 +459,8 @@ my @death_utf8_only_under_strict = (
my @warning = (
'm/\b*\x{100}/' => '\b* matches null string many times {#} m/\b*{#}\x{100}/',
+ '/\b{g}/a' => "Using /u for '\\b{g}' instead of /a {#} m/\\b{g}{#}/",
+ '/\B{gcb}/a' => "Using /u for '\\B{gcb}' instead of /a {#} m/\\B{gcb}{#}/",
'm/[:blank:]\x{100}/' => 'POSIX syntax [: :] belongs inside character classes {#} m/[:blank:]{#}\x{100}/',
'm/[[:cntrl:]][:^ascii:]\x{100}/' => 'POSIX syntax [: :] belongs inside character classes {#} m/[[:cntrl:]][:^ascii:]{#}\x{100}/',
"m'\\y\\x{100}'" => 'Unrecognized escape \y passed through {#} m/\y{#}\x{100}/',
diff --git a/utf8.c b/utf8.c
index 179a96988e..efb8d8647f 100644
--- a/utf8.c
+++ b/utf8.c
@@ -32,7 +32,6 @@
#define PERL_IN_UTF8_C
#include "perl.h"
#include "inline_invlist.c"
-#include "charclass_invlists.h"
static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";