summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-02-17 14:43:10 -0700
committerKarl Williamson <public@khwilliamson.com>2011-02-17 15:41:54 -0700
commitc11ff9433950cda8448b773418d1cb2592eea29d (patch)
treef0284cd5865f0db8c7484123153ab8fb860aa129
parent0167186c6da6afb0eb6708879a543c70c612fc45 (diff)
downloadperl-c11ff9433950cda8448b773418d1cb2592eea29d.tar.gz
handy.h: isIDFIRST_utf8() changed to use XIDStart
Previously this used a home-grown definition of an identifier start, stemming from a bug in some early Unicode versions. This led to some problems, fixed by #74022. But the home-grown solution did not track Unicode, and allowed for characters, like marks, to begin words when they shouldn't. This change brings this macro into compliance with Unicode going-forward.
-rw-r--r--embed.fnc2
-rw-r--r--embed.h2
-rw-r--r--embedvar.h4
-rw-r--r--global.sym2
-rw-r--r--handy.h17
-rw-r--r--intrpvar.h2
-rw-r--r--pod/perldelta.pod6
-rw-r--r--proto.h12
-rw-r--r--sv.c2
-rw-r--r--utf8.c25
10 files changed, 64 insertions, 10 deletions
diff --git a/embed.fnc b/embed.fnc
index b1f2334819..8663b212fd 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -583,7 +583,9 @@ Anpdmb |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p
Anpd |bool |is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el
ApR |bool |is_utf8_alnum |NN const U8 *p
ApR |bool |is_utf8_idfirst|NN const U8 *p
+ApR |bool |is_utf8_xidfirst|NN const U8 *p
ApR |bool |is_utf8_idcont |NN const U8 *p
+ApR |bool |is_utf8_xidcont |NN const U8 *p
ApR |bool |is_utf8_alpha |NN const U8 *p
ApR |bool |is_utf8_ascii |NN const U8 *p
ApR |bool |is_utf8_space |NN const U8 *p
diff --git a/embed.h b/embed.h
index 297e55b07f..727e921f0c 100644
--- a/embed.h
+++ b/embed.h
@@ -251,6 +251,8 @@
#define is_utf8_string_loclen Perl_is_utf8_string_loclen
#define is_utf8_upper(a) Perl_is_utf8_upper(aTHX_ a)
#define is_utf8_xdigit(a) Perl_is_utf8_xdigit(aTHX_ a)
+#define is_utf8_xidcont(a) Perl_is_utf8_xidcont(aTHX_ a)
+#define is_utf8_xidfirst(a) Perl_is_utf8_xidfirst(aTHX_ a)
#define leave_scope(a) Perl_leave_scope(aTHX_ a)
#define lex_bufutf8() Perl_lex_bufutf8(aTHX)
#define lex_discard_to(a) Perl_lex_discard_to(aTHX_ a)
diff --git a/embedvar.h b/embedvar.h
index 7b78a5ebdd..bb179fdcf4 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -361,6 +361,8 @@
#define PL_utf8_toupper (vTHX->Iutf8_toupper)
#define PL_utf8_upper (vTHX->Iutf8_upper)
#define PL_utf8_xdigit (vTHX->Iutf8_xdigit)
+#define PL_utf8_xidcont (vTHX->Iutf8_xidcont)
+#define PL_utf8_xidstart (vTHX->Iutf8_xidstart)
#define PL_utf8cache (vTHX->Iutf8cache)
#define PL_utf8locale (vTHX->Iutf8locale)
#define PL_warnhook (vTHX->Iwarnhook)
@@ -693,6 +695,8 @@
#define PL_Iutf8_toupper PL_utf8_toupper
#define PL_Iutf8_upper PL_utf8_upper
#define PL_Iutf8_xdigit PL_utf8_xdigit
+#define PL_Iutf8_xidcont PL_utf8_xidcont
+#define PL_Iutf8_xidstart PL_utf8_xidstart
#define PL_Iutf8cache PL_utf8cache
#define PL_Iutf8locale PL_utf8locale
#define PL_Iwarnhook PL_warnhook
diff --git a/global.sym b/global.sym
index 9064f98699..dde11d4072 100644
--- a/global.sym
+++ b/global.sym
@@ -287,6 +287,8 @@ Perl_is_utf8_string_loc
Perl_is_utf8_string_loclen
Perl_is_utf8_upper
Perl_is_utf8_xdigit
+Perl_is_utf8_xidcont
+Perl_is_utf8_xidfirst
Perl_leave_scope
Perl_lex_bufutf8
Perl_lex_discard_to
diff --git a/handy.h b/handy.h
index ad2e4b68b2..6541c95c35 100644
--- a/handy.h
+++ b/handy.h
@@ -883,16 +883,13 @@ EXTCONST U32 PL_charclass[];
#define isBLANK_LC_uni(c) isBLANK(c) /* could be wrong */
#define isALNUM_utf8(p) is_utf8_alnum(p)
-/* The ID_Start of Unicode was originally quite limiting: it assumed an
- * L-class character (meaning that you could not have, say, a CJK charac-
- * ter). So, instead, perl has for a long time allowed ID_Continue but
- * not digits.
- * We still preserve that for backward compatibility. But we also make sure
- * that it is alphanumeric, so S_scan_word in toke.c will not hang. See
- * http://rt.perl.org/rt3/Ticket/Display.html?id=74022
- * for more detail than you ever wanted to know about. */
-#define isIDFIRST_utf8(p) \
- (is_utf8_idcont(p) && !is_utf8_digit(p) && is_utf8_alnum(p))
+/* To prevent S_scan_word in toke.c from hanging, we have to make sure that
+ * IDFIRST is an alnum. See
+ * http://rt.perl.org/rt3/Ticket/Display.html?id=74022
+ * for more detail than you ever wanted to know about. This used to be not the
+ * XID version, but we decided to go with the more modern Unicode definition */
+#define isIDFIRST_utf8(p) (is_utf8_xidfirst(p) && is_utf8_alnum(p))
+#define isIDCONT_utf8(p) is_utf8_xidcont(p)
#define isALPHA_utf8(p) is_utf8_alpha(p)
#define isSPACE_utf8(p) is_utf8_space(p)
#define isDIGIT_utf8(p) is_utf8_digit(p)
diff --git a/intrpvar.h b/intrpvar.h
index b12f21b283..a4beda6851 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -678,6 +678,8 @@ PERLVAR(Idebug_pad, struct perl_debug_pad) /* always needed because of the re ex
PERLVAR(Iutf8_idstart, SV *)
PERLVAR(Iutf8_idcont, SV *)
+PERLVAR(Iutf8_xidstart, SV *)
+PERLVAR(Iutf8_xidcont, SV *)
PERLVAR(Isort_RealCmp, SVCOMPARE_t)
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index e8f4715aa6..f00cb3382e 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -801,6 +801,12 @@ again and, if the entries are re-created too many times, dies with a
[perl #78494] When pipes are shared between threads, the C<close> function
(and any implicit close, such as on thread exit) no longer blocks.
+=item *
+
+Several contexts no longer allow a Unicode character to begin a word
+that should never begin words, for an example an accent that must follow
+another character previously could precede all other characters.
+
=back
=head1 Known Problems
diff --git a/proto.h b/proto.h
index 960e5f5246..d4642aa5eb 100644
--- a/proto.h
+++ b/proto.h
@@ -1801,6 +1801,18 @@ PERL_CALLCONV bool Perl_is_utf8_xdigit(pTHX_ const U8 *p)
#define PERL_ARGS_ASSERT_IS_UTF8_XDIGIT \
assert(p)
+PERL_CALLCONV bool Perl_is_utf8_xidcont(pTHX_ const U8 *p)
+ __attribute__warn_unused_result__
+ __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_XIDCONT \
+ assert(p)
+
+PERL_CALLCONV bool Perl_is_utf8_xidfirst(pTHX_ const U8 *p)
+ __attribute__warn_unused_result__
+ __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST \
+ assert(p)
+
PERL_CALLCONV OP* Perl_jmaybe(pTHX_ OP *o)
__attribute__nonnull__(pTHX_1);
#define PERL_ARGS_ASSERT_JMAYBE \
diff --git a/sv.c b/sv.c
index 4bd68503fe..9254ad19e5 100644
--- a/sv.c
+++ b/sv.c
@@ -13119,7 +13119,9 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
PL_utf8_tolower = sv_dup_inc(proto_perl->Iutf8_tolower, param);
PL_utf8_tofold = sv_dup_inc(proto_perl->Iutf8_tofold, param);
PL_utf8_idstart = sv_dup_inc(proto_perl->Iutf8_idstart, param);
+ PL_utf8_xidstart = sv_dup_inc(proto_perl->Iutf8_xidstart, param);
PL_utf8_idcont = sv_dup_inc(proto_perl->Iutf8_idcont, param);
+ PL_utf8_xidcont = sv_dup_inc(proto_perl->Iutf8_xidcont, param);
PL_utf8_foldable = hv_dup_inc(proto_perl->Iutf8_foldable, param);
/* Did the locale setup indicate UTF-8? */
diff --git a/utf8.c b/utf8.c
index b5d853188b..808d9a80a7 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1501,6 +1501,19 @@ Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
}
bool
+Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
+
+ if (*p == '_')
+ return TRUE;
+ /* is_utf8_idstart would be more logical. */
+ return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
+}
+
+bool
Perl_is_utf8_idcont(pTHX_ const U8 *p)
{
dVAR;
@@ -1513,6 +1526,18 @@ Perl_is_utf8_idcont(pTHX_ const U8 *p)
}
bool
+Perl_is_utf8_xidcont(pTHX_ const U8 *p)
+{
+ dVAR;
+
+ PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
+
+ if (*p == '_')
+ return TRUE;
+ return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
+}
+
+bool
Perl_is_utf8_alpha(pTHX_ const U8 *p)
{
dVAR;