handy.h: isIDFIRST_utf8() changed to use XIDStart

Previously this used a home-grown definition of an identifier start, stemming from a bug in some early Unicode versions. This led to some problems, fixed by #74022. But the home-grown solution did not track Unicode, and allowed for characters, like marks, to begin words when they shouldn't. This change brings this macro into compliance with Unicode going-forward.
author: Karl Williamson <public@khwilliamson.com> 2011-02-17 14:43:10 -0700
committer: Karl Williamson <public@khwilliamson.com> 2011-02-17 15:41:54 -0700
commit: c11ff9433950cda8448b773418d1cb2592eea29d (patch)
tree: f0284cd5865f0db8c7484123153ab8fb860aa129
parent: 0167186c6da6afb0eb6708879a543c70c612fc45 (diff)
download: perl-c11ff9433950cda8448b773418d1cb2592eea29d.tar.gz
10 files changed, 64 insertions, 10 deletions
diff --git a/embed.fnc b/embed.fnc
index b1f2334819..8663b212fd 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -583,7 +583,9 @@ Anpdmb	|bool	|is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p
 Anpd	|bool	|is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el
 ApR	|bool	|is_utf8_alnum	|NN const U8 *p
 ApR	|bool	|is_utf8_idfirst|NN const U8 *p
+ApR	|bool	|is_utf8_xidfirst|NN const U8 *p
 ApR	|bool	|is_utf8_idcont	|NN const U8 *p
+ApR	|bool	|is_utf8_xidcont	|NN const U8 *p
 ApR	|bool	|is_utf8_alpha	|NN const U8 *p
 ApR	|bool	|is_utf8_ascii	|NN const U8 *p
 ApR	|bool	|is_utf8_space	|NN const U8 *p
diff --git a/embed.h b/embed.h
index 297e55b07f..727e921f0c 100644
--- a/embed.h
+++ b/embed.h
@@ -251,6 +251,8 @@
 #define is_utf8_string_loclen	Perl_is_utf8_string_loclen
 #define is_utf8_upper(a)	Perl_is_utf8_upper(aTHX_ a)
 #define is_utf8_xdigit(a)	Perl_is_utf8_xdigit(aTHX_ a)
+#define is_utf8_xidcont(a)	Perl_is_utf8_xidcont(aTHX_ a)
+#define is_utf8_xidfirst(a)	Perl_is_utf8_xidfirst(aTHX_ a)
 #define leave_scope(a)		Perl_leave_scope(aTHX_ a)
 #define lex_bufutf8()		Perl_lex_bufutf8(aTHX)
 #define lex_discard_to(a)	Perl_lex_discard_to(aTHX_ a)
diff --git a/embedvar.h b/embedvar.h
index 7b78a5ebdd..bb179fdcf4 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -361,6 +361,8 @@
 #define PL_utf8_toupper		(vTHX->Iutf8_toupper)
 #define PL_utf8_upper		(vTHX->Iutf8_upper)
 #define PL_utf8_xdigit		(vTHX->Iutf8_xdigit)
+#define PL_utf8_xidcont		(vTHX->Iutf8_xidcont)
+#define PL_utf8_xidstart	(vTHX->Iutf8_xidstart)
 #define PL_utf8cache		(vTHX->Iutf8cache)
 #define PL_utf8locale		(vTHX->Iutf8locale)
 #define PL_warnhook		(vTHX->Iwarnhook)
@@ -693,6 +695,8 @@
 #define PL_Iutf8_toupper	PL_utf8_toupper
 #define PL_Iutf8_upper		PL_utf8_upper
 #define PL_Iutf8_xdigit		PL_utf8_xdigit
+#define PL_Iutf8_xidcont	PL_utf8_xidcont
+#define PL_Iutf8_xidstart	PL_utf8_xidstart
 #define PL_Iutf8cache		PL_utf8cache
 #define PL_Iutf8locale		PL_utf8locale
 #define PL_Iwarnhook		PL_warnhook
diff --git a/global.sym b/global.sym
index 9064f98699..dde11d4072 100644
--- a/global.sym
+++ b/global.sym
@@ -287,6 +287,8 @@ Perl_is_utf8_string_loc
 Perl_is_utf8_string_loclen
 Perl_is_utf8_upper
 Perl_is_utf8_xdigit
+Perl_is_utf8_xidcont
+Perl_is_utf8_xidfirst
 Perl_leave_scope
 Perl_lex_bufutf8
 Perl_lex_discard_to
diff --git a/handy.h b/handy.h
index ad2e4b68b2..6541c95c35 100644
--- a/handy.h
+++ b/handy.h
@@ -883,16 +883,13 @@ EXTCONST U32 PL_charclass[];
 #define isBLANK_LC_uni(c)	isBLANK(c) /* could be wrong */
 
 #define isALNUM_utf8(p)		is_utf8_alnum(p)
-/* The ID_Start of Unicode was originally quite limiting: it assumed an
- * L-class character (meaning that you could not have, say, a CJK charac-
- * ter). So, instead, perl has for a long time allowed ID_Continue but
- * not digits.
- * We still preserve that for backward compatibility. But we also make sure
- * that it is alphanumeric, so S_scan_word in toke.c will not hang. See
- *    http://rt.perl.org/rt3/Ticket/Display.html?id=74022
- * for more detail than you ever wanted to know about. */
-#define isIDFIRST_utf8(p) \
-    (is_utf8_idcont(p) && !is_utf8_digit(p) && is_utf8_alnum(p))
+/* To prevent S_scan_word in toke.c from hanging, we have to make sure that
+ * IDFIRST is an alnum.  See
+ * http://rt.perl.org/rt3/Ticket/Display.html?id=74022
+ * for more detail than you ever wanted to know about.  This used to be not the
+ * XID version, but we decided to go with the more modern Unicode definition */
+#define isIDFIRST_utf8(p)	(is_utf8_xidfirst(p) && is_utf8_alnum(p))
+#define isIDCONT_utf8(p)	is_utf8_xidcont(p)
 #define isALPHA_utf8(p)		is_utf8_alpha(p)
 #define isSPACE_utf8(p)		is_utf8_space(p)
 #define isDIGIT_utf8(p)		is_utf8_digit(p)
diff --git a/intrpvar.h b/intrpvar.h
index b12f21b283..a4beda6851 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -678,6 +678,8 @@ PERLVAR(Idebug_pad,	struct perl_debug_pad)	/* always needed because of the re ex
 
 PERLVAR(Iutf8_idstart,	SV *)
 PERLVAR(Iutf8_idcont,	SV *)
+PERLVAR(Iutf8_xidstart,	SV *)
+PERLVAR(Iutf8_xidcont,	SV *)
 
 PERLVAR(Isort_RealCmp,  SVCOMPARE_t)
 
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index e8f4715aa6..f00cb3382e 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -801,6 +801,12 @@ again and, if the entries are re-created too many times, dies with a
 [perl #78494] When pipes are shared between threads, the C<close> function
 (and any implicit close, such as on thread exit) no longer blocks.
 
+=item *
+
+Several contexts no longer allow a Unicode character to begin a word
+that should never begin words, for an example an accent that must follow
+another character previously could precede all other characters.
+
 =back
 
 =head1 Known Problems
diff --git a/proto.h b/proto.h
index 960e5f5246..d4642aa5eb 100644
--- a/proto.h
+++ b/proto.h
@@ -1801,6 +1801,18 @@ PERL_CALLCONV bool	Perl_is_utf8_xdigit(pTHX_ const U8 *p)
 #define PERL_ARGS_ASSERT_IS_UTF8_XDIGIT	\
 	assert(p)
 
+PERL_CALLCONV bool	Perl_is_utf8_xidcont(pTHX_ const U8 *p)
+			__attribute__warn_unused_result__
+			__attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_XIDCONT	\
+	assert(p)
+
+PERL_CALLCONV bool	Perl_is_utf8_xidfirst(pTHX_ const U8 *p)
+			__attribute__warn_unused_result__
+			__attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST	\
+	assert(p)
+
 PERL_CALLCONV OP*	Perl_jmaybe(pTHX_ OP *o)
 			__attribute__nonnull__(pTHX_1);
 #define PERL_ARGS_ASSERT_JMAYBE	\
diff --git a/sv.c b/sv.c
index 4bd68503fe..9254ad19e5 100644
--- a/sv.c
+++ b/sv.c
@@ -13119,7 +13119,9 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
     PL_utf8_tolower	= sv_dup_inc(proto_perl->Iutf8_tolower, param);
     PL_utf8_tofold	= sv_dup_inc(proto_perl->Iutf8_tofold, param);
     PL_utf8_idstart	= sv_dup_inc(proto_perl->Iutf8_idstart, param);
+    PL_utf8_xidstart	= sv_dup_inc(proto_perl->Iutf8_xidstart, param);
     PL_utf8_idcont	= sv_dup_inc(proto_perl->Iutf8_idcont, param);
+    PL_utf8_xidcont	= sv_dup_inc(proto_perl->Iutf8_xidcont, param);
     PL_utf8_foldable	= hv_dup_inc(proto_perl->Iutf8_foldable, param);
 
     /* Did the locale setup indicate UTF-8? */
diff --git a/utf8.c b/utf8.c
index b5d853188b..808d9a80a7 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1501,6 +1501,19 @@ Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
 }
 
 bool
+Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
+
+    if (*p == '_')
+	return TRUE;
+    /* is_utf8_idstart would be more logical. */
+    return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
+}
+
+bool
 Perl_is_utf8_idcont(pTHX_ const U8 *p)
 {
     dVAR;
@@ -1513,6 +1526,18 @@ Perl_is_utf8_idcont(pTHX_ const U8 *p)
 }
 
 bool
+Perl_is_utf8_xidcont(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
+
+    if (*p == '_')
+	return TRUE;
+    return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
+}
+
+bool
 Perl_is_utf8_alpha(pTHX_ const U8 *p)
 {
     dVAR;
author	Karl Williamson <public@khwilliamson.com>	2011-02-17 14:43:10 -0700
committer	Karl Williamson <public@khwilliamson.com>	2011-02-17 15:41:54 -0700
commit	c11ff9433950cda8448b773418d1cb2592eea29d (patch)
tree	f0284cd5865f0db8c7484123153ab8fb860aa129
parent	0167186c6da6afb0eb6708879a543c70c612fc45 (diff)
download	perl-c11ff9433950cda8448b773418d1cb2592eea29d.tar.gz