diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-02-17 14:43:10 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-02-17 15:41:54 -0700 |
commit | c11ff9433950cda8448b773418d1cb2592eea29d (patch) | |
tree | f0284cd5865f0db8c7484123153ab8fb860aa129 | |
parent | 0167186c6da6afb0eb6708879a543c70c612fc45 (diff) | |
download | perl-c11ff9433950cda8448b773418d1cb2592eea29d.tar.gz |
handy.h: isIDFIRST_utf8() changed to use XIDStart
Previously this used a home-grown definition of an identifier start,
stemming from a bug in some early Unicode versions. This led to some
problems, fixed by #74022.
But the home-grown solution did not track Unicode, and allowed for
characters, like marks, to begin words when they shouldn't. This change
brings this macro into compliance with Unicode going-forward.
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | embed.h | 2 | ||||
-rw-r--r-- | embedvar.h | 4 | ||||
-rw-r--r-- | global.sym | 2 | ||||
-rw-r--r-- | handy.h | 17 | ||||
-rw-r--r-- | intrpvar.h | 2 | ||||
-rw-r--r-- | pod/perldelta.pod | 6 | ||||
-rw-r--r-- | proto.h | 12 | ||||
-rw-r--r-- | sv.c | 2 | ||||
-rw-r--r-- | utf8.c | 25 |
10 files changed, 64 insertions, 10 deletions
@@ -583,7 +583,9 @@ Anpdmb |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p Anpd |bool |is_utf8_string_loclen|NN const U8 *s|STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el ApR |bool |is_utf8_alnum |NN const U8 *p ApR |bool |is_utf8_idfirst|NN const U8 *p +ApR |bool |is_utf8_xidfirst|NN const U8 *p ApR |bool |is_utf8_idcont |NN const U8 *p +ApR |bool |is_utf8_xidcont |NN const U8 *p ApR |bool |is_utf8_alpha |NN const U8 *p ApR |bool |is_utf8_ascii |NN const U8 *p ApR |bool |is_utf8_space |NN const U8 *p @@ -251,6 +251,8 @@ #define is_utf8_string_loclen Perl_is_utf8_string_loclen #define is_utf8_upper(a) Perl_is_utf8_upper(aTHX_ a) #define is_utf8_xdigit(a) Perl_is_utf8_xdigit(aTHX_ a) +#define is_utf8_xidcont(a) Perl_is_utf8_xidcont(aTHX_ a) +#define is_utf8_xidfirst(a) Perl_is_utf8_xidfirst(aTHX_ a) #define leave_scope(a) Perl_leave_scope(aTHX_ a) #define lex_bufutf8() Perl_lex_bufutf8(aTHX) #define lex_discard_to(a) Perl_lex_discard_to(aTHX_ a) diff --git a/embedvar.h b/embedvar.h index 7b78a5ebdd..bb179fdcf4 100644 --- a/embedvar.h +++ b/embedvar.h @@ -361,6 +361,8 @@ #define PL_utf8_toupper (vTHX->Iutf8_toupper) #define PL_utf8_upper (vTHX->Iutf8_upper) #define PL_utf8_xdigit (vTHX->Iutf8_xdigit) +#define PL_utf8_xidcont (vTHX->Iutf8_xidcont) +#define PL_utf8_xidstart (vTHX->Iutf8_xidstart) #define PL_utf8cache (vTHX->Iutf8cache) #define PL_utf8locale (vTHX->Iutf8locale) #define PL_warnhook (vTHX->Iwarnhook) @@ -693,6 +695,8 @@ #define PL_Iutf8_toupper PL_utf8_toupper #define PL_Iutf8_upper PL_utf8_upper #define PL_Iutf8_xdigit PL_utf8_xdigit +#define PL_Iutf8_xidcont PL_utf8_xidcont +#define PL_Iutf8_xidstart PL_utf8_xidstart #define PL_Iutf8cache PL_utf8cache #define PL_Iutf8locale PL_utf8locale #define PL_Iwarnhook PL_warnhook diff --git a/global.sym b/global.sym index 9064f98699..dde11d4072 100644 --- a/global.sym +++ b/global.sym @@ -287,6 +287,8 @@ Perl_is_utf8_string_loc Perl_is_utf8_string_loclen Perl_is_utf8_upper Perl_is_utf8_xdigit +Perl_is_utf8_xidcont +Perl_is_utf8_xidfirst Perl_leave_scope Perl_lex_bufutf8 Perl_lex_discard_to @@ -883,16 +883,13 @@ EXTCONST U32 PL_charclass[]; #define isBLANK_LC_uni(c) isBLANK(c) /* could be wrong */ #define isALNUM_utf8(p) is_utf8_alnum(p) -/* The ID_Start of Unicode was originally quite limiting: it assumed an - * L-class character (meaning that you could not have, say, a CJK charac- - * ter). So, instead, perl has for a long time allowed ID_Continue but - * not digits. - * We still preserve that for backward compatibility. But we also make sure - * that it is alphanumeric, so S_scan_word in toke.c will not hang. See - * http://rt.perl.org/rt3/Ticket/Display.html?id=74022 - * for more detail than you ever wanted to know about. */ -#define isIDFIRST_utf8(p) \ - (is_utf8_idcont(p) && !is_utf8_digit(p) && is_utf8_alnum(p)) +/* To prevent S_scan_word in toke.c from hanging, we have to make sure that + * IDFIRST is an alnum. See + * http://rt.perl.org/rt3/Ticket/Display.html?id=74022 + * for more detail than you ever wanted to know about. This used to be not the + * XID version, but we decided to go with the more modern Unicode definition */ +#define isIDFIRST_utf8(p) (is_utf8_xidfirst(p) && is_utf8_alnum(p)) +#define isIDCONT_utf8(p) is_utf8_xidcont(p) #define isALPHA_utf8(p) is_utf8_alpha(p) #define isSPACE_utf8(p) is_utf8_space(p) #define isDIGIT_utf8(p) is_utf8_digit(p) diff --git a/intrpvar.h b/intrpvar.h index b12f21b283..a4beda6851 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -678,6 +678,8 @@ PERLVAR(Idebug_pad, struct perl_debug_pad) /* always needed because of the re ex PERLVAR(Iutf8_idstart, SV *) PERLVAR(Iutf8_idcont, SV *) +PERLVAR(Iutf8_xidstart, SV *) +PERLVAR(Iutf8_xidcont, SV *) PERLVAR(Isort_RealCmp, SVCOMPARE_t) diff --git a/pod/perldelta.pod b/pod/perldelta.pod index e8f4715aa6..f00cb3382e 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -801,6 +801,12 @@ again and, if the entries are re-created too many times, dies with a [perl #78494] When pipes are shared between threads, the C<close> function (and any implicit close, such as on thread exit) no longer blocks. +=item * + +Several contexts no longer allow a Unicode character to begin a word +that should never begin words, for an example an accent that must follow +another character previously could precede all other characters. + =back =head1 Known Problems @@ -1801,6 +1801,18 @@ PERL_CALLCONV bool Perl_is_utf8_xdigit(pTHX_ const U8 *p) #define PERL_ARGS_ASSERT_IS_UTF8_XDIGIT \ assert(p) +PERL_CALLCONV bool Perl_is_utf8_xidcont(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_XIDCONT \ + assert(p) + +PERL_CALLCONV bool Perl_is_utf8_xidfirst(pTHX_ const U8 *p) + __attribute__warn_unused_result__ + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST \ + assert(p) + PERL_CALLCONV OP* Perl_jmaybe(pTHX_ OP *o) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_JMAYBE \ @@ -13119,7 +13119,9 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, PL_utf8_tolower = sv_dup_inc(proto_perl->Iutf8_tolower, param); PL_utf8_tofold = sv_dup_inc(proto_perl->Iutf8_tofold, param); PL_utf8_idstart = sv_dup_inc(proto_perl->Iutf8_idstart, param); + PL_utf8_xidstart = sv_dup_inc(proto_perl->Iutf8_xidstart, param); PL_utf8_idcont = sv_dup_inc(proto_perl->Iutf8_idcont, param); + PL_utf8_xidcont = sv_dup_inc(proto_perl->Iutf8_xidcont, param); PL_utf8_foldable = hv_dup_inc(proto_perl->Iutf8_foldable, param); /* Did the locale setup indicate UTF-8? */ @@ -1501,6 +1501,19 @@ Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */ } bool +Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */ +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST; + + if (*p == '_') + return TRUE; + /* is_utf8_idstart would be more logical. */ + return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart"); +} + +bool Perl_is_utf8_idcont(pTHX_ const U8 *p) { dVAR; @@ -1513,6 +1526,18 @@ Perl_is_utf8_idcont(pTHX_ const U8 *p) } bool +Perl_is_utf8_xidcont(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_XIDCONT; + + if (*p == '_') + return TRUE; + return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue"); +} + +bool Perl_is_utf8_alpha(pTHX_ const U8 *p) { dVAR; |