diff options
-rw-r--r-- | dist/Data-Dumper/Dumper.pm | 2 | ||||
-rw-r--r-- | dist/Data-Dumper/Dumper.xs | 23 | ||||
-rw-r--r-- | embed.fnc | 1 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | ext/POSIX/lib/POSIX.pod | 2 | ||||
-rw-r--r-- | lib/locale.pm | 84 | ||||
-rw-r--r-- | locale.c | 21 | ||||
-rw-r--r-- | numeric.c | 4 | ||||
-rw-r--r-- | op.c | 10 | ||||
-rw-r--r-- | perl.h | 27 | ||||
-rw-r--r-- | pod/perldelta.pod | 6 | ||||
-rw-r--r-- | pod/perlembed.pod | 2 | ||||
-rw-r--r-- | pod/perlfunc.pod | 13 | ||||
-rw-r--r-- | pod/perllocale.pod | 153 | ||||
-rw-r--r-- | pp.c | 36 | ||||
-rw-r--r-- | pp_sort.c | 2 | ||||
-rw-r--r-- | pp_sys.c | 2 | ||||
-rw-r--r-- | proto.h | 1 | ||||
-rw-r--r-- | t/run/locale.t | 49 | ||||
-rw-r--r-- | utf8.h | 7 |
20 files changed, 318 insertions, 128 deletions
diff --git a/dist/Data-Dumper/Dumper.pm b/dist/Data-Dumper/Dumper.pm index 6f7a0d8226..9afeac77b7 100644 --- a/dist/Data-Dumper/Dumper.pm +++ b/dist/Data-Dumper/Dumper.pm @@ -1398,7 +1398,7 @@ modify it under the same terms as Perl itself. =head1 VERSION -Version 2.152 (March 7 2014) +Version 2.153 (June 5 2014) =head1 SEE ALSO diff --git a/dist/Data-Dumper/Dumper.xs b/dist/Data-Dumper/Dumper.xs index e98c6d70d0..03515aec53 100644 --- a/dist/Data-Dumper/Dumper.xs +++ b/dist/Data-Dumper/Dumper.xs @@ -838,15 +838,24 @@ DD_dump(pTHX_ SV *val, const char *name, STRLEN namelen, SV *retval, HV *seenhv, av_push(keys, sv); } # ifdef USE_LOCALE_NUMERIC - sortsv(AvARRAY(keys), - av_len(keys)+1, - IN_LOCALE ? Perl_sv_cmp_locale : Perl_sv_cmp); -# else - sortsv(AvARRAY(keys), - av_len(keys)+1, - Perl_sv_cmp); +# ifdef IN_LC /* Use this if available */ + if (IN_LC(LC_COLLATE)) +# else + if (IN_LOCALE) +# endif + { + sortsv(AvARRAY(keys), + av_len(keys)+1, + Perl_sv_cmp_locale); + } + else # endif #endif + { + sortsv(AvARRAY(keys), + av_len(keys)+1, + Perl_sv_cmp); + } } if (sortkeys != &PL_sv_yes) { dSP; ENTER; SAVETMPS; PUSHMARK(sp); @@ -1102,6 +1102,7 @@ ApOM |void |new_numeric |NULLOK const char* newcoll Ap |void |set_numeric_local Ap |void |set_numeric_radix Ap |void |set_numeric_standard +ApM |bool |_is_in_locale_category|const bool compiling|const int category ApdO |void |require_pv |NN const char* pv Apd |void |pack_cat |NN SV *cat|NN const char *pat|NN const char *patend \ |NN SV **beglist|NN SV **endlist|NN SV ***next_in_list|U32 flags @@ -27,6 +27,7 @@ /* Hide global symbols */ #define Gv_AMupdate(a,b) Perl_Gv_AMupdate(aTHX_ a,b) +#define _is_in_locale_category(a,b) Perl__is_in_locale_category(aTHX_ a,b) #define _is_uni_FOO(a,b) Perl__is_uni_FOO(aTHX_ a,b) #define _is_uni_perl_idcont(a) Perl__is_uni_perl_idcont(aTHX_ a) #define _is_uni_perl_idstart(a) Perl__is_uni_perl_idstart(aTHX_ a) diff --git a/ext/POSIX/lib/POSIX.pod b/ext/POSIX/lib/POSIX.pod index 59cbd22fc2..61b0f71c45 100644 --- a/ext/POSIX/lib/POSIX.pod +++ b/ext/POSIX/lib/POSIX.pod @@ -1186,7 +1186,7 @@ L<a section devoted to this function|perllocale/The setlocale function>. The discussion here is merely a summary reference for C<setlocale()>. Note that Perl itself is almost entirely unaffected by the locale except within the scope of S<C<"use locale">>. (Exceptions are listed -in L<perllocale/Not within the scope of any "use locale" variant>.) +in L<perllocale/Not within the scope of "use locale">.) The following examples assume diff --git a/lib/locale.pm b/lib/locale.pm index f7575f5007..52279a1693 100644 --- a/lib/locale.pm +++ b/lib/locale.pm @@ -1,6 +1,6 @@ package locale; -our $VERSION = '1.03'; +our $VERSION = '1.04'; use Config; $Carp::Internal{ (__PACKAGE__) } = 1; @@ -35,40 +35,84 @@ to behave as if in the "C" locale; attempts to change the locale will fail. =cut -# A separate bit is used for each of the two forms of the pragma, as they are -# mostly independent, and interact with each other and the unicode_strings -# feature. This allows for fast determination of which one(s) of the three -# are to be used at any given point, and no code has to be written to deal -# with coming in and out of scopes--it falls automatically out from the hint -# handling +# A separate bit is used for each of the two forms of the pragma, to save +# having to look at %^H for the normal case of a plain 'use locale' without an +# argument. $locale::hint_bits = 0x4; -$locale::not_chars_hint_bits = 0x10; +$locale::partial_hint_bits = 0x10; # If pragma has an argument + +# The pseudo-category :characters consists of 2 real ones; but it also is +# given its own number, -1, because in the complement form it also has the +# side effect of "use feature 'unicode_strings'" sub import { shift; # should be 'locale'; not checked - my $found_not_chars = 0; - while (defined (my $arg = shift)) { - if ($arg eq ":not_characters") { - $^H |= $locale::not_chars_hint_bits; + $^H{locale} = 0 unless defined $^H{locale}; + if (! @_) { # If no parameter, use the plain form that changes all categories + $^H |= $locale::hint_bits; + + } + else { + my @categories = ( qw(:ctype :collate :messages + :numeric :monetary :time) ); + for (my $i = 0; $i < @_; $i++) { + my $arg = $_[$i]; + $complement = $arg =~ s/ : ( ! | not_ ) /:/x; + if (! grep { $arg eq $_ } @categories, ":characters") { + require Carp; + Carp::croak("Unknown parameter '$_[$i]' to 'use locale'"); + } + + if ($complement) { + if ($i != 0 || $i < @_ - 1) { + require Carp; + Carp::croak("Only one argument to 'use locale' allowed" + . "if is $complement"); + } + + if ($arg eq ':characters') { + push @_, grep { $_ ne ':ctype' && $_ ne ':collate' } + @categories; + # We add 1 to the category number; This category number + # is -1 + $^H{locale} |= (1 << 0); + } + else { + push @_, grep { $_ ne $arg } @categories; + } + next; + } + elsif ($arg eq ':characters') { + push @_, ':ctype', ':collate'; + next; + } + + $^H |= $locale::partial_hint_bits; # This form of the pragma overrides the other $^H &= ~$locale::hint_bits; - $found_not_chars = 1; - } - else { - require Carp; - Carp::croak("Unknown parameter '$arg' to 'use locale'"); + + $arg =~ s/^://; + + # Map our names to the ones defined by POSIX + $arg = "LC_" . uc($arg); + use POSIX 'locale_h'; + my $bit = eval "&POSIX::$arg"; + if (defined $bit) { + # 1 is added so that the pseudo-category :characters, which is + # -1, comes out 0. + $^H{locale} |= 1 << ($bit + 1); + } } } - # Use the plain form if not doing the :not_characters one. - $^H |= $locale::hint_bits unless $found_not_chars; } sub unimport { - $^H &= ~($locale::hint_bits|$locale::not_chars_hint_bits); + $^H &= ~($locale::hint_bits|$locale::partial_hint_bits); + $^H{locale} = 0; } 1; @@ -1385,6 +1385,27 @@ Perl__is_cur_LC_category_utf8(pTHX_ int category) #endif + +bool +Perl__is_in_locale_category(pTHX_ const bool compiling, const int category) +{ + /* Internal function which returns if we are in the scope of a pragma that + * enables the locale category 'category'. 'compiling' should indicate if + * this is during the compilation phase (TRUE) or not (FALSE). */ + + const COP * const cop = (compiling) ? &PL_compiling : PL_curcop; + + SV *categories = cop_hints_fetch_pvs(cop, "locale", 0); + if (! categories || categories == &PL_sv_placeholder) { + return FALSE; + } + + /* The pseudo-category 'not_characters' is -1, so just add 1 to each to get + * a valid unsigned */ + assert(category >= -1); + return cBOOL(SvUV(categories) & (1U << (category + 1))); +} + /* * Local variables: * c-indentation-style: bsd @@ -528,7 +528,7 @@ Perl_grok_numeric_radix(pTHX_ const char **sp, const char *send) PERL_ARGS_ASSERT_GROK_NUMERIC_RADIX; - if (IN_SOME_LOCALE_FORM) { + if (IN_LC(LC_NUMERIC)) { DECLARE_STORE_LC_NUMERIC_SET_TO_NEEDED(); if (PL_numeric_radix_sv) { STRLEN len; @@ -860,7 +860,7 @@ Perl_my_atof(pTHX_ const char* s) { DECLARE_STORE_LC_NUMERIC_SET_TO_NEEDED(); - if (PL_numeric_radix_sv && IN_SOME_LOCALE_FORM) { + if (PL_numeric_radix_sv && IN_LC(LC_NUMERIC)) { const char *standard = NULL, *local = NULL; bool use_standard_radix; @@ -3523,14 +3523,20 @@ S_fold_constants(pTHX_ OP *o) case OP_UC: case OP_LC: case OP_FC: + if (IN_LC_COMPILETIME(LC_CTYPE)) + goto nope; + break; case OP_SLT: case OP_SGT: case OP_SLE: case OP_SGE: case OP_SCMP: + if (IN_LC_COMPILETIME(LC_COLLATE)) + goto nope; + break; case OP_SPRINTF: /* XXX what about the numeric ops? */ - if (IN_LOCALE_COMPILETIME) + if (IN_LC_COMPILETIME(LC_NUMERIC)) goto nope; break; case OP_PACK: @@ -4731,7 +4737,7 @@ Perl_newPMOP(pTHX_ I32 type, I32 flags) if (PL_hints & HINT_RE_TAINT) pmop->op_pmflags |= PMf_RETAINT; - if (IN_LOCALE_COMPILETIME) { + if (IN_LC_COMPILETIME(LC_CTYPE)) { set_regex_charset(&(pmop->op_pmflags), REGEX_LOCALE_CHARSET); } else if (IN_UNI_8_BIT) { @@ -4794,12 +4794,18 @@ typedef enum { However, bitops store HINT_INTEGER in their op_private. NOTE: The typical module using these has the bit value hard-coded, so don't - blindly change the values of these */ + blindly change the values of these. + + If we run out of bits, the 2 locale ones could be combined. The PARTIAL one + is for "use locale 'FOO'" which excludes some categories. It requires going + to %^H to find out which are in and which are out. This could be extended + for the normal case of a plain HINT_LOCALE, so that %^H would be used for + any locale form. */ #define HINT_INTEGER 0x00000001 /* integer pragma */ #define HINT_STRICT_REFS 0x00000002 /* strict pragma */ #define HINT_LOCALE 0x00000004 /* locale pragma */ #define HINT_BYTES 0x00000008 /* bytes pragma */ -#define HINT_LOCALE_NOT_CHARS 0x00000010 /* locale ':not_characters' pragma */ +#define HINT_LOCALE_PARTIAL 0x00000010 /* locale, but a subset of categories */ #define HINT_EXPLICIT_STRICT_REFS 0x00000020 /* strict.pm */ #define HINT_EXPLICIT_STRICT_SUBS 0x00000040 /* strict.pm */ @@ -5303,17 +5309,18 @@ typedef struct am_table_short AMTS; #define PERLDB_SAVESRC_NOSUBS (PL_perldb && (PL_perldb & PERLDBf_SAVESRC_NOSUBS)) #define PERLDB_SAVESRC_INVALID (PL_perldb && (PL_perldb & PERLDBf_SAVESRC_INVALID)) +/* These locale things are all subject to change */ /* Returns TRUE if the plain locale pragma without a parameter is in effect */ #define IN_LOCALE_RUNTIME cBOOL(CopHINTS_get(PL_curcop) & HINT_LOCALE) /* Returns TRUE if either form of the locale pragma is in effect */ #define IN_SOME_LOCALE_FORM_RUNTIME \ - cBOOL(CopHINTS_get(PL_curcop) & (HINT_LOCALE|HINT_LOCALE_NOT_CHARS)) + cBOOL(CopHINTS_get(PL_curcop) & (HINT_LOCALE|HINT_LOCALE_PARTIAL)) #define IN_LOCALE_COMPILETIME cBOOL(PL_hints & HINT_LOCALE) #define IN_SOME_LOCALE_FORM_COMPILETIME \ - cBOOL(PL_hints & (HINT_LOCALE|HINT_LOCALE_NOT_CHARS)) + cBOOL(PL_hints & (HINT_LOCALE|HINT_LOCALE_PARTIAL)) #define IN_LOCALE \ (IN_PERL_COMPILETIME ? IN_LOCALE_COMPILETIME : IN_LOCALE_RUNTIME) @@ -5321,6 +5328,16 @@ typedef struct am_table_short AMTS; (IN_PERL_COMPILETIME ? IN_SOME_LOCALE_FORM_COMPILETIME \ : IN_SOME_LOCALE_FORM_RUNTIME) +#define IN_LC_ALL_COMPILETIME IN_LOCALE_COMPILETIME +#define IN_LC_ALL_RUNTIME IN_LOCALE_RUNTIME + +#define IN_LC_PARTIAL_COMPILETIME cBOOL(PL_hints & HINT_LOCALE_PARTIAL) +#define IN_LC_PARTIAL_RUNTIME cBOOL(CopHINTS_get(PL_curcop) & HINT_LOCALE_PARTIAL) + +#define IN_LC_COMPILETIME(category) (IN_LC_ALL_COMPILETIME || (IN_LC_PARTIAL_COMPILETIME && _is_in_locale_category(TRUE, (category)))) +#define IN_LC_RUNTIME(category) (IN_LC_ALL_RUNTIME || (IN_LC_PARTIAL_RUNTIME && _is_in_locale_category(FALSE, (category)))) +#define IN_LC(category) (IN_LC_COMPILETIME(category) || IN_LC_RUNTIME(category)) + #ifdef USE_LOCALE_NUMERIC /* These macros are for toggling between the underlying locale (LOCAL) and the @@ -5345,7 +5362,7 @@ typedef struct am_table_short AMTS; void (*_restore_LC_NUMERIC_function)(pTHX) = NULL; #define STORE_LC_NUMERIC_SET_TO_NEEDED() \ - if (IN_SOME_LOCALE_FORM) { \ + if (IN_LC(LC_NUMERIC)) { \ if (_NOT_IN_NUMERIC_LOCAL) { \ set_numeric_local(); \ _restore_LC_NUMERIC_function = &Perl_set_numeric_standard; \ diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 21c59b626e..79de1c44db 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -41,6 +41,12 @@ U+2028 LINE SEPARATOR, and U+2029 PARAGRAPH SEPARATOR. +=head2 S<C<use locale>> can restrict which locale categories are affected + +It is now possible to pass a parameter to S<C<use locale>> to specify +a subset of locale categories to be locale-aware, with the remaining +ones unaffected. See L<perllocale/The "use locale" pragma> for details. + =head1 Security XXX Any security-related notices go here. In particular, any security diff --git a/pod/perlembed.pod b/pod/perlembed.pod index 4560ecc786..596f28781a 100644 --- a/pod/perlembed.pod +++ b/pod/perlembed.pod @@ -1089,7 +1089,7 @@ When a Perl interpreter normally starts up, it tells the system it wants to use the system's default locale. This is often, but not necessarily, the "C" or "POSIX" locale. Absent a S<C<"use locale">> within the perl code, this mostly has no effect (but see L<perllocale/Not within the -scope of any "use locale" variant>). Also, there is not a problem if the +scope of "use locale">). Also, there is not a problem if the locale you want to use in your embedded Perl is the same as the system default. However, this doesn't work if you have set up and want to use a locale that isn't the system default one. Starting in Perl v5.20, you diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index aff2cd5660..5b9c0058d6 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -3337,9 +3337,9 @@ What gets returned depends on several factors: The results follow ASCII rules. Only the characters C<A-Z> change, to C<a-z> respectively. -=item Otherwise, if C<use locale> (but not C<use locale ':not_characters'>) is in effect: +=item Otherwise, if C<use locale> for C<LC_CTYPE> is in effect: -Respects current LC_CTYPE locale for code points < 256; and uses Unicode +Respects current C<LC_CTYPE> locale for code points < 256; and uses Unicode rules for the remaining code points (this last can only happen if the UTF8 flag is also set). See L<perllocale>. @@ -5240,10 +5240,11 @@ LIST are actually parsed as a single list. The first argument of the list will be interpreted as the C<printf> format. This means that C<printf(@_)> will use C<$_[0]> as the format. See L<sprintf|/sprintf FORMAT, LIST> for an -explanation of the format argument. If C<use locale> (including -C<use locale ':not_characters'>) is in effect and +explanation of the format argument. If C<use locale> for C<LC_NUMERIC> +Look for this throught pod +is in effect and POSIX::setlocale() has been called, the character used for the decimal -separator in formatted floating-point numbers is affected by the LC_NUMERIC +separator in formatted floating-point numbers is affected by the C<LC_NUMERIC> locale setting. See L<perllocale> and L<POSIX>. For historical reasons, if you omit the list, C<$_> is used as the format; @@ -7373,7 +7374,7 @@ index, the C<$> may need escaping: If C<use locale> (including C<use locale 'not_characters'>) is in effect and POSIX::setlocale() has been called, the character used for the decimal separator in formatted floating-point -numbers is affected by the LC_NUMERIC locale. See L<perllocale> +numbers is affected by the C<LC_NUMERIC> locale. See L<perllocale> and L<POSIX>. =item sqrt EXPR diff --git a/pod/perllocale.pod b/pod/perllocale.pod index 19ec397b9e..f371b9b66c 100644 --- a/pod/perllocale.pod +++ b/pod/perllocale.pod @@ -91,7 +91,7 @@ This indicates, for example if a character is an uppercase letter. Some platforms have other categories, dealing with such things as measurement units and paper sizes. None of these are used directly by Perl, but outside operations that Perl interacts with may use -these. See L</Not within the scope of any "use locale" variant> below. +these. See L</Not within the scope of "use locale"> below. =back @@ -140,7 +140,7 @@ C<define>. If you want a Perl application to process and present your data according to a particular locale, the application code should include -the S<C<use locale>> pragma (see L<The use locale pragma>) where +the S<C<use locale>> pragma (see L<The "use locale" pragma>) where appropriate, and B<at least one> of the following must be true: =over 4 @@ -160,24 +160,12 @@ L<The setlocale function>. =head1 USING LOCALES -=head2 The use locale pragma +=head2 The C<"use locale"> pragma By default, Perl itself ignores the current locale. The S<C<use locale>> pragma tells Perl to use the current locale for some operations. -Starting in v5.16, there is an optional parameter to this pragma: - - use locale ':not_characters'; - -This parameter allows better mixing of locales and Unicode (less useful -in v5.20 and later), and is -described fully in L</Unicode and UTF-8>, but briefly, it tells Perl to -not use the character portions of the locale definition, that is -the C<LC_CTYPE> and C<LC_COLLATE> categories. Instead it will use the -native character set (extended by Unicode). When using this parameter, -you are responsible for getting the external character set translated -into the native/Unicode one (which it already will be if it is one of -the increasingly popular UTF-8 locales). There are convenient ways of -doing this, as described in L</Unicode and UTF-8>. +Starting in v5.16, there are optional parameters to this pragma, +described below, which restrict which operations are affected by it. The current locale is set at execution time by L<setlocale()|/The setlocale function> described below. If that function @@ -194,9 +182,10 @@ The operations that are affected by locale are: =over 4 -=item B<Not within the scope of any C<"use locale"> variant> +=item B<Not within the scope of C<"use locale">> -Only operations originating outside Perl should be affected, as follows: +Only certain operations originating outside Perl should be affected, as +follows: =over 4 @@ -252,7 +241,7 @@ E<160> =item B<Lingering effects of C<S<use locale>>> Certain Perl operations that are set-up within the scope of a -C<use locale> variant retain that effect even outside the scope. +C<use locale> retain that effect even outside the scope. These include: =over 4 @@ -262,7 +251,7 @@ These include: The output format of a L<write()|perlfunc/write> is determined by an earlier format declaration (L<perlfunc/format>), so whether or not the output is affected by locale is determined by if the C<format()> is -within the scope of a C<use locale> variant, not whether the C<write()> +within the scope of a C<use locale>, not whether the C<write()> is. =item * @@ -278,15 +267,16 @@ behavior, not if the matches are done within such a scope or not. =for comment The nbsp below makes this look better (though not great) + E<160> -=item B<Under C<"use locale ':not_characters';">> +=item B<Under C<"use locale";>> =over 4 =item * -All the non-Perl operations. +All the above operations =item * @@ -303,21 +293,6 @@ C<say()>, and C<sprintf()>. -=back - -=for comment -The nbsp below makes this look better (though not great) - -E<160> - -=item B<Under just plain C<"use locale";>> - -=over 4 - -=item * - -All the above operations - =item * B<The comparison operators> (C<lt>, C<le>, C<cmp>, C<ge>, and C<gt>) use @@ -353,6 +328,66 @@ The string result of any operation that uses locale information is tainted, as it is possible for a locale to be untrustworthy. See L<"SECURITY">. +Starting in Perl v5.16 in a very limited way, and more generally in +v5.22, you can restrict which category or categories are enabled by this +particular instance of the pragma by adding parameters to it. For +example, + + use locale qw(:ctype :numeric); + +enables locale awareness within its scope of only those operations +(listed above) that are affected by C<LC_CTYPE> and C<LC_NUMERIC>. + +The possible categories are: C<:collate>, C<:ctype>, C<:messages>, +C<:monetary>, C<:numeric>, C<:time>, and the pseudo category +C<:characters> (described below). + +Thus you can say + + use locale ':messages'; + +and only L<$!|perlvar/$ERRNO> and L<$^E|perlvar/$EXTENDED_OS_ERROR> +will be locale aware. Everything else is unaffected. + +Since Perl doesn't currently do anything with the C<LC_MONETARY> +category, specifying C<:monetary> does effectively nothing. Some +systems have other categories, such as C<LC_PAPER_SIZE>, but Perl +also doesn't know anything about them, and there is no way to specify +them in this pragma's arguments. + +You can also easily say to use all categories but one, by either, for +example, + + use locale ':!ctype'; + use locale ':not_ctype'; + +both of which mean to enable locale awarness of all categories but +C<LC_CTYPE>. Only one category argument may be specified in a +S<C<use locale>> if it is of the negated form. + +Prior to v5.22 only one form of the pragma with arguments is available: + + use locale ':not_characters'; + +(and you have to say C<not_>; you can't use the bang C<!> form). This +pseudo category is a shorthand for specifying both C<:collate> and +C<:ctype>. Hence, in the negated form, it is nearly the same thing as +saying + + use locale qw(:messages :monetary :numeric :time); + +We use the term "nearly", because C<:not_characters> also turns on +S<C<use feature 'unicode_strings'>> within its scope. This form is +less useful in v5.20 and later, and is described fully in +L</Unicode and UTF-8>, but briefly, it tells Perl to not use the +character portions of the locale definition, that is the C<LC_CTYPE> and +C<LC_COLLATE> categories. Instead it will use the native character set +(extended by Unicode). When using this parameter, you are responsible +for getting the external character set translated into the +native/Unicode one (which it already will be if it is one of the +increasingly popular UTF-8 locales). There are convenient ways of doing +this, as described in L</Unicode and UTF-8>. + =head2 The setlocale function You can switch locales as often as you wish at run time with the @@ -419,8 +454,8 @@ return to the default that was in force when Perl started up: changes to the environment made by the application after startup may or may not be noticed, depending on your system's C library. -Note that Perl ignores the current C<LC_CTYPE> and C<LC_COLLATE> locales -within the scope of a C<use locale ':not_characters'>. +Note that when a form of C<use locale> that doesn't include all +categories is specified, Perl ignores the excluded categories. If C<set_locale()> fails for some reason (for example, an attempt to set to a locale unknown to the system), the locale for the category is not @@ -720,8 +755,8 @@ basic category at a time. See L<"ENVIRONMENT"> for a discussion of these. =head2 Category C<LC_COLLATE>: Collation -In the scope of S<C<use locale>> (but not a -C<use locale ':not_characters'>), Perl looks to the C<LC_COLLATE> +In the scope of a S<C<use locale>> form that includes collation, Perl +looks to the C<LC_COLLATE> environment variable to determine the application's notions on collation (ordering) of characters. For example, "b" follows "a" in Latin alphabets, but where do "E<aacute>" and "E<aring>" belong? And while @@ -807,8 +842,8 @@ always obey the current C<LC_COLLATE> locale. =head2 Category C<LC_CTYPE>: Character Types -In the scope of S<C<use locale>> (but not a -C<use locale ':not_characters'>), Perl obeys the C<LC_CTYPE> locale +In the scope of a S<C<use locale>> form that includes C<LC_CTYPE>, Perl +obeys the C<LC_CTYPE> locale setting. This controls the application's notion of which characters are alphabetic, numeric, punctuation, I<etc>. This affects Perl's C<\w> regular expression metanotation, @@ -866,10 +901,10 @@ should use C<\w> with the C</a> regular expression modifier. See L<"SECURITY">. =head2 Category C<LC_NUMERIC>: Numeric Formatting -After a proper C<POSIX::setlocale()> call, and within the scope of one -of the C<use locale> variants, Perl obeys the C<LC_NUMERIC> -locale information, which controls an application's idea of how numbers -should be formatted for human readability. +After a proper C<POSIX::setlocale()> call, and within the scope of +of a C<use locale> form that includes numerics, Perl obeys the +C<LC_NUMERIC> locale information, which controls an application's idea +of how numbers should be formatted for human readability. In most implementations the only effect is to change the character used for the decimal point--perhaps from "." to ",". The functions aren't aware of such niceties as thousands separation and @@ -1011,8 +1046,8 @@ Scalar true/false (or less/equal/greater) result is never tainted. B<Case-mapping interpolation> (with C<\l>, C<\L>, C<\u>, C<\U>, or C<\F>) -Result string containing interpolated material is tainted if -C<use locale> (but not S<C<use locale ':not_characters'>>) is in effect. +The result string containing interpolated material is tainted if +a C<use locale> form that includes C<LC_CTYPE> is in effect. =item * @@ -1021,8 +1056,8 @@ B<Matching operator> (C<m//>): Scalar true/false result never tainted. All subpatterns, either delivered as a list-context result or as C<$1> -I<etc>., are tainted if C<use locale> (but not -S<C<use locale ':not_characters'>>) is in effect, and the subpattern +I<etc>., are tainted if a C<use locale> form that includes +C<LC_CTYPE> is in effect, and the subpattern regular expression contains a locale-dependent construct. These constructs include C<\w> (to match an alphanumeric character), C<\W> (non-alphanumeric character), C<\b> and C<\B> (word-boundary and @@ -1046,8 +1081,8 @@ The matched-pattern variables, C<$&>, C<$`> (pre-match), C<$'> B<Substitution operator> (C<s///>): Has the same behavior as the match operator. Also, the left -operand of C<=~> becomes tainted when C<use locale> -(but not S<C<use locale ':not_characters'>>) is in effect if modified as +operand of C<=~> becomes tainted when a C<use locale> +form that includes C<LC_CTYPE> is in effect, if modified as a result of a substitution based on a regular expression match involving any of the things mentioned in the previous item, or of case-mapping, such as C<\l>, C<\L>,C<\u>, C<\U>, or C<\F>. @@ -1064,8 +1099,8 @@ effect. B<Case-mapping functions> (C<lc()>, C<lcfirst()>, C<uc()>, C<ucfirst()>): -Results are tainted if C<use locale> (but not -S<C<use locale ':not_characters'>>) is in effect. +Results are tainted if a C<use locale> form that includes C<LC_CTYPE> is +in effect. =item * @@ -1274,6 +1309,10 @@ something like: This prints C<2.7>. +You could also exclude C<LC_NUMERIC>, if you don't need it, by + + use locale ':!numeric'; + =head2 Backward compatibility Versions of Perl prior to 5.004 B<mostly> ignored locale information, @@ -1282,7 +1321,7 @@ always in force, even if the program environment suggested otherwise (see L<The setlocale function>). By default, Perl still behaves this way for backward compatibility. If you want a Perl application to pay attention to locale information, you B<must> use the S<C<use locale>> -pragma (see L<The use locale pragma>) or, in the unlikely event +pragma (see L<The "use locale" pragma>) or, in the unlikely event that you want to do so for just pattern matching, the C</l> regular expression modifier (see L<perlre/Character set modifiers>) to instruct it to do so. @@ -2133,7 +2133,7 @@ PP(pp_sle) tryAMAGICbin_MG(amg_type, AMGf_set); { dPOPTOPssrl; - const int cmp = (IN_LOCALE_RUNTIME + const int cmp = (IN_LC_RUNTIME(LC_COLLATE) ? sv_cmp_locale_flags(left, right, 0) : sv_cmp_flags(left, right, 0)); SETs(boolSV(cmp * multiplier < rhs)); @@ -2169,7 +2169,7 @@ PP(pp_scmp) tryAMAGICbin_MG(scmp_amg, 0); { dPOPTOPssrl; - const int cmp = (IN_LOCALE_RUNTIME + const int cmp = (IN_LC_RUNTIME(LC_COLLATE) ? sv_cmp_locale_flags(left, right, 0) : sv_cmp_flags(left, right, 0)); SETi( cmp ); @@ -3506,10 +3506,10 @@ PP(pp_ucfirst) doing_utf8 = TRUE; ulen = UTF8SKIP(s); if (op_type == OP_UCFIRST) { - _to_utf8_title_flags(s, tmpbuf, &tculen, IN_LOCALE_RUNTIME); + _to_utf8_title_flags(s, tmpbuf, &tculen, IN_LC_RUNTIME(LC_CTYPE)); } else { - _to_utf8_lower_flags(s, tmpbuf, &tculen, IN_LOCALE_RUNTIME); + _to_utf8_lower_flags(s, tmpbuf, &tculen, IN_LC_RUNTIME(LC_CTYPE)); } /* we can't do in-place if the length changes. */ @@ -3527,11 +3527,11 @@ PP(pp_ucfirst) if (op_type == OP_LCFIRST) { /* lower case the first letter: no trickiness for any character */ - *tmpbuf = (IN_LOCALE_RUNTIME) ? toLOWER_LC(*s) : + *tmpbuf = (IN_LC_RUNTIME(LC_CTYPE)) ? toLOWER_LC(*s) : ((IN_UNI_8_BIT) ? toLOWER_LATIN1(*s) : toLOWER(*s)); } /* is ucfirst() */ - else if (IN_LOCALE_RUNTIME) { + else if (IN_LC_RUNTIME(LC_CTYPE)) { if (IN_UTF8_CTYPE_LOCALE) { goto do_uni_rules; } @@ -3683,7 +3683,7 @@ PP(pp_ucfirst) SvCUR_set(dest, need - 1); } } - if (IN_LOCALE_RUNTIME) { + if (IN_LC_RUNTIME(LC_CTYPE)) { TAINT; SvTAINTED_on(dest); } @@ -3714,7 +3714,7 @@ PP(pp_uc) (SvTEMP(source) && !SvSMAGICAL(source) && SvREFCNT(source) == 1)) && !SvREADONLY(source) && SvPOK(source) && !DO_UTF8(source) - && ((IN_LOCALE_RUNTIME) + && ((IN_LC_RUNTIME(LC_CTYPE)) ? ! IN_UTF8_CTYPE_LOCALE : ! IN_UNI_8_BIT)) { @@ -3781,7 +3781,7 @@ PP(pp_uc) * and copy it to the output buffer */ u = UTF8SKIP(s); - uv = _to_utf8_upper_flags(s, tmpbuf, &ulen, IN_LOCALE_RUNTIME); + uv = _to_utf8_upper_flags(s, tmpbuf, &ulen, IN_LC_RUNTIME(LC_CTYPE)); #define GREEK_CAPITAL_LETTER_IOTA 0x0399 #define COMBINING_GREEK_YPOGEGRAMMENI 0x0345 if (uv == GREEK_CAPITAL_LETTER_IOTA @@ -3824,7 +3824,7 @@ PP(pp_uc) /* Use locale casing if in locale; regular style if not treating * latin1 as having case; otherwise the latin1 casing. Do the * whole thing in a tight loop, for speed, */ - if (IN_LOCALE_RUNTIME) { + if (IN_LC_RUNTIME(LC_CTYPE)) { if (IN_UTF8_CTYPE_LOCALE) { goto do_uni_rules; } @@ -3926,7 +3926,7 @@ PP(pp_uc) SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); } } /* End of isn't utf8 */ - if (IN_LOCALE_RUNTIME) { + if (IN_LC_RUNTIME(LC_CTYPE)) { TAINT; SvTAINTED_on(dest); } @@ -3987,7 +3987,7 @@ PP(pp_lc) const STRLEN u = UTF8SKIP(s); STRLEN ulen; - _to_utf8_lower_flags(s, tmpbuf, &ulen, IN_LOCALE_RUNTIME); + _to_utf8_lower_flags(s, tmpbuf, &ulen, IN_LC_RUNTIME(LC_CTYPE)); /* Here is where we would do context-sensitive actions. See the * commit message for 86510fb15 for why there isn't any */ @@ -4024,7 +4024,7 @@ PP(pp_lc) /* Use locale casing if in locale; regular style if not treating * latin1 as having case; otherwise the latin1 casing. Do the * whole thing in a tight loop, for speed, */ - if (IN_LOCALE_RUNTIME) { + if (IN_LC_RUNTIME(LC_CTYPE)) { for (; s < send; d++, s++) *d = toLOWER_LC(*s); } @@ -4044,7 +4044,7 @@ PP(pp_lc) SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); } } - if (IN_LOCALE_RUNTIME) { + if (IN_LC_RUNTIME(LC_CTYPE)) { TAINT; SvTAINTED_on(dest); } @@ -4081,7 +4081,7 @@ PP(pp_quotemeta) /* In locale, we quote all non-ASCII Latin1 chars. * Otherwise use the quoting rules */ - if (IN_LOCALE_RUNTIME + if (IN_LC_RUNTIME(LC_CTYPE) || _isQUOTEMETA(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s + 1)))) { to_quote = TRUE; @@ -4143,7 +4143,7 @@ PP(pp_fc) U8 tmpbuf[UTF8_MAXBYTES_CASE + 1]; const bool full_folding = TRUE; const U8 flags = ( full_folding ? FOLD_FLAGS_FULL : 0 ) - | ( IN_LOCALE_RUNTIME ? FOLD_FLAGS_LOCALE : 0 ); + | ( IN_LC_RUNTIME(LC_CTYPE) ? FOLD_FLAGS_LOCALE : 0 ); /* This is a facsimile of pp_lc, but with a thousand bugs thanks to me. * You are welcome(?) -Hugmeir @@ -4191,7 +4191,7 @@ PP(pp_fc) SvUTF8_on(dest); } /* Unflagged string */ else if (len) { - if ( IN_LOCALE_RUNTIME ) { /* Under locale */ + if ( IN_LC_RUNTIME(LC_CTYPE) ) { /* Under locale */ if (IN_UTF8_CTYPE_LOCALE) { goto do_uni_folding; } @@ -4270,7 +4270,7 @@ PP(pp_fc) *d = '\0'; SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); - if (IN_LOCALE_RUNTIME) { + if (IN_LC_RUNTIME(LC_CTYPE)) { TAINT; SvTAINTED_on(dest); } @@ -1727,7 +1727,7 @@ PP(pp_sort) ? ( ( ( priv & OPpSORT_INTEGER) || all_SIVs) ? ( overloading ? S_amagic_i_ncmp : S_sv_i_ncmp) : ( overloading ? S_amagic_ncmp : S_sv_ncmp ) ) - : ( IN_LOCALE_RUNTIME + : ( IN_LC_RUNTIME(LC_COLLATE) ? ( overloading ? (SVCOMPARE_t)S_amagic_cmp_locale : (SVCOMPARE_t)sv_cmp_locale_static) @@ -3470,7 +3470,7 @@ PP(pp_fttext) #else else if (*s & 128) { #ifdef USE_LOCALE - if (IN_LOCALE_RUNTIME && isALPHA_LC(*s)) + if (IN_LC_RUNTIME(LC_CTYPE) && isALPHA_LC(*s)) continue; #endif /* utf8 characters don't count as odd */ @@ -42,6 +42,7 @@ PERL_CALLCONV void Perl_Slab_Free(pTHX_ void *op) #define PERL_ARGS_ASSERT_SLAB_FREE \ assert(op) +PERL_CALLCONV bool Perl__is_in_locale_category(pTHX_ const bool compiling, const int category); PERL_CALLCONV bool Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) __attribute__warn_unused_result__; diff --git a/t/run/locale.t b/t/run/locale.t index e662e3ca32..4ecc2b24f1 100644 --- a/t/run/locale.t +++ b/t/run/locale.t @@ -111,7 +111,8 @@ EOF "format() does not look at LC_NUMERIC without 'use locale'"); { - fresh_perl_is(<<'EOF', $difference, {}, + fresh_perl_is(<<'EOF', "$difference\n", {}, +use POSIX; use locale; format STDOUT = @.# @@ -133,15 +134,55 @@ EOF } { + my $categories = ":collate :characters :collate :ctype :monetary :time"; + fresh_perl_is(<<"EOF", "4.2", {}, +use locale qw($categories); +format STDOUT = +@.# +4.179 +. +write; +EOF + "format() does not look at LC_NUMERIC with 'use locale qw($categories)'"); + } + + { + fresh_perl_is(<<'EOF', $difference, {}, +use locale; +format STDOUT = +@.# +4.179 +. +write; +EOF + "format() looks at LC_NUMERIC with 'use locale'"); + } + + for my $category (qw(collate characters collate ctype monetary time)) { + for my $negation ("!", "not_") { + fresh_perl_is(<<"EOF", $difference, {}, +use locale ":$negation$category"; +format STDOUT = +@.# +4.179 +. +write; +EOF + "format() looks at LC_NUMERIC with 'use locale \":" + . "$negation$category\"'"); + } + } + + { fresh_perl_is(<<'EOF', $difference, {}, -use locale ":not_characters"; +use locale ":numeric"; format STDOUT = @.# 4.179 . write; EOF - "format() looks at LC_NUMERIC with 'use locale \":not_characters\"'"); + "format() looks at LC_NUMERIC with 'use locale \":numeric\"'"); } { @@ -371,4 +412,4 @@ EOF } -sub last { 21 } +sub last { 35 } @@ -422,8 +422,11 @@ Perl's extended UTF-8 means we can have start bytes up to FF. #define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES) #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES) #define IN_UNI_8_BIT \ - (CopHINTS_get(PL_curcop) & (HINT_UNI_8_BIT|HINT_LOCALE_NOT_CHARS) \ - && ! IN_LOCALE_RUNTIME && ! IN_BYTES) + (((CopHINTS_get(PL_curcop) & (HINT_UNI_8_BIT)) \ + || (CopHINTS_get(PL_curcop) & HINT_LOCALE_PARTIAL \ + /* -1 below is for :not_characters */ \ + && _is_in_locale_category(FALSE, -1))) \ + && ! IN_BYTES) #define UTF8_ALLOW_EMPTY 0x0001 /* Allow a zero length string */ |