summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-02-15 12:27:23 -0700
committerKarl Williamson <public@khwilliamson.com>2012-02-15 18:02:35 -0700
commit20adcf7c80fc3ad8cffa7d52b95a40d87bd781d0 (patch)
tree42b6ebc3be70e41366f78282be3e658182a06468
parent29050de536086566fb4d4a9db818b8068dd118a0 (diff)
downloadperl-20adcf7c80fc3ad8cffa7d52b95a40d87bd781d0.tar.gz
pp_quote_meta: in locale, quote all UTF-8 Latin1 non-ASCII
Under locale rules, this commit quotes all non-ASCII Latin1 characters in UTF-8 encoded strings. This provides consistency with this function and other functions, such as lc().
-rw-r--r--pod/perlfunc.pod6
-rw-r--r--pp.c6
-rw-r--r--t/op/quotemeta.t48
3 files changed, 58 insertions, 2 deletions
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
index 7cec3bbcc2..8d97258a8d 100644
--- a/pod/perlfunc.pod
+++ b/pod/perlfunc.pod
@@ -4976,6 +4976,12 @@ compatibility for old programs which do not use Unicode. (Note that
C<unicode_strings> is automatically enabled within the scope of a
S<C<use v5.12>> or greater.)
+Within the scope of C<use locale>, all non-ASCII Latin1 code points
+are quoted whether the string is encoded as UTF-8 or not. As mentioned
+above, locale does not affect the quoting of ASCII-range characters.
+This protects against those locales where characters such as C<"|"> are
+considered to be word characters.
+
Otherwise, Perl quotes non-ASCII characters using an adaptation from
Unicode (see L<http://www.unicode.org/reports/tr31/>.)
The only code points that are quoted are those that have any of the
diff --git a/pp.c b/pp.c
index a2b34b30eb..f1d79f7b0a 100644
--- a/pp.c
+++ b/pp.c
@@ -4097,7 +4097,11 @@ PP(pp_quotemeta)
}
}
else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
- if (_isQUOTEMETA(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1))))
+
+ /* In locale, we quote all non-ASCII Latin1 chars.
+ * Otherwise use the quoting rules */
+ if (IN_LOCALE_RUNTIME
+ || _isQUOTEMETA(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1))))
{
to_quote = TRUE;
}
diff --git a/t/op/quotemeta.t b/t/op/quotemeta.t
index 9cec0bdd89..d62563c309 100644
--- a/t/op/quotemeta.t
+++ b/t/op/quotemeta.t
@@ -7,7 +7,7 @@ BEGIN {
require "test.pl";
}
-plan tests => 40;
+plan tests => 60;
if ($Config{ebcdic} eq 'define') {
$_ = join "", map chr($_), 129..233;
@@ -75,6 +75,25 @@ is(length(quotemeta($char)), 1, "quotemeta '\\N{U+D8}' in UTF-8 length");
is(length(quotemeta("\x{d7}")), 2, "quotemeta Latin1 no unicode_strings quoted length");
is(quotemeta("\x{d8}"), "\\\x{d8}", "quotemeta Latin1 no unicode_strings quoted");
is(length(quotemeta("\x{d8}")), 2, "quotemeta Latin1 no unicode_strings quoted length");
+
+ use locale;
+
+ my $char = ":";
+ is(quotemeta($char), "\\$char", "quotemeta '$char' locale");
+ is(length(quotemeta($char)), 2, "quotemeta '$char' locale");
+
+ $char = "M";
+ utf8::upgrade($char);
+ is(quotemeta($char), "$char", "quotemeta '$char' locale");
+ is(length(quotemeta($char)), 1, "quotemeta '$char' locale");
+
+ my $char = "\x{D7}";
+ is(quotemeta($char), "\\$char", "quotemeta '\\x{D7}' locale");
+ is(length(quotemeta($char)), 2, "quotemeta '\\x{D7}' locale length");
+
+ $char = "\x{D8}"; # Every non-ASCII Latin1 is quoted in locale.
+ is(quotemeta($char), "\\$char", "quotemeta '\\x{D8}' locale");
+ is(length(quotemeta($char)), 2, "quotemeta '\\x{D8}' locale length");
}
{
use feature 'unicode_strings';
@@ -82,6 +101,33 @@ is(length(quotemeta($char)), 1, "quotemeta '\\N{U+D8}' in UTF-8 length");
is(length(quotemeta("\x{d7}")), 2, "quotemeta Latin1 unicode_strings quoted length");
is(quotemeta("\x{d8}"), "\x{d8}", "quotemeta Latin1 unicode_strings nonquoted");
is(length(quotemeta("\x{d8}")), 1, "quotemeta Latin1 unicode_strings nonquoted length");
+
+ use locale;
+
+ my $char = ":";
+ utf8::upgrade($char);
+ is(quotemeta($char), "\\$char", "quotemeta '$char' locale in UTF-8");
+ is(length(quotemeta($char)), 2, "quotemeta '$char' locale in UTF-8 length");
+
+ $char = "M";
+ utf8::upgrade($char);
+ is(quotemeta($char), "$char", "quotemeta '$char' locale in UTF-8");
+ is(length(quotemeta($char)), 1, "quotemeta '$char' locale in UTF-8 length");
+
+ my $char = "\N{U+D7}";
+ utf8::upgrade($char);
+ is(quotemeta($char), "\\$char", "quotemeta '\\N{U+D7}' locale in UTF-8");
+ is(length(quotemeta($char)), 2, "quotemeta '\\N{U+D7}' locale in UTF-8 length");
+
+ $char = "\N{U+D8}"; # Every non-ASCII Latin1 is quoted in locale.
+ utf8::upgrade($char);
+ is(quotemeta($char), "\\$char", "quotemeta '\\N{U+D8}' locale in UTF-8");
+ is(length(quotemeta($char)), 2, "quotemeta '\\N{U+D8}' locale in UTF-8 length");
+
+ is(quotemeta("\x{263a}"), "\\\x{263a}", "quotemeta locale Unicode quoted");
+ is(length(quotemeta("\x{263a}")), 2, "quotemeta locale Unicode quoted length");
+ is(quotemeta("\x{100}"), "\x{100}", "quotemeta locale Unicode nonquoted");
+ is(length(quotemeta("\x{100}")), 1, "quotemeta locale Unicode nonquoted length");
}
$a = "foo|bar";