summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MANIFEST1
-rw-r--r--pod/perlunicode.pod77
-rw-r--r--t/op/turkish.t100
3 files changed, 171 insertions, 7 deletions
diff --git a/MANIFEST b/MANIFEST
index a676231146..197d35966f 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -4506,6 +4506,7 @@ t/op/tie.t See if tie/untie functions work
t/op/time_loop.t Test that very large values don't hang gmtime and localtime.
t/op/time.t See if time functions work
t/op/tr.t See if tr works
+t/op/turkish.t See if we can implement Turkish casing
t/op/undef.t See if undef works
t/op/universal.t See if UNIVERSAL class works
t/op/unshift.t See if unshift works
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 1b4c249339..bd193f8b92 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -899,20 +899,83 @@ to be mapped to "A"; all other characters will remain unchanged.
(For serious hackers only) The above means you have to furnish a complete
mapping; you can't just override a couple of characters and leave the rest
-unchanged. You can find all the mappings in the directory
+unchanged. You can find all the official mappings in the directory
C<$Config{privlib}>F</unicore/To/>. The mapping data is returned as the
here-document. The C<utf8::ToSpecI<Foo>> hashes in those files are special
exception mappings derived from
-C<$Config{privlib}>F</unicore/SpecialCasing.txt>. The "Digit" and
+C<$Config{privlib}>F</unicore/SpecialCasing.txt>. (The "Digit" and
"Fold" mappings that one can see in the directory are not directly
user-accessible, one can use either the L<Unicode::UCD> module, or just match
-case-insensitively (that's when the "Fold" mapping is used).
+case-insensitively, which is what uses the "Fold" mapping. Neither are user
+overridable.)
-The mappings will only take effect on scalars that have been marked as having
-Unicode characters, for example by using C<utf8::upgrade()>.
-Old byte-style strings are not affected.
+If you have many mappings to change, you can take the official mapping data,
+change by hand the affected code points, and place the whole thing into your
+subroutine. But this will only be valid on Perls that use the same Unicode
+version. Another option would be to have your subroutine read the official
+mapping file(s) and overwrite the affected code points.
-The mappings are in effect for the package they are defined in.
+If you have only a few mappings to change, starting in 5.14 you can use the
+following trick, here illustrated for Turkish.
+
+ use Config;
+
+ sub ToUpper {
+ my $official = do "$Config{privlib}/unicore/To/Upper.pl";
+ $utf8::ToSpecUpper{'i'} =
+ "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}";
+ return $official;
+ }
+
+This takes the official mappings and overrides just one, for "LATIN SMALL
+LETTER I". The keys to the hash must be in UTF-8 (or on EBCDIC platforms,
+UTF-EBCDIC), as illustrated by the inverse function.
+
+ sub ToLower {
+ my $official = do $lower;
+ $utf8::ToSpecLower{"\xc4\xb0"} = "i";
+ return $official;
+ }
+
+This example is for an ASCII platform, and C<\xc4\xb0> is the UTF-8 string that
+represents C<\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}>, C<U+0130>.
+
+(The trick illustrated here does work in earlier releases, but only if all the
+characters you want to override have ordinal values of 256 or higher.)
+
+The mappings are in effect only for the package they are defined in, and only
+on scalars that have been marked as having Unicode characters, for example by
+using C<utf8::upgrade()>. You can get around the latter restriction in the
+scope of a C<S<use subs>>:
+
+ use subs qw(uc ucfirst lc lcfirst);
+
+ sub uc($) {
+ my $string = shift;
+ utf8::upgrade($string);
+ return CORE::uc($string);
+ }
+
+ sub lc($) {
+ my $string = shift;
+ utf8::upgrade($string);
+
+ # Unless an I is before a dot_above, it turns into a dotless i.
+ $string =~
+ s/I (?! [^\p{ccc=0}\p{ccc=Above}]* \x{0307} )/\x{131}/gx;
+
+ # But when the I is followed by a dot_above, remove the
+ # dot_above so the end result will be i.
+ $string =~ s/I ([^\p{ccc=0}\p{ccc=Above}]* ) \x{0307}/i$1/gx;
+ return CORE::lc($string);
+ }
+
+These examples (also for Turkish) make sure the input is in UTF-8, and then
+call the corresponding official function, which will use the C<ToUpper()> and
+C<ToLower()> functions you have defined in the package. The C<lc()> example
+shows how you can add context-dependent casing. (For Turkish, there other
+required functions: C<ucfirst>, C<lcfirst>, and C<ToTitle>. These are very
+similar to the ones given above.)
=head2 Character Encodings for Input and Output
diff --git a/t/op/turkish.t b/t/op/turkish.t
new file mode 100644
index 0000000000..08e2baceb0
--- /dev/null
+++ b/t/op/turkish.t
@@ -0,0 +1,100 @@
+# Verifies that can implement Turkish casing as defined by Unicode 5.2.
+
+use Config;
+
+BEGIN {
+ chdir 't';
+ @INC = '../lib';
+ require './test.pl';
+}
+
+use subs qw(lc lcfirst uc ucfirst);
+
+sub uc($) {
+ my $string = shift;
+ utf8::upgrade($string);
+ return CORE::uc($string);
+}
+
+sub ucfirst($) {
+ my $string = shift;
+ utf8::upgrade($string);
+ return CORE::ucfirst($string);
+}
+
+sub lc($) {
+ my $string = shift;
+ utf8::upgrade($string);
+
+ # Unless an I is before a dot_above, it turns into a dotless i.
+ $string =~ s/I (?! [^\p{ccc=0}\p{ccc=Above}]* \x{0307} )/\x{131}/gx;
+
+ # But when the I is followed by a dot_above, remove the dot_above so
+ # the end result will be i.
+ $string =~ s/I ([^\p{ccc=0}\p{ccc=Above}]* ) \x{0307}/i$1/gx;
+ return CORE::lc($string);
+}
+
+sub lcfirst($) {
+ my $string = shift;
+ utf8::upgrade($string);
+
+ # Unless an I is before a dot_above, it turns into a dotless i.
+ $string =~ s/^I (?! [^\p{ccc=0}\p{ccc=Above}]* \x{0307} )/\x{131}/x;
+
+ # But when the I is followed by a dot_above, remove the dot_above so
+ # the end result will be i.
+ $string =~ s/^I ([^\p{ccc=0}\p{ccc=Above}]* ) \x{0307}/i$1/x;
+ return CORE::lcfirst($string);
+}
+
+plan tests => 22;
+
+my $map_directory = "../lib/unicore/To";
+my $upper = "$map_directory/Upper.pl";
+my $lower = "$map_directory/Lower.pl";
+my $title = "$map_directory/Title.pl";
+
+sub ToUpper {
+ my $official = do $upper;
+ $utf8::ToSpecUpper{'i'} = "\x{0130}";
+ return $official;
+}
+
+sub ToTitle {
+ my $official = do $title;
+ $utf8::ToSpecTitle{'i'} = "\x{0130}";
+ return $official;
+}
+
+sub ToLower {
+ my $official = do $lower;
+ $utf8::ToSpecLower{"\xc4\xb0"} = "i";
+ return $official;
+}
+
+is(uc("\x{DF}\x{DF}"), "SSSS", "Verify that uc of non-overridden multi-char works");
+is(uc("aa"), "AA", "Verify that uc of non-overridden ASCII works");
+is(uc("\x{101}\x{101}"), "\x{100}\x{100}", "Verify that uc of non-overridden utf8 works");
+is(uc("ii"), "\x{130}\x{130}", "Verify uc('i') eq \\x{130}");
+
+is(ucfirst("\x{DF}\x{DF}"), "Ss\x{DF}", "Verify that ucfirst of non-overridden multi-char works");
+is(ucfirst("\x{101}\x{101}"), "\x{100}\x{101}", "Verify that ucfirst of non-overridden utf8 works");
+is(ucfirst("aa"), "Aa", "Verify that ucfirst of non-overridden ASCII works");
+is(ucfirst("ii"), "\x{130}i", "Verify ucfirst('ii') eq \"\\x{130}i\"");
+
+is(lc("AA"), "aa", "Verify that lc of non-overridden ASCII works");
+is(lc("\x{C0}\x{C0}"), "\x{E0}\x{E0}", "Verify that lc of non-overridden latin1 works");
+is(lc("\x{0178}\x{0178}"), "\x{FF}\x{FF}", "Verify that lc of non-overridden utf8 works");
+is(lc("II"), "\x{131}\x{131}", "Verify that lc('I') eq \\x{131}");
+is(lc("IG\x{0307}IG\x{0307}"), "\x{131}g\x{0307}\x{131}g\x{0307}", "Verify that lc(\"I...\\x{0307}\") eq \"\\x{131}...\\x{0307}\"");
+is(lc("I\x{0307}I\x{0307}"), "ii", "Verify that lc(\"I\\x{0307}\") removes the \\x{0307}, leaving 'i'");
+is(lc("\x{130}\x{130}"), "ii", "Verify that lc(\"\\x{130}\") eq 'i'");
+
+is(lcfirst("AA"), "aA", "Verify that lcfirst of non-overridden ASCII works");
+is(lcfirst("\x{C0}\x{C0}"), "\x{E0}\x{C0}", "Verify that lcfirst of non-overridden latin1 works");
+is(lcfirst("\x{0178}\x{0178}"), "\x{FF}\x{0178}", "Verify that lcfirst of non-overridden utf8 works");
+is(lcfirst("I"), "\x{131}", "Verify that lcfirst('II') eq \"\\x{131}I\"");
+is(lcfirst("IG\x{0307}"), "\x{131}G\x{0307}", "Verify that lcfirst(\"I...\\x{0307}\") eq \"\\x{131}...\\x{0307}\"");
+is(lcfirst("I\x{0307}I\x{0307}"), "iI\x{0307}", "Verify that lcfirst(\"I\\x{0307}I\\x{0307}\") removes the first \\x{0307}, leaving 'iI\\x{0307}'");
+is(lcfirst("\x{130}\x{130}"), "i\x{130}", "Verify that lcfirst(\"\\x{130}\\x{130}\") eq \"i\\x{130}\"");