summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@khw-desktop.(none)>2009-11-12 22:40:21 -0700
committerRafael Garcia-Suarez <rgs@consttype.org>2009-11-14 23:34:55 +0100
commit00f254e235ff10d6223aa9a402ad5b7a85689829 (patch)
treeecbf0c57e651245f5632419e0b86788eb252dac1
parenta1248f17ffcfa8fe0e91df962317b46b81fc0ce5 (diff)
downloadperl-00f254e235ff10d6223aa9a402ad5b7a85689829.tar.gz
add code for Unicode semantics for non-utf8 latin1 chars
-rw-r--r--MANIFEST1
-rw-r--r--handy.h13
-rwxr-xr-xlib/legacy.pm100
-rw-r--r--lib/legacy.t146
-rw-r--r--perl.h118
-rw-r--r--pp.c736
-rw-r--r--utf8.h1
7 files changed, 972 insertions, 143 deletions
diff --git a/MANIFEST b/MANIFEST
index 93042f644e..22db6a3c90 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -3476,6 +3476,7 @@ lib/integer.pm For "use integer"
lib/integer.t For "use integer" testing
lib/Internals.t For Internals::* testing
lib/legacy.pm Pragma to preserve legacy behavior
+lib/legacy.t For "use legacy" testing
lib/less.pm For "use less"
lib/less.t See if less support works
lib/locale.pm For "use locale"
diff --git a/handy.h b/handy.h
index 9ec64e0a09..848cc0e9bb 100644
--- a/handy.h
+++ b/handy.h
@@ -429,7 +429,7 @@ Returns a boolean indicating whether the C C<char> is a US-ASCII (Basic Latin)
alphanumeric character (including underscore) or digit.
=for apidoc Am|bool|isALPHA|char ch
-Returns a boolean indicating whether the C C<char> is a US-ASCII (Basic Latin)
+Returns a boolean indicating whether the C C<char> is a US-ASCII (Basic Latin)
alphabetic character.
=for apidoc Am|bool|isSPACE|char ch
@@ -479,7 +479,9 @@ US-ASCII (Basic Latin) range are viewed as not having any case.
# define isPUNCT(c) ispunct(c)
# define isXDIGIT(c) isxdigit(c)
# define toUPPER(c) toupper(c)
+# define toUPPER_LATIN1_MOD(c) UNI_TO_NATIVE(PL_mod_latin1_uc[(U8) NATIVE_TO_UNI(c)])
# define toLOWER(c) tolower(c)
+# define toLOWER_LATIN1(c) UNI_TO_NATIVE(PL_latin1_lc[(U8) NATIVE_TO_UNI(c)])
#else
# define isUPPER(c) ((c) >= 'A' && (c) <= 'Z')
# define isLOWER(c) ((c) >= 'a' && (c) <= 'z')
@@ -490,6 +492,15 @@ US-ASCII (Basic Latin) range are viewed as not having any case.
# define isPRINT(c) (((c) >= 32 && (c) < 127))
# define isPUNCT(c) (((c) >= 33 && (c) <= 47) || ((c) >= 58 && (c) <= 64) || ((c) >= 91 && (c) <= 96) || ((c) >= 123 && (c) <= 126))
# define isXDIGIT(c) (isDIGIT(c) || ((c) >= 'a' && (c) <= 'f') || ((c) >= 'A' && (c) <= 'F'))
+
+/* Use table lookup for speed */
+# define toLOWER_LATIN1(c) (PL_latin1_lc[(U8) c])
+
+/* Modified uc. Is correct uc except for three non-ascii chars which are
+ * all mapped to one of them, and these need special handling */
+# define toUPPER_LATIN1_MOD(c) (PL_mod_latin1_uc[(U8) c])
+
+/* ASCII casing. */
# define toUPPER(c) (isLOWER(c) ? (c) - ('a' - 'A') : (c))
# define toLOWER(c) (isUPPER(c) ? (c) + ('a' - 'A') : (c))
#endif
diff --git a/lib/legacy.pm b/lib/legacy.pm
index a1f21a6fc1..3ffea6912c 100755
--- a/lib/legacy.pm
+++ b/lib/legacy.pm
@@ -2,7 +2,7 @@ package legacy;
our $VERSION = '1.00';
-$unicode8bit::hint_bits = 0x00000800;
+$unicode8bit::hint_uni8bit = 0x00000800;
my %legacy_bundle = (
"5.10" => [qw(unicode8bit)],
@@ -20,21 +20,19 @@ behaviors
use legacy ':5.10'; # Keeps semantics the same as in perl 5.10
- no legacy;
-
-=cut
+ use legacy qw(unicode8bit);
- #no legacy qw(unicode8bit);
+ no legacy;
-=pod
+ no legacy qw(unicode8bit);
=head1 DESCRIPTION
Some programs may rely on behaviors that for others are problematic or
even wrong. A new version of Perl may change behaviors from past ones,
and when it is viewed that the old way of doing things may be required
-to still be supported, that behavior will be added to the list recognized
-by this pragma to allow that.
+to still be supported, the new behavior will be able to be turned off by using
+this pragma.
Additionally, a new behavior may be supported in a new version of Perl, but
for whatever reason the default remains the old one. This pragma can enable
@@ -44,24 +42,92 @@ Like other pragmas (C<use feature>, for example), C<use legacy qw(foo)> will
only make the legacy behavior for "foo" available from that point to the end of
the enclosing block.
-B<This pragma is, for the moment, a skeleton and does not actually affect any
-behaviors yet>
-
=head2 B<use legacy>
Preserve the old way of doing things when a new version of Perl is
-released that changes things
+released that would otherwise change the behavior.
+
+The one current possibility is:
+
+=head3 unicode8bit
+
+THIS IS SUBJECT TO CHANGE
+
+Use legacy semantics for the 128 characters on ASCII systems that have the 8th
+bit set. (See L</EBCDIC platforms> below for EBCDIC systems.) Unless
+C<S<use locale>> is specified, or the scalar containing such a character is
+known by Perl to be encoded in UTF8, the semantics are essentially that the
+characters have an ordinal number, and that's it. They are caseless, and
+aren't anything: they're not controls, not letters, not punctuation, ..., not
+anything.
+
+This behavior stems from when Perl did not support Unicode, and ASCII was the
+only known character set outside of C<S<use locale>>. In order to not
+possibly break pre_Unicode programs, these characters have retained their old
+non-meanings, except when it is clear to Perl that Unicode is what is meant,
+for example by calling utf::upgrade() on a scalar, or if the scalar also
+contains characters that are only available in Unicode. Then these 128
+characters take on their Unicode meanings.
+
+The problem with this behavior is that a scalar that encodes these characters
+has a different meaning depending on if it is stored as utf8 or not.
+In general, the internal storage method should not affect the
+external behavior.
+
+The behavior is known to have effects on these areas:
+
+=over 4
+
+=item
+
+Changing the case of a scalar, that is, using C<uc()>,
+C<ucfirst()>,
+C<lc()>,
+and C<lcfirst()>, or C<\L>, C<\U>, C<\u> and C<\l> in regular expression substitutions.
+
+=item
+
+Using caseless (C</i>) regular expression matching
+
+=item
+
+Matching a number of properties in regular expressions, such as C<\w>
+
+=item
+
+User-defined case change mappings. You can create a C<ToUpper()> function, for
+example, which overrides Perl's built-in case mappings. The scalar must be
+encoded in utf8 for your function to actually be invoked.
+
+=back
+
+B<This lack of semantics for these characters is currently the default,>
+outside of C<use locale>. See below for EBCDIC.
+To turn on B<case changing semantics only> for these characters, use
+C<S<no legacy>>.
+The other legacy behaviors regarding these characters are currently
+unaffected by this pragma.
+
+=head4 EBCDIC platforms
+
+On EBCDIC platforms, the situation is somewhat different. The legacy
+semantics are whatever the underlying semantics of the native C language
+library are. Each of the three EBCDIC encodings currently known by Perl is an
+isomorph of the Latin-1 character set. That means every character in Latin-1
+has a corresponding EBCDIC equivalent, and vice-versa. Specifying C<S<no
+legacy>> currently makes sure that all EBCDIC characters have the same
+B<casing only> semantics as their corresponding Latin-1 characters.
=head2 B<no legacy>
Turn on a new behavior in a version of Perl that understands
it but has it turned off by default. For example, C<no legacy 'foo'> turns on
-behavior C<foo> in the lexical scope of the pragma. Simply C<no legacy>
-turns on all new behaviors known to the pragma.
+behavior C<foo> in the lexical scope of the pragma. C<no legacy>
+without any modifier turns on all new behaviors known to the pragma.
=head1 LEGACY BUNDLES
-It's possible to turn off all new behaviors past a given release by
+It's possible to turn off all new behaviors past a given release by
using a I<legacy bundle>, which is the name of the release prefixed with
a colon, to distinguish it from an individual legacy behavior.
@@ -93,7 +159,7 @@ sub import {
if (!exists $legacy{$name}) {
unknown_legacy($name);
}
- $^H &= ~$unicode8bit::hint_bits; # The only thing it could be as of yet
+ $^H &= ~$unicode8bit::hint_uni8bit; # The only valid thing as of yet
}
}
@@ -116,7 +182,7 @@ sub unimport {
unknown_legacy($name);
}
else {
- $^H |= $unicode8bit::hint_bits; # The only thing it could be as of yet
+ $^H |= $unicode8bit::hint_uni8bit; # The only valid thing as of yet
}
}
}
diff --git a/lib/legacy.t b/lib/legacy.t
new file mode 100644
index 0000000000..1d332b7be3
--- /dev/null
+++ b/lib/legacy.t
@@ -0,0 +1,146 @@
+use warnings;
+use strict;
+
+BEGIN {
+ chdir 't' if -d 't';
+ @INC = '../lib';
+ require './test.pl';
+}
+
+#use Test::More;
+
+#plan("no_plan");
+plan(13312);
+
+# First compute the case mappings without resorting to the functions we're
+# testing.
+
+# Initialize the arrays so each $i maps to itself.
+my @posix_to_upper;
+for my $i (0 .. 255) {
+ $posix_to_upper[$i] = chr($i);
+}
+my @posix_to_lower
+= my @posix_to_title
+= my @latin1_to_upper
+= my @latin1_to_lower
+= my @latin1_to_title
+= @posix_to_upper;
+
+# Override the elements in the to_lower arrays that have different lower case
+# mappings with those mappings.
+for my $i (0x41 .. 0x5A) {
+ $posix_to_lower[$i] = chr(ord($posix_to_lower[$i]) + 32);
+ $latin1_to_lower[$i] = chr(ord($latin1_to_lower[$i]) + 32);
+}
+
+# Same for upper and title
+for my $i (0x61 .. 0x7A) {
+ $posix_to_upper[$i] = chr(ord($posix_to_upper[$i]) - 32);
+ $latin1_to_upper[$i] = chr(ord($latin1_to_upper[$i]) - 32);
+ $posix_to_title[$i] = chr(ord($posix_to_title[$i]) - 32);
+ $latin1_to_title[$i] = chr(ord($latin1_to_title[$i]) - 32);
+}
+
+# And the same for those in the latin1 range
+for my $i (0xC0 .. 0xD6, 0xD8 .. 0xDE) {
+ $latin1_to_lower[$i] = chr(ord($latin1_to_lower[$i]) + 32);
+}
+for my $i (0xE0 .. 0xF6, 0xF8 .. 0xFE) {
+ $latin1_to_upper[$i] = chr(ord($latin1_to_upper[$i]) - 32);
+ $latin1_to_title[$i] = chr(ord($latin1_to_title[$i]) - 32);
+}
+
+# Override the abnormal cases.
+$latin1_to_upper[0xB5] = chr(0x39C);
+$latin1_to_title[0xB5] = chr(0x39C);
+$latin1_to_upper[0xDF] = 'SS';
+$latin1_to_title[0xDF] = 'Ss';
+$latin1_to_upper[0xFF] = chr(0x178);
+$latin1_to_title[0xFF] = chr(0x178);
+
+my $repeat = 25; # Length to make strings.
+
+# Create hashes of strings in several ranges, both for uc and lc.
+my %posix;
+$posix{'uc'} = 'A' x $repeat;
+$posix{'lc'} = 'a' x $repeat ;
+
+my %cyrillic;
+$cyrillic{'uc'} = chr(0x42F) x $repeat;
+$cyrillic{'lc'} = chr(0x44F) x $repeat;
+
+my %latin1;
+$latin1{'uc'} = chr(0xD8) x $repeat;
+$latin1{'lc'} = chr(0xF8) x $repeat;
+
+my %empty;
+$empty{'lc'} = $empty{'uc'} = "";
+
+# Loop so prefix each character being tested with nothing, and the various
+# strings; then loop for suffixes of those strings as well.
+for my $prefix (\%empty, \%posix, \%cyrillic, \%latin1) {
+ for my $suffix (\%empty, \%posix, \%cyrillic, \%latin1) {
+ for my $i (0 .. 255) { # For each possible posix or latin1 character
+ my $cp = sprintf "%02X", $i;
+
+ # First try using latin1 (Unicode) semantics.
+ no legacy "unicode8bit";
+
+ my $phrase = 'with unicode';
+ my $char = chr($i);
+ my $pre_lc = $prefix->{'lc'};
+ my $pre_uc = $prefix->{'uc'};
+ my $post_lc = $suffix->{'lc'};
+ my $post_uc = $suffix->{'uc'};
+ my $to_upper = $pre_lc . $char . $post_lc;
+ my $expected_upper = $pre_uc . $latin1_to_upper[$i] . $post_uc;
+ my $to_lower = $pre_uc . $char . $post_uc;
+ my $expected_lower = $pre_lc . $latin1_to_lower[$i] . $post_lc;
+
+ is (uc($to_upper), $expected_upper,
+
+ # The names are commented out for now to avoid 'wide character
+ # in print' messages.
+ ); #"$cp: $phrase: uc('$to_upper') eq '$expected_upper'");
+ is (lc($to_lower), $expected_lower,
+ ); #"$cp: $phrase: lc('$to_lower') eq '$expected_lower'");
+
+ if ($pre_uc eq "") { # Title case if null prefix.
+ my $expected_title = $latin1_to_title[$i] . $post_lc;
+ is (ucfirst($to_upper), $expected_title,
+ ); #"$cp: $phrase: ucfirst('$to_upper') eq '$expected_title'");
+ my $expected_lcfirst = $latin1_to_lower[$i] . $post_uc;
+ is (lcfirst($to_lower), $expected_lcfirst,
+ ); #"$cp: $phrase: lcfirst('$to_lower') eq '$expected_lcfirst'");
+ }
+
+ # Then try with posix semantics.
+ use legacy "unicode8bit";
+ $phrase = 'no unicode';
+
+ # These don't contribute anything in this case.
+ next if $suffix == \%cyrillic;
+ next if $suffix == \%latin1;
+ next if $prefix == \%cyrillic;
+ next if $prefix == \%latin1;
+
+ $expected_upper = $pre_uc . $posix_to_upper[$i] . $post_uc;
+ $expected_lower = $pre_lc . $posix_to_lower[$i] . $post_lc;
+
+ is (uc($to_upper), $expected_upper,
+ ); #"$cp: $phrase: uc('$to_upper') eq '$expected_upper'");
+ is (lc($to_lower), $expected_lower,
+ ); #"$cp: $phrase: lc('$to_lower') eq '$expected_lower'");
+
+ if ($pre_uc eq "") {
+ my $expected_title = $posix_to_title[$i] . $post_lc;
+ is (ucfirst($to_upper), $expected_title,
+ ); #"$cp: $phrase: ucfirst('$to_upper') eq '$expected_title'");
+ my $expected_lcfirst = $posix_to_lower[$i] . $post_uc;
+ is (lcfirst($to_lower), $expected_lcfirst,
+ ); #"$cp: $phrase: lcfirst('$to_lower') eq '$expected_lcfirst'");
+ }
+ }
+ }
+}
diff --git a/perl.h b/perl.h
index 874d0c34ea..fe6b7fcec0 100644
--- a/perl.h
+++ b/perl.h
@@ -28,7 +28,7 @@
#ifdef VOIDUSED
# undef VOIDUSED
-#endif
+#endif
#define VOIDUSED 1
#ifdef PERL_MICRO
@@ -270,13 +270,13 @@
#define CALLREG_PACKAGE(rx) \
CALL_FPTR(RX_ENGINE(rx)->qr_package)(aTHX_ (rx))
-#if defined(USE_ITHREADS)
+#if defined(USE_ITHREADS)
#define CALLREGDUPE(prog,param) \
Perl_re_dup(aTHX_ (prog),(param))
#define CALLREGDUPE_PVT(prog,param) \
(prog ? CALL_FPTR(RX_ENGINE(prog)->dupe)(aTHX_ (prog),(param)) \
- : (REGEXP *)NULL)
+ : (REGEXP *)NULL)
#endif
@@ -310,7 +310,7 @@
# define PERL_UNUSED_DECL
# endif
#endif
-
+
/* gcc -Wall:
* for silencing unused variables that are actually used most of the time,
* but we cannot quite get rid of, such as "ax" in PPCODE+noargs xsubs
@@ -947,7 +947,7 @@ EXTERN_C int usleep(unsigned int);
#define PERL_DEFAULT_DO_EXEC3_IMPLEMENTATION
#endif
-/* Cannot include embed.h here on Win32 as win32.h has not
+/* Cannot include embed.h here on Win32 as win32.h has not
yet been included and defines some config variables e.g. HAVE_INTERP_INTERN
*/
#if !defined(PERL_FOR_X2P) && !(defined(WIN32)||defined(VMS))
@@ -1198,7 +1198,7 @@ EXTERN_C int usleep(unsigned int);
#endif
/* In Tru64 use the 4.4BSD struct msghdr, not the 4.3 one.
- * This is important for using IPv6.
+ * This is important for using IPv6.
* For OSF/1 3.2, however, defining _SOCKADDR_LEN would be
* a bad idea since it breaks send() and recv(). */
#if defined(__osf__) && defined(__alpha) && !defined(_SOCKADDR_LEN) && !defined(DEC_OSF1_3_X)
@@ -2780,7 +2780,7 @@ freeing any remaining Perl interpreters.
# define HASATTRIBUTE_WARN_UNUSED_RESULT
# endif
#endif
-#endif /* #ifndef PERL_MICRO */
+#endif /* #ifndef PERL_MICRO */
/* USE_5005THREADS needs to be after unixish.h as <pthread.h> includes
* <sys/signal.h> which defines NSIG - which will stop inclusion of <signal.h>
@@ -2871,7 +2871,7 @@ typedef pthread_key_t perl_key;
/* This is complicated. The child processes return a true native VMS
status which must be saved. But there is an assumption in Perl that
the UNIX child status has some relationship to errno values, so
- Perl tries to translate it to text in some of the tests.
+ Perl tries to translate it to text in some of the tests.
In order to get the string translation correct, for the error, errno
must be EVMSERR, but that generates a different text message
than what the test programs are expecting. So an errno value must
@@ -3131,16 +3131,16 @@ typedef pthread_key_t perl_key;
# define PERL_SET_THX(t) PERL_SET_CONTEXT(t)
#endif
-/*
+/*
This replaces the previous %_ "hack" by the "%p" hacks.
All that is required is that the perl source does not
- use "%-p" or "%-<number>p" or "%<number>p" formats.
- These formats will still work in perl code.
+ use "%-p" or "%-<number>p" or "%<number>p" formats.
+ These formats will still work in perl code.
See comments in sv.c for futher details.
Robin Barker 2005-07-14
- No longer use %1p for VDf = %vd. RMB 2007-10-19
+ No longer use %1p for VDf = %vd. RMB 2007-10-19
*/
#ifndef SVf_
@@ -3162,7 +3162,7 @@ typedef pthread_key_t perl_key;
#define SVfARG(p) ((void*)(p))
#ifdef PERL_CORE
-/* not used; but needed for backward compatibilty with XS code? - RMB */
+/* not used; but needed for backward compatibilty with XS code? - RMB */
# undef VDf
#else
# ifndef VDf
@@ -3171,7 +3171,7 @@ typedef pthread_key_t perl_key;
#endif
#ifdef PERL_CORE
-/* not used; but needed for backward compatibilty with XS code? - RMB */
+/* not used; but needed for backward compatibilty with XS code? - RMB */
# undef UVf
#else
# ifndef UVf
@@ -3251,7 +3251,7 @@ typedef pthread_key_t perl_key;
#ifdef PRINTF_FORMAT_NULL_OK
# define __attribute__format__null_ok__(x,y,z) __attribute__format__(x,y,z)
#else
-# define __attribute__format__null_ok__(x,y,z)
+# define __attribute__format__null_ok__(x,y,z)
#endif
#ifdef HAS_BUILTIN_EXPECT
@@ -3354,7 +3354,7 @@ typedef struct crypt_data { /* straight from /usr/include/crypt.h */
#endif
#ifdef __LIBCATAMOUNT__
-#undef HAS_PASSWD /* unixish.h but not unixish enough. */
+#undef HAS_PASSWD /* unixish.h but not unixish enough. */
#undef HAS_GROUP
#define FAKE_BIT_BUCKET
#endif
@@ -4345,9 +4345,85 @@ EXTCONST unsigned char PL_fold[] = {
240, 241, 242, 243, 244, 245, 246, 247,
248, 249, 250, 251, 252, 253, 254, 255
};
-#endif /* !EBCDIC */
-#else
+#endif /* !EBCDIC, but still in DOINIT */
+
+/* If these tables are accessed through ebcdic, the access will be converted to
+ * latin1 first */
+EXTCONST unsigned char PL_latin1_lc[] = { /* lowercasing */
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+ 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
+ 'x', 'y', 'z', 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ 128, 129, 130, 131, 132, 133, 134, 135,
+ 136, 137, 138, 139, 140, 141, 142, 143,
+ 144, 145, 146, 147, 148, 149, 150, 151,
+ 152, 153, 154, 155, 156, 157, 158, 159,
+ 160, 161, 162, 163, 164, 165, 166, 167,
+ 168, 169, 170, 171, 172, 173, 174, 175,
+ 176, 177, 178, 179, 180, 181, 182, 183,
+ 184, 185, 186, 187, 188, 189, 190, 191,
+ 192+32, 193+32, 194+32, 195+32, 196+32, 197+32, 198+32, 199+32,
+ 200+32, 201+32, 202+32, 203+32, 204+32, 205+32, 206+32, 207+32,
+ 208+32, 209+32, 210+32, 211+32, 212+32, 213+32, 214+32, 215,
+ 216+32, 217+32, 218+32, 219+32, 220+32, 221+32, 222+32, 223,
+ 224, 225, 226, 227, 228, 229, 230, 231,
+ 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247,
+ 248, 249, 250, 251, 252, 253, 254, 255
+};
+
+/* upper and title case of latin1 characters, modified so that the three tricky
+ * ones are mapped to 255 (which is one of the three) */
+EXTCONST unsigned char PL_mod_latin1_uc[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+ 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+ 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
+ 'X', 'Y', 'Z', 123, 124, 125, 126, 127,
+ 128, 129, 130, 131, 132, 133, 134, 135,
+ 136, 137, 138, 139, 140, 141, 142, 143,
+ 144, 145, 146, 147, 148, 149, 150, 151,
+ 152, 153, 154, 155, 156, 157, 158, 159,
+ 160, 161, 162, 163, 164, 165, 166, 167,
+ 168, 169, 170, 171, 172, 173, 174, 175,
+ 176, 177, 178, 179, 180, 255 /*micro*/, 182, 183,
+ 184, 185, 186, 187, 188, 189, 190, 191,
+ 192, 193, 194, 195, 196, 197, 198, 199,
+ 200, 201, 202, 203, 204, 205, 206, 207,
+ 208, 209, 210, 211, 212, 213, 214, 215,
+ 216, 217, 218, 219, 220, 221, 222, 255 /*sharp s*/,
+ 224-32, 225-32, 226-32, 227-32, 228-32, 229-32, 230-32, 231-32,
+ 232-32, 233-32, 234-32, 235-32, 236-32, 237-32, 238-32, 239-32,
+ 240-32, 241-32, 242-32, 243-32, 244-32, 245-32, 246-32, 247,
+ 248-32, 249-32, 250-32, 251-32, 252-32, 253-32, 254-32, 255
+};
+#else /* ! DOINIT */
EXTCONST unsigned char PL_fold[];
+EXTCONST unsigned char PL_mod_latin1_uc[];
+EXTCONST unsigned char PL_latin1_lc[];
#endif
#ifndef PERL_GLOBAL_STRUCT /* or perlvars.h */
@@ -5970,8 +6046,8 @@ extern void moncontrol(int);
#define NO_ENV_ARRAY_IN_MAIN
#endif
-/* These are used by Perl_pv_escape() and Perl_pv_pretty()
- * are here so that they are available throughout the core
+/* These are used by Perl_pv_escape() and Perl_pv_pretty()
+ * are here so that they are available throughout the core
* NOTE that even though some are for _escape and some for _pretty
* there must not be any clashes as the flags from _pretty are
* passed straight through to _escape.
@@ -5985,7 +6061,7 @@ extern void moncontrol(int);
#define PERL_PV_ESCAPE_FIRSTCHAR 0x0008
-#define PERL_PV_ESCAPE_UNI 0x0100
+#define PERL_PV_ESCAPE_UNI 0x0100
#define PERL_PV_ESCAPE_UNI_DETECT 0x0200
#define PERL_PV_ESCAPE_ALL 0x1000
diff --git a/pp.c b/pp.c
index 7641b54edf..eaeb89f5c9 100644
--- a/pp.c
+++ b/pp.c
@@ -3525,22 +3525,97 @@ PP(pp_crypt)
#endif
}
+/* Generally UTF-8 and UTF-EBCDIC are indistinguishable at this level. So
+ * most comments below say UTF-8, when in fact they mean UTF-EBCDIC as well */
+
+/* Both the characters below can be stored in two UTF-8 bytes. In UTF-8 the max
+ * character that 2 bytes can hold is U+07FF, and in UTF-EBCDIC it is U+03FF.
+ * See http://www.unicode.org/unicode/reports/tr16 */
+#define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 /* Also is title case */
+#define GREEK_CAPITAL_LETTER_MU 0x039C /* Upper and title case of MICRON */
+
+/* Below are several macros that generate code */
+/* Generates code to store a unicode codepoint c that is known to occupy
+ * exactly two UTF-8 and UTF-EBCDIC bytes; it is stored into p and p+1. */
+#define STORE_UNI_TO_UTF8_TWO_BYTE(p, c) \
+ STMT_START { \
+ *(p) = UTF8_TWO_BYTE_HI(c); \
+ *((p)+1) = UTF8_TWO_BYTE_LO(c); \
+ } STMT_END
+
+/* Like STORE_UNI_TO_UTF8_TWO_BYTE, but advances p to point to the next
+ * available byte after the two bytes */
+#define CAT_UNI_TO_UTF8_TWO_BYTE(p, c) \
+ STMT_START { \
+ *(p)++ = UTF8_TWO_BYTE_HI(c); \
+ *((p)++) = UTF8_TWO_BYTE_LO(c); \
+ } STMT_END
+
+/* Generates code to store the upper case of latin1 character l which is known
+ * to have its upper case be non-latin1 into the two bytes p and p+1. There
+ * are only two characters that fit this description, and this macro knows
+ * about them, and that the upper case values fit into two UTF-8 or UTF-EBCDIC
+ * bytes */
+#define STORE_NON_LATIN1_UC(p, l) \
+STMT_START { \
+ if ((l) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { \
+ STORE_UNI_TO_UTF8_TWO_BYTE((p), LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); \
+ } else { /* Must be the following letter */ \
+ STORE_UNI_TO_UTF8_TWO_BYTE((p), GREEK_CAPITAL_LETTER_MU); \
+ } \
+} STMT_END
+
+/* Like STORE_NON_LATIN1_UC, but advances p to point to the next available byte
+ * after the character stored */
+#define CAT_NON_LATIN1_UC(p, l) \
+STMT_START { \
+ if ((l) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { \
+ CAT_UNI_TO_UTF8_TWO_BYTE((p), LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); \
+ } else { \
+ CAT_UNI_TO_UTF8_TWO_BYTE((p), GREEK_CAPITAL_LETTER_MU); \
+ } \
+} STMT_END
+
+/* Generates code to add the two UTF-8 bytes (probably u) that are the upper
+ * case of l into p and p+1. u must be the result of toUPPER_LATIN1_MOD(l),
+ * and must require two bytes to store it. Advances p to point to the next
+ * available position */
+#define CAT_TWO_BYTE_UNI_UPPER_MOD(p, l, u) \
+STMT_START { \
+ if ((u) != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { \
+ CAT_UNI_TO_UTF8_TWO_BYTE((p), (u)); /* not special, just save it */ \
+ } else if (l == LATIN_SMALL_LETTER_SHARP_S) { \
+ *(p)++ = 'S'; *(p)++ = 'S'; /* upper case is 'SS' */ \
+ } else {/* else is one of the other two special cases */ \
+ CAT_NON_LATIN1_UC((p), (l)); \
+ } \
+} STMT_END
+
PP(pp_ucfirst)
{
+ /* Actually is both lcfirst() and ucfirst(). Only the first character
+ * changes. This means that possibly we can change in-place, ie., just
+ * take the source and change that one character and store it back, but not
+ * if read-only etc, or if the length changes */
+
dVAR;
dSP;
SV *source = TOPs;
- STRLEN slen;
+ STRLEN slen; /* slen is the byte length of the whole SV. */
STRLEN need;
SV *dest;
- bool inplace = TRUE;
- bool doing_utf8;
+ bool inplace; /* ? Convert first char only, in-place */
+ bool doing_utf8 = FALSE; /* ? using utf8 */
+ bool convert_source_to_utf8 = FALSE; /* ? need to convert */
const int op_type = PL_op->op_type;
const U8 *s;
U8 *d;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
- STRLEN ulen;
- STRLEN tculen;
+ STRLEN ulen; /* ulen is the byte length of the original Unicode character
+ * stored as UTF-8 at s. */
+ STRLEN tculen; /* tculen is the byte length of the freshly titlecased (or
+ * lowercased) character stored in tmpbuf. May be either
+ * UTF-8 or not, but in either case is the number of bytes */
SvGETMAGIC(source);
if (SvOK(source)) {
@@ -3552,25 +3627,187 @@ PP(pp_ucfirst)
slen = 0;
}
- if (slen && DO_UTF8(source) && UTF8_IS_START(*s)) {
+ /* We may be able to get away with changing only the first character, in
+ * place, but not if read-only, etc. Later we may discover more reasons to
+ * not convert in-place. */
+ inplace = SvPADTMP(source) && !SvREADONLY(source) && SvTEMP(source);
+
+ /* First calculate what the changed first character should be. This affects
+ * whether we can just swap it out, leaving the rest of the string unchanged,
+ * or even if have to convert the dest to UTF-8 when the source isn't */
+
+ if (! slen) { /* If empty */
+ need = 1; /* still need a trailing NUL */
+ }
+ else if (DO_UTF8(source)) { /* Is the source utf8? */
doing_utf8 = TRUE;
- utf8_to_uvchr(s, &ulen);
- if (op_type == OP_UCFIRST) {
- toTITLE_utf8(s, tmpbuf, &tculen);
- } else {
- toLOWER_utf8(s, tmpbuf, &tculen);
+
+/* TODO: This is #ifdefd out because it has hard-coded the standard mappings,
+ * and doesn't allow for the user to specify their own. When code is added to
+ * detect if there is a user-defined mapping in force here, and if so to use
+ * that, then the code below can be compiled. The detection would be a good
+ * thing anyway, as currently the user-defined mappings only work on utf8
+ * strings, and thus depend on the chosen internal storage method, which is a
+ * bad thing */
+#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
+ if (UTF8_IS_INVARIANT(*s)) {
+
+ /* An invariant source character is either ASCII or, in EBCDIC, an
+ * ASCII equivalent or a caseless C1 control. In both these cases,
+ * the lower and upper cases of any character are also invariants
+ * (and title case is the same as upper case). So it is safe to
+ * use the simple case change macros which avoid the overhead of
+ * the general functions. Note that if perl were to be extended to
+ * do locale handling in UTF-8 strings, this wouldn't be true in,
+ * for example, Lithuanian or Turkic. */
+ *tmpbuf = (op_type == OP_LCFIRST) ? toLOWER(*s) : toUPPER(*s);
+ tculen = ulen = 1;
+ need = slen + 1;
+ }
+ else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+ U8 chr;
+
+ /* Similarly, if the source character isn't invariant but is in the
+ * latin1 range (or EBCDIC equivalent thereof), we have the case
+ * changes compiled into perl, and can avoid the overhead of the
+ * general functions. In this range, the characters are stored as
+ * two UTF-8 bytes, and it so happens that any changed-case version
+ * is also two bytes (in both ASCIIish and EBCDIC machines). */
+ tculen = ulen = 2;
+ need = slen + 1;
+
+ /* Convert the two source bytes to a single Unicode code point
+ * value, change case and save for below */
+ chr = UTF8_ACCUMULATE(*s, *(s+1));
+ if (op_type == OP_LCFIRST) { /* lower casing is easy */
+ U8 lower = toLOWER_LATIN1(chr);
+ STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf, lower);
+ }
+ else { /* ucfirst */
+ U8 upper = toUPPER_LATIN1_MOD(chr);
+
+ /* Most of the latin1 range characters are well-behaved. Their
+ * title and upper cases are the same, and are also in the
+ * latin1 range. The macro above returns their upper (hence
+ * title) case, and all that need be done is to save the result
+ * for below. However, several characters are problematic, and
+ * have to be handled specially. The MOD in the macro name
+ * above means that these tricky characters all get mapped to
+ * the single character LATIN_SMALL_LETTER_Y_WITH_DIAERESIS.
+ * This mapping saves some tests for the majority of the
+ * characters */
+
+ if (upper != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) {
+
+ /* Not tricky. Just save it. */
+ STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf, upper);
+ }
+ else if (chr == LATIN_SMALL_LETTER_SHARP_S) {
+
+ /* This one is tricky because it is two characters long,
+ * though the UTF-8 is still two bytes, so the stored
+ * length doesn't change */
+ *tmpbuf = 'S'; /* The UTF-8 is 'Ss' */
+ *(tmpbuf + 1) = 's';
+ }
+ else {
+
+ /* The other two have their title and upper cases the same,
+ * but are tricky because the changed-case characters
+ * aren't in the latin1 range. They, however, do fit into
+ * two UTF-8 bytes */
+ STORE_NON_LATIN1_UC(tmpbuf, chr);
+ }
+ }
}
- /* If the two differ, we definately cannot do inplace. */
- inplace = (ulen == tculen);
- need = slen + 1 - ulen + tculen;
- } else {
- doing_utf8 = FALSE;
- need = slen + 1;
+ else {
+#endif /* end of dont want to break user-defined casing */
+
+ /* Here, can't short-cut the general case */
+
+ utf8_to_uvchr(s, &ulen);
+ if (op_type == OP_UCFIRST) toTITLE_utf8(s, tmpbuf, &tculen);
+ else toLOWER_utf8(s, tmpbuf, &tculen);
+
+ /* we can't do in-place if the length changes. */
+ if (ulen != tculen) inplace = FALSE;
+ need = slen + 1 - ulen + tculen;
+#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
+ }
+#endif
}
+ else { /* Non-zero length, non-UTF-8, Need to consider locale and if
+ * latin1 is treated as caseless. Note that a locale takes
+ * precedence */
+ tculen = 1; /* Most characters will require one byte, but this will
+ * need to be overridden for the tricky ones */
+ need = slen + 1;
+
+ if (op_type == OP_LCFIRST) {
+
+ /* lower case the first letter: no trickiness for any character */
+ *tmpbuf = (IN_LOCALE_RUNTIME) ? toLOWER_LC(*s) :
+ ((IN_UNI_8_BIT) ? toLOWER_LATIN1(*s) : toLOWER(*s));
+ }
+ /* is ucfirst() */
+ else if (IN_LOCALE_RUNTIME) {
+ *tmpbuf = toUPPER_LC(*s); /* This would be a bug if any locales
+ * have upper and title case different
+ */
+ }
+ else if (! IN_UNI_8_BIT) {
+ *tmpbuf = toUPPER(*s); /* Returns caseless for non-ascii, or
+ * on EBCDIC machines whatever the
+ * native function does */
+ }
+ else { /* is ucfirst non-UTF-8, not in locale, and cased latin1 */
+ *tmpbuf = toUPPER_LATIN1_MOD(*s);
+
+ /* tmpbuf now has the correct title case for all latin1 characters
+ * except for the several ones that have tricky handling. All
+ * of these are mapped by the MOD to the letter below. */
+ if (*tmpbuf == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) {
+
+ /* The length is going to change, with all three of these, so
+ * can't replace just the first character */
+ inplace = FALSE;
+
+ /* We use the original to distinguish between these tricky
+ * cases */
+ if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+ /* Two character title case 'Ss', but can remain non-UTF-8 */
+ need = slen + 2;
+ *tmpbuf = 'S';
+ *(tmpbuf + 1) = 's'; /* Assert: length(tmpbuf) >= 2 */
+ tculen = 2;
+ }
+ else {
+
+ /* The other two tricky ones have their title case outside
+ * latin1. It is the same as their upper case. */
+ doing_utf8 = TRUE;
+ STORE_NON_LATIN1_UC(tmpbuf, *s);
+
+ /* The UTF-8 and UTF-EBCDIC lengths of both these characters
+ * and their upper cases is 2. */
+ tculen = ulen = 2;
+
+ /* The entire result will have to be in UTF-8. Assume worst
+ * case sizing in conversion. (all latin1 characters occupy
+ * at most two bytes in utf8) */
+ convert_source_to_utf8 = TRUE;
+ need = slen * 2 + 1;
+ }
+ } /* End of is one of the three special chars */
+ } /* End of use Unicode (Latin1) semantics */
+ } /* End of changing the case of the first character */
- if (SvPADTMP(source) && !SvREADONLY(source) && inplace && SvTEMP(source)) {
- /* We can convert in place. */
+ /* Here, have the first character's changed case stored in tmpbuf. Ready to
+ * generate the result */
+ if (inplace) {
+ /* We can convert in place. This means we change just the first
+ * character without disturbing the rest; no need to grow */
dest = source;
s = d = (U8*)SvPV_force_nomg(source, slen);
} else {
@@ -3578,53 +3815,83 @@ PP(pp_ucfirst)
dest = TARG;
+ /* Here, we can't convert in place; we earlier calculated how much
+ * space we will need, so grow to accommodate that */
SvUPGRADE(dest, SVt_PV);
d = (U8*)SvGROW(dest, need);
(void)SvPOK_only(dest);
SETs(dest);
-
- inplace = FALSE;
}
if (doing_utf8) {
- if(!inplace) {
- /* slen is the byte length of the whole SV.
- * ulen is the byte length of the original Unicode character
- * stored as UTF-8 at s.
- * tculen is the byte length of the freshly titlecased (or
- * lowercased) Unicode character stored as UTF-8 at tmpbuf.
- * We first set the result to be the titlecased (/lowercased)
- * character, and then append the rest of the SV data. */
- sv_setpvn(dest, (char*)tmpbuf, tculen);
- if (slen > ulen)
- sv_catpvn(dest, (char*)(s + ulen), slen - ulen);
+ if (! inplace) {
+ if (! convert_source_to_utf8) {
+
+ /* Here both source and dest are in UTF-8, but have to create
+ * the entire output. We initialize the result to be the
+ * title/lower cased first character, and then append the rest
+ * of the string. */
+ sv_setpvn(dest, (char*)tmpbuf, tculen);
+ if (slen > ulen) {
+ sv_catpvn(dest, (char*)(s + ulen), slen - ulen);
+ }
+ }
+ else {
+ const U8 *const send = s + slen;
+
+ /* Here the dest needs to be in UTF-8, but the source isn't,
+ * except we earlier UTF-8'd the first character of the source
+ * into tmpbuf. First put that into dest, and then append the
+ * rest of the source, converting it to UTF-8 as we go. */
+
+ /* Assert tculen is 2 here because the only two characters that
+ * get to this part of the code have 2-byte UTF-8 equivalents */
+ *d++ = *tmpbuf;
+ *d++ = *(tmpbuf + 1);
+ s++; /* We have just processed the 1st char */
+
+ for (; s < send; s++) {
+ d = uvchr_to_utf8(d, *s);
+ }
+ *d = '\0';
+ SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+ }
SvUTF8_on(dest);
}
- else {
+ else { /* in-place UTF-8. Just overwrite the first character */
Copy(tmpbuf, d, tculen, U8);
SvCUR_set(dest, need - 1);
}
}
- else {
- if (*s) {
+ else { /* Neither source nor dest are in or need to be UTF-8 */
+ if (slen) {
if (IN_LOCALE_RUNTIME) {
TAINT;
SvTAINTED_on(dest);
- *d = (op_type == OP_UCFIRST)
- ? toUPPER_LC(*s) : toLOWER_LC(*s);
}
- else
- *d = (op_type == OP_UCFIRST) ? toUPPER(*s) : toLOWER(*s);
- } else {
- /* See bug #39028 */
+ if (inplace) { /* in-place, only need to change the 1st char */
+ *d = *tmpbuf;
+ }
+ else { /* Not in-place */
+
+ /* Copy the case-changed character(s) from tmpbuf */
+ Copy(tmpbuf, d, tculen, U8);
+ d += tculen - 1; /* Code below expects d to point to final
+ * character stored */
+ }
+ }
+ else { /* empty source */
+ /* See bug #39028: Don't taint if empty */
*d = *s;
}
+ /* In a "use bytes" we don't treat the source as UTF-8, but, still want
+ * the destination to retain that flag */
if (SvUTF8(source))
SvUTF8_on(dest);
- if (!inplace) {
+ if (!inplace) { /* Finish the rest of the string, unchanged */
/* This will copy the trailing NUL */
Copy(s + 1, d + 1, slen, U8);
SvCUR_set(dest, need - 1);
@@ -3636,7 +3903,7 @@ PP(pp_ucfirst)
/* There's so much setup/teardown code common between uc and lc, I wonder if
it would be worth merging the two, and just having a switch outside each
- of the three tight loops. */
+ of the three tight loops. There is less and less commonality though */
PP(pp_uc)
{
dVAR;
@@ -3651,9 +3918,16 @@ PP(pp_uc)
SvGETMAGIC(source);
if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source)
- && SvTEMP(source) && !DO_UTF8(source)) {
- /* We can convert in place. */
-
+ && SvTEMP(source) && !DO_UTF8(source)
+ && (IN_LOCALE_RUNTIME || ! IN_UNI_8_BIT)) {
+
+ /* We can convert in place. The reason we can't if in UNI_8_BIT is to
+ * make the loop tight, so we overwrite the source with the dest before
+ * looking at it, and we need to look at the original source
+ * afterwards. There would also need to be code added to handle
+ * switching to not in-place in midstream if we run into characters
+ * that change the length.
+ */
dest = source;
s = d = (U8*)SvPV_force_nomg(source, len);
min = len + 1;
@@ -3693,48 +3967,209 @@ PP(pp_uc)
const U8 *const send = s + len;
U8 tmpbuf[UTF8_MAXBYTES+1];
+/* This is ifdefd out because it needs more work and thought. It isn't clear
+ * that we should do it. These are hard-coded rules from the Unicode standard,
+ * and may change. 5.2 gives new guidance on the iota subscript, for example,
+ * which has not been checked against this; and secondly it may be that we are
+ * passed a subset of the context, via a \U...\E, for example, and its not
+ * clear what the best approach is to that */
+#ifdef CONTEXT_DEPENDENT_CASING
+ bool in_iota_subscript = FALSE;
+#endif
+
while (s < send) {
- const STRLEN u = UTF8SKIP(s);
- STRLEN ulen;
-
- toUPPER_utf8(s, tmpbuf, &ulen);
- if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
- /* If the eventually required minimum size outgrows
- * the available space, we need to grow. */
- const UV o = d - (U8*)SvPVX_const(dest);
-
- /* If someone uppercases one million U+03B0s we SvGROW() one
- * million times. Or we could try guessing how much to
- allocate without allocating too much. Such is life. */
- SvGROW(dest, min);
- d = (U8*)SvPVX(dest) + o;
+#ifdef CONTEXT_DEPENDENT_CASING
+ if (in_iota_subscript && ! is_utf8_mark(s)) {
+ /* A non-mark. Time to output the iota subscript */
+#define GREEK_CAPITAL_LETTER_IOTA 0x0399
+#define COMBINING_GREEK_YPOGEGRAMMENI 0x0345
+
+ CAT_UNI_TO_UTF8_TWO_BYTE(d, GREEK_CAPITAL_LETTER_IOTA);
+ in_iota_subscript = FALSE;
+ }
+#endif
+
+
+/* See comments at the first instance in this file of this ifdef */
+#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
+
+ /* If the UTF-8 character is invariant, then it is in the range
+ * known by the standard macro; result is only one byte long */
+ if (UTF8_IS_INVARIANT(*s)) {
+ *d++ = toUPPER(*s);
+ s++;
+ }
+ else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+
+ /* Likewise, if it fits in a byte, its case change is in our
+ * table */
+ U8 orig = UTF8_ACCUMULATE(*s, *(s+1));
+ U8 upper = toUPPER_LATIN1_MOD(orig);
+ CAT_TWO_BYTE_UNI_UPPER_MOD(d, orig, upper);
+ s += 2;
+ }
+ else {
+#else
+ {
+#endif
+
+ /* Otherwise, need the general UTF-8 case. Get the changed
+ * case value and copy it to the output buffer */
+
+ const STRLEN u = UTF8SKIP(s);
+ STRLEN ulen;
+
+#ifndef CONTEXT_DEPENDENT_CASING
+ toUPPER_utf8(s, tmpbuf, &ulen);
+#else
+ const UV uv = toUPPER_utf8(s, tmpbuf, &ulen);
+ if (uv == GREEK_CAPITAL_LETTER_IOTA && utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI) {
+ in_iota_subscript = TRUE;
+ }
+ else {
+#endif
+ if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
+ /* If the eventually required minimum size outgrows
+ * the available space, we need to grow. */
+ const UV o = d - (U8*)SvPVX_const(dest);
+
+ /* If someone uppercases one million U+03B0s we
+ * SvGROW() one million times. Or we could try
+ * guessing how much to allocate without allocating too
+ * much. Such is life. See corresponding comment in lc code
+ * for another option */
+ SvGROW(dest, min);
+ d = (U8*)SvPVX(dest) + o;
+ }
+ Copy(tmpbuf, d, ulen, U8);
+ d += ulen;
+#ifdef CONTEXT_DEPENDENT_CASING
+ }
+#endif
+ s += u;
}
- Copy(tmpbuf, d, ulen, U8);
- d += ulen;
- s += u;
}
+#ifdef CONTEXT_DEPENDENT_CASING
+ if (in_iota_subscript) CAT_UNI_TO_UTF8_TWO_BYTE(d, GREEK_CAPITAL_LETTER_IOTA);
+#endif
SvUTF8_on(dest);
*d = '\0';
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
- } else {
+ } else { /* Not UTF-8 */
if (len) {
const U8 *const send = s + len;
+
+ /* Use locale casing if in locale; regular style if not treating
+ * latin1 as having case; otherwise the latin1 casing. Do the
+ * whole thing in a tight loop, for speed, */
if (IN_LOCALE_RUNTIME) {
TAINT;
SvTAINTED_on(dest);
for (; s < send; d++, s++)
*d = toUPPER_LC(*s);
}
- else {
- for (; s < send; d++, s++)
+ else if (! IN_UNI_8_BIT) {
+ for (; s < send; d++, s++) {
*d = toUPPER(*s);
+ }
}
- }
+ else {
+ for (; s < send; d++, s++) {
+ *d = toUPPER_LATIN1_MOD(*s);
+ if (*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) continue;
+
+ /* The mainstream case is the tight loop above. To avoid
+ * extra tests in that, all three characters that require
+ * special handling are mapped by the MOD to the one tested
+ * just above.
+ * Use the source to distinguish between the three cases */
+
+ if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+
+ /* uc() of this requires 2 characters, but they are
+ * ASCII. If not enough room, grow the string */
+ if (SvLEN(dest) < ++min) {
+ const UV o = d - (U8*)SvPVX_const(dest);
+ SvGROW(dest, min);
+ d = (U8*)SvPVX(dest) + o;
+ }
+ *d++ = 'S'; *d = 'S'; /* upper case is 'SS' */
+ continue; /* Back to the tight loop; still in ASCII */
+ }
+
+ /* The other two special handling characters have their
+ * upper cases outside the latin1 range, hence need to be
+ * in UTF-8, so the whole result needs to be in UTF-8. So,
+ * here we are somewhere in the middle of processing a
+ * non-UTF-8 string, and realize that we will have to convert
+ * the whole thing to UTF-8. What to do? There are
+ * several possibilities. The simplest to code is to
+ * convert what we have so far, set a flag, and continue on
+ * in the loop. The flag would be tested each time through
+ * the loop, and if set, the next character would be
+ * converted to UTF-8 and stored. But, I (khw) didn't want
+ * to slow down the mainstream case at all for this fairly
+ * rare case, so I didn't want to add a test that didn't
+ * absolutely have to be there in the loop, besides the
+ * possibility that it would get too complicated for
+ * optimizers to deal with. Another possibility is to just
+ * give up, convert the source to UTF-8, and restart the
+ * function that way. Another possibility is to convert
+ * both what has already been processed and what is yet to
+ * come separately to UTF-8, then jump into the loop that
+ * handles UTF-8. But the most efficient time-wise of the
+ * ones I could think of is what follows, and turned out to
+ * not require much extra code. */
+
+ /* Convert what we have so far into UTF-8, telling the
+ * function that we know it should be converted, and to
+ * allow extra space for what we haven't processed yet.
+ * Assume the worst case space requirements for converting
+ * what we haven't processed so far: that it will require
+ * two bytes for each remaining source character, plus the
+ * NUL at the end. This may cause the string pointer to
+ * move, so re-find it. */
+
+ len = d - (U8*)SvPVX_const(dest);
+ SvCUR_set(dest, len);
+ len = sv_utf8_upgrade_flags_grow(dest,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+ (send -s) * 2 + 1);
+ d = (U8*)SvPVX(dest) + len;
+
+ /* And append the current character's upper case in UTF-8 */
+ CAT_NON_LATIN1_UC(d, *s);
+
+ /* Now process the remainder of the source, converting to
+ * upper and UTF-8. If a resulting byte is invariant in
+ * UTF-8, output it as-is, otherwise convert to UTF-8 and
+ * append it to the output. */
+
+ s++;
+ for (; s < send; s++) {
+ U8 upper = toUPPER_LATIN1_MOD(*s);
+ if UTF8_IS_INVARIANT(upper) {
+ *d++ = upper;
+ }
+ else {
+ CAT_TWO_BYTE_UNI_UPPER_MOD(d, *s, upper);
+ }
+ }
+
+ /* Here have processed the whole source; no need to continue
+ * with the outer loop. Each character has been converted
+ * to upper case and converted to UTF-8 */
+
+ break;
+ } /* End of processing all latin1-style chars */
+ } /* End of processing all chars */
+ } /* End of source is not empty */
+
if (source != dest) {
- *d = '\0';
+ *d = '\0'; /* Here d points to 1 after last char, add NUL */
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
}
- }
+ } /* End of isn't utf8 */
SvSETMAGIC(dest);
RETURN;
}
@@ -3754,8 +4189,9 @@ PP(pp_lc)
if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source)
&& SvTEMP(source) && !DO_UTF8(source)) {
- /* We can convert in place. */
+ /* We can convert in place, as lowercasing anything in the latin1 range
+ * (or else DO_UTF8 would have been on) doesn't lengthen it */
dest = source;
s = d = (U8*)SvPV_force_nomg(source, len);
min = len + 1;
@@ -3796,56 +4232,148 @@ PP(pp_lc)
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
while (s < send) {
- const STRLEN u = UTF8SKIP(s);
- STRLEN ulen;
- const UV uv = toLOWER_utf8(s, tmpbuf, &ulen);
+/* See comments at the first instance in this file of this ifdef */
+#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
+ if (UTF8_IS_INVARIANT(*s)) {
-#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */
- if (uv == GREEK_CAPITAL_LETTER_SIGMA) {
- NOOP;
- /*
- * Now if the sigma is NOT followed by
- * /$ignorable_sequence$cased_letter/;
- * and it IS preceded by /$cased_letter$ignorable_sequence/;
- * where $ignorable_sequence is [\x{2010}\x{AD}\p{Mn}]*
- * and $cased_letter is [\p{Ll}\p{Lo}\p{Lt}]
- * then it should be mapped to 0x03C2,
- * (GREEK SMALL LETTER FINAL SIGMA),
- * instead of staying 0x03A3.
- * "should be": in other words, this is not implemented yet.
- * See lib/unicore/SpecialCasing.txt.
+ /* Invariant characters use the standard mappings compiled in.
*/
+ *d++ = toLOWER(*s);
+ s++;
}
- if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
- /* If the eventually required minimum size outgrows
- * the available space, we need to grow. */
- const UV o = d - (U8*)SvPVX_const(dest);
-
- /* If someone lowercases one million U+0130s we SvGROW() one
- * million times. Or we could try guessing how much to
- allocate without allocating too much. Such is life. */
- SvGROW(dest, min);
- d = (U8*)SvPVX(dest) + o;
+ else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+
+ /* As do the ones in the Latin1 range */
+ U8 lower = toLOWER_LATIN1(UTF8_ACCUMULATE(*s, *(s+1)));
+ CAT_UNI_TO_UTF8_TWO_BYTE(d, lower);
+ s += 2;
}
- Copy(tmpbuf, d, ulen, U8);
- d += ulen;
- s += u;
- }
+ else {
+#endif
+ /* Here, is utf8 not in Latin-1 range, have to go out and get
+ * the mappings from the tables. */
+
+ const STRLEN u = UTF8SKIP(s);
+ STRLEN ulen;
+
+/* See comments at the first instance in this file of this ifdef */
+#ifndef CONTEXT_DEPENDENT_CASING
+ toLOWER_utf8(s, tmpbuf, &ulen);
+#else
+ /* Here is context dependent casing, not compiled in currently;
+ * needs more thought and work */
+
+ const UV uv = toLOWER_utf8(s, tmpbuf, &ulen);
+
+ /* If the lower case is a small sigma, it may be that we need
+ * to change it to a final sigma. This happens at the end of
+ * a word that contains more than just this character, and only
+ * when we started with a capital sigma. */
+ if (uv == UNICODE_GREEK_SMALL_LETTER_SIGMA &&
+ s > send - len && /* Makes sure not the first letter */
+ utf8_to_uvchr(s, 0) == UNICODE_GREEK_CAPITAL_LETTER_SIGMA
+ ) {
+
+ /* We use the algorithm in:
+ * http://www.unicode.org/versions/Unicode5.0.0/ch03.pdf (C
+ * is a CAPITAL SIGMA): If C is preceded by a sequence
+ * consisting of a cased letter and a case-ignorable
+ * sequence, and C is not followed by a sequence consisting
+ * of a case ignorable sequence and then a cased letter,
+ * then when lowercasing C, C becomes a final sigma */
+
+ /* To determine if this is the end of a word, need to peek
+ * ahead. Look at the next character */
+ const U8 *peek = s + u;
+
+ /* Skip any case ignorable characters */
+ while (peek < send && is_utf8_case_ignorable(peek)) {
+ peek += UTF8SKIP(peek);
+ }
+
+ /* If we reached the end of the string without finding any
+ * non-case ignorable characters, or if the next such one
+ * is not-cased, then we have met the conditions for it
+ * being a final sigma with regards to peek ahead, and so
+ * must do peek behind for the remaining conditions. (We
+ * know there is stuff behind to look at since we tested
+ * above that this isn't the first letter) */
+ if (peek >= send || ! is_utf8_cased(peek)) {
+ peek = utf8_hop(s, -1);
+
+ /* Here are at the beginning of the first character
+ * before the original upper case sigma. Keep backing
+ * up, skipping any case ignorable characters */
+ while (is_utf8_case_ignorable(peek)) {
+ peek = utf8_hop(peek, -1);
+ }
+
+ /* Here peek points to the first byte of the closest
+ * non-case-ignorable character before the capital
+ * sigma. If it is cased, then by the Unicode
+ * algorithm, we should use a small final sigma instead
+ * of what we have */
+ if (is_utf8_cased(peek)) {
+ STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf,
+ UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA);
+ }
+ }
+ }
+ else { /* Not a context sensitive mapping */
+#endif /* End of commented out context sensitive */
+ if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
+
+ /* If the eventually required minimum size outgrows
+ * the available space, we need to grow. */
+ const UV o = d - (U8*)SvPVX_const(dest);
+
+ /* If someone lowercases one million U+0130s we
+ * SvGROW() one million times. Or we could try
+ * guessing how much to allocate without allocating too
+ * much. Such is life. Another option would be to
+ * grow an extra byte or two more each time we need to
+ * grow, which would cut down the million to 500K, with
+ * little waste */
+ SvGROW(dest, min);
+ d = (U8*)SvPVX(dest) + o;
+ }
+#ifdef CONTEXT_DEPENDENT_CASING
+ }
+#endif
+ /* Copy the newly lowercased letter to the output buffer we're
+ * building */
+ Copy(tmpbuf, d, ulen, U8);
+ d += ulen;
+ s += u;
+#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
+ }
+#endif
+ } /* End of looping through the source string */
SvUTF8_on(dest);
*d = '\0';
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
- } else {
+ } else { /* Not utf8 */
if (len) {
const U8 *const send = s + len;
+
+ /* Use locale casing if in locale; regular style if not treating
+ * latin1 as having case; otherwise the latin1 casing. Do the
+ * whole thing in a tight loop, for speed, */
if (IN_LOCALE_RUNTIME) {
TAINT;
SvTAINTED_on(dest);
for (; s < send; d++, s++)
*d = toLOWER_LC(*s);
}
- else {
- for (; s < send; d++, s++)
+ else if (! IN_UNI_8_BIT) {
+ for (; s < send; d++, s++) {
*d = toLOWER(*s);
+ }
+ }
+ else {
+ for (; s < send; d++, s++) {
+ *d = toLOWER_LATIN1(*s);
+ }
}
}
if (source != dest) {
diff --git a/utf8.h b/utf8.h
index 1c8e06b59e..7c205d1523 100644
--- a/utf8.h
+++ b/utf8.h
@@ -207,6 +207,7 @@ encoded character.
#define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)
+#define IN_UNI_8_BIT (CopHINTS_get(PL_curcop) & HINT_UNI_8_BIT && ! IN_LOCALE_RUNTIME && ! IN_BYTES)
#define UTF8_ALLOW_EMPTY 0x0001
#define UTF8_ALLOW_CONTINUATION 0x0002