add code for Unicode semantics for non-utf8 latin1 chars

author: Karl Williamson <khw@khw-desktop.(none)> 2009-11-12 22:40:21 -0700
committer: Rafael Garcia-Suarez <rgs@consttype.org> 2009-11-14 23:34:55 +0100
commit: 00f254e235ff10d6223aa9a402ad5b7a85689829 (patch)
tree: ecbf0c57e651245f5632419e0b86788eb252dac1
parent: a1248f17ffcfa8fe0e91df962317b46b81fc0ce5 (diff)
download: perl-00f254e235ff10d6223aa9a402ad5b7a85689829.tar.gz
7 files changed, 972 insertions, 143 deletions
diff --git a/MANIFEST b/MANIFEST
index 93042f644e..22db6a3c90 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -3476,6 +3476,7 @@ lib/integer.pm			For "use integer"
 lib/integer.t			For "use integer" testing
 lib/Internals.t			For Internals::* testing
 lib/legacy.pm			Pragma to preserve legacy behavior
+lib/legacy.t			For "use legacy" testing
 lib/less.pm			For "use less"
 lib/less.t			See if less support works
 lib/locale.pm			For "use locale"
diff --git a/handy.h b/handy.h
index 9ec64e0a09..848cc0e9bb 100644
--- a/handy.h
+++ b/handy.h
@@ -429,7 +429,7 @@ Returns a boolean indicating whether the C C<char> is a US-ASCII (Basic Latin)
 alphanumeric character (including underscore) or digit.
 
 =for apidoc Am|bool|isALPHA|char ch
-Returns a boolean indicating whether the C C<char> is a US-ASCII (Basic Latin) 
+Returns a boolean indicating whether the C C<char> is a US-ASCII (Basic Latin)
 alphabetic character.
 
 =for apidoc Am|bool|isSPACE|char ch
@@ -479,7 +479,9 @@ US-ASCII (Basic Latin) range are viewed as not having any case.
 #   define isPUNCT(c)	ispunct(c)
 #   define isXDIGIT(c)	isxdigit(c)
 #   define toUPPER(c)	toupper(c)
+#   define toUPPER_LATIN1_MOD(c)    UNI_TO_NATIVE(PL_mod_latin1_uc[(U8) NATIVE_TO_UNI(c)])
 #   define toLOWER(c)	tolower(c)
+#   define toLOWER_LATIN1(c)	UNI_TO_NATIVE(PL_latin1_lc[(U8) NATIVE_TO_UNI(c)])
 #else
 #   define isUPPER(c)	((c) >= 'A' && (c) <= 'Z')
 #   define isLOWER(c)	((c) >= 'a' && (c) <= 'z')
@@ -490,6 +492,15 @@ US-ASCII (Basic Latin) range are viewed as not having any case.
 #   define isPRINT(c)	(((c) >= 32 && (c) < 127))
 #   define isPUNCT(c)	(((c) >= 33 && (c) <= 47) || ((c) >= 58 && (c) <= 64)  || ((c) >= 91 && (c) <= 96) || ((c) >= 123 && (c) <= 126))
 #   define isXDIGIT(c)  (isDIGIT(c) || ((c) >= 'a' && (c) <= 'f') || ((c) >= 'A' && (c) <= 'F'))
+
+/* Use table lookup for speed */
+#   define toLOWER_LATIN1(c)	(PL_latin1_lc[(U8) c])
+
+/* Modified uc.  Is correct uc except for three non-ascii chars which are
+ * all mapped to one of them, and these need special handling */
+#   define toUPPER_LATIN1_MOD(c)    (PL_mod_latin1_uc[(U8) c])
+
+/* ASCII casing. */
 #   define toUPPER(c)	(isLOWER(c) ? (c) - ('a' - 'A') : (c))
 #   define toLOWER(c)	(isUPPER(c) ? (c) + ('a' - 'A') : (c))
 #endif
diff --git a/lib/legacy.pm b/lib/legacy.pm
index a1f21a6fc1..3ffea6912c 100755
--- a/lib/legacy.pm
+++ b/lib/legacy.pm
@@ -2,7 +2,7 @@ package legacy;
 
 our $VERSION = '1.00';
 
-$unicode8bit::hint_bits = 0x00000800;
+$unicode8bit::hint_uni8bit = 0x00000800;
 
 my %legacy_bundle = (
     "5.10" => [qw(unicode8bit)],
@@ -20,21 +20,19 @@ behaviors
 
  use legacy ':5.10'; # Keeps semantics the same as in perl 5.10
 
- no legacy;
-
-=cut
+ use legacy qw(unicode8bit);
 
-    #no legacy qw(unicode8bit);
+ no legacy;
 
-=pod
+ no legacy qw(unicode8bit);
 
 =head1 DESCRIPTION
 
 Some programs may rely on behaviors that for others are problematic or
 even wrong.  A new version of Perl may change behaviors from past ones,
 and when it is viewed that the old way of doing things may be required
-to still be supported, that behavior will be added to the list recognized
-by this pragma to allow that.
+to still be supported, the new behavior will be able to be turned off by using
+this pragma.
 
 Additionally, a new behavior may be supported in a new version of Perl, but
 for whatever reason the default remains the old one.  This pragma can enable
@@ -44,24 +42,92 @@ Like other pragmas (C<use feature>, for example), C<use legacy qw(foo)> will
 only make the legacy behavior for "foo" available from that point to the end of
 the enclosing block.
 
-B<This pragma is, for the moment, a skeleton and does not actually affect any
-behaviors yet>
-
 =head2 B<use legacy>
 
 Preserve the old way of doing things when a new version of Perl is
-released that changes things
+released that would otherwise change the behavior.
+
+The one current possibility is:
+
+=head3 unicode8bit
+
+THIS IS SUBJECT TO CHANGE
+
+Use legacy semantics for the 128 characters on ASCII systems that have the 8th
+bit set.  (See L</EBCDIC platforms> below for EBCDIC systems.)  Unless
+C<S<use locale>> is specified, or the scalar containing such a character is
+known by Perl to be encoded in UTF8, the semantics are essentially that the
+characters have an ordinal number, and that's it.  They are caseless, and
+aren't anything: they're not controls, not letters, not punctuation, ..., not
+anything.
+
+This behavior stems from when Perl did not support Unicode, and ASCII was the
+only known character set outside of C<S<use locale>>.  In order to not
+possibly break pre_Unicode programs, these characters have retained their old
+non-meanings, except when it is clear to Perl that Unicode is what is meant,
+for example by calling utf::upgrade() on a scalar, or if the scalar also
+contains characters that are only available in Unicode.  Then these 128
+characters take on their Unicode meanings.
+
+The problem with this behavior is that a scalar that encodes these characters
+has a different meaning depending on if it is stored as utf8 or not.
+In general, the internal storage method should not affect the
+external behavior.
+
+The behavior is known to have effects on these areas:
+
+=over 4
+
+=item
+
+Changing the case of a scalar, that is, using C<uc()>,
+C<ucfirst()>,
+C<lc()>,
+and C<lcfirst()>, or C<\L>, C<\U>, C<\u> and C<\l> in regular expression substitutions.
+
+=item
+
+Using caseless (C</i>) regular expression matching
+
+=item
+
+Matching a number of properties in regular expressions, such as C<\w>
+
+=item
+
+User-defined case change mappings.  You can create a C<ToUpper()> function, for
+example, which overrides Perl's built-in case mappings.  The scalar must be
+encoded in utf8 for your function to actually be invoked.
+
+=back
+
+B<This lack of semantics for these characters is currently the default,>
+outside of C<use locale>.  See below for EBCDIC.
+To turn on B<case changing semantics only> for these characters, use
+C<S<no legacy>>.
+The other legacy behaviors regarding these characters are currently
+unaffected by this pragma.
+
+=head4 EBCDIC platforms
+
+On EBCDIC platforms, the situation is somewhat different.  The legacy
+semantics are whatever the underlying semantics of the native C language
+library are.  Each of the three EBCDIC encodings currently known by Perl is an
+isomorph of the Latin-1 character set.  That means every character in Latin-1
+has a corresponding EBCDIC equivalent, and vice-versa.  Specifying C<S<no
+legacy>> currently makes sure that all EBCDIC characters have the same
+B<casing only> semantics as their corresponding Latin-1 characters.
 
 =head2 B<no legacy>
 
 Turn on a new behavior in a version of Perl that understands
 it but has it turned off by default.  For example, C<no legacy 'foo'> turns on
-behavior C<foo> in the lexical scope of the pragma.  Simply C<no legacy>
-turns on all new behaviors known to the pragma.
+behavior C<foo> in the lexical scope of the pragma.  C<no legacy>
+without any modifier turns on all new behaviors known to the pragma.
 
 =head1 LEGACY BUNDLES
 
-It's possible to turn off all new behaviors past a given release by 
+It's possible to turn off all new behaviors past a given release by
 using a I<legacy bundle>, which is the name of the release prefixed with
 a colon, to distinguish it from an individual legacy behavior.
 
@@ -93,7 +159,7 @@ sub import {
         if (!exists $legacy{$name}) {
             unknown_legacy($name);
         }
-        $^H &= ~$unicode8bit::hint_bits;    # The only thing it could be as of yet
+        $^H &= ~$unicode8bit::hint_uni8bit;    # The only valid thing as of yet
     }
 }
 
@@ -116,7 +182,7 @@ sub unimport {
             unknown_legacy($name);
         }
         else {
-            $^H |= $unicode8bit::hint_bits; # The only thing it could be as of yet
+            $^H |= $unicode8bit::hint_uni8bit; # The only valid thing as of yet
         }
     }
 }
diff --git a/lib/legacy.t b/lib/legacy.t
new file mode 100644
index 0000000000..1d332b7be3
--- /dev/null
+++ b/lib/legacy.t
@@ -0,0 +1,146 @@
+use warnings;
+use strict;
+
+BEGIN {
+    chdir 't' if -d 't';
+    @INC = '../lib';
+    require './test.pl';
+}
+
+#use Test::More;
+
+#plan("no_plan");
+plan(13312);
+
+# First compute the case mappings without resorting to the functions we're
+# testing.
+
+# Initialize the arrays so each $i maps to itself.
+my @posix_to_upper;
+for my $i (0 .. 255) {
+    $posix_to_upper[$i] = chr($i);
+}
+my @posix_to_lower
+= my @posix_to_title
+= my @latin1_to_upper
+= my @latin1_to_lower
+= my @latin1_to_title
+= @posix_to_upper;
+
+# Override the elements in the to_lower arrays that have different lower case 
+# mappings with those mappings.
+for my $i (0x41 .. 0x5A) {
+    $posix_to_lower[$i] = chr(ord($posix_to_lower[$i]) + 32);
+    $latin1_to_lower[$i] = chr(ord($latin1_to_lower[$i]) + 32);
+}
+
+# Same for upper and title
+for my $i (0x61 .. 0x7A) {
+    $posix_to_upper[$i] = chr(ord($posix_to_upper[$i]) - 32);
+    $latin1_to_upper[$i] = chr(ord($latin1_to_upper[$i]) - 32);
+    $posix_to_title[$i] = chr(ord($posix_to_title[$i]) - 32);
+    $latin1_to_title[$i] = chr(ord($latin1_to_title[$i]) - 32);
+}
+
+# And the same for those in the latin1 range
+for my $i (0xC0 .. 0xD6, 0xD8 .. 0xDE) {
+    $latin1_to_lower[$i] = chr(ord($latin1_to_lower[$i]) + 32);
+}
+for my $i (0xE0 .. 0xF6, 0xF8 .. 0xFE) {
+    $latin1_to_upper[$i] = chr(ord($latin1_to_upper[$i]) - 32);
+    $latin1_to_title[$i] = chr(ord($latin1_to_title[$i]) - 32);
+}
+
+# Override the abnormal cases.
+$latin1_to_upper[0xB5] = chr(0x39C);
+$latin1_to_title[0xB5] = chr(0x39C);
+$latin1_to_upper[0xDF] = 'SS';
+$latin1_to_title[0xDF] = 'Ss';
+$latin1_to_upper[0xFF] = chr(0x178);
+$latin1_to_title[0xFF] = chr(0x178);
+
+my $repeat = 25;    # Length to make strings.
+
+# Create hashes of strings in several ranges, both for uc and lc.
+my %posix;
+$posix{'uc'} = 'A' x $repeat;
+$posix{'lc'} = 'a' x $repeat ;
+
+my %cyrillic;
+$cyrillic{'uc'} = chr(0x42F) x $repeat;
+$cyrillic{'lc'} = chr(0x44F) x $repeat;
+
+my %latin1;
+$latin1{'uc'} = chr(0xD8) x $repeat;
+$latin1{'lc'} = chr(0xF8) x $repeat;
+
+my %empty;
+$empty{'lc'} = $empty{'uc'} = "";
+
+# Loop so prefix each character being tested with nothing, and the various
+# strings; then loop for suffixes of those strings as well.
+for my  $prefix (\%empty, \%posix, \%cyrillic, \%latin1) {
+    for my  $suffix (\%empty, \%posix, \%cyrillic, \%latin1) {
+        for my $i (0 .. 255) {  # For each possible posix or latin1 character
+            my $cp = sprintf "%02X", $i;
+
+            # First try using latin1 (Unicode) semantics.
+            no legacy "unicode8bit";    
+
+            my $phrase = 'with unicode';
+            my $char = chr($i);
+            my $pre_lc = $prefix->{'lc'};
+            my $pre_uc = $prefix->{'uc'};
+            my $post_lc = $suffix->{'lc'};
+            my $post_uc = $suffix->{'uc'};
+            my $to_upper = $pre_lc . $char . $post_lc;
+            my $expected_upper = $pre_uc . $latin1_to_upper[$i] . $post_uc;
+            my $to_lower = $pre_uc . $char . $post_uc;
+            my $expected_lower = $pre_lc . $latin1_to_lower[$i] . $post_lc;
+
+            is (uc($to_upper), $expected_upper,
+
+                # The names are commented out for now to avoid 'wide character
+                # in print' messages.
+                ); #"$cp: $phrase: uc('$to_upper') eq '$expected_upper'");
+            is (lc($to_lower), $expected_lower,
+                ); #"$cp: $phrase: lc('$to_lower') eq '$expected_lower'");
+
+            if ($pre_uc eq "") {    # Title case if null prefix.
+                my $expected_title = $latin1_to_title[$i] . $post_lc;
+                is (ucfirst($to_upper), $expected_title,
+                    ); #"$cp: $phrase: ucfirst('$to_upper') eq '$expected_title'");
+                my $expected_lcfirst = $latin1_to_lower[$i] . $post_uc;
+                is (lcfirst($to_lower), $expected_lcfirst,
+                    ); #"$cp: $phrase: lcfirst('$to_lower') eq '$expected_lcfirst'");
+            }
+
+            # Then try with posix semantics.
+            use legacy "unicode8bit";
+            $phrase = 'no unicode';
+
+            # These don't contribute anything in this case.
+            next if $suffix == \%cyrillic;
+            next if $suffix == \%latin1;
+            next if $prefix == \%cyrillic;
+            next if $prefix == \%latin1;
+
+            $expected_upper = $pre_uc . $posix_to_upper[$i] . $post_uc;
+            $expected_lower = $pre_lc . $posix_to_lower[$i] . $post_lc;
+
+            is (uc($to_upper), $expected_upper,
+                ); #"$cp: $phrase: uc('$to_upper') eq '$expected_upper'");
+            is (lc($to_lower), $expected_lower,
+                ); #"$cp: $phrase: lc('$to_lower') eq '$expected_lower'");
+
+            if ($pre_uc eq "") {
+                my $expected_title = $posix_to_title[$i] . $post_lc;
+                is (ucfirst($to_upper), $expected_title,
+                    ); #"$cp: $phrase: ucfirst('$to_upper') eq '$expected_title'");
+                my $expected_lcfirst = $posix_to_lower[$i] . $post_uc;
+                is (lcfirst($to_lower), $expected_lcfirst,
+                    ); #"$cp: $phrase: lcfirst('$to_lower') eq '$expected_lcfirst'");
+            }
+        }
+    }
+}
diff --git a/perl.h b/perl.h
index 874d0c34ea..fe6b7fcec0 100644
--- a/perl.h
+++ b/perl.h
@@ -28,7 +28,7 @@
 
 #ifdef VOIDUSED
 #   undef VOIDUSED
-#endif 
+#endif
 #define VOIDUSED 1
 
 #ifdef PERL_MICRO
@@ -270,13 +270,13 @@
 #define CALLREG_PACKAGE(rx) \
     CALL_FPTR(RX_ENGINE(rx)->qr_package)(aTHX_ (rx))
 
-#if defined(USE_ITHREADS)         
+#if defined(USE_ITHREADS)
 #define CALLREGDUPE(prog,param) \
     Perl_re_dup(aTHX_ (prog),(param))
 
 #define CALLREGDUPE_PVT(prog,param) \
     (prog ? CALL_FPTR(RX_ENGINE(prog)->dupe)(aTHX_ (prog),(param)) \
-          : (REGEXP *)NULL) 
+          : (REGEXP *)NULL)
 #endif
 
 
@@ -310,7 +310,7 @@
 #    define PERL_UNUSED_DECL
 #  endif
 #endif
- 
+
 /* gcc -Wall:
  * for silencing unused variables that are actually used most of the time,
  * but we cannot quite get rid of, such as "ax" in PPCODE+noargs xsubs
@@ -947,7 +947,7 @@ EXTERN_C int usleep(unsigned int);
 #define PERL_DEFAULT_DO_EXEC3_IMPLEMENTATION
 #endif
 
-/* Cannot include embed.h here on Win32 as win32.h has not 
+/* Cannot include embed.h here on Win32 as win32.h has not
    yet been included and defines some config variables e.g. HAVE_INTERP_INTERN
  */
 #if !defined(PERL_FOR_X2P) && !(defined(WIN32)||defined(VMS))
@@ -1198,7 +1198,7 @@ EXTERN_C int usleep(unsigned int);
 #endif
 
 /* In Tru64 use the 4.4BSD struct msghdr, not the 4.3 one.
- * This is important for using IPv6. 
+ * This is important for using IPv6.
  * For OSF/1 3.2, however, defining _SOCKADDR_LEN would be
  * a bad idea since it breaks send() and recv(). */
 #if defined(__osf__) && defined(__alpha) && !defined(_SOCKADDR_LEN) && !defined(DEC_OSF1_3_X)
@@ -2780,7 +2780,7 @@ freeing any remaining Perl interpreters.
 #    define HASATTRIBUTE_WARN_UNUSED_RESULT
 #  endif
 #endif
-#endif /* #ifndef PERL_MICRO */ 
+#endif /* #ifndef PERL_MICRO */
 
 /* USE_5005THREADS needs to be after unixish.h as <pthread.h> includes
  * <sys/signal.h> which defines NSIG - which will stop inclusion of <signal.h>
@@ -2871,7 +2871,7 @@ typedef pthread_key_t	perl_key;
 /* This is complicated.  The child processes return a true native VMS
    status which must be saved.  But there is an assumption in Perl that
    the UNIX child status has some relationship to errno values, so
-   Perl tries to translate it to text in some of the tests.  
+   Perl tries to translate it to text in some of the tests.
    In order to get the string translation correct, for the error, errno
    must be EVMSERR, but that generates a different text message
    than what the test programs are expecting.  So an errno value must
@@ -3131,16 +3131,16 @@ typedef pthread_key_t	perl_key;
 #  define PERL_SET_THX(t)		PERL_SET_CONTEXT(t)
 #endif
 
-/* 
+/*
     This replaces the previous %_ "hack" by the "%p" hacks.
     All that is required is that the perl source does not
-    use "%-p" or "%-<number>p" or "%<number>p" formats.  
-    These formats will still work in perl code.   
+    use "%-p" or "%-<number>p" or "%<number>p" formats.
+    These formats will still work in perl code.
     See comments in sv.c for futher details.
 
     Robin Barker 2005-07-14
 
-    No longer use %1p for VDf = %vd.  RMB 2007-10-19 
+    No longer use %1p for VDf = %vd.  RMB 2007-10-19
 */
 
 #ifndef SVf_
@@ -3162,7 +3162,7 @@ typedef pthread_key_t	perl_key;
 #define SVfARG(p) ((void*)(p))
 
 #ifdef PERL_CORE
-/* not used; but needed for backward compatibilty with XS code? - RMB */ 
+/* not used; but needed for backward compatibilty with XS code? - RMB */
 #  undef VDf
 #else
 #  ifndef VDf
@@ -3171,7 +3171,7 @@ typedef pthread_key_t	perl_key;
 #endif
 
 #ifdef PERL_CORE
-/* not used; but needed for backward compatibilty with XS code? - RMB */ 
+/* not used; but needed for backward compatibilty with XS code? - RMB */
 #  undef UVf
 #else
 #  ifndef UVf
@@ -3251,7 +3251,7 @@ typedef pthread_key_t	perl_key;
 #ifdef PRINTF_FORMAT_NULL_OK
 #  define __attribute__format__null_ok__(x,y,z)  __attribute__format__(x,y,z)
 #else
-#  define __attribute__format__null_ok__(x,y,z)  
+#  define __attribute__format__null_ok__(x,y,z)
 #endif
 
 #ifdef HAS_BUILTIN_EXPECT
@@ -3354,7 +3354,7 @@ typedef        struct crypt_data {     /* straight from /usr/include/crypt.h */
 #endif
 
 #ifdef __LIBCATAMOUNT__
-#undef HAS_PASSWD  /* unixish.h but not unixish enough. */ 
+#undef HAS_PASSWD  /* unixish.h but not unixish enough. */
 #undef HAS_GROUP
 #define FAKE_BIT_BUCKET
 #endif
@@ -4345,9 +4345,85 @@ EXTCONST  unsigned char PL_fold[] = {
 	240,	241,	242,	243,	244,	245,	246,	247,
 	248,	249,	250,	251,	252,	253,	254,	255
 };
-#endif  /* !EBCDIC */
-#else
+#endif  /* !EBCDIC, but still in DOINIT */
+
+/* If these tables are accessed through ebcdic, the access will be converted to
+ * latin1 first */
+EXTCONST  unsigned char PL_latin1_lc[] = {  /* lowercasing */
+	0,	1,	2,	3,	4,	5,	6,	7,
+	8,	9,	10,	11,	12,	13,	14,	15,
+	16,	17,	18,	19,	20,	21,	22,	23,
+	24,	25,	26,	27,	28,	29,	30,	31,
+	32,	33,	34,	35,	36,	37,	38,	39,
+	40,	41,	42,	43,	44,	45,	46,	47,
+	48,	49,	50,	51,	52,	53,	54,	55,
+	56,	57,	58,	59,	60,	61,	62,	63,
+	64,	'a',	'b',	'c',	'd',	'e',	'f',	'g',
+	'h',	'i',	'j',	'k',	'l',	'm',	'n',	'o',
+	'p',	'q',	'r',	's',	't',	'u',	'v',	'w',
+	'x',	'y',	'z',	91,	92,	93,	94,	95,
+	96,	97,	98,	99,	100,	101,	102,	103,
+	104,	105,	106,	107,	108,	109,	110,	111,
+	112,	113,	114,	115,	116,	117,	118,	119,
+	120,	121,	122,	123,	124,	125,	126,	127,
+	128,	129,	130,	131,	132,	133,	134,	135,
+	136,	137,	138,	139,	140,	141,	142,	143,
+	144,	145,	146,	147,	148,	149,	150,	151,
+	152,	153,	154,	155,	156,	157,	158,	159,
+	160,	161,	162,	163,	164,	165,	166,	167,
+	168,	169,	170,	171,	172,	173,	174,	175,
+	176,	177,	178,	179,	180,	181,	182,	183,
+	184,	185,	186,	187,	188,	189,	190,	191,
+	192+32,	193+32,	194+32,	195+32,	196+32,	197+32,	198+32,	199+32,
+	200+32,	201+32,	202+32,	203+32,	204+32,	205+32,	206+32,	207+32,
+	208+32,	209+32,	210+32,	211+32,	212+32,	213+32,	214+32,	215,
+	216+32,	217+32,	218+32,	219+32,	220+32,	221+32,	222+32,	223,	
+	224,	225,	226,	227,	228,	229,	230,	231,
+	232,	233,	234,	235,	236,	237,	238,	239,
+	240,	241,	242,	243,	244,	245,	246,	247,
+	248,	249,	250,	251,	252,	253,	254,	255
+};
+
+/* upper and title case of latin1 characters, modified so that the three tricky
+ * ones are mapped to 255 (which is one of the three) */
+EXTCONST  unsigned char PL_mod_latin1_uc[] = {
+	0,	1,	2,	3,	4,	5,	6,	7,
+	8,	9,	10,	11,	12,	13,	14,	15,
+	16,	17,	18,	19,	20,	21,	22,	23,
+	24,	25,	26,	27,	28,	29,	30,	31,
+	32,	33,	34,	35,	36,	37,	38,	39,
+	40,	41,	42,	43,	44,	45,	46,	47,
+	48,	49,	50,	51,	52,	53,	54,	55,
+	56,	57,	58,	59,	60,	61,	62,	63,
+	64,	65,	66,	67,	68,	69,	70,	71,
+	72,	73,	74,	75,	76,	77,	78,	79,
+	80,	81,	82,	83,	84,	85,	86,	87,
+	88,	89,	90,	91,	92,	93,	94,	95,
+	96,	'A',	'B',	'C',	'D',	'E',	'F',	'G',
+	'H',	'I',	'J',	'K',	'L',	'M',	'N',	'O',
+	'P',	'Q',	'R',	'S',	'T',	'U',	'V',	'W',
+	'X',	'Y',	'Z',	123,	124,	125,	126,	127,
+	128,	129,	130,	131,	132,	133,	134,	135,
+	136,	137,	138,	139,	140,	141,	142,	143,
+	144,	145,	146,	147,	148,	149,	150,	151,
+	152,	153,	154,	155,	156,	157,	158,	159,
+	160,	161,	162,	163,	164,	165,	166,	167,
+	168,	169,	170,	171,	172,	173,	174,	175,
+	176,	177,	178,	179,	180,	255 /*micro*/,	182,	183,
+	184,	185,	186,	187,	188,	189,	190,	191,
+	192,	193,	194,	195,	196,	197,	198,	199,
+	200,	201,	202,	203,	204,	205,	206,	207,
+	208,	209,	210,	211,	212,	213,	214,	215,
+	216,	217,	218,	219,	220,	221,	222,	255 /*sharp s*/,	
+	224-32,	225-32,	226-32,	227-32,	228-32,	229-32,	230-32,	231-32,
+	232-32,	233-32,	234-32,	235-32,	236-32,	237-32,	238-32,	239-32,
+	240-32,	241-32,	242-32,	243-32,	244-32,	245-32,	246-32,	247,
+	248-32,	249-32,	250-32,	251-32,	252-32,	253-32,	254-32,	255
+};
+#else	/* ! DOINIT */
 EXTCONST unsigned char PL_fold[];
+EXTCONST unsigned char PL_mod_latin1_uc[];
+EXTCONST unsigned char PL_latin1_lc[];
 #endif
 
 #ifndef PERL_GLOBAL_STRUCT /* or perlvars.h */
@@ -5970,8 +6046,8 @@ extern void moncontrol(int);
 #define NO_ENV_ARRAY_IN_MAIN
 #endif
 
-/* These are used by Perl_pv_escape() and Perl_pv_pretty() 
- * are here so that they are available throughout the core 
+/* These are used by Perl_pv_escape() and Perl_pv_pretty()
+ * are here so that they are available throughout the core
  * NOTE that even though some are for _escape and some for _pretty
  * there must not be any clashes as the flags from _pretty are
  * passed straight through to _escape.
@@ -5985,7 +6061,7 @@ extern void moncontrol(int);
 
 #define PERL_PV_ESCAPE_FIRSTCHAR    0x0008
 
-#define PERL_PV_ESCAPE_UNI          0x0100     
+#define PERL_PV_ESCAPE_UNI          0x0100
 #define PERL_PV_ESCAPE_UNI_DETECT   0x0200
 
 #define PERL_PV_ESCAPE_ALL	    0x1000
diff --git a/pp.c b/pp.c
index 7641b54edf..eaeb89f5c9 100644
--- a/pp.c
+++ b/pp.c
@@ -3525,22 +3525,97 @@ PP(pp_crypt)
 #endif
 }
 
+/* Generally UTF-8 and UTF-EBCDIC are indistinguishable at this level.  So 
+ * most comments below say UTF-8, when in fact they mean UTF-EBCDIC as well */
+
+/* Both the characters below can be stored in two UTF-8 bytes.  In UTF-8 the max
+ * character that 2 bytes can hold is U+07FF, and in UTF-EBCDIC it is U+03FF.
+ * See http://www.unicode.org/unicode/reports/tr16 */
+#define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178	/* Also is title case */
+#define GREEK_CAPITAL_LETTER_MU 0x039C	/* Upper and title case of MICRON */
+
+/* Below are several macros that generate code */
+/* Generates code to store a unicode codepoint c that is known to occupy
+ * exactly two UTF-8 and UTF-EBCDIC bytes; it is stored into p and p+1. */
+#define STORE_UNI_TO_UTF8_TWO_BYTE(p, c)				    \
+    STMT_START {							    \
+	*(p) = UTF8_TWO_BYTE_HI(c);					    \
+	*((p)+1) = UTF8_TWO_BYTE_LO(c);					    \
+    } STMT_END
+
+/* Like STORE_UNI_TO_UTF8_TWO_BYTE, but advances p to point to the next
+ * available byte after the two bytes */
+#define CAT_UNI_TO_UTF8_TWO_BYTE(p, c)					    \
+    STMT_START {							    \
+	*(p)++ = UTF8_TWO_BYTE_HI(c);					    \
+	*((p)++) = UTF8_TWO_BYTE_LO(c);					    \
+    } STMT_END
+
+/* Generates code to store the upper case of latin1 character l which is known
+ * to have its upper case be non-latin1 into the two bytes p and p+1.  There
+ * are only two characters that fit this description, and this macro knows
+ * about them, and that the upper case values fit into two UTF-8 or UTF-EBCDIC
+ * bytes */
+#define STORE_NON_LATIN1_UC(p, l)					    \
+STMT_START {								    \
+    if ((l) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) {			    \
+	STORE_UNI_TO_UTF8_TWO_BYTE((p), LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);  \
+    } else { /* Must be the following letter */								    \
+	STORE_UNI_TO_UTF8_TWO_BYTE((p), GREEK_CAPITAL_LETTER_MU);	    \
+    }									    \
+} STMT_END
+
+/* Like STORE_NON_LATIN1_UC, but advances p to point to the next available byte
+ * after the character stored */
+#define CAT_NON_LATIN1_UC(p, l)						    \
+STMT_START {								    \
+    if ((l) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) {			    \
+	CAT_UNI_TO_UTF8_TWO_BYTE((p), LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);    \
+    } else {								    \
+	CAT_UNI_TO_UTF8_TWO_BYTE((p), GREEK_CAPITAL_LETTER_MU);		    \
+    }									    \
+} STMT_END
+
+/* Generates code to add the two UTF-8 bytes (probably u) that are the upper
+ * case of l into p and p+1.  u must be the result of toUPPER_LATIN1_MOD(l),
+ * and must require two bytes to store it.  Advances p to point to the next
+ * available position */
+#define CAT_TWO_BYTE_UNI_UPPER_MOD(p, l, u)				    \
+STMT_START {								    \
+    if ((u) != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) {			    \
+	CAT_UNI_TO_UTF8_TWO_BYTE((p), (u)); /* not special, just save it */ \
+    } else if (l == LATIN_SMALL_LETTER_SHARP_S) {			    \
+	*(p)++ = 'S'; *(p)++ = 'S'; /* upper case is 'SS' */		    \
+    } else {/* else is one of the other two special cases */		    \
+	CAT_NON_LATIN1_UC((p), (l));					    \
+    }									    \
+} STMT_END
+
 PP(pp_ucfirst)
 {
+    /* Actually is both lcfirst() and ucfirst().  Only the first character
+     * changes.  This means that possibly we can change in-place, ie., just
+     * take the source and change that one character and store it back, but not
+     * if read-only etc, or if the length changes */
+
     dVAR;
     dSP;
     SV *source = TOPs;
-    STRLEN slen;
+    STRLEN slen; /* slen is the byte length of the whole SV. */
     STRLEN need;
     SV *dest;
-    bool inplace = TRUE;
-    bool doing_utf8;
+    bool inplace;   /* ? Convert first char only, in-place */
+    bool doing_utf8 = FALSE;		   /* ? using utf8 */
+    bool convert_source_to_utf8 = FALSE;   /* ? need to convert */
     const int op_type = PL_op->op_type;
     const U8 *s;
     U8 *d;
     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
-    STRLEN ulen;
-    STRLEN tculen;
+    STRLEN ulen;    /* ulen is the byte length of the original Unicode character
+		     * stored as UTF-8 at s. */
+    STRLEN tculen;  /* tculen is the byte length of the freshly titlecased (or
+		     * lowercased) character stored in tmpbuf.  May be either
+		     * UTF-8 or not, but in either case is the number of bytes */
 
     SvGETMAGIC(source);
     if (SvOK(source)) {
@@ -3552,25 +3627,187 @@ PP(pp_ucfirst)
 	slen = 0;
     }
 
-    if (slen && DO_UTF8(source) && UTF8_IS_START(*s)) {
+    /* We may be able to get away with changing only the first character, in
+     * place, but not if read-only, etc.  Later we may discover more reasons to
+     * not convert in-place. */
+    inplace = SvPADTMP(source) && !SvREADONLY(source) && SvTEMP(source);
+
+    /* First calculate what the changed first character should be.  This affects
+     * whether we can just swap it out, leaving the rest of the string unchanged,
+     * or even if have to convert the dest to UTF-8 when the source isn't */
+
+    if (! slen) {   /* If empty */
+	need = 1; /* still need a trailing NUL */
+    }
+    else if (DO_UTF8(source)) {	/* Is the source utf8? */
 	doing_utf8 = TRUE;
-	utf8_to_uvchr(s, &ulen);
-	if (op_type == OP_UCFIRST) {
-	    toTITLE_utf8(s, tmpbuf, &tculen);
-	} else {
-	    toLOWER_utf8(s, tmpbuf, &tculen);
+
+/* TODO: This is #ifdefd out because it has hard-coded the standard mappings,
+ * and doesn't allow for the user to specify their own.  When code is added to
+ * detect if there is a user-defined mapping in force here, and if so to use
+ * that, then the code below can be compiled.  The detection would be a good
+ * thing anyway, as currently the user-defined mappings only work on utf8
+ * strings, and thus depend on the chosen internal storage method, which is a
+ * bad thing */
+#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
+	if (UTF8_IS_INVARIANT(*s)) {
+
+	    /* An invariant source character is either ASCII or, in EBCDIC, an
+	     * ASCII equivalent or a caseless C1 control.  In both these cases,
+	     * the lower and upper cases of any character are also invariants
+	     * (and title case is the same as upper case).  So it is safe to
+	     * use the simple case change macros which avoid the overhead of
+	     * the general functions.  Note that if perl were to be extended to
+	     * do locale handling in UTF-8 strings, this wouldn't be true in,
+	     * for example, Lithuanian or Turkic.  */
+	    *tmpbuf = (op_type == OP_LCFIRST) ? toLOWER(*s) : toUPPER(*s);
+	    tculen = ulen = 1;
+	    need = slen + 1;
+	}
+	else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+	    U8 chr;
+
+	    /* Similarly, if the source character isn't invariant but is in the
+	     * latin1 range (or EBCDIC equivalent thereof), we have the case
+	     * changes compiled into perl, and can avoid the overhead of the
+	     * general functions.  In this range, the characters are stored as
+	     * two UTF-8 bytes, and it so happens that any changed-case version
+	     * is also two bytes (in both ASCIIish and EBCDIC machines). */
+	    tculen = ulen = 2;
+	    need = slen + 1;
+
+	    /* Convert the two source bytes to a single Unicode code point
+	     * value, change case and save for below */
+	    chr = UTF8_ACCUMULATE(*s, *(s+1));
+	    if (op_type == OP_LCFIRST) {    /* lower casing is easy */
+		U8 lower = toLOWER_LATIN1(chr);
+		STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf, lower);
+	    }
+	    else {	/* ucfirst */
+		U8 upper = toUPPER_LATIN1_MOD(chr);
+
+		/* Most of the latin1 range characters are well-behaved.  Their
+		 * title and upper cases are the same, and are also in the
+		 * latin1 range.  The macro above returns their upper (hence
+		 * title) case, and all that need be done is to save the result
+		 * for below.  However, several characters are problematic, and
+		 * have to be handled specially.  The MOD in the macro name
+		 * above means that these tricky characters all get mapped to
+		 * the single character LATIN_SMALL_LETTER_Y_WITH_DIAERESIS.
+		 * This mapping saves some tests for the majority of the
+		 * characters */
+
+		if (upper != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) {
+
+		    /* Not tricky.  Just save it. */
+		    STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf, upper);
+		}
+		else if (chr == LATIN_SMALL_LETTER_SHARP_S) {
+
+		    /* This one is tricky because it is two characters long,
+		     * though the UTF-8 is still two bytes, so the stored
+		     * length doesn't change */
+		    *tmpbuf = 'S';  /* The UTF-8 is 'Ss' */
+		    *(tmpbuf + 1) = 's';
+		}
+		else {
+
+		    /* The other two have their title and upper cases the same,
+		     * but are tricky because the changed-case characters
+		     * aren't in the latin1 range.  They, however, do fit into
+		     * two UTF-8 bytes */
+		    STORE_NON_LATIN1_UC(tmpbuf, chr);    
+		}
+	    }
 	}
-	/* If the two differ, we definately cannot do inplace.  */
-	inplace = (ulen == tculen);
-	need = slen + 1 - ulen + tculen;
-    } else {
-	doing_utf8 = FALSE;
-	need = slen + 1;
+	else {
+#endif	/* end of dont want to break user-defined casing */
+
+	    /* Here, can't short-cut the general case */
+
+	    utf8_to_uvchr(s, &ulen);
+	    if (op_type == OP_UCFIRST) toTITLE_utf8(s, tmpbuf, &tculen);
+	    else toLOWER_utf8(s, tmpbuf, &tculen);
+
+	    /* we can't do in-place if the length changes.  */
+	    if (ulen != tculen) inplace = FALSE;
+	    need = slen + 1 - ulen + tculen;
+#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
+	}
+#endif
     }
+    else { /* Non-zero length, non-UTF-8,  Need to consider locale and if
+	    * latin1 is treated as caseless.  Note that a locale takes
+	    * precedence */ 
+	tculen = 1;	/* Most characters will require one byte, but this will
+			 * need to be overridden for the tricky ones */
+	need = slen + 1;
+
+	if (op_type == OP_LCFIRST) {
+
+	    /* lower case the first letter: no trickiness for any character */
+	    *tmpbuf = (IN_LOCALE_RUNTIME) ? toLOWER_LC(*s) :
+			((IN_UNI_8_BIT) ? toLOWER_LATIN1(*s) : toLOWER(*s));
+	}
+	/* is ucfirst() */
+	else if (IN_LOCALE_RUNTIME) {
+	    *tmpbuf = toUPPER_LC(*s);	/* This would be a bug if any locales
+					 * have upper and title case different
+					 */
+	}
+	else if (! IN_UNI_8_BIT) {
+	    *tmpbuf = toUPPER(*s);	/* Returns caseless for non-ascii, or
+					 * on EBCDIC machines whatever the
+					 * native function does */
+	}
+	else { /* is ucfirst non-UTF-8, not in locale, and cased latin1 */
+	    *tmpbuf = toUPPER_LATIN1_MOD(*s);
+
+	    /* tmpbuf now has the correct title case for all latin1 characters
+	     * except for the several ones that have tricky handling.  All
+	     * of these are mapped by the MOD to the letter below. */
+	    if (*tmpbuf == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) {
+
+		/* The length is going to change, with all three of these, so
+		 * can't replace just the first character */
+		inplace = FALSE;
+
+		/* We use the original to distinguish between these tricky
+		 * cases */
+		if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+		    /* Two character title case 'Ss', but can remain non-UTF-8 */
+		    need = slen + 2;
+		    *tmpbuf = 'S';
+		    *(tmpbuf + 1) = 's';   /* Assert: length(tmpbuf) >= 2 */
+		    tculen = 2;
+		}
+		else {
+
+		    /* The other two tricky ones have their title case outside
+		     * latin1.  It is the same as their upper case. */
+		    doing_utf8 = TRUE;
+		    STORE_NON_LATIN1_UC(tmpbuf, *s);
+
+		    /* The UTF-8 and UTF-EBCDIC lengths of both these characters
+		     * and their upper cases is 2. */
+		    tculen = ulen = 2;
+
+		    /* The entire result will have to be in UTF-8.  Assume worst
+		     * case sizing in conversion. (all latin1 characters occupy
+		     * at most two bytes in utf8) */
+		    convert_source_to_utf8 = TRUE;
+		    need = slen * 2 + 1;
+		}
+	    } /* End of is one of the three special chars */
+	} /* End of use Unicode (Latin1) semantics */
+    } /* End of changing the case of the first character */
 
-    if (SvPADTMP(source) && !SvREADONLY(source) && inplace && SvTEMP(source)) {
-	/* We can convert in place.  */
+    /* Here, have the first character's changed case stored in tmpbuf.  Ready to
+     * generate the result */
+    if (inplace) {
 
+	/* We can convert in place.  This means we change just the first
+	 * character without disturbing the rest; no need to grow */
 	dest = source;
 	s = d = (U8*)SvPV_force_nomg(source, slen);
     } else {
@@ -3578,53 +3815,83 @@ PP(pp_ucfirst)
 
 	dest = TARG;
 
+	/* Here, we can't convert in place; we earlier calculated how much
+	 * space we will need, so grow to accommodate that */
 	SvUPGRADE(dest, SVt_PV);
 	d = (U8*)SvGROW(dest, need);
 	(void)SvPOK_only(dest);
 
 	SETs(dest);
-
-	inplace = FALSE;
     }
 
     if (doing_utf8) {
-	if(!inplace) {
-	    /* slen is the byte length of the whole SV.
-	     * ulen is the byte length of the original Unicode character
-	     * stored as UTF-8 at s.
-	     * tculen is the byte length of the freshly titlecased (or
-	     * lowercased) Unicode character stored as UTF-8 at tmpbuf.
-	     * We first set the result to be the titlecased (/lowercased)
-	     * character, and then append the rest of the SV data. */
-	    sv_setpvn(dest, (char*)tmpbuf, tculen);
-	    if (slen > ulen)
-	        sv_catpvn(dest, (char*)(s + ulen), slen - ulen);
+	if (! inplace) {
+	    if (! convert_source_to_utf8) {
+
+		/* Here  both source and dest are in UTF-8, but have to create
+		 * the entire output.  We initialize the result to be the
+		 * title/lower cased first character, and then append the rest
+		 * of the string. */
+		sv_setpvn(dest, (char*)tmpbuf, tculen);
+		if (slen > ulen) {
+		    sv_catpvn(dest, (char*)(s + ulen), slen - ulen);
+		}
+	    }
+	    else {
+		const U8 *const send = s + slen;
+
+		/* Here the dest needs to be in UTF-8, but the source isn't,
+		 * except we earlier UTF-8'd the first character of the source
+		 * into tmpbuf.  First put that into dest, and then append the
+		 * rest of the source, converting it to UTF-8 as we go. */
+
+		/* Assert tculen is 2 here because the only two characters that
+		 * get to this part of the code have 2-byte UTF-8 equivalents */
+		*d++ = *tmpbuf;
+		*d++ = *(tmpbuf + 1);
+		s++;	/* We have just processed the 1st char */
+
+		for (; s < send; s++) {
+		    d = uvchr_to_utf8(d, *s);
+		}
+		*d = '\0';
+		SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
+	    }
 	    SvUTF8_on(dest);
 	}
-	else {
+	else {   /* in-place UTF-8.  Just overwrite the first character */
 	    Copy(tmpbuf, d, tculen, U8);
 	    SvCUR_set(dest, need - 1);
 	}
     }
-    else {
-	if (*s) {
+    else {  /* Neither source nor dest are in or need to be UTF-8 */
+	if (slen) {
 	    if (IN_LOCALE_RUNTIME) {
 		TAINT;
 		SvTAINTED_on(dest);
-		*d = (op_type == OP_UCFIRST)
-		    ? toUPPER_LC(*s) : toLOWER_LC(*s);
 	    }
-	    else
-		*d = (op_type == OP_UCFIRST) ? toUPPER(*s) : toLOWER(*s);
-	} else {
-	    /* See bug #39028  */
+	    if (inplace) {  /* in-place, only need to change the 1st char */
+		*d = *tmpbuf;
+	    }
+	    else {	/* Not in-place */
+
+		/* Copy the case-changed character(s) from tmpbuf */
+		Copy(tmpbuf, d, tculen, U8);
+		d += tculen - 1; /* Code below expects d to point to final
+				  * character stored */
+	    }
+	}
+	else {	/* empty source */
+	    /* See bug #39028: Don't taint if empty  */
 	    *d = *s;
 	}
 
+	/* In a "use bytes" we don't treat the source as UTF-8, but, still want
+	 * the destination to retain that flag */
 	if (SvUTF8(source))
 	    SvUTF8_on(dest);
 
-	if (!inplace) {
+	if (!inplace) {	/* Finish the rest of the string, unchanged */
 	    /* This will copy the trailing NUL  */
 	    Copy(s + 1, d + 1, slen, U8);
 	    SvCUR_set(dest, need - 1);
@@ -3636,7 +3903,7 @@ PP(pp_ucfirst)
 
 /* There's so much setup/teardown code common between uc and lc, I wonder if
    it would be worth merging the two, and just having a switch outside each
-   of the three tight loops.  */
+   of the three tight loops.  There is less and less commonality though */
 PP(pp_uc)
 {
     dVAR;
@@ -3651,9 +3918,16 @@ PP(pp_uc)
     SvGETMAGIC(source);
 
     if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source)
-	&& SvTEMP(source) && !DO_UTF8(source)) {
-	/* We can convert in place.  */
-
+	&& SvTEMP(source) && !DO_UTF8(source)
+	&& (IN_LOCALE_RUNTIME || ! IN_UNI_8_BIT)) {
+
+	/* We can convert in place.  The reason we can't if in UNI_8_BIT is to
+	 * make the loop tight, so we overwrite the source with the dest before
+	 * looking at it, and we need to look at the original source
+	 * afterwards.  There would also need to be code added to handle
+	 * switching to not in-place in midstream if we run into characters
+	 * that change the length.
+	 */
 	dest = source;
 	s = d = (U8*)SvPV_force_nomg(source, len);
 	min = len + 1;
@@ -3693,48 +3967,209 @@ PP(pp_uc)
 	const U8 *const send = s + len;
 	U8 tmpbuf[UTF8_MAXBYTES+1];
 
+/* This is ifdefd out because it needs more work and thought.  It isn't clear
+ * that we should do it.  These are hard-coded rules from the Unicode standard,
+ * and may change.  5.2 gives new guidance on the iota subscript, for example,
+ * which has not been checked against this; and secondly it may be that we are
+ * passed a subset of the context, via a \U...\E, for example, and its not
+ * clear what the best approach is to that */
+#ifdef CONTEXT_DEPENDENT_CASING
+	bool in_iota_subscript = FALSE;
+#endif
+
 	while (s < send) {
-	    const STRLEN u = UTF8SKIP(s);
-	    STRLEN ulen;
-
-	    toUPPER_utf8(s, tmpbuf, &ulen);
-	    if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
-		/* If the eventually required minimum size outgrows
-		 * the available space, we need to grow. */
-		const UV o = d - (U8*)SvPVX_const(dest);
-
-		/* If someone uppercases one million U+03B0s we SvGROW() one
-		 * million times.  Or we could try guessing how much to
-		 allocate without allocating too much.  Such is life. */
-		SvGROW(dest, min);
-		d = (U8*)SvPVX(dest) + o;
+#ifdef CONTEXT_DEPENDENT_CASING
+	    if (in_iota_subscript && ! is_utf8_mark(s)) {
+		/* A non-mark.  Time to output the iota subscript */
+#define GREEK_CAPITAL_LETTER_IOTA 0x0399
+#define COMBINING_GREEK_YPOGEGRAMMENI 0x0345
+
+		CAT_UNI_TO_UTF8_TWO_BYTE(d, GREEK_CAPITAL_LETTER_IOTA);
+		in_iota_subscript = FALSE;
+	    }
+#endif
+
+
+/* See comments at the first instance in this file of this ifdef */
+#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
+
+	    /* If the UTF-8 character is invariant, then it is in the range
+	     * known by the standard macro; result is only one byte long */
+	    if (UTF8_IS_INVARIANT(*s)) {
+		*d++ = toUPPER(*s);
+		s++;
+	    }
+	    else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+
+		/* Likewise, if it fits in a byte, its case change is in our
+		 * table */
+		U8 orig = UTF8_ACCUMULATE(*s, *(s+1));
+		U8 upper = toUPPER_LATIN1_MOD(orig);
+		CAT_TWO_BYTE_UNI_UPPER_MOD(d, orig, upper);
+		s += 2;
+	    }
+	    else {
+#else
+	    {
+#endif
+
+		/* Otherwise, need the general UTF-8 case.  Get the changed
+		 * case value and copy it to the output buffer */
+
+		const STRLEN u = UTF8SKIP(s);
+		STRLEN ulen;
+
+#ifndef CONTEXT_DEPENDENT_CASING
+		toUPPER_utf8(s, tmpbuf, &ulen);
+#else
+		const UV uv = toUPPER_utf8(s, tmpbuf, &ulen);
+		if (uv == GREEK_CAPITAL_LETTER_IOTA && utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI) {
+		    in_iota_subscript = TRUE;
+		}
+		else {
+#endif
+		    if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
+			/* If the eventually required minimum size outgrows
+			 * the available space, we need to grow. */
+			const UV o = d - (U8*)SvPVX_const(dest);
+
+			/* If someone uppercases one million U+03B0s we
+			 * SvGROW() one million times.  Or we could try
+			 * guessing how much to allocate without allocating too
+			 * much.  Such is life.  See corresponding comment in lc code
+			 * for another option */
+			SvGROW(dest, min);
+			d = (U8*)SvPVX(dest) + o;
+		    }
+		    Copy(tmpbuf, d, ulen, U8);
+		    d += ulen;
+#ifdef CONTEXT_DEPENDENT_CASING
+		}
+#endif
+		s += u;
 	    }
-	    Copy(tmpbuf, d, ulen, U8);
-	    d += ulen;
-	    s += u;
 	}
+#ifdef CONTEXT_DEPENDENT_CASING
+	if (in_iota_subscript) CAT_UNI_TO_UTF8_TWO_BYTE(d, GREEK_CAPITAL_LETTER_IOTA);
+#endif
 	SvUTF8_on(dest);
 	*d = '\0';
 	SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
-    } else {
+    } else {	/* Not UTF-8 */
 	if (len) {
 	    const U8 *const send = s + len;
+
+	    /* Use locale casing if in locale; regular style if not treating
+	     * latin1 as having case; otherwise the latin1 casing.  Do the
+	     * whole thing in a tight loop, for speed, */
 	    if (IN_LOCALE_RUNTIME) {
 		TAINT;
 		SvTAINTED_on(dest);
 		for (; s < send; d++, s++)
 		    *d = toUPPER_LC(*s);
 	    }
-	    else {
-		for (; s < send; d++, s++)
+	    else if (! IN_UNI_8_BIT) {
+		for (; s < send; d++, s++) {
 		    *d = toUPPER(*s);
+		}
 	    }
-	}
+	    else {
+		for (; s < send; d++, s++) {
+		    *d = toUPPER_LATIN1_MOD(*s);
+		    if (*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) continue;
+
+		    /* The mainstream case is the tight loop above.  To avoid
+		     * extra tests in that, all three characters that require
+		     * special handling are mapped by the MOD to the one tested
+		     * just above.  
+		     * Use the source to distinguish between the three cases */
+
+		    if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+
+			/* uc() of this requires 2 characters, but they are
+			 * ASCII.  If not enough room, grow the string */
+			if (SvLEN(dest) < ++min) {	
+			    const UV o = d - (U8*)SvPVX_const(dest);
+			    SvGROW(dest, min);
+			    d = (U8*)SvPVX(dest) + o;
+			}
+			*d++ = 'S'; *d = 'S'; /* upper case is 'SS' */
+			continue;   /* Back to the tight loop; still in ASCII */
+		    }
+
+		    /* The other two special handling characters have their
+		     * upper cases outside the latin1 range, hence need to be
+		     * in UTF-8, so the whole result needs to be in UTF-8.  So,
+		     * here we are somewhere in the middle of processing a
+		     * non-UTF-8 string, and realize that we will have to convert
+		     * the whole thing to UTF-8.  What to do?  There are
+		     * several possibilities.  The simplest to code is to
+		     * convert what we have so far, set a flag, and continue on
+		     * in the loop.  The flag would be tested each time through
+		     * the loop, and if set, the next character would be
+		     * converted to UTF-8 and stored.  But, I (khw) didn't want
+		     * to slow down the mainstream case at all for this fairly
+		     * rare case, so I didn't want to add a test that didn't
+		     * absolutely have to be there in the loop, besides the
+		     * possibility that it would get too complicated for
+		     * optimizers to deal with.  Another possibility is to just
+		     * give up, convert the source to UTF-8, and restart the
+		     * function that way.  Another possibility is to convert
+		     * both what has already been processed and what is yet to
+		     * come separately to UTF-8, then jump into the loop that
+		     * handles UTF-8.  But the most efficient time-wise of the
+		     * ones I could think of is what follows, and turned out to
+		     * not require much extra code.  */
+
+		    /* Convert what we have so far into UTF-8, telling the
+		     * function that we know it should be converted, and to
+		     * allow extra space for what we haven't processed yet.
+		     * Assume the worst case space requirements for converting
+		     * what we haven't processed so far: that it will require
+		     * two bytes for each remaining source character, plus the
+		     * NUL at the end.  This may cause the string pointer to
+		     * move, so re-find it. */
+
+		    len = d - (U8*)SvPVX_const(dest);
+		    SvCUR_set(dest, len);
+		    len = sv_utf8_upgrade_flags_grow(dest,
+						SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+						(send -s) * 2 + 1);
+		    d = (U8*)SvPVX(dest) + len;
+
+		    /* And append the current character's upper case in UTF-8 */
+		    CAT_NON_LATIN1_UC(d, *s);
+
+		    /* Now process the remainder of the source, converting to
+		     * upper and UTF-8.  If a resulting byte is invariant in
+		     * UTF-8, output it as-is, otherwise convert to UTF-8 and
+		     * append it to the output. */
+
+		    s++;
+		    for (; s < send; s++) {
+			U8 upper = toUPPER_LATIN1_MOD(*s);
+			if UTF8_IS_INVARIANT(upper) {
+			    *d++ = upper;
+			}
+			else {
+			    CAT_TWO_BYTE_UNI_UPPER_MOD(d, *s, upper);
+			}
+		    }
+
+		    /* Here have processed the whole source; no need to continue
+		     * with the outer loop.  Each character has been converted
+		     * to upper case and converted to UTF-8 */
+
+		    break;
+		} /* End of processing all latin1-style chars */
+	    } /* End of processing all chars */
+	} /* End of source is not empty */
+
 	if (source != dest) {
-	    *d = '\0';
+	    *d = '\0';  /* Here d points to 1 after last char, add NUL */
 	    SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
 	}
-    }
+    } /* End of isn't utf8 */
     SvSETMAGIC(dest);
     RETURN;
 }
@@ -3754,8 +4189,9 @@ PP(pp_lc)
 
     if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source)
 	&& SvTEMP(source) && !DO_UTF8(source)) {
-	/* We can convert in place.  */
 
+	/* We can convert in place, as lowercasing anything in the latin1 range
+	 * (or else DO_UTF8 would have been on) doesn't lengthen it */
 	dest = source;
 	s = d = (U8*)SvPV_force_nomg(source, len);
 	min = len + 1;
@@ -3796,56 +4232,148 @@ PP(pp_lc)
 	U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
 
 	while (s < send) {
-	    const STRLEN u = UTF8SKIP(s);
-	    STRLEN ulen;
-	    const UV uv = toLOWER_utf8(s, tmpbuf, &ulen);
+/* See comments at the first instance in this file of this ifdef */
+#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
+	    if (UTF8_IS_INVARIANT(*s)) {
 
-#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */
-	    if (uv == GREEK_CAPITAL_LETTER_SIGMA) {
-		NOOP;
-		/*
-		 * Now if the sigma is NOT followed by
-		 * /$ignorable_sequence$cased_letter/;
-		 * and it IS preceded by /$cased_letter$ignorable_sequence/;
-		 * where $ignorable_sequence is [\x{2010}\x{AD}\p{Mn}]*
-		 * and $cased_letter is [\p{Ll}\p{Lo}\p{Lt}]
-		 * then it should be mapped to 0x03C2,
-		 * (GREEK SMALL LETTER FINAL SIGMA),
-		 * instead of staying 0x03A3.
-		 * "should be": in other words, this is not implemented yet.
-		 * See lib/unicore/SpecialCasing.txt.
+		/* Invariant characters use the standard mappings compiled in.
 		 */
+		*d++ = toLOWER(*s);
+		s++;
 	    }
-	    if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
-		/* If the eventually required minimum size outgrows
-		 * the available space, we need to grow. */
-		const UV o = d - (U8*)SvPVX_const(dest);
-
-		/* If someone lowercases one million U+0130s we SvGROW() one
-		 * million times.  Or we could try guessing how much to
-		 allocate without allocating too much.  Such is life. */
-		SvGROW(dest, min);
-		d = (U8*)SvPVX(dest) + o;
+	    else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+
+		/* As do the ones in the Latin1 range */
+		U8 lower = toLOWER_LATIN1(UTF8_ACCUMULATE(*s, *(s+1)));
+		CAT_UNI_TO_UTF8_TWO_BYTE(d, lower);
+		s += 2;
 	    }
-	    Copy(tmpbuf, d, ulen, U8);
-	    d += ulen;
-	    s += u;
-	}
+	    else {
+#endif
+		/* Here, is utf8 not in Latin-1 range, have to go out and get
+		 * the mappings from the tables. */
+
+		const STRLEN u = UTF8SKIP(s);
+		STRLEN ulen;
+
+/* See comments at the first instance in this file of this ifdef */
+#ifndef CONTEXT_DEPENDENT_CASING
+		toLOWER_utf8(s, tmpbuf, &ulen);
+#else
+		/* Here is context dependent casing, not compiled in currently;
+		 * needs more thought and work */
+
+		const UV uv = toLOWER_utf8(s, tmpbuf, &ulen);
+
+		/* If the lower case is a small sigma, it may be that we need
+		 * to change it to a final sigma.  This happens at the end of 
+		 * a word that contains more than just this character, and only
+		 * when we started with a capital sigma. */
+		if (uv == UNICODE_GREEK_SMALL_LETTER_SIGMA &&
+		    s > send - len &&	/* Makes sure not the first letter */
+		    utf8_to_uvchr(s, 0) == UNICODE_GREEK_CAPITAL_LETTER_SIGMA
+		) {
+
+		    /* We use the algorithm in:
+		     * http://www.unicode.org/versions/Unicode5.0.0/ch03.pdf (C
+		     * is a CAPITAL SIGMA): If C is preceded by a sequence
+		     * consisting of a cased letter and a case-ignorable
+		     * sequence, and C is not followed by a sequence consisting
+		     * of a case ignorable sequence and then a cased letter,
+		     * then when lowercasing C, C becomes a final sigma */
+
+		    /* To determine if this is the end of a word, need to peek
+		     * ahead.  Look at the next character */
+		    const U8 *peek = s + u;
+
+		    /* Skip any case ignorable characters */
+		    while (peek < send && is_utf8_case_ignorable(peek)) {
+			peek += UTF8SKIP(peek);
+		    }
+
+		    /* If we reached the end of the string without finding any
+		     * non-case ignorable characters, or if the next such one
+		     * is not-cased, then we have met the conditions for it
+		     * being a final sigma with regards to peek ahead, and so
+		     * must do peek behind for the remaining conditions. (We
+		     * know there is stuff behind to look at since we tested
+		     * above that this isn't the first letter) */
+		    if (peek >= send || ! is_utf8_cased(peek)) {
+			peek = utf8_hop(s, -1);
+
+			/* Here are at the beginning of the first character
+			 * before the original upper case sigma.  Keep backing
+			 * up, skipping any case ignorable characters */
+			while (is_utf8_case_ignorable(peek)) {
+			    peek = utf8_hop(peek, -1);
+			}
+
+			/* Here peek points to the first byte of the closest
+			 * non-case-ignorable character before the capital
+			 * sigma.  If it is cased, then by the Unicode
+			 * algorithm, we should use a small final sigma instead
+			 * of what we have */
+			if (is_utf8_cased(peek)) {
+			    STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf,
+					UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA);
+			}
+		    }
+		}
+		else {	/* Not a context sensitive mapping */
+#endif	/* End of commented out context sensitive */
+		    if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
+
+			/* If the eventually required minimum size outgrows
+			 * the available space, we need to grow. */
+			const UV o = d - (U8*)SvPVX_const(dest);
+
+			/* If someone lowercases one million U+0130s we
+			 * SvGROW() one million times.  Or we could try
+			 * guessing how much to allocate without allocating too
+			 * much.  Such is life.  Another option would be to
+			 * grow an extra byte or two more each time we need to
+			 * grow, which would cut down the million to 500K, with
+			 * little waste */
+			SvGROW(dest, min);
+			d = (U8*)SvPVX(dest) + o;
+		    }
+#ifdef CONTEXT_DEPENDENT_CASING
+		}
+#endif
+		/* Copy the newly lowercased letter to the output buffer we're
+		 * building */
+		Copy(tmpbuf, d, ulen, U8);
+		d += ulen;
+		s += u;
+#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS
+	    }
+#endif
+	}   /* End of looping through the source string */
 	SvUTF8_on(dest);
 	*d = '\0';
 	SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
-    } else {
+    } else {	/* Not utf8 */
 	if (len) {
 	    const U8 *const send = s + len;
+
+	    /* Use locale casing if in locale; regular style if not treating
+	     * latin1 as having case; otherwise the latin1 casing.  Do the
+	     * whole thing in a tight loop, for speed, */
 	    if (IN_LOCALE_RUNTIME) {
 		TAINT;
 		SvTAINTED_on(dest);
 		for (; s < send; d++, s++)
 		    *d = toLOWER_LC(*s);
 	    }
-	    else {
-		for (; s < send; d++, s++)
+	    else if (! IN_UNI_8_BIT) {
+		for (; s < send; d++, s++) {
 		    *d = toLOWER(*s);
+		}
+	    }
+	    else {
+		for (; s < send; d++, s++) {
+		    *d = toLOWER_LATIN1(*s);
+		}
 	    }
 	}
 	if (source != dest) {
diff --git a/utf8.h b/utf8.h
index 1c8e06b59e..7c205d1523 100644
--- a/utf8.h
+++ b/utf8.h
@@ -207,6 +207,7 @@ encoded character.
 
 #define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
 #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)
+#define IN_UNI_8_BIT (CopHINTS_get(PL_curcop) & HINT_UNI_8_BIT && ! IN_LOCALE_RUNTIME && ! IN_BYTES)
 
 #define UTF8_ALLOW_EMPTY		0x0001
 #define UTF8_ALLOW_CONTINUATION		0x0002
author	Karl Williamson <khw@khw-desktop.(none)>	2009-11-12 22:40:21 -0700
committer	Rafael Garcia-Suarez <rgs@consttype.org>	2009-11-14 23:34:55 +0100
commit	00f254e235ff10d6223aa9a402ad5b7a85689829 (patch)
tree	ecbf0c57e651245f5632419e0b86788eb252dac1
parent	a1248f17ffcfa8fe0e91df962317b46b81fc0ce5 (diff)
download	perl-00f254e235ff10d6223aa9a402ad5b7a85689829.tar.gz