#1477: ur'\U0010FFFF' used to raise in narrow unicode builds.

Corrected the raw-unicode-escape codec to use UTF-16 surrogates in this case, like the unicode-escape codec does. Backport of r61793 and r61853
author: Amaury Forgeot d'Arc <amauryfa@gmail.com> 2008-03-24 21:16:28 +0000
committer: Amaury Forgeot d'Arc <amauryfa@gmail.com> 2008-03-24 21:16:28 +0000
commit: e4342a538c3f4c08f118f87cf4a8c3e1230c5d1f (patch)
tree: 2911335b1a7f948a8c6893600d54a958ba8fdd0b
parent: b2b04e4e98b58f1ff0fe563feb7e323a812fe989 (diff)
download: cpython-e4342a538c3f4c08f118f87cf4a8c3e1230c5d1f.tar.gz
3 files changed, 64 insertions, 6 deletions
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 4f75771458..55fb8e17f0 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -736,12 +736,25 @@ class UnicodeTest(
         print >>out, u'def\n'
 
     def test_ucs4(self):
-        if sys.maxunicode == 0xFFFF:
-            return
         x = u'\U00100000'
         y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
         self.assertEqual(x, y)
 
+        y = r'\U00100000'
+        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
+        self.assertEqual(x, y)
+        y = r'\U00010000'
+        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
+        self.assertEqual(x, y)
+
+        try:
+            '\U11111111'.decode("raw-unicode-escape")
+        except UnicodeDecodeError, e:
+            self.assertEqual(e.start, 0)
+            self.assertEqual(e.end, 10)
+        else:
+            self.fail("Should have raised UnicodeDecodeError")
+
     def test_conversion(self):
         # Make sure __unicode__() works properly
         class Foo0:
diff --git a/Misc/NEWS b/Misc/NEWS
index a97fa52b26..3e95a44da9 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -11,6 +11,13 @@ What's New in Python 2.5.3?
 
 Core and builtins
 -----------------
+
+- Issue #1477: With narrow Unicode builds, the unicode escape sequence
+  \Uxxxxxxxx did not accept values outside the Basic Multilingual Plane.  This
+  affected raw unicode literals and the 'raw-unicode-escape' codec.  Now
+  UTF-16 surrogates are generated in this case, like normal unicode literals
+  and the 'unicode-escape' codec.
+
 - Issue #2321: use pymalloc for unicode object string data to reduce
   memory usage in some circumstances.
 
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index c749ac51a7..e2f1ed323d 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2273,8 +2273,22 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
 	    else
 		x += 10 + c - 'A';
 	}
-#ifndef Py_UNICODE_WIDE
-        if (x > 0x10000) {
+        if (x <= 0xffff)
+                /* UCS-2 character */
+                *p++ = (Py_UNICODE) x;
+        else if (x <= 0x10ffff) {
+                /* UCS-4 character. Either store directly, or as
+                   surrogate pair. */
+#ifdef Py_UNICODE_WIDE
+                *p++ = (Py_UNICODE) x;
+#else
+                x -= 0x10000L;
+                *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
+                *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
+#endif
+        } else {
+            endinpos = s-starts;
+            outpos = p-PyUnicode_AS_UNICODE(v);
             if (unicode_decode_call_errorhandler(
                     errors, &errorHandler,
                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
@@ -2282,8 +2296,6 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
 		    (PyObject **)&v, &outpos, &p))
 		    goto onError;
         }
-#endif
-	*p++ = x;
 	nextByte:
 	;
     }
@@ -2337,6 +2349,32 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
             *p++ = hexdigit[ch & 15];
         }
         else
+#else
+	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
+	if (ch >= 0xD800 && ch < 0xDC00) {
+	    Py_UNICODE ch2;
+	    Py_UCS4 ucs;
+
+	    ch2 = *s++;
+	    size--;
+	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
+		*p++ = '\\';
+		*p++ = 'U';
+		*p++ = hexdigit[(ucs >> 28) & 0xf];
+		*p++ = hexdigit[(ucs >> 24) & 0xf];
+		*p++ = hexdigit[(ucs >> 20) & 0xf];
+		*p++ = hexdigit[(ucs >> 16) & 0xf];
+		*p++ = hexdigit[(ucs >> 12) & 0xf];
+		*p++ = hexdigit[(ucs >> 8) & 0xf];
+		*p++ = hexdigit[(ucs >> 4) & 0xf];
+		*p++ = hexdigit[ucs & 0xf];
+		continue;
+	    }
+	    /* Fall through: isolated surrogates are copied as-is */
+	    s--;
+	    size++;
+	}
 #endif
 	/* Map 16-bit characters to '\uxxxx' */
 	if (ch >= 256) {
author	Amaury Forgeot d'Arc <amauryfa@gmail.com>	2008-03-24 21:16:28 +0000
committer	Amaury Forgeot d'Arc <amauryfa@gmail.com>	2008-03-24 21:16:28 +0000
commit	e4342a538c3f4c08f118f87cf4a8c3e1230c5d1f (patch)
tree	2911335b1a7f948a8c6893600d54a958ba8fdd0b
parent	b2b04e4e98b58f1ff0fe563feb7e323a812fe989 (diff)
download	cpython-e4342a538c3f4c08f118f87cf4a8c3e1230c5d1f.tar.gz