diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2001-06-27 06:28:56 +0000 |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2001-06-27 06:28:56 +0000 |
commit | ce9b5a55e164f1128756478b6a2bb548abec1980 (patch) | |
tree | 0b616e0fae5ec7204f723235d196ae2b7c124d78 /Objects | |
parent | 236d8b79748fec890d57ad0dd99ea3f1c3ba57df (diff) | |
download | cpython-git-ce9b5a55e164f1128756478b6a2bb548abec1980.tar.gz |
Encode surrogates in UTF-8 even for a wide Py_UNICODE.
Implement sys.maxunicode.
Explicitly wrap around upper/lower computations for wide Py_UNICODE.
When decoding large characters with UTF-8, represent expected test
results using the \U notation.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodectype.c | 35 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 19 |
2 files changed, 39 insertions, 15 deletions
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 3bc19b2d44..13fc6128c1 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -59,14 +59,21 @@ int _PyUnicode_IsLinebreak(register const Py_UNICODE ch) /* Returns the titlecase Unicode characters corresponding to ch or just ch if no titlecase mapping is known. */ -Py_UNICODE _PyUnicode_ToTitlecase(register const Py_UNICODE ch) +Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->title) - return ch + ctype->title; - - return ch + ctype->upper; + ch += ctype->title; + else + ch += ctype->upper; + +#ifdef USE_UCS4_STORAGE + /* The database assumes that the values wrap around at 0x10000. */ + if (ch > 0x10000) + ch -= 0x10000; +#endif + return ch; } /* Returns 1 for Unicode characters having the category 'Lt', 0 @@ -348,21 +355,33 @@ int _PyUnicode_IsUppercase(register const Py_UNICODE ch) /* Returns the uppercase Unicode characters corresponding to ch or just ch if no uppercase mapping is known. */ -Py_UNICODE _PyUnicode_ToUppercase(register const Py_UNICODE ch) +Py_UNICODE _PyUnicode_ToUppercase(register Py_UNICODE ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - return ch + ctype->upper; + ch += ctype->upper; +#ifdef USE_UCS4_STORAGE + /* The database assumes that the values wrap around at 0x10000. */ + if (ch > 0x10000) + ch -= 0x10000; +#endif + return ch; } /* Returns the lowercase Unicode characters corresponding to ch or just ch if no lowercase mapping is known. */ -Py_UNICODE _PyUnicode_ToLowercase(register const Py_UNICODE ch) +Py_UNICODE _PyUnicode_ToLowercase(register Py_UNICODE ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - return ch + ctype->lower; + ch += ctype->lower; +#ifdef USE_UCS4_STORAGE + /* The database assumes that the values wrap around at 0x10000. */ + if (ch > 0x10000) + ch -= 0x10000; +#endif + return ch; } /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ffac3710df..2f66c3cf93 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -103,6 +103,18 @@ static PyUnicodeObject *unicode_latin1[256]; */ static char unicode_default_encoding[100]; +Py_UNICODE +PyUnicode_GetMax() +{ +#ifdef USE_UCS4_STORAGE + return 0x10FFFF; +#else + /* This is actually an illegal character, so it should + not be passed to unichr. */ + return 0xFFFF; +#endif +} + /* --- Unicode Object ----------------------------------------------------- */ static @@ -884,12 +896,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, cbWritten += 2; } else if (ch < 0x10000) { -#if Py_UNICODE_SIZE == 4 - *p++ = 0xe0 | (ch>>12); - *p++ = 0x80 | ((ch>>6) & 0x3f); - *p++ = 0x80 | (ch & 0x3f); - cbWritten += 3; -#else /* Check for high surrogate */ if (0xD800 <= ch && ch <= 0xDBFF) { if (i != size) { @@ -920,7 +926,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, } *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); *p++ = (char)(0x80 | (ch & 0x3f)); -#endif } else { *p++ = 0xf0 | (ch>>18); *p++ = 0x80 | ((ch>>12) & 0x3f); |