diff options
author | Benjamin Peterson <benjamin@python.org> | 2012-01-11 18:17:06 -0500 |
---|---|---|
committer | Benjamin Peterson <benjamin@python.org> | 2012-01-11 18:17:06 -0500 |
commit | b2bf01d824ea5a13b375d0aa79211c01f8ab726a (patch) | |
tree | c2e840d182aff5a4ae272ca9a80b6a1cf3c1db3d /Objects/unicodectype.c | |
parent | 9007f72db095212a169b3234194fcc08bd14bf6e (diff) | |
download | cpython-git-b2bf01d824ea5a13b375d0aa79211c01f8ab726a.tar.gz |
use full unicode mappings for upper/lower/title case (#12736)
Also broaden the category of characters that count as lowercase/uppercase.
Diffstat (limited to 'Objects/unicodectype.c')
-rw-r--r-- | Objects/unicodectype.c | 97 |
1 files changed, 75 insertions, 22 deletions
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 9f6ac89b9f..05b63cc430 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -21,8 +21,10 @@ #define XID_START_MASK 0x100 #define XID_CONTINUE_MASK 0x200 #define PRINTABLE_MASK 0x400 -#define NODELTA_MASK 0x800 -#define NUMERIC_MASK 0x1000 +#define NUMERIC_MASK 0x800 +#define CASE_IGNORABLE_MASK 0x1000 +#define CASED_MASK 0x2000 +#define EXTENDED_CASE_MASK 0x4000 typedef struct { const Py_UCS4 upper; @@ -57,15 +59,8 @@ gettyperecord(Py_UCS4 code) Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - int delta = ctype->title; - if (ctype->flags & NODELTA_MASK) - return delta; - - if (delta >= 32768) - delta -= 65536; - - return ch + delta; + return ctype->title ? ctype->title : ch; } /* Returns 1 for Unicode characters having the category 'Lt', 0 @@ -188,12 +183,10 @@ int _PyUnicode_IsUppercase(Py_UCS4 ch) Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - int delta = ctype->upper; - if (ctype->flags & NODELTA_MASK) - return delta; - if (delta >= 32768) - delta -= 65536; - return ch + delta; + + if (ctype->flags & EXTENDED_CASE_MASK) + return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF]; + return ctype->upper ? ctype->upper : ch; } /* Returns the lowercase Unicode characters corresponding to ch or just @@ -202,12 +195,72 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); - int delta = ctype->lower; - if (ctype->flags & NODELTA_MASK) - return delta; - if (delta >= 32768) - delta -= 65536; - return ch + delta; + + if (ctype->flags & EXTENDED_CASE_MASK) + return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF]; + return ctype->lower ? ctype->lower : ch; +} + +int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->lower & 0xFFFFFF; + int n = ctype->lower >> 24; + int i; + for (i = 0; i < n; i++) + res[i] = _PyUnicode_ExtendedCase[index + i]; + return n; + } + res[0] = ctype->lower ? ctype->lower : ch; + return 1; +} + +int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->title & 0xFFFFFF; + int n = ctype->title >> 24; + int i; + for (i = 0; i < n; i++) + res[i] = _PyUnicode_ExtendedCase[index + i]; + return n; + } + res[0] = ctype->title ? ctype->title : ch; + return 1; +} + +int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + if (ctype->flags & EXTENDED_CASE_MASK) { + int index = ctype->upper & 0xFFFFFF; + int n = ctype->upper >> 24; + int i; + for (i = 0; i < n; i++) + res[i] = _PyUnicode_ExtendedCase[index + i]; + return n; + } + res[0] = ctype->upper ? ctype->upper : ch; + return 1; +} + +int _PyUnicode_IsCased(Py_UCS4 ch) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + return (ctype->flags & CASED_MASK) != 0; +} + +int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch) +{ + const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); + + return (ctype->flags & CASE_IGNORABLE_MASK) != 0; } /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt', |