summaryrefslogtreecommitdiff
path: root/Objects/unicodectype.c
diff options
context:
space:
mode:
authorBenjamin Peterson <benjamin@python.org>2012-01-11 18:17:06 -0500
committerBenjamin Peterson <benjamin@python.org>2012-01-11 18:17:06 -0500
commitb2bf01d824ea5a13b375d0aa79211c01f8ab726a (patch)
treec2e840d182aff5a4ae272ca9a80b6a1cf3c1db3d /Objects/unicodectype.c
parent9007f72db095212a169b3234194fcc08bd14bf6e (diff)
downloadcpython-git-b2bf01d824ea5a13b375d0aa79211c01f8ab726a.tar.gz
use full unicode mappings for upper/lower/title case (#12736)
Also broaden the category of characters that count as lowercase/uppercase.
Diffstat (limited to 'Objects/unicodectype.c')
-rw-r--r--Objects/unicodectype.c97
1 files changed, 75 insertions, 22 deletions
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c
index 9f6ac89b9f..05b63cc430 100644
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@@ -21,8 +21,10 @@
#define XID_START_MASK 0x100
#define XID_CONTINUE_MASK 0x200
#define PRINTABLE_MASK 0x400
-#define NODELTA_MASK 0x800
-#define NUMERIC_MASK 0x1000
+#define NUMERIC_MASK 0x800
+#define CASE_IGNORABLE_MASK 0x1000
+#define CASED_MASK 0x2000
+#define EXTENDED_CASE_MASK 0x4000
typedef struct {
const Py_UCS4 upper;
@@ -57,15 +59,8 @@ gettyperecord(Py_UCS4 code)
Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
- int delta = ctype->title;
- if (ctype->flags & NODELTA_MASK)
- return delta;
-
- if (delta >= 32768)
- delta -= 65536;
-
- return ch + delta;
+ return ctype->title ? ctype->title : ch;
}
/* Returns 1 for Unicode characters having the category 'Lt', 0
@@ -188,12 +183,10 @@ int _PyUnicode_IsUppercase(Py_UCS4 ch)
Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
- int delta = ctype->upper;
- if (ctype->flags & NODELTA_MASK)
- return delta;
- if (delta >= 32768)
- delta -= 65536;
- return ch + delta;
+
+ if (ctype->flags & EXTENDED_CASE_MASK)
+ return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF];
+ return ctype->upper ? ctype->upper : ch;
}
/* Returns the lowercase Unicode characters corresponding to ch or just
@@ -202,12 +195,72 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
- int delta = ctype->lower;
- if (ctype->flags & NODELTA_MASK)
- return delta;
- if (delta >= 32768)
- delta -= 65536;
- return ch + delta;
+
+ if (ctype->flags & EXTENDED_CASE_MASK)
+ return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF];
+ return ctype->lower ? ctype->lower : ch;
+}
+
+int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+ const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+ if (ctype->flags & EXTENDED_CASE_MASK) {
+ int index = ctype->lower & 0xFFFFFF;
+ int n = ctype->lower >> 24;
+ int i;
+ for (i = 0; i < n; i++)
+ res[i] = _PyUnicode_ExtendedCase[index + i];
+ return n;
+ }
+ res[0] = ctype->lower ? ctype->lower : ch;
+ return 1;
+}
+
+int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+ const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+ if (ctype->flags & EXTENDED_CASE_MASK) {
+ int index = ctype->title & 0xFFFFFF;
+ int n = ctype->title >> 24;
+ int i;
+ for (i = 0; i < n; i++)
+ res[i] = _PyUnicode_ExtendedCase[index + i];
+ return n;
+ }
+ res[0] = ctype->title ? ctype->title : ch;
+ return 1;
+}
+
+int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+ const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+ if (ctype->flags & EXTENDED_CASE_MASK) {
+ int index = ctype->upper & 0xFFFFFF;
+ int n = ctype->upper >> 24;
+ int i;
+ for (i = 0; i < n; i++)
+ res[i] = _PyUnicode_ExtendedCase[index + i];
+ return n;
+ }
+ res[0] = ctype->upper ? ctype->upper : ch;
+ return 1;
+}
+
+int _PyUnicode_IsCased(Py_UCS4 ch)
+{
+ const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+ return (ctype->flags & CASED_MASK) != 0;
+}
+
+int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
+{
+ const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+ return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
}
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',