summaryrefslogtreecommitdiff
path: root/Tools/unicode
diff options
context:
space:
mode:
authorBenjamin Peterson <benjamin@python.org>2012-01-11 18:17:06 -0500
committerBenjamin Peterson <benjamin@python.org>2012-01-11 18:17:06 -0500
commitb2bf01d824ea5a13b375d0aa79211c01f8ab726a (patch)
treec2e840d182aff5a4ae272ca9a80b6a1cf3c1db3d /Tools/unicode
parent9007f72db095212a169b3234194fcc08bd14bf6e (diff)
downloadcpython-git-b2bf01d824ea5a13b375d0aa79211c01f8ab726a.tar.gz
use full unicode mappings for upper/lower/title case (#12736)
Also broaden the category of characters that count as lowercase/uppercase.
Diffstat (limited to 'Tools/unicode')
-rw-r--r--Tools/unicode/makeunicodedata.py101
1 files changed, 69 insertions, 32 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index d9770979e1..140fc6484f 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -22,6 +22,7 @@
# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
# 2011-10-21 ezio add support for name aliases and named sequences
+# 2012-01 benjamin add full case mappings
#
# written by Fredrik Lundh (fredrik@pythonware.com)
#
@@ -47,6 +48,7 @@ DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
LINE_BREAK = "LineBreak%s.txt"
NAME_ALIASES = "NameAliases%s.txt"
NAMED_SEQUENCES = "NamedSequences%s.txt"
+SPECIAL_CASING = "SpecialCasing%s.txt"
# Private Use Areas -- in planes 1, 15, 16
PUA_1 = range(0xE000, 0xF900)
@@ -84,8 +86,10 @@ UPPER_MASK = 0x80
XID_START_MASK = 0x100
XID_CONTINUE_MASK = 0x200
PRINTABLE_MASK = 0x400
-NODELTA_MASK = 0x800
-NUMERIC_MASK = 0x1000
+NUMERIC_MASK = 0x800
+CASE_IGNORABLE_MASK = 0x1000
+CASED_MASK = 0x2000
+EXTENDED_CASE_MASK = 0x4000
# these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [
@@ -384,6 +388,7 @@ def makeunicodetype(unicode, trace):
numeric = {}
spaces = []
linebreaks = []
+ extra_casing = []
for char in unicode.chars:
record = unicode.table[char]
@@ -396,7 +401,7 @@ def makeunicodetype(unicode, trace):
delta = True
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
flags |= ALPHA_MASK
- if category == "Ll":
+ if "Lowercase" in properties:
flags |= LOWER_MASK
if 'Line_Break' in properties or bidirectional == "B":
flags |= LINEBREAK_MASK
@@ -406,7 +411,7 @@ def makeunicodetype(unicode, trace):
spaces.append(char)
if category == "Lt":
flags |= TITLE_MASK
- if category == "Lu":
+ if "Uppercase" in properties:
flags |= UPPER_MASK
if char == ord(" ") or category[0] not in ("C", "Z"):
flags |= PRINTABLE_MASK
@@ -414,35 +419,41 @@ def makeunicodetype(unicode, trace):
flags |= XID_START_MASK
if "XID_Continue" in properties:
flags |= XID_CONTINUE_MASK
- # use delta predictor for upper/lower/title if it fits
- if record[12]:
- upper = int(record[12], 16)
- else:
- upper = char
- if record[13]:
- lower = int(record[13], 16)
- else:
- lower = char
- if record[14]:
- title = int(record[14], 16)
- else:
- # UCD.html says that a missing title char means that
- # it defaults to the uppercase character, not to the
- # character itself. Apparently, in the current UCD (5.x)
- # this feature is never used
- title = upper
- upper_d = upper - char
- lower_d = lower - char
- title_d = title - char
- if -32768 <= upper_d <= 32767 and \
- -32768 <= lower_d <= 32767 and \
- -32768 <= title_d <= 32767:
- # use deltas
- upper = upper_d & 0xffff
- lower = lower_d & 0xffff
- title = title_d & 0xffff
+ if "Cased" in properties:
+ flags |= CASED_MASK
+ if "Case_Ignorable" in properties:
+ flags |= CASE_IGNORABLE_MASK
+ sc = unicode.special_casing.get(char)
+ if sc is None:
+ if record[12]:
+ upper = int(record[12], 16)
+ else:
+ upper = char
+ if record[13]:
+ lower = int(record[13], 16)
+ else:
+ lower = char
+ if record[14]:
+ title = int(record[14], 16)
+ else:
+ title = upper
+ if upper == lower == title:
+ upper = lower = title = 0
else:
- flags |= NODELTA_MASK
+ # This happens when some character maps to more than one
+ # character in uppercase, lowercase, or titlecase. The extra
+ # characters are stored in a different array.
+ flags |= EXTENDED_CASE_MASK
+ lower = len(extra_casing) | (len(sc[0]) << 24)
+ extra_casing.extend(sc[0])
+ upper = len(extra_casing) | (len(sc[2]) << 24)
+ extra_casing.extend(sc[2])
+ # Title is probably equal to upper.
+ if sc[1] == sc[2]:
+ title = upper
+ else:
+ title = len(extra_casing) | (len(sc[1]) << 24)
+ extra_casing.extend(sc[1])
# decimal digit, integer digit
decimal = 0
if record[6]:
@@ -469,6 +480,7 @@ def makeunicodetype(unicode, trace):
print(sum(map(len, numeric.values())), "numeric code points")
print(len(spaces), "whitespace code points")
print(len(linebreaks), "linebreak code points")
+ print(len(extra_casing), "extended case array")
print("--- Writing", FILE, "...")
@@ -482,6 +494,14 @@ def makeunicodetype(unicode, trace):
print("};", file=fp)
print(file=fp)
+ print("/* extended case mappings */", file=fp)
+ print(file=fp)
+ print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
+ for c in extra_casing:
+ print(" %d," % c, file=fp)
+ print("};", file=fp)
+ print(file=fp)
+
# split decomposition index table
index1, index2, shift = splitbins(index, trace)
@@ -1070,6 +1090,23 @@ class UnicodeData:
# Patch the numeric field
if table[i] is not None:
table[i][8] = value
+ sc = self.special_casing = {}
+ with open_data(SPECIAL_CASING, version) as file:
+ for s in file:
+ s = s[:-1].split('#', 1)[0]
+ if not s:
+ continue
+ data = s.split("; ")
+ if data[4]:
+ # We ignore all conditionals (since they depend on
+ # languages) except for one, which is hardcoded. See
+ # handle_capital_sigma in unicodeobject.c.
+ continue
+ c = int(data[0], 16)
+ lower = [int(char, 16) for char in data[1].split()]
+ title = [int(char, 16) for char in data[2].split()]
+ upper = [int(char, 16) for char in data[3].split()]
+ sc[c] = (lower, title, upper)
def uselatin1(self):
# restrict character range to ISO Latin 1