summaryrefslogtreecommitdiff
path: root/localedata/unicode-gen/utf8_gen.py
diff options
context:
space:
mode:
authorCarlos O'Donell <carlos@systemhalted.org>2015-12-09 22:27:41 -0500
committerCarlos O'Donell <carlos@systemhalted.org>2015-12-09 22:52:13 -0500
commitdd8e8e547647bf7a3f6feb816a848a846feeaf14 (patch)
treea2565747c02ddaa9b178a5aa9de6fa42aa5ae979 /localedata/unicode-gen/utf8_gen.py
parent40b59cace2fd5e5aa04367073a54efc995059376 (diff)
downloadglibc-dd8e8e547647bf7a3f6feb816a848a846feeaf14.tar.gz
Update transliteration support to Unicode 7.0.0.
The transliteration files are now autogenerated from upstream Unicode data.
Diffstat (limited to 'localedata/unicode-gen/utf8_gen.py')
-rwxr-xr-xlocaledata/unicode-gen/utf8_gen.py28
1 files changed, 11 insertions, 17 deletions
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py
index f1b88f5b29..bc84c07617 100755
--- a/localedata/unicode-gen/utf8_gen.py
+++ b/localedata/unicode-gen/utf8_gen.py
@@ -29,6 +29,7 @@ It will output UTF-8 file
import sys
import re
+import unicode_utils
# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
# sections 3.11 and 4.4.
@@ -49,13 +50,6 @@ JAMO_FINAL_SHORT_NAME = (
'P', 'H'
)
-def ucs_symbol(code_point):
- '''Return the UCS symbol string for a Unicode character.'''
- if code_point < 0x10000:
- return '<U{:04X}>'.format(code_point)
- else:
- return '<U{:08X}>'.format(code_point)
-
def process_range(start, end, outfile, name):
'''Writes a range of code points into the CHARMAP section of the
output file
@@ -78,7 +72,7 @@ def process_range(start, end, outfile, name):
+ JAMO_MEDIAL_SHORT_NAME[index2] \
+ JAMO_FINAL_SHORT_NAME[index3]
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
- ucs_symbol(i), convert_to_hex(i),
+ unicode_utils.ucs_symbol(i), convert_to_hex(i),
hangul_syllable_name))
return
# UnicodeData.txt file has contains code point ranges like this:
@@ -95,14 +89,14 @@ def process_range(start, end, outfile, name):
for i in range(int(start, 16), int(end, 16), 64 ):
if i > (int(end, 16)-64):
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
- ucs_symbol(i),
- ucs_symbol(int(end,16)),
+ unicode_utils.ucs_symbol(i),
+ unicode_utils.ucs_symbol(int(end,16)),
convert_to_hex(i),
name))
break
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
- ucs_symbol(i),
- ucs_symbol(i+63),
+ unicode_utils.ucs_symbol(i),
+ unicode_utils.ucs_symbol(i+63),
convert_to_hex(i),
name))
@@ -168,7 +162,7 @@ def process_charmap(flines, outfile):
# comments, so we keep these comment lines.
outfile.write('%')
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
- ucs_symbol(int(fields[0], 16)),
+ unicode_utils.ucs_symbol(int(fields[0], 16)),
convert_to_hex(int(fields[0], 16)),
fields[1]))
@@ -230,7 +224,7 @@ def process_width(outfile, ulines, elines):
for line in ulines:
fields = line.split(";")
if fields[4] == "NSM" or fields[2] == "Cf":
- width_dict[int(fields[0], 16)] = ucs_symbol(
+ width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
int(fields[0], 16)) + '\t0'
for line in elines:
@@ -238,7 +232,7 @@ def process_width(outfile, ulines, elines):
# UnicodeData.txt:
fields = line.split(";")
if not '..' in fields[0]:
- width_dict[int(fields[0], 16)] = ucs_symbol(
+ width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
int(fields[0], 16)) + '\t2'
else:
code_points = fields[0].split("..")
@@ -247,8 +241,8 @@ def process_width(outfile, ulines, elines):
if key in width_dict:
del width_dict[key]
width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
- ucs_symbol(int(code_points[0], 16)),
- ucs_symbol(int(code_points[1], 16)))
+ unicode_utils.ucs_symbol(int(code_points[0], 16)),
+ unicode_utils.ucs_symbol(int(code_points[1], 16)))
for key in sorted(width_dict):
outfile.write(width_dict[key]+'\n')