diff options
author | Simos Xenitellis <simos@src.gnome.org> | 2008-03-04 11:21:48 +0000 |
---|---|---|
committer | Simos Xenitellis <simos@src.gnome.org> | 2008-03-04 11:21:48 +0000 |
commit | 11abc0d6917e043ad47c9f67dae4b4ebde11a22b (patch) | |
tree | 4395ed7cf43346dc9e321536e2c6e45a0acf071f /gtk/compose-parse.py | |
parent | b9001703a2e6ea313860dbc3f9a094547bf24a2e (diff) | |
download | gtk+-11abc0d6917e043ad47c9f67dae4b4ebde11a22b.tar.gz |
Fix #321896 - Synch gtkimcontextsimple.c with Xorg
svn path=/trunk/; revision=19706
Diffstat (limited to 'gtk/compose-parse.py')
-rwxr-xr-x | gtk/compose-parse.py | 856 |
1 files changed, 856 insertions, 0 deletions
diff --git a/gtk/compose-parse.py b/gtk/compose-parse.py new file mode 100755 index 0000000000..af86939cd5 --- /dev/null +++ b/gtk/compose-parse.py @@ -0,0 +1,856 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# compose-parse.py, version 1.3 +# +# multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c) +# the script produces statistics and information about the whole process, run with --help for more. +# +# You may need to switch your python installation to utf-8, if you get 'ascii' codec errors. +# +# Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft. + +from re import findall, match, split, sub +from string import atoi +from unicodedata import normalize +from urllib import urlretrieve +from os.path import isfile, getsize +from copy import copy + +import sys +import getopt + +# We grab files off the web, left and right. +URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre' +URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt" +URL_GDKKEYSYMSH = "http://svn.gnome.org/svn/gtk%2B/trunk/gdk/gdkkeysyms.h" +URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.0.0/ucd/UnicodeData.txt' + +# We currently support keysyms of size 2; once upstream xorg gets sorted, +# we might produce some tables with size 2 and some with size 4. +SIZEOFINT = 2 + +# Current max compose sequence length; in case it gets increased. +WIDTHOFCOMPOSETABLE = 5 + +keysymdatabase = {} +keysymunicodedatabase = {} +unicodedatabase = {} + +headerfile_start = """/* GTK - The GIMP Tool Kit + * Copyright (C) 2007, 2008 GNOME Foundation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896 + * using the input files + * Input : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre + * Input : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt + * Input : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt + * + * This table is optimised for space and requires special handling to access the content. + * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c + * + * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h + * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896 + */ + +/* + * Modified by the GTK+ Team and others 2007, 2008. See the AUTHORS + * file for a list of people on the GTK+ Team. See the ChangeLog + * files for a list of changes. These files are distributed with + * GTK+ at ftp://ftp.gtk.org/pub/gtk/. + */ + +#ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ +#define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ + +/* === These are the original comments of the file; we keep for historical purposes === + * + * The following table was generated from the X compose tables include with + * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com> + * to obtain the relevant perl scripts. + * + * The following compose letter letter sequences confliced + * Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over + * ETH (Icelandic, Faroese, old English, IPA) [ D- -D d- -d ] + * Amacron/amacron and ordfeminine; resolved to ordfeminine [ _A A_ a_ _a ] + * Amacron/amacron and Atilde/atilde; resolved to atilde [ -A A- a- -a ] + * Omacron/Omacron and masculine; resolved to masculine [ _O O_ o_ _o ] + * Omacron/omacron and Otilde/atilde; resolved to otilde [ -O O- o- -o ] + * + * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for + * spanish. atilde and otilde are used at least for Portuguese ] + * + * at and Aring; resolved to Aring [ AA ] + * guillemotleft and caron; resolved to guillemotleft [ << ] + * ogonek and cedilla; resolved to cedilla [ ,, ] + * + * This probably should be resolved by first checking an additional set of compose tables + * that depend on the locale or selected input method. + */ + +static const guint16 gtk_compose_seqs_compact[] = {""" + +headerfile_end = """}; + +#endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */ +""" + +def stringtohex(str): return atoi(str, 16) + +def factorial(n): + if n <= 1: + return 1 + else: + return n * factorial(n-1) + +def uniq(*args) : + """ Performs a uniq operation on a list or lists """ + theInputList = [] + for theList in args: + theInputList += theList + theFinalList = [] + for elem in theInputList: + if elem not in theFinalList: + theFinalList.append(elem) + return theFinalList + + + +def all_permutations(seq): + """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """ + """ Produces all permutations of the items of a list """ + if len(seq) <=1: + yield seq + else: + for perm in all_permutations(seq[1:]): + for i in range(len(perm)+1): + #nb str[0:1] works in both string and list contexts + yield perm[:i] + seq[0:1] + perm[i:] + +def usage(): + print """compose-parse available parameters: + -h, --help this craft + -s, --statistics show overall statistics (both algorithmic, non-algorithmic) + -a, --algorithmic show sequences saved with algorithmic optimisation + -g, --gtk show entries that go to GTK+ + -u, --unicodedatatxt show compose sequences derived from UnicodeData.txt (from unicode.org) + -v, --verbose show verbose output + -p, --plane1 show plane1 compose sequences + -n, --numeric when used with --gtk, create file with numeric values only + -e, --gtk-expanded when used with --gtk, create file that repeats first column; not usable in GTK+ + + Default is to show statistics. + """ + +try: + opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt", + "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"]) +except: + usage() + sys.exit(2) + +opt_statistics = False +opt_algorithmic = False +opt_gtk = False +opt_unicodedatatxt = False +opt_verbose = False +opt_plane1 = False +opt_numeric = False +opt_gtkexpanded = False + +for o, a in opts: + if o in ("-h", "--help"): + usage() + sys.exit() + if o in ("-s", "--statistics"): + opt_statistics = True + if o in ("-a", "--algorithmic"): + opt_algorithmic = True + if o in ("-g", "--gtk"): + opt_gtk = True + if o in ("-u", "--unicodedatatxt"): + opt_unicodedatatxt = True + if o in ("-v", "--verbose"): + opt_verbose = True + if o in ("-p", "--plane1"): + opt_plane1 = True + if o in ("-n", "--numeric"): + opt_numeric = True + if o in ("-e", "--gtk-expanded"): + opt_gtkexpanded = True + +if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt: + opt_statistics = True + +def download_hook(blocks_transferred, block_size, file_size): + """ A download hook to provide some feedback when downloading """ + if blocks_transferred == 0: + if file_size > 0: + if opt_verbose: + print "Downloading", file_size, "bytes: ", + else: + if opt_verbose: + print "Downloading: ", + sys.stdout.write('#') + sys.stdout.flush() + + +def download_file(url): + """ Downloads a file provided a URL. Returns the filename. """ + """ Borks on failure """ + localfilename = url.split('/')[-1] + if not isfile(localfilename) or getsize(localfilename) <= 0: + if opt_verbose: + print "Downloading ", url, "..." + try: + urlretrieve(url, localfilename, download_hook) + except IOError, (errno, strerror): + print "I/O error(%s): %s" % (errno, strerror) + sys.exit(-1) + except: + print "Unexpected error: ", sys.exc_info()[0] + sys.exit(-1) + print " done." + else: + if opt_verbose: + print "Using cached file for ", url + return localfilename + +def process_gdkkeysymsh(): + """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """ + """ Fills up keysymdb with contents """ + filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH) + try: + gdkkeysymsh = open(filename_gdkkeysymsh, 'r') + except IOError, (errno, strerror): + print "I/O error(%s): %s" % (errno, strerror) + sys.exit(-1) + except: + print "Unexpected error: ", sys.exc_info()[0] + sys.exit(-1) + + """ Parse the gdkkeysyms.h file and place contents in keysymdb """ + linenum_gdkkeysymsh = 0 + keysymdb = {} + for line in gdkkeysymsh.readlines(): + linenum_gdkkeysymsh += 1 + line = line.strip() + if line == "" or not match('^#define GDK_', line): + continue + components = split('\s+', line) + if len(components) < 3: + print "Invalid line %(linenum)d in %(filename)s: %(line)s"\ + % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line} + print "Was expecting 3 items in the line" + sys.exit(-1) + if not match('^GDK_', components[1]): + print "Invalid line %(linenum)d in %(filename)s: %(line)s"\ + % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line} + print "Was expecting a keysym starting with GDK_" + sys.exit(-1) + if components[2][:2] == '0x' and match('[0-9a-fA-F]+$', components[2][2:]): + unival = atoi(components[2][2:], 16) + if unival == 0: + continue + keysymdb[components[1][4:]] = unival + else: + print "Invalid line %(linenum)d in %(filename)s: %(line)s"\ + % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line} + print "Was expecting a hexadecimal number at the end of the line" + sys.exit(-1) + gdkkeysymsh.close() + + """ Patch up the keysymdb with some of our own stuff """ + + """ This is for a missing keysym from the currently upstread file """ + keysymdb['dead_stroke'] = 0x338 + + """ This is^Wwas preferential treatment for Greek """ + # keysymdb['dead_tilde'] = 0x342 + """ This is^was preferential treatment for Greek """ + #keysymdb['combining_tilde'] = 0x342 + + """ Fixing VoidSymbol """ + keysymdb['VoidSymbol'] = 0xFFFF + + return keysymdb + +def process_keysymstxt(): + """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """ + """ This file keeps a record between keysyms <-> unicode chars """ + filename_keysymstxt = download_file(URL_KEYSYMSTXT) + try: + keysymstxt = open(filename_keysymstxt, 'r') + except IOError, (errno, strerror): + print "I/O error(%s): %s" % (errno, strerror) + sys.exit(-1) + except: + print "Unexpected error: ", sys.exc_info()[0] + sys.exit(-1) + + """ Parse the keysyms.txt file and place content in keysymdb """ + linenum_keysymstxt = 0 + keysymdb = {} + for line in keysymstxt.readlines(): + linenum_keysymstxt += 1 + line = line.strip() + if line == "" or match('^#', line): + continue + components = split('\s+', line) + if len(components) < 5: + print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\ + % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line} + print "Was expecting 5 items in the line" + sys.exit(-1) + if components[1][0] == 'U' and match('[0-9a-fA-F]+$', components[1][1:]): + unival = atoi(components[1][1:], 16) + if unival == 0: + continue + keysymdb[components[4]] = unival + keysymstxt.close() + + """ Patch up the keysymdb with some of our own stuff """ + + """ This is preferential treatment for Greek """ + """ => we get more savings if used for Greek """ + # keysymdb['dead_tilde'] = 0x342 + """ This is preferential treatment for Greek """ + # keysymdb['combining_tilde'] = 0x342 + + """ This is for a missing keysym from Marcus Khun's db """ + keysymdb['dead_stroke'] = 0x338 + """ This is for a missing keysym from Marcus Khun's db """ + # keysymdb['Oslash'] = 0x0d8 + + """ This is for a missing (recently added) keysym """ + keysymdb['dead_psili'] = 0x313 + """ This is for a missing (recently added) keysym """ + keysymdb['dead_dasia'] = 0x314 + + """ Allows to import Multi_key sequences """ + keysymdb['Multi_key'] = 0xff20 + + return keysymdb + +def keysymvalue(keysym, file = "n/a", linenum = 0): + """ Extracts a value from the keysym """ + """ Find the value of keysym, using the data from keysyms """ + """ Use file and linenum to when reporting errors """ + if keysym == "": + return 0 + if keysymdatabase.has_key(keysym): + return keysymdatabase[keysym] + elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]): + return atoi(keysym[1:], 16) + elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]): + return atoi(keysym[2:], 16) + else: + print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym } + sys.exit(-1) + +def keysymunicodevalue(keysym, file = "n/a", linenum = 0): + """ Extracts a value from the keysym """ + """ Find the value of keysym, using the data from keysyms """ + """ Use file and linenum to when reporting errors """ + if keysym == "": + return 0 + if keysymunicodedatabase.has_key(keysym): + return keysymunicodedatabase[keysym] + elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]): + return atoi(keysym[1:], 16) + elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]): + return atoi(keysym[2:], 16) + else: + print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym } + sys.exit(-1) + +def rename_combining(seq): + filtered_sequence = [] + for ks in seq: + if findall('^combining_', ks): + filtered_sequence.append(sub('^combining_', 'dead_', ks)) + else: + filtered_sequence.append(ks) + return filtered_sequence + + +keysymunicodedatabase = process_keysymstxt() +keysymdatabase = process_gdkkeysymsh() + +""" Grab and open the compose file from upstream """ +filename_compose = download_file(URL_COMPOSE) +try: + composefile = open(filename_compose, 'r') +except IOError, (errno, strerror): + print "I/O error(%s): %s" % (errno, strerror) + sys.exit(-1) +except: + print "Unexpected error: ", sys.exc_info()[0] + sys.exit(-1) + +""" Parse the compose file in xorg_compose_sequences""" +xorg_compose_sequences = [] +xorg_compose_sequences_algorithmic = [] +linenum_compose = 0 +for line in composefile.readlines(): + linenum_compose += 1 + line = line.strip() + if line is "" or match("^XCOMM", line) or match("^#", line): + continue + + line = line[:-1] + components = split(':', line) + if len(components) != 2: + print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\ + /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose } + exit(-1) + (seq, val ) = split(':', line) + seq = seq.strip() + val = val.strip() + raw_sequence = findall('\w+', seq) + values = split('\s+', val) + unichar_temp = split('"', values[0]) + unichar = unichar_temp[1] + codepointstr = values[1] + if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]): + raw_sequence[0] = '0x' + raw_sequence[0][1:] + if codepointstr[0] == 'U' and match('[0-9a-fA-F]+$', codepointstr[1:]): + codepoint = atoi(codepointstr[1:], 16) + elif keysymdatabase.has_key(codepointstr): + codepoint = keysymdatabase[codepointstr] + else: + print + print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\ + %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line } + exit(-1) + sequence = rename_combining(raw_sequence) + reject_this = False + for i in sequence: + if keysymvalue(i) > 0xFFFF: + reject_this = True + if opt_plane1: + print sequence + break + if reject_this: + continue + if "U0313" in sequence or "U0314" in sequence or "0x0313" in sequence or "0x0314" in sequence: + continue + for i in range(len(sequence)): + if sequence[i] == "0x0342": + sequence[i] = "dead_tilde" + if "Multi_key" not in sequence: + """ Ignore for now >0xFFFF keysyms """ + if codepoint < 0xFFFF: + original_sequence = copy(sequence) + stats_sequence = copy(sequence) + base = sequence.pop() + basechar = keysymvalue(base, filename_compose, linenum_compose) + + if basechar < 0xFFFF: + counter = 1 + unisequence = [] + not_normalised = True + skipping_this = False + for i in range(0, len(sequence)): + """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically + because of lack of dead_perispomeni (i.e. conflict) + """ + bc = basechar + if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff): + skipping_this = True + break + if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff): + skipping_this = True + break + if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff): + skipping_this = True + break + if sequence[-1] == "dead_psili": + sequence[i] = "dead_horn" + if sequence[-1] == "dead_dasia": + sequence[-1] = "dead_ogonek" + unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose))) + + if skipping_this: + unisequence = [] + for perm in all_permutations(unisequence): + # print counter, original_sequence, unichr(basechar) + "".join(perm) + # print counter, map(unichr, perm) + normalized = normalize('NFC', unichr(basechar) + "".join(perm)) + if len(normalized) == 1: + # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \ + # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint }, + # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter } + stats_sequence_data = map(keysymunicodevalue, stats_sequence) + stats_sequence_data.append(normalized) + xorg_compose_sequences_algorithmic.append(stats_sequence_data) + not_normalised = False + break; + counter += 1 + if not_normalised: + original_sequence.append(codepoint) + xorg_compose_sequences.append(original_sequence) + """ print xorg_compose_sequences[-1] """ + + else: + print "Error in base char !?!" + exit(-2) + else: + print "OVER", sequence + exit(-1) + else: + sequence.append(codepoint) + xorg_compose_sequences.append(sequence) + """ print xorg_compose_sequences[-1] """ + +def sequence_cmp(x, y): + if keysymvalue(x[0]) > keysymvalue(y[0]): + return 1 + elif keysymvalue(x[0]) < keysymvalue(y[0]): + return -1 + elif len(x) > len(y): + return 1 + elif len(x) < len(y): + return -1 + elif keysymvalue(x[1]) > keysymvalue(y[1]): + return 1 + elif keysymvalue(x[1]) < keysymvalue(y[1]): + return -1 + elif len(x) < 4: + return 0 + elif keysymvalue(x[2]) > keysymvalue(y[2]): + return 1 + elif keysymvalue(x[2]) < keysymvalue(y[2]): + return -1 + elif len(x) < 5: + return 0 + elif keysymvalue(x[3]) > keysymvalue(y[3]): + return 1 + elif keysymvalue(x[3]) < keysymvalue(y[3]): + return -1 + elif len(x) < 6: + return 0 + elif keysymvalue(x[4]) > keysymvalue(y[4]): + return 1 + elif keysymvalue(x[4]) < keysymvalue(y[4]): + return -1 + else: + return 0 + +def sequence_unicode_cmp(x, y): + if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]): + return 1 + elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]): + return -1 + elif len(x) > len(y): + return 1 + elif len(x) < len(y): + return -1 + elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]): + return 1 + elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]): + return -1 + elif len(x) < 4: + return 0 + elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]): + return 1 + elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]): + return -1 + elif len(x) < 5: + return 0 + elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]): + return 1 + elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]): + return -1 + elif len(x) < 6: + return 0 + elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]): + return 1 + elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]): + return -1 + else: + return 0 + +def sequence_algorithmic_cmp(x, y): + if len(x) < len(y): + return -1 + elif len(x) > len(y): + return 1 + else: + for i in range(len(x)): + if x[i] < y[i]: + return -1 + elif x[i] > y[i]: + return 1 + return 0 + + +xorg_compose_sequences.sort(sequence_cmp) + +xorg_compose_sequences_uniqued = [] +first_time = True +item = None +for next_item in xorg_compose_sequences: + if first_time: + first_time = False + item = next_item + if sequence_unicode_cmp(item, next_item) != 0: + xorg_compose_sequences_uniqued.append(item) + item = next_item + +xorg_compose_sequences = copy(xorg_compose_sequences_uniqued) + +counter_multikey = 0 +for item in xorg_compose_sequences: + if findall('Multi_key', "".join(item[:-1])) != []: + counter_multikey += 1 + +xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp) +xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic) + +firstitem = "" +num_first_keysyms = 0 +zeroes = 0 +num_entries = 0 +num_algorithmic_greek = 0 +for sequence in xorg_compose_sequences: + if keysymvalue(firstitem) != keysymvalue(sequence[0]): + firstitem = sequence[0] + num_first_keysyms += 1 + zeroes += 6 - len(sequence) + 1 + num_entries += 1 + +for sequence in xorg_compose_sequences_algorithmic_uniqued: + ch = ord(sequence[-1:][0]) + if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff: + num_algorithmic_greek += 1 + + +if opt_algorithmic: + for sequence in xorg_compose_sequences_algorithmic_uniqued: + letter = "".join(sequence[-1:]) + print '0x%(cp)04X, %(uni)c, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter, 'base': sequence[-2] }, + for elem in sequence[:-2]: + print "<0x%(keysym)04X>," % { 'keysym': elem }, + """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """ + print "], recomposed as", letter, "verified" + +def num_of_keysyms(seq): + return len(seq) - 1 + +def convert_UnotationToHex(arg): + if isinstance(arg, str): + if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg): + return sub('^U', '0x', arg) + return arg + +def addprefix_GDK(arg): + if match('^0x', arg): + return '%(arg)s, ' % { 'arg': arg } + else: + return 'GDK_%(arg)s, ' % { 'arg': arg } + +if opt_gtk: + first_keysym = "" + sequence = [] + compose_table = [] + ct_second_part = [] + ct_sequence_width = 2 + start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1) + we_finished = False + counter = 0 + + sequence_iterator = iter(xorg_compose_sequences) + sequence = sequence_iterator.next() + while True: + first_keysym = sequence[0] # Set the first keysym + compose_table.append([first_keysym, 0, 0, 0, 0, 0]) + while sequence[0] == first_keysym: + compose_table[counter][num_of_keysyms(sequence)-1] += 1 + try: + sequence = sequence_iterator.next() + except StopIteration: + we_finished = True + break + if we_finished: + break + counter += 1 + + ct_index = start_offset + for line_num in range(len(compose_table)): + for i in range(WIDTHOFCOMPOSETABLE): + occurences = compose_table[line_num][i+1] + compose_table[line_num][i+1] = ct_index + ct_index += occurences * (i+2) + + for sequence in xorg_compose_sequences: + ct_second_part.append(map(convert_UnotationToHex, sequence)) + + print headerfile_start + for i in compose_table: + if opt_gtkexpanded: + print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) }, + print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) } + elif not match('^0x', i[0]): + print 'GDK_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) } + else: + print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) } + for i in ct_second_part: + if opt_numeric: + for ks in i[1:][:-1]: + print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) }, + print '0x%(cp)04X, ' % { 'cp':i[-1] } + """ + for ks in i[:-1]: + print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) }, + print '0x%(cp)04X, ' % { 'cp':i[-1] } + """ + elif opt_gtkexpanded: + print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] } + else: + print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] } + print headerfile_end + +def redecompose(codepoint): + (name, decomposition, combiningclass) = unicodedatabase[codepoint] + if decomposition[0] == '' or decomposition[0] == '0': + return [codepoint] + if match('<\w+>', decomposition[0]): + numdecomposition = map(stringtohex, decomposition[1:]) + return map(redecompose, numdecomposition) + numdecomposition = map(stringtohex, decomposition) + return map(redecompose, numdecomposition) + +def process_unicodedata_file(verbose = False): + """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """ + filename_unicodedatatxt = download_file(URL_UNICODEDATATXT) + try: + unicodedatatxt = open(filename_unicodedatatxt, 'r') + except IOError, (errno, strerror): + print "I/O error(%s): %s" % (errno, strerror) + sys.exit(-1) + except: + print "Unexpected error: ", sys.exc_info()[0] + sys.exit(-1) + for line in unicodedatatxt.readlines(): + if line[0] == "" or line[0] == '#': + continue + line = line[:-1] + uniproperties = split(';', line) + codepoint = stringtohex(uniproperties[0]) + """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """ + if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF): + continue + name = uniproperties[1] + category = uniproperties[2] + combiningclass = uniproperties[3] + decomposition = uniproperties[5] + unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass] + + counter_combinations = 0 + counter_combinations_greek = 0 + counter_entries = 0 + counter_entries_greek = 0 + + for item in unicodedatabase.keys(): + (name, decomposition, combiningclass) = unicodedatabase[item] + if decomposition[0] == '': + continue + print name, "is empty" + elif match('<\w+>', decomposition[0]): + continue + print name, "has weird", decomposition[0] + else: + sequence = map(stringtohex, decomposition) + chrsequence = map(unichr, sequence) + normalized = normalize('NFC', "".join(chrsequence)) + + """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized), """ + decomposedsequence = [] + for subseq in map(redecompose, sequence): + for seqitem in subseq: + if isinstance(seqitem, list): + for i in seqitem: + if isinstance(i, list): + for j in i: + decomposedsequence.append(j) + else: + decomposedsequence.append(i) + else: + decomposedsequence.append(seqitem) + recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence))) + if len(recomposedchar) == 1 and len(decomposedsequence) > 1: + counter_entries += 1 + counter_combinations += factorial(len(decomposedsequence)-1) + ch = item + if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff: + counter_entries_greek += 1 + counter_combinations_greek += factorial(len(decomposedsequence)-1) + if verbose: + print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) }, + print "[", + for elem in decomposedsequence: + print '<0x%(hex)04X>,' % { 'hex': elem }, + print "], recomposed as", recomposedchar, + if unichr(item) == recomposedchar: + print "verified" + + if verbose == False: + print "Unicode statistics from UnicodeData.txt" + print "Number of entries that can be algorithmically produced :", counter_entries + print " of which are for Greek :", counter_entries_greek + print "Number of compose sequence combinations requiring :", counter_combinations + print " of which are for Greek :", counter_combinations_greek + print "Note: We do not include partial compositions, " + print "thus the slight discrepancy in the figures" + print + +if opt_unicodedatatxt: + process_unicodedata_file(True) + +if opt_statistics: + print + print "Total number of compose sequences (from file) :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic) + print " of which can be expressed algorithmically :", len(xorg_compose_sequences_algorithmic) + print " of which cannot be expressed algorithmically :", len(xorg_compose_sequences) + print " of which have Multi_key :", counter_multikey + print + print "Algorithmic (stats for Xorg Compose file)" + print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic) + print "Number of sequences off due to algo (uniq(sort(array))) :", len(xorg_compose_sequences_algorithmic_uniqued) + print " of which are for Greek :", num_algorithmic_greek + print + process_unicodedata_file() + print "Not algorithmic (stats from Xorg Compose file)" + print "Number of sequences :", len(xorg_compose_sequences) + print "Flat array looks like :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)" + print "Flat array would have taken up (in bytes) :", num_entries * 2 * 6, "bytes from the GTK+ library" + print "Number of items in flat array :", len(xorg_compose_sequences) * 6 + print " of which are zeroes :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent" + print "Number of different first items :", num_first_keysyms + print "Number of max bytes (if using flat array) :", num_entries * 2 * 6 + print "Number of savings :", zeroes * 2 - num_first_keysyms * 2 * 5 + print + print "Memory needs if both algorithmic+optimised table in latest Xorg compose file" + print " :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5 + print + print "Existing (old) implementation in GTK+" + print "Number of sequences in old gtkimcontextsimple.c :", 691 + print "The existing (old) implementation in GTK+ takes up :", 691 * 2 * 12, "bytes" |