Fix #321896 - Synch gtkimcontextsimple.c with Xorg

svn path=/trunk/; revision=19706
author: Simos Xenitellis <simos@src.gnome.org> 2008-03-04 11:21:48 +0000
committer: Simos Xenitellis <simos@src.gnome.org> 2008-03-04 11:21:48 +0000
commit: 11abc0d6917e043ad47c9f67dae4b4ebde11a22b (patch)
tree: 4395ed7cf43346dc9e321536e2c6e45a0acf071f /gtk/compose-parse.py
parent: b9001703a2e6ea313860dbc3f9a094547bf24a2e (diff)
download: gtk+-11abc0d6917e043ad47c9f67dae4b4ebde11a22b.tar.gz
1 files changed, 856 insertions, 0 deletions
diff --git a/gtk/compose-parse.py b/gtk/compose-parse.py
new file mode 100755
index 0000000000..af86939cd5
--- /dev/null
+++ b/gtk/compose-parse.py
@@ -0,0 +1,856 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# compose-parse.py, version 1.3
+#
+# multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
+# the script produces statistics and information about the whole process, run with --help for more.
+#
+# You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
+#
+# Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
+
+from re			import findall, match, split, sub
+from string		import atoi
+from unicodedata	import normalize
+from urllib 		import urlretrieve
+from os.path		import isfile, getsize
+from copy 		import copy
+
+import sys
+import getopt
+
+# We grab files off the web, left and right.
+URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
+URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
+URL_GDKKEYSYMSH = "http://svn.gnome.org/svn/gtk%2B/trunk/gdk/gdkkeysyms.h"
+URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.0.0/ucd/UnicodeData.txt'
+
+# We currently support keysyms of size 2; once upstream xorg gets sorted, 
+# we might produce some tables with size 2 and some with size 4.
+SIZEOFINT = 2
+
+# Current max compose sequence length; in case it gets increased.
+WIDTHOFCOMPOSETABLE = 5
+
+keysymdatabase = {}
+keysymunicodedatabase = {}
+unicodedatabase = {}
+
+headerfile_start = """/* GTK - The GIMP Tool Kit
+ * Copyright (C) 2007, 2008 GNOME Foundation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/*
+ * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
+ * using the input files
+ *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
+ *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
+ *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
+ *
+ * This table is optimised for space and requires special handling to access the content.
+ * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
+ * 
+ * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
+ * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
+ */
+
+/*
+ * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
+ * file for a list of people on the GTK+ Team.  See the ChangeLog
+ * files for a list of changes.  These files are distributed with
+ * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
+ */
+
+#ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
+#define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
+
+/* === These are the original comments of the file; we keep for historical purposes ===
+ *
+ * The following table was generated from the X compose tables include with
+ * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
+ * to obtain the relevant perl scripts.
+ *
+ * The following compose letter letter sequences confliced
+ *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
+ *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
+ *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
+ *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
+ *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
+ *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
+ *
+ * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
+ *   spanish. atilde and otilde are used at least for Portuguese ]
+ *
+ *   at and Aring; resolved to Aring                                          [ AA ]
+ *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
+ *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
+ *
+ * This probably should be resolved by first checking an additional set of compose tables
+ * that depend on the locale or selected input method.
+ */
+
+static const guint16 gtk_compose_seqs_compact[] = {"""
+
+headerfile_end = """};
+
+#endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
+"""
+
+def stringtohex(str): return atoi(str, 16)
+
+def factorial(n): 
+	if n <= 1:
+		return 1
+	else:
+		return n * factorial(n-1)
+
+def uniq(*args) :
+	""" Performs a uniq operation on a list or lists """
+    	theInputList = []
+    	for theList in args:
+    	   theInputList += theList
+    	theFinalList = []
+    	for elem in theInputList:
+		if elem not in theFinalList:
+          		theFinalList.append(elem)
+    	return theFinalList
+
+
+
+def all_permutations(seq):
+	""" Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
+	""" Produces all permutations of the items of a list """
+    	if len(seq) <=1:
+    	    yield seq
+    	else:
+    	    for perm in all_permutations(seq[1:]):
+    	        for i in range(len(perm)+1):
+    	            #nb str[0:1] works in both string and list contexts
+        	        yield perm[:i] + seq[0:1] + perm[i:]
+
+def usage():
+	print """compose-parse available parameters:
+	-h, --help		this craft
+	-s, --statistics	show overall statistics (both algorithmic, non-algorithmic)
+	-a, --algorithmic	show sequences saved with algorithmic optimisation
+	-g, --gtk		show entries that go to GTK+
+	-u, --unicodedatatxt	show compose sequences derived from UnicodeData.txt (from unicode.org)
+	-v, --verbose		show verbose output
+        -p, --plane1		show plane1 compose sequences
+	-n, --numeric		when used with --gtk, create file with numeric values only
+	-e, --gtk-expanded	when used with --gtk, create file that repeats first column; not usable in GTK+
+
+	Default is to show statistics.
+	"""
+
+try: 
+	opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt", 
+		"stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded"])
+except: 
+	usage()
+	sys.exit(2)
+
+opt_statistics = False
+opt_algorithmic = False
+opt_gtk = False
+opt_unicodedatatxt = False
+opt_verbose = False
+opt_plane1 = False
+opt_numeric = False
+opt_gtkexpanded = False
+
+for o, a in opts:
+	if o in ("-h", "--help"):
+		usage()
+		sys.exit()
+	if o in ("-s", "--statistics"):
+		opt_statistics = True
+	if o in ("-a", "--algorithmic"):
+		opt_algorithmic = True
+	if o in ("-g", "--gtk"):
+		opt_gtk = True	
+	if o in ("-u", "--unicodedatatxt"):
+		opt_unicodedatatxt = True
+	if o in ("-v", "--verbose"):
+		opt_verbose = True
+	if o in ("-p", "--plane1"):
+		opt_plane1 = True
+	if o in ("-n", "--numeric"):
+		opt_numeric = True
+	if o in ("-e", "--gtk-expanded"):
+		opt_gtkexpanded = True
+
+if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
+	opt_statistics = True
+
+def download_hook(blocks_transferred, block_size, file_size):
+	""" A download hook to provide some feedback when downloading """
+	if blocks_transferred == 0:
+		if file_size > 0:
+			if opt_verbose:
+				print "Downloading", file_size, "bytes: ",
+		else:	
+			if opt_verbose:
+				print "Downloading: ",
+	sys.stdout.write('#')
+	sys.stdout.flush()
+
+
+def download_file(url):
+	""" Downloads a file provided a URL. Returns the filename. """
+	""" Borks on failure """
+	localfilename = url.split('/')[-1]
+        if not isfile(localfilename) or getsize(localfilename) <= 0:
+		if opt_verbose:
+			print "Downloading ", url, "..."
+		try: 
+			urlretrieve(url, localfilename, download_hook)
+		except IOError, (errno, strerror):
+			print "I/O error(%s): %s" % (errno, strerror)
+			sys.exit(-1)
+		except:
+			print "Unexpected error: ", sys.exc_info()[0]
+			sys.exit(-1)
+		print " done."
+        else:
+		if opt_verbose:
+                	print "Using cached file for ", url
+	return localfilename
+
+def process_gdkkeysymsh():
+	""" Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
+	""" Fills up keysymdb with contents """
+	filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
+	try: 
+		gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
+	except IOError, (errno, strerror):
+		print "I/O error(%s): %s" % (errno, strerror)
+		sys.exit(-1)
+	except:
+		print "Unexpected error: ", sys.exc_info()[0]
+		sys.exit(-1)
+
+	""" Parse the gdkkeysyms.h file and place contents in  keysymdb """
+	linenum_gdkkeysymsh = 0
+	keysymdb = {}
+	for line in gdkkeysymsh.readlines():
+		linenum_gdkkeysymsh += 1
+		line = line.strip()
+		if line == "" or not match('^#define GDK_', line):
+			continue
+		components = split('\s+', line)
+		if len(components) < 3:
+			print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
+			% {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
+			print "Was expecting 3 items in the line"
+			sys.exit(-1)
+		if not match('^GDK_', components[1]):
+			print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
+			% {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
+			print "Was expecting a keysym starting with GDK_"
+			sys.exit(-1)
+		if components[2][:2] == '0x' and match('[0-9a-fA-F]+$', components[2][2:]):
+			unival = atoi(components[2][2:], 16)
+			if unival == 0:
+				continue
+			keysymdb[components[1][4:]] = unival
+		else:
+			print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
+			% {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
+			print "Was expecting a hexadecimal number at the end of the line"
+			sys.exit(-1)
+	gdkkeysymsh.close()
+
+	""" Patch up the keysymdb with some of our own stuff """
+
+	""" This is for a missing keysym from the currently upstread file """
+	keysymdb['dead_stroke'] = 0x338
+
+	""" This is^Wwas preferential treatment for Greek """
+	# keysymdb['dead_tilde'] = 0x342  		
+	""" This is^was preferential treatment for Greek """
+	#keysymdb['combining_tilde'] = 0x342	
+
+	""" Fixing VoidSymbol """
+	keysymdb['VoidSymbol'] = 0xFFFF
+
+	return keysymdb
+
+def process_keysymstxt():
+	""" Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
+	""" This file keeps a record between keysyms <-> unicode chars """
+	filename_keysymstxt = download_file(URL_KEYSYMSTXT)
+	try: 
+		keysymstxt = open(filename_keysymstxt, 'r')
+	except IOError, (errno, strerror):
+		print "I/O error(%s): %s" % (errno, strerror)
+		sys.exit(-1)
+	except:
+		print "Unexpected error: ", sys.exc_info()[0]
+		sys.exit(-1)
+
+	""" Parse the keysyms.txt file and place content in  keysymdb """
+	linenum_keysymstxt = 0
+	keysymdb = {}
+	for line in keysymstxt.readlines():
+		linenum_keysymstxt += 1
+		line = line.strip()
+		if line == "" or match('^#', line):
+			continue
+		components = split('\s+', line)
+		if len(components) < 5:
+			print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
+			% {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
+			print "Was expecting 5 items in the line"
+			sys.exit(-1)
+		if components[1][0] == 'U' and match('[0-9a-fA-F]+$', components[1][1:]):
+			unival = atoi(components[1][1:], 16)
+		if unival == 0:
+			continue
+		keysymdb[components[4]] = unival
+	keysymstxt.close()
+
+	""" Patch up the keysymdb with some of our own stuff """
+
+	""" This is preferential treatment for Greek """
+	""" => we get more savings if used for Greek """
+	# keysymdb['dead_tilde'] = 0x342  		
+	""" This is preferential treatment for Greek """
+	# keysymdb['combining_tilde'] = 0x342	
+
+	""" This is for a missing keysym from Marcus Khun's db """
+	keysymdb['dead_stroke'] = 0x338
+	""" This is for a missing keysym from Marcus Khun's db """
+	# keysymdb['Oslash'] = 0x0d8		
+
+	""" This is for a missing (recently added) keysym """
+	keysymdb['dead_psili'] = 0x313		
+	""" This is for a missing (recently added) keysym """
+	keysymdb['dead_dasia'] = 0x314		
+
+	""" Allows to import Multi_key sequences """
+	keysymdb['Multi_key'] = 0xff20
+
+	return keysymdb
+
+def keysymvalue(keysym, file = "n/a", linenum = 0):
+	""" Extracts a value from the keysym """
+	""" Find the value of keysym, using the data from keysyms """
+	""" Use file and linenum to when reporting errors """
+	if keysym == "":
+		return 0
+       	if keysymdatabase.has_key(keysym):
+               	return keysymdatabase[keysym]
+       	elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
+               	return atoi(keysym[1:], 16)
+       	elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
+		return atoi(keysym[2:], 16)
+	else:
+        	print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
+               	sys.exit(-1)
+
+def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
+	""" Extracts a value from the keysym """
+	""" Find the value of keysym, using the data from keysyms """
+	""" Use file and linenum to when reporting errors """
+	if keysym == "":
+		return 0
+       	if keysymunicodedatabase.has_key(keysym):
+               	return keysymunicodedatabase[keysym]
+       	elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
+               	return atoi(keysym[1:], 16)
+       	elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
+		return atoi(keysym[2:], 16)
+	else:
+        	print 'UNKNOWN{%(keysym)s}' % { "keysym": keysym }
+               	sys.exit(-1)
+
+def rename_combining(seq):
+	filtered_sequence = []
+	for ks in seq:
+		if findall('^combining_', ks):
+			filtered_sequence.append(sub('^combining_', 'dead_', ks))
+		else:
+			filtered_sequence.append(ks)
+	return filtered_sequence
+
+
+keysymunicodedatabase = process_keysymstxt()
+keysymdatabase = process_gdkkeysymsh()
+
+""" Grab and open the compose file from upstream """
+filename_compose = download_file(URL_COMPOSE)
+try: 
+	composefile = open(filename_compose, 'r')
+except IOError, (errno, strerror):
+	print "I/O error(%s): %s" % (errno, strerror)
+	sys.exit(-1)
+except:
+	print "Unexpected error: ", sys.exc_info()[0]
+	sys.exit(-1)
+
+""" Parse the compose file in  xorg_compose_sequences"""
+xorg_compose_sequences = []
+xorg_compose_sequences_algorithmic = []
+linenum_compose = 0
+for line in composefile.readlines():
+	linenum_compose += 1
+	line = line.strip()
+	if line is "" or match("^XCOMM", line) or match("^#", line):
+		continue
+
+	line = line[:-1]
+	components = split(':', line)
+	if len(components) != 2:
+		print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
+		/value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
+		exit(-1)
+	(seq, val ) = split(':', line)
+	seq = seq.strip()
+	val = val.strip()
+	raw_sequence = findall('\w+', seq)
+	values = split('\s+', val)
+	unichar_temp = split('"', values[0])
+	unichar = unichar_temp[1]
+	codepointstr = values[1]
+	if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
+		raw_sequence[0] = '0x' + raw_sequence[0][1:]
+	if codepointstr[0] == 'U' and match('[0-9a-fA-F]+$', codepointstr[1:]):
+		codepoint = atoi(codepointstr[1:], 16)
+	elif keysymdatabase.has_key(codepointstr):
+		codepoint = keysymdatabase[codepointstr]
+	else:
+		print
+		print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
+		 %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
+		exit(-1)
+	sequence = rename_combining(raw_sequence)
+	reject_this = False
+	for i in sequence:
+		if keysymvalue(i) > 0xFFFF:
+			reject_this = True
+			if opt_plane1:
+				print sequence
+			break
+	if reject_this:
+		continue
+	if "U0313" in sequence or "U0314" in sequence or "0x0313" in sequence or "0x0314" in sequence:
+		continue
+	for i in range(len(sequence)):
+		if sequence[i] == "0x0342":
+			sequence[i] = "dead_tilde"
+	if "Multi_key" not in sequence:
+		""" Ignore for now >0xFFFF keysyms """
+		if codepoint < 0xFFFF:
+			original_sequence = copy(sequence)
+			stats_sequence = copy(sequence)
+			base = sequence.pop()
+			basechar = keysymvalue(base, filename_compose, linenum_compose)
+			
+			if basechar < 0xFFFF:
+				counter = 1
+				unisequence = []
+				not_normalised = True
+				skipping_this = False
+				for i in range(0, len(sequence)):
+					""" If the sequence has dead_tilde and is for Greek, we don't do algorithmically 
+					    because of lack of dead_perispomeni (i.e. conflict)
+					"""
+					bc = basechar
+					if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
+						skipping_this = True
+						break
+					if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
+						skipping_this = True
+						break
+					if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
+						skipping_this = True
+						break
+					if sequence[-1] == "dead_psili":
+						sequence[i] = "dead_horn"
+					if sequence[-1] == "dead_dasia":
+						sequence[-1] = "dead_ogonek"
+					unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
+					
+				if skipping_this:
+					unisequence = []
+				for perm in all_permutations(unisequence):
+					# print counter, original_sequence, unichr(basechar) + "".join(perm)
+					# print counter, map(unichr, perm)
+					normalized = normalize('NFC', unichr(basechar) + "".join(perm))
+					if len(normalized) == 1:
+						# print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
+						# % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
+						# print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
+						stats_sequence_data = map(keysymunicodevalue, stats_sequence)
+						stats_sequence_data.append(normalized)
+						xorg_compose_sequences_algorithmic.append(stats_sequence_data)
+						not_normalised = False
+						break;
+					counter += 1
+				if not_normalised:
+					original_sequence.append(codepoint)
+					xorg_compose_sequences.append(original_sequence)
+					""" print xorg_compose_sequences[-1] """
+					
+			else:
+				print "Error in base char !?!"
+				exit(-2)
+		else:
+			print "OVER", sequence
+			exit(-1)
+	else:
+		sequence.append(codepoint)
+		xorg_compose_sequences.append(sequence)
+		""" print xorg_compose_sequences[-1] """
+
+def sequence_cmp(x, y):
+	if keysymvalue(x[0]) > keysymvalue(y[0]):
+		return 1
+	elif keysymvalue(x[0]) < keysymvalue(y[0]):
+		return -1
+	elif len(x) > len(y):
+		return 1
+	elif len(x) < len(y):
+		return -1
+	elif keysymvalue(x[1]) > keysymvalue(y[1]):
+		return 1
+	elif keysymvalue(x[1]) < keysymvalue(y[1]):
+		return -1
+	elif len(x) < 4:
+		return 0
+	elif keysymvalue(x[2]) > keysymvalue(y[2]):
+		return 1
+	elif keysymvalue(x[2]) < keysymvalue(y[2]):
+		return -1
+	elif len(x) < 5:
+		return 0
+	elif keysymvalue(x[3]) > keysymvalue(y[3]):
+		return 1
+	elif keysymvalue(x[3]) < keysymvalue(y[3]):
+		return -1
+	elif len(x) < 6:
+		return 0
+	elif keysymvalue(x[4]) > keysymvalue(y[4]):
+		return 1
+	elif keysymvalue(x[4]) < keysymvalue(y[4]):
+		return -1
+	else:
+		return 0
+
+def sequence_unicode_cmp(x, y):
+	if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
+		return 1
+	elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
+		return -1
+	elif len(x) > len(y):
+		return 1
+	elif len(x) < len(y):
+		return -1
+	elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
+		return 1
+	elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
+		return -1
+	elif len(x) < 4:
+		return 0
+	elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
+		return 1
+	elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
+		return -1
+	elif len(x) < 5:
+		return 0
+	elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
+		return 1
+	elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
+		return -1
+	elif len(x) < 6:
+		return 0
+	elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
+		return 1
+	elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
+		return -1
+	else:
+		return 0
+
+def sequence_algorithmic_cmp(x, y):
+	if len(x) < len(y):
+		return -1
+	elif len(x) > len(y):
+		return 1
+	else:
+		for i in range(len(x)):
+			if x[i] < y[i]:
+				return -1
+			elif x[i] > y[i]:
+				return 1
+	return 0
+
+
+xorg_compose_sequences.sort(sequence_cmp)
+
+xorg_compose_sequences_uniqued = []
+first_time = True
+item = None
+for next_item in xorg_compose_sequences:
+	if first_time:
+		first_time = False
+		item = next_item
+	if sequence_unicode_cmp(item, next_item) != 0:
+		xorg_compose_sequences_uniqued.append(item)
+	item = next_item
+
+xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
+
+counter_multikey = 0
+for item in xorg_compose_sequences:
+	if findall('Multi_key', "".join(item[:-1])) != []:
+		counter_multikey += 1
+
+xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
+xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
+
+firstitem = ""
+num_first_keysyms = 0
+zeroes = 0
+num_entries = 0
+num_algorithmic_greek = 0
+for sequence in xorg_compose_sequences:
+	if keysymvalue(firstitem) != keysymvalue(sequence[0]): 
+		firstitem = sequence[0]
+		num_first_keysyms += 1
+	zeroes += 6 - len(sequence) + 1
+	num_entries += 1
+
+for sequence in xorg_compose_sequences_algorithmic_uniqued:
+	ch = ord(sequence[-1:][0])
+	if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
+		num_algorithmic_greek += 1
+		
+
+if opt_algorithmic:
+	for sequence in xorg_compose_sequences_algorithmic_uniqued:
+		letter = "".join(sequence[-1:])
+		print '0x%(cp)04X, %(uni)c, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter, 'base': sequence[-2] },
+		for elem in sequence[:-2]:
+			print "<0x%(keysym)04X>," % { 'keysym': elem },
+		""" Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
+		print "], recomposed as", letter, "verified"
+
+def num_of_keysyms(seq):
+	return len(seq) - 1
+
+def convert_UnotationToHex(arg):
+	if isinstance(arg, str):
+		if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
+			return sub('^U', '0x', arg)
+	return arg
+
+def addprefix_GDK(arg):
+	if match('^0x', arg):
+		return '%(arg)s, ' % { 'arg': arg } 
+	else:
+		return 'GDK_%(arg)s, ' % { 'arg': arg } 
+
+if opt_gtk:
+	first_keysym = ""
+	sequence = []
+	compose_table = []
+	ct_second_part = []
+	ct_sequence_width = 2
+	start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
+	we_finished = False
+	counter = 0
+
+	sequence_iterator = iter(xorg_compose_sequences)
+	sequence = sequence_iterator.next()
+	while True:
+		first_keysym = sequence[0]					# Set the first keysym
+		compose_table.append([first_keysym, 0, 0, 0, 0, 0])
+		while sequence[0] == first_keysym:
+			compose_table[counter][num_of_keysyms(sequence)-1] += 1
+			try:
+				sequence = sequence_iterator.next()
+			except StopIteration:
+				we_finished = True
+				break
+		if we_finished:
+			break
+		counter += 1
+
+	ct_index = start_offset
+	for line_num in range(len(compose_table)):
+		for i in range(WIDTHOFCOMPOSETABLE):
+			occurences = compose_table[line_num][i+1]
+			compose_table[line_num][i+1] = ct_index
+			ct_index += occurences * (i+2)
+
+	for sequence in xorg_compose_sequences:
+		ct_second_part.append(map(convert_UnotationToHex, sequence))
+
+	print headerfile_start
+	for i in compose_table:
+		if opt_gtkexpanded:
+			print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
+			print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
+		elif not match('^0x', i[0]):
+			print 'GDK_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
+		else:
+			print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
+	for i in ct_second_part:
+		if opt_numeric:
+			for ks in i[1:][:-1]:
+				print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
+			print '0x%(cp)04X, ' % { 'cp':i[-1] }
+			"""
+			for ks in i[:-1]:
+				print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
+			print '0x%(cp)04X, ' % { 'cp':i[-1] }
+			"""
+		elif opt_gtkexpanded:
+			print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
+		else:
+			print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
+	print headerfile_end 
+
+def redecompose(codepoint):
+	(name, decomposition, combiningclass) = unicodedatabase[codepoint]
+	if decomposition[0] == '' or decomposition[0] == '0':
+		return [codepoint]
+	if match('<\w+>', decomposition[0]):
+		numdecomposition = map(stringtohex, decomposition[1:])
+		return map(redecompose, numdecomposition)
+	numdecomposition = map(stringtohex, decomposition)
+	return map(redecompose, numdecomposition)
+
+def process_unicodedata_file(verbose = False):
+	""" Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
+	filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
+	try: 
+		unicodedatatxt = open(filename_unicodedatatxt, 'r')
+	except IOError, (errno, strerror):
+		print "I/O error(%s): %s" % (errno, strerror)
+		sys.exit(-1)
+	except:
+		print "Unexpected error: ", sys.exc_info()[0]
+		sys.exit(-1)
+	for line in unicodedatatxt.readlines():
+		if line[0] == "" or line[0] == '#':
+			continue
+		line = line[:-1]
+		uniproperties = split(';', line)
+		codepoint = stringtohex(uniproperties[0])
+		""" We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
+		if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF): 
+			continue
+		name = uniproperties[1]
+		category = uniproperties[2]
+		combiningclass = uniproperties[3]
+		decomposition = uniproperties[5]
+		unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
+	
+	counter_combinations = 0
+	counter_combinations_greek = 0
+	counter_entries = 0
+	counter_entries_greek = 0
+
+	for item in unicodedatabase.keys():
+		(name, decomposition, combiningclass) = unicodedatabase[item]
+		if decomposition[0] == '':
+			continue
+			print name, "is empty"
+		elif match('<\w+>', decomposition[0]):
+			continue
+			print name, "has weird", decomposition[0]
+		else:
+			sequence = map(stringtohex, decomposition)
+			chrsequence = map(unichr, sequence)
+			normalized = normalize('NFC', "".join(chrsequence))
+			
+			""" print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
+			decomposedsequence = []
+			for subseq in map(redecompose, sequence):
+				for seqitem in subseq:
+					if isinstance(seqitem, list):
+						for i in seqitem:
+							if isinstance(i, list):
+								for j in i:
+									decomposedsequence.append(j)
+							else:
+								decomposedsequence.append(i)
+					else:
+						decomposedsequence.append(seqitem)
+			recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
+			if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
+				counter_entries += 1
+				counter_combinations += factorial(len(decomposedsequence)-1)
+				ch = item
+				if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
+					counter_entries_greek += 1
+					counter_combinations_greek += factorial(len(decomposedsequence)-1)
+				if verbose:
+					print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
+					print "[",
+					for elem in decomposedsequence:
+						print '<0x%(hex)04X>,' % { 'hex': elem },
+					print "], recomposed as", recomposedchar,
+					if unichr(item) == recomposedchar:
+						print "verified"
+	
+	if verbose == False:
+		print "Unicode statistics from UnicodeData.txt"
+		print "Number of entries that can be algorithmically produced     :", counter_entries
+		print "  of which are for Greek                                   :", counter_entries_greek
+		print "Number of compose sequence combinations requiring          :", counter_combinations
+		print "  of which are for Greek                                   :", counter_combinations_greek
+		print "Note: We do not include partial compositions, "
+		print "thus the slight discrepancy in the figures"
+		print
+
+if opt_unicodedatatxt:
+	process_unicodedata_file(True)
+
+if opt_statistics:
+	print
+	print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
+	print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
+	print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences) 
+	print "    of which have Multi_key                                :", counter_multikey
+	print 
+	print "Algorithmic (stats for Xorg Compose file)"
+	print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
+	print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
+	print "  of which are for Greek                                   :", num_algorithmic_greek
+	print 
+	process_unicodedata_file()
+	print "Not algorithmic (stats from Xorg Compose file)"
+	print "Number of sequences                                        :", len(xorg_compose_sequences) 
+	print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
+	print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
+	print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
+	print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
+	print "Number of different first items                            :", num_first_keysyms
+	print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
+	print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
+	print 
+	print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
+	print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
+	print
+	print "Existing (old) implementation in GTK+"
+	print "Number of sequences in old gtkimcontextsimple.c            :", 691
+	print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"
author	Simos Xenitellis <simos@src.gnome.org>	2008-03-04 11:21:48 +0000
committer	Simos Xenitellis <simos@src.gnome.org>	2008-03-04 11:21:48 +0000
commit	11abc0d6917e043ad47c9f67dae4b4ebde11a22b (patch)
tree	4395ed7cf43346dc9e321536e2c6e45a0acf071f /gtk/compose-parse.py
parent	b9001703a2e6ea313860dbc3f9a094547bf24a2e (diff)
download	gtk+-11abc0d6917e043ad47c9f67dae4b4ebde11a22b.tar.gz