Imported anthy zipcode.t into ibus-anthy.

author: fujiwarat <takao.fujiwara1@gmail.com> 2013-08-27 23:17:47 +0900
committer: fujiwarat <takao.fujiwara1@gmail.com> 2013-08-29 12:57:40 +0900
commit: 17d8871211a531c755cd4397a75f4892222412ff (patch)
tree: 4a845f7c2b1084f0a0e8d3db90e41c5258c98135 /data
parent: e1a94ef87577a38b3402a0ebaa1f0e5088f99c2a (diff)
download: ibus-anthy-17d8871211a531c755cd4397a75f4892222412ff.tar.gz
2 files changed, 64 insertions, 0 deletions
diff --git a/data/Makefile.am b/data/Makefile.am
index 3aa240a..f187c93 100644
--- a/data/Makefile.am
+++ b/data/Makefile.am
@@ -27,8 +27,18 @@ dicts_DATA = \
 
 dictsdir = $(pkgdatadir)/dicts
 
+if ENABLE_ZIPCODE
+dicts_DATA += zipcode.t
+
+zipcode.t: $(ANTHY_ZIPCODE_FILE)
+	$(PYTHON) zipcode-textdic.py $<
+
+CLEANFILES = zipcode.t
+endif
+
 EXTRA_DIST = \
         era.t \
         oldchar.t \
         symbol.t \
+        zipcode-textdic.py \
         $(NULL)
diff --git a/data/zipcode-textdic.py b/data/zipcode-textdic.py
new file mode 100755
index 0000000..2864f85
--- /dev/null
+++ b/data/zipcode-textdic.py
@@ -0,0 +1,54 @@
+#!/usr/bin/python
+
+# Finally decided to import anthy zipcode.t with UTF-8 into ibus-anthy
+# because if digits without hyphen is grepped by engine, it could cause
+# the timeout issue. If digits without hyphen are sent to anthy,
+# digits with hyphen also need to be sent to anthy so the lookup could
+# include too many and unnecessary candidates.
+# Also wish to install the filename of 'zipcode.t' to simplify enigne.
+
+import sys
+
+if len(sys.argv) < 2:
+    print >> sys.stderr, 'usage: %s /usr/share/anthy/zipcode.t' % sys.argv[0]
+    exit(-1)
+
+anthy_zipfile = sys.argv[1]
+
+try:
+    contents = unicode(open(anthy_zipfile).read(), 'euc_jp').encode('utf-8')
+except UnicodeDecodeError, e:
+    print >> sys.stderr, 'Your file is not eucJP? %s' % anthy_zipfile
+    contents = open(anthy_zipfile).read()
+
+output_zipfile = open('zipcode.t', 'w')
+output_zipfile.write('# copied %s with UTF-8.\n#\n' % anthy_zipfile)
+
+for line in contents.split('\n'):
+    if len(line) == 0 or line[0] == '#':
+        output_zipfile.write('%s\n' % line)
+        continue
+
+    words = line.split()
+    if len(words) < 3:
+        continue
+
+    if len(words[0]) < 1 or ord(unicode(words[0], 'utf-8')[0]) > 0xff:
+        mbcs_addr = words[0]
+    else:
+        uni_addr = ''
+        i = 0
+        for word in words[0]:
+            # Convert ASCII number char to wide number char.
+            uni_addr += unichr(0xfee0 + ord(word))
+            if i == 2:
+                # Insert wide hyphen
+                uni_addr += unichr(0x30fc)
+            i += 1
+        mbcs_addr = uni_addr.encode('utf-8')
+
+    output_zipfile.write('%s %s %s\n' % \
+            (mbcs_addr, '#T35*500', words[2]))
+
+output_zipfile.flush()
+output_zipfile.close()
author	fujiwarat <takao.fujiwara1@gmail.com>	2013-08-27 23:17:47 +0900
committer	fujiwarat <takao.fujiwara1@gmail.com>	2013-08-29 12:57:40 +0900
commit	17d8871211a531c755cd4397a75f4892222412ff (patch)
tree	4a845f7c2b1084f0a0e8d3db90e41c5258c98135 /data
parent	e1a94ef87577a38b3402a0ebaa1f0e5088f99c2a (diff)
download	ibus-anthy-17d8871211a531c755cd4397a75f4892222412ff.tar.gz