diff options
Diffstat (limited to 'scripts/genpytable.py')
-rw-r--r-- | scripts/genpytable.py | 439 |
1 files changed, 0 insertions, 439 deletions
diff --git a/scripts/genpytable.py b/scripts/genpytable.py deleted file mode 100644 index 5dfe2d0..0000000 --- a/scripts/genpytable.py +++ /dev/null @@ -1,439 +0,0 @@ -# vim:set et sts=4: -# -*- coding: utf-8 -*- - -from pydict import * -from bopomofo import * - -def str_cmp(a, b): - if len(a) == len(b): - return cmp(a, b) - else: - return len(a) - len(b) - -pinyin_list = PINYIN_DICT.keys() -pinyin_list.sort() - -shengmu_list = SHENGMU_DICT.keys() -shengmu_list.remove("") -shengmu_list.sort() - -auto_correct = [ - # "correct", "wrong" - ("ng", "gn"), - ("ng", "mg"), - ("iu", "iou"), - ("ui", "uei"), - ("un", "uen"), -# ("ue", "ve"), - ("ve", "ue"), - ("ong", "on"), -] - -auto_correct_ext = [ - # "correct", "wrong", flag - ("ju", "jv", "PINYIN_CORRECT_V_TO_U"), - ("qu", "qv", "PINYIN_CORRECT_V_TO_U"), - ("xu", "xv", "PINYIN_CORRECT_V_TO_U"), - ("yu", "yv", "PINYIN_CORRECT_V_TO_U"), - - ("jue", "jve", "PINYIN_CORRECT_V_TO_U"), - ("que", "qve", "PINYIN_CORRECT_V_TO_U"), - ("xue", "xve", "PINYIN_CORRECT_V_TO_U"), - ("yue", "yve", "PINYIN_CORRECT_V_TO_U"), - - ("juan", "jvan", "PINYIN_CORRECT_V_TO_U"), - ("quan", "qvan", "PINYIN_CORRECT_V_TO_U"), - ("xuan", "xvan", "PINYIN_CORRECT_V_TO_U"), - ("yuan", "yvan", "PINYIN_CORRECT_V_TO_U"), - - ("jun", "jvn", "PINYIN_CORRECT_V_TO_U"), - ("qun", "qvn", "PINYIN_CORRECT_V_TO_U"), - ("xun", "xvn", "PINYIN_CORRECT_V_TO_U"), - ("yun", "yvn", "PINYIN_CORRECT_V_TO_U"), - - ("juang", "jvang", "PINYIN_FUZZY_UANG_UAN | PINYIN_CORRECT_V_TO_U"), - ("quang", "qvang", "PINYIN_FUZZY_UANG_UAN | PINYIN_CORRECT_V_TO_U"), - ("xuang", "xvang", "PINYIN_FUZZY_UANG_UAN | PINYIN_CORRECT_V_TO_U"), - ("yuang", "yvang", "PINYIN_FUZZY_UANG_UAN | PINYIN_CORRECT_V_TO_U"), - - ("jun", "jven", "PINYIN_CORRECT_UEN_TO_UN | PINYIN_CORRECT_V_TO_U"), - ("qun", "qven", "PINYIN_CORRECT_UEN_TO_UN | PINYIN_CORRECT_V_TO_U"), - ("xun", "xven", "PINYIN_CORRECT_UEN_TO_UN | PINYIN_CORRECT_V_TO_U"), - ("yun", "yven", "PINYIN_CORRECT_UEN_TO_UN | PINYIN_CORRECT_V_TO_U"), -] - -fuzzy_shengmu = [ - ("c", "ch"), - ("ch", "c"), - ("z", "zh"), - ("zh", "z"), - ("s", "sh"), - ("sh", "s"), - ("l", "n"), - ("n", "l"), - ("f", "h"), - ("h", "f"), - ("l", "r"), - ("r", "l"), - ("k", "g"), - ("g", "k"), -] - -fuzzy_yunmu = [ - ("an", "ang"), - ("ang", "an"), - ("en", "eng"), - ("eng", "en"), - ("in", "ing"), - ("ing", "in"), - ("ian", "iang"), - ("iang", "ian"), - ("uan", "uang"), - ("uang", "uan"), -] - -def get_sheng_yun(pinyin): - if pinyin == None: - return None, None - if pinyin == "ng": - return "", "ng" - for i in range(2, 0, -1): - s = pinyin[:i] - if s in shengmu_list: - return s, pinyin[i:] - return "", pinyin - -yunmu_list = set([]) -for p in pinyin_list: - s, y = get_sheng_yun(p) - yunmu_list |= set([y]) -yunmu_list = list(yunmu_list) -yunmu_list.sort() - -shengmu_yunmu_list = shengmu_list + yunmu_list -id_dict = {} -for i, y in enumerate(shengmu_yunmu_list): - id_dict[y] = i + 1 - -fuzzy_shengmu_dict = {} -for s1, s2 in fuzzy_shengmu: - if s1 not in fuzzy_shengmu_dict: - fuzzy_shengmu_dict[s1] = [] - fuzzy_shengmu_dict[s1].append(s2) - -fuzzy_yunmu_dict = {} -for y1, y2 in fuzzy_yunmu: - if y1 not in fuzzy_yunmu_dict: - fuzzy_yunmu_dict[y1] = [] - fuzzy_yunmu_dict[y1].append(y2) - -def encode_pinyin(pinyin): - if pinyin == None or pinyin == "": - return 0 - return id_dict[pinyin] - - e = 0 - for c in pinyin: - e = (e << 5) + (ord(c) - ord('a') + 1) - return e - -def get_pinyin(): - for p in pinyin_list: - s, y = get_sheng_yun(p) - yield p, s, y, len(p), [] - - for s in shengmu_list: - yield s, s, "", len(s), ["PINYIN_INCOMPLETE_PINYIN"] - - for c, w in auto_correct: - flag = "PINYIN_CORRECT_%s_TO_%s" % (w.upper(), c.upper()) - for p in pinyin_list: - if p.endswith(c) and p != c: - wp = p.replace(c, w) - s, y = get_sheng_yun(p) - yield wp, s, y, len(wp), [flag] - - for c, w, flag in auto_correct_ext: - s, y = get_sheng_yun(c) - yield w, s, y, len(w), [flag] - - for s1, s2 in fuzzy_shengmu: - flag = "PINYIN_FUZZY_%s_%s" % (s1.upper(), s2.upper()) - for y in yunmu_list: - if s1 + y not in pinyin_list and s2 + y in pinyin_list: - yield s1 + y, s1, y, len(s1) + len(y), [flag] - for c, w in auto_correct: - if s1 + w not in pinyin_list and \ - s1 + c not in pinyin_list and \ - s2 + w not in pinyin_list and \ - s2 + c in pinyin_list: - flag_correct = "PINYIN_CORRECT_%s_TO_%s" % (w.upper(), c.upper()) - yield s1 + w, s1, c, len(s2) + len(w), ["%s | %s" % (flag, flag_correct)] - - # if s2 + y not in pinyin_list and s1 + y in pinyin_list: - # yield s2 + y, s2, y, len (s2) + len(y), [flag] - - for y1, y2 in fuzzy_yunmu: - flag = "PINYIN_FUZZY_%s_%s" % (y1.upper(), y2.upper()) - for s in shengmu_list: - if s + y1 not in pinyin_list and s + y2 in pinyin_list: - yield s + y1, s, y1, len(s) + len(y1), [flag] - # if s + y2 not in pinyin_list and s + y1 in pinyin_list: - # yield s + y2, s, y2, len(s) + len(y2), [flag] - - -def get_pinyin_with_fuzzy(): - for text, s, y, l, flags in get_pinyin(): - fss = fuzzy_shengmu_dict.get(s, ["", ""]) - fys = fuzzy_yunmu_dict.get(y, ["", ""]) - - try: - fs1, fs2 = fss - except: - fs1, fs2 = fss[0], "" - - try: - fy1, fy2 = fys - except: - fy1, fy2 = fys[0], "" - - if fs1 and \ - (fs1 + y not in pinyin_list) and \ - (fy1 and fs1 + fy1 not in pinyin_list) and \ - (fy2 and fs1 + fy2 not in pinyin_list): - fs1 = "" - - if fs2 and \ - (fs2 + y not in pinyin_list) and \ - (fy1 and fs2 + fy1 not in pinyin_list) and \ - (fy2 and fs2 + fy2 not in pinyin_list): - fs2 = "" - - if fy1 and \ - (s + fy1 not in pinyin_list) and \ - (fs1 and fs1 + fy1 not in pinyin_list) and \ - (fs2 and fs2 + fy1 not in pinyin_list): - fy1 = "" - - if fy2 and \ - (s + fy2 not in pinyin_list) and \ - (fs1 and fs1 + fy2 not in pinyin_list) and \ - (fs2 and fs2 + fy2 not in pinyin_list): - fy2 = "" - - bopomofo = pinyin_bopomofo_map.get(text, "") - - if bopomofo == "": - if all([f.startswith("PINYIN_FUZZY_") for f in flags[0].split(" | ")]): - #if it is fuzzy pinyin or normal pinyin - if s in sheng_yun_bopomofo_map and y in sheng_yun_bopomofo_map: - if isinstance(sheng_yun_bopomofo_map[s], str): - bopomofo = sheng_yun_bopomofo_map[s] - else: - if y in sheng_yun_bopomofo_map[s][1][0]: - bopomofo += sheng_yun_bopomofo_map[s][1][1] - else: - bopomofo += sheng_yun_bopomofo_map[s][0] - - if isinstance(sheng_yun_bopomofo_map[y], str): - bopomofo += sheng_yun_bopomofo_map[y] - else: - if s in sheng_yun_bopomofo_map[y][1][0]: - bopomofo += sheng_yun_bopomofo_map[y][1][1] - else: - bopomofo += sheng_yun_bopomofo_map[y][0] - else: - print text - - yield text, bopomofo, s, y, fs1, fy1, fs2, fy2, l, flags - - -def gen_header(): - print '''/* Please do not modify this file. It is generated by script */ -#include "Types.h" - -namespace PY { -''' - -def gen_macros(): - print '#define PINYIN_ID_VOID (-1)' - print '#define PINYIN_ID_ZERO (0)' - for y in shengmu_list: - print '#define PINYIN_ID_%s (%d)' % (y.upper(), encode_pinyin(y)) - - for y in yunmu_list: - print '#define PINYIN_ID_%s (%d)' % (y.upper(), encode_pinyin(y)) - print - print - print - -def gen_option_check(name, fuzzy): - print '''static gboolean -%s (guint option, gint id, gint fid) -{ - switch ((id << 16) | fid) {''' % name - for y1, y2 in fuzzy: - flag = "PINYIN_FUZZY_%s_%s" % (y1.upper(), y2.upper()) - args = tuple(["PINYIN_ID_%s" % y.upper() for y in [y1, y2]]) + (flag, ) - print ''' case (%s << 16) | %s: - return (option & %s);''' % args - - print ' default: return FALSE;' - print ' }' - print '}' - -def union_dups(a): - n = {} - for r in a: - if r[:-1] in n: - n[r[:-1]] += r[-1] - else: - n[r[:-1]] = r[-1] - na = [] - for k, flags in n.items(): - na.append (tuple(list(k) + [" | ".join(flags) if flags else "0"])) - na.sort() - return na - -def gen_table(): - - pinyins = list(get_pinyin_with_fuzzy()) - pinyins = union_dups(pinyins) - - print 'static const Pinyin pinyin_table[] = {' - for i, (text, bopomofo, s, y, fs1, fy1, fs2, fy2, l, flags) in enumerate(pinyins): - s_id = "PINYIN_ID_%s" % s.upper() if s else "PINYIN_ID_ZERO" - y_id = "PINYIN_ID_%s" % y.upper() if y else "PINYIN_ID_ZERO" - fs1_id = "PINYIN_ID_%s" % fs1.upper() if fs1 else "PINYIN_ID_ZERO" - fy1_id = "PINYIN_ID_%s" % fy1.upper() if fy1 else "PINYIN_ID_ZERO" - fs2_id = "PINYIN_ID_%s" % fs2.upper() if fs2 else "PINYIN_ID_ZERO" - fy2_id = "PINYIN_ID_%s" % fy2.upper() if fy2 else "PINYIN_ID_ZERO" - - # args = (i, ) + tuple(['"%s"' % s for s in p[:3]]) + tuple(["PINYIN_ID_%s" % s.upper() if s else "PINYIN_ID_ZERO" for s in p[3:9]]) + p[9:-1] + (str(p[-1]), ) - print ''' { /* %d */ - text : "%s", - bopomofo : L"%s", - sheng : "%s", - yun : "%s", - pinyin_id : {{ %s, %s }, { %s, %s }, { %s, %s }}, - len : %d, - flags : %s - },''' % (i, text, bopomofo, s, y.replace("v", "ΓΌ"), s_id, y_id, fs1_id, fy1_id, fs2_id, fy2_id, l, flags) - - print '};' - print - - return pinyins - -def gen_bopomofo_table(pinyins): - bopomofo_table = [ (i, p) for i, p in enumerate(pinyins)] - bopomofo_table.sort(lambda a, b: cmp(a[1][1], b[1][1])) - print 'static const Pinyin *bopomofo_table[] = {' - for i, p in bopomofo_table: - if p[1]: - print ' %-20s %s' % ('&pinyin_table[%d],' % i, '// "%s" => "%s"' % (p[1], p[0])) - print '};' - print - -def get_all_special(pinyins): - for p in pinyins: - if p[-1] in ["n", "g", "r"]: - for yun in yunmu_list: - if yun not in pinyin_list: - continue - new_pinyin = p[-1] + yun - # if new_pinyin in pinyin_list: - yield p, yun, p[:-1], new_pinyin - elif p[-1] in ["e"]: - yield p, "r", p[:-1], "er" - -def get_max_freq_2(db, p1, p2): - s1, y1 = get_sheng_yun(p1) - s2, y2 = get_sheng_yun(p2) - - sql = "select max(freq), phrase from py_phrase_1 where s0 = %d and y0 = %d and s1 = %d and y1 = %d" - - c = db.execute(sql % (encode_pinyin(s1), encode_pinyin(y1), encode_pinyin(s2), encode_pinyin(y2))) - for r in c: - return r[0] - return 0 - -def get_max_freq_1(db, p1): - s1, y1 = get_sheng_yun(p1) - - sql = "select max(freq), phrase from py_phrase_0 where s0 = %d and y0 = %d" - - c = db.execute(sql % (encode_pinyin(s1), encode_pinyin(y1))) - for r in c: - return r[0] if r[0] else 0 - return 0 - -def compaired_special(pinyins): - import sqlite3 - db = sqlite3.connect("open-phrase.db") - # db = sqlite3.connect("main.db") - - for p1, p2, p3, p4 in get_all_special(pinyins): - if p3 not in pinyin_list or p4 not in pinyin_list: - continue - if p1 not in pinyin_list or p2 not in pinyin_list: - yield p1, p2, p3, p4 - continue - - if p3 not in pinyin_list or p4 not in pinyin_list: - continue - - a1 = get_max_freq_2(db, p1, p2) - a2 = get_max_freq_2(db, p3, p4) - if a1 == a2: - a1 = get_max_freq_1(db, p1) + get_max_freq_1(db, p2) - a2 = get_max_freq_1(db, p3) + get_max_freq_1(db, p4) - if a1 < a2: - yield p1, p2, p3, p4 - -def gen_full_pinyin_table(pinyins): - _dict = {} - for i in xrange(0, len(pinyins)): - _dict[pinyins[i]] = i - full_pinyin = [] - for i in xrange(0, len(pinyins)): - if pinyins[i][0] in pinyin_list: - full_pinyin.append (pinyins[i]) - full_pinyin.sort(lambda a, b: (cmp(a[1], b[1]) << 16) + cmp(a[2],b[4])) - print 'static const Pinyin *full_pinyin_table[] = {' - for p in full_pinyin: - print " &pinyin_table[%d], // %s" % (_dict[p], p[0]) - print '};' - print - - -def gen_special_table(pinyins): - _dict = {} - for i in xrange(0, len(pinyins)): - _dict[pinyins[i][0]] = i - - l = list(compaired_special(_dict.keys())) - l.sort() - print 'static const Pinyin *special_table[][4] = {' - for r in l: - ids = [("&pinyin_table[%d]," % _dict[py]).ljust(20) for py in r] - - print ' { %s %s %s %s },' % tuple(ids), "/* %s %s => %s %s */" % r - print '};' - print - - -def main(): - # gen_header() - # gen_macros() - pinyins = gen_table() - # gen_full_pinyin_table (pinyins) - gen_bopomofo_table(pinyins) - gen_special_table(pinyins) - # gen_option_check("pinyin_option_check_sheng", fuzzy_shengmu) - # gen_option_check("pinyin_option_check_yun", fuzzy_yunmu) - - -if __name__ == "__main__": - main() - |