merge specialtable.py into fullpinyintable.py

author: Peng Wu <alexepico@gmail.com> 2015-09-07 14:34:06 +0800
committer: Peng Wu <alexepico@gmail.com> 2015-09-07 14:34:06 +0800
commit: 58ceaafb09440502c0a30895cf2ca4447cc2323d (patch)
tree: 1703077e8f2e4480916f53ad544572b2964c7010
parent: 1a7731b1c634be215175f4a8418f60c059f9323c (diff)
download: libpinyin-58ceaafb09440502c0a30895cf2ca4447cc2323d.tar.gz
1 files changed, 112 insertions, 12 deletions
diff --git a/scripts2/fullpinyintable.py b/scripts2/fullpinyintable.py
index c1c812b..6ad05be 100644
--- a/scripts2/fullpinyintable.py
+++ b/scripts2/fullpinyintable.py
@@ -19,26 +19,18 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 
+import os
 import operator
 import itertools
 import chewing
 from pyzymap import ZHUYIN_PINYIN_MAP, ZHUYIN_LUOMA_PINYIN_MAP, ZHUYIN_SECONDARY_ZHUYIN_MAP
 from pyzymap import PINYIN_ZHUYIN_MAP, ZHUYIN_SPECIAL_INITIAL_SET_IN_PINYIN_FORM
-from fullpinyin import PINYIN_LIST, SHENGMU_LIST
+from fullpinyin import PINYIN_LIST, SHENGMU_LIST, YUNMU_LIST
 from options import *
 from utils import shuffle_all
 
 
-content_table = []
-pinyin_index = []
-luoma_pinyin_index = []
-zhuyin_index = []
-shuffle_zhuyin_index = []
-secondary_zhuyin_index = []
-hsu_zhuyin_index = []
-eten26_zhuyin_index = []
-
-
+#pinyins
 pinyin_list = sorted(PINYIN_ZHUYIN_MAP.keys())
 shengmu_list = sorted(SHENGMU_LIST)
 
@@ -171,6 +163,16 @@ def gen_u_to_v():
 
 
 #pinyin table
+content_table = []
+pinyin_index = []
+luoma_pinyin_index = []
+zhuyin_index = []
+shuffle_zhuyin_index = []
+secondary_zhuyin_index = []
+hsu_zhuyin_index = []
+eten26_zhuyin_index = []
+
+
 def filter_pinyin_list():
     for (pinyin, bopomofo, flags, chewing) in gen_pinyin_list():
         (luoma, second) = (None, None)
@@ -401,13 +403,105 @@ def gen_table_index_for_chewing_key(content_table):
     return ",\n".join(entries)
 
 
-#init code
+#special table
+pinyin_list = sorted(PINYIN_LIST)
+shengmu_list = sorted(SHENGMU_LIST)
+yunmu_list = sorted(YUNMU_LIST)
+
+phrase_dict = {}
+
+
+def load_phrase(filename):
+    phrasefile = open(filename, "r")
+    for line in phrasefile.readlines():
+        line = line.rstrip(os.linesep)
+        (pinyin_str, freq) = line.split(None, 1)
+        freq = int(freq)
+        if 0 == freq:
+            #print(pinyin_str)
+            continue
+
+        # no duplicate here
+        if "'" in pinyin_str:
+            (first_key, second_key) = pinyin_str.split("'")
+            phrase_dict[(first_key, second_key)] = freq
+        else:
+            phrase_dict[pinyin_str] = freq
+    phrasefile.close()
+
+
+def gen_all_divided():
+    for pinyin_key in pinyin_list:
+        for first_key in pinyin_list:
+            if len(pinyin_key) <= len(first_key):
+                continue
+            if not pinyin_key.startswith(first_key):
+                continue
+            second_key = pinyin_key[len(first_key):]
+            if second_key in pinyin_list:
+                yield pinyin_key, first_key, second_key
+
+
+def filter_divided():
+    for (pinyin_key, first_key, second_key) in gen_all_divided():
+        if not (first_key, second_key) in phrase_dict:
+            continue
+        orig_freq = 0
+        if pinyin_key in phrase_dict:
+            orig_freq = phrase_dict[pinyin_key]
+        new_freq = phrase_dict[(first_key, second_key)]
+        yield pinyin_key, orig_freq, first_key, second_key, new_freq
+
+
+def gen_all_resplit():
+    for pinyin_key in pinyin_list:
+        if pinyin_key[-1] in ["n", "g", "r"]:
+            for yun in yunmu_list:
+                if yun not in pinyin_list:
+                    continue
+                #check first new pinyin key
+                if not pinyin_key[:-1] in pinyin_list:
+                    continue
+                #check second new pinyin key
+                new_pinyin_key = pinyin_key[-1] + yun
+                if new_pinyin_key in pinyin_list:
+                    yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key
+'''
+        elif pinyin_key[-1] in ["e"]:
+            #check first new pinyin key
+            if pinyin_key[:-1] in pinyin_list:
+                yield pinyin_key, "r", pinyin_key[:-1], "er"
+'''
+
+
+def filter_resplit():
+    for (orig_first_key, orig_second_key, new_first_key, new_second_key) \
+    in gen_all_resplit():
+        #do the reverse here, as libpinyin pinyin parser is different with
+        #ibus-pinyin's parser.
+        (orig_first_key, orig_second_key, new_first_key, new_second_key) = \
+            (new_first_key, new_second_key, orig_first_key, orig_second_key)
+        if (new_first_key, new_second_key) not in phrase_dict:
+            continue
+        orig_freq = 0
+        new_freq = phrase_dict[(new_first_key, new_second_key)]
+        if (orig_first_key, orig_second_key) in phrase_dict:
+            orig_freq = phrase_dict[(orig_first_key, orig_second_key)]
+        yield orig_first_key, orig_second_key, orig_freq, \
+        new_first_key, new_second_key, new_freq
+
+
+#init full pinyin table code
 filter_pinyin_list()
 check_rules(hsu_correct, hsu_correct_special)
 check_rules(eten26_correct, eten26_correct_special)
 populate_more_zhuyin_index()
 sort_all()
 
+#init resplit/divided table code
+load_phrase("pinyins.txt")
+#load_phrase("specials.txt")
+
 
 ### main function ###
 if __name__ == "__main__":
@@ -423,3 +517,9 @@ if __name__ == "__main__":
     s = gen_hsu_zhuyin_index() + gen_eten26_zhuyin_index()
     s = gen_table_index_for_chewing_key(content_table)
     print(s)
+
+    #dump
+    for p in filter_divided():
+        print (p)
+    for p in filter_resplit():
+        print (p)
author	Peng Wu <alexepico@gmail.com>	2015-09-07 14:34:06 +0800
committer	Peng Wu <alexepico@gmail.com>	2015-09-07 14:34:06 +0800
commit	58ceaafb09440502c0a30895cf2ca4447cc2323d (patch)
tree	1703077e8f2e4480916f53ad544572b2964c7010
parent	1a7731b1c634be215175f4a8418f60c059f9323c (diff)
download	libpinyin-58ceaafb09440502c0a30895cf2ca4447cc2323d.tar.gz