summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeng Wu <alexepico@gmail.com>2015-09-07 14:34:06 +0800
committerPeng Wu <alexepico@gmail.com>2015-09-07 14:34:06 +0800
commit58ceaafb09440502c0a30895cf2ca4447cc2323d (patch)
tree1703077e8f2e4480916f53ad544572b2964c7010
parent1a7731b1c634be215175f4a8418f60c059f9323c (diff)
downloadlibpinyin-58ceaafb09440502c0a30895cf2ca4447cc2323d.tar.gz
merge specialtable.py into fullpinyintable.py
-rw-r--r--scripts2/fullpinyintable.py124
1 files changed, 112 insertions, 12 deletions
diff --git a/scripts2/fullpinyintable.py b/scripts2/fullpinyintable.py
index c1c812b..6ad05be 100644
--- a/scripts2/fullpinyintable.py
+++ b/scripts2/fullpinyintable.py
@@ -19,26 +19,18 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+import os
import operator
import itertools
import chewing
from pyzymap import ZHUYIN_PINYIN_MAP, ZHUYIN_LUOMA_PINYIN_MAP, ZHUYIN_SECONDARY_ZHUYIN_MAP
from pyzymap import PINYIN_ZHUYIN_MAP, ZHUYIN_SPECIAL_INITIAL_SET_IN_PINYIN_FORM
-from fullpinyin import PINYIN_LIST, SHENGMU_LIST
+from fullpinyin import PINYIN_LIST, SHENGMU_LIST, YUNMU_LIST
from options import *
from utils import shuffle_all
-content_table = []
-pinyin_index = []
-luoma_pinyin_index = []
-zhuyin_index = []
-shuffle_zhuyin_index = []
-secondary_zhuyin_index = []
-hsu_zhuyin_index = []
-eten26_zhuyin_index = []
-
-
+#pinyins
pinyin_list = sorted(PINYIN_ZHUYIN_MAP.keys())
shengmu_list = sorted(SHENGMU_LIST)
@@ -171,6 +163,16 @@ def gen_u_to_v():
#pinyin table
+content_table = []
+pinyin_index = []
+luoma_pinyin_index = []
+zhuyin_index = []
+shuffle_zhuyin_index = []
+secondary_zhuyin_index = []
+hsu_zhuyin_index = []
+eten26_zhuyin_index = []
+
+
def filter_pinyin_list():
for (pinyin, bopomofo, flags, chewing) in gen_pinyin_list():
(luoma, second) = (None, None)
@@ -401,13 +403,105 @@ def gen_table_index_for_chewing_key(content_table):
return ",\n".join(entries)
-#init code
+#special table
+pinyin_list = sorted(PINYIN_LIST)
+shengmu_list = sorted(SHENGMU_LIST)
+yunmu_list = sorted(YUNMU_LIST)
+
+phrase_dict = {}
+
+
+def load_phrase(filename):
+ phrasefile = open(filename, "r")
+ for line in phrasefile.readlines():
+ line = line.rstrip(os.linesep)
+ (pinyin_str, freq) = line.split(None, 1)
+ freq = int(freq)
+ if 0 == freq:
+ #print(pinyin_str)
+ continue
+
+ # no duplicate here
+ if "'" in pinyin_str:
+ (first_key, second_key) = pinyin_str.split("'")
+ phrase_dict[(first_key, second_key)] = freq
+ else:
+ phrase_dict[pinyin_str] = freq
+ phrasefile.close()
+
+
+def gen_all_divided():
+ for pinyin_key in pinyin_list:
+ for first_key in pinyin_list:
+ if len(pinyin_key) <= len(first_key):
+ continue
+ if not pinyin_key.startswith(first_key):
+ continue
+ second_key = pinyin_key[len(first_key):]
+ if second_key in pinyin_list:
+ yield pinyin_key, first_key, second_key
+
+
+def filter_divided():
+ for (pinyin_key, first_key, second_key) in gen_all_divided():
+ if not (first_key, second_key) in phrase_dict:
+ continue
+ orig_freq = 0
+ if pinyin_key in phrase_dict:
+ orig_freq = phrase_dict[pinyin_key]
+ new_freq = phrase_dict[(first_key, second_key)]
+ yield pinyin_key, orig_freq, first_key, second_key, new_freq
+
+
+def gen_all_resplit():
+ for pinyin_key in pinyin_list:
+ if pinyin_key[-1] in ["n", "g", "r"]:
+ for yun in yunmu_list:
+ if yun not in pinyin_list:
+ continue
+ #check first new pinyin key
+ if not pinyin_key[:-1] in pinyin_list:
+ continue
+ #check second new pinyin key
+ new_pinyin_key = pinyin_key[-1] + yun
+ if new_pinyin_key in pinyin_list:
+ yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key
+'''
+ elif pinyin_key[-1] in ["e"]:
+ #check first new pinyin key
+ if pinyin_key[:-1] in pinyin_list:
+ yield pinyin_key, "r", pinyin_key[:-1], "er"
+'''
+
+
+def filter_resplit():
+ for (orig_first_key, orig_second_key, new_first_key, new_second_key) \
+ in gen_all_resplit():
+ #do the reverse here, as libpinyin pinyin parser is different with
+ #ibus-pinyin's parser.
+ (orig_first_key, orig_second_key, new_first_key, new_second_key) = \
+ (new_first_key, new_second_key, orig_first_key, orig_second_key)
+ if (new_first_key, new_second_key) not in phrase_dict:
+ continue
+ orig_freq = 0
+ new_freq = phrase_dict[(new_first_key, new_second_key)]
+ if (orig_first_key, orig_second_key) in phrase_dict:
+ orig_freq = phrase_dict[(orig_first_key, orig_second_key)]
+ yield orig_first_key, orig_second_key, orig_freq, \
+ new_first_key, new_second_key, new_freq
+
+
+#init full pinyin table code
filter_pinyin_list()
check_rules(hsu_correct, hsu_correct_special)
check_rules(eten26_correct, eten26_correct_special)
populate_more_zhuyin_index()
sort_all()
+#init resplit/divided table code
+load_phrase("pinyins.txt")
+#load_phrase("specials.txt")
+
### main function ###
if __name__ == "__main__":
@@ -423,3 +517,9 @@ if __name__ == "__main__":
s = gen_hsu_zhuyin_index() + gen_eten26_zhuyin_index()
s = gen_table_index_for_chewing_key(content_table)
print(s)
+
+ #dump
+ for p in filter_divided():
+ print (p)
+ for p in filter_resplit():
+ print (p)