diff options
author | Peng Wu <alexepico@gmail.com> | 2015-09-07 14:34:06 +0800 |
---|---|---|
committer | Peng Wu <alexepico@gmail.com> | 2015-09-07 14:34:06 +0800 |
commit | 58ceaafb09440502c0a30895cf2ca4447cc2323d (patch) | |
tree | 1703077e8f2e4480916f53ad544572b2964c7010 | |
parent | 1a7731b1c634be215175f4a8418f60c059f9323c (diff) | |
download | libpinyin-58ceaafb09440502c0a30895cf2ca4447cc2323d.tar.gz |
merge specialtable.py into fullpinyintable.py
-rw-r--r-- | scripts2/fullpinyintable.py | 124 |
1 files changed, 112 insertions, 12 deletions
diff --git a/scripts2/fullpinyintable.py b/scripts2/fullpinyintable.py index c1c812b..6ad05be 100644 --- a/scripts2/fullpinyintable.py +++ b/scripts2/fullpinyintable.py @@ -19,26 +19,18 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +import os import operator import itertools import chewing from pyzymap import ZHUYIN_PINYIN_MAP, ZHUYIN_LUOMA_PINYIN_MAP, ZHUYIN_SECONDARY_ZHUYIN_MAP from pyzymap import PINYIN_ZHUYIN_MAP, ZHUYIN_SPECIAL_INITIAL_SET_IN_PINYIN_FORM -from fullpinyin import PINYIN_LIST, SHENGMU_LIST +from fullpinyin import PINYIN_LIST, SHENGMU_LIST, YUNMU_LIST from options import * from utils import shuffle_all -content_table = [] -pinyin_index = [] -luoma_pinyin_index = [] -zhuyin_index = [] -shuffle_zhuyin_index = [] -secondary_zhuyin_index = [] -hsu_zhuyin_index = [] -eten26_zhuyin_index = [] - - +#pinyins pinyin_list = sorted(PINYIN_ZHUYIN_MAP.keys()) shengmu_list = sorted(SHENGMU_LIST) @@ -171,6 +163,16 @@ def gen_u_to_v(): #pinyin table +content_table = [] +pinyin_index = [] +luoma_pinyin_index = [] +zhuyin_index = [] +shuffle_zhuyin_index = [] +secondary_zhuyin_index = [] +hsu_zhuyin_index = [] +eten26_zhuyin_index = [] + + def filter_pinyin_list(): for (pinyin, bopomofo, flags, chewing) in gen_pinyin_list(): (luoma, second) = (None, None) @@ -401,13 +403,105 @@ def gen_table_index_for_chewing_key(content_table): return ",\n".join(entries) -#init code +#special table +pinyin_list = sorted(PINYIN_LIST) +shengmu_list = sorted(SHENGMU_LIST) +yunmu_list = sorted(YUNMU_LIST) + +phrase_dict = {} + + +def load_phrase(filename): + phrasefile = open(filename, "r") + for line in phrasefile.readlines(): + line = line.rstrip(os.linesep) + (pinyin_str, freq) = line.split(None, 1) + freq = int(freq) + if 0 == freq: + #print(pinyin_str) + continue + + # no duplicate here + if "'" in pinyin_str: + (first_key, second_key) = pinyin_str.split("'") + phrase_dict[(first_key, second_key)] = freq + else: + phrase_dict[pinyin_str] = freq + phrasefile.close() + + +def gen_all_divided(): + for pinyin_key in pinyin_list: + for first_key in pinyin_list: + if len(pinyin_key) <= len(first_key): + continue + if not pinyin_key.startswith(first_key): + continue + second_key = pinyin_key[len(first_key):] + if second_key in pinyin_list: + yield pinyin_key, first_key, second_key + + +def filter_divided(): + for (pinyin_key, first_key, second_key) in gen_all_divided(): + if not (first_key, second_key) in phrase_dict: + continue + orig_freq = 0 + if pinyin_key in phrase_dict: + orig_freq = phrase_dict[pinyin_key] + new_freq = phrase_dict[(first_key, second_key)] + yield pinyin_key, orig_freq, first_key, second_key, new_freq + + +def gen_all_resplit(): + for pinyin_key in pinyin_list: + if pinyin_key[-1] in ["n", "g", "r"]: + for yun in yunmu_list: + if yun not in pinyin_list: + continue + #check first new pinyin key + if not pinyin_key[:-1] in pinyin_list: + continue + #check second new pinyin key + new_pinyin_key = pinyin_key[-1] + yun + if new_pinyin_key in pinyin_list: + yield pinyin_key, yun, pinyin_key[:-1], new_pinyin_key +''' + elif pinyin_key[-1] in ["e"]: + #check first new pinyin key + if pinyin_key[:-1] in pinyin_list: + yield pinyin_key, "r", pinyin_key[:-1], "er" +''' + + +def filter_resplit(): + for (orig_first_key, orig_second_key, new_first_key, new_second_key) \ + in gen_all_resplit(): + #do the reverse here, as libpinyin pinyin parser is different with + #ibus-pinyin's parser. + (orig_first_key, orig_second_key, new_first_key, new_second_key) = \ + (new_first_key, new_second_key, orig_first_key, orig_second_key) + if (new_first_key, new_second_key) not in phrase_dict: + continue + orig_freq = 0 + new_freq = phrase_dict[(new_first_key, new_second_key)] + if (orig_first_key, orig_second_key) in phrase_dict: + orig_freq = phrase_dict[(orig_first_key, orig_second_key)] + yield orig_first_key, orig_second_key, orig_freq, \ + new_first_key, new_second_key, new_freq + + +#init full pinyin table code filter_pinyin_list() check_rules(hsu_correct, hsu_correct_special) check_rules(eten26_correct, eten26_correct_special) populate_more_zhuyin_index() sort_all() +#init resplit/divided table code +load_phrase("pinyins.txt") +#load_phrase("specials.txt") + ### main function ### if __name__ == "__main__": @@ -423,3 +517,9 @@ if __name__ == "__main__": s = gen_hsu_zhuyin_index() + gen_eten26_zhuyin_index() s = gen_table_index_for_chewing_key(content_table) print(s) + + #dump + for p in filter_divided(): + print (p) + for p in filter_resplit(): + print (p) |