diff options
Diffstat (limited to 'src-splitter/metaword.c')
-rw-r--r-- | src-splitter/metaword.c | 967 |
1 files changed, 967 insertions, 0 deletions
diff --git a/src-splitter/metaword.c b/src-splitter/metaword.c new file mode 100644 index 0000000..0491035 --- /dev/null +++ b/src-splitter/metaword.c @@ -0,0 +1,967 @@ +/* + * 文節もしくは単語を一つ以上セットにしてmetawordとして扱う。 + * ここでは各種のmetawordを生成する + * + * init_metaword_tab() metaword処理のための情報を構成する + * anthy_make_metaword_all() context中のmetawordを構成する + * anthy_print_metaword() 指定されたmetawordを表示する + * + * Funded by IPA未踏ソフトウェア創造事業 2001 10/29 + * Copyright (C) 2000-2006 TABATA Yusuke + * Copyright (C) 2004-2006 YOSHIDA Yuichi + * Copyright (C) 2000-2003 UGAWA Tomoharu + */ +#include <stdlib.h> +#include <stdio.h> +#include <math.h> + +#include <anthy/record.h> +#include <anthy/splitter.h> +#include <anthy/xchar.h> +#include <anthy/xstr.h> +#include <anthy/segment.h> +#include <anthy/segclass.h> +#include "wordborder.h" + +/* 各種meta_wordをどのように処理するか */ +struct metaword_type_tab_ anthy_metaword_type_tab[] = { + {MW_DUMMY,"dummy",MW_STATUS_NONE,MW_CHECK_SINGLE}, + {MW_SINGLE,"single",MW_STATUS_NONE,MW_CHECK_SINGLE}, + {MW_WRAP,"wrap",MW_STATUS_WRAPPED,MW_CHECK_WRAP}, + {MW_COMPOUND_HEAD,"compound_head",MW_STATUS_NONE,MW_CHECK_COMPOUND}, + {MW_COMPOUND,"compound",MW_STATUS_NONE,MW_CHECK_NONE}, + {MW_COMPOUND_LEAF,"compound_leaf",MW_STATUS_COMPOUND,MW_CHECK_NONE}, + {MW_COMPOUND_PART,"compound_part",MW_STATUS_COMPOUND_PART,MW_CHECK_SINGLE}, + {MW_V_RENYOU_A,"v_renyou_a",MW_STATUS_COMBINED,MW_CHECK_BORDER}, + {MW_V_RENYOU_NOUN,"v_renyou_noun",MW_STATUS_COMBINED,MW_CHECK_BORDER}, + {MW_NUMBER,"number",MW_STATUS_COMBINED,MW_CHECK_NUMBER}, + {MW_OCHAIRE,"ochaire",MW_STATUS_OCHAIRE,MW_CHECK_OCHAIRE}, + /**/ + {MW_END,"end",MW_STATUS_NONE,MW_CHECK_NONE} +}; + +static void +combine_metaword(struct splitter_context *sc, struct meta_word *mw); + +/* コンテキスト中にmetawordを追加する */ +void +anthy_commit_meta_word(struct splitter_context *sc, + struct meta_word *mw) +{ + struct word_split_info_cache *info = sc->word_split_info; + /* 同じ開始点を持つノードのリスト */ + mw->next = info->cnode[mw->from].mw; + info->cnode[mw->from].mw = mw; + /**/ + if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_MW) { + anthy_print_metaword(sc, mw); + } +} + +static void +print_metaword_features(int features) +{ + if (features & MW_FEATURE_SV) { + printf(":sv"); + } + if (features & MW_FEATURE_WEAK_CONN) { + printf(":weak"); + } + if (features & MW_FEATURE_SUFFIX) { + printf(":suffix"); + } + if (features & MW_FEATURE_NUM) { + printf(":num"); + } + if (features & MW_FEATURE_CORE1) { + printf(":c1"); + } + if (features & MW_FEATURE_HIGH_FREQ) { + printf(":hf"); + } +} + +static void +anthy_do_print_metaword(struct splitter_context *sc, + struct meta_word *mw, + int indent) +{ + int i; + for (i = 0; i < indent; i++) { + printf(" "); + } + printf("*meta word type=%s(%d-%d):score=%d:seg_class=%s", + anthy_metaword_type_tab[mw->type].name, + mw->from, mw->len, mw->score, + anthy_seg_class_name(mw->seg_class)); + print_metaword_features(mw->mw_features); + printf(":can_use=%d*\n", mw->can_use); + if (mw->wl) { + anthy_print_word_list(sc, mw->wl); + } + if (mw->cand_hint.str) { + printf("("); + anthy_putxstr(&mw->cand_hint); + printf(")\n"); + } + if (mw->mw1) { + anthy_do_print_metaword(sc, mw->mw1, indent + 1); + } + if (mw->mw2) { + anthy_do_print_metaword(sc, mw->mw2, indent + 1); + } +} + +void +anthy_print_metaword(struct splitter_context *sc, + struct meta_word *mw) +{ + anthy_do_print_metaword(sc, mw, 0); +} + +static struct meta_word * +alloc_metaword(struct splitter_context *sc) +{ + struct meta_word *mw; + mw = anthy_smalloc(sc->word_split_info->MwAllocator); + mw->type = MW_SINGLE; + mw->score = 0; + mw->struct_score = 0; + mw->dep_word_hash = 0; + mw->core_wt = anthy_wt_none; + mw->mw_features = 0; + mw->dep_class = DEP_NONE; + mw->wl = NULL; + mw->mw1 = NULL; + mw->mw2 = NULL; + mw->cand_hint.str = NULL; + mw->cand_hint.len = 0; + mw->seg_class = SEG_HEAD; + mw->can_use = ok; + return mw; +} + + +/* + * wlの接頭辞部分と接尾辞部分を文字列として取り出す + */ +static void +get_surrounding_text(struct splitter_context* sc, + struct word_list* wl, + xstr* xs_pre, xstr* xs_post) +{ + int post_len = wl->part[PART_DEPWORD].len + wl->part[PART_POSTFIX].len; + int pre_len = wl->part[PART_PREFIX].len; + + xs_pre->str = sc->ce[wl->from].c; + xs_pre->len = pre_len; + xs_post->str = sc->ce[wl->from + wl->len - post_len].c; + xs_post->len = post_len; +} + +static int +count_vu(xstr *xs) { + int i, r = 0; + for (i = 0; i < xs->len; i++) { + if (xs->str[i] == KK_VU) { + r++; + } + } + return r; +} + +/* + * 複合語であるwlからn番めの部分を取り出してmwにする + */ +static struct meta_word* +make_compound_nth_metaword(struct splitter_context* sc, + compound_ent_t ce, int nth, + struct word_list* wl, + enum metaword_type type) +{ + int i; + int len = 0; + int from = wl->from; + int seg_num = anthy_compound_get_nr_segments(ce); + struct meta_word* mw; + xstr xs_pre, xs_core, xs_post; + + get_surrounding_text(sc, wl, &xs_pre, &xs_post); + + for (i = 0; i <= nth; ++i) { + xstr part; + from += len; + len = anthy_compound_get_nth_segment_len(ce, i); + part.str = sc->ce[from].c; + part.len = len; + len -= count_vu(&part); + if (i == 0) { + len += xs_pre.len; + } + if (i == seg_num - 1) { + len += xs_post.len; + } + } + + mw = alloc_metaword(sc); + mw->from = from; + mw->len = len; + mw->type = type; + mw->score = 1000; + mw->seg_class = wl->seg_class; + + anthy_compound_get_nth_segment_xstr(ce, nth, &xs_core); + if (nth == 0) { + anthy_xstrcat(&mw->cand_hint, &xs_pre); + } + anthy_xstrcat(&mw->cand_hint, &xs_core); + if (nth == seg_num - 1) { + anthy_xstrcat(&mw->cand_hint, &xs_post); + } + return mw; +} + + +/* + * metawordを実際に結合する + */ +static struct meta_word * +anthy_do_cons_metaword(struct splitter_context *sc, + enum metaword_type type, + struct meta_word *mw, struct meta_word *mw2) +{ + struct meta_word *n; + + n = alloc_metaword(sc); + n->from = mw->from; + n->len = mw->len + (mw2 ? mw2->len : 0); + + if (mw2) { + n->score = sqrt(mw->score) * sqrt(mw2->score); + } else { + n->score = mw->score; + } + n->type = type; + n->mw1 = mw; + n->mw2 = mw2; + if (mw2) { + n->seg_class = mw2->seg_class; + n->nr_parts = mw->nr_parts + mw2->nr_parts; + n->dep_word_hash = mw2->dep_word_hash; + } else { + n->seg_class = mw->seg_class; + n->nr_parts = mw->nr_parts; + n->dep_word_hash = mw->dep_word_hash; + } + anthy_commit_meta_word(sc, n); + return n; +} + +/* + * 複合語用のmeta_wordを作成する。 + */ +static void +make_compound_metaword(struct splitter_context* sc, struct word_list* wl) +{ + int i, j; + seq_ent_t se = wl->part[PART_CORE].seq; + int ent_num = anthy_get_nr_dic_ents(se, NULL); + + for (i = 0; i < ent_num; ++i) { + compound_ent_t ce; + int seg_num; + struct meta_word *mw = NULL; + struct meta_word *mw2 = NULL; + if (!anthy_get_nth_dic_ent_is_compound(se, i)) { + continue; + } + ce = anthy_get_nth_compound_ent(se, i); + seg_num = anthy_compound_get_nr_segments(ce); + + for (j = seg_num - 1; j >= 0; --j) { + enum metaword_type type; + mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_LEAF); + anthy_commit_meta_word(sc, mw); + + type = j == 0 ? MW_COMPOUND_HEAD : MW_COMPOUND; + mw2 = anthy_do_cons_metaword(sc, type, mw, mw2); + } + } +} + +/* + * 複合語の中の個々の文節を結合したmeta_wordを作成する。 + */ +static void +make_compound_part_metaword(struct splitter_context* sc, struct word_list* wl) +{ + int i, j, k; + seq_ent_t se = wl->part[PART_CORE].seq; + int ent_num = anthy_get_nr_dic_ents(se, NULL); + + for (i = 0; i < ent_num; ++i) { + compound_ent_t ce; + int seg_num; + struct meta_word *mw = NULL; + struct meta_word *mw2 = NULL; + + if (!anthy_get_nth_dic_ent_is_compound(se, i)) { + continue; + } + + ce = anthy_get_nth_compound_ent(se, i); + seg_num = anthy_compound_get_nr_segments(ce); + + /* 後ろから */ + for (j = seg_num - 1; j >= 0; --j) { + mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_PART); + for (k = j - 1; k >= 0; --k) { + mw2 = make_compound_nth_metaword(sc, ce, k, wl, MW_COMPOUND_PART); + mw2->len += mw->len; + mw2->score += mw->score; + anthy_xstrcat(&mw2->cand_hint, &mw->cand_hint); + + anthy_commit_meta_word(sc, mw2); + mw = mw2; + } + } + } +} + +/* + * 単文節単語 + */ +static void +make_simple_metaword(struct splitter_context *sc, struct word_list* wl) +{ + struct meta_word *mw = alloc_metaword(sc); + mw->wl = wl; + mw->from = wl->from; + mw->len = wl->len; + mw->score = 1000; + mw->type = MW_SINGLE; + mw->dep_class = wl->part[PART_DEPWORD].dc; + mw->seg_class = wl->seg_class; + if (wl->part[PART_CORE].len) { + mw->core_wt = wl->part[PART_CORE].wt; + } + mw->nr_parts = NR_PARTS; + mw->dep_word_hash = wl->dep_word_hash; + mw->mw_features = wl->mw_features; + anthy_commit_meta_word(sc, mw); +} + +/* + * wordlist一個からなる、metawordを作成 + */ +static void +make_metaword_from_word_list(struct splitter_context *sc) +{ + int i; + for (i = 0; i < sc->char_count; i++) { + struct word_list *wl; + for (wl = sc->word_split_info->cnode[i].wl; + wl; wl = wl->next) { + if (wl->is_compound) { + make_compound_part_metaword(sc, wl); + make_compound_metaword(sc, wl); + } else { + make_simple_metaword(sc, wl); + } + } + } +} + +/* + * metawordをリスト風に結合する + */ +static struct meta_word * +list_metaword(struct splitter_context *sc, + enum metaword_type type, + struct meta_word *mw, struct meta_word *mw2) +{ + struct meta_word *wrapped_mw = anthy_do_cons_metaword(sc, type, mw2, NULL); + struct meta_word *n = anthy_do_cons_metaword(sc, type, mw, wrapped_mw); + + n->mw_features = mw->mw_features | mw2->mw_features; + + return n; +} + +/* + * 動詞連用形 + 形容詞化接尾語 「〜しやすい」など + */ +static void +try_combine_v_renyou_a(struct splitter_context *sc, + struct meta_word *mw, struct meta_word *mw2) +{ + wtype_t w2; + if (!mw->wl || !mw2->wl) return; + + w2 = mw2->wl->part[PART_CORE].wt; + + if (mw->wl->head_pos == POS_V && + mw->wl->tail_ct == CT_RENYOU && + anthy_wtype_get_pos(w2) == POS_D2KY) { + /* 形容詞ではあるので次のチェック */ + if (anthy_get_seq_ent_wtype_freq(mw2->wl->part[PART_CORE].seq, + anthy_wtype_a_tail_of_v_renyou)) { + list_metaword(sc, MW_V_RENYOU_A, mw, mw2); + } + } +} + +/* + * 動詞連用形 + 名詞化接尾語(#D2T35) 「入れ たて(のお茶)」など + */ +static void +try_combine_v_renyou_noun(struct splitter_context *sc, + struct meta_word *mw, struct meta_word *mw2) +{ + wtype_t w2; + if (!mw->wl || !mw2->wl) return; + + w2 = mw2->wl->part[PART_CORE].wt; + if (mw->wl->head_pos == POS_V && + mw->wl->tail_ct == CT_RENYOU && + anthy_wtype_get_pos(w2) == POS_NOUN && + anthy_wtype_get_scos(w2) == SCOS_T40) { + list_metaword(sc, MW_V_RENYOU_NOUN, mw, mw2); + } +} + +/* + * 数字を結合する + */ +static void +try_combine_number(struct splitter_context *sc, + struct meta_word *mw1, struct meta_word *mw2) +{ + struct word_list *wl1 = mw1->wl; + struct word_list *wl2 = mw2->wl; + struct meta_word *combined_mw; + int recursive = wl2 ? 0 : 1; /* combinedなmwを結合する場合1 */ + + /* 左mwは数詞 */ + + if (anthy_wtype_get_pos(wl1->part[PART_CORE].wt) != POS_NUMBER) return; + if (recursive) { + /* 右mwは数字を結合したmw */ + if (mw2->type != MW_NUMBER) return; + wl2 = mw2->mw1->wl; + } else { + /* 右mwは数詞 */ + if (anthy_wtype_get_pos(wl2->part[PART_CORE].wt) != POS_NUMBER) return; + } + /* 左mwの後ろに文字が付いていなければ */ + if (wl1->part[PART_POSTFIX].len == 0 && + wl1->part[PART_DEPWORD].len == 0) { + int scos1 = anthy_wtype_get_scos(wl1->part[PART_CORE].wt); + int scos2 = anthy_wtype_get_scos(wl2->part[PART_CORE].wt); + + /* #NNは対象外 */ + if (scos2 == SCOS_NONE) return; + /* + 左mwの種類によって、後ろにつくことができる右mwの種類が変わる + 例えば一〜九の後ろには万〜九万、億〜九億しかつくことができないが、 + 十〜九十の後ろには、あわせて一〜九などもつくことができる + */ + switch (scos1) { + case SCOS_N1: + if (scos2 == SCOS_N1) return; /* 後ろに一〜九がついてはいけない */ + case SCOS_N10: + if (scos2 == SCOS_N10) return; /* 後ろに十〜九十がついてはいけない */ + case SCOS_N100: + if (scos2 == SCOS_N100) return; /* 後ろに百〜九百がついてはいけない */ + case SCOS_N1000: + if (scos2 == SCOS_N1000) return; /* 後ろに千〜九千がついてはいけない */ + case SCOS_N10000: + /* 万〜九万、億〜九億…などは、 + いつでも後ろにつくことができる */ + break; + default: + return; + } + + if (recursive) { + combined_mw = anthy_do_cons_metaword(sc, MW_NUMBER, mw1, mw2); + } else { + /* 初めて結合する場合は後ろにnullをつけてlistにする */ + combined_mw = list_metaword(sc, MW_NUMBER, mw1, mw2); + } + combine_metaword(sc, combined_mw); + } +} + +/* 右隣のmetawordと結合できるかチェック */ +static void +try_combine_metaword(struct splitter_context *sc, + struct meta_word *mw1, struct meta_word *mw2) +{ + if (!mw1->wl) return; + + /* metawordの結合を行うためには、後続の + metawordに接頭辞がないことが必要 */ + if (mw2->wl && mw2->wl->part[PART_PREFIX].len > 0) { + return; + } + + try_combine_v_renyou_a(sc, mw1, mw2); + try_combine_v_renyou_noun(sc, mw1, mw2); + try_combine_number(sc, mw1, mw2); +} + +static void +combine_metaword(struct splitter_context *sc, struct meta_word *mw) +{ + struct word_split_info_cache *info = sc->word_split_info; + int i; + + if (mw->mw_features & MW_FEATURE_DEP_ONLY) { + /* 付属語だけの文節とは結合しない */ + return; + } + + for (i = mw->from - 1; i >= 0; i--) { + struct meta_word *mw_left; + for (mw_left = info->cnode[i].mw; mw_left; mw_left = mw_left->next) { + if (mw_left->from + mw_left->len == mw->from) { + /* 結合できるかチェック */ + try_combine_metaword(sc, mw_left, mw); + } + } + } +} + +static void +combine_metaword_all(struct splitter_context *sc) +{ + int i; + + struct word_split_info_cache *info = sc->word_split_info; + /* metawordの左端によるループ */ + for (i = sc->char_count - 1; i >= 0; i--){ + struct meta_word *mw; + /* 各metawordのループ */ + for (mw = info->cnode[i].mw; + mw; mw = mw->next) { + combine_metaword(sc, mw); + } + } +} + +static void +make_dummy_metaword(struct splitter_context *sc, int from, + int len, int orig_len) +{ + int score = 0; + struct meta_word *mw, *n; + + for (mw = sc->word_split_info->cnode[from].mw; mw; mw = mw->next) { + if (mw->len != orig_len) continue; + if (mw->score > score) { + score = mw->score; + } + } + + n = alloc_metaword(sc); + n->type = MW_DUMMY; + n->from = from; + n->len = len; + n->score = 3 * score * len / orig_len; + if (mw) { + mw->nr_parts = 0; + } + anthy_commit_meta_word(sc, n); +} + +/* + * 文節を伸ばしたらそれを覚えておく + */ +static void +make_expanded_metaword_all(struct splitter_context *sc) +{ + int i, j; + if (anthy_select_section("EXPANDPAIR", 0) == -1) { + return ; + } + for (i = 0; i < sc->char_count; i++) { + for (j = 1; j < sc->char_count - i; j++) { + /* 全ての部分文字列に対して */ + xstr xs; + xs.len = j; + xs.str = sc->ce[i].c; + if (anthy_select_row(&xs, 0) == 0) { + /* この部分文字列は過去に拡大の対象となった */ + int k; + int nr = anthy_get_nr_values(); + for (k = 0; k < nr; k++) { + xstr *exs; + exs = anthy_get_nth_xstr(k); + if (exs && exs->len <= sc->char_count - i) { + xstr txs; + txs.str = sc->ce[i].c; + txs.len = exs->len; + if (!anthy_xstrcmp(&txs, exs)) { + make_dummy_metaword(sc, i, txs.len, j); + } + } + } + } + } + } +} + +/* お茶入れ学習のmetawordを作る */ +static void +make_ochaire_metaword(struct splitter_context *sc, + int from, int len) +{ + struct meta_word *mw; + int count; + int s; + int j; + int seg_len; + int mw_len = 0; + xstr* xs; + + (void)len; + + /* 文節数を取得 */ + count = anthy_get_nth_value(0); + /* 一番右の文節をのぞいた文字数の合計を計算 */ + for (s = 0, j = 0; j < count - 1; j++) { + s += anthy_get_nth_value(j * 2 + 1); + } + /* 一番右の文節のmetawordを構成 */ + xs = anthy_get_nth_xstr((count - 1) * 2 + 2); + if (!xs) { + return ; + } + seg_len = anthy_get_nth_value((count - 1) * 2 + 1); + mw = alloc_metaword(sc); + mw->type = MW_OCHAIRE; + mw->from = from + s; + mw->len = seg_len; + mw->score = OCHAIRE_SCORE; + mw->cand_hint.str = malloc(sizeof(xchar)*xs->len); + anthy_xstrcpy(&mw->cand_hint, xs); + anthy_commit_meta_word(sc, mw); + mw_len += seg_len; + /* それ以外の文節でmetawordを構成 */ + for (j-- ; j >= 0; j--) { + struct meta_word *n; + seg_len = anthy_get_nth_value(j * 2 + 1); + s -= seg_len; + xs = anthy_get_nth_xstr(j * 2 + 2); + if (!xs) { + return ; + } + n = alloc_metaword(sc); + n->type = MW_OCHAIRE; + /* 右のmetawordをつなぐ */ + n->mw1 = mw; + n->from = from + s; + n->len = seg_len; + n->score = OCHAIRE_SCORE; + n->cand_hint.str = malloc(sizeof(xchar)*xs->len); + anthy_xstrcpy(&n->cand_hint, xs); + anthy_commit_meta_word(sc, n); + mw = n; + mw_len += seg_len; + } +} + +/* + * 複数の文節の組を履歴から検索する + */ +static void +make_ochaire_metaword_all(struct splitter_context *sc) +{ + int i; + if (anthy_select_section("OCHAIRE", 0) == -1) { + return ; + } + + for (i = 0; i < sc->char_count; i++) { + xstr xs; + xs.len = sc->char_count - i; + xs.str = sc->ce[i].c; + if (anthy_select_longest_row(&xs) == 0) { + xstr* key; + int len; + anthy_mark_row_used(); + key = anthy_get_index_xstr(); + len = key->len; + + make_ochaire_metaword(sc, i, len); + /* 今回見つかった meta_word の次の文字から始める */ + i += len - 1; + break; + } + } +} + +static void +add_dummy_metaword(struct splitter_context *sc, + int from) +{ + struct meta_word *n; + n = alloc_metaword(sc); + n->from = from; + n->len = 1; + n->type = MW_SINGLE; + n->score = 1; + n->seg_class = SEG_BUNSETSU; + anthy_commit_meta_word(sc, n); +} + +/* 指定したmetawordをwrapしてj文字長いmeta_wordを作る */ +static void +expand_meta_word(struct splitter_context *sc, + struct meta_word *mw, int from, int len, + int destroy_seg_class, int j) +{ + struct meta_word *n; + n = alloc_metaword(sc); + n->from = from; + n->len = len + j; + if (mw) { + n->type = MW_WRAP; + n->mw1 = mw; + n->score = mw->score; + n->nr_parts = mw->nr_parts; + if (destroy_seg_class) { + n->seg_class = SEG_BUNSETSU; + n->score /= 10; + } else { + n->seg_class = mw->seg_class; + } + } else { + n->type = MW_SINGLE; + n->score = 1; + n->seg_class = SEG_BUNSETSU; + } + anthy_commit_meta_word(sc, n); +} + +/* + * metawordの後ろの雑多な文字をくっつけたmetawordを構成する + */ +static void +make_metaword_with_depchar(struct splitter_context *sc, + struct meta_word *mw) +{ + int j; + int destroy_seg_class = 0; + int from = mw ? mw->from : 0; + int len = mw ? mw->len : 0; + + /* metawordの直後の文字の種類を調べる */ + int type; + if (sc->char_count <= from + len) { + return ; + } + type = anthy_get_xchar_type(*sc->ce[from + len].c); + if (!(type & XCT_SYMBOL) && + !(type & XCT_PART)) { + return; + } + if (type & XCT_PUNCTUATION) { + /* 句読点ならば別の文節にする */ + return ; + } + + /* 同じ種類の文字でなければくっつけるのをうちきり */ + for (j = 0; from + len + j < sc->char_count; j++) { + int p = from + len + j; + if ((anthy_get_xchar_type(*sc->ce[p].c) != type)) { + break; + } + if (!(p + 1 < sc->char_count) || + *sc->ce[p].c != *sc->ce[p + 1].c) { + destroy_seg_class = 1; + } + } + + /* 上のループを抜けた時、jには独立できない文字の数が入っている */ + + /* 独立できない文字があるので、それを付けたmetawordを作る */ + if (j > 0) { + expand_meta_word(sc, mw, from, len, destroy_seg_class, j); + } +} + +static void +make_metaword_with_depchar_all(struct splitter_context *sc) +{ + int i; + struct word_split_info_cache *info = sc->word_split_info; + + /* 全metawordに対して */ + for (i = 0; i < sc->char_count; i++) { + struct meta_word *mw; + for (mw = info->cnode[i].mw; + mw; mw = mw->next) { + make_metaword_with_depchar(sc, mw); + } + if (!info->cnode[i].mw) { + /**/ + add_dummy_metaword(sc, i); + } + } + /* 文の左端から始まるもの */ + make_metaword_with_depchar(sc, NULL); +} + +static int +is_single(xstr* xs) +{ + int i; + int xct; + for (i = xs->len - 1; i >= 1; --i) { + xct = anthy_get_xchar_type(xs->str[i]); + if (!(xct & XCT_PART)) { + return 0; + } + } + return 1; +} + +static void +bias_to_single_char_metaword(struct splitter_context *sc) +{ + int i; + + for (i = sc->char_count - 1; i >= 0; --i) { + struct meta_word *mw; + xstr xs; + int xct; + + struct char_node *cnode = &sc->word_split_info->cnode[i]; + + /* カッコの場合は一文字で文節を構成できる */ + xct = anthy_get_xchar_type(*sc->ce[i].c); + if (xct & (XCT_OPEN|XCT_CLOSE)) { + continue; + } + + xs.str = sc->ce[i].c; + for (mw = cnode->mw; mw; mw = mw->next) { + /* 付属語のみの文節は減点しない */ + if (mw->mw_features & MW_FEATURE_DEP_ONLY) { + continue; + } + /* 一文字(+直前につながる文字の繰り返し)のスコアを下げる */ + xs.len = mw->len; + if (is_single(&xs)) { + mw->score /= 10; + } + } + } +} + +void +anthy_mark_border_by_metaword(struct splitter_context* sc, + struct meta_word* mw) +{ + struct word_split_info_cache* info = sc->word_split_info; + if (!mw) return; + + switch (mw->type) { + case MW_DUMMY: + /* BREAK THROUGH */ + case MW_SINGLE: + /* BREAK THROUGH */ + case MW_COMPOUND_PART: + info->seg_border[mw->from] = 1; + break; + case MW_COMPOUND_LEAF: + info->seg_border[mw->from] = 1; + info->best_mw[mw->from] = mw; + mw->can_use = ok; + break; + case MW_COMPOUND_HEAD: + /* BREAK THROUGH */ + case MW_COMPOUND: + /* BREAK THROUGH */ + case MW_NUMBER: + info->best_mw[mw->mw1->from] = mw->mw1; + anthy_mark_border_by_metaword(sc, mw->mw1); + anthy_mark_border_by_metaword(sc, mw->mw2); + break; + case MW_V_RENYOU_A: + /* BREAK THROUGH */ + case MW_V_RENYOU_NOUN: + info->seg_border[mw->from] = 1; + break; + case MW_WRAP: + anthy_mark_border_by_metaword(sc, mw->mw1); + break; + case MW_OCHAIRE: + info->seg_border[mw->from] = 1; + anthy_mark_border_by_metaword(sc, mw->mw1); + break; + default: + break; + } +} + +void +anthy_make_metaword_all(struct splitter_context *sc) +{ + /* まず、word_list一個のmetawordを作る */ + make_metaword_from_word_list(sc); + + /* metawordを結合する */ + combine_metaword_all(sc); + + /* 拡大された文節を処理する */ + make_expanded_metaword_all(sc); + + /* 濁点や長音などの記号、その他の記号を処理 */ + make_metaword_with_depchar_all(sc); + + /* おちゃをいれる */ + make_ochaire_metaword_all(sc); + + /* 一文字の文節は減点 */ + bias_to_single_char_metaword(sc); +} + +/* + * 指定された領域をカバーするmetawordを数える + */ +int +anthy_get_nr_metaword(struct splitter_context *sc, + int from, int len) +{ + struct meta_word *mw; + int n; + + for (n = 0, mw = sc->word_split_info->cnode[from].mw; + mw; mw = mw->next) { + if (mw->len == len && mw->can_use == ok) { + n++; + } + } + return n; +} + +struct meta_word * +anthy_get_nth_metaword(struct splitter_context *sc, + int from, int len, int nth) +{ + struct meta_word *mw; + int n; + for (n = 0, mw = sc->word_split_info->cnode[from].mw; + mw; mw = mw->next) { + if (mw->len == len && mw->can_use == ok) { + if (n == nth) { + return mw; + } + n++; + } + } + return NULL; +} |