summaryrefslogtreecommitdiff
path: root/src-splitter/metaword.c
diff options
context:
space:
mode:
Diffstat (limited to 'src-splitter/metaword.c')
-rw-r--r--src-splitter/metaword.c967
1 files changed, 967 insertions, 0 deletions
diff --git a/src-splitter/metaword.c b/src-splitter/metaword.c
new file mode 100644
index 0000000..0491035
--- /dev/null
+++ b/src-splitter/metaword.c
@@ -0,0 +1,967 @@
+/*
+ * 文節もしくは単語を一つ以上セットにしてmetawordとして扱う。
+ * ここでは各種のmetawordを生成する
+ *
+ * init_metaword_tab() metaword処理のための情報を構成する
+ * anthy_make_metaword_all() context中のmetawordを構成する
+ * anthy_print_metaword() 指定されたmetawordを表示する
+ *
+ * Funded by IPA未踏ソフトウェア創造事業 2001 10/29
+ * Copyright (C) 2000-2006 TABATA Yusuke
+ * Copyright (C) 2004-2006 YOSHIDA Yuichi
+ * Copyright (C) 2000-2003 UGAWA Tomoharu
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include <anthy/record.h>
+#include <anthy/splitter.h>
+#include <anthy/xchar.h>
+#include <anthy/xstr.h>
+#include <anthy/segment.h>
+#include <anthy/segclass.h>
+#include "wordborder.h"
+
+/* 各種meta_wordをどのように処理するか */
+struct metaword_type_tab_ anthy_metaword_type_tab[] = {
+ {MW_DUMMY,"dummy",MW_STATUS_NONE,MW_CHECK_SINGLE},
+ {MW_SINGLE,"single",MW_STATUS_NONE,MW_CHECK_SINGLE},
+ {MW_WRAP,"wrap",MW_STATUS_WRAPPED,MW_CHECK_WRAP},
+ {MW_COMPOUND_HEAD,"compound_head",MW_STATUS_NONE,MW_CHECK_COMPOUND},
+ {MW_COMPOUND,"compound",MW_STATUS_NONE,MW_CHECK_NONE},
+ {MW_COMPOUND_LEAF,"compound_leaf",MW_STATUS_COMPOUND,MW_CHECK_NONE},
+ {MW_COMPOUND_PART,"compound_part",MW_STATUS_COMPOUND_PART,MW_CHECK_SINGLE},
+ {MW_V_RENYOU_A,"v_renyou_a",MW_STATUS_COMBINED,MW_CHECK_BORDER},
+ {MW_V_RENYOU_NOUN,"v_renyou_noun",MW_STATUS_COMBINED,MW_CHECK_BORDER},
+ {MW_NUMBER,"number",MW_STATUS_COMBINED,MW_CHECK_NUMBER},
+ {MW_OCHAIRE,"ochaire",MW_STATUS_OCHAIRE,MW_CHECK_OCHAIRE},
+ /**/
+ {MW_END,"end",MW_STATUS_NONE,MW_CHECK_NONE}
+};
+
+static void
+combine_metaword(struct splitter_context *sc, struct meta_word *mw);
+
+/* コンテキスト中にmetawordを追加する */
+void
+anthy_commit_meta_word(struct splitter_context *sc,
+ struct meta_word *mw)
+{
+ struct word_split_info_cache *info = sc->word_split_info;
+ /* 同じ開始点を持つノードのリスト */
+ mw->next = info->cnode[mw->from].mw;
+ info->cnode[mw->from].mw = mw;
+ /**/
+ if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_MW) {
+ anthy_print_metaword(sc, mw);
+ }
+}
+
+static void
+print_metaword_features(int features)
+{
+ if (features & MW_FEATURE_SV) {
+ printf(":sv");
+ }
+ if (features & MW_FEATURE_WEAK_CONN) {
+ printf(":weak");
+ }
+ if (features & MW_FEATURE_SUFFIX) {
+ printf(":suffix");
+ }
+ if (features & MW_FEATURE_NUM) {
+ printf(":num");
+ }
+ if (features & MW_FEATURE_CORE1) {
+ printf(":c1");
+ }
+ if (features & MW_FEATURE_HIGH_FREQ) {
+ printf(":hf");
+ }
+}
+
+static void
+anthy_do_print_metaword(struct splitter_context *sc,
+ struct meta_word *mw,
+ int indent)
+{
+ int i;
+ for (i = 0; i < indent; i++) {
+ printf(" ");
+ }
+ printf("*meta word type=%s(%d-%d):score=%d:seg_class=%s",
+ anthy_metaword_type_tab[mw->type].name,
+ mw->from, mw->len, mw->score,
+ anthy_seg_class_name(mw->seg_class));
+ print_metaword_features(mw->mw_features);
+ printf(":can_use=%d*\n", mw->can_use);
+ if (mw->wl) {
+ anthy_print_word_list(sc, mw->wl);
+ }
+ if (mw->cand_hint.str) {
+ printf("(");
+ anthy_putxstr(&mw->cand_hint);
+ printf(")\n");
+ }
+ if (mw->mw1) {
+ anthy_do_print_metaword(sc, mw->mw1, indent + 1);
+ }
+ if (mw->mw2) {
+ anthy_do_print_metaword(sc, mw->mw2, indent + 1);
+ }
+}
+
+void
+anthy_print_metaword(struct splitter_context *sc,
+ struct meta_word *mw)
+{
+ anthy_do_print_metaword(sc, mw, 0);
+}
+
+static struct meta_word *
+alloc_metaword(struct splitter_context *sc)
+{
+ struct meta_word *mw;
+ mw = anthy_smalloc(sc->word_split_info->MwAllocator);
+ mw->type = MW_SINGLE;
+ mw->score = 0;
+ mw->struct_score = 0;
+ mw->dep_word_hash = 0;
+ mw->core_wt = anthy_wt_none;
+ mw->mw_features = 0;
+ mw->dep_class = DEP_NONE;
+ mw->wl = NULL;
+ mw->mw1 = NULL;
+ mw->mw2 = NULL;
+ mw->cand_hint.str = NULL;
+ mw->cand_hint.len = 0;
+ mw->seg_class = SEG_HEAD;
+ mw->can_use = ok;
+ return mw;
+}
+
+
+/*
+ * wlの接頭辞部分と接尾辞部分を文字列として取り出す
+ */
+static void
+get_surrounding_text(struct splitter_context* sc,
+ struct word_list* wl,
+ xstr* xs_pre, xstr* xs_post)
+{
+ int post_len = wl->part[PART_DEPWORD].len + wl->part[PART_POSTFIX].len;
+ int pre_len = wl->part[PART_PREFIX].len;
+
+ xs_pre->str = sc->ce[wl->from].c;
+ xs_pre->len = pre_len;
+ xs_post->str = sc->ce[wl->from + wl->len - post_len].c;
+ xs_post->len = post_len;
+}
+
+static int
+count_vu(xstr *xs) {
+ int i, r = 0;
+ for (i = 0; i < xs->len; i++) {
+ if (xs->str[i] == KK_VU) {
+ r++;
+ }
+ }
+ return r;
+}
+
+/*
+ * 複合語であるwlからn番めの部分を取り出してmwにする
+ */
+static struct meta_word*
+make_compound_nth_metaword(struct splitter_context* sc,
+ compound_ent_t ce, int nth,
+ struct word_list* wl,
+ enum metaword_type type)
+{
+ int i;
+ int len = 0;
+ int from = wl->from;
+ int seg_num = anthy_compound_get_nr_segments(ce);
+ struct meta_word* mw;
+ xstr xs_pre, xs_core, xs_post;
+
+ get_surrounding_text(sc, wl, &xs_pre, &xs_post);
+
+ for (i = 0; i <= nth; ++i) {
+ xstr part;
+ from += len;
+ len = anthy_compound_get_nth_segment_len(ce, i);
+ part.str = sc->ce[from].c;
+ part.len = len;
+ len -= count_vu(&part);
+ if (i == 0) {
+ len += xs_pre.len;
+ }
+ if (i == seg_num - 1) {
+ len += xs_post.len;
+ }
+ }
+
+ mw = alloc_metaword(sc);
+ mw->from = from;
+ mw->len = len;
+ mw->type = type;
+ mw->score = 1000;
+ mw->seg_class = wl->seg_class;
+
+ anthy_compound_get_nth_segment_xstr(ce, nth, &xs_core);
+ if (nth == 0) {
+ anthy_xstrcat(&mw->cand_hint, &xs_pre);
+ }
+ anthy_xstrcat(&mw->cand_hint, &xs_core);
+ if (nth == seg_num - 1) {
+ anthy_xstrcat(&mw->cand_hint, &xs_post);
+ }
+ return mw;
+}
+
+
+/*
+ * metawordを実際に結合する
+ */
+static struct meta_word *
+anthy_do_cons_metaword(struct splitter_context *sc,
+ enum metaword_type type,
+ struct meta_word *mw, struct meta_word *mw2)
+{
+ struct meta_word *n;
+
+ n = alloc_metaword(sc);
+ n->from = mw->from;
+ n->len = mw->len + (mw2 ? mw2->len : 0);
+
+ if (mw2) {
+ n->score = sqrt(mw->score) * sqrt(mw2->score);
+ } else {
+ n->score = mw->score;
+ }
+ n->type = type;
+ n->mw1 = mw;
+ n->mw2 = mw2;
+ if (mw2) {
+ n->seg_class = mw2->seg_class;
+ n->nr_parts = mw->nr_parts + mw2->nr_parts;
+ n->dep_word_hash = mw2->dep_word_hash;
+ } else {
+ n->seg_class = mw->seg_class;
+ n->nr_parts = mw->nr_parts;
+ n->dep_word_hash = mw->dep_word_hash;
+ }
+ anthy_commit_meta_word(sc, n);
+ return n;
+}
+
+/*
+ * 複合語用のmeta_wordを作成する。
+ */
+static void
+make_compound_metaword(struct splitter_context* sc, struct word_list* wl)
+{
+ int i, j;
+ seq_ent_t se = wl->part[PART_CORE].seq;
+ int ent_num = anthy_get_nr_dic_ents(se, NULL);
+
+ for (i = 0; i < ent_num; ++i) {
+ compound_ent_t ce;
+ int seg_num;
+ struct meta_word *mw = NULL;
+ struct meta_word *mw2 = NULL;
+ if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
+ continue;
+ }
+ ce = anthy_get_nth_compound_ent(se, i);
+ seg_num = anthy_compound_get_nr_segments(ce);
+
+ for (j = seg_num - 1; j >= 0; --j) {
+ enum metaword_type type;
+ mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_LEAF);
+ anthy_commit_meta_word(sc, mw);
+
+ type = j == 0 ? MW_COMPOUND_HEAD : MW_COMPOUND;
+ mw2 = anthy_do_cons_metaword(sc, type, mw, mw2);
+ }
+ }
+}
+
+/*
+ * 複合語の中の個々の文節を結合したmeta_wordを作成する。
+ */
+static void
+make_compound_part_metaword(struct splitter_context* sc, struct word_list* wl)
+{
+ int i, j, k;
+ seq_ent_t se = wl->part[PART_CORE].seq;
+ int ent_num = anthy_get_nr_dic_ents(se, NULL);
+
+ for (i = 0; i < ent_num; ++i) {
+ compound_ent_t ce;
+ int seg_num;
+ struct meta_word *mw = NULL;
+ struct meta_word *mw2 = NULL;
+
+ if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
+ continue;
+ }
+
+ ce = anthy_get_nth_compound_ent(se, i);
+ seg_num = anthy_compound_get_nr_segments(ce);
+
+ /* 後ろから */
+ for (j = seg_num - 1; j >= 0; --j) {
+ mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_PART);
+ for (k = j - 1; k >= 0; --k) {
+ mw2 = make_compound_nth_metaword(sc, ce, k, wl, MW_COMPOUND_PART);
+ mw2->len += mw->len;
+ mw2->score += mw->score;
+ anthy_xstrcat(&mw2->cand_hint, &mw->cand_hint);
+
+ anthy_commit_meta_word(sc, mw2);
+ mw = mw2;
+ }
+ }
+ }
+}
+
+/*
+ * 単文節単語
+ */
+static void
+make_simple_metaword(struct splitter_context *sc, struct word_list* wl)
+{
+ struct meta_word *mw = alloc_metaword(sc);
+ mw->wl = wl;
+ mw->from = wl->from;
+ mw->len = wl->len;
+ mw->score = 1000;
+ mw->type = MW_SINGLE;
+ mw->dep_class = wl->part[PART_DEPWORD].dc;
+ mw->seg_class = wl->seg_class;
+ if (wl->part[PART_CORE].len) {
+ mw->core_wt = wl->part[PART_CORE].wt;
+ }
+ mw->nr_parts = NR_PARTS;
+ mw->dep_word_hash = wl->dep_word_hash;
+ mw->mw_features = wl->mw_features;
+ anthy_commit_meta_word(sc, mw);
+}
+
+/*
+ * wordlist一個からなる、metawordを作成
+ */
+static void
+make_metaword_from_word_list(struct splitter_context *sc)
+{
+ int i;
+ for (i = 0; i < sc->char_count; i++) {
+ struct word_list *wl;
+ for (wl = sc->word_split_info->cnode[i].wl;
+ wl; wl = wl->next) {
+ if (wl->is_compound) {
+ make_compound_part_metaword(sc, wl);
+ make_compound_metaword(sc, wl);
+ } else {
+ make_simple_metaword(sc, wl);
+ }
+ }
+ }
+}
+
+/*
+ * metawordをリスト風に結合する
+ */
+static struct meta_word *
+list_metaword(struct splitter_context *sc,
+ enum metaword_type type,
+ struct meta_word *mw, struct meta_word *mw2)
+{
+ struct meta_word *wrapped_mw = anthy_do_cons_metaword(sc, type, mw2, NULL);
+ struct meta_word *n = anthy_do_cons_metaword(sc, type, mw, wrapped_mw);
+
+ n->mw_features = mw->mw_features | mw2->mw_features;
+
+ return n;
+}
+
+/*
+ * 動詞連用形 + 形容詞化接尾語 「〜しやすい」など
+ */
+static void
+try_combine_v_renyou_a(struct splitter_context *sc,
+ struct meta_word *mw, struct meta_word *mw2)
+{
+ wtype_t w2;
+ if (!mw->wl || !mw2->wl) return;
+
+ w2 = mw2->wl->part[PART_CORE].wt;
+
+ if (mw->wl->head_pos == POS_V &&
+ mw->wl->tail_ct == CT_RENYOU &&
+ anthy_wtype_get_pos(w2) == POS_D2KY) {
+ /* 形容詞ではあるので次のチェック */
+ if (anthy_get_seq_ent_wtype_freq(mw2->wl->part[PART_CORE].seq,
+ anthy_wtype_a_tail_of_v_renyou)) {
+ list_metaword(sc, MW_V_RENYOU_A, mw, mw2);
+ }
+ }
+}
+
+/*
+ * 動詞連用形 + 名詞化接尾語(#D2T35) 「入れ たて(のお茶)」など
+ */
+static void
+try_combine_v_renyou_noun(struct splitter_context *sc,
+ struct meta_word *mw, struct meta_word *mw2)
+{
+ wtype_t w2;
+ if (!mw->wl || !mw2->wl) return;
+
+ w2 = mw2->wl->part[PART_CORE].wt;
+ if (mw->wl->head_pos == POS_V &&
+ mw->wl->tail_ct == CT_RENYOU &&
+ anthy_wtype_get_pos(w2) == POS_NOUN &&
+ anthy_wtype_get_scos(w2) == SCOS_T40) {
+ list_metaword(sc, MW_V_RENYOU_NOUN, mw, mw2);
+ }
+}
+
+/*
+ * 数字を結合する
+ */
+static void
+try_combine_number(struct splitter_context *sc,
+ struct meta_word *mw1, struct meta_word *mw2)
+{
+ struct word_list *wl1 = mw1->wl;
+ struct word_list *wl2 = mw2->wl;
+ struct meta_word *combined_mw;
+ int recursive = wl2 ? 0 : 1; /* combinedなmwを結合する場合1 */
+
+ /* 左mwは数詞 */
+
+ if (anthy_wtype_get_pos(wl1->part[PART_CORE].wt) != POS_NUMBER) return;
+ if (recursive) {
+ /* 右mwは数字を結合したmw */
+ if (mw2->type != MW_NUMBER) return;
+ wl2 = mw2->mw1->wl;
+ } else {
+ /* 右mwは数詞 */
+ if (anthy_wtype_get_pos(wl2->part[PART_CORE].wt) != POS_NUMBER) return;
+ }
+ /* 左mwの後ろに文字が付いていなければ */
+ if (wl1->part[PART_POSTFIX].len == 0 &&
+ wl1->part[PART_DEPWORD].len == 0) {
+ int scos1 = anthy_wtype_get_scos(wl1->part[PART_CORE].wt);
+ int scos2 = anthy_wtype_get_scos(wl2->part[PART_CORE].wt);
+
+ /* #NNは対象外 */
+ if (scos2 == SCOS_NONE) return;
+ /*
+ 左mwの種類によって、後ろにつくことができる右mwの種類が変わる
+ 例えば一〜九の後ろには万〜九万、億〜九億しかつくことができないが、
+ 十〜九十の後ろには、あわせて一〜九などもつくことができる
+ */
+ switch (scos1) {
+ case SCOS_N1:
+ if (scos2 == SCOS_N1) return; /* 後ろに一〜九がついてはいけない */
+ case SCOS_N10:
+ if (scos2 == SCOS_N10) return; /* 後ろに十〜九十がついてはいけない */
+ case SCOS_N100:
+ if (scos2 == SCOS_N100) return; /* 後ろに百〜九百がついてはいけない */
+ case SCOS_N1000:
+ if (scos2 == SCOS_N1000) return; /* 後ろに千〜九千がついてはいけない */
+ case SCOS_N10000:
+ /* 万〜九万、億〜九億…などは、
+ いつでも後ろにつくことができる */
+ break;
+ default:
+ return;
+ }
+
+ if (recursive) {
+ combined_mw = anthy_do_cons_metaword(sc, MW_NUMBER, mw1, mw2);
+ } else {
+ /* 初めて結合する場合は後ろにnullをつけてlistにする */
+ combined_mw = list_metaword(sc, MW_NUMBER, mw1, mw2);
+ }
+ combine_metaword(sc, combined_mw);
+ }
+}
+
+/* 右隣のmetawordと結合できるかチェック */
+static void
+try_combine_metaword(struct splitter_context *sc,
+ struct meta_word *mw1, struct meta_word *mw2)
+{
+ if (!mw1->wl) return;
+
+ /* metawordの結合を行うためには、後続の
+ metawordに接頭辞がないことが必要 */
+ if (mw2->wl && mw2->wl->part[PART_PREFIX].len > 0) {
+ return;
+ }
+
+ try_combine_v_renyou_a(sc, mw1, mw2);
+ try_combine_v_renyou_noun(sc, mw1, mw2);
+ try_combine_number(sc, mw1, mw2);
+}
+
+static void
+combine_metaword(struct splitter_context *sc, struct meta_word *mw)
+{
+ struct word_split_info_cache *info = sc->word_split_info;
+ int i;
+
+ if (mw->mw_features & MW_FEATURE_DEP_ONLY) {
+ /* 付属語だけの文節とは結合しない */
+ return;
+ }
+
+ for (i = mw->from - 1; i >= 0; i--) {
+ struct meta_word *mw_left;
+ for (mw_left = info->cnode[i].mw; mw_left; mw_left = mw_left->next) {
+ if (mw_left->from + mw_left->len == mw->from) {
+ /* 結合できるかチェック */
+ try_combine_metaword(sc, mw_left, mw);
+ }
+ }
+ }
+}
+
+static void
+combine_metaword_all(struct splitter_context *sc)
+{
+ int i;
+
+ struct word_split_info_cache *info = sc->word_split_info;
+ /* metawordの左端によるループ */
+ for (i = sc->char_count - 1; i >= 0; i--){
+ struct meta_word *mw;
+ /* 各metawordのループ */
+ for (mw = info->cnode[i].mw;
+ mw; mw = mw->next) {
+ combine_metaword(sc, mw);
+ }
+ }
+}
+
+static void
+make_dummy_metaword(struct splitter_context *sc, int from,
+ int len, int orig_len)
+{
+ int score = 0;
+ struct meta_word *mw, *n;
+
+ for (mw = sc->word_split_info->cnode[from].mw; mw; mw = mw->next) {
+ if (mw->len != orig_len) continue;
+ if (mw->score > score) {
+ score = mw->score;
+ }
+ }
+
+ n = alloc_metaword(sc);
+ n->type = MW_DUMMY;
+ n->from = from;
+ n->len = len;
+ n->score = 3 * score * len / orig_len;
+ if (mw) {
+ mw->nr_parts = 0;
+ }
+ anthy_commit_meta_word(sc, n);
+}
+
+/*
+ * 文節を伸ばしたらそれを覚えておく
+ */
+static void
+make_expanded_metaword_all(struct splitter_context *sc)
+{
+ int i, j;
+ if (anthy_select_section("EXPANDPAIR", 0) == -1) {
+ return ;
+ }
+ for (i = 0; i < sc->char_count; i++) {
+ for (j = 1; j < sc->char_count - i; j++) {
+ /* 全ての部分文字列に対して */
+ xstr xs;
+ xs.len = j;
+ xs.str = sc->ce[i].c;
+ if (anthy_select_row(&xs, 0) == 0) {
+ /* この部分文字列は過去に拡大の対象となった */
+ int k;
+ int nr = anthy_get_nr_values();
+ for (k = 0; k < nr; k++) {
+ xstr *exs;
+ exs = anthy_get_nth_xstr(k);
+ if (exs && exs->len <= sc->char_count - i) {
+ xstr txs;
+ txs.str = sc->ce[i].c;
+ txs.len = exs->len;
+ if (!anthy_xstrcmp(&txs, exs)) {
+ make_dummy_metaword(sc, i, txs.len, j);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+/* お茶入れ学習のmetawordを作る */
+static void
+make_ochaire_metaword(struct splitter_context *sc,
+ int from, int len)
+{
+ struct meta_word *mw;
+ int count;
+ int s;
+ int j;
+ int seg_len;
+ int mw_len = 0;
+ xstr* xs;
+
+ (void)len;
+
+ /* 文節数を取得 */
+ count = anthy_get_nth_value(0);
+ /* 一番右の文節をのぞいた文字数の合計を計算 */
+ for (s = 0, j = 0; j < count - 1; j++) {
+ s += anthy_get_nth_value(j * 2 + 1);
+ }
+ /* 一番右の文節のmetawordを構成 */
+ xs = anthy_get_nth_xstr((count - 1) * 2 + 2);
+ if (!xs) {
+ return ;
+ }
+ seg_len = anthy_get_nth_value((count - 1) * 2 + 1);
+ mw = alloc_metaword(sc);
+ mw->type = MW_OCHAIRE;
+ mw->from = from + s;
+ mw->len = seg_len;
+ mw->score = OCHAIRE_SCORE;
+ mw->cand_hint.str = malloc(sizeof(xchar)*xs->len);
+ anthy_xstrcpy(&mw->cand_hint, xs);
+ anthy_commit_meta_word(sc, mw);
+ mw_len += seg_len;
+ /* それ以外の文節でmetawordを構成 */
+ for (j-- ; j >= 0; j--) {
+ struct meta_word *n;
+ seg_len = anthy_get_nth_value(j * 2 + 1);
+ s -= seg_len;
+ xs = anthy_get_nth_xstr(j * 2 + 2);
+ if (!xs) {
+ return ;
+ }
+ n = alloc_metaword(sc);
+ n->type = MW_OCHAIRE;
+ /* 右のmetawordをつなぐ */
+ n->mw1 = mw;
+ n->from = from + s;
+ n->len = seg_len;
+ n->score = OCHAIRE_SCORE;
+ n->cand_hint.str = malloc(sizeof(xchar)*xs->len);
+ anthy_xstrcpy(&n->cand_hint, xs);
+ anthy_commit_meta_word(sc, n);
+ mw = n;
+ mw_len += seg_len;
+ }
+}
+
+/*
+ * 複数の文節の組を履歴から検索する
+ */
+static void
+make_ochaire_metaword_all(struct splitter_context *sc)
+{
+ int i;
+ if (anthy_select_section("OCHAIRE", 0) == -1) {
+ return ;
+ }
+
+ for (i = 0; i < sc->char_count; i++) {
+ xstr xs;
+ xs.len = sc->char_count - i;
+ xs.str = sc->ce[i].c;
+ if (anthy_select_longest_row(&xs) == 0) {
+ xstr* key;
+ int len;
+ anthy_mark_row_used();
+ key = anthy_get_index_xstr();
+ len = key->len;
+
+ make_ochaire_metaword(sc, i, len);
+ /* 今回見つかった meta_word の次の文字から始める */
+ i += len - 1;
+ break;
+ }
+ }
+}
+
+static void
+add_dummy_metaword(struct splitter_context *sc,
+ int from)
+{
+ struct meta_word *n;
+ n = alloc_metaword(sc);
+ n->from = from;
+ n->len = 1;
+ n->type = MW_SINGLE;
+ n->score = 1;
+ n->seg_class = SEG_BUNSETSU;
+ anthy_commit_meta_word(sc, n);
+}
+
+/* 指定したmetawordをwrapしてj文字長いmeta_wordを作る */
+static void
+expand_meta_word(struct splitter_context *sc,
+ struct meta_word *mw, int from, int len,
+ int destroy_seg_class, int j)
+{
+ struct meta_word *n;
+ n = alloc_metaword(sc);
+ n->from = from;
+ n->len = len + j;
+ if (mw) {
+ n->type = MW_WRAP;
+ n->mw1 = mw;
+ n->score = mw->score;
+ n->nr_parts = mw->nr_parts;
+ if (destroy_seg_class) {
+ n->seg_class = SEG_BUNSETSU;
+ n->score /= 10;
+ } else {
+ n->seg_class = mw->seg_class;
+ }
+ } else {
+ n->type = MW_SINGLE;
+ n->score = 1;
+ n->seg_class = SEG_BUNSETSU;
+ }
+ anthy_commit_meta_word(sc, n);
+}
+
+/*
+ * metawordの後ろの雑多な文字をくっつけたmetawordを構成する
+ */
+static void
+make_metaword_with_depchar(struct splitter_context *sc,
+ struct meta_word *mw)
+{
+ int j;
+ int destroy_seg_class = 0;
+ int from = mw ? mw->from : 0;
+ int len = mw ? mw->len : 0;
+
+ /* metawordの直後の文字の種類を調べる */
+ int type;
+ if (sc->char_count <= from + len) {
+ return ;
+ }
+ type = anthy_get_xchar_type(*sc->ce[from + len].c);
+ if (!(type & XCT_SYMBOL) &&
+ !(type & XCT_PART)) {
+ return;
+ }
+ if (type & XCT_PUNCTUATION) {
+ /* 句読点ならば別の文節にする */
+ return ;
+ }
+
+ /* 同じ種類の文字でなければくっつけるのをうちきり */
+ for (j = 0; from + len + j < sc->char_count; j++) {
+ int p = from + len + j;
+ if ((anthy_get_xchar_type(*sc->ce[p].c) != type)) {
+ break;
+ }
+ if (!(p + 1 < sc->char_count) ||
+ *sc->ce[p].c != *sc->ce[p + 1].c) {
+ destroy_seg_class = 1;
+ }
+ }
+
+ /* 上のループを抜けた時、jには独立できない文字の数が入っている */
+
+ /* 独立できない文字があるので、それを付けたmetawordを作る */
+ if (j > 0) {
+ expand_meta_word(sc, mw, from, len, destroy_seg_class, j);
+ }
+}
+
+static void
+make_metaword_with_depchar_all(struct splitter_context *sc)
+{
+ int i;
+ struct word_split_info_cache *info = sc->word_split_info;
+
+ /* 全metawordに対して */
+ for (i = 0; i < sc->char_count; i++) {
+ struct meta_word *mw;
+ for (mw = info->cnode[i].mw;
+ mw; mw = mw->next) {
+ make_metaword_with_depchar(sc, mw);
+ }
+ if (!info->cnode[i].mw) {
+ /**/
+ add_dummy_metaword(sc, i);
+ }
+ }
+ /* 文の左端から始まるもの */
+ make_metaword_with_depchar(sc, NULL);
+}
+
+static int
+is_single(xstr* xs)
+{
+ int i;
+ int xct;
+ for (i = xs->len - 1; i >= 1; --i) {
+ xct = anthy_get_xchar_type(xs->str[i]);
+ if (!(xct & XCT_PART)) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static void
+bias_to_single_char_metaword(struct splitter_context *sc)
+{
+ int i;
+
+ for (i = sc->char_count - 1; i >= 0; --i) {
+ struct meta_word *mw;
+ xstr xs;
+ int xct;
+
+ struct char_node *cnode = &sc->word_split_info->cnode[i];
+
+ /* カッコの場合は一文字で文節を構成できる */
+ xct = anthy_get_xchar_type(*sc->ce[i].c);
+ if (xct & (XCT_OPEN|XCT_CLOSE)) {
+ continue;
+ }
+
+ xs.str = sc->ce[i].c;
+ for (mw = cnode->mw; mw; mw = mw->next) {
+ /* 付属語のみの文節は減点しない */
+ if (mw->mw_features & MW_FEATURE_DEP_ONLY) {
+ continue;
+ }
+ /* 一文字(+直前につながる文字の繰り返し)のスコアを下げる */
+ xs.len = mw->len;
+ if (is_single(&xs)) {
+ mw->score /= 10;
+ }
+ }
+ }
+}
+
+void
+anthy_mark_border_by_metaword(struct splitter_context* sc,
+ struct meta_word* mw)
+{
+ struct word_split_info_cache* info = sc->word_split_info;
+ if (!mw) return;
+
+ switch (mw->type) {
+ case MW_DUMMY:
+ /* BREAK THROUGH */
+ case MW_SINGLE:
+ /* BREAK THROUGH */
+ case MW_COMPOUND_PART:
+ info->seg_border[mw->from] = 1;
+ break;
+ case MW_COMPOUND_LEAF:
+ info->seg_border[mw->from] = 1;
+ info->best_mw[mw->from] = mw;
+ mw->can_use = ok;
+ break;
+ case MW_COMPOUND_HEAD:
+ /* BREAK THROUGH */
+ case MW_COMPOUND:
+ /* BREAK THROUGH */
+ case MW_NUMBER:
+ info->best_mw[mw->mw1->from] = mw->mw1;
+ anthy_mark_border_by_metaword(sc, mw->mw1);
+ anthy_mark_border_by_metaword(sc, mw->mw2);
+ break;
+ case MW_V_RENYOU_A:
+ /* BREAK THROUGH */
+ case MW_V_RENYOU_NOUN:
+ info->seg_border[mw->from] = 1;
+ break;
+ case MW_WRAP:
+ anthy_mark_border_by_metaword(sc, mw->mw1);
+ break;
+ case MW_OCHAIRE:
+ info->seg_border[mw->from] = 1;
+ anthy_mark_border_by_metaword(sc, mw->mw1);
+ break;
+ default:
+ break;
+ }
+}
+
+void
+anthy_make_metaword_all(struct splitter_context *sc)
+{
+ /* まず、word_list一個のmetawordを作る */
+ make_metaword_from_word_list(sc);
+
+ /* metawordを結合する */
+ combine_metaword_all(sc);
+
+ /* 拡大された文節を処理する */
+ make_expanded_metaword_all(sc);
+
+ /* 濁点や長音などの記号、その他の記号を処理 */
+ make_metaword_with_depchar_all(sc);
+
+ /* おちゃをいれる */
+ make_ochaire_metaword_all(sc);
+
+ /* 一文字の文節は減点 */
+ bias_to_single_char_metaword(sc);
+}
+
+/*
+ * 指定された領域をカバーするmetawordを数える
+ */
+int
+anthy_get_nr_metaword(struct splitter_context *sc,
+ int from, int len)
+{
+ struct meta_word *mw;
+ int n;
+
+ for (n = 0, mw = sc->word_split_info->cnode[from].mw;
+ mw; mw = mw->next) {
+ if (mw->len == len && mw->can_use == ok) {
+ n++;
+ }
+ }
+ return n;
+}
+
+struct meta_word *
+anthy_get_nth_metaword(struct splitter_context *sc,
+ int from, int len, int nth)
+{
+ struct meta_word *mw;
+ int n;
+ for (n = 0, mw = sc->word_split_info->cnode[from].mw;
+ mw; mw = mw->next) {
+ if (mw->len == len && mw->can_use == ok) {
+ if (n == nth) {
+ return mw;
+ }
+ n++;
+ }
+ }
+ return NULL;
+}