diff options
Diffstat (limited to 'src-ordering/commit.c')
-rw-r--r-- | src-ordering/commit.c | 275 |
1 files changed, 275 insertions, 0 deletions
diff --git a/src-ordering/commit.c b/src-ordering/commit.c new file mode 100644 index 0000000..3a43a4f --- /dev/null +++ b/src-ordering/commit.c @@ -0,0 +1,275 @@ +/* + * 確定(コミット)後の処理をする。 + * 各種の学習処理を呼び出す + * + * anthy_proc_commit() が外部から呼ばれる + */ +#include <stdlib.h> +#include <time.h> + +#include <anthy/ordering.h> +#include <anthy/record.h> +#include <anthy/splitter.h> +#include <anthy/segment.h> +#include "sorter.h" + +#define MAX_OCHAIRE_ENTRY_COUNT 100 +#define MAX_OCHAIRE_LEN 32 +#define MAX_PREDICTION_ENTRY 100 + +#define MAX_UNKNOWN_WORD 100 + +/* 交換された候補を探す */ +static void +learn_swapped_candidates(struct segment_list *sl) +{ + int i; + struct seg_ent *seg; + for (i = 0; i < sl->nr_segments; i++) { + seg = anthy_get_nth_segment(sl, i); + if (seg->committed != 0) { + /* 最初の候補(0番目)でない候補(seg->committed番目)がコミットされた */ + anthy_swap_cand_ent(seg->cands[0], + seg->cands[seg->committed]); + } + } + anthy_cand_swap_ageup(); +} + +/* 長さが変わった文節の変更後に対して */ +static void +learn_resized_segment(struct splitter_context *sc, + struct segment_list *sl) + +{ + int i; + struct meta_word **mw + = alloca(sizeof(struct meta_word*) * sl->nr_segments); + int *len_array + = alloca(sizeof(int) * sl->nr_segments); + + /* 各文節の長さの配列とmeta_wordの配列を用意する */ + for (i = 0; i < sl->nr_segments; i++) { + struct seg_ent *se = anthy_get_nth_segment(sl, i); + mw[i] = se->cands[se->committed]->mw; + len_array[i] = se->str.len; + } + + anthy_commit_border(sc, sl->nr_segments, mw, len_array); +} + +/* 長さが変わった文節の変更前に対して */ +static void +clear_resized_segment(struct splitter_context *sc, + struct segment_list *sl) +{ + int *mark, i, from; + struct seg_ent *seg; + mark = alloca(sizeof(int)*sc->char_count); + for (i = 0; i < sc->char_count; i++) { + mark[i] = 0; + } + /* 実際に確定された文節の長さをマークする */ + from = 0; + for (i = 0; i < sl->nr_segments; i++) { + seg = anthy_get_nth_segment(sl, i); + mark[from] = seg->len; + from = from + seg->len; + } + for (i = 0; i < sc->char_count; i++) { + int len = sc->ce[i].initial_seg_len; + /* 最初の長さと確定された長さが異なれば、 + 使われなかった未知語の可能性がある */ + if (len && len != mark[i]) { + xstr xs; + xs.str = sc->ce[i].c; + xs.len = len; + anthy_forget_unused_unknown_word(&xs); + } + } + if (!anthy_select_section("UNKNOWN_WORD", 0)) { + anthy_truncate_section(MAX_UNKNOWN_WORD); + } +} + +/* recordにお茶入れ学習の結果を書き込む */ +static void +commit_ochaire(struct seg_ent *seg, int count, xstr* xs) +{ + int i; + if (xs->len >= MAX_OCHAIRE_LEN) { + return ; + } + if (anthy_select_row(xs, 1)) { + return ; + } + anthy_set_nth_value(0, count); + for (i = 0; i < count; i++, seg = seg->next) { + anthy_set_nth_value(i * 2 + 1, seg->len); + anthy_set_nth_xstr(i * 2 + 2, &seg->cands[seg->committed]->str); + } +} + +/* recordの領域を節約するために、お茶入れ学習のネガティブな + エントリを消す */ +static void +release_negative_ochaire(struct splitter_context *sc, + struct segment_list *sl) +{ + int start, len; + xstr xs; + (void)sl; + /* 変換前のひらがな文字列 */ + xs.len = sc->char_count; + xs.str = sc->ce[0].c; + + /* xsの部分文字列に対して */ + for (start = 0; start < xs.len; start ++) { + for (len = 1; len <= xs.len - start && len < MAX_OCHAIRE_LEN; len ++) { + xstr part; + part.str = &xs.str[start]; + part.len = len; + if (anthy_select_row(&part, 0) == 0) { + anthy_release_row(); + } + } + } +} + +/* お茶入れ学習を行う */ +static void +learn_ochaire(struct splitter_context *sc, + struct segment_list *sl) +{ + int i; + int count; + + if (anthy_select_section("OCHAIRE", 1)) { + return ; + } + + /* お茶入れ学習のネガティブなエントリを消す */ + release_negative_ochaire(sc, sl); + + /* お茶入れ学習をする */ + for (count = 2; count <= sl->nr_segments && count < 5; count++) { + /* 2文節以上の長さの文節列に対して */ + + for (i = 0; i <= sl->nr_segments - count; i++) { + struct seg_ent *head = anthy_get_nth_segment(sl, i); + struct seg_ent *s; + xstr xs; + int j; + xs = head->str; + if (xs.len < 2 && count < 3) { + /* 細切れの文節を学習することを避ける、 + * いい加減なheuristics */ + continue; + } + /* 文節列を構成する文字列を作る */ + for (j = 1, s = head->next; j < count; j++, s = s->next) { + xs.len += s->str.len; + } + /**/ + commit_ochaire(head, count, &xs); + } + } + if (anthy_select_section("OCHAIRE", 1)) { + return ; + } + anthy_truncate_section(MAX_OCHAIRE_ENTRY_COUNT); +} + +static int +learn_prediction_str(xstr *idx, xstr *xs) +{ + int nr_predictions; + int i; + time_t t = time(NULL); + if (anthy_select_row(idx, 1)) { + return 0; + } + nr_predictions = anthy_get_nr_values(); + + /* 既に履歴にある場合はタイムスタンプだけ更新 */ + for (i = 0; i < nr_predictions; i += 2) { + xstr *log = anthy_get_nth_xstr(i + 1); + if (!log) { + continue; + } + if (anthy_xstrcmp(log, xs) == 0) { + anthy_set_nth_value(i, t); + break; + } + } + + /* ない場合は末尾に追加 */ + if (i == nr_predictions) { + anthy_set_nth_value(nr_predictions, t); + anthy_set_nth_xstr(nr_predictions + 1, xs); + anthy_mark_row_used(); + return 1; + } + anthy_mark_row_used(); + return 0; +} + +static void +learn_prediction(struct segment_list *sl) +{ + int i; + int added = 0; + if (anthy_select_section("PREDICTION", 1)) { + return ; + } + for (i = 0; i < sl->nr_segments; i++) { + struct seg_ent *seg = anthy_get_nth_segment(sl, i); + xstr *xs = &seg->cands[seg->committed]->str; + + if (seg->committed < 0) { + continue; + } + if (learn_prediction_str(&seg->str, xs)) { + added = 1; + } + } + if (added) { + anthy_truncate_section(MAX_PREDICTION_ENTRY); + } +} + +static void +learn_unknown(struct segment_list *sl) +{ + int i; + for (i = 0; i < sl->nr_segments; i++) { + struct seg_ent *seg = anthy_get_nth_segment(sl, i); + struct cand_ent *ce = seg->cands[seg->committed]; + if (ce->nr_words == 0) { + anthy_add_unknown_word(&seg->str, &ce->str); + } + } +} + +void +anthy_do_commit_prediction(xstr *src, xstr *xs) +{ + if (anthy_select_section("PREDICTION", 1)) { + return ; + } + learn_prediction_str(src, xs); +} + +void +anthy_proc_commit(struct segment_list *sl, + struct splitter_context *sc) +{ + /* 各種の学習を行う */ + learn_swapped_candidates(sl); + learn_resized_segment(sc, sl); + clear_resized_segment(sc, sl); + learn_ochaire(sc, sl); + learn_prediction(sl); + learn_unknown(sl); + anthy_learn_cand_history(sl); +} |