summaryrefslogtreecommitdiff
path: root/src-ordering/commit.c
diff options
context:
space:
mode:
Diffstat (limited to 'src-ordering/commit.c')
-rw-r--r--src-ordering/commit.c275
1 files changed, 275 insertions, 0 deletions
diff --git a/src-ordering/commit.c b/src-ordering/commit.c
new file mode 100644
index 0000000..3a43a4f
--- /dev/null
+++ b/src-ordering/commit.c
@@ -0,0 +1,275 @@
+/*
+ * 確定(コミット)後の処理をする。
+ * 各種の学習処理を呼び出す
+ *
+ * anthy_proc_commit() が外部から呼ばれる
+ */
+#include <stdlib.h>
+#include <time.h>
+
+#include <anthy/ordering.h>
+#include <anthy/record.h>
+#include <anthy/splitter.h>
+#include <anthy/segment.h>
+#include "sorter.h"
+
+#define MAX_OCHAIRE_ENTRY_COUNT 100
+#define MAX_OCHAIRE_LEN 32
+#define MAX_PREDICTION_ENTRY 100
+
+#define MAX_UNKNOWN_WORD 100
+
+/* 交換された候補を探す */
+static void
+learn_swapped_candidates(struct segment_list *sl)
+{
+ int i;
+ struct seg_ent *seg;
+ for (i = 0; i < sl->nr_segments; i++) {
+ seg = anthy_get_nth_segment(sl, i);
+ if (seg->committed != 0) {
+ /* 最初の候補(0番目)でない候補(seg->committed番目)がコミットされた */
+ anthy_swap_cand_ent(seg->cands[0],
+ seg->cands[seg->committed]);
+ }
+ }
+ anthy_cand_swap_ageup();
+}
+
+/* 長さが変わった文節の変更後に対して */
+static void
+learn_resized_segment(struct splitter_context *sc,
+ struct segment_list *sl)
+
+{
+ int i;
+ struct meta_word **mw
+ = alloca(sizeof(struct meta_word*) * sl->nr_segments);
+ int *len_array
+ = alloca(sizeof(int) * sl->nr_segments);
+
+ /* 各文節の長さの配列とmeta_wordの配列を用意する */
+ for (i = 0; i < sl->nr_segments; i++) {
+ struct seg_ent *se = anthy_get_nth_segment(sl, i);
+ mw[i] = se->cands[se->committed]->mw;
+ len_array[i] = se->str.len;
+ }
+
+ anthy_commit_border(sc, sl->nr_segments, mw, len_array);
+}
+
+/* 長さが変わった文節の変更前に対して */
+static void
+clear_resized_segment(struct splitter_context *sc,
+ struct segment_list *sl)
+{
+ int *mark, i, from;
+ struct seg_ent *seg;
+ mark = alloca(sizeof(int)*sc->char_count);
+ for (i = 0; i < sc->char_count; i++) {
+ mark[i] = 0;
+ }
+ /* 実際に確定された文節の長さをマークする */
+ from = 0;
+ for (i = 0; i < sl->nr_segments; i++) {
+ seg = anthy_get_nth_segment(sl, i);
+ mark[from] = seg->len;
+ from = from + seg->len;
+ }
+ for (i = 0; i < sc->char_count; i++) {
+ int len = sc->ce[i].initial_seg_len;
+ /* 最初の長さと確定された長さが異なれば、
+ 使われなかった未知語の可能性がある */
+ if (len && len != mark[i]) {
+ xstr xs;
+ xs.str = sc->ce[i].c;
+ xs.len = len;
+ anthy_forget_unused_unknown_word(&xs);
+ }
+ }
+ if (!anthy_select_section("UNKNOWN_WORD", 0)) {
+ anthy_truncate_section(MAX_UNKNOWN_WORD);
+ }
+}
+
+/* recordにお茶入れ学習の結果を書き込む */
+static void
+commit_ochaire(struct seg_ent *seg, int count, xstr* xs)
+{
+ int i;
+ if (xs->len >= MAX_OCHAIRE_LEN) {
+ return ;
+ }
+ if (anthy_select_row(xs, 1)) {
+ return ;
+ }
+ anthy_set_nth_value(0, count);
+ for (i = 0; i < count; i++, seg = seg->next) {
+ anthy_set_nth_value(i * 2 + 1, seg->len);
+ anthy_set_nth_xstr(i * 2 + 2, &seg->cands[seg->committed]->str);
+ }
+}
+
+/* recordの領域を節約するために、お茶入れ学習のネガティブな
+ エントリを消す */
+static void
+release_negative_ochaire(struct splitter_context *sc,
+ struct segment_list *sl)
+{
+ int start, len;
+ xstr xs;
+ (void)sl;
+ /* 変換前のひらがな文字列 */
+ xs.len = sc->char_count;
+ xs.str = sc->ce[0].c;
+
+ /* xsの部分文字列に対して */
+ for (start = 0; start < xs.len; start ++) {
+ for (len = 1; len <= xs.len - start && len < MAX_OCHAIRE_LEN; len ++) {
+ xstr part;
+ part.str = &xs.str[start];
+ part.len = len;
+ if (anthy_select_row(&part, 0) == 0) {
+ anthy_release_row();
+ }
+ }
+ }
+}
+
+/* お茶入れ学習を行う */
+static void
+learn_ochaire(struct splitter_context *sc,
+ struct segment_list *sl)
+{
+ int i;
+ int count;
+
+ if (anthy_select_section("OCHAIRE", 1)) {
+ return ;
+ }
+
+ /* お茶入れ学習のネガティブなエントリを消す */
+ release_negative_ochaire(sc, sl);
+
+ /* お茶入れ学習をする */
+ for (count = 2; count <= sl->nr_segments && count < 5; count++) {
+ /* 2文節以上の長さの文節列に対して */
+
+ for (i = 0; i <= sl->nr_segments - count; i++) {
+ struct seg_ent *head = anthy_get_nth_segment(sl, i);
+ struct seg_ent *s;
+ xstr xs;
+ int j;
+ xs = head->str;
+ if (xs.len < 2 && count < 3) {
+ /* 細切れの文節を学習することを避ける、
+ * いい加減なheuristics */
+ continue;
+ }
+ /* 文節列を構成する文字列を作る */
+ for (j = 1, s = head->next; j < count; j++, s = s->next) {
+ xs.len += s->str.len;
+ }
+ /**/
+ commit_ochaire(head, count, &xs);
+ }
+ }
+ if (anthy_select_section("OCHAIRE", 1)) {
+ return ;
+ }
+ anthy_truncate_section(MAX_OCHAIRE_ENTRY_COUNT);
+}
+
+static int
+learn_prediction_str(xstr *idx, xstr *xs)
+{
+ int nr_predictions;
+ int i;
+ time_t t = time(NULL);
+ if (anthy_select_row(idx, 1)) {
+ return 0;
+ }
+ nr_predictions = anthy_get_nr_values();
+
+ /* 既に履歴にある場合はタイムスタンプだけ更新 */
+ for (i = 0; i < nr_predictions; i += 2) {
+ xstr *log = anthy_get_nth_xstr(i + 1);
+ if (!log) {
+ continue;
+ }
+ if (anthy_xstrcmp(log, xs) == 0) {
+ anthy_set_nth_value(i, t);
+ break;
+ }
+ }
+
+ /* ない場合は末尾に追加 */
+ if (i == nr_predictions) {
+ anthy_set_nth_value(nr_predictions, t);
+ anthy_set_nth_xstr(nr_predictions + 1, xs);
+ anthy_mark_row_used();
+ return 1;
+ }
+ anthy_mark_row_used();
+ return 0;
+}
+
+static void
+learn_prediction(struct segment_list *sl)
+{
+ int i;
+ int added = 0;
+ if (anthy_select_section("PREDICTION", 1)) {
+ return ;
+ }
+ for (i = 0; i < sl->nr_segments; i++) {
+ struct seg_ent *seg = anthy_get_nth_segment(sl, i);
+ xstr *xs = &seg->cands[seg->committed]->str;
+
+ if (seg->committed < 0) {
+ continue;
+ }
+ if (learn_prediction_str(&seg->str, xs)) {
+ added = 1;
+ }
+ }
+ if (added) {
+ anthy_truncate_section(MAX_PREDICTION_ENTRY);
+ }
+}
+
+static void
+learn_unknown(struct segment_list *sl)
+{
+ int i;
+ for (i = 0; i < sl->nr_segments; i++) {
+ struct seg_ent *seg = anthy_get_nth_segment(sl, i);
+ struct cand_ent *ce = seg->cands[seg->committed];
+ if (ce->nr_words == 0) {
+ anthy_add_unknown_word(&seg->str, &ce->str);
+ }
+ }
+}
+
+void
+anthy_do_commit_prediction(xstr *src, xstr *xs)
+{
+ if (anthy_select_section("PREDICTION", 1)) {
+ return ;
+ }
+ learn_prediction_str(src, xs);
+}
+
+void
+anthy_proc_commit(struct segment_list *sl,
+ struct splitter_context *sc)
+{
+ /* 各種の学習を行う */
+ learn_swapped_candidates(sl);
+ learn_resized_segment(sc, sl);
+ clear_resized_segment(sc, sl);
+ learn_ochaire(sc, sl);
+ learn_prediction(sl);
+ learn_unknown(sl);
+ anthy_learn_cand_history(sl);
+}