src-ordering/candhistory.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207

/*
 * 候補の履歴を覚える
 *
 *
 * ある読みの履歴が 候補A 候補B 候補A 候補A 候補A
 * であったというような情報をもとに候補のスコアを加点する。
 *
 * Copyright (C) 2006-2007 TABATA Yusuke
 *
 */
#include <stdlib.h>

#include <anthy/segment.h>
#include <anthy/record.h>
#include "sorter.h"

#define HISTORY_DEPTH 8
#define MAX_HISTORY_ENTRY 200

/** 文節のコミットを履歴に追加する */
static void
learn_cand_history(struct seg_ent *seg)
{
  int nr, i;

  if (anthy_select_section("CAND_HISTORY", 1)) {
    return ;
  }
  if (anthy_select_row(&seg->str, 1)) {
    return ;
  }
  /* シフトする */
  nr = anthy_get_nr_values();
  nr ++;
  if (nr > HISTORY_DEPTH) {
    nr = HISTORY_DEPTH;
  }
  for (i = nr - 1; i > 0; i--) {
    xstr *xs = anthy_get_nth_xstr(i - 1);
    anthy_set_nth_xstr(i, xs);
  }
  /* 0番目に設定 */
  anthy_set_nth_xstr(0, &seg->cands[seg->committed]->str);
  anthy_mark_row_used();
}

static void
learn_suffix_history(struct seg_ent *seg)
{
  int i;
  struct cand_ent *cand = seg->cands[seg->committed];
  if (anthy_select_section("SUFFIX_HISTORY", 1)) {
    return ;
  }
  for (i = 0; i < cand->nr_words; i++) {
    struct cand_elm *elm = &cand->elm[i];
    xstr xs;
    if (elm->nth == -1) {
      continue;
    }
    if (anthy_wtype_get_pos(elm->wt) != POS_SUC) {
      continue;
    }
    if (anthy_select_row(&elm->str, 1)) {
      continue;
    }
    if (anthy_get_nth_dic_ent_str(elm->se, &elm->str, elm->nth, &xs)) {
      continue;
    }
    anthy_set_nth_xstr(0, &xs);
    free(xs.str);
  }
}

/** 外から呼ばれる関数 
 * 履歴に追加する */
void
anthy_learn_cand_history(struct segment_list *sl)
{
  int i, nr = 0;
  for (i = 0; i < sl->nr_segments; i++) {
    struct seg_ent *seg = anthy_get_nth_segment(sl, i);
    xstr *xs = &seg->str;
    if (seg->committed < 0) {
      continue;
    }
    if (anthy_select_row(xs, 0)) {
      if (seg->committed == 0) {
	/* 候補のエントリが無くて、コミットされた候補も先頭のものであればパス */
	continue;
      }
    }
    /**/
    learn_cand_history(seg);
    learn_suffix_history(seg);
    nr ++;
  }
  if (nr > 0) {
    if (!anthy_select_section("CAND_HISTORY", 1)) {
      anthy_truncate_section(MAX_HISTORY_ENTRY);
    }
    if (!anthy_select_section("SUFFIX_HISTORY", 1)) {
      anthy_truncate_section(MAX_HISTORY_ENTRY);
    }
  }
}

/* 履歴をみて候補の重みを計算する */
static int
get_history_weight(xstr *xs)
{
  int i, nr = anthy_get_nr_values();
  int w = 0;
  for (i = 0; i < nr; i++) {
    xstr *h = anthy_get_nth_xstr(i);
    if (!h) {
      continue;
    }
    if (!anthy_xstrcmp(xs, h)) {
      w++;
      if (i == 0) {
	/* 直前に確定されたものには高いスコア*/
	w += (HISTORY_DEPTH / 2);
      }
    }
  }
  return w;
}

static void
reorder_by_candidate(struct seg_ent *se)
{
  int i, primary_score;
  /**/
  if (anthy_select_section("CAND_HISTORY", 1)) {
    return ;
  }
  if (anthy_select_row(&se->str, 0)) {
    return ;
  }
  /* 最も評価の高い候補 */
  primary_score = se->cands[0]->score;
  /**/
  for (i = 0; i < se->nr_cands; i++) {
    struct cand_ent *ce = se->cands[i];
    int weight = get_history_weight(&ce->str);
    ce->score += primary_score / (HISTORY_DEPTH /2) * weight;
  }
  anthy_mark_row_used();
}

/* 接尾辞の学習を適用する */
static void
reorder_by_suffix(struct seg_ent *se)
{
  int i, j;
  int delta = 0;
  int top_cand = -1;
  if (anthy_select_section("SUFFIX_HISTORY", 0)) {
    return ;
  }
  /* 各候補 */
  for (i = 0; i < se->nr_cands; i++) {
    struct cand_ent *ce = se->cands[i];
    /* 候補を構成する各単語 */
    for (j = 0; j < ce->nr_words; j++) {
      struct cand_elm *elm = &ce->elm[j];
      xstr xs;
      if (elm->nth == -1) {
	continue;
      }
      if (anthy_wtype_get_pos(elm->wt) != POS_SUC) {
	continue;
      }
      /* 変換元の文字列をキーに検索 */
      if (anthy_select_row(&elm->str, 0)) {
	continue;
      }
      /* 変換後の文字列を取得 */
      if (anthy_get_nth_dic_ent_str(elm->se, &elm->str, elm->nth, &xs)) {
	continue;
      }
      /* 履歴中の文字列と比較する */
      if (anthy_xstrcmp(&xs, anthy_get_nth_xstr(0))) {
	free(xs.str);
	continue;
      }
      /**/
      if (top_cand < 0) {
	top_cand = i;
      }
      if (delta == 0) {
	delta = (se->cands[top_cand]->score - ce->score) + 1;
      }
      ce->score += delta;
      free(xs.str);
    }
  }
}

/* 履歴で加点する */
void
anthy_reorder_candidates_by_history(struct seg_ent *se)
{
  reorder_by_candidate(se);
  reorder_by_suffix(se);
}