1 files changed, 542 insertions, 0 deletions
diff --git a/src-util/convdb.c b/src-util/convdb.c
new file mode 100644
index 0000000..3512403
--- /dev/null
+++ b/src-util/convdb.c
@@ -0,0 +1,542 @@
+/*
+ * 変換エンジンの内部情報を使うため、意図的に
+ * layer violationを放置している。
+ *
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <anthy/anthy.h>
+#include <anthy/convdb.h>
+#include <anthy/segment.h>
+#include <anthy/feature_set.h>
+/**/
+#include "../src-main/main.h"
+#include "../src-splitter/wordborder.h"
+#include "../src-worddic/dic_ent.h"
+
+
+/* 自立語部か付属語部か */
+#define WORD_INDEP 0
+#define WORD_DEP 1
+
+/* 単語(自立語or付属語) */
+struct word {
+  /* WORD_* */
+  int type;
+  /* 付属語のhash(WORD_INDEP)もしくは変換後の文字列のhash(WORD_DEP) */
+  int hash;
+  /* 読みの文字列のhash */
+  int yomi_hash;
+  /* 変換前の文字列 */
+  xstr *raw_xs;
+  /* 変換後の文字列 */
+  xstr *conv_xs;
+  /* 変換後の品詞 */
+  const char *wt;
+};
+
+static struct cand_ent *
+selected_candidate(struct seg_ent *seg)
+{
+  if (seg->committed > -1) {
+    return seg->cands[seg->committed];
+  }
+  return seg->cands[0];
+}
+
+static void
+get_res(anthy_context_t ac, char *res_buf, int conv)
+{
+  struct anthy_conv_stat acs;
+  int i;
+
+  anthy_get_stat(ac, &acs);
+  res_buf[0] = 0;
+  if (!conv) {
+    strcat(res_buf, "|");
+  }
+  for (i = 0; i < acs.nr_segment; i++) {
+    char buf[1024];
+    if (conv) {
+      anthy_get_segment(ac, i, 0, buf, 1024);
+      strcat(res_buf, buf);
+    } else {
+      anthy_get_segment(ac, i, NTH_UNCONVERTED_CANDIDATE, buf, 1024);
+      strcat(res_buf, buf);
+      strcat(res_buf, "|");
+    }
+  }
+}
+
+static struct conv_res *
+do_find_conv_res(struct res_db *db, const char *src, const char *res)
+{
+  struct conv_res *cr;
+
+  for (cr = db->res_list.next; cr; cr = cr->next) {
+    if (((!cr->res_str && !res) ||
+	 !strcmp(cr->res_str, res)) &&
+	!strcmp(cr->src_str, src)) {
+      return cr;
+    }
+  }
+  cr = (struct conv_res *)malloc(sizeof(struct conv_res));
+  cr->src_str = strdup(src);
+  if (res) {
+    cr->res_str = strdup(res);
+  } else {
+    cr->res_str = NULL;
+  }
+  cr->cand_str = NULL;
+  cr->check = CHK_UNKNOWN;
+  cr->used = 0;
+  cr->cand_check = NULL;
+  /**/
+  db->tail->next = cr;
+  cr->next = NULL;
+  db->tail = cr;
+  return cr;
+}
+
+struct conv_res *
+find_conv_res(struct res_db *db, anthy_context_t ac,
+	      const char *src, int conv)
+{
+  char res_buf[1024];
+  get_res(ac, res_buf, conv);
+
+  return do_find_conv_res(db, src, res_buf);
+}
+
+static void
+chomp_line(char *buf)
+{
+  int len = strlen(buf);
+  if (buf[len-1] == '\n') {
+    buf[len-1] = 0;
+  }
+}
+
+struct res_db *
+create_db(void)
+{
+  struct res_db *db;
+
+  db = malloc(sizeof(struct res_db));
+  db->res_list.next = NULL;
+  db->tail = &db->res_list;
+  db->total = 0;
+  db->res.unknown = 0;
+  db->res.ok = 0;
+  db->res.miss = 0;
+  db->res.dontcare = 0;
+  db->split.unknown = 0;
+  db->split.ok = 0;
+  db->split.miss = 0;
+  db->split.dontcare = 0;
+
+  return db;
+}
+
+static void
+strip_separator_vbar(char *buf, const char *str)
+{
+  const char *src = str;
+  char *dst = buf;
+  while (*src) {
+    if (*src != '|' && *src != '~') {
+      *dst = *src;
+      dst ++;
+    }
+    src ++;
+  }
+  *dst = 0;
+}
+
+static void
+parse_line(struct res_db *db, char *line)
+{
+  char buf1[1024], buf2[1024], buf3[1024], buf4[1024];
+  char *src, *res;
+  const char *check;
+  struct conv_res *cr;
+  int nr;
+  chomp_line(line);
+  if (line[0] == '#' || line[0] == 0) {
+    return ;
+  }
+  nr = sscanf(line, "%s %s %s", buf1, buf2, buf3);
+  if (nr == 1) {
+    cr = do_find_conv_res(db, buf1, NULL);
+    cr->check = CHK_UNKNOWN;
+    return ;
+  }
+  if (nr < 2) {
+    return ;
+  }
+  if (buf1[0] != '|') {
+    /* buf1 buf2    buf3
+     * 平文 区切り文
+     * 平文 区切り文 変換後
+     * 平文 区切り文 check
+     */
+    src = buf1;
+    res = buf2;
+    if (nr == 3) {
+      check = buf3;
+    } else {
+      check = "?";
+    }
+  } else {
+    /* buf1    buf2  (buf3)
+     * 区切り文
+     * 区切り文 変換後
+     * 区切り文 check
+     */
+    strip_separator_vbar(buf4, buf1);
+    src = buf4;
+    res = buf1;
+    check = buf2;
+  }
+  cr = do_find_conv_res(db, src, res);
+  if (nr == 2 && check[0] != '|') {
+    cr->check = CHK_OK;
+    return ;
+  }
+  if (check[0] == 'O') {
+    cr->check = CHK_OK;
+  } else if (check[0] == 'X') {
+    cr->check = CHK_MISS;
+  } else if (check[0] == '*') {
+    cr->check = CHK_DONTCARE;
+  } else if (check[0] == '|') {
+    cr->check = CHK_UNKNOWN;
+    cr->cand_str = strdup(check);
+  } else {
+    cr->check = CHK_UNKNOWN;
+  }
+}
+
+void
+read_db(struct res_db *db, const char *fn)
+{
+  FILE *fp;
+  char line[1024];
+
+  if (!fn) {
+    return ;
+  }
+  fp = fopen(fn, "r");
+  if (!fp) {
+    return ;
+  }
+  while (fgets(line, 1024, fp)) {
+    parse_line(db, line);
+  }
+}
+
+static void
+fill_conv_info(struct word *w, struct cand_elm *elm)
+{
+  /*w->conv_xs, w->wt*/
+  struct dic_ent *de;
+  if (elm->nth == -1 ||
+      elm->nth >= elm->se->nr_dic_ents) {
+    w->conv_xs = NULL;
+    w->wt = NULL;
+    return ;
+  }
+  if (!elm->se->dic_ents) {
+    w->conv_xs = NULL;
+    w->wt = NULL;
+    return ;
+  }
+  /**/
+  de = elm->se->dic_ents[elm->nth];
+  w->conv_xs = anthy_xstr_dup(&de->str);
+  w->wt = de->wt_name;
+  w->hash = anthy_xstr_hash(w->conv_xs);
+}
+
+static void
+init_word(struct word *w, int type)
+{
+  w->type = type;
+  w->raw_xs = NULL;
+  w->conv_xs = NULL;
+  w->wt = NULL;
+}
+
+static void
+free_word(struct word *w)
+{
+  anthy_free_xstr(w->raw_xs);
+  anthy_free_xstr(w->conv_xs);
+}
+
+/* 自立語を作る */
+static void
+fill_indep_word(struct word *w, struct cand_elm *elm)
+{
+  init_word(w, WORD_INDEP);
+  /* 変換前の読みを取得する */
+  w->raw_xs = anthy_xstr_dup(&elm->str);
+  w->yomi_hash = anthy_xstr_hash(w->raw_xs);
+  w->hash = 0;
+  /**/
+  fill_conv_info(w, elm);
+}
+
+/* 付属語を作る */
+static void
+fill_dep_word(struct word *w, struct cand_elm *elm)
+{
+  init_word(w, WORD_DEP);
+  /**/
+  w->hash = anthy_xstr_hash(&elm->str);
+  w->yomi_hash = w->hash;
+  w->raw_xs = anthy_xstr_dup(&elm->str);
+}
+
+static void
+print_features(struct feature_list *fl)
+{
+  int i, nr;
+  if (!fl) {
+    return ;
+  }
+  nr = anthy_feature_list_nr(fl);
+  if (nr == 0) {
+    return ;
+  }
+  printf(" features=");
+  for (i = 0; i < nr; i++) {
+    if (i > 0) {
+      printf(",");
+    }
+    printf("%d", anthy_feature_list_nth(fl, i));
+  }
+}
+
+static void
+print_word(const char *prefix, struct word *w, struct feature_list *fl)
+{
+  printf("%s", prefix);
+  if (w->type == WORD_DEP) {
+    /* 付属語 */
+    printf("dep_word hash=%d ", w->hash);
+    anthy_putxstrln(w->raw_xs);
+    return ;
+  }
+  /* 自立語 */
+  printf("indep_word hash=%d", w->hash);
+  /**/
+  if (fl) {
+    print_features(fl);
+  }
+  /* 品詞 */
+  if (w->wt) {
+    printf(" %s", w->wt);
+  } else {
+    printf(" null");
+  }
+  /* 文字列 */
+  if (w->conv_xs) {
+    printf(" ");
+    anthy_putxstr(w->conv_xs);
+  } else {
+    printf(" null");
+  }
+  printf(" ");
+  anthy_putxstrln(w->raw_xs);
+}
+
+/** segの文節クラスを返す
+ * segがnullであれば、clをクラスとする
+ */
+static int
+get_seg_class(struct seg_ent *seg, int cl)
+{
+  struct cand_ent *ce;
+  if (!seg) {
+    return cl;
+  }
+  ce = selected_candidate(seg);
+  if (ce->mw) {
+    return ce->mw->seg_class;
+  }
+  return SEG_BUNSETSU;
+}
+
+static void
+set_features(struct feature_list *fl,
+	     struct seg_ent *prev_seg,
+	     struct seg_ent *cur_seg)
+{
+  int cl, pc;
+  cl = get_seg_class(cur_seg, SEG_TAIL);
+  pc = get_seg_class(prev_seg, SEG_HEAD);
+
+  anthy_feature_list_set_cur_class(fl, cl);
+  if (cur_seg) {
+    struct cand_ent *ce =  selected_candidate(cur_seg);
+    anthy_feature_list_set_dep_word(fl, ce->dep_word_hash);
+    if (ce->mw) {
+      anthy_feature_list_set_dep_class(fl, ce->mw->dep_class);
+      anthy_feature_list_set_mw_features(fl, ce->mw->mw_features);
+      anthy_feature_list_set_noun_cos(fl, ce->mw->core_wt);
+    }
+  }
+  anthy_feature_list_set_class_trans(fl, pc, cl);
+  /**/
+  anthy_feature_list_sort(fl);
+}
+
+static void
+print_element(const char *prefix,
+	      struct cand_elm *elm, struct feature_list *fl)
+{
+  struct word w;
+
+  if (elm->str.len == 0) {
+    return ;
+  }
+  if (elm->id != -1) {
+    /* 自立語 */
+    fill_indep_word(&w, elm);
+    print_word(prefix, &w, fl);
+  } else {
+    /* 付属語 */
+    fill_dep_word(&w, elm);
+    print_word(prefix, &w, NULL);
+  }
+  free_word(&w);
+}
+
+static void
+print_unconverted(struct cand_ent *ce)
+{
+  printf("unknown ");
+  anthy_putxstrln(&ce->str);
+}
+
+static void
+print_eos(struct seg_ent *prev_seg)
+{
+  struct feature_list fl;
+  anthy_feature_list_init(&fl);
+  set_features(&fl, prev_seg, NULL);
+  printf("eos ");
+  print_features(&fl);
+  printf("\n");
+  anthy_feature_list_free(&fl);
+}
+
+/* 候補のミスには '~'、文節長のミスには '!'を付ける
+ * 同じ文節内の二つめ以降の自立語には '^'を付ける
+ */
+static const char *
+get_prefix(int flag)
+{
+  if (flag & CONV_INVALID) {
+    return "^";
+  }
+  if (flag & CONV_SIZE_MISS) {
+    return "!";
+  }
+  if (flag & CONV_CAND_MISS) {
+    return "~";
+  }
+  return "";
+}
+
+static void
+print_segment_info(int is_negative,
+		   struct seg_ent *prev_seg,
+		   struct seg_ent *seg)
+{
+  int i;
+  struct feature_list fl;
+  struct cand_ent *ce =  selected_candidate(seg);
+  int nr_indep = 0;
+  const char *prefix = get_prefix(is_negative);
+
+  anthy_feature_list_init(&fl);
+  set_features(&fl, prev_seg, seg);
+  for (i = 0; i < ce->nr_words; i++) {
+    struct cand_elm *elm = &ce->elm[i];
+    prefix = get_prefix(is_negative);
+    if (nr_indep > 0 && elm->id != -1) {
+      prefix = get_prefix(is_negative | CONV_INVALID);
+    }
+    /* 出力する */
+    print_element(prefix, elm, &fl);
+    /* 自立語を数える */
+    if (elm->id != -1) {
+      nr_indep ++;
+    }
+  }
+  anthy_feature_list_free(&fl);
+}
+
+void
+print_size_miss_segment_info(anthy_context_t ac, int nth)
+{
+  struct seg_ent *prev_seg = NULL;
+  struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, nth);
+  if (nth > 0) {
+    prev_seg = anthy_get_nth_segment(&ac->seg_list, nth - 1);
+  }
+  print_segment_info(CONV_SIZE_MISS, prev_seg, seg);
+}
+
+void
+print_cand_miss_segment_info(anthy_context_t ac, int nth)
+{
+  struct seg_ent *prev_seg = NULL;
+  struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, nth);
+  if (nth > 0) {
+    prev_seg = anthy_get_nth_segment(&ac->seg_list, nth - 1);
+  }
+  print_segment_info(CONV_CAND_MISS, prev_seg, seg);
+}
+
+void
+print_context_info(anthy_context_t ac, struct conv_res *cr)
+{
+  int i;
+  struct seg_ent *prev_seg = NULL;
+
+  printf("segments: %d\n", ac->seg_list.nr_segments);
+  /* 各文節に対して */
+  for (i = 0; i < ac->seg_list.nr_segments; i++) {
+    struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, i);
+    struct cand_ent *ce = selected_candidate(seg);
+    int is_negative = 0;
+    if (cr && cr->cand_check && cr->cand_check[i]) {
+      is_negative = CONV_CAND_MISS;
+    }
+
+    /* 各要素に対して */
+    if (!ce->nr_words) {
+      /* 要素が無いものはそのまま表示 */
+      print_unconverted(ce);
+    } else {
+      /* 候補の変更があった場合はそれを表示 */
+      if (seg->committed > 0) {
+	int tmp = seg->committed;
+	seg->committed = 0;
+	print_cand_miss_segment_info(ac, i);
+	seg->committed = tmp;
+      }
+      /* 文節の構成を表示 */
+      print_segment_info(is_negative, prev_seg, seg);
+    }
+    /**/
+    prev_seg = seg;
+  }
+  print_eos(prev_seg);
+  printf("\n");
+}