summaryrefslogtreecommitdiff
path: root/src-util/convdb.c
diff options
context:
space:
mode:
Diffstat (limited to 'src-util/convdb.c')
-rw-r--r--src-util/convdb.c542
1 files changed, 542 insertions, 0 deletions
diff --git a/src-util/convdb.c b/src-util/convdb.c
new file mode 100644
index 0000000..3512403
--- /dev/null
+++ b/src-util/convdb.c
@@ -0,0 +1,542 @@
+/*
+ * 変換エンジンの内部情報を使うため、意図的に
+ * layer violationを放置している。
+ *
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <anthy/anthy.h>
+#include <anthy/convdb.h>
+#include <anthy/segment.h>
+#include <anthy/feature_set.h>
+/**/
+#include "../src-main/main.h"
+#include "../src-splitter/wordborder.h"
+#include "../src-worddic/dic_ent.h"
+
+
+/* 自立語部か付属語部か */
+#define WORD_INDEP 0
+#define WORD_DEP 1
+
+/* 単語(自立語or付属語) */
+struct word {
+ /* WORD_* */
+ int type;
+ /* 付属語のhash(WORD_INDEP)もしくは変換後の文字列のhash(WORD_DEP) */
+ int hash;
+ /* 読みの文字列のhash */
+ int yomi_hash;
+ /* 変換前の文字列 */
+ xstr *raw_xs;
+ /* 変換後の文字列 */
+ xstr *conv_xs;
+ /* 変換後の品詞 */
+ const char *wt;
+};
+
+static struct cand_ent *
+selected_candidate(struct seg_ent *seg)
+{
+ if (seg->committed > -1) {
+ return seg->cands[seg->committed];
+ }
+ return seg->cands[0];
+}
+
+static void
+get_res(anthy_context_t ac, char *res_buf, int conv)
+{
+ struct anthy_conv_stat acs;
+ int i;
+
+ anthy_get_stat(ac, &acs);
+ res_buf[0] = 0;
+ if (!conv) {
+ strcat(res_buf, "|");
+ }
+ for (i = 0; i < acs.nr_segment; i++) {
+ char buf[1024];
+ if (conv) {
+ anthy_get_segment(ac, i, 0, buf, 1024);
+ strcat(res_buf, buf);
+ } else {
+ anthy_get_segment(ac, i, NTH_UNCONVERTED_CANDIDATE, buf, 1024);
+ strcat(res_buf, buf);
+ strcat(res_buf, "|");
+ }
+ }
+}
+
+static struct conv_res *
+do_find_conv_res(struct res_db *db, const char *src, const char *res)
+{
+ struct conv_res *cr;
+
+ for (cr = db->res_list.next; cr; cr = cr->next) {
+ if (((!cr->res_str && !res) ||
+ !strcmp(cr->res_str, res)) &&
+ !strcmp(cr->src_str, src)) {
+ return cr;
+ }
+ }
+ cr = (struct conv_res *)malloc(sizeof(struct conv_res));
+ cr->src_str = strdup(src);
+ if (res) {
+ cr->res_str = strdup(res);
+ } else {
+ cr->res_str = NULL;
+ }
+ cr->cand_str = NULL;
+ cr->check = CHK_UNKNOWN;
+ cr->used = 0;
+ cr->cand_check = NULL;
+ /**/
+ db->tail->next = cr;
+ cr->next = NULL;
+ db->tail = cr;
+ return cr;
+}
+
+struct conv_res *
+find_conv_res(struct res_db *db, anthy_context_t ac,
+ const char *src, int conv)
+{
+ char res_buf[1024];
+ get_res(ac, res_buf, conv);
+
+ return do_find_conv_res(db, src, res_buf);
+}
+
+static void
+chomp_line(char *buf)
+{
+ int len = strlen(buf);
+ if (buf[len-1] == '\n') {
+ buf[len-1] = 0;
+ }
+}
+
+struct res_db *
+create_db(void)
+{
+ struct res_db *db;
+
+ db = malloc(sizeof(struct res_db));
+ db->res_list.next = NULL;
+ db->tail = &db->res_list;
+ db->total = 0;
+ db->res.unknown = 0;
+ db->res.ok = 0;
+ db->res.miss = 0;
+ db->res.dontcare = 0;
+ db->split.unknown = 0;
+ db->split.ok = 0;
+ db->split.miss = 0;
+ db->split.dontcare = 0;
+
+ return db;
+}
+
+static void
+strip_separator_vbar(char *buf, const char *str)
+{
+ const char *src = str;
+ char *dst = buf;
+ while (*src) {
+ if (*src != '|' && *src != '~') {
+ *dst = *src;
+ dst ++;
+ }
+ src ++;
+ }
+ *dst = 0;
+}
+
+static void
+parse_line(struct res_db *db, char *line)
+{
+ char buf1[1024], buf2[1024], buf3[1024], buf4[1024];
+ char *src, *res;
+ const char *check;
+ struct conv_res *cr;
+ int nr;
+ chomp_line(line);
+ if (line[0] == '#' || line[0] == 0) {
+ return ;
+ }
+ nr = sscanf(line, "%s %s %s", buf1, buf2, buf3);
+ if (nr == 1) {
+ cr = do_find_conv_res(db, buf1, NULL);
+ cr->check = CHK_UNKNOWN;
+ return ;
+ }
+ if (nr < 2) {
+ return ;
+ }
+ if (buf1[0] != '|') {
+ /* buf1 buf2 buf3
+ * 平文 区切り文
+ * 平文 区切り文 変換後
+ * 平文 区切り文 check
+ */
+ src = buf1;
+ res = buf2;
+ if (nr == 3) {
+ check = buf3;
+ } else {
+ check = "?";
+ }
+ } else {
+ /* buf1 buf2 (buf3)
+ * 区切り文
+ * 区切り文 変換後
+ * 区切り文 check
+ */
+ strip_separator_vbar(buf4, buf1);
+ src = buf4;
+ res = buf1;
+ check = buf2;
+ }
+ cr = do_find_conv_res(db, src, res);
+ if (nr == 2 && check[0] != '|') {
+ cr->check = CHK_OK;
+ return ;
+ }
+ if (check[0] == 'O') {
+ cr->check = CHK_OK;
+ } else if (check[0] == 'X') {
+ cr->check = CHK_MISS;
+ } else if (check[0] == '*') {
+ cr->check = CHK_DONTCARE;
+ } else if (check[0] == '|') {
+ cr->check = CHK_UNKNOWN;
+ cr->cand_str = strdup(check);
+ } else {
+ cr->check = CHK_UNKNOWN;
+ }
+}
+
+void
+read_db(struct res_db *db, const char *fn)
+{
+ FILE *fp;
+ char line[1024];
+
+ if (!fn) {
+ return ;
+ }
+ fp = fopen(fn, "r");
+ if (!fp) {
+ return ;
+ }
+ while (fgets(line, 1024, fp)) {
+ parse_line(db, line);
+ }
+}
+
+static void
+fill_conv_info(struct word *w, struct cand_elm *elm)
+{
+ /*w->conv_xs, w->wt*/
+ struct dic_ent *de;
+ if (elm->nth == -1 ||
+ elm->nth >= elm->se->nr_dic_ents) {
+ w->conv_xs = NULL;
+ w->wt = NULL;
+ return ;
+ }
+ if (!elm->se->dic_ents) {
+ w->conv_xs = NULL;
+ w->wt = NULL;
+ return ;
+ }
+ /**/
+ de = elm->se->dic_ents[elm->nth];
+ w->conv_xs = anthy_xstr_dup(&de->str);
+ w->wt = de->wt_name;
+ w->hash = anthy_xstr_hash(w->conv_xs);
+}
+
+static void
+init_word(struct word *w, int type)
+{
+ w->type = type;
+ w->raw_xs = NULL;
+ w->conv_xs = NULL;
+ w->wt = NULL;
+}
+
+static void
+free_word(struct word *w)
+{
+ anthy_free_xstr(w->raw_xs);
+ anthy_free_xstr(w->conv_xs);
+}
+
+/* 自立語を作る */
+static void
+fill_indep_word(struct word *w, struct cand_elm *elm)
+{
+ init_word(w, WORD_INDEP);
+ /* 変換前の読みを取得する */
+ w->raw_xs = anthy_xstr_dup(&elm->str);
+ w->yomi_hash = anthy_xstr_hash(w->raw_xs);
+ w->hash = 0;
+ /**/
+ fill_conv_info(w, elm);
+}
+
+/* 付属語を作る */
+static void
+fill_dep_word(struct word *w, struct cand_elm *elm)
+{
+ init_word(w, WORD_DEP);
+ /**/
+ w->hash = anthy_xstr_hash(&elm->str);
+ w->yomi_hash = w->hash;
+ w->raw_xs = anthy_xstr_dup(&elm->str);
+}
+
+static void
+print_features(struct feature_list *fl)
+{
+ int i, nr;
+ if (!fl) {
+ return ;
+ }
+ nr = anthy_feature_list_nr(fl);
+ if (nr == 0) {
+ return ;
+ }
+ printf(" features=");
+ for (i = 0; i < nr; i++) {
+ if (i > 0) {
+ printf(",");
+ }
+ printf("%d", anthy_feature_list_nth(fl, i));
+ }
+}
+
+static void
+print_word(const char *prefix, struct word *w, struct feature_list *fl)
+{
+ printf("%s", prefix);
+ if (w->type == WORD_DEP) {
+ /* 付属語 */
+ printf("dep_word hash=%d ", w->hash);
+ anthy_putxstrln(w->raw_xs);
+ return ;
+ }
+ /* 自立語 */
+ printf("indep_word hash=%d", w->hash);
+ /**/
+ if (fl) {
+ print_features(fl);
+ }
+ /* 品詞 */
+ if (w->wt) {
+ printf(" %s", w->wt);
+ } else {
+ printf(" null");
+ }
+ /* 文字列 */
+ if (w->conv_xs) {
+ printf(" ");
+ anthy_putxstr(w->conv_xs);
+ } else {
+ printf(" null");
+ }
+ printf(" ");
+ anthy_putxstrln(w->raw_xs);
+}
+
+/** segの文節クラスを返す
+ * segがnullであれば、clをクラスとする
+ */
+static int
+get_seg_class(struct seg_ent *seg, int cl)
+{
+ struct cand_ent *ce;
+ if (!seg) {
+ return cl;
+ }
+ ce = selected_candidate(seg);
+ if (ce->mw) {
+ return ce->mw->seg_class;
+ }
+ return SEG_BUNSETSU;
+}
+
+static void
+set_features(struct feature_list *fl,
+ struct seg_ent *prev_seg,
+ struct seg_ent *cur_seg)
+{
+ int cl, pc;
+ cl = get_seg_class(cur_seg, SEG_TAIL);
+ pc = get_seg_class(prev_seg, SEG_HEAD);
+
+ anthy_feature_list_set_cur_class(fl, cl);
+ if (cur_seg) {
+ struct cand_ent *ce = selected_candidate(cur_seg);
+ anthy_feature_list_set_dep_word(fl, ce->dep_word_hash);
+ if (ce->mw) {
+ anthy_feature_list_set_dep_class(fl, ce->mw->dep_class);
+ anthy_feature_list_set_mw_features(fl, ce->mw->mw_features);
+ anthy_feature_list_set_noun_cos(fl, ce->mw->core_wt);
+ }
+ }
+ anthy_feature_list_set_class_trans(fl, pc, cl);
+ /**/
+ anthy_feature_list_sort(fl);
+}
+
+static void
+print_element(const char *prefix,
+ struct cand_elm *elm, struct feature_list *fl)
+{
+ struct word w;
+
+ if (elm->str.len == 0) {
+ return ;
+ }
+ if (elm->id != -1) {
+ /* 自立語 */
+ fill_indep_word(&w, elm);
+ print_word(prefix, &w, fl);
+ } else {
+ /* 付属語 */
+ fill_dep_word(&w, elm);
+ print_word(prefix, &w, NULL);
+ }
+ free_word(&w);
+}
+
+static void
+print_unconverted(struct cand_ent *ce)
+{
+ printf("unknown ");
+ anthy_putxstrln(&ce->str);
+}
+
+static void
+print_eos(struct seg_ent *prev_seg)
+{
+ struct feature_list fl;
+ anthy_feature_list_init(&fl);
+ set_features(&fl, prev_seg, NULL);
+ printf("eos ");
+ print_features(&fl);
+ printf("\n");
+ anthy_feature_list_free(&fl);
+}
+
+/* 候補のミスには '~'、文節長のミスには '!'を付ける
+ * 同じ文節内の二つめ以降の自立語には '^'を付ける
+ */
+static const char *
+get_prefix(int flag)
+{
+ if (flag & CONV_INVALID) {
+ return "^";
+ }
+ if (flag & CONV_SIZE_MISS) {
+ return "!";
+ }
+ if (flag & CONV_CAND_MISS) {
+ return "~";
+ }
+ return "";
+}
+
+static void
+print_segment_info(int is_negative,
+ struct seg_ent *prev_seg,
+ struct seg_ent *seg)
+{
+ int i;
+ struct feature_list fl;
+ struct cand_ent *ce = selected_candidate(seg);
+ int nr_indep = 0;
+ const char *prefix = get_prefix(is_negative);
+
+ anthy_feature_list_init(&fl);
+ set_features(&fl, prev_seg, seg);
+ for (i = 0; i < ce->nr_words; i++) {
+ struct cand_elm *elm = &ce->elm[i];
+ prefix = get_prefix(is_negative);
+ if (nr_indep > 0 && elm->id != -1) {
+ prefix = get_prefix(is_negative | CONV_INVALID);
+ }
+ /* 出力する */
+ print_element(prefix, elm, &fl);
+ /* 自立語を数える */
+ if (elm->id != -1) {
+ nr_indep ++;
+ }
+ }
+ anthy_feature_list_free(&fl);
+}
+
+void
+print_size_miss_segment_info(anthy_context_t ac, int nth)
+{
+ struct seg_ent *prev_seg = NULL;
+ struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, nth);
+ if (nth > 0) {
+ prev_seg = anthy_get_nth_segment(&ac->seg_list, nth - 1);
+ }
+ print_segment_info(CONV_SIZE_MISS, prev_seg, seg);
+}
+
+void
+print_cand_miss_segment_info(anthy_context_t ac, int nth)
+{
+ struct seg_ent *prev_seg = NULL;
+ struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, nth);
+ if (nth > 0) {
+ prev_seg = anthy_get_nth_segment(&ac->seg_list, nth - 1);
+ }
+ print_segment_info(CONV_CAND_MISS, prev_seg, seg);
+}
+
+void
+print_context_info(anthy_context_t ac, struct conv_res *cr)
+{
+ int i;
+ struct seg_ent *prev_seg = NULL;
+
+ printf("segments: %d\n", ac->seg_list.nr_segments);
+ /* 各文節に対して */
+ for (i = 0; i < ac->seg_list.nr_segments; i++) {
+ struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, i);
+ struct cand_ent *ce = selected_candidate(seg);
+ int is_negative = 0;
+ if (cr && cr->cand_check && cr->cand_check[i]) {
+ is_negative = CONV_CAND_MISS;
+ }
+
+ /* 各要素に対して */
+ if (!ce->nr_words) {
+ /* 要素が無いものはそのまま表示 */
+ print_unconverted(ce);
+ } else {
+ /* 候補の変更があった場合はそれを表示 */
+ if (seg->committed > 0) {
+ int tmp = seg->committed;
+ seg->committed = 0;
+ print_cand_miss_segment_info(ac, i);
+ seg->committed = tmp;
+ }
+ /* 文節の構成を表示 */
+ print_segment_info(is_negative, prev_seg, seg);
+ }
+ /**/
+ prev_seg = seg;
+ }
+ print_eos(prev_seg);
+ printf("\n");
+}