diff options
Diffstat (limited to 'src-util/convdb.c')
-rw-r--r-- | src-util/convdb.c | 542 |
1 files changed, 542 insertions, 0 deletions
diff --git a/src-util/convdb.c b/src-util/convdb.c new file mode 100644 index 0000000..3512403 --- /dev/null +++ b/src-util/convdb.c @@ -0,0 +1,542 @@ +/* + * 変換エンジンの内部情報を使うため、意図的に + * layer violationを放置している。 + * + */ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#include <anthy/anthy.h> +#include <anthy/convdb.h> +#include <anthy/segment.h> +#include <anthy/feature_set.h> +/**/ +#include "../src-main/main.h" +#include "../src-splitter/wordborder.h" +#include "../src-worddic/dic_ent.h" + + +/* 自立語部か付属語部か */ +#define WORD_INDEP 0 +#define WORD_DEP 1 + +/* 単語(自立語or付属語) */ +struct word { + /* WORD_* */ + int type; + /* 付属語のhash(WORD_INDEP)もしくは変換後の文字列のhash(WORD_DEP) */ + int hash; + /* 読みの文字列のhash */ + int yomi_hash; + /* 変換前の文字列 */ + xstr *raw_xs; + /* 変換後の文字列 */ + xstr *conv_xs; + /* 変換後の品詞 */ + const char *wt; +}; + +static struct cand_ent * +selected_candidate(struct seg_ent *seg) +{ + if (seg->committed > -1) { + return seg->cands[seg->committed]; + } + return seg->cands[0]; +} + +static void +get_res(anthy_context_t ac, char *res_buf, int conv) +{ + struct anthy_conv_stat acs; + int i; + + anthy_get_stat(ac, &acs); + res_buf[0] = 0; + if (!conv) { + strcat(res_buf, "|"); + } + for (i = 0; i < acs.nr_segment; i++) { + char buf[1024]; + if (conv) { + anthy_get_segment(ac, i, 0, buf, 1024); + strcat(res_buf, buf); + } else { + anthy_get_segment(ac, i, NTH_UNCONVERTED_CANDIDATE, buf, 1024); + strcat(res_buf, buf); + strcat(res_buf, "|"); + } + } +} + +static struct conv_res * +do_find_conv_res(struct res_db *db, const char *src, const char *res) +{ + struct conv_res *cr; + + for (cr = db->res_list.next; cr; cr = cr->next) { + if (((!cr->res_str && !res) || + !strcmp(cr->res_str, res)) && + !strcmp(cr->src_str, src)) { + return cr; + } + } + cr = (struct conv_res *)malloc(sizeof(struct conv_res)); + cr->src_str = strdup(src); + if (res) { + cr->res_str = strdup(res); + } else { + cr->res_str = NULL; + } + cr->cand_str = NULL; + cr->check = CHK_UNKNOWN; + cr->used = 0; + cr->cand_check = NULL; + /**/ + db->tail->next = cr; + cr->next = NULL; + db->tail = cr; + return cr; +} + +struct conv_res * +find_conv_res(struct res_db *db, anthy_context_t ac, + const char *src, int conv) +{ + char res_buf[1024]; + get_res(ac, res_buf, conv); + + return do_find_conv_res(db, src, res_buf); +} + +static void +chomp_line(char *buf) +{ + int len = strlen(buf); + if (buf[len-1] == '\n') { + buf[len-1] = 0; + } +} + +struct res_db * +create_db(void) +{ + struct res_db *db; + + db = malloc(sizeof(struct res_db)); + db->res_list.next = NULL; + db->tail = &db->res_list; + db->total = 0; + db->res.unknown = 0; + db->res.ok = 0; + db->res.miss = 0; + db->res.dontcare = 0; + db->split.unknown = 0; + db->split.ok = 0; + db->split.miss = 0; + db->split.dontcare = 0; + + return db; +} + +static void +strip_separator_vbar(char *buf, const char *str) +{ + const char *src = str; + char *dst = buf; + while (*src) { + if (*src != '|' && *src != '~') { + *dst = *src; + dst ++; + } + src ++; + } + *dst = 0; +} + +static void +parse_line(struct res_db *db, char *line) +{ + char buf1[1024], buf2[1024], buf3[1024], buf4[1024]; + char *src, *res; + const char *check; + struct conv_res *cr; + int nr; + chomp_line(line); + if (line[0] == '#' || line[0] == 0) { + return ; + } + nr = sscanf(line, "%s %s %s", buf1, buf2, buf3); + if (nr == 1) { + cr = do_find_conv_res(db, buf1, NULL); + cr->check = CHK_UNKNOWN; + return ; + } + if (nr < 2) { + return ; + } + if (buf1[0] != '|') { + /* buf1 buf2 buf3 + * 平文 区切り文 + * 平文 区切り文 変換後 + * 平文 区切り文 check + */ + src = buf1; + res = buf2; + if (nr == 3) { + check = buf3; + } else { + check = "?"; + } + } else { + /* buf1 buf2 (buf3) + * 区切り文 + * 区切り文 変換後 + * 区切り文 check + */ + strip_separator_vbar(buf4, buf1); + src = buf4; + res = buf1; + check = buf2; + } + cr = do_find_conv_res(db, src, res); + if (nr == 2 && check[0] != '|') { + cr->check = CHK_OK; + return ; + } + if (check[0] == 'O') { + cr->check = CHK_OK; + } else if (check[0] == 'X') { + cr->check = CHK_MISS; + } else if (check[0] == '*') { + cr->check = CHK_DONTCARE; + } else if (check[0] == '|') { + cr->check = CHK_UNKNOWN; + cr->cand_str = strdup(check); + } else { + cr->check = CHK_UNKNOWN; + } +} + +void +read_db(struct res_db *db, const char *fn) +{ + FILE *fp; + char line[1024]; + + if (!fn) { + return ; + } + fp = fopen(fn, "r"); + if (!fp) { + return ; + } + while (fgets(line, 1024, fp)) { + parse_line(db, line); + } +} + +static void +fill_conv_info(struct word *w, struct cand_elm *elm) +{ + /*w->conv_xs, w->wt*/ + struct dic_ent *de; + if (elm->nth == -1 || + elm->nth >= elm->se->nr_dic_ents) { + w->conv_xs = NULL; + w->wt = NULL; + return ; + } + if (!elm->se->dic_ents) { + w->conv_xs = NULL; + w->wt = NULL; + return ; + } + /**/ + de = elm->se->dic_ents[elm->nth]; + w->conv_xs = anthy_xstr_dup(&de->str); + w->wt = de->wt_name; + w->hash = anthy_xstr_hash(w->conv_xs); +} + +static void +init_word(struct word *w, int type) +{ + w->type = type; + w->raw_xs = NULL; + w->conv_xs = NULL; + w->wt = NULL; +} + +static void +free_word(struct word *w) +{ + anthy_free_xstr(w->raw_xs); + anthy_free_xstr(w->conv_xs); +} + +/* 自立語を作る */ +static void +fill_indep_word(struct word *w, struct cand_elm *elm) +{ + init_word(w, WORD_INDEP); + /* 変換前の読みを取得する */ + w->raw_xs = anthy_xstr_dup(&elm->str); + w->yomi_hash = anthy_xstr_hash(w->raw_xs); + w->hash = 0; + /**/ + fill_conv_info(w, elm); +} + +/* 付属語を作る */ +static void +fill_dep_word(struct word *w, struct cand_elm *elm) +{ + init_word(w, WORD_DEP); + /**/ + w->hash = anthy_xstr_hash(&elm->str); + w->yomi_hash = w->hash; + w->raw_xs = anthy_xstr_dup(&elm->str); +} + +static void +print_features(struct feature_list *fl) +{ + int i, nr; + if (!fl) { + return ; + } + nr = anthy_feature_list_nr(fl); + if (nr == 0) { + return ; + } + printf(" features="); + for (i = 0; i < nr; i++) { + if (i > 0) { + printf(","); + } + printf("%d", anthy_feature_list_nth(fl, i)); + } +} + +static void +print_word(const char *prefix, struct word *w, struct feature_list *fl) +{ + printf("%s", prefix); + if (w->type == WORD_DEP) { + /* 付属語 */ + printf("dep_word hash=%d ", w->hash); + anthy_putxstrln(w->raw_xs); + return ; + } + /* 自立語 */ + printf("indep_word hash=%d", w->hash); + /**/ + if (fl) { + print_features(fl); + } + /* 品詞 */ + if (w->wt) { + printf(" %s", w->wt); + } else { + printf(" null"); + } + /* 文字列 */ + if (w->conv_xs) { + printf(" "); + anthy_putxstr(w->conv_xs); + } else { + printf(" null"); + } + printf(" "); + anthy_putxstrln(w->raw_xs); +} + +/** segの文節クラスを返す + * segがnullであれば、clをクラスとする + */ +static int +get_seg_class(struct seg_ent *seg, int cl) +{ + struct cand_ent *ce; + if (!seg) { + return cl; + } + ce = selected_candidate(seg); + if (ce->mw) { + return ce->mw->seg_class; + } + return SEG_BUNSETSU; +} + +static void +set_features(struct feature_list *fl, + struct seg_ent *prev_seg, + struct seg_ent *cur_seg) +{ + int cl, pc; + cl = get_seg_class(cur_seg, SEG_TAIL); + pc = get_seg_class(prev_seg, SEG_HEAD); + + anthy_feature_list_set_cur_class(fl, cl); + if (cur_seg) { + struct cand_ent *ce = selected_candidate(cur_seg); + anthy_feature_list_set_dep_word(fl, ce->dep_word_hash); + if (ce->mw) { + anthy_feature_list_set_dep_class(fl, ce->mw->dep_class); + anthy_feature_list_set_mw_features(fl, ce->mw->mw_features); + anthy_feature_list_set_noun_cos(fl, ce->mw->core_wt); + } + } + anthy_feature_list_set_class_trans(fl, pc, cl); + /**/ + anthy_feature_list_sort(fl); +} + +static void +print_element(const char *prefix, + struct cand_elm *elm, struct feature_list *fl) +{ + struct word w; + + if (elm->str.len == 0) { + return ; + } + if (elm->id != -1) { + /* 自立語 */ + fill_indep_word(&w, elm); + print_word(prefix, &w, fl); + } else { + /* 付属語 */ + fill_dep_word(&w, elm); + print_word(prefix, &w, NULL); + } + free_word(&w); +} + +static void +print_unconverted(struct cand_ent *ce) +{ + printf("unknown "); + anthy_putxstrln(&ce->str); +} + +static void +print_eos(struct seg_ent *prev_seg) +{ + struct feature_list fl; + anthy_feature_list_init(&fl); + set_features(&fl, prev_seg, NULL); + printf("eos "); + print_features(&fl); + printf("\n"); + anthy_feature_list_free(&fl); +} + +/* 候補のミスには '~'、文節長のミスには '!'を付ける + * 同じ文節内の二つめ以降の自立語には '^'を付ける + */ +static const char * +get_prefix(int flag) +{ + if (flag & CONV_INVALID) { + return "^"; + } + if (flag & CONV_SIZE_MISS) { + return "!"; + } + if (flag & CONV_CAND_MISS) { + return "~"; + } + return ""; +} + +static void +print_segment_info(int is_negative, + struct seg_ent *prev_seg, + struct seg_ent *seg) +{ + int i; + struct feature_list fl; + struct cand_ent *ce = selected_candidate(seg); + int nr_indep = 0; + const char *prefix = get_prefix(is_negative); + + anthy_feature_list_init(&fl); + set_features(&fl, prev_seg, seg); + for (i = 0; i < ce->nr_words; i++) { + struct cand_elm *elm = &ce->elm[i]; + prefix = get_prefix(is_negative); + if (nr_indep > 0 && elm->id != -1) { + prefix = get_prefix(is_negative | CONV_INVALID); + } + /* 出力する */ + print_element(prefix, elm, &fl); + /* 自立語を数える */ + if (elm->id != -1) { + nr_indep ++; + } + } + anthy_feature_list_free(&fl); +} + +void +print_size_miss_segment_info(anthy_context_t ac, int nth) +{ + struct seg_ent *prev_seg = NULL; + struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, nth); + if (nth > 0) { + prev_seg = anthy_get_nth_segment(&ac->seg_list, nth - 1); + } + print_segment_info(CONV_SIZE_MISS, prev_seg, seg); +} + +void +print_cand_miss_segment_info(anthy_context_t ac, int nth) +{ + struct seg_ent *prev_seg = NULL; + struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, nth); + if (nth > 0) { + prev_seg = anthy_get_nth_segment(&ac->seg_list, nth - 1); + } + print_segment_info(CONV_CAND_MISS, prev_seg, seg); +} + +void +print_context_info(anthy_context_t ac, struct conv_res *cr) +{ + int i; + struct seg_ent *prev_seg = NULL; + + printf("segments: %d\n", ac->seg_list.nr_segments); + /* 各文節に対して */ + for (i = 0; i < ac->seg_list.nr_segments; i++) { + struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, i); + struct cand_ent *ce = selected_candidate(seg); + int is_negative = 0; + if (cr && cr->cand_check && cr->cand_check[i]) { + is_negative = CONV_CAND_MISS; + } + + /* 各要素に対して */ + if (!ce->nr_words) { + /* 要素が無いものはそのまま表示 */ + print_unconverted(ce); + } else { + /* 候補の変更があった場合はそれを表示 */ + if (seg->committed > 0) { + int tmp = seg->committed; + seg->committed = 0; + print_cand_miss_segment_info(ac, i); + seg->committed = tmp; + } + /* 文節の構成を表示 */ + print_segment_info(is_negative, prev_seg, seg); + } + /**/ + prev_seg = seg; + } + print_eos(prev_seg); + printf("\n"); +} |