From a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0 Mon Sep 17 00:00:00 2001 From: Lorry Tar Creator Date: Sat, 7 Feb 2009 16:32:56 +0000 Subject: anthy-9100h --- calctrans/proccorpus.c | 176 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 calctrans/proccorpus.c (limited to 'calctrans/proccorpus.c') diff --git a/calctrans/proccorpus.c b/calctrans/proccorpus.c new file mode 100644 index 0000000..e1db67e --- /dev/null +++ b/calctrans/proccorpus.c @@ -0,0 +1,176 @@ +/* + * コーパスとなる文章を読んで、文節の長さを調整して + * 形態素解析の結果を出力する + * + * 出力形式について + * まず伸縮を行った文節が最初の長さで出力される + * 次に各文節毎に(あれば)誤った候補、正しい候補の順で情報を出力する + * + * + * Copyright (C) 2006-2007 TABATA Yusuke + * + */ +#include +#include +#include +#include + +static int verbose; + +/* 文節の長さを例文にあわせる */ +static int +trim_segment(anthy_context_t ac, struct conv_res *cr, + int nth, char *seg) +{ + int len = strlen(seg); + int resized = 0; + (void)cr; + + while (1) { + char seg_buf[1024]; + int cur_len; + + anthy_get_segment(ac, nth, NTH_UNCONVERTED_CANDIDATE, seg_buf, 1024); + cur_len = strlen(seg_buf); + if (len == cur_len) { + return 1; + } + if (!resized) { + resized = 1; + /* 伸縮前の文節の情報を表示する */ + print_size_miss_segment_info(ac, nth); + } + if (len > cur_len) { + anthy_resize_segment(ac, nth, 1); + } else { + anthy_resize_segment(ac, nth, -1); + } + } + return 0; +} + +/* + * nth番目の文節で候補segを探して確定する + */ +static int +find_candidate(anthy_context_t ac, struct conv_res *cr, + int nth, char *seg) +{ + char seg_buf[1024]; + int i; + struct anthy_segment_stat ass; + + if (seg[0] == '~') { + /* 候補ミスのマーク「~」をスキップする */ + seg++; + cr->cand_check[nth] = 1; + } + + anthy_get_segment_stat(ac, nth, &ass); + for (i = 0; i < ass.nr_candidate; i++) { + anthy_get_segment(ac, nth, i, seg_buf, 1024); + if (!strcmp(seg_buf, seg)) { + /* 一致する候補を見つけたので確定する */ + anthy_commit_segment(ac, nth, i); + return 0; + } + } + return 0; +} + +/* '|' で文節に区切られた文字列の各文節を引数にfnを呼ぶ */ +static int +for_each_segment(anthy_context_t ac, struct conv_res *cr, + const char *res_str, + int (*fn)(anthy_context_t ac, struct conv_res *cr, + int nth, char *seg)) +{ + char *str, *cur, *cur_seg; + int nth; + if (!res_str) { + return 0; + } + + str = strdup(res_str); + cur = str; + cur ++; + cur_seg = cur; + nth = 0; + while ((cur = strchr(cur, '|'))) { + *cur = 0; + /**/ + if (fn) { + fn(ac, cr, nth, cur_seg); + } + /**/ + nth ++; + cur ++; + cur_seg = cur; + } + + free(str); + + return 1; +} + +static void +proc_sentence(anthy_context_t ac, struct conv_res *cr) +{ + int i; + struct anthy_conv_stat acs; + /*printf("(%s)\n", cr->src_str);*/ + anthy_set_string(ac, cr->src_str); + /* 文節の長さを調節する */ + if (!for_each_segment(ac, cr, cr->res_str, trim_segment)) { + return ; + } + /**/ + if (anthy_get_stat(ac, &acs)) { + return ; + } + cr->cand_check = malloc(sizeof(int) * acs.nr_segment); + for (i = 0; i < acs.nr_segment; i++) { + cr->cand_check[i] = 0; + } + + /* 候補を選択する */ + if (cr->cand_str) { + for_each_segment(ac, cr, cr->cand_str, find_candidate); + } + + if (verbose) { + anthy_print_context(ac); + } + /* 出力する */ + print_context_info(ac, cr); +} + +int +main(int argc, char **argv) +{ + struct res_db *db; + struct conv_res *cr; + anthy_context_t ac; + int i; + + db = create_db(); + for (i = 1; i < argc; i++) { + if (!strcmp("-v", argv[i])) { + verbose = 1; + } else { + read_db(db, argv[i]); + } + } + + anthy_conf_override("CONFFILE", "../anthy-conf"); + anthy_conf_override("DIC_FILE", "../mkanthydic/anthy.dic"); + anthy_init(); + anthy_set_personality(""); + ac = anthy_create_context(); + + /**/ + for (cr = db->res_list.next; cr; cr = cr->next) { + proc_sentence(ac, cr); + } + return 0; +} -- cgit v1.2.1