summaryrefslogtreecommitdiff
path: root/calctrans/proccorpus.c
diff options
context:
space:
mode:
Diffstat (limited to 'calctrans/proccorpus.c')
-rw-r--r--calctrans/proccorpus.c176
1 files changed, 176 insertions, 0 deletions
diff --git a/calctrans/proccorpus.c b/calctrans/proccorpus.c
new file mode 100644
index 0000000..e1db67e
--- /dev/null
+++ b/calctrans/proccorpus.c
@@ -0,0 +1,176 @@
+/*
+ * コーパスとなる文章を読んで、文節の長さを調整して
+ * 形態素解析の結果を出力する
+ *
+ * 出力形式について
+ * まず伸縮を行った文節が最初の長さで出力される
+ * 次に各文節毎に(あれば)誤った候補、正しい候補の順で情報を出力する
+ *
+ *
+ * Copyright (C) 2006-2007 TABATA Yusuke
+ *
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <anthy/convdb.h>
+
+static int verbose;
+
+/* 文節の長さを例文にあわせる */
+static int
+trim_segment(anthy_context_t ac, struct conv_res *cr,
+ int nth, char *seg)
+{
+ int len = strlen(seg);
+ int resized = 0;
+ (void)cr;
+
+ while (1) {
+ char seg_buf[1024];
+ int cur_len;
+
+ anthy_get_segment(ac, nth, NTH_UNCONVERTED_CANDIDATE, seg_buf, 1024);
+ cur_len = strlen(seg_buf);
+ if (len == cur_len) {
+ return 1;
+ }
+ if (!resized) {
+ resized = 1;
+ /* 伸縮前の文節の情報を表示する */
+ print_size_miss_segment_info(ac, nth);
+ }
+ if (len > cur_len) {
+ anthy_resize_segment(ac, nth, 1);
+ } else {
+ anthy_resize_segment(ac, nth, -1);
+ }
+ }
+ return 0;
+}
+
+/*
+ * nth番目の文節で候補segを探して確定する
+ */
+static int
+find_candidate(anthy_context_t ac, struct conv_res *cr,
+ int nth, char *seg)
+{
+ char seg_buf[1024];
+ int i;
+ struct anthy_segment_stat ass;
+
+ if (seg[0] == '~') {
+ /* 候補ミスのマーク「~」をスキップする */
+ seg++;
+ cr->cand_check[nth] = 1;
+ }
+
+ anthy_get_segment_stat(ac, nth, &ass);
+ for (i = 0; i < ass.nr_candidate; i++) {
+ anthy_get_segment(ac, nth, i, seg_buf, 1024);
+ if (!strcmp(seg_buf, seg)) {
+ /* 一致する候補を見つけたので確定する */
+ anthy_commit_segment(ac, nth, i);
+ return 0;
+ }
+ }
+ return 0;
+}
+
+/* '|' で文節に区切られた文字列の各文節を引数にfnを呼ぶ */
+static int
+for_each_segment(anthy_context_t ac, struct conv_res *cr,
+ const char *res_str,
+ int (*fn)(anthy_context_t ac, struct conv_res *cr,
+ int nth, char *seg))
+{
+ char *str, *cur, *cur_seg;
+ int nth;
+ if (!res_str) {
+ return 0;
+ }
+
+ str = strdup(res_str);
+ cur = str;
+ cur ++;
+ cur_seg = cur;
+ nth = 0;
+ while ((cur = strchr(cur, '|'))) {
+ *cur = 0;
+ /**/
+ if (fn) {
+ fn(ac, cr, nth, cur_seg);
+ }
+ /**/
+ nth ++;
+ cur ++;
+ cur_seg = cur;
+ }
+
+ free(str);
+
+ return 1;
+}
+
+static void
+proc_sentence(anthy_context_t ac, struct conv_res *cr)
+{
+ int i;
+ struct anthy_conv_stat acs;
+ /*printf("(%s)\n", cr->src_str);*/
+ anthy_set_string(ac, cr->src_str);
+ /* 文節の長さを調節する */
+ if (!for_each_segment(ac, cr, cr->res_str, trim_segment)) {
+ return ;
+ }
+ /**/
+ if (anthy_get_stat(ac, &acs)) {
+ return ;
+ }
+ cr->cand_check = malloc(sizeof(int) * acs.nr_segment);
+ for (i = 0; i < acs.nr_segment; i++) {
+ cr->cand_check[i] = 0;
+ }
+
+ /* 候補を選択する */
+ if (cr->cand_str) {
+ for_each_segment(ac, cr, cr->cand_str, find_candidate);
+ }
+
+ if (verbose) {
+ anthy_print_context(ac);
+ }
+ /* 出力する */
+ print_context_info(ac, cr);
+}
+
+int
+main(int argc, char **argv)
+{
+ struct res_db *db;
+ struct conv_res *cr;
+ anthy_context_t ac;
+ int i;
+
+ db = create_db();
+ for (i = 1; i < argc; i++) {
+ if (!strcmp("-v", argv[i])) {
+ verbose = 1;
+ } else {
+ read_db(db, argv[i]);
+ }
+ }
+
+ anthy_conf_override("CONFFILE", "../anthy-conf");
+ anthy_conf_override("DIC_FILE", "../mkanthydic/anthy.dic");
+ anthy_init();
+ anthy_set_personality("");
+ ac = anthy_create_context();
+
+ /**/
+ for (cr = db->res_list.next; cr; cr = cr->next) {
+ proc_sentence(ac, cr);
+ }
+ return 0;
+}