summaryrefslogtreecommitdiff
path: root/calctrans/calctrans.c
diff options
context:
space:
mode:
Diffstat (limited to 'calctrans/calctrans.c')
-rw-r--r--calctrans/calctrans.c725
1 files changed, 725 insertions, 0 deletions
diff --git a/calctrans/calctrans.c b/calctrans/calctrans.c
new file mode 100644
index 0000000..e7f7ddf
--- /dev/null
+++ b/calctrans/calctrans.c
@@ -0,0 +1,725 @@
+/*
+ * 文節の遷移行列を作成する
+ *
+ * このコマンドは二つの機能を持っている。(-cオプションで制御)
+ * (1) proccorpusの結果からテキスト形式で経験的格率の表を作る
+ * (2) テキスト形式の表からバイナリ形式に変換する
+ *
+ * morphological-analyzerの出力には下記のマークが付けてある
+ * ~ 候補の誤り
+ * ! 文節長の誤り
+ * ^ 複合文節の2つめ以降の要素
+ *
+ * generate transition matrix
+ *
+ * Copyright (C) 2006 HANAOKA Toshiyuki
+ * Copyright (C) 2006-2007 TABATA Yusuke
+ *
+ */
+/*
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <anthy/anthy.h>
+#include <anthy/xstr.h>
+#include <anthy/feature_set.h>
+#include <anthy/diclib.h>
+#include "input_set.h"
+#include <anthy/corpus.h>
+
+#define FEATURE_SET_SIZE NR_EM_FEATURES
+
+#define ARRAY_SIZE 16
+
+struct array {
+ int len;
+ int f[ARRAY_SIZE];
+};
+
+#define MAX_SEGMENT 64
+
+struct segment_info {
+ int orig_hash;
+ int hash;
+};
+
+struct sentence_info {
+ int nr_segments;
+ struct segment_info segs[MAX_SEGMENT];
+};
+
+/* 確率のテーブル */
+struct input_info {
+ /* 候補全体の素性 */
+ struct input_set *cand_is;
+ /* 文節の素性 */
+ struct input_set *seg_is;
+ /* 自立語の全文検索用情報 */
+ struct corpus *indep_corpus;
+
+ /**/
+ struct array missed_cand_features;
+
+ /**/
+ int nth_input_file;
+
+ /* 入力された例文の量に関する情報 */
+ int nr_sentences;
+ int nr_connections;
+};
+
+static struct input_info *
+init_input_info(void)
+{
+ struct input_info *m;
+ m = malloc(sizeof(struct input_info));
+ m->seg_is = input_set_create();
+ m->cand_is = input_set_create();
+ m->indep_corpus = corpus_new();
+ m->missed_cand_features.len = 0;
+ m->nth_input_file = 0;
+ m->nr_sentences = 0;
+ m->nr_connections = 0;
+ return m;
+}
+
+/* features=1,2,3,,の形式をparseする */
+static void
+parse_features(struct array *features, char *s)
+{
+ char *tok, *str = s;
+ tok = strtok(str, ",");
+ features->len = 0;
+ do {
+ features->f[features->len] = atoi(tok);
+ features->len++;
+ tok = strtok(NULL, ",");
+ } while(tok);
+}
+
+static void
+add_seg_struct_info(struct input_info *m,
+ struct array *features,
+ int weight)
+{
+ input_set_set_features(m->cand_is, features->f, features->len, weight);
+}
+
+static void
+set_hash(struct sentence_info *sinfo, int error_class,
+ char tag, int hash)
+{
+ if (tag == '~') {
+ sinfo->segs[sinfo->nr_segments].orig_hash = hash;
+ } else {
+ sinfo->segs[sinfo->nr_segments].hash = hash;
+ }
+ if (!error_class) {
+ sinfo->nr_segments++;
+ }
+}
+
+static int
+compare_array(struct array *a1, struct array *a2)
+{
+ int i;
+ if (a1->len != a2->len) {
+ return 1;
+ }
+ for (i = 0; i < a1->len; i++) {
+ if (a1->f[i] != a2->f[i]) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* 自立語の行をparseする */
+static void
+parse_indep(struct input_info *m, struct sentence_info *sinfo,
+ char *line, char *buf, int error_class)
+{
+ struct array features;
+ char *s;
+ int weight = 1;
+ /**/
+ s = strstr(buf, "features=");
+ if (s) {
+ s += 9;
+ parse_features(&features, s);
+ m->nr_connections ++;
+ }
+ s = strstr(buf, "hash=");
+ if (s) {
+ s += 5;
+ set_hash(sinfo, error_class, line[0], atoi(s));
+ }
+
+ /* 加算する */
+ if (error_class) {
+ if (line[0] == '~') {
+ /* 誤った候補の構造を保存 */
+ m->missed_cand_features = features;
+ }
+ if (line[0] == '!') {
+ /* 文節長の誤り */
+ input_set_set_features(m->seg_is, features.f, features.len, -weight);
+ }
+ } else {
+ /* 接続行列 */
+ input_set_set_features(m->seg_is, features.f, features.len, weight);
+ /* 候補の構造 */
+ if (m->missed_cand_features.len != 0 &&
+ compare_array(&features, &m->missed_cand_features)) {
+ /* 正解と異なる構造なら分母に加算 */
+ add_seg_struct_info(m, &m->missed_cand_features, -weight);
+ }
+ m->missed_cand_features.len = 0;
+ add_seg_struct_info(m, &features, weight);
+ }
+}
+
+static void
+init_sentence_info(struct sentence_info *sinfo)
+{
+ int i;
+ sinfo->nr_segments = 0;
+ for (i = 0; i < MAX_SEGMENT; i++) {
+ sinfo->segs[i].orig_hash = 0;
+ sinfo->segs[i].hash = 0;
+ }
+}
+
+/* 一つの文を読んだときに全文検索用のデータを作る
+ */
+static void
+complete_sentence_info(struct input_info *m, struct sentence_info *sinfo)
+{
+ int i;
+ if (m->nth_input_file > 0) {
+ /* 二つめ以降の入力ファイルは使わない */
+ return ;
+ }
+ for (i = 0; i < sinfo->nr_segments; i++) {
+ int flags = ELM_NONE;
+ int nr = 1;
+ int buf[2];
+ if (i == 0) {
+ flags |= ELM_BOS;
+ }
+ /**/
+ buf[0] = sinfo->segs[i].hash;
+ if (sinfo->segs[i].orig_hash) {
+ /*
+ buf[1] = sinfo->segs[i].orig_hash;
+ nr ++;
+ */
+ }
+ corpus_push_back(m->indep_corpus, buf, nr, flags);
+ }
+}
+
+static void
+do_read_file(struct input_info *m, FILE *fp)
+{
+ char line[1024];
+ struct sentence_info sinfo;
+
+ init_sentence_info(&sinfo);
+
+ while (fgets(line, 1024, fp)) {
+ char *buf = line;
+ int error_class = 0;
+ if (!strncmp(buf, "eos", 3)) {
+ m->nr_sentences ++;
+ complete_sentence_info(m, &sinfo);
+ init_sentence_info(&sinfo);
+ }
+ if (line[0] == '~' || line[0] == '!' ||
+ line[0] == '^') {
+ buf ++;
+ error_class = 1;
+ }
+ if (!strncmp(buf, "indep_word", 10) ||
+ !strncmp(buf, "eos", 3)) {
+ parse_indep(m, &sinfo, line, buf, error_class);
+ }
+ }
+}
+
+static void
+read_file(struct input_info *m, char *fn)
+{
+ FILE *ifp;
+ ifp = fopen(fn, "r");
+ if (!ifp) {
+ return ;
+ }
+ do_read_file(m, ifp);
+ fclose(ifp);
+}
+
+static void
+write_nl(FILE *fp, int i)
+{
+ i = anthy_dic_htonl(i);
+ fwrite(&i, sizeof(int), 1, fp);
+}
+
+static void
+dump_line(FILE *ofp, struct input_line *il)
+{
+ int i;
+ for (i = 0; i < FEATURE_SET_SIZE || i < il->nr_features; i++) {
+ if (i) {
+ fprintf(ofp, ", ");
+ }
+ if (i < il->nr_features) {
+ fprintf(ofp, "%d", il->features[i]);
+ } else {
+ fprintf(ofp, "0");
+ }
+ }
+ fprintf(ofp,",%d,%d\n", (int)il->negative_weight, (int)il->weight);
+}
+
+static int
+compare_line(const void *p1, const void *p2)
+{
+ const struct input_line *const *il1 = p1;
+ const struct input_line *const *il2 = p2;
+ int i;
+ for (i = 0; i < (*il1)->nr_features &&
+ i < (*il2)->nr_features; i++) {
+ if ((*il1)->features[i] !=
+ (*il2)->features[i]) {
+ return (*il1)->features[i] - (*il2)->features[i];
+ }
+ }
+ return (*il1)->nr_features - (*il2)->nr_features;
+}
+
+static void
+dump_features(FILE *ofp, struct input_set *is)
+{
+ struct input_line *il, **lines;
+ int i, nr = 0;
+ int weight = 0;
+
+ /* count lines */
+ for (il = input_set_get_input_line(is); il; il = il->next_line) {
+ nr ++;
+ weight += (int)il->weight;
+ }
+ /* copy lines */
+ lines = malloc(sizeof(struct input_line *) * nr);
+ for (il = input_set_get_input_line(is), i = 0; i < nr;
+ i++, il = il->next_line) {
+ lines[i] = il;
+ }
+ /* sort */
+ qsort(lines, nr, sizeof(struct input_line *), compare_line);
+ /* output */
+ fprintf(ofp, "%d %d total_line_weight,count\n", weight, nr);
+ /**/
+ for (i = 0; i < nr; i++) {
+ dump_line(ofp, lines[i]);
+ }
+}
+
+static void
+dump_input_info(FILE *ofp, struct input_info *m)
+{
+ fprintf(ofp, "section anthy.trans_info ");
+ dump_features(ofp, m->seg_is);
+ fprintf(ofp, "section anthy.cand_info ");
+ dump_features(ofp, m->cand_is);
+ fprintf(ofp, "section anthy.corpus_bucket ");
+ corpus_write_bucket(ofp, m->indep_corpus);
+ fprintf(ofp, "section anthy.corpus_array ");
+ corpus_write_array(ofp, m->indep_corpus);
+ /**/
+ fprintf(ofp, "section anthy.feature_info ");
+ input_set_output_feature_freq(ofp, m->seg_is);
+}
+
+static void
+convert_line(FILE *ofp, char *buf)
+{
+ char *tok;
+ tok = strtok(buf, ",");
+ do {
+ int n = atoi(tok);
+ write_nl(ofp, n);
+ tok = strtok(NULL, ",");
+ } while (tok);
+}
+
+static void
+convert_file(FILE *ifp)
+{
+ char buf[1024];
+ FILE *ofp = NULL;
+ while (fgets(buf, 1024, ifp)) {
+ /**/
+ if (buf[0] == '#') {
+ continue;
+ }
+ if (!strncmp("section", buf, 7)) {
+ int w, n, i;
+ char fn[1024];
+ if (ofp) {
+ fclose(ofp);
+ ofp = NULL;
+ }
+ sscanf(buf, "section %s %d %d", fn, &w, &n);
+ ofp = fopen(fn, "w");
+ if (!ofp) {
+ fprintf(stderr, "failed to open (%s)\n", fn);
+ abort();
+ }
+ write_nl(ofp, w);
+ write_nl(ofp, n);
+ for (i = 0; i < NR_EM_FEATURES; i++) {
+ write_nl(ofp, 0);
+ }
+ } else {
+ convert_line(ofp, buf);
+ }
+ }
+ if (ofp) {
+ fclose(ofp);
+ }
+}
+
+static void
+convert_data(int nr_fn, char **fns)
+{
+ FILE *ifp;
+ int i;
+ /**/
+ for (i = 0; i < nr_fn; i++) {
+ ifp = fopen(fns[i], "r");
+ if (!ifp) {
+ fprintf(stderr, "failed to open (%s)\n", fns[i]);
+ continue;
+ }
+ convert_file(ifp);
+ fclose(ifp);
+ }
+}
+
+/**/
+#define STRING_HASH_SIZE 256
+struct string_node {
+ int key;
+ char *str;
+ struct string_node *next_hash;
+};
+struct string_pool {
+ int nr;
+ struct string_node hash[STRING_HASH_SIZE];
+ struct string_node **array;
+};
+struct resize_info {
+ char *indep;
+ int valid;
+};
+struct extract_stat {
+ int nr;
+ struct resize_info info[MAX_SEGMENT];
+};
+
+static void
+string_pool_init(struct string_pool *sp)
+{
+ int i;
+ for (i = 0; i < STRING_HASH_SIZE; i++) {
+ sp->hash[i].next_hash = NULL;
+ }
+ sp->nr = 0;
+}
+
+static int
+compare_string_node(const void *p1, const void *p2)
+{
+ const struct string_node *const *n1 = p1;
+ const struct string_node *const *n2 = p2;
+ return (*n1)->key -(*n2)->key;
+}
+
+static void
+string_pool_sort(struct string_pool *sp)
+{
+ int idx, h;
+ sp->array = malloc(sizeof(struct string_node *) * sp->nr);
+ for (idx = 0, h = 0; h < STRING_HASH_SIZE; h++) {
+ struct string_node *node;
+ for (node = sp->hash[h].next_hash; node; node = node->next_hash) {
+ sp->array[idx] = node;
+ idx ++;
+ }
+ }
+ /**/
+ qsort(sp->array, sp->nr, sizeof(struct string_node *), compare_string_node);
+}
+
+static void
+string_pool_dump(FILE *ofp, struct string_pool *sp)
+{
+ int i;
+ fprintf(ofp, "section anthy.weak_words 0 %d\n", sp->nr);
+ for (i = 0; i < sp->nr; i++) {
+ fprintf(ofp, "%d\n", sp->array[i]->key);
+ }
+}
+
+static unsigned int
+string_hash(const unsigned char *str)
+{
+ unsigned int h = 0;
+ while (*str) {
+ h += *str;
+ h *= 13;
+ str ++;
+ }
+ return h % STRING_HASH_SIZE;
+}
+
+static struct string_node *
+find_string_node(struct string_pool *sp, const char *str)
+{
+ int h = (int)string_hash((const unsigned char *)str);
+ struct string_node *node;
+ for (node = sp->hash[h].next_hash; node; node = node->next_hash) {
+ if (!strcmp(str, node->str)) {
+ return node;
+ }
+ }
+ /* allocate new */
+ node = malloc(sizeof(*node));
+ node->str = strdup(str);
+ node->key = 0;
+ node->next_hash = sp->hash[h].next_hash;
+ sp->hash[h].next_hash = node;
+ sp->nr ++;
+ return node;
+}
+
+static void
+flush_extract_stat(struct extract_stat *es, struct string_pool *sp)
+{
+ int i;
+ for (i = 0; i < es->nr; i++) {
+ if (es->info[i].valid) {
+ struct string_node *node;
+ node = find_string_node(sp, es->info[i].indep);
+ if (node->key == 0) {
+ xstr *xs = anthy_cstr_to_xstr(node->str, ANTHY_EUC_JP_ENCODING);
+ node->key = anthy_xstr_hash(xs);
+ anthy_free_xstr(xs);
+ }
+ /* printf("(%s)%d\n", es->info[i].indep, node->key); */
+ }
+ free(es->info[i].indep);
+ es->info[i].indep = NULL;
+ }
+ es->nr = 0;
+}
+
+static char *
+get_indep_part(char *buf)
+{
+ int len;
+ char *c = strchr(buf, '#');
+ if (!c) {
+ return NULL;
+ }
+ c = strchr(c, ' ');
+ if (!c) {
+ return NULL;
+ }
+ c++;
+ c = strchr(c, ' ');
+ if (!c) {
+ return NULL;
+ }
+ c++;
+ len = strlen(c);
+ c[len-1] = 0;
+ return c;
+}
+
+static void
+fixup_missed_word(struct extract_stat *es, char *buf)
+{
+ int i;
+ char *c = get_indep_part(buf);
+ if (!c) {
+ return ;
+ }
+ for (i = 0; i < es->nr; i++) {
+ if (!strcmp(es->info[i].indep, c)) {
+ es->info[i].valid = 0;
+ }
+ }
+}
+
+static void
+fill_missed_word(struct extract_stat *es, char *buf)
+{
+ char *c = get_indep_part(buf);
+ if (!c) {
+ return ;
+ }
+ es->info[es->nr].indep = strdup(c);
+ es->info[es->nr].valid = 1;
+ es->nr++;
+}
+
+static void
+extract_word_from_file(FILE *ifp, struct string_pool *sp)
+{
+ int i;
+ char buf[1024];
+ struct extract_stat es;
+ /**/
+ es.nr = 0;
+ for (i = 0; i < MAX_SEGMENT; i++) {
+ es.info[i].indep = NULL;
+ }
+ /**/
+ while (fgets(buf, 1024, ifp)) {
+ if (buf[0] == '#') {
+ continue;
+ }
+ if (buf[0] == '\n' ||
+ buf[0] == ' ') {
+ flush_extract_stat(&es, sp);
+ continue;
+ }
+ /**/
+ if (!strncmp("!indep_word ", buf, 12)) {
+ fill_missed_word(&es, buf);
+ }
+ if (!strncmp("indep_word", buf, 10)) {
+ fixup_missed_word(&es, buf);
+ }
+ }
+ flush_extract_stat(&es, sp);
+}
+
+static void
+extract_word(int nr_fn, char **fns, FILE *ofp)
+{
+ struct string_pool sp;
+ FILE *ifp;
+ int i;
+ /**/
+ string_pool_init(&sp);
+ /**/
+ for (i = 0; i < nr_fn; i++) {
+ ifp = fopen(fns[i], "r");
+ if (!ifp) {
+ fprintf(stderr, "failed to open (%s)\n", fns[i]);
+ continue;
+ }
+ extract_word_from_file(ifp, &sp);
+ fclose(ifp);
+ }
+ /**/
+ string_pool_sort(&sp);
+ string_pool_dump(ofp, &sp);
+}
+
+/* 変換結果から確率のテーブルを作る */
+static void
+proc_corpus(int nr_fn, char **fns, FILE *ofp)
+{
+ int i;
+ struct input_info *m;
+ /**/
+ m = init_input_info();
+ /**/
+ for (i = 0; i < nr_fn; i++) {
+ m->nth_input_file = i;
+ read_file(m, fns[i]);
+ }
+
+ corpus_build(m->indep_corpus);
+ /**/
+ dump_input_info(ofp, m);
+ /**/
+ fprintf(stderr, " %d sentences\n", m->nr_sentences);
+ fprintf(stderr, " %d connections\n", m->nr_connections);
+ fprintf(stderr, " %d segments\n", m->nr_connections - m->nr_sentences);
+}
+
+int
+main(int argc, char **argv)
+{
+ FILE *ofp;
+ int i;
+ int nr_input = 0;
+ char **input_files;
+ int convert = 0;
+ int extract = 0;
+
+ ofp = NULL;
+ input_files = malloc(sizeof(char *) * argc);
+
+ for (i = 1; i < argc; i++) {
+ char *arg = argv[i];
+ if (!strcmp(arg, "-o")) {
+ ofp = fopen(argv[i+1], "w");
+ if (!ofp) {
+ fprintf(stderr, "failed to open (%s)\n", argv[i+1]);
+ }
+ i ++;
+ } else if (!strcmp(arg, "-c") ||
+ !strcmp(arg, "--convert")) {
+ convert = 1;
+ } else if (!strcmp(arg, "-e") ||
+ !strcmp(arg, "--extract")) {
+ extract = 1;
+ } else {
+ input_files[nr_input] = arg;
+ nr_input ++;
+ }
+ }
+ if (extract) {
+ printf(" -- extracting missed words\n");
+ if (!ofp) {
+ ofp = stdout;
+ }
+ extract_word(nr_input, input_files, ofp);
+ return 0;
+ }
+ if (ofp) {
+ printf(" -- generating dictionary in text form\n");
+ proc_corpus(nr_input, input_files, ofp);
+ fclose(ofp);
+ }
+ if (convert) {
+ printf(" -- converting dictionary from text to binary form\n");
+ convert_data(nr_input, input_files);
+ }
+
+ return 0;
+}