From a7a06a7ccfe0af1e134357678b8fa6cf87dff3b0 Mon Sep 17 00:00:00 2001 From: Lorry Tar Creator Date: Sat, 7 Feb 2009 16:32:56 +0000 Subject: anthy-9100h --- src-util/dic-tool.c | 448 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 448 insertions(+) create mode 100644 src-util/dic-tool.c (limited to 'src-util/dic-tool.c') diff --git a/src-util/dic-tool.c b/src-util/dic-tool.c new file mode 100644 index 0000000..f5ce076 --- /dev/null +++ b/src-util/dic-tool.c @@ -0,0 +1,448 @@ +/* + * 辞書操作用のユーティリティコマンド + * + * 辞書のライブラリ内部の形式と外部の形式の相互変換を行う + * 外部形式は + * *読み 頻度 単語 + * *品詞の変数1 = 値1 + * *品詞の変数2 = 値2 + * *... + * *<空行> + * になる + */ +/* + * Funded by IPA未踏ソフトウェア創造事業 2001 9/22 + * + * Copyright (C) 2000-2007 TABATA Yusuke + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include + +#include +#include +/**/ +#include +#include "config.h" + +#define UNSPEC 0 +#define DUMP_DIC 1 +#define LOAD_DIC 2 +#define APPEND_DIC 3 + +#define TYPETAB "typetab" +#define USAGE_TEXT "dic-tool-usage.txt" + +#define USAGE \ + "Anthy-dic-util [options]\n"\ + " --help: Show this usage text\n"\ + " --version: Show version\n"\ + " --dump: Dump dictionary\n"\ + " --load: Load dictionary\n"\ + " --append: Append dictionary\n"\ + " --utf8: Use utf8 encoding\n"\ + " --personality=NAME: use NAME as a name of personality\n" + + +static int command = UNSPEC; +static int encoding = ANTHY_EUC_JP_ENCODING; +static FILE *fp_in; +static char *fn; +static const char *personality = ""; + +/* 変数名と値のペア */ +struct var{ + struct var *next; + char *var_name; + char *val; +}; + +/* 品詞のパラメータから品詞名を得るためのテーブル */ +struct trans_tab { + struct trans_tab *next; + char *type_name; /* 内部での型の名前 T35とか */ + struct var var_list; /* 型を決定するためのパラメータ */ +}trans_tab_list; + +static void +print_usage(void) +{ + printf(USAGE); + exit(0); +} + +static FILE * +open_typetab(void) +{ + FILE *fp; + char *fn; + fp = fopen(TYPETAB, "r"); + if (fp) { + return fp; + } + fn = strdup(anthy_dic_util_get_anthydir()); + fn = realloc(fn, strlen(fn) + strlen(TYPETAB) + 4); + strcat(fn, "/"); + strcat(fn, TYPETAB); + fp = fopen(fn, "r"); + return fp; +} + +static FILE * +open_usage_file(void) +{ + FILE *fp; + /* カレントディレクトリにある場合は、それを使用する */ + fp = fopen(USAGE_TEXT, "r"); + if (!fp) { + /* インストールされたものを使用 */ + char *fn; + fn = strdup(anthy_dic_util_get_anthydir()); + fn = realloc(fn, strlen(fn) + strlen(USAGE_TEXT) + 10); + strcat(fn, "/" USAGE_TEXT); + fp = fopen(fn, "r"); + } + return fp; +} + +static void +print_usage_text(void) +{ + char buf[256]; + FILE *fp = open_usage_file(); + if (!fp) { + printf("# Anthy-dic-tool\n#\n"); + return ; + } + fprintf(stdout, "#" PACKAGE " " VERSION "\n"); + if (encoding == ANTHY_UTF8_ENCODING) { + } else { + } + /* そのままファイルの内容を出力 */ + while (fgets(buf, 256, fp)) { + if (encoding == ANTHY_UTF8_ENCODING) { + char *s; + s = anthy_conv_euc_to_utf8(buf); + printf("%s", s); + free(s); + } else { + printf("%s", buf); + } + } + fclose(fp); +} + +static char * +read_line(char *buf, int len, FILE *fp) +{ + while (fgets(buf, len, fp)) { + if (buf[0] != '#') { + /* 改行を削除する */ + int l = strlen(buf); + if (l > 0 && buf[l-1] == '\n') { + buf[l-1] = 0; + } + if (l > 1 && buf[l-2] == '\r') { + buf[l-1] = 0; + } + /**/ + return buf; + } + } + return NULL; +} + +static int +read_typetab_var(struct var *head, FILE *fp, int table) +{ + char buf[256]; + char var[256], eq[256], val[256]; + struct var *v; + if (!read_line(buf, 256, fp)) { + return -1; + } + if (sscanf(buf, "%s %s %s", var, eq, val) != 3) { + return -1; + } + + v = malloc(sizeof(struct var)); + if (encoding == ANTHY_UTF8_ENCODING && table) { + /* UTF-8 */ + v->var_name = anthy_conv_euc_to_utf8(var); + v->val = anthy_conv_euc_to_utf8(val); + } else { + /* do not change */ + v->var_name = strdup(var); + v->val = strdup(val); + } + + /* リストにつなぐ */ + v->next = head->next; + head->next = v; + + return 0; +} + +static int +read_typetab_entry(FILE *fp) +{ + char buf[256], type_name[257]; + char *res; + struct trans_tab *t; + /* 一行目の品詞名を読む */ + do { + res = read_line(buf, 256, fp); + if (!res) { + return -1; + } + } while (res[0] == '#' || res[0] == 0); + t = malloc(sizeof(struct trans_tab)); + sprintf(type_name, "#%s", buf); + t->type_name = strdup(type_name); + t->var_list.next = 0; + /* パラメータを読む */ + while(!read_typetab_var(&t->var_list, fp, 1)); + /* リストにつなぐ */ + t->next = trans_tab_list.next; + trans_tab_list.next = t; + return 0; +} + +static void +read_typetab(void) +{ + FILE *fp = open_typetab(); + if (!fp) { + printf("Failed to open type table.\n"); + exit(1); + } + while (!read_typetab_entry(fp)); +} + +static struct trans_tab * +find_trans_tab_by_name(char *name) +{ + struct trans_tab *t; + for (t = trans_tab_list.next; t; t = t->next) { + if (!strcmp(t->type_name, name)) { + return t; + } + } + return NULL; +} + +static void +print_word_type(struct trans_tab *t) +{ + struct var *v; + for (v = t->var_list.next; v; v = v->next) { + printf("%s\t=\t%s\n", v->var_name, v->val); + } +} + +static void +dump_dic(void) +{ + print_usage_text(); + if (anthy_priv_dic_select_first_entry() == -1) { + printf("# Failed to read private dictionary\n" + "# There are no words or error occurred?\n" + "#\n"); + return ; + } + do { + char idx[100], wt[100], w[100]; + int freq; + if (anthy_priv_dic_get_index(idx, 100) && + anthy_priv_dic_get_wtype(wt, 100) && + anthy_priv_dic_get_word(w, 100)) { + struct trans_tab *t; + freq = anthy_priv_dic_get_freq(); + t = find_trans_tab_by_name(wt); + if (t) { + printf("%s %d %s\n", idx, freq, w); + print_word_type(t); + printf("\n"); + } else { + printf("# Failed to determine word type of %s(%s).\n", w, wt); + } + } + } while (anthy_priv_dic_select_next_entry() == 0); +} + +static void +open_input_file(void) +{ + if (!fn) { + fp_in = stdin; + } else { + fp_in = fopen(fn, "r"); + if (!fp_in) { + exit(1); + } + } +} + +/* vが sの中にあるか */ +static int +match_var(struct var *v, struct var *s) +{ + struct var *i; + for (i = s->next; i; i = i->next) { + if (!strcmp(v->var_name, i->var_name) && + !strcmp(v->val, i->val)) { + return 1; + } + } + return 0; +} + +/* v1がv2の部分集合かどうか */ +static int +var_list_subset_p(struct var *v1, struct var *v2) +{ + struct var *v; + for (v = v1->next; v; v = v->next) { + if (!match_var(v, v2)) { + return 0; + } + } + return 1; +} + +static char * +find_wt(void) +{ + struct var v; + struct trans_tab *t; + v.next = 0; + while(!read_typetab_var(&v, fp_in, 0)); + for (t = trans_tab_list.next; t; t = t->next) { + if (var_list_subset_p(&t->var_list, &v) && + var_list_subset_p(&v, &t->var_list)) { + return t->type_name; + } + } + return NULL; +} + +static int +find_head(char *yomi, char *freq, char *w) +{ + char buf[256]; + do { + if (!read_line(buf, 256, fp_in)) { + return -1; + } + } while (sscanf(buf, "%s %s %[^\n]",yomi, freq, w) != 3); + return 0; +} + +static void +load_dic(void) +{ + char yomi[256], freq[256], w[256]; + while (!find_head(yomi, freq, w)) { + char *wt = find_wt(); + if (wt) { + int ret; + ret = anthy_priv_dic_add_entry(yomi, w, wt, atoi(freq)); + if (ret == -1) { + printf("Failed to register %s\n", yomi); + }else { + printf("Word %s is registered as %s\n", yomi, wt); + } + } else { + printf("Failed to find the type of %s.\n", yomi); + } + } +} + +static void +print_version(void) +{ + printf("Anthy-dic-util "VERSION".\n"); + exit(0); +} + +static void +parse_args(int argc, char **argv) +{ + int i; + for (i = 1 ; i < argc ; i++) { + if (!strncmp(argv[i], "--", 2)) { + char *opt = &argv[i][2]; + if (!strcmp(opt, "help")) { + print_usage(); + } else if (!strcmp(opt, "version")){ + print_version(); + } else if (!strcmp(opt, "dump")) { + command = DUMP_DIC; + } else if (!strcmp(opt,"append") ){ + command = APPEND_DIC; + } else if (!strncmp(opt, "personality=", 12)) { + personality = &opt[12]; + } else if (!strcmp(opt, "utf8")) { + encoding = ANTHY_UTF8_ENCODING; + } else if (!strcmp(opt, "eucjp")) { + encoding = ANTHY_EUC_JP_ENCODING; + } else if (!strcmp(opt, "load")) { + command = LOAD_DIC; + } + }else{ + fn = argv[i]; + } + } +} + +static void +init_lib(void) +{ + anthy_dic_util_init(); + anthy_dic_util_set_encoding(encoding); + read_typetab(); +} + +int +main(int argc,char **argv) +{ + fp_in = stdin; + parse_args(argc, argv); + + switch (command) { + case DUMP_DIC: + init_lib(); + dump_dic(); + break; + case LOAD_DIC: + init_lib(); + anthy_priv_dic_delete(); + open_input_file(); + load_dic(); + break; + case APPEND_DIC: + init_lib(); + open_input_file(); + load_dic(); + break; + case UNSPEC: + default: + print_usage(); + } + return 0; +} -- cgit v1.2.1