summaryrefslogtreecommitdiff
path: root/src-util/dic-tool.c
diff options
context:
space:
mode:
Diffstat (limited to 'src-util/dic-tool.c')
-rw-r--r--src-util/dic-tool.c448
1 files changed, 448 insertions, 0 deletions
diff --git a/src-util/dic-tool.c b/src-util/dic-tool.c
new file mode 100644
index 0000000..f5ce076
--- /dev/null
+++ b/src-util/dic-tool.c
@@ -0,0 +1,448 @@
+/*
+ * 辞書操作用のユーティリティコマンド
+ *
+ * 辞書のライブラリ内部の形式と外部の形式の相互変換を行う
+ * 外部形式は
+ * *読み 頻度 単語
+ * *品詞の変数1 = 値1
+ * *品詞の変数2 = 値2
+ * *...
+ * *<空行>
+ * になる
+ */
+/*
+ * Funded by IPA未踏ソフトウェア創造事業 2001 9/22
+ *
+ * Copyright (C) 2000-2007 TABATA Yusuke
+ */
+/*
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <anthy/anthy.h>
+#include <anthy/dicutil.h>
+/**/
+#include <anthy/xstr.h>
+#include "config.h"
+
+#define UNSPEC 0
+#define DUMP_DIC 1
+#define LOAD_DIC 2
+#define APPEND_DIC 3
+
+#define TYPETAB "typetab"
+#define USAGE_TEXT "dic-tool-usage.txt"
+
+#define USAGE \
+ "Anthy-dic-util [options]\n"\
+ " --help: Show this usage text\n"\
+ " --version: Show version\n"\
+ " --dump: Dump dictionary\n"\
+ " --load: Load dictionary\n"\
+ " --append: Append dictionary\n"\
+ " --utf8: Use utf8 encoding\n"\
+ " --personality=NAME: use NAME as a name of personality\n"
+
+
+static int command = UNSPEC;
+static int encoding = ANTHY_EUC_JP_ENCODING;
+static FILE *fp_in;
+static char *fn;
+static const char *personality = "";
+
+/* 変数名と値のペア */
+struct var{
+ struct var *next;
+ char *var_name;
+ char *val;
+};
+
+/* 品詞のパラメータから品詞名を得るためのテーブル */
+struct trans_tab {
+ struct trans_tab *next;
+ char *type_name; /* 内部での型の名前 T35とか */
+ struct var var_list; /* 型を決定するためのパラメータ */
+}trans_tab_list;
+
+static void
+print_usage(void)
+{
+ printf(USAGE);
+ exit(0);
+}
+
+static FILE *
+open_typetab(void)
+{
+ FILE *fp;
+ char *fn;
+ fp = fopen(TYPETAB, "r");
+ if (fp) {
+ return fp;
+ }
+ fn = strdup(anthy_dic_util_get_anthydir());
+ fn = realloc(fn, strlen(fn) + strlen(TYPETAB) + 4);
+ strcat(fn, "/");
+ strcat(fn, TYPETAB);
+ fp = fopen(fn, "r");
+ return fp;
+}
+
+static FILE *
+open_usage_file(void)
+{
+ FILE *fp;
+ /* カレントディレクトリにある場合は、それを使用する */
+ fp = fopen(USAGE_TEXT, "r");
+ if (!fp) {
+ /* インストールされたものを使用 */
+ char *fn;
+ fn = strdup(anthy_dic_util_get_anthydir());
+ fn = realloc(fn, strlen(fn) + strlen(USAGE_TEXT) + 10);
+ strcat(fn, "/" USAGE_TEXT);
+ fp = fopen(fn, "r");
+ }
+ return fp;
+}
+
+static void
+print_usage_text(void)
+{
+ char buf[256];
+ FILE *fp = open_usage_file();
+ if (!fp) {
+ printf("# Anthy-dic-tool\n#\n");
+ return ;
+ }
+ fprintf(stdout, "#" PACKAGE " " VERSION "\n");
+ if (encoding == ANTHY_UTF8_ENCODING) {
+ } else {
+ }
+ /* そのままファイルの内容を出力 */
+ while (fgets(buf, 256, fp)) {
+ if (encoding == ANTHY_UTF8_ENCODING) {
+ char *s;
+ s = anthy_conv_euc_to_utf8(buf);
+ printf("%s", s);
+ free(s);
+ } else {
+ printf("%s", buf);
+ }
+ }
+ fclose(fp);
+}
+
+static char *
+read_line(char *buf, int len, FILE *fp)
+{
+ while (fgets(buf, len, fp)) {
+ if (buf[0] != '#') {
+ /* 改行を削除する */
+ int l = strlen(buf);
+ if (l > 0 && buf[l-1] == '\n') {
+ buf[l-1] = 0;
+ }
+ if (l > 1 && buf[l-2] == '\r') {
+ buf[l-1] = 0;
+ }
+ /**/
+ return buf;
+ }
+ }
+ return NULL;
+}
+
+static int
+read_typetab_var(struct var *head, FILE *fp, int table)
+{
+ char buf[256];
+ char var[256], eq[256], val[256];
+ struct var *v;
+ if (!read_line(buf, 256, fp)) {
+ return -1;
+ }
+ if (sscanf(buf, "%s %s %s", var, eq, val) != 3) {
+ return -1;
+ }
+
+ v = malloc(sizeof(struct var));
+ if (encoding == ANTHY_UTF8_ENCODING && table) {
+ /* UTF-8 */
+ v->var_name = anthy_conv_euc_to_utf8(var);
+ v->val = anthy_conv_euc_to_utf8(val);
+ } else {
+ /* do not change */
+ v->var_name = strdup(var);
+ v->val = strdup(val);
+ }
+
+ /* リストにつなぐ */
+ v->next = head->next;
+ head->next = v;
+
+ return 0;
+}
+
+static int
+read_typetab_entry(FILE *fp)
+{
+ char buf[256], type_name[257];
+ char *res;
+ struct trans_tab *t;
+ /* 一行目の品詞名を読む */
+ do {
+ res = read_line(buf, 256, fp);
+ if (!res) {
+ return -1;
+ }
+ } while (res[0] == '#' || res[0] == 0);
+ t = malloc(sizeof(struct trans_tab));
+ sprintf(type_name, "#%s", buf);
+ t->type_name = strdup(type_name);
+ t->var_list.next = 0;
+ /* パラメータを読む */
+ while(!read_typetab_var(&t->var_list, fp, 1));
+ /* リストにつなぐ */
+ t->next = trans_tab_list.next;
+ trans_tab_list.next = t;
+ return 0;
+}
+
+static void
+read_typetab(void)
+{
+ FILE *fp = open_typetab();
+ if (!fp) {
+ printf("Failed to open type table.\n");
+ exit(1);
+ }
+ while (!read_typetab_entry(fp));
+}
+
+static struct trans_tab *
+find_trans_tab_by_name(char *name)
+{
+ struct trans_tab *t;
+ for (t = trans_tab_list.next; t; t = t->next) {
+ if (!strcmp(t->type_name, name)) {
+ return t;
+ }
+ }
+ return NULL;
+}
+
+static void
+print_word_type(struct trans_tab *t)
+{
+ struct var *v;
+ for (v = t->var_list.next; v; v = v->next) {
+ printf("%s\t=\t%s\n", v->var_name, v->val);
+ }
+}
+
+static void
+dump_dic(void)
+{
+ print_usage_text();
+ if (anthy_priv_dic_select_first_entry() == -1) {
+ printf("# Failed to read private dictionary\n"
+ "# There are no words or error occurred?\n"
+ "#\n");
+ return ;
+ }
+ do {
+ char idx[100], wt[100], w[100];
+ int freq;
+ if (anthy_priv_dic_get_index(idx, 100) &&
+ anthy_priv_dic_get_wtype(wt, 100) &&
+ anthy_priv_dic_get_word(w, 100)) {
+ struct trans_tab *t;
+ freq = anthy_priv_dic_get_freq();
+ t = find_trans_tab_by_name(wt);
+ if (t) {
+ printf("%s %d %s\n", idx, freq, w);
+ print_word_type(t);
+ printf("\n");
+ } else {
+ printf("# Failed to determine word type of %s(%s).\n", w, wt);
+ }
+ }
+ } while (anthy_priv_dic_select_next_entry() == 0);
+}
+
+static void
+open_input_file(void)
+{
+ if (!fn) {
+ fp_in = stdin;
+ } else {
+ fp_in = fopen(fn, "r");
+ if (!fp_in) {
+ exit(1);
+ }
+ }
+}
+
+/* vが sの中にあるか */
+static int
+match_var(struct var *v, struct var *s)
+{
+ struct var *i;
+ for (i = s->next; i; i = i->next) {
+ if (!strcmp(v->var_name, i->var_name) &&
+ !strcmp(v->val, i->val)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* v1がv2の部分集合かどうか */
+static int
+var_list_subset_p(struct var *v1, struct var *v2)
+{
+ struct var *v;
+ for (v = v1->next; v; v = v->next) {
+ if (!match_var(v, v2)) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static char *
+find_wt(void)
+{
+ struct var v;
+ struct trans_tab *t;
+ v.next = 0;
+ while(!read_typetab_var(&v, fp_in, 0));
+ for (t = trans_tab_list.next; t; t = t->next) {
+ if (var_list_subset_p(&t->var_list, &v) &&
+ var_list_subset_p(&v, &t->var_list)) {
+ return t->type_name;
+ }
+ }
+ return NULL;
+}
+
+static int
+find_head(char *yomi, char *freq, char *w)
+{
+ char buf[256];
+ do {
+ if (!read_line(buf, 256, fp_in)) {
+ return -1;
+ }
+ } while (sscanf(buf, "%s %s %[^\n]",yomi, freq, w) != 3);
+ return 0;
+}
+
+static void
+load_dic(void)
+{
+ char yomi[256], freq[256], w[256];
+ while (!find_head(yomi, freq, w)) {
+ char *wt = find_wt();
+ if (wt) {
+ int ret;
+ ret = anthy_priv_dic_add_entry(yomi, w, wt, atoi(freq));
+ if (ret == -1) {
+ printf("Failed to register %s\n", yomi);
+ }else {
+ printf("Word %s is registered as %s\n", yomi, wt);
+ }
+ } else {
+ printf("Failed to find the type of %s.\n", yomi);
+ }
+ }
+}
+
+static void
+print_version(void)
+{
+ printf("Anthy-dic-util "VERSION".\n");
+ exit(0);
+}
+
+static void
+parse_args(int argc, char **argv)
+{
+ int i;
+ for (i = 1 ; i < argc ; i++) {
+ if (!strncmp(argv[i], "--", 2)) {
+ char *opt = &argv[i][2];
+ if (!strcmp(opt, "help")) {
+ print_usage();
+ } else if (!strcmp(opt, "version")){
+ print_version();
+ } else if (!strcmp(opt, "dump")) {
+ command = DUMP_DIC;
+ } else if (!strcmp(opt,"append") ){
+ command = APPEND_DIC;
+ } else if (!strncmp(opt, "personality=", 12)) {
+ personality = &opt[12];
+ } else if (!strcmp(opt, "utf8")) {
+ encoding = ANTHY_UTF8_ENCODING;
+ } else if (!strcmp(opt, "eucjp")) {
+ encoding = ANTHY_EUC_JP_ENCODING;
+ } else if (!strcmp(opt, "load")) {
+ command = LOAD_DIC;
+ }
+ }else{
+ fn = argv[i];
+ }
+ }
+}
+
+static void
+init_lib(void)
+{
+ anthy_dic_util_init();
+ anthy_dic_util_set_encoding(encoding);
+ read_typetab();
+}
+
+int
+main(int argc,char **argv)
+{
+ fp_in = stdin;
+ parse_args(argc, argv);
+
+ switch (command) {
+ case DUMP_DIC:
+ init_lib();
+ dump_dic();
+ break;
+ case LOAD_DIC:
+ init_lib();
+ anthy_priv_dic_delete();
+ open_input_file();
+ load_dic();
+ break;
+ case APPEND_DIC:
+ init_lib();
+ open_input_file();
+ load_dic();
+ break;
+ case UNSPEC:
+ default:
+ print_usage();
+ }
+ return 0;
+}