diff options
Diffstat (limited to 'src-worddic/dic_util.c')
-rw-r--r-- | src-worddic/dic_util.c | 594 |
1 files changed, 594 insertions, 0 deletions
diff --git a/src-worddic/dic_util.c b/src-worddic/dic_util.c new file mode 100644 index 0000000..e03a801 --- /dev/null +++ b/src-worddic/dic_util.c @@ -0,0 +1,594 @@ +/* + * 個人辞書管理用の関数群 + * + * 互換性の都合で + * utf8の辞書はtextdict + * eucjpの辞書はtexttrie + * およびrecordを使ってて混乱しまくり + * textdictへ移行する + * + * 開発予定 + * + * 新規登録はtextdictに対して行うようにする <- todo + * texttrieの単語は移行するようにする + * record関係は消す + * + * + * Funded by IPA未踏ソフトウェア創造事業 2001 10/24 + * + * Copyright (C) 2001-2007 TABATA Yusuke + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include <anthy/anthy.h> +#include <anthy/conf.h> +#include <anthy/dic.h> +#include <anthy/texttrie.h> +#include <anthy/textdict.h> +#include <anthy/dicutil.h> + +#include "dic_main.h" +#include "dic_personality.h" + +/* + * 個人辞書はtexttrie中に格納されるとき + * 「 見出し 数字」 -> 「#品詞*頻度 単語」という形式をとる + * (UTF8の場合は「 p見出し 数字」 -> 「#品詞*頻度 単語」) + * 最初の2文字の空白は単語情報のセクションであることを意味し、 + * 数字の部分は同音語を区別するために用いられる。 + * + */ + +/* UTF8で32文字 x 3bytes */ +#define MAX_KEY_LEN 96 + +static int gIsInit; +static int dic_util_encoding; + +extern struct text_trie *anthy_private_tt_dic; +extern struct textdict *anthy_private_text_dic; +/* 現在選択されている読み */ +static struct iterate_contex { + /**/ + int in_tt; + /* texttrie */ + char key_buf[MAX_KEY_LEN+32]; + /* textdictの検索用 */ + int dicfile_offset; + char *current_index; + char *current_line; +} word_iterator; +/**/ +struct scan_context { + const char *yomi; + const char *word; + const char *wt_name; + int offset; + int found_word; +}; + +static void +set_current_line(const char *index, const char *line) +{ + if (word_iterator.current_line) { + free(word_iterator.current_line); + word_iterator.current_line = NULL; + } + if (line) { + word_iterator.current_line = strdup(line); + } + if (word_iterator.current_index) { + free(word_iterator.current_index); + word_iterator.current_index = NULL; + } + if (index) { + word_iterator.current_index = strdup(index); + } +} + +/** 個人辞書ライブラリを初期化する */ +void +anthy_dic_util_init(void) +{ + if (gIsInit) { + return ; + } + if (anthy_init_dic() == -1) { + return ; + } + anthy_dic_set_personality("default"); + gIsInit = 1; + dic_util_encoding = ANTHY_EUC_JP_ENCODING; + /**/ + word_iterator.key_buf[0] = 0; + word_iterator.in_tt = 1; +} + +/** 辞書ライブラリを解放する */ +void +anthy_dic_util_quit(void) +{ + if (gIsInit) { + anthy_quit_dic(); + } + set_current_line(NULL, NULL); + gIsInit = 0; +} + +/** 辞書ユーティリティAPIのエンコーディングを設定する */ +int +anthy_dic_util_set_encoding(int enc) +{ + if (enc == ANTHY_UTF8_ENCODING || + enc == ANTHY_EUC_JP_ENCODING) { + dic_util_encoding = enc; + } + return dic_util_encoding; +} + +void +anthy_dic_util_set_personality(const char *id) +{ + anthy_dic_set_personality(id); +} + +static char * +find_next_key(const char *prefix) +{ + char *v; + v = anthy_trie_find_next_key(anthy_private_tt_dic, + word_iterator.key_buf, MAX_KEY_LEN+32); + + if (v && v[0] == prefix[0] && v[1] == prefix[1]) { + /* 次のkeyも指定されたprefixを持っている */ + return v; + } + /**/ + sprintf(word_iterator.key_buf, "%s", prefix); + return NULL; +} + +static void +delete_prefix(const char *prefix) +{ + sprintf(word_iterator.key_buf, "%s", prefix); + anthy_priv_dic_lock(); + /* word_iterator.key_bufがprefixの文字列であれば、find_next_key()は + 最初の単語を返す */ + while (find_next_key(prefix)) { + anthy_trie_delete(anthy_private_tt_dic, word_iterator.key_buf); + sprintf(word_iterator.key_buf, "%s", prefix); + } + anthy_priv_dic_unlock(); +} + +static const char * +encoding_prefix(int encoding) +{ + if (encoding == ANTHY_UTF8_ENCODING) { + return " p"; + } + /* EUC-JP */ + return " "; +} + +/** (API) 個人辞書を全部消す */ +void +anthy_priv_dic_delete(void) +{ + delete_prefix(encoding_prefix(ANTHY_EUC_JP_ENCODING)); + /**/ + while (!anthy_textdict_delete_line(anthy_private_text_dic, 0)) { + /**/ + } +} + +static int +scan_one_word_cb(void *p, int next_offset, const char *key, const char *n) +{ + (void)p; + set_current_line(key, n); + word_iterator.dicfile_offset = next_offset; + return -1; +} + +static int +select_first_entry_in_textdict(void) +{ + word_iterator.dicfile_offset = 0; + set_current_line(NULL, NULL); + anthy_textdict_scan(anthy_private_text_dic, + word_iterator.dicfile_offset, NULL, + scan_one_word_cb); + if (word_iterator.current_line) { + word_iterator.in_tt = 0; + return 0; + } + /* 単語が無い */ + return ANTHY_DIC_UTIL_ERROR; +} + +/** (API) 最初の単語を選択する */ +int +anthy_priv_dic_select_first_entry(void) +{ + if (dic_util_encoding == ANTHY_UTF8_ENCODING) { + return select_first_entry_in_textdict(); + } + if (anthy_private_tt_dic) { + sprintf(word_iterator.key_buf, "%s", encoding_prefix(dic_util_encoding)); + /* prefixの次のエントリが最初のエントリ */ + if (find_next_key(encoding_prefix(dic_util_encoding))) { + word_iterator.in_tt = 1; + return 0; + } + } + /* 単語が無いのでtextdictに移動を試みる */ + return select_first_entry_in_textdict(); +} + +/** (API) 現在選択されている単語の次の単語を選択する */ +int +anthy_priv_dic_select_next_entry(void) +{ + if (!word_iterator.in_tt) { + set_current_line(NULL, NULL); + anthy_textdict_scan(anthy_private_text_dic, word_iterator.dicfile_offset, + NULL, + scan_one_word_cb); + if (word_iterator.current_line) { + return 0; + } + return ANTHY_DIC_UTIL_ERROR; + } + if (find_next_key(encoding_prefix(dic_util_encoding))) { + return 0; + } + /* 単語が無いのでtextdictに移動を試みる */ + return select_first_entry_in_textdict(); +} + +/** 未実装 */ +int +anthy_priv_dic_select_entry(const char *index) +{ + (void)index; + return 0; +} + +/** 現在選択されている単語の読みをを取得する */ +char * +anthy_priv_dic_get_index(char *buf, int len) +{ + int i; + char *src_buf; + if (word_iterator.in_tt) { + src_buf = &word_iterator.key_buf[2]; + } else { + src_buf = word_iterator.current_index; + } + if (!word_iterator.in_tt && dic_util_encoding == ANTHY_EUC_JP_ENCODING) { + /**/ + src_buf = anthy_conv_utf8_to_euc(src_buf); + } else { + src_buf = strdup(src_buf); + } + /* 最初の空白か\0までをコピーする */ + for (i = 0; src_buf[i] && src_buf[i] != ' '; i++) { + if (i >= len - 1) { + free(src_buf); + return NULL; + } + buf[i] = src_buf[i]; + } + buf[i] = 0; + free(src_buf); + return buf; +} + +/** 現在選択されている単語の頻度を取得する */ +int +anthy_priv_dic_get_freq(void) +{ + struct word_line res; + char *v; + if (word_iterator.in_tt) { + v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf); + anthy_parse_word_line(v, &res); + free(v); + } else { + anthy_parse_word_line(word_iterator.current_line, &res); + } + return res.freq; +} + +/** 現在選択されている単語の品詞を取得する */ +char * +anthy_priv_dic_get_wtype(char *buf, int len) +{ + struct word_line res; + char *v; + if (word_iterator.in_tt) { + v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf); + anthy_parse_word_line(v, &res); + free(v); + } else { + anthy_parse_word_line(word_iterator.current_line, &res); + } + if (len - 1 < (int)strlen(res.wt)) { + return NULL; + } + sprintf(buf, "%s", res.wt); + return buf; +} + +/** 現在選択されている単語を取得する */ +char * +anthy_priv_dic_get_word(char *buf, int len) +{ + char *v; + char *s; + if (word_iterator.in_tt) { + v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf); + } else { + v = word_iterator.current_line; + } + if (!v) { + return NULL; + } + /* 品詞の後ろにある単語を取り出す */ + s = strchr(v, ' '); + s++; + if (!word_iterator.in_tt && dic_util_encoding == ANTHY_EUC_JP_ENCODING) { + s = anthy_conv_utf8_to_euc(s); + snprintf(buf, len, "%s", s); + free(s); + } else { + snprintf(buf, len, "%s", s); + } + if (word_iterator.in_tt) { + free(v); + } + return buf; +} + +static int +find_cb(void *p, int next_offset, const char *key, const char *n) +{ + struct scan_context *sc = p; + struct word_line res; + if (strcmp(key, sc->yomi)) { + sc->offset = next_offset; + return 0; + } + anthy_parse_word_line(n, &res); + if (!strcmp(res.wt, sc->wt_name) && + !strcmp(res.word, sc->word)) { + sc->found_word = 1; + return -1; + } + sc->offset = next_offset; + return 0; +} + +static int +order_cb(void *p, int next_offset, const char *key, const char *n) +{ + struct scan_context *sc = p; + (void)n; + if (strcmp(key, sc->yomi) >= 0) { + sc->found_word = 1; + return -1; + } + sc->offset = next_offset; + return 0; +} + +/* 引数はutf8 */ +static int +do_add_word_to_textdict(struct textdict *td, int offset, + const char *yomi, const char *word, + const char *wt_name, int freq) +{ + char *buf = malloc(strlen(yomi) + strlen(word) + strlen(wt_name) + 20); + int rv; + if (!buf) { + return -1; + } + sprintf(buf, "%s %s*%d %s\n", yomi, wt_name, freq, word); + rv = anthy_textdict_insert_line(td, offset, buf); + free(buf); + return rv; +} + +static int +dup_word_check(const char *v, const char *word, const char *wt) +{ + struct word_line res; + + if (anthy_parse_word_line(v, &res)) { + return 0; + } + + /* 読みと単語を比較する */ + if (!strcmp(res.wt, wt) && + !strcmp(res.word, word)) { + return 1; + } + return 0; +} + +static int +find_same_word(char *idx_buf, const char *yomi, + const char *word, const char *wt_name, int yomi_len) +{ + int found = 0; + sprintf(idx_buf, "%s%s ", + encoding_prefix(dic_util_encoding), + yomi); + anthy_trie_find_next_key(anthy_private_tt_dic, + idx_buf, yomi_len + 12); + + /* trieのインデックスを探す */ + do { + char *v; + if (strncmp(&idx_buf[2], yomi, yomi_len) || + idx_buf[yomi_len+2] != ' ') { + /* 見出語が異なるのでループ終了 */ + break; + } + /* texttrieにアクセスして、見出語以外も一致しているかをチェック */ + v = anthy_trie_find(anthy_private_tt_dic, idx_buf); + if (v) { + found = dup_word_check(v, word, wt_name); + free(v); + if (found) { + break; + } + } + } while (anthy_trie_find_next_key(anthy_private_tt_dic, + idx_buf, yomi_len + 12)); + + return found; +} + +static int +add_word_to_textdict(const char *yomi, const char *word, + const char *wt_name, int freq) +{ + struct scan_context sc; + int rv; + int yomi_len = strlen(yomi); + + if (yomi_len > MAX_KEY_LEN || yomi_len == 0) { + return ANTHY_DIC_UTIL_ERROR; + } + + if (wt_name[0] != '#') { + return ANTHY_DIC_UTIL_ERROR; + } + + /* texttrieにあれば消す */ + if (anthy_private_tt_dic) { + char *idx_buf = malloc(yomi_len + 12); + if (find_same_word(idx_buf, yomi, word, wt_name, yomi_len)) { + anthy_trie_delete(anthy_private_tt_dic, idx_buf); + } + free(idx_buf); + } + + /* 同じ物があったら消す */ + sc.yomi = yomi; + sc.word = word; + sc.wt_name = wt_name; + /**/ + sc.offset = 0; + sc.found_word = 0; + anthy_textdict_scan(anthy_private_text_dic, 0, &sc, + find_cb); + if (sc.found_word == 1) { + anthy_textdict_delete_line(anthy_private_text_dic, sc.offset); + } + if (freq == 0) { + return ANTHY_DIC_UTIL_OK; + } + /* 追加する場所を探す */ + sc.offset = 0; + sc.found_word = 0; + anthy_textdict_scan(anthy_private_text_dic, 0, &sc, + order_cb); + /* 追加する */ + rv = do_add_word_to_textdict(anthy_private_text_dic, sc.offset, + yomi, word, wt_name, freq); + if (!rv) { + return ANTHY_DIC_UTIL_OK; + } + return ANTHY_DIC_UTIL_ERROR; +} + +/** 単語を登録する + * 頻度が0の場合は削除 + */ +int +anthy_priv_dic_add_entry(const char *yomi, const char *word, + const char *wt_name, int freq) +{ + if (dic_util_encoding == ANTHY_UTF8_ENCODING) { + return add_word_to_textdict(yomi, word, wt_name, freq); + } else { + int rv; + char *yomi_utf8 = anthy_conv_euc_to_utf8(yomi); + char *word_utf8 = anthy_conv_euc_to_utf8(word); + rv = add_word_to_textdict(yomi_utf8, word_utf8, wt_name, freq); + free(yomi_utf8); + free(word_utf8); + return rv; + } +} + +const char * +anthy_dic_util_get_anthydir(void) +{ + return anthy_conf_get_str("ANTHYDIR"); +} + +/* lookコマンドの辞書を検索するための関数 */ +static char * +do_search(FILE *fp, const char *word) +{ + char buf[32]; + char *res = NULL; + int word_len = strlen(word); + while (fgets(buf, 32, fp)) { + int len = strlen(buf); + buf[len - 1] = 0; + len --; + if (len > word_len) { + continue; + } + if (!strncasecmp(buf, word, len)) { + if (res) { + free(res); + } + res = strdup(buf); + } + } + return res; +} + +/* lookコマンドの辞書を検索するAPI */ +char * +anthy_dic_search_words_file(const char *word) +{ + FILE *fp; + char *res; + const char *words_dict_fn = anthy_conf_get_str("WORDS_FILE"); + if (!words_dict_fn) { + return NULL; + } + fp = fopen(words_dict_fn, "r"); + if (!fp) { + return NULL; + } + res = do_search(fp, word); + fclose(fp); + return res; +} |