summaryrefslogtreecommitdiff
path: root/src-worddic/dic_util.c
diff options
context:
space:
mode:
Diffstat (limited to 'src-worddic/dic_util.c')
-rw-r--r--src-worddic/dic_util.c594
1 files changed, 594 insertions, 0 deletions
diff --git a/src-worddic/dic_util.c b/src-worddic/dic_util.c
new file mode 100644
index 0000000..e03a801
--- /dev/null
+++ b/src-worddic/dic_util.c
@@ -0,0 +1,594 @@
+/*
+ * 個人辞書管理用の関数群
+ *
+ * 互換性の都合で
+ * utf8の辞書はtextdict
+ * eucjpの辞書はtexttrie
+ * およびrecordを使ってて混乱しまくり
+ * textdictへ移行する
+ *
+ * 開発予定
+ *
+ * 新規登録はtextdictに対して行うようにする <- todo
+ * texttrieの単語は移行するようにする
+ * record関係は消す
+ *
+ *
+ * Funded by IPA未踏ソフトウェア創造事業 2001 10/24
+ *
+ * Copyright (C) 2001-2007 TABATA Yusuke
+ *
+ */
+/*
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <anthy/anthy.h>
+#include <anthy/conf.h>
+#include <anthy/dic.h>
+#include <anthy/texttrie.h>
+#include <anthy/textdict.h>
+#include <anthy/dicutil.h>
+
+#include "dic_main.h"
+#include "dic_personality.h"
+
+/*
+ * 個人辞書はtexttrie中に格納されるとき
+ * 「 見出し 数字」 -> 「#品詞*頻度 単語」という形式をとる
+ * (UTF8の場合は「 p見出し 数字」 -> 「#品詞*頻度 単語」)
+ * 最初の2文字の空白は単語情報のセクションであることを意味し、
+ * 数字の部分は同音語を区別するために用いられる。
+ *
+ */
+
+/* UTF8で32文字 x 3bytes */
+#define MAX_KEY_LEN 96
+
+static int gIsInit;
+static int dic_util_encoding;
+
+extern struct text_trie *anthy_private_tt_dic;
+extern struct textdict *anthy_private_text_dic;
+/* 現在選択されている読み */
+static struct iterate_contex {
+ /**/
+ int in_tt;
+ /* texttrie */
+ char key_buf[MAX_KEY_LEN+32];
+ /* textdictの検索用 */
+ int dicfile_offset;
+ char *current_index;
+ char *current_line;
+} word_iterator;
+/**/
+struct scan_context {
+ const char *yomi;
+ const char *word;
+ const char *wt_name;
+ int offset;
+ int found_word;
+};
+
+static void
+set_current_line(const char *index, const char *line)
+{
+ if (word_iterator.current_line) {
+ free(word_iterator.current_line);
+ word_iterator.current_line = NULL;
+ }
+ if (line) {
+ word_iterator.current_line = strdup(line);
+ }
+ if (word_iterator.current_index) {
+ free(word_iterator.current_index);
+ word_iterator.current_index = NULL;
+ }
+ if (index) {
+ word_iterator.current_index = strdup(index);
+ }
+}
+
+/** 個人辞書ライブラリを初期化する */
+void
+anthy_dic_util_init(void)
+{
+ if (gIsInit) {
+ return ;
+ }
+ if (anthy_init_dic() == -1) {
+ return ;
+ }
+ anthy_dic_set_personality("default");
+ gIsInit = 1;
+ dic_util_encoding = ANTHY_EUC_JP_ENCODING;
+ /**/
+ word_iterator.key_buf[0] = 0;
+ word_iterator.in_tt = 1;
+}
+
+/** 辞書ライブラリを解放する */
+void
+anthy_dic_util_quit(void)
+{
+ if (gIsInit) {
+ anthy_quit_dic();
+ }
+ set_current_line(NULL, NULL);
+ gIsInit = 0;
+}
+
+/** 辞書ユーティリティAPIのエンコーディングを設定する */
+int
+anthy_dic_util_set_encoding(int enc)
+{
+ if (enc == ANTHY_UTF8_ENCODING ||
+ enc == ANTHY_EUC_JP_ENCODING) {
+ dic_util_encoding = enc;
+ }
+ return dic_util_encoding;
+}
+
+void
+anthy_dic_util_set_personality(const char *id)
+{
+ anthy_dic_set_personality(id);
+}
+
+static char *
+find_next_key(const char *prefix)
+{
+ char *v;
+ v = anthy_trie_find_next_key(anthy_private_tt_dic,
+ word_iterator.key_buf, MAX_KEY_LEN+32);
+
+ if (v && v[0] == prefix[0] && v[1] == prefix[1]) {
+ /* 次のkeyも指定されたprefixを持っている */
+ return v;
+ }
+ /**/
+ sprintf(word_iterator.key_buf, "%s", prefix);
+ return NULL;
+}
+
+static void
+delete_prefix(const char *prefix)
+{
+ sprintf(word_iterator.key_buf, "%s", prefix);
+ anthy_priv_dic_lock();
+ /* word_iterator.key_bufがprefixの文字列であれば、find_next_key()は
+ 最初の単語を返す */
+ while (find_next_key(prefix)) {
+ anthy_trie_delete(anthy_private_tt_dic, word_iterator.key_buf);
+ sprintf(word_iterator.key_buf, "%s", prefix);
+ }
+ anthy_priv_dic_unlock();
+}
+
+static const char *
+encoding_prefix(int encoding)
+{
+ if (encoding == ANTHY_UTF8_ENCODING) {
+ return " p";
+ }
+ /* EUC-JP */
+ return " ";
+}
+
+/** (API) 個人辞書を全部消す */
+void
+anthy_priv_dic_delete(void)
+{
+ delete_prefix(encoding_prefix(ANTHY_EUC_JP_ENCODING));
+ /**/
+ while (!anthy_textdict_delete_line(anthy_private_text_dic, 0)) {
+ /**/
+ }
+}
+
+static int
+scan_one_word_cb(void *p, int next_offset, const char *key, const char *n)
+{
+ (void)p;
+ set_current_line(key, n);
+ word_iterator.dicfile_offset = next_offset;
+ return -1;
+}
+
+static int
+select_first_entry_in_textdict(void)
+{
+ word_iterator.dicfile_offset = 0;
+ set_current_line(NULL, NULL);
+ anthy_textdict_scan(anthy_private_text_dic,
+ word_iterator.dicfile_offset, NULL,
+ scan_one_word_cb);
+ if (word_iterator.current_line) {
+ word_iterator.in_tt = 0;
+ return 0;
+ }
+ /* 単語が無い */
+ return ANTHY_DIC_UTIL_ERROR;
+}
+
+/** (API) 最初の単語を選択する */
+int
+anthy_priv_dic_select_first_entry(void)
+{
+ if (dic_util_encoding == ANTHY_UTF8_ENCODING) {
+ return select_first_entry_in_textdict();
+ }
+ if (anthy_private_tt_dic) {
+ sprintf(word_iterator.key_buf, "%s", encoding_prefix(dic_util_encoding));
+ /* prefixの次のエントリが最初のエントリ */
+ if (find_next_key(encoding_prefix(dic_util_encoding))) {
+ word_iterator.in_tt = 1;
+ return 0;
+ }
+ }
+ /* 単語が無いのでtextdictに移動を試みる */
+ return select_first_entry_in_textdict();
+}
+
+/** (API) 現在選択されている単語の次の単語を選択する */
+int
+anthy_priv_dic_select_next_entry(void)
+{
+ if (!word_iterator.in_tt) {
+ set_current_line(NULL, NULL);
+ anthy_textdict_scan(anthy_private_text_dic, word_iterator.dicfile_offset,
+ NULL,
+ scan_one_word_cb);
+ if (word_iterator.current_line) {
+ return 0;
+ }
+ return ANTHY_DIC_UTIL_ERROR;
+ }
+ if (find_next_key(encoding_prefix(dic_util_encoding))) {
+ return 0;
+ }
+ /* 単語が無いのでtextdictに移動を試みる */
+ return select_first_entry_in_textdict();
+}
+
+/** 未実装 */
+int
+anthy_priv_dic_select_entry(const char *index)
+{
+ (void)index;
+ return 0;
+}
+
+/** 現在選択されている単語の読みをを取得する */
+char *
+anthy_priv_dic_get_index(char *buf, int len)
+{
+ int i;
+ char *src_buf;
+ if (word_iterator.in_tt) {
+ src_buf = &word_iterator.key_buf[2];
+ } else {
+ src_buf = word_iterator.current_index;
+ }
+ if (!word_iterator.in_tt && dic_util_encoding == ANTHY_EUC_JP_ENCODING) {
+ /**/
+ src_buf = anthy_conv_utf8_to_euc(src_buf);
+ } else {
+ src_buf = strdup(src_buf);
+ }
+ /* 最初の空白か\0までをコピーする */
+ for (i = 0; src_buf[i] && src_buf[i] != ' '; i++) {
+ if (i >= len - 1) {
+ free(src_buf);
+ return NULL;
+ }
+ buf[i] = src_buf[i];
+ }
+ buf[i] = 0;
+ free(src_buf);
+ return buf;
+}
+
+/** 現在選択されている単語の頻度を取得する */
+int
+anthy_priv_dic_get_freq(void)
+{
+ struct word_line res;
+ char *v;
+ if (word_iterator.in_tt) {
+ v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf);
+ anthy_parse_word_line(v, &res);
+ free(v);
+ } else {
+ anthy_parse_word_line(word_iterator.current_line, &res);
+ }
+ return res.freq;
+}
+
+/** 現在選択されている単語の品詞を取得する */
+char *
+anthy_priv_dic_get_wtype(char *buf, int len)
+{
+ struct word_line res;
+ char *v;
+ if (word_iterator.in_tt) {
+ v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf);
+ anthy_parse_word_line(v, &res);
+ free(v);
+ } else {
+ anthy_parse_word_line(word_iterator.current_line, &res);
+ }
+ if (len - 1 < (int)strlen(res.wt)) {
+ return NULL;
+ }
+ sprintf(buf, "%s", res.wt);
+ return buf;
+}
+
+/** 現在選択されている単語を取得する */
+char *
+anthy_priv_dic_get_word(char *buf, int len)
+{
+ char *v;
+ char *s;
+ if (word_iterator.in_tt) {
+ v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf);
+ } else {
+ v = word_iterator.current_line;
+ }
+ if (!v) {
+ return NULL;
+ }
+ /* 品詞の後ろにある単語を取り出す */
+ s = strchr(v, ' ');
+ s++;
+ if (!word_iterator.in_tt && dic_util_encoding == ANTHY_EUC_JP_ENCODING) {
+ s = anthy_conv_utf8_to_euc(s);
+ snprintf(buf, len, "%s", s);
+ free(s);
+ } else {
+ snprintf(buf, len, "%s", s);
+ }
+ if (word_iterator.in_tt) {
+ free(v);
+ }
+ return buf;
+}
+
+static int
+find_cb(void *p, int next_offset, const char *key, const char *n)
+{
+ struct scan_context *sc = p;
+ struct word_line res;
+ if (strcmp(key, sc->yomi)) {
+ sc->offset = next_offset;
+ return 0;
+ }
+ anthy_parse_word_line(n, &res);
+ if (!strcmp(res.wt, sc->wt_name) &&
+ !strcmp(res.word, sc->word)) {
+ sc->found_word = 1;
+ return -1;
+ }
+ sc->offset = next_offset;
+ return 0;
+}
+
+static int
+order_cb(void *p, int next_offset, const char *key, const char *n)
+{
+ struct scan_context *sc = p;
+ (void)n;
+ if (strcmp(key, sc->yomi) >= 0) {
+ sc->found_word = 1;
+ return -1;
+ }
+ sc->offset = next_offset;
+ return 0;
+}
+
+/* 引数はutf8 */
+static int
+do_add_word_to_textdict(struct textdict *td, int offset,
+ const char *yomi, const char *word,
+ const char *wt_name, int freq)
+{
+ char *buf = malloc(strlen(yomi) + strlen(word) + strlen(wt_name) + 20);
+ int rv;
+ if (!buf) {
+ return -1;
+ }
+ sprintf(buf, "%s %s*%d %s\n", yomi, wt_name, freq, word);
+ rv = anthy_textdict_insert_line(td, offset, buf);
+ free(buf);
+ return rv;
+}
+
+static int
+dup_word_check(const char *v, const char *word, const char *wt)
+{
+ struct word_line res;
+
+ if (anthy_parse_word_line(v, &res)) {
+ return 0;
+ }
+
+ /* 読みと単語を比較する */
+ if (!strcmp(res.wt, wt) &&
+ !strcmp(res.word, word)) {
+ return 1;
+ }
+ return 0;
+}
+
+static int
+find_same_word(char *idx_buf, const char *yomi,
+ const char *word, const char *wt_name, int yomi_len)
+{
+ int found = 0;
+ sprintf(idx_buf, "%s%s ",
+ encoding_prefix(dic_util_encoding),
+ yomi);
+ anthy_trie_find_next_key(anthy_private_tt_dic,
+ idx_buf, yomi_len + 12);
+
+ /* trieのインデックスを探す */
+ do {
+ char *v;
+ if (strncmp(&idx_buf[2], yomi, yomi_len) ||
+ idx_buf[yomi_len+2] != ' ') {
+ /* 見出語が異なるのでループ終了 */
+ break;
+ }
+ /* texttrieにアクセスして、見出語以外も一致しているかをチェック */
+ v = anthy_trie_find(anthy_private_tt_dic, idx_buf);
+ if (v) {
+ found = dup_word_check(v, word, wt_name);
+ free(v);
+ if (found) {
+ break;
+ }
+ }
+ } while (anthy_trie_find_next_key(anthy_private_tt_dic,
+ idx_buf, yomi_len + 12));
+
+ return found;
+}
+
+static int
+add_word_to_textdict(const char *yomi, const char *word,
+ const char *wt_name, int freq)
+{
+ struct scan_context sc;
+ int rv;
+ int yomi_len = strlen(yomi);
+
+ if (yomi_len > MAX_KEY_LEN || yomi_len == 0) {
+ return ANTHY_DIC_UTIL_ERROR;
+ }
+
+ if (wt_name[0] != '#') {
+ return ANTHY_DIC_UTIL_ERROR;
+ }
+
+ /* texttrieにあれば消す */
+ if (anthy_private_tt_dic) {
+ char *idx_buf = malloc(yomi_len + 12);
+ if (find_same_word(idx_buf, yomi, word, wt_name, yomi_len)) {
+ anthy_trie_delete(anthy_private_tt_dic, idx_buf);
+ }
+ free(idx_buf);
+ }
+
+ /* 同じ物があったら消す */
+ sc.yomi = yomi;
+ sc.word = word;
+ sc.wt_name = wt_name;
+ /**/
+ sc.offset = 0;
+ sc.found_word = 0;
+ anthy_textdict_scan(anthy_private_text_dic, 0, &sc,
+ find_cb);
+ if (sc.found_word == 1) {
+ anthy_textdict_delete_line(anthy_private_text_dic, sc.offset);
+ }
+ if (freq == 0) {
+ return ANTHY_DIC_UTIL_OK;
+ }
+ /* 追加する場所を探す */
+ sc.offset = 0;
+ sc.found_word = 0;
+ anthy_textdict_scan(anthy_private_text_dic, 0, &sc,
+ order_cb);
+ /* 追加する */
+ rv = do_add_word_to_textdict(anthy_private_text_dic, sc.offset,
+ yomi, word, wt_name, freq);
+ if (!rv) {
+ return ANTHY_DIC_UTIL_OK;
+ }
+ return ANTHY_DIC_UTIL_ERROR;
+}
+
+/** 単語を登録する
+ * 頻度が0の場合は削除
+ */
+int
+anthy_priv_dic_add_entry(const char *yomi, const char *word,
+ const char *wt_name, int freq)
+{
+ if (dic_util_encoding == ANTHY_UTF8_ENCODING) {
+ return add_word_to_textdict(yomi, word, wt_name, freq);
+ } else {
+ int rv;
+ char *yomi_utf8 = anthy_conv_euc_to_utf8(yomi);
+ char *word_utf8 = anthy_conv_euc_to_utf8(word);
+ rv = add_word_to_textdict(yomi_utf8, word_utf8, wt_name, freq);
+ free(yomi_utf8);
+ free(word_utf8);
+ return rv;
+ }
+}
+
+const char *
+anthy_dic_util_get_anthydir(void)
+{
+ return anthy_conf_get_str("ANTHYDIR");
+}
+
+/* lookコマンドの辞書を検索するための関数 */
+static char *
+do_search(FILE *fp, const char *word)
+{
+ char buf[32];
+ char *res = NULL;
+ int word_len = strlen(word);
+ while (fgets(buf, 32, fp)) {
+ int len = strlen(buf);
+ buf[len - 1] = 0;
+ len --;
+ if (len > word_len) {
+ continue;
+ }
+ if (!strncasecmp(buf, word, len)) {
+ if (res) {
+ free(res);
+ }
+ res = strdup(buf);
+ }
+ }
+ return res;
+}
+
+/* lookコマンドの辞書を検索するAPI */
+char *
+anthy_dic_search_words_file(const char *word)
+{
+ FILE *fp;
+ char *res;
+ const char *words_dict_fn = anthy_conf_get_str("WORDS_FILE");
+ if (!words_dict_fn) {
+ return NULL;
+ }
+ fp = fopen(words_dict_fn, "r");
+ if (!fp) {
+ return NULL;
+ }
+ res = do_search(fp, word);
+ fclose(fp);
+ return res;
+}