diff options
Diffstat (limited to 'src-main/context.c')
-rw-r--r-- | src-main/context.c | 695 |
1 files changed, 695 insertions, 0 deletions
diff --git a/src-main/context.c b/src-main/context.c new file mode 100644 index 0000000..7b426db --- /dev/null +++ b/src-main/context.c @@ -0,0 +1,695 @@ +/* + * 変換や文節の伸縮などの操作が進行中の文字列や候補などを + * まとめて変換コンテキストと呼ぶ。 + * Anthyのコンテキストに対する操作は全てここから呼ばれる。 + * 各操作に対して変換パイプラインの必要なモジュールを順に呼びだす。 + * + * personalityの管理もする。 + * + * Funded by IPA未踏ソフトウェア創造事業 2001 10/29 + * Copyright (C) 2000-2007 TABATA Yusuke + * + * $Id: context.c,v 1.26 2002/11/17 14:45:47 yusuke Exp $ + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include <anthy/anthy.h> +#include <anthy/alloc.h> +#include <anthy/record.h> +#include <anthy/ordering.h> +#include <anthy/splitter.h> +#include <anthy/xstr.h> +#include "main.h" + +/**/ +static allocator context_ator; + +/** 現在のpersonality + * 未設定時: null + * 未設定のまま変換を開始した場合: "default" + * anonymousの場合: "" + */ +static char *current_personality; + +/**/ +#define HISTORY_FILE_LIMIT 100000 + +static void +context_dtor(void *p) +{ + anthy_do_reset_context((struct anthy_context *)p); +} + +/** 現在のpersonalityを返す */ +static char * +get_personality(void) +{ + if (!current_personality) { + current_personality = strdup("default"); + anthy_dic_set_personality(current_personality); + } + return current_personality; +} + +static void +release_segment(struct seg_ent *s) +{ + if (s->cands) { + int i; + for (i = 0; i < s->nr_cands; i++) { + anthy_release_cand_ent(s->cands[i]); + } + free (s->cands); + } + if (s->mw_array) { + free(s->mw_array); + } + free(s); + +} + +/** 文節リストの最後の要素を削除する */ +static void +pop_back_seg_ent(struct anthy_context *c) +{ + struct seg_ent *s; + s = c->seg_list.list_head.prev; + if (s == &c->seg_list.list_head) { + return ; + } + s->prev->next = s->next; + s->next->prev = s->prev; + release_segment(s); + c->seg_list.nr_segments --; +} + + +/** n番目の文節の文字のindexを求める */ +static int +get_nth_segment_index(struct anthy_context *c, int n) +{ + int i,s; + for (i = 0, s = 0; i < c->str.len; i++) { + if (c->split_info.ce[i].seg_border) { + if (s == n) { + return i; + } + s++; + } + } + return -1; +} + +/** n番目の文節の長さを求める. + * segment_listが構成されていなくても計算できるようにする. + */ +static int +get_nth_segment_len(struct anthy_context *c, int sindex) +{ + int a,i,l; + a = get_nth_segment_index(c, sindex); + if ( a == -1){ + return -1; + } + l = 1; + for (i = a+1; !c->split_info.ce[i].seg_border; i++) { + l++; + } + return l; +} + +/** metawordの配列を作る */ +static void +make_metaword_array(struct anthy_context *ac, + struct seg_ent *se) +{ + int i; + se->mw_array = NULL; + for (i = se->len; i > 0; i--) { + int j; + /* 最後に濁点とかがついてたら直前の文字ごと落す */ + if (i < se->len && + anthy_get_xchar_type(se->str.str[i]) & XCT_PART) { + /* FIXME 濁点とかがありえない並びをしてたら */ + i--; + continue ; + } + + se->nr_metaword = anthy_get_nr_metaword(&ac->split_info, se->from, i); + if (!se->nr_metaword) { + continue ; + } + /* metawordを配列に取り込む */ + se->mw_array = malloc(sizeof(struct meta_word*) * se->nr_metaword); + for (j = 0; j < se->nr_metaword; j++) { + se->mw_array[j] = anthy_get_nth_metaword(&ac->split_info, se->from, i, j); + } + return; + } +} + +static struct seg_ent* +create_segment(struct anthy_context *ac, int from, int len, + struct meta_word* best_mw) +{ + struct seg_ent* s; + s = (struct seg_ent *)malloc(sizeof(struct seg_ent)); + s->str.str = &ac->str.str[from]; + s->str.len = len; + s->from = from; + s->len = s->str.len; + s->nr_cands = 0; + s->cands = NULL; + s->best_seg_class = ac->split_info.ce[from].best_seg_class; + s->best_mw = best_mw; + make_metaword_array(ac, s); + return s; +} + +/** 変換コンテキストに文節を追加する */ +static void +push_back_segment(struct anthy_context *ac, struct seg_ent *se) +{ + se->next = &ac->seg_list.list_head; + se->prev = ac->seg_list.list_head.prev; + ac->seg_list.list_head.prev->next = se; + ac->seg_list.list_head.prev = se; + ac->seg_list.nr_segments ++; + se->committed = -1; +} + +/** splitterによって配列中に付けられた文節境界のマークから、 + * 文節のリストを構成する + */ +static void +create_segment_list(struct anthy_context *ac, int from, int to) +{ + int i, n; + struct seg_ent *s; + /* from の所までにいくつの文節があるか調べる */ + i = 0; n = 0; + while (i < from) { + i += get_nth_segment_len(ac, n); + n++; + }; + /**/ + for (i = from; i < to; i++) { + if (ac->split_info.ce[i].seg_border) { + int len = get_nth_segment_len(ac, n); + s = create_segment(ac, i, len, ac->split_info.ce[i].best_mw); + + push_back_segment(ac, s); + n++; + } + } +} + +/** コンテキストを作る */ +struct anthy_context * +anthy_do_create_context(int encoding) +{ + struct anthy_context *ac; + char *p = get_personality(); + + if (!p) { + return NULL; + } + + ac = (struct anthy_context *)anthy_smalloc(context_ator); + ac->str.str = NULL; + ac->str.len = 0; + ac->seg_list.nr_segments = 0; + ac->seg_list.list_head.prev = &ac->seg_list.list_head; + ac->seg_list.list_head.next = &ac->seg_list.list_head; + ac->split_info.word_split_info = NULL; + ac->split_info.ce = NULL; + ac->ordering_info.oc = NULL; + ac->dic_session = NULL; + ac->prediction.str.str = NULL; + ac->prediction.str.len = 0; + ac->prediction.nr_prediction = 0; + ac->prediction.predictions = NULL; + ac->encoding = encoding; + ac->reconversion_mode = ANTHY_RECONVERT_AUTO; + + return ac; +} + +/** コンテキストのアロケータを作る */ +void +anthy_init_contexts(void) +{ + context_ator = anthy_create_allocator(sizeof(struct anthy_context), + context_dtor); +} + +void +anthy_quit_contexts(void) +{ + anthy_free_allocator(context_ator); +} + +static void +release_prediction(struct prediction_cache *pc) +{ + int i; + if (pc->str.str) { + free(pc->str.str); + pc->str.str = NULL; + } + if (pc->predictions) { + for (i = 0; i < pc->nr_prediction; ++i) { + anthy_free_xstr(pc->predictions[i].src_str); + anthy_free_xstr(pc->predictions[i].str); + } + free(pc->predictions); + pc->predictions = NULL; + } +} + +void +anthy_release_segment_list(struct anthy_context *ac) +{ + int i, sc; + sc = ac->seg_list.nr_segments; + for (i = 0; i < sc; i++) { + pop_back_seg_ent(ac); + } + ac->seg_list.nr_segments = 0; +} + +/* resetではcontextのために確保されたリソースを全て解放する */ +void +anthy_do_reset_context(struct anthy_context *ac) +{ + /* まず辞書セッションを解放 */ + if (ac->dic_session) { + anthy_dic_release_session(ac->dic_session); + ac->dic_session = NULL; + } + if (!ac->str.str) { + /* 文字列が設定されていなければ解放すべき物はもう無い */ + return ; + } + free(ac->str.str); + ac->str.str = NULL; + anthy_release_split_context(&ac->split_info); + anthy_release_segment_list(ac); + + /* 予測された文字列の解放 */ + release_prediction(&ac->prediction); +} + +void +anthy_do_release_context(struct anthy_context *ac) +{ + anthy_sfree(context_ator, ac); +} + +static void +make_candidates(struct anthy_context *ac, int from, int from2, int is_reverse) +{ + int i; + int len = ac->str.len; + + /* 文節の境界を設定 */ + /* from と from2の間に境界を作ることを禁止する */ + anthy_mark_border(&ac->split_info, from, from2, len); + create_segment_list(ac, from, len); + anthy_sort_metaword(&ac->seg_list); + + /* 候補を列挙 */ + for (i = 0; i < ac->seg_list.nr_segments; i++) { + anthy_do_make_candidates(&ac->split_info, + anthy_get_nth_segment(&ac->seg_list, i), + is_reverse); + } + /* 候補をソート */ + anthy_sort_candidate(&ac->seg_list, 0); +} + +int +anthy_do_context_set_str(struct anthy_context *ac, xstr *s, int is_reverse) +{ + int i; + + /* 文字列をコピー(一文字分余計にして0をセット) */ + ac->str.str = (xchar *)malloc(sizeof(xchar)*(s->len+1)); + anthy_xstrcpy(&ac->str, s); + ac->str.str[s->len] = 0; + + /* splitterの初期化*/ + anthy_init_split_context(&ac->str, &ac->split_info, is_reverse); + + /* 解の候補を作成 */ + make_candidates(ac, 0, 0, is_reverse); + + /* 最初に設定した文節境界を覚えておく */ + for (i = 0; i < ac->seg_list.nr_segments; i++) { + struct seg_ent *s = anthy_get_nth_segment(&ac->seg_list, i); + ac->split_info.ce[s->from].initial_seg_len = s->len; + } + + return 0; +} + +void +anthy_do_resize_segment(struct anthy_context *ac, + int nth, int resize) +{ + int i; + int index, len, sc; + + /* resizeが可能か検査する */ + if (nth >= ac->seg_list.nr_segments) { + return ; + } + index = get_nth_segment_index(ac, nth); + len = get_nth_segment_len(ac, nth); + if (index + len + resize > ac->str.len) { + return ; + } + if (len + resize < 1) { + return ; + } + + /* nth以降のseg_entを解放する */ + sc = ac->seg_list.nr_segments; + for (i = nth; i < sc; i++) { + pop_back_seg_ent(ac); + } + + /* resizeしたseg_borderをマークする */ + /* 現在のマークを消して新しいマークをつける */ + ac->split_info.ce[index+len].seg_border = 0; + ac->split_info.ce[ac->str.len].seg_border = 1; + for (i = index+len+resize+1; i < ac->str.len; i++) { + ac->split_info.ce[i].seg_border = 0; + } + ac->split_info.ce[index+len+resize].seg_border = 1; + for (i = index; i < ac->str.len; i++) { + ac->split_info.ce[i].best_mw = NULL; + } + + /* 解の候補を作成 */ + make_candidates(ac, index, index+len+resize, 0); +} + +/* + * n番めの文節を取得する、無い場合にはNULLを返す + */ +struct seg_ent * +anthy_get_nth_segment(struct segment_list *sl, int n) +{ + int i; + struct seg_ent *se; + if (n >= sl->nr_segments || + n < 0) { + return NULL; + } + for (i = 0, se = sl->list_head.next; i < n; i++, se = se->next); + return se; +} + +int +anthy_do_set_prediction_str(struct anthy_context *ac, xstr* xs) +{ + struct prediction_cache* prediction = &ac->prediction; + int nr_prediction; + + /* まず辞書セッションを解放 */ + if (ac->dic_session) { + anthy_dic_release_session(ac->dic_session); + ac->dic_session = NULL; + } + /* 予測された文字列の解放 */ + release_prediction(&ac->prediction); + + /* 辞書セッションの開始 */ + if (!ac->dic_session) { + ac->dic_session = anthy_dic_create_session(); + if (!ac->dic_session) { + return -1; + } + } + + prediction->str.str = (xchar*)malloc(sizeof(xchar*)*(xs->len+1)); + anthy_xstrcpy(&prediction->str, xs); + prediction->str.str[xs->len]=0; + + nr_prediction = anthy_traverse_record_for_prediction(xs, NULL); + prediction->nr_prediction = nr_prediction; + + if (nr_prediction) { + prediction->predictions = (struct prediction_t*)malloc(sizeof(struct prediction_t) * + nr_prediction); + anthy_traverse_record_for_prediction(xs, prediction->predictions); + } + return 0; +} + +static const char * +get_change_state(struct anthy_context *ac) +{ + int resize = 0, cand_change = 0; + int i; + for (i = 0; i < ac->seg_list.nr_segments; i++) { + struct seg_ent *s = anthy_get_nth_segment(&ac->seg_list, i); + if (ac->split_info.ce[s->from].initial_seg_len != s->len) { + resize = 1; + } + if (s->committed > 0) { + cand_change = 1; + } + } + /**/ + if (resize && cand_change) { + return "SC"; + } + if (resize) { + return "S"; + } + if (cand_change) { + return "C"; + } + return "-"; +} + +static void +write_history(FILE *fp, struct anthy_context *ac) +{ + int i; + /* 読み */ + fprintf(fp, "|"); + for (i = 0; i < ac->seg_list.nr_segments; i++) { + struct seg_ent *s = anthy_get_nth_segment(&ac->seg_list, i); + char *c = anthy_xstr_to_cstr(&s->str, ANTHY_EUC_JP_ENCODING); + fprintf(fp, "%s|", c); + free(c); + } + fprintf(fp, " |"); + /* 結果 */ + for (i = 0; i < ac->seg_list.nr_segments; i++) { + struct seg_ent *s = anthy_get_nth_segment(&ac->seg_list, i); + char *c; + /**/ + if (s->committed < 0) { + fprintf(fp, "?|"); + continue ; + } + c = anthy_xstr_to_cstr(&s->cands[s->committed]->str, + ANTHY_EUC_JP_ENCODING); + fprintf(fp, "%s|", c); + free(c); + } +} + +void +anthy_save_history(const char *fn, struct anthy_context *ac) +{ + FILE *fp; + struct stat st; + if (!fn) { + return ; + } + fp = fopen(fn, "a"); + if (!fp) { + return ; + } + if (stat(fn, &st) || + st.st_size > HISTORY_FILE_LIMIT) { + fclose(fp); + return ; + } + /**/ + fprintf(fp, "anthy-%s ", anthy_get_version_string()); + fprintf(fp, "%s ", get_change_state(ac)); + write_history(fp, ac); + fprintf(fp, "\n"); + fclose(fp); + /**/ + chmod(fn, S_IREAD | S_IWRITE); +} + +/** 候補を表示する */ +void +anthy_print_candidate(struct cand_ent *ce) +{ + int mod = (ce->score % 1000); + int seg_score = 0; + + if (ce->mw) { + seg_score = ce->mw->score; + } + anthy_putxstr(&ce->str); + printf(":("); + /*if (ce->nr_words == 1) {printf("%d,", ce->elm[0].id); }*/ + if (ce->flag & CEF_OCHAIRE) { + putchar('o'); + } + if (ce->flag & CEF_SINGLEWORD) { + putchar('1'); + } + if (ce->flag & CEF_GUESS) { + putchar('g'); + } + if (ce->flag & (CEF_KATAKANA | CEF_HIRAGANA)) { + putchar('N'); + } + if (ce->flag & CEF_USEDICT) { + putchar('U'); + } + if (ce->flag & CEF_CONTEXT) { + putchar('C'); + } + printf(",%d,", seg_score); + + + if (ce->mw) { + printf("%s,%d", anthy_seg_class_sym(ce->mw->seg_class), + ce->mw->struct_score); + } else { + putchar('-'); + } + printf(")"); + + if (ce->score >= 1000) { + printf("%d,", ce->score/1000); + if (mod < 100) { + printf("0"); + } + if (mod < 10) { + printf("0"); + } + printf("%d ", mod); + } else { + printf("%d ", ce->score); + } +} + +/** 文節を表示する */ +static void +print_segment(struct seg_ent *e) +{ + int i; + + anthy_putxstr(&e->str); + printf("("); + for ( i = 0 ; i < e->nr_cands ; i++) { + anthy_print_candidate(e->cands[i]); + printf(","); + } + printf(")"); + printf(":\n"); +} + +/** コンテキストを表示する */ +void +anthy_do_print_context(struct anthy_context *ac, int encoding) +{ + int i; + struct char_ent *ce; + anthy_xstr_set_print_encoding(encoding); + + ce = ac->split_info.ce; + if (!ce) { + printf("(invalid)\n"); + return ; + } + /* 各文字を表示する */ + for (i = 0, ce = ac->split_info.ce; i < ac->str.len; i++, ce++) { + if (ce->seg_border) { + printf("|"); + } + anthy_putxchar(*(ce->c)); + } + printf("\n"); + /* 各文節を表示する */ + for (i = 0; i < ac->seg_list.nr_segments; i++) { + print_segment(anthy_get_nth_segment(&ac->seg_list, i)); + } + printf("\n"); +} + +void +anthy_release_cand_ent(struct cand_ent *ce) +{ + if (ce->elm) { + free(ce->elm); + } + if (&ce->str) { + anthy_free_xstr_str(&ce->str); + } + free(ce); +} + +int +anthy_do_set_personality(const char *id) +{ + if (current_personality) { + /* すでに設定されてる */ + return -1; + } + if (!id || strchr(id, '/')) { + return -1; + } + current_personality = strdup(id); + anthy_dic_set_personality(current_personality); + return 0; +} + +void +anthy_init_personality(void) +{ + current_personality = NULL; +} + +void +anthy_quit_personality(void) +{ + if (current_personality) { + free(current_personality); + current_personality = NULL; + } +} |