diff options
Diffstat (limited to 'src-splitter/splitter.c')
-rw-r--r-- | src-splitter/splitter.c | 329 |
1 files changed, 329 insertions, 0 deletions
diff --git a/src-splitter/splitter.c b/src-splitter/splitter.c new file mode 100644 index 0000000..75ace2b --- /dev/null +++ b/src-splitter/splitter.c @@ -0,0 +1,329 @@ +/* + * 文を文節にsplitするsplitter + * + * 文節の境界を検出する + * anthy_init_split_context() 分割用のコンテキストを作って + * anthy_mark_border() 分割をして + * anthy_release_split_context() コンテキストを解放する + * + * anthy_commit_border() コミットされた内容に対して学習をする + * + * Funded by IPA未踏ソフトウェア創造事業 2001 9/22 + * + * Copyright (C) 2004 YOSHIDA Yuichi + * Copyright (C) 2000-2004 TABATA Yusuke + * Copyright (C) 2000-2001 UGAWA Tomoharu + * + * $Id: splitter.c,v 1.48 2002/11/18 11:39:18 yusuke Exp $ + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdlib.h> +#include <string.h> + +#include <anthy/alloc.h> +#include <anthy/record.h> +#include <anthy/splitter.h> +#include <anthy/logger.h> +#include "wordborder.h" + +#define MAX_EXPAND_PAIR_ENTRY_COUNT 1000 + +static int splitter_debug_flags; + +/**/ +wtype_t anthy_wtype_noun; +wtype_t anthy_wtype_name_noun; +wtype_t anthy_wtype_num_noun; +wtype_t anthy_wtype_prefix; +wtype_t anthy_wtype_num_prefix; +wtype_t anthy_wtype_num_postfix; +wtype_t anthy_wtype_name_postfix; +wtype_t anthy_wtype_sv_postfix; +wtype_t anthy_wtype_a_tail_of_v_renyou; +wtype_t anthy_wtype_v_renyou; +wtype_t anthy_wtype_noun_tail;/* いれ「たて」とか */ +wtype_t anthy_wtype_n1; +wtype_t anthy_wtype_n10; + + +/** make_word_cacheで作成した文節情報を解放する + */ +static void +release_info_cache(struct splitter_context *sc) +{ + struct word_split_info_cache *info = sc->word_split_info; + + anthy_free_allocator(info->MwAllocator); + anthy_free_allocator(info->WlAllocator); + free(info->cnode); + free(info->seq_len); + free(info->rev_seq_len); + free(info); +} + +static void +metaword_dtor(void *p) +{ + struct meta_word *mw = (struct meta_word*)p; + if (mw->cand_hint.str) { + free(mw->cand_hint.str); + } +} + + +static void +alloc_char_ent(xstr *xs, struct splitter_context *sc) +{ + int i; + + sc->char_count = xs->len; + sc->ce = (struct char_ent*) + malloc(sizeof(struct char_ent)*(xs->len + 1)); + for (i = 0; i <= xs->len; i++) { + sc->ce[i].c = &xs->str[i]; + sc->ce[i].seg_border = 0; + sc->ce[i].initial_seg_len = 0; + sc->ce[i].best_seg_class = SEG_HEAD; + sc->ce[i].best_mw = NULL; + } + + /* 左右両端は文節の境界である */ + sc->ce[0].seg_border = 1; + sc->ce[xs->len].seg_border = 1; +} + +/* ここで確保した内容はrelease_info_cacheで解放される + */ +static void +alloc_info_cache(struct splitter_context *sc) +{ + int i; + struct word_split_info_cache *info; + + /* キャッシュのデータを確保 */ + sc->word_split_info = malloc(sizeof(struct word_split_info_cache)); + info = sc->word_split_info; + info->MwAllocator = anthy_create_allocator(sizeof(struct meta_word), metaword_dtor); + info->WlAllocator = anthy_create_allocator(sizeof(struct word_list), 0); + info->cnode = + malloc(sizeof(struct char_node) * (sc->char_count + 1)); + + info->seq_len = malloc(sizeof(int) * (sc->char_count + 1)); + info->rev_seq_len = malloc(sizeof(int) * (sc->char_count + 1)); + + /* 各文字インデックスに対して初期化を行う */ + for (i = 0; i <= sc->char_count; i++) { + info->seq_len[i] = 0; + info->rev_seq_len[i] = 0; + info->cnode[i].wl = NULL; + info->cnode[i].mw = NULL; + info->cnode[i].max_len = 0; + } +} + +/** 外から呼び出されるwordsplitterのトップレベルの関数 */ +void +anthy_mark_border(struct splitter_context *sc, + int from, int from2, int to) +{ + int i; + struct word_split_info_cache *info; + + /* sanity check */ + if ((to - from) <= 0) { + return ; + } + + /* 境界マーク用とlatticeの検索で用いられるクラス用の領域を確保 */ + info = sc->word_split_info; + info->seg_border = alloca(sizeof(int)*(sc->char_count + 1)); + info->best_seg_class = alloca(sizeof(enum seg_class)*(sc->char_count + 1)); + info->best_mw = alloca(sizeof(struct meta_word*)*(sc->char_count + 1)); + for (i = 0; i < sc->char_count + 1; ++i) { + info->seg_border[i] = sc->ce[i].seg_border; + info->best_seg_class[i] = sc->ce[i].best_seg_class; + info->best_mw[i] = sc->ce[i].best_mw; + } + + /* 境界を決定する */ + anthy_eval_border(sc, from, from2, to); + + for (i = from; i < to; ++i) { + sc->ce[i].seg_border = info->seg_border[i]; + sc->ce[i].best_seg_class = info->best_seg_class[i]; + sc->ce[i].best_mw = info->best_mw[i]; + } +} + +/* 文節が拡大されたので,それを学習する */ +static void +proc_expanded_segment(struct splitter_context *sc, + int from, int len) +{ + int initial_len = sc->ce[from].initial_seg_len; + int i, nr; + xstr from_xs, to_xs, *xs; + + from_xs.str = sc->ce[from].c; + from_xs.len = initial_len; + to_xs.str = sc->ce[from].c; + to_xs.len = len; + if (anthy_select_section("EXPANDPAIR", 1) == -1) { + return ; + } + if (anthy_select_row(&from_xs, 1) == -1) { + return ; + } + nr = anthy_get_nr_values(); + for (i = 0; i < nr; i ++) { + xs = anthy_get_nth_xstr(i); + if (!xs || !anthy_xstrcmp(xs, &to_xs)) { + /* 既にある */ + return ; + } + } + anthy_set_nth_xstr(nr, &to_xs); + anthy_truncate_section(MAX_EXPAND_PAIR_ENTRY_COUNT); +} + +/* 文節のマージと語尾を学習する */ +void +anthy_commit_border(struct splitter_context *sc, int nr_segments, + struct meta_word **mw, int *seg_len) +{ + int i, from = 0; + + /* 伸ばした文節 */ + for (i = 0; i < nr_segments; i++) { + /* それぞれの文節に対して */ + + int len = seg_len[i]; + int initial_len = sc->ce[from].initial_seg_len; + int real_len = 0; + int l2; + + if (!initial_len || from + initial_len == sc->char_count) { + /* そこは境界ではない */ + goto tail; + } + l2 = sc->ce[from + initial_len].initial_seg_len; + if (initial_len + l2 > len) { + /* 隣の文節を含むほど拡大されたわけではない */ + goto tail; + } + if (mw[i]) { + real_len = mw[i]->len; + } + if (real_len <= initial_len) { + goto tail; + } + /* 右の文節を含む長さに拡張された文節がコミットされた */ + proc_expanded_segment(sc, from, real_len); + tail: + from += len; + } +} + +int +anthy_splitter_debug_flags(void) +{ + return splitter_debug_flags; +} + +void +anthy_init_split_context(xstr *xs, struct splitter_context *sc, int is_reverse) +{ + alloc_char_ent(xs, sc); + alloc_info_cache(sc); + sc->is_reverse = is_reverse; + /* 全ての部分文字列をチェックして、文節の候補を列挙する + word_listを構成してからmetawordを構成する */ + anthy_lock_dic(); + anthy_make_word_list_all(sc); + anthy_unlock_dic(); + anthy_make_metaword_all(sc); + +} + +void +anthy_release_split_context(struct splitter_context *sc) +{ + if (sc->word_split_info) { + release_info_cache(sc); + sc->word_split_info = 0; + } + if (sc->ce) { + free(sc->ce); + sc->ce = 0; + } +} + +/** splitter全体の初期化を行う */ +int +anthy_init_splitter(void) +{ + /* デバッグプリントの設定 */ + char *en = getenv("ANTHY_ENABLE_DEBUG_PRINT"); + char *dis = getenv("ANTHY_DISABLE_DEBUG_PRINT"); + splitter_debug_flags = SPLITTER_DEBUG_NONE; + if (!dis && en && strlen(en)) { + char *fs = getenv("ANTHY_SPLITTER_PRINT"); + if (fs) { + if (strchr(fs, 'w')) { + splitter_debug_flags |= SPLITTER_DEBUG_WL; + } + if (strchr(fs, 'm')) { + splitter_debug_flags |= SPLITTER_DEBUG_MW; + } + if (strchr(fs, 'l')) { + splitter_debug_flags |= SPLITTER_DEBUG_LN; + } + if (strchr(fs, 'i')) { + splitter_debug_flags |= SPLITTER_DEBUG_ID; + } + if (strchr(fs, 'c')) { + splitter_debug_flags |= SPLITTER_DEBUG_CAND; + } + } + } + /* 付属語グラフの初期化 */ + if (anthy_init_depword_tab()) { + anthy_log(0, "Failed to init dependent word table.\n"); + return -1; + } + /**/ + anthy_wtype_noun = anthy_init_wtype_by_name("名詞35"); + anthy_wtype_name_noun = anthy_init_wtype_by_name("人名"); + anthy_wtype_num_noun = anthy_init_wtype_by_name("数詞"); + anthy_wtype_a_tail_of_v_renyou = anthy_init_wtype_by_name("形容詞化接尾語"); + anthy_wtype_v_renyou = anthy_init_wtype_by_name("動詞連用形"); + anthy_wtype_noun_tail = anthy_init_wtype_by_name("名詞化接尾語"); + anthy_wtype_prefix = anthy_init_wtype_by_name("名詞接頭辞"); + anthy_wtype_num_prefix = anthy_init_wtype_by_name("数接頭辞"); + anthy_wtype_num_postfix = anthy_init_wtype_by_name("数接尾辞"); + anthy_wtype_name_postfix = anthy_init_wtype_by_name("人名接尾辞"); + anthy_wtype_sv_postfix = anthy_init_wtype_by_name("サ変接尾辞"); + anthy_wtype_n1 = anthy_init_wtype_by_name("数詞1"); + anthy_wtype_n10 = anthy_init_wtype_by_name("数詞10"); + return 0; +} + +void +anthy_quit_splitter(void) +{ + anthy_quit_depword_tab(); +} |