diff options
Diffstat (limited to 'src-ordering/relation.c')
-rw-r--r-- | src-ordering/relation.c | 447 |
1 files changed, 447 insertions, 0 deletions
diff --git a/src-ordering/relation.c b/src-ordering/relation.c new file mode 100644 index 0000000..3c1ae74 --- /dev/null +++ b/src-ordering/relation.c @@ -0,0 +1,447 @@ +/* + * 文節の関係を処理する + * Copyright (C) 2006 Higashiyama Masahiko (thanks google summer of code program) + * Copyright (C) 2002-2007 TABATA Yusuke + * + * anthy_reorder_candidates_by_relation() + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <arpa/inet.h> +#include <stdlib.h> + +#include <anthy/segclass.h> +#include <anthy/segment.h> +#include <anthy/ordering.h> +#include <anthy/dic.h> +#include <anthy/diclib.h> +#include <anthy/feature_set.h> +#include <anthy/corpus.h> +#include "sorter.h" + +#define MAX_COLLISION 4 +#define SEARCH_LIMIT 100 +#define MAX_NEIGHBOR 10 + + +/* 全文検索用のコーパス */ +static struct corpus_ { + /* header */ + void *corpus_bucket; + void *corpus_array; + /**/ + int *bucket; + int *array; + /**/ + int bucket_size; + int array_size; +} corpus_info; + +/* 検索用のiterator */ +struct iterator { + /* 検索のキーと現在の場所 */ + int key; + int idx; + /* 検索回数の上限 */ + int limit; +}; + +struct neighbor { + int nr; + int id[MAX_NEIGHBOR]; +}; + +/** 文節@segの中に@from_word_idの単語と共起関係にある + * 候補があるかどうかを探し、あればスコアを上げる。 + */ +static void +reorder_candidate(int from_word_id, struct seg_ent *seg) +{ + int i, pos; + struct cand_ent *ce; + if (NULL == seg->cands) { /* 辞書もしくは学習データが壊れていた時の対策 */ + return; + } + ce = seg->cands[0]; + if (ce->core_elm_index == -1) { + return ; + } + /* 0番目の候補の品詞 */ + pos = anthy_wtype_get_pos(ce->elm[ce->core_elm_index].wt); + + for (i = 0; i < seg->nr_cands; i++) { + int word_id; + ce = seg->cands[i]; + if (ce->core_elm_index == -1) { + continue; + } + word_id = ce->elm[ce->core_elm_index].id; + if (anthy_dic_check_word_relation(from_word_id, word_id) && + anthy_wtype_get_pos(ce->elm[ce->core_elm_index].wt) == pos) { + /* 用例にマッチしたので、候補のスコアを更新 */ + ce->flag |= CEF_USEDICT; + ce->score *= 10; + } + } +} + +static int +get_indep_word_id(struct seg_ent *seg, int nth) +{ + struct cand_ent *ce; + if (NULL == seg->cands) { /* 辞書もしくは学習データが壊れていた時の対策 */ + return -1; + } + if (seg->cands[nth]->core_elm_index == -1) { + /* 一番目の候補がseq_entから作られた候補ではない */ + return -1; + } + ce = seg->cands[nth]; + /* 自立語のidを取り出す */ + return ce->elm[ce->core_elm_index].id; +} + +/* 用例辞書を使って並び替えをする */ +static void +reorder_by_use_dict(struct segment_list *sl, int nth) +{ + int i; + struct seg_ent *cur_seg; + int word_id; + + cur_seg = anthy_get_nth_segment(sl, nth); + word_id = get_indep_word_id(cur_seg, 0); + if (word_id == -1) { + /**/ + return ; + } + /* 近所の文節を順に見ていく */ + for (i = nth - 2; i < nth + 2 && i < sl->nr_segments; i++) { + struct seg_ent *target_seg; + if (i < 0 || i == nth) { + continue ; + } + /* i番目の文節と前後のj番目の文節に対して */ + target_seg = anthy_get_nth_segment(sl, i); + reorder_candidate(word_id, target_seg); + } +} + +static int +find_border_of_this_word(int idx) +{ + int val; + if (idx < 0) { + return 0; + } + val = ntohl(corpus_info.array[idx * 2]); + while (!(val & ELM_WORD_BORDER) && + idx > -1) { + idx --; + } + return idx; +} + +static int +find_left_word_border(int idx) +{ + int val; + if (idx == -1) { + return -1; + } + val = ntohl(corpus_info.array[idx * 2]); + if (val & ELM_BOS) { + return -1; + } + idx --; + return find_border_of_this_word(idx); +} + +static int +find_right_word_border(int idx) +{ + if (idx == -1) { + return -1; + } + while (idx < corpus_info.array_size - 2) { + int val; + idx ++; + val = ntohl(corpus_info.array[idx * 2]); + if (val & ELM_BOS) { + return -1; + } + if (val & ELM_WORD_BORDER) { + return idx; + } + } + return -1; +} + +static void +push_id(struct neighbor *ctx, + int id) +{ + if (ctx->nr < MAX_NEIGHBOR - 1) { + ctx->id[ctx->nr] = id; + ctx->nr++; + } +} + +static void +collect_word_context(struct neighbor *ctx, int idx) +{ + int id = ntohl(corpus_info.array[idx * 2]) & CORPUS_KEY_MASK; + /*printf(" id=%d\n", id);*/ + push_id(ctx, id); +} + +/* 例文中で周辺の情報を取得する */ +static void +collect_corpus_context(struct neighbor *ctx, + struct iterator *it) +{ + int i; + int this_idx, idx; + + this_idx = find_border_of_this_word(it->idx); + + /*printf(" key=%d\n", it->key);*/ + /* 左へスキャン */ + idx = this_idx; + for (i = 0; i < 2; i++) { + idx = find_left_word_border(idx); + if (idx == -1) { + break; + } + collect_word_context(ctx, idx); + } + /* 右へスキャン */ + idx = this_idx; + for (i = 0; i < 2; i++) { + idx = find_right_word_border(idx); + if (idx == -1) { + break; + } + collect_word_context(ctx, idx); + } +} + +/* 変換対象の文字列の周辺の情報を取得する */ +static void +collect_user_context(struct neighbor *ctx, + struct segment_list *sl, int nth) +{ + int i; + ctx->nr = 0; + for (i = nth - 2; i <= nth + 2 && i < sl->nr_segments; i++) { + int id; + if ((i < 0) || (i == nth)) { + continue; + } + id = get_indep_word_id(anthy_get_nth_segment(sl, i), 0); + if (id > -1) { + id &= CORPUS_KEY_MASK; + /*printf("user_ctx=%d\n", id);*/ + push_id(ctx, id); + } + } +} + +/* 隣接文節の情報を比較する */ +static int +do_compare_context(struct neighbor *n1, + struct neighbor *n2) +{ + int i, j; + int m = 0; + for (i = 0; i < n1->nr; i++) { + for (j = 0; j < n2->nr; j++) { + if (n1->id[i] == n2->id[j]) { + m++; + } + } + } + return m; +} + +/* 隣接文節の情報を取得して比較する */ +static int +compare_context(struct neighbor *user, + struct iterator *it) +{ + struct neighbor sample; + int nr; + /**/ + sample.nr = 0; + /* 例文中の周辺情報を集める */ + collect_corpus_context(&sample, it); + if (sample.nr == 0) { + return 0; + } + /* 比較する */ + nr = do_compare_context(user, &sample); + if (nr >= sample.nr / 2) { + return nr; + } + return 0; +} + +/* keyの最初の出現場所を見つける + * 見つからなかったら-1を返す + */ +static int +find_first_pos(int key) +{ + int i; + for (i = 0; i < MAX_COLLISION; i++) { + int bkt = (key + i) % corpus_info.bucket_size; + if ((int)ntohl(corpus_info.bucket[bkt * 2]) == key) { + return ntohl(corpus_info.bucket[bkt * 2 + 1]); + } + } + return -1; +} + +/* keyの最初の出現場所でiteratorを初期化する + * 見つからなかったら-1を返す + */ +static int +find_first_from_corpus(int key, struct iterator *it, int limit) +{ + key &= CORPUS_KEY_MASK; + it->idx = find_first_pos(key); + it->key = key; + it->limit = limit; + return it->idx; +} + +/* keyの次の出現場所のiteratorを設定する + */ +static int +find_next_from_corpus(struct iterator *it) +{ + int idx = it->idx; + it->limit--; + if (it->limit < 1) { + it->idx = -1; + return -1; + } + it->idx = ntohl(corpus_info.array[it->idx * 2 + 1]); + if (it->idx < 0 || it->idx >= corpus_info.array_size || + it->idx < idx) { + it->idx = -1; + } + return it->idx; +} + +static void +check_candidate_context(struct seg_ent *cur_seg, + int i, + struct neighbor *user) +{ + struct iterator it; + int nr = 0; + int word_id; + word_id = get_indep_word_id(cur_seg, i); + if (word_id == -1) { + return ; + } + /* 各出現場所をスキャンする */ + find_first_from_corpus(word_id, &it, SEARCH_LIMIT); + /*printf("word_id=%d %d\n", word_id, it.idx);*/ + while (it.idx > -1) { + nr += compare_context(user, &it); + /**/ + find_next_from_corpus(&it); + } + /**/ + if (nr > 0) { + cur_seg->cands[i]->flag |= CEF_CONTEXT; + } +} + +/* 全文検索で候補を並び替える */ +static void +reorder_by_corpus(struct segment_list *sl, int nth) +{ + struct seg_ent *cur_seg; + struct neighbor user; + int i; + /* 文節の周辺情報を集める */ + collect_user_context(&user, sl, nth); + if (user.nr == 0) { + return ; + } + cur_seg = anthy_get_nth_segment(sl, nth); + if (NULL == cur_seg->cands) { /* 辞書もしくは学習データが壊れていた時の対策 */ + return; + } + /* 各候補について */ + for (i = 0; i < cur_seg->nr_cands; i++) { + check_candidate_context(cur_seg, i, &user); + } + /* トップの候補に用例があれば、他の候補は見ない */ + if (cur_seg->cands[0]->flag & CEF_CONTEXT) { + cur_seg->cands[0]->flag &= ~CEF_CONTEXT; + return ; + } + /* 用例によるスコア加算 */ + for (i = 1; i < cur_seg->nr_cands; i++) { + if (cur_seg->cands[i]->flag & CEF_CONTEXT) { + cur_seg->cands[i]->score *= 2; + } + } +} + +/* + * 用例を用いて候補を並び替える + * @nth番目以降の文節を対象とする + */ +void +anthy_reorder_candidates_by_relation(struct segment_list *sl, int nth) +{ + int i; + for (i = nth; i < sl->nr_segments; i++) { + reorder_by_use_dict(sl, i); + reorder_by_corpus(sl, i); + } +} + +void +anthy_relation_init(void) +{ + corpus_info.corpus_array = anthy_file_dic_get_section("corpus_array"); + corpus_info.corpus_bucket = anthy_file_dic_get_section("corpus_bucket"); + if (!corpus_info.corpus_array || + !corpus_info.corpus_array) { + return ; + } + corpus_info.array_size = ntohl(((int *)corpus_info.corpus_array)[1]); + corpus_info.bucket_size = ntohl(((int *)corpus_info.corpus_bucket)[1]); + corpus_info.array = &(((int *)corpus_info.corpus_array)[16]); + corpus_info.bucket = &(((int *)corpus_info.corpus_bucket)[16]); + /* + { + int i; + for (i = 0; i < corpus_info.array_size; i++) { + int v = ntohl(corpus_info.array[i * 2]); + printf("%d: %d %d\n", i, v, v & CORPUS_KEY_MASK); + } + } + */ +} |