diff options
Diffstat (limited to 'calctrans/corpus.c')
-rw-r--r-- | calctrans/corpus.c | 277 |
1 files changed, 277 insertions, 0 deletions
diff --git a/calctrans/corpus.c b/calctrans/corpus.c new file mode 100644 index 0000000..54d14d1 --- /dev/null +++ b/calctrans/corpus.c @@ -0,0 +1,277 @@ +/* + * 例文の中を検索できるデータ構造を作る + * 現時点では例文をすべて入れているが、そのうちフィルターすることも考えられる + * + * Copyright (C) 2007 TABATA Yusuke + * + */ +/* + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#include <anthy/corpus.h> + +#define MAX_NR_VAL 8 +#define BUCKET_SIZE 8192 +#define MAX_COLLISION 8 + +/* word in source */ +struct node { + int nr; + int val[MAX_NR_VAL]; + int flags; +}; + +/* word feature in corpus file */ +struct element { + /* hash値 */ + int val; + /* 有効な(ELM_INVALIDの無い)エントリとしてのindex */ + int idx; + /* このhash値の次の出現場所 */ + int next_idx; + /**/ + int flags; +}; + +/* index */ +struct bucket { + /* 検索のキー */ + int key; + /* 最初の出現場所 */ + int first_idx; + /**/ + int last_idx; +}; + +struct corpus { + /**/ + struct node *array; + int nr_node; + int array_size; + + /**/ + int nr_values; + struct element *elms; + /**/ + int nr_buckets; + struct bucket *buckets; + + /**/ + int bucket_collision; +}; + +static void +corpus_setup_bucket(struct corpus *c, int nr) +{ + int i; + free(c->buckets); + /**/ + c->nr_buckets = nr; + c->buckets = malloc(sizeof(struct bucket) * nr); + for (i = 0; i < nr; i++) { + c->buckets[i].key = -1; + c->buckets[i].first_idx = -1; + c->buckets[i].last_idx = -1; + } +} + +struct corpus * +corpus_new(void) +{ + struct corpus *c = malloc(sizeof(*c)); + c->nr_node = 0; + c->array_size = 0; + c->array = NULL; + c->nr_values = 0; + c->elms = NULL; + c->buckets = NULL; + c->bucket_collision = 0; + return c; +} + +static void +corpus_ensure_array(struct corpus *c, int nr) +{ + int i, size; + if (c->array_size >= nr) { + return ; + } + size = nr * 2; + c->array = realloc(c->array, sizeof(struct node) * size); + for (i = c->array_size; i < size; i++) { + c->array[i].nr = 0; + } + c->array_size = nr; +} + +void +corpus_dump(struct corpus *c) +{ + int i; + for (i = 0; i < c->nr_values; i++) { + if (c->elms[i].flags & ELM_WORD_BORDER) { + printf("%d:", i); + } + printf("%d(%d) ", c->elms[i].val, c->elms[i].next_idx); + } + printf("\n"); +} + +static int +count_nr_valid_values(struct corpus *c) +{ + int i; + int nr = 0; + for (i = 0; i < c->nr_node; i++) { + struct node *nd = &c->array[i]; + if (nd->flags & ELM_INVALID) { + continue; + } + nr += nd->nr; + } + return nr; +} + +static void +corpus_build_flatten(struct corpus *c) +{ + int i, j; + int idx = 0; + int nr_valid_elms = count_nr_valid_values(c); + c->elms = malloc(sizeof(struct element) * nr_valid_elms); + for (i = 0; i < c->nr_node; i++) { + struct node *nd = &c->array[i]; + if (nd->flags & ELM_INVALID) { + continue; + } + for (j = 0; j < nd->nr; j++) { + c->elms[idx].val = nd->val[j]; + c->elms[idx].next_idx = -1; + c->elms[idx].flags = nd->flags; + if (j == 0) { + c->elms[idx].flags |= ELM_WORD_BORDER; + } + c->elms[idx].idx = idx; + idx++; + } + } +} + +static struct bucket * +find_bucket(struct corpus *c, int val) +{ + int i; + int h = val % c->nr_buckets; + for (i = 0; i < MAX_COLLISION; i++) { + struct bucket *bkt = &c->buckets[h]; + if (bkt->key == val) { + return bkt; + } + if (bkt->key == -1) { + bkt->key = val; + return bkt; + } + /**/ + h ++; + h %= c->nr_buckets; + } + c->bucket_collision ++; + return NULL; +} + +static void +corpus_build_link(struct corpus *c) +{ + int i; + for (i = 0; i < c->nr_values; i++) { + struct element *elm = &c->elms[i]; + struct bucket *bkt = find_bucket(c, elm->val); + if (!bkt) { + continue; + } + if (bkt->first_idx < 0) { + /* 最初の出現 */ + bkt->first_idx = c->elms[i].idx; + } else { + c->elms[bkt->last_idx].next_idx = c->elms[i].idx; + } + bkt->last_idx = c->elms[i].idx; + c->elms[i].next_idx = -1; + } +} + +void +corpus_build(struct corpus *c) +{ + corpus_setup_bucket(c, BUCKET_SIZE); + /**/ + corpus_build_flatten(c); + corpus_build_link(c); + /**/ +} + +void +corpus_push_back(struct corpus *c, int *val, int nr, int flags) +{ + struct node nd; + int i; + for (i = 0; i < nr; i++) { + nd.val[i] = val[i]; + } + nd.nr = nr; + nd.flags = flags; + /**/ + corpus_ensure_array(c, c->nr_node+1); + c->array[c->nr_node] = nd; + c->nr_node++; + c->nr_values += nd.nr; +} + +void +corpus_write_bucket(FILE *fp, struct corpus *c) +{ + int i; + fprintf(fp, "0 %d\n", c->nr_buckets); + for (i = 0; i < c->nr_buckets; i++) { + fprintf(fp, "%d,%d\n", (c->buckets[i].key & CORPUS_KEY_MASK), + c->buckets[i].first_idx); + } + printf(" %d collisions in corpus bucket\n", c->bucket_collision); +} + +void +corpus_write_array(FILE *fp, struct corpus *c) +{ + int i; + fprintf(fp, "0 %d\n", c->nr_values); + for (i = 0; i < c->nr_values; i++) { + struct element *elm = &c->elms[i]; + int val; + val = elm->val; + val &= CORPUS_KEY_MASK; + if (elm->flags & ELM_BOS) { + val |= ELM_BOS; + } + if (elm->flags & ELM_WORD_BORDER) { + val |= ELM_WORD_BORDER; + } + fprintf(fp, "%d,%d\n", val, + c->elms[i].next_idx); + } +} |