summaryrefslogtreecommitdiff
path: root/calctrans/corpus.c
diff options
context:
space:
mode:
Diffstat (limited to 'calctrans/corpus.c')
-rw-r--r--calctrans/corpus.c277
1 files changed, 277 insertions, 0 deletions
diff --git a/calctrans/corpus.c b/calctrans/corpus.c
new file mode 100644
index 0000000..54d14d1
--- /dev/null
+++ b/calctrans/corpus.c
@@ -0,0 +1,277 @@
+/*
+ * 例文の中を検索できるデータ構造を作る
+ * 現時点では例文をすべて入れているが、そのうちフィルターすることも考えられる
+ *
+ * Copyright (C) 2007 TABATA Yusuke
+ *
+ */
+/*
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <anthy/corpus.h>
+
+#define MAX_NR_VAL 8
+#define BUCKET_SIZE 8192
+#define MAX_COLLISION 8
+
+/* word in source */
+struct node {
+ int nr;
+ int val[MAX_NR_VAL];
+ int flags;
+};
+
+/* word feature in corpus file */
+struct element {
+ /* hash値 */
+ int val;
+ /* 有効な(ELM_INVALIDの無い)エントリとしてのindex */
+ int idx;
+ /* このhash値の次の出現場所 */
+ int next_idx;
+ /**/
+ int flags;
+};
+
+/* index */
+struct bucket {
+ /* 検索のキー */
+ int key;
+ /* 最初の出現場所 */
+ int first_idx;
+ /**/
+ int last_idx;
+};
+
+struct corpus {
+ /**/
+ struct node *array;
+ int nr_node;
+ int array_size;
+
+ /**/
+ int nr_values;
+ struct element *elms;
+ /**/
+ int nr_buckets;
+ struct bucket *buckets;
+
+ /**/
+ int bucket_collision;
+};
+
+static void
+corpus_setup_bucket(struct corpus *c, int nr)
+{
+ int i;
+ free(c->buckets);
+ /**/
+ c->nr_buckets = nr;
+ c->buckets = malloc(sizeof(struct bucket) * nr);
+ for (i = 0; i < nr; i++) {
+ c->buckets[i].key = -1;
+ c->buckets[i].first_idx = -1;
+ c->buckets[i].last_idx = -1;
+ }
+}
+
+struct corpus *
+corpus_new(void)
+{
+ struct corpus *c = malloc(sizeof(*c));
+ c->nr_node = 0;
+ c->array_size = 0;
+ c->array = NULL;
+ c->nr_values = 0;
+ c->elms = NULL;
+ c->buckets = NULL;
+ c->bucket_collision = 0;
+ return c;
+}
+
+static void
+corpus_ensure_array(struct corpus *c, int nr)
+{
+ int i, size;
+ if (c->array_size >= nr) {
+ return ;
+ }
+ size = nr * 2;
+ c->array = realloc(c->array, sizeof(struct node) * size);
+ for (i = c->array_size; i < size; i++) {
+ c->array[i].nr = 0;
+ }
+ c->array_size = nr;
+}
+
+void
+corpus_dump(struct corpus *c)
+{
+ int i;
+ for (i = 0; i < c->nr_values; i++) {
+ if (c->elms[i].flags & ELM_WORD_BORDER) {
+ printf("%d:", i);
+ }
+ printf("%d(%d) ", c->elms[i].val, c->elms[i].next_idx);
+ }
+ printf("\n");
+}
+
+static int
+count_nr_valid_values(struct corpus *c)
+{
+ int i;
+ int nr = 0;
+ for (i = 0; i < c->nr_node; i++) {
+ struct node *nd = &c->array[i];
+ if (nd->flags & ELM_INVALID) {
+ continue;
+ }
+ nr += nd->nr;
+ }
+ return nr;
+}
+
+static void
+corpus_build_flatten(struct corpus *c)
+{
+ int i, j;
+ int idx = 0;
+ int nr_valid_elms = count_nr_valid_values(c);
+ c->elms = malloc(sizeof(struct element) * nr_valid_elms);
+ for (i = 0; i < c->nr_node; i++) {
+ struct node *nd = &c->array[i];
+ if (nd->flags & ELM_INVALID) {
+ continue;
+ }
+ for (j = 0; j < nd->nr; j++) {
+ c->elms[idx].val = nd->val[j];
+ c->elms[idx].next_idx = -1;
+ c->elms[idx].flags = nd->flags;
+ if (j == 0) {
+ c->elms[idx].flags |= ELM_WORD_BORDER;
+ }
+ c->elms[idx].idx = idx;
+ idx++;
+ }
+ }
+}
+
+static struct bucket *
+find_bucket(struct corpus *c, int val)
+{
+ int i;
+ int h = val % c->nr_buckets;
+ for (i = 0; i < MAX_COLLISION; i++) {
+ struct bucket *bkt = &c->buckets[h];
+ if (bkt->key == val) {
+ return bkt;
+ }
+ if (bkt->key == -1) {
+ bkt->key = val;
+ return bkt;
+ }
+ /**/
+ h ++;
+ h %= c->nr_buckets;
+ }
+ c->bucket_collision ++;
+ return NULL;
+}
+
+static void
+corpus_build_link(struct corpus *c)
+{
+ int i;
+ for (i = 0; i < c->nr_values; i++) {
+ struct element *elm = &c->elms[i];
+ struct bucket *bkt = find_bucket(c, elm->val);
+ if (!bkt) {
+ continue;
+ }
+ if (bkt->first_idx < 0) {
+ /* 最初の出現 */
+ bkt->first_idx = c->elms[i].idx;
+ } else {
+ c->elms[bkt->last_idx].next_idx = c->elms[i].idx;
+ }
+ bkt->last_idx = c->elms[i].idx;
+ c->elms[i].next_idx = -1;
+ }
+}
+
+void
+corpus_build(struct corpus *c)
+{
+ corpus_setup_bucket(c, BUCKET_SIZE);
+ /**/
+ corpus_build_flatten(c);
+ corpus_build_link(c);
+ /**/
+}
+
+void
+corpus_push_back(struct corpus *c, int *val, int nr, int flags)
+{
+ struct node nd;
+ int i;
+ for (i = 0; i < nr; i++) {
+ nd.val[i] = val[i];
+ }
+ nd.nr = nr;
+ nd.flags = flags;
+ /**/
+ corpus_ensure_array(c, c->nr_node+1);
+ c->array[c->nr_node] = nd;
+ c->nr_node++;
+ c->nr_values += nd.nr;
+}
+
+void
+corpus_write_bucket(FILE *fp, struct corpus *c)
+{
+ int i;
+ fprintf(fp, "0 %d\n", c->nr_buckets);
+ for (i = 0; i < c->nr_buckets; i++) {
+ fprintf(fp, "%d,%d\n", (c->buckets[i].key & CORPUS_KEY_MASK),
+ c->buckets[i].first_idx);
+ }
+ printf(" %d collisions in corpus bucket\n", c->bucket_collision);
+}
+
+void
+corpus_write_array(FILE *fp, struct corpus *c)
+{
+ int i;
+ fprintf(fp, "0 %d\n", c->nr_values);
+ for (i = 0; i < c->nr_values; i++) {
+ struct element *elm = &c->elms[i];
+ int val;
+ val = elm->val;
+ val &= CORPUS_KEY_MASK;
+ if (elm->flags & ELM_BOS) {
+ val |= ELM_BOS;
+ }
+ if (elm->flags & ELM_WORD_BORDER) {
+ val |= ELM_WORD_BORDER;
+ }
+ fprintf(fp, "%d,%d\n", val,
+ c->elms[i].next_idx);
+ }
+}