summaryrefslogtreecommitdiff
path: root/calctrans/input_set.c
diff options
context:
space:
mode:
Diffstat (limited to 'calctrans/input_set.c')
-rw-r--r--calctrans/input_set.c285
1 files changed, 285 insertions, 0 deletions
diff --git a/calctrans/input_set.c b/calctrans/input_set.c
new file mode 100644
index 0000000..14bfc81
--- /dev/null
+++ b/calctrans/input_set.c
@@ -0,0 +1,285 @@
+/* 入力のセットを管理するコード
+ *
+ * Copyright (C) 2006 HANAOKA Toshiyuki
+ * Copyright (C) 2006-2007 TABATA Yusuke
+ *
+ * Special Thanks: Google Summer of Code Program 2006
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "input_set.h"
+
+#define HASH_SIZE 1024
+
+struct int_map_node {
+ int key;
+ int val;
+ struct int_map_node *next;
+};
+
+struct int_map {
+ /**/
+ int nr;
+ struct int_map_node *hash_head;
+ /**/
+ int array_size;
+ struct int_map_node **array;
+};
+
+struct input_set {
+ /**/
+ struct input_line *lines;
+ struct input_line *buckets[HASH_SIZE];
+ /**/
+ struct int_map *feature_freq;
+};
+
+static int
+line_hash(const int *ar, int nr)
+{
+ int i;
+ unsigned h = 0;
+ for (i = 0; i < nr; i++) {
+ h += ar[i];
+ }
+ return (h % HASH_SIZE);
+}
+
+static struct input_line *
+find_same_line(struct input_set *is, int *features, int nr)
+{
+ struct input_line *il;
+ int h = line_hash(features, nr);
+ for (il = is->buckets[h]; il; il = il->next_in_hash) {
+ int i;
+ if (il->nr_features != nr) {
+ continue;
+ }
+ for (i = 0; i < nr; i++) {
+ if (il->features[i] != features[i]) {
+ break;
+ }
+ }
+ if (i >= nr) {
+ return il;
+ }
+ }
+ return NULL;
+}
+
+static struct input_line *
+add_line(struct input_set *is, int *features, int nr)
+{
+ int i, h;
+ struct input_line *il;
+ il = malloc(sizeof(struct input_line));
+ il->nr_features = nr;
+ il->features = malloc(sizeof(int) * nr);
+ for (i = 0; i < nr; i++) {
+ il->features[i] = features[i];
+ }
+ il->weight = 0;
+ il->negative_weight = 0;
+ /* link */
+ il->next_line = is->lines;
+ is->lines = il;
+ /**/
+ h = line_hash(features, nr);
+ il->next_in_hash = is->buckets[h];
+ is->buckets[h] = il;
+ return il;
+}
+
+static void
+add_feature_count(struct int_map *im, int nr, int *features, int weight)
+{
+ int i;
+ for (i = 0; i < nr; i++) {
+ int f = features[i];
+ int v = int_map_peek(im, f);
+ int_map_set(im, f, v + weight);
+ }
+}
+
+/* input_setに入力を一つ加える */
+void
+input_set_set_features(struct input_set *is, int *features,
+ int nr, int weight)
+{
+ struct input_line *il;
+ int abs_weight = abs(weight);
+
+ /**/
+ il = find_same_line(is, features, nr);
+ if (!il) {
+ il = add_line(is, features, nr);
+ }
+ /**/
+ if (weight > 0) {
+ il->weight += weight;
+ } else {
+ il->negative_weight += abs_weight;
+ }
+ /**/
+ add_feature_count(is->feature_freq, nr, features, abs_weight);
+}
+
+struct input_set *
+input_set_create(void)
+{
+ int i;
+ struct input_set *is;
+ is = malloc(sizeof(struct input_set));
+ is->lines = NULL;
+ /**/
+ for (i = 0; i < HASH_SIZE; i++) {
+ is->buckets[i] = NULL;
+ }
+ /**/
+ is->feature_freq = int_map_new();
+ /**/
+ return is;
+}
+
+struct input_line *
+input_set_get_input_line(struct input_set *is)
+{
+ return is->lines;
+}
+
+struct input_set *
+input_set_filter(struct input_set *is,
+ double pos, double neg)
+{
+ struct input_set *new_is = input_set_create();
+ struct input_line *il;
+ for (il = is->lines; il; il = il->next_line) {
+ if (il->weight > pos ||
+ il->negative_weight > neg) {
+ input_set_set_features(new_is, il->features,
+ il->nr_features, il->weight);
+ input_set_set_features(new_is, il->features,
+ il->nr_features, -il->negative_weight);
+ }
+ }
+ return new_is;
+}
+
+void
+input_set_output_feature_freq(FILE *fp, struct input_set *is)
+{
+ int i;
+ struct int_map *im = is->feature_freq;
+ fprintf(fp, "0 %d\n", im->array_size);
+ int_map_flatten(im);
+ for (i = 0; i < im->array_size; i++) {
+ if (!im->array[i]) {
+ fprintf(fp, "0 0\n");
+ } else {
+ fprintf(fp, "%d %d\n", im->array[i]->key,
+ im->array[i]->val);
+ }
+ }
+}
+
+struct int_map *
+int_map_new(void)
+{
+ int i;
+ struct int_map *im = malloc(sizeof(struct int_map));
+ im->nr = 0;
+ im->hash_head = malloc(sizeof(struct int_map_node) * HASH_SIZE);
+ for (i = 0; i < HASH_SIZE; i++) {
+ im->hash_head[i].next = NULL;
+ }
+ return im;
+}
+
+static int
+node_index(int idx)
+{
+ return idx % HASH_SIZE;
+}
+
+static struct int_map_node *
+find_int_map_node(struct int_map *im, int idx)
+{
+ struct int_map_node *node;
+ int h = node_index(idx);
+ for (node = im->hash_head[h].next; node; node = node->next) {
+ if (node->key == idx) {
+ return node;
+ }
+ }
+ return NULL;
+}
+
+int
+int_map_peek(struct int_map *im, int idx)
+{
+ struct int_map_node *node = find_int_map_node(im, idx);
+ if (node) {
+ return node->val;
+ }
+ return 0;
+}
+
+void
+int_map_set(struct int_map *im, int idx, int val)
+{
+ struct int_map_node *node = find_int_map_node(im, idx);
+ int h;
+ if (node) {
+ node->val = val;
+ return ;
+ }
+ /**/
+ node = malloc(sizeof(struct int_map_node));
+ node->key = idx;
+ node->val = val;
+ h = node_index(idx);
+ node->next = im->hash_head[h].next;
+ im->hash_head[h].next = node;
+ /**/
+ im->nr ++;
+}
+
+void
+int_map_flatten(struct int_map *im)
+{
+ int i;
+ struct int_map_node *node;
+ int max_n = 0;
+ /* 配列を準備する */
+ im->array_size = im->nr * 2;
+ im->array = malloc(sizeof(struct int_map_node *) *
+ im->array_size);
+ for (i = 0; i < im->array_size; i++) {
+ im->array[i] = NULL;
+ }
+ /* 要素を置いていく */
+ for (i = 0; i < HASH_SIZE; i++) {
+ for (node = im->hash_head[i].next; node; node = node->next) {
+ int n = 0;
+ while (1) {
+ int h;
+ h = node->key + n;
+ h %= im->array_size;
+ if (!im->array[h]) {
+ im->array[h] = node;
+ break;
+ }
+ /**/
+ n++;
+ }
+ if (n > max_n) {
+ max_n = n;
+ }
+ }
+ }
+ /**/
+ printf(" max_collision=%d\n", max_n);
+}