summaryrefslogtreecommitdiff
path: root/src-worddic/texttrie.c
diff options
context:
space:
mode:
Diffstat (limited to 'src-worddic/texttrie.c')
-rw-r--r--src-worddic/texttrie.c1516
1 files changed, 1516 insertions, 0 deletions
diff --git a/src-worddic/texttrie.c b/src-worddic/texttrie.c
new file mode 100644
index 0000000..9497a02
--- /dev/null
+++ b/src-worddic/texttrie.c
@@ -0,0 +1,1516 @@
+/*
+ * DEPRECATED, it is too hard to debug.
+ * you may use textdict instead
+ *
+ * Trie in Text
+ *
+ * *issues
+ * +correct API
+ * -iterator vs callback
+ * +robustness
+ * -error detection
+ * -auto correction
+ * -concurrent access
+ * +efficiency
+ * -lower memory consumption
+ * -disk space?
+ *
+ * on some file system like jffs2 on linux, writable mmap
+ * is not allowed, though you can write it.
+ *
+ */
+/*
+ * API
+ * anthy_trie_open()
+ * anthy_trie_close()
+ * anthy_trie_add()
+ * anthy_trie_find()
+ * anthy_trie_delete()
+ * anthy_trie_find_next_key()
+ * anthy_trie_find_prefix()
+ * anthy_trie_print_array()
+ *
+ * Copyright (C) 2005-2006 TABATA Yusuke
+ *
+ */
+/*
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/* open & mmap */
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+/**/
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <anthy/texttrie.h>
+#include <anthy/filemap.h>
+#include "dic_main.h"
+
+/* configs */
+#define OBJ_LEN 20
+#define LINE_LEN 32
+#define EXPAND_COUNT 16
+
+/* cell type */
+#define TT_SUPER 0
+#define TT_UNUSED 1
+#define TT_ALLOCED 2
+#define TT_NODE 3
+#define TT_BODY 4
+#define TT_TAIL 5
+
+/* cell structure */
+struct cell {
+ /* (common) type */
+ int type;
+ /* union */
+ union {
+ /* unused */
+ int next_unused;
+ /* super */
+ struct {
+ int first_unused;
+ int root_cell;
+ int size;
+ int serial;
+ } super;
+ /* node */
+ struct {
+ int key;
+ int next;
+ int child;
+ int body;
+ int parent;
+ } node;
+ /* body */
+ struct {
+ int owner;
+ char *obj;
+ } body;
+ /* tail */
+ struct {
+ char *obj;
+ int prev;
+ } tail;
+ } u;
+ /* body & tail */
+ int next_tail;
+};
+
+struct text_trie {
+ /**/
+ int fatal;
+ /**/
+ char *fn;
+ FILE *wfp;
+ struct filemapping *mapping;
+ char *ptr;
+ /**/
+ struct cell super;
+ int valid_super;
+};
+
+struct path {
+ /**/
+ const char *key_str;
+ /**/
+ int max_len;
+ int *path;
+ int len;
+ int cur;
+};
+
+static void
+print_super_cell(struct cell *c)
+{
+ printf("super first_unused=%d, root_cell=%d, size=%d, serial=%d\n",
+ c->u.super.first_unused, c->u.super.root_cell,
+ c->u.super.size, c->u.super.serial);
+}
+
+static void
+print_alloced_cell(void)
+{
+ printf("alloc-ed\n");
+}
+
+static void
+print_node_cell(struct cell *c)
+{
+ printf("node key=%d", c->u.node.key);
+ if (c->u.node.key > 0 && isprint(c->u.node.key)) {
+ printf("(%c)", c->u.node.key);
+ }
+ printf(" parent=%d next=%d child=%d body=%d\n",
+ c->u.node.parent, c->u.node.next, c->u.node.child, c->u.node.body);
+}
+
+static void
+print_unused_cell(struct cell *c)
+{
+ printf("unused next_unused=%d\n",
+ c->u.next_unused);
+}
+
+static void
+print_body_cell(struct cell *c)
+{
+ printf("body object=(%s), owner=%d, next_tail=%d\n",
+ c->u.body.obj, c->u.body.owner, c->next_tail);
+}
+
+static void
+print_tail_cell(struct cell *c)
+{
+ printf("tail object=(%s), prev=%d, next_tail=%d\n",
+ c->u.tail.obj, c->u.tail.prev, c->next_tail);
+}
+
+static void
+print_cell(int idx, struct cell *c)
+{
+ if (!c) {
+ printf("idx =%d(null cell):\n", idx);
+ return ;
+ }
+ printf("idx=%d:", idx);
+ switch (c->type) {
+ case TT_SUPER:
+ print_super_cell(c);
+ break;
+ case TT_ALLOCED:
+ print_alloced_cell();
+ break;
+ case TT_NODE:
+ print_node_cell(c);
+ break;
+ case TT_UNUSED:
+ print_unused_cell(c);
+ break;
+ case TT_BODY:
+ print_body_cell(c);
+ break;
+ case TT_TAIL:
+ print_tail_cell(c);
+ break;
+ default:
+ printf("unknown\n");
+ }
+}
+
+static void
+path_setup(struct path *path, const char *key, int len, int *buf)
+{
+ unsigned char *p = (unsigned char *)key;
+ path->key_str = key;
+ path->max_len = len;
+ path->path = buf;
+ path->len = 0;
+ path->cur = 0;
+ /**/
+ while (*p) {
+ path->path[path->len] = p[0] * 256 + p[1];
+ path->len ++;
+ p++;
+ if (p[0]) {
+ p++;
+ }
+ }
+}
+
+static void
+path_copy_to_str(struct path *path, char *str, int buf_len)
+{
+ unsigned char *p = (unsigned char *)str;
+ int i, o;
+ for (i = 0, o = 0; i < path->cur && o < buf_len - 2; i++) {
+ p[o] = (path->path[i]>>8)&255;
+ p[o+1] = path->path[i]&255;
+ o += 2;
+ }
+ p[o] = 0;
+}
+
+static int
+sput_int(char *buf, int num)
+{
+ unsigned char *tmp = (unsigned char *)buf;
+ tmp[0] = (num>>24)&255;
+ tmp[1] = (num>>16)&255;
+ tmp[2] = (num>>8)&255;
+ tmp[3] = num&255;
+ return 4;
+}
+
+static char *
+sget_int(char *buf, int *num)
+{
+ unsigned int res;
+ unsigned char *tmp = (unsigned char *)buf;
+ res = 0;
+ res += tmp[0] << 24;
+ res += tmp[1] << 16;
+ res += tmp[2] << 8;
+ res += tmp[3];
+ *num = res;
+ buf += 4;
+ return buf;
+}
+
+static char *
+pass_str(char *buf, const char *str)
+{
+ buf += strlen(str);
+ return buf;
+}
+
+static void
+encode_super(struct cell *c, char *buf)
+{
+ buf += sprintf(buf, "S ");
+ buf += sput_int(buf, c->u.super.size);
+ buf += sput_int(buf, c->u.super.root_cell);
+ buf += sput_int(buf, c->u.super.first_unused);
+ buf += sput_int(buf, c->u.super.serial);
+ buf += sput_int(buf, LINE_LEN);
+}
+
+static void
+encode_node(struct cell *c, char *buf)
+{
+ buf += sprintf(buf, "N ");
+ buf += sput_int(buf, c->u.node.key);
+ buf += sput_int(buf, c->u.node.parent);
+ buf += sput_int(buf, c->u.node.next);
+ buf += sput_int(buf, c->u.node.child);
+ buf += sput_int(buf, c->u.node.body);
+}
+
+static void
+encode_body(struct cell *c, char *buf)
+{
+ buf += sprintf(buf, "B");
+ buf += sput_int(buf, c->next_tail);
+ buf += sput_int(buf, c->u.body.owner);
+ sprintf(buf, "\"%s\"",
+ c->u.body.obj);
+}
+
+static void
+encode_unused(struct cell *c, char *buf)
+{
+ buf += sprintf(buf, "-next=");
+ buf += sput_int(buf, c->u.next_unused);
+}
+
+static void
+encode_tail(struct cell *c, char *buf)
+{
+ buf += sprintf(buf, "T");
+ buf += sput_int(buf, c->u.tail.prev);
+ buf += sput_int(buf, c->next_tail);
+ sprintf(buf, "\"%s\"",
+ c->u.tail.obj);
+}
+
+static void
+encode_unknown(char *buf)
+{
+ sprintf(buf, "?");
+}
+
+static void
+encode_cell(struct cell *c, char *buf)
+{
+ switch (c->type) {
+ case TT_SUPER:
+ encode_super(c, buf);
+ break;
+ case TT_NODE:
+ encode_node(c, buf);
+ break;
+ case TT_UNUSED:
+ encode_unused(c, buf);
+ break;
+ case TT_BODY:
+ encode_body(c, buf);
+ break;
+ case TT_TAIL:
+ encode_tail(c, buf);
+ break;
+ default:
+ encode_unknown(buf);
+ break;
+ }
+}
+
+static void
+write_back_cell(struct text_trie *tt, struct cell *c, int idx)
+{
+ int i;
+ char buf[LINE_LEN+1];
+ /* sanity check */
+ if (((anthy_mmap_size(tt->mapping) / LINE_LEN) < (idx + 1)) ||
+ idx < 0) {
+ return ;
+ }
+ for (i = 0; i < LINE_LEN; i++) {
+ buf[i] = ' ';
+ }
+ encode_cell(c, buf);
+ buf[LINE_LEN-1] = '\n';
+ if (anthy_mmap_is_writable(tt->mapping)) {
+ memcpy(&tt->ptr[idx*LINE_LEN], buf, LINE_LEN);
+ } else {
+ fseek(tt->wfp, idx*LINE_LEN, SEEK_SET);
+ fwrite(buf, LINE_LEN, 1, tt->wfp);
+ fflush(tt->wfp);
+ }
+}
+
+static char *
+decode_str(char *raw_buf, int off)
+{
+ char *head;
+ char copy_buf[LINE_LEN + 1];
+ char *buf;
+ int i;
+ /* from off to before last '\n' */
+ for (i = 0; i < LINE_LEN - off - 1; i++) {
+ copy_buf[i] = raw_buf[i];
+ }
+ copy_buf[i] = 0;
+ buf = copy_buf;
+ /* find first double quote */
+ while (*buf && *buf != '\"') {
+ buf ++;
+ }
+ if (!*buf) {
+ /* cant find double quote */
+ return strdup("");
+ }
+ buf ++;
+ head = buf;
+ /* go to the end of string */
+ while (*buf) {
+ buf ++;
+ }
+ /* find last double quote */
+ while (*buf != '\"') {
+ buf--;
+ }
+ *buf = 0;
+ /**/
+ return strdup(head);
+}
+
+static void
+release_cell_str(struct cell *c)
+{
+ if (!c) {
+ return ;
+ }
+ if (c->type == TT_BODY) {
+ free(c->u.body.obj);
+ }
+ if (c->type == TT_TAIL) {
+ free(c->u.tail.obj);
+ }
+}
+
+static int
+decode_super(struct cell *c, char *buf)
+{
+ c->type = TT_SUPER;
+ buf = pass_str(buf, "S ");
+ buf = sget_int(buf, &c->u.super.size);
+ buf = sget_int(buf, &c->u.super.root_cell);
+ buf = sget_int(buf, &c->u.super.first_unused);
+ buf = sget_int(buf, &c->u.super.serial);
+ return 0;
+}
+
+static int
+decode_unuse(struct cell *c, char *buf)
+{
+ c->type = TT_UNUSED;
+ buf = pass_str(buf, "-next=");
+ buf = sget_int(buf, &c->u.next_unused);
+ return 0;
+}
+
+static int
+decode_node(struct cell *c, char *buf)
+{
+ c->type = TT_NODE;
+ buf = pass_str(buf, "N ");
+ buf = sget_int(buf, &c->u.node.key);
+ buf = sget_int(buf, &c->u.node.parent);
+ buf = sget_int(buf, &c->u.node.next);
+ buf = sget_int(buf, &c->u.node.child);
+ buf = sget_int(buf, &c->u.node.body);
+ return 0;
+}
+
+static int
+decode_body(struct cell *c, char *buf)
+{
+ c->type = TT_BODY;
+ buf = pass_str(buf, "B");
+ buf = sget_int(buf, &c->next_tail);
+ buf = sget_int(buf, &c->u.body.owner);
+ c->u.body.obj = decode_str(buf, 9);
+ return 0;
+}
+
+static int
+decode_tail(struct cell *c, char *buf)
+{
+ c->type = TT_TAIL;
+ buf = pass_str(buf, "T");
+ buf = sget_int(buf, &c->u.tail.prev);
+ buf = sget_int(buf, &c->next_tail);
+ c->u.tail.obj = decode_str(buf, 9);
+ return 0;
+}
+
+static int
+decode_alloced(struct cell *c)
+{
+ c->type = TT_ALLOCED;
+ return 0;
+}
+
+static struct cell *
+decode_nth_cell(struct text_trie *tt, struct cell *c, int nth)
+{
+ int res;
+ char *buf;
+ if (nth < 0 ||
+ (anthy_mmap_size(tt->mapping) / LINE_LEN) <
+ (nth + 1)) {
+ return NULL;
+ }
+ buf = &tt->ptr[nth*LINE_LEN];
+
+ res = -1;
+ switch (buf[0]) {
+ case 'S':
+ res = decode_super(c, buf);
+ break;
+ case '-':
+ res = decode_unuse(c, buf);
+ break;
+ case 'N':
+ res = decode_node(c, buf);
+ break;
+ case 'B':
+ res = decode_body(c, buf);
+ break;
+ case 'T':
+ res = decode_tail(c, buf);
+ break;
+ case '?':
+ res = decode_alloced(c);
+ break;
+ default:
+ /*printf("decode fail (nth=%d::%s).\n", nth, buf);*/
+ ;
+ }
+ if (res) {
+ c->type = TT_UNUSED;
+ }
+ return c;
+}
+
+static struct cell *
+decode_nth_node(struct text_trie *tt, struct cell *c, int nth)
+{
+ if (!decode_nth_cell(tt, c, nth)) {
+ return NULL;
+ }
+ if (c->type != TT_NODE) {
+ return NULL;
+ }
+ return c;
+}
+
+static int
+update_mapping(struct text_trie *tt)
+{
+ if (tt->mapping) {
+ anthy_munmap(tt->mapping);
+ }
+ tt->mapping = anthy_mmap(tt->fn, 1);
+ if (!tt->mapping) {
+ /* try to fall back read-only mmap */
+ tt->mapping = anthy_mmap(tt->fn, 0);
+ }
+ if (!tt->mapping) {
+ tt->ptr = NULL;
+ return 1;
+ }
+ tt->ptr = anthy_mmap_address(tt->mapping);
+ return 0;
+}
+
+static int
+expand_file(struct text_trie *tt, int count)
+{
+ char buf[LINE_LEN+1];
+ int i;
+ for (i = 0; i < LINE_LEN; i++) {
+ buf[i] = ' ';
+ }
+ buf[LINE_LEN-1] = '\n';
+ /**/
+ for (i = 0; i < count; i++) {
+ int res;
+ res = fwrite(buf, LINE_LEN, 1, tt->wfp);
+ if (res != 1) {
+ return 1;
+ }
+ if (fflush(tt->wfp)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int
+set_file_size(struct text_trie *tt, int len)
+{
+ int size = LINE_LEN * len;
+ int cur_size;
+ int err = 0;
+
+ fseek(tt->wfp, 0, SEEK_END);
+ cur_size = ftell(tt->wfp);
+ if (cur_size == size) {
+ return 0;
+ }
+ if (cur_size > size) {
+ truncate(tt->fn, size);
+ } else {
+ err = expand_file(tt, (size - cur_size) / LINE_LEN);
+ if (!err) {
+ update_mapping(tt);
+ } else {
+ tt->fatal = 1;
+ }
+ }
+
+ return err;
+}
+
+static struct cell *
+get_super_cell(struct text_trie *tt)
+{
+ /* cached? */
+ if (tt->valid_super) {
+ return &tt->super;
+ }
+ /* read */
+ if (decode_nth_cell(tt, &tt->super, 0)) {
+ tt->valid_super = 1;
+ return &tt->super;
+ }
+ /* create now */
+ tt->super.type = TT_SUPER;
+ tt->super.u.super.first_unused = 0;
+ tt->super.u.super.root_cell = 0;
+ tt->super.u.super.size = 1;
+ tt->super.u.super.serial = 1;
+ if (set_file_size(tt, 1) != 0) {
+ return NULL;
+ }
+ write_back_cell(tt, &tt->super, 0);
+ tt->valid_super = 1;
+ return &tt->super;
+}
+
+/* convenience function */
+static int
+get_array_size(struct text_trie *a)
+{
+ struct cell *super = get_super_cell(a);
+ int size = super->u.super.size;
+ return size;
+}
+
+/* convenience function */
+static int
+get_root_idx(struct text_trie *tt)
+{
+ struct cell *super = get_super_cell(tt);
+ if (!super) {
+ return 0;
+ }
+ return super->u.super.root_cell;
+}
+
+static int
+expand_array(struct text_trie *tt, int len)
+{
+ int i;
+ struct cell *super;
+ int res;
+ int size = get_array_size(tt);
+ if (size >= len) {
+ return 0;
+ }
+ /* expand file */
+ res = set_file_size(tt, len);
+ if (res) {
+ return 1;
+ }
+ /* fill unused */
+ super = get_super_cell(tt);
+ for (i = super->u.super.size; i < len; i++) {
+ struct cell ex_cell;
+ ex_cell.type = TT_UNUSED;
+ ex_cell.u.next_unused = super->u.super.first_unused;
+ write_back_cell(tt, &ex_cell, i);
+ super->u.super.first_unused = i;
+ }
+ super->u.super.size = len;
+ write_back_cell(tt, super, 0);
+ return 0;
+}
+
+void
+anthy_trie_print_array(struct text_trie *tt)
+{
+ int i;
+ int size = get_array_size(tt);
+ print_cell(0, get_super_cell(tt));
+ for (i = 1; i < size; i++) {
+ struct cell c;
+ decode_nth_cell(tt, &c, i);
+ print_cell(i, &c);
+ release_cell_str(&c);
+ }
+}
+
+/* get unused cell */
+static int
+get_unused_index(struct text_trie *tt)
+{
+ struct cell *super;
+ int unuse;
+ struct cell new_cell;
+
+ super = get_super_cell(tt);
+ unuse = super->u.super.first_unused;
+ if (!unuse) {
+ /* expand array */
+ expand_array(tt, super->u.super.size + EXPAND_COUNT);
+ unuse = super->u.super.first_unused;
+ if (!unuse) {
+ return 0;
+ }
+ }
+ if (!decode_nth_cell(tt, &new_cell, unuse)) {
+ tt->fatal = 1;
+ return 0;
+ }
+ super->u.super.first_unused = new_cell.u.next_unused;
+ new_cell.type = TT_ALLOCED;
+ write_back_cell(tt, &new_cell, unuse);
+ write_back_cell(tt, super, 0);
+ return unuse;
+}
+
+static void
+free_cell(struct text_trie *tt, int idx)
+{
+ struct cell *super = get_super_cell(tt);
+ struct cell c;
+ if (!decode_nth_cell(tt, &c, idx)) {
+ tt->fatal = 1;
+ } else {
+ c.type = TT_UNUSED;
+ c.u.next_unused = super->u.super.first_unused;
+ write_back_cell(tt, &c, idx);
+ }
+ super->u.super.first_unused = idx;
+ write_back_cell(tt, super, 0);
+}
+
+static void
+load_super(struct text_trie *tt)
+{
+ struct cell root, *super;
+ int root_idx;
+ super = get_super_cell(tt);
+ if (!super) {
+ tt->fatal = 1;
+ return ;
+ }
+ /**/
+ if (super->u.super.root_cell) {
+ return ;
+ }
+ /**/
+ root_idx = get_unused_index(tt);
+ if (root_idx == 0) {
+ tt->fatal = 1;
+ return ;
+ }
+ root.u.node.key = 0;
+ root.type = TT_NODE;
+ root.u.node.parent = 0;
+ root.u.node.next = 0;
+ root.u.node.child = 0;
+ root.u.node.body = 0;
+ write_back_cell(tt, &root, root_idx);
+ /**/
+ tt->super.u.super.root_cell = root_idx;
+ write_back_cell(tt, super, 0);
+}
+
+static void
+purge_cache(struct text_trie *tt)
+{
+ if (tt) {
+ tt->valid_super = 0;
+ }
+}
+
+static FILE *
+do_fopen(const char *fn, int create)
+{
+ int fd;
+ if (!create) {
+ /* check file existance */
+ FILE *fp;
+ fp = fopen(fn, "r");
+ if (!fp) {
+ return NULL;
+ }
+ fclose(fp);
+ }
+ fd = open(fn, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+ if (fd == -1) {
+ return NULL;
+ }
+ return fdopen(fd, "w");
+}
+
+static struct text_trie *
+alloc_tt(const char *fn, FILE *wfp)
+{
+ struct text_trie *tt;
+ tt = malloc(sizeof(struct text_trie));
+ tt->fatal = 0;
+ tt->wfp = wfp;
+ tt->valid_super = 0;
+ tt->fn = strdup(fn);
+ tt->mapping = NULL;
+ return tt;
+}
+
+static void
+clear_file(const char *fn)
+{
+ FILE *fp = fopen(fn, "w");
+ if (fp) {
+ fclose(fp);
+ }
+}
+
+static struct text_trie *
+trie_open(const char *fn, int create, int do_retry)
+{
+ struct text_trie *tt;
+ FILE *fp;
+ /**/
+ fp = do_fopen(fn, create);
+ if (!fp) {
+ return NULL;
+ }
+ /**/
+ tt = alloc_tt(fn, fp);
+ if (!tt) {
+ fclose(fp);
+ return NULL;
+ }
+ /**/
+ update_mapping(tt);
+ load_super(tt);
+ /**/
+ if (tt->fatal) {
+ anthy_trie_close(tt);
+ if (!do_retry) {
+ return NULL;
+ }
+ clear_file(fn);
+ return trie_open(fn, create, 0);
+ }
+ /**/
+ return tt;
+}
+
+
+/* API */
+struct text_trie *
+anthy_trie_open(const char *fn, int create)
+{
+ struct text_trie *tt;
+ anthy_priv_dic_lock();
+ tt = trie_open(fn, create, 1);
+ anthy_priv_dic_unlock();
+ purge_cache(tt);
+ return tt;
+}
+
+/* API */
+void
+anthy_trie_close(struct text_trie *tt)
+{
+ if (!tt) {
+ return ;
+ }
+ fclose(tt->wfp);
+ anthy_munmap(tt->mapping);
+ free(tt->fn);
+ free(tt);
+}
+
+/* API */
+void
+anthy_trie_update_mapping(struct text_trie *tt)
+{
+ if (!tt) {
+ return ;
+ }
+ anthy_priv_dic_lock();
+ update_mapping(tt);
+ anthy_priv_dic_unlock();
+}
+
+static void
+graft_child(struct text_trie *tt, int parent_idx, int new_idx)
+{
+ struct cell parent_cell;
+ struct cell new_cell;
+ struct cell cur_youngest_cell;
+ int cur_idx;
+ /**/
+ if (!decode_nth_node(tt, &parent_cell, parent_idx)) {
+ return ;
+ }
+ /**/
+ if (parent_cell.u.node.child == 0) {
+ /* 1st child */
+ parent_cell.u.node.child = new_idx;
+ write_back_cell(tt, &parent_cell, parent_idx);
+ return ;
+ }
+
+ if (!decode_nth_node(tt, &cur_youngest_cell, parent_cell.u.node.child)) {
+ return ;
+ }
+ if (!decode_nth_node(tt, &new_cell, new_idx)) {
+ return ;
+ }
+ if (new_cell.u.node.key < cur_youngest_cell.u.node.key) {
+ /* newly added younger child */
+ new_cell.u.node.next = parent_cell.u.node.child;
+ parent_cell.u.node.child = new_idx;
+ write_back_cell(tt, &new_cell, new_idx);
+ write_back_cell(tt, &parent_cell, parent_idx);
+ return;
+ }
+
+ /* insert some order */
+ cur_idx = parent_cell.u.node.child;
+ while (cur_idx) {
+ int next_idx;
+ struct cell cur_cell, tmp_cell;
+ struct cell *next_cell = NULL;
+ if (!decode_nth_node(tt, &cur_cell, cur_idx)) {
+ return ;
+ }
+ next_idx = cur_cell.u.node.next;
+ /**/
+ if (next_idx) {
+ next_cell = decode_nth_node(tt, &tmp_cell, next_idx);
+ }
+ if (!next_cell) {
+ /* append */
+ new_cell.u.node.next = 0;
+ cur_cell.u.node.next = new_idx;
+ write_back_cell(tt, &cur_cell, cur_idx);
+ break;
+ } else {
+ if (cur_cell.u.node.key < new_cell.u.node.key &&
+ new_cell.u.node.key < next_cell->u.node.key) {
+ cur_cell.u.node.next = new_idx;
+ new_cell.u.node.next = next_idx;
+ write_back_cell(tt, &cur_cell, cur_idx);
+ break;
+ }
+ }
+ cur_idx = next_idx;
+ }
+ write_back_cell(tt, &new_cell, new_idx);
+}
+
+static int
+find_child(struct text_trie *tt, int parent_idx, int key, int exact)
+{
+ int child_idx;
+ int prev_key;
+ struct cell parent_cell;
+
+ if (!decode_nth_node(tt, &parent_cell, parent_idx)) {
+ return 0;
+ }
+
+ /**/
+ prev_key = 0;
+ child_idx = parent_cell.u.node.child;
+
+ /**/
+ while (child_idx) {
+ struct cell child_cell;
+ int this_key;
+ /**/
+ if (!decode_nth_node(tt, &child_cell, child_idx)) {
+ return 0;
+ }
+ this_key = child_cell.u.node.key;
+ if (this_key <= prev_key) {
+ return 0;
+ }
+ /**/
+ if (exact && this_key == key) {
+ return child_idx;
+ }
+ if (!exact && (this_key & 0xff00) == (key & 0xff00)) {
+ return child_idx;
+ }
+ child_idx = child_cell.u.node.next;
+ prev_key = this_key;
+ }
+ return 0;
+}
+
+static int
+trie_search_rec(struct text_trie *tt, struct path *p,
+ int parent_idx, int create)
+{
+ int child_idx;
+ int key = p->path[p->cur];
+ /* special case */
+ if (p->cur == p->len) {
+ return parent_idx;
+ }
+ /* scan child */
+ child_idx = find_child(tt, parent_idx, key, 1);
+ if (!child_idx) {
+ struct cell child_cell;
+ if (!create) {
+ return 0;
+ }
+ /* add child */
+ child_idx = get_unused_index(tt);
+ if (!child_idx) {
+ return 0;
+ }
+ if (!decode_nth_cell(tt, &child_cell, child_idx)) {
+ return 0;
+ }
+ child_cell.type = TT_NODE;
+ child_cell.u.node.parent = parent_idx;
+ child_cell.u.node.key = key;
+ child_cell.u.node.next = 0;
+ child_cell.u.node.child = 0;
+ child_cell.u.node.body = 0;
+ write_back_cell(tt, &child_cell, child_idx);
+ /* graft */
+ graft_child(tt, parent_idx, child_idx);
+ }
+ p->cur ++;
+ key ++;
+ if (!key) {
+ return child_idx;
+ }
+ return trie_search_rec(tt, p, child_idx, create);
+}
+
+static char *
+get_str_part(const char *str, int from)
+{
+ char buf[OBJ_LEN+1];
+ int i;
+ for (i = 0; i < OBJ_LEN; i++) {
+ buf[i] = str[from+i];
+ }
+ buf[i] = 0;
+ return strdup(buf);
+}
+
+static void
+release_body(struct text_trie *tt, int idx)
+{
+ struct cell c;
+ int tail_idx;
+ if (!decode_nth_cell(tt, &c, idx) ||
+ c.type != TT_BODY) {
+ return ;
+ }
+ tail_idx = c.next_tail;
+ while (tail_idx) {
+ struct cell tail_cell;
+ int tmp;
+ if (!decode_nth_cell(tt, &tail_cell, tail_idx)) {
+ break;
+ }
+ tmp = tail_cell.next_tail;
+ free_cell(tt, tail_idx);
+ tail_idx = tmp;
+ }
+ free_cell(tt, idx);
+}
+
+static void
+set_body(struct text_trie *tt, int idx, const char *body_str)
+{
+ int body_idx = get_unused_index(tt);
+ int len;
+ int i;
+ struct cell node_cell;
+ struct cell body_cell;
+ struct cell prev_cell;
+ struct cell tail_cell;
+ int prev_idx;
+ /**/
+ if (!decode_nth_cell(tt, &node_cell, idx)) {
+ return ;
+ }
+ if (node_cell.u.node.body) {
+ release_body(tt, node_cell.u.node.body);
+ }
+ len = strlen(body_str);
+ /**/
+ node_cell.u.node.body = body_idx;
+ write_back_cell(tt, &node_cell, idx);
+ /**/
+ if (!decode_nth_cell(tt, &body_cell, body_idx)) {
+ return ;
+ }
+ body_cell.type = TT_BODY;
+ body_cell.u.body.obj = get_str_part(body_str, 0);
+ body_cell.u.body.owner = idx;
+ body_cell.next_tail = 0;
+ write_back_cell(tt, &body_cell, body_idx);
+ release_cell_str(&body_cell);
+ /**/
+ if (!decode_nth_cell(tt, &body_cell, body_idx)) {
+ return ;
+ }
+ /**/
+ prev_idx = body_idx;
+ prev_cell = body_cell;
+ for (i = OBJ_LEN; i < len; i += OBJ_LEN) {
+ int tail_idx = get_unused_index(tt);
+ if (!decode_nth_cell(tt, &tail_cell, tail_idx)) {
+ return ;
+ }
+ tail_cell.type = TT_TAIL;
+ tail_cell.u.tail.obj = get_str_part(body_str, i);
+ tail_cell.u.tail.prev = prev_idx;
+ tail_cell.next_tail = 0;
+ prev_cell.next_tail = tail_idx;
+ write_back_cell(tt, &tail_cell, tail_idx);
+ write_back_cell(tt, &prev_cell, prev_idx);
+ release_cell_str(&prev_cell);
+ /**/
+ prev_idx = tail_idx;
+ prev_cell = tail_cell;
+ }
+ if (i != OBJ_LEN) {
+ release_cell_str(&tail_cell);
+ }
+}
+
+static int
+trie_add(struct text_trie *tt, struct path *p, const char *body)
+{
+ int root_idx = get_root_idx(tt);
+ int target_idx;
+ /**/
+ if (root_idx == 0) {
+ return -1;
+ }
+ target_idx = trie_search_rec(tt, p, root_idx, 1);
+ if (target_idx) {
+ set_body(tt, target_idx, body);
+ }
+ return 0;
+}
+
+/* API */
+int
+anthy_trie_add(struct text_trie *tt, const char *key, const char *body)
+{
+ int res;
+ int len;
+ struct path p;
+ if (!tt || tt->fatal) {
+ return -1;
+ }
+ len = strlen(key);
+ path_setup(&p, key, len, alloca(sizeof(int)*len));
+ anthy_priv_dic_lock();
+ res = trie_add(tt, &p, body);
+ anthy_priv_dic_unlock();
+ purge_cache(tt);
+ return res;
+}
+
+static int
+get_object_length(struct text_trie *tt, int body_idx)
+{
+ int len = 0;
+ int idx = body_idx;
+ while (idx) {
+ struct cell c;
+ if (!decode_nth_cell(tt, &c, idx)) {
+ return 0;
+ }
+ idx = c.next_tail;
+ len += OBJ_LEN;
+ release_cell_str(&c);
+ }
+ return len;
+}
+
+static char *
+gather_str(struct text_trie *tt, int body_idx)
+{
+ int idx;
+ char *buf;
+ int len;
+ /* count length */
+ len = get_object_length(tt, body_idx);
+ if (len == 0) {
+ return NULL;
+ }
+ /**/
+ buf = malloc(len + 1);
+ idx = body_idx;
+ len = 0;
+ while (idx) {
+ struct cell c;
+ if (!decode_nth_cell(tt, &c, idx)) {
+ free(buf);
+ return NULL;
+ }
+ if (len == 0) {
+ sprintf(&buf[len], "%s", c.u.body.obj);
+ } else {
+ sprintf(&buf[len], "%s", c.u.tail.obj);
+ }
+ idx = c.next_tail;
+ len += OBJ_LEN;
+ release_cell_str(&c);
+ }
+ return buf;
+}
+
+static char *
+trie_find(struct text_trie *tt, struct path *p)
+{
+ int root_idx;
+ int target_idx;
+ root_idx = get_root_idx(tt);
+ if (!root_idx) {
+ return NULL;
+ }
+ target_idx = trie_search_rec(tt, p, root_idx, 0);
+ if (target_idx) {
+ struct cell target_cell;
+ int body_idx;
+ if (!decode_nth_node(tt, &target_cell, target_idx)) {
+ return NULL;
+ }
+ body_idx = target_cell.u.node.body;
+ if (body_idx) {
+ return gather_str(tt, body_idx);
+ }
+ }
+ return NULL;
+}
+
+/* API */
+char *
+anthy_trie_find(struct text_trie *tt, char *key)
+{
+ char *res;
+ struct path p;
+ int len;
+ if (!tt || tt->fatal) {
+ return NULL;
+ }
+ len = strlen(key);
+ path_setup(&p, key, len, alloca(sizeof(int)*len));
+ anthy_priv_dic_lock();
+ res = trie_find(tt, &p);
+ anthy_priv_dic_unlock();
+ purge_cache(tt);
+ return res;
+}
+
+static int
+do_find_next_key(struct text_trie *tt, struct path *p,
+ int root_idx, int target_idx)
+{
+ struct cell *target_cell, tmp_cell;
+ int prev_is_up = 0;
+ target_cell = decode_nth_node(tt, &tmp_cell, target_idx);
+ /**/
+ do {
+ /* one step */
+ if (!target_cell) {
+ return -1;
+ }
+ if (!prev_is_up && target_cell->u.node.child) {
+ prev_is_up = 0;
+ target_idx = target_cell->u.node.child;
+ p->cur++;
+ } else if (target_cell->u.node.next) {
+ prev_is_up = 0;
+ target_idx = target_cell->u.node.next;
+ } else if (target_cell->u.node.parent) {
+ prev_is_up = 1;
+ target_idx = target_cell->u.node.parent;
+ p->cur--;
+ } else {
+ return -1;
+ }
+ target_cell = decode_nth_node(tt, &tmp_cell, target_idx);
+ if (!target_cell) {
+ return -1;
+ }
+ if (p->cur >= p->max_len) {
+ continue;
+ }
+ if (p->cur == 0) {
+ return -1;
+ }
+ p->path[p->cur-1] = target_cell->u.node.key;
+ if (!prev_is_up && target_cell->u.node.body) {
+ return 0;
+ }
+ } while (target_idx != root_idx);
+ return -1;
+}
+
+static int
+find_partial_key(struct text_trie *tt, struct path *p, int idx)
+{
+ struct cell c;
+ if (!decode_nth_node(tt, &c, idx)) {
+ return -1;
+ }
+ if (c.type != TT_NODE) {
+ return -1;
+ }
+ p->len ++;
+ p->path[p->cur] = c.u.node.key;
+ p->cur ++;
+ return 0;
+}
+
+static int
+trie_find_next_key(struct text_trie *tt, struct path *p)
+{
+ int root_idx = get_root_idx(tt);
+ int target_idx;
+ int tmp_idx;
+ /**/
+ target_idx = trie_search_rec(tt, p, root_idx, 0);
+ if (target_idx > 0) {
+ /* easy case */
+ return do_find_next_key(tt, p, root_idx, target_idx);
+ }
+ if ((p->path[p->len-1] & 0xff) != 0) {
+ /* simply not exist in tree */
+ return -1;
+ }
+ /* special case */
+ p->len --;
+ p->cur = 0;
+ target_idx = trie_search_rec(tt, p, root_idx, 0);
+ tmp_idx = find_child(tt, target_idx, p->path[p->len], 0);
+ if (tmp_idx) {
+ return find_partial_key(tt, p, tmp_idx);
+ }
+ return do_find_next_key(tt, p, root_idx, target_idx);
+}
+
+
+/* API */
+char *
+anthy_trie_find_next_key(struct text_trie *tt, char *buf, int buf_len)
+{
+ int res;
+ struct path p;
+ if (!tt || tt->fatal) {
+ return NULL;
+ }
+ path_setup(&p, buf, buf_len, alloca(sizeof(int)*buf_len));
+ anthy_priv_dic_lock();
+ res = trie_find_next_key(tt, &p);
+ anthy_priv_dic_unlock();
+ purge_cache(tt);
+ if (res) {
+ return NULL;
+ }
+ path_copy_to_str(&p, buf, buf_len);
+ return buf;
+}
+
+static void
+trie_find_prefix(struct text_trie *tt, const char *str,
+ char *buf, int buf_len,
+ int (*cb)(const char *key, const char *str))
+{
+ int idx = get_root_idx(tt);
+ int i, len = strlen(str);
+ for (i = 0; i < len && i < buf_len; i++) {
+ struct cell c;
+ idx = find_child(tt, idx, str[i], 1);
+ if (!idx) {
+ return ;
+ }
+ if (!decode_nth_node(tt, &c, idx)) {
+ return ;
+ }
+ buf[i] = idx;
+ buf[i+1] = 0;
+ if (c.u.node.body) {
+ char *s = gather_str(tt, c.u.node.body);
+ if (cb) {
+ cb(buf, s);
+ }
+ free(s);
+ }
+ }
+}
+
+void
+anthy_trie_find_prefix(struct text_trie *tt, const char *str,
+ char *buf, int buf_len,
+ int (*cb)(const char *key, const char *str))
+{
+ if (!tt || tt->fatal) {
+ return ;
+ }
+ anthy_priv_dic_lock();
+ trie_find_prefix(tt, str, buf, buf_len, cb);
+ anthy_priv_dic_unlock();
+ purge_cache(tt);
+}
+
+static void
+disconnect(struct text_trie *tt, int parent_idx, int target_idx)
+{
+ struct cell parent_cell;
+ struct cell target_cell;
+
+ if (!decode_nth_node(tt, &parent_cell, parent_idx) ||
+ !decode_nth_node(tt, &target_cell, target_idx)) {
+ return ;
+ }
+
+ if (parent_cell.u.node.child == target_idx) {
+ /* 1st child */
+ parent_cell.u.node.child = target_cell.u.node.next;
+ write_back_cell(tt, &parent_cell, parent_idx);
+ if (!target_cell.u.node.next &&
+ !parent_cell.u.node.body) {
+ /* only child and parent does not have body, so traverse upward */
+ disconnect(tt, parent_cell.u.node.parent, parent_idx);
+ free_cell(tt, target_idx);
+ return ;
+ }
+ free_cell(tt, target_idx);
+ } else {
+ /* not 1st child */
+ int child_idx = parent_cell.u.node.child;
+ while (child_idx) {
+ struct cell cur;
+ if (!decode_nth_cell(tt, &cur, child_idx)) {
+ return ;
+ }
+ if (cur.u.node.next == target_idx) {
+ /**/
+ cur.u.node.next = target_cell.u.node.next;
+ write_back_cell(tt, &cur, child_idx);
+ free_cell(tt, target_idx);
+ return ;
+ }
+ child_idx = cur.u.node.next;
+ }
+ }
+}
+
+static void
+trie_delete(struct text_trie *tt, struct path *p)
+{
+ struct cell target_cell;
+ int root_idx = get_root_idx(tt);
+ int target_idx, parent_idx;
+ target_idx = trie_search_rec(tt, p, root_idx, 0);
+ if (!target_idx) {
+ return ;
+ }
+ if (!decode_nth_node(tt, &target_cell, target_idx)) {
+ return ;
+ }
+ release_body(tt, target_cell.u.node.body);
+ target_cell.u.node.body = 0;
+ write_back_cell(tt, &target_cell, target_idx);
+ if (target_cell.u.node.child) {
+ return ;
+ }
+ parent_idx = target_cell.u.node.parent;
+ disconnect(tt, parent_idx, target_idx);
+}
+
+/* API */
+void
+anthy_trie_delete(struct text_trie *tt, const char *key)
+{
+ struct path p;
+ int len;
+ if (!tt || tt->fatal) {
+ return ;
+ }
+ len = strlen(key);
+ path_setup(&p, key, len, alloca(sizeof(int)*len));
+ anthy_priv_dic_lock();
+ trie_delete(tt, &p);
+ anthy_priv_dic_unlock();
+ purge_cache(tt);
+}