1 files changed, 442 insertions, 0 deletions
diff --git a/depgraph/mkdepgraph.c b/depgraph/mkdepgraph.c
new file mode 100644
index 0000000..5842946
--- /dev/null
+++ b/depgraph/mkdepgraph.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright (C) 2000-2007 TABATA Yusuke
+ * Copyright (C) 2004-2006 YOSHIDA Yuichi
+ */
+/*
+ * 付属語グラフをバイナリ化する
+ * init_word_seq_tab()
+ *   付属語テーブル中のノードへのポインタの初期化
+ */
+/*
+  This library is free software; you can redistribute it and/or
+  modify it under the terms of the GNU Lesser General Public
+  License as published by the Free Software Foundation; either
+  version 2 of the License, or (at your option) any later version.
+
+  This library is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public
+  License along with this library; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <anthy/alloc.h>
+#include <anthy/conf.h>
+#include <anthy/ruleparser.h>
+#include <anthy/xstr.h>
+#include <anthy/logger.h>
+#include <anthy/splitter.h>
+#include <anthy/anthy.h>
+#include <anthy/depgraph.h>
+#include <anthy/diclib.h>
+
+#ifndef SRCDIR
+#define SRCDIR "."
+#endif
+
+static int verbose;
+
+static struct dep_node* gNodes;
+static char** gNodeNames;
+static int nrNodes;
+
+/* 単語接続ルール */
+static struct wordseq_rule *gRules;
+static int nrRules;
+
+static int 
+get_node_id_by_name(const char *name)
+{
+  int i;
+  /* 登録済みのものから探す */
+  for (i = 0; i < nrNodes; i++) {
+    if (!strcmp(name,gNodeNames[i])) {
+      return i;
+    }
+  }
+  /* なかったので作る */
+  gNodes = realloc(gNodes, sizeof(struct dep_node)*(nrNodes+1));
+  gNodeNames = realloc(gNodeNames, sizeof(char*)*(nrNodes+1));
+  gNodes[nrNodes].nr_branch = 0;
+  gNodes[nrNodes].branch = 0;
+  gNodeNames[nrNodes] = strdup(name);
+  nrNodes++;
+  return nrNodes-1;
+}
+
+
+/* 遷移条件からbranchを捜し出す */
+static struct dep_branch *
+find_branch(struct dep_node *node, xstr **strs, int nr_strs)
+{
+  struct dep_branch *db;
+  int i, j;
+  /* 同じ遷移条件のブランチを探す */
+  for (i = 0; i < node->nr_branch; i++) {
+    db = &node->branch[i];
+    if (nr_strs != db->nr_strs) {
+      continue ;
+    }
+    for (j = 0; j < nr_strs; j++) {
+      if (anthy_xstrcmp(db->str[j], strs[j])) {
+	goto fail;
+      }
+    }
+    /**/
+    return db;
+  fail:;
+  }
+  /* 新しいブランチを確保する */
+  node->branch = realloc(node->branch,
+			 sizeof(struct dep_branch)*(node->nr_branch+1));
+  db = &node->branch[node->nr_branch];
+  node->nr_branch++;
+  db->str = malloc(sizeof(xstr*)*nr_strs);
+  for (i = 0; i < nr_strs; i++) {
+    db->str[i] = strs[i];
+  }
+  db->nr_strs = nr_strs;
+  db->nr_transitions = 0;
+  db->transition = 0;
+  return db;
+}
+
+/*
+ * 遷移をparseする
+ *  doc/SPLITTER参照
+ */
+static void
+parse_transition(char *token, struct dep_transition *tr)
+{
+  int ct = CT_NONE;
+  int pos = POS_NONE;
+  enum dep_class dc = DEP_NONE;
+  char *str = token;
+  tr->head_pos = POS_NONE;
+  tr->weak = 0;
+  /* 遷移の属性を解析*/
+  while (*token != '@') {
+    switch(*token){
+    case ':':
+    case '.':
+      tr->weak = 1;
+      break;
+    case 'C':
+      /* 活用形 */
+      switch (token[1]) {
+      case 'z': ct = CT_MIZEN; break;
+      case 'y': ct = CT_RENYOU; break;
+      case 's': ct = CT_SYUSI; break;
+      case 't': ct = CT_RENTAI; break;
+      case 'k': ct = CT_KATEI; break;
+      case 'm': ct = CT_MEIREI; break;
+      case 'g': ct = CT_HEAD; break;
+      }
+      token ++;
+      break;
+    case 'H':
+      /* 自立語部の品詞 */
+      switch (token[1]) {
+      case 'n':	tr->head_pos = POS_NOUN; break;
+      case 'v':	tr->head_pos = POS_V; break;
+      case 'j':	tr->head_pos = POS_AJV; break;
+      }
+      token ++;
+      break;
+    case 'S':
+      /* 文節の属性 */
+      switch (token[1]) {
+	/*      case 'n': sc = DEP_NO; break;*/
+      case 'f': dc = DEP_FUZOKUGO; break;
+      case 'k': dc = DEP_KAKUJOSHI; break;
+      case 'y': dc = DEP_RENYOU; break;
+      case 't': dc = DEP_RENTAI; break;
+      case 'e': dc = DEP_END; break;
+      case 'r': dc = DEP_RAW; break;
+      default: printf("unknown (S%c)\n", token[1]);
+      }
+      token ++;
+      break;
+    default:
+      printf("Unknown (%c) %s\n", *token, str);
+      break;
+    }
+    token ++;
+  }
+  /* @から後はノードの名前 */
+  tr->next_node = get_node_id_by_name(token);
+  /**/
+  tr->pos = pos;
+  tr->ct = ct;
+  tr->dc = dc;
+}
+
+/*
+ * ノード名 遷移条件+ 遷移先+
+ */
+static void
+parse_dep(char **tokens, int nr)
+{
+  int id, row = 0;
+  struct dep_branch *db;
+  struct dep_node *dn;
+  int nr_strs;
+  xstr **strs = alloca(sizeof(xstr*) * nr);
+
+  /* ノードとそのidを確保 */
+  id = get_node_id_by_name(tokens[row]);
+  dn = &gNodes[id];
+  row ++;
+
+  nr_strs = 0;
+
+  /* 遷移条件の付属語の配列を作る */
+  for (; row < nr && tokens[row][0] == '\"'; row++) {
+    char *s;
+    s = strdup(&tokens[row][1]);
+    s[strlen(s)-1] =0;
+    strs[nr_strs] = anthy_cstr_to_xstr(s, ANTHY_EUC_JP_ENCODING);
+    nr_strs ++;
+    free(s);
+  }
+
+  /* 遷移条件がない時は警告を出して、空の遷移条件を追加する */
+  if (nr_strs == 0) {
+    char *s;
+    anthy_log(0, "node %s has a branch without any transition condition.\n",
+	      tokens[0]);
+    s = strdup("");
+    strs[0] = anthy_cstr_to_xstr(s, ANTHY_EUC_JP_ENCODING);
+    nr_strs = 1;
+    free(s);
+  }
+
+  /* ブランチに遷移先のノードを追加する */
+  db = find_branch(dn, strs, nr_strs);
+  for ( ; row < nr; row++){
+    struct dep_transition *tr;
+    db->transition = realloc(db->transition,
+			     sizeof(struct dep_transition)*
+			     (db->nr_transitions+1));
+    tr = &db->transition[db->nr_transitions];
+    parse_transition(tokens[row], tr);
+    db->nr_transitions ++;
+  }
+}
+
+/* 文法定義ファイル中に空のノードがあるかチェックする */
+static void
+check_nodes(void)
+{
+  int i;
+  for (i = 1; i < nrNodes; i++) {
+    if (gNodes[i].nr_branch == 0) {
+      anthy_log(0, "node %s has no branch.\n", gNodeNames);
+    }
+  }
+}
+
+
+static int
+init_depword_tab(void)
+{
+  const char *fn;
+  char **tokens;
+  int nr;
+
+  /* id 0 を空ノードに割当てる */
+  get_node_id_by_name("@");
+
+  /**/
+  fn = anthy_conf_get_str("DEPWORD");
+  if (!fn) {
+    anthy_log(0, "Dependent word dictionary is unspecified.\n");
+    return -1;
+  }
+  if (anthy_open_file(fn) == -1) {
+    anthy_log(0, "Failed to open dep word dict (%s).\n", fn);
+    return -1;
+  }
+  /* 一行ずつ付属語グラフを読む */
+  while (!anthy_read_line(&tokens, &nr)) {
+    parse_dep(tokens, nr);
+    anthy_free_line();
+  }
+  anthy_close_file();
+  check_nodes();
+  return 0;
+}
+
+
+static void
+parse_indep(char **tokens, int nr)
+{
+  if (nr < 2) {
+    printf("Syntex error in indepword defs"
+	   " :%d.\n", anthy_get_line_number());
+    return ;
+  }
+  gRules = realloc(gRules, sizeof(struct wordseq_rule)*(nrRules+1));
+
+  /* 行の先頭には品詞の名前が入っている */
+  gRules[nrRules].wt = anthy_init_wtype_by_name(tokens[0]);
+
+  /* その次にはノード名が入っている */
+  gRules[nrRules].node_id = get_node_id_by_name(tokens[1]);
+
+  if (verbose) {
+    printf("%d (%s)\n", nrRules, tokens[0]);
+  }
+
+  nrRules ++;
+}
+
+/** 自立語からの遷移表 */
+static int 
+init_indep_word_seq_tab(void)
+{
+  const char *fn;
+  char **tokens;
+  int nr;
+
+  fn = anthy_conf_get_str("INDEPWORD");
+  if (!fn){
+    printf("independent word dict unspecified.\n");
+    return -1;
+  }
+  if (anthy_open_file(fn) == -1) {
+    printf("Failed to open indep word dict (%s).\n", fn);
+    return -1;
+  }
+  /* ファイルを一行ずつ読む */
+  while (!anthy_read_line(&tokens, &nr)) {
+    parse_indep(tokens, nr);
+    anthy_free_line();
+  }
+  anthy_close_file();
+
+  return 0;
+}
+
+/*  
+    ネットワークバイトオーダーで4byte書き出す
+*/
+static void
+write_nl(FILE* fp, int i)
+{
+  i = anthy_dic_htonl(i);
+  fwrite(&i, sizeof(int), 1, fp);
+}
+
+static void
+write_transition(FILE* fp, struct dep_transition* transition)
+{
+  write_nl(fp, transition->next_node); 
+  write_nl(fp, transition->pos); 
+  write_nl(fp, transition->ct); 
+  write_nl(fp, transition->dc); 
+  write_nl(fp, transition->head_pos); 
+  write_nl(fp, transition->weak); 
+}
+
+static void
+write_xstr(FILE* fp, xstr* str)
+{
+  int i;
+  xchar c;
+  write_nl(fp, str->len);
+
+  for (i = 0; i < str->len; i++) {
+    c = anthy_dic_htonl(str->str[i]);
+    fwrite(&c, sizeof(xchar), 1, fp);
+  }
+}
+
+static void
+write_branch(FILE* fp, struct dep_branch* branch)
+{
+  int i;
+
+  write_nl(fp, branch->nr_strs);
+  for (i = 0; i < branch->nr_strs; ++i) {
+    write_xstr(fp, branch->str[i]);
+  }
+
+  write_nl(fp, branch->nr_transitions);
+  for (i = 0; i < branch->nr_transitions; ++i) {
+    write_transition(fp, &branch->transition[i]);
+  }
+}
+
+static void
+write_node(FILE* fp, struct dep_node* node)
+{
+  int i;
+  write_nl(fp, node->nr_branch);
+  for (i = 0; i < node->nr_branch; ++i) {
+    write_branch(fp, &node->branch[i]);
+  }
+}
+
+static void
+write_wtype(FILE *fp, wtype_t wt)
+{
+  fputc(anthy_wtype_get_pos(wt), fp);
+  fputc(anthy_wtype_get_cos(wt), fp);
+  fputc(anthy_wtype_get_scos(wt), fp);
+  fputc(anthy_wtype_get_cc(wt), fp);
+  fputc(anthy_wtype_get_ct(wt), fp);
+  fputc(anthy_wtype_get_wf(wt), fp);
+  fputc(0, fp);
+  fputc(0, fp);
+}
+
+static void
+write_file(const char* file_name)
+{
+  int i;
+  FILE* fp = fopen(file_name, "w");
+  int* node_offset = malloc(sizeof(int) * nrNodes); /* gNodesのファイル上の位置 */
+
+  /* 各ルール */
+  write_nl(fp, nrRules);
+  for (i = 0; i < nrRules; ++i) {
+    write_wtype(fp, gRules[i].wt);
+    write_nl(fp, gRules[i].node_id);
+  }
+
+  write_nl(fp, nrNodes);
+
+  for (i = 0; i < nrNodes; ++i) {
+    write_node(fp, &gNodes[i]);
+  }
+
+  free(node_offset);
+  fclose(fp);
+}
+
+int
+main(int argc, char* argv[])
+{
+  /* 付属語辞書を読み込んでファイルに書き出す */
+  anthy_conf_override("CONFFILE", "../anthy-conf");
+  anthy_conf_override("ANTHYDIR", SRCDIR "/../depgraph/");
+
+  anthy_init_wtypes();
+  anthy_do_conf_init();
+  /* 付属語グラフ */
+  init_depword_tab();
+  /* 自立語からの遷移表 */
+  init_indep_word_seq_tab();
+
+  write_file("anthy.dep");
+
+  return 0;
+}