summaryrefslogtreecommitdiff
path: root/src-worddic/feature_set.c
diff options
context:
space:
mode:
Diffstat (limited to 'src-worddic/feature_set.c')
-rw-r--r--src-worddic/feature_set.c248
1 files changed, 248 insertions, 0 deletions
diff --git a/src-worddic/feature_set.c b/src-worddic/feature_set.c
new file mode 100644
index 0000000..fcf7aba
--- /dev/null
+++ b/src-worddic/feature_set.c
@@ -0,0 +1,248 @@
+/* features
+ *
+ * 素性の番号と意味を隠蔽して管理する
+ *
+ * Copyright (C) 2006-2007 TABATA Yusuke
+ *
+ */
+/*
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <arpa/inet.h>
+#include <anthy/segclass.h>
+#include <anthy/feature_set.h>
+/* for MW_FEATURE* constants */
+#include <anthy/splitter.h>
+
+/* 素性の番号
+ *
+ * 0-19 クラス素性
+ * 30-319(30+SEG_SIZE^2) クラス遷移属性
+ * 540-579 その他
+ * 580- (1024個) 付属語の種類
+ */
+
+#define CUR_CLASS_BASE 0
+#define DEP_TYPE_FEATURE_BASE 20
+#define CLASS_TRANS_BASE 30
+#define FEATURE_SV 542
+#define FEATURE_WEAK 543
+#define FEATURE_SUFFIX 544
+#define FEATURE_NUM 546
+#define FEATURE_CORE1 547
+#define FEATURE_HIGH_FREQ 548
+#define FEATURE_WEAK_SEQ 549
+#define COS_BASE 573
+#define DEP_FEATURE_BASE 580
+
+void
+anthy_feature_list_init(struct feature_list *fl)
+{
+ fl->nr = 0;
+ fl->size = NR_EM_FEATURES;
+}
+
+void
+anthy_feature_list_free(struct feature_list *fl)
+{
+ (void)fl;
+}
+
+void
+anthy_feature_list_add(struct feature_list *fl, int f)
+{
+ if (fl->nr < NR_EM_FEATURES) {
+ fl->u.index[fl->nr] = f;
+ fl->nr++;
+ }
+}
+
+int
+anthy_feature_list_nr(const struct feature_list *fl)
+{
+ return fl->nr;
+}
+
+int
+anthy_feature_list_nth(const struct feature_list *fl, int nth)
+{
+ return fl->u.index[nth];
+}
+
+static int
+cmp_short(const void *p1, const void *p2)
+{
+ return *((short *)p1) - *((short *)p2);
+}
+
+void
+anthy_feature_list_sort(struct feature_list *fl)
+{
+ qsort(fl->u.index, fl->nr, sizeof(fl->u.index[0]),
+ cmp_short);
+}
+
+
+void
+anthy_feature_list_set_cur_class(struct feature_list *fl, int cl)
+{
+ anthy_feature_list_add(fl, CUR_CLASS_BASE + cl);
+}
+
+void
+anthy_feature_list_set_class_trans(struct feature_list *fl, int pc, int cc)
+{
+ anthy_feature_list_add(fl, CLASS_TRANS_BASE + pc * SEG_SIZE + cc);
+}
+
+void
+anthy_feature_list_set_dep_word(struct feature_list *fl, int h)
+{
+ anthy_feature_list_add(fl, h + DEP_FEATURE_BASE);
+}
+
+void
+anthy_feature_list_set_dep_class(struct feature_list *fl, int c)
+{
+ anthy_feature_list_add(fl, c + DEP_TYPE_FEATURE_BASE);
+}
+
+void
+anthy_feature_list_set_noun_cos(struct feature_list *fl, wtype_t wt)
+{
+ int c;
+ if (anthy_wtype_get_pos(wt) != POS_NOUN) {
+ return ;
+ }
+ c = anthy_wtype_get_cos(wt);
+ if (c == COS_SUFFIX) {
+ anthy_feature_list_add(fl, COS_BASE + c);
+ }
+}
+
+void
+anthy_feature_list_set_mw_features(struct feature_list *fl, int mask)
+{
+ if (mask & MW_FEATURE_WEAK_CONN) {
+ anthy_feature_list_add(fl, FEATURE_WEAK);
+ }
+ if (mask & MW_FEATURE_SUFFIX) {
+ anthy_feature_list_add(fl, FEATURE_SUFFIX);
+ }
+ if (mask & MW_FEATURE_SV) {
+ anthy_feature_list_add(fl, FEATURE_SV);
+ }
+ if (mask & MW_FEATURE_NUM) {
+ anthy_feature_list_add(fl, FEATURE_NUM);
+ }
+ if (mask & MW_FEATURE_CORE1) {
+ anthy_feature_list_add(fl, FEATURE_CORE1);
+ }
+ if (mask & MW_FEATURE_HIGH_FREQ) {
+ anthy_feature_list_add(fl, FEATURE_HIGH_FREQ);
+ }
+ if (mask & MW_FEATURE_WEAK_SEQ) {
+ anthy_feature_list_add(fl, FEATURE_WEAK_SEQ);
+ }
+}
+
+void
+anthy_feature_list_print(struct feature_list *fl)
+{
+ int i;
+ printf("features=");
+ for (i = 0; i < fl->nr; i++) {
+ if (i) {
+ printf(",");
+ }
+ printf("%d", fl->u.index[i]);
+ }
+ printf("\n");
+}
+
+static int
+compare_line(const void *kp, const void *cp)
+{
+ const int *f = kp;
+ const struct feature_freq *c = cp;
+ int i;
+ for (i = 0; i < NR_EM_FEATURES; i++) {
+ if (f[i] != (int)ntohl(c->f[i])) {
+ return f[i] - ntohl(c->f[i]);
+ }
+ }
+ return 0;
+}
+
+struct feature_freq *
+anthy_find_array_freq(const void *image, int *f, int nr,
+ struct feature_freq *arg)
+{
+ struct feature_freq *res;
+ int nr_lines, i;
+ const int *array = (int *)image;
+ int n[NR_EM_FEATURES];
+ if (!image) {
+ return NULL;
+ }
+ /* コピーする */
+ for (i = 0; i < NR_EM_FEATURES; i++) {
+ if (i < nr) {
+ n[i] = f[i];
+ } else {
+ n[i] = 0;
+ }
+ }
+ /**/
+ nr_lines = ntohl(array[1]);
+ res = bsearch(n, &array[16], nr_lines,
+ sizeof(struct feature_freq),
+ compare_line);
+ if (!res) {
+ return NULL;
+ }
+ for (i = 0; i < NR_EM_FEATURES + 2; i++) {
+ arg->f[i] = ntohl(res->f[i]);
+ }
+ return arg;
+}
+
+struct feature_freq *
+anthy_find_feature_freq(const void *image,
+ const struct feature_list *fl,
+ struct feature_freq *arg)
+{
+ int i, nr;
+ int f[NR_EM_FEATURES + 2];
+
+ /* 配列にコピーする */
+ nr = anthy_feature_list_nr(fl);
+ for (i = 0; i < NR_EM_FEATURES + 2; i++) {
+ if (i < nr) {
+ f[i] = anthy_feature_list_nth(fl, i);
+ } else {
+ f[i] = 0;
+ }
+ }
+ return anthy_find_array_freq(image, f, NR_EM_FEATURES, arg);
+}
+
+void
+anthy_init_features(void)
+{
+}