summaryrefslogtreecommitdiff
path: root/storage/mroonga/vendor/groonga/plugins/token_filters/stop_word.c
blob: 054cd65aa370979b4f8608871aba8bee8355de08 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/* -*- c-basic-offset: 2 -*- */
/*
  Copyright(C) 2014 Brazil

  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License version 2.1 as published by the Free Software Foundation.

  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the Free Software
  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
*/

#include <grn_str.h>

#include <groonga.h>
#include <groonga/token_filter.h>

#include <string.h>

#define COLUMN_NAME "is_stop_word"

typedef struct {
  grn_obj *table;
  grn_token_mode mode;
  grn_obj *column;
  grn_obj value;
  grn_tokenizer_token token;
} grn_stop_word_token_filter;

static void *
stop_word_init(grn_ctx *ctx, grn_obj *table, grn_token_mode mode)
{
  grn_stop_word_token_filter *token_filter;

  if (mode != GRN_TOKEN_GET) {
    return NULL;
  }

  token_filter = GRN_PLUGIN_MALLOC(ctx, sizeof(grn_stop_word_token_filter));
  if (!token_filter) {
    GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
                     "[token-filter][stop-word] "
                     "failed to allocate grn_stop_word_token_filter");
    return NULL;
  }

  token_filter->table = table;
  token_filter->mode = mode;
  token_filter->column = grn_obj_column(ctx,
                                        token_filter->table,
                                        COLUMN_NAME,
                                        strlen(COLUMN_NAME));
  if (!token_filter->column) {
    char table_name[GRN_TABLE_MAX_KEY_SIZE];
    unsigned int table_name_size;

    table_name_size = grn_obj_name(ctx,
                                   token_filter->table,
                                   table_name,
                                   GRN_TABLE_MAX_KEY_SIZE);
    GRN_PLUGIN_ERROR(ctx, GRN_TOKEN_FILTER_ERROR,
                     "[token-filter][stop-word] "
                     "column for judging stop word doesn't exit: <%.*s.%s>",
                     table_name_size,
                     table_name,
                     COLUMN_NAME);
    GRN_PLUGIN_FREE(ctx, token_filter);
    return NULL;
  }

  GRN_BOOL_INIT(&(token_filter->value), 0);
  grn_tokenizer_token_init(ctx, &(token_filter->token));

  return token_filter;
}

static void
stop_word_filter(grn_ctx *ctx,
                 grn_token *current_token,
                 grn_token *next_token,
                 void *user_data)
{
  grn_stop_word_token_filter *token_filter = user_data;
  grn_id id;
  grn_obj *data;

  if (!token_filter) {
    return;
  }

  data = grn_token_get_data(ctx, current_token);
  id = grn_table_get(ctx,
                     token_filter->table,
                     GRN_TEXT_VALUE(data),
                     GRN_TEXT_LEN(data));
  if (id != GRN_ID_NIL) {
    GRN_BULK_REWIND(&(token_filter->value));
    grn_obj_get_value(ctx,
                      token_filter->column,
                      id,
                      &(token_filter->value));
    if (GRN_BOOL_VALUE(&(token_filter->value))) {
      grn_tokenizer_status status;
      status = grn_token_get_status(ctx, current_token);
      status |= GRN_TOKEN_SKIP;
      grn_token_set_status(ctx, next_token, status);
    }
  }
}

static void
stop_word_fin(grn_ctx *ctx, void *user_data)
{
  grn_stop_word_token_filter *token_filter = user_data;
  if (!token_filter) {
    return;
  }

  grn_tokenizer_token_fin(ctx, &(token_filter->token));
  grn_obj_unlink(ctx, token_filter->column);
  grn_obj_unlink(ctx, &(token_filter->value));
  GRN_PLUGIN_FREE(ctx, token_filter);
}

grn_rc
GRN_PLUGIN_INIT(grn_ctx *ctx)
{
  return ctx->rc;
}

grn_rc
GRN_PLUGIN_REGISTER(grn_ctx *ctx)
{
  grn_rc rc;

  rc = grn_token_filter_register(ctx,
                                 "TokenFilterStopWord", -1,
                                 stop_word_init,
                                 stop_word_filter,
                                 stop_word_fin);

  return rc;
}

grn_rc
GRN_PLUGIN_FIN(grn_ctx *ctx)
{
  return GRN_SUCCESS;
}