summaryrefslogtreecommitdiff
path: root/src/searchutils.c
blob: c1fb656d41ef703ef971bf208461db3e8630b13e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/* searchutils.c - helper subroutines for grep's matchers.
   Copyright 1992, 1998, 2000, 2007, 2009-2012 Free Software Foundation, Inc.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
   02110-1301, USA.  */

#include <config.h>
#include <assert.h>
#include "search.h"

#define NCHAR (UCHAR_MAX + 1)

void
kwsinit (kwset_t *kwset)
{
  static char trans[NCHAR];
  int i;

  if (match_icase && MB_CUR_MAX == 1)
    {
      for (i = 0; i < NCHAR; ++i)
        trans[i] = tolower (i);

      *kwset = kwsalloc (trans);
    }
  else
    *kwset = kwsalloc (NULL);

  if (!*kwset)
    xalloc_die ();
}

#if MBS_SUPPORT
/* Convert the *N-byte string, BEG, to lowercase, and write the
   NUL-terminated result into malloc'd storage.  Upon success, set *N
   to the length (in bytes) of the resulting string (not including the
   trailing NUL byte), and return a pointer to the lowercase string.
   Upon memory allocation failure, this function exits.
   Note that on input, *N must be larger than zero.

   Note that while this function returns a pointer to malloc'd storage,
   the caller must not free it, since this function retains a pointer
   to the buffer and reuses it on any subsequent call.  As a consequence,
   this function is not thread-safe.

   When all the characters in the lowercase result string have the
   same length as corresponding characters in the input string,
   set *LEN_MAP_P to NULL.  Otherwise, set it to a malloc'd buffer (like the
   returned buffer, this must not be freed by caller) of the same length as
   the result string.  (*LEN_MAP_P)[J] is one less than the length-in-bytes
   of the character in BEG that formed byte J of the result.  This map is
   used by the caller to convert offset,length pairs that reference the
   lowercase result to numbers that refer to the corresponding parts of
   the original buffer.  */
char *
mbtolower (const char *beg, size_t *n, unsigned char **len_map_p)
{
  static char *out;
  static unsigned char *len_map;
  static size_t outalloc;
  size_t outlen, mb_cur_max;
  mbstate_t is, os;
  const char *end;
  char *p;
  unsigned char *m;
  bool lengths_differ = false;

  if (*n > outalloc || outalloc == 0)
    {
      outalloc = MAX(1, *n);
      out = xrealloc (out, outalloc);
      len_map = xrealloc (len_map, outalloc);
    }

  /* appease clang-2.6 */
  assert (out);
  assert (len_map);
  if (*n == 0)
    return out;

  memset (&is, 0, sizeof (is));
  memset (&os, 0, sizeof (os));
  end = beg + *n;

  mb_cur_max = MB_CUR_MAX;
  p = out;
  m = len_map;
  outlen = 0;
  while (beg < end)
    {
      wchar_t wc;
      size_t mbclen = mbrtowc (&wc, beg, end - beg, &is);
      if (outlen + mb_cur_max >= outalloc)
        {
          size_t dm = m - len_map;
          out = x2nrealloc (out, &outalloc, 1);
          len_map = xrealloc (len_map, outalloc);
          p = out + outlen;
          m = len_map + dm;
        }

      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
        {
          /* An invalid sequence, or a truncated multi-octet character.
             We treat it as a single-octet character.  */
          *m++ = 0;
          *p++ = *beg++;
          outlen++;
          memset (&is, 0, sizeof (is));
          memset (&os, 0, sizeof (os));
        }
      else
        {
          *m++ = mbclen - 1;
          beg += mbclen;
          size_t ombclen = wcrtomb (p, towlower ((wint_t) wc), &os);
          p += ombclen;
          outlen += ombclen;
          lengths_differ |= (mbclen != ombclen);
        }
    }

  *len_map_p = lengths_differ ? len_map : NULL;
  *n = p - out;
  *p = 0;
  return out;
}


bool
is_mb_middle (const char **good, const char *buf, const char *end,
              size_t match_len)
{
  const char *p = *good;
  const char *prev = p;
  mbstate_t cur_state;

  /* TODO: can be optimized for UTF-8.  */
  memset(&cur_state, 0, sizeof(mbstate_t));
  while (p < buf)
    {
      size_t mbclen = mbrlen(p, end - p, &cur_state);

      /* Store the beginning of the previous complete multibyte character.  */
      if (mbclen != (size_t) -2)
        prev = p;

      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
        {
          /* An invalid sequence, or a truncated multibyte character.
             We treat it as a single byte character.  */
          mbclen = 1;
          memset(&cur_state, 0, sizeof cur_state);
        }
      p += mbclen;
    }

  *good = prev;

  if (p > buf)
    return true;

  /* P == BUF here.  */
  return 0 < match_len && match_len < mbrlen (p, end - p, &cur_state);
}
#endif /* MBS_SUPPORT */