summaryrefslogtreecommitdiff
path: root/strings/ctype-uca-scanner_next.inl
blob: e8489ddf19165bdaeccc0ceb94c9890665a15355 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
/* Copyright (c) 2004, 2013, Oracle and/or its affiliates.
   Copyright (c) 2009, 2021, MariaDB

   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public
   License as published by the Free Software Foundation; version 2
   of the License.

   This library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with this library; if not, write to the Free
   Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
   MA 02110-1335  USA */


#ifdef SCANNER_NEXT_NCHARS

#define SCANNER_NEXT_RETURN(_w,_n) \
  do { weight_and_nchars_t rc= {_w, _n}; return rc; } while(0)

#define SCANNER_NEXT_RETURN_CONTRACTION(_cnt,_ignorable_nchars) \
  do { \
    weight_and_nchars_t rc= { _cnt->weight[0], \
                              _ignorable_nchars + \
                              my_contraction_char_length(_cnt) }; \
     return rc; \
  } while(0)

#else

#define SCANNER_NEXT_RETURN(_w,_n) do { return _w; } while (0)

#define SCANNER_NEXT_RETURN_CONTRACTION(_cnt,_ignorable_nchars) \
  do { return _cnt->weight[0]; } while(0)

#endif

static inline
#ifdef SCANNER_NEXT_NCHARS
weight_and_nchars_t
MY_FUNCTION_NAME(scanner_next_with_nchars)(my_uca_scanner *scanner,
                                           const my_uca_scanner_param *param,
                                           size_t nchars)
#else
int
MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner,
                               const my_uca_scanner_param *param)
#endif
{
#ifdef SCANNER_NEXT_NCHARS
  uint ignorable_nchars;
#define LOCAL_MAX_CONTRACTION_LENGTH nchars
#else
#define LOCAL_MAX_CONTRACTION_LENGTH MY_UCA_MAX_CONTRACTION
#endif
  uint16 weight= my_uca_scanner_next_expansion_weight(scanner);
  if (weight)
  {
    /*
      More weights left from the previous step.
      Return the next weight from the current expansion.
      Return "0" as "nchars". The real nchars was set on a previous
      iteration.
    */
    SCANNER_NEXT_RETURN(weight, 0);
  }

#ifdef SCANNER_NEXT_NCHARS
  for (ignorable_nchars= 0 ; ; ignorable_nchars++)
#else
  for ( ; ; )
#endif
  {
    const uint16 *wpage;
    int mblen;
    my_wc_t currwc= 0;
    const uint16 *cweight;

#if MY_UCA_ASCII_OPTIMIZE && !defined(SCANNER_NEXT_NCHARS)
    if (scanner->sbeg + 1 < scanner->send)
    {
      const MY_UCA_2BYTES_ITEM *ww;
      ww= my_uca_level_booster_2bytes_item_addr_const(param->level->booster,
                                                      scanner->sbeg[0],
                                                      scanner->sbeg[1]);
      if (my_uca_2bytes_item_is_applicable(ww))
      {
        /*
          Byte pairs that make 2-byte head characters in previous
          context pairs are marked as not applicable for optimization
          during the collation initialization. So when we come here
          sbeg[0] and sbeg[1] are:
          - either two ASCII characters
          - or one 2-byte character which IS NOT a previous context head
          Just remember sbeg[1] as the previous character for simplicity.
          This may erroneously interpret bytes 0x80..0x9F as previous context
          head characters U+0080..U+009F. However, CLDR does not have any real
          collations that use these characters as previous context heads.
        */
        scanner->page= 0;
        scanner->code= (int) scanner->sbeg[1];
        scanner->sbeg+= 2;
        if ((weight= my_uca_scanner_set_weight(scanner, ww->weight)))
        {
          /*
            TODO: add support for scanner_next_with_nchars and do this:
            SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
          */
          return weight;
        }
        continue; /* Ignorable character */
      }
      /* 2 byte optimization is not applicable, go the slow path */
    }
#endif


    /* Get next character */
#if MY_UCA_ASCII_OPTIMIZE
    /* Get next ASCII character */
    if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80)
    {
      currwc= scanner->sbeg[0];
      scanner->sbeg+= 1;

#if MY_UCA_COMPILE_CONTRACTIONS
      if (my_uca_needs_context_handling(param->level, currwc))
      {
        const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, param,
                                                              currwc,
                                                  LOCAL_MAX_CONTRACTION_LENGTH);
        if (cnt)
        {
          if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight)))
            SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
          continue;  /* Ignorable contraction */
        }
      }
#endif

      scanner->page= 0;
      scanner->code= (int) currwc;
      cweight= param->level->weights[0] + scanner->code * param->level->lengths[0];
      if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
        SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
      continue; /* Ignorable character */
    }
    else
#endif
    /* Get next MB character */
    if (((mblen= MY_MB_WC(scanner, param, &currwc, scanner->sbeg,
                                            scanner->send)) <= 0))
    {
      if (scanner->sbeg >= scanner->send)
      {
        /* No more bytes, end of line reached */
        SCANNER_NEXT_RETURN(-1, ignorable_nchars);
      }
      /*
        There are some more bytes left. Non-positive mb_len means that
        we got an incomplete or a bad byte sequence. Consume mbminlen bytes.
      */
      if ((scanner->sbeg+= param->cs->mbminlen) > scanner->send)
      {
        /* For safety purposes don't go beyond the string range. */
        scanner->sbeg= scanner->send;
      }
      /*
        Treat every complete or incomplete mbminlen unit as a weight which is
        greater than weight for any possible normal character.
        0xFFFF is greater than any possible weight in the UCA weight table.
      */
      SCANNER_NEXT_RETURN(0xFFFF, ignorable_nchars + 1);
    }

    scanner->sbeg+= mblen;
    if (currwc > param->level->maxchar)
    {
      SCANNER_NEXT_RETURN(my_uca_scanner_set_weight_outside_maxchar(scanner),
                          ignorable_nchars + 1);
    }

#if MY_UCA_COMPILE_CONTRACTIONS
    if (my_uca_needs_context_handling(param->level, currwc))
    {
      const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, param, currwc,
                                                LOCAL_MAX_CONTRACTION_LENGTH);
      if (cnt)
      {
        if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight)))
          SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
        continue;  /* Ignorable contraction */
      }
    }
#endif

    /* Process single character */
    scanner->page= currwc >> 8;
    scanner->code= currwc & 0xFF;

    /* If weight page for w[0] does not exist, then calculate algoritmically */
    if (!(wpage= param->level->weights[scanner->page]))
      SCANNER_NEXT_RETURN(my_uca_scanner_next_implicit(scanner, param),
                          ignorable_nchars + 1);

    /* Calculate pointer to w[0]'s weight, using page and offset */
    cweight= wpage + scanner->code * param->level->lengths[scanner->page];
    if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
      SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
    continue; /* Ignorable character */
  }

  SCANNER_NEXT_RETURN(0, 0); /* Not reachable */
}

#undef SCANNER_NEXT_NCHARS
#undef SCANNER_NEXT_RETURN
#undef SCANNER_NEXT_RETURN_CONTRACTION
#undef LOCAL_MAX_CONTRACTION_LENGTH