summaryrefslogtreecommitdiff
path: root/src/static_libs/libunibreak/graphemebreak.c
blob: 77c3d5f55ca380e11b94e0610c5aa3bd67c76a2c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
/*
 * Grapheme breaking in a Unicode sequence.  Designed to be used in a
 * generic text renderer.
 *
 * Copyright (C) 2016 Andreas Röver <roever at users dot sf dot net>
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the author be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute
 * it freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must
 *    not claim that you wrote the original software.  If you use this
 *    software in a product, an acknowledgement in the product
 *    documentation would be appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must
 *    not be misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source
 *    distribution.
 *
 * The main reference is Unicode Standard Annex 29 (UAX #29):
 *      <URL:http://unicode.org/reports/tr29>
 *
 * When this library was designed, this annex was at Revision 29, for
 * Unicode 9.0.0:
 *      <URL:http://www.unicode.org/reports/tr29/tr29-29.html>
 *
 * The Unicode Terms of Use are available at
 *      <URL:http://www.unicode.org/copyright.html>
 */

/**
 * @file    graphemebreak.c
 *
 * Implementation of the grapheme breaking algorithm as described in Unicode
 * Standard Annex 29.
 *
 * @author  Andreas Roever
 */

#if defined(_MSC_VER) && _MSC_VER < 1800
typedef int bool;
#define false 0
#define true 1
#else
#include <stdbool.h>
#endif

#include <string.h>
#include "graphemebreak.h"
#include "graphemebreakdata.c"
#include "unibreakdef.h"

#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))

/**
 * Initializes the wordbreak internals.  It currently does nothing, but
 * it may in the future.
 */
void init_graphemebreak(void)
{
}

/**
 * Gets the grapheme breaking class of a character.
 *
 * @param ch   character to check
 * @return     the grapheme breaking class if found; \c GBP_Other otherwise
 */
static enum GraphemeBreakClass get_char_gb_class(utf32_t ch)
{
    int min = 0;
    int max = ARRAY_LEN(gb_prop_default) - 1;
    int mid;

    do
    {
        mid = (min + max) / 2;

        if (ch < gb_prop_default[mid].start)
            max = mid - 1;
        else if (ch > gb_prop_default[mid].end)
            min = mid + 1;
        else
            return gb_prop_default[mid].prop;
    } while (min <= max);

    return GBP_Other;
}

/**
 * Sets the grapheme breaking information for a generic input string.
 *
 * @param[in]  s             input string
 * @param[in]  len           length of the input
 * @param[out] brks          pointer to the output breaking data, containing
 *                           #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK
 * @param[in] get_next_char  function to get the next UTF-32 character
 */
static void set_graphemebreaks(const void *s, size_t len, char *brks,
                               get_next_char_t get_next_char)
{
    size_t posNext = 0;
    bool rule10Left = false;  // is the left side of rule 10 fulfilled?
    bool evenRegionalIndicators = true;  // is the number of preceeding
                                         // GBP_RegionalIndicator characters
                                         // even

    utf32_t ch = get_next_char(s, len, &posNext);
    enum GraphemeBreakClass current_class = get_char_gb_class(ch);

    // initialize whole output to inside char
    memset(brks, GRAPHEMEBREAK_INSIDEACHAR, len);

    while (true)
    {
        enum GraphemeBreakClass prev_class = current_class;

        // safe position if current character so that we can store the
        // result there later on
        size_t brksPos = posNext - 1;

        // get nect character
        ch = get_next_char(s, len, &posNext);

        if (ch == EOS)
        {
            // done, place one final break after the last character as per
            // algorithm rule GB1
            brks[brksPos] = GRAPHEMEBREAK_BREAK;
            break;
        }

        // get class of current character
        current_class = get_char_gb_class(ch);

        // update some helper variables
        if ((prev_class == GBP_E_Base) || (prev_class == GBP_E_Base_GAZ))
        {
            rule10Left = true;
        }
        else if (prev_class != GBP_Extend)
        {
            rule10Left = false;
        }

        if (prev_class == GBP_Regional_Indicator)
        {
            evenRegionalIndicators = !evenRegionalIndicators;
        }
        else
        {
            evenRegionalIndicators = true;
        }

        // check all rules
        if (prev_class == GBP_CR && current_class == GBP_LF)
        {
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB3
        }
        else if ((prev_class == GBP_CR) || (prev_class == GBP_LF) ||
                 (prev_class == GBP_Control) || (current_class == GBP_CR) ||
                 (current_class == GBP_LF) ||
                 (current_class == GBP_Control))
        {
            brks[brksPos] = GRAPHEMEBREAK_BREAK;  // Rule: GB4 + GB5
        }
        else if ((prev_class == GBP_L) &&
                 ((current_class == GBP_L) || (current_class == GBP_V) ||
                  (current_class == GBP_LV) || (current_class == GBP_LVT)))
        {
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB6
        }
        else if (((prev_class == GBP_LV) || (prev_class == GBP_V)) &&
                 ((current_class == GBP_V) || (current_class == GBP_T)))
        {
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB7
        }
        else if (((prev_class == GBP_LVT) || (prev_class == GBP_T)) &&
                 (current_class == GBP_T))
        {
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB8
        }
        else if ((current_class == GBP_Extend) ||
                 (current_class == GBP_ZWJ))
        {
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB9
        }
        else if (current_class == GBP_SpacingMark)
        {
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB9a
        }
        else if (prev_class == GBP_Prepend)
        {
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB9b
        }
        else if (rule10Left && (current_class == GBP_E_Modifier))
        {
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB10
        }
        else if ((prev_class == GBP_ZWJ) &&
                 ((current_class == GBP_Glue_After_Zwj) ||
                  (current_class == GBP_E_Base_GAZ)))
        {
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB11
        }
        else if (!evenRegionalIndicators &&
                 (current_class == GBP_Regional_Indicator))
        {
            brks[brksPos] = GRAPHEMEBREAK_NOBREAK;  // Rule: GB12 + GB13
        }
        else
        {
            brks[brksPos] = GRAPHEMEBREAK_BREAK;  // Rule: GB999
        }
    }
}

/**
 * Sets the grapheme breaking information for a UTF-8 input string.
 *
 * @param[in]  s     input UTF-8 string
 * @param[in]  len   length of the input
 * @param[in]  lang  language of the input (reserved for future use)
 * @param[out] brks  pointer to the output breaking data, containing
 *                   #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
 *                   First element in output array is for the break behind
 *                   the first character the pointer must point to an
 *                   array with at least as many elements as there
 *                   are characters in the string
 */
void set_graphemebreaks_utf8(const utf8_t *s, size_t len, const char *lang,
                             char *brks)
{
    (void)lang;
    set_graphemebreaks(s, len, brks,
                       (get_next_char_t)ub_get_next_char_utf8);
}

/**
 * Sets the grapheme breaking information for a UTF-16 input string.
 *
 * @param[in]  s     input UTF-16 string
 * @param[in]  len   length of the input
 * @param[in]  lang  language of the input (reserved for future use)
 * @param[out] brks  pointer to the output breaking data, containing
 *                   #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
 *                   First element in output array is for the break behind
 *                   the first character the pointer must point to an
 *                   array with at least as many elements as there
 *                   are characters in the string
 */
void set_graphemebreaks_utf16(const utf16_t *s, size_t len,
                              const char *lang, char *brks)
{
    (void)lang;
    set_graphemebreaks(s, len, brks,
                       (get_next_char_t)ub_get_next_char_utf16);
}

/**
 * Sets the grapheme breaking information for a UTF-32 input string.
 *
 * @param[in]  s     input UTF-32 string
 * @param[in]  len   length of the input
 * @param[in]  lang  language of the input (reserved for future use)
 * @param[out] brks  pointer to the output breaking data, containing
 *                   #GRAPHEMEBREAK_BREAK or #GRAPHEMEBREAK_NOBREAK.
 *                   First element in output array is for the break behind
 *                   the first character the pointer must point to an
 *                   array with at least as many elements as there
 *                   are characters in the string
 */
void set_graphemebreaks_utf32(const utf32_t *s, size_t len,
                              const char *lang, char *brks)
{
    (void)lang;
    set_graphemebreaks(s, len, brks,
                       (get_next_char_t)ub_get_next_char_utf32);
}