summaryrefslogtreecommitdiff
path: root/src/shared/quickjs/libunicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/shared/quickjs/libunicode.c')
-rw-r--r--src/shared/quickjs/libunicode.c1558
1 files changed, 1558 insertions, 0 deletions
diff --git a/src/shared/quickjs/libunicode.c b/src/shared/quickjs/libunicode.c
new file mode 100644
index 000000000..112da72da
--- /dev/null
+++ b/src/shared/quickjs/libunicode.c
@@ -0,0 +1,1558 @@
+/*
+ * Unicode utilities
+ *
+ * Copyright (c) 2017-2018 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <assert.h>
+
+#include "cutils.h"
+#include "libunicode.h"
+#include "libunicode-table.h"
+
+enum {
+ RUN_TYPE_U,
+ RUN_TYPE_L,
+ RUN_TYPE_UF,
+ RUN_TYPE_LF,
+ RUN_TYPE_UL,
+ RUN_TYPE_LSU,
+ RUN_TYPE_U2L_399_EXT2,
+ RUN_TYPE_UF_D20,
+ RUN_TYPE_UF_D1_EXT,
+ RUN_TYPE_U_EXT,
+ RUN_TYPE_LF_EXT,
+ RUN_TYPE_U_EXT2,
+ RUN_TYPE_L_EXT2,
+ RUN_TYPE_U_EXT3,
+};
+
+/* conv_type:
+ 0 = to upper
+ 1 = to lower
+ 2 = case folding (= to lower with modifications)
+*/
+int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
+{
+ if (c < 128) {
+ if (conv_type) {
+ if (c >= 'A' && c <= 'Z') {
+ c = c - 'A' + 'a';
+ }
+ } else {
+ if (c >= 'a' && c <= 'z') {
+ c = c - 'a' + 'A';
+ }
+ }
+ } else {
+ uint32_t v, code, data, type, len, a, is_lower;
+ int idx, idx_min, idx_max;
+
+ is_lower = (conv_type != 0);
+ idx_min = 0;
+ idx_max = countof(case_conv_table1) - 1;
+ while (idx_min <= idx_max) {
+ idx = (unsigned)(idx_max + idx_min) / 2;
+ v = case_conv_table1[idx];
+ code = v >> (32 - 17);
+ len = (v >> (32 - 17 - 7)) & 0x7f;
+ if (c < code) {
+ idx_max = idx - 1;
+ } else if (c >= code + len) {
+ idx_min = idx + 1;
+ } else {
+ type = (v >> (32 - 17 - 7 - 4)) & 0xf;
+ data = ((v & 0xf) << 8) | case_conv_table2[idx];
+ switch(type) {
+ case RUN_TYPE_U:
+ case RUN_TYPE_L:
+ case RUN_TYPE_UF:
+ case RUN_TYPE_LF:
+ if (conv_type == (type & 1) ||
+ (type >= RUN_TYPE_UF && conv_type == 2)) {
+ c = c - code + (case_conv_table1[data] >> (32 - 17));
+ }
+ break;
+ case RUN_TYPE_UL:
+ a = c - code;
+ if ((a & 1) != (1 - is_lower))
+ break;
+ c = (a ^ 1) + code;
+ break;
+ case RUN_TYPE_LSU:
+ a = c - code;
+ if (a == 1) {
+ c += 2 * is_lower - 1;
+ } else if (a == (1 - is_lower) * 2) {
+ c += (2 * is_lower - 1) * 2;
+ }
+ break;
+ case RUN_TYPE_U2L_399_EXT2:
+ if (!is_lower) {
+ res[0] = c - code + case_conv_ext[data >> 6];
+ res[1] = 0x399;
+ return 2;
+ } else {
+ c = c - code + case_conv_ext[data & 0x3f];
+ }
+ break;
+ case RUN_TYPE_UF_D20:
+ if (conv_type == 1)
+ break;
+ c = data + (conv_type == 2) * 0x20;
+ break;
+ case RUN_TYPE_UF_D1_EXT:
+ if (conv_type == 1)
+ break;
+ c = case_conv_ext[data] + (conv_type == 2);
+ break;
+ case RUN_TYPE_U_EXT:
+ case RUN_TYPE_LF_EXT:
+ if (is_lower != (type - RUN_TYPE_U_EXT))
+ break;
+ c = case_conv_ext[data];
+ break;
+ case RUN_TYPE_U_EXT2:
+ case RUN_TYPE_L_EXT2:
+ if (conv_type != (type - RUN_TYPE_U_EXT2))
+ break;
+ res[0] = c - code + case_conv_ext[data >> 6];
+ res[1] = case_conv_ext[data & 0x3f];
+ return 2;
+ default:
+ case RUN_TYPE_U_EXT3:
+ if (conv_type != 0)
+ break;
+ res[0] = case_conv_ext[data >> 8];
+ res[1] = case_conv_ext[(data >> 4) & 0xf];
+ res[2] = case_conv_ext[data & 0xf];
+ return 3;
+ }
+ break;
+ }
+ }
+ }
+ res[0] = c;
+ return 1;
+}
+
+static uint32_t get_le24(const uint8_t *ptr)
+{
+#if defined(__x86__) || defined(__x86_64__)
+ return *(uint16_t *)ptr | (ptr[2] << 16);
+#else
+ return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
+#endif
+}
+
+#define UNICODE_INDEX_BLOCK_LEN 32
+
+/* return -1 if not in table, otherwise the offset in the block */
+static int get_index_pos(uint32_t *pcode, uint32_t c,
+ const uint8_t *index_table, int index_table_len)
+{
+ uint32_t code, v;
+ int idx_min, idx_max, idx;
+
+ idx_min = 0;
+ v = get_le24(index_table);
+ code = v & ((1 << 21) - 1);
+ if (c < code) {
+ *pcode = 0;
+ return 0;
+ }
+ idx_max = index_table_len - 1;
+ code = get_le24(index_table + idx_max * 3);
+ if (c >= code)
+ return -1;
+ /* invariant: tab[idx_min] <= c < tab2[idx_max] */
+ while ((idx_max - idx_min) > 1) {
+ idx = (idx_max + idx_min) / 2;
+ v = get_le24(index_table + idx * 3);
+ code = v & ((1 << 21) - 1);
+ if (c < code) {
+ idx_max = idx;
+ } else {
+ idx_min = idx;
+ }
+ }
+ v = get_le24(index_table + idx_min * 3);
+ *pcode = v & ((1 << 21) - 1);
+ return (idx_min + 1) * UNICODE_INDEX_BLOCK_LEN + (v >> 21);
+}
+
+static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
+ const uint8_t *index_table, int index_table_len)
+{
+ uint32_t code, b, bit;
+ int pos;
+ const uint8_t *p;
+
+ pos = get_index_pos(&code, c, index_table, index_table_len);
+ if (pos < 0)
+ return FALSE; /* outside the table */
+ p = table + pos;
+ bit = 0;
+ for(;;) {
+ b = *p++;
+ if (b < 64) {
+ code += (b >> 3) + 1;
+ if (c < code)
+ return bit;
+ bit ^= 1;
+ code += (b & 7) + 1;
+ } else if (b >= 0x80) {
+ code += b - 0x80 + 1;
+ } else if (b < 0x60) {
+ code += (((b - 0x40) << 8) | p[0]) + 1;
+ p++;
+ } else {
+ code += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
+ p += 2;
+ }
+ if (c < code)
+ return bit;
+ bit ^= 1;
+ }
+}
+
+BOOL lre_is_cased(uint32_t c)
+{
+ uint32_t v, code, len;
+ int idx, idx_min, idx_max;
+
+ idx_min = 0;
+ idx_max = countof(case_conv_table1) - 1;
+ while (idx_min <= idx_max) {
+ idx = (unsigned)(idx_max + idx_min) / 2;
+ v = case_conv_table1[idx];
+ code = v >> (32 - 17);
+ len = (v >> (32 - 17 - 7)) & 0x7f;
+ if (c < code) {
+ idx_max = idx - 1;
+ } else if (c >= code + len) {
+ idx_min = idx + 1;
+ } else {
+ return TRUE;
+ }
+ }
+ return lre_is_in_table(c, unicode_prop_Cased1_table,
+ unicode_prop_Cased1_index,
+ sizeof(unicode_prop_Cased1_index) / 3);
+}
+
+BOOL lre_is_case_ignorable(uint32_t c)
+{
+ return lre_is_in_table(c, unicode_prop_Case_Ignorable_table,
+ unicode_prop_Case_Ignorable_index,
+ sizeof(unicode_prop_Case_Ignorable_index) / 3);
+}
+
+/* character range */
+
+static maybe_unused void cr_dump(CharRange *cr)
+{
+ int i;
+ for(i = 0; i < cr->len; i++)
+ printf("%d: 0x%04x\n", i, cr->points[i]);
+}
+
+static void *cr_default_realloc(void *opaque, void *ptr, size_t size)
+{
+ return realloc(ptr, size);
+}
+
+void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func)
+{
+ cr->len = cr->size = 0;
+ cr->points = NULL;
+ cr->mem_opaque = mem_opaque;
+ cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc;
+}
+
+void cr_free(CharRange *cr)
+{
+ cr->realloc_func(cr->mem_opaque, cr->points, 0);
+}
+
+int cr_realloc(CharRange *cr, int size)
+{
+ int new_size;
+ uint32_t *new_buf;
+
+ if (size > cr->size) {
+ new_size = max_int(size, cr->size * 3 / 2);
+ new_buf = cr->realloc_func(cr->mem_opaque, cr->points,
+ new_size * sizeof(cr->points[0]));
+ if (!new_buf)
+ return -1;
+ cr->points = new_buf;
+ cr->size = new_size;
+ }
+ return 0;
+}
+
+int cr_copy(CharRange *cr, const CharRange *cr1)
+{
+ if (cr_realloc(cr, cr1->len))
+ return -1;
+ memcpy(cr->points, cr1->points, sizeof(cr->points[0]) * cr1->len);
+ cr->len = cr1->len;
+ return 0;
+}
+
+/* merge consecutive intervals and remove empty intervals */
+static void cr_compress(CharRange *cr)
+{
+ int i, j, k, len;
+ uint32_t *pt;
+
+ pt = cr->points;
+ len = cr->len;
+ i = 0;
+ j = 0;
+ k = 0;
+ while ((i + 1) < len) {
+ if (pt[i] == pt[i + 1]) {
+ /* empty interval */
+ i += 2;
+ } else {
+ j = i;
+ while ((j + 3) < len && pt[j + 1] == pt[j + 2])
+ j += 2;
+ /* just copy */
+ pt[k] = pt[i];
+ pt[k + 1] = pt[j + 1];
+ k += 2;
+ i = j + 2;
+ }
+ }
+ cr->len = k;
+}
+
+/* union or intersection */
+int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
+ const uint32_t *b_pt, int b_len, int op)
+{
+ int a_idx, b_idx, is_in;
+ uint32_t v;
+
+ a_idx = 0;
+ b_idx = 0;
+ for(;;) {
+ /* get one more point from a or b in increasing order */
+ if (a_idx < a_len && b_idx < b_len) {
+ if (a_pt[a_idx] < b_pt[b_idx]) {
+ goto a_add;
+ } else if (a_pt[a_idx] == b_pt[b_idx]) {
+ v = a_pt[a_idx];
+ a_idx++;
+ b_idx++;
+ } else {
+ goto b_add;
+ }
+ } else if (a_idx < a_len) {
+ a_add:
+ v = a_pt[a_idx++];
+ } else if (b_idx < b_len) {
+ b_add:
+ v = b_pt[b_idx++];
+ } else {
+ break;
+ }
+ /* add the point if the in/out status changes */
+ switch(op) {
+ case CR_OP_UNION:
+ is_in = (a_idx & 1) | (b_idx & 1);
+ break;
+ case CR_OP_INTER:
+ is_in = (a_idx & 1) & (b_idx & 1);
+ break;
+ case CR_OP_XOR:
+ is_in = (a_idx & 1) ^ (b_idx & 1);
+ break;
+ default:
+ abort();
+ }
+ if (is_in != (cr->len & 1)) {
+ if (cr_add_point(cr, v))
+ return -1;
+ }
+ }
+ cr_compress(cr);
+ return 0;
+}
+
+int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len)
+{
+ CharRange a = *cr;
+ int ret;
+ cr->len = 0;
+ cr->size = 0;
+ cr->points = NULL;
+ ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION);
+ cr_free(&a);
+ return ret;
+}
+
+int cr_invert(CharRange *cr)
+{
+ int len;
+ len = cr->len;
+ if (cr_realloc(cr, len + 2))
+ return -1;
+ memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0]));
+ cr->points[0] = 0;
+ cr->points[len + 1] = UINT32_MAX;
+ cr->len = len + 2;
+ cr_compress(cr);
+ return 0;
+}
+
+#ifdef CONFIG_ALL_UNICODE
+
+BOOL lre_is_id_start(uint32_t c)
+{
+ return lre_is_in_table(c, unicode_prop_ID_Start_table,
+ unicode_prop_ID_Start_index,
+ sizeof(unicode_prop_ID_Start_index) / 3);
+}
+
+BOOL lre_is_id_continue(uint32_t c)
+{
+ return lre_is_id_start(c) ||
+ lre_is_in_table(c, unicode_prop_ID_Continue1_table,
+ unicode_prop_ID_Continue1_index,
+ sizeof(unicode_prop_ID_Continue1_index) / 3);
+}
+
+#define UNICODE_DECOMP_LEN_MAX 18
+
+typedef enum {
+ DECOMP_TYPE_C1, /* 16 bit char */
+ DECOMP_TYPE_L1, /* 16 bit char table */
+ DECOMP_TYPE_L2,
+ DECOMP_TYPE_L3,
+ DECOMP_TYPE_L4,
+ DECOMP_TYPE_L5, /* XXX: not used */
+ DECOMP_TYPE_L6, /* XXX: could remove */
+ DECOMP_TYPE_L7, /* XXX: could remove */
+ DECOMP_TYPE_LL1, /* 18 bit char table */
+ DECOMP_TYPE_LL2,
+ DECOMP_TYPE_S1, /* 8 bit char table */
+ DECOMP_TYPE_S2,
+ DECOMP_TYPE_S3,
+ DECOMP_TYPE_S4,
+ DECOMP_TYPE_S5,
+ DECOMP_TYPE_I1, /* increment 16 bit char value */
+ DECOMP_TYPE_I2_0,
+ DECOMP_TYPE_I2_1,
+ DECOMP_TYPE_I3_1,
+ DECOMP_TYPE_I3_2,
+ DECOMP_TYPE_I4_1,
+ DECOMP_TYPE_I4_2,
+ DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
+ DECOMP_TYPE_B2,
+ DECOMP_TYPE_B3,
+ DECOMP_TYPE_B4,
+ DECOMP_TYPE_B5,
+ DECOMP_TYPE_B6,
+ DECOMP_TYPE_B7,
+ DECOMP_TYPE_B8,
+ DECOMP_TYPE_B18,
+ DECOMP_TYPE_LS2,
+ DECOMP_TYPE_PAT3,
+ DECOMP_TYPE_S2_UL,
+ DECOMP_TYPE_LS2_UL,
+} DecompTypeEnum;
+
+static uint32_t unicode_get_short_code(uint32_t c)
+{
+ static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
+
+ if (c < 0x80)
+ return c;
+ else if (c < 0x80 + 0x50)
+ return c - 0x80 + 0x300;
+ else
+ return unicode_short_table[c - 0x80 - 0x50];
+}
+
+static uint32_t unicode_get_lower_simple(uint32_t c)
+{
+ if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
+ c += 0x20;
+ else
+ c++;
+ return c;
+}
+
+static uint16_t unicode_get16(const uint8_t *p)
+{
+ return p[0] | (p[1] << 8);
+}
+
+static int unicode_decomp_entry(uint32_t *res, uint32_t c,
+ int idx, uint32_t code, uint32_t len,
+ uint32_t type)
+{
+ uint32_t c1;
+ int l, i, p;
+ const uint8_t *d;
+
+ if (type == DECOMP_TYPE_C1) {
+ res[0] = unicode_decomp_table2[idx];
+ return 1;
+ } else {
+ d = unicode_decomp_data + unicode_decomp_table2[idx];
+ switch(type) {
+ case DECOMP_TYPE_L1:
+ case DECOMP_TYPE_L2:
+ case DECOMP_TYPE_L3:
+ case DECOMP_TYPE_L4:
+ case DECOMP_TYPE_L5:
+ case DECOMP_TYPE_L6:
+ case DECOMP_TYPE_L7:
+ l = type - DECOMP_TYPE_L1 + 1;
+ d += (c - code) * l * 2;
+ for(i = 0; i < l; i++) {
+ if ((res[i] = unicode_get16(d + 2 * i)) == 0)
+ return 0;
+ }
+ return l;
+ case DECOMP_TYPE_LL1:
+ case DECOMP_TYPE_LL2:
+ {
+ uint32_t k, p;
+ l = type - DECOMP_TYPE_LL1 + 1;
+ k = (c - code) * l;
+ p = len * l * 2;
+ for(i = 0; i < l; i++) {
+ c1 = unicode_get16(d + 2 * k) |
+ (((d[p + (k / 4)] >> ((k % 4) * 2)) & 3) << 16);
+ if (!c1)
+ return 0;
+ res[i] = c1;
+ k++;
+ }
+ }
+ return l;
+ case DECOMP_TYPE_S1:
+ case DECOMP_TYPE_S2:
+ case DECOMP_TYPE_S3:
+ case DECOMP_TYPE_S4:
+ case DECOMP_TYPE_S5:
+ l = type - DECOMP_TYPE_S1 + 1;
+ d += (c - code) * l;
+ for(i = 0; i < l; i++) {
+ if ((res[i] = unicode_get_short_code(d[i])) == 0)
+ return 0;
+ }
+ return l;
+ case DECOMP_TYPE_I1:
+ l = 1;
+ p = 0;
+ goto decomp_type_i;
+ case DECOMP_TYPE_I2_0:
+ case DECOMP_TYPE_I2_1:
+ case DECOMP_TYPE_I3_1:
+ case DECOMP_TYPE_I3_2:
+ case DECOMP_TYPE_I4_1:
+ case DECOMP_TYPE_I4_2:
+ l = 2 + ((type - DECOMP_TYPE_I2_0) >> 1);
+ p = ((type - DECOMP_TYPE_I2_0) & 1) + (l > 2);
+ decomp_type_i:
+ for(i = 0; i < l; i++) {
+ c1 = unicode_get16(d + 2 * i);
+ if (i == p)
+ c1 += c - code;
+ res[i] = c1;
+ }
+ return l;
+ case DECOMP_TYPE_B18:
+ l = 18;
+ goto decomp_type_b;
+ case DECOMP_TYPE_B1:
+ case DECOMP_TYPE_B2:
+ case DECOMP_TYPE_B3:
+ case DECOMP_TYPE_B4:
+ case DECOMP_TYPE_B5:
+ case DECOMP_TYPE_B6:
+ case DECOMP_TYPE_B7:
+ case DECOMP_TYPE_B8:
+ l = type - DECOMP_TYPE_B1 + 1;
+ decomp_type_b:
+ {
+ uint32_t c_min;
+ c_min = unicode_get16(d);
+ d += 2 + (c - code) * l;
+ for(i = 0; i < l; i++) {
+ c1 = d[i];
+ if (c1 == 0xff)
+ c1 = 0x20;
+ else
+ c1 += c_min;
+ res[i] = c1;
+ }
+ }
+ return l;
+ case DECOMP_TYPE_LS2:
+ d += (c - code) * 3;
+ if (!(res[0] = unicode_get16(d)))
+ return 0;
+ res[1] = unicode_get_short_code(d[2]);
+ return 2;
+ case DECOMP_TYPE_PAT3:
+ res[0] = unicode_get16(d);
+ res[2] = unicode_get16(d + 2);
+ d += 4 + (c - code) * 2;
+ res[1] = unicode_get16(d);
+ return 3;
+ case DECOMP_TYPE_S2_UL:
+ case DECOMP_TYPE_LS2_UL:
+ c1 = c - code;
+ if (type == DECOMP_TYPE_S2_UL) {
+ d += c1 & ~1;
+ c = unicode_get_short_code(*d);
+ d++;
+ } else {
+ d += (c1 >> 1) * 3;
+ c = unicode_get16(d);
+ d += 2;
+ }
+ if (c1 & 1)
+ c = unicode_get_lower_simple(c);
+ res[0] = c;
+ res[1] = unicode_get_short_code(*d);
+ return 2;
+ }
+ }
+ return 0;
+}
+
+
+/* return the length of the decomposition (length <=
+ UNICODE_DECOMP_LEN_MAX) or 0 if no decomposition */
+static int unicode_decomp_char(uint32_t *res, uint32_t c, BOOL is_compat1)
+{
+ uint32_t v, type, is_compat, code, len;
+ int idx_min, idx_max, idx;
+
+ idx_min = 0;
+ idx_max = countof(unicode_decomp_table1) - 1;
+ while (idx_min <= idx_max) {
+ idx = (idx_max + idx_min) / 2;
+ v = unicode_decomp_table1[idx];
+ code = v >> (32 - 18);
+ len = (v >> (32 - 18 - 7)) & 0x7f;
+ // printf("idx=%d code=%05x len=%d\n", idx, code, len);
+ if (c < code) {
+ idx_max = idx - 1;
+ } else if (c >= code + len) {
+ idx_min = idx + 1;
+ } else {
+ is_compat = v & 1;
+ if (is_compat1 < is_compat)
+ break;
+ type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
+ return unicode_decomp_entry(res, c, idx, code, len, type);
+ }
+ }
+ return 0;
+}
+
+/* return 0 if no pair found */
+static int unicode_compose_pair(uint32_t c0, uint32_t c1)
+{
+ uint32_t code, len, type, v, idx1, d_idx, d_offset, ch;
+ int idx_min, idx_max, idx, d;
+ uint32_t pair[2];
+
+ idx_min = 0;
+ idx_max = countof(unicode_comp_table) - 1;
+ while (idx_min <= idx_max) {
+ idx = (idx_max + idx_min) / 2;
+ idx1 = unicode_comp_table[idx];
+
+ /* idx1 represent an entry of the decomposition table */
+ d_idx = idx1 >> 6;
+ d_offset = idx1 & 0x3f;
+ v = unicode_decomp_table1[d_idx];
+ code = v >> (32 - 18);
+ len = (v >> (32 - 18 - 7)) & 0x7f;
+ type = (v >> (32 - 18 - 7 - 6)) & 0x3f;
+ ch = code + d_offset;
+ unicode_decomp_entry(pair, ch, d_idx, code, len, type);
+ d = c0 - pair[0];
+ if (d == 0)
+ d = c1 - pair[1];
+ if (d < 0) {
+ idx_max = idx - 1;
+ } else if (d > 0) {
+ idx_min = idx + 1;
+ } else {
+ return ch;
+ }
+ }
+ return 0;
+}
+
+/* return the combining class of character c (between 0 and 255) */
+static int unicode_get_cc(uint32_t c)
+{
+ uint32_t code, n, type, cc, c1, b;
+ int pos;
+ const uint8_t *p;
+
+ pos = get_index_pos(&code, c,
+ unicode_cc_index, sizeof(unicode_cc_index) / 3);
+ if (pos < 0)
+ return 0;
+ p = unicode_cc_table + pos;
+ for(;;) {
+ b = *p++;
+ type = b >> 6;
+ n = b & 0x3f;
+ if (n < 48) {
+ } else if (n < 56) {
+ n = (n - 48) << 8;
+ n |= *p++;
+ n += 48;
+ } else {
+ n = (n - 56) << 8;
+ n |= *p++ << 8;
+ n |= *p++;
+ n += 48 + (1 << 11);
+ }
+ if (type <= 1)
+ p++;
+ c1 = code + n + 1;
+ if (c < c1) {
+ switch(type) {
+ case 0:
+ cc = p[-1];
+ break;
+ case 1:
+ cc = p[-1] + c - code;
+ break;
+ case 2:
+ cc = 0;
+ break;
+ default:
+ case 3:
+ cc = 230;
+ break;
+ }
+ return cc;
+ }
+ code = c1;
+ }
+}
+
+static void sort_cc(int *buf, int len)
+{
+ int i, j, k, cc, cc1, start, ch1;
+
+ for(i = 0; i < len; i++) {
+ cc = unicode_get_cc(buf[i]);
+ if (cc != 0) {
+ start = i;
+ j = i + 1;
+ while (j < len) {
+ ch1 = buf[j];
+ cc1 = unicode_get_cc(ch1);
+ if (cc1 == 0)
+ break;
+ k = j - 1;
+ while (k >= start) {
+ if (unicode_get_cc(buf[k]) <= cc1)
+ break;
+ buf[k + 1] = buf[k];
+ k--;
+ }
+ buf[k + 1] = ch1;
+ j++;
+ }
+#if 0
+ printf("cc:");
+ for(k = start; k < j; k++) {
+ printf(" %3d", unicode_get_cc(buf[k]));
+ }
+ printf("\n");
+#endif
+ i = j;
+ }
+ }
+}
+
+static void to_nfd_rec(DynBuf *dbuf,
+ const int *src, int src_len, int is_compat)
+{
+ uint32_t c, v;
+ int i, l;
+ uint32_t res[UNICODE_DECOMP_LEN_MAX];
+
+ for(i = 0; i < src_len; i++) {
+ c = src[i];
+ if (c >= 0xac00 && c < 0xd7a4) {
+ /* Hangul decomposition */
+ c -= 0xac00;
+ dbuf_put_u32(dbuf, 0x1100 + c / 588);
+ dbuf_put_u32(dbuf, 0x1161 + (c % 588) / 28);
+ v = c % 28;
+ if (v != 0)
+ dbuf_put_u32(dbuf, 0x11a7 + v);
+ } else {
+ l = unicode_decomp_char(res, c, is_compat);
+ if (l) {
+ to_nfd_rec(dbuf, (int *)res, l, is_compat);
+ } else {
+ dbuf_put_u32(dbuf, c);
+ }
+ }
+ }
+}
+
+/* return 0 if not found */
+static int compose_pair(uint32_t c0, uint32_t c1)
+{
+ /* Hangul composition */
+ if (c0 >= 0x1100 && c0 < 0x1100 + 19 &&
+ c1 >= 0x1161 && c1 < 0x1161 + 21) {
+ return 0xac00 + (c0 - 0x1100) * 588 + (c1 - 0x1161) * 28;
+ } else if (c0 >= 0xac00 && c0 < 0xac00 + 11172 &&
+ (c0 - 0xac00) % 28 == 0 &&
+ c1 >= 0x11a7 && c1 < 0x11a7 + 28) {
+ return c0 + c1 - 0x11a7;
+ } else {
+ return unicode_compose_pair(c0, c1);
+ }
+}
+
+int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len,
+ UnicodeNormalizationEnum n_type,
+ void *opaque, DynBufReallocFunc *realloc_func)
+{
+ int *buf, buf_len, i, p, starter_pos, cc, last_cc, out_len;
+ BOOL is_compat;
+ DynBuf dbuf_s, *dbuf = &dbuf_s;
+
+ is_compat = n_type >> 1;
+
+ dbuf_init2(dbuf, opaque, realloc_func);
+ if (dbuf_realloc(dbuf, sizeof(int) * src_len))
+ goto fail;
+
+ /* common case: latin1 is unaffected by NFC */
+ if (n_type == UNICODE_NFC) {
+ for(i = 0; i < src_len; i++) {
+ if (src[i] >= 0x100)
+ goto not_latin1;
+ }
+ buf = (int *)dbuf->buf;
+ memcpy(buf, src, src_len * sizeof(int));
+ *pdst = (uint32_t *)buf;
+ return src_len;
+ not_latin1: ;
+ }
+
+ to_nfd_rec(dbuf, (const int *)src, src_len, is_compat);
+ if (dbuf_error(dbuf)) {
+ fail:
+ *pdst = NULL;
+ return -1;
+ }
+ buf = (int *)dbuf->buf;
+ buf_len = dbuf->size / sizeof(int);
+
+ sort_cc(buf, buf_len);
+
+ if (buf_len <= 1 || (n_type & 1) != 0) {
+ /* NFD / NFKD */
+ *pdst = (uint32_t *)buf;
+ return buf_len;
+ }
+
+ i = 1;
+ out_len = 1;
+ while (i < buf_len) {
+ /* find the starter character and test if it is blocked from
+ the character at 'i' */
+ last_cc = unicode_get_cc(buf[i]);
+ starter_pos = out_len - 1;
+ while (starter_pos >= 0) {
+ cc = unicode_get_cc(buf[starter_pos]);
+ if (cc == 0)
+ break;
+ if (cc >= last_cc)
+ goto next;
+ last_cc = 256;
+ starter_pos--;
+ }
+ if (starter_pos >= 0 &&
+ (p = compose_pair(buf[starter_pos], buf[i])) != 0) {
+ buf[starter_pos] = p;
+ i++;
+ } else {
+ next:
+ buf[out_len++] = buf[i++];
+ }
+ }
+ *pdst = (uint32_t *)buf;
+ return out_len;
+}
+
+/* char ranges for various unicode properties */
+
+static int unicode_find_name(const char *name_table, const char *name)
+{
+ const char *p, *r;
+ int pos;
+ size_t name_len, len;
+
+ p = name_table;
+ pos = 0;
+ name_len = strlen(name);
+ while (*p) {
+ for(;;) {
+ r = strchr(p, ',');
+ if (!r)
+ len = strlen(p);
+ else
+ len = r - p;
+ if (len == name_len && !memcmp(p, name, name_len))
+ return pos;
+ p += len + 1;
+ if (!r)
+ break;
+ }
+ pos++;
+ }
+ return -1;
+}
+
+/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
+ if not found */
+int unicode_script(CharRange *cr,
+ const char *script_name, BOOL is_ext)
+{
+ int script_idx;
+ const uint8_t *p, *p_end;
+ uint32_t c, c1, b, n, v, v_len, i, type;
+ CharRange cr1_s, *cr1;
+ CharRange cr2_s, *cr2 = &cr2_s;
+ BOOL is_common;
+
+ script_idx = unicode_find_name(unicode_script_name_table, script_name);
+ if (script_idx < 0)
+ return -2;
+ /* Note: we remove the "Unknown" Script */
+ script_idx += UNICODE_SCRIPT_Unknown + 1;
+
+ is_common = (script_idx == UNICODE_SCRIPT_Common ||
+ script_idx == UNICODE_SCRIPT_Inherited);
+ if (is_ext) {
+ cr1 = &cr1_s;
+ cr_init(cr1, cr->mem_opaque, cr->realloc_func);
+ cr_init(cr2, cr->mem_opaque, cr->realloc_func);
+ } else {
+ cr1 = cr;
+ }
+
+ p = unicode_script_table;
+ p_end = unicode_script_table + countof(unicode_script_table);
+ c = 0;
+ while (p < p_end) {
+ b = *p++;
+ type = b >> 7;
+ n = b & 0x7f;
+ if (n < 96) {
+ } else if (n < 112) {
+ n = (n - 96) << 8;
+ n |= *p++;
+ n += 96;
+ } else {
+ n = (n - 112) << 16;
+ n |= *p++ << 8;
+ n |= *p++;
+ n += 96 + (1 << 12);
+ }
+ if (type == 0)
+ v = 0;
+ else
+ v = *p++;
+ c1 = c + n + 1;
+ if (v == script_idx) {
+ if (cr_add_interval(cr1, c, c1))
+ goto fail;
+ }
+ c = c1;
+ }
+
+ if (is_ext) {
+ /* add the script extensions */
+ p = unicode_script_ext_table;
+ p_end = unicode_script_ext_table + countof(unicode_script_ext_table);
+ c = 0;
+ while (p < p_end) {
+ b = *p++;
+ if (b < 128) {
+ n = b;
+ } else if (b < 128 + 64) {
+ n = (b - 128) << 8;
+ n |= *p++;
+ n += 128;
+ } else {
+ n = (b - 128 - 64) << 16;
+ n |= *p++ << 8;
+ n |= *p++;
+ n += 128 + (1 << 14);
+ }
+ c1 = c + n + 1;
+ v_len = *p++;
+ if (is_common) {
+ if (v_len != 0) {
+ if (cr_add_interval(cr2, c, c1))
+ goto fail;
+ }
+ } else {
+ for(i = 0; i < v_len; i++) {
+ if (p[i] == script_idx) {
+ if (cr_add_interval(cr2, c, c1))
+ goto fail;
+ break;
+ }
+ }
+ }
+ p += v_len;
+ c = c1;
+ }
+ if (is_common) {
+ /* remove all the characters with script extensions */
+ if (cr_invert(cr2))
+ goto fail;
+ if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
+ CR_OP_INTER))
+ goto fail;
+ } else {
+ if (cr_op(cr, cr1->points, cr1->len, cr2->points, cr2->len,
+ CR_OP_UNION))
+ goto fail;
+ }
+ cr_free(cr1);
+ cr_free(cr2);
+ }
+ return 0;
+ fail:
+ if (is_ext) {
+ cr_free(cr1);
+ cr_free(cr2);
+ }
+ goto fail;
+}
+
+#define M(id) (1U << UNICODE_GC_ ## id)
+
+static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
+{
+ const uint8_t *p, *p_end;
+ uint32_t c, c0, b, n, v;
+
+ p = unicode_gc_table;
+ p_end = unicode_gc_table + countof(unicode_gc_table);
+ c = 0;
+ while (p < p_end) {
+ b = *p++;
+ n = b >> 5;
+ v = b & 0x1f;
+ if (n == 7) {
+ n = *p++;
+ if (n < 128) {
+ n += 7;
+ } else if (n < 128 + 64) {
+ n = (n - 128) << 8;
+ n |= *p++;
+ n += 7 + 128;
+ } else {
+ n = (n - 128 - 64) << 16;
+ n |= *p++ << 8;
+ n |= *p++;
+ n += 7 + 128 + (1 << 14);
+ }
+ }
+ c0 = c;
+ c += n + 1;
+ if (v == 31) {
+ /* run of Lu / Ll */
+ b = gc_mask & (M(Lu) | M(Ll));
+ if (b != 0) {
+ if (b == (M(Lu) | M(Ll))) {
+ goto add_range;
+ } else {
+ c0 += ((gc_mask & M(Ll)) != 0);
+ for(; c0 < c; c0 += 2) {
+ if (cr_add_interval(cr, c0, c0 + 1))
+ return -1;
+ }
+ }
+ }
+ } else if ((gc_mask >> v) & 1) {
+ add_range:
+ if (cr_add_interval(cr, c0, c))
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int unicode_prop1(CharRange *cr, int prop_idx)
+{
+ const uint8_t *p, *p_end;
+ uint32_t c, c0, b, bit;
+
+ p = unicode_prop_table[prop_idx];
+ p_end = p + unicode_prop_len_table[prop_idx];
+ c = 0;
+ bit = 0;
+ while (p < p_end) {
+ c0 = c;
+ b = *p++;
+ if (b < 64) {
+ c += (b >> 3) + 1;
+ if (bit) {
+ if (cr_add_interval(cr, c0, c))
+ return -1;
+ }
+ bit ^= 1;
+ c0 = c;
+ c += (b & 7) + 1;
+ } else if (b >= 0x80) {
+ c += b - 0x80 + 1;
+ } else if (b < 0x60) {
+ c += (((b - 0x40) << 8) | p[0]) + 1;
+ p++;
+ } else {
+ c += (((b - 0x60) << 16) | (p[0] << 8) | p[1]) + 1;
+ p += 2;
+ }
+ if (bit) {
+ if (cr_add_interval(cr, c0, c))
+ return -1;
+ }
+ bit ^= 1;
+ }
+ return 0;
+}
+
+#define CASE_U (1 << 0)
+#define CASE_L (1 << 1)
+#define CASE_F (1 << 2)
+
+/* use the case conversion table to generate range of characters.
+ CASE_U: set char if modified by uppercasing,
+ CASE_L: set char if modified by lowercasing,
+ CASE_F: set char if modified by case folding,
+ */
+static int unicode_case1(CharRange *cr, int case_mask)
+{
+#define MR(x) (1 << RUN_TYPE_ ## x)
+ const uint32_t tab_run_mask[3] = {
+ MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
+ MR(UF_D1_EXT) | MR(U_EXT) | MR(U_EXT2) | MR(U_EXT3),
+
+ MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(L_EXT2),
+
+ MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT),
+ };
+#undef MR
+ uint32_t mask, v, code, type, len, i, idx;
+
+ if (case_mask == 0)
+ return 0;
+ mask = 0;
+ for(i = 0; i < 3; i++) {
+ if ((case_mask >> i) & 1)
+ mask |= tab_run_mask[i];
+ }
+ for(idx = 0; idx < countof(case_conv_table1); idx++) {
+ v = case_conv_table1[idx];
+ type = (v >> (32 - 17 - 7 - 4)) & 0xf;
+ code = v >> (32 - 17);
+ len = (v >> (32 - 17 - 7)) & 0x7f;
+ if ((mask >> type) & 1) {
+ // printf("%d: type=%d %04x %04x\n", idx, type, code, code + len - 1);
+ switch(type) {
+ case RUN_TYPE_UL:
+ if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
+ goto def_case;
+ code += ((case_mask & CASE_U) != 0);
+ for(i = 0; i < len; i += 2) {
+ if (cr_add_interval(cr, code + i, code + i + 1))
+ return -1;
+ }
+ break;
+ case RUN_TYPE_LSU:
+ if ((case_mask & CASE_U) && (case_mask & (CASE_L | CASE_F)))
+ goto def_case;
+ if (!(case_mask & CASE_U)) {
+ if (cr_add_interval(cr, code, code + 1))
+ return -1;
+ }
+ if (cr_add_interval(cr, code + 1, code + 2))
+ return -1;
+ if (case_mask & CASE_U) {
+ if (cr_add_interval(cr, code + 2, code + 3))
+ return -1;
+ }
+ break;
+ default:
+ def_case:
+ if (cr_add_interval(cr, code, code + len))
+ return -1;
+ break;
+ }
+ }
+ }
+ return 0;
+}
+
+typedef enum {
+ POP_GC,
+ POP_PROP,
+ POP_CASE,
+ POP_UNION,
+ POP_INTER,
+ POP_XOR,
+ POP_INVERT,
+ POP_END,
+} PropOPEnum;
+
+#define POP_STACK_LEN_MAX 4
+
+static int unicode_prop_ops(CharRange *cr, ...)
+{
+ va_list ap;
+ CharRange stack[POP_STACK_LEN_MAX];
+ int stack_len, op, ret, i;
+ uint32_t a;
+
+ va_start(ap, cr);
+ stack_len = 0;
+ for(;;) {
+ op = va_arg(ap, int);
+ switch(op) {
+ case POP_GC:
+ assert(stack_len < POP_STACK_LEN_MAX);
+ a = va_arg(ap, int);
+ cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
+ if (unicode_general_category1(&stack[stack_len - 1], a))
+ goto fail;
+ break;
+ case POP_PROP:
+ assert(stack_len < POP_STACK_LEN_MAX);
+ a = va_arg(ap, int);
+ cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
+ if (unicode_prop1(&stack[stack_len - 1], a))
+ goto fail;
+ break;
+ case POP_CASE:
+ assert(stack_len < POP_STACK_LEN_MAX);
+ a = va_arg(ap, int);
+ cr_init(&stack[stack_len++], cr->mem_opaque, cr->realloc_func);
+ if (unicode_case1(&stack[stack_len - 1], a))
+ goto fail;
+ break;
+ case POP_UNION:
+ case POP_INTER:
+ case POP_XOR:
+ {
+ CharRange *cr1, *cr2, *cr3;
+ assert(stack_len >= 2);
+ assert(stack_len < POP_STACK_LEN_MAX);
+ cr1 = &stack[stack_len - 2];
+ cr2 = &stack[stack_len - 1];
+ cr3 = &stack[stack_len++];
+ cr_init(cr3, cr->mem_opaque, cr->realloc_func);
+ if (cr_op(cr3, cr1->points, cr1->len,
+ cr2->points, cr2->len, op - POP_UNION + CR_OP_UNION))
+ goto fail;
+ cr_free(cr1);
+ cr_free(cr2);
+ *cr1 = *cr3;
+ stack_len -= 2;
+ }
+ break;
+ case POP_INVERT:
+ assert(stack_len >= 1);
+ if (cr_invert(&stack[stack_len - 1]))
+ goto fail;
+ break;
+ case POP_END:
+ goto done;
+ default:
+ abort();
+ }
+ }
+ done:
+ va_end(ap);
+ assert(stack_len == 1);
+ ret = cr_copy(cr, &stack[0]);
+ cr_free(&stack[0]);
+ return ret;
+ fail:
+ va_end(ap);
+ for(i = 0; i < stack_len; i++)
+ cr_free(&stack[i]);
+ return -1;
+}
+
+static const uint32_t unicode_gc_mask_table[] = {
+ M(Lu) | M(Ll) | M(Lt), /* LC */
+ M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo), /* L */
+ M(Mn) | M(Mc) | M(Me), /* M */
+ M(Nd) | M(Nl) | M(No), /* N */
+ M(Sm) | M(Sc) | M(Sk) | M(So), /* S */
+ M(Pc) | M(Pd) | M(Ps) | M(Pe) | M(Pi) | M(Pf) | M(Po), /* P */
+ M(Zs) | M(Zl) | M(Zp), /* Z */
+ M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn), /* C */
+};
+
+/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
+ if not found */
+int unicode_general_category(CharRange *cr, const char *gc_name)
+{
+ int gc_idx;
+ uint32_t gc_mask;
+
+ gc_idx = unicode_find_name(unicode_gc_name_table, gc_name);
+ if (gc_idx < 0)
+ return -2;
+ if (gc_idx <= UNICODE_GC_Co) {
+ gc_mask = (uint64_t)1 << gc_idx;
+ } else {
+ gc_mask = unicode_gc_mask_table[gc_idx - UNICODE_GC_LC];
+ }
+ return unicode_general_category1(cr, gc_mask);
+}
+
+
+/* 'cr' must be initialized and empty. Return 0 if OK, -1 if error, -2
+ if not found */
+int unicode_prop(CharRange *cr, const char *prop_name)
+{
+ int prop_idx, ret;
+
+ prop_idx = unicode_find_name(unicode_prop_name_table, prop_name);
+ if (prop_idx < 0)
+ return -2;
+ prop_idx += UNICODE_PROP_ASCII_Hex_Digit;
+
+ ret = 0;
+ switch(prop_idx) {
+ case UNICODE_PROP_ASCII:
+ if (cr_add_interval(cr, 0x00, 0x7f + 1))
+ return -1;
+ break;
+ case UNICODE_PROP_Any:
+ if (cr_add_interval(cr, 0x00000, 0x10ffff + 1))
+ return -1;
+ break;
+ case UNICODE_PROP_Assigned:
+ ret = unicode_prop_ops(cr,
+ POP_GC, M(Cn),
+ POP_INVERT,
+ POP_END);
+ break;
+ case UNICODE_PROP_Math:
+ ret = unicode_prop_ops(cr,
+ POP_GC, M(Sm),
+ POP_PROP, UNICODE_PROP_Other_Math,
+ POP_UNION,
+ POP_END);
+ break;
+ case UNICODE_PROP_Lowercase:
+ ret = unicode_prop_ops(cr,
+ POP_GC, M(Ll),
+ POP_PROP, UNICODE_PROP_Other_Lowercase,
+ POP_UNION,
+ POP_END);
+ break;
+ case UNICODE_PROP_Uppercase:
+ ret = unicode_prop_ops(cr,
+ POP_GC, M(Lu),
+ POP_PROP, UNICODE_PROP_Other_Uppercase,
+ POP_UNION,
+ POP_END);
+ break;
+ case UNICODE_PROP_Cased:
+ ret = unicode_prop_ops(cr,
+ POP_GC, M(Lu) | M(Ll) | M(Lt),
+ POP_PROP, UNICODE_PROP_Other_Uppercase,
+ POP_UNION,
+ POP_PROP, UNICODE_PROP_Other_Lowercase,
+ POP_UNION,
+ POP_END);
+ break;
+ case UNICODE_PROP_Alphabetic:
+ ret = unicode_prop_ops(cr,
+ POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
+ POP_PROP, UNICODE_PROP_Other_Uppercase,
+ POP_UNION,
+ POP_PROP, UNICODE_PROP_Other_Lowercase,
+ POP_UNION,
+ POP_PROP, UNICODE_PROP_Other_Alphabetic,
+ POP_UNION,
+ POP_END);
+ break;
+ case UNICODE_PROP_Grapheme_Base:
+ ret = unicode_prop_ops(cr,
+ POP_GC, M(Cc) | M(Cf) | M(Cs) | M(Co) | M(Cn) | M(Zl) | M(Zp) | M(Me) | M(Mn),
+ POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
+ POP_UNION,
+ POP_INVERT,
+ POP_END);
+ break;
+ case UNICODE_PROP_Grapheme_Extend:
+ ret = unicode_prop_ops(cr,
+ POP_GC, M(Me) | M(Mn),
+ POP_PROP, UNICODE_PROP_Other_Grapheme_Extend,
+ POP_UNION,
+ POP_END);
+ break;
+ case UNICODE_PROP_XID_Start:
+ ret = unicode_prop_ops(cr,
+ POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
+ POP_PROP, UNICODE_PROP_Other_ID_Start,
+ POP_UNION,
+ POP_PROP, UNICODE_PROP_Pattern_Syntax,
+ POP_PROP, UNICODE_PROP_Pattern_White_Space,
+ POP_UNION,
+ POP_PROP, UNICODE_PROP_XID_Start1,
+ POP_UNION,
+ POP_INVERT,
+ POP_INTER,
+ POP_END);
+ break;
+ case UNICODE_PROP_XID_Continue:
+ ret = unicode_prop_ops(cr,
+ POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
+ M(Mn) | M(Mc) | M(Nd) | M(Pc),
+ POP_PROP, UNICODE_PROP_Other_ID_Start,
+ POP_UNION,
+ POP_PROP, UNICODE_PROP_Other_ID_Continue,
+ POP_UNION,
+ POP_PROP, UNICODE_PROP_Pattern_Syntax,
+ POP_PROP, UNICODE_PROP_Pattern_White_Space,
+ POP_UNION,
+ POP_PROP, UNICODE_PROP_XID_Continue1,
+ POP_UNION,
+ POP_INVERT,
+ POP_INTER,
+ POP_END);
+ break;
+ case UNICODE_PROP_Changes_When_Uppercased:
+ ret = unicode_case1(cr, CASE_U);
+ break;
+ case UNICODE_PROP_Changes_When_Lowercased:
+ ret = unicode_case1(cr, CASE_L);
+ break;
+ case UNICODE_PROP_Changes_When_Casemapped:
+ ret = unicode_case1(cr, CASE_U | CASE_L | CASE_F);
+ break;
+ case UNICODE_PROP_Changes_When_Titlecased:
+ ret = unicode_prop_ops(cr,
+ POP_CASE, CASE_U,
+ POP_PROP, UNICODE_PROP_Changes_When_Titlecased1,
+ POP_XOR,
+ POP_END);
+ break;
+ case UNICODE_PROP_Changes_When_Casefolded:
+ ret = unicode_prop_ops(cr,
+ POP_CASE, CASE_F,
+ POP_PROP, UNICODE_PROP_Changes_When_Casefolded1,
+ POP_XOR,
+ POP_END);
+ break;
+ case UNICODE_PROP_Changes_When_NFKC_Casefolded:
+ ret = unicode_prop_ops(cr,
+ POP_CASE, CASE_F,
+ POP_PROP, UNICODE_PROP_Changes_When_NFKC_Casefolded1,
+ POP_XOR,
+ POP_END);
+ break;
+#if 0
+ case UNICODE_PROP_ID_Start:
+ ret = unicode_prop_ops(cr,
+ POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl),
+ POP_PROP, UNICODE_PROP_Other_ID_Start,
+ POP_UNION,
+ POP_PROP, UNICODE_PROP_Pattern_Syntax,
+ POP_PROP, UNICODE_PROP_Pattern_White_Space,
+ POP_UNION,
+ POP_INVERT,
+ POP_INTER,
+ POP_END);
+ break;
+ case UNICODE_PROP_ID_Continue:
+ ret = unicode_prop_ops(cr,
+ POP_GC, M(Lu) | M(Ll) | M(Lt) | M(Lm) | M(Lo) | M(Nl) |
+ M(Mn) | M(Mc) | M(Nd) | M(Pc),
+ POP_PROP, UNICODE_PROP_Other_ID_Start,
+ POP_UNION,
+ POP_PROP, UNICODE_PROP_Other_ID_Continue,
+ POP_UNION,
+ POP_PROP, UNICODE_PROP_Pattern_Syntax,
+ POP_PROP, UNICODE_PROP_Pattern_White_Space,
+ POP_UNION,
+ POP_INVERT,
+ POP_INTER,
+ POP_END);
+ break;
+ case UNICODE_PROP_Case_Ignorable:
+ ret = unicode_prop_ops(cr,
+ POP_GC, M(Mn) | M(Cf) | M(Lm) | M(Sk),
+ POP_PROP, UNICODE_PROP_Case_Ignorable1,
+ POP_XOR,
+ POP_END);
+ break;
+#else
+ /* we use the existing tables */
+ case UNICODE_PROP_ID_Continue:
+ ret = unicode_prop_ops(cr,
+ POP_PROP, UNICODE_PROP_ID_Start,
+ POP_PROP, UNICODE_PROP_ID_Continue1,
+ POP_XOR,
+ POP_END);
+ break;
+#endif
+ default:
+ if (prop_idx >= countof(unicode_prop_table))
+ return -2;
+ ret = unicode_prop1(cr, prop_idx);
+ break;
+ }
+ return ret;
+}
+
+#endif /* CONFIG_ALL_UNICODE */