/* Copyright (c) 2004, 2006 MySQL AB Copyright (c) 2009-2011, Monty Program Ab Use is subject to license terms. Copyright (c) 2009-2011, Monty Program Ab This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ #include #include #include #include "my_global.h" #include "m_ctype.h" #include "ctype-uca.h" #define MAX_ALLOWED_CODE 0x10FFFF typedef struct opt_st { const char *name_prefix; /* Name that goes into all array names */ const char *filename; /* The filename or "-" for stdin */ uint levels; /* The number of levels to dump */ my_bool no_contractions; my_bool case_first_upper; } OPT; static OPT defaults= { "uca", "-", 3, FALSE, FALSE }; typedef struct my_ducet_weight_st { uint16 weight[4][MY_UCA_MAX_WEIGHT_SIZE]; size_t weight_length; } MY_DUCET_WEIGHT; typedef struct my_ducet_single_char_t { MY_DUCET_WEIGHT weight; my_bool is_variable; } MY_DUCET_SINGLE_CHAR; typedef struct my_ducet_char_t { my_wc_t wc[MY_UCA_MAX_CONTRACTION]; size_t length; } MY_DUCET_CHARS; typedef struct my_ducet_contraction_t { MY_DUCET_CHARS chars; MY_DUCET_WEIGHT weights; } MY_DUCET_CONTRACTION; typedef struct my_ducet_contraction_list_st { size_t nitems; MY_DUCET_CONTRACTION item[4*1024]; } MY_DUCET_CONTRACTION_LIST; typedef struct my_ducet_logical_posision_st { my_wc_t first; my_wc_t last; } MY_DUCET_LOGICAL_POSITION; typedef struct my_ducet_logical_positions_st { MY_DUCET_LOGICAL_POSITION tertiary_ignorable; MY_DUCET_LOGICAL_POSITION secondary_ignorable; MY_DUCET_LOGICAL_POSITION primary_ignorable; MY_DUCET_LOGICAL_POSITION variable; MY_DUCET_LOGICAL_POSITION non_ignorable; } MY_DUCET_LOGICAL_POSITIONS; typedef struct my_allkeys_st { MY_DUCET_SINGLE_CHAR single_chars[MAX_ALLOWED_CODE+1]; MY_DUCET_CONTRACTION_LIST contractions; MY_DUCET_LOGICAL_POSITIONS logical_positions; uint version; char version_str[32]; } MY_DUCET; static int my_ducet_weight_cmp_on_level(const MY_DUCET_WEIGHT *a, const MY_DUCET_WEIGHT *b, uint level) { uint i; for (i= 0; i < array_elements(a->weight[level]); i++) { int diff= (int) a->weight[level][i] - (int) b->weight[level][i]; if (diff) return diff; } return 0; } static int my_ducet_weight_cmp(const MY_DUCET_WEIGHT *a, const MY_DUCET_WEIGHT *b) { uint level; for (level= 0; level < array_elements(a->weight); level++) { int diff= my_ducet_weight_cmp_on_level(a, b, level); if (diff) return diff; } return 0; } /* "3.11 Logical Reset Positions" says: The CLDR table (based on UCA) has the following overall structure for weights, going from low to high. */ static my_bool my_ducet_weight_is_tertiary_ignorable(const MY_DUCET_WEIGHT *w) { return w->weight[0][0] == 0 && w->weight[1][0] == 0 && w->weight[2][0] == 0; } static my_bool my_ducet_weight_is_secondary_ignorable(const MY_DUCET_WEIGHT *w) { return w->weight[0][0] == 0 && w->weight[1][0] == 0 && w->weight[2][0] != 0; } static my_bool my_ducet_weight_is_primary_ignorable(const MY_DUCET_WEIGHT *w) { return w->weight[0][0] == 0 && w->weight[1][0] != 0 && w->weight[2][0] != 0; } static my_bool my_ducet_weight_is_primary_non_ignorable(const MY_DUCET_WEIGHT *w) { return w->weight[0][0] > 0 && w->weight[0][0] < 0xFB00; } /* if alternate = non-ignorable p != ignore, if alternate = shifted p, s, t = ignore */ static my_bool my_ducet_single_char_is_variable(const MY_DUCET_SINGLE_CHAR *ch) { return ch->is_variable && my_ducet_weight_is_primary_non_ignorable(&ch->weight); } static void my_ducet_logical_position_set(MY_DUCET_LOGICAL_POSITION *dst, my_wc_t wc) { dst->first= dst->last= wc; } static void my_ducet_logical_position_update(MY_DUCET_LOGICAL_POSITION *dst, const MY_DUCET *ducet, my_wc_t current) { const MY_DUCET_SINGLE_CHAR *chars= ducet->single_chars; int diff; if (current >= array_elements(ducet->single_chars)) return; if ((diff= my_ducet_weight_cmp(&chars[current].weight, &chars[dst->first].weight)) < 0 || (diff == 0 && current < dst->first)) dst->first= current; if ((diff= my_ducet_weight_cmp(&chars[current].weight, &chars[dst->last].weight)) > 0 || (diff == 0 && current > dst->last)) dst->last= current; } static void my_ducet_logical_positions_init(MY_DUCET_LOGICAL_POSITIONS *dst, const MY_DUCET *ducet) { uint i; const MY_DUCET_SINGLE_CHAR *chars= ducet->single_chars; for (i= 0; i < array_elements(ducet->single_chars); i++) { if (my_ducet_weight_is_tertiary_ignorable(&chars[i].weight)) { my_ducet_logical_position_set(&dst->tertiary_ignorable, i); break; } } for (i= 0; i < array_elements(ducet->single_chars); i++) { if (my_ducet_weight_is_secondary_ignorable(&chars[i].weight)) { my_ducet_logical_position_set(&dst->secondary_ignorable, i); break; } } for (i= 0; i < array_elements(ducet->single_chars); i++) { if (my_ducet_weight_is_primary_ignorable(&chars[i].weight)) { my_ducet_logical_position_set(&dst->primary_ignorable, i); break; } } for (i= 0; i < array_elements(ducet->single_chars); i++) { if (my_ducet_weight_is_primary_non_ignorable(&chars[i].weight)) { my_ducet_logical_position_set(&dst->non_ignorable, i); break; } } for (i= 0; i < array_elements(ducet->single_chars); i++) { if (my_ducet_single_char_is_variable(&chars[i])) { my_ducet_logical_position_set(&dst->variable, i); break; } } for (i= 1; i < array_elements(ducet->single_chars); i++) { if (my_ducet_weight_is_primary_non_ignorable(&chars[i].weight)) my_ducet_logical_position_update(&dst->non_ignorable, ducet, i); if (my_ducet_weight_is_primary_ignorable(&chars[i].weight)) my_ducet_logical_position_update(&dst->primary_ignorable, ducet, i); if (my_ducet_weight_is_secondary_ignorable(&chars[i].weight)) my_ducet_logical_position_update(&dst->secondary_ignorable, ducet, i); if (my_ducet_weight_is_tertiary_ignorable(&chars[i].weight)) my_ducet_logical_position_update(&dst->tertiary_ignorable, ducet, i); if (my_ducet_single_char_is_variable(&chars[i])) my_ducet_logical_position_update(&dst->variable, ducet, i); } /* DUCET as of Unicode-14.0.0 does not have any secondary ignorable characters, i.e. with weights [p=0000, s=0000, t!=0000] For compatibility with 4.0.0 and 5.2.0 data in ctype-uca.c, let copy tertiary_ignorable to secondary_ignorable. It gives effectively the same result with just leaving secondary_ignorable as {first=U+0000,last=U+0000}. */ if (dst->secondary_ignorable.first == 0 && dst->secondary_ignorable.last == 0) { dst->secondary_ignorable.first= dst->tertiary_ignorable.first; dst->secondary_ignorable.last= dst->tertiary_ignorable.last; } } static void my_ducet_weight_normalize_on_level(MY_DUCET_WEIGHT *weights, uint level, const OPT *options) { uint dst, src; for (src= 0, dst= 0; src < array_elements(weights->weight[level]); src++) { if (weights->weight[level][src] != 0) weights->weight[level][dst++]= weights->weight[level][src]; } for ( ; dst < array_elements(weights->weight[level]) ; dst++) weights->weight[level][dst]= 0; if (options->case_first_upper && level == 2) { /* Invert weights for secondary level to sort upper case letters before their lower case counter part. */ for (dst= 0; dst < array_elements(weights->weight[level]); dst++) { if (weights->weight[level][dst] == 0) break; if (weights->weight[level][dst] >= 0x20) { fprintf(stderr, "Secondary level is too large: %04X\n", (int) weights->weight[level][dst]); } weights->weight[level][dst]= (uint16) (0x20 - weights->weight[level][dst]); } } } static void my_ducet_weight_normalize(MY_DUCET_WEIGHT *weights, const OPT *options) { uint i; for (i= 0; i < array_elements(weights->weight); i++) my_ducet_weight_normalize_on_level(weights, i, options); } static void my_ducet_normalize(MY_DUCET *ducet, const OPT *options) { uint i; for (i= 0; i < array_elements(ducet->single_chars); i++) my_ducet_weight_normalize(&ducet->single_chars[i].weight, options); for (i= 0; i < array_elements(ducet->contractions.item); i++) my_ducet_weight_normalize(&ducet->contractions.item[i].weights, options); } static my_bool my_ducet_contraction_list_add(MY_DUCET_CONTRACTION_LIST *dst, const MY_DUCET_CHARS *chars, const MY_DUCET_WEIGHT *weights) { if (dst->nitems >= array_elements(dst->item)) { fprintf(stderr, "Too many contractions\n"); return TRUE; } dst->item[dst->nitems].chars= *chars; dst->item[dst->nitems].weights= *weights; dst->nitems++; return FALSE; } #if 0 #define MY_UCA_NPAGES 1024 #define MY_UCA_NCHARS 64 #define MY_UCA_CMASK 63 #define MY_UCA_PSHIFT 6 #else #define MY_UCA_NPAGES 4352 /* 0x110000 characters / 0x100 chars per page */ #define MY_UCA_NCHARS 256 #define MY_UCA_CMASK 255 #define MY_UCA_PSHIFT 8 #endif /* Name prefix that goes into page weight array names after global_name_prefix */ static const char *pname_prefix[]= {"_p", "_p", "_p"}; /* Name suffix that goes into page weight array names after page number */ static const char *pname_suffix[]= {"", "_secondary", "_tertiary"}; void usage(const char *prog) { printf("Usage:\n"); printf("%s [options] filename\n", prog); } static inline int lstrncmp(const char *str, const LEX_CSTRING lstr) { return strncmp(lstr.str, str, lstr.length); } int process_option(OPT *options, const char *opt) { static const LEX_CSTRING opt_name_prefix= {STRING_WITH_LEN("--name-prefix=")}; static const LEX_CSTRING opt_levels= {STRING_WITH_LEN("--levels=")}; static const LEX_CSTRING opt_no_contractions= {STRING_WITH_LEN("--no-contractions")}; static const LEX_CSTRING opt_case_first= {STRING_WITH_LEN("--case-first=")}; if (!lstrncmp(opt, opt_name_prefix)) { options->name_prefix= opt + opt_name_prefix.length; return 0; } if (!lstrncmp(opt, opt_levels)) { options->levels= (uint) strtoul(opt + opt_levels.length, NULL, 10); if (options->levels < 1 || options->levels > 3) { printf("Bad --levels value\n"); return 1; } return 0; } if (!lstrncmp(opt, opt_case_first)) { const char *value= opt + opt_case_first.length; if (!strcasecmp(value, "upper")) { options->case_first_upper= TRUE; return 0; } if (!strcasecmp(value, "lower")) { options->case_first_upper= FALSE; return 0; } fprintf(stderr, "Bad option: %s\n", opt); return 1; } if (!strcmp(opt, opt_no_contractions.str)) { options->no_contractions= TRUE; return 0; } printf("Unknown option: %s\n", opt); return 1; } int process_options(OPT *options, int ac, char **av) { int i; for (i= 1; i < ac; i++) { if (!strncmp(av[i], "--", 2)) { if (process_option(options, av[i])) return 1; } else { if (i + 1 != ac) { usage(av[0]); return 1; } options->filename= av[i]; return 0; } } usage(av[0]); return 1; } FILE *open_file(const char *name) { if (!strcmp(name, "-")) return stdin; return fopen(name, "r"); } void close_file(FILE *file) { if (file != stdin) fclose(file); } char *strrtrim(char *str) { char *end= str + strlen(str); for ( ; str < end; end--) { if (end[-1] != '\r' && end[-1] != '\n' && end[-1] != ' ' && end[-1] != '\t') break; end[-1]= '\0'; } return str; } /* Parse a line starting with '@'. As of 14.0.0, allkeys.txt has @version and @implicitweights lines. Only @version is parsed here. It could also be possible to parse @implicitweights to automatically generate routines responsible for implicit weight handling for Siniform ideographic scripts (Tangut, Nushu, Khitan). But as there are only a few of them at the moment, it was easier to write these routines in ctype-uca.h manually. So @implicitweights lines are ignored here. */ my_bool parse_at_line(MY_DUCET *ducet, const char *str) { static const LEX_CSTRING version= {STRING_WITH_LEN("@version ")}; if (!lstrncmp(str, version)) { /* Examples: @version 4.0.0 @version 5.2.0 @version 14.0.0 */ const char *src= str + version.length; long n[3]= {0}; uint pos; int length; length= snprintf(ducet->version_str, sizeof(ducet->version_str)-1, "%s", src); ducet->version_str[length]= '\0'; for (pos= 0 ; pos < 3; pos++) { char *endptr; n[pos]= strtol(src, &endptr, 10); if (*endptr != '.' && *endptr != '\r' && *endptr != '\n' && *endptr != 0) return TRUE; src= endptr + 1; } ducet->version= MY_UCA_VERSION_ID(n[0], n[1], n[2]); } return FALSE; } static void parse_chars(MY_DUCET_CHARS *dst, char *str) { char *s; const char *delim= " \t"; dst->length= 0; for (s= strtok(str, delim); s ; s= strtok(NULL, delim)) { my_wc_t code= (my_wc_t) strtoul(s, NULL, 16); if (dst->length < array_elements(dst->wc)) dst->wc[dst->length]= code; dst->length++; } } static void parse_weights(MY_DUCET_WEIGHT *dst, my_bool *is_variable, char *weight) { const char *delim= " []"; size_t w; char *weights[64]; char *s; dst->weight_length= 0; *is_variable= FALSE; for (s= strtok(weight, delim) ; s ; s= strtok(NULL, delim)) { if (dst->weight_length < array_elements(weights)) weights[dst->weight_length]= s; dst->weight_length++; } set_if_smaller(dst->weight_length, MY_UCA_MAX_WEIGHT_SIZE-1); for (w= 0; w < dst->weight_length ; w++) { size_t partnum= 0; for (s= weights[w]; *s ;) { char *endptr; uint part= (uint) strtoul(s + 1, &endptr, 16); if (w == 0 && s[0] == '*') *is_variable= TRUE; if (part > 0xFFFF) fprintf(stderr, "Weight is too large: %X\n", (uint) part); dst->weight[partnum][w]= (uint16) part; s= endptr; partnum++; } } } static void print_one_logical_position(const OPT *options, const char *name, const char *name2, my_wc_t value) { printf("#define %s_%s%s 0x%04X\n", options->name_prefix, name, name2, (int) value); } static void my_ducet_weight_print_canonical(const MY_DUCET_WEIGHT *src) { uint i; for (i= 0; i < array_elements(src->weight[0]); i++) { my_bool zero= src->weight[0][i] == 0 && src->weight[1][i] == 0 && src->weight[2][i] == 0; if (zero && i > 0) break; printf("[.%04X.%04X.%04X]", src->weight[0][i], src->weight[1][i], src->weight[2][i]); } } static void my_ducet_logical_position_print(const MY_DUCET_LOGICAL_POSITION *src, const char *name, const MY_DUCET *ducet, const OPT *options) { printf("/*\n"); my_ducet_weight_print_canonical(&ducet->single_chars[src->first].weight); printf("\n"); my_ducet_weight_print_canonical(&ducet->single_chars[src->last].weight); printf("\n*/\n"); print_one_logical_position(options, name, "_first", src->first); print_one_logical_position(options, name, "_last", src->last); printf("\n"); } static void print_logical_positions(const MY_DUCET_LOGICAL_POSITIONS *src, const MY_DUCET *ducet, const OPT *opt) { my_ducet_logical_position_print(&src->tertiary_ignorable, "tertiary_ignorable", ducet, opt); my_ducet_logical_position_print(&src->secondary_ignorable, "secondary_ignorable", ducet, opt); my_ducet_logical_position_print(&src->primary_ignorable, "primary_ignorable", ducet, opt); my_ducet_logical_position_print(&src->variable, "variable", ducet, opt); my_ducet_logical_position_print(&src->non_ignorable, "non_ignorable", ducet, opt); } static void print_version(const MY_DUCET *ducet, const OPT *opt) { printf("\n"); printf("#define %s_version %d /* %s */\n", opt->name_prefix, ducet->version, ducet->version_str); printf("\n"); } static void print_contraction(const MY_DUCET_CONTRACTION *c, uint level, const OPT *options) { size_t j; printf("{"); printf("{"); for (j= 0; j < array_elements(c->chars.wc); j++) { if (j > 0) printf(", "); if (c->chars.wc[j]) printf("0x%04X", (uint) c->chars.wc[j]); else { printf("0"); break; } } printf("}, "); printf("{"); for (j= 0; j < array_elements(c->weights.weight[level]); j++) { if (j > 0) printf(", "); if (c->weights.weight[level][j]) printf("0x%04X", (uint) c->weights.weight[level][j]); else { printf("0"); break; } } printf("}, FALSE"); printf("},\n"); } static void print_contraction_list(const MY_DUCET_CONTRACTION_LIST *src, uint level, const OPT *opt) { size_t i; printf("\n\n/* Contractions, level %d */\n", level); printf("static MY_CONTRACTION %s_contractions%s[%d]={\n", opt->name_prefix, pname_suffix[level], (int) src->nitems); for (i= 0; i < src->nitems; i++) { const MY_DUCET_CONTRACTION *c= &src->item[i]; print_contraction(c, level, opt); } printf("};\n\n"); } int main(int ac, char **av) { char str[1024]; static MY_DUCET ducet; my_wc_t code; uint w; int pageloaded[MY_UCA_NPAGES]; FILE *file; OPT options= defaults; if (process_options(&options, ac, av)) return 1; if (!(file= open_file(options.filename))) { printf("Could not open %s for reading\n", options.filename); return 1; } bzero(&ducet, sizeof(ducet)); bzero(pageloaded, sizeof(pageloaded)); while (fgets(str, sizeof(str), file)) { char *comment; char *weight; MY_DUCET_CHARS chr = {{0,0,0,0,0,0}, 0}; if (str[0] == '#') continue; if (str[0] == '@') { parse_at_line(&ducet, strrtrim(str)); continue; } if ((weight= strchr(str, ';'))) { *weight++= '\0'; for ( ; *weight==' ' ; weight++); } else continue; if ((comment=strchr(weight, '#'))) { *comment++= '\0'; }else continue; parse_chars(&chr, str); if (!chr.length) continue; if (chr.length == 1) { if (chr.wc[0] > MAX_ALLOWED_CODE) continue; parse_weights(&ducet.single_chars[chr.wc[0]].weight, &ducet.single_chars[chr.wc[0]].is_variable, weight); /* Mark that a character from this page was loaded */ pageloaded[chr.wc[0] >> MY_UCA_PSHIFT]++; } else { MY_DUCET_WEIGHT weights= {{{0,0,0,0,0,0,0,0,0}, {0,0,0,0,0,0,0,0,0}, {0,0,0,0,0,0,0,0,0}, {0,0,0,0,0,0,0,0,0}}, 0}; my_bool dummy; if (chr.length >= MY_UCA_MAX_CONTRACTION) { fprintf(stderr, "Too long contraction: %d\n", (int) chr.length); continue; } parse_weights(&weights, &dummy, weight); my_ducet_contraction_list_add(&ducet.contractions, &chr, &weights); continue; } } close_file(file); /* Now set implicit weights */ for (code=0; code <= MAX_ALLOWED_CODE; code++) { uint level; if (ducet.single_chars[code].weight.weight_length) continue; for (level= 0; level < 4; level++) { MY_UCA_IMPLICIT_WEIGHT weight; weight= my_uca_implicit_weight_on_level(ducet.version, code, level); ducet.single_chars[code].weight.weight[level][0]= weight.weight[0]; ducet.single_chars[code].weight.weight[level][1]= weight.weight[1]; } ducet.single_chars[code].weight.weight_length= 2; } my_ducet_normalize(&ducet, &options); my_ducet_logical_positions_init(&ducet.logical_positions, &ducet); printf("/*\n"); printf(" Generated from allkeys.txt version '%s'\n", ducet.version_str); printf("*/\n"); for (w=0; w < options.levels; w++) { my_wc_t page; int pagemaxlen[MY_UCA_NPAGES]; for (page=0; page < MY_UCA_NPAGES; page++) { my_wc_t offs; size_t maxnum= 0; size_t nchars= 0; size_t mchars; size_t ndefs= 0; size_t code_line_start= page * MY_UCA_NCHARS; pagemaxlen[page]= 0; /* Skip this page if no weights were loaded */ if (!pageloaded[page]) continue; /* Calculate maximum weight length for this page */ for (offs=0; offs < MY_UCA_NCHARS; offs++) { size_t i, num; code= page*MY_UCA_NCHARS+offs; /* Calculate only non-zero weights */ for (num=0, i=0; i < ducet.single_chars[code].weight.weight_length; i++) if (ducet.single_chars[code].weight.weight[w][i]) num++; maxnum= maxnum < num ? num : maxnum; /* Check if default weight */ if (w == 1 && num == 1) { /* 0020 0000 ... */ if (ducet.single_chars[code].weight.weight[w][0] == 0x0020) ndefs++; } else if (w == 2 && num == 1) { /* 0002 0000 ... */ if (ducet.single_chars[code].weight.weight[w][0] == 0x0002) ndefs++; } } maxnum++; /* If the page have only default weights then no needs to dump it, skip. */ if (ndefs == MY_UCA_NCHARS) { continue; } switch (maxnum) { case 0: mchars= 8; break; case 1: mchars= 8; break; case 2: mchars= 8; break; case 3: mchars= 9; break; case 4: mchars= 8; break; default: mchars= ducet.single_chars[code].weight.weight_length; } pagemaxlen[page]= (int) maxnum; /* Now print this page */ printf("static const uint16 %s%s%03X%s[]= { /* %04X (%d weights per char) */\n", options.name_prefix, pname_prefix[w], (int) page, pname_suffix[w], (int) page*MY_UCA_NCHARS, (int) maxnum); for (offs=0; offs < MY_UCA_NCHARS; offs++) { size_t i; code= page*MY_UCA_NCHARS+offs; for (i=0; i < maxnum; i++) { int tmp= ducet.single_chars[code].weight.weight[w][i]; printf("0x%04X", tmp); if ((offs+1 != MY_UCA_NCHARS) || (i+1!=maxnum)) printf(","); else printf(" "); nchars++; } if (nchars >=mchars) { printf(" /* %04X */\n", (int) code_line_start); code_line_start= code + 1; nchars=0; } else { printf(" "); } } printf("};\n\n"); } printf("const uchar %s_length%s[%d]={\n", options.name_prefix, pname_suffix[w], MY_UCA_NPAGES); for (page=0; page < MY_UCA_NPAGES; page++) { printf("%d%s%s",pagemaxlen[page],page