#ifndef CTYPE_UCA_H #define CTYPE_UCA_H /* Copyright (c) 2021, MariaDB This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; version 2 of the License. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ #define MY_UCA_VERSION_ID(x,y,z) ((uint) ((x) * 100 + (y) * 10 + (z))) /* Implicit weight handling is done according to the section "Computing Implicit Weights" in https://unicode.org/reports/tr10/#Values_For_Base_Table (as of Unicode 14.0.0) Implicit weights for a code CP are constructed as follows: [.AAAA.0020.0002][.BBBB.0000.0000] - There are two primary weights, depending on the character type and block. - There is one weight on the secondary and tertiary levels. AAAA and BBBB are computed using different formulas for: - Siniform ideographic scripts - Han - Unassigned characters */ typedef struct my_uca_implict_weight_t { uint16 weight[2]; } MY_UCA_IMPLICIT_WEIGHT; /* By default, implicit weights for a code CP are constructed as follows: [.AAAA.0020.0002][.BBBB.0000.0000] where AAAA and BBBB are : AAAA= BASE + (CP >> 15); BBBB= (CP & 0x7FFF) | 0x8000; This formula covers the following implicit weight subtypes: - Core Han Unified Ideographs - All other Han Unified Ideographs - Unassigned characters Every mentioned subtype passes a different BASE. This formula does not cover Siniform ideographic scripts. They are handled by separate functions. */ static inline MY_UCA_IMPLICIT_WEIGHT my_uca_implicit_weight_primary_default(uint16 base, my_wc_t code) { MY_UCA_IMPLICIT_WEIGHT res; res.weight[0]= (uint16) ((code >> 15) + base); res.weight[1]= (uint16) ((code & 0x7FFF)|0x8000); return res; } /** Calculate Unicode-5.2.0 implicit weight on the primary level. According to UCA, BASE is calculated as follows: - FB40 for Unified_Ideograph=True AND ((Block=CJK_Unified_Ideograph) OR (Block=CJK_Compatibility_Ideographs)) - FB80 for Unified_Ideograph=True AND NOT ((Block=CJK_Unified_Ideograph) OR (Block=CJK_Compatibility_Ideographs)) - FBC0 for any other code point But for Unicode-5.2.0 and Unicode-4.0.0 we used a simplified formula as implemented before. */ static inline MY_UCA_IMPLICIT_WEIGHT my_uca_520_implicit_weight_primary(my_wc_t code) { uint16 base; /* 3400; 4DB5; 4E00; 9FA5; */ if (code >= 0x3400 && code <= 0x4DB5) base= 0xFB80; else if (code >= 0x4E00 && code <= 0x9FA5) base= 0xFB40; else base= 0xFBC0; return my_uca_implicit_weight_primary_default(base, code); } typedef enum my_cs_encoding_enum { MY_CS_ENCODING_UTF8MB3= 0, MY_CS_ENCODING_UTF8MB4= 1, MY_CS_ENCODING_UCS2= 2, MY_CS_ENCODING_UTF16= 3, MY_CS_ENCODING_UTF32= 4, } my_cs_encoding_t; #define MY_CS_ENCODING_LAST MY_CS_ENCODING_UTF32 #include "ctype-uca1400.h" static inline MY_UCA_IMPLICIT_WEIGHT my_uca_implicit_weight_primary(uint version, my_wc_t code) { return version >= 1400 ? my_uca_1400_implicit_weight_primary(code) : my_uca_520_implicit_weight_primary(code); } static inline MY_UCA_IMPLICIT_WEIGHT my_uca_implicit_weight_secondary() { MY_UCA_IMPLICIT_WEIGHT res; res.weight[0]= 0x0020; res.weight[1]= 0; return res; } static inline MY_UCA_IMPLICIT_WEIGHT my_uca_implicit_weight_tertiary() { MY_UCA_IMPLICIT_WEIGHT res; res.weight[0]= 0x0002; res.weight[1]= 0; return res; } static inline MY_UCA_IMPLICIT_WEIGHT my_uca_implicit_weight_quaternary() { MY_UCA_IMPLICIT_WEIGHT res; res.weight[0]= 0x0001; res.weight[1]= 0; return res; } static inline MY_UCA_IMPLICIT_WEIGHT my_uca_implicit_weight_on_level(uint version, my_wc_t code, uint level) { switch (level) { case 0: return my_uca_implicit_weight_primary(version, code); case 1: return my_uca_implicit_weight_secondary(); case 2: return my_uca_implicit_weight_tertiary(); default: break; } return my_uca_implicit_weight_quaternary(); } uint my_ci_get_id_uca(CHARSET_INFO *cs, my_collation_id_type_t type); LEX_CSTRING my_ci_get_collation_name_uca(CHARSET_INFO *cs, my_collation_name_mode_t mode); #endif /* CTYPE_UCA_H */