diff options
Diffstat (limited to 'strings')
-rw-r--r-- | strings/CMakeLists.txt | 2 | ||||
-rw-r--r-- | strings/ctype-ascii.h | 189 | ||||
-rw-r--r-- | strings/ctype-big5.c | 2 | ||||
-rw-r--r-- | strings/ctype-cp932.c | 4 | ||||
-rw-r--r-- | strings/ctype-euc_kr.c | 4 | ||||
-rw-r--r-- | strings/ctype-eucjpms.c | 4 | ||||
-rw-r--r-- | strings/ctype-gb2312.c | 2 | ||||
-rw-r--r-- | strings/ctype-gbk.c | 2 | ||||
-rw-r--r-- | strings/ctype-sjis.c | 4 | ||||
-rw-r--r-- | strings/ctype-uca-scanner_next.inl | 74 | ||||
-rw-r--r-- | strings/ctype-uca.c | 81 | ||||
-rw-r--r-- | strings/ctype-uca.inl | 1 | ||||
-rw-r--r-- | strings/ctype-ujis.c | 4 | ||||
-rw-r--r-- | strings/ctype-utf8.c | 297 | ||||
-rw-r--r-- | strings/json_lib.c | 10 | ||||
-rw-r--r-- | strings/json_normalize.c | 852 | ||||
-rw-r--r-- | strings/strcoll.inl | 96 |
17 files changed, 1557 insertions, 71 deletions
diff --git a/strings/CMakeLists.txt b/strings/CMakeLists.txt index 0e62f9e34ad..54612256adc 100644 --- a/strings/CMakeLists.txt +++ b/strings/CMakeLists.txt @@ -23,7 +23,7 @@ SET(STRINGS_SOURCES bchange.c bmove_upp.c ctype-big5.c ctype-bin.c ctype-cp932.c str2int.c strcend.c strend.c strfill.c strmake.c strmov.c strnmov.c strxmov.c strxnmov.c xml.c strmov_overlapp.c - my_strchr.c strcont.c strappend.c json_lib.c) + my_strchr.c strcont.c strappend.c json_lib.c json_normalize.c) IF(NOT HAVE_STRNLEN) # OSX below 10.7 did not have strnlen diff --git a/strings/ctype-ascii.h b/strings/ctype-ascii.h new file mode 100644 index 00000000000..540d01b1a0d --- /dev/null +++ b/strings/ctype-ascii.h @@ -0,0 +1,189 @@ +#ifndef CTYPE_ASCII_INCLUDED +#define CTYPE_ASCII_INCLUDED + +#include "myisampack.h" + +/* + Magic expression. It uses the fact that for any byte value X in + the range 0..31 (0x00..0x1F) the expression (X+31)*5 returns + the 7th bit (0x80) set only for the following six (out of 32) values: + 0x00, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F. + These values correspond to offsets of non-letter characters + in the ASCII table: + + The following macro sets the bit 0x20 for the following characters: + ---------------- -------------------------------- + Magic bit 10000000000000000000000000011111 + ASCII 0x00..0x1F ................................ Control + ASCII 0x20..0x3F ................................ Punctuation, digits + ASCII 0x40..0x5F @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_ + ASCII 0x60..0x7F `abcdefghijklmnopqrstuvwxyz{|}~. + ---------------- -------------------------------- + We shift the magic bit 0x80 right twice to make it 0x20. + So on the ranges [40..5F] and [60..7F] the expression + has the bit 0x20 set for all non-letter characters. + Note, other bits contain garbage. + + Requirements: + All bytes must be in the range [00..7F], + to avoid overflow and carry to the next byte. +*/ +#define MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC(i) \ + (((((i)+0x1F1F1F1F1F1F1F1FULL) & 0x1F1F1F1F1F1F1F1F) * 5) >> 2) + + +/* + The following macro returns the bit 0x20 set to: + - 1 for input bytes in the ranges [60..7F] or [E0..FF] + - 0 otherwise + Bytes in the ranges [40..7F] and [C0..FF] have the bit 0x40 set. + Bytes in the ranges [60..7F] and [E0..FF] have the bit 0x20 set. + Hex BinHi BinLo + ---- -1-- ---- + 0x[4C]X .10. .... + 0x[5D]X .10. .... + 0x[6E]X .11. .... + 0x[7F]X .11. .... +*/ +#define MY_ASCII_20_IS_SET_IF_RANGE_60_7F_OR_E0_FF(i) (((i) >> 1) & ((i))) + + +/* + The following macro evaluates to exactly 0x20 for all + lower case ASCII letters [a-z], and to 0x00 otherwise: + + Value Range Character range Subrange + -------- -------- -------------------------------- ------- + 00000000 0x00..0x3F Control, punctuation, digits + 00100000 0x40..0x5F @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_ letters A-Z + 00000000 0x40..0x5F @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_ non-letters + 00100000 0x60..0x7F `abcdefghijklmnopqrstuvwxyz{|}~. letters a-z + 00000000 0x60..0x7F `abcdefghijklmnopqrstuvwxyz{|}~. non-letters + + Requirements: + All bytes must be in the range [00..7F]. + See the comments in MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC(). +*/ + +#define MY_ASCII_20_IF_IS_LOWER_LETTER(i) \ + (MY_ASCII_20_IS_SET_IF_RANGE_60_7F_OR_E0_FF(i) & \ + ~MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC(i) & \ + 0x2020202020202020) + +/* + Convert lower case ASCII letters to upper case by unsetting + the bit 0x20 with help of the magic expression. + + Requirements: + All bytes must be in the range [00..7F]. + See the comments in MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC() +*/ +#define MY_ASCII_TOUPPER_MAGIC(i) \ + (i ^ MY_ASCII_20_IF_IS_LOWER_LETTER(i)) + + +/* + Convert a string (consisting of 8 bytes stored in uint64) + to upper case algorithmically. + + Requirements: + All bytes must be in the range [00..0x7F]. + See the comments in MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC(). + The result on 8bit data is unpredictable!!! + The caller should make sure not to pass 8bit data. +*/ +static inline ulonglong my_ascii_to_upper_magic_uint64(ulonglong i) +{ + return MY_ASCII_TOUPPER_MAGIC(i); +} + + +/* + Check if: + - both strings "a" and "b" have at least 4 bytes, and + - both strings have only 7bit data. +*/ +static inline int +my_strcoll_ascii_4bytes_found(const uchar *a, const uchar *ae, + const uchar *b, const uchar *be) +{ + return a + 4 <= ae && b + 4 <= be && + (uint4korr(b) & 0x80808080) == 0 && + (uint4korr(a) & 0x80808080) == 0; +} + + +/* + Compare the leading four 7bit ASCII bytes in two strings case insensitively + by converting letters [a-z] to upper case [A-Z]. + + Requirements: + - The input strings must have at least four bytes, and + - The leading four bytes in both strings must be 7bit ASCII. + The caller must make sure to provide only strings that meet + these requirements. The result on 8-bit data is unpredictable + as 8-bit bytes may cause overflow in my_ascii_to_upper_magic_uint64(). + See comments above. +*/ +static inline int +my_strcoll_ascii_toupper_4bytes(const uchar *a, const uchar *b) +{ + ulonglong abn= (((ulonglong) mi_uint4korr(a)) << 32) | mi_uint4korr(b); + abn= my_ascii_to_upper_magic_uint64(abn); + if ((uint32) (abn >> 32) == (uint32) abn) + return 0; + return ((uint32) (abn >> 32)) < ((uint32) abn) ? -1 : + 1; +} + + +/* + Compare the leading eight 7bit ASCII bytes in two strings case insensitively + by converting letters [a-z] to upper case [A-Z]. + + Requirements: + - The input strings must have at least eight bytes, and + - The leading eight bytes in both strings must be 7bit ASCII. + See comments in my_strcoll_ascii_toupper_4bytes(). +*/ +static inline int +my_strcoll_ascii_toupper_8bytes(const uchar *a, const uchar *b) +{ + /* + TODO: + Try to get advantage of SIMD instructions by massive comparison + (16 bytes at a time) of characters against (x>='a' && x<='z') using: + - either explicit intrinsics + - or a loop that can get vectorized automatically by some compilers. + */ + ulonglong an= mi_uint8korr(a); + ulonglong bn= mi_uint8korr(b); + an= my_ascii_to_upper_magic_uint64(an); + bn= my_ascii_to_upper_magic_uint64(bn); + return an == bn ? 0 : an < bn ? -1 : +1; +} + + +/* + Compare the leading four 7bit ASCII bytes in two strings in binary style. +*/ +static inline int +my_strcoll_mb7_bin_4bytes(const uchar *a, const uchar *b) +{ + uint32 an= mi_uint4korr(a); + uint32 bn= mi_uint4korr(b); + return an == bn ? 0 : an < bn ? -1 : +1; +} + + +/* + Compare the leading four 7bit ASCII bytes in two strings in binary style. +*/ +static inline int +my_strcoll_mb7_bin_8bytes(const uchar *a, const uchar *b) +{ + ulonglong an= mi_uint8korr(a); + ulonglong bn= mi_uint8korr(b); + return an == bn ? 0 : an < bn ? -1 : +1; +} + +#endif /* CTYPE_ASCII_INCLUDED */ diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c index d66a2bf8593..2491a5ff7ed 100644 --- a/strings/ctype-big5.c +++ b/strings/ctype-big5.c @@ -6691,6 +6691,7 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)), #define MY_FUNCTION_NAME(x) my_ ## x ## _big5_bin #define WEIGHT_MB1(x) ((uchar) (x)) #define WEIGHT_MB2(x,y) (big5code(x, y)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" @@ -6707,6 +6708,7 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)), #define MY_FUNCTION_NAME(x) my_ ## x ## _big5_nopad_bin #define WEIGHT_MB1(x) ((uchar) (x)) #define WEIGHT_MB2(x,y) (big5code(x, y)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c index 9971750ca1c..af3de05509d 100644 --- a/strings/ctype-cp932.c +++ b/strings/ctype-cp932.c @@ -34639,6 +34639,7 @@ size_t my_numcells_cp932(CHARSET_INFO *cs __attribute__((unused)), #define WEIGHT_PAD_SPACE (256 * (int) ' ') #define WEIGHT_MB1(x) (256 * (int) sort_order_cp932[(uchar) (x)]) #define WEIGHT_MB2(x,y) (cp932code(x, y)) +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -34646,6 +34647,7 @@ size_t my_numcells_cp932(CHARSET_INFO *cs __attribute__((unused)), #define WEIGHT_PAD_SPACE (256 * (int) ' ') #define WEIGHT_MB1(x) (256 * (int) (uchar) (x)) #define WEIGHT_MB2(x,y) (cp932code(x, y)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" @@ -34654,6 +34656,7 @@ size_t my_numcells_cp932(CHARSET_INFO *cs __attribute__((unused)), #define WEIGHT_PAD_SPACE (256 * (int) ' ') #define WEIGHT_MB1(x) (256 * (int) sort_order_cp932[(uchar) (x)]) #define WEIGHT_MB2(x,y) (cp932code(x, y)) +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -34662,6 +34665,7 @@ size_t my_numcells_cp932(CHARSET_INFO *cs __attribute__((unused)), #define WEIGHT_PAD_SPACE (256 * (int) ' ') #define WEIGHT_MB1(x) (256 * (int) (uchar) (x)) #define WEIGHT_MB2(x,y) (cp932code(x, y)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c index 4d159b29494..1f62ebaf636 100644 --- a/strings/ctype-euc_kr.c +++ b/strings/ctype-euc_kr.c @@ -9932,12 +9932,14 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)), #define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_korean_ci #define WEIGHT_MB1(x) (sort_order_euc_kr[(uchar) (x)]) #define WEIGHT_MB2(x,y) (euckrcode(x, y)) +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" #define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_bin #define WEIGHT_MB1(x) ((uchar) (x)) #define WEIGHT_MB2(x,y) (euckrcode(x, y)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" @@ -9945,6 +9947,7 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)), #define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_korean_nopad_ci #define WEIGHT_MB1(x) (sort_order_euc_kr[(uchar) (x)]) #define WEIGHT_MB2(x,y) (euckrcode(x, y)) +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -9952,6 +9955,7 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)), #define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_nopad_bin #define WEIGHT_MB1(x) ((uchar) (x)) #define WEIGHT_MB2(x,y) (euckrcode(x, y)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c index 72b18b5ec76..ed48917e333 100644 --- a/strings/ctype-eucjpms.c +++ b/strings/ctype-eucjpms.c @@ -212,6 +212,7 @@ static const uchar sort_order_eucjpms[]= #define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \ (((uint) (uchar) (y)) << 8)) #define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z)) +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -221,6 +222,7 @@ static const uchar sort_order_eucjpms[]= #define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \ (((uint) (uchar) (y)) << 8)) #define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" @@ -231,6 +233,7 @@ static const uchar sort_order_eucjpms[]= #define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \ (((uint) (uchar) (y)) << 8)) #define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z)) +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -241,6 +244,7 @@ static const uchar sort_order_eucjpms[]= #define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \ (((uint) (uchar) (y)) << 8)) #define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c index 7b6b0b080f0..dd3581366fe 100644 --- a/strings/ctype-gb2312.c +++ b/strings/ctype-gb2312.c @@ -6344,6 +6344,7 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)), #define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312_bin #define WEIGHT_MB1(x) ((uchar) (x)) #define WEIGHT_MB2(x,y) (gb2312code(x, y)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" @@ -6358,6 +6359,7 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)), #define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312_nopad_bin #define WEIGHT_MB1(x) ((uchar) (x)) #define WEIGHT_MB2(x,y) (gb2312code(x, y)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c index 2501c293fb2..2e72d5bd7a4 100644 --- a/strings/ctype-gbk.c +++ b/strings/ctype-gbk.c @@ -10625,6 +10625,7 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)), #define MY_FUNCTION_NAME(x) my_ ## x ## _gbk_bin #define WEIGHT_MB1(x) ((uchar) (x)) #define WEIGHT_MB2(x,y) (gbkcode(x,y)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" @@ -10640,6 +10641,7 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)), #define MY_FUNCTION_NAME(x) my_ ## x ## _gbk_nopad_bin #define WEIGHT_MB1(x) ((uchar) (x)) #define WEIGHT_MB2(x,y) (gbkcode(x,y)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c index 313dfaa8f90..c3e64ce0d11 100644 --- a/strings/ctype-sjis.c +++ b/strings/ctype-sjis.c @@ -34027,6 +34027,7 @@ my_wc_to_printable_sjis(CHARSET_INFO *cs, my_wc_t wc, #define WEIGHT_PAD_SPACE (256 * (int) ' ') #define WEIGHT_MB1(x) (256 * (int) sort_order_sjis[(uchar) (x)]) #define WEIGHT_MB2(x,y) (sjiscode(x, y)) +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -34034,6 +34035,7 @@ my_wc_to_printable_sjis(CHARSET_INFO *cs, my_wc_t wc, #define WEIGHT_PAD_SPACE (256 * (int) ' ') #define WEIGHT_MB1(x) (256 * (int) (uchar) (x)) #define WEIGHT_MB2(x,y) (sjiscode(x, y)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" @@ -34042,6 +34044,7 @@ my_wc_to_printable_sjis(CHARSET_INFO *cs, my_wc_t wc, #define WEIGHT_PAD_SPACE (256 * (int) ' ') #define WEIGHT_MB1(x) (256 * (int) sort_order_sjis[(uchar) (x)]) #define WEIGHT_MB2(x,y) (sjiscode(x, y)) +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -34050,6 +34053,7 @@ my_wc_to_printable_sjis(CHARSET_INFO *cs, my_wc_t wc, #define WEIGHT_PAD_SPACE (256 * (int) ' ') #define WEIGHT_MB1(x) (256 * (int) (uchar) (x)) #define WEIGHT_MB2(x,y) (sjiscode(x, y)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" diff --git a/strings/ctype-uca-scanner_next.inl b/strings/ctype-uca-scanner_next.inl index 79d25487b42..acab31f21ef 100644 --- a/strings/ctype-uca-scanner_next.inl +++ b/strings/ctype-uca-scanner_next.inl @@ -1,5 +1,5 @@ /* Copyright (c) 2004, 2013, Oracle and/or its affiliates. - Copyright (c) 2009, 2021, MariaDB + Copyright (c) 2009, 2021, MariaDB This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public @@ -55,13 +55,8 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) #else #define LOCAL_MAX_CONTRACTION_LENGTH MY_UCA_MAX_CONTRACTION #endif - /* - Check if the weights for the previous character have been - already fully scanned. If yes, then get the next character and - initialize wbeg and wlength to its weight string. - */ - - if (scanner->wbeg[0]) + uint16 weight= my_uca_scanner_next_expansion_weight(scanner); + if (weight) { /* More weights left from the previous step. @@ -69,7 +64,7 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) Return "0" as "nchars". The real nchars was set on a previous iteration. */ - SCANNER_NEXT_RETURN(*scanner->wbeg++, 0); + SCANNER_NEXT_RETURN(weight, 0); } #ifdef SCANNER_NEXT_NCHARS @@ -79,39 +74,44 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) #endif { const uint16 *wpage; - my_wc_t wc[MY_UCA_MAX_CONTRACTION]; int mblen; + my_wc_t currwc= 0; + const uint16 *cweight; /* Get next character */ #if MY_UCA_ASCII_OPTIMIZE /* Get next ASCII character */ if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80) { - wc[0]= scanner->sbeg[0]; + currwc= scanner->sbeg[0]; scanner->sbeg+= 1; #if MY_UCA_COMPILE_CONTRACTIONS - if (my_uca_needs_context_handling(scanner->level, wc[0])) + if (my_uca_needs_context_handling(scanner->level, currwc)) { - const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, wc, + const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, currwc, LOCAL_MAX_CONTRACTION_LENGTH); if (cnt) - SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars); + { + if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight))) + SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars); + continue; /* Ignorable contraction */ + } } #endif scanner->page= 0; - scanner->code= (int) wc[0]; - scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0]; - if (scanner->wbeg[0]) - SCANNER_NEXT_RETURN(*scanner->wbeg++, ignorable_nchars + 1); - continue; + scanner->code= (int) currwc; + cweight= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0]; + if ((weight= my_uca_scanner_set_weight(scanner, cweight))) + SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1); + continue; /* Ignorable character */ } else #endif /* Get next MB character */ - if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg, - scanner->send)) <= 0)) + if (((mblen= MY_MB_WC(scanner, &currwc, scanner->sbeg, + scanner->send)) <= 0)) { if (scanner->sbeg >= scanner->send) { @@ -136,26 +136,29 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) } scanner->sbeg+= mblen; - if (wc[0] > scanner->level->maxchar) + if (currwc > scanner->level->maxchar) { - /* Return 0xFFFD as weight for all characters outside BMP */ - scanner->wbeg= nochar; - SCANNER_NEXT_RETURN(0xFFFD, ignorable_nchars + 1); + SCANNER_NEXT_RETURN(my_uca_scanner_set_weight_outside_maxchar(scanner), + ignorable_nchars + 1); } #if MY_UCA_COMPILE_CONTRACTIONS - if (my_uca_needs_context_handling(scanner->level, wc[0])) + if (my_uca_needs_context_handling(scanner->level, currwc)) { - const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, wc, + const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, currwc, LOCAL_MAX_CONTRACTION_LENGTH); if (cnt) - SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars); + { + if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight))) + SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars); + continue; /* Ignorable contraction */ + } } #endif /* Process single character */ - scanner->page= wc[0] >> 8; - scanner->code= wc[0] & 0xFF; + scanner->page= currwc >> 8; + scanner->code= currwc & 0xFF; /* If weight page for w[0] does not exist, then calculate algoritmically */ if (!(wpage= scanner->level->weights[scanner->page])) @@ -163,14 +166,13 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner) ignorable_nchars + 1); /* Calculate pointer to w[0]'s weight, using page and offset */ - scanner->wbeg= wpage + - scanner->code * scanner->level->lengths[scanner->page]; - if (scanner->wbeg[0]) - break; - /* Skip ignorable character and continue the loop */ + cweight= wpage + scanner->code * scanner->level->lengths[scanner->page]; + if ((weight= my_uca_scanner_set_weight(scanner, cweight))) + SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1); + continue; /* Ignorable character */ } - SCANNER_NEXT_RETURN(*scanner->wbeg++, ignorable_nchars + 1); + SCANNER_NEXT_RETURN(0, 0); /* Not reachable */ } #undef SCANNER_NEXT_NCHARS diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c index 221f81e18b0..c5b6ad6cbb3 100644 --- a/strings/ctype-uca.c +++ b/strings/ctype-uca.c @@ -31181,6 +31181,33 @@ static const uint16 nochar[]= {0,0}; #define MY_UCA_PREVIOUS_CONTEXT_HEAD 64 #define MY_UCA_PREVIOUS_CONTEXT_TAIL 128 + +static inline uint16 +my_uca_scanner_next_expansion_weight(my_uca_scanner *scanner) +{ + if (scanner->wbeg[0]) + return *scanner->wbeg++; + return 0; +} + + +static inline uint16 +my_uca_scanner_set_weight(my_uca_scanner *scanner, const uint16 *weight) +{ + scanner->wbeg= weight + 1; + return *weight; +} + + +static inline uint16 +my_uca_scanner_set_weight_outside_maxchar(my_uca_scanner *scanner) +{ + /* Return 0xFFFD as weight for all characters outside BMP */ + scanner->wbeg= nochar; + return 0xFFFD; +} + + /********** Helper functions to handle contraction ************/ @@ -31364,7 +31391,7 @@ my_uca_can_be_contraction_part(const MY_CONTRACTIONS *c, my_wc_t wc, int flag) @retval ptr - contraction weight array */ -uint16 * +const uint16 * my_uca_contraction2_weight(const MY_CONTRACTIONS *list, my_wc_t wc1, my_wc_t wc2) { MY_CONTRACTION *c, *last; @@ -31449,14 +31476,30 @@ my_uca_needs_context_handling(const MY_UCA_WEIGHT_LEVEL *level, my_wc_t wc) @retval non-zero - strings are different */ -static int -my_wmemcmp(my_wc_t *a, my_wc_t *b, size_t len) +static inline int +my_wmemcmp(const my_wc_t *a, const my_wc_t *b, size_t len) { return memcmp(a, b, len * sizeof(my_wc_t)); } /* + Test if the MY_CONTRACTION instance is equal to the wide + string with the given length. + Note, only true contractions are checked, + while previous context pairs always return FALSE. +*/ +static inline my_bool +my_uca_true_contraction_eq(const MY_CONTRACTION *c, + const my_wc_t *wc, size_t len) +{ + return (len >= MY_UCA_MAX_CONTRACTION || c->ch[len] == 0) && + !c->with_context && + !my_wmemcmp(c->ch, wc, len); +} + + +/* Return the number of characters in a contraction. */ static inline uint my_contraction_char_length(const MY_CONTRACTION *cnt) @@ -31492,9 +31535,7 @@ my_uca_contraction_find(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len) for (c= list->item, last= c + list->nitems; c < last; c++) { - if ((len >= MY_UCA_MAX_CONTRACTION || c->ch[len] == 0) && - !c->with_context && - !my_wmemcmp(c->ch, wc, len)) + if (my_uca_true_contraction_eq(c, wc, len)) return c; } return NULL; @@ -31518,12 +31559,15 @@ my_uca_contraction_find(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len) */ static const MY_CONTRACTION * -my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc, +my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t currwc, size_t max_char_length) { size_t clen= 1; int flag; const uchar *s, *beg[MY_UCA_MAX_CONTRACTION]; + my_wc_t wc[MY_UCA_MAX_CONTRACTION]; + wc[0]= currwc; + memset((void*) beg, 0, sizeof(beg)); /* Scan all contraction candidates */ @@ -31549,7 +31593,6 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc, (cnt= my_uca_contraction_find(&scanner->level->contractions, wc, clen))) { - scanner->wbeg= cnt->weight + 1; scanner->sbeg= beg[clen - 1]; return cnt; } @@ -31573,18 +31616,14 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc, */ static const MY_CONTRACTION * -my_uca_previous_context_find(my_uca_scanner *scanner, +my_uca_previous_context_find(const MY_CONTRACTIONS *list, my_wc_t wc0, my_wc_t wc1) { - const MY_CONTRACTIONS *list= &scanner->level->contractions; MY_CONTRACTION *c, *last; for (c= list->item, last= c + list->nitems; c < last; c++) { if (c->with_context && wc0 == c->ch[0] && wc1 == c->ch[1]) - { - scanner->wbeg= c->weight + 1; return c; - } } return NULL; } @@ -31610,10 +31649,11 @@ my_uca_previous_context_find(my_uca_scanner *scanner, @retval non null pointer - the address of MY_CONTRACTION found */ static inline const MY_CONTRACTION * -my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc, +my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc, size_t max_char_length) { const MY_CONTRACTION *cnt; + my_wc_t prevwc; DBUG_ASSERT(scanner->level->contractions.nitems); /* If we have scanned a character which can have previous context, @@ -31625,21 +31665,22 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc, context at the moment. CLDR does not have longer sequences. */ if (my_uca_can_be_previous_context_tail(&scanner->level->contractions, - wc[0]) && + currwc) && scanner->wbeg != nochar && /* if not the very first character */ my_uca_can_be_previous_context_head(&scanner->level->contractions, - (wc[1]= ((scanner->page << 8) + + (prevwc= ((scanner->page << 8) + scanner->code))) && - (cnt= my_uca_previous_context_find(scanner, wc[1], wc[0]))) + (cnt= my_uca_previous_context_find(&scanner->level->contractions, + prevwc, currwc))) { scanner->page= scanner->code= 0; /* Clear for the next character */ return cnt; } else if (my_uca_can_be_contraction_head(&scanner->level->contractions, - wc[0])) + currwc)) { - /* Check if w[0] starts a contraction */ - if ((cnt= my_uca_scanner_contraction_find(scanner, wc, max_char_length))) + /* Check if currwc starts a contraction */ + if ((cnt= my_uca_scanner_contraction_find(scanner, currwc, max_char_length))) return cnt; } return NULL; diff --git a/strings/ctype-uca.inl b/strings/ctype-uca.inl index 1fc3480e5b5..6cf31ace11a 100644 --- a/strings/ctype-uca.inl +++ b/strings/ctype-uca.inl @@ -36,6 +36,7 @@ #error MY_UCA_COLL_INIT is not defined #endif + #include "ctype-uca-scanner_next.inl" #define SCANNER_NEXT_NCHARS #include "ctype-uca-scanner_next.inl" diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c index fb0ab7be6a6..adcd4825d88 100644 --- a/strings/ctype-ujis.c +++ b/strings/ctype-ujis.c @@ -211,6 +211,7 @@ static const uchar sort_order_ujis[]= #define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \ (((uint) (uchar) (y)) << 8)) #define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z)) +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -220,6 +221,7 @@ static const uchar sort_order_ujis[]= #define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \ (((uint) (uchar) (y)) << 8)) #define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" @@ -230,6 +232,7 @@ static const uchar sort_order_ujis[]= #define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \ (((uint) (uchar) (y)) << 8)) #define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z)) +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -240,6 +243,7 @@ static const uchar sort_order_ujis[]= #define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \ (((uint) (uchar) (y)) << 8)) #define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index 742eeb912e3..611684ff706 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -1036,6 +1036,268 @@ static MY_UNICASE_CHARACTER plane05[]={ {0x05FE,0x05FE,0x05FE}, {0x05FF,0x05FF,0x05FF} }; +static MY_UNICASE_CHARACTER plane06[]={ /* This page is dummy */ + {0x0600,0x0600,0x0600}, {0x0601,0x0601,0x0601}, /* 0600 */ + {0x0602,0x0602,0x0602}, {0x0603,0x0603,0x0603}, /* 0602 */ + {0x0604,0x0604,0x0604}, {0x0605,0x0605,0x0605}, /* 0604 */ + {0x0606,0x0606,0x0606}, {0x0607,0x0607,0x0607}, /* 0606 */ + {0x0608,0x0608,0x0608}, {0x0609,0x0609,0x0609}, /* 0608 */ + {0x060A,0x060A,0x060A}, {0x060B,0x060B,0x060B}, /* 060A */ + {0x060C,0x060C,0x060C}, {0x060D,0x060D,0x060D}, /* 060C */ + {0x060E,0x060E,0x060E}, {0x060F,0x060F,0x060F}, /* 060E */ + {0x0610,0x0610,0x0610}, {0x0611,0x0611,0x0611}, /* 0610 */ + {0x0612,0x0612,0x0612}, {0x0613,0x0613,0x0613}, /* 0612 */ + {0x0614,0x0614,0x0614}, {0x0615,0x0615,0x0615}, /* 0614 */ + {0x0616,0x0616,0x0616}, {0x0617,0x0617,0x0617}, /* 0616 */ + {0x0618,0x0618,0x0618}, {0x0619,0x0619,0x0619}, /* 0618 */ + {0x061A,0x061A,0x061A}, {0x061B,0x061B,0x061B}, /* 061A */ + {0x061C,0x061C,0x061C}, {0x061D,0x061D,0x061D}, /* 061C */ + {0x061E,0x061E,0x061E}, {0x061F,0x061F,0x061F}, /* 061E */ + {0x0620,0x0620,0x0620}, {0x0621,0x0621,0x0621}, /* 0620 */ + {0x0622,0x0622,0x0622}, {0x0623,0x0623,0x0623}, /* 0622 */ + {0x0624,0x0624,0x0624}, {0x0625,0x0625,0x0625}, /* 0624 */ + {0x0626,0x0626,0x0626}, {0x0627,0x0627,0x0627}, /* 0626 */ + {0x0628,0x0628,0x0628}, {0x0629,0x0629,0x0629}, /* 0628 */ + {0x062A,0x062A,0x062A}, {0x062B,0x062B,0x062B}, /* 062A */ + {0x062C,0x062C,0x062C}, {0x062D,0x062D,0x062D}, /* 062C */ + {0x062E,0x062E,0x062E}, {0x062F,0x062F,0x062F}, /* 062E */ + {0x0630,0x0630,0x0630}, {0x0631,0x0631,0x0631}, /* 0630 */ + {0x0632,0x0632,0x0632}, {0x0633,0x0633,0x0633}, /* 0632 */ + {0x0634,0x0634,0x0634}, {0x0635,0x0635,0x0635}, /* 0634 */ + {0x0636,0x0636,0x0636}, {0x0637,0x0637,0x0637}, /* 0636 */ + {0x0638,0x0638,0x0638}, {0x0639,0x0639,0x0639}, /* 0638 */ + {0x063A,0x063A,0x063A}, {0x063B,0x063B,0x063B}, /* 063A */ + {0x063C,0x063C,0x063C}, {0x063D,0x063D,0x063D}, /* 063C */ + {0x063E,0x063E,0x063E}, {0x063F,0x063F,0x063F}, /* 063E */ + {0x0640,0x0640,0x0640}, {0x0641,0x0641,0x0641}, /* 0640 */ + {0x0642,0x0642,0x0642}, {0x0643,0x0643,0x0643}, /* 0642 */ + {0x0644,0x0644,0x0644}, {0x0645,0x0645,0x0645}, /* 0644 */ + {0x0646,0x0646,0x0646}, {0x0647,0x0647,0x0647}, /* 0646 */ + {0x0648,0x0648,0x0648}, {0x0649,0x0649,0x0649}, /* 0648 */ + {0x064A,0x064A,0x064A}, {0x064B,0x064B,0x064B}, /* 064A */ + {0x064C,0x064C,0x064C}, {0x064D,0x064D,0x064D}, /* 064C */ + {0x064E,0x064E,0x064E}, {0x064F,0x064F,0x064F}, /* 064E */ + {0x0650,0x0650,0x0650}, {0x0651,0x0651,0x0651}, /* 0650 */ + {0x0652,0x0652,0x0652}, {0x0653,0x0653,0x0653}, /* 0652 */ + {0x0654,0x0654,0x0654}, {0x0655,0x0655,0x0655}, /* 0654 */ + {0x0656,0x0656,0x0656}, {0x0657,0x0657,0x0657}, /* 0656 */ + {0x0658,0x0658,0x0658}, {0x0659,0x0659,0x0659}, /* 0658 */ + {0x065A,0x065A,0x065A}, {0x065B,0x065B,0x065B}, /* 065A */ + {0x065C,0x065C,0x065C}, {0x065D,0x065D,0x065D}, /* 065C */ + {0x065E,0x065E,0x065E}, {0x065F,0x065F,0x065F}, /* 065E */ + {0x0660,0x0660,0x0660}, {0x0661,0x0661,0x0661}, /* 0660 */ + {0x0662,0x0662,0x0662}, {0x0663,0x0663,0x0663}, /* 0662 */ + {0x0664,0x0664,0x0664}, {0x0665,0x0665,0x0665}, /* 0664 */ + {0x0666,0x0666,0x0666}, {0x0667,0x0667,0x0667}, /* 0666 */ + {0x0668,0x0668,0x0668}, {0x0669,0x0669,0x0669}, /* 0668 */ + {0x066A,0x066A,0x066A}, {0x066B,0x066B,0x066B}, /* 066A */ + {0x066C,0x066C,0x066C}, {0x066D,0x066D,0x066D}, /* 066C */ + {0x066E,0x066E,0x066E}, {0x066F,0x066F,0x066F}, /* 066E */ + {0x0670,0x0670,0x0670}, {0x0671,0x0671,0x0671}, /* 0670 */ + {0x0672,0x0672,0x0672}, {0x0673,0x0673,0x0673}, /* 0672 */ + {0x0674,0x0674,0x0674}, {0x0675,0x0675,0x0675}, /* 0674 */ + {0x0676,0x0676,0x0676}, {0x0677,0x0677,0x0677}, /* 0676 */ + {0x0678,0x0678,0x0678}, {0x0679,0x0679,0x0679}, /* 0678 */ + {0x067A,0x067A,0x067A}, {0x067B,0x067B,0x067B}, /* 067A */ + {0x067C,0x067C,0x067C}, {0x067D,0x067D,0x067D}, /* 067C */ + {0x067E,0x067E,0x067E}, {0x067F,0x067F,0x067F}, /* 067E */ + {0x0680,0x0680,0x0680}, {0x0681,0x0681,0x0681}, /* 0680 */ + {0x0682,0x0682,0x0682}, {0x0683,0x0683,0x0683}, /* 0682 */ + {0x0684,0x0684,0x0684}, {0x0685,0x0685,0x0685}, /* 0684 */ + {0x0686,0x0686,0x0686}, {0x0687,0x0687,0x0687}, /* 0686 */ + {0x0688,0x0688,0x0688}, {0x0689,0x0689,0x0689}, /* 0688 */ + {0x068A,0x068A,0x068A}, {0x068B,0x068B,0x068B}, /* 068A */ + {0x068C,0x068C,0x068C}, {0x068D,0x068D,0x068D}, /* 068C */ + {0x068E,0x068E,0x068E}, {0x068F,0x068F,0x068F}, /* 068E */ + {0x0690,0x0690,0x0690}, {0x0691,0x0691,0x0691}, /* 0690 */ + {0x0692,0x0692,0x0692}, {0x0693,0x0693,0x0693}, /* 0692 */ + {0x0694,0x0694,0x0694}, {0x0695,0x0695,0x0695}, /* 0694 */ + {0x0696,0x0696,0x0696}, {0x0697,0x0697,0x0697}, /* 0696 */ + {0x0698,0x0698,0x0698}, {0x0699,0x0699,0x0699}, /* 0698 */ + {0x069A,0x069A,0x069A}, {0x069B,0x069B,0x069B}, /* 069A */ + {0x069C,0x069C,0x069C}, {0x069D,0x069D,0x069D}, /* 069C */ + {0x069E,0x069E,0x069E}, {0x069F,0x069F,0x069F}, /* 069E */ + {0x06A0,0x06A0,0x06A0}, {0x06A1,0x06A1,0x06A1}, /* 06A0 */ + {0x06A2,0x06A2,0x06A2}, {0x06A3,0x06A3,0x06A3}, /* 06A2 */ + {0x06A4,0x06A4,0x06A4}, {0x06A5,0x06A5,0x06A5}, /* 06A4 */ + {0x06A6,0x06A6,0x06A6}, {0x06A7,0x06A7,0x06A7}, /* 06A6 */ + {0x06A8,0x06A8,0x06A8}, {0x06A9,0x06A9,0x06A9}, /* 06A8 */ + {0x06AA,0x06AA,0x06AA}, {0x06AB,0x06AB,0x06AB}, /* 06AA */ + {0x06AC,0x06AC,0x06AC}, {0x06AD,0x06AD,0x06AD}, /* 06AC */ + {0x06AE,0x06AE,0x06AE}, {0x06AF,0x06AF,0x06AF}, /* 06AE */ + {0x06B0,0x06B0,0x06B0}, {0x06B1,0x06B1,0x06B1}, /* 06B0 */ + {0x06B2,0x06B2,0x06B2}, {0x06B3,0x06B3,0x06B3}, /* 06B2 */ + {0x06B4,0x06B4,0x06B4}, {0x06B5,0x06B5,0x06B5}, /* 06B4 */ + {0x06B6,0x06B6,0x06B6}, {0x06B7,0x06B7,0x06B7}, /* 06B6 */ + {0x06B8,0x06B8,0x06B8}, {0x06B9,0x06B9,0x06B9}, /* 06B8 */ + {0x06BA,0x06BA,0x06BA}, {0x06BB,0x06BB,0x06BB}, /* 06BA */ + {0x06BC,0x06BC,0x06BC}, {0x06BD,0x06BD,0x06BD}, /* 06BC */ + {0x06BE,0x06BE,0x06BE}, {0x06BF,0x06BF,0x06BF}, /* 06BE */ + {0x06C0,0x06C0,0x06C0}, {0x06C1,0x06C1,0x06C1}, /* 06C0 */ + {0x06C2,0x06C2,0x06C2}, {0x06C3,0x06C3,0x06C3}, /* 06C2 */ + {0x06C4,0x06C4,0x06C4}, {0x06C5,0x06C5,0x06C5}, /* 06C4 */ + {0x06C6,0x06C6,0x06C6}, {0x06C7,0x06C7,0x06C7}, /* 06C6 */ + {0x06C8,0x06C8,0x06C8}, {0x06C9,0x06C9,0x06C9}, /* 06C8 */ + {0x06CA,0x06CA,0x06CA}, {0x06CB,0x06CB,0x06CB}, /* 06CA */ + {0x06CC,0x06CC,0x06CC}, {0x06CD,0x06CD,0x06CD}, /* 06CC */ + {0x06CE,0x06CE,0x06CE}, {0x06CF,0x06CF,0x06CF}, /* 06CE */ + {0x06D0,0x06D0,0x06D0}, {0x06D1,0x06D1,0x06D1}, /* 06D0 */ + {0x06D2,0x06D2,0x06D2}, {0x06D3,0x06D3,0x06D3}, /* 06D2 */ + {0x06D4,0x06D4,0x06D4}, {0x06D5,0x06D5,0x06D5}, /* 06D4 */ + {0x06D6,0x06D6,0x06D6}, {0x06D7,0x06D7,0x06D7}, /* 06D6 */ + {0x06D8,0x06D8,0x06D8}, {0x06D9,0x06D9,0x06D9}, /* 06D8 */ + {0x06DA,0x06DA,0x06DA}, {0x06DB,0x06DB,0x06DB}, /* 06DA */ + {0x06DC,0x06DC,0x06DC}, {0x06DD,0x06DD,0x06DD}, /* 06DC */ + {0x06DE,0x06DE,0x06DE}, {0x06DF,0x06DF,0x06DF}, /* 06DE */ + {0x06E0,0x06E0,0x06E0}, {0x06E1,0x06E1,0x06E1}, /* 06E0 */ + {0x06E2,0x06E2,0x06E2}, {0x06E3,0x06E3,0x06E3}, /* 06E2 */ + {0x06E4,0x06E4,0x06E4}, {0x06E5,0x06E5,0x06E5}, /* 06E4 */ + {0x06E6,0x06E6,0x06E6}, {0x06E7,0x06E7,0x06E7}, /* 06E6 */ + {0x06E8,0x06E8,0x06E8}, {0x06E9,0x06E9,0x06E9}, /* 06E8 */ + {0x06EA,0x06EA,0x06EA}, {0x06EB,0x06EB,0x06EB}, /* 06EA */ + {0x06EC,0x06EC,0x06EC}, {0x06ED,0x06ED,0x06ED}, /* 06EC */ + {0x06EE,0x06EE,0x06EE}, {0x06EF,0x06EF,0x06EF}, /* 06EE */ + {0x06F0,0x06F0,0x06F0}, {0x06F1,0x06F1,0x06F1}, /* 06F0 */ + {0x06F2,0x06F2,0x06F2}, {0x06F3,0x06F3,0x06F3}, /* 06F2 */ + {0x06F4,0x06F4,0x06F4}, {0x06F5,0x06F5,0x06F5}, /* 06F4 */ + {0x06F6,0x06F6,0x06F6}, {0x06F7,0x06F7,0x06F7}, /* 06F6 */ + {0x06F8,0x06F8,0x06F8}, {0x06F9,0x06F9,0x06F9}, /* 06F8 */ + {0x06FA,0x06FA,0x06FA}, {0x06FB,0x06FB,0x06FB}, /* 06FA */ + {0x06FC,0x06FC,0x06FC}, {0x06FD,0x06FD,0x06FD}, /* 06FC */ + {0x06FE,0x06FE,0x06FE}, {0x06FF,0x06FF,0x06FF} /* 06FE */ +}; + +static MY_UNICASE_CHARACTER plane07[]={ /* This page is dummy */ + {0x0700,0x0700,0x0700}, {0x0701,0x0701,0x0701}, /* 0700 */ + {0x0702,0x0702,0x0702}, {0x0703,0x0703,0x0703}, /* 0702 */ + {0x0704,0x0704,0x0704}, {0x0705,0x0705,0x0705}, /* 0704 */ + {0x0706,0x0706,0x0706}, {0x0707,0x0707,0x0707}, /* 0706 */ + {0x0708,0x0708,0x0708}, {0x0709,0x0709,0x0709}, /* 0708 */ + {0x070A,0x070A,0x070A}, {0x070B,0x070B,0x070B}, /* 070A */ + {0x070C,0x070C,0x070C}, {0x070D,0x070D,0x070D}, /* 070C */ + {0x070E,0x070E,0x070E}, {0x070F,0x070F,0x070F}, /* 070E */ + {0x0710,0x0710,0x0710}, {0x0711,0x0711,0x0711}, /* 0710 */ + {0x0712,0x0712,0x0712}, {0x0713,0x0713,0x0713}, /* 0712 */ + {0x0714,0x0714,0x0714}, {0x0715,0x0715,0x0715}, /* 0714 */ + {0x0716,0x0716,0x0716}, {0x0717,0x0717,0x0717}, /* 0716 */ + {0x0718,0x0718,0x0718}, {0x0719,0x0719,0x0719}, /* 0718 */ + {0x071A,0x071A,0x071A}, {0x071B,0x071B,0x071B}, /* 071A */ + {0x071C,0x071C,0x071C}, {0x071D,0x071D,0x071D}, /* 071C */ + {0x071E,0x071E,0x071E}, {0x071F,0x071F,0x071F}, /* 071E */ + {0x0720,0x0720,0x0720}, {0x0721,0x0721,0x0721}, /* 0720 */ + {0x0722,0x0722,0x0722}, {0x0723,0x0723,0x0723}, /* 0722 */ + {0x0724,0x0724,0x0724}, {0x0725,0x0725,0x0725}, /* 0724 */ + {0x0726,0x0726,0x0726}, {0x0727,0x0727,0x0727}, /* 0726 */ + {0x0728,0x0728,0x0728}, {0x0729,0x0729,0x0729}, /* 0728 */ + {0x072A,0x072A,0x072A}, {0x072B,0x072B,0x072B}, /* 072A */ + {0x072C,0x072C,0x072C}, {0x072D,0x072D,0x072D}, /* 072C */ + {0x072E,0x072E,0x072E}, {0x072F,0x072F,0x072F}, /* 072E */ + {0x0730,0x0730,0x0730}, {0x0731,0x0731,0x0731}, /* 0730 */ + {0x0732,0x0732,0x0732}, {0x0733,0x0733,0x0733}, /* 0732 */ + {0x0734,0x0734,0x0734}, {0x0735,0x0735,0x0735}, /* 0734 */ + {0x0736,0x0736,0x0736}, {0x0737,0x0737,0x0737}, /* 0736 */ + {0x0738,0x0738,0x0738}, {0x0739,0x0739,0x0739}, /* 0738 */ + {0x073A,0x073A,0x073A}, {0x073B,0x073B,0x073B}, /* 073A */ + {0x073C,0x073C,0x073C}, {0x073D,0x073D,0x073D}, /* 073C */ + {0x073E,0x073E,0x073E}, {0x073F,0x073F,0x073F}, /* 073E */ + {0x0740,0x0740,0x0740}, {0x0741,0x0741,0x0741}, /* 0740 */ + {0x0742,0x0742,0x0742}, {0x0743,0x0743,0x0743}, /* 0742 */ + {0x0744,0x0744,0x0744}, {0x0745,0x0745,0x0745}, /* 0744 */ + {0x0746,0x0746,0x0746}, {0x0747,0x0747,0x0747}, /* 0746 */ + {0x0748,0x0748,0x0748}, {0x0749,0x0749,0x0749}, /* 0748 */ + {0x074A,0x074A,0x074A}, {0x074B,0x074B,0x074B}, /* 074A */ + {0x074C,0x074C,0x074C}, {0x074D,0x074D,0x074D}, /* 074C */ + {0x074E,0x074E,0x074E}, {0x074F,0x074F,0x074F}, /* 074E */ + {0x0750,0x0750,0x0750}, {0x0751,0x0751,0x0751}, /* 0750 */ + {0x0752,0x0752,0x0752}, {0x0753,0x0753,0x0753}, /* 0752 */ + {0x0754,0x0754,0x0754}, {0x0755,0x0755,0x0755}, /* 0754 */ + {0x0756,0x0756,0x0756}, {0x0757,0x0757,0x0757}, /* 0756 */ + {0x0758,0x0758,0x0758}, {0x0759,0x0759,0x0759}, /* 0758 */ + {0x075A,0x075A,0x075A}, {0x075B,0x075B,0x075B}, /* 075A */ + {0x075C,0x075C,0x075C}, {0x075D,0x075D,0x075D}, /* 075C */ + {0x075E,0x075E,0x075E}, {0x075F,0x075F,0x075F}, /* 075E */ + {0x0760,0x0760,0x0760}, {0x0761,0x0761,0x0761}, /* 0760 */ + {0x0762,0x0762,0x0762}, {0x0763,0x0763,0x0763}, /* 0762 */ + {0x0764,0x0764,0x0764}, {0x0765,0x0765,0x0765}, /* 0764 */ + {0x0766,0x0766,0x0766}, {0x0767,0x0767,0x0767}, /* 0766 */ + {0x0768,0x0768,0x0768}, {0x0769,0x0769,0x0769}, /* 0768 */ + {0x076A,0x076A,0x076A}, {0x076B,0x076B,0x076B}, /* 076A */ + {0x076C,0x076C,0x076C}, {0x076D,0x076D,0x076D}, /* 076C */ + {0x076E,0x076E,0x076E}, {0x076F,0x076F,0x076F}, /* 076E */ + {0x0770,0x0770,0x0770}, {0x0771,0x0771,0x0771}, /* 0770 */ + {0x0772,0x0772,0x0772}, {0x0773,0x0773,0x0773}, /* 0772 */ + {0x0774,0x0774,0x0774}, {0x0775,0x0775,0x0775}, /* 0774 */ + {0x0776,0x0776,0x0776}, {0x0777,0x0777,0x0777}, /* 0776 */ + {0x0778,0x0778,0x0778}, {0x0779,0x0779,0x0779}, /* 0778 */ + {0x077A,0x077A,0x077A}, {0x077B,0x077B,0x077B}, /* 077A */ + {0x077C,0x077C,0x077C}, {0x077D,0x077D,0x077D}, /* 077C */ + {0x077E,0x077E,0x077E}, {0x077F,0x077F,0x077F}, /* 077E */ + {0x0780,0x0780,0x0780}, {0x0781,0x0781,0x0781}, /* 0780 */ + {0x0782,0x0782,0x0782}, {0x0783,0x0783,0x0783}, /* 0782 */ + {0x0784,0x0784,0x0784}, {0x0785,0x0785,0x0785}, /* 0784 */ + {0x0786,0x0786,0x0786}, {0x0787,0x0787,0x0787}, /* 0786 */ + {0x0788,0x0788,0x0788}, {0x0789,0x0789,0x0789}, /* 0788 */ + {0x078A,0x078A,0x078A}, {0x078B,0x078B,0x078B}, /* 078A */ + {0x078C,0x078C,0x078C}, {0x078D,0x078D,0x078D}, /* 078C */ + {0x078E,0x078E,0x078E}, {0x078F,0x078F,0x078F}, /* 078E */ + {0x0790,0x0790,0x0790}, {0x0791,0x0791,0x0791}, /* 0790 */ + {0x0792,0x0792,0x0792}, {0x0793,0x0793,0x0793}, /* 0792 */ + {0x0794,0x0794,0x0794}, {0x0795,0x0795,0x0795}, /* 0794 */ + {0x0796,0x0796,0x0796}, {0x0797,0x0797,0x0797}, /* 0796 */ + {0x0798,0x0798,0x0798}, {0x0799,0x0799,0x0799}, /* 0798 */ + {0x079A,0x079A,0x079A}, {0x079B,0x079B,0x079B}, /* 079A */ + {0x079C,0x079C,0x079C}, {0x079D,0x079D,0x079D}, /* 079C */ + {0x079E,0x079E,0x079E}, {0x079F,0x079F,0x079F}, /* 079E */ + {0x07A0,0x07A0,0x07A0}, {0x07A1,0x07A1,0x07A1}, /* 07A0 */ + {0x07A2,0x07A2,0x07A2}, {0x07A3,0x07A3,0x07A3}, /* 07A2 */ + {0x07A4,0x07A4,0x07A4}, {0x07A5,0x07A5,0x07A5}, /* 07A4 */ + {0x07A6,0x07A6,0x07A6}, {0x07A7,0x07A7,0x07A7}, /* 07A6 */ + {0x07A8,0x07A8,0x07A8}, {0x07A9,0x07A9,0x07A9}, /* 07A8 */ + {0x07AA,0x07AA,0x07AA}, {0x07AB,0x07AB,0x07AB}, /* 07AA */ + {0x07AC,0x07AC,0x07AC}, {0x07AD,0x07AD,0x07AD}, /* 07AC */ + {0x07AE,0x07AE,0x07AE}, {0x07AF,0x07AF,0x07AF}, /* 07AE */ + {0x07B0,0x07B0,0x07B0}, {0x07B1,0x07B1,0x07B1}, /* 07B0 */ + {0x07B2,0x07B2,0x07B2}, {0x07B3,0x07B3,0x07B3}, /* 07B2 */ + {0x07B4,0x07B4,0x07B4}, {0x07B5,0x07B5,0x07B5}, /* 07B4 */ + {0x07B6,0x07B6,0x07B6}, {0x07B7,0x07B7,0x07B7}, /* 07B6 */ + {0x07B8,0x07B8,0x07B8}, {0x07B9,0x07B9,0x07B9}, /* 07B8 */ + {0x07BA,0x07BA,0x07BA}, {0x07BB,0x07BB,0x07BB}, /* 07BA */ + {0x07BC,0x07BC,0x07BC}, {0x07BD,0x07BD,0x07BD}, /* 07BC */ + {0x07BE,0x07BE,0x07BE}, {0x07BF,0x07BF,0x07BF}, /* 07BE */ + {0x07C0,0x07C0,0x07C0}, {0x07C1,0x07C1,0x07C1}, /* 07C0 */ + {0x07C2,0x07C2,0x07C2}, {0x07C3,0x07C3,0x07C3}, /* 07C2 */ + {0x07C4,0x07C4,0x07C4}, {0x07C5,0x07C5,0x07C5}, /* 07C4 */ + {0x07C6,0x07C6,0x07C6}, {0x07C7,0x07C7,0x07C7}, /* 07C6 */ + {0x07C8,0x07C8,0x07C8}, {0x07C9,0x07C9,0x07C9}, /* 07C8 */ + {0x07CA,0x07CA,0x07CA}, {0x07CB,0x07CB,0x07CB}, /* 07CA */ + {0x07CC,0x07CC,0x07CC}, {0x07CD,0x07CD,0x07CD}, /* 07CC */ + {0x07CE,0x07CE,0x07CE}, {0x07CF,0x07CF,0x07CF}, /* 07CE */ + {0x07D0,0x07D0,0x07D0}, {0x07D1,0x07D1,0x07D1}, /* 07D0 */ + {0x07D2,0x07D2,0x07D2}, {0x07D3,0x07D3,0x07D3}, /* 07D2 */ + {0x07D4,0x07D4,0x07D4}, {0x07D5,0x07D5,0x07D5}, /* 07D4 */ + {0x07D6,0x07D6,0x07D6}, {0x07D7,0x07D7,0x07D7}, /* 07D6 */ + {0x07D8,0x07D8,0x07D8}, {0x07D9,0x07D9,0x07D9}, /* 07D8 */ + {0x07DA,0x07DA,0x07DA}, {0x07DB,0x07DB,0x07DB}, /* 07DA */ + {0x07DC,0x07DC,0x07DC}, {0x07DD,0x07DD,0x07DD}, /* 07DC */ + {0x07DE,0x07DE,0x07DE}, {0x07DF,0x07DF,0x07DF}, /* 07DE */ + {0x07E0,0x07E0,0x07E0}, {0x07E1,0x07E1,0x07E1}, /* 07E0 */ + {0x07E2,0x07E2,0x07E2}, {0x07E3,0x07E3,0x07E3}, /* 07E2 */ + {0x07E4,0x07E4,0x07E4}, {0x07E5,0x07E5,0x07E5}, /* 07E4 */ + {0x07E6,0x07E6,0x07E6}, {0x07E7,0x07E7,0x07E7}, /* 07E6 */ + {0x07E8,0x07E8,0x07E8}, {0x07E9,0x07E9,0x07E9}, /* 07E8 */ + {0x07EA,0x07EA,0x07EA}, {0x07EB,0x07EB,0x07EB}, /* 07EA */ + {0x07EC,0x07EC,0x07EC}, {0x07ED,0x07ED,0x07ED}, /* 07EC */ + {0x07EE,0x07EE,0x07EE}, {0x07EF,0x07EF,0x07EF}, /* 07EE */ + {0x07F0,0x07F0,0x07F0}, {0x07F1,0x07F1,0x07F1}, /* 07F0 */ + {0x07F2,0x07F2,0x07F2}, {0x07F3,0x07F3,0x07F3}, /* 07F2 */ + {0x07F4,0x07F4,0x07F4}, {0x07F5,0x07F5,0x07F5}, /* 07F4 */ + {0x07F6,0x07F6,0x07F6}, {0x07F7,0x07F7,0x07F7}, /* 07F6 */ + {0x07F8,0x07F8,0x07F8}, {0x07F9,0x07F9,0x07F9}, /* 07F8 */ + {0x07FA,0x07FA,0x07FA}, {0x07FB,0x07FB,0x07FB}, /* 07FA */ + {0x07FC,0x07FC,0x07FC}, {0x07FD,0x07FD,0x07FD}, /* 07FC */ + {0x07FE,0x07FE,0x07FE}, {0x07FF,0x07FF,0x07FF} /* 07FE */ +}; + static MY_UNICASE_CHARACTER plane1E[]={ {0x1E00,0x1E01,0x0041}, {0x1E00,0x1E01,0x0041}, {0x1E02,0x1E03,0x0042}, {0x1E02,0x1E03,0x0042}, @@ -1695,7 +1957,7 @@ static MY_UNICASE_CHARACTER planeFF[]={ MY_UNICASE_CHARACTER *my_unicase_default_pages[256]= { my_unicase_default_page00, - plane01, plane02, plane03, plane04, plane05, NULL, NULL, + plane01, plane02, plane03, plane04, plane05, plane06, plane07, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, plane1E, plane1F, @@ -1742,7 +2004,7 @@ MY_UNICASE_INFO my_unicase_default= */ MY_UNICASE_CHARACTER *my_unicase_pages_mysql500[256]={ plane00_mysql500, - plane01, plane02, plane03, plane04, plane05, NULL, NULL, + plane01, plane02, plane03, plane04, plane05, plane06, plane07, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, plane1E, plane1F, @@ -1929,7 +2191,7 @@ static MY_UNICASE_CHARACTER turk00[]= static MY_UNICASE_CHARACTER *my_unicase_pages_turkish[256]= { - turk00, plane01, plane02, plane03, plane04, plane05, NULL, NULL, + turk00, plane01, plane02, plane03, plane04, plane05, plane06, plane07, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, plane1E, plane1F, @@ -4333,7 +4595,7 @@ static MY_UNICASE_CHARACTER u520p104[]={ MY_UNICASE_CHARACTER *my_unicase_pages_unicode520[4352]= { - u520p00, u520p01, u520p02, u520p03, u520p04, u520p05, NULL, NULL, + u520p00, u520p01, u520p02, u520p03, u520p04, u520p05, plane06, plane07, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, u520p10, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, u520p1D, u520p1E, u520p1F, @@ -5231,7 +5493,17 @@ static inline int my_weight_mb2_utf8mb3_general_ci(uchar b0, uchar b1) { my_wc_t wc= UTF8MB2_CODE(b0, b1); MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8]; - return (int) (page ? page[wc & 0xFF].sort : wc); + /* + 2-byte utf8 sequences encode Unicode characters up to U+07FF. + my_unicase_default_pages[N] has non-NULL page pointers + for all N in the range [0..7]. + - my_unicase_default_pages[0..5] point to real translation data + - my_unicase_default_pages[6..7] point to dummy pages + (without real translation). + By adding these dummy pages we can avoid testing 'page' against NULL. + This gives up to 20% performance improvement. + */ + return (int) page[wc & 0xFF].sort; } @@ -5255,6 +5527,7 @@ static inline int my_weight_mb3_utf8mb3_general_ci(uchar b0, uchar b1, uchar b2) #define WEIGHT_MB1(x) my_weight_mb1_utf8mb3_general_ci(x) #define WEIGHT_MB2(x,y) my_weight_mb2_utf8mb3_general_ci(x,y) #define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8mb3_general_ci(x,y,z) +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -5264,6 +5537,7 @@ static inline int my_weight_mb3_utf8mb3_general_ci(uchar b0, uchar b1, uchar b2) #define WEIGHT_MB1(x) my_weight_mb1_utf8mb3_general_ci(x) #define WEIGHT_MB2(x,y) my_weight_mb2_utf8mb3_general_ci(x,y) #define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8mb3_general_ci(x,y,z) +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -5277,7 +5551,11 @@ static inline int my_weight_mb2_utf8mb3_general_mysql500_ci(uchar b0, uchar b1) { my_wc_t wc= UTF8MB2_CODE(b0, b1); MY_UNICASE_CHARACTER *page= my_unicase_pages_mysql500[wc >> 8]; - return (int) (page ? page[wc & 0xFF].sort : wc); + /* + `page` should never be NULL for 2-byte utf8 characters. + See comments in my_weight_mb2_utf8mb3_general_ci(). + */ + return (int) page[wc & 0xFF].sort; } @@ -5301,6 +5579,7 @@ my_weight_mb3_utf8mb3_general_mysql500_ci(uchar b0, uchar b1, uchar b2) #define WEIGHT_MB1(x) my_weight_mb1_utf8mb3_general_mysql500_ci(x) #define WEIGHT_MB2(x,y) my_weight_mb2_utf8mb3_general_mysql500_ci(x,y) #define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8mb3_general_mysql500_ci(x,y,z) +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -5312,6 +5591,7 @@ my_weight_mb3_utf8mb3_general_mysql500_ci(uchar b0, uchar b1, uchar b2) #define WEIGHT_MB1(x) ((int) (uchar) (x)) #define WEIGHT_MB2(x,y) ((int) UTF8MB2_CODE(x,y)) #define WEIGHT_MB3(x,y,z) ((int) UTF8MB3_CODE(x,y,z)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" @@ -5321,6 +5601,7 @@ my_weight_mb3_utf8mb3_general_mysql500_ci(uchar b0, uchar b1, uchar b2) #define WEIGHT_MB1(x) ((int) (uchar) (x)) #define WEIGHT_MB2(x,y) ((int) UTF8MB2_CODE(x,y)) #define WEIGHT_MB3(x,y,z) ((int) UTF8MB3_CODE(x,y,z)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" /* @@ -7692,6 +7973,7 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), All non-BMP characters have the same weight. */ #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -7701,6 +7983,7 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), #define WEIGHT_MB2(b0,b1) ((int) UTF8MB2_CODE(b0,b1)) #define WEIGHT_MB3(b0,b1,b2) ((int) UTF8MB3_CODE(b0,b1,b2)) #define WEIGHT_MB4(b0,b1,b2,b3) ((int) UTF8MB4_CODE(b0,b1,b2,b3)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" @@ -7715,6 +7998,7 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), All non-BMP characters have the same weight. */ #define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER +#define STRCOLL_MB7_TOUPPER #include "strcoll.inl" @@ -7725,6 +8009,7 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)), #define WEIGHT_MB2(b0,b1) ((int) UTF8MB2_CODE(b0,b1)) #define WEIGHT_MB3(b0,b1,b2) ((int) UTF8MB3_CODE(b0,b1,b2)) #define WEIGHT_MB4(b0,b1,b2,b3) ((int) UTF8MB4_CODE(b0,b1,b2,b3)) +#define STRCOLL_MB7_BIN #include "strcoll.inl" diff --git a/strings/json_lib.c b/strings/json_lib.c index 6b2a6416952..6898e9741a8 100644 --- a/strings/json_lib.c +++ b/strings/json_lib.c @@ -951,7 +951,7 @@ int json_read_value(json_engine_t *j) { int t_next, c_len, res; - j->value_type= JSON_VALUE_UNINITALIZED; + j->value_type= JSON_VALUE_UNINITIALIZED; if (j->state == JST_KEY) { while (json_read_keyname_chr(j) == 0) {} @@ -1640,7 +1640,7 @@ int json_escape(CHARSET_INFO *str_cs, if (c_len < 0) { /* JSON buffer is depleted. */ - return -1; + return JSON_ERROR_OUT_OF_SPACE; } /* JSON charset cannot convert this character. */ @@ -1652,7 +1652,7 @@ int json_escape(CHARSET_INFO *str_cs, json+= c_len, json_end)) <= 0) { /* JSON buffer is depleted. */ - return -1; + return JSON_ERROR_OUT_OF_SPACE; } json+= c_len; @@ -1685,11 +1685,11 @@ int json_escape(CHARSET_INFO *str_cs, continue; } /* JSON buffer is depleted. */ - return -1; + return JSON_ERROR_OUT_OF_SPACE; } } else /* c_len == 0, an illegal symbol. */ - return -1; + return JSON_ERROR_ILLEGAL_SYMBOL; } return (int)(json - json_start); diff --git a/strings/json_normalize.c b/strings/json_normalize.c new file mode 100644 index 00000000000..0b7f172dae6 --- /dev/null +++ b/strings/json_normalize.c @@ -0,0 +1,852 @@ +/* Copyright (c) 2021 Eric Herman and MariaDB Foundation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include <my_global.h> +#include <json_lib.h> + +#ifndef PSI_JSON +#define PSI_JSON PSI_NOT_INSTRUMENTED +#endif + +#ifndef JSON_MALLOC_FLAGS +#define JSON_MALLOC_FLAGS MYF(MY_THREAD_SPECIFIC|MY_WME) +#endif + +/* +From the EXPIRED DRAFT JSON Canonical Form +https://datatracker.ietf.org/doc/html/draft-staykov-hu-json-canonical-form-00 + +2. JSON canonical form + + The canonical form is defined by the following rules: + * The document MUST be encoded in UTF-8 [UTF-8] + * Non-significant(1) whitespace characters MUST NOT be used + * Non-significant(1) line endings MUST NOT be used + * Entries (set of name/value pairs) in JSON objects MUST be sorted + lexicographically(2) by their names + * Arrays MUST preserve their initial ordering + + (1)As defined in JSON data-interchange format [JSON], JSON objects + consists of multiple "name"/"value" pairs and JSON arrays consists + of multiple "value" fields. Non-significant means not part of + "name" or "value". + + + (2)Lexicographic comparison, which orders strings from least to + greatest alphabetically based on the UCS (Unicode Character Set) + codepoint values. +*/ + + +struct json_norm_array { + DYNAMIC_ARRAY values; +}; + + +struct json_norm_object { + DYNAMIC_ARRAY kv_pairs; +}; + + +struct json_norm_value { + enum json_value_types type; + union { + DYNAMIC_STRING number; + LEX_STRING string; + struct json_norm_array array; + struct json_norm_object object; + } value; +}; + + +struct json_norm_kv { + LEX_STRING key; + struct json_norm_value value; +}; + + +static void * +json_norm_malloc(size_t size) +{ + return my_malloc(PSI_JSON, size, JSON_MALLOC_FLAGS); +} + + +int +json_norm_string_init(LEX_STRING *string, const char *str, size_t len) +{ + string->length= len + 1; + string->str= json_norm_malloc(string->length); + if (!string->str) + { + string->length= 0; + return 1; + } + strncpy(string->str, str, len); + string->str[len]= 0; + return 0; +} + + +void +json_norm_string_free(LEX_STRING *string) +{ + my_free(string->str); + string->str= NULL; + string->length= 0; +} + + +void +json_norm_number_free(DYNAMIC_STRING *number) +{ + dynstr_free(number); + number->length= 0; +} + + +int +json_normalize_number(DYNAMIC_STRING *out, const char *str, size_t str_len) +{ + int err= 0; + long int magnitude= 0; + int negative= 0; + size_t i= 0; + size_t j= 0; + size_t k= 0; + char *buf= NULL; + size_t buf_size = str_len + 1; + + buf= json_norm_malloc(buf_size); + if (!buf) + return 1; + + memset(buf, 0x00, buf_size); + + if (str[0] == '-') + { + negative= 1; + ++i; + } + + /* grab digits preceding the decimal */ + for (; i < str_len && str[i] != '.' && str[i] != 'e' && str[i] != 'E'; ++i) + buf[j++] = str[i]; + + magnitude = (long)(j - 1); + + /* skip the . */ + if (str[i] == '.') + ++i; + + /* grab rest of digits before the E */ + for (; i < str_len && str[i] != 'e' && str[i] != 'E'; ++i) + buf[j++] = str[i]; + + /* trim trailing zeros */ + for (k = j - 1; k && buf[k] == '0'; --k, --j) + buf[k] = '\0'; + + /* trim the leading zeros */ + for (k = 0; buf[k] && buf[k] == '0'; ++k); + if (k) + { + memmove(buf, buf + k, j - k); + j = j - k; + buf[j] = '\0'; + magnitude -= (long)k; + } + + if (!j) + { + err= dynstr_append_mem(out, STRING_WITH_LEN("0.0E0")); + my_free(buf); + return err; + } + + if (negative) + err|= dynstr_append_mem(out, STRING_WITH_LEN("-")); + err|= dynstr_append_mem(out, buf, 1); + err|= dynstr_append_mem(out, STRING_WITH_LEN(".")); + if (j == 1) + err|= dynstr_append_mem(out, STRING_WITH_LEN("0")); + else + err|= dynstr_append(out, buf + 1); + + err|= dynstr_append_mem(out, STRING_WITH_LEN("E")); + + if (str[i] == 'e' || str[i] == 'E') + { + char *endptr = NULL; + /* skip the [eE] */ + ++i; + /* combine the exponent with current magnitude */ + magnitude += strtol(str + i, &endptr, 10); + } + snprintf(buf, buf_size, "%ld", magnitude); + err|= dynstr_append(out, buf); + + my_free(buf); + return err ? 1 : 0; +} + + +static int +json_norm_object_append_key_value(struct json_norm_object *obj, + DYNAMIC_STRING *key, + struct json_norm_value *val) +{ + struct json_norm_kv pair; + int err= json_norm_string_init(&pair.key, key->str, key->length); + + if (err) + return 1; + + pair.value= *val; + + err|= insert_dynamic(&obj->kv_pairs, &pair); + if (err) + { + json_norm_string_free(&pair.key); + return 1; + } + + return 0; +} + + +static struct json_norm_kv* +json_norm_object_get_last_element(struct json_norm_object *obj) +{ + struct json_norm_kv *kv; + + DBUG_ASSERT(obj->kv_pairs.elements > 0); + kv= dynamic_element(&obj->kv_pairs, + obj->kv_pairs.elements - 1, + struct json_norm_kv*); + return kv; +} + + +static struct json_norm_value* +json_norm_array_get_last_element(struct json_norm_array *arr) +{ + struct json_norm_value *val; + + DBUG_ASSERT(arr->values.elements > 0); + val= dynamic_element(&arr->values, + arr->values.elements - 1, + struct json_norm_value*); + return val; +} + + +static int +json_norm_array_append_value(struct json_norm_array *arr, + struct json_norm_value *val) +{ + return insert_dynamic(&arr->values, val); +} + + +int +json_norm_init_dynamic_array(size_t element_size, void *where) +{ + const size_t init_alloc= 20; + const size_t alloc_increment= 20; + return my_init_dynamic_array(PSI_JSON, where, element_size, + init_alloc, alloc_increment, + JSON_MALLOC_FLAGS); +} + + +int +json_norm_value_object_init(struct json_norm_value *val) +{ + const size_t element_size= sizeof(struct json_norm_kv); + struct json_norm_object *obj= &val->value.object; + + val->type= JSON_VALUE_OBJECT; + + return json_norm_init_dynamic_array(element_size, &obj->kv_pairs); +} + + +int +json_norm_value_array_init(struct json_norm_value *val) +{ + const size_t element_size= sizeof(struct json_norm_value); + struct json_norm_array *array= &val->value.array; + + val->type= JSON_VALUE_ARRAY; + + return json_norm_init_dynamic_array(element_size, &array->values); +} + + +static int +json_norm_value_string_init(struct json_norm_value *val, + const char *str, size_t len) +{ + val->type= JSON_VALUE_STRING; + return json_norm_string_init(&val->value.string, str, len); +} + + +static int +json_norm_kv_comp(const struct json_norm_kv *a, + const struct json_norm_kv *b) +{ + return my_strnncoll(&my_charset_utf8mb4_bin, + (const uchar *)a->key.str, a->key.length, + (const uchar *)b->key.str, b->key.length); +} + + +static void +json_normalize_sort(struct json_norm_value *val) +{ + switch (val->type) { + case JSON_VALUE_OBJECT: + { + size_t i; + DYNAMIC_ARRAY *pairs= &val->value.object.kv_pairs; + for (i= 0; i < pairs->elements; ++i) + { + struct json_norm_kv *kv= dynamic_element(pairs, i, struct json_norm_kv*); + json_normalize_sort(&kv->value); + } + + my_qsort(dynamic_element(pairs, 0, struct json_norm_kv*), + pairs->elements, sizeof(struct json_norm_kv), + (qsort_cmp) json_norm_kv_comp); + break; + } + case JSON_VALUE_ARRAY: + { + /* Arrays in JSON must keep the order. Just recursively sort values. */ + size_t i; + DYNAMIC_ARRAY *values= &val->value.array.values; + for (i= 0; i < values->elements; ++i) + { + struct json_norm_value *value; + value= dynamic_element(values, i, struct json_norm_value*); + json_normalize_sort(value); + } + + break; + } + case JSON_VALUE_UNINITIALIZED: + DBUG_ASSERT(0); + break; + default: /* Nothing to do for other types. */ + break; + } +} + + +static void +json_norm_value_free(struct json_norm_value *val) +{ + size_t i; + switch (val->type) { + case JSON_VALUE_OBJECT: + { + struct json_norm_object *obj= &val->value.object; + + DYNAMIC_ARRAY *pairs_arr= &obj->kv_pairs; + for (i= 0; i < pairs_arr->elements; ++i) + { + struct json_norm_kv *kv; + kv= dynamic_element(pairs_arr, i, struct json_norm_kv *); + json_norm_string_free(&kv->key); + json_norm_value_free(&kv->value); + } + delete_dynamic(pairs_arr); + break; + } + case JSON_VALUE_ARRAY: + { + struct json_norm_array *arr= &val->value.array; + + DYNAMIC_ARRAY *values_arr= &arr->values; + for (i= 0; i < arr->values.elements; ++i) + { + struct json_norm_value *jt_value; + jt_value= dynamic_element(values_arr, i, struct json_norm_value *); + json_norm_value_free(jt_value); + } + delete_dynamic(values_arr); + break; + } + case JSON_VALUE_STRING: + { + json_norm_string_free(&val->value.string); + break; + } + case JSON_VALUE_NUMBER: + json_norm_number_free(&val->value.number); + break; + case JSON_VALUE_NULL: + case JSON_VALUE_TRUE: + case JSON_VALUE_FALSE: + case JSON_VALUE_UNINITIALIZED: + break; + } + val->type= JSON_VALUE_UNINITIALIZED; +} + + +static int +json_norm_to_string(DYNAMIC_STRING *buf, struct json_norm_value *val) +{ + switch (val->type) + { + case JSON_VALUE_OBJECT: + { + size_t i; + struct json_norm_object *obj= &val->value.object; + DYNAMIC_ARRAY *pairs_arr= &obj->kv_pairs; + + if (dynstr_append_mem(buf, STRING_WITH_LEN("{"))) + return 1; + + for (i= 0; i < pairs_arr->elements; ++i) + { + struct json_norm_kv *kv; + kv= dynamic_element(pairs_arr, i, struct json_norm_kv *); + + if (dynstr_append_mem(buf, STRING_WITH_LEN("\"")) || + dynstr_append(buf, kv->key.str) || + dynstr_append_mem(buf, STRING_WITH_LEN("\":")) || + json_norm_to_string(buf, &kv->value)) + return 1; + + if (i != (pairs_arr->elements - 1)) + if (dynstr_append_mem(buf, STRING_WITH_LEN(","))) + return 1; + } + if (dynstr_append_mem(buf, STRING_WITH_LEN("}"))) + return 1; + break; + } + case JSON_VALUE_ARRAY: + { + size_t i; + struct json_norm_array *arr= &val->value.array; + DYNAMIC_ARRAY *values_arr= &arr->values; + + if (dynstr_append_mem(buf, STRING_WITH_LEN("["))) + return 1; + for (i= 0; i < values_arr->elements; ++i) + { + struct json_norm_value *jt_value; + jt_value= dynamic_element(values_arr, i, struct json_norm_value *); + + if (json_norm_to_string(buf, jt_value)) + return 1; + if (i != (values_arr->elements - 1)) + if (dynstr_append_mem(buf, STRING_WITH_LEN(","))) + return 1; + } + if (dynstr_append_mem(buf, STRING_WITH_LEN("]"))) + return 1; + break; + } + case JSON_VALUE_STRING: + { + if (dynstr_append(buf, val->value.string.str)) + return 1; + break; + } + case JSON_VALUE_NULL: + { + if (dynstr_append_mem(buf, STRING_WITH_LEN("null"))) + return 1; + break; + } + case JSON_VALUE_TRUE: + { + if (dynstr_append_mem(buf, STRING_WITH_LEN("true"))) + return 1; + break; + } + case JSON_VALUE_FALSE: + { + if (dynstr_append_mem(buf, STRING_WITH_LEN("false"))) + return 1; + break; + } + case JSON_VALUE_NUMBER: + { + if (dynstr_append(buf, val->value.number.str)) + return 1; + break; + } + case JSON_VALUE_UNINITIALIZED: + { + DBUG_ASSERT(0); + break; + } + } + return 0; +} + + +static int +json_norm_value_number_init(struct json_norm_value *val, + const char *number, size_t num_len) +{ + int err; + val->type= JSON_VALUE_NUMBER; + err= init_dynamic_string(&val->value.number, NULL, 0, 0); + if (err) + return 1; + err= json_normalize_number(&val->value.number, number, num_len); + if (err) + dynstr_free(&val->value.number); + return err; +} + + +static void +json_norm_value_null_init(struct json_norm_value *val) +{ + val->type= JSON_VALUE_NULL; +} + + +static void +json_norm_value_false_init(struct json_norm_value *val) +{ + val->type= JSON_VALUE_FALSE; +} + + +static void +json_norm_value_true_init(struct json_norm_value *val) +{ + val->type= JSON_VALUE_TRUE; +} + + +static int +json_norm_value_init(struct json_norm_value *val, json_engine_t *je) +{ + int err= 0; + switch (je->value_type) { + case JSON_VALUE_STRING: + { + const char *je_value_begin= (const char *)je->value_begin; + size_t je_value_len= (je->value_end - je->value_begin); + err= json_norm_value_string_init(val, je_value_begin, je_value_len); + break; + } + case JSON_VALUE_NULL: + { + json_norm_value_null_init(val); + break; + } + case JSON_VALUE_TRUE: + { + json_norm_value_true_init(val); + break; + } + case JSON_VALUE_FALSE: + { + json_norm_value_false_init(val); + break; + } + case JSON_VALUE_ARRAY: + { + err= json_norm_value_array_init(val); + break; + } + case JSON_VALUE_OBJECT: + { + err= json_norm_value_object_init(val); + break; + } + case JSON_VALUE_NUMBER: + { + const char *je_number_begin= (const char *)je->value_begin; + size_t je_number_len= (je->value_end - je->value_begin); + err= json_norm_value_number_init(val, je_number_begin, je_number_len); + break; + } + default: + DBUG_ASSERT(0); + return 1; + } + return err; +} + + +static int +json_norm_append_to_array(struct json_norm_value *val, + json_engine_t *je) +{ + int err= 0; + struct json_norm_value tmp; + + DBUG_ASSERT(val->type == JSON_VALUE_ARRAY); + DBUG_ASSERT(je->value_type != JSON_VALUE_UNINITIALIZED); + + err= json_norm_value_init(&tmp, je); + + if (err) + return 1; + + err= json_norm_array_append_value(&val->value.array, &tmp); + + if (err) + json_norm_value_free(&tmp); + + return err; +} + + +static int +json_norm_append_to_object(struct json_norm_value *val, + DYNAMIC_STRING *key, json_engine_t *je) +{ + int err= 0; + struct json_norm_value tmp; + + DBUG_ASSERT(val->type == JSON_VALUE_OBJECT); + DBUG_ASSERT(je->value_type != JSON_VALUE_UNINITIALIZED); + + err= json_norm_value_init(&tmp, je); + + if (err) + return 1; + + err= json_norm_object_append_key_value(&val->value.object, key, &tmp); + + if (err) + json_norm_value_free(&tmp); + + return err; +} + + +static int +json_norm_parse(struct json_norm_value *root, json_engine_t *je) +{ + size_t current; + struct json_norm_value *stack[JSON_DEPTH_LIMIT]; + int err= 0; + DYNAMIC_STRING key; + + err= init_dynamic_string(&key, NULL, 0, 0); + if (err) + goto json_norm_parse_end; + + memset(stack, 0x00, sizeof(stack)); + current= 0; + stack[current]= root; + + do { + switch (je->state) + { + case JST_KEY: + { + const uchar *key_start= je->s.c_str; + const uchar *key_end; + + DBUG_ASSERT(stack[current]->type == JSON_VALUE_OBJECT); + do + { + key_end= je->s.c_str; + } while (json_read_keyname_chr(je) == 0); + + /* we have the key name */ + /* reset the dynstr: */ + dynstr_trunc(&key, key.length); + dynstr_append_mem(&key, (char *)key_start, (key_end - key_start)); + + /* After reading the key, we have a follow-up value. */ + err= json_read_value(je); + if (err) + goto json_norm_parse_end; + + err= json_norm_append_to_object(stack[current], &key, je); + if (err) + goto json_norm_parse_end; + + if (je->value_type == JSON_VALUE_ARRAY || + je->value_type == JSON_VALUE_OBJECT) + { + struct json_norm_kv *kv; + + err= ((current + 1) == JSON_DEPTH_LIMIT); + if (err) + goto json_norm_parse_end; + + kv= json_norm_object_get_last_element(&stack[current]->value.object); + stack[++current]= &kv->value; + } + break; + } + case JST_VALUE: + { + struct json_norm_array *current_arr= &stack[current]->value.array; + err= json_read_value(je); + if (err) + goto json_norm_parse_end; + + DBUG_ASSERT(stack[current]->type == JSON_VALUE_ARRAY); + + err= json_norm_append_to_array(stack[current], je); + if (err) + goto json_norm_parse_end; + + if (je->value_type == JSON_VALUE_ARRAY || + je->value_type == JSON_VALUE_OBJECT) + { + + err= ((current + 1) == JSON_DEPTH_LIMIT); + if (err) + goto json_norm_parse_end; + + stack[++current]= json_norm_array_get_last_element(current_arr); + } + + break; + } + case JST_OBJ_START: + /* parser found an object (the '{' in JSON) */ + break; + case JST_OBJ_END: + /* parser found the end of the object (the '}' in JSON) */ + /* pop stack */ + --current; + break; + case JST_ARRAY_START: + /* parser found an array (the '[' in JSON) */ + break; + case JST_ARRAY_END: + /* parser found the end of the array (the ']' in JSON) */ + /* pop stack */ + --current; + break; + }; + } while (json_scan_next(je) == 0); + +json_norm_parse_end: + dynstr_free(&key); + return err; +} + + +static int +json_norm_build(struct json_norm_value *root, + const char *s, size_t size, CHARSET_INFO *cs) +{ + int err= 0; + json_engine_t je; + + DBUG_ASSERT(s); + memset(&je, 0x00, sizeof(je)); + + memset(root, 0x00, sizeof(struct json_norm_value)); + root->type= JSON_VALUE_UNINITIALIZED; + + err= json_scan_start(&je, cs, (const uchar *)s, (const uchar *)(s + size)); + if (json_read_value(&je)) + return err; + + err= json_norm_value_init(root, &je); + + if (root->type == JSON_VALUE_OBJECT || + root->type == JSON_VALUE_ARRAY) + { + err= json_norm_parse(root, &je); + if (err) + return err; + } + return err; +} + + +int +json_normalize(DYNAMIC_STRING *result, + const char *s, size_t size, CHARSET_INFO *cs) +{ + int err= 0; + uint convert_err= 0; + struct json_norm_value root; + char *s_utf8= NULL; + size_t in_size; + const char *in; + + DBUG_ASSERT(result); + + memset(&root, 0x00, sizeof(root)); + root.type = JSON_VALUE_UNINITIALIZED; + + /* + Convert the incoming string to utf8mb4_bin before doing any other work. + According to JSON RFC 8259, between systems JSON must be UTF-8 + https://datatracker.ietf.org/doc/html/rfc8259#section-8.1 + */ + if (cs == &my_charset_utf8mb4_bin) + { + in= s; + in_size= size; + } + else + { + in_size= (size * my_charset_utf8mb4_bin.mbmaxlen) + 1; + s_utf8= json_norm_malloc(in_size); + if (!s_utf8) + return 1; + memset(s_utf8, 0x00, in_size); + my_convert(s_utf8, (uint32)in_size, &my_charset_utf8mb4_bin, + s, (uint32)size, cs, &convert_err); + if (convert_err) + { + my_free(s_utf8); + return 1; + } + in= s_utf8; + in_size= strlen(s_utf8); + } + + + if (!json_valid(in, in_size, &my_charset_utf8mb4_bin)) + { + err= 1; + goto json_normalize_end; + } + + err= json_norm_build(&root, in, in_size, &my_charset_utf8mb4_bin); + if (err) + goto json_normalize_end; + + json_normalize_sort(&root); + + err= json_norm_to_string(result, &root); + +json_normalize_end: + json_norm_value_free(&root); + if (err) + dynstr_free(result); + if (s_utf8) + my_free(s_utf8); + return err; +} + + diff --git a/strings/strcoll.inl b/strings/strcoll.inl index 50849c06e7d..eb5c6e3c717 100644 --- a/strings/strcoll.inl +++ b/strings/strcoll.inl @@ -16,6 +16,8 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ +#include "ctype-ascii.h" + #ifndef MY_FUNCTION_NAME #error MY_FUNCTION_NAME is not defined #endif @@ -40,6 +42,42 @@ /* + For binary collations: + - on 32bit platforms perform only 4 byte optimization + - on 64bit platforms perform both 4 byte and 8 byte optimization +*/ +#if defined(STRCOLL_MB7_BIN) +#define MY_STRCOLL_MB7_4BYTES(a,b) my_strcoll_mb7_bin_4bytes((a),(b)) +#if SIZEOF_VOIDP == 8 +#define STRCOLL_MB7_8BYTES +#define MY_STRCOLL_MB7_8BYTES(a,b) my_strcoll_mb7_bin_8bytes((a),(b)) +#endif /* Architecture test */ +#endif /* STRCOLL_MB7_BIN */ + + +/* + For case insensitive collations with trivial mapping from [a-z] to [A-Z] + perform optimization only on 64 bit platforms. + There is no sense to perform my_ascii_to_upper_magic_uint64() based + optimization on 32bit platforms. The idea of this optimization + is that it handles 8bytes at a time, using 64bit CPU registers. + Enabling this optimization on 32bit platform may only slow things down. +*/ +#if defined(STRCOLL_MB7_TOUPPER) +#if SIZEOF_VOIDP == 8 +#define MY_STRCOLL_MB7_4BYTES(a,b) my_strcoll_ascii_toupper_4bytes((a),(b)) +#define MY_STRCOLL_MB7_8BYTES(a,b) my_strcoll_ascii_toupper_8bytes((a),(b)) +#endif /* Architecture test */ +#endif /* STRCOLL_MB7_TOUPPER */ + + +/* + A helper macro to shift two pointers forward, to the given amount. +*/ +#define MY_STRING_SHIFT_PTR_PTR(a,b,len) do { a+= len; b+= len; } while(0) + + +/* Weight of an illegal byte, must follow these rules: 1. Must be greater than weight of any normal character in the collation. 2. Two different bad bytes must have different weights and must be @@ -182,7 +220,31 @@ MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs __attribute__((unused)), { int a_weight, b_weight, res; uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end); - uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); + uint b_wlen; + +#ifdef MY_STRCOLL_MB7_4BYTES + if (a_wlen == 1 && my_strcoll_ascii_4bytes_found(a, a_end, b, b_end)) + { + int res; +#ifdef MY_STRCOLL_MB7_8BYTES + /*TODO: a a loop here >='a' <='z' here, for automatic vectorization*/ + if (my_strcoll_ascii_4bytes_found(a + 4, a_end, b + 4, b_end)) + { + if ((res= MY_STRCOLL_MB7_8BYTES(a, b))) + return res; + MY_STRING_SHIFT_PTR_PTR(a, b, 8); + continue; + } +#endif + if ((res= MY_STRCOLL_MB7_4BYTES(a, b))) + return res; + MY_STRING_SHIFT_PTR_PTR(a, b, 4); + continue; + } +#endif /* MY_STRCOLL_MB7_4BYTES */ + + b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); + /* a_wlen b_wlen Comment ------ ------ ------- @@ -253,7 +315,30 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), { int a_weight, b_weight, res; uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end); - uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); + uint b_wlen; + +#ifdef MY_STRCOLL_MB7_4BYTES + if (a_wlen == 1 && my_strcoll_ascii_4bytes_found(a, a_end, b, b_end)) + { + int res; +#ifdef MY_STRCOLL_MB7_8BYTES + if (my_strcoll_ascii_4bytes_found(a + 4, a_end, b + 4, b_end)) + { + if ((res= MY_STRCOLL_MB7_8BYTES(a, b))) + return res; + MY_STRING_SHIFT_PTR_PTR(a, b, 8); + continue; + } +#endif + if ((res= MY_STRCOLL_MB7_4BYTES(a, b))) + return res; + MY_STRING_SHIFT_PTR_PTR(a, b, 4); + continue; + } +#endif /* MY_STRCOLL_MB7_4BYTES */ + + b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); + if ((res= (a_weight - b_weight))) { /* @@ -286,7 +371,7 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), DBUG_ASSERT(0); return 0; } -#endif +#endif /* DEFINE_STRNNCOLLSP_NOPAD */ /** @@ -652,3 +737,8 @@ MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs, #undef DEFINE_STRNXFRM_UNICODE_BIN2 #undef DEFINE_STRNNCOLL #undef DEFINE_STRNNCOLLSP_NOPAD + +#undef STRCOLL_MB7_TOUPPER +#undef STRCOLL_MB7_BIN +#undef MY_STRCOLL_MB7_4BYTES +#undef MY_STRCOLL_MB7_8BYTES |