summaryrefslogtreecommitdiff
path: root/strings
diff options
context:
space:
mode:
Diffstat (limited to 'strings')
-rw-r--r--strings/CMakeLists.txt2
-rw-r--r--strings/ctype-ascii.h189
-rw-r--r--strings/ctype-big5.c2
-rw-r--r--strings/ctype-cp932.c4
-rw-r--r--strings/ctype-euc_kr.c4
-rw-r--r--strings/ctype-eucjpms.c4
-rw-r--r--strings/ctype-gb2312.c2
-rw-r--r--strings/ctype-gbk.c2
-rw-r--r--strings/ctype-sjis.c4
-rw-r--r--strings/ctype-uca-scanner_next.inl74
-rw-r--r--strings/ctype-uca.c81
-rw-r--r--strings/ctype-uca.inl1
-rw-r--r--strings/ctype-ujis.c4
-rw-r--r--strings/ctype-utf8.c297
-rw-r--r--strings/json_lib.c10
-rw-r--r--strings/json_normalize.c852
-rw-r--r--strings/strcoll.inl96
17 files changed, 1557 insertions, 71 deletions
diff --git a/strings/CMakeLists.txt b/strings/CMakeLists.txt
index 0e62f9e34ad..54612256adc 100644
--- a/strings/CMakeLists.txt
+++ b/strings/CMakeLists.txt
@@ -23,7 +23,7 @@ SET(STRINGS_SOURCES bchange.c bmove_upp.c ctype-big5.c ctype-bin.c ctype-cp932.c
str2int.c strcend.c strend.c strfill.c strmake.c strmov.c strnmov.c
strxmov.c strxnmov.c xml.c
strmov_overlapp.c
- my_strchr.c strcont.c strappend.c json_lib.c)
+ my_strchr.c strcont.c strappend.c json_lib.c json_normalize.c)
IF(NOT HAVE_STRNLEN)
# OSX below 10.7 did not have strnlen
diff --git a/strings/ctype-ascii.h b/strings/ctype-ascii.h
new file mode 100644
index 00000000000..540d01b1a0d
--- /dev/null
+++ b/strings/ctype-ascii.h
@@ -0,0 +1,189 @@
+#ifndef CTYPE_ASCII_INCLUDED
+#define CTYPE_ASCII_INCLUDED
+
+#include "myisampack.h"
+
+/*
+ Magic expression. It uses the fact that for any byte value X in
+ the range 0..31 (0x00..0x1F) the expression (X+31)*5 returns
+ the 7th bit (0x80) set only for the following six (out of 32) values:
+ 0x00, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F.
+ These values correspond to offsets of non-letter characters
+ in the ASCII table:
+
+ The following macro sets the bit 0x20 for the following characters:
+ ---------------- --------------------------------
+ Magic bit 10000000000000000000000000011111
+ ASCII 0x00..0x1F ................................ Control
+ ASCII 0x20..0x3F ................................ Punctuation, digits
+ ASCII 0x40..0x5F @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_
+ ASCII 0x60..0x7F `abcdefghijklmnopqrstuvwxyz{|}~.
+ ---------------- --------------------------------
+ We shift the magic bit 0x80 right twice to make it 0x20.
+ So on the ranges [40..5F] and [60..7F] the expression
+ has the bit 0x20 set for all non-letter characters.
+ Note, other bits contain garbage.
+
+ Requirements:
+ All bytes must be in the range [00..7F],
+ to avoid overflow and carry to the next byte.
+*/
+#define MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC(i) \
+ (((((i)+0x1F1F1F1F1F1F1F1FULL) & 0x1F1F1F1F1F1F1F1F) * 5) >> 2)
+
+
+/*
+ The following macro returns the bit 0x20 set to:
+ - 1 for input bytes in the ranges [60..7F] or [E0..FF]
+ - 0 otherwise
+ Bytes in the ranges [40..7F] and [C0..FF] have the bit 0x40 set.
+ Bytes in the ranges [60..7F] and [E0..FF] have the bit 0x20 set.
+ Hex BinHi BinLo
+ ---- -1-- ----
+ 0x[4C]X .10. ....
+ 0x[5D]X .10. ....
+ 0x[6E]X .11. ....
+ 0x[7F]X .11. ....
+*/
+#define MY_ASCII_20_IS_SET_IF_RANGE_60_7F_OR_E0_FF(i) (((i) >> 1) & ((i)))
+
+
+/*
+ The following macro evaluates to exactly 0x20 for all
+ lower case ASCII letters [a-z], and to 0x00 otherwise:
+
+ Value Range Character range Subrange
+ -------- -------- -------------------------------- -------
+ 00000000 0x00..0x3F Control, punctuation, digits
+ 00100000 0x40..0x5F @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_ letters A-Z
+ 00000000 0x40..0x5F @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_ non-letters
+ 00100000 0x60..0x7F `abcdefghijklmnopqrstuvwxyz{|}~. letters a-z
+ 00000000 0x60..0x7F `abcdefghijklmnopqrstuvwxyz{|}~. non-letters
+
+ Requirements:
+ All bytes must be in the range [00..7F].
+ See the comments in MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC().
+*/
+
+#define MY_ASCII_20_IF_IS_LOWER_LETTER(i) \
+ (MY_ASCII_20_IS_SET_IF_RANGE_60_7F_OR_E0_FF(i) & \
+ ~MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC(i) & \
+ 0x2020202020202020)
+
+/*
+ Convert lower case ASCII letters to upper case by unsetting
+ the bit 0x20 with help of the magic expression.
+
+ Requirements:
+ All bytes must be in the range [00..7F].
+ See the comments in MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC()
+*/
+#define MY_ASCII_TOUPPER_MAGIC(i) \
+ (i ^ MY_ASCII_20_IF_IS_LOWER_LETTER(i))
+
+
+/*
+ Convert a string (consisting of 8 bytes stored in uint64)
+ to upper case algorithmically.
+
+ Requirements:
+ All bytes must be in the range [00..0x7F].
+ See the comments in MY_ASCII_20_IS_SET_IF_NOT_LETTER_MAGIC().
+ The result on 8bit data is unpredictable!!!
+ The caller should make sure not to pass 8bit data.
+*/
+static inline ulonglong my_ascii_to_upper_magic_uint64(ulonglong i)
+{
+ return MY_ASCII_TOUPPER_MAGIC(i);
+}
+
+
+/*
+ Check if:
+ - both strings "a" and "b" have at least 4 bytes, and
+ - both strings have only 7bit data.
+*/
+static inline int
+my_strcoll_ascii_4bytes_found(const uchar *a, const uchar *ae,
+ const uchar *b, const uchar *be)
+{
+ return a + 4 <= ae && b + 4 <= be &&
+ (uint4korr(b) & 0x80808080) == 0 &&
+ (uint4korr(a) & 0x80808080) == 0;
+}
+
+
+/*
+ Compare the leading four 7bit ASCII bytes in two strings case insensitively
+ by converting letters [a-z] to upper case [A-Z].
+
+ Requirements:
+ - The input strings must have at least four bytes, and
+ - The leading four bytes in both strings must be 7bit ASCII.
+ The caller must make sure to provide only strings that meet
+ these requirements. The result on 8-bit data is unpredictable
+ as 8-bit bytes may cause overflow in my_ascii_to_upper_magic_uint64().
+ See comments above.
+*/
+static inline int
+my_strcoll_ascii_toupper_4bytes(const uchar *a, const uchar *b)
+{
+ ulonglong abn= (((ulonglong) mi_uint4korr(a)) << 32) | mi_uint4korr(b);
+ abn= my_ascii_to_upper_magic_uint64(abn);
+ if ((uint32) (abn >> 32) == (uint32) abn)
+ return 0;
+ return ((uint32) (abn >> 32)) < ((uint32) abn) ? -1 : + 1;
+}
+
+
+/*
+ Compare the leading eight 7bit ASCII bytes in two strings case insensitively
+ by converting letters [a-z] to upper case [A-Z].
+
+ Requirements:
+ - The input strings must have at least eight bytes, and
+ - The leading eight bytes in both strings must be 7bit ASCII.
+ See comments in my_strcoll_ascii_toupper_4bytes().
+*/
+static inline int
+my_strcoll_ascii_toupper_8bytes(const uchar *a, const uchar *b)
+{
+ /*
+ TODO:
+ Try to get advantage of SIMD instructions by massive comparison
+ (16 bytes at a time) of characters against (x>='a' && x<='z') using:
+ - either explicit intrinsics
+ - or a loop that can get vectorized automatically by some compilers.
+ */
+ ulonglong an= mi_uint8korr(a);
+ ulonglong bn= mi_uint8korr(b);
+ an= my_ascii_to_upper_magic_uint64(an);
+ bn= my_ascii_to_upper_magic_uint64(bn);
+ return an == bn ? 0 : an < bn ? -1 : +1;
+}
+
+
+/*
+ Compare the leading four 7bit ASCII bytes in two strings in binary style.
+*/
+static inline int
+my_strcoll_mb7_bin_4bytes(const uchar *a, const uchar *b)
+{
+ uint32 an= mi_uint4korr(a);
+ uint32 bn= mi_uint4korr(b);
+ return an == bn ? 0 : an < bn ? -1 : +1;
+}
+
+
+/*
+ Compare the leading four 7bit ASCII bytes in two strings in binary style.
+*/
+static inline int
+my_strcoll_mb7_bin_8bytes(const uchar *a, const uchar *b)
+{
+ ulonglong an= mi_uint8korr(a);
+ ulonglong bn= mi_uint8korr(b);
+ return an == bn ? 0 : an < bn ? -1 : +1;
+}
+
+#endif /* CTYPE_ASCII_INCLUDED */
diff --git a/strings/ctype-big5.c b/strings/ctype-big5.c
index d66a2bf8593..2491a5ff7ed 100644
--- a/strings/ctype-big5.c
+++ b/strings/ctype-big5.c
@@ -6691,6 +6691,7 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)),
#define MY_FUNCTION_NAME(x) my_ ## x ## _big5_bin
#define WEIGHT_MB1(x) ((uchar) (x))
#define WEIGHT_MB2(x,y) (big5code(x, y))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
@@ -6707,6 +6708,7 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)),
#define MY_FUNCTION_NAME(x) my_ ## x ## _big5_nopad_bin
#define WEIGHT_MB1(x) ((uchar) (x))
#define WEIGHT_MB2(x,y) (big5code(x, y))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
diff --git a/strings/ctype-cp932.c b/strings/ctype-cp932.c
index 9971750ca1c..af3de05509d 100644
--- a/strings/ctype-cp932.c
+++ b/strings/ctype-cp932.c
@@ -34639,6 +34639,7 @@ size_t my_numcells_cp932(CHARSET_INFO *cs __attribute__((unused)),
#define WEIGHT_PAD_SPACE (256 * (int) ' ')
#define WEIGHT_MB1(x) (256 * (int) sort_order_cp932[(uchar) (x)])
#define WEIGHT_MB2(x,y) (cp932code(x, y))
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -34646,6 +34647,7 @@ size_t my_numcells_cp932(CHARSET_INFO *cs __attribute__((unused)),
#define WEIGHT_PAD_SPACE (256 * (int) ' ')
#define WEIGHT_MB1(x) (256 * (int) (uchar) (x))
#define WEIGHT_MB2(x,y) (cp932code(x, y))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
@@ -34654,6 +34656,7 @@ size_t my_numcells_cp932(CHARSET_INFO *cs __attribute__((unused)),
#define WEIGHT_PAD_SPACE (256 * (int) ' ')
#define WEIGHT_MB1(x) (256 * (int) sort_order_cp932[(uchar) (x)])
#define WEIGHT_MB2(x,y) (cp932code(x, y))
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -34662,6 +34665,7 @@ size_t my_numcells_cp932(CHARSET_INFO *cs __attribute__((unused)),
#define WEIGHT_PAD_SPACE (256 * (int) ' ')
#define WEIGHT_MB1(x) (256 * (int) (uchar) (x))
#define WEIGHT_MB2(x,y) (cp932code(x, y))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
diff --git a/strings/ctype-euc_kr.c b/strings/ctype-euc_kr.c
index 4d159b29494..1f62ebaf636 100644
--- a/strings/ctype-euc_kr.c
+++ b/strings/ctype-euc_kr.c
@@ -9932,12 +9932,14 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)),
#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_korean_ci
#define WEIGHT_MB1(x) (sort_order_euc_kr[(uchar) (x)])
#define WEIGHT_MB2(x,y) (euckrcode(x, y))
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_bin
#define WEIGHT_MB1(x) ((uchar) (x))
#define WEIGHT_MB2(x,y) (euckrcode(x, y))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
@@ -9945,6 +9947,7 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)),
#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_korean_nopad_ci
#define WEIGHT_MB1(x) (sort_order_euc_kr[(uchar) (x)])
#define WEIGHT_MB2(x,y) (euckrcode(x, y))
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -9952,6 +9955,7 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)),
#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr_nopad_bin
#define WEIGHT_MB1(x) ((uchar) (x))
#define WEIGHT_MB2(x,y) (euckrcode(x, y))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
diff --git a/strings/ctype-eucjpms.c b/strings/ctype-eucjpms.c
index 72b18b5ec76..ed48917e333 100644
--- a/strings/ctype-eucjpms.c
+++ b/strings/ctype-eucjpms.c
@@ -212,6 +212,7 @@ static const uchar sort_order_eucjpms[]=
#define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \
(((uint) (uchar) (y)) << 8))
#define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z))
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -221,6 +222,7 @@ static const uchar sort_order_eucjpms[]=
#define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \
(((uint) (uchar) (y)) << 8))
#define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
@@ -231,6 +233,7 @@ static const uchar sort_order_eucjpms[]=
#define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \
(((uint) (uchar) (y)) << 8))
#define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z))
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -241,6 +244,7 @@ static const uchar sort_order_eucjpms[]=
#define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \
(((uint) (uchar) (y)) << 8))
#define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
diff --git a/strings/ctype-gb2312.c b/strings/ctype-gb2312.c
index 7b6b0b080f0..dd3581366fe 100644
--- a/strings/ctype-gb2312.c
+++ b/strings/ctype-gb2312.c
@@ -6344,6 +6344,7 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)),
#define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312_bin
#define WEIGHT_MB1(x) ((uchar) (x))
#define WEIGHT_MB2(x,y) (gb2312code(x, y))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
@@ -6358,6 +6359,7 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)),
#define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312_nopad_bin
#define WEIGHT_MB1(x) ((uchar) (x))
#define WEIGHT_MB2(x,y) (gb2312code(x, y))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
diff --git a/strings/ctype-gbk.c b/strings/ctype-gbk.c
index 2501c293fb2..2e72d5bd7a4 100644
--- a/strings/ctype-gbk.c
+++ b/strings/ctype-gbk.c
@@ -10625,6 +10625,7 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)),
#define MY_FUNCTION_NAME(x) my_ ## x ## _gbk_bin
#define WEIGHT_MB1(x) ((uchar) (x))
#define WEIGHT_MB2(x,y) (gbkcode(x,y))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
@@ -10640,6 +10641,7 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)),
#define MY_FUNCTION_NAME(x) my_ ## x ## _gbk_nopad_bin
#define WEIGHT_MB1(x) ((uchar) (x))
#define WEIGHT_MB2(x,y) (gbkcode(x,y))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
diff --git a/strings/ctype-sjis.c b/strings/ctype-sjis.c
index 313dfaa8f90..c3e64ce0d11 100644
--- a/strings/ctype-sjis.c
+++ b/strings/ctype-sjis.c
@@ -34027,6 +34027,7 @@ my_wc_to_printable_sjis(CHARSET_INFO *cs, my_wc_t wc,
#define WEIGHT_PAD_SPACE (256 * (int) ' ')
#define WEIGHT_MB1(x) (256 * (int) sort_order_sjis[(uchar) (x)])
#define WEIGHT_MB2(x,y) (sjiscode(x, y))
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -34034,6 +34035,7 @@ my_wc_to_printable_sjis(CHARSET_INFO *cs, my_wc_t wc,
#define WEIGHT_PAD_SPACE (256 * (int) ' ')
#define WEIGHT_MB1(x) (256 * (int) (uchar) (x))
#define WEIGHT_MB2(x,y) (sjiscode(x, y))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
@@ -34042,6 +34044,7 @@ my_wc_to_printable_sjis(CHARSET_INFO *cs, my_wc_t wc,
#define WEIGHT_PAD_SPACE (256 * (int) ' ')
#define WEIGHT_MB1(x) (256 * (int) sort_order_sjis[(uchar) (x)])
#define WEIGHT_MB2(x,y) (sjiscode(x, y))
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -34050,6 +34053,7 @@ my_wc_to_printable_sjis(CHARSET_INFO *cs, my_wc_t wc,
#define WEIGHT_PAD_SPACE (256 * (int) ' ')
#define WEIGHT_MB1(x) (256 * (int) (uchar) (x))
#define WEIGHT_MB2(x,y) (sjiscode(x, y))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
diff --git a/strings/ctype-uca-scanner_next.inl b/strings/ctype-uca-scanner_next.inl
index 79d25487b42..acab31f21ef 100644
--- a/strings/ctype-uca-scanner_next.inl
+++ b/strings/ctype-uca-scanner_next.inl
@@ -1,5 +1,5 @@
/* Copyright (c) 2004, 2013, Oracle and/or its affiliates.
- Copyright (c) 2009, 2021, MariaDB
+ Copyright (c) 2009, 2021, MariaDB
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
@@ -55,13 +55,8 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
#else
#define LOCAL_MAX_CONTRACTION_LENGTH MY_UCA_MAX_CONTRACTION
#endif
- /*
- Check if the weights for the previous character have been
- already fully scanned. If yes, then get the next character and
- initialize wbeg and wlength to its weight string.
- */
-
- if (scanner->wbeg[0])
+ uint16 weight= my_uca_scanner_next_expansion_weight(scanner);
+ if (weight)
{
/*
More weights left from the previous step.
@@ -69,7 +64,7 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
Return "0" as "nchars". The real nchars was set on a previous
iteration.
*/
- SCANNER_NEXT_RETURN(*scanner->wbeg++, 0);
+ SCANNER_NEXT_RETURN(weight, 0);
}
#ifdef SCANNER_NEXT_NCHARS
@@ -79,39 +74,44 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
#endif
{
const uint16 *wpage;
- my_wc_t wc[MY_UCA_MAX_CONTRACTION];
int mblen;
+ my_wc_t currwc= 0;
+ const uint16 *cweight;
/* Get next character */
#if MY_UCA_ASCII_OPTIMIZE
/* Get next ASCII character */
if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80)
{
- wc[0]= scanner->sbeg[0];
+ currwc= scanner->sbeg[0];
scanner->sbeg+= 1;
#if MY_UCA_COMPILE_CONTRACTIONS
- if (my_uca_needs_context_handling(scanner->level, wc[0]))
+ if (my_uca_needs_context_handling(scanner->level, currwc))
{
- const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, wc,
+ const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, currwc,
LOCAL_MAX_CONTRACTION_LENGTH);
if (cnt)
- SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
+ {
+ if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight)))
+ SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
+ continue; /* Ignorable contraction */
+ }
}
#endif
scanner->page= 0;
- scanner->code= (int) wc[0];
- scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
- if (scanner->wbeg[0])
- SCANNER_NEXT_RETURN(*scanner->wbeg++, ignorable_nchars + 1);
- continue;
+ scanner->code= (int) currwc;
+ cweight= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
+ if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
+ SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
+ continue; /* Ignorable character */
}
else
#endif
/* Get next MB character */
- if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg,
- scanner->send)) <= 0))
+ if (((mblen= MY_MB_WC(scanner, &currwc, scanner->sbeg,
+ scanner->send)) <= 0))
{
if (scanner->sbeg >= scanner->send)
{
@@ -136,26 +136,29 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
}
scanner->sbeg+= mblen;
- if (wc[0] > scanner->level->maxchar)
+ if (currwc > scanner->level->maxchar)
{
- /* Return 0xFFFD as weight for all characters outside BMP */
- scanner->wbeg= nochar;
- SCANNER_NEXT_RETURN(0xFFFD, ignorable_nchars + 1);
+ SCANNER_NEXT_RETURN(my_uca_scanner_set_weight_outside_maxchar(scanner),
+ ignorable_nchars + 1);
}
#if MY_UCA_COMPILE_CONTRACTIONS
- if (my_uca_needs_context_handling(scanner->level, wc[0]))
+ if (my_uca_needs_context_handling(scanner->level, currwc))
{
- const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, wc,
+ const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, currwc,
LOCAL_MAX_CONTRACTION_LENGTH);
if (cnt)
- SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
+ {
+ if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight)))
+ SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
+ continue; /* Ignorable contraction */
+ }
}
#endif
/* Process single character */
- scanner->page= wc[0] >> 8;
- scanner->code= wc[0] & 0xFF;
+ scanner->page= currwc >> 8;
+ scanner->code= currwc & 0xFF;
/* If weight page for w[0] does not exist, then calculate algoritmically */
if (!(wpage= scanner->level->weights[scanner->page]))
@@ -163,14 +166,13 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
ignorable_nchars + 1);
/* Calculate pointer to w[0]'s weight, using page and offset */
- scanner->wbeg= wpage +
- scanner->code * scanner->level->lengths[scanner->page];
- if (scanner->wbeg[0])
- break;
- /* Skip ignorable character and continue the loop */
+ cweight= wpage + scanner->code * scanner->level->lengths[scanner->page];
+ if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
+ SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
+ continue; /* Ignorable character */
}
- SCANNER_NEXT_RETURN(*scanner->wbeg++, ignorable_nchars + 1);
+ SCANNER_NEXT_RETURN(0, 0); /* Not reachable */
}
#undef SCANNER_NEXT_NCHARS
diff --git a/strings/ctype-uca.c b/strings/ctype-uca.c
index 221f81e18b0..c5b6ad6cbb3 100644
--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
@@ -31181,6 +31181,33 @@ static const uint16 nochar[]= {0,0};
#define MY_UCA_PREVIOUS_CONTEXT_HEAD 64
#define MY_UCA_PREVIOUS_CONTEXT_TAIL 128
+
+static inline uint16
+my_uca_scanner_next_expansion_weight(my_uca_scanner *scanner)
+{
+ if (scanner->wbeg[0])
+ return *scanner->wbeg++;
+ return 0;
+}
+
+
+static inline uint16
+my_uca_scanner_set_weight(my_uca_scanner *scanner, const uint16 *weight)
+{
+ scanner->wbeg= weight + 1;
+ return *weight;
+}
+
+
+static inline uint16
+my_uca_scanner_set_weight_outside_maxchar(my_uca_scanner *scanner)
+{
+ /* Return 0xFFFD as weight for all characters outside BMP */
+ scanner->wbeg= nochar;
+ return 0xFFFD;
+}
+
+
/********** Helper functions to handle contraction ************/
@@ -31364,7 +31391,7 @@ my_uca_can_be_contraction_part(const MY_CONTRACTIONS *c, my_wc_t wc, int flag)
@retval ptr - contraction weight array
*/
-uint16 *
+const uint16 *
my_uca_contraction2_weight(const MY_CONTRACTIONS *list, my_wc_t wc1, my_wc_t wc2)
{
MY_CONTRACTION *c, *last;
@@ -31449,14 +31476,30 @@ my_uca_needs_context_handling(const MY_UCA_WEIGHT_LEVEL *level, my_wc_t wc)
@retval non-zero - strings are different
*/
-static int
-my_wmemcmp(my_wc_t *a, my_wc_t *b, size_t len)
+static inline int
+my_wmemcmp(const my_wc_t *a, const my_wc_t *b, size_t len)
{
return memcmp(a, b, len * sizeof(my_wc_t));
}
/*
+ Test if the MY_CONTRACTION instance is equal to the wide
+ string with the given length.
+ Note, only true contractions are checked,
+ while previous context pairs always return FALSE.
+*/
+static inline my_bool
+my_uca_true_contraction_eq(const MY_CONTRACTION *c,
+ const my_wc_t *wc, size_t len)
+{
+ return (len >= MY_UCA_MAX_CONTRACTION || c->ch[len] == 0) &&
+ !c->with_context &&
+ !my_wmemcmp(c->ch, wc, len);
+}
+
+
+/*
Return the number of characters in a contraction.
*/
static inline uint my_contraction_char_length(const MY_CONTRACTION *cnt)
@@ -31492,9 +31535,7 @@ my_uca_contraction_find(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
for (c= list->item, last= c + list->nitems; c < last; c++)
{
- if ((len >= MY_UCA_MAX_CONTRACTION || c->ch[len] == 0) &&
- !c->with_context &&
- !my_wmemcmp(c->ch, wc, len))
+ if (my_uca_true_contraction_eq(c, wc, len))
return c;
}
return NULL;
@@ -31518,12 +31559,15 @@ my_uca_contraction_find(const MY_CONTRACTIONS *list, my_wc_t *wc, size_t len)
*/
static const MY_CONTRACTION *
-my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc,
+my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t currwc,
size_t max_char_length)
{
size_t clen= 1;
int flag;
const uchar *s, *beg[MY_UCA_MAX_CONTRACTION];
+ my_wc_t wc[MY_UCA_MAX_CONTRACTION];
+ wc[0]= currwc;
+
memset((void*) beg, 0, sizeof(beg));
/* Scan all contraction candidates */
@@ -31549,7 +31593,6 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc,
(cnt= my_uca_contraction_find(&scanner->level->contractions,
wc, clen)))
{
- scanner->wbeg= cnt->weight + 1;
scanner->sbeg= beg[clen - 1];
return cnt;
}
@@ -31573,18 +31616,14 @@ my_uca_scanner_contraction_find(my_uca_scanner *scanner, my_wc_t *wc,
*/
static const MY_CONTRACTION *
-my_uca_previous_context_find(my_uca_scanner *scanner,
+my_uca_previous_context_find(const MY_CONTRACTIONS *list,
my_wc_t wc0, my_wc_t wc1)
{
- const MY_CONTRACTIONS *list= &scanner->level->contractions;
MY_CONTRACTION *c, *last;
for (c= list->item, last= c + list->nitems; c < last; c++)
{
if (c->with_context && wc0 == c->ch[0] && wc1 == c->ch[1])
- {
- scanner->wbeg= c->weight + 1;
return c;
- }
}
return NULL;
}
@@ -31610,10 +31649,11 @@ my_uca_previous_context_find(my_uca_scanner *scanner,
@retval non null pointer - the address of MY_CONTRACTION found
*/
static inline const MY_CONTRACTION *
-my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc,
+my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t currwc,
size_t max_char_length)
{
const MY_CONTRACTION *cnt;
+ my_wc_t prevwc;
DBUG_ASSERT(scanner->level->contractions.nitems);
/*
If we have scanned a character which can have previous context,
@@ -31625,21 +31665,22 @@ my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc,
context at the moment. CLDR does not have longer sequences.
*/
if (my_uca_can_be_previous_context_tail(&scanner->level->contractions,
- wc[0]) &&
+ currwc) &&
scanner->wbeg != nochar && /* if not the very first character */
my_uca_can_be_previous_context_head(&scanner->level->contractions,
- (wc[1]= ((scanner->page << 8) +
+ (prevwc= ((scanner->page << 8) +
scanner->code))) &&
- (cnt= my_uca_previous_context_find(scanner, wc[1], wc[0])))
+ (cnt= my_uca_previous_context_find(&scanner->level->contractions,
+ prevwc, currwc)))
{
scanner->page= scanner->code= 0; /* Clear for the next character */
return cnt;
}
else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
- wc[0]))
+ currwc))
{
- /* Check if w[0] starts a contraction */
- if ((cnt= my_uca_scanner_contraction_find(scanner, wc, max_char_length)))
+ /* Check if currwc starts a contraction */
+ if ((cnt= my_uca_scanner_contraction_find(scanner, currwc, max_char_length)))
return cnt;
}
return NULL;
diff --git a/strings/ctype-uca.inl b/strings/ctype-uca.inl
index 1fc3480e5b5..6cf31ace11a 100644
--- a/strings/ctype-uca.inl
+++ b/strings/ctype-uca.inl
@@ -36,6 +36,7 @@
#error MY_UCA_COLL_INIT is not defined
#endif
+
#include "ctype-uca-scanner_next.inl"
#define SCANNER_NEXT_NCHARS
#include "ctype-uca-scanner_next.inl"
diff --git a/strings/ctype-ujis.c b/strings/ctype-ujis.c
index fb0ab7be6a6..adcd4825d88 100644
--- a/strings/ctype-ujis.c
+++ b/strings/ctype-ujis.c
@@ -211,6 +211,7 @@ static const uchar sort_order_ujis[]=
#define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \
(((uint) (uchar) (y)) << 8))
#define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z))
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -220,6 +221,7 @@ static const uchar sort_order_ujis[]=
#define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \
(((uint) (uchar) (y)) << 8))
#define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
@@ -230,6 +232,7 @@ static const uchar sort_order_ujis[]=
#define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \
(((uint) (uchar) (y)) << 8))
#define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z))
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -240,6 +243,7 @@ static const uchar sort_order_ujis[]=
#define WEIGHT_MB2(x,y) ((((uint) (uchar)(x)) << 16) | \
(((uint) (uchar) (y)) << 8))
#define WEIGHT_MB3(x,y,z) (WEIGHT_MB2(x,y) | ((uint) (uchar) z))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 742eeb912e3..611684ff706 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -1036,6 +1036,268 @@ static MY_UNICASE_CHARACTER plane05[]={
{0x05FE,0x05FE,0x05FE}, {0x05FF,0x05FF,0x05FF}
};
+static MY_UNICASE_CHARACTER plane06[]={ /* This page is dummy */
+ {0x0600,0x0600,0x0600}, {0x0601,0x0601,0x0601}, /* 0600 */
+ {0x0602,0x0602,0x0602}, {0x0603,0x0603,0x0603}, /* 0602 */
+ {0x0604,0x0604,0x0604}, {0x0605,0x0605,0x0605}, /* 0604 */
+ {0x0606,0x0606,0x0606}, {0x0607,0x0607,0x0607}, /* 0606 */
+ {0x0608,0x0608,0x0608}, {0x0609,0x0609,0x0609}, /* 0608 */
+ {0x060A,0x060A,0x060A}, {0x060B,0x060B,0x060B}, /* 060A */
+ {0x060C,0x060C,0x060C}, {0x060D,0x060D,0x060D}, /* 060C */
+ {0x060E,0x060E,0x060E}, {0x060F,0x060F,0x060F}, /* 060E */
+ {0x0610,0x0610,0x0610}, {0x0611,0x0611,0x0611}, /* 0610 */
+ {0x0612,0x0612,0x0612}, {0x0613,0x0613,0x0613}, /* 0612 */
+ {0x0614,0x0614,0x0614}, {0x0615,0x0615,0x0615}, /* 0614 */
+ {0x0616,0x0616,0x0616}, {0x0617,0x0617,0x0617}, /* 0616 */
+ {0x0618,0x0618,0x0618}, {0x0619,0x0619,0x0619}, /* 0618 */
+ {0x061A,0x061A,0x061A}, {0x061B,0x061B,0x061B}, /* 061A */
+ {0x061C,0x061C,0x061C}, {0x061D,0x061D,0x061D}, /* 061C */
+ {0x061E,0x061E,0x061E}, {0x061F,0x061F,0x061F}, /* 061E */
+ {0x0620,0x0620,0x0620}, {0x0621,0x0621,0x0621}, /* 0620 */
+ {0x0622,0x0622,0x0622}, {0x0623,0x0623,0x0623}, /* 0622 */
+ {0x0624,0x0624,0x0624}, {0x0625,0x0625,0x0625}, /* 0624 */
+ {0x0626,0x0626,0x0626}, {0x0627,0x0627,0x0627}, /* 0626 */
+ {0x0628,0x0628,0x0628}, {0x0629,0x0629,0x0629}, /* 0628 */
+ {0x062A,0x062A,0x062A}, {0x062B,0x062B,0x062B}, /* 062A */
+ {0x062C,0x062C,0x062C}, {0x062D,0x062D,0x062D}, /* 062C */
+ {0x062E,0x062E,0x062E}, {0x062F,0x062F,0x062F}, /* 062E */
+ {0x0630,0x0630,0x0630}, {0x0631,0x0631,0x0631}, /* 0630 */
+ {0x0632,0x0632,0x0632}, {0x0633,0x0633,0x0633}, /* 0632 */
+ {0x0634,0x0634,0x0634}, {0x0635,0x0635,0x0635}, /* 0634 */
+ {0x0636,0x0636,0x0636}, {0x0637,0x0637,0x0637}, /* 0636 */
+ {0x0638,0x0638,0x0638}, {0x0639,0x0639,0x0639}, /* 0638 */
+ {0x063A,0x063A,0x063A}, {0x063B,0x063B,0x063B}, /* 063A */
+ {0x063C,0x063C,0x063C}, {0x063D,0x063D,0x063D}, /* 063C */
+ {0x063E,0x063E,0x063E}, {0x063F,0x063F,0x063F}, /* 063E */
+ {0x0640,0x0640,0x0640}, {0x0641,0x0641,0x0641}, /* 0640 */
+ {0x0642,0x0642,0x0642}, {0x0643,0x0643,0x0643}, /* 0642 */
+ {0x0644,0x0644,0x0644}, {0x0645,0x0645,0x0645}, /* 0644 */
+ {0x0646,0x0646,0x0646}, {0x0647,0x0647,0x0647}, /* 0646 */
+ {0x0648,0x0648,0x0648}, {0x0649,0x0649,0x0649}, /* 0648 */
+ {0x064A,0x064A,0x064A}, {0x064B,0x064B,0x064B}, /* 064A */
+ {0x064C,0x064C,0x064C}, {0x064D,0x064D,0x064D}, /* 064C */
+ {0x064E,0x064E,0x064E}, {0x064F,0x064F,0x064F}, /* 064E */
+ {0x0650,0x0650,0x0650}, {0x0651,0x0651,0x0651}, /* 0650 */
+ {0x0652,0x0652,0x0652}, {0x0653,0x0653,0x0653}, /* 0652 */
+ {0x0654,0x0654,0x0654}, {0x0655,0x0655,0x0655}, /* 0654 */
+ {0x0656,0x0656,0x0656}, {0x0657,0x0657,0x0657}, /* 0656 */
+ {0x0658,0x0658,0x0658}, {0x0659,0x0659,0x0659}, /* 0658 */
+ {0x065A,0x065A,0x065A}, {0x065B,0x065B,0x065B}, /* 065A */
+ {0x065C,0x065C,0x065C}, {0x065D,0x065D,0x065D}, /* 065C */
+ {0x065E,0x065E,0x065E}, {0x065F,0x065F,0x065F}, /* 065E */
+ {0x0660,0x0660,0x0660}, {0x0661,0x0661,0x0661}, /* 0660 */
+ {0x0662,0x0662,0x0662}, {0x0663,0x0663,0x0663}, /* 0662 */
+ {0x0664,0x0664,0x0664}, {0x0665,0x0665,0x0665}, /* 0664 */
+ {0x0666,0x0666,0x0666}, {0x0667,0x0667,0x0667}, /* 0666 */
+ {0x0668,0x0668,0x0668}, {0x0669,0x0669,0x0669}, /* 0668 */
+ {0x066A,0x066A,0x066A}, {0x066B,0x066B,0x066B}, /* 066A */
+ {0x066C,0x066C,0x066C}, {0x066D,0x066D,0x066D}, /* 066C */
+ {0x066E,0x066E,0x066E}, {0x066F,0x066F,0x066F}, /* 066E */
+ {0x0670,0x0670,0x0670}, {0x0671,0x0671,0x0671}, /* 0670 */
+ {0x0672,0x0672,0x0672}, {0x0673,0x0673,0x0673}, /* 0672 */
+ {0x0674,0x0674,0x0674}, {0x0675,0x0675,0x0675}, /* 0674 */
+ {0x0676,0x0676,0x0676}, {0x0677,0x0677,0x0677}, /* 0676 */
+ {0x0678,0x0678,0x0678}, {0x0679,0x0679,0x0679}, /* 0678 */
+ {0x067A,0x067A,0x067A}, {0x067B,0x067B,0x067B}, /* 067A */
+ {0x067C,0x067C,0x067C}, {0x067D,0x067D,0x067D}, /* 067C */
+ {0x067E,0x067E,0x067E}, {0x067F,0x067F,0x067F}, /* 067E */
+ {0x0680,0x0680,0x0680}, {0x0681,0x0681,0x0681}, /* 0680 */
+ {0x0682,0x0682,0x0682}, {0x0683,0x0683,0x0683}, /* 0682 */
+ {0x0684,0x0684,0x0684}, {0x0685,0x0685,0x0685}, /* 0684 */
+ {0x0686,0x0686,0x0686}, {0x0687,0x0687,0x0687}, /* 0686 */
+ {0x0688,0x0688,0x0688}, {0x0689,0x0689,0x0689}, /* 0688 */
+ {0x068A,0x068A,0x068A}, {0x068B,0x068B,0x068B}, /* 068A */
+ {0x068C,0x068C,0x068C}, {0x068D,0x068D,0x068D}, /* 068C */
+ {0x068E,0x068E,0x068E}, {0x068F,0x068F,0x068F}, /* 068E */
+ {0x0690,0x0690,0x0690}, {0x0691,0x0691,0x0691}, /* 0690 */
+ {0x0692,0x0692,0x0692}, {0x0693,0x0693,0x0693}, /* 0692 */
+ {0x0694,0x0694,0x0694}, {0x0695,0x0695,0x0695}, /* 0694 */
+ {0x0696,0x0696,0x0696}, {0x0697,0x0697,0x0697}, /* 0696 */
+ {0x0698,0x0698,0x0698}, {0x0699,0x0699,0x0699}, /* 0698 */
+ {0x069A,0x069A,0x069A}, {0x069B,0x069B,0x069B}, /* 069A */
+ {0x069C,0x069C,0x069C}, {0x069D,0x069D,0x069D}, /* 069C */
+ {0x069E,0x069E,0x069E}, {0x069F,0x069F,0x069F}, /* 069E */
+ {0x06A0,0x06A0,0x06A0}, {0x06A1,0x06A1,0x06A1}, /* 06A0 */
+ {0x06A2,0x06A2,0x06A2}, {0x06A3,0x06A3,0x06A3}, /* 06A2 */
+ {0x06A4,0x06A4,0x06A4}, {0x06A5,0x06A5,0x06A5}, /* 06A4 */
+ {0x06A6,0x06A6,0x06A6}, {0x06A7,0x06A7,0x06A7}, /* 06A6 */
+ {0x06A8,0x06A8,0x06A8}, {0x06A9,0x06A9,0x06A9}, /* 06A8 */
+ {0x06AA,0x06AA,0x06AA}, {0x06AB,0x06AB,0x06AB}, /* 06AA */
+ {0x06AC,0x06AC,0x06AC}, {0x06AD,0x06AD,0x06AD}, /* 06AC */
+ {0x06AE,0x06AE,0x06AE}, {0x06AF,0x06AF,0x06AF}, /* 06AE */
+ {0x06B0,0x06B0,0x06B0}, {0x06B1,0x06B1,0x06B1}, /* 06B0 */
+ {0x06B2,0x06B2,0x06B2}, {0x06B3,0x06B3,0x06B3}, /* 06B2 */
+ {0x06B4,0x06B4,0x06B4}, {0x06B5,0x06B5,0x06B5}, /* 06B4 */
+ {0x06B6,0x06B6,0x06B6}, {0x06B7,0x06B7,0x06B7}, /* 06B6 */
+ {0x06B8,0x06B8,0x06B8}, {0x06B9,0x06B9,0x06B9}, /* 06B8 */
+ {0x06BA,0x06BA,0x06BA}, {0x06BB,0x06BB,0x06BB}, /* 06BA */
+ {0x06BC,0x06BC,0x06BC}, {0x06BD,0x06BD,0x06BD}, /* 06BC */
+ {0x06BE,0x06BE,0x06BE}, {0x06BF,0x06BF,0x06BF}, /* 06BE */
+ {0x06C0,0x06C0,0x06C0}, {0x06C1,0x06C1,0x06C1}, /* 06C0 */
+ {0x06C2,0x06C2,0x06C2}, {0x06C3,0x06C3,0x06C3}, /* 06C2 */
+ {0x06C4,0x06C4,0x06C4}, {0x06C5,0x06C5,0x06C5}, /* 06C4 */
+ {0x06C6,0x06C6,0x06C6}, {0x06C7,0x06C7,0x06C7}, /* 06C6 */
+ {0x06C8,0x06C8,0x06C8}, {0x06C9,0x06C9,0x06C9}, /* 06C8 */
+ {0x06CA,0x06CA,0x06CA}, {0x06CB,0x06CB,0x06CB}, /* 06CA */
+ {0x06CC,0x06CC,0x06CC}, {0x06CD,0x06CD,0x06CD}, /* 06CC */
+ {0x06CE,0x06CE,0x06CE}, {0x06CF,0x06CF,0x06CF}, /* 06CE */
+ {0x06D0,0x06D0,0x06D0}, {0x06D1,0x06D1,0x06D1}, /* 06D0 */
+ {0x06D2,0x06D2,0x06D2}, {0x06D3,0x06D3,0x06D3}, /* 06D2 */
+ {0x06D4,0x06D4,0x06D4}, {0x06D5,0x06D5,0x06D5}, /* 06D4 */
+ {0x06D6,0x06D6,0x06D6}, {0x06D7,0x06D7,0x06D7}, /* 06D6 */
+ {0x06D8,0x06D8,0x06D8}, {0x06D9,0x06D9,0x06D9}, /* 06D8 */
+ {0x06DA,0x06DA,0x06DA}, {0x06DB,0x06DB,0x06DB}, /* 06DA */
+ {0x06DC,0x06DC,0x06DC}, {0x06DD,0x06DD,0x06DD}, /* 06DC */
+ {0x06DE,0x06DE,0x06DE}, {0x06DF,0x06DF,0x06DF}, /* 06DE */
+ {0x06E0,0x06E0,0x06E0}, {0x06E1,0x06E1,0x06E1}, /* 06E0 */
+ {0x06E2,0x06E2,0x06E2}, {0x06E3,0x06E3,0x06E3}, /* 06E2 */
+ {0x06E4,0x06E4,0x06E4}, {0x06E5,0x06E5,0x06E5}, /* 06E4 */
+ {0x06E6,0x06E6,0x06E6}, {0x06E7,0x06E7,0x06E7}, /* 06E6 */
+ {0x06E8,0x06E8,0x06E8}, {0x06E9,0x06E9,0x06E9}, /* 06E8 */
+ {0x06EA,0x06EA,0x06EA}, {0x06EB,0x06EB,0x06EB}, /* 06EA */
+ {0x06EC,0x06EC,0x06EC}, {0x06ED,0x06ED,0x06ED}, /* 06EC */
+ {0x06EE,0x06EE,0x06EE}, {0x06EF,0x06EF,0x06EF}, /* 06EE */
+ {0x06F0,0x06F0,0x06F0}, {0x06F1,0x06F1,0x06F1}, /* 06F0 */
+ {0x06F2,0x06F2,0x06F2}, {0x06F3,0x06F3,0x06F3}, /* 06F2 */
+ {0x06F4,0x06F4,0x06F4}, {0x06F5,0x06F5,0x06F5}, /* 06F4 */
+ {0x06F6,0x06F6,0x06F6}, {0x06F7,0x06F7,0x06F7}, /* 06F6 */
+ {0x06F8,0x06F8,0x06F8}, {0x06F9,0x06F9,0x06F9}, /* 06F8 */
+ {0x06FA,0x06FA,0x06FA}, {0x06FB,0x06FB,0x06FB}, /* 06FA */
+ {0x06FC,0x06FC,0x06FC}, {0x06FD,0x06FD,0x06FD}, /* 06FC */
+ {0x06FE,0x06FE,0x06FE}, {0x06FF,0x06FF,0x06FF} /* 06FE */
+};
+
+static MY_UNICASE_CHARACTER plane07[]={ /* This page is dummy */
+ {0x0700,0x0700,0x0700}, {0x0701,0x0701,0x0701}, /* 0700 */
+ {0x0702,0x0702,0x0702}, {0x0703,0x0703,0x0703}, /* 0702 */
+ {0x0704,0x0704,0x0704}, {0x0705,0x0705,0x0705}, /* 0704 */
+ {0x0706,0x0706,0x0706}, {0x0707,0x0707,0x0707}, /* 0706 */
+ {0x0708,0x0708,0x0708}, {0x0709,0x0709,0x0709}, /* 0708 */
+ {0x070A,0x070A,0x070A}, {0x070B,0x070B,0x070B}, /* 070A */
+ {0x070C,0x070C,0x070C}, {0x070D,0x070D,0x070D}, /* 070C */
+ {0x070E,0x070E,0x070E}, {0x070F,0x070F,0x070F}, /* 070E */
+ {0x0710,0x0710,0x0710}, {0x0711,0x0711,0x0711}, /* 0710 */
+ {0x0712,0x0712,0x0712}, {0x0713,0x0713,0x0713}, /* 0712 */
+ {0x0714,0x0714,0x0714}, {0x0715,0x0715,0x0715}, /* 0714 */
+ {0x0716,0x0716,0x0716}, {0x0717,0x0717,0x0717}, /* 0716 */
+ {0x0718,0x0718,0x0718}, {0x0719,0x0719,0x0719}, /* 0718 */
+ {0x071A,0x071A,0x071A}, {0x071B,0x071B,0x071B}, /* 071A */
+ {0x071C,0x071C,0x071C}, {0x071D,0x071D,0x071D}, /* 071C */
+ {0x071E,0x071E,0x071E}, {0x071F,0x071F,0x071F}, /* 071E */
+ {0x0720,0x0720,0x0720}, {0x0721,0x0721,0x0721}, /* 0720 */
+ {0x0722,0x0722,0x0722}, {0x0723,0x0723,0x0723}, /* 0722 */
+ {0x0724,0x0724,0x0724}, {0x0725,0x0725,0x0725}, /* 0724 */
+ {0x0726,0x0726,0x0726}, {0x0727,0x0727,0x0727}, /* 0726 */
+ {0x0728,0x0728,0x0728}, {0x0729,0x0729,0x0729}, /* 0728 */
+ {0x072A,0x072A,0x072A}, {0x072B,0x072B,0x072B}, /* 072A */
+ {0x072C,0x072C,0x072C}, {0x072D,0x072D,0x072D}, /* 072C */
+ {0x072E,0x072E,0x072E}, {0x072F,0x072F,0x072F}, /* 072E */
+ {0x0730,0x0730,0x0730}, {0x0731,0x0731,0x0731}, /* 0730 */
+ {0x0732,0x0732,0x0732}, {0x0733,0x0733,0x0733}, /* 0732 */
+ {0x0734,0x0734,0x0734}, {0x0735,0x0735,0x0735}, /* 0734 */
+ {0x0736,0x0736,0x0736}, {0x0737,0x0737,0x0737}, /* 0736 */
+ {0x0738,0x0738,0x0738}, {0x0739,0x0739,0x0739}, /* 0738 */
+ {0x073A,0x073A,0x073A}, {0x073B,0x073B,0x073B}, /* 073A */
+ {0x073C,0x073C,0x073C}, {0x073D,0x073D,0x073D}, /* 073C */
+ {0x073E,0x073E,0x073E}, {0x073F,0x073F,0x073F}, /* 073E */
+ {0x0740,0x0740,0x0740}, {0x0741,0x0741,0x0741}, /* 0740 */
+ {0x0742,0x0742,0x0742}, {0x0743,0x0743,0x0743}, /* 0742 */
+ {0x0744,0x0744,0x0744}, {0x0745,0x0745,0x0745}, /* 0744 */
+ {0x0746,0x0746,0x0746}, {0x0747,0x0747,0x0747}, /* 0746 */
+ {0x0748,0x0748,0x0748}, {0x0749,0x0749,0x0749}, /* 0748 */
+ {0x074A,0x074A,0x074A}, {0x074B,0x074B,0x074B}, /* 074A */
+ {0x074C,0x074C,0x074C}, {0x074D,0x074D,0x074D}, /* 074C */
+ {0x074E,0x074E,0x074E}, {0x074F,0x074F,0x074F}, /* 074E */
+ {0x0750,0x0750,0x0750}, {0x0751,0x0751,0x0751}, /* 0750 */
+ {0x0752,0x0752,0x0752}, {0x0753,0x0753,0x0753}, /* 0752 */
+ {0x0754,0x0754,0x0754}, {0x0755,0x0755,0x0755}, /* 0754 */
+ {0x0756,0x0756,0x0756}, {0x0757,0x0757,0x0757}, /* 0756 */
+ {0x0758,0x0758,0x0758}, {0x0759,0x0759,0x0759}, /* 0758 */
+ {0x075A,0x075A,0x075A}, {0x075B,0x075B,0x075B}, /* 075A */
+ {0x075C,0x075C,0x075C}, {0x075D,0x075D,0x075D}, /* 075C */
+ {0x075E,0x075E,0x075E}, {0x075F,0x075F,0x075F}, /* 075E */
+ {0x0760,0x0760,0x0760}, {0x0761,0x0761,0x0761}, /* 0760 */
+ {0x0762,0x0762,0x0762}, {0x0763,0x0763,0x0763}, /* 0762 */
+ {0x0764,0x0764,0x0764}, {0x0765,0x0765,0x0765}, /* 0764 */
+ {0x0766,0x0766,0x0766}, {0x0767,0x0767,0x0767}, /* 0766 */
+ {0x0768,0x0768,0x0768}, {0x0769,0x0769,0x0769}, /* 0768 */
+ {0x076A,0x076A,0x076A}, {0x076B,0x076B,0x076B}, /* 076A */
+ {0x076C,0x076C,0x076C}, {0x076D,0x076D,0x076D}, /* 076C */
+ {0x076E,0x076E,0x076E}, {0x076F,0x076F,0x076F}, /* 076E */
+ {0x0770,0x0770,0x0770}, {0x0771,0x0771,0x0771}, /* 0770 */
+ {0x0772,0x0772,0x0772}, {0x0773,0x0773,0x0773}, /* 0772 */
+ {0x0774,0x0774,0x0774}, {0x0775,0x0775,0x0775}, /* 0774 */
+ {0x0776,0x0776,0x0776}, {0x0777,0x0777,0x0777}, /* 0776 */
+ {0x0778,0x0778,0x0778}, {0x0779,0x0779,0x0779}, /* 0778 */
+ {0x077A,0x077A,0x077A}, {0x077B,0x077B,0x077B}, /* 077A */
+ {0x077C,0x077C,0x077C}, {0x077D,0x077D,0x077D}, /* 077C */
+ {0x077E,0x077E,0x077E}, {0x077F,0x077F,0x077F}, /* 077E */
+ {0x0780,0x0780,0x0780}, {0x0781,0x0781,0x0781}, /* 0780 */
+ {0x0782,0x0782,0x0782}, {0x0783,0x0783,0x0783}, /* 0782 */
+ {0x0784,0x0784,0x0784}, {0x0785,0x0785,0x0785}, /* 0784 */
+ {0x0786,0x0786,0x0786}, {0x0787,0x0787,0x0787}, /* 0786 */
+ {0x0788,0x0788,0x0788}, {0x0789,0x0789,0x0789}, /* 0788 */
+ {0x078A,0x078A,0x078A}, {0x078B,0x078B,0x078B}, /* 078A */
+ {0x078C,0x078C,0x078C}, {0x078D,0x078D,0x078D}, /* 078C */
+ {0x078E,0x078E,0x078E}, {0x078F,0x078F,0x078F}, /* 078E */
+ {0x0790,0x0790,0x0790}, {0x0791,0x0791,0x0791}, /* 0790 */
+ {0x0792,0x0792,0x0792}, {0x0793,0x0793,0x0793}, /* 0792 */
+ {0x0794,0x0794,0x0794}, {0x0795,0x0795,0x0795}, /* 0794 */
+ {0x0796,0x0796,0x0796}, {0x0797,0x0797,0x0797}, /* 0796 */
+ {0x0798,0x0798,0x0798}, {0x0799,0x0799,0x0799}, /* 0798 */
+ {0x079A,0x079A,0x079A}, {0x079B,0x079B,0x079B}, /* 079A */
+ {0x079C,0x079C,0x079C}, {0x079D,0x079D,0x079D}, /* 079C */
+ {0x079E,0x079E,0x079E}, {0x079F,0x079F,0x079F}, /* 079E */
+ {0x07A0,0x07A0,0x07A0}, {0x07A1,0x07A1,0x07A1}, /* 07A0 */
+ {0x07A2,0x07A2,0x07A2}, {0x07A3,0x07A3,0x07A3}, /* 07A2 */
+ {0x07A4,0x07A4,0x07A4}, {0x07A5,0x07A5,0x07A5}, /* 07A4 */
+ {0x07A6,0x07A6,0x07A6}, {0x07A7,0x07A7,0x07A7}, /* 07A6 */
+ {0x07A8,0x07A8,0x07A8}, {0x07A9,0x07A9,0x07A9}, /* 07A8 */
+ {0x07AA,0x07AA,0x07AA}, {0x07AB,0x07AB,0x07AB}, /* 07AA */
+ {0x07AC,0x07AC,0x07AC}, {0x07AD,0x07AD,0x07AD}, /* 07AC */
+ {0x07AE,0x07AE,0x07AE}, {0x07AF,0x07AF,0x07AF}, /* 07AE */
+ {0x07B0,0x07B0,0x07B0}, {0x07B1,0x07B1,0x07B1}, /* 07B0 */
+ {0x07B2,0x07B2,0x07B2}, {0x07B3,0x07B3,0x07B3}, /* 07B2 */
+ {0x07B4,0x07B4,0x07B4}, {0x07B5,0x07B5,0x07B5}, /* 07B4 */
+ {0x07B6,0x07B6,0x07B6}, {0x07B7,0x07B7,0x07B7}, /* 07B6 */
+ {0x07B8,0x07B8,0x07B8}, {0x07B9,0x07B9,0x07B9}, /* 07B8 */
+ {0x07BA,0x07BA,0x07BA}, {0x07BB,0x07BB,0x07BB}, /* 07BA */
+ {0x07BC,0x07BC,0x07BC}, {0x07BD,0x07BD,0x07BD}, /* 07BC */
+ {0x07BE,0x07BE,0x07BE}, {0x07BF,0x07BF,0x07BF}, /* 07BE */
+ {0x07C0,0x07C0,0x07C0}, {0x07C1,0x07C1,0x07C1}, /* 07C0 */
+ {0x07C2,0x07C2,0x07C2}, {0x07C3,0x07C3,0x07C3}, /* 07C2 */
+ {0x07C4,0x07C4,0x07C4}, {0x07C5,0x07C5,0x07C5}, /* 07C4 */
+ {0x07C6,0x07C6,0x07C6}, {0x07C7,0x07C7,0x07C7}, /* 07C6 */
+ {0x07C8,0x07C8,0x07C8}, {0x07C9,0x07C9,0x07C9}, /* 07C8 */
+ {0x07CA,0x07CA,0x07CA}, {0x07CB,0x07CB,0x07CB}, /* 07CA */
+ {0x07CC,0x07CC,0x07CC}, {0x07CD,0x07CD,0x07CD}, /* 07CC */
+ {0x07CE,0x07CE,0x07CE}, {0x07CF,0x07CF,0x07CF}, /* 07CE */
+ {0x07D0,0x07D0,0x07D0}, {0x07D1,0x07D1,0x07D1}, /* 07D0 */
+ {0x07D2,0x07D2,0x07D2}, {0x07D3,0x07D3,0x07D3}, /* 07D2 */
+ {0x07D4,0x07D4,0x07D4}, {0x07D5,0x07D5,0x07D5}, /* 07D4 */
+ {0x07D6,0x07D6,0x07D6}, {0x07D7,0x07D7,0x07D7}, /* 07D6 */
+ {0x07D8,0x07D8,0x07D8}, {0x07D9,0x07D9,0x07D9}, /* 07D8 */
+ {0x07DA,0x07DA,0x07DA}, {0x07DB,0x07DB,0x07DB}, /* 07DA */
+ {0x07DC,0x07DC,0x07DC}, {0x07DD,0x07DD,0x07DD}, /* 07DC */
+ {0x07DE,0x07DE,0x07DE}, {0x07DF,0x07DF,0x07DF}, /* 07DE */
+ {0x07E0,0x07E0,0x07E0}, {0x07E1,0x07E1,0x07E1}, /* 07E0 */
+ {0x07E2,0x07E2,0x07E2}, {0x07E3,0x07E3,0x07E3}, /* 07E2 */
+ {0x07E4,0x07E4,0x07E4}, {0x07E5,0x07E5,0x07E5}, /* 07E4 */
+ {0x07E6,0x07E6,0x07E6}, {0x07E7,0x07E7,0x07E7}, /* 07E6 */
+ {0x07E8,0x07E8,0x07E8}, {0x07E9,0x07E9,0x07E9}, /* 07E8 */
+ {0x07EA,0x07EA,0x07EA}, {0x07EB,0x07EB,0x07EB}, /* 07EA */
+ {0x07EC,0x07EC,0x07EC}, {0x07ED,0x07ED,0x07ED}, /* 07EC */
+ {0x07EE,0x07EE,0x07EE}, {0x07EF,0x07EF,0x07EF}, /* 07EE */
+ {0x07F0,0x07F0,0x07F0}, {0x07F1,0x07F1,0x07F1}, /* 07F0 */
+ {0x07F2,0x07F2,0x07F2}, {0x07F3,0x07F3,0x07F3}, /* 07F2 */
+ {0x07F4,0x07F4,0x07F4}, {0x07F5,0x07F5,0x07F5}, /* 07F4 */
+ {0x07F6,0x07F6,0x07F6}, {0x07F7,0x07F7,0x07F7}, /* 07F6 */
+ {0x07F8,0x07F8,0x07F8}, {0x07F9,0x07F9,0x07F9}, /* 07F8 */
+ {0x07FA,0x07FA,0x07FA}, {0x07FB,0x07FB,0x07FB}, /* 07FA */
+ {0x07FC,0x07FC,0x07FC}, {0x07FD,0x07FD,0x07FD}, /* 07FC */
+ {0x07FE,0x07FE,0x07FE}, {0x07FF,0x07FF,0x07FF} /* 07FE */
+};
+
static MY_UNICASE_CHARACTER plane1E[]={
{0x1E00,0x1E01,0x0041}, {0x1E00,0x1E01,0x0041},
{0x1E02,0x1E03,0x0042}, {0x1E02,0x1E03,0x0042},
@@ -1695,7 +1957,7 @@ static MY_UNICASE_CHARACTER planeFF[]={
MY_UNICASE_CHARACTER *my_unicase_default_pages[256]=
{
my_unicase_default_page00,
- plane01, plane02, plane03, plane04, plane05, NULL, NULL,
+ plane01, plane02, plane03, plane04, plane05, plane06, plane07,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, plane1E, plane1F,
@@ -1742,7 +2004,7 @@ MY_UNICASE_INFO my_unicase_default=
*/
MY_UNICASE_CHARACTER *my_unicase_pages_mysql500[256]={
plane00_mysql500,
- plane01, plane02, plane03, plane04, plane05, NULL, NULL,
+ plane01, plane02, plane03, plane04, plane05, plane06, plane07,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, plane1E, plane1F,
@@ -1929,7 +2191,7 @@ static MY_UNICASE_CHARACTER turk00[]=
static MY_UNICASE_CHARACTER *my_unicase_pages_turkish[256]=
{
- turk00, plane01, plane02, plane03, plane04, plane05, NULL, NULL,
+ turk00, plane01, plane02, plane03, plane04, plane05, plane06, plane07,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, plane1E, plane1F,
@@ -4333,7 +4595,7 @@ static MY_UNICASE_CHARACTER u520p104[]={
MY_UNICASE_CHARACTER *my_unicase_pages_unicode520[4352]=
{
- u520p00, u520p01, u520p02, u520p03, u520p04, u520p05, NULL, NULL,
+ u520p00, u520p01, u520p02, u520p03, u520p04, u520p05, plane06, plane07,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
u520p10, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, u520p1D, u520p1E, u520p1F,
@@ -5231,7 +5493,17 @@ static inline int my_weight_mb2_utf8mb3_general_ci(uchar b0, uchar b1)
{
my_wc_t wc= UTF8MB2_CODE(b0, b1);
MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
- return (int) (page ? page[wc & 0xFF].sort : wc);
+ /*
+ 2-byte utf8 sequences encode Unicode characters up to U+07FF.
+ my_unicase_default_pages[N] has non-NULL page pointers
+ for all N in the range [0..7].
+ - my_unicase_default_pages[0..5] point to real translation data
+ - my_unicase_default_pages[6..7] point to dummy pages
+ (without real translation).
+ By adding these dummy pages we can avoid testing 'page' against NULL.
+ This gives up to 20% performance improvement.
+ */
+ return (int) page[wc & 0xFF].sort;
}
@@ -5255,6 +5527,7 @@ static inline int my_weight_mb3_utf8mb3_general_ci(uchar b0, uchar b1, uchar b2)
#define WEIGHT_MB1(x) my_weight_mb1_utf8mb3_general_ci(x)
#define WEIGHT_MB2(x,y) my_weight_mb2_utf8mb3_general_ci(x,y)
#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8mb3_general_ci(x,y,z)
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -5264,6 +5537,7 @@ static inline int my_weight_mb3_utf8mb3_general_ci(uchar b0, uchar b1, uchar b2)
#define WEIGHT_MB1(x) my_weight_mb1_utf8mb3_general_ci(x)
#define WEIGHT_MB2(x,y) my_weight_mb2_utf8mb3_general_ci(x,y)
#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8mb3_general_ci(x,y,z)
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -5277,7 +5551,11 @@ static inline int my_weight_mb2_utf8mb3_general_mysql500_ci(uchar b0, uchar b1)
{
my_wc_t wc= UTF8MB2_CODE(b0, b1);
MY_UNICASE_CHARACTER *page= my_unicase_pages_mysql500[wc >> 8];
- return (int) (page ? page[wc & 0xFF].sort : wc);
+ /*
+ `page` should never be NULL for 2-byte utf8 characters.
+ See comments in my_weight_mb2_utf8mb3_general_ci().
+ */
+ return (int) page[wc & 0xFF].sort;
}
@@ -5301,6 +5579,7 @@ my_weight_mb3_utf8mb3_general_mysql500_ci(uchar b0, uchar b1, uchar b2)
#define WEIGHT_MB1(x) my_weight_mb1_utf8mb3_general_mysql500_ci(x)
#define WEIGHT_MB2(x,y) my_weight_mb2_utf8mb3_general_mysql500_ci(x,y)
#define WEIGHT_MB3(x,y,z) my_weight_mb3_utf8mb3_general_mysql500_ci(x,y,z)
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -5312,6 +5591,7 @@ my_weight_mb3_utf8mb3_general_mysql500_ci(uchar b0, uchar b1, uchar b2)
#define WEIGHT_MB1(x) ((int) (uchar) (x))
#define WEIGHT_MB2(x,y) ((int) UTF8MB2_CODE(x,y))
#define WEIGHT_MB3(x,y,z) ((int) UTF8MB3_CODE(x,y,z))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
@@ -5321,6 +5601,7 @@ my_weight_mb3_utf8mb3_general_mysql500_ci(uchar b0, uchar b1, uchar b2)
#define WEIGHT_MB1(x) ((int) (uchar) (x))
#define WEIGHT_MB2(x,y) ((int) UTF8MB2_CODE(x,y))
#define WEIGHT_MB3(x,y,z) ((int) UTF8MB3_CODE(x,y,z))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
/*
@@ -7692,6 +7973,7 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
All non-BMP characters have the same weight.
*/
#define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -7701,6 +7983,7 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
#define WEIGHT_MB2(b0,b1) ((int) UTF8MB2_CODE(b0,b1))
#define WEIGHT_MB3(b0,b1,b2) ((int) UTF8MB3_CODE(b0,b1,b2))
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) UTF8MB4_CODE(b0,b1,b2,b3))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
@@ -7715,6 +7998,7 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
All non-BMP characters have the same weight.
*/
#define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
+#define STRCOLL_MB7_TOUPPER
#include "strcoll.inl"
@@ -7725,6 +8009,7 @@ my_charlen_utf8mb4(CHARSET_INFO *cs __attribute__((unused)),
#define WEIGHT_MB2(b0,b1) ((int) UTF8MB2_CODE(b0,b1))
#define WEIGHT_MB3(b0,b1,b2) ((int) UTF8MB3_CODE(b0,b1,b2))
#define WEIGHT_MB4(b0,b1,b2,b3) ((int) UTF8MB4_CODE(b0,b1,b2,b3))
+#define STRCOLL_MB7_BIN
#include "strcoll.inl"
diff --git a/strings/json_lib.c b/strings/json_lib.c
index 6b2a6416952..6898e9741a8 100644
--- a/strings/json_lib.c
+++ b/strings/json_lib.c
@@ -951,7 +951,7 @@ int json_read_value(json_engine_t *j)
{
int t_next, c_len, res;
- j->value_type= JSON_VALUE_UNINITALIZED;
+ j->value_type= JSON_VALUE_UNINITIALIZED;
if (j->state == JST_KEY)
{
while (json_read_keyname_chr(j) == 0) {}
@@ -1640,7 +1640,7 @@ int json_escape(CHARSET_INFO *str_cs,
if (c_len < 0)
{
/* JSON buffer is depleted. */
- return -1;
+ return JSON_ERROR_OUT_OF_SPACE;
}
/* JSON charset cannot convert this character. */
@@ -1652,7 +1652,7 @@ int json_escape(CHARSET_INFO *str_cs,
json+= c_len, json_end)) <= 0)
{
/* JSON buffer is depleted. */
- return -1;
+ return JSON_ERROR_OUT_OF_SPACE;
}
json+= c_len;
@@ -1685,11 +1685,11 @@ int json_escape(CHARSET_INFO *str_cs,
continue;
}
/* JSON buffer is depleted. */
- return -1;
+ return JSON_ERROR_OUT_OF_SPACE;
}
}
else /* c_len == 0, an illegal symbol. */
- return -1;
+ return JSON_ERROR_ILLEGAL_SYMBOL;
}
return (int)(json - json_start);
diff --git a/strings/json_normalize.c b/strings/json_normalize.c
new file mode 100644
index 00000000000..0b7f172dae6
--- /dev/null
+++ b/strings/json_normalize.c
@@ -0,0 +1,852 @@
+/* Copyright (c) 2021 Eric Herman and MariaDB Foundation.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */
+
+#include <my_global.h>
+#include <json_lib.h>
+
+#ifndef PSI_JSON
+#define PSI_JSON PSI_NOT_INSTRUMENTED
+#endif
+
+#ifndef JSON_MALLOC_FLAGS
+#define JSON_MALLOC_FLAGS MYF(MY_THREAD_SPECIFIC|MY_WME)
+#endif
+
+/*
+From the EXPIRED DRAFT JSON Canonical Form
+https://datatracker.ietf.org/doc/html/draft-staykov-hu-json-canonical-form-00
+
+2. JSON canonical form
+
+ The canonical form is defined by the following rules:
+ * The document MUST be encoded in UTF-8 [UTF-8]
+ * Non-significant(1) whitespace characters MUST NOT be used
+ * Non-significant(1) line endings MUST NOT be used
+ * Entries (set of name/value pairs) in JSON objects MUST be sorted
+ lexicographically(2) by their names
+ * Arrays MUST preserve their initial ordering
+
+ (1)As defined in JSON data-interchange format [JSON], JSON objects
+ consists of multiple "name"/"value" pairs and JSON arrays consists
+ of multiple "value" fields. Non-significant means not part of
+ "name" or "value".
+
+
+ (2)Lexicographic comparison, which orders strings from least to
+ greatest alphabetically based on the UCS (Unicode Character Set)
+ codepoint values.
+*/
+
+
+struct json_norm_array {
+ DYNAMIC_ARRAY values;
+};
+
+
+struct json_norm_object {
+ DYNAMIC_ARRAY kv_pairs;
+};
+
+
+struct json_norm_value {
+ enum json_value_types type;
+ union {
+ DYNAMIC_STRING number;
+ LEX_STRING string;
+ struct json_norm_array array;
+ struct json_norm_object object;
+ } value;
+};
+
+
+struct json_norm_kv {
+ LEX_STRING key;
+ struct json_norm_value value;
+};
+
+
+static void *
+json_norm_malloc(size_t size)
+{
+ return my_malloc(PSI_JSON, size, JSON_MALLOC_FLAGS);
+}
+
+
+int
+json_norm_string_init(LEX_STRING *string, const char *str, size_t len)
+{
+ string->length= len + 1;
+ string->str= json_norm_malloc(string->length);
+ if (!string->str)
+ {
+ string->length= 0;
+ return 1;
+ }
+ strncpy(string->str, str, len);
+ string->str[len]= 0;
+ return 0;
+}
+
+
+void
+json_norm_string_free(LEX_STRING *string)
+{
+ my_free(string->str);
+ string->str= NULL;
+ string->length= 0;
+}
+
+
+void
+json_norm_number_free(DYNAMIC_STRING *number)
+{
+ dynstr_free(number);
+ number->length= 0;
+}
+
+
+int
+json_normalize_number(DYNAMIC_STRING *out, const char *str, size_t str_len)
+{
+ int err= 0;
+ long int magnitude= 0;
+ int negative= 0;
+ size_t i= 0;
+ size_t j= 0;
+ size_t k= 0;
+ char *buf= NULL;
+ size_t buf_size = str_len + 1;
+
+ buf= json_norm_malloc(buf_size);
+ if (!buf)
+ return 1;
+
+ memset(buf, 0x00, buf_size);
+
+ if (str[0] == '-')
+ {
+ negative= 1;
+ ++i;
+ }
+
+ /* grab digits preceding the decimal */
+ for (; i < str_len && str[i] != '.' && str[i] != 'e' && str[i] != 'E'; ++i)
+ buf[j++] = str[i];
+
+ magnitude = (long)(j - 1);
+
+ /* skip the . */
+ if (str[i] == '.')
+ ++i;
+
+ /* grab rest of digits before the E */
+ for (; i < str_len && str[i] != 'e' && str[i] != 'E'; ++i)
+ buf[j++] = str[i];
+
+ /* trim trailing zeros */
+ for (k = j - 1; k && buf[k] == '0'; --k, --j)
+ buf[k] = '\0';
+
+ /* trim the leading zeros */
+ for (k = 0; buf[k] && buf[k] == '0'; ++k);
+ if (k)
+ {
+ memmove(buf, buf + k, j - k);
+ j = j - k;
+ buf[j] = '\0';
+ magnitude -= (long)k;
+ }
+
+ if (!j)
+ {
+ err= dynstr_append_mem(out, STRING_WITH_LEN("0.0E0"));
+ my_free(buf);
+ return err;
+ }
+
+ if (negative)
+ err|= dynstr_append_mem(out, STRING_WITH_LEN("-"));
+ err|= dynstr_append_mem(out, buf, 1);
+ err|= dynstr_append_mem(out, STRING_WITH_LEN("."));
+ if (j == 1)
+ err|= dynstr_append_mem(out, STRING_WITH_LEN("0"));
+ else
+ err|= dynstr_append(out, buf + 1);
+
+ err|= dynstr_append_mem(out, STRING_WITH_LEN("E"));
+
+ if (str[i] == 'e' || str[i] == 'E')
+ {
+ char *endptr = NULL;
+ /* skip the [eE] */
+ ++i;
+ /* combine the exponent with current magnitude */
+ magnitude += strtol(str + i, &endptr, 10);
+ }
+ snprintf(buf, buf_size, "%ld", magnitude);
+ err|= dynstr_append(out, buf);
+
+ my_free(buf);
+ return err ? 1 : 0;
+}
+
+
+static int
+json_norm_object_append_key_value(struct json_norm_object *obj,
+ DYNAMIC_STRING *key,
+ struct json_norm_value *val)
+{
+ struct json_norm_kv pair;
+ int err= json_norm_string_init(&pair.key, key->str, key->length);
+
+ if (err)
+ return 1;
+
+ pair.value= *val;
+
+ err|= insert_dynamic(&obj->kv_pairs, &pair);
+ if (err)
+ {
+ json_norm_string_free(&pair.key);
+ return 1;
+ }
+
+ return 0;
+}
+
+
+static struct json_norm_kv*
+json_norm_object_get_last_element(struct json_norm_object *obj)
+{
+ struct json_norm_kv *kv;
+
+ DBUG_ASSERT(obj->kv_pairs.elements > 0);
+ kv= dynamic_element(&obj->kv_pairs,
+ obj->kv_pairs.elements - 1,
+ struct json_norm_kv*);
+ return kv;
+}
+
+
+static struct json_norm_value*
+json_norm_array_get_last_element(struct json_norm_array *arr)
+{
+ struct json_norm_value *val;
+
+ DBUG_ASSERT(arr->values.elements > 0);
+ val= dynamic_element(&arr->values,
+ arr->values.elements - 1,
+ struct json_norm_value*);
+ return val;
+}
+
+
+static int
+json_norm_array_append_value(struct json_norm_array *arr,
+ struct json_norm_value *val)
+{
+ return insert_dynamic(&arr->values, val);
+}
+
+
+int
+json_norm_init_dynamic_array(size_t element_size, void *where)
+{
+ const size_t init_alloc= 20;
+ const size_t alloc_increment= 20;
+ return my_init_dynamic_array(PSI_JSON, where, element_size,
+ init_alloc, alloc_increment,
+ JSON_MALLOC_FLAGS);
+}
+
+
+int
+json_norm_value_object_init(struct json_norm_value *val)
+{
+ const size_t element_size= sizeof(struct json_norm_kv);
+ struct json_norm_object *obj= &val->value.object;
+
+ val->type= JSON_VALUE_OBJECT;
+
+ return json_norm_init_dynamic_array(element_size, &obj->kv_pairs);
+}
+
+
+int
+json_norm_value_array_init(struct json_norm_value *val)
+{
+ const size_t element_size= sizeof(struct json_norm_value);
+ struct json_norm_array *array= &val->value.array;
+
+ val->type= JSON_VALUE_ARRAY;
+
+ return json_norm_init_dynamic_array(element_size, &array->values);
+}
+
+
+static int
+json_norm_value_string_init(struct json_norm_value *val,
+ const char *str, size_t len)
+{
+ val->type= JSON_VALUE_STRING;
+ return json_norm_string_init(&val->value.string, str, len);
+}
+
+
+static int
+json_norm_kv_comp(const struct json_norm_kv *a,
+ const struct json_norm_kv *b)
+{
+ return my_strnncoll(&my_charset_utf8mb4_bin,
+ (const uchar *)a->key.str, a->key.length,
+ (const uchar *)b->key.str, b->key.length);
+}
+
+
+static void
+json_normalize_sort(struct json_norm_value *val)
+{
+ switch (val->type) {
+ case JSON_VALUE_OBJECT:
+ {
+ size_t i;
+ DYNAMIC_ARRAY *pairs= &val->value.object.kv_pairs;
+ for (i= 0; i < pairs->elements; ++i)
+ {
+ struct json_norm_kv *kv= dynamic_element(pairs, i, struct json_norm_kv*);
+ json_normalize_sort(&kv->value);
+ }
+
+ my_qsort(dynamic_element(pairs, 0, struct json_norm_kv*),
+ pairs->elements, sizeof(struct json_norm_kv),
+ (qsort_cmp) json_norm_kv_comp);
+ break;
+ }
+ case JSON_VALUE_ARRAY:
+ {
+ /* Arrays in JSON must keep the order. Just recursively sort values. */
+ size_t i;
+ DYNAMIC_ARRAY *values= &val->value.array.values;
+ for (i= 0; i < values->elements; ++i)
+ {
+ struct json_norm_value *value;
+ value= dynamic_element(values, i, struct json_norm_value*);
+ json_normalize_sort(value);
+ }
+
+ break;
+ }
+ case JSON_VALUE_UNINITIALIZED:
+ DBUG_ASSERT(0);
+ break;
+ default: /* Nothing to do for other types. */
+ break;
+ }
+}
+
+
+static void
+json_norm_value_free(struct json_norm_value *val)
+{
+ size_t i;
+ switch (val->type) {
+ case JSON_VALUE_OBJECT:
+ {
+ struct json_norm_object *obj= &val->value.object;
+
+ DYNAMIC_ARRAY *pairs_arr= &obj->kv_pairs;
+ for (i= 0; i < pairs_arr->elements; ++i)
+ {
+ struct json_norm_kv *kv;
+ kv= dynamic_element(pairs_arr, i, struct json_norm_kv *);
+ json_norm_string_free(&kv->key);
+ json_norm_value_free(&kv->value);
+ }
+ delete_dynamic(pairs_arr);
+ break;
+ }
+ case JSON_VALUE_ARRAY:
+ {
+ struct json_norm_array *arr= &val->value.array;
+
+ DYNAMIC_ARRAY *values_arr= &arr->values;
+ for (i= 0; i < arr->values.elements; ++i)
+ {
+ struct json_norm_value *jt_value;
+ jt_value= dynamic_element(values_arr, i, struct json_norm_value *);
+ json_norm_value_free(jt_value);
+ }
+ delete_dynamic(values_arr);
+ break;
+ }
+ case JSON_VALUE_STRING:
+ {
+ json_norm_string_free(&val->value.string);
+ break;
+ }
+ case JSON_VALUE_NUMBER:
+ json_norm_number_free(&val->value.number);
+ break;
+ case JSON_VALUE_NULL:
+ case JSON_VALUE_TRUE:
+ case JSON_VALUE_FALSE:
+ case JSON_VALUE_UNINITIALIZED:
+ break;
+ }
+ val->type= JSON_VALUE_UNINITIALIZED;
+}
+
+
+static int
+json_norm_to_string(DYNAMIC_STRING *buf, struct json_norm_value *val)
+{
+ switch (val->type)
+ {
+ case JSON_VALUE_OBJECT:
+ {
+ size_t i;
+ struct json_norm_object *obj= &val->value.object;
+ DYNAMIC_ARRAY *pairs_arr= &obj->kv_pairs;
+
+ if (dynstr_append_mem(buf, STRING_WITH_LEN("{")))
+ return 1;
+
+ for (i= 0; i < pairs_arr->elements; ++i)
+ {
+ struct json_norm_kv *kv;
+ kv= dynamic_element(pairs_arr, i, struct json_norm_kv *);
+
+ if (dynstr_append_mem(buf, STRING_WITH_LEN("\"")) ||
+ dynstr_append(buf, kv->key.str) ||
+ dynstr_append_mem(buf, STRING_WITH_LEN("\":")) ||
+ json_norm_to_string(buf, &kv->value))
+ return 1;
+
+ if (i != (pairs_arr->elements - 1))
+ if (dynstr_append_mem(buf, STRING_WITH_LEN(",")))
+ return 1;
+ }
+ if (dynstr_append_mem(buf, STRING_WITH_LEN("}")))
+ return 1;
+ break;
+ }
+ case JSON_VALUE_ARRAY:
+ {
+ size_t i;
+ struct json_norm_array *arr= &val->value.array;
+ DYNAMIC_ARRAY *values_arr= &arr->values;
+
+ if (dynstr_append_mem(buf, STRING_WITH_LEN("[")))
+ return 1;
+ for (i= 0; i < values_arr->elements; ++i)
+ {
+ struct json_norm_value *jt_value;
+ jt_value= dynamic_element(values_arr, i, struct json_norm_value *);
+
+ if (json_norm_to_string(buf, jt_value))
+ return 1;
+ if (i != (values_arr->elements - 1))
+ if (dynstr_append_mem(buf, STRING_WITH_LEN(",")))
+ return 1;
+ }
+ if (dynstr_append_mem(buf, STRING_WITH_LEN("]")))
+ return 1;
+ break;
+ }
+ case JSON_VALUE_STRING:
+ {
+ if (dynstr_append(buf, val->value.string.str))
+ return 1;
+ break;
+ }
+ case JSON_VALUE_NULL:
+ {
+ if (dynstr_append_mem(buf, STRING_WITH_LEN("null")))
+ return 1;
+ break;
+ }
+ case JSON_VALUE_TRUE:
+ {
+ if (dynstr_append_mem(buf, STRING_WITH_LEN("true")))
+ return 1;
+ break;
+ }
+ case JSON_VALUE_FALSE:
+ {
+ if (dynstr_append_mem(buf, STRING_WITH_LEN("false")))
+ return 1;
+ break;
+ }
+ case JSON_VALUE_NUMBER:
+ {
+ if (dynstr_append(buf, val->value.number.str))
+ return 1;
+ break;
+ }
+ case JSON_VALUE_UNINITIALIZED:
+ {
+ DBUG_ASSERT(0);
+ break;
+ }
+ }
+ return 0;
+}
+
+
+static int
+json_norm_value_number_init(struct json_norm_value *val,
+ const char *number, size_t num_len)
+{
+ int err;
+ val->type= JSON_VALUE_NUMBER;
+ err= init_dynamic_string(&val->value.number, NULL, 0, 0);
+ if (err)
+ return 1;
+ err= json_normalize_number(&val->value.number, number, num_len);
+ if (err)
+ dynstr_free(&val->value.number);
+ return err;
+}
+
+
+static void
+json_norm_value_null_init(struct json_norm_value *val)
+{
+ val->type= JSON_VALUE_NULL;
+}
+
+
+static void
+json_norm_value_false_init(struct json_norm_value *val)
+{
+ val->type= JSON_VALUE_FALSE;
+}
+
+
+static void
+json_norm_value_true_init(struct json_norm_value *val)
+{
+ val->type= JSON_VALUE_TRUE;
+}
+
+
+static int
+json_norm_value_init(struct json_norm_value *val, json_engine_t *je)
+{
+ int err= 0;
+ switch (je->value_type) {
+ case JSON_VALUE_STRING:
+ {
+ const char *je_value_begin= (const char *)je->value_begin;
+ size_t je_value_len= (je->value_end - je->value_begin);
+ err= json_norm_value_string_init(val, je_value_begin, je_value_len);
+ break;
+ }
+ case JSON_VALUE_NULL:
+ {
+ json_norm_value_null_init(val);
+ break;
+ }
+ case JSON_VALUE_TRUE:
+ {
+ json_norm_value_true_init(val);
+ break;
+ }
+ case JSON_VALUE_FALSE:
+ {
+ json_norm_value_false_init(val);
+ break;
+ }
+ case JSON_VALUE_ARRAY:
+ {
+ err= json_norm_value_array_init(val);
+ break;
+ }
+ case JSON_VALUE_OBJECT:
+ {
+ err= json_norm_value_object_init(val);
+ break;
+ }
+ case JSON_VALUE_NUMBER:
+ {
+ const char *je_number_begin= (const char *)je->value_begin;
+ size_t je_number_len= (je->value_end - je->value_begin);
+ err= json_norm_value_number_init(val, je_number_begin, je_number_len);
+ break;
+ }
+ default:
+ DBUG_ASSERT(0);
+ return 1;
+ }
+ return err;
+}
+
+
+static int
+json_norm_append_to_array(struct json_norm_value *val,
+ json_engine_t *je)
+{
+ int err= 0;
+ struct json_norm_value tmp;
+
+ DBUG_ASSERT(val->type == JSON_VALUE_ARRAY);
+ DBUG_ASSERT(je->value_type != JSON_VALUE_UNINITIALIZED);
+
+ err= json_norm_value_init(&tmp, je);
+
+ if (err)
+ return 1;
+
+ err= json_norm_array_append_value(&val->value.array, &tmp);
+
+ if (err)
+ json_norm_value_free(&tmp);
+
+ return err;
+}
+
+
+static int
+json_norm_append_to_object(struct json_norm_value *val,
+ DYNAMIC_STRING *key, json_engine_t *je)
+{
+ int err= 0;
+ struct json_norm_value tmp;
+
+ DBUG_ASSERT(val->type == JSON_VALUE_OBJECT);
+ DBUG_ASSERT(je->value_type != JSON_VALUE_UNINITIALIZED);
+
+ err= json_norm_value_init(&tmp, je);
+
+ if (err)
+ return 1;
+
+ err= json_norm_object_append_key_value(&val->value.object, key, &tmp);
+
+ if (err)
+ json_norm_value_free(&tmp);
+
+ return err;
+}
+
+
+static int
+json_norm_parse(struct json_norm_value *root, json_engine_t *je)
+{
+ size_t current;
+ struct json_norm_value *stack[JSON_DEPTH_LIMIT];
+ int err= 0;
+ DYNAMIC_STRING key;
+
+ err= init_dynamic_string(&key, NULL, 0, 0);
+ if (err)
+ goto json_norm_parse_end;
+
+ memset(stack, 0x00, sizeof(stack));
+ current= 0;
+ stack[current]= root;
+
+ do {
+ switch (je->state)
+ {
+ case JST_KEY:
+ {
+ const uchar *key_start= je->s.c_str;
+ const uchar *key_end;
+
+ DBUG_ASSERT(stack[current]->type == JSON_VALUE_OBJECT);
+ do
+ {
+ key_end= je->s.c_str;
+ } while (json_read_keyname_chr(je) == 0);
+
+ /* we have the key name */
+ /* reset the dynstr: */
+ dynstr_trunc(&key, key.length);
+ dynstr_append_mem(&key, (char *)key_start, (key_end - key_start));
+
+ /* After reading the key, we have a follow-up value. */
+ err= json_read_value(je);
+ if (err)
+ goto json_norm_parse_end;
+
+ err= json_norm_append_to_object(stack[current], &key, je);
+ if (err)
+ goto json_norm_parse_end;
+
+ if (je->value_type == JSON_VALUE_ARRAY ||
+ je->value_type == JSON_VALUE_OBJECT)
+ {
+ struct json_norm_kv *kv;
+
+ err= ((current + 1) == JSON_DEPTH_LIMIT);
+ if (err)
+ goto json_norm_parse_end;
+
+ kv= json_norm_object_get_last_element(&stack[current]->value.object);
+ stack[++current]= &kv->value;
+ }
+ break;
+ }
+ case JST_VALUE:
+ {
+ struct json_norm_array *current_arr= &stack[current]->value.array;
+ err= json_read_value(je);
+ if (err)
+ goto json_norm_parse_end;
+
+ DBUG_ASSERT(stack[current]->type == JSON_VALUE_ARRAY);
+
+ err= json_norm_append_to_array(stack[current], je);
+ if (err)
+ goto json_norm_parse_end;
+
+ if (je->value_type == JSON_VALUE_ARRAY ||
+ je->value_type == JSON_VALUE_OBJECT)
+ {
+
+ err= ((current + 1) == JSON_DEPTH_LIMIT);
+ if (err)
+ goto json_norm_parse_end;
+
+ stack[++current]= json_norm_array_get_last_element(current_arr);
+ }
+
+ break;
+ }
+ case JST_OBJ_START:
+ /* parser found an object (the '{' in JSON) */
+ break;
+ case JST_OBJ_END:
+ /* parser found the end of the object (the '}' in JSON) */
+ /* pop stack */
+ --current;
+ break;
+ case JST_ARRAY_START:
+ /* parser found an array (the '[' in JSON) */
+ break;
+ case JST_ARRAY_END:
+ /* parser found the end of the array (the ']' in JSON) */
+ /* pop stack */
+ --current;
+ break;
+ };
+ } while (json_scan_next(je) == 0);
+
+json_norm_parse_end:
+ dynstr_free(&key);
+ return err;
+}
+
+
+static int
+json_norm_build(struct json_norm_value *root,
+ const char *s, size_t size, CHARSET_INFO *cs)
+{
+ int err= 0;
+ json_engine_t je;
+
+ DBUG_ASSERT(s);
+ memset(&je, 0x00, sizeof(je));
+
+ memset(root, 0x00, sizeof(struct json_norm_value));
+ root->type= JSON_VALUE_UNINITIALIZED;
+
+ err= json_scan_start(&je, cs, (const uchar *)s, (const uchar *)(s + size));
+ if (json_read_value(&je))
+ return err;
+
+ err= json_norm_value_init(root, &je);
+
+ if (root->type == JSON_VALUE_OBJECT ||
+ root->type == JSON_VALUE_ARRAY)
+ {
+ err= json_norm_parse(root, &je);
+ if (err)
+ return err;
+ }
+ return err;
+}
+
+
+int
+json_normalize(DYNAMIC_STRING *result,
+ const char *s, size_t size, CHARSET_INFO *cs)
+{
+ int err= 0;
+ uint convert_err= 0;
+ struct json_norm_value root;
+ char *s_utf8= NULL;
+ size_t in_size;
+ const char *in;
+
+ DBUG_ASSERT(result);
+
+ memset(&root, 0x00, sizeof(root));
+ root.type = JSON_VALUE_UNINITIALIZED;
+
+ /*
+ Convert the incoming string to utf8mb4_bin before doing any other work.
+ According to JSON RFC 8259, between systems JSON must be UTF-8
+ https://datatracker.ietf.org/doc/html/rfc8259#section-8.1
+ */
+ if (cs == &my_charset_utf8mb4_bin)
+ {
+ in= s;
+ in_size= size;
+ }
+ else
+ {
+ in_size= (size * my_charset_utf8mb4_bin.mbmaxlen) + 1;
+ s_utf8= json_norm_malloc(in_size);
+ if (!s_utf8)
+ return 1;
+ memset(s_utf8, 0x00, in_size);
+ my_convert(s_utf8, (uint32)in_size, &my_charset_utf8mb4_bin,
+ s, (uint32)size, cs, &convert_err);
+ if (convert_err)
+ {
+ my_free(s_utf8);
+ return 1;
+ }
+ in= s_utf8;
+ in_size= strlen(s_utf8);
+ }
+
+
+ if (!json_valid(in, in_size, &my_charset_utf8mb4_bin))
+ {
+ err= 1;
+ goto json_normalize_end;
+ }
+
+ err= json_norm_build(&root, in, in_size, &my_charset_utf8mb4_bin);
+ if (err)
+ goto json_normalize_end;
+
+ json_normalize_sort(&root);
+
+ err= json_norm_to_string(result, &root);
+
+json_normalize_end:
+ json_norm_value_free(&root);
+ if (err)
+ dynstr_free(result);
+ if (s_utf8)
+ my_free(s_utf8);
+ return err;
+}
+
+
diff --git a/strings/strcoll.inl b/strings/strcoll.inl
index 50849c06e7d..eb5c6e3c717 100644
--- a/strings/strcoll.inl
+++ b/strings/strcoll.inl
@@ -16,6 +16,8 @@
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
*/
+#include "ctype-ascii.h"
+
#ifndef MY_FUNCTION_NAME
#error MY_FUNCTION_NAME is not defined
#endif
@@ -40,6 +42,42 @@
/*
+ For binary collations:
+ - on 32bit platforms perform only 4 byte optimization
+ - on 64bit platforms perform both 4 byte and 8 byte optimization
+*/
+#if defined(STRCOLL_MB7_BIN)
+#define MY_STRCOLL_MB7_4BYTES(a,b) my_strcoll_mb7_bin_4bytes((a),(b))
+#if SIZEOF_VOIDP == 8
+#define STRCOLL_MB7_8BYTES
+#define MY_STRCOLL_MB7_8BYTES(a,b) my_strcoll_mb7_bin_8bytes((a),(b))
+#endif /* Architecture test */
+#endif /* STRCOLL_MB7_BIN */
+
+
+/*
+ For case insensitive collations with trivial mapping from [a-z] to [A-Z]
+ perform optimization only on 64 bit platforms.
+ There is no sense to perform my_ascii_to_upper_magic_uint64() based
+ optimization on 32bit platforms. The idea of this optimization
+ is that it handles 8bytes at a time, using 64bit CPU registers.
+ Enabling this optimization on 32bit platform may only slow things down.
+*/
+#if defined(STRCOLL_MB7_TOUPPER)
+#if SIZEOF_VOIDP == 8
+#define MY_STRCOLL_MB7_4BYTES(a,b) my_strcoll_ascii_toupper_4bytes((a),(b))
+#define MY_STRCOLL_MB7_8BYTES(a,b) my_strcoll_ascii_toupper_8bytes((a),(b))
+#endif /* Architecture test */
+#endif /* STRCOLL_MB7_TOUPPER */
+
+
+/*
+ A helper macro to shift two pointers forward, to the given amount.
+*/
+#define MY_STRING_SHIFT_PTR_PTR(a,b,len) do { a+= len; b+= len; } while(0)
+
+
+/*
Weight of an illegal byte, must follow these rules:
1. Must be greater than weight of any normal character in the collation.
2. Two different bad bytes must have different weights and must be
@@ -182,7 +220,31 @@ MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs __attribute__((unused)),
{
int a_weight, b_weight, res;
uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end);
- uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
+ uint b_wlen;
+
+#ifdef MY_STRCOLL_MB7_4BYTES
+ if (a_wlen == 1 && my_strcoll_ascii_4bytes_found(a, a_end, b, b_end))
+ {
+ int res;
+#ifdef MY_STRCOLL_MB7_8BYTES
+ /*TODO: a a loop here >='a' <='z' here, for automatic vectorization*/
+ if (my_strcoll_ascii_4bytes_found(a + 4, a_end, b + 4, b_end))
+ {
+ if ((res= MY_STRCOLL_MB7_8BYTES(a, b)))
+ return res;
+ MY_STRING_SHIFT_PTR_PTR(a, b, 8);
+ continue;
+ }
+#endif
+ if ((res= MY_STRCOLL_MB7_4BYTES(a, b)))
+ return res;
+ MY_STRING_SHIFT_PTR_PTR(a, b, 4);
+ continue;
+ }
+#endif /* MY_STRCOLL_MB7_4BYTES */
+
+ b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
+
/*
a_wlen b_wlen Comment
------ ------ -------
@@ -253,7 +315,30 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
{
int a_weight, b_weight, res;
uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end);
- uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
+ uint b_wlen;
+
+#ifdef MY_STRCOLL_MB7_4BYTES
+ if (a_wlen == 1 && my_strcoll_ascii_4bytes_found(a, a_end, b, b_end))
+ {
+ int res;
+#ifdef MY_STRCOLL_MB7_8BYTES
+ if (my_strcoll_ascii_4bytes_found(a + 4, a_end, b + 4, b_end))
+ {
+ if ((res= MY_STRCOLL_MB7_8BYTES(a, b)))
+ return res;
+ MY_STRING_SHIFT_PTR_PTR(a, b, 8);
+ continue;
+ }
+#endif
+ if ((res= MY_STRCOLL_MB7_4BYTES(a, b)))
+ return res;
+ MY_STRING_SHIFT_PTR_PTR(a, b, 4);
+ continue;
+ }
+#endif /* MY_STRCOLL_MB7_4BYTES */
+
+ b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
+
if ((res= (a_weight - b_weight)))
{
/*
@@ -286,7 +371,7 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
DBUG_ASSERT(0);
return 0;
}
-#endif
+#endif /* DEFINE_STRNNCOLLSP_NOPAD */
/**
@@ -652,3 +737,8 @@ MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs,
#undef DEFINE_STRNXFRM_UNICODE_BIN2
#undef DEFINE_STRNNCOLL
#undef DEFINE_STRNNCOLLSP_NOPAD
+
+#undef STRCOLL_MB7_TOUPPER
+#undef STRCOLL_MB7_BIN
+#undef MY_STRCOLL_MB7_4BYTES
+#undef MY_STRCOLL_MB7_8BYTES