summaryrefslogtreecommitdiff
path: root/strings/ctype-utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'strings/ctype-utf8.c')
-rw-r--r--strings/ctype-utf8.c71
1 files changed, 31 insertions, 40 deletions
diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c
index 88bab1fac76..b96ca0e5bbe 100644
--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -2404,46 +2404,33 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)),
my_wc_t wc, uchar *r, uchar *e)
{
- int count;
-
- if (r >= e)
- return MY_CS_TOOSMALL;
-
if (wc < 0x80)
- count = 1;
- else if (wc < 0x800)
- count = 2;
- else if (wc < 0x10000)
- count = 3;
-#ifdef UNICODE_32BIT
- else if (wc < 0x200000)
- count = 4;
- else if (wc < 0x4000000)
- count = 5;
- else if (wc <= 0x7fffffff)
- count = 6;
-#endif
- else return MY_CS_ILUNI;
-
- /*
- e is a character after the string r, not the last character of it.
- Because of it (r+count > e), not (r+count-1 >e )
- */
- if ( r+count > e )
- return MY_CS_TOOSMALLN(count);
-
- switch (count) {
- /* Fall through all cases!!! */
-#ifdef UNICODE_32BIT
- case 6: r[5] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x4000000;
- case 5: r[4] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x200000;
- case 4: r[3] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x10000;
-#endif
- case 3: r[2] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x800;
- case 2: r[1] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0xc0;
- case 1: r[0] = (uchar) wc;
+ {
+ if (r >= e)
+ return MY_CS_TOOSMALL;
+ *r= (uchar) wc;
+ return 1;
}
- return count;
+ if (wc < 0x800)
+ {
+ if (r + 2 > e)
+ return MY_CS_TOOSMALLN(2);
+ /* U+0080..U+07FF: 00000xxx.xxyyyyyy -> 110xxxxx 10yyyyyy */
+ *r++= (uchar) (0xC0 | (wc >> 6));
+ *r= (uchar) (0x80 | (wc & 0x3F));
+ return 2;
+ }
+ if (wc < 0x10000)
+ {
+ if (r + 3 > e)
+ return MY_CS_TOOSMALLN(3);
+ /* U+0800..U+FFFF: xxxxyyyy.yyzzzzzz -> 1110xxxx 10yyyyyy 10zzzzzz */
+ *r++= (uchar) (0xE0 | (wc >> 12));
+ *r++= (uchar) (0x80 | ((wc >> 6) & 0x3f));
+ *r= (uchar) (0x80 | (wc & 0x3f));
+ return 3;
+ }
+ return MY_CS_ILUNI;
}
@@ -4353,6 +4340,10 @@ static const char filename_safe_char[128]=
#define MY_FILENAME_ESCAPE '@'
+/*
+ note, that we cannot trust 'e' here, it's may be fake,
+ see strconvert()
+*/
static int
my_mb_wc_filename(CHARSET_INFO *cs __attribute__((unused)),
my_wc_t *pwc, const uchar *s, const uchar *e)
@@ -4374,7 +4365,7 @@ my_mb_wc_filename(CHARSET_INFO *cs __attribute__((unused)),
return MY_CS_TOOSMALL3;
byte1= s[1];
- byte2= s[2];
+ byte2= byte1 ? s[2] : 0;
if (byte1 >= 0x30 && byte1 <= 0x7F &&
byte2 >= 0x30 && byte2 <= 0x7F)
@@ -4399,7 +4390,7 @@ my_mb_wc_filename(CHARSET_INFO *cs __attribute__((unused)),
(byte2= hexlo(byte2)) >= 0)
{
int byte3= hexlo(s[3]);
- int byte4= hexlo(s[4]);
+ int byte4= hexlo(s[3] ? s[4] : 0);
if (byte3 >=0 && byte4 >=0)
{
*pwc= (byte1 << 12) + (byte2 << 8) + (byte3 << 4) + byte4;