diff options
author | Heinrich Schuchardt <xypron.glpk@gmx.de> | 2018-08-31 21:31:27 +0200 |
---|---|---|
committer | Alexander Graf <agraf@suse.de> | 2018-09-23 21:55:29 +0200 |
commit | d8c28232c34d4be9984eda52d69d0920678d937d (patch) | |
tree | da6d8305f94829b36f163c331eb5e1bf8abc3280 /lib/charset.c | |
parent | 1dde0d57a5d1281aaa949e9bf7b5c476345a56ee (diff) | |
download | u-boot-d8c28232c34d4be9984eda52d69d0920678d937d.tar.gz |
lib: charset: utility functions for Unicode
utf8_get() - get next UTF-8 code point from buffer
utf8_put() - write UTF-8 code point to buffer
utf8_utf16_strnlen() - length of a utf-8 string after conversion to utf-16
utf8_utf16_strncpy() - copy a utf-8 string to utf-16
utf16_get() - get next UTF-16 code point from buffer
utf16_put() - write UTF-16 code point to buffer
utf16_strnlen() - number of codes points in a utf-16 string
utf16_utf8_strnlen() - length of a utf-16 string after conversion to utf-8
utf16_utf8_strncpy() - copy a utf-16 string to utf-8
Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
Signed-off-by: Alexander Graf <agraf@suse.de>
Diffstat (limited to 'lib/charset.c')
-rw-r--r-- | lib/charset.c | 236 |
1 files changed, 233 insertions, 3 deletions
diff --git a/lib/charset.c b/lib/charset.c index 8ff8d59957..e82622a7f8 100644 --- a/lib/charset.c +++ b/lib/charset.c @@ -8,9 +8,239 @@ #include <charset.h> #include <malloc.h> -/* - * utf8/utf16 conversion mostly lifted from grub - */ +s32 utf8_get(const char **src) +{ + s32 code = 0; + unsigned char c; + + if (!src || !*src) + return -1; + if (!**src) + return 0; + c = **src; + if (c >= 0x80) { + ++*src; + if (!**src) + return -1; + /* + * We do not expect a continuation byte (0x80 - 0xbf). + * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2 + * here. + * The highest code point is 0x10ffff which is coded as + * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4. + */ + if (c < 0xc2 || code > 0xf4) + return -1; + if (c >= 0xe0) { + if (c >= 0xf0) { + /* 0xf0 - 0xf4 */ + c &= 0x07; + code = c << 18; + c = **src; + ++*src; + if (!**src) + return -1; + if (c < 0x80 || c > 0xbf) + return -1; + c &= 0x3f; + } else { + /* 0xe0 - 0xef */ + c &= 0x0f; + } + code += c << 12; + if ((code >= 0xD800 && code <= 0xDFFF) || + code >= 0x110000) + return -1; + c = **src; + ++*src; + if (!**src) + return -1; + if (c < 0x80 || c > 0xbf) + return -1; + } + /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */ + c &= 0x3f; + code += c << 6; + c = **src; + if (c < 0x80 || c > 0xbf) + return -1; + c &= 0x3f; + } + code += c; + ++*src; + return code; +} + +int utf8_put(s32 code, char **dst) +{ + if (!dst || !*dst) + return -1; + if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) + return -1; + if (code <= 0x007F) { + **dst = code; + } else { + if (code <= 0x07FF) { + **dst = code >> 6 | 0xC0; + } else { + if (code < 0x10000) { + **dst = code >> 12 | 0xE0; + } else { + **dst = code >> 18 | 0xF0; + ++*dst; + **dst = (code >> 12 & 0x3F) | 0x80; + } + ++*dst; + **dst = (code >> 6 & 0x3F) | 0x80; + } + ++*dst; + **dst = (code & 0x3F) | 0x80; + } + ++*dst; + return 0; +} + +size_t utf8_utf16_strnlen(const char *src, size_t count) +{ + size_t len = 0; + + for (; *src && count; --count) { + s32 code = utf8_get(&src); + + if (!code) + break; + if (code < 0) { + /* Reserve space for a replacement character */ + len += 1; + } else if (code < 0x10000) { + len += 1; + } else { + len += 2; + } + } + return len; +} + +int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count) +{ + if (!src || !dst || !*dst) + return -1; + + for (; count && *src; --count) { + s32 code = utf8_get(&src); + + if (code < 0) + code = '?'; + utf16_put(code, dst); + } + **dst = 0; + return 0; +} + +s32 utf16_get(const u16 **src) +{ + s32 code, code2; + + if (!src || !*src) + return -1; + if (!**src) + return 0; + code = **src; + ++*src; + if (code >= 0xDC00 && code <= 0xDFFF) + return -1; + if (code >= 0xD800 && code <= 0xDBFF) { + if (!**src) + return -1; + code &= 0x3ff; + code <<= 10; + code += 0x10000; + code2 = **src; + ++*src; + if (code2 <= 0xDC00 || code2 >= 0xDFFF) + return -1; + code2 &= 0x3ff; + code += code2; + } + return code; +} + +int utf16_put(s32 code, u16 **dst) +{ + if (!dst || !*dst) + return -1; + if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000) + return -1; + if (code < 0x10000) { + **dst = code; + } else { + code -= 0x10000; + **dst = code >> 10 | 0xD800; + ++*dst; + **dst = (code & 0x3ff) | 0xDC00; + } + ++*dst; + return 0; +} + +size_t utf16_strnlen(const u16 *src, size_t count) +{ + size_t len = 0; + + for (; *src && count; --count) { + s32 code = utf16_get(&src); + + if (!code) + break; + /* + * In case of an illegal sequence still reserve space for a + * replacement character. + */ + ++len; + } + return len; +} + +size_t utf16_utf8_strnlen(const u16 *src, size_t count) +{ + size_t len = 0; + + for (; *src && count; --count) { + s32 code = utf16_get(&src); + + if (!code) + break; + if (code < 0) + /* Reserve space for a replacement character */ + len += 1; + else if (code < 0x80) + len += 1; + else if (code < 0x800) + len += 2; + else if (code < 0x10000) + len += 3; + else + len += 4; + } + return len; +} + +int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count) +{ + if (!src || !dst || !*dst) + return -1; + + for (; count && *src; --count) { + s32 code = utf16_get(&src); + + if (code < 0) + code = '?'; + utf8_put(code, dst); + } + **dst = 0; + return 0; +} + size_t u16_strlen(const u16 *in) { |