diff options
author | Louis Yung-Chieh Lo <yjlou@chromium.org> | 2010-11-19 09:58:47 +0800 |
---|---|---|
committer | Louis Yung-Chieh Lo <yjlou@chromium.org> | 2010-11-19 09:58:47 +0800 |
commit | 6965cbfed3352754f0ff9a270e3b330223b7154c (patch) | |
tree | f75b25ad312fcd504d4c5c0073a900e709e94a63 /cgpt | |
parent | 2b23c021f3137427a8b3f00e7702850be6e1c242 (diff) | |
download | vboot-6965cbfed3352754f0ff9a270e3b330223b7154c.tar.gz |
The right implementation of CGPT label conversion between UTF8 and UTF16.
For security quick fix, the original UTF8/UTF16 conversion only supports
ASCII area. This CL extends the library to support multiple code units
conversion between UTF8 and UTF16. The UTF8/UTF16 encoded byte(s) would be
decoded to code point first, then be encoded to UTF16/UTF8 correspondingly.
Bill, please kindly review the UTF8/UTF16 conversion.
Peter, please kindly comment if any security concern.
Thanks.
Change-Id: I99c558ff27556e0b8635ba2b8d9925d042e75cb2
BUG=chromium-os:7542
TEST=RUNTESTS=1 emerge-x86-generic vboot_reference
Manually tested the following commands (intentionally mix Chinese and ASCII):
export C=.../cgpt
export D=/tmp/hda
$C add $D -i 1 -l 批P踢T踢T許C夕C餐
$C find $D -l 批P踢T踢T許C夕C餐
$C show $D
$C add $D -i 1 -l 批P踢T踢T許C夕C餐
$C find $D -l 批P踢T踢T許C夕C餐
$C add $D -i 1 -l abc012
$C add $D -i 1 -l 是否看過坊間常見的許茹芸淚海慶功宴吃蓋飯第四集
$C add $D -i 1 -l 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ # ok and truncated
$C add $D -i 1 -l `printf "\xf4\x91\x81\x81"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xf4\x8f\xbf\xbf"`
$C add $D -i 1 -l `printf "\xf4\x8f\x44\x44"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xf4\x8f\xbf"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xf0\xbf\xbf\xbf"`
$C add $D -i 1 -l `printf "\xf0\xbf\xbf\x44"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xf0\x80\x80\x80"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xf0\x80\x84\x80"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xf0\x80\x90\x80"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xf0\x88\x80\x80"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xed\x80\x80"`
$C add $D -i 1 -l `printf "\xed\xa0\x80"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xe0\xbf\xbf"`
$C add $D -i 1 -l `printf "\xe0\xbf\x44"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xe0\x80\x80"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xe0\x90\x80"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xe0\xbf"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xd0\x80"`
$C add $D -i 1 -l `printf "\xd0\x11"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xd0"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\xc0\xaf"` # (EXPECT: failed)
$C add $D -i 1 -l `printf "\x80"` # (EXPECT: failed)
Review URL: http://codereview.chromium.org/5025003
Diffstat (limited to 'cgpt')
-rw-r--r-- | cgpt/cgpt.h | 18 | ||||
-rw-r--r-- | cgpt/cgpt_common.c | 197 | ||||
-rw-r--r-- | cgpt/cmd_add.c | 7 | ||||
-rw-r--r-- | cgpt/cmd_find.c | 8 |
4 files changed, 199 insertions, 31 deletions
diff --git a/cgpt/cgpt.h b/cgpt/cgpt.h index 9b0805cf..85702a4f 100644 --- a/cgpt/cgpt.h +++ b/cgpt/cgpt.h @@ -81,14 +81,22 @@ int WritePMBR(struct drive *drive); /* Convert possibly unterminated UTF16 string to UTF8. * Caller must prepare enough space for UTF8, which could be up to - * twice the number of UTF16 chars plus the terminating '\0'. + * twice the byte length of UTF16 string plus the terminating '\0'. + * + * Return: CGPT_OK --- all character are converted successfully. + * CGPT_FAILED --- convert error, i.e. output buffer is too short. */ -void UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput, - uint8_t *utf8, unsigned int maxoutput); +int UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput, + uint8_t *utf8, unsigned int maxoutput); + /* Convert null-terminated UTF8 string to UTF16. - * Caller must prepare enough space for UTF16, including a terminating 0x0000 + * Caller must prepare enough space for UTF16, which is the byte length of UTF8 + * plus the terminating 0x0000. + * + * Return: CGPT_OK --- all character are converted successfully. + * CGPT_FAILED --- convert error, i.e. output buffer is too short. */ -void UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput); +int UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput); /* Helper functions for supported GPT types. */ int ResolveType(const Guid *type, char *buf); diff --git a/cgpt/cgpt_common.c b/cgpt/cgpt_common.c index 0e466fdc..52cbe70c 100644 --- a/cgpt/cgpt_common.c +++ b/cgpt/cgpt_common.c @@ -350,56 +350,209 @@ void GuidToStr(const Guid *guid, char *str, unsigned int buflen) { /* Convert possibly unterminated UTF16 string to UTF8. * Caller must prepare enough space for UTF8, which could be up to - * twice the number of UTF16 chars plus the terminating '\0'. - * FIXME(wfrichar): The original implementation had security issues. As a - * temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542 - * (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix - * this. + * twice the byte length of UTF16 string plus the terminating '\0'. + * See the following table for encoding lengths. + * + * Code point UTF16 UTF8 + * 0x0000-0x007F 2 bytes 1 byte + * 0x0080-0x07FF 2 bytes 2 bytes + * 0x0800-0xFFFF 2 bytes 3 bytes + * 0x10000-0x10FFFF 4 bytes 4 bytes + * + * This function uses a simple state meachine to convert UTF-16 char(s) to + * a code point. Once a code point is parsed out, the state machine throws + * out sequencial UTF-8 chars in one time. + * + * Return: CGPT_OK --- all character are converted successfully. + * CGPT_FAILED --- convert error, i.e. output buffer is too short. */ -void UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput, - uint8_t *utf8, unsigned int maxoutput) +int UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput, + uint8_t *utf8, unsigned int maxoutput) { size_t s16idx, s8idx; - uint32_t utfchar; + uint32_t code_point; + int code_point_ready = 1; // code point is ready to output. + int retval = CGPT_OK; if (!utf16 || !maxinput || !utf8 || !maxoutput) - return; + return CGPT_FAILED; maxoutput--; /* plan for termination now */ for (s16idx = s8idx = 0; s16idx < maxinput && utf16[s16idx] && maxoutput; - s16idx++, maxoutput--) { - utfchar = le16toh(utf16[s16idx]); - utf8[s8idx++] = utfchar & 0x7F; + s16idx++) { + uint16_t codeunit = le16toh(utf16[s16idx]); + + if (code_point_ready) { + if (codeunit >= 0xD800 && codeunit <= 0xDBFF) { + /* high surrogate, need the low surrogate. */ + code_point_ready = 0; + code_point = (codeunit & 0x03FF) + 0x0040; + } else { + /* BMP char, output it. */ + code_point = codeunit; + } + } else { + /* expect the low surrogate */ + if (codeunit >= 0xDC00 && codeunit <= 0xDFFF) { + code_point = (code_point << 10) | (codeunit & 0x03FF); + code_point_ready = 1; + } else { + /* the second code unit is NOT the low surrogate. Unexpected. */ + retval = CGPT_FAILED; + break; + } + } + + /* If UTF code point is ready, output it. */ + if (code_point_ready) { + require(code_point <= 0x10FFFF); + if (code_point <= 0x7F && maxoutput >= 1) { + maxoutput -= 1; + utf8[s8idx++] = code_point & 0x7F; + } else if (code_point <= 0x7FF && maxoutput >= 2) { + maxoutput -= 2; + utf8[s8idx++] = 0xC0 | (code_point >> 6); + utf8[s8idx++] = 0x80 | (code_point & 0x3F); + } else if (code_point <= 0xFFFF && maxoutput >= 3) { + maxoutput -= 3; + utf8[s8idx++] = 0xE0 | (code_point >> 12); + utf8[s8idx++] = 0x80 | ((code_point >> 6) & 0x3F); + utf8[s8idx++] = 0x80 | (code_point & 0x3F); + } else if (code_point <= 0x10FFFF && maxoutput >= 4) { + maxoutput -= 4; + utf8[s8idx++] = 0xF0 | (code_point >> 18); + utf8[s8idx++] = 0x80 | ((code_point >> 12) & 0x3F); + utf8[s8idx++] = 0x80 | ((code_point >> 6) & 0x3F); + utf8[s8idx++] = 0x80 | (code_point & 0x3F); + } else { + /* buffer underrun */ + retval = CGPT_FAILED; + break; + } + } } utf8[s8idx++] = 0; + return retval; } /* Convert UTF8 string to UTF16. The UTF8 string must be null-terminated. * Caller must prepare enough space for UTF16, including a terminating 0x0000. - * FIXME(wfrichar): The original implementation had security issues. As a - * temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542 - * (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix - * this. + * See the following table for encoding lengths. In any case, the caller + * just needs to prepare the byte length of UTF8 plus the terminating 0x0000. + * + * Code point UTF16 UTF8 + * 0x0000-0x007F 2 bytes 1 byte + * 0x0080-0x07FF 2 bytes 2 bytes + * 0x0800-0xFFFF 2 bytes 3 bytes + * 0x10000-0x10FFFF 4 bytes 4 bytes + * + * This function converts UTF8 chars to a code point first. Then, convrts it + * to UTF16 code unit(s). + * + * Return: CGPT_OK --- all character are converted successfully. + * CGPT_FAILED --- convert error, i.e. output buffer is too short. */ -void UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput) +int UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput) { size_t s16idx, s8idx; - uint32_t utfchar; + uint32_t code_point = 0; + unsigned int expected_units = 1; + unsigned int decoded_units = 1; + int retval = CGPT_OK; if (!utf8 || !utf16 || !maxoutput) - return; + return CGPT_FAILED; maxoutput--; /* plan for termination */ for (s8idx = s16idx = 0; utf8[s8idx] && maxoutput; - s8idx++, maxoutput--) { - utfchar = utf8[s8idx]; - utf16[s16idx++] = utfchar & 0x7F; + s8idx++) { + uint8_t code_unit; + code_unit = utf8[s8idx]; + + if (expected_units != decoded_units) { + /* Trailing bytes of multi-byte character */ + if ((code_unit & 0xC0) == 0x80) { + code_point = (code_point << 6) | (code_unit & 0x3F); + ++decoded_units; + } else { + /* Unexpected code unit. */ + retval = CGPT_FAILED; + break; + } + } else { + /* parsing a new code point. */ + decoded_units = 1; + if (code_unit <= 0x7F) { + code_point = code_unit; + expected_units = 1; + } else if (code_unit <= 0xBF) { + /* 0x80-0xBF must NOT be the heading byte unit of a new code point. */ + retval = CGPT_FAILED; + break; + } else if (code_unit >= 0xC2 && code_unit <= 0xDF) { + code_point = code_unit & 0x1F; + expected_units = 2; + } else if (code_unit >= 0xE0 && code_unit <= 0xEF) { + code_point = code_unit & 0x0F; + expected_units = 3; + } else if (code_unit >= 0xF0 && code_unit <= 0xF4) { + code_point = code_unit & 0x07; + expected_units = 4; + } else { + /* illegal code unit: 0xC0-0xC1, 0xF5-0xFF */ + retval = CGPT_FAILED; + break; + } + } + + /* If no more unit is needed, output the UTF16 unit(s). */ + if (expected_units == decoded_units) { + /* Check if the encoding is the shortest possible UTF-8 sequence. */ + switch (expected_units) { + case 2: + if (code_point <= 0x7F) retval = CGPT_FAILED; + break; + case 3: + if (code_point <= 0x7FF) retval = CGPT_FAILED; + break; + case 4: + if (code_point <= 0xFFFF) retval = CGPT_FAILED; + break; + } + if (retval == CGPT_FAILED) break; /* leave immediately */ + + if ((code_point <= 0xD7FF) || + (code_point >= 0xE000 && code_point <= 0xFFFF)) { + utf16[s16idx++] = code_point; + maxoutput -= 1; + } else if (code_point >= 0x10000 && code_point <= 0x10FFFF && + maxoutput >= 2) { + utf16[s16idx++] = 0xD800 | ((code_point >> 10) - 0x0040); + utf16[s16idx++] = 0xDC00 | (code_point & 0x03FF); + maxoutput -= 2; + } else { + /* Three possibilities fall into here. Both are failure cases. + * a. surrogate pair (non-BMP characters; 0xD800~0xDFFF) + * b. invalid code point > 0x10FFFF + * c. buffer underrun + */ + retval = CGPT_FAILED; + break; + } + } } + + /* A null-terminator shows up before the UTF8 sequence ends. */ + if (expected_units != decoded_units) { + retval = CGPT_FAILED; + } + utf16[s16idx++] = 0; + return retval; } struct { diff --git a/cgpt/cmd_add.c b/cgpt/cmd_add.c index dafcc50f..81b0dfa1 100644 --- a/cgpt/cmd_add.c +++ b/cgpt/cmd_add.c @@ -251,8 +251,11 @@ int cmd_add(int argc, char *argv[]) { if (set_unique) memcpy(&entry->unique, &unique_guid, sizeof(Guid)); if (label) { - UTF8ToUTF16((uint8_t *)label, entry->name, - sizeof(entry->name) / sizeof(entry->name[0])); + if (CGPT_OK != UTF8ToUTF16((uint8_t *)label, entry->name, + sizeof(entry->name) / sizeof(entry->name[0]))) { + Error("The label cannot be converted to UTF16.\n"); + goto bad; + } } if (set_raw) { entry->attrs.fields.gpt_att = raw_value; diff --git a/cgpt/cmd_find.c b/cgpt/cmd_find.c index 40f10ba0..43438ef7 100644 --- a/cgpt/cmd_find.c +++ b/cgpt/cmd_find.c @@ -181,8 +181,12 @@ static int do_search(char *filename) { (set_type && !memcmp(&type_guid, &entry->type, sizeof(Guid)))) { found = 1; } else if (set_label) { - UTF16ToUTF8(entry->name, sizeof(entry->name) / sizeof(entry->name[0]), - (uint8_t *)partlabel, sizeof(partlabel)); + if (CGPT_OK != UTF16ToUTF8(entry->name, + sizeof(entry->name) / sizeof(entry->name[0]), + (uint8_t *)partlabel, sizeof(partlabel))) { + Error("The label cannot be converted to UTF16, so abort.\n"); + return 0; + } if (!strncmp(label, partlabel, sizeof(partlabel))) { found = 1; } |