diff options
author | Nikita Popov <nikita.ppv@gmail.com> | 2017-08-03 22:14:00 +0200 |
---|---|---|
committer | Nikita Popov <nikita.ppv@gmail.com> | 2017-08-03 22:14:00 +0200 |
commit | 41e9ba6333ab58c9fe3eb8bd413cc3e0eca87be1 (patch) | |
tree | 2686eda5727a0d2955113d719ac60c69b6f31911 /ext | |
parent | fb9bf5b64b6c09b9d93bbd1dadd64884e0af66f3 (diff) | |
download | php-git-41e9ba6333ab58c9fe3eb8bd413cc3e0eca87be1.tar.gz |
Always use Unicode codepoints in mb_ord() and mb_chr()
Previously mb_chr() had two different encoding-dependent behaviors:
* For "Unicode-encodings" it took a Unicode codepoint and returned
its encoded representation.
* Otherwise it returned a big-endian binary encoding of the passed
integer.
Now the input is always interpreted as a Unicode codepoint. If
a big-endian binary encoding is what you want, you don't need
mbstring to implement that.
Diffstat (limited to 'ext')
-rw-r--r-- | ext/mbstring/mbstring.c | 117 | ||||
-rw-r--r-- | ext/mbstring/tests/mb_chr.phpt | 2 | ||||
-rw-r--r-- | ext/mbstring/tests/mb_ord.phpt | 2 |
3 files changed, 20 insertions, 101 deletions
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 9b9458afc4..aca182acf6 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -5065,8 +5065,6 @@ static inline zend_long php_mb_ord(const char* str, size_t str_len, const char* enum mbfl_no_encoding no_enc; char* ret; size_t ret_len; - const mbfl_encoding *encoding; - unsigned char char_len; zend_long cp; if (enc == NULL) { @@ -5080,52 +5078,20 @@ static inline zend_long php_mb_ord(const char* str, size_t str_len, const char* } } - if (php_mb_is_no_encoding_unicode(no_enc)) { - - ret = php_mb_convert_encoding(str, str_len, "UCS-4BE", enc, &ret_len); - - if (ret == NULL) { - return -1; - } - - cp = (unsigned char) ret[0] << 24 | \ - (unsigned char) ret[1] << 16 | \ - (unsigned char) ret[2] << 8 | \ - (unsigned char) ret[3]; - - efree(ret); - - return cp; - - } else if (php_mb_is_unsupported_no_encoding(no_enc)) { + if (php_mb_is_unsupported_no_encoding(no_enc)) { php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc); return -1; } - ret = php_mb_convert_encoding(str, str_len, enc, enc, &ret_len); - + ret = php_mb_convert_encoding(str, str_len, "UCS-4BE", enc, &ret_len); if (ret == NULL) { return -1; } - encoding = mbfl_no2encoding(no_enc); - char_len = php_mb_mbchar_bytes_ex(ret, encoding); - - if (char_len == 1) { - cp = (unsigned char) ret[0]; - } else if (char_len == 2) { - cp = ((unsigned char) ret[0] << 8) | \ - (unsigned char) ret[1]; - } else if (char_len == 3) { - cp = ((unsigned char) ret[0] << 16) | \ - ((unsigned char) ret[1] << 8) | \ - (unsigned char) ret[2]; - } else { - cp = ((unsigned char) ret[0] << 24) | \ - ((unsigned char) ret[1] << 16) | \ - ((unsigned char) ret[2] << 8) | \ - (unsigned char) ret[3]; - } + cp = (unsigned char) ret[0] << 24 | \ + (unsigned char) ret[1] << 16 | \ + (unsigned char) ret[2] << 8 | \ + (unsigned char) ret[3]; efree(ret); @@ -5217,77 +5183,30 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len return ret; - } else if (php_mb_is_no_encoding_unicode(no_enc)) { - - if (0 > cp || 0x10ffff < cp) { - - if (php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) { - cp = MBSTRG(current_filter_illegal_substchar); - } else { - cp = 0x3f; - } - - } - - buf_len = 4; - buf = (char *) safe_emalloc(buf_len, 1, 1); - buf[0] = (cp >> 24) & 0xff; - buf[1] = (cp >> 16) & 0xff; - buf[2] = (cp >> 8) & 0xff; - buf[3] = cp & 0xff; - buf[4] = 0; - - ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len); - efree(buf); - - if (output_len) { - *output_len = ret_len; - } - - return ret; - } else if (php_mb_is_unsupported_no_encoding(no_enc)) { php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc); return NULL; } - if (0 > cp || cp > 0x100000000) { - if (no_enc == MBSTRG(current_internal_encoding)->no_encoding) { + if (0 > cp || 0x10ffff < cp) { + + if (php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) { cp = MBSTRG(current_filter_illegal_substchar); } else { cp = 0x3f; } - } - if (cp < 0x100) { - buf_len = 1; - buf = (char *) safe_emalloc(buf_len, 1, 1); - buf[0] = cp; - buf[1] = 0; - } else if (cp < 0x10000) { - buf_len = 2; - buf = (char *) safe_emalloc(buf_len, 1, 1); - buf[0] = cp >> 8; - buf[1] = cp & 0xff; - buf[2] = 0; - } else if (cp < 0x1000000) { - buf_len = 3; - buf = (char *) safe_emalloc(buf_len, 1, 1); - buf[0] = cp >> 16; - buf[1] = (cp >> 8) & 0xff; - buf[2] = cp & 0xff; - buf[3] = 0; - } else { - buf_len = 4; - buf = (char *) safe_emalloc(buf_len, 1, 1); - buf[0] = cp >> 24; - buf[1] = (cp >> 16) & 0xff; - buf[2] = (cp >> 8) & 0xff; - buf[3] = cp & 0xff; - buf[4] = 0; } - ret = php_mb_convert_encoding(buf, buf_len, enc, enc, &ret_len); + buf_len = 4; + buf = (char *) safe_emalloc(buf_len, 1, 1); + buf[0] = (cp >> 24) & 0xff; + buf[1] = (cp >> 16) & 0xff; + buf[2] = (cp >> 8) & 0xff; + buf[3] = cp & 0xff; + buf[4] = 0; + + ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len); efree(buf); if (output_len) { diff --git a/ext/mbstring/tests/mb_chr.phpt b/ext/mbstring/tests/mb_chr.phpt index 8ec35920c3..b99aa12b99 100644 --- a/ext/mbstring/tests/mb_chr.phpt +++ b/ext/mbstring/tests/mb_chr.phpt @@ -6,7 +6,7 @@ mb_chr() <?php var_dump( "\u{20bb7}" === mb_chr(0x20bb7), - "\x8f\xa1\xef" === mb_chr(0x8fa1ef, "EUC-JP-2004"), + "\x8f\xa1\xef" === mb_chr(0x50aa, "EUC-JP-2004"), "?" === mb_chr(0xd800) ); diff --git a/ext/mbstring/tests/mb_ord.phpt b/ext/mbstring/tests/mb_ord.phpt index e3f5343fd8..4bf0d0c0e4 100644 --- a/ext/mbstring/tests/mb_ord.phpt +++ b/ext/mbstring/tests/mb_ord.phpt @@ -7,7 +7,7 @@ mb_ord() var_dump( 0x20bb7 === mb_ord("\u{20bb7}"), 0x3f === mb_ord("\u{d800}"), - 0x8fa1ef === mb_ord("\x8f\xa1\xef", "EUC-JP-2004") + 0x50aa === mb_ord("\x8f\xa1\xef", "EUC-JP-2004") ); // Invalid |