summaryrefslogtreecommitdiff
path: root/ext
diff options
context:
space:
mode:
authorNikita Popov <nikita.ppv@gmail.com>2017-08-03 22:14:00 +0200
committerNikita Popov <nikita.ppv@gmail.com>2017-08-03 22:14:00 +0200
commit41e9ba6333ab58c9fe3eb8bd413cc3e0eca87be1 (patch)
tree2686eda5727a0d2955113d719ac60c69b6f31911 /ext
parentfb9bf5b64b6c09b9d93bbd1dadd64884e0af66f3 (diff)
downloadphp-git-41e9ba6333ab58c9fe3eb8bd413cc3e0eca87be1.tar.gz
Always use Unicode codepoints in mb_ord() and mb_chr()
Previously mb_chr() had two different encoding-dependent behaviors: * For "Unicode-encodings" it took a Unicode codepoint and returned its encoded representation. * Otherwise it returned a big-endian binary encoding of the passed integer. Now the input is always interpreted as a Unicode codepoint. If a big-endian binary encoding is what you want, you don't need mbstring to implement that.
Diffstat (limited to 'ext')
-rw-r--r--ext/mbstring/mbstring.c117
-rw-r--r--ext/mbstring/tests/mb_chr.phpt2
-rw-r--r--ext/mbstring/tests/mb_ord.phpt2
3 files changed, 20 insertions, 101 deletions
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
index 9b9458afc4..aca182acf6 100644
--- a/ext/mbstring/mbstring.c
+++ b/ext/mbstring/mbstring.c
@@ -5065,8 +5065,6 @@ static inline zend_long php_mb_ord(const char* str, size_t str_len, const char*
enum mbfl_no_encoding no_enc;
char* ret;
size_t ret_len;
- const mbfl_encoding *encoding;
- unsigned char char_len;
zend_long cp;
if (enc == NULL) {
@@ -5080,52 +5078,20 @@ static inline zend_long php_mb_ord(const char* str, size_t str_len, const char*
}
}
- if (php_mb_is_no_encoding_unicode(no_enc)) {
-
- ret = php_mb_convert_encoding(str, str_len, "UCS-4BE", enc, &ret_len);
-
- if (ret == NULL) {
- return -1;
- }
-
- cp = (unsigned char) ret[0] << 24 | \
- (unsigned char) ret[1] << 16 | \
- (unsigned char) ret[2] << 8 | \
- (unsigned char) ret[3];
-
- efree(ret);
-
- return cp;
-
- } else if (php_mb_is_unsupported_no_encoding(no_enc)) {
+ if (php_mb_is_unsupported_no_encoding(no_enc)) {
php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc);
return -1;
}
- ret = php_mb_convert_encoding(str, str_len, enc, enc, &ret_len);
-
+ ret = php_mb_convert_encoding(str, str_len, "UCS-4BE", enc, &ret_len);
if (ret == NULL) {
return -1;
}
- encoding = mbfl_no2encoding(no_enc);
- char_len = php_mb_mbchar_bytes_ex(ret, encoding);
-
- if (char_len == 1) {
- cp = (unsigned char) ret[0];
- } else if (char_len == 2) {
- cp = ((unsigned char) ret[0] << 8) | \
- (unsigned char) ret[1];
- } else if (char_len == 3) {
- cp = ((unsigned char) ret[0] << 16) | \
- ((unsigned char) ret[1] << 8) | \
- (unsigned char) ret[2];
- } else {
- cp = ((unsigned char) ret[0] << 24) | \
- ((unsigned char) ret[1] << 16) | \
- ((unsigned char) ret[2] << 8) | \
- (unsigned char) ret[3];
- }
+ cp = (unsigned char) ret[0] << 24 | \
+ (unsigned char) ret[1] << 16 | \
+ (unsigned char) ret[2] << 8 | \
+ (unsigned char) ret[3];
efree(ret);
@@ -5217,77 +5183,30 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len
return ret;
- } else if (php_mb_is_no_encoding_unicode(no_enc)) {
-
- if (0 > cp || 0x10ffff < cp) {
-
- if (php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) {
- cp = MBSTRG(current_filter_illegal_substchar);
- } else {
- cp = 0x3f;
- }
-
- }
-
- buf_len = 4;
- buf = (char *) safe_emalloc(buf_len, 1, 1);
- buf[0] = (cp >> 24) & 0xff;
- buf[1] = (cp >> 16) & 0xff;
- buf[2] = (cp >> 8) & 0xff;
- buf[3] = cp & 0xff;
- buf[4] = 0;
-
- ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len);
- efree(buf);
-
- if (output_len) {
- *output_len = ret_len;
- }
-
- return ret;
-
} else if (php_mb_is_unsupported_no_encoding(no_enc)) {
php_error_docref(NULL, E_WARNING, "Unsupported encoding \"%s\"", enc);
return NULL;
}
- if (0 > cp || cp > 0x100000000) {
- if (no_enc == MBSTRG(current_internal_encoding)->no_encoding) {
+ if (0 > cp || 0x10ffff < cp) {
+
+ if (php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) {
cp = MBSTRG(current_filter_illegal_substchar);
} else {
cp = 0x3f;
}
- }
- if (cp < 0x100) {
- buf_len = 1;
- buf = (char *) safe_emalloc(buf_len, 1, 1);
- buf[0] = cp;
- buf[1] = 0;
- } else if (cp < 0x10000) {
- buf_len = 2;
- buf = (char *) safe_emalloc(buf_len, 1, 1);
- buf[0] = cp >> 8;
- buf[1] = cp & 0xff;
- buf[2] = 0;
- } else if (cp < 0x1000000) {
- buf_len = 3;
- buf = (char *) safe_emalloc(buf_len, 1, 1);
- buf[0] = cp >> 16;
- buf[1] = (cp >> 8) & 0xff;
- buf[2] = cp & 0xff;
- buf[3] = 0;
- } else {
- buf_len = 4;
- buf = (char *) safe_emalloc(buf_len, 1, 1);
- buf[0] = cp >> 24;
- buf[1] = (cp >> 16) & 0xff;
- buf[2] = (cp >> 8) & 0xff;
- buf[3] = cp & 0xff;
- buf[4] = 0;
}
- ret = php_mb_convert_encoding(buf, buf_len, enc, enc, &ret_len);
+ buf_len = 4;
+ buf = (char *) safe_emalloc(buf_len, 1, 1);
+ buf[0] = (cp >> 24) & 0xff;
+ buf[1] = (cp >> 16) & 0xff;
+ buf[2] = (cp >> 8) & 0xff;
+ buf[3] = cp & 0xff;
+ buf[4] = 0;
+
+ ret = php_mb_convert_encoding(buf, buf_len, enc, "UCS-4BE", &ret_len);
efree(buf);
if (output_len) {
diff --git a/ext/mbstring/tests/mb_chr.phpt b/ext/mbstring/tests/mb_chr.phpt
index 8ec35920c3..b99aa12b99 100644
--- a/ext/mbstring/tests/mb_chr.phpt
+++ b/ext/mbstring/tests/mb_chr.phpt
@@ -6,7 +6,7 @@ mb_chr()
<?php
var_dump(
"\u{20bb7}" === mb_chr(0x20bb7),
- "\x8f\xa1\xef" === mb_chr(0x8fa1ef, "EUC-JP-2004"),
+ "\x8f\xa1\xef" === mb_chr(0x50aa, "EUC-JP-2004"),
"?" === mb_chr(0xd800)
);
diff --git a/ext/mbstring/tests/mb_ord.phpt b/ext/mbstring/tests/mb_ord.phpt
index e3f5343fd8..4bf0d0c0e4 100644
--- a/ext/mbstring/tests/mb_ord.phpt
+++ b/ext/mbstring/tests/mb_ord.phpt
@@ -7,7 +7,7 @@ mb_ord()
var_dump(
0x20bb7 === mb_ord("\u{20bb7}"),
0x3f === mb_ord("\u{d800}"),
- 0x8fa1ef === mb_ord("\x8f\xa1\xef", "EUC-JP-2004")
+ 0x50aa === mb_ord("\x8f\xa1\xef", "EUC-JP-2004")
);
// Invalid