diff options
| author | legale <legale.legale@gmail.com> | 2019-02-09 20:27:48 +0100 |
|---|---|---|
| committer | Nikita Popov <nikita.ppv@gmail.com> | 2019-02-12 16:42:51 +0100 |
| commit | d77ad27415a34e4f5908cb262567b7b6f0eca17f (patch) | |
| tree | cffd779071dfc568a5c4558dcd70a34bb635ea46 | |
| parent | 083cfc07cc13b1804ceeaa61f44319c598c05670 (diff) | |
| download | php-git-d77ad27415a34e4f5908cb262567b7b6f0eca17f.tar.gz | |
Implement mb_str_split()
RFC: https://wiki.php.net/rfc/mb_str_split
| -rw-r--r-- | UPGRADING | 5 | ||||
| -rw-r--r-- | ext/mbstring/mbstring.c | 170 | ||||
| -rw-r--r-- | ext/mbstring/mbstring.h | 1 | ||||
| -rw-r--r-- | ext/mbstring/tests/mb_str_split_jp.phpt | 76 | ||||
| -rw-r--r-- | ext/mbstring/tests/mb_str_split_ru.phpt | 75 | ||||
| -rw-r--r-- | ext/mbstring/tests/mb_str_split_utf8_utf16.phpt | 81 |
6 files changed, 408 insertions, 0 deletions
@@ -114,6 +114,11 @@ PHP 7.4 UPGRADE NOTES native variables and create/access data structures defined in C libraries. RFC: https://wiki.php.net/rfc/ffi +- Mbstring: + . Added mb_str_split() function, which provide the same functionality as + str_split(), but operating on code points rather than bytes. + RFC: https://wiki.php.net/rfc/mb_str_split + - OPcache: . Support for preloading code has been added. RFC: https://wiki.php.net/rfc/preload diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 3e292a0804..004a1d40d6 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -229,6 +229,12 @@ ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_output_handler, 0, 0, 2) ZEND_ARG_INFO(0, status) ZEND_END_ARG_INFO() +ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_str_split, 0, 0, 1) + ZEND_ARG_INFO(0, str) + ZEND_ARG_INFO(0, split_length) + ZEND_ARG_INFO(0, encoding) +ZEND_END_ARG_INFO() + ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_strlen, 0, 0, 1) ZEND_ARG_INFO(0, str) ZEND_ARG_INFO(0, encoding) @@ -526,6 +532,7 @@ static const zend_function_entry mbstring_functions[] = { PHP_FE(mb_parse_str, arginfo_mb_parse_str) PHP_FE(mb_output_handler, arginfo_mb_output_handler) PHP_FE(mb_preferred_mime_name, arginfo_mb_preferred_mime_name) + PHP_FE(mb_str_split, arginfo_mb_str_split) PHP_FE(mb_strlen, arginfo_mb_strlen) PHP_FE(mb_strpos, arginfo_mb_strpos) PHP_FE(mb_strrpos, arginfo_mb_strrpos) @@ -2273,6 +2280,169 @@ PHP_FUNCTION(mb_output_handler) } /* }}} */ +/* {{{ proto array mb_str_split(string str [, int split_length] [, string encoding]) + Convert a multibyte string to an array. If split_length is specified, + break the string down into chunks each split_length characters long. */ + +/* structure to pass split params to the callback */ +struct mbfl_split_params { + zval *return_value; /* php function return value structure pointer */ + mbfl_string *result_string; /* string to store result chunk */ + size_t mb_chunk_length; /* actual chunk length in chars */ + size_t split_length; /* split length in chars */ + mbfl_convert_filter *next_filter; /* widechar to encoding converter */ +}; + +/* callback function to fill split array */ +static int mbfl_split_output(int c, void *data) +{ + struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */ + + (*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */ + + if(params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */ + mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */ + mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */ + mbfl_string *chunk = params->result_string; + mbfl_memory_device_result(device, chunk); /* make chunk */ + add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */ + efree(chunk->val); + params->mb_chunk_length = 0; /* reset mb_chunk size */ + } + return 0; +} + +PHP_FUNCTION(mb_str_split) +{ + zend_string *str, *encoding = NULL; + size_t mb_len, chunks, chunk_len; + const char *p, *last; /* pointer for the string cursor and last string char */ + mbfl_string string, result_string; + const mbfl_encoding *mbfl_encoding; + zend_long split_length = 1; + + ZEND_PARSE_PARAMETERS_START(1, 3) + Z_PARAM_STR(str) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(split_length) + Z_PARAM_STR(encoding) + ZEND_PARSE_PARAMETERS_END(); + + if (split_length <= 0) { + php_error_docref(NULL, E_WARNING, "The length of each segment must be greater than zero"); + RETURN_FALSE; + } + + /* fill mbfl_string structure */ + string.val = (unsigned char *) ZSTR_VAL(str); + string.len = ZSTR_LEN(str); + string.no_language = MBSTRG(language); + string.encoding = php_mb_get_encoding(encoding); + if (!string.encoding) { + RETURN_FALSE; + } + + p = ZSTR_VAL(str); /* string cursor pointer */ + last = ZSTR_VAL(str) + ZSTR_LEN(str); /* last string char pointer */ + + mbfl_encoding = string.encoding; + + /* first scenario: 1,2,4-bytes fixed width encodings (head part) */ + if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */ + mb_len = string.len; + chunk_len = (size_t)split_length; /* chunk length in bytes */ + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { /* 2 bytes */ + mb_len = string.len / 2; + chunk_len = split_length * 2; + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { /* 4 bytes */ + mb_len = string.len / 4; + chunk_len = split_length * 4; + } else if (mbfl_encoding->mblen_table != NULL) { + /* second scenario: variable width encodings with length table */ + char unsigned const *mbtab = mbfl_encoding->mblen_table; + + /* assume that we have 1-bytes characters */ + array_init_size(return_value, (string.len + split_length) / split_length); /* round up */ + + while (p < last) { /* split cycle work until the cursor has reached the last byte */ + char const *chunk_p = p; /* chunk first byte pointer */ + chunk_len = 0; /* chunk length in bytes */ + for (zend_long char_count = 0; char_count < split_length && p < last; ++char_count) { + char unsigned const m = mbtab[*(const unsigned char *)p]; /* single character length table */ + chunk_len += m; + p += m; + } + if (p >= last) chunk_len -= p - last; /* check if chunk is in bounds */ + add_next_index_stringl(return_value, chunk_p, chunk_len); + } + return; + } else { + /* third scenario: other multibyte encodings */ + mbfl_convert_filter *filter, *decoder; + + /* assume that we have 1-bytes characters */ + array_init_size(return_value, (string.len + split_length) / split_length); /* round up */ + + /* decoder filter to decode wchar to encoding */ + mbfl_memory_device device; + mbfl_memory_device_init(&device, split_length + 1, 0); + + decoder = mbfl_convert_filter_new( + &mbfl_encoding_wchar, + string.encoding, + mbfl_memory_device_output, + NULL, + &device); + /* if something wrong with the decoded */ + if (decoder == NULL) { + RETURN_FALSE; + } + + /* wchar filter */ + mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */ + struct mbfl_split_params params = { /* init callback function params structure */ + .return_value = return_value, + .result_string = &result_string, + .mb_chunk_length = 0, + .split_length = (size_t)split_length, + .next_filter = decoder, + }; + + filter = mbfl_convert_filter_new( + string.encoding, + &mbfl_encoding_wchar, + mbfl_split_output, + NULL, + ¶ms); + /* if something wrong with the filter */ + if (filter == NULL){ + mbfl_convert_filter_delete(decoder); /* this will free allocated memory for the decoded */ + RETURN_FALSE; + } + + while (p < last - 1) { /* cycle each byte except last with callback function */ + (*filter->filter_function)(*p++, filter); + } + params.mb_chunk_length = split_length - 1; /* force to finish current chunk */ + (*filter->filter_function)(*p++, filter); /*process last char */ + + mbfl_convert_filter_delete(decoder); + mbfl_convert_filter_delete(filter); + return; + } + + /* first scenario: 1,2,4-bytes fixed width encodings (tail part) */ + chunks = (mb_len + split_length - 1) / split_length; /* (round up idiom) */ + array_init_size(return_value, chunks); + if (chunks != 0) { + for (zend_long i = 0; i < chunks - 1; p += chunk_len, ++i) { + add_next_index_stringl(return_value, p, chunk_len); + } + add_next_index_stringl(return_value, p, last - p); + } +} +/* }}} */ + /* {{{ proto int mb_strlen(string str [, string encoding]) Get character numbers of a string */ PHP_FUNCTION(mb_strlen) diff --git a/ext/mbstring/mbstring.h b/ext/mbstring/mbstring.h index 37965ec289..7321525064 100644 --- a/ext/mbstring/mbstring.h +++ b/ext/mbstring/mbstring.h @@ -78,6 +78,7 @@ PHP_FUNCTION(mb_substitute_character); PHP_FUNCTION(mb_preferred_mime_name); PHP_FUNCTION(mb_parse_str); PHP_FUNCTION(mb_output_handler); +PHP_FUNCTION(mb_str_split); PHP_FUNCTION(mb_strlen); PHP_FUNCTION(mb_strpos); PHP_FUNCTION(mb_strrpos); diff --git a/ext/mbstring/tests/mb_str_split_jp.phpt b/ext/mbstring/tests/mb_str_split_jp.phpt new file mode 100644 index 0000000000..84f63030d6 --- /dev/null +++ b/ext/mbstring/tests/mb_str_split_jp.phpt @@ -0,0 +1,76 @@ +--TEST-- +mb_str_split() tests for the japanese language +--SKIPIF-- +<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?> +--INI-- +output_handler= +mbstring.func_overload=0 +--FILE-- +<?php +ini_set('include_path','.'); +include_once('common.inc'); + +$string = "日本"; /* 2 chars */ +$len = 2; +$charset = [ + "BIG-5", + "EUC-JP", + "ISO-2022-JP", + "SJIS", + "UTF-16BE", + "UTF-16LE", + "UTF-32BE", + "UTF-32LE", + "UTF-8" +]; + + +foreach($charset as $cs){ + $enc = mb_convert_encoding($string, $cs, "UTF-8"); + $split = mb_str_split($enc, 1, $cs); + + /* check chunks number */ + for($i = 1; $i <= $len; ++$i){ + $ceil = ceil($len / $i); + $cnt = count(mb_str_split($enc,$i,$cs)); + if($ceil != $cnt){ + echo "$cs WRONG CHUNKS NUMBER: expected/actual: $ceil/$cnt\n"; + } + } + + /* check content */ + echo "$cs:"; + for($i = 0; $i < $len; ++$i){ + echo " " . unpack("H*", $split[$i])[1]; + } + echo "\n"; +} + +/* long string test */ +$size = 50000; +$long = str_repeat($string, $size); /* 50k x 2 chars = 1e5 chars */ +$enc = mb_convert_encoding($long, "ISO-2022-JP", "UTF-8"); +$array = mb_str_split($enc, $len, "ISO-2022-JP"); +$count = count($array); + +/* check array size */ +if($size !== $count) printf("Long string splitting error: actual array size: %d expected: %d\n", $count, $size); + +/* compare initial string and last array element after splitting */ +$enc = mb_convert_encoding($string, "ISO-2022-JP", "UTF-8"); +if(end($array) !== $enc){ + printf("Long string splitting error: + last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]); +} + +?> +--EXPECT-- +BIG-5: a4e9 a5bb +EUC-JP: c6fc cbdc +ISO-2022-JP: 1b2442467c1b2842 1b24424b5c1b2842 +SJIS: 93fa 967b +UTF-16BE: 65e5 672c +UTF-16LE: e565 2c67 +UTF-32BE: 000065e5 0000672c +UTF-32LE: e5650000 2c670000 +UTF-8: e697a5 e69cac diff --git a/ext/mbstring/tests/mb_str_split_ru.phpt b/ext/mbstring/tests/mb_str_split_ru.phpt new file mode 100644 index 0000000000..75e49275d7 --- /dev/null +++ b/ext/mbstring/tests/mb_str_split_ru.phpt @@ -0,0 +1,75 @@ +--TEST-- +mb_str_split() tests for the russian language +--SKIPIF-- +<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?> +--INI-- +output_handler= +mbstring.func_overload=0 +--FILE-- +<?php +ini_set('include_path','.'); +include_once('common.inc'); + +$string = "рай рай рай "; /* 12 chars */ +$len = 12; +$charset = [ + "EUC-JP", + "CP866", + "KOI8-R", + "UTF-16BE", + "UTF-16LE", + "UTF-32BE", + "UTF-32LE", + "UTF-8" +]; + + +foreach($charset as $cs){ + $enc = mb_convert_encoding($string, $cs, "UTF-8"); + $split = mb_str_split($enc, 1, $cs); + + + /* check chunks number */ + for($i = 1; $i <= $len; ++$i){ + $ceil = ceil($len / $i); + $cnt = count(mb_str_split($enc,$i,$cs)); + if($ceil != $cnt){ + echo "$cs WRONG CHUNKS NUMBER: expected/actual: $ceil/$cnt\n"; + } + } + + /* check content */ + echo "$cs:"; + for($i = 0; $i < $len; ++$i){ + echo " " . unpack("H*", $split[$i])[1]; + } + echo "\n"; +} + +/* long string test */ +$size = 25000; +$long = str_repeat($string, $size); /* 25k x 12 chars = 3e5 chars */ +$enc = mb_convert_encoding($long, "EUC-JP", "UTF-8"); +$array = mb_str_split($enc, $len, "EUC-JP"); +$count = count($array); + +/* check array size */ +if($size !== $count) printf("Long string splitting error: actual array size: %d expected: %d\n", $count, $size); + +/* compare initial string and last array element after splitting */ +$enc = mb_convert_encoding($string, "EUC-JP", "UTF-8"); +if(end($array) !== $enc){ + printf("Long string splitting error: + last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]); +} + +?> +--EXPECT-- +EUC-JP: a7e2 a7d1 a7db 20 a7e2 a7d1 a7db 20 a7e2 a7d1 a7db 20 +CP866: e0 a0 a9 20 e0 a0 a9 20 e0 a0 a9 20 +KOI8-R: d2 c1 ca 20 d2 c1 ca 20 d2 c1 ca 20 +UTF-16BE: 0440 0430 0439 0020 0440 0430 0439 0020 0440 0430 0439 0020 +UTF-16LE: 4004 3004 3904 2000 4004 3004 3904 2000 4004 3004 3904 2000 +UTF-32BE: 00000440 00000430 00000439 00000020 00000440 00000430 00000439 00000020 00000440 00000430 00000439 00000020 +UTF-32LE: 40040000 30040000 39040000 20000000 40040000 30040000 39040000 20000000 40040000 30040000 39040000 20000000 +UTF-8: d180 d0b0 d0b9 20 d180 d0b0 d0b9 20 d180 d0b0 d0b9 20 diff --git a/ext/mbstring/tests/mb_str_split_utf8_utf16.phpt b/ext/mbstring/tests/mb_str_split_utf8_utf16.phpt new file mode 100644 index 0000000000..b8234bb322 --- /dev/null +++ b/ext/mbstring/tests/mb_str_split_utf8_utf16.phpt @@ -0,0 +1,81 @@ +--TEST-- +mb_str_split() tests UTF-8 illegal chars & UTF-16 surrogate pairs +--SKIPIF-- +<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?> +--INI-- +output_handler= +mbstring.func_overload=0 +--FILE-- +<?php +ini_set('include_path','.'); +include_once('common.inc'); + +/* 123 string and 4-bytes length character 0xf09280a9 */ +$utf8 = pack("H*", "313233f09280a9"); + +/* 123 string and 4-bytes length character 0xf09280a9 head without tail */ +$utf8_bad = pack("H*", "313233f092"); + +/* very first and very last utf-16 4-bytes characters */ +$utf16_first_be = pack("H*", "d800dc00"); +$utf16_first_le = pack("H*", "00d800dc"); + +$utf16_last_be = pack("H*", "dbffdfff"); +$utf16_last_le = pack("H*", "ffdbffdf"); +$utf16be_char_bad = pack("H*", "dc00dc00"); /* this char is illegal because it starts from low surrogate char */ +$utf16le_char_bad = pack("H*", "00dc00dc"); /* this char is illegal because it starts from low surrogate char */ + + +$utf16be = $utf16_first_be . $utf16_last_be; +$utf16le = $utf16_first_le . $utf16_last_le; + +$utf16be_bad = $utf16_first_be . $utf16be_char_bad; +$utf16le_bad = $utf16_first_le . $utf16le_char_bad; + +/* print each chunk as HEX string */ +echo "UTF-8:"; +foreach(mb_str_split($utf8, 2) as $chunk){ + printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); +} +echo PHP_EOL; + +echo "BAD UTF-8:"; +foreach(mb_str_split($utf8_bad, 2) as $chunk){ + printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); +} +echo PHP_EOL; + +echo "UTF-16BE:"; +foreach(mb_str_split($utf16be, 1, "UTF-16BE") as $chunk){ + printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); +} +echo PHP_EOL; + +echo "UTF-16LE:"; +foreach(mb_str_split($utf16le, 1, "UTF-16LE") as $chunk){ + printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); +} +echo PHP_EOL; + +echo "BAD UTF-16BE:"; +foreach(mb_str_split($utf16be_bad, 1, "UTF-16BE") as $chunk){ + printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); +} +echo PHP_EOL; + +echo "BAD UTF-16LE:"; +foreach(mb_str_split($utf16le_bad, 1, "UTF-16LE") as $chunk){ + printf(" l:%d v:%s", strlen($chunk), unpack("H*", $chunk)[1]); +} +echo PHP_EOL; + +?> +--EXPECT-- +UTF-8: l:2 v:3132 l:5 v:33f09280a9 +BAD UTF-8: l:2 v:3132 l:3 v:33f092 +UTF-16BE: l:4 v:d800dc00 l:4 v:dbffdfff +UTF-16LE: l:4 v:00d800dc l:4 v:ffdbffdf +BAD UTF-16BE: l:4 v:d800dc00 l:2 v:003f l:2 v:003f +BAD UTF-16LE: l:4 v:00d800dc l:2 v:3f00 l:2 v:3f00 + + |
