summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStanislav Malyshev <stas@php.net>2016-11-27 15:36:29 -0800
committerStanislav Malyshev <stas@php.net>2016-11-27 15:36:29 -0800
commitf1a9851c3e4aa5606d209b44742dc7156b49f7b7 (patch)
tree714c487dc87609d58f60c6840735675abac4cfd1
parent1cb58ead7032a4fe11cda07181997705c37610f1 (diff)
parent8856b3a63c6b3eefe83aecaa9e18afb640173708 (diff)
downloadphp-git-f1a9851c3e4aa5606d209b44742dc7156b49f7b7.tar.gz
Merge branch 'PHP-5.6' into PHP-7.0
* PHP-5.6: Fix #68447: grapheme_extract take an extra trailing character
-rw-r--r--ext/intl/grapheme/grapheme_string.c78
-rw-r--r--ext/intl/tests/bug68447.phpt28
2 files changed, 56 insertions, 50 deletions
diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c
index 5687e3e260..f69500429d 100644
--- a/ext/intl/grapheme/grapheme_string.c
+++ b/ext/intl/grapheme/grapheme_string.c
@@ -676,8 +676,10 @@ PHP_FUNCTION(grapheme_stristr)
static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
{
- int pos = 0, prev_pos = 0;
- int ret_pos = 0, prev_ret_pos = 0;
+ int pos = 0;
+ int ret_pos = 0;
+ int break_pos, prev_break_pos;
+ int count = 0;
while ( 1 ) {
pos = ubrk_next(bi);
@@ -686,23 +688,24 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
break;
}
- /* if we are beyond our limit, then the loop is done */
- if ( pos > csize ) {
- break;
- }
+ for ( break_pos = ret_pos; break_pos < pos; ) {
+ count++;
+ prev_break_pos = break_pos;
+ U8_FWD_1(pstr, break_pos, str_len);
- /* update our pointer in the original UTF-8 buffer by as many characters
- as ubrk_next iterated over */
-
- prev_ret_pos = ret_pos;
- U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
+ if ( prev_break_pos == break_pos ) {
+ /* something wrong - malformed utf8? */
+ csize = 0;
+ break;
+ }
+ }
- if ( prev_ret_pos == ret_pos ) {
- /* something wrong - malformed utf8? */
+ /* if we are beyond our limit, then the loop is done */
+ if ( count > csize ) {
break;
}
- prev_pos = pos;
+ ret_pos = break_pos;
}
return ret_pos;
@@ -713,8 +716,8 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
{
- int pos = 0, prev_pos = 0;
- int ret_pos = 0, prev_ret_pos = 0;
+ int pos = 0;
+ int ret_pos = 0;
while ( 1 ) {
pos = ubrk_next(bi);
@@ -723,20 +726,11 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
break;
}
- prev_ret_pos = ret_pos;
- U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
-
- if ( ret_pos > bsize ) {
- ret_pos = prev_ret_pos;
- break;
- }
-
- if ( prev_ret_pos == ret_pos ) {
- /* something wrong - malformed utf8? */
+ if ( pos > bsize ) {
break;
}
- prev_pos = pos;
+ ret_pos = pos;
}
return ret_pos;
@@ -747,7 +741,7 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
static inline int32_t
grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
{
- int pos = 0, next_pos = 0;
+ int next_pos = 0;
int ret_pos = 0;
while ( size ) {
@@ -756,16 +750,10 @@ grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pst
if ( UBRK_DONE == next_pos ) {
break;
}
- pos = next_pos;
+ ret_pos = next_pos;
size--;
}
- /* pos is one past the last UChar - and represent the number of code units to
- advance in the utf-8 buffer
- */
-
- U8_FWD_N(pstr, ret_pos, str_len, pos);
-
return ret_pos;
}
/* }}} */
@@ -785,9 +773,8 @@ static grapheme_extract_iter grapheme_extract_iters[] = {
PHP_FUNCTION(grapheme_extract)
{
char *str, *pstr;
- UChar *ustr;
+ UText ut = UTEXT_INITIALIZER;
size_t str_len;
- int32_t ustr_len;
zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
zend_long lstart = 0; /* starting position in str in bytes */
int32_t start = 0;
@@ -871,21 +858,15 @@ PHP_FUNCTION(grapheme_extract)
RETURN_STRINGL(pstr, nsize);
}
- /* convert the strings to UTF-16. */
- ustr = NULL;
- ustr_len = 0;
status = U_ZERO_ERROR;
- intl_convert_utf8_to_utf16(&ustr, &ustr_len, pstr, str_len, &status );
+ utext_openUTF8(&ut, pstr, str_len, &status);
if ( U_FAILURE( status ) ) {
/* Set global error code. */
intl_error_set_code( NULL, status );
/* Set error messages. */
- intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
-
- if ( NULL != ustr )
- efree( ustr );
+ intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
RETURN_FALSE;
}
@@ -894,8 +875,7 @@ PHP_FUNCTION(grapheme_extract)
status = U_ZERO_ERROR;
bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
- ubrk_setText(bi, ustr, ustr_len, &status);
-
+ ubrk_setUText(bi, &ut, &status);
/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
can't back up. So, we will not do anything. */
@@ -903,9 +883,7 @@ PHP_FUNCTION(grapheme_extract)
/* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
- if (ustr) {
- efree(ustr);
- }
+ utext_close(&ut);
ubrk_close(bi);
if ( NULL != next ) {
diff --git a/ext/intl/tests/bug68447.phpt b/ext/intl/tests/bug68447.phpt
new file mode 100644
index 0000000000..f320276df2
--- /dev/null
+++ b/ext/intl/tests/bug68447.phpt
@@ -0,0 +1,28 @@
+--TEST--
+Bug #68447: grapheme_extract take an extra trailing character
+--SKIPIF--
+<?php if( !extension_loaded( 'intl' ) ) print 'skip'; ?>
+--FILE--
+<?php
+$katsushikaku = "葛󠄁飾区";
+echo grapheme_extract($katsushikaku, 1) . "\n";
+
+$haiyore = "這󠄀いよれ";
+echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_COUNT) . "\n";
+echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_COUNT) . "\n";
+echo grapheme_extract($haiyore, 6, GRAPHEME_EXTR_MAXBYTES) . "\n";
+echo grapheme_extract($haiyore, 9, GRAPHEME_EXTR_MAXBYTES) . "\n";
+echo grapheme_extract($haiyore, 12, GRAPHEME_EXTR_MAXBYTES) . "\n";
+echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_MAXCHARS) . "\n";
+echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_MAXCHARS) . "\n";
+echo grapheme_extract($haiyore, 3, GRAPHEME_EXTR_MAXCHARS) . "\n";
+--EXPECT--
+葛󠄁
+這󠄀
+這󠄀い
+
+這󠄀
+這󠄀い
+
+這󠄀
+這󠄀い