summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStanislav Malyshev <stas@php.net>2016-11-27 15:11:17 -0800
committerStanislav Malyshev <stas@php.net>2016-11-27 15:34:58 -0800
commit8856b3a63c6b3eefe83aecaa9e18afb640173708 (patch)
tree6bbe501c29719a4053a214478e44e2c3d96e2338
parent5049ef2f1c496c4964cd147e185c1f765ab0347b (diff)
parentdf683fa3b0a66d7626d6720858f309da9e7880e5 (diff)
downloadphp-git-8856b3a63c6b3eefe83aecaa9e18afb640173708.tar.gz
Merge branch 'pull-request/1974' into PHP-5.6
* pull-request/1974: Fix #68447: grapheme_extract take an extra trailing character
-rw-r--r--ext/intl/grapheme/grapheme_string.c85
-rw-r--r--ext/intl/tests/bug68447.phpt28
2 files changed, 60 insertions, 53 deletions
diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c
index 3ba9b51524..d298bab98b 100644
--- a/ext/intl/grapheme/grapheme_string.c
+++ b/ext/intl/grapheme/grapheme_string.c
@@ -702,8 +702,10 @@ PHP_FUNCTION(grapheme_stristr)
static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
{
- int pos = 0, prev_pos = 0;
- int ret_pos = 0, prev_ret_pos = 0;
+ int pos = 0;
+ int ret_pos = 0;
+ int break_pos, prev_break_pos;
+ int count = 0;
while ( 1 ) {
pos = ubrk_next(bi);
@@ -712,23 +714,24 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
break;
}
- /* if we are beyond our limit, then the loop is done */
- if ( pos > csize ) {
- break;
- }
+ for ( break_pos = ret_pos; break_pos < pos; ) {
+ count++;
+ prev_break_pos = break_pos;
+ U8_FWD_1(pstr, break_pos, str_len);
- /* update our pointer in the original UTF-8 buffer by as many characters
- as ubrk_next iterated over */
-
- prev_ret_pos = ret_pos;
- U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
+ if ( prev_break_pos == break_pos ) {
+ /* something wrong - malformed utf8? */
+ csize = 0;
+ break;
+ }
+ }
- if ( prev_ret_pos == ret_pos ) {
- /* something wrong - malformed utf8? */
+ /* if we are beyond our limit, then the loop is done */
+ if ( count > csize ) {
break;
}
- prev_pos = pos;
+ ret_pos = break_pos;
}
return ret_pos;
@@ -739,8 +742,8 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char
static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
{
- int pos = 0, prev_pos = 0;
- int ret_pos = 0, prev_ret_pos = 0;
+ int pos = 0;
+ int ret_pos = 0;
while ( 1 ) {
pos = ubrk_next(bi);
@@ -749,20 +752,11 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
break;
}
- prev_ret_pos = ret_pos;
- U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
-
- if ( ret_pos > bsize ) {
- ret_pos = prev_ret_pos;
- break;
- }
-
- if ( prev_ret_pos == ret_pos ) {
- /* something wrong - malformed utf8? */
+ if ( pos > bsize ) {
break;
}
- prev_pos = pos;
+ ret_pos = pos;
}
return ret_pos;
@@ -773,7 +767,7 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char
static inline int32_t
grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
{
- int pos = 0, next_pos = 0;
+ int next_pos = 0;
int ret_pos = 0;
while ( size ) {
@@ -782,16 +776,10 @@ grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pst
if ( UBRK_DONE == next_pos ) {
break;
}
- pos = next_pos;
+ ret_pos = next_pos;
size--;
}
- /* pos is one past the last UChar - and represent the number of code units to
- advance in the utf-8 buffer
- */
-
- U8_FWD_N(pstr, ret_pos, str_len, pos);
-
return ret_pos;
}
/* }}} */
@@ -810,11 +798,11 @@ static grapheme_extract_iter grapheme_extract_iters[] = {
Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)
{
- unsigned char *str, *pstr;
- UChar *ustr;
- int str_len, ustr_len;
- long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
- long lstart = 0; /* starting position in str in bytes */
+ char *str, *pstr;
+ UText ut = UTEXT_INITIALIZER;
+ size_t str_len;
+ zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
+ zend_long lstart = 0; /* starting position in str in bytes */
int32_t start = 0;
long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
UErrorCode status;
@@ -900,21 +888,15 @@ PHP_FUNCTION(grapheme_extract)
RETURN_STRINGL(((char *)pstr), nsize, 1);
}
- /* convert the strings to UTF-16. */
- ustr = NULL;
- ustr_len = 0;
status = U_ZERO_ERROR;
- intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
+ utext_openUTF8(&ut, pstr, str_len, &status);
if ( U_FAILURE( status ) ) {
/* Set global error code. */
intl_error_set_code( NULL, status TSRMLS_CC );
/* Set error messages. */
- intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
-
- if ( NULL != ustr )
- efree( ustr );
+ intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 TSRMLS_CC );
RETURN_FALSE;
}
@@ -923,8 +905,7 @@ PHP_FUNCTION(grapheme_extract)
status = U_ZERO_ERROR;
bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
- ubrk_setText(bi, ustr, ustr_len, &status);
-
+ ubrk_setUText(bi, &ut, &status);
/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
can't back up. So, we will not do anything. */
@@ -932,9 +913,7 @@ PHP_FUNCTION(grapheme_extract)
ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
- if (ustr) {
- efree(ustr);
- }
+ utext_close(&ut);
ubrk_close(bi);
if ( NULL != next ) {
diff --git a/ext/intl/tests/bug68447.phpt b/ext/intl/tests/bug68447.phpt
new file mode 100644
index 0000000000..f320276df2
--- /dev/null
+++ b/ext/intl/tests/bug68447.phpt
@@ -0,0 +1,28 @@
+--TEST--
+Bug #68447: grapheme_extract take an extra trailing character
+--SKIPIF--
+<?php if( !extension_loaded( 'intl' ) ) print 'skip'; ?>
+--FILE--
+<?php
+$katsushikaku = "葛󠄁飾区";
+echo grapheme_extract($katsushikaku, 1) . "\n";
+
+$haiyore = "這󠄀いよれ";
+echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_COUNT) . "\n";
+echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_COUNT) . "\n";
+echo grapheme_extract($haiyore, 6, GRAPHEME_EXTR_MAXBYTES) . "\n";
+echo grapheme_extract($haiyore, 9, GRAPHEME_EXTR_MAXBYTES) . "\n";
+echo grapheme_extract($haiyore, 12, GRAPHEME_EXTR_MAXBYTES) . "\n";
+echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_MAXCHARS) . "\n";
+echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_MAXCHARS) . "\n";
+echo grapheme_extract($haiyore, 3, GRAPHEME_EXTR_MAXCHARS) . "\n";
+--EXPECT--
+葛󠄁
+這󠄀
+這󠄀い
+
+這󠄀
+這󠄀い
+
+這󠄀
+這󠄀い