diff options
author | Stanislav Malyshev <stas@php.net> | 2016-11-27 15:11:17 -0800 |
---|---|---|
committer | Stanislav Malyshev <stas@php.net> | 2016-11-27 15:34:58 -0800 |
commit | 8856b3a63c6b3eefe83aecaa9e18afb640173708 (patch) | |
tree | 6bbe501c29719a4053a214478e44e2c3d96e2338 | |
parent | 5049ef2f1c496c4964cd147e185c1f765ab0347b (diff) | |
parent | df683fa3b0a66d7626d6720858f309da9e7880e5 (diff) | |
download | php-git-8856b3a63c6b3eefe83aecaa9e18afb640173708.tar.gz |
Merge branch 'pull-request/1974' into PHP-5.6
* pull-request/1974:
Fix #68447: grapheme_extract take an extra trailing character
-rw-r--r-- | ext/intl/grapheme/grapheme_string.c | 85 | ||||
-rw-r--r-- | ext/intl/tests/bug68447.phpt | 28 |
2 files changed, 60 insertions, 53 deletions
diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index 3ba9b51524..d298bab98b 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -702,8 +702,10 @@ PHP_FUNCTION(grapheme_stristr) static inline int32_t grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len) { - int pos = 0, prev_pos = 0; - int ret_pos = 0, prev_ret_pos = 0; + int pos = 0; + int ret_pos = 0; + int break_pos, prev_break_pos; + int count = 0; while ( 1 ) { pos = ubrk_next(bi); @@ -712,23 +714,24 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char break; } - /* if we are beyond our limit, then the loop is done */ - if ( pos > csize ) { - break; - } + for ( break_pos = ret_pos; break_pos < pos; ) { + count++; + prev_break_pos = break_pos; + U8_FWD_1(pstr, break_pos, str_len); - /* update our pointer in the original UTF-8 buffer by as many characters - as ubrk_next iterated over */ - - prev_ret_pos = ret_pos; - U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); + if ( prev_break_pos == break_pos ) { + /* something wrong - malformed utf8? */ + csize = 0; + break; + } + } - if ( prev_ret_pos == ret_pos ) { - /* something wrong - malformed utf8? */ + /* if we are beyond our limit, then the loop is done */ + if ( count > csize ) { break; } - prev_pos = pos; + ret_pos = break_pos; } return ret_pos; @@ -739,8 +742,8 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char static inline int32_t grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len) { - int pos = 0, prev_pos = 0; - int ret_pos = 0, prev_ret_pos = 0; + int pos = 0; + int ret_pos = 0; while ( 1 ) { pos = ubrk_next(bi); @@ -749,20 +752,11 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char break; } - prev_ret_pos = ret_pos; - U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); - - if ( ret_pos > bsize ) { - ret_pos = prev_ret_pos; - break; - } - - if ( prev_ret_pos == ret_pos ) { - /* something wrong - malformed utf8? */ + if ( pos > bsize ) { break; } - prev_pos = pos; + ret_pos = pos; } return ret_pos; @@ -773,7 +767,7 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char static inline int32_t grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len) { - int pos = 0, next_pos = 0; + int next_pos = 0; int ret_pos = 0; while ( size ) { @@ -782,16 +776,10 @@ grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pst if ( UBRK_DONE == next_pos ) { break; } - pos = next_pos; + ret_pos = next_pos; size--; } - /* pos is one past the last UChar - and represent the number of code units to - advance in the utf-8 buffer - */ - - U8_FWD_N(pstr, ret_pos, str_len, pos); - return ret_pos; } /* }}} */ @@ -810,11 +798,11 @@ static grapheme_extract_iter grapheme_extract_iters[] = { Function to extract a sequence of default grapheme clusters */ PHP_FUNCTION(grapheme_extract) { - unsigned char *str, *pstr; - UChar *ustr; - int str_len, ustr_len; - long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */ - long lstart = 0; /* starting position in str in bytes */ + char *str, *pstr; + UText ut = UTEXT_INITIALIZER; + size_t str_len; + zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */ + zend_long lstart = 0; /* starting position in str in bytes */ int32_t start = 0; long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT; UErrorCode status; @@ -900,21 +888,15 @@ PHP_FUNCTION(grapheme_extract) RETURN_STRINGL(((char *)pstr), nsize, 1); } - /* convert the strings to UTF-16. */ - ustr = NULL; - ustr_len = 0; status = U_ZERO_ERROR; - intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status ); + utext_openUTF8(&ut, pstr, str_len, &status); if ( U_FAILURE( status ) ) { /* Set global error code. */ intl_error_set_code( NULL, status TSRMLS_CC ); /* Set error messages. */ - intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC ); - - if ( NULL != ustr ) - efree( ustr ); + intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 TSRMLS_CC ); RETURN_FALSE; } @@ -923,8 +905,7 @@ PHP_FUNCTION(grapheme_extract) status = U_ZERO_ERROR; bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC ); - ubrk_setText(bi, ustr, ustr_len, &status); - + ubrk_setUText(bi, &ut, &status); /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we can't back up. So, we will not do anything. */ @@ -932,9 +913,7 @@ PHP_FUNCTION(grapheme_extract) ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len); - if (ustr) { - efree(ustr); - } + utext_close(&ut); ubrk_close(bi); if ( NULL != next ) { diff --git a/ext/intl/tests/bug68447.phpt b/ext/intl/tests/bug68447.phpt new file mode 100644 index 0000000000..f320276df2 --- /dev/null +++ b/ext/intl/tests/bug68447.phpt @@ -0,0 +1,28 @@ +--TEST-- +Bug #68447: grapheme_extract take an extra trailing character +--SKIPIF-- +<?php if( !extension_loaded( 'intl' ) ) print 'skip'; ?> +--FILE-- +<?php +$katsushikaku = "葛󠄁飾区"; +echo grapheme_extract($katsushikaku, 1) . "\n"; + +$haiyore = "這󠄀いよれ"; +echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_COUNT) . "\n"; +echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_COUNT) . "\n"; +echo grapheme_extract($haiyore, 6, GRAPHEME_EXTR_MAXBYTES) . "\n"; +echo grapheme_extract($haiyore, 9, GRAPHEME_EXTR_MAXBYTES) . "\n"; +echo grapheme_extract($haiyore, 12, GRAPHEME_EXTR_MAXBYTES) . "\n"; +echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_MAXCHARS) . "\n"; +echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_MAXCHARS) . "\n"; +echo grapheme_extract($haiyore, 3, GRAPHEME_EXTR_MAXCHARS) . "\n"; +--EXPECT-- +葛󠄁 +這󠄀 +這󠄀い + +這󠄀 +這󠄀い + +這󠄀 +這󠄀い |