diff options
author | Stanislav Malyshev <stas@php.net> | 2016-11-27 15:36:29 -0800 |
---|---|---|
committer | Stanislav Malyshev <stas@php.net> | 2016-11-27 15:36:29 -0800 |
commit | f1a9851c3e4aa5606d209b44742dc7156b49f7b7 (patch) | |
tree | 714c487dc87609d58f60c6840735675abac4cfd1 | |
parent | 1cb58ead7032a4fe11cda07181997705c37610f1 (diff) | |
parent | 8856b3a63c6b3eefe83aecaa9e18afb640173708 (diff) | |
download | php-git-f1a9851c3e4aa5606d209b44742dc7156b49f7b7.tar.gz |
Merge branch 'PHP-5.6' into PHP-7.0
* PHP-5.6:
Fix #68447: grapheme_extract take an extra trailing character
-rw-r--r-- | ext/intl/grapheme/grapheme_string.c | 78 | ||||
-rw-r--r-- | ext/intl/tests/bug68447.phpt | 28 |
2 files changed, 56 insertions, 50 deletions
diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index 5687e3e260..f69500429d 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -676,8 +676,10 @@ PHP_FUNCTION(grapheme_stristr) static inline int32_t grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len) { - int pos = 0, prev_pos = 0; - int ret_pos = 0, prev_ret_pos = 0; + int pos = 0; + int ret_pos = 0; + int break_pos, prev_break_pos; + int count = 0; while ( 1 ) { pos = ubrk_next(bi); @@ -686,23 +688,24 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char break; } - /* if we are beyond our limit, then the loop is done */ - if ( pos > csize ) { - break; - } + for ( break_pos = ret_pos; break_pos < pos; ) { + count++; + prev_break_pos = break_pos; + U8_FWD_1(pstr, break_pos, str_len); - /* update our pointer in the original UTF-8 buffer by as many characters - as ubrk_next iterated over */ - - prev_ret_pos = ret_pos; - U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); + if ( prev_break_pos == break_pos ) { + /* something wrong - malformed utf8? */ + csize = 0; + break; + } + } - if ( prev_ret_pos == ret_pos ) { - /* something wrong - malformed utf8? */ + /* if we are beyond our limit, then the loop is done */ + if ( count > csize ) { break; } - prev_pos = pos; + ret_pos = break_pos; } return ret_pos; @@ -713,8 +716,8 @@ grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char static inline int32_t grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len) { - int pos = 0, prev_pos = 0; - int ret_pos = 0, prev_ret_pos = 0; + int pos = 0; + int ret_pos = 0; while ( 1 ) { pos = ubrk_next(bi); @@ -723,20 +726,11 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char break; } - prev_ret_pos = ret_pos; - U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos); - - if ( ret_pos > bsize ) { - ret_pos = prev_ret_pos; - break; - } - - if ( prev_ret_pos == ret_pos ) { - /* something wrong - malformed utf8? */ + if ( pos > bsize ) { break; } - prev_pos = pos; + ret_pos = pos; } return ret_pos; @@ -747,7 +741,7 @@ grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char static inline int32_t grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len) { - int pos = 0, next_pos = 0; + int next_pos = 0; int ret_pos = 0; while ( size ) { @@ -756,16 +750,10 @@ grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pst if ( UBRK_DONE == next_pos ) { break; } - pos = next_pos; + ret_pos = next_pos; size--; } - /* pos is one past the last UChar - and represent the number of code units to - advance in the utf-8 buffer - */ - - U8_FWD_N(pstr, ret_pos, str_len, pos); - return ret_pos; } /* }}} */ @@ -785,9 +773,8 @@ static grapheme_extract_iter grapheme_extract_iters[] = { PHP_FUNCTION(grapheme_extract) { char *str, *pstr; - UChar *ustr; + UText ut = UTEXT_INITIALIZER; size_t str_len; - int32_t ustr_len; zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */ zend_long lstart = 0; /* starting position in str in bytes */ int32_t start = 0; @@ -871,21 +858,15 @@ PHP_FUNCTION(grapheme_extract) RETURN_STRINGL(pstr, nsize); } - /* convert the strings to UTF-16. */ - ustr = NULL; - ustr_len = 0; status = U_ZERO_ERROR; - intl_convert_utf8_to_utf16(&ustr, &ustr_len, pstr, str_len, &status ); + utext_openUTF8(&ut, pstr, str_len, &status); if ( U_FAILURE( status ) ) { /* Set global error code. */ intl_error_set_code( NULL, status ); /* Set error messages. */ - intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 ); - - if ( NULL != ustr ) - efree( ustr ); + intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 ); RETURN_FALSE; } @@ -894,8 +875,7 @@ PHP_FUNCTION(grapheme_extract) status = U_ZERO_ERROR; bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status ); - ubrk_setText(bi, ustr, ustr_len, &status); - + ubrk_setUText(bi, &ut, &status); /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we can't back up. So, we will not do anything. */ @@ -903,9 +883,7 @@ PHP_FUNCTION(grapheme_extract) /* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */ ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len); - if (ustr) { - efree(ustr); - } + utext_close(&ut); ubrk_close(bi); if ( NULL != next ) { diff --git a/ext/intl/tests/bug68447.phpt b/ext/intl/tests/bug68447.phpt new file mode 100644 index 0000000000..f320276df2 --- /dev/null +++ b/ext/intl/tests/bug68447.phpt @@ -0,0 +1,28 @@ +--TEST-- +Bug #68447: grapheme_extract take an extra trailing character +--SKIPIF-- +<?php if( !extension_loaded( 'intl' ) ) print 'skip'; ?> +--FILE-- +<?php +$katsushikaku = "葛󠄁飾区"; +echo grapheme_extract($katsushikaku, 1) . "\n"; + +$haiyore = "這󠄀いよれ"; +echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_COUNT) . "\n"; +echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_COUNT) . "\n"; +echo grapheme_extract($haiyore, 6, GRAPHEME_EXTR_MAXBYTES) . "\n"; +echo grapheme_extract($haiyore, 9, GRAPHEME_EXTR_MAXBYTES) . "\n"; +echo grapheme_extract($haiyore, 12, GRAPHEME_EXTR_MAXBYTES) . "\n"; +echo grapheme_extract($haiyore, 1, GRAPHEME_EXTR_MAXCHARS) . "\n"; +echo grapheme_extract($haiyore, 2, GRAPHEME_EXTR_MAXCHARS) . "\n"; +echo grapheme_extract($haiyore, 3, GRAPHEME_EXTR_MAXCHARS) . "\n"; +--EXPECT-- +葛󠄁 +這󠄀 +這󠄀い + +這󠄀 +這󠄀い + +這󠄀 +這󠄀い |