diff options
Diffstat (limited to 'Source/JavaScriptCore/icu/unicode/utf8.h')
-rw-r--r-- | Source/JavaScriptCore/icu/unicode/utf8.h | 271 |
1 files changed, 209 insertions, 62 deletions
diff --git a/Source/JavaScriptCore/icu/unicode/utf8.h b/Source/JavaScriptCore/icu/unicode/utf8.h index 6405795a5..21e5f3d04 100644 --- a/Source/JavaScriptCore/icu/unicode/utf8.h +++ b/Source/JavaScriptCore/icu/unicode/utf8.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2009, International Business Machines +* Copyright (C) 1999-2013, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -19,11 +19,9 @@ * \brief C API: 8-bit Unicode handling macros * * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. - * utf8.h is included by utf.h after unicode/umachine.h - * and some common definitions. * * For more information see utf.h and the ICU User Guide Strings chapter - * (http://icu-project.org/userguide/strings.html). + * (http://userguide.icu-project.org/strings). * * <em>Usage:</em> * ICU coding guidelines for if() statements should be followed when using these macros. @@ -34,7 +32,7 @@ #ifndef __UTF8_H__ #define __UTF8_H__ -/* utf.h must be included first. */ +#include "unicode/umachine.h" #ifndef __UTF_H__ # include "unicode/utf.h" #endif @@ -62,13 +60,41 @@ U_CFUNC U_IMPORT const uint8_t /* U_IMPORT2? */ /*U_IMPORT*/ utf8_countTrailBytes[256]; /** - * Count the trail bytes for a UTF-8 lead byte. + * Counts the trail bytes for a UTF-8 lead byte. + * Returns 0 for 0..0xbf as well as for 0xfe and 0xff. * * This is internal since it is not meant to be called directly by external clients; * however it is called by public macros in this file and thus must remain stable. + * + * Note: Beginning with ICU 50, the implementation uses a multi-condition expression + * which was shown in 2012 (on x86-64) to compile to fast, branch-free code. + * leadByte is evaluated multiple times. + * + * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes: + * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte]) + * leadByte was evaluated exactly once. + * + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. * @internal */ -#define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) +#define U8_COUNT_TRAIL_BYTES(leadByte) \ + ((leadByte)<0xf0 ? \ + ((leadByte)>=0xc0)+((leadByte)>=0xe0) : \ + (leadByte)<0xfe ? 3+((leadByte)>=0xf8)+((leadByte)>=0xfc) : 0) + +/** + * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. + * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF. + * leadByte might be evaluated multiple times. + * + * This is internal since it is not meant to be called directly by external clients; + * however it is called by public macros in this file and thus must remain stable. + * + * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. + * @internal + */ +#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \ + (((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0)) /** * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. @@ -206,24 +232,60 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * The offset may point to either the lead byte or one of the trail bytes * for a code point, in which case the macro will read all of the bytes * for the code point. + * + * The length can be negative for a NUL-terminated string. + * * If the offset points to an illegal UTF-8 byte sequence, then * c is set to a negative value. * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. * * @param s const uint8_t * string - * @param start starting string offset - * @param i string offset, must be start<=i<length - * @param length string length + * @param start int32_t starting string offset + * @param i int32_t string offset, must be start<=i<length + * @param length int32_t string length * @param c output UChar32 variable, set to <0 in case of an error * @see U8_GET_UNSAFE * @stable ICU 2.4 */ #define U8_GET(s, start, i, length, c) { \ - int32_t _u8_get_index=(int32_t)(i); \ + int32_t _u8_get_index=(i); \ U8_SET_CP_START(s, start, _u8_get_index); \ U8_NEXT(s, _u8_get_index, length, c); \ } +#ifndef U_HIDE_DRAFT_API +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * The offset may point to either the lead byte or one of the trail bytes + * for a code point, in which case the macro will read all of the bytes + * for the code point. + * + * The length can be negative for a NUL-terminated string. + * + * If the offset points to an illegal UTF-8 byte sequence, then + * c is set to U+FFFD. + * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD. + * + * This macro does not distinguish between a real U+FFFD in the text + * and U+FFFD returned for an ill-formed sequence. + * Use U8_GET() if that distinction is important. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset + * @param i int32_t string offset, must be start<=i<length + * @param length int32_t string length + * @param c output UChar32 variable, set to U+FFFD in case of an error + * @see U8_GET + * @draft ICU 51 + */ +#define U8_GET_OR_FFFD(s, start, i, length, c) { \ + int32_t _u8_get_index=(i); \ + U8_SET_CP_START(s, start, _u8_get_index); \ + U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \ +} +#endif /* U_HIDE_DRAFT_API */ + /* definitions with forward iteration --------------------------------------- */ /** @@ -245,19 +307,16 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); */ #define U8_NEXT_UNSAFE(s, i, c) { \ (c)=(uint8_t)(s)[(i)++]; \ - if((uint8_t)((c)-0xc0)<0x35) { \ - uint8_t __count=U8_COUNT_TRAIL_BYTES(c); \ - U8_MASK_LEAD_BYTE(c, __count); \ - switch(__count) { \ - /* each following branch falls through to the next one */ \ - case 3: \ - (c)=((c)<<6)|((s)[(i)++]&0x3f); \ - case 2: \ - (c)=((c)<<6)|((s)[(i)++]&0x3f); \ - case 1: \ - (c)=((c)<<6)|((s)[(i)++]&0x3f); \ - /* no other branches to optimize switch() */ \ - break; \ + if((c)>=0x80) { \ + if((c)<0xe0) { \ + (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \ + } else if((c)<0xf0) { \ + /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ + (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \ + (i)+=2; \ + } else { \ + (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \ + (i)+=3; \ } \ } \ } @@ -268,14 +327,16 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * (Post-incrementing forward iteration.) * "Safe" macro, checks for illegal sequences and for string boundaries. * + * The length can be negative for a NUL-terminated string. + * * The offset may point to the lead byte of a multi-byte sequence, * in which case the macro will read the whole sequence. * If the offset points to a trail byte or an illegal UTF-8 sequence, then * c is set to a negative value. * * @param s const uint8_t * string - * @param i string offset, must be i<length - * @param length string length + * @param i int32_t string offset, must be i<length + * @param length int32_t string length * @param c output UChar32 variable, set to <0 in case of an error * @see U8_NEXT_UNSAFE * @stable ICU 2.4 @@ -286,7 +347,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); uint8_t __t1, __t2; \ if( /* handle U+1000..U+CFFF inline */ \ (0xe0<(c) && (c)<=0xec) && \ - (((i)+1)<(length)) && \ + (((i)+1)<(length) || (length)<0) && \ (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ ) { \ @@ -295,19 +356,70 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); (i)+=2; \ } else if( /* handle U+0080..U+07FF inline */ \ ((c)<0xe0 && (c)>=0xc2) && \ - ((i)<(length)) && \ + ((i)!=(length)) && \ (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ ) { \ - (c)=(UChar)((((c)&0x1f)<<6)|__t1); \ + (c)=(((c)&0x1f)<<6)|__t1; \ ++(i); \ - } else if(U8_IS_LEAD(c)) { \ + } else { \ /* function call for "complicated" and error cases */ \ - (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -1); \ + (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \ + } \ + } \ +} + +#ifndef U_HIDE_DRAFT_API +/** + * Get a code point from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * The offset may point to the lead byte of a multi-byte sequence, + * in which case the macro will read the whole sequence. + * If the offset points to a trail byte or an illegal UTF-8 sequence, then + * c is set to U+FFFD. + * + * This macro does not distinguish between a real U+FFFD in the text + * and U+FFFD returned for an ill-formed sequence. + * Use U8_NEXT() if that distinction is important. + * + * @param s const uint8_t * string + * @param i int32_t string offset, must be i<length + * @param length int32_t string length + * @param c output UChar32 variable, set to U+FFFD in case of an error + * @see U8_NEXT + * @draft ICU 51 + */ +#define U8_NEXT_OR_FFFD(s, i, length, c) { \ + (c)=(uint8_t)(s)[(i)++]; \ + if((c)>=0x80) { \ + uint8_t __t1, __t2; \ + if( /* handle U+1000..U+CFFF inline */ \ + (0xe0<(c) && (c)<=0xec) && \ + (((i)+1)<(length) || (length)<0) && \ + (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ + (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ + ) { \ + /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ + (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ + (i)+=2; \ + } else if( /* handle U+0080..U+07FF inline */ \ + ((c)<0xe0 && (c)>=0xc2) && \ + ((i)!=(length)) && \ + (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ + ) { \ + (c)=(((c)&0x1f)<<6)|__t1; \ + ++(i); \ } else { \ - (c)=U_SENTINEL; \ + /* function call for "complicated" and error cases */ \ + (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3); \ } \ } \ } +#endif /* U_HIDE_DRAFT_API */ /** * Append a code point to a string, overwriting 1 to 4 bytes. @@ -351,9 +463,9 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * then isError is set to TRUE. * * @param s const uint8_t * string buffer - * @param i string offset, must be i<capacity - * @param capacity size of the string buffer - * @param c code point to append + * @param i int32_t string offset, must be i<capacity + * @param capacity int32_t size of the string buffer + * @param c UChar32 code point to append * @param isError output UBool set to TRUE if an error occurs, otherwise not modified * @see U8_APPEND_UNSAFE * @stable ICU 2.4 @@ -369,7 +481,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ } else { \ - (i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(capacity), c, &(isError)); \ + (i)=utf8_appendCharSafeBody(s, (i), (capacity), c, &(isError)); \ } \ } @@ -384,7 +496,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * @stable ICU 2.4 */ #define U8_FWD_1_UNSAFE(s, i) { \ - (i)+=1+U8_COUNT_TRAIL_BYTES((s)[i]); \ + (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \ } /** @@ -392,9 +504,11 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * (Post-incrementing iteration.) * "Safe" macro, checks for illegal sequences and for string boundaries. * + * The length can be negative for a NUL-terminated string. + * * @param s const uint8_t * string - * @param i string offset, must be i<length - * @param length string length + * @param i int32_t string offset, must be i<length + * @param length int32_t string length * @see U8_FWD_1_UNSAFE * @stable ICU 2.4 */ @@ -402,7 +516,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); uint8_t __b=(uint8_t)(s)[(i)++]; \ if(U8_IS_LEAD(__b)) { \ uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \ - if((i)+__count>(length)) { \ + if((i)+__count>(length) && (length)>=0) { \ __count=(uint8_t)((length)-(i)); \ } \ while(__count>0 && U8_IS_TRAIL((s)[i])) { \ @@ -438,16 +552,18 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * (Post-incrementing iteration.) * "Safe" macro, checks for illegal sequences and for string boundaries. * + * The length can be negative for a NUL-terminated string. + * * @param s const uint8_t * string - * @param i string offset, must be i<length - * @param length string length + * @param i int32_t string offset, must be i<length + * @param length int32_t string length * @param n number of code points to skip * @see U8_FWD_N_UNSAFE * @stable ICU 2.4 */ #define U8_FWD_N(s, i, length, n) { \ int32_t __N=(n); \ - while(__N>0 && (i)<(length)) { \ + while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ U8_FWD_1(s, i, length); \ --__N; \ } \ @@ -479,14 +595,14 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * "Safe" macro, checks for illegal sequences and for string boundaries. * * @param s const uint8_t * string - * @param start starting string offset (usually 0) - * @param i string offset, must be start<=i + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start<=i * @see U8_SET_CP_START_UNSAFE * @stable ICU 2.4 */ #define U8_SET_CP_START(s, start, i) { \ if(U8_IS_TRAIL((s)[(i)])) { \ - (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ + (i)=utf8_back1SafeBody(s, start, (i)); \ } \ } @@ -547,8 +663,8 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value. * * @param s const uint8_t * string - * @param start starting string offset (usually 0) - * @param i string offset, must be start<i + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start<i * @param c output UChar32 variable, set to <0 in case of an error * @see U8_PREV_UNSAFE * @stable ICU 2.4 @@ -556,13 +672,42 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); #define U8_PREV(s, start, i, c) { \ (c)=(uint8_t)(s)[--(i)]; \ if((c)>=0x80) { \ - if((c)<=0xbf) { \ - (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ - } else { \ - (c)=U_SENTINEL; \ - } \ + (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ + } \ +} + +#ifndef U_HIDE_DRAFT_API +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The input offset may be the same as the string length. + * If the offset is behind a multi-byte sequence, then the macro will read + * the whole sequence. + * If the offset is behind a lead byte, then that itself + * will be returned as the code point. + * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD. + * + * This macro does not distinguish between a real U+FFFD in the text + * and U+FFFD returned for an ill-formed sequence. + * Use U8_PREV() if that distinction is important. + * + * @param s const uint8_t * string + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start<i + * @param c output UChar32 variable, set to U+FFFD in case of an error + * @see U8_PREV + * @draft ICU 51 + */ +#define U8_PREV_OR_FFFD(s, start, i, c) { \ + (c)=(uint8_t)(s)[--(i)]; \ + if((c)>=0x80) { \ + (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ } \ } +#endif /* U_HIDE_DRAFT_API */ /** * Move the string offset from one code point boundary to the previous one. @@ -586,14 +731,14 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * "Safe" macro, checks for illegal sequences and for string boundaries. * * @param s const uint8_t * string - * @param start starting string offset (usually 0) - * @param i string offset, must be start<i + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start<i * @see U8_BACK_1_UNSAFE * @stable ICU 2.4 */ #define U8_BACK_1(s, start, i) { \ if(U8_IS_TRAIL((s)[--(i)])) { \ - (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ + (i)=utf8_back1SafeBody(s, start, (i)); \ } \ } @@ -626,8 +771,8 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * "Safe" macro, checks for illegal sequences and for string boundaries. * * @param s const uint8_t * string - * @param start index of the start of the string - * @param i string offset, must be start<i + * @param start int32_t index of the start of the string + * @param i int32_t string offset, must be start<i * @param n number of code points to skip * @see U8_BACK_N_UNSAFE * @stable ICU 2.4 @@ -666,15 +811,17 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); * The input offset may be the same as the string length. * "Safe" macro, checks for illegal sequences and for string boundaries. * + * The length can be negative for a NUL-terminated string. + * * @param s const uint8_t * string - * @param start starting string offset (usually 0) - * @param i string offset, must be start<=i<=length - * @param length string length + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, must be start<=i<=length + * @param length int32_t string length * @see U8_SET_CP_LIMIT_UNSAFE * @stable ICU 2.4 */ #define U8_SET_CP_LIMIT(s, start, i, length) { \ - if((start)<(i) && (i)<(length)) { \ + if((start)<(i) && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ U8_BACK_1(s, start, i); \ U8_FWD_1(s, i, length); \ } \ |