// ====================================================================== // // The actual conversion methods are covered by the copyright information // below. It is not the actual code provided by Unicode, Inc. but is an // ACE-ified and only slightly modified version. // Chad Elliott 4/28/2005 // // Copyright 2001-2004 Unicode, Inc. // // Limitations on Rights to Redistribute This Code // // Unicode, Inc. hereby grants the right to freely use the information // supplied in this file in the creation of products supporting the // Unicode Standard, and to make copies of this file in any form // for internal or external distribution as long as this notice // remains attached. // // ====================================================================== #include "ace/UTF16_Encoding_Converter.h" #if defined (ACE_USES_WCHAR) #include "ace/OS_NS_stdio.h" #include "ace/OS_Memory.h" #include "ace/Min_Max.h" #if !defined (__ACE_INLINE__) #include "ace/UTF16_Encoding_Converter.inl" #endif /* __ACE_INLINE__ */ ACE_BEGIN_VERSIONED_NAMESPACE_DECL static constexpr ACE_UINT32 halfShift = 10; static constexpr ACE_UINT32 halfBase = 0x00010000; static constexpr ACE_UINT32 halfMask = 0x000003FF; static constexpr ACE_UINT32 UNI_SUR_HIGH_START = 0x0000D800; static constexpr ACE_UINT32 UNI_SUR_HIGH_END = 0x0000DBFF; static constexpr ACE_UINT32 UNI_SUR_LOW_START = 0x0000DC00; static constexpr ACE_UINT32 UNI_SUR_LOW_END = 0x0000DFFF; static constexpr ACE_UINT32 UNI_REPLACEMENT_CHAR = 0x0000FFFD; static constexpr ACE_UINT32 UNI_MAX_BMP = 0x0000FFFF; static constexpr ACE_UINT32 UNI_MAX_UTF16 = 0x0010FFFF; // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed // into the first byte, depending on how many bytes follow. There are // as many entries in this table as there are UTF-8 sequence types. // (I.e., one byte sequence, two byte... etc.). Remember that sequencs // for *legal* UTF-8 will be 4 or fewer bytes total. static const ACE_Byte firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; // Index into the table below with the first byte of a UTF-8 sequence to // get the number of trailing bytes that are supposed to follow it. // Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is // left as-is for anyone who may want to do such conversion, which was // allowed in earlier algorithms. static const ACE_Byte trailingBytesForUTF8[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; // Magic values subtracted from a buffer value during UTF8 conversion. // This table contains as many values as there might be trailing bytes // in a UTF-8 sequence. static const ACE_UINT32 offsetsFromUTF8[6] = { 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; ACE_UTF16_Encoding_Converter::ACE_UTF16_Encoding_Converter (bool swap) : swap_ (swap) { } ACE_UTF16_Encoding_Converter::~ACE_UTF16_Encoding_Converter () { } ACE_UTF16_Encoding_Converter::Result ACE_UTF16_Encoding_Converter::to_utf8 (const void* source, size_t source_size, ACE_Byte* target, size_t target_size, bool strict) { static const ACE_UINT32 byteMask = 0xBF; static const ACE_UINT32 byteMark = 0x80; Result result = CONVERSION_OK; ACE_Byte* targetEnd = target + target_size; const ACE_UINT16* sourceStart = static_cast (source); const ACE_UINT16* sourceEnd = sourceStart + (source_size / sizeof (ACE_UINT16)); while (sourceStart < sourceEnd) { ACE_UINT16 nw = *sourceStart++; ACE_UINT32 ch = (this->swap_ ? ACE_SWAP_WORD (nw) : nw); // If we have a surrogate pair, convert to ACE_UINT32 first. if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { // If the 16 bits following the high surrogate are in the // sourceStart buffer... if (sourceStart < sourceEnd) { ACE_UINT32 ch2 = (this->swap_ ? ACE_SWAP_WORD (*sourceStart) : *sourceStart); // If it's a low surrogate, convert to ACE_UINT32. if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase; ++sourceStart; } else if (strict) { // it's an unpaired high surrogate result = SOURCE_ILLEGAL; break; } } else { // We don't have the 16 bits following the high surrogate. result = SOURCE_EXHAUSTED; break; } } else if (strict) { // UTF-16 surrogate values are illegal in UTF-32 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { result = SOURCE_ILLEGAL; break; } } // Figure out how many bytes the result will require unsigned short bytesToWrite = 0; if (ch < 0x80) bytesToWrite = 1; else if (ch < 0x800) bytesToWrite = 2; else if (ch < 0x10000) bytesToWrite = 3; else if (ch < 0x110000) bytesToWrite = 4; else { bytesToWrite = 3; ch = UNI_REPLACEMENT_CHAR; } target += bytesToWrite; if (target > targetEnd) { result = TARGET_EXHAUSTED; break; } // NOTE: Everything falls through for efficiency purposes. switch (bytesToWrite) { case 4: *--target = (ACE_Byte)((ch | byteMark) & byteMask); ch >>= 6; case 3: *--target = (ACE_Byte)((ch | byteMark) & byteMask); ch >>= 6; case 2: *--target = (ACE_Byte)((ch | byteMark) & byteMask); ch >>= 6; case 1: *--target = (ACE_Byte)(ch | firstByteMark[bytesToWrite]); } target += bytesToWrite; } return result; } ACE_UTF16_Encoding_Converter::Result ACE_UTF16_Encoding_Converter::from_utf8 (const ACE_Byte* source, size_t source_size, void* target, size_t target_size, bool strict) { Result result = CONVERSION_OK; const ACE_Byte* sourceEnd = source + source_size; ACE_UINT16* targetStart = static_cast (target); ACE_UINT16* targetEnd = targetStart + target_size; while (source < sourceEnd) { ACE_UINT32 ch = 0; unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; if (source + extraBytesToRead >= sourceEnd) { result = SOURCE_EXHAUSTED; break; } // Do this check whether lenient or strict if (!this->is_legal_utf8 (source, extraBytesToRead + 1)) { result = SOURCE_ILLEGAL; break; } // The cases all fall through. See "Note A" below. switch (extraBytesToRead) { case 5: // remember, illegal UTF-8 ch += *source++; ch <<= 6; case 4: // remember, illegal UTF-8 ch += *source++; ch <<= 6; case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; case 0: ch += *source++; } ch -= offsetsFromUTF8[extraBytesToRead]; if (targetStart >= targetEnd) { result = TARGET_EXHAUSTED; break; } if (ch <= UNI_MAX_BMP) // Target is a character <= 0xFFFF { // UTF-16 surrogate values are illegal in UTF-32 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { if (strict) { result = SOURCE_ILLEGAL; break; } else { *targetStart++ = UNI_REPLACEMENT_CHAR; } } else { *targetStart++ = (ACE_UINT16)ch; } } else if (ch > UNI_MAX_UTF16) { if (strict) { result = SOURCE_ILLEGAL; break; } else { *targetStart++ = UNI_REPLACEMENT_CHAR; } } else { // targetStart is a character in range 0xFFFF - 0x10FFFF. if (targetStart + 1 >= targetEnd) { result = TARGET_EXHAUSTED; break; } ch -= halfBase; *targetStart++ = (ACE_UINT16)((ch >> halfShift) + UNI_SUR_HIGH_START); *targetStart++ = (ACE_UINT16)((ch & halfMask) + UNI_SUR_LOW_START); } } return result; } ACE_UTF16_Encoding_Converter* ACE_UTF16_Encoding_Converter::encoded (const ACE_Byte* source, size_t source_size) { static const size_t begin = 16; static const size_t converted = begin * 4; ACE_Byte target[converted]; ACE_UTF16_Encoding_Converter* converter = 0; ACE_NEW_RETURN (converter, ACE_UTF16_Encoding_Converter (false), 0); if (converter->to_utf8 (source, ACE_MIN (begin, source_size), target, converted) == CONVERSION_OK) { return converter; } else { delete converter; } return 0; } ACE_UINT32 ACE_UTF16_Encoding_Converter::get_UNI_SUR_HIGH_START () { return UNI_SUR_HIGH_START; } ACE_UINT32 ACE_UTF16_Encoding_Converter::get_UNI_SUR_LOW_END () { return UNI_SUR_LOW_END; } ACE_UINT32 ACE_UTF16_Encoding_Converter::get_UNI_REPLACEMENT_CHAR () { return UNI_REPLACEMENT_CHAR; } const ACE_Byte* ACE_UTF16_Encoding_Converter::get_first_byte_mark () { return firstByteMark; } const ACE_Byte* ACE_UTF16_Encoding_Converter::get_trailing_bytes_for_utf8 () { return trailingBytesForUTF8; } const ACE_UINT32* ACE_UTF16_Encoding_Converter::get_offsets_from_utf8 () { return offsetsFromUTF8; } ACE_END_VERSIONED_NAMESPACE_DECL #endif /* ACE_USES_WCHAR */