diff options
Diffstat (limited to 'chromium/third_party/blink/renderer/platform/wtf/text/text_codec_utf8.cc')
-rw-r--r-- | chromium/third_party/blink/renderer/platform/wtf/text/text_codec_utf8.cc | 502 |
1 files changed, 502 insertions, 0 deletions
diff --git a/chromium/third_party/blink/renderer/platform/wtf/text/text_codec_utf8.cc b/chromium/third_party/blink/renderer/platform/wtf/text/text_codec_utf8.cc new file mode 100644 index 00000000000..3a5a0d71b5d --- /dev/null +++ b/chromium/third_party/blink/renderer/platform/wtf/text/text_codec_utf8.cc @@ -0,0 +1,502 @@ +/* + * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "third_party/blink/renderer/platform/wtf/text/text_codec_utf8.h" + +#include <memory> +#include "base/memory/ptr_util.h" +#include "third_party/blink/renderer/platform/wtf/text/character_names.h" +#include "third_party/blink/renderer/platform/wtf/text/cstring.h" +#include "third_party/blink/renderer/platform/wtf/text/string_buffer.h" +#include "third_party/blink/renderer/platform/wtf/text/text_codec_ascii_fast_path.h" + +namespace WTF { + +// We'll use nonCharacter* constants to signal invalid utf-8. +// The number in the name signals how many input bytes were invalid. +const int kNonCharacter1 = -1; +const int kNonCharacter2 = -2; +const int kNonCharacter3 = -3; + +bool IsNonCharacter(int character) { + return character >= kNonCharacter3 && character <= kNonCharacter1; +} + +std::unique_ptr<TextCodec> TextCodecUTF8::Create(const TextEncoding&, + const void*) { + return base::WrapUnique(new TextCodecUTF8()); +} + +void TextCodecUTF8::RegisterEncodingNames(EncodingNameRegistrar registrar) { + registrar("UTF-8", "UTF-8"); + + // Additional aliases that originally were present in the encoding + // table in WebKit on Macintosh, and subsequently added by + // TextCodecICU. Perhaps we can prove some are not used on the web + // and remove them. + registrar("unicode11utf8", "UTF-8"); + registrar("unicode20utf8", "UTF-8"); + registrar("utf8", "UTF-8"); + registrar("x-unicode20utf8", "UTF-8"); + + // Additional aliases present in the WHATWG Encoding Standard + // (http://encoding.spec.whatwg.org/) + // and Firefox (24), but not in ICU 4.6. + registrar("unicode-1-1-utf-8", "UTF-8"); +} + +void TextCodecUTF8::RegisterCodecs(TextCodecRegistrar registrar) { + registrar("UTF-8", Create, nullptr); +} + +static inline int NonASCIISequenceLength(uint8_t first_byte) { + static const uint8_t kLengths[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + return kLengths[first_byte]; +} + +static inline int DecodeNonASCIISequence(const uint8_t* sequence, + unsigned length) { + DCHECK(!IsASCII(sequence[0])); + if (length == 2) { + DCHECK_LE(sequence[0], 0xDF); + if (sequence[0] < 0xC2) + return kNonCharacter1; + if (sequence[1] < 0x80 || sequence[1] > 0xBF) + return kNonCharacter1; + return ((sequence[0] << 6) + sequence[1]) - 0x00003080; + } + if (length == 3) { + DCHECK_GE(sequence[0], 0xE0); + DCHECK_LE(sequence[0], 0xEF); + switch (sequence[0]) { + case 0xE0: + if (sequence[1] < 0xA0 || sequence[1] > 0xBF) + return kNonCharacter1; + break; + case 0xED: + if (sequence[1] < 0x80 || sequence[1] > 0x9F) + return kNonCharacter1; + break; + default: + if (sequence[1] < 0x80 || sequence[1] > 0xBF) + return kNonCharacter1; + } + if (sequence[2] < 0x80 || sequence[2] > 0xBF) + return kNonCharacter2; + return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - + 0x000E2080; + } + DCHECK_EQ(length, 4u); + DCHECK_GE(sequence[0], 0xF0); + DCHECK_LE(sequence[0], 0xF4); + switch (sequence[0]) { + case 0xF0: + if (sequence[1] < 0x90 || sequence[1] > 0xBF) + return kNonCharacter1; + break; + case 0xF4: + if (sequence[1] < 0x80 || sequence[1] > 0x8F) + return kNonCharacter1; + break; + default: + if (sequence[1] < 0x80 || sequence[1] > 0xBF) + return kNonCharacter1; + } + if (sequence[2] < 0x80 || sequence[2] > 0xBF) + return kNonCharacter2; + if (sequence[3] < 0x80 || sequence[3] > 0xBF) + return kNonCharacter3; + return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + + sequence[3]) - + 0x03C82080; +} + +static inline UChar* AppendCharacter(UChar* destination, int character) { + DCHECK(!IsNonCharacter(character)); + DCHECK(!U_IS_SURROGATE(character)); + if (U_IS_BMP(character)) { + *destination++ = static_cast<UChar>(character); + } else { + *destination++ = U16_LEAD(character); + *destination++ = U16_TRAIL(character); + } + return destination; +} + +void TextCodecUTF8::ConsumePartialSequenceBytes(int num_bytes) { + DCHECK_GE(partial_sequence_size_, num_bytes); + partial_sequence_size_ -= num_bytes; + memmove(partial_sequence_, partial_sequence_ + num_bytes, + partial_sequence_size_); +} + +void TextCodecUTF8::HandleError(int character, + UChar*& destination, + bool stop_on_error, + bool& saw_error) { + saw_error = true; + if (stop_on_error) + return; + // Each error generates a replacement character and consumes 1-3 bytes. + *destination++ = kReplacementCharacter; + DCHECK(IsNonCharacter(character)); + int num_bytes_consumed = -character; + DCHECK_GE(num_bytes_consumed, 1); + DCHECK_LE(num_bytes_consumed, 3); + ConsumePartialSequenceBytes(num_bytes_consumed); +} + +template <> +bool TextCodecUTF8::HandlePartialSequence<LChar>(LChar*& destination, + const uint8_t*& source, + const uint8_t* end, + bool flush, + bool, + bool&) { + DCHECK(partial_sequence_size_); + do { + if (IsASCII(partial_sequence_[0])) { + *destination++ = partial_sequence_[0]; + ConsumePartialSequenceBytes(1); + continue; + } + int count = NonASCIISequenceLength(partial_sequence_[0]); + if (!count) + return true; + + if (count > partial_sequence_size_) { + if (count - partial_sequence_size_ > end - source) { + if (!flush) { + // The new data is not enough to complete the sequence, so + // add it to the existing partial sequence. + memcpy(partial_sequence_ + partial_sequence_size_, source, + end - source); + partial_sequence_size_ += end - source; + return false; + } + // An incomplete partial sequence at the end is an error, but it will + // create a 16 bit string due to the replacementCharacter. Let the 16 + // bit path handle the error. + return true; + } + memcpy(partial_sequence_ + partial_sequence_size_, source, + count - partial_sequence_size_); + source += count - partial_sequence_size_; + partial_sequence_size_ = count; + } + int character = DecodeNonASCIISequence(partial_sequence_, count); + if (character & ~0xff) + return true; + + partial_sequence_size_ -= count; + *destination++ = static_cast<LChar>(character); + } while (partial_sequence_size_); + + return false; +} + +template <> +bool TextCodecUTF8::HandlePartialSequence<UChar>(UChar*& destination, + const uint8_t*& source, + const uint8_t* end, + bool flush, + bool stop_on_error, + bool& saw_error) { + DCHECK(partial_sequence_size_); + do { + if (IsASCII(partial_sequence_[0])) { + *destination++ = partial_sequence_[0]; + ConsumePartialSequenceBytes(1); + continue; + } + int count = NonASCIISequenceLength(partial_sequence_[0]); + if (!count) { + HandleError(kNonCharacter1, destination, stop_on_error, saw_error); + if (stop_on_error) + return false; + continue; + } + if (count > partial_sequence_size_) { + if (count - partial_sequence_size_ > end - source) { + if (!flush) { + // The new data is not enough to complete the sequence, so + // add it to the existing partial sequence. + memcpy(partial_sequence_ + partial_sequence_size_, source, + end - source); + partial_sequence_size_ += end - source; + return false; + } + // An incomplete partial sequence at the end is an error. + HandleError(kNonCharacter1, destination, stop_on_error, saw_error); + if (stop_on_error) + return false; + continue; + } + memcpy(partial_sequence_ + partial_sequence_size_, source, + count - partial_sequence_size_); + source += count - partial_sequence_size_; + partial_sequence_size_ = count; + } + int character = DecodeNonASCIISequence(partial_sequence_, count); + if (IsNonCharacter(character)) { + HandleError(character, destination, stop_on_error, saw_error); + if (stop_on_error) + return false; + continue; + } + + partial_sequence_size_ -= count; + destination = AppendCharacter(destination, character); + } while (partial_sequence_size_); + + return false; +} + +String TextCodecUTF8::Decode(const char* bytes, + size_t length, + FlushBehavior flush, + bool stop_on_error, + bool& saw_error) { + // Each input byte might turn into a character. + // That includes all bytes in the partial-sequence buffer because + // each byte in an invalid sequence will turn into a replacement character. + StringBuffer<LChar> buffer(partial_sequence_size_ + length); + + const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); + const uint8_t* end = source + length; + const uint8_t* aligned_end = AlignToMachineWord(end); + LChar* destination = buffer.Characters(); + + do { + if (partial_sequence_size_) { + // Explicitly copy destination and source pointers to avoid taking + // pointers to the local variables, which may harm code generation by + // disabling some optimizations in some compilers. + LChar* destination_for_handle_partial_sequence = destination; + const uint8_t* source_for_handle_partial_sequence = source; + if (HandlePartialSequence(destination_for_handle_partial_sequence, + source_for_handle_partial_sequence, end, flush, + stop_on_error, saw_error)) { + source = source_for_handle_partial_sequence; + goto upConvertTo16Bit; + } + destination = destination_for_handle_partial_sequence; + source = source_for_handle_partial_sequence; + if (partial_sequence_size_) + break; + } + + while (source < end) { + if (IsASCII(*source)) { + // Fast path for ASCII. Most UTF-8 text will be ASCII. + if (IsAlignedToMachineWord(source)) { + while (source < aligned_end) { + MachineWord chunk = + *reinterpret_cast_ptr<const MachineWord*>(source); + if (!IsAllASCII<LChar>(chunk)) + break; + CopyASCIIMachineWord(destination, source); + source += sizeof(MachineWord); + destination += sizeof(MachineWord); + } + if (source == end) + break; + if (!IsASCII(*source)) + continue; + } + *destination++ = *source++; + continue; + } + int count = NonASCIISequenceLength(*source); + int character; + if (count == 0) { + character = kNonCharacter1; + } else { + if (count > end - source) { + SECURITY_DCHECK(end - source < + static_cast<ptrdiff_t>(sizeof(partial_sequence_))); + DCHECK(!partial_sequence_size_); + partial_sequence_size_ = end - source; + memcpy(partial_sequence_, source, partial_sequence_size_); + source = end; + break; + } + character = DecodeNonASCIISequence(source, count); + } + if (IsNonCharacter(character)) { + saw_error = true; + if (stop_on_error) + break; + + goto upConvertTo16Bit; + } + if (character > 0xff) + goto upConvertTo16Bit; + + source += count; + *destination++ = static_cast<LChar>(character); + } + } while (flush && partial_sequence_size_); + + buffer.Shrink(destination - buffer.Characters()); + + return String::Adopt(buffer); + +upConvertTo16Bit: + StringBuffer<UChar> buffer16(partial_sequence_size_ + length); + + UChar* destination16 = buffer16.Characters(); + + // Copy the already converted characters + for (LChar* converted8 = buffer.Characters(); converted8 < destination;) + *destination16++ = *converted8++; + + do { + if (partial_sequence_size_) { + // Explicitly copy destination and source pointers to avoid taking + // pointers to the local variables, which may harm code generation by + // disabling some optimizations in some compilers. + UChar* destination_for_handle_partial_sequence = destination16; + const uint8_t* source_for_handle_partial_sequence = source; + HandlePartialSequence(destination_for_handle_partial_sequence, + source_for_handle_partial_sequence, end, flush, + stop_on_error, saw_error); + destination16 = destination_for_handle_partial_sequence; + source = source_for_handle_partial_sequence; + if (partial_sequence_size_) + break; + } + + while (source < end) { + if (IsASCII(*source)) { + // Fast path for ASCII. Most UTF-8 text will be ASCII. + if (IsAlignedToMachineWord(source)) { + while (source < aligned_end) { + MachineWord chunk = + *reinterpret_cast_ptr<const MachineWord*>(source); + if (!IsAllASCII<LChar>(chunk)) + break; + CopyASCIIMachineWord(destination16, source); + source += sizeof(MachineWord); + destination16 += sizeof(MachineWord); + } + if (source == end) + break; + if (!IsASCII(*source)) + continue; + } + *destination16++ = *source++; + continue; + } + int count = NonASCIISequenceLength(*source); + int character; + if (count == 0) { + character = kNonCharacter1; + } else { + if (count > end - source) { + SECURITY_DCHECK(end - source < + static_cast<ptrdiff_t>(sizeof(partial_sequence_))); + DCHECK(!partial_sequence_size_); + partial_sequence_size_ = end - source; + memcpy(partial_sequence_, source, partial_sequence_size_); + source = end; + break; + } + character = DecodeNonASCIISequence(source, count); + } + if (IsNonCharacter(character)) { + saw_error = true; + if (stop_on_error) + break; + // Each error generates one replacement character and consumes the + // 'largest subpart' of the incomplete character. + // Note that the nonCharacterX constants go from -1..-3 and contain + // the negative of number of bytes comprising the broken encoding + // detected. So subtracting c (when isNonCharacter(c)) adds the number + // of broken bytes. + *destination16++ = kReplacementCharacter; + source -= character; + continue; + } + source += count; + destination16 = AppendCharacter(destination16, character); + } + } while (flush && partial_sequence_size_); + + buffer16.Shrink(destination16 - buffer16.Characters()); + + return String::Adopt(buffer16); +} + +template <typename CharType> +CString TextCodecUTF8::EncodeCommon(const CharType* characters, size_t length) { + // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. + // BMP characters take only one UTF-16 code unit and can take up to 3 bytes + // (3x). + // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes + // (2x). + CHECK_LE(length, std::numeric_limits<size_t>::max() / 3); + Vector<uint8_t> bytes(length * 3); + + size_t i = 0; + size_t bytes_written = 0; + while (i < length) { + UChar32 character; + U16_NEXT(characters, i, length, character); + // U16_NEXT will simply emit a surrogate code point if an unmatched + // surrogate is encountered; we must convert it to a + // U+FFFD (REPLACEMENT CHARACTER) here. + if (0xD800 <= character && character <= 0xDFFF) + character = kReplacementCharacter; + U8_APPEND_UNSAFE(bytes.data(), bytes_written, character); + } + + return CString(reinterpret_cast<char*>(bytes.data()), bytes_written); +} + +CString TextCodecUTF8::Encode(const UChar* characters, + size_t length, + UnencodableHandling) { + return EncodeCommon(characters, length); +} + +CString TextCodecUTF8::Encode(const LChar* characters, + size_t length, + UnencodableHandling) { + return EncodeCommon(characters, length); +} + +} // namespace WTF |