summaryrefslogtreecommitdiff
path: root/chromium/third_party/blink/renderer/platform/wtf/text/text_codec_utf8.cc
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/blink/renderer/platform/wtf/text/text_codec_utf8.cc')
-rw-r--r--chromium/third_party/blink/renderer/platform/wtf/text/text_codec_utf8.cc502
1 files changed, 502 insertions, 0 deletions
diff --git a/chromium/third_party/blink/renderer/platform/wtf/text/text_codec_utf8.cc b/chromium/third_party/blink/renderer/platform/wtf/text/text_codec_utf8.cc
new file mode 100644
index 00000000000..3a5a0d71b5d
--- /dev/null
+++ b/chromium/third_party/blink/renderer/platform/wtf/text/text_codec_utf8.cc
@@ -0,0 +1,502 @@
+/*
+ * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "third_party/blink/renderer/platform/wtf/text/text_codec_utf8.h"
+
+#include <memory>
+#include "base/memory/ptr_util.h"
+#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
+#include "third_party/blink/renderer/platform/wtf/text/cstring.h"
+#include "third_party/blink/renderer/platform/wtf/text/string_buffer.h"
+#include "third_party/blink/renderer/platform/wtf/text/text_codec_ascii_fast_path.h"
+
+namespace WTF {
+
+// We'll use nonCharacter* constants to signal invalid utf-8.
+// The number in the name signals how many input bytes were invalid.
+const int kNonCharacter1 = -1;
+const int kNonCharacter2 = -2;
+const int kNonCharacter3 = -3;
+
+bool IsNonCharacter(int character) {
+ return character >= kNonCharacter3 && character <= kNonCharacter1;
+}
+
+std::unique_ptr<TextCodec> TextCodecUTF8::Create(const TextEncoding&,
+ const void*) {
+ return base::WrapUnique(new TextCodecUTF8());
+}
+
+void TextCodecUTF8::RegisterEncodingNames(EncodingNameRegistrar registrar) {
+ registrar("UTF-8", "UTF-8");
+
+ // Additional aliases that originally were present in the encoding
+ // table in WebKit on Macintosh, and subsequently added by
+ // TextCodecICU. Perhaps we can prove some are not used on the web
+ // and remove them.
+ registrar("unicode11utf8", "UTF-8");
+ registrar("unicode20utf8", "UTF-8");
+ registrar("utf8", "UTF-8");
+ registrar("x-unicode20utf8", "UTF-8");
+
+ // Additional aliases present in the WHATWG Encoding Standard
+ // (http://encoding.spec.whatwg.org/)
+ // and Firefox (24), but not in ICU 4.6.
+ registrar("unicode-1-1-utf-8", "UTF-8");
+}
+
+void TextCodecUTF8::RegisterCodecs(TextCodecRegistrar registrar) {
+ registrar("UTF-8", Create, nullptr);
+}
+
+static inline int NonASCIISequenceLength(uint8_t first_byte) {
+ static const uint8_t kLengths[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ return kLengths[first_byte];
+}
+
+static inline int DecodeNonASCIISequence(const uint8_t* sequence,
+ unsigned length) {
+ DCHECK(!IsASCII(sequence[0]));
+ if (length == 2) {
+ DCHECK_LE(sequence[0], 0xDF);
+ if (sequence[0] < 0xC2)
+ return kNonCharacter1;
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ return kNonCharacter1;
+ return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
+ }
+ if (length == 3) {
+ DCHECK_GE(sequence[0], 0xE0);
+ DCHECK_LE(sequence[0], 0xEF);
+ switch (sequence[0]) {
+ case 0xE0:
+ if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
+ return kNonCharacter1;
+ break;
+ case 0xED:
+ if (sequence[1] < 0x80 || sequence[1] > 0x9F)
+ return kNonCharacter1;
+ break;
+ default:
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ return kNonCharacter1;
+ }
+ if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+ return kNonCharacter2;
+ return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) -
+ 0x000E2080;
+ }
+ DCHECK_EQ(length, 4u);
+ DCHECK_GE(sequence[0], 0xF0);
+ DCHECK_LE(sequence[0], 0xF4);
+ switch (sequence[0]) {
+ case 0xF0:
+ if (sequence[1] < 0x90 || sequence[1] > 0xBF)
+ return kNonCharacter1;
+ break;
+ case 0xF4:
+ if (sequence[1] < 0x80 || sequence[1] > 0x8F)
+ return kNonCharacter1;
+ break;
+ default:
+ if (sequence[1] < 0x80 || sequence[1] > 0xBF)
+ return kNonCharacter1;
+ }
+ if (sequence[2] < 0x80 || sequence[2] > 0xBF)
+ return kNonCharacter2;
+ if (sequence[3] < 0x80 || sequence[3] > 0xBF)
+ return kNonCharacter3;
+ return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) +
+ sequence[3]) -
+ 0x03C82080;
+}
+
+static inline UChar* AppendCharacter(UChar* destination, int character) {
+ DCHECK(!IsNonCharacter(character));
+ DCHECK(!U_IS_SURROGATE(character));
+ if (U_IS_BMP(character)) {
+ *destination++ = static_cast<UChar>(character);
+ } else {
+ *destination++ = U16_LEAD(character);
+ *destination++ = U16_TRAIL(character);
+ }
+ return destination;
+}
+
+void TextCodecUTF8::ConsumePartialSequenceBytes(int num_bytes) {
+ DCHECK_GE(partial_sequence_size_, num_bytes);
+ partial_sequence_size_ -= num_bytes;
+ memmove(partial_sequence_, partial_sequence_ + num_bytes,
+ partial_sequence_size_);
+}
+
+void TextCodecUTF8::HandleError(int character,
+ UChar*& destination,
+ bool stop_on_error,
+ bool& saw_error) {
+ saw_error = true;
+ if (stop_on_error)
+ return;
+ // Each error generates a replacement character and consumes 1-3 bytes.
+ *destination++ = kReplacementCharacter;
+ DCHECK(IsNonCharacter(character));
+ int num_bytes_consumed = -character;
+ DCHECK_GE(num_bytes_consumed, 1);
+ DCHECK_LE(num_bytes_consumed, 3);
+ ConsumePartialSequenceBytes(num_bytes_consumed);
+}
+
+template <>
+bool TextCodecUTF8::HandlePartialSequence<LChar>(LChar*& destination,
+ const uint8_t*& source,
+ const uint8_t* end,
+ bool flush,
+ bool,
+ bool&) {
+ DCHECK(partial_sequence_size_);
+ do {
+ if (IsASCII(partial_sequence_[0])) {
+ *destination++ = partial_sequence_[0];
+ ConsumePartialSequenceBytes(1);
+ continue;
+ }
+ int count = NonASCIISequenceLength(partial_sequence_[0]);
+ if (!count)
+ return true;
+
+ if (count > partial_sequence_size_) {
+ if (count - partial_sequence_size_ > end - source) {
+ if (!flush) {
+ // The new data is not enough to complete the sequence, so
+ // add it to the existing partial sequence.
+ memcpy(partial_sequence_ + partial_sequence_size_, source,
+ end - source);
+ partial_sequence_size_ += end - source;
+ return false;
+ }
+ // An incomplete partial sequence at the end is an error, but it will
+ // create a 16 bit string due to the replacementCharacter. Let the 16
+ // bit path handle the error.
+ return true;
+ }
+ memcpy(partial_sequence_ + partial_sequence_size_, source,
+ count - partial_sequence_size_);
+ source += count - partial_sequence_size_;
+ partial_sequence_size_ = count;
+ }
+ int character = DecodeNonASCIISequence(partial_sequence_, count);
+ if (character & ~0xff)
+ return true;
+
+ partial_sequence_size_ -= count;
+ *destination++ = static_cast<LChar>(character);
+ } while (partial_sequence_size_);
+
+ return false;
+}
+
+template <>
+bool TextCodecUTF8::HandlePartialSequence<UChar>(UChar*& destination,
+ const uint8_t*& source,
+ const uint8_t* end,
+ bool flush,
+ bool stop_on_error,
+ bool& saw_error) {
+ DCHECK(partial_sequence_size_);
+ do {
+ if (IsASCII(partial_sequence_[0])) {
+ *destination++ = partial_sequence_[0];
+ ConsumePartialSequenceBytes(1);
+ continue;
+ }
+ int count = NonASCIISequenceLength(partial_sequence_[0]);
+ if (!count) {
+ HandleError(kNonCharacter1, destination, stop_on_error, saw_error);
+ if (stop_on_error)
+ return false;
+ continue;
+ }
+ if (count > partial_sequence_size_) {
+ if (count - partial_sequence_size_ > end - source) {
+ if (!flush) {
+ // The new data is not enough to complete the sequence, so
+ // add it to the existing partial sequence.
+ memcpy(partial_sequence_ + partial_sequence_size_, source,
+ end - source);
+ partial_sequence_size_ += end - source;
+ return false;
+ }
+ // An incomplete partial sequence at the end is an error.
+ HandleError(kNonCharacter1, destination, stop_on_error, saw_error);
+ if (stop_on_error)
+ return false;
+ continue;
+ }
+ memcpy(partial_sequence_ + partial_sequence_size_, source,
+ count - partial_sequence_size_);
+ source += count - partial_sequence_size_;
+ partial_sequence_size_ = count;
+ }
+ int character = DecodeNonASCIISequence(partial_sequence_, count);
+ if (IsNonCharacter(character)) {
+ HandleError(character, destination, stop_on_error, saw_error);
+ if (stop_on_error)
+ return false;
+ continue;
+ }
+
+ partial_sequence_size_ -= count;
+ destination = AppendCharacter(destination, character);
+ } while (partial_sequence_size_);
+
+ return false;
+}
+
+String TextCodecUTF8::Decode(const char* bytes,
+ size_t length,
+ FlushBehavior flush,
+ bool stop_on_error,
+ bool& saw_error) {
+ // Each input byte might turn into a character.
+ // That includes all bytes in the partial-sequence buffer because
+ // each byte in an invalid sequence will turn into a replacement character.
+ StringBuffer<LChar> buffer(partial_sequence_size_ + length);
+
+ const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
+ const uint8_t* end = source + length;
+ const uint8_t* aligned_end = AlignToMachineWord(end);
+ LChar* destination = buffer.Characters();
+
+ do {
+ if (partial_sequence_size_) {
+ // Explicitly copy destination and source pointers to avoid taking
+ // pointers to the local variables, which may harm code generation by
+ // disabling some optimizations in some compilers.
+ LChar* destination_for_handle_partial_sequence = destination;
+ const uint8_t* source_for_handle_partial_sequence = source;
+ if (HandlePartialSequence(destination_for_handle_partial_sequence,
+ source_for_handle_partial_sequence, end, flush,
+ stop_on_error, saw_error)) {
+ source = source_for_handle_partial_sequence;
+ goto upConvertTo16Bit;
+ }
+ destination = destination_for_handle_partial_sequence;
+ source = source_for_handle_partial_sequence;
+ if (partial_sequence_size_)
+ break;
+ }
+
+ while (source < end) {
+ if (IsASCII(*source)) {
+ // Fast path for ASCII. Most UTF-8 text will be ASCII.
+ if (IsAlignedToMachineWord(source)) {
+ while (source < aligned_end) {
+ MachineWord chunk =
+ *reinterpret_cast_ptr<const MachineWord*>(source);
+ if (!IsAllASCII<LChar>(chunk))
+ break;
+ CopyASCIIMachineWord(destination, source);
+ source += sizeof(MachineWord);
+ destination += sizeof(MachineWord);
+ }
+ if (source == end)
+ break;
+ if (!IsASCII(*source))
+ continue;
+ }
+ *destination++ = *source++;
+ continue;
+ }
+ int count = NonASCIISequenceLength(*source);
+ int character;
+ if (count == 0) {
+ character = kNonCharacter1;
+ } else {
+ if (count > end - source) {
+ SECURITY_DCHECK(end - source <
+ static_cast<ptrdiff_t>(sizeof(partial_sequence_)));
+ DCHECK(!partial_sequence_size_);
+ partial_sequence_size_ = end - source;
+ memcpy(partial_sequence_, source, partial_sequence_size_);
+ source = end;
+ break;
+ }
+ character = DecodeNonASCIISequence(source, count);
+ }
+ if (IsNonCharacter(character)) {
+ saw_error = true;
+ if (stop_on_error)
+ break;
+
+ goto upConvertTo16Bit;
+ }
+ if (character > 0xff)
+ goto upConvertTo16Bit;
+
+ source += count;
+ *destination++ = static_cast<LChar>(character);
+ }
+ } while (flush && partial_sequence_size_);
+
+ buffer.Shrink(destination - buffer.Characters());
+
+ return String::Adopt(buffer);
+
+upConvertTo16Bit:
+ StringBuffer<UChar> buffer16(partial_sequence_size_ + length);
+
+ UChar* destination16 = buffer16.Characters();
+
+ // Copy the already converted characters
+ for (LChar* converted8 = buffer.Characters(); converted8 < destination;)
+ *destination16++ = *converted8++;
+
+ do {
+ if (partial_sequence_size_) {
+ // Explicitly copy destination and source pointers to avoid taking
+ // pointers to the local variables, which may harm code generation by
+ // disabling some optimizations in some compilers.
+ UChar* destination_for_handle_partial_sequence = destination16;
+ const uint8_t* source_for_handle_partial_sequence = source;
+ HandlePartialSequence(destination_for_handle_partial_sequence,
+ source_for_handle_partial_sequence, end, flush,
+ stop_on_error, saw_error);
+ destination16 = destination_for_handle_partial_sequence;
+ source = source_for_handle_partial_sequence;
+ if (partial_sequence_size_)
+ break;
+ }
+
+ while (source < end) {
+ if (IsASCII(*source)) {
+ // Fast path for ASCII. Most UTF-8 text will be ASCII.
+ if (IsAlignedToMachineWord(source)) {
+ while (source < aligned_end) {
+ MachineWord chunk =
+ *reinterpret_cast_ptr<const MachineWord*>(source);
+ if (!IsAllASCII<LChar>(chunk))
+ break;
+ CopyASCIIMachineWord(destination16, source);
+ source += sizeof(MachineWord);
+ destination16 += sizeof(MachineWord);
+ }
+ if (source == end)
+ break;
+ if (!IsASCII(*source))
+ continue;
+ }
+ *destination16++ = *source++;
+ continue;
+ }
+ int count = NonASCIISequenceLength(*source);
+ int character;
+ if (count == 0) {
+ character = kNonCharacter1;
+ } else {
+ if (count > end - source) {
+ SECURITY_DCHECK(end - source <
+ static_cast<ptrdiff_t>(sizeof(partial_sequence_)));
+ DCHECK(!partial_sequence_size_);
+ partial_sequence_size_ = end - source;
+ memcpy(partial_sequence_, source, partial_sequence_size_);
+ source = end;
+ break;
+ }
+ character = DecodeNonASCIISequence(source, count);
+ }
+ if (IsNonCharacter(character)) {
+ saw_error = true;
+ if (stop_on_error)
+ break;
+ // Each error generates one replacement character and consumes the
+ // 'largest subpart' of the incomplete character.
+ // Note that the nonCharacterX constants go from -1..-3 and contain
+ // the negative of number of bytes comprising the broken encoding
+ // detected. So subtracting c (when isNonCharacter(c)) adds the number
+ // of broken bytes.
+ *destination16++ = kReplacementCharacter;
+ source -= character;
+ continue;
+ }
+ source += count;
+ destination16 = AppendCharacter(destination16, character);
+ }
+ } while (flush && partial_sequence_size_);
+
+ buffer16.Shrink(destination16 - buffer16.Characters());
+
+ return String::Adopt(buffer16);
+}
+
+template <typename CharType>
+CString TextCodecUTF8::EncodeCommon(const CharType* characters, size_t length) {
+ // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
+ // BMP characters take only one UTF-16 code unit and can take up to 3 bytes
+ // (3x).
+ // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes
+ // (2x).
+ CHECK_LE(length, std::numeric_limits<size_t>::max() / 3);
+ Vector<uint8_t> bytes(length * 3);
+
+ size_t i = 0;
+ size_t bytes_written = 0;
+ while (i < length) {
+ UChar32 character;
+ U16_NEXT(characters, i, length, character);
+ // U16_NEXT will simply emit a surrogate code point if an unmatched
+ // surrogate is encountered; we must convert it to a
+ // U+FFFD (REPLACEMENT CHARACTER) here.
+ if (0xD800 <= character && character <= 0xDFFF)
+ character = kReplacementCharacter;
+ U8_APPEND_UNSAFE(bytes.data(), bytes_written, character);
+ }
+
+ return CString(reinterpret_cast<char*>(bytes.data()), bytes_written);
+}
+
+CString TextCodecUTF8::Encode(const UChar* characters,
+ size_t length,
+ UnencodableHandling) {
+ return EncodeCommon(characters, length);
+}
+
+CString TextCodecUTF8::Encode(const LChar* characters,
+ size_t length,
+ UnencodableHandling) {
+ return EncodeCommon(characters, length);
+}
+
+} // namespace WTF