diff options
Diffstat (limited to 'deps/v8/src/scanner-character-streams.cc')
-rw-r--r-- | deps/v8/src/scanner-character-streams.cc | 23 |
1 files changed, 20 insertions, 3 deletions
diff --git a/deps/v8/src/scanner-character-streams.cc b/deps/v8/src/scanner-character-streams.cc index d06f479f94..50c3955c1b 100644 --- a/deps/v8/src/scanner-character-streams.cc +++ b/deps/v8/src/scanner-character-streams.cc @@ -18,6 +18,10 @@ namespace { unsigned CopyCharsHelper(uint16_t* dest, unsigned length, const uint8_t* src, unsigned* src_pos, unsigned src_length, ScriptCompiler::StreamedSource::Encoding encoding) { + // It's possible that this will be called with length 0, but don't assume that + // the functions this calls handle it gracefully. + if (length == 0) return 0; + if (encoding == ScriptCompiler::StreamedSource::UTF8) { return v8::internal::Utf8ToUtf16CharacterStream::CopyChars( dest, length, src, src_pos, src_length); @@ -381,15 +385,22 @@ unsigned ExternalStreamingStream::FillBuffer(unsigned position) { void ExternalStreamingStream::HandleUtf8SplitCharacters( unsigned* data_in_buffer) { + // Note the following property of UTF-8 which makes this function possible: + // Given any byte, we can always read its local environment (in both + // directions) to find out the (possibly multi-byte) character it belongs + // to. Single byte characters are of the form 0b0XXXXXXX. The first byte of a + // multi-byte character is of the form 0b110XXXXX, 0b1110XXXX or + // 0b11110XXX. The continuation bytes are of the form 0b10XXXXXX. + // First check if we have leftover data from the last chunk. unibrow::uchar c; if (utf8_split_char_buffer_length_ > 0) { // Move the bytes which are part of the split character (which started in - // the previous chunk) into utf8_split_char_buffer_. + // the previous chunk) into utf8_split_char_buffer_. Note that the + // continuation bytes are of the form 0b10XXXXXX, thus c >> 6 == 2. while (current_data_offset_ < current_data_length_ && utf8_split_char_buffer_length_ < 4 && - (c = current_data_[current_data_offset_]) > - unibrow::Utf8::kMaxOneByteChar) { + (c = current_data_[current_data_offset_]) >> 6 == 2) { utf8_split_char_buffer_[utf8_split_char_buffer_length_] = c; ++utf8_split_char_buffer_length_; ++current_data_offset_; @@ -420,6 +431,12 @@ void ExternalStreamingStream::HandleUtf8SplitCharacters( utf8_split_char_buffer_length_ < 4) { --current_data_length_; ++utf8_split_char_buffer_length_; + if (c >= (3 << 6)) { + // 3 << 6 = 0b11000000; this is the first byte of the multi-byte + // character. No need to copy the previous characters into the conversion + // buffer (even if they're multi-byte). + break; + } } CHECK(utf8_split_char_buffer_length_ <= 4); for (unsigned i = 0; i < utf8_split_char_buffer_length_; ++i) { |