diff options
author | Felix Geisendörfer <felix@debuggable.com> | 2014-01-20 09:43:43 +0100 |
---|---|---|
committer | Timothy J Fontaine <tjfontaine@gmail.com> | 2014-06-06 15:04:39 -0700 |
commit | 11d21f5b17f27576cf4245e6ad05cfc66bccf2f8 (patch) | |
tree | ff404ce9bd80272bbd3bb669b0f0f7f647431ee3 | |
parent | 0dc2f4f82d0f3eae2ad5d36447c9d51a46bfc486 (diff) | |
download | node-11d21f5b17f27576cf4245e6ad05cfc66bccf2f8.tar.gz |
deps/v8: Apply REPLACE_INVALID_UTF8 patch
- https://codereview.chromium.org/121173009/
- https://code.google.com/p/v8/source/detail?r=18683
Note: The v8 test case did not cleanly apply, so it's missing from this
patch. I'm assuming this is not a problem if the v8 test suite is not
part of the node build / test system. If that's the case I'll fix it.
Otherwise the test case will be integrated once v8 is upgraded.
-rw-r--r-- | deps/v8/include/v8.h | 6 | ||||
-rw-r--r-- | deps/v8/src/api.cc | 37 | ||||
-rw-r--r-- | deps/v8/src/unicode-inl.h | 15 | ||||
-rw-r--r-- | deps/v8/src/unicode.h | 16 |
4 files changed, 57 insertions, 17 deletions
diff --git a/deps/v8/include/v8.h b/deps/v8/include/v8.h index 77ffb385a..868a4356a 100644 --- a/deps/v8/include/v8.h +++ b/deps/v8/include/v8.h @@ -1064,7 +1064,11 @@ class String : public Primitive { enum WriteOptions { NO_OPTIONS = 0, HINT_MANY_WRITES_EXPECTED = 1, - NO_NULL_TERMINATION = 2 + NO_NULL_TERMINATION = 2, + // Used by WriteUtf8 to replace orphan surrogate code units with the + // unicode replacement character. Needs to be set to guarantee valid UTF-8 + // output. + REPLACE_INVALID_UTF8 = 8 }; // 16-bit character codes. diff --git a/deps/v8/src/api.cc b/deps/v8/src/api.cc index 0d88047aa..b21b1e17a 100644 --- a/deps/v8/src/api.cc +++ b/deps/v8/src/api.cc @@ -3736,7 +3736,8 @@ static int RecursivelySerializeToUtf8(i::String* string, int end, int recursion_budget, int32_t previous_character, - int32_t* last_character) { + int32_t* last_character, + bool replace_invalid_utf8) { int utf8_bytes = 0; while (true) { if (string->IsAsciiRepresentation()) { @@ -3752,7 +3753,10 @@ static int RecursivelySerializeToUtf8(i::String* string, for (int i = start; i < end; i++) { uint16_t character = data[i]; current += - unibrow::Utf8::Encode(current, character, previous_character); + unibrow::Utf8::Encode(current, + character, + previous_character, + replace_invalid_utf8); previous_character = character; } *last_character = previous_character; @@ -3765,7 +3769,10 @@ static int RecursivelySerializeToUtf8(i::String* string, for (int i = start; i < end; i++) { uint16_t character = data[i]; current += - unibrow::Utf8::Encode(current, character, previous_character); + unibrow::Utf8::Encode(current, + character, + previous_character, + replace_invalid_utf8); previous_character = character; } *last_character = previous_character; @@ -3801,7 +3808,8 @@ static int RecursivelySerializeToUtf8(i::String* string, boundary, recursion_budget - 1, previous_character, - &previous_character); + &previous_character, + replace_invalid_utf8); if (extra_utf8_bytes < 0) return extra_utf8_bytes; buffer += extra_utf8_bytes; utf8_bytes += extra_utf8_bytes; @@ -3853,7 +3861,10 @@ int String::WriteUtf8(char* buffer, return len; } - if (capacity == -1 || capacity / 3 >= string_length) { + bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8); + int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize; + + if (capacity == -1 || capacity / max16BitCodeUnitSize >= string_length) { int32_t previous = unibrow::Utf16::kNoPreviousCharacter; const int kMaxRecursion = 100; int utf8_bytes = @@ -3863,7 +3874,8 @@ int String::WriteUtf8(char* buffer, string_length, kMaxRecursion, previous, - &previous); + &previous, + replace_invalid_utf8); if (utf8_bytes >= 0) { // Success serializing with recursion. if ((options & NO_NULL_TERMINATION) == 0 && @@ -3908,7 +3920,10 @@ int String::WriteUtf8(char* buffer, int previous = unibrow::Utf16::kNoPreviousCharacter; for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) { i::uc32 c = write_input_buffer.GetNext(); - int written = unibrow::Utf8::Encode(buffer + pos, c, previous); + int written = unibrow::Utf8::Encode(buffer + pos, + c, + previous, + replace_invalid_utf8); pos += written; nchars++; previous = c; @@ -3920,14 +3935,16 @@ int String::WriteUtf8(char* buffer, char intermediate[unibrow::Utf8::kMaxEncodedSize]; for (; i < len && pos < capacity; i++) { i::uc32 c = write_input_buffer.GetNext(); - if (unibrow::Utf16::IsTrailSurrogate(c) && - unibrow::Utf16::IsLeadSurrogate(previous)) { + if (unibrow::Utf16::IsSurrogatePair(previous, c)) { // We can't use the intermediate buffer here because the encoding // of surrogate pairs is done under assumption that you can step // back and fix the UTF8 stream. Luckily we only need space for one // more byte, so there is always space. ASSERT(pos < capacity); - int written = unibrow::Utf8::Encode(buffer + pos, c, previous); + int written = unibrow::Utf8::Encode(buffer + pos, + c, + previous, + replace_invalid_utf8); ASSERT(written == 1); pos += written; nchars++; diff --git a/deps/v8/src/unicode-inl.h b/deps/v8/src/unicode-inl.h index 9c0ebf9e1..03b38a9ff 100644 --- a/deps/v8/src/unicode-inl.h +++ b/deps/v8/src/unicode-inl.h @@ -78,7 +78,10 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, } -unsigned Utf8::Encode(char* str, uchar c, int previous) { +unsigned Utf8::Encode(char* str, + uchar c, + int previous, + bool replace_invalid) { static const int kMask = ~(1 << 6); if (c <= kMaxOneByteChar) { str[0] = c; @@ -88,12 +91,16 @@ unsigned Utf8::Encode(char* str, uchar c, int previous) { str[1] = 0x80 | (c & kMask); return 2; } else if (c <= kMaxThreeByteChar) { - if (Utf16::IsTrailSurrogate(c) && - Utf16::IsLeadSurrogate(previous)) { + if (Utf16::IsSurrogatePair(previous, c)) { const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; return Encode(str - kUnmatchedSize, Utf16::CombineSurrogatePair(previous, c), - Utf16::kNoPreviousCharacter) - kUnmatchedSize; + Utf16::kNoPreviousCharacter, + replace_invalid) - kUnmatchedSize; + } else if (replace_invalid && + (Utf16::IsLeadSurrogate(c) || + Utf16::IsTrailSurrogate(c))) { + c = kBadChar; } str[0] = 0xE0 | (c >> 12); str[1] = 0x80 | ((c >> 6) & kMask); diff --git a/deps/v8/src/unicode.h b/deps/v8/src/unicode.h index 94ab1b4c1..384326a83 100644 --- a/deps/v8/src/unicode.h +++ b/deps/v8/src/unicode.h @@ -117,6 +117,9 @@ class Buffer { class Utf16 { public: + static inline bool IsSurrogatePair(int lead, int trail) { + return IsLeadSurrogate(lead) && IsTrailSurrogate(trail); + } static inline bool IsLeadSurrogate(int code) { if (code == kNoPreviousCharacter) return false; return (code & 0xfc00) == 0xd800; @@ -152,13 +155,19 @@ class Utf16 { class Utf8 { public: static inline uchar Length(uchar chr, int previous); - static inline unsigned Encode( - char* out, uchar c, int previous); + static inline unsigned Encode(char* out, + uchar c, + int previous, + bool replace_invalid = false); static const byte* ReadBlock(Buffer<const char*> str, byte* buffer, unsigned capacity, unsigned* chars_read, unsigned* offset); static uchar CalculateValue(const byte* str, unsigned length, unsigned* cursor); + + + // The unicode replacement character, used to signal invalid unicode + // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding. static const uchar kBadChar = 0xFFFD; static const unsigned kMaxEncodedSize = 4; static const unsigned kMaxOneByteChar = 0x7f; @@ -170,6 +179,9 @@ class Utf8 { // that match are coded as a 4 byte UTF-8 sequence. static const unsigned kBytesSavedByCombiningSurrogates = 2; static const unsigned kSizeOfUnmatchedSurrogate = 3; + // The maximum size a single UTF-16 code unit may take up when encoded as + // UTF-8. + static const unsigned kMax16BitCodeUnitSize = 3; private: template <unsigned s> friend class Utf8InputBuffer; |