summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelix Geisendörfer <felix@debuggable.com>2014-01-20 09:43:43 +0100
committerTimothy J Fontaine <tjfontaine@gmail.com>2014-06-06 15:04:39 -0700
commit11d21f5b17f27576cf4245e6ad05cfc66bccf2f8 (patch)
treeff404ce9bd80272bbd3bb669b0f0f7f647431ee3
parent0dc2f4f82d0f3eae2ad5d36447c9d51a46bfc486 (diff)
downloadnode-11d21f5b17f27576cf4245e6ad05cfc66bccf2f8.tar.gz
deps/v8: Apply REPLACE_INVALID_UTF8 patch
- https://codereview.chromium.org/121173009/ - https://code.google.com/p/v8/source/detail?r=18683 Note: The v8 test case did not cleanly apply, so it's missing from this patch. I'm assuming this is not a problem if the v8 test suite is not part of the node build / test system. If that's the case I'll fix it. Otherwise the test case will be integrated once v8 is upgraded.
-rw-r--r--deps/v8/include/v8.h6
-rw-r--r--deps/v8/src/api.cc37
-rw-r--r--deps/v8/src/unicode-inl.h15
-rw-r--r--deps/v8/src/unicode.h16
4 files changed, 57 insertions, 17 deletions
diff --git a/deps/v8/include/v8.h b/deps/v8/include/v8.h
index 77ffb385a..868a4356a 100644
--- a/deps/v8/include/v8.h
+++ b/deps/v8/include/v8.h
@@ -1064,7 +1064,11 @@ class String : public Primitive {
enum WriteOptions {
NO_OPTIONS = 0,
HINT_MANY_WRITES_EXPECTED = 1,
- NO_NULL_TERMINATION = 2
+ NO_NULL_TERMINATION = 2,
+ // Used by WriteUtf8 to replace orphan surrogate code units with the
+ // unicode replacement character. Needs to be set to guarantee valid UTF-8
+ // output.
+ REPLACE_INVALID_UTF8 = 8
};
// 16-bit character codes.
diff --git a/deps/v8/src/api.cc b/deps/v8/src/api.cc
index 0d88047aa..b21b1e17a 100644
--- a/deps/v8/src/api.cc
+++ b/deps/v8/src/api.cc
@@ -3736,7 +3736,8 @@ static int RecursivelySerializeToUtf8(i::String* string,
int end,
int recursion_budget,
int32_t previous_character,
- int32_t* last_character) {
+ int32_t* last_character,
+ bool replace_invalid_utf8) {
int utf8_bytes = 0;
while (true) {
if (string->IsAsciiRepresentation()) {
@@ -3752,7 +3753,10 @@ static int RecursivelySerializeToUtf8(i::String* string,
for (int i = start; i < end; i++) {
uint16_t character = data[i];
current +=
- unibrow::Utf8::Encode(current, character, previous_character);
+ unibrow::Utf8::Encode(current,
+ character,
+ previous_character,
+ replace_invalid_utf8);
previous_character = character;
}
*last_character = previous_character;
@@ -3765,7 +3769,10 @@ static int RecursivelySerializeToUtf8(i::String* string,
for (int i = start; i < end; i++) {
uint16_t character = data[i];
current +=
- unibrow::Utf8::Encode(current, character, previous_character);
+ unibrow::Utf8::Encode(current,
+ character,
+ previous_character,
+ replace_invalid_utf8);
previous_character = character;
}
*last_character = previous_character;
@@ -3801,7 +3808,8 @@ static int RecursivelySerializeToUtf8(i::String* string,
boundary,
recursion_budget - 1,
previous_character,
- &previous_character);
+ &previous_character,
+ replace_invalid_utf8);
if (extra_utf8_bytes < 0) return extra_utf8_bytes;
buffer += extra_utf8_bytes;
utf8_bytes += extra_utf8_bytes;
@@ -3853,7 +3861,10 @@ int String::WriteUtf8(char* buffer,
return len;
}
- if (capacity == -1 || capacity / 3 >= string_length) {
+ bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8);
+ int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize;
+
+ if (capacity == -1 || capacity / max16BitCodeUnitSize >= string_length) {
int32_t previous = unibrow::Utf16::kNoPreviousCharacter;
const int kMaxRecursion = 100;
int utf8_bytes =
@@ -3863,7 +3874,8 @@ int String::WriteUtf8(char* buffer,
string_length,
kMaxRecursion,
previous,
- &previous);
+ &previous,
+ replace_invalid_utf8);
if (utf8_bytes >= 0) {
// Success serializing with recursion.
if ((options & NO_NULL_TERMINATION) == 0 &&
@@ -3908,7 +3920,10 @@ int String::WriteUtf8(char* buffer,
int previous = unibrow::Utf16::kNoPreviousCharacter;
for (i = 0; i < len && (capacity == -1 || pos < fast_end); i++) {
i::uc32 c = write_input_buffer.GetNext();
- int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
+ int written = unibrow::Utf8::Encode(buffer + pos,
+ c,
+ previous,
+ replace_invalid_utf8);
pos += written;
nchars++;
previous = c;
@@ -3920,14 +3935,16 @@ int String::WriteUtf8(char* buffer,
char intermediate[unibrow::Utf8::kMaxEncodedSize];
for (; i < len && pos < capacity; i++) {
i::uc32 c = write_input_buffer.GetNext();
- if (unibrow::Utf16::IsTrailSurrogate(c) &&
- unibrow::Utf16::IsLeadSurrogate(previous)) {
+ if (unibrow::Utf16::IsSurrogatePair(previous, c)) {
// We can't use the intermediate buffer here because the encoding
// of surrogate pairs is done under assumption that you can step
// back and fix the UTF8 stream. Luckily we only need space for one
// more byte, so there is always space.
ASSERT(pos < capacity);
- int written = unibrow::Utf8::Encode(buffer + pos, c, previous);
+ int written = unibrow::Utf8::Encode(buffer + pos,
+ c,
+ previous,
+ replace_invalid_utf8);
ASSERT(written == 1);
pos += written;
nchars++;
diff --git a/deps/v8/src/unicode-inl.h b/deps/v8/src/unicode-inl.h
index 9c0ebf9e1..03b38a9ff 100644
--- a/deps/v8/src/unicode-inl.h
+++ b/deps/v8/src/unicode-inl.h
@@ -78,7 +78,10 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
}
-unsigned Utf8::Encode(char* str, uchar c, int previous) {
+unsigned Utf8::Encode(char* str,
+ uchar c,
+ int previous,
+ bool replace_invalid) {
static const int kMask = ~(1 << 6);
if (c <= kMaxOneByteChar) {
str[0] = c;
@@ -88,12 +91,16 @@ unsigned Utf8::Encode(char* str, uchar c, int previous) {
str[1] = 0x80 | (c & kMask);
return 2;
} else if (c <= kMaxThreeByteChar) {
- if (Utf16::IsTrailSurrogate(c) &&
- Utf16::IsLeadSurrogate(previous)) {
+ if (Utf16::IsSurrogatePair(previous, c)) {
const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
return Encode(str - kUnmatchedSize,
Utf16::CombineSurrogatePair(previous, c),
- Utf16::kNoPreviousCharacter) - kUnmatchedSize;
+ Utf16::kNoPreviousCharacter,
+ replace_invalid) - kUnmatchedSize;
+ } else if (replace_invalid &&
+ (Utf16::IsLeadSurrogate(c) ||
+ Utf16::IsTrailSurrogate(c))) {
+ c = kBadChar;
}
str[0] = 0xE0 | (c >> 12);
str[1] = 0x80 | ((c >> 6) & kMask);
diff --git a/deps/v8/src/unicode.h b/deps/v8/src/unicode.h
index 94ab1b4c1..384326a83 100644
--- a/deps/v8/src/unicode.h
+++ b/deps/v8/src/unicode.h
@@ -117,6 +117,9 @@ class Buffer {
class Utf16 {
public:
+ static inline bool IsSurrogatePair(int lead, int trail) {
+ return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
+ }
static inline bool IsLeadSurrogate(int code) {
if (code == kNoPreviousCharacter) return false;
return (code & 0xfc00) == 0xd800;
@@ -152,13 +155,19 @@ class Utf16 {
class Utf8 {
public:
static inline uchar Length(uchar chr, int previous);
- static inline unsigned Encode(
- char* out, uchar c, int previous);
+ static inline unsigned Encode(char* out,
+ uchar c,
+ int previous,
+ bool replace_invalid = false);
static const byte* ReadBlock(Buffer<const char*> str, byte* buffer,
unsigned capacity, unsigned* chars_read, unsigned* offset);
static uchar CalculateValue(const byte* str,
unsigned length,
unsigned* cursor);
+
+
+ // The unicode replacement character, used to signal invalid unicode
+ // sequences (e.g. an orphan surrogate) when converting to a UTF-8 encoding.
static const uchar kBadChar = 0xFFFD;
static const unsigned kMaxEncodedSize = 4;
static const unsigned kMaxOneByteChar = 0x7f;
@@ -170,6 +179,9 @@ class Utf8 {
// that match are coded as a 4 byte UTF-8 sequence.
static const unsigned kBytesSavedByCombiningSurrogates = 2;
static const unsigned kSizeOfUnmatchedSurrogate = 3;
+ // The maximum size a single UTF-16 code unit may take up when encoded as
+ // UTF-8.
+ static const unsigned kMax16BitCodeUnitSize = 3;
private:
template <unsigned s> friend class Utf8InputBuffer;