/* * (C) 1999 Lars Knoll (knoll@kde.org) * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2012 Apple Inc. All rights * reserved. * Copyright (C) 2007-2009 Torch Mobile, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, * Boston, MA 02110-1301, USA. */ #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h" #include #include #include #include "base/callback.h" #include "base/strings/string_util.h" #include "build/build_config.h" #include "third_party/blink/renderer/platform/wtf/dtoa.h" #include "third_party/blink/renderer/platform/wtf/math_extras.h" #include "third_party/blink/renderer/platform/wtf/size_assertions.h" #include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h" #include "third_party/blink/renderer/platform/wtf/text/case_map.h" #include "third_party/blink/renderer/platform/wtf/text/character_names.h" #include "third_party/blink/renderer/platform/wtf/text/string_builder.h" #include "third_party/blink/renderer/platform/wtf/text/string_utf8_adaptor.h" #include "third_party/blink/renderer/platform/wtf/text/unicode.h" #include "third_party/blink/renderer/platform/wtf/text/utf8.h" #include "third_party/blink/renderer/platform/wtf/vector.h" #include "third_party/perfetto/include/perfetto/tracing/traced_value.h" namespace WTF { ASSERT_SIZE(String, void*); // Construct a string with UTF-16 data. String::String(const UChar* characters, unsigned length) : impl_(characters ? StringImpl::Create(characters, length) : nullptr) {} // Construct a string with UTF-16 data, from a null-terminated source. String::String(const UChar* str) { if (!str) return; impl_ = StringImpl::Create(str, LengthOfNullTerminatedString(str)); } // Construct a string with latin1 data. String::String(const LChar* characters, unsigned length) : impl_(characters ? StringImpl::Create(characters, length) : nullptr) {} String::String(const char* characters, unsigned length) : impl_(characters ? StringImpl::Create(reinterpret_cast(characters), length) : nullptr) {} #if defined(ARCH_CPU_64_BITS) String::String(const char* characters, size_t length) : String(characters, SafeCast(length)) {} #endif // defined(ARCH_CPU_64_BITS) int CodeUnitCompare(const String& a, const String& b) { return CodeUnitCompare(a.Impl(), b.Impl()); } int CodeUnitCompareIgnoringASCIICase(const String& a, const char* b) { return CodeUnitCompareIgnoringASCIICase(a.Impl(), reinterpret_cast(b)); } wtf_size_t String::Find(base::RepeatingCallback match_callback, wtf_size_t index) const { return impl_ ? impl_->Find(match_callback, index) : kNotFound; } UChar32 String::CharacterStartingAt(unsigned i) const { if (!impl_ || i >= impl_->length()) return 0; return impl_->CharacterStartingAt(i); } void String::Ensure16Bit() { if (IsNull()) return; if (!Is8Bit()) return; if (unsigned length = this->length()) impl_ = Make16BitFrom8BitSource(impl_->Characters8(), length).ReleaseImpl(); else impl_ = StringImpl::empty16_bit_; } void String::Truncate(unsigned length) { if (impl_) impl_ = impl_->Truncate(length); } void String::Remove(unsigned start, unsigned length_to_remove) { if (impl_) impl_ = impl_->Remove(start, length_to_remove); } String String::Substring(unsigned pos, unsigned len) const { if (!impl_) return String(); return impl_->Substring(pos, len); } String String::DeprecatedLower() const { if (!impl_) return String(); return CaseMap::FastToLowerInvariant(impl_.get()); } String String::LowerASCII() const { if (!impl_) return String(); return impl_->LowerASCII(); } String String::UpperASCII() const { if (!impl_) return String(); return impl_->UpperASCII(); } String String::StripWhiteSpace() const { if (!impl_) return String(); return impl_->StripWhiteSpace(); } String String::StripWhiteSpace(IsWhiteSpaceFunctionPtr is_white_space) const { if (!impl_) return String(); return impl_->StripWhiteSpace(is_white_space); } String String::SimplifyWhiteSpace(StripBehavior strip_behavior) const { if (!impl_) return String(); return impl_->SimplifyWhiteSpace(strip_behavior); } String String::SimplifyWhiteSpace(IsWhiteSpaceFunctionPtr is_white_space, StripBehavior strip_behavior) const { if (!impl_) return String(); return impl_->SimplifyWhiteSpace(is_white_space, strip_behavior); } String String::RemoveCharacters(CharacterMatchFunctionPtr find_match) const { if (!impl_) return String(); return impl_->RemoveCharacters(find_match); } String String::FoldCase() const { if (!impl_) return String(); return impl_->FoldCase(); } String String::Format(const char* format, ...) { // vsnprintf is locale sensitive when converting floats to strings // and we need it to always use a decimal point. Double check that // the locale is compatible, and also that it is the default "C" // locale so that we aren't just lucky. Android's locales work // differently so can't check the same way there. DCHECK_EQ(strcmp(localeconv()->decimal_point, "."), 0); #if !defined(OS_ANDROID) DCHECK_EQ(strcmp(setlocale(LC_NUMERIC, NULL), "C"), 0); #endif // !OS_ANDROID va_list args; // TODO(esprehn): base uses 1024, maybe we should use a bigger size too. static const unsigned kDefaultSize = 256; Vector buffer(kDefaultSize); va_start(args, format); int length = base::vsnprintf(buffer.data(), buffer.size(), format, args); va_end(args); // TODO(esprehn): This can only happen if there's an encoding error, what's // the locale set to inside blink? Can this happen? We should probably CHECK // instead. if (length < 0) return String(); if (static_cast(length) >= buffer.size()) { // vsnprintf doesn't include the NUL terminator in the length so we need to // add space for it when growing. buffer.Grow(length + 1); // We need to call va_end() and then va_start() each time we use args, as // the contents of args is undefined after the call to vsnprintf according // to http://man.cx/snprintf(3) // // Not calling va_end/va_start here happens to work on lots of systems, but // fails e.g. on 64bit Linux. va_start(args, format); length = base::vsnprintf(buffer.data(), buffer.size(), format, args); va_end(args); } CHECK_LT(static_cast(length), buffer.size()); return String(reinterpret_cast(buffer.data()), length); } String String::EncodeForDebugging() const { if (IsNull()) return ""; StringBuilder builder; builder.Append('"'); for (unsigned index = 0; index < length(); ++index) { // Print shorthands for select cases. UChar character = (*impl_)[index]; switch (character) { case '\t': builder.Append("\\t"); break; case '\n': builder.Append("\\n"); break; case '\r': builder.Append("\\r"); break; case '"': builder.Append("\\\""); break; case '\\': builder.Append("\\\\"); break; default: if (IsASCIIPrintable(character)) { builder.Append(static_cast(character)); } else { // Print "\uXXXX" for control or non-ASCII characters. builder.AppendFormat("\\u%04X", character); } break; } } builder.Append('"'); return builder.ToString(); } String String::Number(float number) { return Number(static_cast(number)); } String String::Number(double number, unsigned precision) { NumberToStringBuffer buffer; return String(NumberToFixedPrecisionString(number, precision, buffer)); } String String::NumberToStringECMAScript(double number) { NumberToStringBuffer buffer; return String(NumberToString(number, buffer)); } String String::NumberToStringFixedWidth(double number, unsigned decimal_places) { NumberToStringBuffer buffer; return String(NumberToFixedWidthString(number, decimal_places, buffer)); } int String::ToIntStrict(bool* ok) const { if (!impl_) { if (ok) *ok = false; return 0; } return impl_->ToInt(NumberParsingOptions::kStrict, ok); } unsigned String::ToUIntStrict(bool* ok) const { if (!impl_) { if (ok) *ok = false; return 0; } return impl_->ToUInt(NumberParsingOptions::kStrict, ok); } unsigned String::HexToUIntStrict(bool* ok) const { if (!impl_) { if (ok) *ok = false; return 0; } return impl_->HexToUIntStrict(ok); } uint64_t String::HexToUInt64Strict(bool* ok) const { if (!impl_) { if (ok) *ok = false; return 0; } return impl_->HexToUInt64Strict(ok); } int64_t String::ToInt64Strict(bool* ok) const { if (!impl_) { if (ok) *ok = false; return 0; } return impl_->ToInt64(NumberParsingOptions::kStrict, ok); } uint64_t String::ToUInt64Strict(bool* ok) const { if (!impl_) { if (ok) *ok = false; return 0; } return impl_->ToUInt64(NumberParsingOptions::kStrict, ok); } int String::ToInt(bool* ok) const { if (!impl_) { if (ok) *ok = false; return 0; } return impl_->ToInt(NumberParsingOptions::kLoose, ok); } unsigned String::ToUInt(bool* ok) const { if (!impl_) { if (ok) *ok = false; return 0; } return impl_->ToUInt(NumberParsingOptions::kLoose, ok); } double String::ToDouble(bool* ok) const { if (!impl_) { if (ok) *ok = false; return 0.0; } return impl_->ToDouble(ok); } float String::ToFloat(bool* ok) const { if (!impl_) { if (ok) *ok = false; return 0.0f; } return impl_->ToFloat(ok); } String String::IsolatedCopy() const { if (!impl_) return String(); return impl_->IsolatedCopy(); } bool String::IsSafeToSendToAnotherThread() const { return !impl_ || impl_->IsSafeToSendToAnotherThread(); } void String::Split(const StringView& separator, bool allow_empty_entries, Vector& result) const { result.clear(); unsigned start_pos = 0; wtf_size_t end_pos; while ((end_pos = Find(separator, start_pos)) != kNotFound) { if (allow_empty_entries || start_pos != end_pos) result.push_back(Substring(start_pos, end_pos - start_pos)); start_pos = end_pos + separator.length(); } if (allow_empty_entries || start_pos != length()) result.push_back(Substring(start_pos)); } void String::Split(UChar separator, bool allow_empty_entries, Vector& result) const { result.clear(); unsigned start_pos = 0; wtf_size_t end_pos; while ((end_pos = find(separator, start_pos)) != kNotFound) { if (allow_empty_entries || start_pos != end_pos) result.push_back(Substring(start_pos, end_pos - start_pos)); start_pos = end_pos + 1; } if (allow_empty_entries || start_pos != length()) result.push_back(Substring(start_pos)); } std::string String::Ascii() const { // Printable ASCII characters 32..127 and the null character are // preserved, characters outside of this range are converted to '?'. unsigned length = this->length(); if (!length) return std::string(); std::string ascii(length, '\0'); if (this->Is8Bit()) { const LChar* characters = this->Characters8(); for (unsigned i = 0; i < length; ++i) { LChar ch = characters[i]; ascii[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch; } return ascii; } const UChar* characters = this->Characters16(); for (unsigned i = 0; i < length; ++i) { UChar ch = characters[i]; ascii[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : static_cast(ch); } return ascii; } std::string String::Latin1() const { // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are // preserved, characters outside of this range are converted to '?'. unsigned length = this->length(); if (!length) return std::string(); if (Is8Bit()) { return std::string(reinterpret_cast(this->Characters8()), length); } const UChar* characters = this->Characters16(); std::string latin1(length, '\0'); for (unsigned i = 0; i < length; ++i) { UChar ch = characters[i]; latin1[i] = ch > 0xff ? '?' : static_cast(ch); } return latin1; } // Helper to write a three-byte UTF-8 code point to the buffer, caller must // check room is available. static inline void PutUTF8Triple(char*& buffer, UChar ch) { DCHECK_GE(ch, 0x0800); *buffer++ = static_cast(((ch >> 12) & 0x0F) | 0xE0); *buffer++ = static_cast(((ch >> 6) & 0x3F) | 0x80); *buffer++ = static_cast((ch & 0x3F) | 0x80); } std::string String::Utf8(UTF8ConversionMode mode) const { unsigned length = this->length(); if (!length) return std::string(); // Allocate a buffer big enough to hold all the characters // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). // Optimization ideas, if we find this function is hot: // * We could speculatively create a std::string to contain 'length' // characters, and resize if necessary (i.e. if the buffer contains // non-ascii characters). (Alternatively, scan the buffer first for // ascii characters, so we know this will be sufficient). // * We could allocate a std::string with an appropriate size to // have a good chance of being able to write the string into the // buffer without reallocing (say, 1.5 x length). if (length > std::numeric_limits::max() / 3) return std::string(); Vector buffer_vector(length * 3); char* buffer = buffer_vector.data(); if (Is8Bit()) { const LChar* characters = this->Characters8(); unicode::ConversionResult result = unicode::ConvertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + buffer_vector.size()); // (length * 3) should be sufficient for any conversion DCHECK_NE(result, unicode::kTargetExhausted); } else { const UChar* characters = this->Characters16(); if (mode == kStrictUTF8ConversionReplacingUnpairedSurrogatesWithFFFD) { const UChar* characters_end = characters + length; char* buffer_end = buffer + buffer_vector.size(); while (characters < characters_end) { // Use strict conversion to detect unpaired surrogates. unicode::ConversionResult result = unicode::ConvertUTF16ToUTF8( &characters, characters_end, &buffer, buffer_end, true); DCHECK_NE(result, unicode::kTargetExhausted); // Conversion fails when there is an unpaired surrogate. Put // replacement character (U+FFFD) instead of the unpaired // surrogate. if (result != unicode::kConversionOK) { DCHECK_LE(0xD800, *characters); DCHECK_LE(*characters, 0xDFFF); // There should be room left, since one UChar hasn't been // converted. DCHECK_LE(buffer + 3, buffer_end); PutUTF8Triple(buffer, kReplacementCharacter); ++characters; } } } else { bool strict = mode == kStrictUTF8Conversion; unicode::ConversionResult result = unicode::ConvertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + buffer_vector.size(), strict); // (length * 3) should be sufficient for any conversion DCHECK_NE(result, unicode::kTargetExhausted); // Only produced from strict conversion. if (result == unicode::kSourceIllegal) { DCHECK(strict); return std::string(); } // Check for an unconverted high surrogate. if (result == unicode::kSourceExhausted) { if (strict) return std::string(); // This should be one unpaired high surrogate. Treat it the same // was as an unpaired high surrogate would have been handled in // the middle of a string with non-strict conversion - which is // to say, simply encode it to UTF-8. DCHECK_EQ(characters + 1, this->Characters16() + length); DCHECK_GE(*characters, 0xD800); DCHECK_LE(*characters, 0xDBFF); // There should be room left, since one UChar hasn't been // converted. DCHECK_LE(buffer + 3, buffer + buffer_vector.size()); PutUTF8Triple(buffer, *characters); } } } return std::string(buffer_vector.data(), buffer - buffer_vector.data()); } String String::Make8BitFrom16BitSource(const UChar* source, wtf_size_t length) { if (!length) return g_empty_string; LChar* destination; String result = String::CreateUninitialized(length, destination); CopyLCharsFromUCharSource(destination, source, length); return result; } String String::Make16BitFrom8BitSource(const LChar* source, wtf_size_t length) { if (!length) return g_empty_string16_bit; UChar* destination; String result = String::CreateUninitialized(length, destination); StringImpl::CopyChars(destination, source, length); return result; } String String::FromUTF8(const LChar* string_start, size_t string_length) { wtf_size_t length = SafeCast(string_length); if (!string_start) return String(); if (!length) return g_empty_string; ASCIIStringAttributes attributes = CharacterAttributes(string_start, length); if (attributes.contains_only_ascii) return StringImpl::Create(string_start, length, attributes); Vector buffer(length); UChar* buffer_start = buffer.data(); UChar* buffer_current = buffer_start; const char* string_current = reinterpret_cast(string_start); if (unicode::ConvertUTF8ToUTF16( &string_current, reinterpret_cast(string_start + length), &buffer_current, buffer_current + buffer.size()) != unicode::kConversionOK) return String(); unsigned utf16_length = static_cast(buffer_current - buffer_start); DCHECK_LT(utf16_length, length); return StringImpl::Create(buffer_start, utf16_length); } String String::FromUTF8(const LChar* string) { if (!string) return String(); return FromUTF8(string, strlen(reinterpret_cast(string))); } String String::FromUTF8(base::StringPiece s) { return FromUTF8(reinterpret_cast(s.data()), s.size()); } String String::FromUTF8WithLatin1Fallback(const LChar* string, size_t size) { String utf8 = FromUTF8(string, size); if (!utf8) return String(string, SafeCast(size)); return utf8; } std::ostream& operator<<(std::ostream& out, const String& string) { return out << string.EncodeForDebugging().Utf8(); } #ifndef NDEBUG void String::Show() const { DLOG(INFO) << *this; } #endif void String::WriteIntoTracedValue(perfetto::TracedValue context) const { StringUTF8Adaptor adaptor(*this); std::move(context).WriteString(adaptor.data(), adaptor.size()); } } // namespace WTF