diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-06-27 06:07:23 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2017-06-27 06:07:23 +0000 |
commit | 1bf1084f2b10c3b47fd1a588d85d21ed0eb41d0c (patch) | |
tree | 46dcd36c86e7fbc6e5df36deb463b33e9967a6f7 /Source/WebCore/platform/URLParser.cpp | |
parent | 32761a6cee1d0dee366b885b7b9c777e67885688 (diff) | |
download | WebKitGtk-tarball-master.tar.gz |
webkitgtk-2.16.5HEADwebkitgtk-2.16.5master
Diffstat (limited to 'Source/WebCore/platform/URLParser.cpp')
-rw-r--r-- | Source/WebCore/platform/URLParser.cpp | 2920 |
1 files changed, 2920 insertions, 0 deletions
diff --git a/Source/WebCore/platform/URLParser.cpp b/Source/WebCore/platform/URLParser.cpp new file mode 100644 index 000000000..3f06d4298 --- /dev/null +++ b/Source/WebCore/platform/URLParser.cpp @@ -0,0 +1,2920 @@ +/* + * Copyright (C) 2016 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "URLParser.h" + +#include "Logging.h" +#include "RuntimeApplicationChecks.h" +#include <array> +#include <mutex> +#include <unicode/uidna.h> +#include <unicode/utypes.h> + +namespace WebCore { + +#define URL_PARSER_DEBUGGING 0 + +#if URL_PARSER_DEBUGGING +#define URL_PARSER_LOG(...) LOG(URLParser, __VA_ARGS__) +#else +#define URL_PARSER_LOG(...) +#endif + +template<typename CharacterType> +class CodePointIterator { +public: + ALWAYS_INLINE CodePointIterator() { } + ALWAYS_INLINE CodePointIterator(const CharacterType* begin, const CharacterType* end) + : m_begin(begin) + , m_end(end) + { + } + + ALWAYS_INLINE CodePointIterator(const CodePointIterator& begin, const CodePointIterator& end) + : CodePointIterator(begin.m_begin, end.m_begin) + { + ASSERT(end.m_begin >= begin.m_begin); + } + + ALWAYS_INLINE UChar32 operator*() const; + ALWAYS_INLINE CodePointIterator& operator++(); + + ALWAYS_INLINE bool operator==(const CodePointIterator& other) const + { + return m_begin == other.m_begin + && m_end == other.m_end; + } + ALWAYS_INLINE bool operator!=(const CodePointIterator& other) const { return !(*this == other); } + + ALWAYS_INLINE CodePointIterator& operator=(const CodePointIterator& other) + { + m_begin = other.m_begin; + m_end = other.m_end; + return *this; + } + + ALWAYS_INLINE bool atEnd() const + { + ASSERT(m_begin <= m_end); + return m_begin >= m_end; + } + + ALWAYS_INLINE size_t codeUnitsSince(const CharacterType* reference) const + { + ASSERT(m_begin >= reference); + return m_begin - reference; + } + + ALWAYS_INLINE size_t codeUnitsSince(const CodePointIterator& other) const + { + return codeUnitsSince(other.m_begin); + } + +private: + const CharacterType* m_begin { nullptr }; + const CharacterType* m_end { nullptr }; +}; + +template<> +ALWAYS_INLINE UChar32 CodePointIterator<LChar>::operator*() const +{ + ASSERT(!atEnd()); + return *m_begin; +} + +template<> +ALWAYS_INLINE auto CodePointIterator<LChar>::operator++() -> CodePointIterator& +{ + m_begin++; + return *this; +} + +template<> +ALWAYS_INLINE UChar32 CodePointIterator<UChar>::operator*() const +{ + ASSERT(!atEnd()); + UChar32 c; + U16_GET(m_begin, 0, 0, m_end - m_begin, c); + return c; +} + +template<> +ALWAYS_INLINE auto CodePointIterator<UChar>::operator++() -> CodePointIterator& +{ + unsigned i = 0; + size_t length = m_end - m_begin; + U16_FWD_1(m_begin, i, length); + m_begin += i; + return *this; +} + +ALWAYS_INLINE static void appendCodePoint(Vector<UChar>& destination, UChar32 codePoint) +{ + if (U_IS_BMP(codePoint)) { + destination.append(static_cast<UChar>(codePoint)); + return; + } + destination.reserveCapacity(destination.size() + 2); + destination.uncheckedAppend(U16_LEAD(codePoint)); + destination.uncheckedAppend(U16_TRAIL(codePoint)); +} + +enum URLCharacterClass { + UserInfo = 0x1, + Default = 0x2, + ForbiddenHost = 0x4, + QueryPercent = 0x8, + SlashQuestionOrHash = 0x10, + ValidScheme = 0x20, +}; + +static const uint8_t characterClassTable[256] = { + UserInfo | Default | QueryPercent | ForbiddenHost, // 0x0 + UserInfo | Default | QueryPercent, // 0x1 + UserInfo | Default | QueryPercent, // 0x2 + UserInfo | Default | QueryPercent, // 0x3 + UserInfo | Default | QueryPercent, // 0x4 + UserInfo | Default | QueryPercent, // 0x5 + UserInfo | Default | QueryPercent, // 0x6 + UserInfo | Default | QueryPercent, // 0x7 + UserInfo | Default | QueryPercent, // 0x8 + UserInfo | Default | QueryPercent | ForbiddenHost, // 0x9 + UserInfo | Default | QueryPercent | ForbiddenHost, // 0xA + UserInfo | Default | QueryPercent, // 0xB + UserInfo | Default | QueryPercent, // 0xC + UserInfo | Default | QueryPercent | ForbiddenHost, // 0xD + UserInfo | Default | QueryPercent, // 0xE + UserInfo | Default | QueryPercent, // 0xF + UserInfo | Default | QueryPercent, // 0x10 + UserInfo | Default | QueryPercent, // 0x11 + UserInfo | Default | QueryPercent, // 0x12 + UserInfo | Default | QueryPercent, // 0x13 + UserInfo | Default | QueryPercent, // 0x14 + UserInfo | Default | QueryPercent, // 0x15 + UserInfo | Default | QueryPercent, // 0x16 + UserInfo | Default | QueryPercent, // 0x17 + UserInfo | Default | QueryPercent, // 0x18 + UserInfo | Default | QueryPercent, // 0x19 + UserInfo | Default | QueryPercent, // 0x1A + UserInfo | Default | QueryPercent, // 0x1B + UserInfo | Default | QueryPercent, // 0x1C + UserInfo | Default | QueryPercent, // 0x1D + UserInfo | Default | QueryPercent, // 0x1E + UserInfo | Default | QueryPercent, // 0x1F + UserInfo | Default | QueryPercent | ForbiddenHost, // ' ' + 0, // '!' + UserInfo | Default | QueryPercent, // '"' + UserInfo | Default | QueryPercent | SlashQuestionOrHash | ForbiddenHost, // '#' + 0, // '$' + ForbiddenHost, // '%' + 0, // '&' + QueryPercent, // ''' + 0, // '(' + 0, // ')' + 0, // '*' + ValidScheme, // '+' + 0, // ',' + ValidScheme, // '-' + ValidScheme, // '.' + UserInfo | SlashQuestionOrHash | ForbiddenHost, // '/' + ValidScheme, // '0' + ValidScheme, // '1' + ValidScheme, // '2' + ValidScheme, // '3' + ValidScheme, // '4' + ValidScheme, // '5' + ValidScheme, // '6' + ValidScheme, // '7' + ValidScheme, // '8' + ValidScheme, // '9' + UserInfo | ForbiddenHost, // ':' + UserInfo, // ';' + UserInfo | Default | QueryPercent, // '<' + UserInfo, // '=' + UserInfo | Default | QueryPercent, // '>' + UserInfo | Default | SlashQuestionOrHash | ForbiddenHost, // '?' + UserInfo | ForbiddenHost, // '@' + ValidScheme, // 'A' + ValidScheme, // 'B' + ValidScheme, // 'C' + ValidScheme, // 'D' + ValidScheme, // 'E' + ValidScheme, // 'F' + ValidScheme, // 'G' + ValidScheme, // 'H' + ValidScheme, // 'I' + ValidScheme, // 'J' + ValidScheme, // 'K' + ValidScheme, // 'L' + ValidScheme, // 'M' + ValidScheme, // 'N' + ValidScheme, // 'O' + ValidScheme, // 'P' + ValidScheme, // 'Q' + ValidScheme, // 'R' + ValidScheme, // 'S' + ValidScheme, // 'T' + ValidScheme, // 'U' + ValidScheme, // 'V' + ValidScheme, // 'W' + ValidScheme, // 'X' + ValidScheme, // 'Y' + ValidScheme, // 'Z' + UserInfo | ForbiddenHost, // '[' + UserInfo | SlashQuestionOrHash | ForbiddenHost, // '\\' + UserInfo | ForbiddenHost, // ']' + UserInfo, // '^' + 0, // '_' + UserInfo | Default, // '`' + ValidScheme, // 'a' + ValidScheme, // 'b' + ValidScheme, // 'c' + ValidScheme, // 'd' + ValidScheme, // 'e' + ValidScheme, // 'f' + ValidScheme, // 'g' + ValidScheme, // 'h' + ValidScheme, // 'i' + ValidScheme, // 'j' + ValidScheme, // 'k' + ValidScheme, // 'l' + ValidScheme, // 'm' + ValidScheme, // 'n' + ValidScheme, // 'o' + ValidScheme, // 'p' + ValidScheme, // 'q' + ValidScheme, // 'r' + ValidScheme, // 's' + ValidScheme, // 't' + ValidScheme, // 'u' + ValidScheme, // 'v' + ValidScheme, // 'w' + ValidScheme, // 'x' + ValidScheme, // 'y' + ValidScheme, // 'z' + UserInfo | Default, // '{' + UserInfo, // '|' + UserInfo | Default, // '}' + 0, // '~' + QueryPercent, // 0x7F + QueryPercent, // 0x80 + QueryPercent, // 0x81 + QueryPercent, // 0x82 + QueryPercent, // 0x83 + QueryPercent, // 0x84 + QueryPercent, // 0x85 + QueryPercent, // 0x86 + QueryPercent, // 0x87 + QueryPercent, // 0x88 + QueryPercent, // 0x89 + QueryPercent, // 0x8A + QueryPercent, // 0x8B + QueryPercent, // 0x8C + QueryPercent, // 0x8D + QueryPercent, // 0x8E + QueryPercent, // 0x8F + QueryPercent, // 0x90 + QueryPercent, // 0x91 + QueryPercent, // 0x92 + QueryPercent, // 0x93 + QueryPercent, // 0x94 + QueryPercent, // 0x95 + QueryPercent, // 0x96 + QueryPercent, // 0x97 + QueryPercent, // 0x98 + QueryPercent, // 0x99 + QueryPercent, // 0x9A + QueryPercent, // 0x9B + QueryPercent, // 0x9C + QueryPercent, // 0x9D + QueryPercent, // 0x9E + QueryPercent, // 0x9F + QueryPercent, // 0xA0 + QueryPercent, // 0xA1 + QueryPercent, // 0xA2 + QueryPercent, // 0xA3 + QueryPercent, // 0xA4 + QueryPercent, // 0xA5 + QueryPercent, // 0xA6 + QueryPercent, // 0xA7 + QueryPercent, // 0xA8 + QueryPercent, // 0xA9 + QueryPercent, // 0xAA + QueryPercent, // 0xAB + QueryPercent, // 0xAC + QueryPercent, // 0xAD + QueryPercent, // 0xAE + QueryPercent, // 0xAF + QueryPercent, // 0xB0 + QueryPercent, // 0xB1 + QueryPercent, // 0xB2 + QueryPercent, // 0xB3 + QueryPercent, // 0xB4 + QueryPercent, // 0xB5 + QueryPercent, // 0xB6 + QueryPercent, // 0xB7 + QueryPercent, // 0xB8 + QueryPercent, // 0xB9 + QueryPercent, // 0xBA + QueryPercent, // 0xBB + QueryPercent, // 0xBC + QueryPercent, // 0xBD + QueryPercent, // 0xBE + QueryPercent, // 0xBF + QueryPercent, // 0xC0 + QueryPercent, // 0xC1 + QueryPercent, // 0xC2 + QueryPercent, // 0xC3 + QueryPercent, // 0xC4 + QueryPercent, // 0xC5 + QueryPercent, // 0xC6 + QueryPercent, // 0xC7 + QueryPercent, // 0xC8 + QueryPercent, // 0xC9 + QueryPercent, // 0xCA + QueryPercent, // 0xCB + QueryPercent, // 0xCC + QueryPercent, // 0xCD + QueryPercent, // 0xCE + QueryPercent, // 0xCF + QueryPercent, // 0xD0 + QueryPercent, // 0xD1 + QueryPercent, // 0xD2 + QueryPercent, // 0xD3 + QueryPercent, // 0xD4 + QueryPercent, // 0xD5 + QueryPercent, // 0xD6 + QueryPercent, // 0xD7 + QueryPercent, // 0xD8 + QueryPercent, // 0xD9 + QueryPercent, // 0xDA + QueryPercent, // 0xDB + QueryPercent, // 0xDC + QueryPercent, // 0xDD + QueryPercent, // 0xDE + QueryPercent, // 0xDF + QueryPercent, // 0xE0 + QueryPercent, // 0xE1 + QueryPercent, // 0xE2 + QueryPercent, // 0xE3 + QueryPercent, // 0xE4 + QueryPercent, // 0xE5 + QueryPercent, // 0xE6 + QueryPercent, // 0xE7 + QueryPercent, // 0xE8 + QueryPercent, // 0xE9 + QueryPercent, // 0xEA + QueryPercent, // 0xEB + QueryPercent, // 0xEC + QueryPercent, // 0xED + QueryPercent, // 0xEE + QueryPercent, // 0xEF + QueryPercent, // 0xF0 + QueryPercent, // 0xF1 + QueryPercent, // 0xF2 + QueryPercent, // 0xF3 + QueryPercent, // 0xF4 + QueryPercent, // 0xF5 + QueryPercent, // 0xF6 + QueryPercent, // 0xF7 + QueryPercent, // 0xF8 + QueryPercent, // 0xF9 + QueryPercent, // 0xFA + QueryPercent, // 0xFB + QueryPercent, // 0xFC + QueryPercent, // 0xFD + QueryPercent, // 0xFE + QueryPercent, // 0xFF +}; + +template<typename CharacterType> ALWAYS_INLINE static bool isC0Control(CharacterType character) { return character <= 0x1F; } +template<typename CharacterType> ALWAYS_INLINE static bool isC0ControlOrSpace(CharacterType character) { return character <= 0x20; } +template<typename CharacterType> ALWAYS_INLINE static bool isTabOrNewline(CharacterType character) { return character <= 0xD && character >= 0x9 && character != 0xB && character != 0xC; } +template<typename CharacterType> ALWAYS_INLINE static bool isInSimpleEncodeSet(CharacterType character) { return character > 0x7E || isC0Control(character); } +template<typename CharacterType> ALWAYS_INLINE static bool isInDefaultEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & Default; } +template<typename CharacterType> ALWAYS_INLINE static bool isInUserInfoEncodeSet(CharacterType character) { return character > 0x7E || characterClassTable[character] & UserInfo; } +template<typename CharacterType> ALWAYS_INLINE static bool isPercentOrNonASCII(CharacterType character) { return !isASCII(character) || character == '%'; } +template<typename CharacterType> ALWAYS_INLINE static bool isSlashQuestionOrHash(CharacterType character) { return character <= '\\' && characterClassTable[character] & SlashQuestionOrHash; } +template<typename CharacterType> ALWAYS_INLINE static bool isValidSchemeCharacter(CharacterType character) { return character <= 'z' && characterClassTable[character] & ValidScheme; } +template<typename CharacterType> ALWAYS_INLINE static bool isForbiddenHostCodePoint(CharacterType character) { return character <= ']' && characterClassTable[character] & ForbiddenHost; } +static bool shouldPercentEncodeQueryByte(uint8_t byte) { return characterClassTable[byte] & QueryPercent; } + +template<typename CharacterType, URLParser::ReportSyntaxViolation reportSyntaxViolation> +ALWAYS_INLINE void URLParser::advance(CodePointIterator<CharacterType>& iterator, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition) +{ + ++iterator; + while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) { + if (reportSyntaxViolation == ReportSyntaxViolation::Yes) + syntaxViolation(iteratorForSyntaxViolationPosition); + ++iterator; + } +} + +template<typename CharacterType> +bool URLParser::takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType> iterator) +{ + if (iterator.atEnd()) + return false; + advance<CharacterType, ReportSyntaxViolation::No>(iterator); + if (iterator.atEnd()) + return false; + advance<CharacterType, ReportSyntaxViolation::No>(iterator); + return iterator.atEnd(); +} + +template<typename CharacterType> +ALWAYS_INLINE bool URLParser::isWindowsDriveLetter(CodePointIterator<CharacterType> iterator) +{ + if (iterator.atEnd() || !isASCIIAlpha(*iterator)) + return false; + advance<CharacterType, ReportSyntaxViolation::No>(iterator); + if (iterator.atEnd()) + return false; + if (*iterator == ':') + return true; + if (UNLIKELY(*iterator == '|')) + return true; + return false; +} + +ALWAYS_INLINE void URLParser::appendToASCIIBuffer(UChar32 codePoint) +{ + ASSERT(isASCII(codePoint)); + if (UNLIKELY(m_didSeeSyntaxViolation)) + m_asciiBuffer.append(codePoint); +} + +ALWAYS_INLINE void URLParser::appendToASCIIBuffer(const char* characters, size_t length) +{ + if (UNLIKELY(m_didSeeSyntaxViolation)) + m_asciiBuffer.append(characters, length); +} + +template<typename CharacterType> +void URLParser::appendWindowsDriveLetter(CodePointIterator<CharacterType>& iterator) +{ + ASSERT(isWindowsDriveLetter(iterator)); + appendToASCIIBuffer(*iterator); + advance(iterator); + ASSERT(!iterator.atEnd()); + ASSERT(*iterator == ':' || *iterator == '|'); + if (*iterator == '|') + syntaxViolation(iterator); + appendToASCIIBuffer(':'); + advance(iterator); +} + +bool URLParser::copyBaseWindowsDriveLetter(const URL& base) +{ + if (base.protocolIs("file")) { + RELEASE_ASSERT(base.m_portEnd < base.m_string.length()); + if (base.m_string.is8Bit()) { + const LChar* begin = base.m_string.characters8(); + CodePointIterator<LChar> c(begin + base.m_portEnd + 1, begin + base.m_string.length()); + if (isWindowsDriveLetter(c)) { + appendWindowsDriveLetter(c); + return true; + } + } else { + const UChar* begin = base.m_string.characters16(); + CodePointIterator<UChar> c(begin + base.m_portEnd + 1, begin + base.m_string.length()); + if (isWindowsDriveLetter(c)) { + appendWindowsDriveLetter(c); + return true; + } + } + } + return false; +} + +template<typename CharacterType> +bool URLParser::shouldCopyFileURL(CodePointIterator<CharacterType> iterator) +{ + if (!isWindowsDriveLetter(iterator)) + return true; + if (iterator.atEnd()) + return false; + advance(iterator); + if (iterator.atEnd()) + return true; + advance(iterator); + if (iterator.atEnd()) + return true; + return !isSlashQuestionOrHash(*iterator); +} + +static void percentEncodeByte(uint8_t byte, Vector<LChar>& buffer) +{ + buffer.append('%'); + buffer.append(upperNibbleToASCIIHexDigit(byte)); + buffer.append(lowerNibbleToASCIIHexDigit(byte)); +} + +void URLParser::percentEncodeByte(uint8_t byte) +{ + ASSERT(m_didSeeSyntaxViolation); + appendToASCIIBuffer('%'); + appendToASCIIBuffer(upperNibbleToASCIIHexDigit(byte)); + appendToASCIIBuffer(lowerNibbleToASCIIHexDigit(byte)); +} + +const char replacementCharacterUTF8PercentEncoded[10] = "%EF%BF%BD"; +const size_t replacementCharacterUTF8PercentEncodedLength = sizeof(replacementCharacterUTF8PercentEncoded) - 1; + +template<bool(*isInCodeSet)(UChar32), typename CharacterType> +ALWAYS_INLINE void URLParser::utf8PercentEncode(const CodePointIterator<CharacterType>& iterator) +{ + ASSERT(!iterator.atEnd()); + UChar32 codePoint = *iterator; + if (LIKELY(isASCII(codePoint))) { + if (UNLIKELY(isInCodeSet(codePoint))) { + syntaxViolation(iterator); + percentEncodeByte(codePoint); + } else + appendToASCIIBuffer(codePoint); + return; + } + ASSERT_WITH_MESSAGE(isInCodeSet(codePoint), "isInCodeSet should always return true for non-ASCII characters"); + syntaxViolation(iterator); + + if (!U_IS_UNICODE_CHAR(codePoint)) { + appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength); + return; + } + + uint8_t buffer[U8_MAX_LENGTH]; + int32_t offset = 0; + U8_APPEND_UNSAFE(buffer, offset, codePoint); + for (int32_t i = 0; i < offset; ++i) + percentEncodeByte(buffer[i]); +} + +template<typename CharacterType> +ALWAYS_INLINE void URLParser::utf8QueryEncode(const CodePointIterator<CharacterType>& iterator) +{ + ASSERT(!iterator.atEnd()); + UChar32 codePoint = *iterator; + if (LIKELY(isASCII(codePoint))) { + if (UNLIKELY(shouldPercentEncodeQueryByte(codePoint))) { + syntaxViolation(iterator); + percentEncodeByte(codePoint); + } else + appendToASCIIBuffer(codePoint); + return; + } + + syntaxViolation(iterator); + + if (!U_IS_UNICODE_CHAR(codePoint)) { + appendToASCIIBuffer(replacementCharacterUTF8PercentEncoded, replacementCharacterUTF8PercentEncodedLength); + return; + } + + uint8_t buffer[U8_MAX_LENGTH]; + int32_t offset = 0; + U8_APPEND_UNSAFE(buffer, offset, codePoint); + for (int32_t i = 0; i < offset; ++i) { + auto byte = buffer[i]; + if (shouldPercentEncodeQueryByte(byte)) + percentEncodeByte(byte); + else + appendToASCIIBuffer(byte); + } +} + +template<typename CharacterType> +void URLParser::encodeQuery(const Vector<UChar>& source, const TextEncoding& encoding, CodePointIterator<CharacterType> iterator) +{ + // FIXME: It is unclear in the spec what to do when encoding fails. The behavior should be specified and tested. + CString encoded = encoding.encode(StringView(source.data(), source.size()), URLEncodedEntitiesForUnencodables); + const char* data = encoded.data(); + size_t length = encoded.length(); + + if (!length == !iterator.atEnd()) { + syntaxViolation(iterator); + return; + } + + size_t i = 0; + for (; i < length; ++i) { + ASSERT(!iterator.atEnd()); + uint8_t byte = data[i]; + if (UNLIKELY(byte != *iterator)) { + syntaxViolation(iterator); + break; + } + if (UNLIKELY(shouldPercentEncodeQueryByte(byte))) { + syntaxViolation(iterator); + break; + } + appendToASCIIBuffer(byte); + ++iterator; + } + while (!iterator.atEnd() && isTabOrNewline(*iterator)) + ++iterator; + ASSERT((i == length) == iterator.atEnd()); + for (; i < length; ++i) { + ASSERT(m_didSeeSyntaxViolation); + uint8_t byte = data[i]; + if (shouldPercentEncodeQueryByte(byte)) + percentEncodeByte(byte); + else + appendToASCIIBuffer(byte); + } +} + +std::optional<uint16_t> URLParser::defaultPortForProtocol(StringView scheme) +{ + static const uint16_t ftpPort = 21; + static const uint16_t gopherPort = 70; + static const uint16_t httpPort = 80; + static const uint16_t httpsPort = 443; + static const uint16_t wsPort = 80; + static const uint16_t wssPort = 443; + + auto length = scheme.length(); + if (!length) + return std::nullopt; + switch (scheme[0]) { + case 'w': + switch (length) { + case 2: + if (scheme[1] == 's') + return wsPort; + return std::nullopt; + case 3: + if (scheme[1] == 's' + && scheme[2] == 's') + return wssPort; + return std::nullopt; + default: + return false; + } + case 'h': + switch (length) { + case 4: + if (scheme[1] == 't' + && scheme[2] == 't' + && scheme[3] == 'p') + return httpPort; + return std::nullopt; + case 5: + if (scheme[1] == 't' + && scheme[2] == 't' + && scheme[3] == 'p' + && scheme[4] == 's') + return httpsPort; + return std::nullopt; + default: + return std::nullopt; + } + case 'g': + if (length == 6 + && scheme[1] == 'o' + && scheme[2] == 'p' + && scheme[3] == 'h' + && scheme[4] == 'e' + && scheme[5] == 'r') + return gopherPort; + return std::nullopt; + case 'f': + if (length == 3 + && scheme[1] == 't' + && scheme[2] == 'p') + return ftpPort; + return std::nullopt; + default: + return std::nullopt; + } +} + +enum class Scheme { + WS, + WSS, + File, + FTP, + Gopher, + HTTP, + HTTPS, + NonSpecial +}; + +ALWAYS_INLINE static Scheme scheme(StringView scheme) +{ + auto length = scheme.length(); + if (!length) + return Scheme::NonSpecial; + switch (scheme[0]) { + case 'f': + switch (length) { + case 3: + if (scheme[1] == 't' + && scheme[2] == 'p') + return Scheme::FTP; + return Scheme::NonSpecial; + case 4: + if (scheme[1] == 'i' + && scheme[2] == 'l' + && scheme[3] == 'e') + return Scheme::File; + return Scheme::NonSpecial; + default: + return Scheme::NonSpecial; + } + case 'g': + if (length == 6 + && scheme[1] == 'o' + && scheme[2] == 'p' + && scheme[3] == 'h' + && scheme[4] == 'e' + && scheme[5] == 'r') + return Scheme::Gopher; + return Scheme::NonSpecial; + case 'h': + switch (length) { + case 4: + if (scheme[1] == 't' + && scheme[2] == 't' + && scheme[3] == 'p') + return Scheme::HTTP; + return Scheme::NonSpecial; + case 5: + if (scheme[1] == 't' + && scheme[2] == 't' + && scheme[3] == 'p' + && scheme[4] == 's') + return Scheme::HTTPS; + return Scheme::NonSpecial; + default: + return Scheme::NonSpecial; + } + case 'w': + switch (length) { + case 2: + if (scheme[1] == 's') + return Scheme::WS; + return Scheme::NonSpecial; + case 3: + if (scheme[1] == 's' + && scheme[2] == 's') + return Scheme::WSS; + return Scheme::NonSpecial; + default: + return Scheme::NonSpecial; + } + default: + return Scheme::NonSpecial; + } +} + +enum class URLParser::URLPart { + SchemeEnd, + UserStart, + UserEnd, + PasswordEnd, + HostEnd, + PortEnd, + PathAfterLastSlash, + PathEnd, + QueryEnd, + FragmentEnd, +}; + +size_t URLParser::urlLengthUntilPart(const URL& url, URLPart part) +{ + switch (part) { + case URLPart::FragmentEnd: + return url.m_fragmentEnd; + case URLPart::QueryEnd: + return url.m_queryEnd; + case URLPart::PathEnd: + return url.m_pathEnd; + case URLPart::PathAfterLastSlash: + return url.m_pathAfterLastSlash; + case URLPart::PortEnd: + return url.m_portEnd; + case URLPart::HostEnd: + return url.m_hostEnd; + case URLPart::PasswordEnd: + return url.m_passwordEnd; + case URLPart::UserEnd: + return url.m_userEnd; + case URLPart::UserStart: + return url.m_userStart; + case URLPart::SchemeEnd: + return url.m_schemeEnd; + } + ASSERT_NOT_REACHED(); + return 0; +} + +void URLParser::copyASCIIStringUntil(const String& string, size_t length) +{ + RELEASE_ASSERT(length <= string.length()); + if (string.isNull()) + return; + ASSERT(m_asciiBuffer.isEmpty()); + if (string.is8Bit()) + appendToASCIIBuffer(string.characters8(), length); + else { + const UChar* characters = string.characters16(); + for (size_t i = 0; i < length; ++i) { + UChar c = characters[i]; + ASSERT_WITH_SECURITY_IMPLICATION(isASCII(c)); + appendToASCIIBuffer(c); + } + } +} + +template<typename CharacterType> +void URLParser::copyURLPartsUntil(const URL& base, URLPart part, const CodePointIterator<CharacterType>& iterator, bool& isUTF8Encoding) +{ + syntaxViolation(iterator); + + m_asciiBuffer.clear(); + copyASCIIStringUntil(base.m_string, urlLengthUntilPart(base, part)); + switch (part) { + case URLPart::FragmentEnd: + RELEASE_ASSERT_NOT_REACHED(); + case URLPart::QueryEnd: + m_url.m_queryEnd = base.m_queryEnd; + FALLTHROUGH; + case URLPart::PathEnd: + m_url.m_pathEnd = base.m_pathEnd; + FALLTHROUGH; + case URLPart::PathAfterLastSlash: + m_url.m_pathAfterLastSlash = base.m_pathAfterLastSlash; + FALLTHROUGH; + case URLPart::PortEnd: + m_url.m_portEnd = base.m_portEnd; + FALLTHROUGH; + case URLPart::HostEnd: + m_url.m_hostEnd = base.m_hostEnd; + FALLTHROUGH; + case URLPart::PasswordEnd: + m_url.m_passwordEnd = base.m_passwordEnd; + FALLTHROUGH; + case URLPart::UserEnd: + m_url.m_userEnd = base.m_userEnd; + FALLTHROUGH; + case URLPart::UserStart: + m_url.m_userStart = base.m_userStart; + FALLTHROUGH; + case URLPart::SchemeEnd: + m_url.m_isValid = base.m_isValid; + m_url.m_protocolIsInHTTPFamily = base.m_protocolIsInHTTPFamily; + m_url.m_schemeEnd = base.m_schemeEnd; + } + switch (scheme(StringView(m_asciiBuffer.data(), m_url.m_schemeEnd))) { + case Scheme::WS: + case Scheme::WSS: + isUTF8Encoding = true; + m_urlIsSpecial = true; + return; + case Scheme::File: + m_urlIsFile = true; + FALLTHROUGH; + case Scheme::FTP: + case Scheme::Gopher: + case Scheme::HTTP: + case Scheme::HTTPS: + m_urlIsSpecial = true; + return; + case Scheme::NonSpecial: + m_urlIsSpecial = false; + isUTF8Encoding = true; + return; + } + ASSERT_NOT_REACHED(); +} + +static const char dotASCIICode[2] = {'2', 'e'}; + +template<typename CharacterType> +ALWAYS_INLINE bool URLParser::isSingleDotPathSegment(CodePointIterator<CharacterType> c) +{ + if (c.atEnd()) + return false; + if (*c == '.') { + advance<CharacterType, ReportSyntaxViolation::No>(c); + return c.atEnd() || isSlashQuestionOrHash(*c); + } + if (*c != '%') + return false; + advance<CharacterType, ReportSyntaxViolation::No>(c); + if (c.atEnd() || *c != dotASCIICode[0]) + return false; + advance<CharacterType, ReportSyntaxViolation::No>(c); + if (c.atEnd()) + return false; + if (toASCIILower(*c) == dotASCIICode[1]) { + advance<CharacterType, ReportSyntaxViolation::No>(c); + return c.atEnd() || isSlashQuestionOrHash(*c); + } + return false; +} + +template<typename CharacterType> +ALWAYS_INLINE bool URLParser::isDoubleDotPathSegment(CodePointIterator<CharacterType> c) +{ + if (c.atEnd()) + return false; + if (*c == '.') { + advance<CharacterType, ReportSyntaxViolation::No>(c); + return isSingleDotPathSegment(c); + } + if (*c != '%') + return false; + advance<CharacterType, ReportSyntaxViolation::No>(c); + if (c.atEnd() || *c != dotASCIICode[0]) + return false; + advance<CharacterType, ReportSyntaxViolation::No>(c); + if (c.atEnd()) + return false; + if (toASCIILower(*c) == dotASCIICode[1]) { + advance<CharacterType, ReportSyntaxViolation::No>(c); + return isSingleDotPathSegment(c); + } + return false; +} + +template<typename CharacterType> +void URLParser::consumeSingleDotPathSegment(CodePointIterator<CharacterType>& c) +{ + ASSERT(isSingleDotPathSegment(c)); + if (*c == '.') { + advance(c); + if (!c.atEnd()) { + if (*c == '/' || *c == '\\') + advance(c); + else + ASSERT(*c == '?' || *c == '#'); + } + } else { + ASSERT(*c == '%'); + advance(c); + ASSERT(*c == dotASCIICode[0]); + advance(c); + ASSERT(toASCIILower(*c) == dotASCIICode[1]); + advance(c); + if (!c.atEnd()) { + if (*c == '/' || *c == '\\') + advance(c); + else + ASSERT(*c == '?' || *c == '#'); + } + } +} + +template<typename CharacterType> +void URLParser::consumeDoubleDotPathSegment(CodePointIterator<CharacterType>& c) +{ + ASSERT(isDoubleDotPathSegment(c)); + if (*c == '.') + advance(c); + else { + ASSERT(*c == '%'); + advance(c); + ASSERT(*c == dotASCIICode[0]); + advance(c); + ASSERT(toASCIILower(*c) == dotASCIICode[1]); + advance(c); + } + consumeSingleDotPathSegment(c); +} + +bool URLParser::shouldPopPath(unsigned newPathAfterLastSlash) +{ + ASSERT(m_didSeeSyntaxViolation); + if (!m_urlIsFile) + return true; + + ASSERT(m_url.m_pathAfterLastSlash <= m_asciiBuffer.size()); + CodePointIterator<LChar> componentToPop(&m_asciiBuffer[newPathAfterLastSlash], &m_asciiBuffer[0] + m_url.m_pathAfterLastSlash); + if (newPathAfterLastSlash == m_url.m_portEnd + 1 && isWindowsDriveLetter(componentToPop)) + return false; + return true; +} + +void URLParser::popPath() +{ + ASSERT(m_didSeeSyntaxViolation); + if (m_url.m_pathAfterLastSlash > m_url.m_portEnd + 1) { + auto newPathAfterLastSlash = m_url.m_pathAfterLastSlash - 1; + if (m_asciiBuffer[newPathAfterLastSlash] == '/') + newPathAfterLastSlash--; + while (newPathAfterLastSlash > m_url.m_portEnd && m_asciiBuffer[newPathAfterLastSlash] != '/') + newPathAfterLastSlash--; + newPathAfterLastSlash++; + if (shouldPopPath(newPathAfterLastSlash)) + m_url.m_pathAfterLastSlash = newPathAfterLastSlash; + } + m_asciiBuffer.resize(m_url.m_pathAfterLastSlash); +} + +template<typename CharacterType> +void URLParser::syntaxViolation(const CodePointIterator<CharacterType>& iterator) +{ + if (m_didSeeSyntaxViolation) + return; + m_didSeeSyntaxViolation = true; + + ASSERT(m_asciiBuffer.isEmpty()); + size_t codeUnitsToCopy = iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin)); + RELEASE_ASSERT(codeUnitsToCopy <= m_inputString.length()); + m_asciiBuffer.reserveCapacity(m_inputString.length()); + for (size_t i = 0; i < codeUnitsToCopy; ++i) { + ASSERT(isASCII(m_inputString[i])); + m_asciiBuffer.uncheckedAppend(m_inputString[i]); + } +} + +void URLParser::failure() +{ + m_url.invalidate(); + m_url.m_string = m_inputString; +} + +template<typename CharacterType> +bool URLParser::checkLocalhostCodePoint(CodePointIterator<CharacterType>& iterator, UChar32 codePoint) +{ + if (iterator.atEnd() || toASCIILower(*iterator) != codePoint) + return false; + advance<CharacterType, ReportSyntaxViolation::No>(iterator); + return true; +} + +template<typename CharacterType> +bool URLParser::isAtLocalhost(CodePointIterator<CharacterType> iterator) +{ + if (!checkLocalhostCodePoint(iterator, 'l')) + return false; + if (!checkLocalhostCodePoint(iterator, 'o')) + return false; + if (!checkLocalhostCodePoint(iterator, 'c')) + return false; + if (!checkLocalhostCodePoint(iterator, 'a')) + return false; + if (!checkLocalhostCodePoint(iterator, 'l')) + return false; + if (!checkLocalhostCodePoint(iterator, 'h')) + return false; + if (!checkLocalhostCodePoint(iterator, 'o')) + return false; + if (!checkLocalhostCodePoint(iterator, 's')) + return false; + if (!checkLocalhostCodePoint(iterator, 't')) + return false; + return iterator.atEnd(); +} + +bool URLParser::isLocalhost(StringView view) +{ + if (view.is8Bit()) + return isAtLocalhost(CodePointIterator<LChar>(view.characters8(), view.characters8() + view.length())); + return isAtLocalhost(CodePointIterator<UChar>(view.characters16(), view.characters16() + view.length())); +} + +ALWAYS_INLINE StringView URLParser::parsedDataView(size_t start, size_t length) +{ + if (UNLIKELY(m_didSeeSyntaxViolation)) { + ASSERT(start + length <= m_asciiBuffer.size()); + return StringView(m_asciiBuffer.data() + start, length); + } + ASSERT(start + length <= m_inputString.length()); + return StringView(m_inputString).substring(start, length); +} + +ALWAYS_INLINE UChar URLParser::parsedDataView(size_t position) +{ + if (UNLIKELY(m_didSeeSyntaxViolation)) + return m_asciiBuffer[position]; + return m_inputString[position]; +} + +template<typename CharacterType> +ALWAYS_INLINE size_t URLParser::currentPosition(const CodePointIterator<CharacterType>& iterator) +{ + if (UNLIKELY(m_didSeeSyntaxViolation)) + return m_asciiBuffer.size(); + + return iterator.codeUnitsSince(reinterpret_cast<const CharacterType*>(m_inputBegin)); +} + +URLParser::URLParser(const String& input, const URL& base, const TextEncoding& encoding) + : m_inputString(input) +{ + if (input.isNull()) { + if (base.isValid() && !base.m_cannotBeABaseURL) { + m_url = base; + m_url.removeFragmentIdentifier(); + } + return; + } + + if (input.is8Bit()) { + m_inputBegin = input.characters8(); + parse(input.characters8(), input.length(), base, encoding); + } else { + m_inputBegin = input.characters16(); + parse(input.characters16(), input.length(), base, encoding); + } + + ASSERT(!m_url.m_isValid + || m_didSeeSyntaxViolation == (m_url.string() != input) + || (input.isAllSpecialCharacters<isC0ControlOrSpace>() + && m_url.m_string == base.m_string.left(base.m_queryEnd))); + ASSERT(internalValuesConsistent(m_url)); +#if !ASSERT_DISABLED + if (!m_didSeeSyntaxViolation) { + // Force a syntax violation at the beginning to make sure we get the same result. + URLParser parser(makeString(" ", input), base, encoding); + URL parsed = parser.result(); + if (parsed.isValid()) + ASSERT(allValuesEqual(parser.result(), m_url)); + } +#endif +} + +template<typename CharacterType> +void URLParser::parse(const CharacterType* input, const unsigned length, const URL& base, const TextEncoding& encoding) +{ + URL_PARSER_LOG("Parsing URL <%s> base <%s> encoding <%s>", String(input, length).utf8().data(), base.string().utf8().data(), encoding.name()); + m_url = { }; + ASSERT(m_asciiBuffer.isEmpty()); + + bool isUTF8Encoding = encoding == UTF8Encoding(); + Vector<UChar> queryBuffer; + + unsigned endIndex = length; + while (UNLIKELY(endIndex && isC0ControlOrSpace(input[endIndex - 1]))) { + syntaxViolation(CodePointIterator<CharacterType>(input, input)); + endIndex--; + } + CodePointIterator<CharacterType> c(input, input + endIndex); + CodePointIterator<CharacterType> authorityOrHostBegin; + CodePointIterator<CharacterType> queryBegin; + while (UNLIKELY(!c.atEnd() && isC0ControlOrSpace(*c))) { + syntaxViolation(c); + ++c; + } + auto beginAfterControlAndSpace = c; + + enum class State : uint8_t { + SchemeStart, + Scheme, + NoScheme, + SpecialRelativeOrAuthority, + PathOrAuthority, + Relative, + RelativeSlash, + SpecialAuthoritySlashes, + SpecialAuthorityIgnoreSlashes, + AuthorityOrHost, + Host, + File, + FileSlash, + FileHost, + PathStart, + Path, + CannotBeABaseURLPath, + UTF8Query, + NonUTF8Query, + Fragment, + }; + +#define LOG_STATE(x) URL_PARSER_LOG("State %s, code point %c, parsed data <%s> size %zu", x, *c, parsedDataView(0, currentPosition(c)).utf8().data(), currentPosition(c)) +#define LOG_FINAL_STATE(x) URL_PARSER_LOG("Final State: %s", x) + + State state = State::SchemeStart; + while (!c.atEnd()) { + if (UNLIKELY(isTabOrNewline(*c))) { + syntaxViolation(c); + ++c; + continue; + } + + switch (state) { + case State::SchemeStart: + LOG_STATE("SchemeStart"); + if (isASCIIAlpha(*c)) { + if (UNLIKELY(isASCIIUpper(*c))) + syntaxViolation(c); + appendToASCIIBuffer(toASCIILower(*c)); + advance(c); + if (c.atEnd()) { + m_asciiBuffer.clear(); + state = State::NoScheme; + c = beginAfterControlAndSpace; + } + state = State::Scheme; + } else + state = State::NoScheme; + break; + case State::Scheme: + LOG_STATE("Scheme"); + if (isValidSchemeCharacter(*c)) { + if (UNLIKELY(isASCIIUpper(*c))) + syntaxViolation(c); + appendToASCIIBuffer(toASCIILower(*c)); + } else if (*c == ':') { + m_url.m_schemeEnd = currentPosition(c); + StringView urlScheme = parsedDataView(0, m_url.m_schemeEnd); + appendToASCIIBuffer(':'); + switch (scheme(urlScheme)) { + case Scheme::File: + m_urlIsSpecial = true; + m_urlIsFile = true; + state = State::File; + ++c; + break; + case Scheme::WS: + case Scheme::WSS: + isUTF8Encoding = true; + m_urlIsSpecial = true; + if (base.protocolIs(urlScheme)) + state = State::SpecialRelativeOrAuthority; + else + state = State::SpecialAuthoritySlashes; + ++c; + break; + case Scheme::HTTP: + case Scheme::HTTPS: + m_url.m_protocolIsInHTTPFamily = true; + FALLTHROUGH; + case Scheme::FTP: + case Scheme::Gopher: + m_urlIsSpecial = true; + if (base.protocolIs(urlScheme)) + state = State::SpecialRelativeOrAuthority; + else + state = State::SpecialAuthoritySlashes; + ++c; + break; + case Scheme::NonSpecial: + isUTF8Encoding = true; + auto maybeSlash = c; + advance(maybeSlash); + if (!maybeSlash.atEnd() && *maybeSlash == '/') { + appendToASCIIBuffer('/'); + c = maybeSlash; + state = State::PathOrAuthority; + ASSERT(*c == '/'); + ++c; + m_url.m_userStart = currentPosition(c); + } else { + ++c; + m_url.m_userStart = currentPosition(c); + m_url.m_userEnd = m_url.m_userStart; + m_url.m_passwordEnd = m_url.m_userStart; + m_url.m_hostEnd = m_url.m_userStart; + m_url.m_portEnd = m_url.m_userStart; + m_url.m_pathAfterLastSlash = m_url.m_userStart; + m_url.m_cannotBeABaseURL = true; + state = State::CannotBeABaseURLPath; + } + break; + } + break; + } else { + m_asciiBuffer.clear(); + state = State::NoScheme; + c = beginAfterControlAndSpace; + break; + } + advance(c); + if (c.atEnd()) { + m_asciiBuffer.clear(); + state = State::NoScheme; + c = beginAfterControlAndSpace; + } + break; + case State::NoScheme: + LOG_STATE("NoScheme"); + if (!base.isValid() || (base.m_cannotBeABaseURL && *c != '#')) { + failure(); + return; + } + if (base.m_cannotBeABaseURL && *c == '#') { + copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding); + state = State::Fragment; + appendToASCIIBuffer('#'); + ++c; + break; + } + if (!base.protocolIs("file")) { + state = State::Relative; + break; + } + copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding); + appendToASCIIBuffer(':'); + state = State::File; + break; + case State::SpecialRelativeOrAuthority: + LOG_STATE("SpecialRelativeOrAuthority"); + if (*c == '/') { + appendToASCIIBuffer('/'); + advance(c); + if (c.atEnd()) { + failure(); + return; + } + if (*c == '/') { + appendToASCIIBuffer('/'); + state = State::SpecialAuthorityIgnoreSlashes; + ++c; + } else + state = State::RelativeSlash; + } else + state = State::Relative; + break; + case State::PathOrAuthority: + LOG_STATE("PathOrAuthority"); + if (*c == '/') { + appendToASCIIBuffer('/'); + state = State::AuthorityOrHost; + advance(c); + m_url.m_userStart = currentPosition(c); + authorityOrHostBegin = c; + } else { + ASSERT(parsedDataView(currentPosition(c) - 1) == '/'); + m_url.m_userStart = currentPosition(c) - 1; + m_url.m_userEnd = m_url.m_userStart; + m_url.m_passwordEnd = m_url.m_userStart; + m_url.m_hostEnd = m_url.m_userStart; + m_url.m_portEnd = m_url.m_userStart; + m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; + state = State::Path; + } + break; + case State::Relative: + LOG_STATE("Relative"); + switch (*c) { + case '/': + case '\\': + state = State::RelativeSlash; + ++c; + break; + case '?': + copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding); + appendToASCIIBuffer('?'); + ++c; + if (isUTF8Encoding) + state = State::UTF8Query; + else { + queryBegin = c; + state = State::NonUTF8Query; + } + break; + case '#': + copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding); + appendToASCIIBuffer('#'); + state = State::Fragment; + ++c; + break; + default: + copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding); + if (currentPosition(c) && parsedDataView(currentPosition(c) - 1) != '/') { + appendToASCIIBuffer('/'); + m_url.m_pathAfterLastSlash = currentPosition(c); + } + state = State::Path; + break; + } + break; + case State::RelativeSlash: + LOG_STATE("RelativeSlash"); + if (*c == '/' || *c == '\\') { + ++c; + copyURLPartsUntil(base, URLPart::SchemeEnd, c, isUTF8Encoding); + appendToASCIIBuffer("://", 3); + if (m_urlIsSpecial) + state = State::SpecialAuthorityIgnoreSlashes; + else { + m_url.m_userStart = currentPosition(c); + state = State::AuthorityOrHost; + authorityOrHostBegin = c; + } + } else { + copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding); + appendToASCIIBuffer('/'); + m_url.m_pathAfterLastSlash = base.m_portEnd + 1; + state = State::Path; + } + break; + case State::SpecialAuthoritySlashes: + LOG_STATE("SpecialAuthoritySlashes"); + if (LIKELY(*c == '/' || *c == '\\')) { + if (UNLIKELY(*c == '\\')) + syntaxViolation(c); + appendToASCIIBuffer('/'); + advance(c); + if (LIKELY(!c.atEnd() && (*c == '/' || *c == '\\'))) { + if (UNLIKELY(*c == '\\')) + syntaxViolation(c); + ++c; + appendToASCIIBuffer('/'); + } else { + syntaxViolation(c); + appendToASCIIBuffer('/'); + } + } else { + syntaxViolation(c); + appendToASCIIBuffer("//", 2); + } + state = State::SpecialAuthorityIgnoreSlashes; + break; + case State::SpecialAuthorityIgnoreSlashes: + LOG_STATE("SpecialAuthorityIgnoreSlashes"); + if (*c == '/' || *c == '\\') { + syntaxViolation(c); + ++c; + } else { + m_url.m_userStart = currentPosition(c); + state = State::AuthorityOrHost; + authorityOrHostBegin = c; + } + break; + case State::AuthorityOrHost: + do { + LOG_STATE("AuthorityOrHost"); + if (*c == '@') { + auto lastAt = c; + auto findLastAt = c; + while (!findLastAt.atEnd()) { + URL_PARSER_LOG("Finding last @: %c", *findLastAt); + if (*findLastAt == '@') + lastAt = findLastAt; + bool isSlash = *findLastAt == '/' || (m_urlIsSpecial && *findLastAt == '\\'); + if (isSlash || *findLastAt == '?' || *findLastAt == '#') + break; + ++findLastAt; + } + parseAuthority(CodePointIterator<CharacterType>(authorityOrHostBegin, lastAt)); + c = lastAt; + advance(c); + authorityOrHostBegin = c; + state = State::Host; + m_hostHasPercentOrNonASCII = false; + break; + } + bool isSlash = *c == '/' || (m_urlIsSpecial && *c == '\\'); + if (isSlash || *c == '?' || *c == '#') { + auto iterator = CodePointIterator<CharacterType>(authorityOrHostBegin, c); + if (iterator.atEnd()) { + if (m_urlIsSpecial) + return failure(); + m_url.m_userEnd = currentPosition(c); + m_url.m_passwordEnd = m_url.m_userEnd; + m_url.m_hostEnd = m_url.m_userEnd; + m_url.m_portEnd = m_url.m_userEnd; + m_url.m_pathAfterLastSlash = m_url.m_userEnd; + } else { + m_url.m_userEnd = currentPosition(authorityOrHostBegin); + m_url.m_passwordEnd = m_url.m_userEnd; + if (!parseHostAndPort(iterator)) { + failure(); + return; + } + if (UNLIKELY(!isSlash)) { + if (m_urlIsSpecial) { + syntaxViolation(c); + appendToASCIIBuffer('/'); + } + m_url.m_pathAfterLastSlash = currentPosition(c); + } + } + state = State::Path; + break; + } + if (isPercentOrNonASCII(*c)) + m_hostHasPercentOrNonASCII = true; + ++c; + } while (!c.atEnd()); + break; + case State::Host: + do { + LOG_STATE("Host"); + if (*c == '/' || *c == '?' || *c == '#') { + if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) { + failure(); + return; + } + if (*c == '?' || *c == '#') { + syntaxViolation(c); + appendToASCIIBuffer('/'); + m_url.m_pathAfterLastSlash = currentPosition(c); + } + state = State::Path; + break; + } + if (isPercentOrNonASCII(*c)) + m_hostHasPercentOrNonASCII = true; + ++c; + } while (!c.atEnd()); + break; + case State::File: + LOG_STATE("File"); + switch (*c) { + case '\\': + syntaxViolation(c); + FALLTHROUGH; + case '/': + appendToASCIIBuffer('/'); + state = State::FileSlash; + ++c; + break; + case '?': + syntaxViolation(c); + if (base.isValid() && base.protocolIs("file")) { + copyURLPartsUntil(base, URLPart::PathEnd, c, isUTF8Encoding); + appendToASCIIBuffer('?'); + ++c; + } else { + appendToASCIIBuffer("///?", 4); + ++c; + m_url.m_userStart = currentPosition(c) - 2; + m_url.m_userEnd = m_url.m_userStart; + m_url.m_passwordEnd = m_url.m_userStart; + m_url.m_hostEnd = m_url.m_userStart; + m_url.m_portEnd = m_url.m_userStart; + m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; + m_url.m_pathEnd = m_url.m_pathAfterLastSlash; + } + if (isUTF8Encoding) + state = State::UTF8Query; + else { + queryBegin = c; + state = State::NonUTF8Query; + } + break; + case '#': + syntaxViolation(c); + if (base.isValid() && base.protocolIs("file")) { + copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding); + appendToASCIIBuffer('#'); + } else { + appendToASCIIBuffer("///#", 4); + m_url.m_userStart = currentPosition(c) - 2; + m_url.m_userEnd = m_url.m_userStart; + m_url.m_passwordEnd = m_url.m_userStart; + m_url.m_hostEnd = m_url.m_userStart; + m_url.m_portEnd = m_url.m_userStart; + m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; + m_url.m_pathEnd = m_url.m_pathAfterLastSlash; + m_url.m_queryEnd = m_url.m_pathAfterLastSlash; + } + state = State::Fragment; + ++c; + break; + default: + syntaxViolation(c); + if (base.isValid() && base.protocolIs("file") && shouldCopyFileURL(c)) + copyURLPartsUntil(base, URLPart::PathAfterLastSlash, c, isUTF8Encoding); + else { + appendToASCIIBuffer("///", 3); + m_url.m_userStart = currentPosition(c) - 1; + m_url.m_userEnd = m_url.m_userStart; + m_url.m_passwordEnd = m_url.m_userStart; + m_url.m_hostEnd = m_url.m_userStart; + m_url.m_portEnd = m_url.m_userStart; + m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; + if (isWindowsDriveLetter(c)) + appendWindowsDriveLetter(c); + } + state = State::Path; + break; + } + break; + case State::FileSlash: + LOG_STATE("FileSlash"); + if (LIKELY(*c == '/' || *c == '\\')) { + if (UNLIKELY(*c == '\\')) + syntaxViolation(c); + appendToASCIIBuffer('/'); + advance(c); + m_url.m_userStart = currentPosition(c); + m_url.m_userEnd = m_url.m_userStart; + m_url.m_passwordEnd = m_url.m_userStart; + m_url.m_hostEnd = m_url.m_userStart; + m_url.m_portEnd = m_url.m_userStart; + authorityOrHostBegin = c; + state = State::FileHost; + break; + } + syntaxViolation(c); + appendToASCIIBuffer("//", 2); + m_url.m_userStart = currentPosition(c) - 1; + m_url.m_userEnd = m_url.m_userStart; + m_url.m_passwordEnd = m_url.m_userStart; + m_url.m_hostEnd = m_url.m_userStart; + m_url.m_portEnd = m_url.m_userStart; + if (isWindowsDriveLetter(c)) { + appendWindowsDriveLetter(c); + m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; + } else if (copyBaseWindowsDriveLetter(base)) { + appendToASCIIBuffer('/'); + m_url.m_pathAfterLastSlash = m_url.m_userStart + 4; + } else + m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; + state = State::Path; + break; + case State::FileHost: + do { + LOG_STATE("FileHost"); + if (isSlashQuestionOrHash(*c)) { + bool windowsQuirk = takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c)) + && isWindowsDriveLetter(authorityOrHostBegin); + if (windowsQuirk) { + syntaxViolation(authorityOrHostBegin); + appendToASCIIBuffer('/'); + appendWindowsDriveLetter(authorityOrHostBegin); + } + if (windowsQuirk || authorityOrHostBegin == c) { + ASSERT(windowsQuirk || parsedDataView(currentPosition(c) - 1) == '/'); + if (UNLIKELY(*c == '?')) { + syntaxViolation(c); + appendToASCIIBuffer("/?", 2); + ++c; + if (isUTF8Encoding) + state = State::UTF8Query; + else { + queryBegin = c; + state = State::NonUTF8Query; + } + m_url.m_pathAfterLastSlash = currentPosition(c) - 1; + m_url.m_pathEnd = m_url.m_pathAfterLastSlash; + break; + } + if (UNLIKELY(*c == '#')) { + syntaxViolation(c); + appendToASCIIBuffer("/#", 2); + ++c; + m_url.m_pathAfterLastSlash = currentPosition(c) - 1; + m_url.m_pathEnd = m_url.m_pathAfterLastSlash; + m_url.m_queryEnd = m_url.m_pathAfterLastSlash; + state = State::Fragment; + break; + } + state = State::Path; + break; + } + if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) { + failure(); + return; + } + if (UNLIKELY(isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd)))) { + syntaxViolation(c); + m_asciiBuffer.shrink(m_url.m_passwordEnd); + m_url.m_hostEnd = currentPosition(c); + m_url.m_portEnd = m_url.m_hostEnd; + } + + state = State::PathStart; + break; + } + if (isPercentOrNonASCII(*c)) + m_hostHasPercentOrNonASCII = true; + ++c; + } while (!c.atEnd()); + break; + case State::PathStart: + LOG_STATE("PathStart"); + if (*c != '/' && *c != '\\') + ++c; + state = State::Path; + break; + case State::Path: + LOG_STATE("Path"); + if (*c == '/' || (m_urlIsSpecial && *c == '\\')) { + if (UNLIKELY(m_urlIsSpecial && *c == '\\')) + syntaxViolation(c); + appendToASCIIBuffer('/'); + ++c; + m_url.m_pathAfterLastSlash = currentPosition(c); + break; + } + if (UNLIKELY(currentPosition(c) && parsedDataView(currentPosition(c) - 1) == '/')) { + if (UNLIKELY(isDoubleDotPathSegment(c))) { + syntaxViolation(c); + consumeDoubleDotPathSegment(c); + popPath(); + break; + } + if (UNLIKELY(isSingleDotPathSegment(c))) { + syntaxViolation(c); + consumeSingleDotPathSegment(c); + break; + } + } + if (*c == '?') { + m_url.m_pathEnd = currentPosition(c); + appendToASCIIBuffer('?'); + ++c; + if (isUTF8Encoding) + state = State::UTF8Query; + else { + queryBegin = c; + state = State::NonUTF8Query; + } + break; + } + if (*c == '#') { + m_url.m_pathEnd = currentPosition(c); + m_url.m_queryEnd = m_url.m_pathEnd; + state = State::Fragment; + break; + } + utf8PercentEncode<isInDefaultEncodeSet>(c); + ++c; + break; + case State::CannotBeABaseURLPath: + LOG_STATE("CannotBeABaseURLPath"); + if (*c == '?') { + m_url.m_pathEnd = currentPosition(c); + appendToASCIIBuffer('?'); + ++c; + if (isUTF8Encoding) + state = State::UTF8Query; + else { + queryBegin = c; + state = State::NonUTF8Query; + } + } else if (*c == '#') { + m_url.m_pathEnd = currentPosition(c); + m_url.m_queryEnd = m_url.m_pathEnd; + state = State::Fragment; + } else if (*c == '/') { + appendToASCIIBuffer('/'); + ++c; + m_url.m_pathAfterLastSlash = currentPosition(c); + } else { + utf8PercentEncode<isInSimpleEncodeSet>(c); + ++c; + } + break; + case State::UTF8Query: + LOG_STATE("UTF8Query"); + ASSERT(queryBegin == CodePointIterator<CharacterType>()); + if (*c == '#') { + m_url.m_queryEnd = currentPosition(c); + state = State::Fragment; + break; + } + if (isUTF8Encoding) + utf8QueryEncode(c); + else + appendCodePoint(queryBuffer, *c); + ++c; + break; + case State::NonUTF8Query: + do { + LOG_STATE("NonUTF8Query"); + ASSERT(queryBegin != CodePointIterator<CharacterType>()); + if (*c == '#') { + encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c)); + m_url.m_queryEnd = currentPosition(c); + state = State::Fragment; + break; + } + appendCodePoint(queryBuffer, *c); + advance(c, queryBegin); + } while (!c.atEnd()); + break; + case State::Fragment: + URL_PARSER_LOG("State Fragment"); + utf8PercentEncode<isInSimpleEncodeSet>(c); + ++c; + break; + } + } + + switch (state) { + case State::SchemeStart: + LOG_FINAL_STATE("SchemeStart"); + if (!currentPosition(c) && base.isValid() && !base.m_cannotBeABaseURL) { + m_url = base; + m_url.removeFragmentIdentifier(); + return; + } + failure(); + return; + case State::Scheme: + LOG_FINAL_STATE("Scheme"); + failure(); + return; + case State::NoScheme: + LOG_FINAL_STATE("NoScheme"); + RELEASE_ASSERT_NOT_REACHED(); + case State::SpecialRelativeOrAuthority: + LOG_FINAL_STATE("SpecialRelativeOrAuthority"); + copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding); + m_url.m_fragmentEnd = m_url.m_queryEnd; + break; + case State::PathOrAuthority: + LOG_FINAL_STATE("PathOrAuthority"); + ASSERT(m_url.m_userStart); + ASSERT(m_url.m_userStart == currentPosition(c)); + ASSERT(parsedDataView(currentPosition(c) - 1) == '/'); + m_url.m_userStart--; + m_url.m_userEnd = m_url.m_userStart; + m_url.m_passwordEnd = m_url.m_userStart; + m_url.m_hostEnd = m_url.m_userStart; + m_url.m_portEnd = m_url.m_userStart; + m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; + m_url.m_pathEnd = m_url.m_pathAfterLastSlash; + m_url.m_queryEnd = m_url.m_pathAfterLastSlash; + m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash; + break; + case State::Relative: + LOG_FINAL_STATE("Relative"); + RELEASE_ASSERT_NOT_REACHED(); + case State::RelativeSlash: + LOG_FINAL_STATE("RelativeSlash"); + copyURLPartsUntil(base, URLPart::PortEnd, c, isUTF8Encoding); + appendToASCIIBuffer('/'); + m_url.m_pathAfterLastSlash = base.m_portEnd + 1; + m_url.m_pathEnd = m_url.m_pathAfterLastSlash; + m_url.m_queryEnd = m_url.m_pathAfterLastSlash; + m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash; + break; + case State::SpecialAuthoritySlashes: + LOG_FINAL_STATE("SpecialAuthoritySlashes"); + m_url.m_userStart = currentPosition(c); + m_url.m_userEnd = m_url.m_userStart; + m_url.m_passwordEnd = m_url.m_userStart; + m_url.m_hostEnd = m_url.m_userStart; + m_url.m_portEnd = m_url.m_userStart; + m_url.m_pathAfterLastSlash = m_url.m_userStart; + m_url.m_pathEnd = m_url.m_userStart; + m_url.m_queryEnd = m_url.m_userStart; + m_url.m_fragmentEnd = m_url.m_userStart; + break; + case State::SpecialAuthorityIgnoreSlashes: + LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes"); + failure(); + return; + case State::AuthorityOrHost: + LOG_FINAL_STATE("AuthorityOrHost"); + m_url.m_userEnd = currentPosition(authorityOrHostBegin); + m_url.m_passwordEnd = m_url.m_userEnd; + if (authorityOrHostBegin.atEnd()) { + m_url.m_userEnd = m_url.m_userStart; + m_url.m_passwordEnd = m_url.m_userStart; + m_url.m_hostEnd = m_url.m_userStart; + m_url.m_portEnd = m_url.m_userStart; + m_url.m_pathEnd = m_url.m_userStart; + } else if (!parseHostAndPort(authorityOrHostBegin)) { + failure(); + return; + } else { + if (m_urlIsSpecial) { + syntaxViolation(c); + appendToASCIIBuffer('/'); + m_url.m_pathEnd = m_url.m_portEnd + 1; + } else + m_url.m_pathEnd = m_url.m_portEnd; + } + m_url.m_pathAfterLastSlash = m_url.m_pathEnd; + m_url.m_queryEnd = m_url.m_pathEnd; + m_url.m_fragmentEnd = m_url.m_pathEnd; + break; + case State::Host: + LOG_FINAL_STATE("Host"); + if (!parseHostAndPort(authorityOrHostBegin)) { + failure(); + return; + } + if (m_urlIsSpecial) { + syntaxViolation(c); + appendToASCIIBuffer('/'); + m_url.m_pathEnd = m_url.m_portEnd + 1; + } else + m_url.m_pathEnd = m_url.m_portEnd; + m_url.m_pathAfterLastSlash = m_url.m_pathEnd; + m_url.m_queryEnd = m_url.m_pathEnd; + m_url.m_fragmentEnd = m_url.m_pathEnd; + break; + case State::File: + LOG_FINAL_STATE("File"); + if (base.isValid() && base.protocolIs("file")) { + copyURLPartsUntil(base, URLPart::QueryEnd, c, isUTF8Encoding); + m_url.m_fragmentEnd = m_url.m_queryEnd; + break; + } + syntaxViolation(c); + appendToASCIIBuffer("///", 3); + m_url.m_userStart = currentPosition(c) - 1; + m_url.m_userEnd = m_url.m_userStart; + m_url.m_passwordEnd = m_url.m_userStart; + m_url.m_hostEnd = m_url.m_userStart; + m_url.m_portEnd = m_url.m_userStart; + m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; + m_url.m_pathEnd = m_url.m_pathAfterLastSlash; + m_url.m_queryEnd = m_url.m_pathAfterLastSlash; + m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash; + break; + case State::FileSlash: + LOG_FINAL_STATE("FileSlash"); + syntaxViolation(c); + m_url.m_userStart = currentPosition(c) + 1; + appendToASCIIBuffer("//", 2); + m_url.m_userEnd = m_url.m_userStart; + m_url.m_passwordEnd = m_url.m_userStart; + m_url.m_hostEnd = m_url.m_userStart; + m_url.m_portEnd = m_url.m_userStart; + if (copyBaseWindowsDriveLetter(base)) { + appendToASCIIBuffer('/'); + m_url.m_pathAfterLastSlash = m_url.m_userStart + 4; + } else + m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; + m_url.m_pathEnd = m_url.m_pathAfterLastSlash; + m_url.m_queryEnd = m_url.m_pathAfterLastSlash; + m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash; + break; + case State::FileHost: + LOG_FINAL_STATE("FileHost"); + if (takesTwoAdvancesUntilEnd(CodePointIterator<CharacterType>(authorityOrHostBegin, c)) + && isWindowsDriveLetter(authorityOrHostBegin)) { + syntaxViolation(authorityOrHostBegin); + appendToASCIIBuffer('/'); + appendWindowsDriveLetter(authorityOrHostBegin); + m_url.m_pathAfterLastSlash = currentPosition(c); + m_url.m_pathEnd = m_url.m_pathAfterLastSlash; + m_url.m_queryEnd = m_url.m_pathAfterLastSlash; + m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash; + break; + } + + if (authorityOrHostBegin == c) { + syntaxViolation(c); + appendToASCIIBuffer('/'); + m_url.m_userStart = currentPosition(c) - 1; + m_url.m_userEnd = m_url.m_userStart; + m_url.m_passwordEnd = m_url.m_userStart; + m_url.m_hostEnd = m_url.m_userStart; + m_url.m_portEnd = m_url.m_userStart; + m_url.m_pathAfterLastSlash = m_url.m_userStart + 1; + m_url.m_pathEnd = m_url.m_pathAfterLastSlash; + m_url.m_queryEnd = m_url.m_pathAfterLastSlash; + m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash; + break; + } + + if (!parseHostAndPort(CodePointIterator<CharacterType>(authorityOrHostBegin, c))) { + failure(); + return; + } + + syntaxViolation(c); + if (isLocalhost(parsedDataView(m_url.m_passwordEnd, currentPosition(c) - m_url.m_passwordEnd))) { + m_asciiBuffer.shrink(m_url.m_passwordEnd); + m_url.m_hostEnd = currentPosition(c); + m_url.m_portEnd = m_url.m_hostEnd; + } + appendToASCIIBuffer('/'); + m_url.m_pathAfterLastSlash = m_url.m_portEnd + 1; + m_url.m_pathEnd = m_url.m_pathAfterLastSlash; + m_url.m_queryEnd = m_url.m_pathAfterLastSlash; + m_url.m_fragmentEnd = m_url.m_pathAfterLastSlash; + break; + case State::PathStart: + LOG_FINAL_STATE("PathStart"); + RELEASE_ASSERT_NOT_REACHED(); + case State::Path: + LOG_FINAL_STATE("Path"); + m_url.m_pathEnd = currentPosition(c); + m_url.m_queryEnd = m_url.m_pathEnd; + m_url.m_fragmentEnd = m_url.m_pathEnd; + break; + case State::CannotBeABaseURLPath: + LOG_FINAL_STATE("CannotBeABaseURLPath"); + m_url.m_pathEnd = currentPosition(c); + m_url.m_queryEnd = m_url.m_pathEnd; + m_url.m_fragmentEnd = m_url.m_pathEnd; + break; + case State::UTF8Query: + LOG_FINAL_STATE("UTF8Query"); + ASSERT(queryBegin == CodePointIterator<CharacterType>()); + m_url.m_queryEnd = currentPosition(c); + m_url.m_fragmentEnd = m_url.m_queryEnd; + break; + case State::NonUTF8Query: + LOG_FINAL_STATE("NonUTF8Query"); + ASSERT(queryBegin != CodePointIterator<CharacterType>()); + encodeQuery(queryBuffer, encoding, CodePointIterator<CharacterType>(queryBegin, c)); + m_url.m_queryEnd = currentPosition(c); + m_url.m_fragmentEnd = m_url.m_queryEnd; + break; + case State::Fragment: + LOG_FINAL_STATE("Fragment"); + m_url.m_fragmentEnd = currentPosition(c); + break; + } + + if (LIKELY(!m_didSeeSyntaxViolation)) { + m_url.m_string = m_inputString; + ASSERT(m_asciiBuffer.isEmpty()); + } else + m_url.m_string = String::adopt(WTFMove(m_asciiBuffer)); + m_url.m_isValid = true; + URL_PARSER_LOG("Parsed URL <%s>", m_url.m_string.utf8().data()); +} + +template<typename CharacterType> +void URLParser::parseAuthority(CodePointIterator<CharacterType> iterator) +{ + if (UNLIKELY(iterator.atEnd())) { + syntaxViolation(iterator); + m_url.m_userEnd = currentPosition(iterator); + m_url.m_passwordEnd = m_url.m_userEnd; + return; + } + for (; !iterator.atEnd(); advance(iterator)) { + if (*iterator == ':') { + m_url.m_userEnd = currentPosition(iterator); + auto iteratorAtColon = iterator; + ++iterator; + bool tabOrNewlineAfterColon = false; + while (UNLIKELY(!iterator.atEnd() && isTabOrNewline(*iterator))) { + tabOrNewlineAfterColon = true; + ++iterator; + } + if (UNLIKELY(iterator.atEnd())) { + syntaxViolation(iteratorAtColon); + m_url.m_passwordEnd = m_url.m_userEnd; + if (m_url.m_userEnd > m_url.m_userStart) + appendToASCIIBuffer('@'); + return; + } + if (tabOrNewlineAfterColon) + syntaxViolation(iteratorAtColon); + appendToASCIIBuffer(':'); + break; + } + utf8PercentEncode<isInUserInfoEncodeSet>(iterator); + } + for (; !iterator.atEnd(); advance(iterator)) + utf8PercentEncode<isInUserInfoEncodeSet>(iterator); + m_url.m_passwordEnd = currentPosition(iterator); + if (!m_url.m_userEnd) + m_url.m_userEnd = m_url.m_passwordEnd; + appendToASCIIBuffer('@'); +} + +template<typename UnsignedIntegerType> +void URLParser::appendNumberToASCIIBuffer(UnsignedIntegerType number) +{ + LChar buf[sizeof(UnsignedIntegerType) * 3 + 1]; + LChar* end = buf + WTF_ARRAY_LENGTH(buf); + LChar* p = end; + do { + *--p = (number % 10) + '0'; + number /= 10; + } while (number); + appendToASCIIBuffer(p, end - p); +} + +void URLParser::serializeIPv4(IPv4Address address) +{ + appendNumberToASCIIBuffer<uint8_t>(address >> 24); + appendToASCIIBuffer('.'); + appendNumberToASCIIBuffer<uint8_t>(address >> 16); + appendToASCIIBuffer('.'); + appendNumberToASCIIBuffer<uint8_t>(address >> 8); + appendToASCIIBuffer('.'); + appendNumberToASCIIBuffer<uint8_t>(address); +} + +static size_t zeroSequenceLength(const std::array<uint16_t, 8>& address, size_t begin) +{ + size_t end = begin; + for (; end < 8; end++) { + if (address[end]) + break; + } + return end - begin; +} + +static std::optional<size_t> findLongestZeroSequence(const std::array<uint16_t, 8>& address) +{ + std::optional<size_t> longest; + size_t longestLength = 0; + for (size_t i = 0; i < 8; i++) { + size_t length = zeroSequenceLength(address, i); + if (length) { + if (length > 1 && (!longest || longestLength < length)) { + longest = i; + longestLength = length; + } + i += length; + } + } + return longest; +} + +void URLParser::serializeIPv6Piece(uint16_t piece) +{ + bool printed = false; + if (auto nibble0 = piece >> 12) { + appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble0)); + printed = true; + } + auto nibble1 = piece >> 8 & 0xF; + if (printed || nibble1) { + appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble1)); + printed = true; + } + auto nibble2 = piece >> 4 & 0xF; + if (printed || nibble2) + appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(nibble2)); + appendToASCIIBuffer(lowerNibbleToLowercaseASCIIHexDigit(piece & 0xF)); +} + +void URLParser::serializeIPv6(URLParser::IPv6Address address) +{ + appendToASCIIBuffer('['); + auto compressPointer = findLongestZeroSequence(address); + for (size_t piece = 0; piece < 8; piece++) { + if (compressPointer && compressPointer.value() == piece) { + ASSERT(!address[piece]); + if (piece) + appendToASCIIBuffer(':'); + else + appendToASCIIBuffer("::", 2); + while (piece < 8 && !address[piece]) + piece++; + if (piece == 8) + break; + } + serializeIPv6Piece(address[piece]); + if (piece < 7) + appendToASCIIBuffer(':'); + } + appendToASCIIBuffer(']'); +} + +enum class URLParser::IPv4PieceParsingError { + Failure, + Overflow, +}; + +template<typename CharacterType> +Expected<uint32_t, URLParser::IPv4PieceParsingError> URLParser::parseIPv4Piece(CodePointIterator<CharacterType>& iterator, bool& didSeeSyntaxViolation) +{ + enum class State : uint8_t { + UnknownBase, + Decimal, + OctalOrHex, + Octal, + Hex, + }; + State state = State::UnknownBase; + Checked<uint32_t, RecordOverflow> value = 0; + if (!iterator.atEnd() && *iterator == '.') + return makeUnexpected(IPv4PieceParsingError::Failure); + while (!iterator.atEnd()) { + if (isTabOrNewline(*iterator)) { + didSeeSyntaxViolation = true; + ++iterator; + continue; + } + if (*iterator == '.') { + ASSERT(!value.hasOverflowed()); + return value.unsafeGet(); + } + switch (state) { + case State::UnknownBase: + if (UNLIKELY(*iterator == '0')) { + ++iterator; + state = State::OctalOrHex; + break; + } + state = State::Decimal; + break; + case State::OctalOrHex: + didSeeSyntaxViolation = true; + if (*iterator == 'x' || *iterator == 'X') { + ++iterator; + state = State::Hex; + break; + } + state = State::Octal; + break; + case State::Decimal: + if (!isASCIIDigit(*iterator)) + return makeUnexpected(IPv4PieceParsingError::Failure); + value *= 10; + value += *iterator - '0'; + if (UNLIKELY(value.hasOverflowed())) + return makeUnexpected(IPv4PieceParsingError::Overflow); + ++iterator; + break; + case State::Octal: + ASSERT(didSeeSyntaxViolation); + if (*iterator < '0' || *iterator > '7') + return makeUnexpected(IPv4PieceParsingError::Failure); + value *= 8; + value += *iterator - '0'; + if (UNLIKELY(value.hasOverflowed())) + return makeUnexpected(IPv4PieceParsingError::Overflow); + ++iterator; + break; + case State::Hex: + ASSERT(didSeeSyntaxViolation); + if (!isASCIIHexDigit(*iterator)) + return makeUnexpected(IPv4PieceParsingError::Failure); + value *= 16; + value += toASCIIHexValue(*iterator); + if (UNLIKELY(value.hasOverflowed())) + return makeUnexpected(IPv4PieceParsingError::Overflow); + ++iterator; + break; + } + } + ASSERT(!value.hasOverflowed()); + return value.unsafeGet(); +} + +ALWAYS_INLINE static uint64_t pow256(size_t exponent) +{ + RELEASE_ASSERT(exponent <= 4); + uint64_t values[5] = {1, 256, 256 * 256, 256 * 256 * 256, 256ull * 256 * 256 * 256 }; + return values[exponent]; +} + +enum class URLParser::IPv4ParsingError { + Failure, + NotIPv4, +}; + +template<typename CharacterTypeForSyntaxViolation, typename CharacterType> +Expected<URLParser::IPv4Address, URLParser::IPv4ParsingError> URLParser::parseIPv4Host(const CodePointIterator<CharacterTypeForSyntaxViolation>& iteratorForSyntaxViolationPosition, CodePointIterator<CharacterType> iterator) +{ + Vector<Expected<uint32_t, URLParser::IPv4PieceParsingError>, 4> items; + bool didSeeSyntaxViolation = false; + if (!iterator.atEnd() && *iterator == '.') + return makeUnexpected(IPv4ParsingError::NotIPv4); + while (!iterator.atEnd()) { + if (isTabOrNewline(*iterator)) { + didSeeSyntaxViolation = true; + ++iterator; + continue; + } + if (items.size() >= 4) + return makeUnexpected(IPv4ParsingError::NotIPv4); + items.append(parseIPv4Piece(iterator, didSeeSyntaxViolation)); + if (!iterator.atEnd() && *iterator == '.') { + ++iterator; + if (iterator.atEnd()) + syntaxViolation(iteratorForSyntaxViolationPosition); + else if (*iterator == '.') + return makeUnexpected(IPv4ParsingError::NotIPv4); + } + } + if (!iterator.atEnd() || !items.size() || items.size() > 4) + return makeUnexpected(IPv4ParsingError::NotIPv4); + for (const auto& item : items) { + if (!item.hasValue() && item.error() == IPv4PieceParsingError::Failure) + return makeUnexpected(IPv4ParsingError::NotIPv4); + } + for (const auto& item : items) { + if (!item.hasValue() && item.error() == IPv4PieceParsingError::Overflow) + return makeUnexpected(IPv4ParsingError::Failure); + } + if (items.size() > 1) { + for (size_t i = 0; i < items.size() - 1; i++) { + if (items[i].value() > 255) + return makeUnexpected(IPv4ParsingError::Failure); + } + } + if (items[items.size() - 1].value() >= pow256(5 - items.size())) + return makeUnexpected(IPv4ParsingError::Failure); + + if (didSeeSyntaxViolation) + syntaxViolation(iteratorForSyntaxViolationPosition); + for (const auto& item : items) { + if (item.value() > 255) + syntaxViolation(iteratorForSyntaxViolationPosition); + } + + if (UNLIKELY(items.size() != 4)) + syntaxViolation(iteratorForSyntaxViolationPosition); + + IPv4Address ipv4 = items.takeLast().value(); + for (size_t counter = 0; counter < items.size(); ++counter) + ipv4 += items[counter].value() * pow256(3 - counter); + return ipv4; +} + +template<typename CharacterType> +std::optional<uint32_t> URLParser::parseIPv4PieceInsideIPv6(CodePointIterator<CharacterType>& iterator) +{ + if (iterator.atEnd()) + return std::nullopt; + uint32_t piece = 0; + bool leadingZeros = false; + size_t digitCount = 0; + while (!iterator.atEnd()) { + if (!isASCIIDigit(*iterator)) + return std::nullopt; + ++digitCount; + if (!piece && *iterator == '0') { + if (leadingZeros) + return std::nullopt; + leadingZeros = true; + } + if (!piece && *iterator == '0') + leadingZeros = true; + piece = piece * 10 + *iterator - '0'; + if (piece > 255) + return std::nullopt; + advance<CharacterType, ReportSyntaxViolation::No>(iterator); + if (iterator.atEnd()) + break; + if (*iterator == '.') + break; + } + if (piece && leadingZeros) + return std::nullopt; + return piece; +} + +template<typename CharacterType> +std::optional<URLParser::IPv4Address> URLParser::parseIPv4AddressInsideIPv6(CodePointIterator<CharacterType> iterator) +{ + IPv4Address address = 0; + for (size_t i = 0; i < 4; ++i) { + if (std::optional<uint32_t> piece = parseIPv4PieceInsideIPv6(iterator)) + address = (address << 8) + piece.value(); + else + return std::nullopt; + if (i < 3) { + if (iterator.atEnd()) + return std::nullopt; + if (*iterator != '.') + return std::nullopt; + advance<CharacterType, ReportSyntaxViolation::No>(iterator); + } else if (!iterator.atEnd()) + return std::nullopt; + } + ASSERT(iterator.atEnd()); + return address; +} + +template<typename CharacterType> +std::optional<URLParser::IPv6Address> URLParser::parseIPv6Host(CodePointIterator<CharacterType> c) +{ + ASSERT(*c == '['); + const auto hostBegin = c; + advance(c, hostBegin); + if (c.atEnd()) + return std::nullopt; + + IPv6Address address = {{0, 0, 0, 0, 0, 0, 0, 0}}; + size_t piecePointer = 0; + std::optional<size_t> compressPointer; + + if (*c == ':') { + advance(c, hostBegin); + if (c.atEnd()) + return std::nullopt; + if (*c != ':') + return std::nullopt; + advance(c, hostBegin); + ++piecePointer; + compressPointer = piecePointer; + } + + while (!c.atEnd()) { + if (piecePointer == 8) + return std::nullopt; + if (*c == ':') { + if (compressPointer) + return std::nullopt; + advance(c, hostBegin); + ++piecePointer; + compressPointer = piecePointer; + continue; + } + if (piecePointer == 6 || (compressPointer && piecePointer < 6)) { + if (std::optional<IPv4Address> ipv4Address = parseIPv4AddressInsideIPv6(c)) { + if (compressPointer && piecePointer == 5) + return std::nullopt; + syntaxViolation(hostBegin); + address[piecePointer++] = ipv4Address.value() >> 16; + address[piecePointer++] = ipv4Address.value() & 0xFFFF; + c = { }; + break; + } + } + uint16_t value = 0; + size_t length = 0; + bool leadingZeros = false; + for (; length < 4; length++) { + if (c.atEnd()) + break; + if (!isASCIIHexDigit(*c)) + break; + if (isASCIIUpper(*c)) + syntaxViolation(hostBegin); + if (*c == '0' && !length) + leadingZeros = true; + value = value * 0x10 + toASCIIHexValue(*c); + advance(c, hostBegin); + } + + if (UNLIKELY((value && leadingZeros) || (!value && length > 1))) + syntaxViolation(hostBegin); + + address[piecePointer++] = value; + if (c.atEnd()) + break; + if (piecePointer == 8 || *c != ':') + return std::nullopt; + advance(c, hostBegin); + } + + if (!c.atEnd()) + return std::nullopt; + + if (compressPointer) { + size_t swaps = piecePointer - compressPointer.value(); + piecePointer = 7; + while (swaps) + std::swap(address[piecePointer--], address[compressPointer.value() + swaps-- - 1]); + } else if (piecePointer != 8) + return std::nullopt; + + std::optional<size_t> possibleCompressPointer = findLongestZeroSequence(address); + if (possibleCompressPointer) + possibleCompressPointer.value()++; + if (UNLIKELY(compressPointer != possibleCompressPointer)) + syntaxViolation(hostBegin); + + return address; +} + +template<typename CharacterType> +Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition) +{ + Vector<LChar, defaultInlineBufferSize> output; + output.reserveInitialCapacity(length); + + for (size_t i = 0; i < length; ++i) { + uint8_t byte = input[i]; + if (byte != '%') + output.uncheckedAppend(byte); + else if (length > 2 && i < length - 2) { + if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) { + syntaxViolation(iteratorForSyntaxViolationPosition); + output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2])); + i += 2; + } else + output.uncheckedAppend(byte); + } else + output.uncheckedAppend(byte); + } + return output; +} + +Vector<LChar, URLParser::defaultInlineBufferSize> URLParser::percentDecode(const LChar* input, size_t length) +{ + Vector<LChar, defaultInlineBufferSize> output; + output.reserveInitialCapacity(length); + + for (size_t i = 0; i < length; ++i) { + uint8_t byte = input[i]; + if (byte != '%') + output.uncheckedAppend(byte); + else if (length > 2 && i < length - 2) { + if (isASCIIHexDigit(input[i + 1]) && isASCIIHexDigit(input[i + 2])) { + output.uncheckedAppend(toASCIIHexValue(input[i + 1], input[i + 2])); + i += 2; + } else + output.uncheckedAppend(byte); + } else + output.uncheckedAppend(byte); + } + return output; +} + +ALWAYS_INLINE static bool containsOnlyASCII(const String& string) +{ + ASSERT(!string.isNull()); + if (string.is8Bit()) + return charactersAreAllASCII(string.characters8(), string.length()); + return charactersAreAllASCII(string.characters16(), string.length()); +} + +template<typename CharacterType> +std::optional<Vector<LChar, URLParser::defaultInlineBufferSize>> URLParser::domainToASCII(const String& domain, const CodePointIterator<CharacterType>& iteratorForSyntaxViolationPosition) +{ + Vector<LChar, defaultInlineBufferSize> ascii; + if (containsOnlyASCII(domain)) { + size_t length = domain.length(); + if (domain.is8Bit()) { + const LChar* characters = domain.characters8(); + ascii.reserveInitialCapacity(length); + for (size_t i = 0; i < length; ++i) { + if (UNLIKELY(isASCIIUpper(characters[i]))) + syntaxViolation(iteratorForSyntaxViolationPosition); + ascii.uncheckedAppend(toASCIILower(characters[i])); + } + } else { + const UChar* characters = domain.characters16(); + ascii.reserveInitialCapacity(length); + for (size_t i = 0; i < length; ++i) { + if (UNLIKELY(isASCIIUpper(characters[i]))) + syntaxViolation(iteratorForSyntaxViolationPosition); + ascii.uncheckedAppend(toASCIILower(characters[i])); + } + } + return ascii; + } + + UChar hostnameBuffer[defaultInlineBufferSize]; + UErrorCode error = U_ZERO_ERROR; + UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER; + int32_t numCharactersConverted = uidna_nameToASCII(&internationalDomainNameTranscoder(), StringView(domain).upconvertedCharacters(), domain.length(), hostnameBuffer, defaultInlineBufferSize, &processingDetails, &error); + ASSERT(numCharactersConverted <= static_cast<int32_t>(defaultInlineBufferSize)); + + if (U_SUCCESS(error) && !processingDetails.errors) { + for (int32_t i = 0; i < numCharactersConverted; ++i) { + ASSERT(isASCII(hostnameBuffer[i])); + ASSERT(!isASCIIUpper(hostnameBuffer[i])); + } + ascii.append(hostnameBuffer, numCharactersConverted); + if (domain != StringView(ascii.data(), ascii.size())) + syntaxViolation(iteratorForSyntaxViolationPosition); + return ascii; + } + + // FIXME: Check for U_BUFFER_OVERFLOW_ERROR and retry with an allocated buffer. + return std::nullopt; +} + +bool URLParser::hasForbiddenHostCodePoint(const Vector<LChar, URLParser::defaultInlineBufferSize>& asciiDomain) +{ + for (size_t i = 0; i < asciiDomain.size(); ++i) { + if (isForbiddenHostCodePoint(asciiDomain[i])) + return true; + } + return false; +} + +template<typename CharacterType> +bool URLParser::parsePort(CodePointIterator<CharacterType>& iterator) +{ + ASSERT(*iterator == ':'); + auto colonIterator = iterator; + advance(iterator, colonIterator); + uint32_t port = 0; + if (UNLIKELY(iterator.atEnd())) { + m_url.m_portEnd = currentPosition(colonIterator); + syntaxViolation(colonIterator); + return true; + } + size_t digitCount = 0; + bool leadingZeros = false; + for (; !iterator.atEnd(); ++iterator) { + if (UNLIKELY(isTabOrNewline(*iterator))) { + syntaxViolation(colonIterator); + continue; + } + if (isASCIIDigit(*iterator)) { + if (*iterator == '0' && !digitCount) + leadingZeros = true; + ++digitCount; + port = port * 10 + *iterator - '0'; + if (port > std::numeric_limits<uint16_t>::max()) + return false; + } else + return false; + } + + if (port && leadingZeros) + syntaxViolation(colonIterator); + + if (!port && digitCount > 1) + syntaxViolation(colonIterator); + + ASSERT(port == static_cast<uint16_t>(port)); + if (UNLIKELY(defaultPortForProtocol(parsedDataView(0, m_url.m_schemeEnd)) == static_cast<uint16_t>(port))) + syntaxViolation(colonIterator); + else { + appendToASCIIBuffer(':'); + ASSERT(port <= std::numeric_limits<uint16_t>::max()); + appendNumberToASCIIBuffer<uint16_t>(static_cast<uint16_t>(port)); + } + + m_url.m_portEnd = currentPosition(iterator); + return true; +} + +template<typename CharacterType> +bool URLParser::parseHostAndPort(CodePointIterator<CharacterType> iterator) +{ + if (iterator.atEnd()) + return false; + if (*iterator == ':') + return false; + if (*iterator == '[') { + auto ipv6End = iterator; + while (!ipv6End.atEnd() && *ipv6End != ']') + ++ipv6End; + if (ipv6End.atEnd()) + return false; + if (auto address = parseIPv6Host(CodePointIterator<CharacterType>(iterator, ipv6End))) { + serializeIPv6(address.value()); + if (!ipv6End.atEnd()) { + advance(ipv6End); + if (!ipv6End.atEnd() && *ipv6End == ':') { + m_url.m_hostEnd = currentPosition(ipv6End); + return parsePort(ipv6End); + } + m_url.m_hostEnd = currentPosition(ipv6End); + m_url.m_portEnd = m_url.m_hostEnd; + return true; + } + m_url.m_hostEnd = currentPosition(ipv6End); + return true; + } + return false; + } + + if (!m_urlIsSpecial) { + for (; !iterator.atEnd(); ++iterator) { + if (UNLIKELY(isTabOrNewline(*iterator))) { + syntaxViolation(iterator); + continue; + } + if (*iterator == ':') + break; + if (UNLIKELY(isForbiddenHostCodePoint(*iterator) && *iterator != '%')) + return false; + utf8PercentEncode<isInSimpleEncodeSet>(iterator); + } + m_url.m_hostEnd = currentPosition(iterator); + if (iterator.atEnd()) { + m_url.m_portEnd = currentPosition(iterator); + return true; + } + return parsePort(iterator); + } + + if (LIKELY(!m_hostHasPercentOrNonASCII)) { + auto hostIterator = iterator; + for (; !iterator.atEnd(); ++iterator) { + if (isTabOrNewline(*iterator)) + continue; + if (*iterator == ':') + break; + if (isForbiddenHostCodePoint(*iterator)) + return false; + } + auto address = parseIPv4Host(hostIterator, CodePointIterator<CharacterType>(hostIterator, iterator)); + if (address) { + serializeIPv4(address.value()); + m_url.m_hostEnd = currentPosition(iterator); + if (iterator.atEnd()) { + m_url.m_portEnd = currentPosition(iterator); + return true; + } + return parsePort(iterator); + } + if (address.error() == IPv4ParsingError::Failure) + return false; + for (; hostIterator != iterator; ++hostIterator) { + if (UNLIKELY(isTabOrNewline(*hostIterator))) { + syntaxViolation(hostIterator); + continue; + } + if (UNLIKELY(isASCIIUpper(*hostIterator))) + syntaxViolation(hostIterator); + appendToASCIIBuffer(toASCIILower(*hostIterator)); + } + m_url.m_hostEnd = currentPosition(iterator); + if (!hostIterator.atEnd()) + return parsePort(hostIterator); + m_url.m_portEnd = currentPosition(iterator); + return true; + } + + const auto hostBegin = iterator; + + Vector<LChar, defaultInlineBufferSize> utf8Encoded; + for (; !iterator.atEnd(); ++iterator) { + if (UNLIKELY(isTabOrNewline(*iterator))) { + syntaxViolation(hostBegin); + continue; + } + if (*iterator == ':') + break; + if (UNLIKELY(!isASCII(*iterator))) + syntaxViolation(hostBegin); + + uint8_t buffer[U8_MAX_LENGTH]; + int32_t offset = 0; + UBool error = false; + U8_APPEND(buffer, offset, U8_MAX_LENGTH, *iterator, error); + ASSERT_WITH_SECURITY_IMPLICATION(offset <= static_cast<int32_t>(sizeof(buffer))); + // FIXME: Check error. + utf8Encoded.append(buffer, offset); + } + Vector<LChar, defaultInlineBufferSize> percentDecoded = percentDecode(utf8Encoded.data(), utf8Encoded.size(), hostBegin); + String domain = String::fromUTF8(percentDecoded.data(), percentDecoded.size()); + if (domain.isNull()) + return false; + if (domain != StringView(percentDecoded.data(), percentDecoded.size())) + syntaxViolation(hostBegin); + auto asciiDomain = domainToASCII(domain, hostBegin); + if (!asciiDomain || hasForbiddenHostCodePoint(asciiDomain.value())) + return false; + Vector<LChar, defaultInlineBufferSize>& asciiDomainValue = asciiDomain.value(); + const LChar* asciiDomainCharacters = asciiDomainValue.data(); + + auto address = parseIPv4Host(hostBegin, CodePointIterator<LChar>(asciiDomainValue.begin(), asciiDomainValue.end())); + if (address) { + serializeIPv4(address.value()); + m_url.m_hostEnd = currentPosition(iterator); + if (iterator.atEnd()) { + m_url.m_portEnd = currentPosition(iterator); + return true; + } + return parsePort(iterator); + } + if (address.error() == IPv4ParsingError::Failure) + return false; + + appendToASCIIBuffer(asciiDomainCharacters, asciiDomainValue.size()); + m_url.m_hostEnd = currentPosition(iterator); + if (!iterator.atEnd()) + return parsePort(iterator); + m_url.m_portEnd = currentPosition(iterator); + return true; +} + +std::optional<String> URLParser::formURLDecode(StringView input) +{ + auto utf8 = input.utf8(StrictConversion); + if (utf8.isNull()) + return std::nullopt; + auto percentDecoded = percentDecode(reinterpret_cast<const LChar*>(utf8.data()), utf8.length()); + return String::fromUTF8(percentDecoded.data(), percentDecoded.size()); +} + +// https://url.spec.whatwg.org/#concept-urlencoded-parser +auto URLParser::parseURLEncodedForm(StringView input) -> URLEncodedForm +{ + URLEncodedForm output; + for (StringView bytes : input.split('&')) { + auto equalIndex = bytes.find('='); + if (equalIndex == notFound) { + auto name = formURLDecode(bytes.toString().replace('+', 0x20)); + if (name) + output.append({ name.value(), emptyString() }); + } else { + auto name = formURLDecode(bytes.substring(0, equalIndex).toString().replace('+', 0x20)); + auto value = formURLDecode(bytes.substring(equalIndex + 1).toString().replace('+', 0x20)); + if (name && value) + output.append({ name.value(), value.value() }); + } + } + return output; +} + +static void serializeURLEncodedForm(const String& input, Vector<LChar>& output) +{ + auto utf8 = input.utf8(StrictConversion); + const char* data = utf8.data(); + for (size_t i = 0; i < utf8.length(); ++i) { + const char byte = data[i]; + if (byte == 0x20) + output.append(0x2B); + else if (byte == 0x2A + || byte == 0x2D + || byte == 0x2E + || (byte >= 0x30 && byte <= 0x39) + || (byte >= 0x41 && byte <= 0x5A) + || byte == 0x5F + || (byte >= 0x61 && byte <= 0x7A)) // FIXME: Put these in the characterClassTable to avoid branches. + output.append(byte); + else + percentEncodeByte(byte, output); + } +} + +String URLParser::serialize(const URLEncodedForm& tuples) +{ + Vector<LChar> output; + for (auto& tuple : tuples) { + if (!output.isEmpty()) + output.append('&'); + serializeURLEncodedForm(tuple.key, output); + output.append('='); + serializeURLEncodedForm(tuple.value, output); + } + return String::adopt(WTFMove(output)); +} + +const UIDNA& URLParser::internationalDomainNameTranscoder() +{ + static UIDNA* encoder; + static std::once_flag onceFlag; + std::call_once(onceFlag, [] { + UErrorCode error = U_ZERO_ERROR; + encoder = uidna_openUTS46(UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_NONTRANSITIONAL_TO_UNICODE | UIDNA_NONTRANSITIONAL_TO_ASCII, &error); + RELEASE_ASSERT(U_SUCCESS(error)); + RELEASE_ASSERT(encoder); + }); + return *encoder; +} + +bool URLParser::allValuesEqual(const URL& a, const URL& b) +{ + // FIXME: m_cannotBeABaseURL is not compared because the old URL::parse did not use it, + // but once we get rid of URL::parse its value should be tested. + URL_PARSER_LOG("%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s", + a.m_isValid, + a.m_protocolIsInHTTPFamily, + a.m_schemeEnd, + a.m_userStart, + a.m_userEnd, + a.m_passwordEnd, + a.m_hostEnd, + a.m_portEnd, + a.m_pathAfterLastSlash, + a.m_pathEnd, + a.m_queryEnd, + a.m_fragmentEnd, + a.m_string.utf8().data(), + b.m_isValid, + b.m_protocolIsInHTTPFamily, + b.m_schemeEnd, + b.m_userStart, + b.m_userEnd, + b.m_passwordEnd, + b.m_hostEnd, + b.m_portEnd, + b.m_pathAfterLastSlash, + b.m_pathEnd, + b.m_queryEnd, + b.m_fragmentEnd, + b.m_string.utf8().data()); + + return a.m_string == b.m_string + && a.m_isValid == b.m_isValid + && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily + && a.m_schemeEnd == b.m_schemeEnd + && a.m_userStart == b.m_userStart + && a.m_userEnd == b.m_userEnd + && a.m_passwordEnd == b.m_passwordEnd + && a.m_hostEnd == b.m_hostEnd + && a.m_portEnd == b.m_portEnd + && a.m_pathAfterLastSlash == b.m_pathAfterLastSlash + && a.m_pathEnd == b.m_pathEnd + && a.m_queryEnd == b.m_queryEnd + && a.m_fragmentEnd == b.m_fragmentEnd; +} + +bool URLParser::internalValuesConsistent(const URL& url) +{ + return url.m_schemeEnd <= url.m_userStart + && url.m_userStart <= url.m_userEnd + && url.m_userEnd <= url.m_passwordEnd + && url.m_passwordEnd <= url.m_hostEnd + && url.m_hostEnd <= url.m_portEnd + && url.m_portEnd <= url.m_pathAfterLastSlash + && url.m_pathAfterLastSlash <= url.m_pathEnd + && url.m_pathEnd <= url.m_queryEnd + && url.m_queryEnd <= url.m_fragmentEnd + && (url.m_isValid ? url.m_fragmentEnd == url.m_string.length() : !url.m_fragmentEnd); + // FIXME: Why do we even store m_fragmentEnd? + // It should be able to be deduced from m_isValid and m_string.length() to save memory. +} + +} // namespace WebCore |