diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2021-10-26 13:57:00 +0200 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2021-11-02 11:31:01 +0000 |
commit | 1943b3c2a1dcee36c233724fc4ee7613d71b9cf6 (patch) | |
tree | 8c1b5f12357025c197da5427ae02cfdc2f3570d6 /chromium/third_party/blink/renderer/modules/url_pattern | |
parent | 21ba0c5d4bf8fba15dddd97cd693bad2358b77fd (diff) | |
download | qtwebengine-chromium-1943b3c2a1dcee36c233724fc4ee7613d71b9cf6.tar.gz |
BASELINE: Update Chromium to 94.0.4606.111
Change-Id: I924781584def20fc800bedf6ff41fdb96c438193
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'chromium/third_party/blink/renderer/modules/url_pattern')
23 files changed, 2010 insertions, 916 deletions
diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/BUILD.gn b/chromium/third_party/blink/renderer/modules/url_pattern/BUILD.gn index 6bc2956e0c0..dd47096a003 100644 --- a/chromium/third_party/blink/renderer/modules/url_pattern/BUILD.gn +++ b/chromium/third_party/blink/renderer/modules/url_pattern/BUILD.gn @@ -2,12 +2,19 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +import("//testing/libfuzzer/fuzzer_test.gni") import("//third_party/blink/renderer/modules/modules.gni") blink_modules_sources("url_pattern") { sources = [ "url_pattern.cc", "url_pattern.h", + "url_pattern_canon.cc", + "url_pattern_canon.h", + "url_pattern_component.cc", + "url_pattern_component.h", + "url_pattern_parser.cc", + "url_pattern_parser.h", ] public_deps = [ @@ -15,3 +22,14 @@ blink_modules_sources("url_pattern") { "//third_party/liburlpattern", ] } + +if (use_libfuzzer) { + fuzzer_test("url_pattern_fuzzer") { + sources = [ "url_pattern_fuzzer.cc" ] + deps = [ + "//third_party/blink/renderer/modules:modules", + "//third_party/blink/renderer/platform:blink_fuzzer_test_support", + ] + seed_corpus = "fuzzer_seed_corpus" + } +} diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/DEPS b/chromium/third_party/blink/renderer/modules/url_pattern/DEPS index f5a16143eb0..35dd35a70ba 100644 --- a/chromium/third_party/blink/renderer/modules/url_pattern/DEPS +++ b/chromium/third_party/blink/renderer/modules/url_pattern/DEPS @@ -5,4 +5,6 @@ include_rules = [ "+base/strings/string_util.h", "+third_party/liburlpattern", + "+url/url_canon.h", + "+url/url_util.h", ] diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/2dcf128c70be5016986fa5965a89eb839fd6cc3c b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/2dcf128c70be5016986fa5965a89eb839fd6cc3c new file mode 100644 index 00000000000..6d1f1cad6fd --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/2dcf128c70be5016986fa5965a89eb839fd6cc3c @@ -0,0 +1 @@ +https://example.com/count/([0-9]+) diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/345433f6443349a932caefebc1754a7da500a885 b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/345433f6443349a932caefebc1754a7da500a885 new file mode 100644 index 00000000000..9388cc3a9d0 --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/345433f6443349a932caefebc1754a7da500a885 @@ -0,0 +1 @@ +https://example.com/foo diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/43e53712d966badcd72516f9a9df30486173d8bc b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/43e53712d966badcd72516f9a9df30486173d8bc new file mode 100644 index 00000000000..ee9ccfb46eb --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/43e53712d966badcd72516f9a9df30486173d8bc @@ -0,0 +1 @@ +https://example.com::port?/foo diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/4eba2aa9b6632b032ad9affab48eed570c3a7bec b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/4eba2aa9b6632b032ad9affab48eed570c3a7bec new file mode 100644 index 00000000000..f71ae5546df --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/4eba2aa9b6632b032ad9affab48eed570c3a7bec @@ -0,0 +1 @@ +https://:user::pass@example.com/foo diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/60aff90a381901bfbb4fd3d1753a5a8687842821 b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/60aff90a381901bfbb4fd3d1753a5a8687842821 new file mode 100644 index 00000000000..25978ff030a --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/60aff90a381901bfbb4fd3d1753a5a8687842821 @@ -0,0 +1 @@ +https://example.com/:product diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/97cccba9cd38cd5138376093447ea6382a4df220 b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/97cccba9cd38cd5138376093447ea6382a4df220 new file mode 100644 index 00000000000..581fd125184 --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/97cccba9cd38cd5138376093447ea6382a4df220 @@ -0,0 +1 @@ +http{s}?://example.com/foo diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/b0c2fea6b0fbc79ebcebf728cf1c76d4b3509121 b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/b0c2fea6b0fbc79ebcebf728cf1c76d4b3509121 new file mode 100644 index 00000000000..f9d3f896b75 --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/b0c2fea6b0fbc79ebcebf728cf1c76d4b3509121 @@ -0,0 +1 @@ +https://example.com/count/:value([0-9]+) diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/d376eb4568f7c9ab01409838be90c31db1a9e755 b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/d376eb4568f7c9ab01409838be90c31db1a9e755 new file mode 100644 index 00000000000..db759a3cbf4 --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/d376eb4568f7c9ab01409838be90c31db1a9e755 @@ -0,0 +1 @@ +https://{:sub.}?example.com/:product?/index.html diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/dc9e31b18d4686a0f8dec64b5602d5a426ab0f44 b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/dc9e31b18d4686a0f8dec64b5602d5a426ab0f44 new file mode 100644 index 00000000000..f9354e7b6d0 --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/dc9e31b18d4686a0f8dec64b5602d5a426ab0f44 @@ -0,0 +1 @@ +https://example.com/foo?bar#baz diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/ecac354bd05b4e6328a498c43291ad2e129134fc b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/ecac354bd05b4e6328a498c43291ad2e129134fc new file mode 100644 index 00000000000..7d615d8c3b2 --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/fuzzer_seed_corpus/ecac354bd05b4e6328a498c43291ad2e129134fc @@ -0,0 +1 @@ +https://{*.}example.com/foo diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/idls.gni b/chromium/third_party/blink/renderer/modules/url_pattern/idls.gni deleted file mode 100644 index 9f0ad7e912f..00000000000 --- a/chromium/third_party/blink/renderer/modules/url_pattern/idls.gni +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2020 The Chromium Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -modules_idl_files = [ "url_pattern.idl" ] - -modules_dictionary_idl_files = [ - "url_pattern_component_result.idl", - "url_pattern_init.idl", - "url_pattern_result.idl", -] - -modules_dependency_idl_files = [] diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern.cc b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern.cc index e0e2f943dfb..3abf9254028 100644 --- a/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern.cc +++ b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern.cc @@ -6,10 +6,13 @@ #include "base/strings/string_util.h" #include "third_party/blink/renderer/bindings/core/v8/script_regexp.h" -#include "third_party/blink/renderer/bindings/modules/v8/usv_string_or_url_pattern_init.h" #include "third_party/blink/renderer/bindings/modules/v8/v8_union_urlpatterninit_usvstring.h" #include "third_party/blink/renderer/bindings/modules/v8/v8_url_pattern_component_result.h" +#include "third_party/blink/renderer/bindings/modules/v8/v8_url_pattern_init.h" #include "third_party/blink/renderer/bindings/modules/v8/v8_url_pattern_result.h" +#include "third_party/blink/renderer/modules/url_pattern/url_pattern_canon.h" +#include "third_party/blink/renderer/modules/url_pattern/url_pattern_component.h" +#include "third_party/blink/renderer/modules/url_pattern/url_pattern_parser.h" #include "third_party/blink/renderer/platform/bindings/exception_state.h" #include "third_party/blink/renderer/platform/weborigin/kurl.h" #include "third_party/blink/renderer/platform/weborigin/security_origin.h" @@ -17,88 +20,15 @@ #include "third_party/blink/renderer/platform/wtf/text/string_utf8_adaptor.h" #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h" #include "third_party/liburlpattern/pattern.h" +#include "third_party/liburlpattern/tokenize.h" namespace blink { -// A struct representing all the information needed to match a particular -// component of a URL. -class URLPattern::Component final - : public GarbageCollected<URLPattern::Component> { - public: - bool Match(StringView input, Vector<String>* group_list) const { - return regexp->Match(input, /*start_from=*/0, /*match_length=*/nullptr, - group_list) == 0; - } - - void Trace(Visitor* visitor) const { visitor->Trace(regexp); } - - // The parsed pattern. - liburlpattern::Pattern pattern; - - // The pattern compiled down to a js regular expression. - Member<ScriptRegexp> regexp; - - // The names to be applied to the regular expression capture groups. Note, - // liburlpattern regular expressions do not use named capture groups directly. - Vector<String> name_list; - - Component(liburlpattern::Pattern p, ScriptRegexp* r, Vector<String> n) - : pattern(p), regexp(r), name_list(std::move(n)) {} -}; +using url_pattern::Component; +using url_pattern::ValueType; namespace { -// The default pattern string for components that are not specified in the -// URLPattern constructor. -const char* kDefaultPattern = "*"; - -// The liburlpattern::Options to use for most component patterns. We -// default to strict mode and case sensitivity. In addition, most -// components have no concept of a delimiter or prefix character. -const liburlpattern::Options& DefaultOptions() { - DEFINE_THREAD_SAFE_STATIC_LOCAL(liburlpattern::Options, options, - ({.delimiter_list = "", - .prefix_list = "", - .sensitive = true, - .strict = true})); - return options; -} - -// The liburlpattern::Options to use for hostname patterns. This uses a -// "." delimiter controlling how far a named group like ":bar" will match -// by default. Note, hostnames are case insensitive but we require case -// sensitivity here. This assumes that the hostname values have already -// been normalized to lower case as in URL(). -const liburlpattern::Options& HostnameOptions() { - DEFINE_STATIC_LOCAL(liburlpattern::Options, options, - ({.delimiter_list = ".", - .prefix_list = "", - .sensitive = true, - .strict = true})); - return options; -} - -// The liburlpattern::Options to use for pathname patterns. This uses a -// "/" delimiter controlling how far a named group like ":bar" will match -// by default. It also configures "/" to be treated as an automatic -// prefix before groups. -const liburlpattern::Options& PathnameOptions() { - DEFINE_STATIC_LOCAL(liburlpattern::Options, options, - ({.delimiter_list = "/", - .prefix_list = "/", - .sensitive = true, - .strict = true})); - return options; -} - -// An enum indicating whether the associated component values be operated -// on are for patterns or URLs. Validation and canonicalization will -// do different things depending on the type. -enum class ValueType { - kPattern, - kURL, -}; - // Utility function to determine if a pathname is absolute or not. For // kURL values this mainly consists of a check for a leading slash. For // patterns we do some additional checking for escaped or grouped slashes. @@ -128,213 +58,6 @@ bool IsAbsolutePathname(const String& pathname, ValueType type) { return false; } -String StringFromCanonOutput(const url::CanonOutput& output, - const url::Component& component) { - return String::FromUTF8(output.data() + component.begin, component.len); -} - -std::string StdStringFromCanonOutput(const url::CanonOutput& output, - const url::Component& component) { - return std::string(output.data() + component.begin, component.len); -} - -// A callback to be passed to the liburlpattern::Parse() method that performs -// validation and encoding for the protocol component. -absl::StatusOr<std::string> ProtocolEncodeCallback(absl::string_view input) { - if (input.empty()) - return std::string(); - - url::RawCanonOutputT<char> canon_output; - url::Component component; - - bool result = url::CanonicalizeScheme( - input.data(), url::Component(0, static_cast<int>(input.size())), - &canon_output, &component); - - if (!result) { - return absl::InvalidArgumentError("Invalid protocol '" + - std::string(input) + "'."); - } - - return StdStringFromCanonOutput(canon_output, component); -} - -// Utility function to canonicalize a protocol string. Throws an exception -// if the input is invalid. The canonicalization and/or validation will -// differ depending on whether |type| is kURL or kPattern. -String CanonicalizeProtocol(const String& input, - ValueType type, - ExceptionState& exception_state) { - if (type == ValueType::kPattern) { - // Canonicalization for patterns is handled during compilation via - // encoding callbacks. - return input; - } - - bool result = false; - url::RawCanonOutputT<char> canon_output; - url::Component component; - if (input.Is8Bit()) { - StringUTF8Adaptor utf8(input); - result = url::CanonicalizeScheme( - utf8.data(), url::Component(0, utf8.size()), &canon_output, &component); - } else { - result = url::CanonicalizeScheme(input.Characters16(), - url::Component(0, input.length()), - &canon_output, &component); - } - - if (!result) { - exception_state.ThrowTypeError("Invalid protocol '" + input + "'."); - return String(); - } - - return StringFromCanonOutput(canon_output, component); -} - -// A callback to be passed to the liburlpattern::Parse() method that performs -// validation and encoding for the username component. -absl::StatusOr<std::string> UsernameEncodeCallback(absl::string_view input) { - if (input.empty()) - return std::string(); - - url::RawCanonOutputT<char> canon_output; - url::Component username_component; - url::Component password_component; - - bool result = url::CanonicalizeUserInfo( - input.data(), url::Component(0, static_cast<int>(input.size())), "", - url::Component(0, 0), &canon_output, &username_component, - &password_component); - - if (!result) { - return absl::InvalidArgumentError("Invalid username pattern '" + - std::string(input) + "'."); - } - - return StdStringFromCanonOutput(canon_output, username_component); -} - -// A callback to be passed to the liburlpattern::Parse() method that performs -// validation and encoding for the password component. -absl::StatusOr<std::string> PasswordEncodeCallback(absl::string_view input) { - if (input.empty()) - return std::string(); - - url::RawCanonOutputT<char> canon_output; - url::Component username_component; - url::Component password_component; - - bool result = url::CanonicalizeUserInfo( - "", url::Component(0, 0), input.data(), - url::Component(0, static_cast<int>(input.size())), &canon_output, - &username_component, &password_component); - - if (!result) { - return absl::InvalidArgumentError("Invalid password pattern '" + - std::string(input) + "'."); - } - - return StdStringFromCanonOutput(canon_output, password_component); -} - -// Utility function to canonicalize username and/or password strings. Throws -// an exception if either is invalid. The canonicalization and/or validation -// will differ depending on whether |type| is kURL or kPattern. On success -// |username_out| and |password_out| will contain the canonical values. -void CanonicalizeUsernameAndPassword(const String& username, - const String& password, - ValueType type, - String& username_out, - String& password_out, - ExceptionState& exception_state) { - if (type == ValueType::kPattern) { - // Canonicalization for patterns is handled during compilation via - // encoding callbacks. - username_out = username; - password_out = password; - return; - } - - bool result = false; - url::RawCanonOutputT<char> canon_output; - url::Component username_component; - url::Component password_component; - - if (username && password && username.Is8Bit() && password.Is8Bit()) { - StringUTF8Adaptor username_utf8(username); - StringUTF8Adaptor password_utf8(password); - result = url::CanonicalizeUserInfo( - username_utf8.data(), url::Component(0, username_utf8.size()), - password_utf8.data(), url::Component(0, password_utf8.size()), - &canon_output, &username_component, &password_component); - - } else { - String username16(username); - String password16(password); - username16.Ensure16Bit(); - password16.Ensure16Bit(); - result = url::CanonicalizeUserInfo( - username16.Characters16(), url::Component(0, username16.length()), - password16.Characters16(), url::Component(0, password16.length()), - &canon_output, &username_component, &password_component); - } - - if (!result) { - exception_state.ThrowTypeError("Invalid username '" + username + - "' and/or password '" + password + "'."); - return; - } - - if (username_component.len != -1) - username_out = StringFromCanonOutput(canon_output, username_component); - if (password_component.len != -1) - password_out = StringFromCanonOutput(canon_output, password_component); -} - -// A callback to be passed to the liburlpattern::Parse() method that performs -// validation and encoding for the hostname component. -absl::StatusOr<std::string> HostnameEncodeCallback(absl::string_view input) { - if (input.empty()) - return std::string(); - - url::RawCanonOutputT<char> canon_output; - url::Component component; - - bool result = url::CanonicalizeHost( - input.data(), url::Component(0, static_cast<int>(input.size())), - &canon_output, &component); - - if (!result) { - return absl::InvalidArgumentError("Invalid hostname pattern '" + - std::string(input) + "'."); - } - - return StdStringFromCanonOutput(canon_output, component); -} - -// Utility function to canonicalize a hostname string. Throws an exception -// if the input is invalid. The canonicalization and/or validation will -// differ depending on whether |type| is kURL or kPattern. -String CanonicalizeHostname(const String& input, - ValueType type, - ExceptionState& exception_state) { - if (type == ValueType::kPattern) { - // Canonicalization for patterns is handled during compilation via - // encoding callbacks. - return input; - } - - bool success = false; - String result = SecurityOrigin::CanonicalizeHost(input, &success); - if (!success) { - exception_state.ThrowTypeError("Invalid hostname '" + input + "'."); - return String(); - } - - return result; -} - // Utility function to determine if the default port for the given protocol // matches the given port number. bool IsProtocolDefaultPort(const String& protocol, const String& port) { @@ -353,260 +76,6 @@ bool IsProtocolDefaultPort(const String& protocol, const String& port) { return default_port != url::PORT_UNSPECIFIED && default_port == port_number; } -// A callback to be passed to the liburlpattern::Parse() method that performs -// validation and encoding for the port component. -absl::StatusOr<std::string> PortEncodeCallback(absl::string_view input) { - if (input.empty()) - return std::string(); - - url::RawCanonOutputT<char> canon_output; - url::Component component; - - bool result = url::CanonicalizePort( - input.data(), url::Component(0, static_cast<int>(input.size())), - url::PORT_UNSPECIFIED, &canon_output, &component); - - if (!result) { - return absl::InvalidArgumentError("Invalid port pattern '" + - std::string(input) + "'."); - } - - return StdStringFromCanonOutput(canon_output, component); -} - -// Utility function to canonicalize a port string. Throws an exception -// if the input is invalid. The canonicalization and/or validation will -// differ depending on whether |type| is kURL or kPattern. The |protocol| -// must be provided in order to handle default ports correctly. -String CanonicalizePort(const String& input, - ValueType type, - const String& protocol, - ExceptionState& exception_state) { - if (type == ValueType::kPattern) { - // Canonicalization for patterns is handled during compilation via - // encoding callbacks. - return input; - } - - int default_port = url::PORT_UNSPECIFIED; - if (!input.IsEmpty()) { - StringUTF8Adaptor protocol_utf8(protocol); - default_port = - url::DefaultPortForScheme(protocol_utf8.data(), protocol_utf8.size()); - } - - // Since ports only consist of digits there should be no encoding needed. - // Therefore we directly use the UTF8 encoding version of CanonicalizePort(). - StringUTF8Adaptor utf8(input); - url::RawCanonOutputT<char> canon_output; - url::Component component; - if (!url::CanonicalizePort(utf8.data(), url::Component(0, utf8.size()), - default_port, &canon_output, &component)) { - exception_state.ThrowTypeError("Invalid port '" + input + "'."); - return String(); - } - - return component.len == -1 ? g_empty_string - : StringFromCanonOutput(canon_output, component); -} - -// A callback to be passed to the liburlpattern::Parse() method that performs -// validation and encoding for the pathname component using "standard" URL -// behavior. -absl::StatusOr<std::string> StandardURLPathnameEncodeCallback( - absl::string_view input) { - if (input.empty()) - return std::string(); - - url::RawCanonOutputT<char> canon_output; - url::Component component; - - bool result = url::CanonicalizePartialPath( - input.data(), url::Component(0, static_cast<int>(input.size())), - &canon_output, &component); - - if (!result) { - return absl::InvalidArgumentError("Invalid pathname pattern '" + - std::string(input) + "'."); - } - - return StdStringFromCanonOutput(canon_output, component); -} - -// A callback to be passed to the liburlpattern::Parse() method that performs -// validation and encoding for the pathname component using "path" URL -// behavior. This is like "cannot-be-a-base" URL behavior in the spec. -absl::StatusOr<std::string> PathURLPathnameEncodeCallback( - absl::string_view input) { - if (input.empty()) - return std::string(); - - url::RawCanonOutputT<char> canon_output; - url::Component component; - - url::CanonicalizePathURLPath( - input.data(), url::Component(0, static_cast<int>(input.size())), - &canon_output, &component); - - return StdStringFromCanonOutput(canon_output, component); -} - -// Utility function to canonicalize a pathname string. Throws an exception -// if the input is invalid. The canonicalization and/or validation will -// differ depending on whether |type| is kURL or kPattern. -String CanonicalizePathname(const String& protocol, - const String& input, - ValueType type, - ExceptionState& exception_state) { - if (type == ValueType::kPattern) { - // Canonicalization for patterns is handled during compilation via - // encoding callbacks. - return input; - } - - // Determine if we are using "standard" or "path" URL canonicalization - // for the pathname. In spec terms the "path" URL behavior corresponds - // to "cannot-be-a-base" URLs. We make this determination based on the - // protocol string since we cannot look at the number of slashes between - // components like the URL spec. If this is inadequate the developer - // can use the baseURL property to get more strict URL behavior. - // - // We default to "standard" URL behavior to match how the empty protocol - // string in the URLPattern constructor results in the pathname pattern - // getting "standard" URL canonicalization. - bool standard = false; - if (protocol.IsEmpty()) { - standard = true; - } else if (protocol.Is8Bit()) { - StringUTF8Adaptor utf8(protocol); - standard = url::IsStandard(utf8.data(), url::Component(0, utf8.size())); - } else { - standard = url::IsStandard(protocol.Characters16(), - url::Component(0, protocol.length())); - } - - // Do not enforce absolute pathnames here since we can't enforce it - // it consistently in the URLPattern constructor. This allows us to - // produce a match when the exact same fixed pathname string is passed - // to both the constructor and test()/exec(). Similarly, we use - // url::CanonicalizePartialPath() below instead of url::CanonicalizePath() - // to avoid pre-pending a slash at the start of the string. - - bool result = false; - url::RawCanonOutputT<char> canon_output; - url::Component component; - - const auto canonicalize_path = [&](const auto* data, int length) { - if (standard) { - return url::CanonicalizePartialPath(data, url::Component(0, length), - &canon_output, &component); - } - url::CanonicalizePathURLPath(data, url::Component(0, length), &canon_output, - &component); - return true; - }; - - if (input.Is8Bit()) { - StringUTF8Adaptor utf8(input); - result = canonicalize_path(utf8.data(), utf8.size()); - } else { - result = canonicalize_path(input.Characters16(), input.length()); - } - - if (!result) { - exception_state.ThrowTypeError("Invalid pathname '" + input + "'."); - return String(); - } - - return StringFromCanonOutput(canon_output, component); -} - -// A callback to be passed to the liburlpattern::Parse() method that performs -// validation and encoding for the search component. -absl::StatusOr<std::string> SearchEncodeCallback(absl::string_view input) { - if (input.empty()) - return std::string(); - - url::RawCanonOutputT<char> canon_output; - url::Component component; - - url::CanonicalizeQuery(input.data(), - url::Component(0, static_cast<int>(input.size())), - /*converter=*/nullptr, &canon_output, &component); - - return StdStringFromCanonOutput(canon_output, component); -} - -// Utility function to canonicalize a search string. Throws an exception -// if the input is invalid. The canonicalization and/or validation will -// differ depending on whether |type| is kURL or kPattern. -String CanonicalizeSearch(const String& input, - ValueType type, - ExceptionState& exception_state) { - if (type == ValueType::kPattern) { - // Canonicalization for patterns is handled during compilation via - // encoding callbacks. - return input; - } - - url::RawCanonOutputT<char> canon_output; - url::Component component; - if (input.Is8Bit()) { - StringUTF8Adaptor utf8(input); - url::CanonicalizeQuery(utf8.data(), url::Component(0, utf8.size()), - /*converter=*/nullptr, &canon_output, &component); - } else { - url::CanonicalizeQuery(input.Characters16(), - url::Component(0, input.length()), - /*converter=*/nullptr, &canon_output, &component); - } - - return StringFromCanonOutput(canon_output, component); -} - -// A callback to be passed to the liburlpattern::Parse() method that performs -// validation and encoding for the hash component. -absl::StatusOr<std::string> HashEncodeCallback(absl::string_view input) { - if (input.empty()) - return std::string(); - - url::RawCanonOutputT<char> canon_output; - url::Component component; - - url::CanonicalizeRef(input.data(), - url::Component(0, static_cast<int>(input.size())), - &canon_output, &component); - - return StdStringFromCanonOutput(canon_output, component); -} - -// Utility function to canonicalize a hash string. Throws an exception -// if the input is invalid. The canonicalization and/or validation will -// differ depending on whether |type| is kURL or kPattern. -String CanonicalizeHash(const String& input, - ValueType type, - ExceptionState& exception_state) { - if (type == ValueType::kPattern) { - // Canonicalization for patterns is handled during compilation via - // encoding callbacks. - return input; - } - - url::RawCanonOutputT<char> canon_output; - url::Component component; - if (input.Is8Bit()) { - StringUTF8Adaptor utf8(input); - url::CanonicalizeRef(utf8.data(), url::Component(0, utf8.size()), - &canon_output, &component); - } else { - url::CanonicalizeRef(input.Characters16(), - url::Component(0, input.length()), &canon_output, - &component); - } - - return StringFromCanonOutput(canon_output, component); -} - // A utility method that takes a URLPatternInit, splits it apart, and applies // the individual component values in the given set of strings. The strings // are only applied if a value is present in the init structure. @@ -643,33 +112,37 @@ void ApplyInit(const URLPatternInit* init, port = base_url.Port() > 0 ? String::Number(base_url.Port()) : g_empty_string; pathname = base_url.GetPath() ? base_url.GetPath() : g_empty_string; - - // Do no propagate search or hash from the base URL. This matches the - // behavior when resolving a relative URL against a base URL. + search = base_url.Query() ? base_url.Query() : g_empty_string; + hash = base_url.HasFragmentIdentifier() ? base_url.FragmentIdentifier() + : g_empty_string; } // Apply the URLPatternInit component values on top of the default and // baseURL values. if (init->hasProtocol()) { - protocol = CanonicalizeProtocol(init->protocol(), type, exception_state); + protocol = url_pattern::CanonicalizeProtocol(init->protocol(), type, + exception_state); if (exception_state.HadException()) return; } if (init->hasUsername() || init->hasPassword()) { String init_username = init->hasUsername() ? init->username() : String(); String init_password = init->hasPassword() ? init->password() : String(); - CanonicalizeUsernameAndPassword(init_username, init_password, type, - username, password, exception_state); + url_pattern::CanonicalizeUsernameAndPassword(init_username, init_password, + type, username, password, + exception_state); if (exception_state.HadException()) return; } if (init->hasHostname()) { - hostname = CanonicalizeHostname(init->hostname(), type, exception_state); + hostname = url_pattern::CanonicalizeHostname(init->hostname(), type, + exception_state); if (exception_state.HadException()) return; } if (init->hasPort()) { - port = CanonicalizePort(init->port(), type, protocol, exception_state); + port = url_pattern::CanonicalizePort(init->port(), type, protocol, + exception_state); if (exception_state.HadException()) return; } @@ -689,17 +162,19 @@ void ApplyInit(const URLPatternInit* init, pathname = base_url.GetPath().Substring(0, slash_index + 1) + pathname; } } - pathname = CanonicalizePathname(protocol, pathname, type, exception_state); + pathname = url_pattern::CanonicalizePathname(protocol, pathname, type, + exception_state); if (exception_state.HadException()) return; } if (init->hasSearch()) { - search = CanonicalizeSearch(init->search(), type, exception_state); + search = + url_pattern::CanonicalizeSearch(init->search(), type, exception_state); if (exception_state.HadException()) return; } if (init->hasHash()) { - hash = CanonicalizeHash(init->hash(), type, exception_state); + hash = url_pattern::CanonicalizeHash(init->hash(), type, exception_state); if (exception_state.HadException()) return; } @@ -707,7 +182,52 @@ void ApplyInit(const URLPatternInit* init, } // namespace +URLPattern* URLPattern::Create(const V8URLPatternInput* input, + const String& base_url, + ExceptionState& exception_state) { + if (input->GetContentType() == + V8URLPatternInput::ContentType::kURLPatternInit) { + exception_state.ThrowTypeError( + "Invalid second argument baseURL '" + base_url + + "' provided with a URLPatternInit input. Use the " + "URLPatternInit.baseURL property instead."); + return nullptr; + } + + const auto& input_string = input->GetAsUSVString(); + + url_pattern::Parser parser(input_string); + parser.Parse(exception_state); + if (exception_state.HadException()) + return nullptr; + + URLPatternInit* init = parser.GetResult(); + if (!base_url && !init->hasProtocol()) { + exception_state.ThrowTypeError( + "Relative constructor string '" + input_string + + "' must have a base URL passed as the second argument."); + return nullptr; + } + + if (base_url) + init->setBaseURL(base_url); + + return Create(init, parser.GetProtocolComponent(), exception_state); +} + +URLPattern* URLPattern::Create(const V8URLPatternInput* input, + ExceptionState& exception_state) { + if (input->IsURLPatternInit()) { + return URLPattern::Create(input->GetAsURLPatternInit(), + /*precomputed_protocol_component=*/nullptr, + exception_state); + } + + return Create(input, /*base_url=*/String(), exception_state); +} + URLPattern* URLPattern::Create(const URLPatternInit* init, + Component* precomputed_protocol_component, ExceptionState& exception_state) { // Each component defaults to a wildcard matching any input. We use // the null string as a shorthand for the default. @@ -734,80 +254,54 @@ URLPattern* URLPattern::Create(const URLPatternInit* init, if (IsProtocolDefaultPort(protocol, port)) port = ""; - // Compile each component pattern into a Component structure that can - // be used for matching. Components that match any input may have a - // nullptr Component struct pointer. + // Compile each component pattern into a Component structure that + // can be used for matching. - auto* protocol_component = - CompilePattern(protocol, "protocol", ProtocolEncodeCallback, - DefaultOptions(), exception_state); + auto* protocol_component = precomputed_protocol_component; + if (!protocol_component) { + protocol_component = + Component::Compile(protocol, Component::Type::kProtocol, + /*protocol_component=*/nullptr, exception_state); + } if (exception_state.HadException()) return nullptr; auto* username_component = - CompilePattern(username, "username", UsernameEncodeCallback, - DefaultOptions(), exception_state); + Component::Compile(username, Component::Type::kUsername, + protocol_component, exception_state); if (exception_state.HadException()) return nullptr; auto* password_component = - CompilePattern(password, "password", PasswordEncodeCallback, - DefaultOptions(), exception_state); + Component::Compile(password, Component::Type::kPassword, + protocol_component, exception_state); if (exception_state.HadException()) return nullptr; auto* hostname_component = - CompilePattern(hostname, "hostname", HostnameEncodeCallback, - HostnameOptions(), exception_state); + Component::Compile(hostname, Component::Type::kHostname, + protocol_component, exception_state); if (exception_state.HadException()) return nullptr; - auto* port_component = CompilePattern(port, "port", PortEncodeCallback, - DefaultOptions(), exception_state); + auto* port_component = Component::Compile( + port, Component::Type::kPort, protocol_component, exception_state); if (exception_state.HadException()) return nullptr; - // Different types of URLs use different canonicalization for pathname. - // A "standard" URL flattens `.`/`..` and performs full percent encoding. - // A "path" URL does not flatten and uses a more lax percent encoding. - // The spec calls "path" URLs as "cannot-be-a-base-URL" URLs: - // - // https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state - // - // We prefer "standard" URL here by checking to see if the protocol - // pattern matches any of the known standard protocol strings. So - // an exact pattern of `http` will match, but so will `http{s}?` and - // `*`. - // - // If the protocol pattern does not match any of the known standard URL - // protocols then we fall back to the "path" URL behavior. This will - // normally be triggered by `data`, `javascript`, `about`, etc. It - // will also be triggered for custom protocol strings. We favor "path" - // behavior here because its better to under canonicalize since the - // developer can always manually canonicalize the pathname for a custom - // protocol. - // - // ShouldTreatAsStandardURL can by a bit expensive, so only do it if we - // actually have a pathname pattern to compile. - liburlpattern::EncodeCallback pathname_encode = PathURLPathnameEncodeCallback; - if (!pathname.IsNull() && ShouldTreatAsStandardURL(protocol_component)) { - pathname_encode = StandardURLPathnameEncodeCallback; - } - auto* pathname_component = - CompilePattern(pathname, "pathname", pathname_encode, PathnameOptions(), - exception_state); + Component::Compile(pathname, Component::Type::kPathname, + protocol_component, exception_state); if (exception_state.HadException()) return nullptr; - auto* search_component = - CompilePattern(search, "search", SearchEncodeCallback, DefaultOptions(), - exception_state); + auto* search_component = Component::Compile( + search, Component::Type::kSearch, protocol_component, exception_state); if (exception_state.HadException()) return nullptr; - auto* hash_component = CompilePattern(hash, "hash", HashEncodeCallback, - DefaultOptions(), exception_state); + auto* hash_component = Component::Compile( + hash, Component::Type::kHash, protocol_component, exception_state); if (exception_state.HadException()) return nullptr; @@ -836,32 +330,20 @@ URLPattern::URLPattern(Component* protocol, hash_(hash) {} bool URLPattern::test( -#if defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) const V8URLPatternInput* input, -#else // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - const USVStringOrURLPatternInit& input, -#endif // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) const String& base_url, ExceptionState& exception_state) const { return Match(input, base_url, /*result=*/nullptr, exception_state); } bool URLPattern::test( -#if defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) const V8URLPatternInput* input, -#else // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - const USVStringOrURLPatternInit& input, -#endif // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) ExceptionState& exception_state) const { return test(input, /*base_url=*/String(), exception_state); } URLPatternResult* URLPattern::exec( -#if defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) const V8URLPatternInput* input, -#else // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - const USVStringOrURLPatternInit& input, -#endif // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) const String& base_url, ExceptionState& exception_state) const { URLPatternResult* result = URLPatternResult::Create(); @@ -871,69 +353,71 @@ URLPatternResult* URLPattern::exec( } URLPatternResult* URLPattern::exec( -#if defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) const V8URLPatternInput* input, -#else // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - const USVStringOrURLPatternInit& input, -#endif // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) ExceptionState& exception_state) const { return exec(input, /*base_url=*/String(), exception_state); } String URLPattern::protocol() const { - if (!protocol_) - return kDefaultPattern; - std::string result = protocol_->pattern.GeneratePatternString(); - return String::FromUTF8(result); + return protocol_->GeneratePatternString(); } String URLPattern::username() const { - if (!username_) - return kDefaultPattern; - std::string result = username_->pattern.GeneratePatternString(); - return String::FromUTF8(result); + return username_->GeneratePatternString(); } String URLPattern::password() const { - if (!password_) - return kDefaultPattern; - std::string result = password_->pattern.GeneratePatternString(); - return String::FromUTF8(result); + return password_->GeneratePatternString(); } String URLPattern::hostname() const { - if (!hostname_) - return kDefaultPattern; - std::string result = hostname_->pattern.GeneratePatternString(); - return String::FromUTF8(result); + return hostname_->GeneratePatternString(); } String URLPattern::port() const { - if (!port_) - return kDefaultPattern; - std::string result = port_->pattern.GeneratePatternString(); - return String::FromUTF8(result); + return port_->GeneratePatternString(); } String URLPattern::pathname() const { - if (!pathname_) - return kDefaultPattern; - std::string result = pathname_->pattern.GeneratePatternString(); - return String::FromUTF8(result); + return pathname_->GeneratePatternString(); } String URLPattern::search() const { - if (!search_) - return kDefaultPattern; - std::string result = search_->pattern.GeneratePatternString(); - return String::FromUTF8(result); + return search_->GeneratePatternString(); } String URLPattern::hash() const { - if (!hash_) - return kDefaultPattern; - std::string result = hash_->pattern.GeneratePatternString(); - return String::FromUTF8(result); + return hash_->GeneratePatternString(); +} + +// static +int URLPattern::compareComponent(const V8URLPatternComponent& component, + const URLPattern* left, + const URLPattern* right) { + switch (component.AsEnum()) { + case V8URLPatternComponent::Enum::kProtocol: + return url_pattern::Component::Compare(*left->protocol_, + *right->protocol_); + case V8URLPatternComponent::Enum::kUsername: + return url_pattern::Component::Compare(*left->username_, + *right->username_); + case V8URLPatternComponent::Enum::kPassword: + return url_pattern::Component::Compare(*left->password_, + *right->password_); + case V8URLPatternComponent::Enum::kHostname: + return url_pattern::Component::Compare(*left->hostname_, + *right->hostname_); + case V8URLPatternComponent::Enum::kPort: + return url_pattern::Component::Compare(*left->port_, *right->port_); + case V8URLPatternComponent::Enum::kPathname: + return url_pattern::Component::Compare(*left->pathname_, + *right->pathname_); + case V8URLPatternComponent::Enum::kSearch: + return url_pattern::Component::Compare(*left->search_, *right->search_); + case V8URLPatternComponent::Enum::kHash: + return url_pattern::Component::Compare(*left->hash_, *right->hash_); + } + NOTREACHED(); } void URLPattern::Trace(Visitor* visitor) const { @@ -948,87 +432,8 @@ void URLPattern::Trace(Visitor* visitor) const { ScriptWrappable::Trace(visitor); } -// static -URLPattern::Component* URLPattern::CompilePattern( - const String& pattern, - StringView component, - liburlpattern::EncodeCallback encode_callback, - const liburlpattern::Options& options, - ExceptionState& exception_state) { - // If the pattern is null then optimize by not compiling a pattern. Instead, - // a nullptr Component is interpreted as matching any input value. - if (pattern.IsNull()) - return nullptr; - - // Parse the pattern. - StringUTF8Adaptor utf8(pattern); - auto parse_result = - liburlpattern::Parse(absl::string_view(utf8.data(), utf8.size()), - std::move(encode_callback), options); - if (!parse_result.ok()) { - exception_state.ThrowTypeError("Invalid " + component + " pattern '" + - pattern + "'."); - return nullptr; - } - - // Extract a regular expression string from the parsed pattern. - std::vector<std::string> name_list; - std::string regexp_string = - parse_result.value().GenerateRegexString(&name_list); - - // Compile the regular expression to verify it is valid. - auto case_sensitive = options.sensitive ? WTF::kTextCaseSensitive - : WTF::kTextCaseASCIIInsensitive; - DCHECK(base::IsStringASCII(regexp_string)); - ScriptRegexp* regexp = MakeGarbageCollected<ScriptRegexp>( - String(regexp_string.data(), regexp_string.size()), case_sensitive, - kMultilineDisabled, ScriptRegexp::UTF16); - if (!regexp->IsValid()) { - // The regular expression failed to compile. This means that some - // custom regexp group within the pattern is illegal. Attempt to - // compile each regexp group individually in order to identify the - // culprit. - for (auto& part : parse_result.value().PartList()) { - if (part.type != liburlpattern::PartType::kRegex) - continue; - DCHECK(base::IsStringASCII(part.value)); - String group_value(part.value.data(), part.value.size()); - regexp = MakeGarbageCollected<ScriptRegexp>( - group_value, case_sensitive, kMultilineDisabled, ScriptRegexp::UTF16); - if (regexp->IsValid()) - continue; - exception_state.ThrowTypeError("Invalid " + component + " pattern '" + - pattern + - "'. Custom regular expression group '" + - group_value + "' is invalid."); - return nullptr; - } - // We couldn't find a bad regexp group, but we still have an overall - // error. This shouldn't happen, but we handle it anyway. - exception_state.ThrowTypeError("Invalid " + component + " pattern '" + - pattern + - "'. An unexpected error has occurred."); - return nullptr; - } - - Vector<String> wtf_name_list; - wtf_name_list.ReserveInitialCapacity( - static_cast<wtf_size_t>(name_list.size())); - for (const auto& name : name_list) { - wtf_name_list.push_back(String::FromUTF8(name.data(), name.size())); - } - - return MakeGarbageCollected<URLPattern::Component>( - std::move(parse_result.value()), std::move(regexp), - std::move(wtf_name_list)); -} - bool URLPattern::Match( -#if defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) const V8URLPatternInput* input, -#else // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - const USVStringOrURLPatternInit& input, -#endif // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) const String& base_url, URLPatternResult* result, ExceptionState& exception_state) const { @@ -1043,84 +448,71 @@ bool URLPattern::Match( String search(g_empty_string); String hash(g_empty_string); - HeapVector<USVStringOrURLPatternInit> inputs; - - bool is_init = -#if defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - input->GetContentType() == - V8URLPatternInput::ContentType::kURLPatternInit; -#else // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - input.IsURLPatternInit(); -#endif // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - - if (is_init) { - if (base_url) { - exception_state.ThrowTypeError( - "Invalid second argument baseURL '" + base_url + - "' provided with a URLPatternInit input. Use the " - "URLPatternInit.baseURL property instead."); - return false; - } + HeapVector<Member<V8URLPatternInput>> inputs; - URLPatternInit* init = -#if defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - input->GetAsURLPatternInit(); -#else // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - input.GetAsURLPatternInit(); -#endif // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - - inputs.push_back(USVStringOrURLPatternInit::FromURLPatternInit(init)); - - // Layer the URLPatternInit values on top of the default empty strings. - ApplyInit(init, ValueType::kURL, protocol, username, password, hostname, - port, pathname, search, hash, exception_state); - if (exception_state.HadException()) { - // Treat exceptions simply as a failure to match. - exception_state.ClearException(); - return false; - } - } else { - KURL parsed_base_url(base_url); - if (base_url && !parsed_base_url.IsValid()) { - // Treat as failure to match, but don't throw an exception. - return false; - } + switch (input->GetContentType()) { + case V8URLPatternInput::ContentType::kURLPatternInit: { + if (base_url) { + exception_state.ThrowTypeError( + "Invalid second argument baseURL '" + base_url + + "' provided with a URLPatternInit input. Use the " + "URLPatternInit.baseURL property instead."); + return false; + } + + URLPatternInit* init = input->GetAsURLPatternInit(); + + inputs.push_back(MakeGarbageCollected<V8URLPatternInput>(init)); - const String& input_string = -#if defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - input->GetAsUSVString(); -#else // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - input.GetAsUSVString(); -#endif // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - - inputs.push_back(USVStringOrURLPatternInit::FromUSVString(input_string)); - if (base_url) - inputs.push_back(USVStringOrURLPatternInit::FromUSVString(base_url)); - - // The compile the input string as a fully resolved URL. - KURL url(parsed_base_url, input_string); - if (!url.IsValid() || url.IsEmpty()) { - // Treat as failure to match, but don't throw an exception. - return false; + // Layer the URLPatternInit values on top of the default empty strings. + ApplyInit(init, ValueType::kURL, protocol, username, password, hostname, + port, pathname, search, hash, exception_state); + if (exception_state.HadException()) { + // Treat exceptions simply as a failure to match. + exception_state.ClearException(); + return false; + } + break; } + case V8URLPatternInput::ContentType::kUSVString: { + KURL parsed_base_url(base_url); + if (base_url && !parsed_base_url.IsValid()) { + // Treat as failure to match, but don't throw an exception. + return false; + } + + const String& input_string = input->GetAsUSVString(); - // Apply the parsed URL components on top of our defaults. - if (url.Protocol()) - protocol = url.Protocol(); - if (url.User()) - username = url.User(); - if (url.Pass()) - password = url.Pass(); - if (url.Host()) - hostname = url.Host(); - if (url.Port() > 0) - port = String::Number(url.Port()); - if (url.GetPath()) - pathname = url.GetPath(); - if (url.Query()) - search = url.Query(); - if (url.FragmentIdentifier()) - hash = url.FragmentIdentifier(); + inputs.push_back(MakeGarbageCollected<V8URLPatternInput>(input_string)); + if (base_url) + inputs.push_back(MakeGarbageCollected<V8URLPatternInput>(base_url)); + + // The compile the input string as a fully resolved URL. + KURL url(parsed_base_url, input_string); + if (!url.IsValid() || url.IsEmpty()) { + // Treat as failure to match, but don't throw an exception. + return false; + } + + // Apply the parsed URL components on top of our defaults. + if (url.Protocol()) + protocol = url.Protocol(); + if (url.User()) + username = url.User(); + if (url.Pass()) + password = url.Pass(); + if (url.Host()) + hostname = url.Host(); + if (url.Port() > 0) + port = String::Number(url.Port()); + if (url.GetPath()) + pathname = url.GetPath(); + if (url.Query()) + search = url.Query(); + if (url.FragmentIdentifier()) + hash = url.FragmentIdentifier(); + break; + } } Vector<String> protocol_group_list; @@ -1143,18 +535,25 @@ bool URLPattern::Match( auto* search_group_list_ref = result ? &search_group_list : nullptr; auto* hash_group_list_ref = result ? &hash_group_list : nullptr; + CHECK(protocol_); + CHECK(username_); + CHECK(password_); + CHECK(hostname_); + CHECK(port_); + CHECK(pathname_); + CHECK(search_); + CHECK(hash_); + // Each component of the pattern must match the corresponding component of - // the input. If a pattern Component is nullptr, then it matches any - // input and we can avoid running a real regular expression match. - bool matched = - (!protocol_ || protocol_->Match(protocol, protocol_group_list_ref)) && - (!username_ || username_->Match(username, username_group_list_ref)) && - (!password_ || password_->Match(password, password_group_list_ref)) && - (!hostname_ || hostname_->Match(hostname, hostname_group_list_ref)) && - (!port_ || port_->Match(port, port_group_list_ref)) && - (!pathname_ || pathname_->Match(pathname, pathname_group_list_ref)) && - (!search_ || search_->Match(search, search_group_list_ref)) && - (!hash_ || hash_->Match(hash, hash_group_list_ref)); + // the input. + bool matched = protocol_->Match(protocol, protocol_group_list_ref) && + username_->Match(username, username_group_list_ref) && + password_->Match(password, password_group_list_ref) && + hostname_->Match(hostname, hostname_group_list_ref) && + port_->Match(port, port_group_list_ref) && + pathname_->Match(pathname, pathname_group_list_ref) && + search_->Match(search, search_group_list_ref) && + hash_->Match(hash, hash_group_list_ref); if (!matched || !result) return matched; @@ -1162,55 +561,32 @@ bool URLPattern::Match( result->setInputs(std::move(inputs)); result->setProtocol( - MakeComponentResult(protocol_, protocol, protocol_group_list)); + MakeURLPatternComponentResult(protocol_, protocol, protocol_group_list)); result->setUsername( - MakeComponentResult(username_, username, username_group_list)); + MakeURLPatternComponentResult(username_, username, username_group_list)); result->setPassword( - MakeComponentResult(password_, password, password_group_list)); + MakeURLPatternComponentResult(password_, password, password_group_list)); result->setHostname( - MakeComponentResult(hostname_, hostname, hostname_group_list)); - result->setPort(MakeComponentResult(port_, port, port_group_list)); + MakeURLPatternComponentResult(hostname_, hostname, hostname_group_list)); + result->setPort(MakeURLPatternComponentResult(port_, port, port_group_list)); result->setPathname( - MakeComponentResult(pathname_, pathname, pathname_group_list)); - result->setSearch(MakeComponentResult(search_, search, search_group_list)); - result->setHash(MakeComponentResult(hash_, hash, hash_group_list)); + MakeURLPatternComponentResult(pathname_, pathname, pathname_group_list)); + result->setSearch( + MakeURLPatternComponentResult(search_, search, search_group_list)); + result->setHash(MakeURLPatternComponentResult(hash_, hash, hash_group_list)); return true; } // static -URLPatternComponentResult* URLPattern::MakeComponentResult( +URLPatternComponentResult* URLPattern::MakeURLPatternComponentResult( Component* component, const String& input, - const Vector<String>& group_list) { - Vector<std::pair<String, String>> groups; - if (!component) { - // When there is not Component we must act as if there was a default - // wildcard pattern with a group. The group includes the entire input. - groups.emplace_back("0", input); - } else { - DCHECK_EQ(component->name_list.size(), group_list.size()); - for (wtf_size_t i = 0; i < group_list.size(); ++i) { - groups.emplace_back(component->name_list[i], group_list[i]); - } - } - + const Vector<String>& group_values) { auto* result = URLPatternComponentResult::Create(); result->setInput(input); - result->setGroups(groups); + result->setGroups(component->MakeGroupList(group_values)); return result; } -bool URLPattern::ShouldTreatAsStandardURL(Component* protocol) { - if (!protocol) - return true; - const auto protocol_matches = [&](const std::string& scheme) { - DCHECK(base::IsStringASCII(scheme)); - return protocol->Match( - StringView(scheme.data(), static_cast<unsigned>(scheme.size())), - /*group_list=*/nullptr); - }; - return base::ranges::any_of(url::GetStandardSchemes(), protocol_matches); -} - } // namespace blink diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern.h b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern.h index 0609081ea0b..4dd1affda57 100644 --- a/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern.h +++ b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern.h @@ -1,4 +1,3 @@ -// Copyright 2020 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -7,27 +6,36 @@ #include "base/types/pass_key.h" #include "third_party/blink/renderer/bindings/modules/v8/v8_typedefs.h" +#include "third_party/blink/renderer/bindings/modules/v8/v8_url_pattern_component.h" +#include "third_party/blink/renderer/modules/modules_export.h" #include "third_party/blink/renderer/platform/bindings/script_wrappable.h" #include "third_party/liburlpattern/parse.h" -namespace liburlpattern { -struct Options; -} // namespace liburlpattern - namespace blink { class ExceptionState; class URLPatternComponentResult; class URLPatternInit; class URLPatternResult; -class USVStringOrURLPatternInit; -class URLPattern : public ScriptWrappable { +namespace url_pattern { +class Component; +} // namespace url_pattern + +class MODULES_EXPORT URLPattern : public ScriptWrappable { DEFINE_WRAPPERTYPEINFO(); - class Component; + using Component = url_pattern::Component; public: + static URLPattern* Create(const V8URLPatternInput* input, + const String& base_url, + ExceptionState& exception_state); + + static URLPattern* Create(const V8URLPatternInput* input, + ExceptionState& exception_state); + static URLPattern* Create(const URLPatternInit* init, + Component* precomputed_protocol_component, ExceptionState& exception_state); URLPattern(Component* protocol, @@ -40,33 +48,17 @@ class URLPattern : public ScriptWrappable { Component* hash, base::PassKey<URLPattern> key); -#if defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) bool test(const V8URLPatternInput* input, const String& base_url, ExceptionState& exception_state) const; bool test(const V8URLPatternInput* input, ExceptionState& exception_state) const; -#else // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - bool test(const USVStringOrURLPatternInit& input, - const String& base_url, - ExceptionState& exception_state) const; - bool test(const USVStringOrURLPatternInit& input, - ExceptionState& exception_state) const; -#endif // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) -#if defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) URLPatternResult* exec(const V8URLPatternInput* input, const String& base_url, ExceptionState& exception_state) const; URLPatternResult* exec(const V8URLPatternInput* input, ExceptionState& exception_state) const; -#else // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - URLPatternResult* exec(const USVStringOrURLPatternInit& input, - const String& base_url, - ExceptionState& exception_state) const; - URLPatternResult* exec(const USVStringOrURLPatternInit& input, - ExceptionState& exception_state) const; -#endif // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) String protocol() const; String username() const; @@ -77,51 +69,29 @@ class URLPattern : public ScriptWrappable { String search() const; String hash() const; + static int compareComponent(const V8URLPatternComponent& component, + const URLPattern* left, + const URLPattern* right); + void Trace(Visitor* visitor) const override; private: - // A utility function that takes a given |pattern| and compiles it into a - // Component structure. If the |pattern| matches the given |default_pattern| - // then nullptr may be returned without throwing an exception. In this case - // the Component is not constructed and the nullptr value should be treated as - // matching any input value for the component. The |component| string is used - // for exception messages. The |encode_callback| will be used to validate and - // encode plain text within the pattern during compilation. |options| control - // how the pattern is compiled. - static Component* CompilePattern( - const String& pattern, - StringView component, - liburlpattern::EncodeCallback encode_callback, - const liburlpattern::Options& options, - ExceptionState& exception_state); - // A utility function to determine if a given |input| matches the pattern // or not. Returns |true| if there is a match and |false| otherwise. If // |result| is not nullptr then the URLPatternResult contents will be filled. -#if defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) bool Match(const V8URLPatternInput* input, const String& base_url, URLPatternResult* result, ExceptionState& exception_state) const; -#else // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) - bool Match(const USVStringOrURLPatternInit& input, - const String& base_url, - URLPatternResult* result, - ExceptionState& exception_state) const; -#endif // defined(USE_BLINK_V8_BINDING_NEW_IDL_UNION) // A utility function that constructs a URLPatternComponentResult for - // a given |component|, |input|, and |group_list|. The |component| may - // be nullptr. - static URLPatternComponentResult* MakeComponentResult( + // a given |component|, |input|, and |group_list|. + static URLPatternComponentResult* MakeURLPatternComponentResult( Component* component, const String& input, - const Vector<String>& group_list); - - static bool ShouldTreatAsStandardURL(Component* protocol); + const Vector<String>& group_values); - // The compiled patterns for each URL component. If a Component member is - // nullptr then it should be treated as a wildcard matching any input. + // The compiled patterns for each URL component. Member<Component> protocol_; Member<Component> username_; Member<Component> password_; diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern.idl b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern.idl index f3a619f43fd..e2722f0119f 100644 --- a/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern.idl +++ b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern.idl @@ -4,19 +4,22 @@ typedef (USVString or URLPatternInit) URLPatternInput; +enum URLPatternComponent { "protocol", "username", "password", "hostname", + "port", "pathname", "search", "hash" }; + // https://wicg.github.io/urlpattern/ [ - SecureContext, Exposed=(Window,Worker), RuntimeEnabled=URLPattern ] interface URLPattern { - [RaisesException] constructor(URLPatternInit init); + [RaisesException, Measure] + constructor(URLPatternInput input, optional USVString baseURL); - [RaisesException] + [RaisesException, Measure] boolean test(URLPatternInput input, optional USVString baseURL); - [RaisesException] - URLPatternResult exec(URLPatternInput input, optional USVString baseURL); + [RaisesException, Measure] + URLPatternResult? exec(URLPatternInput input, optional USVString baseURL); readonly attribute USVString protocol; readonly attribute USVString username; @@ -26,4 +29,8 @@ typedef (USVString or URLPatternInit) URLPatternInput; readonly attribute USVString pathname; readonly attribute USVString search; readonly attribute USVString hash; + + [RuntimeEnabled=URLPatternCompareComponent, Measure] + static short compareComponent(URLPatternComponent component, + URLPattern left, URLPattern right); }; diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_canon.cc b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_canon.cc new file mode 100644 index 00000000000..d05ff0666e7 --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_canon.cc @@ -0,0 +1,464 @@ +// Copyright 2021 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "third_party/blink/renderer/modules/url_pattern/url_pattern_canon.h" + +#include "third_party/blink/renderer/modules/url_pattern/url_pattern_component.h" +#include "third_party/blink/renderer/platform/bindings/exception_state.h" +#include "third_party/blink/renderer/platform/weborigin/security_origin.h" +#include "third_party/blink/renderer/platform/wtf/text/string_utf8_adaptor.h" +#include "url/url_canon.h" +#include "url/url_util.h" + +namespace blink { +namespace url_pattern { + +namespace { + +String MaybeStripPrefix(const String& value, StringView prefix) { + if (value.StartsWith(prefix)) + return value.Substring(1, value.length() - 1); + return value; +} + +String MaybeStripSuffix(const String& value, StringView suffix) { + if (value.EndsWith(suffix)) + return value.Substring(0, value.length() - 1); + return value; +} + +String StringFromCanonOutput(const url::CanonOutput& output, + const url::Component& component) { + return String::FromUTF8(output.data() + component.begin, component.len); +} + +std::string StdStringFromCanonOutput(const url::CanonOutput& output, + const url::Component& component) { + return std::string(output.data() + component.begin, component.len); +} + +} // anonymous namespace + +absl::StatusOr<std::string> ProtocolEncodeCallback(absl::string_view input) { + if (input.empty()) + return std::string(); + + url::RawCanonOutputT<char> canon_output; + url::Component component; + + bool result = url::CanonicalizeScheme( + input.data(), url::Component(0, static_cast<int>(input.size())), + &canon_output, &component); + + if (!result) { + return absl::InvalidArgumentError("Invalid protocol '" + + std::string(input) + "'."); + } + + return StdStringFromCanonOutput(canon_output, component); +} + +absl::StatusOr<std::string> UsernameEncodeCallback(absl::string_view input) { + if (input.empty()) + return std::string(); + + url::RawCanonOutputT<char> canon_output; + url::Component username_component; + url::Component password_component; + + bool result = url::CanonicalizeUserInfo( + input.data(), url::Component(0, static_cast<int>(input.size())), "", + url::Component(0, 0), &canon_output, &username_component, + &password_component); + + if (!result) { + return absl::InvalidArgumentError("Invalid username pattern '" + + std::string(input) + "'."); + } + + return StdStringFromCanonOutput(canon_output, username_component); +} + +absl::StatusOr<std::string> PasswordEncodeCallback(absl::string_view input) { + if (input.empty()) + return std::string(); + + url::RawCanonOutputT<char> canon_output; + url::Component username_component; + url::Component password_component; + + bool result = url::CanonicalizeUserInfo( + "", url::Component(0, 0), input.data(), + url::Component(0, static_cast<int>(input.size())), &canon_output, + &username_component, &password_component); + + if (!result) { + return absl::InvalidArgumentError("Invalid password pattern '" + + std::string(input) + "'."); + } + + return StdStringFromCanonOutput(canon_output, password_component); +} + +absl::StatusOr<std::string> HostnameEncodeCallback(absl::string_view input) { + if (input.empty()) + return std::string(); + + url::RawCanonOutputT<char> canon_output; + url::Component component; + + bool result = url::CanonicalizeHost( + input.data(), url::Component(0, static_cast<int>(input.size())), + &canon_output, &component); + + if (!result) { + return absl::InvalidArgumentError("Invalid hostname pattern '" + + std::string(input) + "'."); + } + + return StdStringFromCanonOutput(canon_output, component); +} + +absl::StatusOr<std::string> PortEncodeCallback(absl::string_view input) { + if (input.empty()) + return std::string(); + + url::RawCanonOutputT<char> canon_output; + url::Component component; + + bool result = url::CanonicalizePort( + input.data(), url::Component(0, static_cast<int>(input.size())), + url::PORT_UNSPECIFIED, &canon_output, &component); + + if (!result) { + return absl::InvalidArgumentError("Invalid port pattern '" + + std::string(input) + "'."); + } + + return StdStringFromCanonOutput(canon_output, component); +} + +absl::StatusOr<std::string> StandardURLPathnameEncodeCallback( + absl::string_view input) { + if (input.empty()) + return std::string(); + + url::RawCanonOutputT<char> canon_output; + url::Component component; + + bool result = url::CanonicalizePartialPath( + input.data(), url::Component(0, static_cast<int>(input.size())), + &canon_output, &component); + + if (!result) { + return absl::InvalidArgumentError("Invalid pathname pattern '" + + std::string(input) + "'."); + } + + return StdStringFromCanonOutput(canon_output, component); +} + +absl::StatusOr<std::string> PathURLPathnameEncodeCallback( + absl::string_view input) { + if (input.empty()) + return std::string(); + + url::RawCanonOutputT<char> canon_output; + url::Component component; + + url::CanonicalizePathURLPath( + input.data(), url::Component(0, static_cast<int>(input.size())), + &canon_output, &component); + + return StdStringFromCanonOutput(canon_output, component); +} + +absl::StatusOr<std::string> SearchEncodeCallback(absl::string_view input) { + if (input.empty()) + return std::string(); + + url::RawCanonOutputT<char> canon_output; + url::Component component; + + url::CanonicalizeQuery(input.data(), + url::Component(0, static_cast<int>(input.size())), + /*converter=*/nullptr, &canon_output, &component); + + return StdStringFromCanonOutput(canon_output, component); +} + +absl::StatusOr<std::string> HashEncodeCallback(absl::string_view input) { + if (input.empty()) + return std::string(); + + url::RawCanonOutputT<char> canon_output; + url::Component component; + + url::CanonicalizeRef(input.data(), + url::Component(0, static_cast<int>(input.size())), + &canon_output, &component); + + return StdStringFromCanonOutput(canon_output, component); +} + +String CanonicalizeProtocol(const String& input, + ValueType type, + ExceptionState& exception_state) { + // We allow the protocol input to optionally contain a ":" suffix. Strip + // this for both URL and pattern protocols. + String stripped = MaybeStripSuffix(input, ":"); + + if (type == ValueType::kPattern) { + // Canonicalization for patterns is handled during compilation via + // encoding callbacks. + return stripped; + } + + bool result = false; + url::RawCanonOutputT<char> canon_output; + url::Component component; + if (stripped.Is8Bit()) { + StringUTF8Adaptor utf8(stripped); + result = url::CanonicalizeScheme( + utf8.data(), url::Component(0, utf8.size()), &canon_output, &component); + } else { + result = url::CanonicalizeScheme(stripped.Characters16(), + url::Component(0, stripped.length()), + &canon_output, &component); + } + + if (!result) { + exception_state.ThrowTypeError("Invalid protocol '" + stripped + "'."); + return String(); + } + + return StringFromCanonOutput(canon_output, component); +} + +void CanonicalizeUsernameAndPassword(const String& username, + const String& password, + ValueType type, + String& username_out, + String& password_out, + ExceptionState& exception_state) { + if (type == ValueType::kPattern) { + // Canonicalization for patterns is handled during compilation via + // encoding callbacks. + username_out = username; + password_out = password; + return; + } + + bool result = false; + url::RawCanonOutputT<char> canon_output; + url::Component username_component; + url::Component password_component; + + if (username && password && username.Is8Bit() && password.Is8Bit()) { + StringUTF8Adaptor username_utf8(username); + StringUTF8Adaptor password_utf8(password); + result = url::CanonicalizeUserInfo( + username_utf8.data(), url::Component(0, username_utf8.size()), + password_utf8.data(), url::Component(0, password_utf8.size()), + &canon_output, &username_component, &password_component); + + } else { + String username16(username); + String password16(password); + username16.Ensure16Bit(); + password16.Ensure16Bit(); + result = url::CanonicalizeUserInfo( + username16.Characters16(), url::Component(0, username16.length()), + password16.Characters16(), url::Component(0, password16.length()), + &canon_output, &username_component, &password_component); + } + + if (!result) { + exception_state.ThrowTypeError("Invalid username '" + username + + "' and/or password '" + password + "'."); + return; + } + + if (username_component.len != -1) + username_out = StringFromCanonOutput(canon_output, username_component); + if (password_component.len != -1) + password_out = StringFromCanonOutput(canon_output, password_component); +} + +String CanonicalizeHostname(const String& input, + ValueType type, + ExceptionState& exception_state) { + if (type == ValueType::kPattern) { + // Canonicalization for patterns is handled during compilation via + // encoding callbacks. + return input; + } + + bool success = false; + String result = SecurityOrigin::CanonicalizeHost(input, &success); + if (!success) { + exception_state.ThrowTypeError("Invalid hostname '" + input + "'."); + return String(); + } + + return result; +} + +String CanonicalizePort(const String& input, + ValueType type, + const String& protocol, + ExceptionState& exception_state) { + if (type == ValueType::kPattern) { + // Canonicalization for patterns is handled during compilation via + // encoding callbacks. + return input; + } + + int default_port = url::PORT_UNSPECIFIED; + if (!input.IsEmpty()) { + StringUTF8Adaptor protocol_utf8(protocol); + default_port = + url::DefaultPortForScheme(protocol_utf8.data(), protocol_utf8.size()); + } + + // Since ports only consist of digits there should be no encoding needed. + // Therefore we directly use the UTF8 encoding version of CanonicalizePort(). + StringUTF8Adaptor utf8(input); + url::RawCanonOutputT<char> canon_output; + url::Component component; + if (!url::CanonicalizePort(utf8.data(), url::Component(0, utf8.size()), + default_port, &canon_output, &component)) { + exception_state.ThrowTypeError("Invalid port '" + input + "'."); + return String(); + } + + return component.len == -1 ? g_empty_string + : StringFromCanonOutput(canon_output, component); +} + +String CanonicalizePathname(const String& protocol, + const String& input, + ValueType type, + ExceptionState& exception_state) { + if (type == ValueType::kPattern) { + // Canonicalization for patterns is handled during compilation via + // encoding callbacks. + return input; + } + + // Determine if we are using "standard" or "path" URL canonicalization + // for the pathname. In spec terms the "path" URL behavior corresponds + // to "cannot-be-a-base" URLs. We make this determination based on the + // protocol string since we cannot look at the number of slashes between + // components like the URL spec. If this is inadequate the developer + // can use the baseURL property to get more strict URL behavior. + // + // We default to "standard" URL behavior to match how the empty protocol + // string in the URLPattern constructor results in the pathname pattern + // getting "standard" URL canonicalization. + bool standard = false; + if (protocol.IsEmpty()) { + standard = true; + } else if (protocol.Is8Bit()) { + StringUTF8Adaptor utf8(protocol); + standard = url::IsStandard(utf8.data(), url::Component(0, utf8.size())); + } else { + standard = url::IsStandard(protocol.Characters16(), + url::Component(0, protocol.length())); + } + + // Do not enforce absolute pathnames here since we can't enforce it + // it consistently in the URLPattern constructor. This allows us to + // produce a match when the exact same fixed pathname string is passed + // to both the constructor and test()/exec(). Similarly, we use + // url::CanonicalizePartialPath() below instead of url::CanonicalizePath() + // to avoid pre-pending a slash at the start of the string. + + bool result = false; + url::RawCanonOutputT<char> canon_output; + url::Component component; + + const auto canonicalize_path = [&](const auto* data, int length) { + if (standard) { + return url::CanonicalizePartialPath(data, url::Component(0, length), + &canon_output, &component); + } + url::CanonicalizePathURLPath(data, url::Component(0, length), &canon_output, + &component); + return true; + }; + + if (input.Is8Bit()) { + StringUTF8Adaptor utf8(input); + result = canonicalize_path(utf8.data(), utf8.size()); + } else { + result = canonicalize_path(input.Characters16(), input.length()); + } + + if (!result) { + exception_state.ThrowTypeError("Invalid pathname '" + input + "'."); + return String(); + } + + return StringFromCanonOutput(canon_output, component); +} + +String CanonicalizeSearch(const String& input, + ValueType type, + ExceptionState& exception_state) { + // We allow the search input to optionally contain a "?" prefix. Strip + // this for both URL and pattern protocols. + String stripped = MaybeStripPrefix(input, "?"); + + if (type == ValueType::kPattern) { + // Canonicalization for patterns is handled during compilation via + // encoding callbacks. + return stripped; + } + + url::RawCanonOutputT<char> canon_output; + url::Component component; + if (stripped.Is8Bit()) { + StringUTF8Adaptor utf8(stripped); + url::CanonicalizeQuery(utf8.data(), url::Component(0, utf8.size()), + /*converter=*/nullptr, &canon_output, &component); + } else { + url::CanonicalizeQuery(stripped.Characters16(), + url::Component(0, stripped.length()), + /*converter=*/nullptr, &canon_output, &component); + } + + return StringFromCanonOutput(canon_output, component); +} + +String CanonicalizeHash(const String& input, + ValueType type, + ExceptionState& exception_state) { + // We allow the hash input to optionally contain a "#" prefix. Strip + // this for both URL and pattern protocols. + String stripped = MaybeStripPrefix(input, "#"); + + if (type == ValueType::kPattern) { + // Canonicalization for patterns is handled during compilation via + // encoding callbacks. + return stripped; + } + + url::RawCanonOutputT<char> canon_output; + url::Component component; + if (stripped.Is8Bit()) { + StringUTF8Adaptor utf8(stripped); + url::CanonicalizeRef(utf8.data(), url::Component(0, utf8.size()), + &canon_output, &component); + } else { + url::CanonicalizeRef(stripped.Characters16(), + url::Component(0, stripped.length()), &canon_output, + &component); + } + + return StringFromCanonOutput(canon_output, component); +} + +} // namespace url_pattern +} // namespace blink diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_canon.h b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_canon.h new file mode 100644 index 00000000000..eb6f3b38582 --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_canon.h @@ -0,0 +1,83 @@ +// Copyright 2021 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef THIRD_PARTY_BLINK_RENDERER_MODULES_URL_PATTERN_URL_PATTERN_CANON_H_ +#define THIRD_PARTY_BLINK_RENDERER_MODULES_URL_PATTERN_URL_PATTERN_CANON_H_ + +#include "third_party/abseil-cpp/absl/status/statusor.h" +#include "third_party/blink/renderer/platform/wtf/text/wtf_string.h" + +namespace blink { + +class ExceptionState; + +namespace url_pattern { + +// An enum indicating whether the associated component values to be operated +// on are for patterns or URLs. Validation and canonicalization will +// do different things depending on the type. +enum class ValueType { + kPattern, + kURL, +}; + +// The following functions are callbacks that may be passed to the +// liburlpattern::Parse() method. Each performs validation and encoding for +// a different URL component. +// +// Note that there are two different pathname callbacks for "standard" URLs +// like `https://foo` // vs "path" URLs like `data:foo`. Select the correct +// callback by calling `ShouldTreatAsStandardURL()`. +absl::StatusOr<std::string> ProtocolEncodeCallback(absl::string_view input); +absl::StatusOr<std::string> UsernameEncodeCallback(absl::string_view input); +absl::StatusOr<std::string> PasswordEncodeCallback(absl::string_view input); +absl::StatusOr<std::string> HostnameEncodeCallback(absl::string_view input); +absl::StatusOr<std::string> PortEncodeCallback(absl::string_view input); +absl::StatusOr<std::string> StandardURLPathnameEncodeCallback( + absl::string_view input); +absl::StatusOr<std::string> PathURLPathnameEncodeCallback( + absl::string_view input); +absl::StatusOr<std::string> SearchEncodeCallback(absl::string_view input); +absl::StatusOr<std::string> HashEncodeCallback(absl::string_view input); + +// Utility functions to canonicalize different component strings. They will +// throw an exception if the input is invalid. The canonicalization and/or +// validation will only be applied if the `type` is kURL. These functions +// simply pass through the value when the `type` is kPattern. Encoding is +// for patterns are handled later during compilation via the encode callbacks +// above. +// +// The result is returned, except for `CanonicalizeUsernameAndPassword` which +// uses separate out parameters for the resulting username and password. +String CanonicalizeProtocol(const String& input, + ValueType type, + ExceptionState& exception_state); +void CanonicalizeUsernameAndPassword(const String& username, + const String& password, + ValueType type, + String& username_out, + String& password_out, + ExceptionState& exception_state); +String CanonicalizeHostname(const String& input, + ValueType type, + ExceptionState& exception_state); +String CanonicalizePort(const String& input, + ValueType type, + const String& protocol, + ExceptionState& exception_state); +String CanonicalizePathname(const String& protocol, + const String& input, + ValueType type, + ExceptionState& exception_state); +String CanonicalizeSearch(const String& input, + ValueType type, + ExceptionState& exception_state); +String CanonicalizeHash(const String& input, + ValueType type, + ExceptionState& exception_state); + +} // namespace url_pattern +} // namespace blink + +#endif // THIRD_PARTY_BLINK_RENDERER_MODULES_URL_PATTERN_URL_PATTERN_CANON_H_ diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_component.cc b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_component.cc new file mode 100644 index 00000000000..8e029755c12 --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_component.cc @@ -0,0 +1,390 @@ +// Copyright 2021 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "third_party/blink/renderer/modules/url_pattern/url_pattern_component.h" + +#include "base/ranges/algorithm.h" +#include "base/strings/string_util.h" +#include "third_party/blink/renderer/modules/url_pattern/url_pattern_canon.h" +#include "third_party/blink/renderer/platform/bindings/exception_state.h" +#include "third_party/blink/renderer/platform/wtf/text/string_utf8_adaptor.h" +#include "url/url_util.h" + +namespace blink { +namespace url_pattern { + +namespace { + +// Utility method to convert a type to a string. +StringView TypeToString(Component::Type type) { + switch (type) { + case Component::Type::kProtocol: + return "protocol"; + case Component::Type::kUsername: + return "username"; + case Component::Type::kPassword: + return "password"; + case Component::Type::kHostname: + return "hostname"; + case Component::Type::kPort: + return "port"; + case Component::Type::kPathname: + return "pathname"; + case Component::Type::kSearch: + return "search"; + case Component::Type::kHash: + return "hash"; + } + NOTREACHED(); +} + +// Utility method to get the correct encoding callback for a given type. +liburlpattern::EncodeCallback GetEncodeCallback(Component::Type type, + Component* protocol_component) { + switch (type) { + case Component::Type::kProtocol: + return ProtocolEncodeCallback; + case Component::Type::kUsername: + return UsernameEncodeCallback; + case Component::Type::kPassword: + return PasswordEncodeCallback; + case Component::Type::kHostname: + return HostnameEncodeCallback; + case Component::Type::kPort: + return PortEncodeCallback; + case Component::Type::kPathname: + // Different types of URLs use different canonicalization for pathname. + // A "standard" URL flattens `.`/`..` and performs full percent encoding. + // A "path" URL does not flatten and uses a more lax percent encoding. + // The spec calls "path" URLs as "cannot-be-a-base-URL" URLs: + // + // https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state + // + // We prefer "standard" URL here by checking to see if the protocol + // pattern matches any of the known standard protocol strings. So + // an exact pattern of `http` will match, but so will `http{s}?` and + // `*`. + // + // If the protocol pattern does not match any of the known standard URL + // protocols then we fall back to the "path" URL behavior. This will + // normally be triggered by `data`, `javascript`, `about`, etc. It + // will also be triggered for custom protocol strings. We favor "path" + // behavior here because its better to under canonicalize since the + // developer can always manually canonicalize the pathname for a custom + // protocol. + // + // ShouldTreatAsStandardURL can by a bit expensive, so only do it if we + // actually have a pathname pattern to compile. + CHECK(protocol_component); + if (protocol_component->ShouldTreatAsStandardURL()) + return StandardURLPathnameEncodeCallback; + else + return PathURLPathnameEncodeCallback; + case Component::Type::kSearch: + return SearchEncodeCallback; + case Component::Type::kHash: + return HashEncodeCallback; + } + NOTREACHED(); +} + +// Utility method to get the correct liburlpattern parse options for a given +// type. +const liburlpattern::Options& GetOptions(Component::Type type) { + using liburlpattern::Options; + + // The liburlpattern::Options to use for most component patterns. We + // default to strict mode and case sensitivity. In addition, most + // components have no concept of a delimiter or prefix character. + DEFINE_THREAD_SAFE_STATIC_LOCAL(Options, default_options, + ({.delimiter_list = "", + .prefix_list = "", + .sensitive = true, + .strict = true})); + + // The liburlpattern::Options to use for hostname patterns. This uses a + // "." delimiter controlling how far a named group like ":bar" will match + // by default. Note, hostnames are case insensitive but we require case + // sensitivity here. This assumes that the hostname values have already + // been normalized to lower case as in URL(). + DEFINE_THREAD_SAFE_STATIC_LOCAL(Options, hostname_options, + ({.delimiter_list = ".", + .prefix_list = "", + .sensitive = true, + .strict = true})); + + // The liburlpattern::Options to use for pathname patterns. This uses a + // "/" delimiter controlling how far a named group like ":bar" will match + // by default. It also configures "/" to be treated as an automatic + // prefix before groups. + DEFINE_THREAD_SAFE_STATIC_LOCAL(Options, pathname_options, + ({.delimiter_list = "/", + .prefix_list = "/", + .sensitive = true, + .strict = true})); + + switch (type) { + case Component::Type::kHostname: + return hostname_options; + case Component::Type::kPathname: + return pathname_options; + case Component::Type::kProtocol: + case Component::Type::kUsername: + case Component::Type::kPassword: + case Component::Type::kPort: + case Component::Type::kSearch: + case Component::Type::kHash: + return default_options; + } + NOTREACHED(); +} + +// Utility function to return a statically allocated Part list. +const std::vector<liburlpattern::Part>& GetWildcardOnlyPartList() { + using liburlpattern::Modifier; + using liburlpattern::Part; + using liburlpattern::PartType; + DEFINE_THREAD_SAFE_STATIC_LOCAL( + std::vector<Part>, instance, + ({Part(PartType::kFullWildcard, + /*name=*/"", + /*prefix=*/"", /*value=*/"", /*suffix=*/"", Modifier::kNone)})); + return instance; +} + +int ComparePart(const liburlpattern::Part& lh, const liburlpattern::Part& rh) { + // We prioritize PartType in the ordering so we can favor fixed text. The + // type ordering is: + // + // kFixed > kRegex > kSegmentWildcard > kFullWildcard. + // + // We considered kRegex greater than the wildcards because it is likely to be + // used for imposing some constraint and not just duplicating wildcard + // behavior. + // + // This comparison depends on the PartType enum in liburlpattern having the + // correct corresponding numeric values. + // + // Next the Modifier is considered: + // + // kNone > kOneOrMore > kOptional > kZeroOrMore. + // + // The rationale here is that requring the match group to exist is more + // restrictive then making it optional and requiring an exact count is more + // restrictive than repeating. + // + // This comparison depends on the Modifier enum in liburlpattern having the + // correct corresponding numeric values. + // + // Finally we lexicographically compare the text components from left to + // right; `prefix`, `value`, and `suffix`. Its ok to depend on simple + // byte-wise string comparison here because the values have all been URL + // encoded. This guarantees the strings contain only ASCII. + auto left = std::tie(lh.type, lh.modifier, lh.prefix, lh.value, lh.suffix); + auto right = std::tie(rh.type, rh.modifier, rh.prefix, rh.value, rh.suffix); + if (left < right) + return -1; + else if (left == right) + return 0; + else + return 1; +} + +// Utility method to compare two part lists. +int ComparePartList(const std::vector<liburlpattern::Part>& lh, + const std::vector<liburlpattern::Part>& rh) { + using liburlpattern::Modifier; + using liburlpattern::Part; + using liburlpattern::PartType; + + // Begin by comparing each Part in the lists with each other. If any + // are not equal, then we are done. + size_t i = 0; + for (; i < lh.size() && i < rh.size(); ++i) { + int r = ComparePart(lh[i], rh[i]); + if (r) + return r; + } + + // We reached the end of at least one of the lists without finding a + // difference. However, we must handle the case where one list is longer + // than the other. In this case we compare the next Part from the + // longer list to a synthetically created empty kFixed Part. This is + // necessary in order for "/foo/" to be considered more restrictive, and + // therefore greater, than "/foo/*". + if (i == lh.size() && i != rh.size()) + return ComparePart(Part(PartType::kFixed, "", Modifier::kNone), rh[i]); + else if (i != lh.size() && i == rh.size()) + return ComparePart(lh[i], Part(PartType::kFixed, "", Modifier::kNone)); + + // No differences were found, so declare them equal. + return 0; +} + +} // anonymous namespace + +// static +Component* Component::Compile(const String& pattern, + Type type, + Component* protocol_component, + ExceptionState& exception_state) { + // If the pattern is null then return a special Component object that matches + // any input as if the pattern was `*`. + if (pattern.IsNull()) { + return MakeGarbageCollected<Component>(type, base::PassKey<Component>()); + } + + const liburlpattern::Options& options = GetOptions(type); + + // Parse the pattern. + StringUTF8Adaptor utf8(pattern); + auto parse_result = liburlpattern::Parse( + absl::string_view(utf8.data(), utf8.size()), + GetEncodeCallback(type, protocol_component), options); + if (!parse_result.ok()) { + exception_state.ThrowTypeError( + "Invalid " + TypeToString(type) + " pattern '" + pattern + "'. " + + String::FromUTF8(parse_result.status().message().data(), + parse_result.status().message().size())); + return nullptr; + } + + // Extract a regular expression string from the parsed pattern. + std::vector<std::string> name_list; + std::string regexp_string = + parse_result.value().GenerateRegexString(&name_list); + + // Compile the regular expression to verify it is valid. + auto case_sensitive = options.sensitive ? WTF::kTextCaseSensitive + : WTF::kTextCaseASCIIInsensitive; + DCHECK(base::IsStringASCII(regexp_string)); + ScriptRegexp* regexp = MakeGarbageCollected<ScriptRegexp>( + String(regexp_string.data(), regexp_string.size()), case_sensitive, + kMultilineDisabled, ScriptRegexp::UTF16); + if (!regexp->IsValid()) { + // The regular expression failed to compile. This means that some + // custom regexp group within the pattern is illegal. Attempt to + // compile each regexp group individually in order to identify the + // culprit. + for (auto& part : parse_result.value().PartList()) { + if (part.type != liburlpattern::PartType::kRegex) + continue; + DCHECK(base::IsStringASCII(part.value)); + String group_value(part.value.data(), part.value.size()); + regexp = MakeGarbageCollected<ScriptRegexp>( + group_value, case_sensitive, kMultilineDisabled, ScriptRegexp::UTF16); + if (regexp->IsValid()) + continue; + exception_state.ThrowTypeError("Invalid " + TypeToString(type) + + " pattern '" + pattern + + "'. Custom regular expression group '" + + group_value + "' is invalid."); + return nullptr; + } + // We couldn't find a bad regexp group, but we still have an overall + // error. This shouldn't happen, but we handle it anyway. + exception_state.ThrowTypeError("Invalid " + TypeToString(type) + + " pattern '" + pattern + + "'. An unexpected error has occurred."); + return nullptr; + } + + Vector<String> wtf_name_list; + wtf_name_list.ReserveInitialCapacity( + static_cast<wtf_size_t>(name_list.size())); + for (const auto& name : name_list) { + wtf_name_list.push_back(String::FromUTF8(name.data(), name.size())); + } + + return MakeGarbageCollected<Component>( + type, std::move(parse_result.value()), std::move(regexp), + std::move(wtf_name_list), base::PassKey<Component>()); +} + +// static +int Component::Compare(const Component& lh, const Component& rh) { + using liburlpattern::Modifier; + using liburlpattern::Part; + using liburlpattern::PartType; + + // If both the left and right components are empty wildcards, then they are + // effectively equal. + if (!lh.pattern_.has_value() && !rh.pattern_.has_value()) + return 0; + + // If one side has a real pattern and the other side is an empty component, + // then we have to compare to a part list with a single full wildcard. + if (lh.pattern_.has_value() && !rh.pattern_.has_value()) { + return ComparePartList(lh.pattern_->PartList(), GetWildcardOnlyPartList()); + } + + if (!lh.pattern_.has_value() && rh.pattern_.has_value()) { + return ComparePartList(GetWildcardOnlyPartList(), rh.pattern_->PartList()); + } + + // Otherwise compare the part lists of the patterns on each side. + return ComparePartList(lh.pattern_->PartList(), rh.pattern_->PartList()); +} + +Component::Component(Type type, + liburlpattern::Pattern pattern, + ScriptRegexp* regexp, + Vector<String> name_list, + base::PassKey<Component> key) + : type_(type), + pattern_(std::move(pattern)), + regexp_(regexp), + name_list_(std::move(name_list)) {} + +Component::Component(Type type, base::PassKey<Component> key) + : type_(type), name_list_({"0"}) {} + +bool Component::Match(StringView input, Vector<String>* group_list) const { + if (regexp_) { + return regexp_->Match(input, /*start_from=*/0, /*match_length=*/nullptr, + group_list) == 0; + } else { + if (group_list) + group_list->push_back(input.ToString()); + return true; + } +} + +String Component::GeneratePatternString() const { + if (pattern_.has_value()) + return String::FromUTF8(pattern_->GeneratePatternString()); + else + return "*"; +} + +Vector<std::pair<String, String>> Component::MakeGroupList( + const Vector<String>& group_values) const { + DCHECK_EQ(name_list_.size(), group_values.size()); + Vector<std::pair<String, String>> result; + result.ReserveInitialCapacity(group_values.size()); + for (wtf_size_t i = 0; i < group_values.size(); ++i) { + result.emplace_back(name_list_[i], group_values[i]); + } + return result; +} + +bool Component::ShouldTreatAsStandardURL() const { + DCHECK(type_ == Type::kProtocol); + if (!pattern_.has_value()) + return true; + const auto protocol_matches = [&](const std::string& scheme) { + DCHECK(base::IsStringASCII(scheme)); + return Match( + StringView(scheme.data(), static_cast<unsigned>(scheme.size())), + /*group_list=*/nullptr); + }; + return base::ranges::any_of(url::GetStandardSchemes(), protocol_matches); +} + +void Component::Trace(Visitor* visitor) const { + visitor->Trace(regexp_); +} + +} // namespace url_pattern +} // namespace blink diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_component.h b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_component.h new file mode 100644 index 00000000000..31ca11e1f62 --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_component.h @@ -0,0 +1,113 @@ +// Copyright 2021 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef THIRD_PARTY_BLINK_RENDERER_MODULES_URL_PATTERN_URL_PATTERN_COMPONENT_H_ +#define THIRD_PARTY_BLINK_RENDERER_MODULES_URL_PATTERN_URL_PATTERN_COMPONENT_H_ + +#include "base/types/pass_key.h" +#include "third_party/abseil-cpp/absl/types/optional.h" +#include "third_party/blink/renderer/bindings/core/v8/script_regexp.h" +#include "third_party/blink/renderer/platform/heap/heap.h" +#include "third_party/blink/renderer/platform/heap/member.h" +#include "third_party/blink/renderer/platform/heap/trace_traits.h" +#include "third_party/blink/renderer/platform/wtf/vector.h" +#include "third_party/liburlpattern/parse.h" +#include "third_party/liburlpattern/pattern.h" + +namespace blink { + +class ExceptionState; + +namespace url_pattern { + +// A struct representing all the information needed to match a particular +// component of a URL. +class Component final : public GarbageCollected<Component> { + public: + // Enumeration defining the different types of components. Each component + // type uses a slightly different kind of character encoding. In addition, + // different component types using different liburlpattern parse options. + enum class Type { + kProtocol, + kUsername, + kPassword, + kHostname, + kPort, + kPathname, + kSearch, + kHash, + }; + + // A utility function that takes a given `pattern` and compiles it into a + // Component structure. If the `pattern` is null then nullptr + // may be returned without throwing an exception. In this case the + // Component is not constructed and the nullptr value should be + // treated as matching any input value for the component. The `type` + // specifies which URL component is the pattern is being compiled for. This + // will select the correct encoding callback, liburlpattern options, and + // populate errors messages with the correct component string. + static Component* Compile(const String& pattern, + Type type, + Component* protocol_component, + ExceptionState& exception_state); + + // Compare the pattern strings in the two given components. This provides a + // mostly lexicographical ordering based on fixed text in the patterns. + // Matching groups and modifiers are treated such that more restrictive + // patterns are greater in value. Group names are not considered in the + // comparison. + static int Compare(const Component& lh, const Component& rh); + + // Constructs a Component with a real `pattern` that compiled to the given + // `regexp`. + Component(Type type, + liburlpattern::Pattern pattern, + ScriptRegexp* regexp, + Vector<String> name_list, + base::PassKey<Component> key); + + // Constructs an empty Component that matches any input as if it had the + // pattern `*`. + Component(Type type, base::PassKey<Component> key); + + // Match the given `input` against the component pattern. Returns `true` + // if there is a match. If `group_list` is not nullptr, then it will be + // populated with group values captured by the pattern. + bool Match(StringView input, Vector<String>* group_list) const; + + // Convert the compiled component pattern back into a pattern string. This + // will be functionally equivalent to the original, but may differ based on + // canonicalization that occurred during parsing. + String GeneratePatternString() const; + + // Combines the given list of group values with the group names specified in + // the original pattern. The return result is a vector of name:value tuples. + Vector<std::pair<String, String>> MakeGroupList( + const Vector<String>& group_values) const; + + // Method to determine if the URL associated with this component should be + // treated as a "standard" URL like `https://foo` vs a "path" URL like + // `data:foo`. This should only be called for kProtocol components. + bool ShouldTreatAsStandardURL() const; + + void Trace(Visitor* visitor) const; + + private: + const Type type_; + + // The parsed pattern. + const absl::optional<liburlpattern::Pattern> pattern_; + + // The pattern compiled down to a js regular expression. + const Member<ScriptRegexp> regexp_; + + // The names to be applied to the regular expression capture groups. Note, + // liburlpattern regular expressions do not use named capture groups directly. + const Vector<String> name_list_; +}; + +} // namespace url_pattern +} // namespace blink + +#endif // THIRD_PARTY_BLINK_RENDERER_MODULES_URL_PATTERN_URL_PATTERN_COMPONENT_H_ diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_fuzzer.cc b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_fuzzer.cc new file mode 100644 index 00000000000..9760a6d6039 --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_fuzzer.cc @@ -0,0 +1,29 @@ +// Copyright 2021 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "third_party/blink/renderer/bindings/modules/v8/v8_union_urlpatterninit_usvstring.h" +#include "third_party/blink/renderer/modules/url_pattern/url_pattern.h" +#include "third_party/blink/renderer/platform/bindings/exception_state.h" +#include "third_party/blink/renderer/platform/bindings/v8_per_isolate_data.h" +#include "third_party/blink/renderer/platform/testing/blink_fuzzer_test_support.h" +#include "third_party/blink/renderer/platform/wtf/text/wtf_string.h" + +namespace blink { + +int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + static BlinkFuzzerTestSupport test_support = BlinkFuzzerTestSupport(); + DummyExceptionStateForTesting exception_state; + auto* input = MakeGarbageCollected<V8URLPatternInput>( + String::FromUTF8(reinterpret_cast<const char*>(data), size)); + URLPattern::Create(input, exception_state); + V8PerIsolateData::MainThreadIsolate()->RequestGarbageCollectionForTesting( + v8::Isolate::kFullGarbageCollection); + return 0; +} + +} // namespace blink + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + return blink::LLVMFuzzerTestOneInput(data, size); +} diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_parser.cc b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_parser.cc new file mode 100644 index 00000000000..22a34bb3b0d --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_parser.cc @@ -0,0 +1,453 @@ +// Copyright 2021 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "third_party/blink/renderer/modules/url_pattern/url_pattern_parser.h" + +#include "base/notreached.h" +#include "third_party/blink/renderer/bindings/modules/v8/v8_url_pattern_init.h" +#include "third_party/blink/renderer/modules/url_pattern/url_pattern_component.h" +#include "third_party/blink/renderer/platform/bindings/exception_state.h" +#include "third_party/blink/renderer/platform/wtf/text/string_utf8_adaptor.h" +#include "third_party/liburlpattern/tokenize.h" + +namespace blink { +namespace url_pattern { + +Parser::Parser(const String& input) : input_(input), utf8_(input) {} + +void Parser::Parse(ExceptionState& exception_state) { + DCHECK_EQ(state_, StringParseState::kInit); + DCHECK_EQ(token_index_, 0u); + + auto tokenize_result = + liburlpattern::Tokenize(absl::string_view(utf8_.data(), utf8_.size()), + liburlpattern::TokenizePolicy::kLenient); + if (!tokenize_result.ok()) { + // This should not happen with kLenient mode, but we handle it anyway. + exception_state.ThrowTypeError("Invalid input string '" + input_ + + "'. It unexpectedly fails to tokenize."); + return; + } + + token_list_ = std::move(tokenize_result.value()); + result_ = MakeGarbageCollected<URLPatternInit>(); + + // When constructing a pattern using structured input like + // `new URLPattern({ pathname: 'foo' })` any missing components will be + // defaulted to wildcards. In the constructor string case, however, all + // components are precisely defined as either empty string or a longer + // value. This is due to there being no way to simply "leave out" a + // component when writing a URL. The behavior also matches the URL + // constructor. + // + // To implement this we initialize components to the empty string in advance. + // + // We can't, however, do this immediately. We want to allow the baseURL to + // provide information for relative URLs, so we only want to set the default + // empty string values for components following the first component in the + // relative URL. + // + // We therefore wait to set the default component values until after we exit + // the kInit state and have determined if we are in relative or absolute mode. + + // Iterate through the list of tokens and update our state machine as we go. + for (; token_index_ < token_list_.size(); token_index_ += token_increment_) { + // Reset back to our default `token_increment_` value. + token_increment_ = 1; + + // All states must respect the end of the token list. The liburlpattern + // tokenizer guarantees that the last token will have the type `kEnd`. + if (token_list_[token_index_].type == liburlpattern::TokenType::kEnd) { + // If we failed to find a protocol terminator then we are still in + // relative mode. We now need to determine the first component of the + // relative URL. + if (state_ == StringParseState::kInit) { + // Reset back to the start of the input string. + Rewind(); + + // If the string begins with `?` then its a relative search component. + // If it starts with `#` then its a relative hash component. Otherwise + // its a relative pathname. + // + // In each case we initialize any components following the initial + // component to be empty string. + if (IsHashPrefix()) { + ChangeState(StringParseState::kHash, Skip(1)); + } else if (IsSearchPrefix()) { + ChangeState(StringParseState::kSearch, Skip(1)); + result_->setHash(g_empty_string); + } else { + ChangeState(StringParseState::kPathname, Skip(0)); + result_->setSearch(g_empty_string); + result_->setHash(g_empty_string); + } + continue; + } + + // If we failed to find an `@`, then there is no username and password. + // We should rewind and process the data as a hostname. + else if (state_ == StringParseState::kAuthority) { + RewindAndSetState(StringParseState::kHostname); + continue; + } + + ChangeState(StringParseState::kDone, Skip(0)); + break; + } + + // In addition, all states must handle pattern groups. We do not permit + // a component to end in the middle of a pattern group. Therefore we skip + // past any tokens that are within `{` and `}`. Note, the tokenizer + // handles grouping `(` and `)` and `:foo` groups for us automatically, so + // we don't need special code for them here. + if (group_depth_ > 0) { + if (IsGroupClose()) + group_depth_ -= 1; + else + continue; + } + + if (IsGroupOpen()) { + group_depth_ += 1; + continue; + } + + switch (state_) { + case StringParseState::kInit: + if (IsProtocolSuffix()) { + // We are in absolute mode and we know values will not be inherited + // from a base URL. Therefore initialize the rest of the components + // to the empty string. + result_->setUsername(g_empty_string); + result_->setPassword(g_empty_string); + result_->setHostname(g_empty_string); + result_->setPort(g_empty_string); + result_->setPathname(g_empty_string); + result_->setSearch(g_empty_string); + result_->setHash(g_empty_string); + + // Update the state to expect the start of an absolute URL. + RewindAndSetState(StringParseState::kProtocol); + } + break; + + case StringParseState::kProtocol: + // If we find the end of the protocol component... + if (IsProtocolSuffix()) { + // First we eagerly compile the protocol pattern and use it to + // compute if this entire URLPattern should be treated as a + // "standard" URL. If any of the special schemes, like `https`, + // match the protocol pattern then we treat it as standard. + ComputeShouldTreatAsStandardURL(exception_state); + if (exception_state.HadException()) + return; + + // Standard URLs default to `/` for the pathname. + if (should_treat_as_standard_url_) + result_->setPathname("/"); + + // By default we treat this as a "cannot-be-a-base-URL" or what chrome + // calls a "path" URL. In this case we go straight to the pathname + // component. The hostname and port are left with their default + // empty string values. + StringParseState next_state = StringParseState::kPathname; + Skip skip = Skip(1); + + // If there are authority slashes, like `https://`, then + // we must transition to the authority section of the URLPattern. + if (NextIsAuthoritySlashes()) { + next_state = StringParseState::kAuthority; + skip = Skip(3); + } + + // If there are no authority slashes, but the protocol is special + // then we still go to the authority section as this is a "standard" + // URL. This differs from the above case since we don't need to skip + // the extra slashes. + else if (should_treat_as_standard_url_) { + next_state = StringParseState::kAuthority; + } + + ChangeState(next_state, skip); + } + break; + + case StringParseState::kAuthority: + // Before going to the hostname state we must see if there is an + // identity of the form: + // + // <username>:<password>@<hostname> + // + // We check for this by looking for the `@` character. The username + // and password are themselves each optional, so the `:` may not be + // present. If we see the `@` we just go to the username state + // and let it proceed until it hits either the password separator + // or the `@` terminator. + if (IsIdentityTerminator()) + RewindAndSetState(StringParseState::kUsername); + + // Stop searching for the `@` character if we see the beginning + // of the pathname, search, or hash components. + else if (IsPathnameStart() || IsSearchPrefix() || IsHashPrefix()) + RewindAndSetState(StringParseState::kHostname); + break; + + case StringParseState::kUsername: + // If we find a `:` then transition to the password component state. + if (IsPasswordPrefix()) + ChangeState(StringParseState::kPassword, Skip(1)); + + // If we find a `@` then transition to the hostname component state. + else if (IsIdentityTerminator()) + ChangeState(StringParseState::kHostname, Skip(1)); + break; + + case StringParseState::kPassword: + // If we find a `@` then transition to the hostname component state. + if (IsIdentityTerminator()) + ChangeState(StringParseState::kHostname, Skip(1)); + break; + + case StringParseState::kHostname: + // If we find a `:` then we transition to the port component state. + if (IsPortPrefix()) + ChangeState(StringParseState::kPort, Skip(1)); + + // If we find a `/` then we transition to the pathname component state. + else if (IsPathnameStart()) + ChangeState(StringParseState::kPathname, Skip(0)); + + // If we find a `?` then we transition to the search component state. + else if (IsSearchPrefix()) + ChangeState(StringParseState::kSearch, Skip(1)); + + // If we find a `#` then we transition to the hash component state. + else if (IsHashPrefix()) + ChangeState(StringParseState::kHash, Skip(1)); + break; + + case StringParseState::kPort: + // If we find a `/` then we transition to the pathname component state. + if (IsPathnameStart()) + ChangeState(StringParseState::kPathname, Skip(0)); + // If we find a `?` then we transition to the search component state. + else if (IsSearchPrefix()) + ChangeState(StringParseState::kSearch, Skip(1)); + // If we find a `#` then we transition to the hash component state. + else if (IsHashPrefix()) + ChangeState(StringParseState::kHash, Skip(1)); + break; + case StringParseState::kPathname: + // If we find a `?` then we transition to the search component state. + if (IsSearchPrefix()) + ChangeState(StringParseState::kSearch, Skip(1)); + // If we find a `#` then we transition to the hash component state. + else if (IsHashPrefix()) + ChangeState(StringParseState::kHash, Skip(1)); + break; + case StringParseState::kSearch: + // If we find a `#` then we transition to the hash component state. + if (IsHashPrefix()) + ChangeState(StringParseState::kHash, Skip(1)); + break; + case StringParseState::kHash: + // Nothing to do here as we are just looking for the end. + break; + case StringParseState::kDone: + NOTREACHED(); + break; + }; + } +} + +void Parser::ChangeState(StringParseState new_state, Skip skip) { + // First we convert the tokens between `component_start_` and `token_index_` + // a component pattern string. This is stored in the appropriate result + // property based on the current `state_`. + switch (state_) { + case StringParseState::kInit: + // No component to set when transitioning from this state. + break; + case StringParseState::kProtocol: + result_->setProtocol(MakeComponentString()); + break; + case StringParseState::kAuthority: + // No component to set when transitioning from this state. + break; + case StringParseState::kUsername: + result_->setUsername(MakeComponentString()); + break; + case StringParseState::kPassword: + result_->setPassword(MakeComponentString()); + break; + case StringParseState::kHostname: + result_->setHostname(MakeComponentString()); + break; + case StringParseState::kPort: + result_->setPort(MakeComponentString()); + break; + case StringParseState::kPathname: + result_->setPathname(MakeComponentString()); + break; + case StringParseState::kSearch: + result_->setSearch(MakeComponentString()); + break; + case StringParseState::kHash: + result_->setHash(MakeComponentString()); + break; + case StringParseState::kDone: + NOTREACHED(); + break; + } + + ChangeStateWithoutSettingComponent(new_state, skip); +} + +void Parser::ChangeStateWithoutSettingComponent(StringParseState new_state, + Skip skip) { + state_ = new_state; + + // Now update `component_start_` to point to the new component. The `skip` + // argument tells us how many tokens to ignore to get to the next start. + component_start_ = token_index_ + skip.value(); + + // Next, move the `token_index_` so that the top of the loop will begin + // parsing the new component. We adjust the `token_increment_` down to + // zero as the skip value already takes into account moving to the start + // of the next component. + token_index_ += skip.value(); + token_increment_ = 0; +} + +void Parser::Rewind() { + token_index_ = component_start_; + token_increment_ = 0; +} + +void Parser::RewindAndSetState(StringParseState new_state) { + Rewind(); + state_ = new_state; +} + +const liburlpattern::Token& Parser::SafeToken(size_t index) const { + if (index < token_list_.size()) + return token_list_[index]; + DCHECK(!token_list_.empty()); + DCHECK(token_list_.back().type == liburlpattern::TokenType::kEnd); + return token_list_.back(); +} + +bool Parser::IsNonSpecialPatternChar(size_t index, const char* value) const { + const liburlpattern::Token& token = SafeToken(index); + return token.value == value && + (token.type == liburlpattern::TokenType::kChar || + token.type == liburlpattern::TokenType::kEscapedChar || + token.type == liburlpattern::TokenType::kInvalidChar); +} + +bool Parser::IsProtocolSuffix() const { + return IsNonSpecialPatternChar(token_index_, ":"); +} + +bool Parser::NextIsAuthoritySlashes() const { + return IsNonSpecialPatternChar(token_index_ + 1, "/") && + IsNonSpecialPatternChar(token_index_ + 2, "/"); +} + +bool Parser::IsIdentityTerminator() const { + return IsNonSpecialPatternChar(token_index_, "@"); +} + +bool Parser::IsPasswordPrefix() const { + return IsNonSpecialPatternChar(token_index_, ":"); +} + +bool Parser::IsPortPrefix() const { + return IsNonSpecialPatternChar(token_index_, ":"); +} + +bool Parser::IsPathnameStart() const { + return IsNonSpecialPatternChar(token_index_, "/"); +} + +bool Parser::IsSearchPrefix() const { + if (IsNonSpecialPatternChar(token_index_, "?")) + return true; + + if (token_list_[token_index_].value != "?") + return false; + + // If we have a "?" that is not a normal character, then it must be an + // optional group modifier. + DCHECK_EQ(SafeToken(token_index_).type, + liburlpattern::TokenType::kOtherModifier); + + // We have a `?` tokenized as a modifier. We only want to treat this as + // the search prefix if it would not normally be valid in a liburlpattern + // string. A modifier must follow a matching group. Therefore we inspect + // the preceding token to see if the `?` is immediately following a group + // construct. + // + // So if the string is: + // + // https://example.com/foo?bar + // + // Then we return true because the previous token is a `o` with type kChar. + // For the string: + // + // https://example.com/:name?bar + // + // Then we return false because the previous token is `:name` with type + // kName. If the developer intended this to be a search prefix then they + // would need to escape like question mark like `:name\\?bar`. + // + // Note, if `token_index_` is zero the index will wrap around and + // `SafeToken()` will return the kEnd token. This will correctly return true + // from this method as a pattern cannot normally begin with an unescaped `?`. + const auto& previous_token = SafeToken(token_index_ - 1); + return previous_token.type != liburlpattern::TokenType::kName && + previous_token.type != liburlpattern::TokenType::kRegex && + previous_token.type != liburlpattern::TokenType::kClose && + previous_token.type != liburlpattern::TokenType::kAsterisk; +} + +bool Parser::IsHashPrefix() const { + return IsNonSpecialPatternChar(token_index_, "#"); +} + +bool Parser::IsGroupOpen() const { + return token_list_[token_index_].type == liburlpattern::TokenType::kOpen; +} + +bool Parser::IsGroupClose() const { + return token_list_[token_index_].type == liburlpattern::TokenType::kClose; +} + +String Parser::MakeComponentString() const { + DCHECK_LT(token_index_, token_list_.size()); + const auto& token = token_list_[token_index_]; + + size_t component_char_start = SafeToken(component_start_).index; + + DCHECK_LE(component_char_start, utf8_.size()); + DCHECK_GE(token.index, component_char_start); + DCHECK(token.index < utf8_.size() || + (token.index == utf8_.size() && + token.type == liburlpattern::TokenType::kEnd)); + + return String::FromUTF8(utf8_.data() + component_char_start, + token.index - component_char_start); +} + +void Parser::ComputeShouldTreatAsStandardURL(ExceptionState& exception_state) { + DCHECK_EQ(state_, StringParseState::kProtocol); + protocol_component_ = + Component::Compile(MakeComponentString(), Component::Type::kProtocol, + /*protocol_component=*/nullptr, exception_state); + if (protocol_component_ && protocol_component_->ShouldTreatAsStandardURL()) + should_treat_as_standard_url_ = true; +} + +} // namespace url_pattern +} // namespace blink diff --git a/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_parser.h b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_parser.h new file mode 100644 index 00000000000..c3392018b79 --- /dev/null +++ b/chromium/third_party/blink/renderer/modules/url_pattern/url_pattern_parser.h @@ -0,0 +1,192 @@ +// Copyright 2021 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef THIRD_PARTY_BLINK_RENDERER_MODULES_URL_PATTERN_URL_PATTERN_PARSER_H_ +#define THIRD_PARTY_BLINK_RENDERER_MODULES_URL_PATTERN_URL_PATTERN_PARSER_H_ + +#include <vector> + +#include "base/types/strong_alias.h" +#include "third_party/blink/renderer/platform/wtf/allocator/allocator.h" +#include "third_party/blink/renderer/platform/wtf/text/string_utf8_adaptor.h" +#include "third_party/blink/renderer/platform/wtf/text/wtf_string.h" + +namespace liburlpattern { +struct Token; +} // namespace liburlpattern + +namespace blink { + +class ExceptionState; +class URLPatternInit; + +namespace url_pattern { + +class Component; + +// A helper class to parse the first string passed to the URLPattern +// constructor. In general the parser works by using the liburlpattern +// tokenizer to first split up the input into pattern tokens. It can +// then look through the tokens to find non-special characters that match +// the different URL component separators. Each component is then split +// off and stored in a `URLPatternInit` object that can be accessed via +// `GetResult()`. The intent is that this init object should then be +// processed as if it was passed into the constructor itself. +class Parser final { + STACK_ALLOCATED(); + + public: + explicit Parser(const String& input); + + // Attempt to parse the input string used to construct the Parser object. + // This method may only be called once. Any errors will be thrown on the + // give `exception_state`. Retrieve the parse result by calling + // `GetResult()`. A protocol component will also be eagerly compiled for + // absolute pattern strings. It is not compiled for relative pattern string. + // The compiled protocol Component can be accessed by calling + // `GetProtocolComponent()`. + void Parse(ExceptionState& exception_state); + + // Return the parse result. Should only be called after `Parse()` succeeds. + URLPatternInit* GetResult() const { return result_; } + + // Return the protocol component if it was compiled as part of parsing the + // input string. This should only be called after `Parse()` succeeds. + // This will return nullptr if the input was a relative pattern string. + Component* GetProtocolComponent() const { return protocol_component_; } + + private: + enum class StringParseState { + kInit, + kProtocol, + kAuthority, + kUsername, + kPassword, + kHostname, + kPort, + kPathname, + kSearch, + kHash, + kDone, + }; + + using Skip = base::StrongAlias<class SkipTag, int>; + + // A utility function to move from the current `state_` to `new_state`. This + // method will populate the component string in `result_` corresponding to the + // current `state_` automatically. It will also set `component_start_` and + // `token_index_` to point to the first token of the next section based on how + // many tokens the `skip` argument indicates should be ignored. + void ChangeState(StringParseState new_state, Skip skip); + + // A utility function to move to `new_state`. This is like `ChangeState()`, + // but does not automatically set the component string for the current state. + void ChangeStateWithoutSettingComponent(StringParseState new_state, + Skip skip); + + // Rewind the `token_index_` back to the current `component_start_`. + void Rewind(); + + // Like `Rewind()`, but also sets the state. This is used for cases where + // the parser needs to "look ahead" to determine what parse state to enter. + void RewindAndSetState(StringParseState new_state); + + // Attempt to access the Token at the given `index`. If the `index` is out + // of bounds for the `token_list_`, then the last Token in the list is + // returned. This will always be a `TokenType::kEnd` token. + const liburlpattern::Token& SafeToken(size_t index) const; + + // Returns true if the token at the given `index` is not a special pattern + // character and if it matches the given `value`. This simply checks that the + // token type is kChar, kEscapedChar, or kInvalidChar. + bool IsNonSpecialPatternChar(size_t index, const char* value) const; + + // Returns true if the token at the given `index` is the protocol component + // suffix; e.g. ':'. + bool IsProtocolSuffix() const; + + // Returns true if the next two tokens are slashes; e.g. `//`. + bool NextIsAuthoritySlashes() const; + + // Returns true if the tokan at the given `index` is the `@` character used + // to separate username and password from the hostname. + bool IsIdentityTerminator() const; + + // Returns true if the current token is the password prefix; e.g. `:`. + bool IsPasswordPrefix() const; + + // Returns true if the current token is the port prefix; e.g. `:`. + bool IsPortPrefix() const; + + // Returns true if the current token is the start of the pathname; e.g. `/`. + bool IsPathnameStart() const; + + // Returns true if the current token is the search component prefix; e.g. `?`. + // This also takes into account if this could be a valid pattern modifier by + // looking at the preceding tokens. + bool IsSearchPrefix() const; + + // Returns true if the current token is the hsah component prefix; e.g. `#`. + bool IsHashPrefix() const; + + // These methods indicate if the current token is opening or closing a pattern + // grouping; e.g. `{` or `}`. + bool IsGroupOpen() const; + bool IsGroupClose() const; + + // This method returns a String consisting of the tokens between + // `component_start_` and the current `token_index_`. + String MakeComponentString() const; + + // Returns true if this URL should be treated as a "standard URL". These URLs + // automatically append a `/` for the pathname if one is not specified. + void ComputeShouldTreatAsStandardURL(ExceptionState& exception_state); + + // The input string to the parser. + const String input_; + + // UTF8 representation of `input_`. + const StringUTF8Adaptor utf8_; + + // As we parse the input string we populate a `URLPatternInit` dictionary + // with each component pattern. This is then the final result of the parse. + URLPatternInit* result_ = nullptr; + + // The compiled Component for the protocol. This is generated for absolute + // strings where we need to determine if the value should be treated as + // a "standard" URL. + Component* protocol_component_ = nullptr; + + // The list of Tokens produced by calling `liburlpattern::Tokenize()` on + // `input_`. + std::vector<liburlpattern::Token> token_list_; + + // The index of the first Token to include in the component string. + size_t component_start_ = 0; + + // The index of the current Token being considered. + size_t token_index_ = 0; + + // The value to add to `token_index_` on each turn the through the parse + // loop. While typically this is `1`, it is also set to `0` at times for + // things like state transitions, etc. It is automatically reset back to + // `1` at the top of the parse loop. + size_t token_increment_ = 1; + + // The current nesting depth of `{ }` pattern groupings. + int group_depth_ = 0; + + // The current parse state. This should only be changed via `ChangeState()` + // or `RewindAndSetState()`. + StringParseState state_ = StringParseState::kInit; + + // True if we should apply parse rules as if this is a "standard" URL. If + // false then this is treated as a "not a base URL" or "path" URL. + bool should_treat_as_standard_url_ = false; +}; + +} // namespace url_pattern +} // namespace blink + +#endif // THIRD_PARTY_BLINK_RENDERER_MODULES_URL_PATTERN_URL_PATTERN_PARSER_H_ |