diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2022-02-04 17:20:24 +0100 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2022-02-12 08:15:25 +0000 |
commit | 8fa0776f1f79e91fc9c0b9c1ba11a0a29c05196b (patch) | |
tree | 788d8d7549712682703a0310ca4a0f0860d4802b /chromium/third_party/liburlpattern | |
parent | 606d85f2a5386472314d39923da28c70c60dc8e7 (diff) | |
download | qtwebengine-chromium-8fa0776f1f79e91fc9c0b9c1ba11a0a29c05196b.tar.gz |
BASELINE: Update Chromium to 98.0.4758.90
Change-Id: Ib7c41539bf8a8e0376bd639f27d68294de90f3c8
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'chromium/third_party/liburlpattern')
-rw-r--r-- | chromium/third_party/liburlpattern/BUILD.gn | 17 | ||||
-rw-r--r-- | chromium/third_party/liburlpattern/README.chromium | 1 | ||||
-rw-r--r-- | chromium/third_party/liburlpattern/parse.h | 8 | ||||
-rw-r--r-- | chromium/third_party/liburlpattern/parse_fuzzer.cc | 74 | ||||
-rw-r--r-- | chromium/third_party/liburlpattern/pattern.cc | 70 | ||||
-rw-r--r-- | chromium/third_party/liburlpattern/pattern.h | 4 | ||||
-rw-r--r-- | chromium/third_party/liburlpattern/pattern_unittest.cc | 53 | ||||
-rw-r--r-- | chromium/third_party/liburlpattern/tokenize.cc | 72 | ||||
-rw-r--r-- | chromium/third_party/liburlpattern/tokenize.h | 7 | ||||
-rw-r--r-- | chromium/third_party/liburlpattern/tokenize_unittest.cc | 38 | ||||
-rw-r--r-- | chromium/third_party/liburlpattern/utils.cc | 18 | ||||
-rw-r--r-- | chromium/third_party/liburlpattern/utils.h | 7 |
12 files changed, 314 insertions, 55 deletions
diff --git a/chromium/third_party/liburlpattern/BUILD.gn b/chromium/third_party/liburlpattern/BUILD.gn index e3af29d6036..00236696212 100644 --- a/chromium/third_party/liburlpattern/BUILD.gn +++ b/chromium/third_party/liburlpattern/BUILD.gn @@ -2,8 +2,13 @@ # Use of this source code is governed by an MIT-style license that can be # found in the LICENSE file or at https://opensource.org/licenses/MIT. +import("//testing/libfuzzer/fuzzer_test.gni") import("//testing/test.gni") +config("warnings") { + cflags = [ "-Wno-shadow" ] +} + component("liburlpattern") { defines = [ "IS_LIBURLPATTERN_IMPL" ] deps = [ @@ -12,6 +17,8 @@ component("liburlpattern") { "//third_party/icu:icu", ] + configs += [ ":warnings" ] + # Note, also update the local modifications in README.chromium. sources = [ "options.h", @@ -32,6 +39,7 @@ test("liburlpattern_unittests") { "//base/test:run_all_unittests", "//testing/gtest", "//third_party/abseil-cpp:absl", + "//third_party/icu:icu", ] # Note, also update the local modifications in README.chromium. @@ -43,3 +51,12 @@ test("liburlpattern_unittests") { ] testonly = true } + +fuzzer_test("liburlpattern_fuzzer") { + sources = [ "parse_fuzzer.cc" ] + deps = [ + ":liburlpattern", + "//base", + "//third_party/abseil-cpp:absl", + ] +} diff --git a/chromium/third_party/liburlpattern/README.chromium b/chromium/third_party/liburlpattern/README.chromium index 25629bf64b0..74b13056e75 100644 --- a/chromium/third_party/liburlpattern/README.chromium +++ b/chromium/third_party/liburlpattern/README.chromium @@ -22,6 +22,7 @@ third_party/liburlpattern/OWNERS third_party/liburlpattern/options.h third_party/liburlpattern/parse.cc third_party/liburlpattern/parse.h +third_party/liburlpattern/parse_fuzzer.cc third_party/liburlpattern/parse_unittest.cc third_party/liburlpattern/pattern.cc third_party/liburlpattern/pattern.h diff --git a/chromium/third_party/liburlpattern/parse.h b/chromium/third_party/liburlpattern/parse.h index add6eec22f3..9af64518cf4 100644 --- a/chromium/third_party/liburlpattern/parse.h +++ b/chromium/third_party/liburlpattern/parse.h @@ -29,10 +29,10 @@ class Pattern; typedef std::function<absl::StatusOr<std::string>(absl::string_view)> EncodeCallback; -// Parse a pattern string and return the result. The input |pattern| must -// consist of UTF-8 characters. Currently only group names may actually -// contain non-ASCII characters, however. Unicode characters in other parts -// of the pattern will cause an error to be returned. A |callback| must be +// Parse a pattern string and return the result. The parse will fail if the +// input |pattern| is not valid UTF-8. Currently only group names may actually +// contain non-ASCII characters, however. Unicode characters in other parts of +// the pattern will cause an error to be returned. A |callback| must be // provided to validate and encode plain text parts of the pattern. An // |options| value may be provided to override default behavior. COMPONENT_EXPORT(LIBURLPATTERN) diff --git a/chromium/third_party/liburlpattern/parse_fuzzer.cc b/chromium/third_party/liburlpattern/parse_fuzzer.cc new file mode 100644 index 00000000000..802c705f483 --- /dev/null +++ b/chromium/third_party/liburlpattern/parse_fuzzer.cc @@ -0,0 +1,74 @@ +// Copyright 2021 The Chromium Authors. All rights reserved. +// Use of this source code is governed by an MIT-style license that can be +// found in the LICENSE file. + +#include <stddef.h> +#include <stdint.h> + +#include <string> + +#include "base/check.h" +#include "base/check_op.h" +#include "base/containers/span.h" +#include "base/logging.h" +#include "base/strings/strcat.h" +#include "third_party/abseil-cpp/absl/status/statusor.h" +#include "third_party/abseil-cpp/absl/strings/str_format.h" +#include "third_party/abseil-cpp/absl/strings/string_view.h" +#include "third_party/abseil-cpp/absl/types/optional.h" +#include "third_party/liburlpattern/parse.h" +#include "third_party/liburlpattern/pattern.h" + +namespace liburlpattern { +namespace { +absl::StatusOr<std::string> PassThrough(absl::string_view input) { + return std::string(input); +} + +absl::optional<std::string> ParseAndCanonicalize(absl::string_view s) { + absl::StatusOr<Pattern> pattern = Parse(s, &PassThrough); + if (!pattern.ok()) { + LOG(INFO) << "Parse failed with status: " << pattern.status(); + return absl::nullopt; + } + return pattern->GeneratePatternString(); +} + +std::string FancyHexDump(base::StringPiece label, base::StringPiece data) { + std::string char_line, hex_line; + for (char c : data) { + char_line.append(absl::StrFormat("%4c", c)); + hex_line.append(absl::StrFormat("%4x", c)); + } + return base::StrCat({label, "\n", char_line, "\n", hex_line}); +} + +struct Environment { + Environment() { logging::SetMinLogLevel(logging::LOG_INFO); } +}; +} // namespace + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + static Environment env; + + // Make a copy of `data` on the heap to enable ASAN to catch OOB accesses. + std::string pattern_string(reinterpret_cast<const char*>(data), size); + + absl::optional<std::string> canonical = ParseAndCanonicalize(pattern_string); + if (!canonical) + return 0; + + // If `Pattern::GeneratePatternString()` generates canonical strings, + // recanonicalizing one of its outputs should always be a no-op. To test that + // property, let's check that `ParseAndCanonicalize()` is idempotent, i.e. + // that `canonical` is a fixed point of the function. + absl::optional<std::string> canonical2 = ParseAndCanonicalize(*canonical); + CHECK(canonical2) + << "Failed to parse canonical pattern from original input.\n" + << FancyHexDump("original : ", pattern_string) << "\n" + << FancyHexDump("canonical: ", *canonical); + + CHECK_EQ(*canonical, *canonical2); + return 0; +} +} // namespace liburlpattern diff --git a/chromium/third_party/liburlpattern/pattern.cc b/chromium/third_party/liburlpattern/pattern.cc index 9b39bc1f4c3..885417b10bc 100644 --- a/chromium/third_party/liburlpattern/pattern.cc +++ b/chromium/third_party/liburlpattern/pattern.cc @@ -7,6 +7,7 @@ #include "third_party/abseil-cpp/absl/base/macros.h" #include "third_party/abseil-cpp/absl/strings/str_format.h" +#include "third_party/icu/source/common/unicode/utf8.h" #include "third_party/liburlpattern/utils.h" namespace liburlpattern { @@ -73,6 +74,14 @@ Part::Part(PartType t, ABSL_ASSERT(value.empty()); } +bool Part::HasCustomName() const { + // Determine if the part name was custom, like `:foo`, or an + // automatically assigned numeric value. Since custom group + // names follow javascript identifier rules the first character + // cannot be a digit, so that is all we need to check here. + return !name.empty() && !std::isdigit(name[0]); +} + Pattern::Pattern(std::vector<Part> part_list, Options options, std::string segment_wildcard_regex) @@ -93,8 +102,9 @@ std::string Pattern::GeneratePatternString() const { } result.reserve(estimated_length); - for (const Part& part : part_list_) { - // + for (size_t i = 0; i < part_list_.size(); ++i) { + const Part& part = part_list_[i]; + if (part.type == PartType::kFixed) { // A simple fixed string part. if (part.modifier == Modifier::kNone) { @@ -111,20 +121,26 @@ std::string Pattern::GeneratePatternString() const { continue; } - // Determine if the part needs a grouping like `{ ... }`. This is only - // necessary when using a non-automatic prefix or any suffix. + bool custom_name = part.HasCustomName(); + + // Determine if the part needs a grouping like `{ ... }`. This is + // necessary when the group: + // + // 1. is using a non-automatic prefix or any suffix. + // 2. followed by a matching group that may be represented by a + // `(...)` expression. This is necessary to avoid the following `(...)` + // being mistakenly interpretted as the custom regexp for this + // named group; like `:foo(...)`. + const Part* next_part = + (i + 1) < part_list_.size() ? &part_list_[i + 1] : nullptr; bool needs_grouping = !part.suffix.empty() || (!part.prefix.empty() && (part.prefix.size() != 1 || - options_.prefix_list.find(part.prefix[0]) == std::string::npos)); - - // Determine if the part name was custom, like `:foo`, or an - // automatically assigned numeric value. Since custom group - // names follow javascript identifier rules the first character - // cannot be a digit, so that is all we need to check here. - ABSL_ASSERT(!part.name.empty()); - bool custom_name = !std::isdigit(part.name[0]); + options_.prefix_list.find(part.prefix[0]) == std::string::npos)) || + (custom_name && part.modifier == Modifier::kNone && next_part && + next_part->type != PartType::kFixed && next_part->prefix.empty() && + next_part->suffix.empty() && !next_part->HasCustomName()); // This is a full featured part. We must generate a string that looks // like: @@ -157,10 +173,17 @@ std::string Pattern::GeneratePatternString() const { result += ")"; } } else if (part.type == PartType::kFullWildcard) { - // We can only use the `*` wildcard card if the automatic - // numeric name is used for the group. A custom name - // requires the regexp `(.*)` explicitly. - if (!custom_name) { + const Part* last_part = i > 0 ? &part_list_[i - 1] : nullptr; + // We can only use the `*` wildcard card if we meet a number + // of conditions. We must use an explicit `(.*)` group if: + // + // 1. A custom name was used; e.g. `:foo(.*)`. + // 2. If the preceding group is a matching group without a modifier; e.g. + // `(foo)(.*)`. In that case we cannot emit the `*` shorthand without + // it being mistakenly interpreted as the modifier for the previous + // group. + if (!custom_name && (!last_part || last_part->type == PartType::kFixed || + last_part->modifier != Modifier::kNone)) { result += "*"; } else { result += "("; @@ -169,6 +192,21 @@ std::string Pattern::GeneratePatternString() const { } } + // If the matching group is a simple `:foo` custom name with the default + // segment wildcard, then we must check for a trailing suffix that could + // be interpreted as a trailing part of the name itself. In these cases + // we must escape the beginning of the suffix in order to separate it + // from the end of the custom name; e.g. `:foo\\bar` instead of `:foobar`. + if (part.type == PartType::kSegmentWildcard && custom_name && + !part.suffix.empty()) { + UChar32 codepoint = -1; + U8_GET(reinterpret_cast<const uint8_t*>(part.suffix.data()), 0, 0, + static_cast<int>(part.suffix.size()), codepoint); + if (IsNameCodepoint(codepoint, /*first_codepoint=*/false)) { + result += "\\"; + } + } + EscapePatternStringAndAppend(part.suffix, result); if (needs_grouping) diff --git a/chromium/third_party/liburlpattern/pattern.h b/chromium/third_party/liburlpattern/pattern.h index 67f46b1049f..f522f44c9d1 100644 --- a/chromium/third_party/liburlpattern/pattern.h +++ b/chromium/third_party/liburlpattern/pattern.h @@ -85,6 +85,10 @@ struct COMPONENT_EXPORT(LIBURLPATTERN) Part { std::string suffix, Modifier modifier); Part() = default; + + // Returns true if the `name` member is a custom name; e.g. for a `:foo` + // group. + bool HasCustomName() const; }; COMPONENT_EXPORT(LIBURLPATTERN) diff --git a/chromium/third_party/liburlpattern/pattern_unittest.cc b/chromium/third_party/liburlpattern/pattern_unittest.cc index 08e5f498cf8..74a53cdfc9e 100644 --- a/chromium/third_party/liburlpattern/pattern_unittest.cc +++ b/chromium/third_party/liburlpattern/pattern_unittest.cc @@ -330,6 +330,59 @@ TEST(PatternStringTest, RegexpEscapedPatternCharInSuffix) { RunPatternStringTest("/foo/{(foo)\\:bar}", "/foo/{(foo)\\:bar}"); } +TEST(PatternStringTest, RegexpFollowedByWildcard) { + RunPatternStringTest("(foo)(.*)", "(foo)(.*)"); +} + +TEST(PatternStringTest, RegexpWithOptionalModifierFollowedByWildcard) { + RunPatternStringTest("(foo)?(.*)", "(foo)?*"); +} + +TEST(PatternStringTest, RegexpWithSuffixModifierFollowedByWildcard) { + RunPatternStringTest("{(foo)a}(.*)", "{(foo)a}(.*)"); +} + +TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcard) { + RunPatternStringTest("{:foo}(.*)", "{:foo}(.*)"); +} + +TEST(PatternStringTest, NamedGroupInGroupingFollowedByRegexp) { + RunPatternStringTest("{:foo}(bar)", "{:foo}(bar)"); +} + +TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcardInGrouping) { + RunPatternStringTest("{:foo}{(.*)}", "{:foo}(.*)"); +} + +TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcardWithSuffix) { + RunPatternStringTest("{:foo}{(.*)bar}", ":foo{(.*)bar}"); +} + +TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcardWithPrefix) { + RunPatternStringTest("{:foo}{bar(.*)}", ":foo{bar(.*)}"); +} + +TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcardWithCustomName) { + RunPatternStringTest("{:foo}:bar(.*)", ":foo:bar(.*)"); +} + +TEST(PatternStringTest, + NamedGroupInGroupingWithOptionalModifierFollowedByWildcard) { + RunPatternStringTest("{:foo}?(.*)", ":foo?*"); +} + +TEST(PatternStringTest, NamedGroupWithEscapedValidNameSuffix) { + RunPatternStringTest("{:foo\\bar}", "{:foo\\bar}"); +} + +TEST(PatternStringTest, NamedGroupWithEscapedInvalidNameSuffix) { + RunPatternStringTest("{:foo\\.bar}", "{:foo.bar}"); +} + +TEST(PatternStringTest, NamedGroupWithCustomRegexpAndValidNameSuffix) { + RunPatternStringTest("{:foo(baz)bar}", "{:foo(baz)bar}"); +} + struct DirectMatchCase { absl::string_view input; bool expected_match = true; diff --git a/chromium/third_party/liburlpattern/tokenize.cc b/chromium/third_party/liburlpattern/tokenize.cc index ff70c58c4c9..cbc38dc2062 100644 --- a/chromium/third_party/liburlpattern/tokenize.cc +++ b/chromium/third_party/liburlpattern/tokenize.cc @@ -5,9 +5,11 @@ #include "third_party/liburlpattern/tokenize.h" +#include "base/compiler_specific.h" #include "third_party/abseil-cpp/absl/strings/str_format.h" #include "third_party/icu/source/common/unicode/uchar.h" #include "third_party/icu/source/common/unicode/utf8.h" +#include "third_party/liburlpattern/utils.h" // The following code is a translation from the path-to-regexp typescript at: // @@ -23,24 +25,6 @@ bool IsASCII(UChar32 c) { return c >= 0x00 && c <= 0x7f; } -bool IsNameCodepoint(UChar32 c, bool first_codepoint) { - // Require group names to follow the same character restrictions as - // javascript identifiers. This code originates from v8 at: - // - // https://source.chromium.org/chromium/chromium/src/+/master:v8/src/strings/char-predicates.cc;l=17-34;drc=be014256adea1552d4a044ef80616cdab6a7d549 - // - // We deviate from js identifiers, however, in not support the backslash - // character. This is mainly used in js identifiers to allow escaped - // unicode sequences to be written in ascii. The js engine, however, - // should take care of this long before we reach this level of code. So - // we don't need to handle it here. - if (first_codepoint) { - return u_hasBinaryProperty(c, UCHAR_ID_START) || c == '$' || c == '_'; - } - return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) || c == '$' || c == '_' || - c == 0x200c || c == 0x200d; -} - class Tokenizer { public: Tokenizer(absl::string_view pattern, TokenizePolicy policy) @@ -53,7 +37,10 @@ class Tokenizer { if (!status_.ok()) return std::move(status_); - NextAt(index_); + if (!NextAt(index_)) { + Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", index_)); + continue; + } if (codepoint_ == '*') { AddToken(TokenType::kAsterisk); continue; @@ -73,7 +60,12 @@ class Tokenizer { continue; } size_t escaped_i = next_index_; - Next(); + if (!Next()) { + Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", + next_index_)); + continue; + } + AddToken(TokenType::kEscapedChar, next_index_, escaped_i); continue; } @@ -94,7 +86,12 @@ class Tokenizer { // Iterate over codepoints until we find the first non-name codepoint. while (pos < pattern_.size()) { - NextAt(pos); + if (!status_.ok()) + return std::move(status_); + if (!NextAt(pos)) { + Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", pos)); + continue; + } if (!IsNameCodepoint(codepoint_, pos == name_start)) break; pos = next_index_; @@ -117,7 +114,11 @@ class Tokenizer { bool error = false; while (j < pattern_.size()) { - NextAt(j); + if (!NextAt(j)) { + Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", j)); + error = true; + break; + } if (!IsASCII(codepoint_)) { Error(absl::StrFormat( @@ -149,7 +150,12 @@ class Tokenizer { break; } size_t escaped_j = next_index_; - Next(); + if (!Next()) { + Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", + next_index_)); + error = true; + break; + } if (!IsASCII(codepoint_)) { Error(absl::StrFormat( "Invalid non-ASCII character 0x%02x at index %d.", @@ -177,7 +183,12 @@ class Tokenizer { break; } size_t tmp_j = next_index_; - Next(); + if (!Next()) { + Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", + next_index_)); + error = true; + break; + } // Require the the first character after an open paren is `?`. This // permits assertions, named capture groups, and non-capturing // groups. It blocks, however, unnamed capture groups. @@ -229,17 +240,20 @@ class Tokenizer { private: // Read the codepoint at `next_index_` in `pattern_` and store it in // `codepoint_`. In addition, `next_index_` is updated to the codepoint to be - // read next. - void Next() { + // read next. Returns true iff the codepoint was read successfully. On + // success, `codepoint_` is non-negative. + bool Next() WARN_UNUSED_RESULT { U8_NEXT(pattern_.data(), next_index_, pattern_.size(), codepoint_); + return codepoint_ >= 0; } // Read the codepoint at the specified `index` in `pattern_` and store it in // `codepoint_`. In addition, `next_index_` is updated to the codepoint to be - // read next. - void NextAt(size_t index) { + // read next. Returns true iff the codepoint was read successfully. On + // success, `codepoint_` is non-negative. + bool NextAt(size_t index) WARN_UNUSED_RESULT { next_index_ = index; - Next(); + return Next(); } // Append a Token to our list of the given `type` and with a value consisting diff --git a/chromium/third_party/liburlpattern/tokenize.h b/chromium/third_party/liburlpattern/tokenize.h index ba33741f7b0..0141bc37fb1 100644 --- a/chromium/third_party/liburlpattern/tokenize.h +++ b/chromium/third_party/liburlpattern/tokenize.h @@ -94,9 +94,10 @@ inline bool operator!=(const Token& lh, const Token& rh) { COMPONENT_EXPORT(LIBURLPATTERN) std::ostream& operator<<(std::ostream& o, Token token); -// Split the given input pattern string into a list of lexical tokens. Note, -// the generated Token objects simply reference positions within the input -// |pattern|. The |pattern| must be kept alive as long as the Token objects. +// Split the given input pattern string into a list of lexical tokens. +// Tokenizing will fail if |pattern| is not valid UTF-8. Note, the generated +// Token objects simply reference positions within the input |pattern|. The +// |pattern| must be kept alive as long as the Token objects. COMPONENT_EXPORT(LIBURLPATTERN) absl::StatusOr<std::vector<Token>> Tokenize( absl::string_view pattern, diff --git a/chromium/third_party/liburlpattern/tokenize_unittest.cc b/chromium/third_party/liburlpattern/tokenize_unittest.cc index ac37983488a..46900e40226 100644 --- a/chromium/third_party/liburlpattern/tokenize_unittest.cc +++ b/chromium/third_party/liburlpattern/tokenize_unittest.cc @@ -288,9 +288,9 @@ TEST(TokenizeTest, RegexWithTrailingEscapedChar) { } TEST(TokenizeTest, RegexWithEscapedInvalidChar) { - // Use a single byte invalid character since the escape only applies to the - // next byte character. - RunTokenizeTest("(\\\xff)", + // Use a valid UTF-8 sequence (encoding of U+00A2) that encodes a non-ASCII + // character. + RunTokenizeTest("(\\\xc2\xa2)", absl::InvalidArgumentError("Invalid non-ASCII character")); } @@ -477,4 +477,36 @@ TEST(TokenizeTest, LenientPolicyRegexWithCaptureGroup) { RunTokenizeTest("(foo(bar))", expected_tokens, TokenizePolicy::kLenient); } +TEST(TokenizeTest, InvalidUtf8) { + RunTokenizeTest("hello\xcdworld", absl::InvalidArgumentError( + "Invalid UTF-8 codepoint at index 5.")); +} + +TEST(TokenizeTest, InvalidUtf8Escaped) { + RunTokenizeTest( + "hello\\\xcdworld", + absl::InvalidArgumentError("Invalid UTF-8 codepoint at index 7.")); +} + +TEST(TokenizeTest, InvalidUtf8InName) { + RunTokenizeTest( + "/:foo:hello\xcdworld", + absl::InvalidArgumentError("Invalid UTF-8 codepoint at index 11.")); +} + +TEST(TokenizeTest, InvalidUtf8InRegexGroup) { + RunTokenizeTest("(foo\xcd)", absl::InvalidArgumentError( + "Invalid UTF-8 codepoint at index 4.")); +} + +TEST(TokenizeTest, InvalidUtf8EscapedInRegexGroup) { + RunTokenizeTest("(foo\\\xcd)", absl::InvalidArgumentError( + "Invalid UTF-8 codepoint at index 6.")); +} + +TEST(TokenizeTest, InvalidUtf8InNestedRegexGroup) { + RunTokenizeTest("(foo(\xcd))", absl::InvalidArgumentError( + "Invalid UTF-8 codepoint at index 6.")); +} + } // namespace liburlpattern diff --git a/chromium/third_party/liburlpattern/utils.cc b/chromium/third_party/liburlpattern/utils.cc index c9955e1d2e0..5f7f2d69227 100644 --- a/chromium/third_party/liburlpattern/utils.cc +++ b/chromium/third_party/liburlpattern/utils.cc @@ -52,4 +52,22 @@ std::string EscapeRegexpString(absl::string_view input) { return result; } +bool IsNameCodepoint(UChar32 c, bool first_codepoint) { + // Require group names to follow the same character restrictions as + // javascript identifiers. This code originates from v8 at: + // + // https://source.chromium.org/chromium/chromium/src/+/master:v8/src/strings/char-predicates.cc;l=17-34;drc=be014256adea1552d4a044ef80616cdab6a7d549 + // + // We deviate from js identifiers, however, in not support the backslash + // character. This is mainly used in js identifiers to allow escaped + // unicode sequences to be written in ascii. The js engine, however, + // should take care of this long before we reach this level of code. So + // we don't need to handle it here. + if (first_codepoint) { + return u_hasBinaryProperty(c, UCHAR_ID_START) || c == '$' || c == '_'; + } + return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) || c == '$' || c == '_' || + c == 0x200c || c == 0x200d; +} + } // namespace liburlpattern diff --git a/chromium/third_party/liburlpattern/utils.h b/chromium/third_party/liburlpattern/utils.h index 02826b5f321..201f7f99672 100644 --- a/chromium/third_party/liburlpattern/utils.h +++ b/chromium/third_party/liburlpattern/utils.h @@ -9,6 +9,7 @@ #include <string> #include "base/component_export.h" #include "third_party/abseil-cpp/absl/strings/string_view.h" +#include "third_party/icu/source/common/unicode/uchar.h" namespace liburlpattern { @@ -36,6 +37,12 @@ COMPONENT_EXPORT(LIBURLPATTERN) void EscapePatternStringAndAppend(absl::string_view input, std::string& append_target); +// Return `true` if the given codepoint `c` is valid for a `:foo` name. The +// `first_codepoint` argument can be set if this codepoint is intended to be +// the first codepoint in a name. If its false, then the codepoint is treated +// as a trailing character. +bool IsNameCodepoint(UChar32 c, bool first_codepoint); + } // namespace liburlpattern #endif // THIRD_PARTY_LIBURLPATTERN_UTILS_H_ |