summaryrefslogtreecommitdiff
path: root/chromium/third_party/liburlpattern
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2022-02-04 17:20:24 +0100
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2022-02-12 08:15:25 +0000
commit8fa0776f1f79e91fc9c0b9c1ba11a0a29c05196b (patch)
tree788d8d7549712682703a0310ca4a0f0860d4802b /chromium/third_party/liburlpattern
parent606d85f2a5386472314d39923da28c70c60dc8e7 (diff)
downloadqtwebengine-chromium-8fa0776f1f79e91fc9c0b9c1ba11a0a29c05196b.tar.gz
BASELINE: Update Chromium to 98.0.4758.90
Change-Id: Ib7c41539bf8a8e0376bd639f27d68294de90f3c8 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'chromium/third_party/liburlpattern')
-rw-r--r--chromium/third_party/liburlpattern/BUILD.gn17
-rw-r--r--chromium/third_party/liburlpattern/README.chromium1
-rw-r--r--chromium/third_party/liburlpattern/parse.h8
-rw-r--r--chromium/third_party/liburlpattern/parse_fuzzer.cc74
-rw-r--r--chromium/third_party/liburlpattern/pattern.cc70
-rw-r--r--chromium/third_party/liburlpattern/pattern.h4
-rw-r--r--chromium/third_party/liburlpattern/pattern_unittest.cc53
-rw-r--r--chromium/third_party/liburlpattern/tokenize.cc72
-rw-r--r--chromium/third_party/liburlpattern/tokenize.h7
-rw-r--r--chromium/third_party/liburlpattern/tokenize_unittest.cc38
-rw-r--r--chromium/third_party/liburlpattern/utils.cc18
-rw-r--r--chromium/third_party/liburlpattern/utils.h7
12 files changed, 314 insertions, 55 deletions
diff --git a/chromium/third_party/liburlpattern/BUILD.gn b/chromium/third_party/liburlpattern/BUILD.gn
index e3af29d6036..00236696212 100644
--- a/chromium/third_party/liburlpattern/BUILD.gn
+++ b/chromium/third_party/liburlpattern/BUILD.gn
@@ -2,8 +2,13 @@
# Use of this source code is governed by an MIT-style license that can be
# found in the LICENSE file or at https://opensource.org/licenses/MIT.
+import("//testing/libfuzzer/fuzzer_test.gni")
import("//testing/test.gni")
+config("warnings") {
+ cflags = [ "-Wno-shadow" ]
+}
+
component("liburlpattern") {
defines = [ "IS_LIBURLPATTERN_IMPL" ]
deps = [
@@ -12,6 +17,8 @@ component("liburlpattern") {
"//third_party/icu:icu",
]
+ configs += [ ":warnings" ]
+
# Note, also update the local modifications in README.chromium.
sources = [
"options.h",
@@ -32,6 +39,7 @@ test("liburlpattern_unittests") {
"//base/test:run_all_unittests",
"//testing/gtest",
"//third_party/abseil-cpp:absl",
+ "//third_party/icu:icu",
]
# Note, also update the local modifications in README.chromium.
@@ -43,3 +51,12 @@ test("liburlpattern_unittests") {
]
testonly = true
}
+
+fuzzer_test("liburlpattern_fuzzer") {
+ sources = [ "parse_fuzzer.cc" ]
+ deps = [
+ ":liburlpattern",
+ "//base",
+ "//third_party/abseil-cpp:absl",
+ ]
+}
diff --git a/chromium/third_party/liburlpattern/README.chromium b/chromium/third_party/liburlpattern/README.chromium
index 25629bf64b0..74b13056e75 100644
--- a/chromium/third_party/liburlpattern/README.chromium
+++ b/chromium/third_party/liburlpattern/README.chromium
@@ -22,6 +22,7 @@ third_party/liburlpattern/OWNERS
third_party/liburlpattern/options.h
third_party/liburlpattern/parse.cc
third_party/liburlpattern/parse.h
+third_party/liburlpattern/parse_fuzzer.cc
third_party/liburlpattern/parse_unittest.cc
third_party/liburlpattern/pattern.cc
third_party/liburlpattern/pattern.h
diff --git a/chromium/third_party/liburlpattern/parse.h b/chromium/third_party/liburlpattern/parse.h
index add6eec22f3..9af64518cf4 100644
--- a/chromium/third_party/liburlpattern/parse.h
+++ b/chromium/third_party/liburlpattern/parse.h
@@ -29,10 +29,10 @@ class Pattern;
typedef std::function<absl::StatusOr<std::string>(absl::string_view)>
EncodeCallback;
-// Parse a pattern string and return the result. The input |pattern| must
-// consist of UTF-8 characters. Currently only group names may actually
-// contain non-ASCII characters, however. Unicode characters in other parts
-// of the pattern will cause an error to be returned. A |callback| must be
+// Parse a pattern string and return the result. The parse will fail if the
+// input |pattern| is not valid UTF-8. Currently only group names may actually
+// contain non-ASCII characters, however. Unicode characters in other parts of
+// the pattern will cause an error to be returned. A |callback| must be
// provided to validate and encode plain text parts of the pattern. An
// |options| value may be provided to override default behavior.
COMPONENT_EXPORT(LIBURLPATTERN)
diff --git a/chromium/third_party/liburlpattern/parse_fuzzer.cc b/chromium/third_party/liburlpattern/parse_fuzzer.cc
new file mode 100644
index 00000000000..802c705f483
--- /dev/null
+++ b/chromium/third_party/liburlpattern/parse_fuzzer.cc
@@ -0,0 +1,74 @@
+// Copyright 2021 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by an MIT-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "base/check.h"
+#include "base/check_op.h"
+#include "base/containers/span.h"
+#include "base/logging.h"
+#include "base/strings/strcat.h"
+#include "third_party/abseil-cpp/absl/status/statusor.h"
+#include "third_party/abseil-cpp/absl/strings/str_format.h"
+#include "third_party/abseil-cpp/absl/strings/string_view.h"
+#include "third_party/abseil-cpp/absl/types/optional.h"
+#include "third_party/liburlpattern/parse.h"
+#include "third_party/liburlpattern/pattern.h"
+
+namespace liburlpattern {
+namespace {
+absl::StatusOr<std::string> PassThrough(absl::string_view input) {
+ return std::string(input);
+}
+
+absl::optional<std::string> ParseAndCanonicalize(absl::string_view s) {
+ absl::StatusOr<Pattern> pattern = Parse(s, &PassThrough);
+ if (!pattern.ok()) {
+ LOG(INFO) << "Parse failed with status: " << pattern.status();
+ return absl::nullopt;
+ }
+ return pattern->GeneratePatternString();
+}
+
+std::string FancyHexDump(base::StringPiece label, base::StringPiece data) {
+ std::string char_line, hex_line;
+ for (char c : data) {
+ char_line.append(absl::StrFormat("%4c", c));
+ hex_line.append(absl::StrFormat("%4x", c));
+ }
+ return base::StrCat({label, "\n", char_line, "\n", hex_line});
+}
+
+struct Environment {
+ Environment() { logging::SetMinLogLevel(logging::LOG_INFO); }
+};
+} // namespace
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ static Environment env;
+
+ // Make a copy of `data` on the heap to enable ASAN to catch OOB accesses.
+ std::string pattern_string(reinterpret_cast<const char*>(data), size);
+
+ absl::optional<std::string> canonical = ParseAndCanonicalize(pattern_string);
+ if (!canonical)
+ return 0;
+
+ // If `Pattern::GeneratePatternString()` generates canonical strings,
+ // recanonicalizing one of its outputs should always be a no-op. To test that
+ // property, let's check that `ParseAndCanonicalize()` is idempotent, i.e.
+ // that `canonical` is a fixed point of the function.
+ absl::optional<std::string> canonical2 = ParseAndCanonicalize(*canonical);
+ CHECK(canonical2)
+ << "Failed to parse canonical pattern from original input.\n"
+ << FancyHexDump("original : ", pattern_string) << "\n"
+ << FancyHexDump("canonical: ", *canonical);
+
+ CHECK_EQ(*canonical, *canonical2);
+ return 0;
+}
+} // namespace liburlpattern
diff --git a/chromium/third_party/liburlpattern/pattern.cc b/chromium/third_party/liburlpattern/pattern.cc
index 9b39bc1f4c3..885417b10bc 100644
--- a/chromium/third_party/liburlpattern/pattern.cc
+++ b/chromium/third_party/liburlpattern/pattern.cc
@@ -7,6 +7,7 @@
#include "third_party/abseil-cpp/absl/base/macros.h"
#include "third_party/abseil-cpp/absl/strings/str_format.h"
+#include "third_party/icu/source/common/unicode/utf8.h"
#include "third_party/liburlpattern/utils.h"
namespace liburlpattern {
@@ -73,6 +74,14 @@ Part::Part(PartType t,
ABSL_ASSERT(value.empty());
}
+bool Part::HasCustomName() const {
+ // Determine if the part name was custom, like `:foo`, or an
+ // automatically assigned numeric value. Since custom group
+ // names follow javascript identifier rules the first character
+ // cannot be a digit, so that is all we need to check here.
+ return !name.empty() && !std::isdigit(name[0]);
+}
+
Pattern::Pattern(std::vector<Part> part_list,
Options options,
std::string segment_wildcard_regex)
@@ -93,8 +102,9 @@ std::string Pattern::GeneratePatternString() const {
}
result.reserve(estimated_length);
- for (const Part& part : part_list_) {
- //
+ for (size_t i = 0; i < part_list_.size(); ++i) {
+ const Part& part = part_list_[i];
+
if (part.type == PartType::kFixed) {
// A simple fixed string part.
if (part.modifier == Modifier::kNone) {
@@ -111,20 +121,26 @@ std::string Pattern::GeneratePatternString() const {
continue;
}
- // Determine if the part needs a grouping like `{ ... }`. This is only
- // necessary when using a non-automatic prefix or any suffix.
+ bool custom_name = part.HasCustomName();
+
+ // Determine if the part needs a grouping like `{ ... }`. This is
+ // necessary when the group:
+ //
+ // 1. is using a non-automatic prefix or any suffix.
+ // 2. followed by a matching group that may be represented by a
+ // `(...)` expression. This is necessary to avoid the following `(...)`
+ // being mistakenly interpretted as the custom regexp for this
+ // named group; like `:foo(...)`.
+ const Part* next_part =
+ (i + 1) < part_list_.size() ? &part_list_[i + 1] : nullptr;
bool needs_grouping =
!part.suffix.empty() ||
(!part.prefix.empty() &&
(part.prefix.size() != 1 ||
- options_.prefix_list.find(part.prefix[0]) == std::string::npos));
-
- // Determine if the part name was custom, like `:foo`, or an
- // automatically assigned numeric value. Since custom group
- // names follow javascript identifier rules the first character
- // cannot be a digit, so that is all we need to check here.
- ABSL_ASSERT(!part.name.empty());
- bool custom_name = !std::isdigit(part.name[0]);
+ options_.prefix_list.find(part.prefix[0]) == std::string::npos)) ||
+ (custom_name && part.modifier == Modifier::kNone && next_part &&
+ next_part->type != PartType::kFixed && next_part->prefix.empty() &&
+ next_part->suffix.empty() && !next_part->HasCustomName());
// This is a full featured part. We must generate a string that looks
// like:
@@ -157,10 +173,17 @@ std::string Pattern::GeneratePatternString() const {
result += ")";
}
} else if (part.type == PartType::kFullWildcard) {
- // We can only use the `*` wildcard card if the automatic
- // numeric name is used for the group. A custom name
- // requires the regexp `(.*)` explicitly.
- if (!custom_name) {
+ const Part* last_part = i > 0 ? &part_list_[i - 1] : nullptr;
+ // We can only use the `*` wildcard card if we meet a number
+ // of conditions. We must use an explicit `(.*)` group if:
+ //
+ // 1. A custom name was used; e.g. `:foo(.*)`.
+ // 2. If the preceding group is a matching group without a modifier; e.g.
+ // `(foo)(.*)`. In that case we cannot emit the `*` shorthand without
+ // it being mistakenly interpreted as the modifier for the previous
+ // group.
+ if (!custom_name && (!last_part || last_part->type == PartType::kFixed ||
+ last_part->modifier != Modifier::kNone)) {
result += "*";
} else {
result += "(";
@@ -169,6 +192,21 @@ std::string Pattern::GeneratePatternString() const {
}
}
+ // If the matching group is a simple `:foo` custom name with the default
+ // segment wildcard, then we must check for a trailing suffix that could
+ // be interpreted as a trailing part of the name itself. In these cases
+ // we must escape the beginning of the suffix in order to separate it
+ // from the end of the custom name; e.g. `:foo\\bar` instead of `:foobar`.
+ if (part.type == PartType::kSegmentWildcard && custom_name &&
+ !part.suffix.empty()) {
+ UChar32 codepoint = -1;
+ U8_GET(reinterpret_cast<const uint8_t*>(part.suffix.data()), 0, 0,
+ static_cast<int>(part.suffix.size()), codepoint);
+ if (IsNameCodepoint(codepoint, /*first_codepoint=*/false)) {
+ result += "\\";
+ }
+ }
+
EscapePatternStringAndAppend(part.suffix, result);
if (needs_grouping)
diff --git a/chromium/third_party/liburlpattern/pattern.h b/chromium/third_party/liburlpattern/pattern.h
index 67f46b1049f..f522f44c9d1 100644
--- a/chromium/third_party/liburlpattern/pattern.h
+++ b/chromium/third_party/liburlpattern/pattern.h
@@ -85,6 +85,10 @@ struct COMPONENT_EXPORT(LIBURLPATTERN) Part {
std::string suffix,
Modifier modifier);
Part() = default;
+
+ // Returns true if the `name` member is a custom name; e.g. for a `:foo`
+ // group.
+ bool HasCustomName() const;
};
COMPONENT_EXPORT(LIBURLPATTERN)
diff --git a/chromium/third_party/liburlpattern/pattern_unittest.cc b/chromium/third_party/liburlpattern/pattern_unittest.cc
index 08e5f498cf8..74a53cdfc9e 100644
--- a/chromium/third_party/liburlpattern/pattern_unittest.cc
+++ b/chromium/third_party/liburlpattern/pattern_unittest.cc
@@ -330,6 +330,59 @@ TEST(PatternStringTest, RegexpEscapedPatternCharInSuffix) {
RunPatternStringTest("/foo/{(foo)\\:bar}", "/foo/{(foo)\\:bar}");
}
+TEST(PatternStringTest, RegexpFollowedByWildcard) {
+ RunPatternStringTest("(foo)(.*)", "(foo)(.*)");
+}
+
+TEST(PatternStringTest, RegexpWithOptionalModifierFollowedByWildcard) {
+ RunPatternStringTest("(foo)?(.*)", "(foo)?*");
+}
+
+TEST(PatternStringTest, RegexpWithSuffixModifierFollowedByWildcard) {
+ RunPatternStringTest("{(foo)a}(.*)", "{(foo)a}(.*)");
+}
+
+TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcard) {
+ RunPatternStringTest("{:foo}(.*)", "{:foo}(.*)");
+}
+
+TEST(PatternStringTest, NamedGroupInGroupingFollowedByRegexp) {
+ RunPatternStringTest("{:foo}(bar)", "{:foo}(bar)");
+}
+
+TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcardInGrouping) {
+ RunPatternStringTest("{:foo}{(.*)}", "{:foo}(.*)");
+}
+
+TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcardWithSuffix) {
+ RunPatternStringTest("{:foo}{(.*)bar}", ":foo{(.*)bar}");
+}
+
+TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcardWithPrefix) {
+ RunPatternStringTest("{:foo}{bar(.*)}", ":foo{bar(.*)}");
+}
+
+TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcardWithCustomName) {
+ RunPatternStringTest("{:foo}:bar(.*)", ":foo:bar(.*)");
+}
+
+TEST(PatternStringTest,
+ NamedGroupInGroupingWithOptionalModifierFollowedByWildcard) {
+ RunPatternStringTest("{:foo}?(.*)", ":foo?*");
+}
+
+TEST(PatternStringTest, NamedGroupWithEscapedValidNameSuffix) {
+ RunPatternStringTest("{:foo\\bar}", "{:foo\\bar}");
+}
+
+TEST(PatternStringTest, NamedGroupWithEscapedInvalidNameSuffix) {
+ RunPatternStringTest("{:foo\\.bar}", "{:foo.bar}");
+}
+
+TEST(PatternStringTest, NamedGroupWithCustomRegexpAndValidNameSuffix) {
+ RunPatternStringTest("{:foo(baz)bar}", "{:foo(baz)bar}");
+}
+
struct DirectMatchCase {
absl::string_view input;
bool expected_match = true;
diff --git a/chromium/third_party/liburlpattern/tokenize.cc b/chromium/third_party/liburlpattern/tokenize.cc
index ff70c58c4c9..cbc38dc2062 100644
--- a/chromium/third_party/liburlpattern/tokenize.cc
+++ b/chromium/third_party/liburlpattern/tokenize.cc
@@ -5,9 +5,11 @@
#include "third_party/liburlpattern/tokenize.h"
+#include "base/compiler_specific.h"
#include "third_party/abseil-cpp/absl/strings/str_format.h"
#include "third_party/icu/source/common/unicode/uchar.h"
#include "third_party/icu/source/common/unicode/utf8.h"
+#include "third_party/liburlpattern/utils.h"
// The following code is a translation from the path-to-regexp typescript at:
//
@@ -23,24 +25,6 @@ bool IsASCII(UChar32 c) {
return c >= 0x00 && c <= 0x7f;
}
-bool IsNameCodepoint(UChar32 c, bool first_codepoint) {
- // Require group names to follow the same character restrictions as
- // javascript identifiers. This code originates from v8 at:
- //
- // https://source.chromium.org/chromium/chromium/src/+/master:v8/src/strings/char-predicates.cc;l=17-34;drc=be014256adea1552d4a044ef80616cdab6a7d549
- //
- // We deviate from js identifiers, however, in not support the backslash
- // character. This is mainly used in js identifiers to allow escaped
- // unicode sequences to be written in ascii. The js engine, however,
- // should take care of this long before we reach this level of code. So
- // we don't need to handle it here.
- if (first_codepoint) {
- return u_hasBinaryProperty(c, UCHAR_ID_START) || c == '$' || c == '_';
- }
- return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) || c == '$' || c == '_' ||
- c == 0x200c || c == 0x200d;
-}
-
class Tokenizer {
public:
Tokenizer(absl::string_view pattern, TokenizePolicy policy)
@@ -53,7 +37,10 @@ class Tokenizer {
if (!status_.ok())
return std::move(status_);
- NextAt(index_);
+ if (!NextAt(index_)) {
+ Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", index_));
+ continue;
+ }
if (codepoint_ == '*') {
AddToken(TokenType::kAsterisk);
continue;
@@ -73,7 +60,12 @@ class Tokenizer {
continue;
}
size_t escaped_i = next_index_;
- Next();
+ if (!Next()) {
+ Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
+ next_index_));
+ continue;
+ }
+
AddToken(TokenType::kEscapedChar, next_index_, escaped_i);
continue;
}
@@ -94,7 +86,12 @@ class Tokenizer {
// Iterate over codepoints until we find the first non-name codepoint.
while (pos < pattern_.size()) {
- NextAt(pos);
+ if (!status_.ok())
+ return std::move(status_);
+ if (!NextAt(pos)) {
+ Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", pos));
+ continue;
+ }
if (!IsNameCodepoint(codepoint_, pos == name_start))
break;
pos = next_index_;
@@ -117,7 +114,11 @@ class Tokenizer {
bool error = false;
while (j < pattern_.size()) {
- NextAt(j);
+ if (!NextAt(j)) {
+ Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", j));
+ error = true;
+ break;
+ }
if (!IsASCII(codepoint_)) {
Error(absl::StrFormat(
@@ -149,7 +150,12 @@ class Tokenizer {
break;
}
size_t escaped_j = next_index_;
- Next();
+ if (!Next()) {
+ Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
+ next_index_));
+ error = true;
+ break;
+ }
if (!IsASCII(codepoint_)) {
Error(absl::StrFormat(
"Invalid non-ASCII character 0x%02x at index %d.",
@@ -177,7 +183,12 @@ class Tokenizer {
break;
}
size_t tmp_j = next_index_;
- Next();
+ if (!Next()) {
+ Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
+ next_index_));
+ error = true;
+ break;
+ }
// Require the the first character after an open paren is `?`. This
// permits assertions, named capture groups, and non-capturing
// groups. It blocks, however, unnamed capture groups.
@@ -229,17 +240,20 @@ class Tokenizer {
private:
// Read the codepoint at `next_index_` in `pattern_` and store it in
// `codepoint_`. In addition, `next_index_` is updated to the codepoint to be
- // read next.
- void Next() {
+ // read next. Returns true iff the codepoint was read successfully. On
+ // success, `codepoint_` is non-negative.
+ bool Next() WARN_UNUSED_RESULT {
U8_NEXT(pattern_.data(), next_index_, pattern_.size(), codepoint_);
+ return codepoint_ >= 0;
}
// Read the codepoint at the specified `index` in `pattern_` and store it in
// `codepoint_`. In addition, `next_index_` is updated to the codepoint to be
- // read next.
- void NextAt(size_t index) {
+ // read next. Returns true iff the codepoint was read successfully. On
+ // success, `codepoint_` is non-negative.
+ bool NextAt(size_t index) WARN_UNUSED_RESULT {
next_index_ = index;
- Next();
+ return Next();
}
// Append a Token to our list of the given `type` and with a value consisting
diff --git a/chromium/third_party/liburlpattern/tokenize.h b/chromium/third_party/liburlpattern/tokenize.h
index ba33741f7b0..0141bc37fb1 100644
--- a/chromium/third_party/liburlpattern/tokenize.h
+++ b/chromium/third_party/liburlpattern/tokenize.h
@@ -94,9 +94,10 @@ inline bool operator!=(const Token& lh, const Token& rh) {
COMPONENT_EXPORT(LIBURLPATTERN)
std::ostream& operator<<(std::ostream& o, Token token);
-// Split the given input pattern string into a list of lexical tokens. Note,
-// the generated Token objects simply reference positions within the input
-// |pattern|. The |pattern| must be kept alive as long as the Token objects.
+// Split the given input pattern string into a list of lexical tokens.
+// Tokenizing will fail if |pattern| is not valid UTF-8. Note, the generated
+// Token objects simply reference positions within the input |pattern|. The
+// |pattern| must be kept alive as long as the Token objects.
COMPONENT_EXPORT(LIBURLPATTERN)
absl::StatusOr<std::vector<Token>> Tokenize(
absl::string_view pattern,
diff --git a/chromium/third_party/liburlpattern/tokenize_unittest.cc b/chromium/third_party/liburlpattern/tokenize_unittest.cc
index ac37983488a..46900e40226 100644
--- a/chromium/third_party/liburlpattern/tokenize_unittest.cc
+++ b/chromium/third_party/liburlpattern/tokenize_unittest.cc
@@ -288,9 +288,9 @@ TEST(TokenizeTest, RegexWithTrailingEscapedChar) {
}
TEST(TokenizeTest, RegexWithEscapedInvalidChar) {
- // Use a single byte invalid character since the escape only applies to the
- // next byte character.
- RunTokenizeTest("(\\\xff)",
+ // Use a valid UTF-8 sequence (encoding of U+00A2) that encodes a non-ASCII
+ // character.
+ RunTokenizeTest("(\\\xc2\xa2)",
absl::InvalidArgumentError("Invalid non-ASCII character"));
}
@@ -477,4 +477,36 @@ TEST(TokenizeTest, LenientPolicyRegexWithCaptureGroup) {
RunTokenizeTest("(foo(bar))", expected_tokens, TokenizePolicy::kLenient);
}
+TEST(TokenizeTest, InvalidUtf8) {
+ RunTokenizeTest("hello\xcdworld", absl::InvalidArgumentError(
+ "Invalid UTF-8 codepoint at index 5."));
+}
+
+TEST(TokenizeTest, InvalidUtf8Escaped) {
+ RunTokenizeTest(
+ "hello\\\xcdworld",
+ absl::InvalidArgumentError("Invalid UTF-8 codepoint at index 7."));
+}
+
+TEST(TokenizeTest, InvalidUtf8InName) {
+ RunTokenizeTest(
+ "/:foo:hello\xcdworld",
+ absl::InvalidArgumentError("Invalid UTF-8 codepoint at index 11."));
+}
+
+TEST(TokenizeTest, InvalidUtf8InRegexGroup) {
+ RunTokenizeTest("(foo\xcd)", absl::InvalidArgumentError(
+ "Invalid UTF-8 codepoint at index 4."));
+}
+
+TEST(TokenizeTest, InvalidUtf8EscapedInRegexGroup) {
+ RunTokenizeTest("(foo\\\xcd)", absl::InvalidArgumentError(
+ "Invalid UTF-8 codepoint at index 6."));
+}
+
+TEST(TokenizeTest, InvalidUtf8InNestedRegexGroup) {
+ RunTokenizeTest("(foo(\xcd))", absl::InvalidArgumentError(
+ "Invalid UTF-8 codepoint at index 6."));
+}
+
} // namespace liburlpattern
diff --git a/chromium/third_party/liburlpattern/utils.cc b/chromium/third_party/liburlpattern/utils.cc
index c9955e1d2e0..5f7f2d69227 100644
--- a/chromium/third_party/liburlpattern/utils.cc
+++ b/chromium/third_party/liburlpattern/utils.cc
@@ -52,4 +52,22 @@ std::string EscapeRegexpString(absl::string_view input) {
return result;
}
+bool IsNameCodepoint(UChar32 c, bool first_codepoint) {
+ // Require group names to follow the same character restrictions as
+ // javascript identifiers. This code originates from v8 at:
+ //
+ // https://source.chromium.org/chromium/chromium/src/+/master:v8/src/strings/char-predicates.cc;l=17-34;drc=be014256adea1552d4a044ef80616cdab6a7d549
+ //
+ // We deviate from js identifiers, however, in not support the backslash
+ // character. This is mainly used in js identifiers to allow escaped
+ // unicode sequences to be written in ascii. The js engine, however,
+ // should take care of this long before we reach this level of code. So
+ // we don't need to handle it here.
+ if (first_codepoint) {
+ return u_hasBinaryProperty(c, UCHAR_ID_START) || c == '$' || c == '_';
+ }
+ return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) || c == '$' || c == '_' ||
+ c == 0x200c || c == 0x200d;
+}
+
} // namespace liburlpattern
diff --git a/chromium/third_party/liburlpattern/utils.h b/chromium/third_party/liburlpattern/utils.h
index 02826b5f321..201f7f99672 100644
--- a/chromium/third_party/liburlpattern/utils.h
+++ b/chromium/third_party/liburlpattern/utils.h
@@ -9,6 +9,7 @@
#include <string>
#include "base/component_export.h"
#include "third_party/abseil-cpp/absl/strings/string_view.h"
+#include "third_party/icu/source/common/unicode/uchar.h"
namespace liburlpattern {
@@ -36,6 +37,12 @@ COMPONENT_EXPORT(LIBURLPATTERN)
void EscapePatternStringAndAppend(absl::string_view input,
std::string& append_target);
+// Return `true` if the given codepoint `c` is valid for a `:foo` name. The
+// `first_codepoint` argument can be set if this codepoint is intended to be
+// the first codepoint in a name. If its false, then the codepoint is treated
+// as a trailing character.
+bool IsNameCodepoint(UChar32 c, bool first_codepoint);
+
} // namespace liburlpattern
#endif // THIRD_PARTY_LIBURLPATTERN_UTILS_H_