BASELINE: Update Chromium to 98.0.4758.90

Change-Id: Ib7c41539bf8a8e0376bd639f27d68294de90f3c8 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
author: Allan Sandfeld Jensen <allan.jensen@qt.io> 2022-02-04 17:20:24 +0100
committer: Allan Sandfeld Jensen <allan.jensen@qt.io> 2022-02-12 08:15:25 +0000
commit: 8fa0776f1f79e91fc9c0b9c1ba11a0a29c05196b (patch)
tree: 788d8d7549712682703a0310ca4a0f0860d4802b /chromium/third_party/liburlpattern
parent: 606d85f2a5386472314d39923da28c70c60dc8e7 (diff)
download: qtwebengine-chromium-8fa0776f1f79e91fc9c0b9c1ba11a0a29c05196b.tar.gz
12 files changed, 314 insertions, 55 deletions
diff --git a/chromium/third_party/liburlpattern/BUILD.gn b/chromium/third_party/liburlpattern/BUILD.gn
index e3af29d6036..00236696212 100644
--- a/chromium/third_party/liburlpattern/BUILD.gn
+++ b/chromium/third_party/liburlpattern/BUILD.gn
@@ -2,8 +2,13 @@
 # Use of this source code is governed by an MIT-style license that can be
 # found in the LICENSE file or at https://opensource.org/licenses/MIT.
 
+import("//testing/libfuzzer/fuzzer_test.gni")
 import("//testing/test.gni")
 
+config("warnings") {
+  cflags = [ "-Wno-shadow" ]
+}
+
 component("liburlpattern") {
   defines = [ "IS_LIBURLPATTERN_IMPL" ]
   deps = [
@@ -12,6 +17,8 @@ component("liburlpattern") {
     "//third_party/icu:icu",
   ]
 
+  configs += [ ":warnings" ]
+
   # Note, also update the local modifications in README.chromium.
   sources = [
     "options.h",
@@ -32,6 +39,7 @@ test("liburlpattern_unittests") {
     "//base/test:run_all_unittests",
     "//testing/gtest",
     "//third_party/abseil-cpp:absl",
+    "//third_party/icu:icu",
   ]
 
   # Note, also update the local modifications in README.chromium.
@@ -43,3 +51,12 @@ test("liburlpattern_unittests") {
   ]
   testonly = true
 }
+
+fuzzer_test("liburlpattern_fuzzer") {
+  sources = [ "parse_fuzzer.cc" ]
+  deps = [
+    ":liburlpattern",
+    "//base",
+    "//third_party/abseil-cpp:absl",
+  ]
+}
diff --git a/chromium/third_party/liburlpattern/README.chromium b/chromium/third_party/liburlpattern/README.chromium
index 25629bf64b0..74b13056e75 100644
--- a/chromium/third_party/liburlpattern/README.chromium
+++ b/chromium/third_party/liburlpattern/README.chromium
@@ -22,6 +22,7 @@ third_party/liburlpattern/OWNERS
 third_party/liburlpattern/options.h
 third_party/liburlpattern/parse.cc
 third_party/liburlpattern/parse.h
+third_party/liburlpattern/parse_fuzzer.cc
 third_party/liburlpattern/parse_unittest.cc
 third_party/liburlpattern/pattern.cc
 third_party/liburlpattern/pattern.h
diff --git a/chromium/third_party/liburlpattern/parse.h b/chromium/third_party/liburlpattern/parse.h
index add6eec22f3..9af64518cf4 100644
--- a/chromium/third_party/liburlpattern/parse.h
+++ b/chromium/third_party/liburlpattern/parse.h
@@ -29,10 +29,10 @@ class Pattern;
 typedef std::function<absl::StatusOr<std::string>(absl::string_view)>
     EncodeCallback;
 
-// Parse a pattern string and return the result.  The input |pattern| must
-// consist of UTF-8 characters.  Currently only group names may actually
-// contain non-ASCII characters, however.  Unicode characters in other parts
-// of the pattern will cause an error to be returned.  A |callback| must be
+// Parse a pattern string and return the result.  The parse will fail if the
+// input |pattern| is not valid UTF-8.  Currently only group names may actually
+// contain non-ASCII characters, however.  Unicode characters in other parts of
+// the pattern will cause an error to be returned.  A |callback| must be
 // provided to validate and encode plain text parts of the pattern.  An
 // |options| value may be provided to override default behavior.
 COMPONENT_EXPORT(LIBURLPATTERN)
diff --git a/chromium/third_party/liburlpattern/parse_fuzzer.cc b/chromium/third_party/liburlpattern/parse_fuzzer.cc
new file mode 100644
index 00000000000..802c705f483
--- /dev/null
+++ b/chromium/third_party/liburlpattern/parse_fuzzer.cc
@@ -0,0 +1,74 @@
+// Copyright 2021 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by an MIT-style license that can be
+// found in the LICENSE file.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "base/check.h"
+#include "base/check_op.h"
+#include "base/containers/span.h"
+#include "base/logging.h"
+#include "base/strings/strcat.h"
+#include "third_party/abseil-cpp/absl/status/statusor.h"
+#include "third_party/abseil-cpp/absl/strings/str_format.h"
+#include "third_party/abseil-cpp/absl/strings/string_view.h"
+#include "third_party/abseil-cpp/absl/types/optional.h"
+#include "third_party/liburlpattern/parse.h"
+#include "third_party/liburlpattern/pattern.h"
+
+namespace liburlpattern {
+namespace {
+absl::StatusOr<std::string> PassThrough(absl::string_view input) {
+  return std::string(input);
+}
+
+absl::optional<std::string> ParseAndCanonicalize(absl::string_view s) {
+  absl::StatusOr<Pattern> pattern = Parse(s, &PassThrough);
+  if (!pattern.ok()) {
+    LOG(INFO) << "Parse failed with status: " << pattern.status();
+    return absl::nullopt;
+  }
+  return pattern->GeneratePatternString();
+}
+
+std::string FancyHexDump(base::StringPiece label, base::StringPiece data) {
+  std::string char_line, hex_line;
+  for (char c : data) {
+    char_line.append(absl::StrFormat("%4c", c));
+    hex_line.append(absl::StrFormat("%4x", c));
+  }
+  return base::StrCat({label, "\n", char_line, "\n", hex_line});
+}
+
+struct Environment {
+  Environment() { logging::SetMinLogLevel(logging::LOG_INFO); }
+};
+}  // namespace
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  static Environment env;
+
+  // Make a copy of `data` on the heap to enable ASAN to catch OOB accesses.
+  std::string pattern_string(reinterpret_cast<const char*>(data), size);
+
+  absl::optional<std::string> canonical = ParseAndCanonicalize(pattern_string);
+  if (!canonical)
+    return 0;
+
+  // If `Pattern::GeneratePatternString()` generates canonical strings,
+  // recanonicalizing one of its outputs should always be a no-op. To test that
+  // property, let's check that `ParseAndCanonicalize()` is idempotent, i.e.
+  // that `canonical` is a fixed point of the function.
+  absl::optional<std::string> canonical2 = ParseAndCanonicalize(*canonical);
+  CHECK(canonical2)
+      << "Failed to parse canonical pattern from original input.\n"
+      << FancyHexDump("original : ", pattern_string) << "\n"
+      << FancyHexDump("canonical: ", *canonical);
+
+  CHECK_EQ(*canonical, *canonical2);
+  return 0;
+}
+}  // namespace liburlpattern
diff --git a/chromium/third_party/liburlpattern/pattern.cc b/chromium/third_party/liburlpattern/pattern.cc
index 9b39bc1f4c3..885417b10bc 100644
--- a/chromium/third_party/liburlpattern/pattern.cc
+++ b/chromium/third_party/liburlpattern/pattern.cc
@@ -7,6 +7,7 @@
 
 #include "third_party/abseil-cpp/absl/base/macros.h"
 #include "third_party/abseil-cpp/absl/strings/str_format.h"
+#include "third_party/icu/source/common/unicode/utf8.h"
 #include "third_party/liburlpattern/utils.h"
 
 namespace liburlpattern {
@@ -73,6 +74,14 @@ Part::Part(PartType t,
     ABSL_ASSERT(value.empty());
 }
 
+bool Part::HasCustomName() const {
+  // Determine if the part name was custom, like `:foo`, or an
+  // automatically assigned numeric value.  Since custom group
+  // names follow javascript identifier rules the first character
+  // cannot be a digit, so that is all we need to check here.
+  return !name.empty() && !std::isdigit(name[0]);
+}
+
 Pattern::Pattern(std::vector<Part> part_list,
                  Options options,
                  std::string segment_wildcard_regex)
@@ -93,8 +102,9 @@ std::string Pattern::GeneratePatternString() const {
   }
   result.reserve(estimated_length);
 
-  for (const Part& part : part_list_) {
-    //
+  for (size_t i = 0; i < part_list_.size(); ++i) {
+    const Part& part = part_list_[i];
+
     if (part.type == PartType::kFixed) {
       // A simple fixed string part.
       if (part.modifier == Modifier::kNone) {
@@ -111,20 +121,26 @@ std::string Pattern::GeneratePatternString() const {
       continue;
     }
 
-    // Determine if the part needs a grouping like `{ ... }`.  This is only
-    // necessary when using a non-automatic prefix or any suffix.
+    bool custom_name = part.HasCustomName();
+
+    // Determine if the part needs a grouping like `{ ... }`.  This is
+    // necessary when the group:
+    //
+    // 1. is using a non-automatic prefix or any suffix.
+    // 2. followed by a matching group that may be represented by a
+    //    `(...)` expression.  This is necessary to avoid the following `(...)`
+    //    being mistakenly interpretted as the custom regexp for this
+    //    named group; like `:foo(...)`.
+    const Part* next_part =
+        (i + 1) < part_list_.size() ? &part_list_[i + 1] : nullptr;
     bool needs_grouping =
         !part.suffix.empty() ||
         (!part.prefix.empty() &&
          (part.prefix.size() != 1 ||
-          options_.prefix_list.find(part.prefix[0]) == std::string::npos));
-
-    // Determine if the part name was custom, like `:foo`, or an
-    // automatically assigned numeric value.  Since custom group
-    // names follow javascript identifier rules the first character
-    // cannot be a digit, so that is all we need to check here.
-    ABSL_ASSERT(!part.name.empty());
-    bool custom_name = !std::isdigit(part.name[0]);
+          options_.prefix_list.find(part.prefix[0]) == std::string::npos)) ||
+        (custom_name && part.modifier == Modifier::kNone && next_part &&
+         next_part->type != PartType::kFixed && next_part->prefix.empty() &&
+         next_part->suffix.empty() && !next_part->HasCustomName());
 
     // This is a full featured part.  We must generate a string that looks
     // like:
@@ -157,10 +173,17 @@ std::string Pattern::GeneratePatternString() const {
         result += ")";
       }
     } else if (part.type == PartType::kFullWildcard) {
-      // We can only use the `*` wildcard card if the automatic
-      // numeric name is used for the group.  A custom name
-      // requires the regexp `(.*)` explicitly.
-      if (!custom_name) {
+      const Part* last_part = i > 0 ? &part_list_[i - 1] : nullptr;
+      // We can only use the `*` wildcard card if we meet a number
+      // of conditions.  We must use an explicit `(.*)` group if:
+      //
+      // 1. A custom name was used; e.g. `:foo(.*)`.
+      // 2. If the preceding group is a matching group without a modifier; e.g.
+      //    `(foo)(.*)`.  In that case we cannot emit the `*` shorthand without
+      //    it being mistakenly interpreted as the modifier for the previous
+      //    group.
+      if (!custom_name && (!last_part || last_part->type == PartType::kFixed ||
+                           last_part->modifier != Modifier::kNone)) {
         result += "*";
       } else {
         result += "(";
@@ -169,6 +192,21 @@ std::string Pattern::GeneratePatternString() const {
       }
     }
 
+    // If the matching group is a simple `:foo` custom name with the default
+    // segment wildcard, then we must check for a trailing suffix that could
+    // be interpreted as a trailing part of the name itself.  In these cases
+    // we must escape the beginning of the suffix in order to separate it
+    // from the end of the custom name; e.g. `:foo\\bar` instead of `:foobar`.
+    if (part.type == PartType::kSegmentWildcard && custom_name &&
+        !part.suffix.empty()) {
+      UChar32 codepoint = -1;
+      U8_GET(reinterpret_cast<const uint8_t*>(part.suffix.data()), 0, 0,
+             static_cast<int>(part.suffix.size()), codepoint);
+      if (IsNameCodepoint(codepoint, /*first_codepoint=*/false)) {
+        result += "\\";
+      }
+    }
+
     EscapePatternStringAndAppend(part.suffix, result);
 
     if (needs_grouping)
diff --git a/chromium/third_party/liburlpattern/pattern.h b/chromium/third_party/liburlpattern/pattern.h
index 67f46b1049f..f522f44c9d1 100644
--- a/chromium/third_party/liburlpattern/pattern.h
+++ b/chromium/third_party/liburlpattern/pattern.h
@@ -85,6 +85,10 @@ struct COMPONENT_EXPORT(LIBURLPATTERN) Part {
        std::string suffix,
        Modifier modifier);
   Part() = default;
+
+  // Returns true if the `name` member is a custom name; e.g. for a `:foo`
+  // group.
+  bool HasCustomName() const;
 };
 
 COMPONENT_EXPORT(LIBURLPATTERN)
diff --git a/chromium/third_party/liburlpattern/pattern_unittest.cc b/chromium/third_party/liburlpattern/pattern_unittest.cc
index 08e5f498cf8..74a53cdfc9e 100644
--- a/chromium/third_party/liburlpattern/pattern_unittest.cc
+++ b/chromium/third_party/liburlpattern/pattern_unittest.cc
@@ -330,6 +330,59 @@ TEST(PatternStringTest, RegexpEscapedPatternCharInSuffix) {
   RunPatternStringTest("/foo/{(foo)\\:bar}", "/foo/{(foo)\\:bar}");
 }
 
+TEST(PatternStringTest, RegexpFollowedByWildcard) {
+  RunPatternStringTest("(foo)(.*)", "(foo)(.*)");
+}
+
+TEST(PatternStringTest, RegexpWithOptionalModifierFollowedByWildcard) {
+  RunPatternStringTest("(foo)?(.*)", "(foo)?*");
+}
+
+TEST(PatternStringTest, RegexpWithSuffixModifierFollowedByWildcard) {
+  RunPatternStringTest("{(foo)a}(.*)", "{(foo)a}(.*)");
+}
+
+TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcard) {
+  RunPatternStringTest("{:foo}(.*)", "{:foo}(.*)");
+}
+
+TEST(PatternStringTest, NamedGroupInGroupingFollowedByRegexp) {
+  RunPatternStringTest("{:foo}(bar)", "{:foo}(bar)");
+}
+
+TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcardInGrouping) {
+  RunPatternStringTest("{:foo}{(.*)}", "{:foo}(.*)");
+}
+
+TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcardWithSuffix) {
+  RunPatternStringTest("{:foo}{(.*)bar}", ":foo{(.*)bar}");
+}
+
+TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcardWithPrefix) {
+  RunPatternStringTest("{:foo}{bar(.*)}", ":foo{bar(.*)}");
+}
+
+TEST(PatternStringTest, NamedGroupInGroupingFollowedByWildcardWithCustomName) {
+  RunPatternStringTest("{:foo}:bar(.*)", ":foo:bar(.*)");
+}
+
+TEST(PatternStringTest,
+     NamedGroupInGroupingWithOptionalModifierFollowedByWildcard) {
+  RunPatternStringTest("{:foo}?(.*)", ":foo?*");
+}
+
+TEST(PatternStringTest, NamedGroupWithEscapedValidNameSuffix) {
+  RunPatternStringTest("{:foo\\bar}", "{:foo\\bar}");
+}
+
+TEST(PatternStringTest, NamedGroupWithEscapedInvalidNameSuffix) {
+  RunPatternStringTest("{:foo\\.bar}", "{:foo.bar}");
+}
+
+TEST(PatternStringTest, NamedGroupWithCustomRegexpAndValidNameSuffix) {
+  RunPatternStringTest("{:foo(baz)bar}", "{:foo(baz)bar}");
+}
+
 struct DirectMatchCase {
   absl::string_view input;
   bool expected_match = true;
diff --git a/chromium/third_party/liburlpattern/tokenize.cc b/chromium/third_party/liburlpattern/tokenize.cc
index ff70c58c4c9..cbc38dc2062 100644
--- a/chromium/third_party/liburlpattern/tokenize.cc
+++ b/chromium/third_party/liburlpattern/tokenize.cc
@@ -5,9 +5,11 @@
 
 #include "third_party/liburlpattern/tokenize.h"
 
+#include "base/compiler_specific.h"
 #include "third_party/abseil-cpp/absl/strings/str_format.h"
 #include "third_party/icu/source/common/unicode/uchar.h"
 #include "third_party/icu/source/common/unicode/utf8.h"
+#include "third_party/liburlpattern/utils.h"
 
 // The following code is a translation from the path-to-regexp typescript at:
 //
@@ -23,24 +25,6 @@ bool IsASCII(UChar32 c) {
   return c >= 0x00 && c <= 0x7f;
 }
 
-bool IsNameCodepoint(UChar32 c, bool first_codepoint) {
-  // Require group names to follow the same character restrictions as
-  // javascript identifiers.  This code originates from v8 at:
-  //
-  // https://source.chromium.org/chromium/chromium/src/+/master:v8/src/strings/char-predicates.cc;l=17-34;drc=be014256adea1552d4a044ef80616cdab6a7d549
-  //
-  // We deviate from js identifiers, however, in not support the backslash
-  // character.  This is mainly used in js identifiers to allow escaped
-  // unicode sequences to be written in ascii.  The js engine, however,
-  // should take care of this long before we reach this level of code.  So
-  // we don't need to handle it here.
-  if (first_codepoint) {
-    return u_hasBinaryProperty(c, UCHAR_ID_START) || c == '$' || c == '_';
-  }
-  return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) || c == '$' || c == '_' ||
-         c == 0x200c || c == 0x200d;
-}
-
 class Tokenizer {
  public:
   Tokenizer(absl::string_view pattern, TokenizePolicy policy)
@@ -53,7 +37,10 @@ class Tokenizer {
       if (!status_.ok())
         return std::move(status_);
 
-      NextAt(index_);
+      if (!NextAt(index_)) {
+        Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", index_));
+        continue;
+      }
       if (codepoint_ == '*') {
         AddToken(TokenType::kAsterisk);
         continue;
@@ -73,7 +60,12 @@ class Tokenizer {
           continue;
         }
         size_t escaped_i = next_index_;
-        Next();
+        if (!Next()) {
+          Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
+                                next_index_));
+          continue;
+        }
+
         AddToken(TokenType::kEscapedChar, next_index_, escaped_i);
         continue;
       }
@@ -94,7 +86,12 @@ class Tokenizer {
 
         // Iterate over codepoints until we find the first non-name codepoint.
         while (pos < pattern_.size()) {
-          NextAt(pos);
+          if (!status_.ok())
+            return std::move(status_);
+          if (!NextAt(pos)) {
+            Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", pos));
+            continue;
+          }
           if (!IsNameCodepoint(codepoint_, pos == name_start))
             break;
           pos = next_index_;
@@ -117,7 +114,11 @@ class Tokenizer {
         bool error = false;
 
         while (j < pattern_.size()) {
-          NextAt(j);
+          if (!NextAt(j)) {
+            Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", j));
+            error = true;
+            break;
+          }
 
           if (!IsASCII(codepoint_)) {
             Error(absl::StrFormat(
@@ -149,7 +150,12 @@ class Tokenizer {
               break;
             }
             size_t escaped_j = next_index_;
-            Next();
+            if (!Next()) {
+              Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
+                                    next_index_));
+              error = true;
+              break;
+            }
             if (!IsASCII(codepoint_)) {
               Error(absl::StrFormat(
                         "Invalid non-ASCII character 0x%02x at index %d.",
@@ -177,7 +183,12 @@ class Tokenizer {
               break;
             }
             size_t tmp_j = next_index_;
-            Next();
+            if (!Next()) {
+              Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
+                                    next_index_));
+              error = true;
+              break;
+            }
             // Require the the first character after an open paren is `?`.  This
             // permits assertions, named capture groups, and non-capturing
             // groups. It blocks, however, unnamed capture groups.
@@ -229,17 +240,20 @@ class Tokenizer {
  private:
   // Read the codepoint at `next_index_` in `pattern_` and store it in
   // `codepoint_`.  In addition, `next_index_` is updated to the codepoint to be
-  // read next.
-  void Next() {
+  // read next.  Returns true iff the codepoint was read successfully. On
+  // success, `codepoint_` is non-negative.
+  bool Next() WARN_UNUSED_RESULT {
     U8_NEXT(pattern_.data(), next_index_, pattern_.size(), codepoint_);
+    return codepoint_ >= 0;
   }
 
   // Read the codepoint at the specified `index` in `pattern_` and store it in
   // `codepoint_`.  In addition, `next_index_` is updated to the codepoint to be
-  // read next.
-  void NextAt(size_t index) {
+  // read next.  Returns true iff the codepoint was read successfully. On
+  // success, `codepoint_` is non-negative.
+  bool NextAt(size_t index) WARN_UNUSED_RESULT {
     next_index_ = index;
-    Next();
+    return Next();
   }
 
   // Append a Token to our list of the given `type` and with a value consisting
diff --git a/chromium/third_party/liburlpattern/tokenize.h b/chromium/third_party/liburlpattern/tokenize.h
index ba33741f7b0..0141bc37fb1 100644
--- a/chromium/third_party/liburlpattern/tokenize.h
+++ b/chromium/third_party/liburlpattern/tokenize.h
@@ -94,9 +94,10 @@ inline bool operator!=(const Token& lh, const Token& rh) {
 COMPONENT_EXPORT(LIBURLPATTERN)
 std::ostream& operator<<(std::ostream& o, Token token);
 
-// Split the given input pattern string into a list of lexical tokens. Note,
-// the generated Token objects simply reference positions within the input
-// |pattern|.  The |pattern| must be kept alive as long as the Token objects.
+// Split the given input pattern string into a list of lexical tokens.
+// Tokenizing will fail if |pattern| is not valid UTF-8.  Note, the generated
+// Token objects simply reference positions within the input |pattern|.  The
+// |pattern| must be kept alive as long as the Token objects.
 COMPONENT_EXPORT(LIBURLPATTERN)
 absl::StatusOr<std::vector<Token>> Tokenize(
     absl::string_view pattern,
diff --git a/chromium/third_party/liburlpattern/tokenize_unittest.cc b/chromium/third_party/liburlpattern/tokenize_unittest.cc
index ac37983488a..46900e40226 100644
--- a/chromium/third_party/liburlpattern/tokenize_unittest.cc
+++ b/chromium/third_party/liburlpattern/tokenize_unittest.cc
@@ -288,9 +288,9 @@ TEST(TokenizeTest, RegexWithTrailingEscapedChar) {
 }
 
 TEST(TokenizeTest, RegexWithEscapedInvalidChar) {
-  // Use a single byte invalid character since the escape only applies to the
-  // next byte character.
-  RunTokenizeTest("(\\\xff)",
+  // Use a valid UTF-8 sequence (encoding of U+00A2) that encodes a non-ASCII
+  // character.
+  RunTokenizeTest("(\\\xc2\xa2)",
                   absl::InvalidArgumentError("Invalid non-ASCII character"));
 }
 
@@ -477,4 +477,36 @@ TEST(TokenizeTest, LenientPolicyRegexWithCaptureGroup) {
   RunTokenizeTest("(foo(bar))", expected_tokens, TokenizePolicy::kLenient);
 }
 
+TEST(TokenizeTest, InvalidUtf8) {
+  RunTokenizeTest("hello\xcdworld", absl::InvalidArgumentError(
+                                        "Invalid UTF-8 codepoint at index 5."));
+}
+
+TEST(TokenizeTest, InvalidUtf8Escaped) {
+  RunTokenizeTest(
+      "hello\\\xcdworld",
+      absl::InvalidArgumentError("Invalid UTF-8 codepoint at index 7."));
+}
+
+TEST(TokenizeTest, InvalidUtf8InName) {
+  RunTokenizeTest(
+      "/:foo:hello\xcdworld",
+      absl::InvalidArgumentError("Invalid UTF-8 codepoint at index 11."));
+}
+
+TEST(TokenizeTest, InvalidUtf8InRegexGroup) {
+  RunTokenizeTest("(foo\xcd)", absl::InvalidArgumentError(
+                                   "Invalid UTF-8 codepoint at index 4."));
+}
+
+TEST(TokenizeTest, InvalidUtf8EscapedInRegexGroup) {
+  RunTokenizeTest("(foo\\\xcd)", absl::InvalidArgumentError(
+                                     "Invalid UTF-8 codepoint at index 6."));
+}
+
+TEST(TokenizeTest, InvalidUtf8InNestedRegexGroup) {
+  RunTokenizeTest("(foo(\xcd))", absl::InvalidArgumentError(
+                                     "Invalid UTF-8 codepoint at index 6."));
+}
+
 }  // namespace liburlpattern
diff --git a/chromium/third_party/liburlpattern/utils.cc b/chromium/third_party/liburlpattern/utils.cc
index c9955e1d2e0..5f7f2d69227 100644
--- a/chromium/third_party/liburlpattern/utils.cc
+++ b/chromium/third_party/liburlpattern/utils.cc
@@ -52,4 +52,22 @@ std::string EscapeRegexpString(absl::string_view input) {
   return result;
 }
 
+bool IsNameCodepoint(UChar32 c, bool first_codepoint) {
+  // Require group names to follow the same character restrictions as
+  // javascript identifiers.  This code originates from v8 at:
+  //
+  // https://source.chromium.org/chromium/chromium/src/+/master:v8/src/strings/char-predicates.cc;l=17-34;drc=be014256adea1552d4a044ef80616cdab6a7d549
+  //
+  // We deviate from js identifiers, however, in not support the backslash
+  // character.  This is mainly used in js identifiers to allow escaped
+  // unicode sequences to be written in ascii.  The js engine, however,
+  // should take care of this long before we reach this level of code.  So
+  // we don't need to handle it here.
+  if (first_codepoint) {
+    return u_hasBinaryProperty(c, UCHAR_ID_START) || c == '$' || c == '_';
+  }
+  return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) || c == '$' || c == '_' ||
+         c == 0x200c || c == 0x200d;
+}
+
 }  // namespace liburlpattern
diff --git a/chromium/third_party/liburlpattern/utils.h b/chromium/third_party/liburlpattern/utils.h
index 02826b5f321..201f7f99672 100644
--- a/chromium/third_party/liburlpattern/utils.h
+++ b/chromium/third_party/liburlpattern/utils.h
@@ -9,6 +9,7 @@
 #include <string>
 #include "base/component_export.h"
 #include "third_party/abseil-cpp/absl/strings/string_view.h"
+#include "third_party/icu/source/common/unicode/uchar.h"
 
 namespace liburlpattern {
 
@@ -36,6 +37,12 @@ COMPONENT_EXPORT(LIBURLPATTERN)
 void EscapePatternStringAndAppend(absl::string_view input,
                                   std::string& append_target);
 
+// Return `true` if the given codepoint `c` is valid for a `:foo` name.  The
+// `first_codepoint` argument can be set if this codepoint is intended to be
+// the first codepoint in a name.  If its false, then the codepoint is treated
+// as a trailing character.
+bool IsNameCodepoint(UChar32 c, bool first_codepoint);
+
 }  // namespace liburlpattern
 
 #endif  // THIRD_PARTY_LIBURLPATTERN_UTILS_H_
author	Allan Sandfeld Jensen <allan.jensen@qt.io>	2022-02-04 17:20:24 +0100
committer	Allan Sandfeld Jensen <allan.jensen@qt.io>	2022-02-12 08:15:25 +0000
commit	8fa0776f1f79e91fc9c0b9c1ba11a0a29c05196b (patch)
tree	788d8d7549712682703a0310ca4a0f0860d4802b /chromium/third_party/liburlpattern
parent	606d85f2a5386472314d39923da28c70c60dc8e7 (diff)
download	qtwebengine-chromium-8fa0776f1f79e91fc9c0b9c1ba11a0a29c05196b.tar.gz