[Backport] CVE-2019-13633 [1/2]

Block U+4E00 and U+3127 from IDN when used next to non-CJK characters This CL blocks CJK unified ideograph 一 and Bopofomo letter I (ㄧ) from domain names if they are next to non-CJK characters. As a result, the domain will be shown as punycode. U+2F00 (Kangxi Radical One) is a similar character but it's normalized to U+4E00 and implicitly blocked. This change doesn't affect any popular domains. It also doesn't prevent attacks with pure CJK characters, unfortunately. Such attacks are more likely to be prevented by the lookalike domain warnings launched in M75. Bug: 863661 Change-Id: I600fef90a0a1ebb12b3c707fa529e4a5711b2c0c Commit-Queue: Mustafa Emre Acer <meacer@chromium.org> Reviewed-by: Tommy Li <tommycli@chromium.org> Reviewed-by: Daniel Cheng <dcheng@chromium.org> Cr-Commit-Position: refs/heads/master@{#670711} Reviewed-by: Michal Klocek <michal.klocek@qt.io>
author: Allan Sandfeld Jensen <allan.jensen@qt.io> 2019-10-15 10:27:29 +0200
committer: Allan Sandfeld Jensen <allan.jensen@qt.io> 2019-10-16 09:05:08 +0000
commit: ade14af90b01f12a904f4e8ed324bb2bc056e1ef (patch)
tree: 8788960e60b66c0f0b2150a3a1c0ecac5f0f04a4
parent: b1ce3367b4895c1f57de79ef081f4b7bbaa011f8 (diff)
download: qtwebengine-chromium-ade14af90b01f12a904f4e8ed324bb2bc056e1ef.tar.gz
2 files changed, 63 insertions, 27 deletions
diff --git a/chromium/components/url_formatter/idn_spoof_checker.cc b/chromium/components/url_formatter/idn_spoof_checker.cc
index 6029873b282..47e42faed22 100644
--- a/chromium/components/url_formatter/idn_spoof_checker.cc
+++ b/chromium/components/url_formatter/idn_spoof_checker.cc
@@ -305,44 +305,64 @@ bool IDNSpoofChecker::SafeToDisplayAsUnicode(base::StringPiece16 label,
   icu::RegexMatcher* dangerous_pattern =
       reinterpret_cast<icu::RegexMatcher*>(DangerousPatternTLS().Get());
   if (!dangerous_pattern) {
-    // Disallow the katakana no, so, zo, or n, as they may be mistaken for
-    // slashes when they're surrounded by non-Japanese scripts (i.e. scripts
-    // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a
-    // non-Japanese script on either side is disallowed, legitimate cases like
-    // '{vitamin in Katakana}b6' are blocked. Note that trying to block those
-    // characters when used alone as a label is futile because those cases
-    // would not reach here.
-    // Also disallow what used to be blocked by mixed-script-confusable (MSC)
-    // detection. ICU 58 does not detect MSC any more for a single input string.
-    // See http://bugs.icu-project.org/trac/ticket/12823 .
-    // TODO(jshin): adjust the pattern once the above ICU bug is fixed.
-    // - Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-Katakana
-    //   Prolonged Sound) used out-of-context.
-    // - Dislallow U+30FD/E (Katakana iteration mark/voiced iteration mark)
-    //   unless they're preceded by a Katakana.
-    // - Disallow three Hiragana letters (U+307[8-A]) or Katakana letters
-    //   (U+30D[8-A]) that look exactly like each other when they're used in a
-    //   label otherwise entirely in Katakna or Hiragana.
-    // - Disallow combining diacritical mark (U+0300-U+0339) after a non-LGC
-    //   character. Other combining diacritical marks are not in the allowed
-    //   character set.
-    // - Disallow dotless i (U+0131) followed by a combining mark.
-    // - Disallow combining Kana voiced sound marks.
-    // - Disallow U+0307 (dot above) after 'i', 'j', 'l' or dotless i (U+0131).
-    //   Dotless j (U+0237) is not in the allowed set to begin with.
+    // The parentheses in the below strings belong to the raw string sequence
+    // R"(...)". They are NOT part of the regular expression. Each sub
+    // regex is OR'ed with the | operator.
     dangerous_pattern = new icu::RegexMatcher(
         icu::UnicodeString(
+            // Disallow the katakana no (U+30ce), so (U+30bd), zo (U+30be), or
+            // n (U+30f3), as they may be mistaken for slashes when they're
+            // surrounded by non-Japanese scripts (i.e. scripts other than
+            // Katakana, Hiragana or Han). If {no, so, zo, n} next to a
+            // non-Japanese script on either side is disallowed, legitimate
+            // cases like '{vitamin in Katakana}b6' are blocked. Note that
+            // trying to block those characters when used alone as a label is
+            // futile because those cases would not reach here. Also disallow
+            // what used to be blocked by mixed-script-confusable (MSC)
+            // detection. ICU 58 does not detect MSC any more for a single input
+            // string. See http://bugs.icu-project.org/trac/ticket/12823 .
+            // TODO(jshin): adjust the pattern once the above ICU bug is fixed.
             R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"
             R"([\u30ce\u30f3\u30bd\u30be])"
             R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]|)"
-            R"([^\p{scx=kana}\p{scx=hira}]\u30fc|^\u30fc|)"
+
+            // Disallow U+30FD (Katakana iteration mark) and U+30FE (Katakana
+            // voiced iteration mark) unless they're preceded by a Katakana.
             R"([^\p{scx=kana}][\u30fd\u30fe]|^[\u30fd\u30fe]|)"
+
+            // Disallow three Hiragana letters (U+307[8-A]) or Katakana letters
+            // (U+30D[8-A]) that look exactly like each other when they're used
+            // in a label otherwise entirely in Katakana or Hiragana.
             R"(^[\p{scx=kana}]+[\u3078-\u307a][\p{scx=kana}]+$|)"
             R"(^[\p{scx=hira}]+[\u30d8-\u30da][\p{scx=hira}]+$|)"
+
+            // Disallow U+30FB (Katakana Middle Dot) and U+30FC (Hiragana-
+            // Katakana Prolonged Sound) used out-of-context.
+            R"([^\p{scx=kana}\p{scx=hira}]\u30fc|^\u30fc|)"
             R"([a-z]\u30fb|\u30fb[a-z]|)"
+
+            // Disallow U+4E00 (CJK unified ideograph) and U+3127 (Bopomofo
+            // Letter I) unless they are next to Hiragana, Katagana or Han.
+            // U+2F00 (Kangxi Radical One) is similar, but it's normalized to
+            // U+4E00 so it's not explicitly checked here.
+            R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}])"
+            R"([\u4e00\u3127])"
+            R"([^\p{scx=kana}\p{scx=hira}\p{scx=hani}]|)"
+
+            // Disallow combining diacritical mark (U+0300-U+0339) after a
+            // non-LGC character. Other combining diacritical marks are not in
+            // the allowed character set.
             R"([^\p{scx=latn}\p{scx=grek}\p{scx=cyrl}][\u0300-\u0339]|)"
+
+            // Disallow dotless i (U+0131) followed by a combining mark.
             R"(\u0131[\u0300-\u0339]|)"
+
+            // Disallow combining Kana voiced sound marks.
             R"(\u3099|\u309A|)"
+
+            // Disallow U+0307 (dot above) after 'i', 'j', 'l' or dotless i
+            // (U+0131). Dotless j (U+0237) is not in the allowed set to begin
+            // with.
             R"([ijl]\u0307)",
             -1, US_INV),
         0, status);
@@ -357,8 +377,9 @@ std::string IDNSpoofChecker::GetSimilarTopDomain(base::StringPiece16 hostname) {
   for (const std::string& skeleton : GetSkeletons(hostname)) {
     DCHECK(!skeleton.empty());
     std::string matching_top_domain = LookupSkeletonInTopDomains(skeleton);
-    if (!matching_top_domain.empty())
+    if (!matching_top_domain.empty()) {
       return matching_top_domain;
+    }
   }
   return std::string();
 }
diff --git a/chromium/components/url_formatter/url_formatter_unittest.cc b/chromium/components/url_formatter/url_formatter_unittest.cc
index e935fe501d5..b55a37d27d4 100644
--- a/chromium/components/url_formatter/url_formatter_unittest.cc
+++ b/chromium/components/url_formatter/url_formatter_unittest.cc
@@ -1030,6 +1030,21 @@ const IDNTestCase idn_cases[] = {
 
     // Modifier-letter-voicing should be blocked (wwwˬtest.com).
     {"xn--wwwtest-2be.com", L"www\x02ectest.com", false},
+
+    // U+4E00 and U+3127 should be blocked when next to non-CJK.
+    {"xn--ipaddress-w75n.com", L"ip一address.com", false},
+    {"xn--ipaddress-wx5h.com", L"ipㄧaddress.com", false},
+    // These are allowed because 一 is not immediately next to non-CJK.
+    {"xn--gamer-fg1hz05u.com", L"一生gamer.com", true},
+    {"xn--gamer-kg1hy05u.com", L"gamer生一.com", true},
+    {"xn--4gqz91g.com", L"一猫.com", true},
+    {"xn--4fkv10r.com", L"ㄧ猫.com", true},
+    // U+4E00 with another ideograph.
+    {"xn--4gqc.com", L"一丁.com", true},
+
+    // Kana voiced sound marks are not allowed.
+    {"xn--google-1m4e.com", L"google\x3099.com", false},
+    {"xn--google-8m4e.com", L"google\x309A.com", false},
 };
 
 struct AdjustOffsetCase {
author	Allan Sandfeld Jensen <allan.jensen@qt.io>	2019-10-15 10:27:29 +0200
committer	Allan Sandfeld Jensen <allan.jensen@qt.io>	2019-10-16 09:05:08 +0000
commit	ade14af90b01f12a904f4e8ed324bb2bc056e1ef (patch)
tree	8788960e60b66c0f0b2150a3a1c0ecac5f0f04a4
parent	b1ce3367b4895c1f57de79ef081f4b7bbaa011f8 (diff)
download	qtwebengine-chromium-ade14af90b01f12a904f4e8ed324bb2bc056e1ef.tar.gz