summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2018-01-29 11:16:14 +0100
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2018-02-02 10:38:42 +0000
commit2d07ab20669cfb33c08079c4661cf80cdcaa6f3e (patch)
tree7458c24433713b728cc6c2a3422f9b3865662bdb
parent5cb725e1b4b8e313c7d847ae46c1dbe3fbfaaa57 (diff)
downloadqtwebengine-chromium-2d07ab20669cfb33c08079c4661cf80cdcaa6f3e.tar.gz
[Backport] [M64 branch] Add a few more confusable map entries
1. Map Malaylam U+0D1F to 's'. 2. Map 'small-cap-like' Cyrillic letters to "look-alike" Latin lowercase letters. The characters in new confusable map entries are replaced by their Latin "look-alike" characters before the skeleton is calculated to compare with top domain names. TBR=jshin@chromium.org (cherry picked from commit b3f0207c14fccc11aaa9d4975ebe46554ad289cb) Bug: 784761,773930 Test: components_unittests --gtest_filter=*IDNToUni* Reviewed-on: https://chromium-review.googlesource.com/805214 Reviewed-by: Peter Kasting <pkasting@chromium.org> Commit-Queue: Jungshik Shin <jshin@chromium.org> Cr-Original-Commit-Position: refs/heads/master@{#521648} Reviewed-on: https://chromium-review.googlesource.com/852973 Reviewed-by: Jungshik Shin <jshin@chromium.org> Cr-Commit-Position: refs/branch-heads/3282@{#421} Cr-Branched-From: 5fdc0fab22ce7efd32532ee989b223fa12f8171e-refs/heads/master@{#520840} (CVE-2018-6042) Change-Id: Ie3bf95a49aacda093e5e8f91e44c0a8a7dda08f0 Reviewed-by: Michal Klocek <michal.klocek@qt.io>
-rw-r--r--chromium/components/url_formatter/idn_spoof_checker.cc32
-rw-r--r--chromium/components/url_formatter/idn_spoof_checker.h3
-rw-r--r--chromium/components/url_formatter/top_domains/alexa_domains.list3
-rw-r--r--chromium/components/url_formatter/top_domains/alexa_skeletons.gperf3
-rwxr-xr-xchromium/components/url_formatter/top_domains/make_alexa_top_list.py1
-rw-r--r--chromium/components/url_formatter/url_formatter_unittest.cc13
6 files changed, 46 insertions, 9 deletions
diff --git a/chromium/components/url_formatter/idn_spoof_checker.cc b/chromium/components/url_formatter/idn_spoof_checker.cc
index f4c28d87d6e..4f5a594b643 100644
--- a/chromium/components/url_formatter/idn_spoof_checker.cc
+++ b/chromium/components/url_formatter/idn_spoof_checker.cc
@@ -133,15 +133,31 @@ IDNSpoofChecker::IDNSpoofChecker() {
// Used for diacritics-removal before the skeleton calculation. Add
// "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark
- // removal; NFC". On top of that, supplement the Unicode confusable list by
- // replacing {U+043A (к), U+0138(ĸ), U+03BA(κ)}, U+04CF (ӏ) and U+043F(п) by
- // 'k', 'l' and 'n', respectively.
+ // removal; NFC".
// TODO(jshin): Revisit "ł > l; ø > o" mapping.
UParseError parse_error;
- transliterator_.reset(icu::Transliterator::createFromRules(
+ diacritic_remover_.reset(icu::Transliterator::createFromRules(
UNICODE_STRING_SIMPLE("DropAcc"),
icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;"
- " ł > l; ø > o; đ > d; ӏ > l; [кĸκ] > k; п > n;"),
+ " ł > l; ø > o; đ > d;"),
+ UTRANS_FORWARD, parse_error, status));
+
+ // Supplement the Unicode confusable list by the following mapping.
+ // - U+04CF (ӏ) => l
+ // - {U+043A (к), U+0138(ĸ), U+03BA(κ)} => k
+ // - U+043F(п) => n
+ // - {U+0185 (ƅ), U+044C (ь)} => b
+ // - U+0432 (в) => b
+ // - U+043C (м) => m
+ // - U+043D (н) => h
+ // - U+0442 (т) => t
+ // - {U+0448 (ш), U+0449 (щ)} => w
+ // - U+0D1F (ട) => s
+ extra_confusable_mapper_.reset(icu::Transliterator::createFromRules(
+ UNICODE_STRING_SIMPLE("ExtraConf"),
+ icu::UnicodeString(
+ "ӏ > l; [кĸκ] > k; п > n; [ƅь] > b; в > b; м > m; н > h; "
+ "т > t; [шщ] > w; ട > s;"),
UTRANS_FORWARD, parse_error, status));
DCHECK(U_SUCCESS(status))
<< "Spoofchecker initalization failed due to an error: "
@@ -270,7 +286,8 @@ bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {
// attached to non-LGC characters are already blocked.
if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) ==
ustr_host.length())
- transliterator_.get()->transliterate(ustr_host);
+ diacritic_remover_.get()->transliterate(ustr_host);
+ extra_confusable_mapper_.get()->transliterate(ustr_host);
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString ustr_skeleton;
@@ -279,8 +296,7 @@ bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) {
if (U_FAILURE(status))
return false;
std::string skeleton;
- ustr_skeleton.toUTF8String(skeleton);
- return LookupMatchInTopDomains(skeleton);
+ return LookupMatchInTopDomains(ustr_skeleton.toUTF8String(skeleton));
}
bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic(
diff --git a/chromium/components/url_formatter/idn_spoof_checker.h b/chromium/components/url_formatter/idn_spoof_checker.h
index 36d7c789693..02dbab36cac 100644
--- a/chromium/components/url_formatter/idn_spoof_checker.h
+++ b/chromium/components/url_formatter/idn_spoof_checker.h
@@ -67,7 +67,8 @@ class IDNSpoofChecker {
icu::UnicodeSet cyrillic_letters_;
icu::UnicodeSet cyrillic_letters_latin_alike_;
icu::UnicodeSet lgc_letters_n_ascii_;
- std::unique_ptr<icu::Transliterator> transliterator_;
+ std::unique_ptr<icu::Transliterator> diacritic_remover_;
+ std::unique_ptr<icu::Transliterator> extra_confusable_mapper_;
IDNSpoofChecker(const IDNSpoofChecker&) = delete;
void operator=(const IDNSpoofChecker&) = delete;
diff --git a/chromium/components/url_formatter/top_domains/alexa_domains.list b/chromium/components/url_formatter/top_domains/alexa_domains.list
index a76cc116ada..3fc6c60b0e1 100644
--- a/chromium/components/url_formatter/top_domains/alexa_domains.list
+++ b/chromium/components/url_formatter/top_domains/alexa_domains.list
@@ -9174,3 +9174,6 @@ lyft.com
digklmo68.com
digklmo68.co.uk
islkpx123.com
+os345.com
+woder.com
+wmhtb.com
diff --git a/chromium/components/url_formatter/top_domains/alexa_skeletons.gperf b/chromium/components/url_formatter/top_domains/alexa_skeletons.gperf
index 957c69b1e89..7182bf3f755 100644
--- a/chromium/components/url_formatter/top_domains/alexa_skeletons.gperf
+++ b/chromium/components/url_formatter/top_domains/alexa_skeletons.gperf
@@ -9183,4 +9183,7 @@ lyft.corn, 1
digklrno68.corn, 1
digklrno68.co.uk, 1
islkpxl23.corn, 1
+os345.corn, 1
+woder.corn, 1
+wrnhtb.corn, 1
%%
diff --git a/chromium/components/url_formatter/top_domains/make_alexa_top_list.py b/chromium/components/url_formatter/top_domains/make_alexa_top_list.py
index 20820e84c3e..79c4325c3c3 100755
--- a/chromium/components/url_formatter/top_domains/make_alexa_top_list.py
+++ b/chromium/components/url_formatter/top_domains/make_alexa_top_list.py
@@ -53,3 +53,4 @@ with open(alexa_out, 'w') as outfile, open(alexa10k_path, 'r') as infile:
# Add a few made-up domains for testing.
outfile.write("# for testing\ndigklmo68.com\ndigklmo68.co.uk\n")
outfile.write("islkpx123.com\n")
+ outfile.write("os345.com\nwoder.com\nwmhtb.com\n")
diff --git a/chromium/components/url_formatter/url_formatter_unittest.cc b/chromium/components/url_formatter/url_formatter_unittest.cc
index e865c7204e3..dc46a41ed63 100644
--- a/chromium/components/url_formatter/url_formatter_unittest.cc
+++ b/chromium/components/url_formatter/url_formatter_unittest.cc
@@ -406,6 +406,19 @@ const IDNTestCase idn_cases[] = {
L"123.com",
false},
+ // wmhtb.com
+ {"xn--l1acpvx.com", L"\x0448\x043c\x043d\x0442\x044c.com", false},
+ // щмнть.com
+ {"xn--l1acpzs.com", L"\x0449\x043c\x043d\x0442\x044c.com", false},
+ // шмнтв.com
+ {"xn--b1atdu1a.com", L"\x0448\x043c\x043d\x0442\x0432.com", false},
+ // ഠട345.com
+ {"xn--345-jtke.com", L"\x0d20\x0d1f" L"345.com", false},
+
+ // At one point the skeleton of 'w' was 'vv', ensure that
+ // that it's treated as 'w'.
+ {"xn--wder-qqa.com", L"w\x00f3" L"der.com", false},
+
// Mixed digits: the first two will also fail mixed script test
// Latin + ASCII digit + Deva digit
{"xn--asc1deva-j0q.co.in", L"asc1deva\x0967.co.in", false},