diff options
6 files changed, 46 insertions, 9 deletions
diff --git a/chromium/components/url_formatter/idn_spoof_checker.cc b/chromium/components/url_formatter/idn_spoof_checker.cc index f4c28d87d6e..4f5a594b643 100644 --- a/chromium/components/url_formatter/idn_spoof_checker.cc +++ b/chromium/components/url_formatter/idn_spoof_checker.cc @@ -133,15 +133,31 @@ IDNSpoofChecker::IDNSpoofChecker() { // Used for diacritics-removal before the skeleton calculation. Add // "ł > l; ø > o; đ > d" that are not handled by "NFD; Nonspacing mark - // removal; NFC". On top of that, supplement the Unicode confusable list by - // replacing {U+043A (к), U+0138(ĸ), U+03BA(κ)}, U+04CF (ӏ) and U+043F(п) by - // 'k', 'l' and 'n', respectively. + // removal; NFC". // TODO(jshin): Revisit "ł > l; ø > o" mapping. UParseError parse_error; - transliterator_.reset(icu::Transliterator::createFromRules( + diacritic_remover_.reset(icu::Transliterator::createFromRules( UNICODE_STRING_SIMPLE("DropAcc"), icu::UnicodeString("::NFD; ::[:Nonspacing Mark:] Remove; ::NFC;" - " ł > l; ø > o; đ > d; ӏ > l; [кĸκ] > k; п > n;"), + " ł > l; ø > o; đ > d;"), + UTRANS_FORWARD, parse_error, status)); + + // Supplement the Unicode confusable list by the following mapping. + // - U+04CF (ӏ) => l + // - {U+043A (к), U+0138(ĸ), U+03BA(κ)} => k + // - U+043F(п) => n + // - {U+0185 (ƅ), U+044C (ь)} => b + // - U+0432 (в) => b + // - U+043C (м) => m + // - U+043D (н) => h + // - U+0442 (т) => t + // - {U+0448 (ш), U+0449 (щ)} => w + // - U+0D1F (ട) => s + extra_confusable_mapper_.reset(icu::Transliterator::createFromRules( + UNICODE_STRING_SIMPLE("ExtraConf"), + icu::UnicodeString( + "ӏ > l; [кĸκ] > k; п > n; [ƅь] > b; в > b; м > m; н > h; " + "т > t; [шщ] > w; ട > s;"), UTRANS_FORWARD, parse_error, status)); DCHECK(U_SUCCESS(status)) << "Spoofchecker initalization failed due to an error: " @@ -270,7 +286,8 @@ bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) { // attached to non-LGC characters are already blocked. if (lgc_letters_n_ascii_.span(ustr_host, 0, USET_SPAN_CONTAINED) == ustr_host.length()) - transliterator_.get()->transliterate(ustr_host); + diacritic_remover_.get()->transliterate(ustr_host); + extra_confusable_mapper_.get()->transliterate(ustr_host); UErrorCode status = U_ZERO_ERROR; icu::UnicodeString ustr_skeleton; @@ -279,8 +296,7 @@ bool IDNSpoofChecker::SimilarToTopDomains(base::StringPiece16 hostname) { if (U_FAILURE(status)) return false; std::string skeleton; - ustr_skeleton.toUTF8String(skeleton); - return LookupMatchInTopDomains(skeleton); + return LookupMatchInTopDomains(ustr_skeleton.toUTF8String(skeleton)); } bool IDNSpoofChecker::IsMadeOfLatinAlikeCyrillic( diff --git a/chromium/components/url_formatter/idn_spoof_checker.h b/chromium/components/url_formatter/idn_spoof_checker.h index 36d7c789693..02dbab36cac 100644 --- a/chromium/components/url_formatter/idn_spoof_checker.h +++ b/chromium/components/url_formatter/idn_spoof_checker.h @@ -67,7 +67,8 @@ class IDNSpoofChecker { icu::UnicodeSet cyrillic_letters_; icu::UnicodeSet cyrillic_letters_latin_alike_; icu::UnicodeSet lgc_letters_n_ascii_; - std::unique_ptr<icu::Transliterator> transliterator_; + std::unique_ptr<icu::Transliterator> diacritic_remover_; + std::unique_ptr<icu::Transliterator> extra_confusable_mapper_; IDNSpoofChecker(const IDNSpoofChecker&) = delete; void operator=(const IDNSpoofChecker&) = delete; diff --git a/chromium/components/url_formatter/top_domains/alexa_domains.list b/chromium/components/url_formatter/top_domains/alexa_domains.list index a76cc116ada..3fc6c60b0e1 100644 --- a/chromium/components/url_formatter/top_domains/alexa_domains.list +++ b/chromium/components/url_formatter/top_domains/alexa_domains.list @@ -9174,3 +9174,6 @@ lyft.com digklmo68.com digklmo68.co.uk islkpx123.com +os345.com +woder.com +wmhtb.com diff --git a/chromium/components/url_formatter/top_domains/alexa_skeletons.gperf b/chromium/components/url_formatter/top_domains/alexa_skeletons.gperf index 957c69b1e89..7182bf3f755 100644 --- a/chromium/components/url_formatter/top_domains/alexa_skeletons.gperf +++ b/chromium/components/url_formatter/top_domains/alexa_skeletons.gperf @@ -9183,4 +9183,7 @@ lyft.corn, 1 digklrno68.corn, 1 digklrno68.co.uk, 1 islkpxl23.corn, 1 +os345.corn, 1 +woder.corn, 1 +wrnhtb.corn, 1 %% diff --git a/chromium/components/url_formatter/top_domains/make_alexa_top_list.py b/chromium/components/url_formatter/top_domains/make_alexa_top_list.py index 20820e84c3e..79c4325c3c3 100755 --- a/chromium/components/url_formatter/top_domains/make_alexa_top_list.py +++ b/chromium/components/url_formatter/top_domains/make_alexa_top_list.py @@ -53,3 +53,4 @@ with open(alexa_out, 'w') as outfile, open(alexa10k_path, 'r') as infile: # Add a few made-up domains for testing. outfile.write("# for testing\ndigklmo68.com\ndigklmo68.co.uk\n") outfile.write("islkpx123.com\n") + outfile.write("os345.com\nwoder.com\nwmhtb.com\n") diff --git a/chromium/components/url_formatter/url_formatter_unittest.cc b/chromium/components/url_formatter/url_formatter_unittest.cc index e865c7204e3..dc46a41ed63 100644 --- a/chromium/components/url_formatter/url_formatter_unittest.cc +++ b/chromium/components/url_formatter/url_formatter_unittest.cc @@ -406,6 +406,19 @@ const IDNTestCase idn_cases[] = { L"123.com", false}, + // wmhtb.com + {"xn--l1acpvx.com", L"\x0448\x043c\x043d\x0442\x044c.com", false}, + // щмнть.com + {"xn--l1acpzs.com", L"\x0449\x043c\x043d\x0442\x044c.com", false}, + // шмнтв.com + {"xn--b1atdu1a.com", L"\x0448\x043c\x043d\x0442\x0432.com", false}, + // ഠട345.com + {"xn--345-jtke.com", L"\x0d20\x0d1f" L"345.com", false}, + + // At one point the skeleton of 'w' was 'vv', ensure that + // that it's treated as 'w'. + {"xn--wder-qqa.com", L"w\x00f3" L"der.com", false}, + // Mixed digits: the first two will also fail mixed script test // Latin + ASCII digit + Deva digit {"xn--asc1deva-j0q.co.in", L"asc1deva\x0967.co.in", false}, |