chromium/components/url_formatter/top_domains/make_alexa_top_list.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

#!/usr/bin/env python
# Copyright 2017 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Generates alexa_domains.list from
   src/tools/perf/page_sets/alexa1-10000-urls.json.  By default, all the domains
   extracted from the input will be recorded in alexa_domains.list in the script
   directory except for duplicates and domains in ccTLDs known to disallow
   non-ASCII Latin letters (cn,jp,kr,tw).
   Optional command line arguments can be used to limit the output to top N
   domains and to specify an output file.
"""

import re
import sys
import os

script_dir = os.path.dirname(os.path.realpath(__file__))
alexa10k_path = os.path.join(script_dir, "..", "..", "..", "tools", "perf",
                              "page_sets", "alexa1-10000-urls.json")
max_num_domains = 10000 if len(sys.argv) < 2 else int(sys.argv[1])
alexa_out = os.path.join(script_dir, "alexa_domains.list") \
    if len(sys.argv) < 3 else os.path.join(script_dir, sys.argv[2])

domain_extractor = re.compile(r'^.*"https?://(?:www.)?([^/]*)/.*$')
excluded_tld = re.compile(r'.(cn|kr|jp|tw)$')
domains = set()
n_domains = 0

with open(alexa_out, 'w') as outfile, open(alexa10k_path, 'r') as infile:
  for line in infile:
    if line.startswith('#'):
      continue
    match = domain_extractor.match(line)
    if match and n_domains < max_num_domains:
      n_domains = n_domains + 1
      domain = match.group(1)
      labels = domain.split('.')
      if len(labels) > 3:
        domain = '.'.join(labels[-3:])
      if not excluded_tld.search(match.group(1)) and domain not in domains:
        domains.add(domain)
        outfile.write(domain + "\n")

  # Add some popular domains if they're missing.
  # TODO(jshin): Find a way to update the list. (crbug.com/722022)
  for domain in ["gmail.com", "hotmail.com", "360.cn", "ntd.tv", "onclkds.com",
                 "uber.com", "lyft.com", "ok.ru"]:
    if domain not in domains:
      outfile.write(domain + "\n")

  # Add a few made-up domains for testing.
  outfile.write("# for testing\ndigklmo68.com\ndigklmo68.co.uk\n")
  outfile.write("islkpx123.com\n")