summaryrefslogtreecommitdiff
path: root/chromium/docs/website/scripts/export.py
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/docs/website/scripts/export.py')
-rwxr-xr-xchromium/docs/website/scripts/export.py457
1 files changed, 0 insertions, 457 deletions
diff --git a/chromium/docs/website/scripts/export.py b/chromium/docs/website/scripts/export.py
deleted file mode 100755
index 4e53aea81bd..00000000000
--- a/chromium/docs/website/scripts/export.py
+++ /dev/null
@@ -1,457 +0,0 @@
-#!/usr/bin/env vpython3
-# Copyright 2021 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Export www.chromium.org to local files.
-
-This script uses the Google GData and Google Sites APIs to extract the
-content from http://www.chromium.org/ and write it into local files
-that can be used to serve the same content.
-
-The APIs are documented at
-
-https://developers.google.com/sites/docs/1.0/developers_guide_protocol
-https://developers.google.com/gdata/docs/json
-
-Because www.chromium.org is a public site, this script requires no
-authentication to work.
-
-The exporting process attempts to convert the original content into
-sane modern HTML as much as possible without changing the appearance
-of any page significantly, with some minor exceptions.
-"""
-
-import argparse
-import collections
-import io
-import json
-import os
-import pdb
-import sys
-import time
-import traceback
-import xml.etree.ElementTree as ET
-
-from urllib.parse import urlparse
-from urllib.request import urlopen
-from urllib.error import HTTPError, URLError
-
-import yaml
-
-import common
-import html2markdown
-
-
-def main():
- parser = argparse.ArgumentParser()
- parser.add_argument('--force', action='store_true',
- help='ignore updated timestamps in local cache')
- parser.add_argument('-j', '--jobs', type=int, default=common.cpu_count())
- parser.add_argument('-t', '--test', action='store_true')
- parser.add_argument('-r', '--raw', action='store_true')
- parser.add_argument('-v', '--verbose', action='count')
- parser.add_argument('--max_results', type=int, default=5000)
- parser.add_argument('--start-index', type=int, default=1)
- parser.add_argument('--paths-to-skip')
- parser.add_argument('--path-list')
- parser.add_argument('path', nargs='*')
- args = parser.parse_args()
-
- entries = _entries(args)
-
- if args.path:
- paths_to_export = ['%s%s' % ('/' if not path.startswith('/') else '',
- path)
- for path in args.path]
- elif args.path_list:
- paths_to_export = common.read_paths(args.path_list)
- else:
- paths_to_export = []
-
- if args.paths_to_skip:
- paths_to_skip = set(common.read_paths(args.paths_to_skip))
- else:
- paths_to_skip = set(
- common.read_paths(os.path.join(common.REPO_DIR,
- 'scripts', 'paths_to_skip.txt')))
-
- max_input_mtime = max(os.stat(__file__).st_mtime,
- os.stat(common.__file__).st_mtime,
- os.stat(html2markdown.__file__).st_mtime)
-
- updated = 0
- paths = []
-
- if args.test:
- entry = _find_entry_by_path(paths_to_export[0], entries)
- if entry:
- metadata = _metadata(entry, entries)
- path = _path(entry, entries)
- _ = _handle_entry(path,
- (entry, metadata, max_input_mtime, args.force,
- args.raw))
- content = common.read_text_file('%s%s/index.md' %
- (common.SITE_DIR, path))
- print(content)
- return 0
- else:
- print('%s not found' % paths_to_export[0])
- return 1
-
- q = common.JobQueue(_handle_entry, args.jobs)
-
- paths_to_export = set(paths_to_export)
- exported_pages = set()
- for i, entry in enumerate(list(entries.values())[:args.max_results]):
- if entry['kind'] in ('webpage', 'listpage',
- 'announcementspage', 'filecabinet'):
- metadata = _metadata(entry, entries)
- path = _path(entry, entries)
-
- if path in paths_to_skip:
- continue
- exported_pages.add(path)
- elif entry['kind'] == 'attachment':
- metadata = {}
- path = entry['url'].replace(
- 'https://sites.google.com/a/chromium.org/dev/', '/').rstrip('/')
- if path in paths_to_skip:
- continue
- else:
- continue
- if not paths_to_export or (path in paths_to_export):
- q.request(path, (entry, metadata, max_input_mtime, args.force,
- False))
-
- ret = 0
- for path, res, did_update in q.results():
- if res:
- ret = 1
- if did_update:
- updated += 1
-
- print('updated %d entries' % updated)
- return ret
-
-
-def _find_entry_by_path(path, entries):
- for entry in entries.values():
- if entry['kind'] not in ('webpage', 'listpage',
- 'announcmentspage', 'filecabinet'):
- continue
- entry_path = _path(entry, entries)
- if entry_path == path:
- return entry
- return None
-
-
-def _handle_entry(task, obj):
- entry, metadata, max_input_mtime, force, raw = obj
- err = ''
- did_update = False
-
- if not task.startswith('/'):
- return 'malformed task', False
-
- yaml.SafeDumper.org_represent_str = yaml.SafeDumper.represent_str
-
- if task in (
- '/developers/jinja',
- '/developers/polymer-1-0',
- '/devtools/breakpoints-tutorial/index.html',
- '/devtools/breakpoints-tutorial/script.js',
- ):
- # TODO: Eleventy chokes on these files.
- return '', False
-
- def repr_str(dumper, data):
- if '\n' in data:
- return dumper.represent_scalar(u'tag:yaml.org,2002:str', data,
- style='|')
- return dumper.org_represent_str(data)
-
- yaml.add_representer(str, repr_str, Dumper=yaml.SafeDumper)
-
-
- mtime = _to_ts(entry['updated'])
- target_mtime = max(mtime, max_input_mtime)
- if entry['kind'] in ('webpage',
- 'listpage',
- 'announcementspage',
- 'filecabinet'):
- path = '%s%s/%s' % (common.SITE_DIR, task, 'index.md')
- if _needs_update(path, target_mtime, force):
- if raw:
- content = entry['content']
- else:
- content_sio = io.StringIO(entry['content'])
- md_sio = io.StringIO()
- md_sio.write('---\n')
- md_sio.write(yaml.safe_dump(metadata))
- md_sio.write('---\n\n')
- url_converter = _URLConverter()
- html2markdown.Convert(content_sio, md_sio, url_converter)
- if entry['kind'] == 'listpage':
- md_sio.write('\n\n')
- _write_listitems(md_sio, entry)
- content = md_sio.getvalue()
- content = content.replace(
- 'chromium.googlesource.com/chromium/src/+/master/',
- 'chromium.googlesource.com/chromium/src/+/HEAD/')
- content = content.replace(' \b\b\b\b', '')
-
- did_update = common.write_if_changed(path, content, mode='w')
- else:
- did_update = False
- elif entry['kind'] == 'listitem':
- # Handled as part of the corresponding 'listpage' entry.
- pass
- elif entry['kind'] == 'announcement':
- # TODO: implement me.
- pass
- elif entry['kind'] == 'attachment':
- path = '%s%s' % (common.SITE_DIR, task)
- path = path.replace(':', '_')
- path = path.replace('%20', ' ')
- path = path.replace('%2B', '+')
- if task in (
- '/developers/design-documents/network-stack/cookiemonster/CM-method-calls-new.png',
- '/developers/design-documents/cookie-split-loading/objects.png',
- ):
- # These are expected 404's that we ignore.
- did_update = False
- elif _needs_update(path, mtime, force):
- try:
- fp = urlopen(entry['url'])
- content = fp.read()
- did_update = common.write_if_changed(path, content)
- except (HTTPError, URLError, TimeoutError) as e:
- err = 'Error: %s' % e
-
- elif entry['kind'] == 'comment':
- # ignore comments in the migration
- pass
- elif entry['kind'] == 'tag':
- err = 'tag kind not implemented'
- else:
- err = 'unknown kind %s' % entry['kind']
-
- return err, did_update
-
-
-def _write_listitems(content, entry):
- if not entry['listitems']:
- return
-
- headers = entry['listitems'][0].keys()
- rows = sorted(entry['listitems'],
- key=lambda row: row.get('Release') or '')
-
- content.write('<table>\n')
- content.write(' <tr>\n')
- for header in headers:
- content.write(' <th>%s</th>\n' % header)
- content.write(' </tr>\n')
- for row in rows:
- content.write(' <tr>\n')
- for value in row.values():
- if value and value.startswith('<a xmlns='):
- value = value.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
- content.write(' <td>%s</td>\n' % (value or ''))
- content.write(' </tr>\n')
- content.write('</table>\n')
-
-
-class _URLConverter:
- def Translate(self, href):
- if not href:
- return ''
-
- for path in common.alternates:
- if href.startswith(path):
- href = href.replace(path, '')
-
- if href.startswith('/_/rsrc'):
- href = '/' + '/'.join(href.split('/')[4:])
-
- url = urlparse(href)
- if '?' in href and url.netloc == '':
- href = href[0:href.index('?')]
- if 'Screenshot' in href:
- head, tail = href.split('Screenshot')
- tail = tail.replace(':', '%3A')
- href = head + 'Screenshot' + tail
- return href
-
-
-def _path(entry, entries):
- path = entry['page_name']
- parent_id = entry.get('parent_id')
- while parent_id:
- path = entries[parent_id]['page_name'] + '/' + path
- parent_id = entries[parent_id].get('parent_id')
-
- path = ('/' + path).rstrip('/') or '/'
- return path
-
-
-def _metadata(entry, entries):
- metadata = {}
- metadata['page_name'] = entry['page_name']
- metadata['title'] = entry['title']
-
- crumbs = []
- parent_id = entry.get('parent_id')
- while parent_id:
- parent = entries[parent_id]
- path = _path(parent, entries)
- title = parent['title']
- crumbs = [[path, title]] + crumbs
- parent_id = parent.get('parent_id')
-
- metadata['breadcrumbs'] = crumbs
-
- if metadata['page_name'] in (
- 'chromium-projects',
- 'chromium',
- ):
- metadata['use_title_as_h1'] = False
-
- return metadata
-
-
-def _needs_update(path, mtime, force):
- if force:
- return True
- if os.path.exists(path):
- st = os.stat(path)
- return mtime > st.st_mtime
- return True
-
-
-def _entries(args):
- entries = {}
- parents = {}
-
- # Looks like Sites probably caps results at 500 entries per request,
- # even if we request more than that.
- rownum = 0
- url = ('https://sites.google.com/feeds/content/chromium.org/dev'
- '?start-index=%d&max-results=%d&alt=json' %
- (args.start_index, 500 - rownum))
- doc, next_url = _fetch(url, args.force)
-
- for rownum, entry in enumerate(doc['feed']['entry'], start=1):
- row = _to_row(entry, rownum)
- entries[row['id']] = row
- if row.get('parent_id'):
- parents.setdefault(row['parent_id'], set()).add(row['id'])
- if args.verbose:
- print(' ... [%d]' % rownum)
- while next_url:
- doc, next_url = _fetch(next_url, args.force)
- for rownum, entry in enumerate(doc['feed']['entry'], start=rownum):
- row = _to_row(entry, rownum)
- entries[row['id']] = row
- if row.get('parent_id'):
- parents.setdefault(row['parent_id'], set()).add(row['id'])
- if args.verbose:
- print(' ... [%d]' % rownum)
-
- for entry_id, entry in entries.items():
- if entry['kind'] == 'listpage':
- entry['listitems'] = [entries[child_id]['fields'] for child_id
- in parents[entry_id]
- if entries[child_id]['kind'] == 'listitem']
-
- return entries
-
-
-def _fetch(url, force):
- path = url.replace('https://sites.google.com/feeds/', 'scripts/feeds/')
- if _needs_update(path, 0, force):
- fp = urlopen(url)
- content = fp.read()
- doc = json.loads(content)
- updated = _to_ts(doc['feed']['updated']['$t'])
- common.write_if_changed(path, content)
- else:
- with open(path) as fp:
- doc = json.load(fp)
- next_url = _find_link(doc['feed'], 'next')
- return doc, next_url
-
-
-def _find_link(doc, rel):
- for ent in doc['link']:
- if ent['rel'] == rel:
- return ent['href']
- return None
-
-
-def _to_row(entry, rownum):
- row = {
- 'rownum': rownum,
- 'content': entry.get('content', {}).get('$t'),
- 'id': _to_id(entry['id']['$t']),
- 'kind': entry['category'][0]['label'],
- 'published': entry['published']['$t'],
- 'updated': entry['updated']['$t'],
- }
-
- row['page_name'] = entry.get('sites$pageName', {}).get('$t')
- row['title'] = entry.get('title', {}).get('$t')
- row['alt_url'] = _find_link(entry, 'alternate')
-
- if row['kind'] == 'attachment':
- row['url'] = _find_link(entry, 'alternate')
- else:
- row['url'] = _find_link(entry, 'self')
-
- if row['kind'] == 'listitem':
- path = row['url'].replace('https://sites.google.com',
- os.path.join(common.REPO_DIR, 'scripts'))
- if os.path.exists(path):
- xml_content = common.read_text_file(path)
- else:
- print('fetching %s' % row['url'])
- with urlopen(row['url']) as fp:
- xml_content = fp.read()
- common.write_if_changed(path, xml_content)
-
- root = ET.fromstring(xml_content)
- fields = root.findall('{http://schemas.google.com/spreadsheets/2006}field')
- row['fields'] = collections.OrderedDict((el.attrib['name'], el.text) for el in fields)
-
- parent_url = _find_link(entry,
- 'http://schemas.google.com/sites/2008#parent')
- if parent_url:
- row['parent_id'] = _to_id(parent_url)
- return row
-
-
-def _to_id(url):
- return url[url.rfind('/') + 1:]
-
-
-def _to_ts(iso_time):
- return time.mktime(time.strptime(iso_time, '%Y-%m-%dT%H:%M:%S.%fZ'))
-
-if __name__ == '__main__':
- try:
- main()
- except Exception:
- extype, value, tb = sys.exc_info()
- traceback.print_exc()
- pdb.post_mortem(tb)