diff options
| author | Giampaolo Rodola <g.rodola@gmail.com> | 2017-05-12 17:35:43 +0200 |
|---|---|---|
| committer | Giampaolo Rodola <g.rodola@gmail.com> | 2017-05-12 17:35:43 +0200 |
| commit | e80a005de6de339e1abad140c0ed2cfa0ef7dbaf (patch) | |
| tree | 6d1ae61c6706912f9263b2838b0ac0c9b46f15d4 /scripts/internal | |
| parent | 4fdfd4a45bf9d2187efed4c9453607fe3ed5a6e0 (diff) | |
| download | psutil-e80a005de6de339e1abad140c0ed2cfa0ef7dbaf.tar.gz | |
refactor broken links script
Diffstat (limited to 'scripts/internal')
| -rwxr-xr-x | scripts/internal/check_broken_links.py | 46 |
1 files changed, 26 insertions, 20 deletions
diff --git a/scripts/internal/check_broken_links.py b/scripts/internal/check_broken_links.py index a7c42e8d..2d2d9d30 100755 --- a/scripts/internal/check_broken_links.py +++ b/scripts/internal/check_broken_links.py @@ -52,8 +52,9 @@ import requests HERE = os.path.abspath(os.path.dirname(__file__)) -REGEX = r'(?:http|ftp|https)?://' \ - r'(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' +REGEX = re.compile( + r'(?:http|ftp|https)?://' + r'(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') REQUEST_TIMEOUT = 10 # There are some status codes sent by websites on HEAD request. # Like 503 by Microsoft, and 401 by Apple @@ -76,39 +77,44 @@ def memoize(fun): return wrapper -def get_urls_rst(filename, _regex=re.compile(REGEX)): +def sanitize_url(url): + return \ + url.strip('(').strip(')').strip('[').strip(']').strip('<').strip('>') + + +def find_urls(s): + matches = REGEX.findall(s) + if matches: + return list(set([sanitize_url(x) for x in matches])) + + +def get_urls_rst(filename): with open(filename) as f: text = f.read() - urls = _regex.findall(text) - # remove duplicates, list for sets are not iterable - urls = list(set(urls)) + urls = find_urls(text) # HISTORY file has a lot of dead links. - if filename == 'HISTORY.rst': + if filename == 'HISTORY.rst' and urls: urls = [ x for x in urls if - not x.startswith('https://github.com/giampaolo/psutil/issues/')] - # correct urls which are between < and/or > - for i, url in enumerate(urls): - urls[i] = re.sub("[\*<>\(\)\)]", '', url) + not x.startswith('https://github.com/giampaolo/psutil/issues')] return urls -def get_urls_py(filename, _regex=re.compile(REGEX)): +def get_urls_py(filename): with open(filename) as f: lines = f.readlines() - urls = set() + ret = set() for i, line in enumerate(lines): - line = line.strip() - match = _regex.findall(line) - if match: - url = match[0] + urls = find_urls(line) + if urls: + assert len(urls) == 1, urls + url = urls[0] if line.startswith('# '): nextline = lines[i + 1].strip() if re.match('^# .+', nextline): url += nextline[1:].strip() - url = re.sub("[\*<>\(\)\)]", '', url) - urls.add(url) - return urls + ret.add(url) + return list(ret) def get_urls(filename): |
