summaryrefslogtreecommitdiff
path: root/tools/yelp-check.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools/yelp-check.py')
-rw-r--r--tools/yelp-check.py1245
1 files changed, 0 insertions, 1245 deletions
diff --git a/tools/yelp-check.py b/tools/yelp-check.py
deleted file mode 100644
index a3eb486..0000000
--- a/tools/yelp-check.py
+++ /dev/null
@@ -1,1245 +0,0 @@
-#!/bin/python3
-#
-# yelp-check
-# Copyright (C) 2011-2020 Shaun McCance <shaunm@gnome.org>
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
-import configparser
-import lxml.etree
-import os
-import sys
-import urllib.request
-import shutil
-import subprocess
-import tempfile
-import textwrap
-
-# FIXME: don't hardcode this
-DATADIR = '/usr/share/yelp-tools'
-
-XML_ID = '{http://www.w3.org/XML/1998/namespace}id'
-NAMESPACES = {
- 'mal': 'http://projectmallard.org/1.0/',
- 'cache': 'http://projectmallard.org/cache/1.0/',
- 'db': 'http://docbook.org/ns/docbook',
- 'e': 'http://projectmallard.org/experimental/',
- 'ui': 'http://projectmallard.org/ui/1.0/',
- 'uix': 'http://projectmallard.org/experimental/ui/',
- 'xlink': 'http://www.w3.org/1999/xlink'
- }
-
-def _stringify(el):
- ret = el.text or ''
- for ch in el:
- ret = ret + _stringify(ch)
- if el.tail is not None:
- ret = ret + el.tail
- return ret
-
-def get_format(node):
- ns = lxml.etree.QName(node).namespace
- if ns in (NAMESPACES['mal'], NAMESPACES['cache']):
- return 'mallard'
- elif ns == NAMESPACES['db']:
- return 'docbook5'
- elif ns is None:
- # For now, just assume no ns means docbook4
- return 'docbook4'
- else:
- return None
-
-class InputFile:
- def __init__(self, filepath, filename, sitedir=None):
- self.filepath = filepath
- self.filename = filename
- self.absfile = os.path.join(filepath, filename)
- self.absdir = os.path.dirname(self.absfile)
- self.sitedir = sitedir or ''
- self.sitefilename = self.sitedir + self.filename
-
-
-class Checker:
- name = None
- desc = None
- blurb = None
- formats = []
- arguments = []
- postblurb = None
- xinclude = True
- config = None
-
- def __init__(self, yelpcheck):
- self.yelpcheck = yelpcheck
- self.options = {}
- self.fileargs = []
- self.tmpdir = None
-
- def __del__(self):
- if self.tmpdir is not None:
- shutil.rmtree(self.tmpdir)
- self.tmpdir = None
-
- def parse_args(self, args):
- while len(args) > 0:
- argdef = None
- if args[0].startswith('--'):
- for arg_ in self.arguments:
- if args[0] == '--' + arg_[0]:
- argdef = arg_
- break
- if argdef is None:
- self.print_help()
- return 1
- elif args[0].startswith('-'):
- for arg_ in self.arguments:
- if args[0] == arg_[1]:
- argdef = arg_
- break
- if argdef is None:
- self.print_help()
- return 1
- if argdef is not None:
- takesarg = (argdef[2] is not None)
- if takesarg:
- if len(args) < 2:
- self.print_help()
- return 1
- self.options.setdefault(argdef[0], [])
- self.options[argdef[0]].append(args[1])
- args = args[2:]
- else:
- self.options[argdef[0]] = True
- args = args[1:]
- else:
- self.fileargs.append(args[0])
- args = args[1:]
- cfgfile = None
- if len(self.fileargs) > 0:
- cfgfile = os.path.join(os.path.dirname(self.fileargs[0]), '.yelp-tools.cfg')
- if not os.path.exists(cfgfile):
- cfgfile = None
- if cfgfile is None:
- cfgfile = os.path.join(os.getcwd(), '.yelp-tools.cfg')
- if os.path.exists(cfgfile):
- self.config = configparser.ConfigParser()
- try:
- self.config.read(cfgfile)
- except Exception as e:
- print(e, file=sys.stderr)
- sys.exit(1)
- return 0
-
- def get_option_bool(self, arg):
- if arg in self.options:
- return self.options[arg] == True
- if self.config is not None:
- val = self.config.get('check:' + self.name, arg, fallback=None)
- if val is not None:
- return (val == 'true')
- val = self.config.get('check', arg, fallback=None)
- if val is not None:
- return (val == 'true')
- val = self.config.get('default', arg, fallback=None)
- if val is not None:
- return (val == 'true')
- return False
-
- def get_option_str(self, arg):
- if arg in self.options:
- if isinstance(self.options[arg], list):
- return self.options[arg][-1]
- if self.config is not None:
- val = self.config.get('check:' + self.name, arg, fallback=None)
- if val is not None:
- return val
- val = self.config.get('check', arg, fallback=None)
- if val is not None:
- return val
- val = self.config.get('default', arg, fallback=None)
- if val is not None:
- return val
- return None
-
- def get_option_list(self, arg):
- if arg in self.options:
- if isinstance(self.options[arg], list):
- ret = []
- for opt in self.options[arg]:
- ret.extend(opt.replace(',', ' ').split())
- return ret
- if self.config is not None:
- val = self.config.get('check:' + self.name, arg, fallback=None)
- if val is not None:
- return val.replace(',', ' ').split()
- val = self.config.get('check', arg, fallback=None)
- if val is not None:
- return val.replace(',', ' ').split()
- val = self.config.get('default', arg, fallback=None)
- if val is not None:
- return val.replace(',', ' ').split()
- return None
-
- def iter_files(self, sitedir=None):
- issite = self.get_option_bool('site')
- if len(self.fileargs) == 0:
- self.fileargs.append('.')
- for filearg in self.fileargs:
- if os.path.isdir(filearg):
- if issite:
- for infile in self.iter_site(filearg, '/'):
- yield infile
- else:
- for fname in os.listdir(filearg):
- if fname.endswith('.page'):
- yield InputFile(filearg, fname)
- else:
- if issite:
- # FIXME: should do some normalization here, I guess.
- # It's hard to get this perfect without a defined start dir
- yield InputFile(os.getcwd(), filearg, '/' + os.path.dirname(filearg))
- else:
- yield InputFile(os.getcwd(), filearg)
-
- def iter_site(self, filepath, sitedir):
- for fname in os.listdir(filepath):
- newpath = os.path.join(filepath, fname)
- if os.path.isdir(newpath):
- # FIXME https://github.com/projectmallard/pintail/issues/36
- if fname == '__pintail__':
- continue
- for infile in self.iter_site(newpath, sitedir + fname + '/'):
- yield infile
- elif fname.endswith('.page'):
- yield InputFile(filepath, fname, sitedir)
-
- def get_xml(self, xmlfile):
- # FIXME: we can cache these if we add a feature to run multiple
- # checkers at once
- tree = lxml.etree.parse(xmlfile.absfile)
- if self.xinclude:
- lxml.etree.XInclude()(tree.getroot())
- return tree
-
- def create_tmpdir(self):
- if self.tmpdir is None:
- self.tmpdir = tempfile.mkdtemp()
-
- def print_help(self):
- print('Usage: yelp-check ' + self.name + ' [OPTIONS] [FILES]')
- print('Formats: ' + ' '.join(self.formats) + '\n')
- #FIXME: prettify names of formats
- if self.blurb is not None:
- print(self.blurb + '\n')
- print('Options:')
- maxarglen = 2
- args = []
- for arg in self.arguments:
- argkey = '--' + arg[0]
- if arg[1] is not None:
- argkey = arg[1] + ', ' + argkey
- if arg[2] is not None:
- argkey = argkey + ' ' + arg[2]
- args.append((argkey, arg[3]))
- for arg in args:
- maxarglen = max(maxarglen, len(arg[0]) + 1)
- for arg in args:
- print(' ' + (arg[0]).ljust(maxarglen) + ' ' + arg[1])
- if self.postblurb is not None:
- print(self.postblurb)
-
- def main(self, args):
- pass
-
-
-class HrefsChecker (Checker):
- name = 'hrefs'
- desc = 'Find broken external links in a document'
- blurb = ('Find broken href links in FILES in a Mallard document, or\n' +
- 'broken ulink or XLink links in FILES in a DocBook document.')
- formats = ['docbook4', 'docbook5', 'mallard']
- arguments = [
- ('help', '-h', None, 'Show this help and exit'),
- ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
- ('allow', None, 'URL', 'Allow URL or list of URLs without checking')
- ]
- postblurb = 'URL may be a comma- and/or space-separated list, or specified\nmultiple times.'
-
- def main(self, args):
- if self.parse_args(args) != 0:
- return 1
- if 'help' in self.options:
- self.print_help()
- return 0
-
- # safelisting URLs that we use as identifiers
- hrefs = {
- 'http://creativecommons.org/licenses/by-sa/3.0/': True,
- 'https://creativecommons.org/licenses/by-sa/3.0/': True,
- 'http://creativecommons.org/licenses/by-sa/3.0/us/': True,
- 'https://creativecommons.org/licenses/by-sa/3.0/us/': True
- }
- allow = self.get_option_list('allow')
- if allow is not None:
- for url in allow:
- hrefs[url] = True
- retcode = 0
-
- for infile in self.iter_files():
- xml = self.get_xml(infile)
- for el in xml.xpath('//*[@href | @xlink:href | self::ulink/@url]',
- namespaces=NAMESPACES):
- href = el.get('href', None)
- if href is None:
- href = el.get('{www.w3.org/1999/xlink}href')
- if href is None:
- href = el.get('url')
- if href is None:
- continue
- if href.startswith('mailto:'):
- continue
- if href not in hrefs:
- try:
- req = urllib.request.urlopen(href)
- hrefs[href] = (req.status == 200)
- except Exception as e:
- hrefs[href] = False
- if not hrefs[href]:
- retcode = 1
- print(infile.sitefilename + ': ' + href)
-
- return retcode
-
-
-class IdsChecker (Checker):
- name = 'ids'
- desc = 'Find Mallard page IDs that do not match file names'
- blurb = ('Find pages in a Mallard document whose page ID does not match\n' +
- 'the base file name of the page file.')
- formats = ['mallard']
- arguments = [
- ('help', '-h', None, 'Show this help and exit'),
- ('site', '-s', None, 'Treat pages as belonging to a Mallard site')
- ]
-
- def main(self, args):
- if self.parse_args(args) != 0:
- return 1
- if 'help' in self.options:
- self.print_help()
- return 0
-
- retcode = 0
-
- for infile in self.iter_files():
- xml = self.get_xml(infile)
- isok = False
- pageid = None
- if infile.filename.endswith('.page'):
- try:
- pageid = xml.getroot().get('id')
- isok = (pageid == os.path.basename(infile.filename)[:-5])
- except:
- isok = False
- if not isok:
- retcode = 1
- print(infile.sitefilename + ': ' + (pageid or ''))
-
- return retcode
-
-
-class LinksChecker (Checker):
- name = 'links'
- desc = 'Find broken xref or linkend links in a document'
- blurb = ('Find broken xref links in FILES in a Mallard document,\n' +
- 'or broken linkend links in FILES in a DocBook document.')
- formats = ['docbook4', 'docbook5', 'mallard']
- arguments = [
- ('help', '-h', None, 'Show this help and exit'),
- ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
- ('cache', '-c', 'CACHE', 'Use the existing Mallard cache CACHE'),
- ('ignore', '-i', None, 'Ignore xrefs where href is present')
- ]
-
- def __init__(self, yelpcheck):
- super().__init__(yelpcheck)
- self.idstoxrefs = {}
- self.idstolinkends = {}
-
- def _accumulate_mal(self, node, pageid, sectid, xrefs, sitedir=None):
- thisid = node.get('id')
- if thisid is not None:
- if node.tag == '{' + NAMESPACES['mal'] + '}page':
- pageid = thisid
- else:
- sectid = thisid
- curid = pageid
- ignore = self.get_option_bool('ignore')
- if curid is not None:
- if sectid is not None:
- # id attrs in cache files are already fully formed
- if '#' in sectid:
- curid = sectid
- else:
- curid = curid + '#' + sectid
- if sitedir is not None:
- # id attrs in cache files already have sitedir prefixed
- if curid[0] != '/':
- curid = sitedir + curid
- self.idstoxrefs.setdefault(curid, [])
- if xrefs:
- xref = node.get('xref')
- if xref is not None:
- if not (ignore and (node.get('href') is not None)):
- self.idstoxrefs[curid].append(xref)
- for child in node:
- self._accumulate_mal(child, pageid, sectid, xrefs, sitedir)
-
- def _accumulate_db(self, node, nodeid):
- thisid = node.get('id')
- if thisid is None:
- thisid = node.get(XML_ID)
- if thisid is not None:
- nodeid = thisid
- self.idstolinkends.setdefault(nodeid, [])
- if nodeid is not None:
- linkend = node.get('linkend')
- if linkend is not None:
- self.idstolinkends[nodeid].append(linkend)
- for child in node:
- self._accumulate_db(child, nodeid)
-
- def main(self, args):
- if self.parse_args(args) != 0:
- return 1
- if 'help' in self.options:
- self.print_help()
- return 0
-
- retcode = 0
-
- cachefile = self.get_option_str('cache')
- if cachefile is not None:
- xml = self.get_xml(InputFile(os.getcwd(), cachefile))
- self._accumulate_mal(xml.getroot(), None, None, False)
-
- for infile in self.iter_files():
- xml = self.get_xml(infile)
- format = get_format(xml.getroot())
- if format == 'mallard':
- self._accumulate_mal(xml.getroot(), None, None, True, infile.sitedir)
- elif format in ('docbook4', 'docbook5'):
- # For DocBook, we assume each filearg is its own document, so
- # we reset the dict each time and only check within the file.
- # Note that XInclude and SYSTEM includes DO happen first.
- self.idstolinkends = {}
- self._accumulate_db(xml.getroot(), None)
- for curid in self.idstolinkends:
- for linkend in self.idstolinkends[curid]:
- if linkend not in self.idstolinkends:
- print(curid + ': ' + linkend)
- retcode = 1
-
- for curid in self.idstoxrefs:
- for xref in self.idstoxrefs[curid]:
- checkref = xref
- if checkref[0] == '#':
- checkref = curid.split('#')[0] + checkref
- if curid[0] == '/' and checkref[0] != '/':
- checkref = curid[:curid.rfind('/')+1] + checkref
- if checkref not in self.idstoxrefs:
- print(curid + ': ' + xref)
- retcode = 1
-
- return retcode
-
-
-class MediaChecker (Checker):
- name = 'media'
- desc = 'Find broken references to media files'
- blurb = ('Find broken references to media files. In Mallard, this\n' +
- 'checks media and thumb elements. In DocBook, this checks\n' +
- 'audiodata, imagedata, and videodata elements.')
- formats = ['docbook4', 'docbook5', 'mallard']
- arguments = [
- ('help', '-h', None, 'Show this help and exit'),
- ('site', '-s', None, 'Treat pages as belonging to a Mallard site')
- ]
-
- def main(self, args):
- if self.parse_args(args) != 0:
- return 1
- if 'help' in self.options:
- self.print_help()
- return 0
-
- retcode = 0
-
- for infile in self.iter_files():
- xml = self.get_xml(infile)
- format = get_format(xml.getroot())
- srcs = []
- if format == 'mallard':
- for el in xml.xpath('//mal:media[@src] | //uix:thumb | //ui:thumb | //e:mouseover',
- namespaces=NAMESPACES):
- srcs.append(el.get('src'))
- elif format == 'docbook5':
- # FIXME: do we care about entityref?
- for el in xml.xpath('//db:audiodata | //db:imagedata | //db:videodata',
- namespaces=NAMESPACES):
- srcs.append(el.get('fileref'))
- elif format == 'docbook4':
- for el in xml.xpath('//audiodata | //imagedata | //videodata'):
- srcs.append(el.get('fileref'))
- for src in srcs:
- fsrc = os.path.join(infile.absdir, src)
- if not os.path.exists(fsrc):
- print(infile.sitefilename + ': ' + src)
- retcode = 1
-
- return retcode
-
-
-class OrphansChecker (Checker):
- name = 'orphans'
- desc = 'Find orphaned pages in a Mallard document'
- blurb = ('Locate orphaned pages among FILES in a Mallard document.\n' +
- 'Orphaned pages are any pages that cannot be reached by\n' +
- 'topic links alone from the index page.')
- formats = ['mallard']
- arguments = [
- ('help', '-h', None, 'Show this help and exit'),
- ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
- ('cache', '-c', 'CACHE', 'Use the existing Mallard cache CACHE')
- ]
-
- def __init__(self, yelpcheck):
- super().__init__(yelpcheck)
- self.guidelinks = {}
- self.sitesubdirs = set()
-
- def _collect_links(self, node, sitedir):
- pageid = node.get('id')
- if pageid[0] != '/':
- # id attrs in cache files already have sitedir prefixed
- pageid = sitedir + pageid
- else:
- sitedir = pageid[:pageid.rfind('/')+1]
- self.guidelinks.setdefault(pageid, set())
- # For the purposes of finding orphans, we'll just pretend that
- # all links to or from sections are just to or from pages.
- for el in node.xpath('//mal:info/mal:link[@type="guide"]',
- namespaces=NAMESPACES):
- xref = el.get('xref')
- if xref is None or xref == '':
- continue
- if xref[0] == '#':
- continue
- if '#' in xref:
- xref = xref[:xref.find('#')]
- if sitedir is not None and sitedir != '':
- if xref[0] != '/':
- xref = sitedir + xref
- self.guidelinks[pageid].add(xref)
- for el in node.xpath('//mal:info/mal:link[@type="topic"]',
- namespaces=NAMESPACES):
- xref = el.get('xref')
- if xref is None or xref == '':
- continue
- if xref[0] == '#':
- continue
- if '#' in xref:
- xref = xref[:xref.find('#')]
- if sitedir is not None and sitedir != '':
- if xref[0] != '/':
- xref = sitedir + xref
- self.guidelinks.setdefault(xref, set())
- self.guidelinks[xref].add(pageid)
- for el in node.xpath('//mal:links[@type="site-subdirs" or @type="site:subdirs"]',
- namespaces=NAMESPACES):
- self.sitesubdirs.add(pageid)
-
- def main(self, args):
- if self.parse_args(args) != 0:
- return 1
- if 'help' in self.options:
- self.print_help()
- return 0
-
- retcode = 0
-
- cachefile = self.get_option_str('cache')
- if cachefile is not None:
- xml = self.get_xml(InputFile(os.getcwd(), cachefile))
- for page in xml.getroot():
- if page.tag == '{' + NAMESPACES['mal'] + '}page':
- pageid = page.get('id')
- if pageid is None or pageid == '':
- continue
- self._collect_links(page, page.get('{http://projectmallard.org/site/1.0/}dir', ''))
-
- pageids = set()
- for infile in self.iter_files():
- xml = self.get_xml(infile)
- pageid = xml.getroot().get('id')
- if pageid is None:
- continue
- pageids.add(infile.sitedir + pageid)
- self._collect_links(xml.getroot(), infile.sitedir)
-
- siteupdirs = {}
- for pageid in self.sitesubdirs:
- dirname = pageid[:pageid.rfind('/')+1]
- for subid in self.guidelinks:
- if subid.startswith(dirname):
- if subid.endswith('/index'):
- mid = subid[len(dirname):-6]
- if mid != '' and '/' not in mid:
- siteupdirs[subid] = pageid
-
- if self.get_option_bool('site'):
- okpages = set(['/index'])
- else:
- okpages = set(['index'])
- for pageid in sorted(pageids):
- if pageid in okpages:
- isok = True
- else:
- isok = False
- guides = [g for g in self.guidelinks[pageid]]
- if pageid in siteupdirs:
- updir = siteupdirs[pageid]
- if updir not in guides:
- guides.append(updir)
- cur = 0
- while cur < len(guides):
- if guides[cur] in okpages:
- isok = True
- break
- if guides[cur] in self.guidelinks:
- for guide in self.guidelinks[guides[cur]]:
- if guide not in guides:
- guides.append(guide)
- cur += 1
- if isok:
- okpages.add(pageid)
- else:
- print(pageid)
- retcode = 1
-
- return retcode
-
-
-class ValidateChecker (Checker):
- name = 'validate'
- desc = 'Validate files against a DTD or RNG'
- blurb = ('Validate FILES against the appropriate DTD or RNG.\n' +
- 'For Mallard pages, perform automatic RNG merging\n' +
- 'based on the version attribute.')
- formats = ['docbook4', 'docbook5', 'mallard']
- arguments = [
- ('help', '-h', None, 'Show this help and exit'),
- ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
- ('strict', None, None, 'Disallow unknown namespaces'),
- ('allow', None, 'NS', 'Explicitly allow namespace NS in strict mode'),
- ('jing', None, None, 'Use jing instead of xmllint for RNG validation')
- ]
- postblurb = 'NS may be a comma- and/or space-separated list, or specified\nmultiple times.'
-
- def main(self, args):
- if self.parse_args(args) != 0:
- return 1
- if 'help' in self.options:
- self.print_help()
- return 0
-
- retcode = 0
-
- for infile in self.iter_files():
- xml = self.get_xml(infile)
- format = get_format(xml.getroot())
- command = None
- if format == 'mallard':
- version = xml.getroot().get('version')
- if version is None or version == '':
- tag = xml.getroot().tag
- if tag == '{' + NAMESPACES['mal'] + '}stack':
- # 1.2 isn't final yet as of 2020-01-09. Stacks will
- # likely be in 1.2, so we can assume at least that.
- version = '1.2'
- elif tag == '{' + NAMESPACES['cache'] + '}cache':
- version = 'cache/1.0'
- else:
- version = '1.0'
- self.create_tmpdir()
- rng = os.path.join(self.tmpdir,
- version.replace('/', '__').replace(' ', '__'))
- if not os.path.exists(rng):
- strict = 'true()' if self.get_option_bool('strict') else 'false()'
- allow = self.get_option_list('allow')
- if allow is None:
- allow = ''
- else:
- allow = ' '.join(allow)
- subprocess.call(['xsltproc', '-o', rng,
- '--param', 'rng.strict', strict,
- '--stringparam', 'rng.strict.allow', allow,
- os.path.join(DATADIR, 'xslt', 'mal-rng.xsl'),
- infile.absfile])
- if self.get_option_bool('jing'):
- command = ['jing', '-i', rng, infile.filename]
- else:
- command = ['xmllint', '--noout', '--xinclude', '--noent',
- '--relaxng', rng, infile.filename]
- elif format == 'docbook4':
- if xml.docinfo.doctype.startswith('<!DOCTYPE'):
- command = ['xmllint', '--noout', '--xinclude', '--noent',
- '--postvalid', infile.filename]
- else:
- command = ['xmllint', '--noout', '--xinclude', '--noent',
- '--dtdvalid',
- 'http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd',
- infile.filename]
- elif format == 'docbook5':
- version = xml.getroot().get('version')
- if version is None or version == '':
- version = '5.0'
- # Canonical URIs are http, but they 301 redirect to https. jing
- # can handle https fine, but not the redirect. And jing doesn't
- # look at catalogs. So just always feed jing an https URI.
- rnghttp = 'http://docbook.org/xml/' + version + '/rng/docbook.rng'
- rnghttps = 'https://docbook.org/xml/' + version + '/rng/docbook.rng'
- if self.get_option_bool('jing'):
- command = ['jing', '-i', rnghttps, infile.filename]
- else:
- # xmllint, on the other hand, does support catalogs. It also
- # doesn't do the redirect, but it wouldn't matter if it did
- # because it doesn't do https. So if the schema is available
- # locally in the catalog, hand xmllint the http URI so it
- # can use the local copy. Otherwise, we have to get curl
- # involved to do https.
- try:
- catfile = subprocess.check_output(['xmlcatalog',
- '/etc/xml/catalog',
- rnghttp],
- stderr=subprocess.DEVNULL,
- text=True)
- for catline in catfile.split('\n'):
- if catline.startswith('file://'):
- command = ['xmllint', '--noout', '--xinclude', '--noent',
- '--relaxng', rnghttp, infile.filename]
- except:
- pass
- if command is None:
- self.create_tmpdir()
- rngfile = os.path.join(self.tmpdir, 'docbook-' + version + '.rng')
- if not os.path.exists(rngfile):
- urllib.request.urlretrieve(rnghttps, rngfile)
- command = ['xmllint', '--noout', '--xinclude', '--noent',
- '--relaxng', rngfile, infile.filename]
- if command is not None:
- try:
- subprocess.check_output(command,
- cwd=infile.filepath,
- stderr=subprocess.STDOUT,
- text=True)
- except subprocess.CalledProcessError as e:
- retcode = e.returncode
- print(e.output)
- else:
- retcode = 1
-
- return retcode
-
-
-class CommentsChecker (Checker):
- name = 'comments'
- desc = 'Print the editorial comments in a document'
- blurb = ('Print the editorial comments in the files FILES, using the\n' +
- 'comment element in Mallard and the remark element in DocBook.')
- formats = ['docbook4', 'docbook5', 'mallard']
- arguments = [
- ('help', '-h', None, 'Show this help and exit'),
- ('site', '-s', None, 'Treat pages as belonging to a Mallard site')
- ]
-
- def main(self, args):
- if self.parse_args(args) != 0:
- return 1
- if 'help' in self.options:
- self.print_help()
- return 0
-
- for infile in self.iter_files():
- xml = self.get_xml(infile)
- format = get_format(xml.getroot())
- if format == 'mallard':
- for el in xml.xpath('//mal:comment', namespaces=NAMESPACES):
- thisid = xml.getroot().get('id')
- par = el
- while par is not None:
- if par.tag == '{' + NAMESPACES['mal'] + '}section':
- sectid = par.get('id')
- if sectid is not None:
- thisid = thisid + '#' + sectid
- break
- par = par.getparent()
- print('Page: ' + infile.sitedir + thisid)
- for ch in el.xpath('mal:cite[1]', namespaces=NAMESPACES):
- name = _stringify(ch).strip()
- href = ch.get('href')
- if href is not None and href.startswith('mailto:'):
- name = name + ' <' + href[7:] + '>'
- print('From: ' + name)
- date = ch.get('date')
- if date is not None:
- print('Date: ' + date)
- print('')
- for ch in el:
- if isinstance(ch, lxml.etree._ProcessingInstruction):
- continue
- elif ch.tag == '{' + NAMESPACES['mal'] + '}cite':
- continue
- elif ch.tag in ('{' + NAMESPACES['mal'] + '}p',
- '{' + NAMESPACES['mal'] + '}title'):
- for s in _stringify(ch).strip().split('\n'):
- print(' ' + s.strip())
- print('')
- else:
- name = lxml.etree.QName(ch).localname
- print(' <' + name + '>...</' + name + '>\n')
- elif format in ('docbook4', 'docbook5'):
- if format == 'docbook4':
- dbxpath = '//remark'
- else:
- dbxpath = '//db:remark'
- for el in xml.xpath(dbxpath, namespaces=NAMESPACES):
- thisid = infile.filename
- par = el
- while par is not None:
- sectid = par.get('id')
- if sectid is None:
- sectid = par.get(XML_ID)
- if sectid is not None:
- thisid = thisid + '#' + sectid
- break
- par = par.getparent()
- print('Page: ' + thisid)
- flag = el.get('revisionflag')
- if flag is not None:
- print('Flag: ' + flag)
- print('')
- for s in _stringify(el).strip().split('\n'):
- print(' ' + s.strip())
- print('')
-
- return 0
-
-
-class LicenseChecker (Checker):
- name = 'license'
- desc = 'Report the license of Mallard pages'
- blurb = ('Report the license of the Mallard page files FILES. Each\n' +
- 'matching page is reporting along with its license, reported\n' +
- 'based on the href attribute of the license element. Common\n' +
- 'licenses use a shortened identifier. Pages with multiple\n' +
- 'licenses have the identifiers separated by spaces. Pages\n' +
- 'with no license element report \'none\'. Licenses with no\n' +
- 'href attribute are reported as \'unknown\'')
- formats = ['mallard']
- arguments = [
- ('help', '-h', None, 'Show this help and exit'),
- ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
- ('only', None, 'LICENSES', 'Only show pages whose license is in LICENSES'),
- ('except', None, 'LICENSES', 'Exclude pages whose license is in LICENSES'),
- ('totals', None, None, 'Show total counts for each license')
- ]
- postblurb = 'LICENSES may be a comma- and/or space-separated list, or specified\nmultiple times.'
-
- def get_license(self, href):
- if href is None:
- return 'unknown'
- elif (href.startswith('http://creativecommons.org/licenses/') or
- href.startswith('https://creativecommons.org/licenses/')):
- return 'cc-' + '-'.join([x for x in href.split('/') if x][3:])
- elif (href.startswith('http://www.gnu.org/licenses/') or
- href.startswith('https://www.gnu.org/licenses/')):
- return href.split('/')[-1].replace('.html', '')
- else:
- return 'unknown'
-
- def main(self, args):
- if self.parse_args(args) != 0:
- return 1
- if 'help' in self.options:
- self.print_help()
- return 0
-
- totals = {}
-
- for infile in self.iter_files():
- xml = self.get_xml(infile)
- thisid = xml.getroot().get('id') or infile.filename
- licenses = []
- for el in xml.xpath('/mal:page/mal:info/mal:license',
- namespaces=NAMESPACES):
- licenses.append(self.get_license(el.get('href')))
- if len(licenses) == 0:
- licenses.append('none')
-
- only = self.get_option_list('only')
- if only is not None:
- skip = True
- for lic in licenses:
- if lic in only:
- skip = False
- if skip:
- continue
- cept = self.get_option_list('except')
- if cept is not None:
- skip = False
- for lic in licenses:
- if lic in cept:
- skip = True
- if skip:
- continue
-
- if self.get_option_bool('totals'):
- for lic in licenses:
- totals.setdefault(lic, 0)
- totals[lic] += 1
- else:
- print(infile.sitedir + thisid + ': ' + ' '.join(licenses))
-
- if self.get_option_bool('totals'):
- for lic in sorted(totals):
- print(lic + ': ' + str(totals[lic]))
-
- return 0
-
-
-class StatusChecker (Checker):
- name = 'status'
- desc = 'Report the status of Mallard pages'
- blurb = ('Report the status of the Mallard page files FILES. Each\n' +
- 'matching page is reporting along with its status.')
- formats = ['mallard']
- arguments = [
- ('help', '-h', None, 'Show this help and exit'),
- ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
- ('version', None, 'VER', 'Select revisions with the version attribute VER'),
- ('docversion', None, 'VER', 'Select revisions with the docversion attribute VER'),
- ('pkgversion', None, 'VER', 'Select revisions with the pkgversion attribute VER'),
- ('older', None, 'DATE', 'Only show pages older than DATE'),
- ('newer', None, 'DATE', 'Only show pages newer than DATE'),
- ('only', None, 'STATUSES', 'Only show pages whose status is in STATUSES'),
- ('except', None, 'STATUSES', 'Exclude pages whose status is in STATUSES'),
- ('totals', None, None, 'Show total counts for each status')
- ]
- postblurb = 'VER and STATUSES may be comma- and/or space-separated lists, or specified\nmultiple times.'
-
- def main(self, args):
- if self.parse_args(args) != 0:
- return 1
- if 'help' in self.options:
- self.print_help()
- return 0
-
- totals = {}
-
- checks = []
- ver = self.get_option_list('version')
- if ver is not None:
- checks.append(ver)
- ver = self.get_option_list('docversion')
- if ver is not None:
- checks.append(['doc:' + v for v in ver])
- ver = self.get_option_list('pkgversion')
- if ver is not None:
- checks.append(['pkg:' + v for v in ver])
-
- for infile in self.iter_files():
- xml = self.get_xml(infile)
- pageid = xml.getroot().get('id')
- bestrev = None
- for rev in xml.xpath('/mal:page/mal:info/mal:revision', namespaces=NAMESPACES):
- revversion = (rev.get('version') or '').split()
- docversion = rev.get('docversion')
- if docversion is not None:
- revversion.append('doc:' + docversion)
- pkgversion = rev.get('pkgversion')
- if pkgversion is not None:
- revversion.append('pkg:' + pkgversion)
- revok = True
- for check in checks:
- checkok = False
- for v in check:
- if v in revversion:
- checkok = True
- break
- if not checkok:
- revok = False
- break
- if revok:
- if bestrev is None:
- bestrev = rev
- continue
- bestdate = bestrev.get('date')
- thisdate = rev.get('date')
- if bestdate is None:
- bestrev = rev
- elif thisdate is None:
- pass
- elif thisdate >= bestdate:
- bestrev = rev
- if bestrev is not None:
- status = bestrev.get('status') or 'none'
- date = bestrev.get('date') or None
- else:
- status = 'none'
- date = None
- older = self.get_option_str('older')
- if older is not None:
- if date is None or date >= older:
- continue
- newer = self.get_option_str('newer')
- if newer is not None:
- if date is None or date <= newer:
- continue
- only = self.get_option_list('only')
- if only is not None:
- if status not in only:
- continue
- cept = self.get_option_list('except')
- if cept is not None:
- if status in cept:
- continue
- if self.get_option_bool('totals'):
- totals.setdefault(status, 0)
- totals[status] += 1
- else:
- print(infile.sitedir + pageid + ': ' + status)
-
- if self.get_option_bool('totals'):
- for st in sorted(totals):
- print(st + ': ' + str(totals[st]))
-
- return 0
-
-
-class StyleChecker (Checker):
- name = 'style'
- desc = 'Report the style attribute of Mallard pages'
- blurb = ('Report the page style attribute of the Mallard page files\n' +
- 'FILES. Each matching page is reporting along with its status.')
- formats = ['mallard']
- arguments = [
- ('help', '-h', None, 'Show this help and exit'),
- ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
- ('only', None, 'STYLES', 'Only show pages whose style is in STATUSES'),
- ('except', None, 'STYLES', 'Exclude pages whose style is in STATUSES'),
- ('totals', None, None, 'Show total counts for each style')
- ]
- postblurb = 'STYLES may be comma- and/or space-separated lists, or specified\nmultiple times.'
-
- def main(self, args):
- if self.parse_args(args) != 0:
- return 1
- if 'help' in self.options:
- self.print_help()
- return 0
-
- totals = {}
-
- for infile in self.iter_files():
- xml = self.get_xml(infile)
- thisid = xml.getroot().get('id')
- style = xml.getroot().get('style')
- if style is None:
- style = 'none'
- styles = style.split()
- # We'll set style to None if it doesn't meat the criteria
- only = self.get_option_list('only')
- if only is not None:
- if len(only) == 0:
- # We treat a blank --only as requesting pages with no style
- if style != 'none':
- style = None
- else:
- allow = False
- for st in styles:
- if st in only:
- allow = True
- break
- if not allow:
- style = None
- cept = self.get_option_list('except')
- if cept is not None:
- for st in styles:
- if st in cept:
- style = None
- break
- if self.get_option_bool('totals'):
- if style is not None:
- for st in styles:
- totals.setdefault(st, 0)
- totals[st] += 1
- else:
- if style is not None:
- print(infile.sitedir + thisid + ': ' + style)
-
- if self.get_option_bool('totals'):
- for st in sorted(totals):
- print(st + ': ' + str(totals[st]))
-
- return 0
-
-
-class CustomChecker(Checker):
- formats = ['docbook4', 'docbook5', 'mallard']
- arguments = [
- ('help', '-h', None, 'Show this help and exit'),
- ('site', '-s', None, 'Treat pages as belonging to a Mallard site')
- ]
-
- def __init__(self, name, yelpcheck):
- super().__init__(yelpcheck)
- self.name = name
-
- def main(self, args):
- if self.parse_args(args) != 0:
- return 1
-
- sect = 'check:' + self.name
- if sect not in self.config.sections():
- print('Unrecognized command: ' + self.name, file=sys.stderr)
- return 1
- self.blurb = self.config.get(sect, 'blurb', fallback=None)
- if self.blurb is not None:
- self.blurb = '\n'.join(textwrap.wrap(self.blurb))
-
- if 'help' in self.options:
- self.print_help()
- return 0
-
- assertexpr = self.config.get(sect, 'assert', fallback=None)
- if assertexpr is not None:
- return self.run_assert(assertexpr)
-
- print('No action found for command: ' + self.name, file=sys.stderr)
- return 1
-
- def run_assert(self, assertexpr):
- sect = 'check:' + self.name
- selectexpr = self.config.get(sect, 'select', fallback='/')
- message = self.config.get(sect, 'message', fallback='Assertion failed')
- self.xinclude = self.config.get(sect, 'xinclude', fallback='true') != 'false'
-
- namespaces = {}
- if 'namespaces' in self.config.sections():
- for ns in self.config.options('namespaces'):
- namespaces[ns] = self.config.get('namespaces', ns)
-
- for infile in self.iter_files():
- xml = self.get_xml(infile)
- thisid = xml.getroot().get('id') or infile.filename
- for root in xml.xpath(selectexpr, namespaces=namespaces):
- if not bool(root.xpath(assertexpr, namespaces=namespaces)):
- print(infile.sitedir + thisid + ': ' + message)
- # check if self.config has section check:self.name
- # check if section has select, assert, message
-
-
-class YelpCheck:
- def __init__(self):
- pass
-
- def main(self):
- if len(sys.argv) < 2:
- self.print_usage()
- return 1
-
- checker = None
- for cls in Checker.__subclasses__():
- if sys.argv[1] == cls.name:
- checker = cls(self)
-
- if checker is None:
- checker = CustomChecker(sys.argv[1], self)
-
- return checker.main(sys.argv[2:])
-
- def print_usage(self):
- print('Usage: yelp-check <COMMAND> [OPTIONS] [FILES]')
- namelen = 2
- checks = []
- reports = []
- others = []
- for cls in sorted(Checker.__subclasses__(), key=(lambda cls: cls.name or '')):
- if cls is CustomChecker:
- continue
- namelen = max(namelen, len(cls.name) + 2)
- if cls in (HrefsChecker, IdsChecker, LinksChecker,
- MediaChecker, OrphansChecker, ValidateChecker):
- checks.append(cls)
- elif cls in (CommentsChecker, LicenseChecker, StatusChecker,
- StyleChecker):
- reports.append(cls)
- else:
- others.append(cls)
- if len(checks) > 0:
- print('\nCheck commands:')
- for cls in checks:
- print(' ' + cls.name.ljust(namelen) + cls.desc)
- if len(reports) > 0:
- print('\nReport commands:')
- for cls in reports:
- print(' ' + cls.name.ljust(namelen) + cls.desc)
- if len(others) > 0:
- print('\nOther commands:')
- for cls in others:
- print(' ' + cls.name.ljust(namelen) + cls.desc)
- config = configparser.ConfigParser()
- try:
- config.read('.yelp-tools.cfg')
- except:
- return
- customs = []
- for sect in config.sections():
- if sect.startswith('check:'):
- name = sect[6:]
- skip = False
- for cls in Checker.__subclasses__():
- if name == cls.name:
- skip = True
- break
- if skip:
- continue
- if config.get(sect, 'assert', fallback=None) == None:
- continue
- desc = config.get(sect, 'desc', fallback='')
- namelen = max(namelen, len(name) + 2)
- customs.append((name, desc))
- if len(customs) > 0:
- print('\nCustom commands:')
- for name, desc in customs:
- print(' ' + name.ljust(namelen) + desc)
-
-
-if __name__ == '__main__':
- try:
- sys.exit(YelpCheck().main())
- except KeyboardInterrupt:
- sys.exit(1)