#!/usr/bin/python3
#
# yelp-check
# Copyright (C) 2011-2020 Shaun McCance <shaunm@gnome.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

import configparser
import lxml.etree
import os
import sys
import urllib.request
import shutil
import subprocess
import tempfile
import textwrap


DATADIR = '@DATADIR@'

XML_ID = '{http://www.w3.org/XML/1998/namespace}id'
NAMESPACES = {
    'mal':   'http://projectmallard.org/1.0/',
    'cache': 'http://projectmallard.org/cache/1.0/',
    'db':    'http://docbook.org/ns/docbook',
    'e':     'http://projectmallard.org/experimental/',
    'ui':    'http://projectmallard.org/ui/1.0/',
    'uix':   'http://projectmallard.org/experimental/ui/',
    'xlink': 'http://www.w3.org/1999/xlink'
    }

def _stringify(el):
    ret = el.text or ''
    for ch in el:
        ret = ret + _stringify(ch)
    if el.tail is not None:
        ret = ret + el.tail
    return ret

def get_format(node):
    ns = lxml.etree.QName(node).namespace
    if ns in (NAMESPACES['mal'], NAMESPACES['cache']):
        return 'mallard'
    elif ns == NAMESPACES['db']:
        return 'docbook5'
    elif ns is None:
        # For now, just assume no ns means docbook4
        return 'docbook4'
    else:
        return None

class InputFile:
    def __init__(self, filepath, filename, sitedir=None):
        self.filepath = filepath
        self.filename = filename
        self.absfile = os.path.join(filepath, filename)
        self.absdir = os.path.dirname(self.absfile)
        self.sitedir = sitedir or ''
        self.sitefilename = self.sitedir + self.filename


class Checker:
    name = None
    desc = None
    blurb = None
    formats = []
    arguments = []
    postblurb = None
    xinclude = True
    config = None

    def __init__(self, yelpcheck):
        self.yelpcheck = yelpcheck
        self.options = {}
        self.fileargs = []
        self.tmpdir = None
        self.skipfiles = []

    def __del__(self):
        if self.tmpdir is not None:
            shutil.rmtree(self.tmpdir)
            self.tmpdir = None

    def parse_args(self, args):
        while len(args) > 0:
            argdef = None
            if args[0].startswith('--'):
                for arg_ in self.arguments:
                    if args[0] == '--' + arg_[0]:
                        argdef = arg_
                        break
                if argdef is None:
                    self.print_help()
                    return 1
            elif args[0].startswith('-'):
                for arg_ in self.arguments:
                    if args[0] == arg_[1]:
                        argdef = arg_
                        break
                if argdef is None:
                    self.print_help()
                    return 1
            if argdef is not None:
                takesarg = (argdef[2] is not None)
                if takesarg:
                    if len(args) < 2:
                        self.print_help()
                        return 1
                    self.options.setdefault(argdef[0], [])
                    self.options[argdef[0]].append(args[1])
                    args = args[2:]
                else:
                    self.options[argdef[0]] = True
                    args = args[1:]
            else:
                self.fileargs.append(args[0])
                args = args[1:]
        cfgfile = None
        if len(self.fileargs) > 0:
            cfgfile = os.path.join(os.path.dirname(self.fileargs[0]), '.yelp-tools.cfg')
            if not os.path.exists(cfgfile):
                cfgfile = None
        if cfgfile is None:
            cfgfile = os.path.join(os.getcwd(), '.yelp-tools.cfg')
        if os.path.exists(cfgfile):
            self.config = configparser.ConfigParser()
            try:
                self.config.read(cfgfile)
            except Exception as e:
                print(e, file=sys.stderr)
                sys.exit(1)
        self.skipfiles = self.get_option_list('skip') or []
        return 0

    def get_option_bool(self, arg):
        if arg in self.options:
            return self.options[arg] == True
        if self.config is not None:
            val = self.config.get('check:' + self.name, arg, fallback=None)
            if val is not None:
                return (val == 'true')
            val = self.config.get('check', arg, fallback=None)
            if val is not None:
                return (val == 'true')
            val = self.config.get('default', arg, fallback=None)
            if val is not None:
                return (val == 'true')
        return False

    def get_option_str(self, arg):
        if arg in self.options:
            if isinstance(self.options[arg], list):
                return self.options[arg][-1]
        if self.config is not None:
            val = self.config.get('check:' + self.name, arg, fallback=None)
            if val is not None:
                return val
            val = self.config.get('check', arg, fallback=None)
            if val is not None:
                return val
            val = self.config.get('default', arg, fallback=None)
            if val is not None:
                return val
        return None

    def get_option_list(self, arg):
        if arg in self.options:
            if isinstance(self.options[arg], list):
                ret = []
                for opt in self.options[arg]:
                    ret.extend(opt.replace(',', ' ').split())
                return ret
        if self.config is not None:
            val = self.config.get('check:' + self.name, arg, fallback=None)
            if val is not None:
                return val.replace(',', ' ').split()
            val = self.config.get('check', arg, fallback=None)
            if val is not None:
                return val.replace(',', ' ').split()
            val = self.config.get('default', arg, fallback=None)
            if val is not None:
                return val.replace(',', ' ').split()
        return None

    def iter_files(self, sitedir=None):
        issite = self.get_option_bool('site')
        if len(self.fileargs) == 0:
            self.fileargs.append('.')
        for filearg in self.fileargs:
            if filearg in self.skipfiles:
                continue
            if os.path.isdir(filearg):
                if issite:
                    for infile in self.iter_site(filearg, '/'):
                        yield infile
                else:
                    for fname in os.listdir(filearg):
                        if fname in self.skipfiles:
                            continue
                        if fname.endswith('.page'):
                            yield InputFile(filearg, fname)
            else:
                if issite:
                    # FIXME: should do some normalization here, I guess.
                    # It's hard to get this perfect without a defined start dir
                    yield InputFile(os.getcwd(), filearg, '/' + os.path.dirname(filearg))
                else:
                    yield InputFile(os.getcwd(), filearg)

    def iter_site(self, filepath, sitedir):
        for fname in os.listdir(filepath):
            newpath = os.path.join(filepath, fname)
            if os.path.isdir(newpath):
                # FIXME https://github.com/projectmallard/pintail/issues/36
                if fname == '__pintail__':
                    continue
                for infile in self.iter_site(newpath, sitedir + fname + '/'):
                    yield infile
            elif fname.endswith('.page'):
                yield InputFile(filepath, fname, sitedir)

    def get_xml(self, xmlfile):
        # FIXME: we can cache these if we add a feature to run multiple
        # checkers at once
        tree = lxml.etree.parse(xmlfile.absfile)
        if self.xinclude:
            lxml.etree.XInclude()(tree.getroot())
        return tree

    def create_tmpdir(self):
        if self.tmpdir is None:
            self.tmpdir = tempfile.mkdtemp()

    def print_help(self):
        print('Usage:   yelp-check ' + self.name + ' [OPTIONS] [FILES]')
        print('Formats: ' + ' '.join(self.formats) + '\n')
        #FIXME: prettify names of formats
        if self.blurb is not None:
            print(self.blurb + '\n')
        elif self.desc is not None:
            print(self.desc + '\n')
        print('Options:')
        maxarglen = 2
        args = []
        for arg in self.arguments:
            argkey = '--' + arg[0]
            if arg[1] is not None:
                argkey = arg[1] + ', ' + argkey
            if arg[2] is not None:
                argkey = argkey + ' ' + arg[2]
            args.append((argkey, arg[3]))
        for arg in args:
            maxarglen = max(maxarglen, len(arg[0]) + 1)
        for arg in args:
            print('  ' + (arg[0]).ljust(maxarglen) + '  ' + arg[1])
        if self.postblurb is not None:
            print(self.postblurb)

    def main(self, args):
        pass


class HrefsChecker (Checker):
    name = 'hrefs'
    desc = 'Find broken external links in a document'
    blurb = ('Find broken href links in FILES in a Mallard document, or\n' +
             'broken ulink or XLink links in FILES in a DocBook document.')
    formats = ['docbook4', 'docbook5', 'mallard']
    arguments = [
        ('help', '-h', None, 'Show this help and exit'),
        ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
        ('allow', None, 'URL', 'Allow URL or list of URLs without checking'),
        ('skip', None, 'FILES', 'List of files to skip')
    ]
    postblurb = 'URL may be a comma- and/or space-separated list, or specified\nmultiple times.'

    def main(self, args):
        if self.parse_args(args) != 0:
            return 1
        if 'help' in self.options:
            self.print_help()
            return 0

        # safelisting URLs that we use as identifiers
        hrefs = {
             'http://creativecommons.org/licenses/by-sa/3.0/': True,
            'https://creativecommons.org/licenses/by-sa/3.0/': True,
             'http://creativecommons.org/licenses/by-sa/3.0/us/': True,
            'https://creativecommons.org/licenses/by-sa/3.0/us/': True
        }
        allow = self.get_option_list('allow')
        if allow is not None:
            for url in allow:
                hrefs[url] = True
        retcode = 0

        for infile in self.iter_files():
            xml = self.get_xml(infile)
            for el in xml.xpath('//*[@href | @xlink:href | self::ulink/@url]',
                                namespaces=NAMESPACES):
                href = el.get('href', None)
                if href is None:
                    href = el.get('{www.w3.org/1999/xlink}href')
                if href is None:
                    href = el.get('url')
                if href is None:
                    continue
                if href.startswith('mailto:'):
                    continue
                if href not in hrefs:
                    try:
                        req = urllib.request.urlopen(href)
                        hrefs[href] = (req.status == 200)
                    except Exception as e:
                        hrefs[href] = False
                if not hrefs[href]:
                    retcode = 1
                    print(infile.sitefilename + ': ' + href)

        return retcode


class IdsChecker (Checker):
    name = 'ids'
    desc = 'Find Mallard page IDs that do not match file names'
    blurb = ('Find pages in a Mallard document whose page ID does not match\n' +
             'the base file name of the page file.')
    formats = ['mallard']
    arguments = [
        ('help', '-h', None, 'Show this help and exit'),
        ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
        ('skip', None, 'FILES', 'List of files to skip')
    ]

    def main(self, args):
        if self.parse_args(args) != 0:
            return 1
        if 'help' in self.options:
            self.print_help()
            return 0

        retcode = 0

        for infile in self.iter_files():
            xml = self.get_xml(infile)
            isok = False
            pageid = None
            if infile.filename.endswith('.page'):
                try:
                    pageid = xml.getroot().get('id')
                    isok = (pageid == os.path.basename(infile.filename)[:-5])
                except:
                    isok = False
            if not isok:
                retcode = 1
                print(infile.sitefilename + ': ' + (pageid or ''))

        return retcode


class LinksChecker (Checker):
    name = 'links'
    desc = 'Find broken xref or linkend links in a document'
    blurb = ('Find broken xref links in FILES in a Mallard document,\n' +
             'or broken linkend links in FILES in a DocBook document.')
    formats = ['docbook4', 'docbook5', 'mallard']
    arguments = [
        ('help', '-h', None, 'Show this help and exit'),
        ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
        ('cache', '-c', 'CACHE', 'Use the existing Mallard cache CACHE'),
        ('ignore', '-i', None, 'Ignore xrefs where href is present'),
        ('skip', None, 'FILES', 'List of files to skip')
    ]

    def __init__(self, yelpcheck):
        super().__init__(yelpcheck)
        self.idstoxrefs = {}
        self.idstolinkends = {}

    def _accumulate_mal(self, node, pageid, sectid, xrefs, sitedir=None):
        thisid = node.get('id')
        if thisid is not None:
            if node.tag == '{' + NAMESPACES['mal'] + '}page':
                pageid = thisid
            else:
                sectid = thisid
        curid = pageid
        ignore = self.get_option_bool('ignore')
        if curid is not None:
            if sectid is not None:
                # id attrs in cache files are already fully formed
                if '#' in sectid:
                    curid = sectid
                else:
                    curid = curid + '#' + sectid
            if sitedir is not None:
                # id attrs in cache files already have sitedir prefixed
                if curid[0] != '/':
                    curid = sitedir + curid
            self.idstoxrefs.setdefault(curid, [])
            if xrefs:
                xref = node.get('xref')
                if xref is not None:
                    if not (ignore and (node.get('href') is not None)):
                        self.idstoxrefs[curid].append(xref)
        for child in node:
            self._accumulate_mal(child, pageid, sectid, xrefs, sitedir)

    def _accumulate_db(self, node, nodeid):
        thisid = node.get('id')
        if thisid is None:
            thisid = node.get(XML_ID)
        if thisid is not None:
            nodeid = thisid
            self.idstolinkends.setdefault(nodeid, [])
        if nodeid is not None:
            linkend = node.get('linkend')
            if linkend is not None:
                self.idstolinkends[nodeid].append(linkend)
        for child in node:
            self._accumulate_db(child, nodeid)

    def main(self, args):
        if self.parse_args(args) != 0:
            return 1
        if 'help' in self.options:
            self.print_help()
            return 0

        retcode = 0

        cachefile = self.get_option_str('cache')
        if cachefile is not None:
            xml = self.get_xml(InputFile(os.getcwd(), cachefile))
            self._accumulate_mal(xml.getroot(), None, None, False)

        for infile in self.iter_files():
            xml = self.get_xml(infile)
            format = get_format(xml.getroot())
            if format == 'mallard':
                self._accumulate_mal(xml.getroot(), None, None, True, infile.sitedir)
            elif format in ('docbook4', 'docbook5'):
                # For DocBook, we assume each filearg is its own document, so
                # we reset the dict each time and only check within the file.
                # Note that XInclude and SYSTEM includes DO happen first.
                self.idstolinkends = {}
                self._accumulate_db(xml.getroot(), None)
                for curid in self.idstolinkends:
                    for linkend in self.idstolinkends[curid]:
                        if linkend not in self.idstolinkends:
                            print(curid + ': ' + linkend)
                            retcode = 1

        for curid in self.idstoxrefs:
            for xref in self.idstoxrefs[curid]:
                checkref = xref
                if checkref[0] == '#':
                    checkref = curid.split('#')[0] + checkref
                if curid[0] == '/' and checkref[0] != '/':
                    checkref = curid[:curid.rfind('/')+1] + checkref
                if checkref not in self.idstoxrefs:
                    print(curid + ': ' + xref)
                    retcode = 1

        return retcode


class MediaChecker (Checker):
    name = 'media'
    desc = 'Find broken references to media files'
    blurb = ('Find broken references to media files. In Mallard, this\n' +
             'checks media and thumb elements. In DocBook, this checks\n' +
             'audiodata, imagedata, and videodata elements.')
    formats = ['docbook4', 'docbook5', 'mallard']
    arguments = [
        ('help', '-h', None, 'Show this help and exit'),
        ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
        ('skip', None, 'FILES', 'List of files to skip')
    ]

    def main(self, args):
        if self.parse_args(args) != 0:
            return 1
        if 'help' in self.options:
            self.print_help()
            return 0

        retcode = 0

        for infile in self.iter_files():
            xml = self.get_xml(infile)
            format = get_format(xml.getroot())
            srcs = []
            if format == 'mallard':
                for el in xml.xpath('//mal:media[@src] | //uix:thumb | //ui:thumb | //e:mouseover',
                                    namespaces=NAMESPACES):
                    srcs.append(el.get('src'))
            elif format == 'docbook5':
                # FIXME: do we care about entityref?
                for el in xml.xpath('//db:audiodata | //db:imagedata | //db:videodata',
                                    namespaces=NAMESPACES):
                    srcs.append(el.get('fileref'))
            elif format == 'docbook4':
                for el in xml.xpath('//audiodata | //imagedata | //videodata'):
                    srcs.append(el.get('fileref'))
            for src in srcs:
                fsrc = os.path.join(infile.absdir, src)
                if not os.path.exists(fsrc):
                    print(infile.sitefilename + ': ' + src)
                    retcode = 1

        return retcode


class OrphansChecker (Checker):
    name = 'orphans'
    desc = 'Find orphaned pages in a Mallard document'
    blurb = ('Locate orphaned pages among FILES in a Mallard document.\n' +
             'Orphaned pages are any pages that cannot be reached by\n' +
             'topic links alone from the index page.')
    formats = ['mallard']
    arguments = [
        ('help', '-h', None, 'Show this help and exit'),
        ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
        ('cache', '-c', 'CACHE', 'Use the existing Mallard cache CACHE'),
        ('skip', None, 'FILES', 'List of files to skip')
    ]

    def __init__(self, yelpcheck):
        super().__init__(yelpcheck)
        self.guidelinks = {}
        self.sitesubdirs = set()

    def _collect_links(self, node, sitedir):
        pageid = node.get('id')
        if pageid[0] != '/':
            # id attrs in cache files already have sitedir prefixed
            pageid = sitedir + pageid
        else:
            sitedir = pageid[:pageid.rfind('/')+1]
        self.guidelinks.setdefault(pageid, set())
        # For the purposes of finding orphans, we'll just pretend that
        # all links to or from sections are just to or from pages.
        for el in node.xpath('//mal:info/mal:link[@type="guide"]',
                             namespaces=NAMESPACES):
            xref = el.get('xref')
            if xref is None or xref == '':
                continue
            if xref[0] == '#':
                continue
            if '#' in xref:
                xref = xref[:xref.find('#')]
            if sitedir is not None and sitedir != '':
                if xref[0] != '/':
                    xref = sitedir + xref
            self.guidelinks[pageid].add(xref)
        for el in node.xpath('//mal:info/mal:link[@type="topic"]',
                             namespaces=NAMESPACES):
            xref = el.get('xref')
            if xref is None or xref == '':
                continue
            if xref[0] == '#':
                continue
            if '#' in xref:
                xref = xref[:xref.find('#')]
            if sitedir is not None and sitedir != '':
                if xref[0] != '/':
                    xref = sitedir + xref
            self.guidelinks.setdefault(xref, set())
            self.guidelinks[xref].add(pageid)
        for el in node.xpath('//mal:links[@type="site-subdirs" or @type="site:subdirs"]',
                             namespaces=NAMESPACES):
            self.sitesubdirs.add(pageid)

    def main(self, args):
        if self.parse_args(args) != 0:
            return 1
        if 'help' in self.options:
            self.print_help()
            return 0

        retcode = 0

        cachefile = self.get_option_str('cache')
        if cachefile is not None:
            xml = self.get_xml(InputFile(os.getcwd(), cachefile))
            for page in xml.getroot():
                if page.tag == '{' + NAMESPACES['mal'] + '}page':
                    pageid = page.get('id')
                    if pageid is None or pageid == '':
                        continue
                    self._collect_links(page, page.get('{http://projectmallard.org/site/1.0/}dir', ''))

        pageids = set()
        for infile in self.iter_files():
            xml = self.get_xml(infile)
            pageid = xml.getroot().get('id')
            if pageid is None:
                continue
            pageids.add(infile.sitedir + pageid)
            self._collect_links(xml.getroot(), infile.sitedir)

        siteupdirs = {}
        for pageid in self.sitesubdirs:
            dirname = pageid[:pageid.rfind('/')+1]
            for subid in self.guidelinks:
                if subid.startswith(dirname):
                    if subid.endswith('/index'):
                        mid = subid[len(dirname):-6]
                        if mid != '' and '/' not in mid:
                            siteupdirs[subid] = pageid

        if self.get_option_bool('site'):
            okpages = set(['/index'])
        else:
            okpages = set(['index'])
        for pageid in sorted(pageids):
            if pageid in okpages:
                isok = True
            else:
                isok = False
                guides = [g for g in self.guidelinks[pageid]]
                if pageid in siteupdirs:
                    updir = siteupdirs[pageid]
                    if updir not in guides:
                        guides.append(updir)
                cur = 0
                while cur < len(guides):
                    if guides[cur] in okpages:
                        isok = True
                        break
                    if guides[cur] in self.guidelinks:
                        for guide in self.guidelinks[guides[cur]]:
                            if guide not in guides:
                                guides.append(guide)
                    cur += 1
            if isok:
                okpages.add(pageid)
            else:
                print(pageid)
                retcode = 1

        return retcode


class ValidateChecker (Checker):
    name = 'validate'
    desc = 'Validate files against a DTD or RNG'
    blurb = ('Validate FILES against the appropriate DTD or RNG.\n' +
             'For Mallard pages, perform automatic RNG merging\n' +
             'based on the version attribute.')
    formats = ['docbook4', 'docbook5', 'mallard']
    arguments = [
        ('help', '-h', None, 'Show this help and exit'),
        ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
        ('strict', None, None, 'Disallow unknown namespaces'),
        ('allow', None, 'NS', 'Explicitly allow namespace NS in strict mode'),
        ('jing', None, None, 'Use jing instead of xmllint for RNG validation'),
        ('skip', None, 'FILES', 'List of files to skip')
    ]
    postblurb = 'NS may be a comma- and/or space-separated list, or specified\nmultiple times.'

    def main(self, args):
        if self.parse_args(args) != 0:
            return 1
        if 'help' in self.options:
            self.print_help()
            return 0

        retcode = 0

        for infile in self.iter_files():
            xml = self.get_xml(infile)
            format = get_format(xml.getroot())
            command = None
            if format == 'mallard':
                version = xml.getroot().get('version')
                if version is None or version == '':
                    tag = xml.getroot().tag
                    if tag == '{' + NAMESPACES['mal'] + '}stack':
                        # 1.2 isn't final yet as of 2020-01-09. Stacks will
                        # likely be in 1.2, so we can assume at least that.
                        version = '1.2'
                    elif tag == '{' + NAMESPACES['cache'] + '}cache':
                        version = 'cache/1.0'
                    else:
                        version = '1.0'
                self.create_tmpdir()
                rng = os.path.join(self.tmpdir,
                                   version.replace('/', '__').replace(' ', '__'))
                if not os.path.exists(rng):
                    strict = 'true()' if self.get_option_bool('strict') else 'false()'
                    allow = self.get_option_list('allow')
                    if allow is None:
                        allow = ''
                    else:
                        allow = ' '.join(allow)
                    subprocess.call(['xsltproc', '-o', rng,
                                    '--param', 'rng.strict', strict,
                                    '--stringparam', 'rng.strict.allow', allow,
                                    os.path.join(DATADIR, 'xslt', 'mal-rng.xsl'),
                                    infile.absfile])
                if self.get_option_bool('jing'):
                    command = ['jing', '-i', rng, infile.filename]
                else:
                    command = ['xmllint', '--noout', '--xinclude', '--noent',
                               '--relaxng', rng, infile.filename]
            elif format == 'docbook4':
                if xml.docinfo.doctype.startswith('<!DOCTYPE'):
                    command = ['xmllint', '--noout', '--xinclude', '--noent',
                               '--postvalid', infile.filename]
                else:
                    command = ['xmllint', '--noout', '--xinclude', '--noent',
                               '--dtdvalid',
                               'http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd',
                               infile.filename]
            elif format == 'docbook5':
                version = xml.getroot().get('version')
                if version is None or version == '':
                    version = '5.0'
                # Canonical URIs are http, but they 301 redirect to https. jing
                # can handle https fine, but not the redirect. And jing doesn't
                # look at catalogs. So just always feed jing an https URI.
                rnghttp = 'http://docbook.org/xml/' + version + '/rng/docbook.rng'
                rnghttps = 'https://docbook.org/xml/' + version + '/rng/docbook.rng'
                if self.get_option_bool('jing'):
                    command = ['jing', '-i', rnghttps, infile.filename]
                else:
                    # xmllint, on the other hand, does support catalogs. It also
                    # doesn't do the redirect, but it wouldn't matter if it did
                    # because it doesn't do https. So if the schema is available
                    # locally in the catalog, hand xmllint the http URI so it
                    # can use the local copy. Otherwise, we have to get curl
                    # involved to do https.
                    try:
                        catfile = subprocess.check_output(['xmlcatalog',
                                                           '/etc/xml/catalog',
                                                           rnghttp],
                                                          stderr=subprocess.DEVNULL,
                                                          text=True)
                        for catline in catfile.split('\n'):
                            if catline.startswith('file://'):
                                command = ['xmllint', '--noout', '--xinclude',  '--noent',
                                           '--relaxng', rnghttp, infile.filename]
                    except:
                        pass
                    if command is None:
                        self.create_tmpdir()
                        rngfile = os.path.join(self.tmpdir, 'docbook-' + version + '.rng')
                        if not os.path.exists(rngfile):
                            urllib.request.urlretrieve(rnghttps, rngfile)
                        command = ['xmllint', '--noout', '--xinclude',  '--noent',
                                   '--relaxng', rngfile, infile.filename]
            if command is not None:
                try:
                    subprocess.check_output(command,
                                            cwd=infile.filepath,
                                            stderr=subprocess.STDOUT,
                                            text=True)
                except subprocess.CalledProcessError as e:
                    retcode = e.returncode
                    print(e.output)
            else:
                retcode = 1

        return retcode


class CommentsChecker (Checker):
    name = 'comments'
    desc = 'Print the editorial comments in a document'
    blurb = ('Print the editorial comments in the files FILES, using the\n' +
             'comment element in Mallard and the remark element in DocBook.')
    formats = ['docbook4', 'docbook5', 'mallard']
    arguments = [
        ('help', '-h', None, 'Show this help and exit'),
        ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
        ('skip', None, 'FILES', 'List of files to skip')
    ]

    def main(self, args):
        if self.parse_args(args) != 0:
            return 1
        if 'help' in self.options:
            self.print_help()
            return 0

        for infile in self.iter_files():
            xml = self.get_xml(infile)
            format = get_format(xml.getroot())
            if format == 'mallard':
                for el in xml.xpath('//mal:comment', namespaces=NAMESPACES):
                    thisid = xml.getroot().get('id')
                    par = el
                    while par is not None:
                        if par.tag == '{' + NAMESPACES['mal'] + '}section':
                            sectid = par.get('id')
                            if sectid is not None:
                                thisid = thisid + '#' + sectid
                                break
                        par = par.getparent()
                    print('Page:  ' + infile.sitedir + thisid)
                    for ch in el.xpath('mal:cite[1]', namespaces=NAMESPACES):
                        name = _stringify(ch).strip()
                        href = ch.get('href')
                        if href is not None and href.startswith('mailto:'):
                            name = name + ' <' + href[7:] + '>'
                        print('From:  ' + name)
                        date = ch.get('date')
                        if date is not None:
                            print('Date:  ' + date)
                    print('')
                    for ch in el:
                        if isinstance(ch, lxml.etree._ProcessingInstruction):
                            continue
                        elif ch.tag == '{' + NAMESPACES['mal'] + '}cite':
                            continue
                        elif ch.tag in ('{' + NAMESPACES['mal'] + '}p',
                                        '{' + NAMESPACES['mal'] + '}title'):
                            for s in _stringify(ch).strip().split('\n'):
                                print('  ' + s.strip())
                            print('')
                        else:
                            name = lxml.etree.QName(ch).localname
                            print('  <' + name + '>...</' + name + '>\n')
            elif format in ('docbook4', 'docbook5'):
                if format == 'docbook4':
                    dbxpath = '//remark'
                else:
                    dbxpath = '//db:remark'
                for el in xml.xpath(dbxpath, namespaces=NAMESPACES):
                    thisid = infile.filename
                    par = el
                    while par is not None:
                        sectid = par.get('id')
                        if sectid is None:
                            sectid = par.get(XML_ID)
                        if sectid is not None:
                            thisid = thisid + '#' + sectid
                            break
                        par = par.getparent()
                    print('Page:  ' + thisid)
                    flag = el.get('revisionflag')
                    if flag is not None:
                        print('Flag:  ' + flag)
                    print('')
                    for s in _stringify(el).strip().split('\n'):
                        print('  ' + s.strip())
                    print('')

        return 0


class LicenseChecker (Checker):
    name = 'license'
    desc = 'Report the license of Mallard pages'
    blurb = ('Report the license of the Mallard page files FILES. Each\n' +
             'matching page is reporting along with its license, reported\n' +
             'based on the href attribute of the license element. Common\n' +
             'licenses use a shortened identifier. Pages with multiple\n' +
             'licenses have the identifiers separated by spaces. Pages\n' +
             'with no license element report \'none\'. Licenses with no\n' +
             'href attribute are reported as \'unknown\'')
    formats = ['mallard']
    arguments = [
        ('help', '-h', None, 'Show this help and exit'),
        ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
        ('only', None, 'LICENSES', 'Only show pages whose license is in LICENSES'),
        ('except', None, 'LICENSES', 'Exclude pages whose license is in LICENSES'),
        ('totals', None, None, 'Show total counts for each license'),
        ('skip', None, 'FILES', 'List of files to skip')
    ]
    postblurb = 'LICENSES may be a comma- and/or space-separated list, or specified\nmultiple times.'

    def get_license(self, href):
        if href is None:
            return 'unknown'
        elif (href.startswith('http://creativecommons.org/licenses/') or
              href.startswith('https://creativecommons.org/licenses/')):
            return 'cc-' + '-'.join([x for x in href.split('/') if x][3:])
        elif (href.startswith('http://www.gnu.org/licenses/') or
              href.startswith('https://www.gnu.org/licenses/')):
            return href.split('/')[-1].replace('.html', '')
        else:
            return 'unknown'

    def main(self, args):
        if self.parse_args(args) != 0:
            return 1
        if 'help' in self.options:
            self.print_help()
            return 0

        totals = {}

        for infile in self.iter_files():
            xml = self.get_xml(infile)
            thisid = xml.getroot().get('id') or infile.filename
            licenses = []
            for el in xml.xpath('/mal:page/mal:info/mal:license',
                                namespaces=NAMESPACES):
                licenses.append(self.get_license(el.get('href')))
            if len(licenses) == 0:
                licenses.append('none')

            only = self.get_option_list('only')
            if only is not None:
                skip = True
                for lic in licenses:
                    if lic in only:
                        skip = False
                if skip:
                    continue
            cept = self.get_option_list('except')
            if cept is not None:
                skip = False
                for lic in licenses:
                    if lic in cept:
                        skip = True
                if skip:
                    continue

            if self.get_option_bool('totals'):
                for lic in licenses:
                    totals.setdefault(lic, 0)
                    totals[lic] += 1
            else:
                print(infile.sitedir + thisid + ': ' + ' '.join(licenses))

        if self.get_option_bool('totals'):
            for lic in sorted(totals):
                print(lic + ': ' + str(totals[lic]))

        return 0


class StatusChecker (Checker):
    name = 'status'
    desc = 'Report the status of Mallard pages'
    blurb = ('Report the status of the Mallard page files FILES. Each\n' +
             'matching page is reporting along with its status.')
    formats = ['mallard']
    arguments = [
        ('help', '-h', None, 'Show this help and exit'),
        ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
        ('version',    None, 'VER', 'Select revisions with the version attribute VER'),
        ('docversion', None, 'VER', 'Select revisions with the docversion attribute VER'),
        ('pkgversion', None, 'VER', 'Select revisions with the pkgversion attribute VER'),
        ('older',  None, 'DATE', 'Only show pages older than DATE'),
        ('newer',  None, 'DATE', 'Only show pages newer than DATE'),
        ('only',   None, 'STATUSES', 'Only show pages whose status is in STATUSES'),
        ('except', None, 'STATUSES', 'Exclude pages whose status is in STATUSES'),
        ('totals', None, None, 'Show total counts for each status'),
        ('skip', None, 'FILES', 'List of files to skip')
    ]
    postblurb = 'VER and STATUSES may be comma- and/or space-separated lists, or specified\nmultiple times.'

    def main(self, args):
        if self.parse_args(args) != 0:
            return 1
        if 'help' in self.options:
            self.print_help()
            return 0

        totals = {}

        checks = []
        ver = self.get_option_list('version')
        if ver is not None:
            checks.append(ver)
        ver = self.get_option_list('docversion')
        if ver is not None:
            checks.append(['doc:' + v for v in ver])
        ver = self.get_option_list('pkgversion')
        if ver is not None:
            checks.append(['pkg:' + v for v in ver])

        for infile in self.iter_files():
            xml = self.get_xml(infile)
            pageid = xml.getroot().get('id')
            bestrev = None
            for rev in xml.xpath('/mal:page/mal:info/mal:revision', namespaces=NAMESPACES):
                revversion = (rev.get('version') or '').split()
                docversion = rev.get('docversion')
                if docversion is not None:
                    revversion.append('doc:' + docversion)
                pkgversion = rev.get('pkgversion')
                if pkgversion is not None:
                    revversion.append('pkg:' + pkgversion)
                revok = True
                for check in checks:
                    checkok = False
                    for v in check:
                        if v in revversion:
                            checkok = True
                            break
                    if not checkok:
                        revok = False
                        break
                if revok:
                    if bestrev is None:
                        bestrev = rev
                        continue
                    bestdate = bestrev.get('date')
                    thisdate = rev.get('date')
                    if bestdate is None:
                        bestrev = rev
                    elif thisdate is None:
                        pass
                    elif thisdate >= bestdate:
                        bestrev = rev
            if bestrev is not None:
                status = bestrev.get('status') or 'none'
                date = bestrev.get('date') or None
            else:
                status = 'none'
                date = None
            older = self.get_option_str('older')
            if older is not None:
                if date is None or date >= older:
                    continue
            newer = self.get_option_str('newer')
            if newer is not None:
                if date is None or date <= newer:
                    continue
            only = self.get_option_list('only')
            if only is not None:
                if status not in only:
                    continue
            cept = self.get_option_list('except')
            if cept is not None:
                if status in cept:
                    continue
            if self.get_option_bool('totals'):
                totals.setdefault(status, 0)
                totals[status] += 1
            else:
                print(infile.sitedir + pageid + ': ' + status)

        if self.get_option_bool('totals'):
            for st in sorted(totals):
                print(st + ': ' + str(totals[st]))

        return 0


class StyleChecker (Checker):
    name = 'style'
    desc = 'Report the style attribute of Mallard pages'
    blurb = ('Report the page style attribute of the Mallard page files\n' +
             'FILES. Each matching page is reporting along with its status.')
    formats = ['mallard']
    arguments = [
        ('help', '-h', None, 'Show this help and exit'),
        ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
        ('only',   None, 'STYLES', 'Only show pages whose style is in STATUSES'),
        ('except', None, 'STYLES', 'Exclude pages whose style is in STATUSES'),
        ('totals', None, None, 'Show total counts for each style'),
        ('skip', None, 'FILES', 'List of files to skip')
    ]
    postblurb = 'STYLES may be comma- and/or space-separated lists, or specified\nmultiple times.'

    def main(self, args):
        if self.parse_args(args) != 0:
            return 1
        if 'help' in self.options:
            self.print_help()
            return 0

        totals = {}

        for infile in self.iter_files():
            xml = self.get_xml(infile)
            thisid = xml.getroot().get('id')
            style = xml.getroot().get('style')
            if style is None:
                style = 'none'
            styles = style.split()
            # We'll set style to None if it doesn't meat the criteria
            only = self.get_option_list('only')
            if only is not None:
                if len(only) == 0:
                    # We treat a blank --only as requesting pages with no style
                    if style != 'none':
                        style = None
                else:
                    allow = False
                    for st in styles:
                        if st in only:
                            allow = True
                            break
                    if not allow:
                        style = None
            cept = self.get_option_list('except')
            if cept is not None:
                for st in styles:
                    if st in cept:
                        style = None
                        break
            if self.get_option_bool('totals'):
                if style is not None:
                    for st in styles:
                        totals.setdefault(st, 0)
                        totals[st] += 1
            else:
                if style is not None:
                    print(infile.sitedir + thisid + ': ' + style)

        if self.get_option_bool('totals'):
            for st in sorted(totals):
                print(st + ': ' + str(totals[st]))

        return 0


class CustomChecker(Checker):
    formats = ['docbook4', 'docbook5', 'mallard']
    arguments = [
        ('help', '-h', None, 'Show this help and exit'),
        ('site', '-s', None, 'Treat pages as belonging to a Mallard site'),
        ('skip', None, 'FILES', 'List of files to skip')
    ]

    def __init__(self, name, yelpcheck):
        super().__init__(yelpcheck)
        self.name = name

    def main(self, args):
        if self.parse_args(args) != 0:
            return 1

        sect = 'check:' + self.name
        if self.config is None or (sect not in self.config.sections()):
            print('Unrecognized command: ' + self.name, file=sys.stderr)
            return 1
        self.desc = self.config.get(sect, 'desc', fallback=None)
        self.blurb = self.config.get(sect, 'blurb', fallback=None)
        if self.blurb is not None:
            self.blurb = '\n'.join(textwrap.wrap(self.blurb))

        if 'help' in self.options:
            self.print_help()
            return 0

        assertexpr = self.config.get(sect, 'assert', fallback=None)
        if assertexpr is not None:
            return self.run_assert(assertexpr)

        print('No action found for command: ' + self.name, file=sys.stderr)
        return 1

    def run_assert(self, assertexpr):
        sect = 'check:' + self.name
        selectexpr = self.config.get(sect, 'select', fallback='/')
        message = self.config.get(sect, 'message', fallback='Assertion failed')
        self.xinclude = self.config.get(sect, 'xinclude', fallback='true') != 'false'

        namespaces = {}
        if 'namespaces' in self.config.sections():
            for ns in self.config.options('namespaces'):
                namespaces[ns] = self.config.get('namespaces', ns)

        retcode = 0
        for infile in self.iter_files():
            xml = self.get_xml(infile)
            thisid = xml.getroot().get('id') or infile.filename
            # FIXME check these expressions and give better errors
            for root in xml.xpath(selectexpr, namespaces=namespaces):
                if not bool(root.xpath(assertexpr, namespaces=namespaces)):
                    print(infile.sitedir + thisid + ': ' + message)
                    retcode = 1
        return retcode


class YelpCheck:
    def __init__(self):
        pass

    def main(self):
        if len(sys.argv) < 2:
            self.print_usage()
            return 1

        checker = None
        for cls in Checker.__subclasses__():
            if sys.argv[1] == cls.name:
                checker = cls(self)

        if checker is None:
            checker = CustomChecker(sys.argv[1], self)

        return checker.main(sys.argv[2:])

    def print_usage(self):
        print('Usage: yelp-check <COMMAND> [OPTIONS] [FILES]')
        namelen = 2
        checks = []
        reports = []
        others = []
        for cls in sorted(Checker.__subclasses__(), key=(lambda cls: cls.name or '')):
            if cls is CustomChecker:
                continue
            namelen = max(namelen, len(cls.name) + 2)
            if cls in (HrefsChecker, IdsChecker, LinksChecker,
                       MediaChecker, OrphansChecker, ValidateChecker):
                checks.append(cls)
            elif cls in (CommentsChecker, LicenseChecker, StatusChecker,
                         StyleChecker):
                reports.append(cls)
            else:
                others.append(cls)
        if len(checks) > 0:
            print('\nCheck commands:')
            for cls in checks:
                print('  ' + cls.name.ljust(namelen) + cls.desc)
        if len(reports) > 0:
            print('\nReport commands:')
            for cls in reports:
                print('  ' + cls.name.ljust(namelen) + cls.desc)
        if len(others) > 0:
            print('\nOther commands:')
            for cls in others:
                print('  ' + cls.name.ljust(namelen) + cls.desc)
        config = configparser.ConfigParser()
        try:
            config.read('.yelp-tools.cfg')
        except:
            return
        customs = []
        for sect in config.sections():
            if sect.startswith('check:'):
                name = sect[6:]
                skip = False
                for cls in Checker.__subclasses__():
                    if name == cls.name:
                        skip = True
                        break
                if skip:
                    continue
                if config.get(sect, 'assert', fallback=None) == None:
                    continue
                desc = config.get(sect, 'desc', fallback='')
                namelen = max(namelen, len(name) + 2)
                customs.append((name, desc))
        if len(customs) > 0:
            print('\nCustom commands:')
            for name, desc in customs:
                print('  ' + name.ljust(namelen) + desc)


if __name__ == '__main__':
    try:
        sys.exit(YelpCheck().main())
    except KeyboardInterrupt:
        sys.exit(1)