""" python-creole ~~~~~~~~~~~~~ :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ import re import warnings from html.parser import HTMLParser from creole.html_tools.strip_html import strip_html from creole.parser.html_parser_config import BLOCK_TAGS, IGNORE_TAGS from creole.shared.document_tree import DebugList, DocNode # ------------------------------------------------------------------------------ block_re = re.compile(r''' ^
 \s* $
    (?P
        (\n|.)*?
    )
    ^
\s* $ [\s\n]* ''', re.VERBOSE | re.UNICODE | re.MULTILINE) inline_re = re.compile(r'''
    (?P
        (\n|.)*?
    )
    
''', re.VERBOSE | re.UNICODE) headline_tag_re = re.compile(r"h(\d)", re.UNICODE) # ------------------------------------------------------------------------------ class HtmlParser(HTMLParser): """ parse html code and create a document tree. >>> p = HtmlParser() >>> p.feed("

html code

") >>> p.debug() ________________________________________________________________________________ document tree: ================================================================================ p data: 'html ' strong data: 'code' ******************************************************************************** >>> p = HtmlParser() >>> p.feed("

html1 html2

") >>> p.debug() ________________________________________________________________________________ document tree: ================================================================================ p data: 'html1 ' script data: "var foo='BAR';" data: ' html2' ******************************************************************************** """ # placeholder html tag for pre cutout areas: _block_placeholder = "blockdata" _inline_placeholder = "inlinedata" def __init__(self, debug=False): super(HtmlParser, self).__init__(convert_charrefs=False) self.debugging = debug if self.debugging: warnings.warn( message="Html2Creole debug is on! warn every data append." ) self.result = DebugList(self) else: self.result = [] self.blockdata = [] self.root = DocNode("document", None) self.cur = self.root self.__list_level = 0 def _pre_cut(self, data, type, placeholder): if self.debugging: print(f"append blockdata: {data!r}") assert isinstance(data, str), "blockdata is not unicode" self.blockdata.append(data) id = len(self.blockdata) - 1 return f'<{placeholder} type="{type}" id="{id}" />' def _pre_pre_inline_cut(self, groups): return self._pre_cut(groups["pre_inline"], "pre", self._inline_placeholder) def _pre_pre_block_cut(self, groups): return self._pre_cut(groups["pre_block"], "pre", self._block_placeholder) def _pre_pass_block_cut(self, groups): content = groups["pass_block"].strip() return self._pre_cut(content, "pass", self._block_placeholder) _pre_pass_block_start_cut = _pre_pass_block_cut def _pre_cut_out(self, match): groups = match.groupdict() for name, text in groups.items(): if text is not None: if self.debugging: print("%15s: %r (%r)" % (name, text, match.group(0))) method = getattr(self, f'_pre_{name}_cut') return method(groups) # data = match.group("data") def feed(self, raw_data): assert isinstance(raw_data, str), "feed data must be unicode!" data = raw_data.strip() # cut out
 and  areas block tag areas
        data = block_re.sub(self._pre_cut_out, data)
        data = inline_re.sub(self._pre_cut_out, data)

        # Delete whitespace from html code
        data = strip_html(data)

        if self.debugging:
            print("_" * 79)
            print("raw data:")
            print(repr(raw_data))
            print(" -" * 40)
            print("cleaned data:")
            print(data)
            print("-" * 79)
#            print(clean_data.replace(">", ">\n"))
#            print("-"*79)

        HTMLParser.feed(self, data)

        return self.root

    # -------------------------------------------------------------------------

    def _upto(self, node, kinds):
        """
        Look up the tree to the first occurence
        of one of the listed kinds of nodes or root.
        Start at the node node.
        """
        while node is not None and node.parent is not None:
            node = node.parent
            if node.kind in kinds:
                break

        return node

    def _go_up(self):
        kinds = list(BLOCK_TAGS) + ["document"]
        self.cur = self._upto(self.cur, kinds)
        self.debug_msg("go up to", self.cur)

    # -------------------------------------------------------------------------

    def handle_starttag(self, tag, attrs):
        self.debug_msg("starttag", f"{tag!r} atts: {attrs}")

        if tag in IGNORE_TAGS:
            return

        headline = headline_tag_re.match(tag)
        if headline:
            self.cur = DocNode(
                "headline", self.cur, level=int(headline.group(1))
            )
            return

        if tag in ("li", "ul", "ol"):
            if tag in ("ul", "ol"):
                self.__list_level += 1
            self.cur = DocNode(tag, self.cur, None, attrs, level=self.__list_level)
        elif tag in ("img", "br"):
            # Work-a-round if img or br  tag is not marked as startendtag:
            # wrong:  doesn't work if  not exist
            # right: 
            DocNode(tag, self.cur, None, attrs)
        else:
            self.cur = DocNode(tag, self.cur, None, attrs)

    def handle_data(self, data):
        self.debug_msg("data", f"{data!r}")
        assert isinstance(data, str)
        DocNode("data", self.cur, content=data)

    def handle_charref(self, name):
        self.debug_msg("charref", f"{name!r}")
        DocNode("charref", self.cur, content=name)

    def handle_entityref(self, name):
        self.debug_msg("entityref", f"{name!r}")
        DocNode("entityref", self.cur, content=name)

    def handle_startendtag(self, tag, attrs):
        self.debug_msg("startendtag", f"{tag!r} atts: {attrs}")
        attr_dict = dict(attrs)
        if tag in (self._block_placeholder, self._inline_placeholder):
            id = int(attr_dict["id"])
#            block_type = attr_dict["type"]
            DocNode(
                f"{tag}_{attr_dict['type']}",
                self.cur,
                content=self.blockdata[id],
                #                attrs = attr_dict
            )
        else:
            DocNode(tag, self.cur, None, attrs)

    def handle_endtag(self, tag):
        if tag in IGNORE_TAGS:
            return

        self.debug_msg("endtag", f"{tag!r}")

        if tag == "br":  # handled in starttag
            return

        self.debug_msg("starttag", "%r" % self.get_starttag_text())

        if tag in ("ul", "ol"):
            self.__list_level -= 1

        if tag in BLOCK_TAGS or self.cur is None:
            self._go_up()
        else:
            self.cur = self.cur.parent

    # -------------------------------------------------------------------------

    def debug_msg(self, method, txt):
        if not self.debugging:
            return
        print("%-8s %8s: %s" % (self.getpos(), method, txt))

    def debug(self, start_node=None):
        """
        Display the current document tree
        """
        print("_" * 80)

        if start_node is None:
            start_node = self.root
            print("  document tree:")
        else:
            print(f"  tree from {start_node}:")

        print("=" * 80)

        def emit(node, ident=0):
            for child in node.children:
                txt = "%s%s" % (" " * ident, child.kind)

                if child.content:
                    txt += f": {child.content!r}"

                if child.attrs:
                    txt += f" - attrs: {child.attrs!r}"

                if child.level is not None:
                    txt += f" - level: {child.level!r}"

                print(txt)
                emit(child, ident + 4)
        emit(start_node)
        print("*" * 80)


if __name__ == '__main__':
    import doctest
    print(doctest.testmod())

#    p = HtmlParser(debug=True)
#    p.feed("""\
# 

in span
# in code

# """) # p.debug()