""" Creole wiki markup parser See http://wikicreole.org/ for latest specs. Notes: * No markup allowed in headings. Creole 1.0 does not require us to support this. * No markup allowed in table headings. Creole 1.0 does not require us to support this. * No (non-bracketed) generic url recognition: this is "mission impossible" except if you want to risk lots of false positives. Only known protocols are recognized. * We do not allow ":" before "//" italic markup to avoid urls with unrecognized schemes (like wtf://server/path) triggering italic rendering for the rest of the paragraph. :copyleft: 2008-2020 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ import re from pprint import pformat from creole.parser.creol2html_rules import INLINE_FLAGS, INLINE_RULES, BlockRules, InlineRules, SpecialRules from creole.shared.document_tree import DocNode class CreoleParser(object): """ Parse the raw text and create a document object that can be converted into output using Emitter. """ # For pre escaping, in creole 1.0 done with ~: pre_escape_re = re.compile( SpecialRules.pre_escape, re.MULTILINE | re.VERBOSE | re.UNICODE ) # for link descriptions: link_re = re.compile( '|'.join([InlineRules.image, InlineRules.linebreak, InlineRules.char]), re.VERBOSE | re.UNICODE ) # for list items: item_re = re.compile( SpecialRules.item, re.VERBOSE | re.UNICODE | re.MULTILINE ) # for table cells: cell_re = re.compile(SpecialRules.cell, re.VERBOSE | re.UNICODE) # For inline elements: inline_re = re.compile('|'.join(INLINE_RULES), INLINE_FLAGS) def __init__(self, raw, block_rules=None, blog_line_breaks=True, debug=False): assert isinstance(raw, str) self.raw = raw if block_rules is None: block_rules = BlockRules(blog_line_breaks=blog_line_breaks) self.blog_line_breaks = blog_line_breaks self.debug = debug # TODO: use logging # setup block element rules: self.block_re = re.compile('|'.join(block_rules.rules), block_rules.re_flags) self.root = DocNode('document', None) self.cur = self.root # The most recent document node self.text = None # The node to add inline characters to self.last_text_break = None # Last break node, inserted by _text_repl() # Filled with all macros that's in the text self.root.used_macros = set() # -------------------------------------------------------------------------- def cleanup_break(self, old_cur): """ remove unused end line breaks. Should be called before a new block element. e.g.:

line one
line two
<--- remove this br-tag

""" if self.cur.children: last_child = self.cur.children[-1] if last_child.kind == "break": del(self.cur.children[-1]) def _upto(self, node, kinds): """ Look up the tree to the first occurence of one of the listed kinds of nodes or root. Start at the node node. """ self.cleanup_break(node) # remove unused end line breaks. while node.parent is not None and node.kind not in kinds: node = node.parent return node def _upto_block(self): self.cur = self._upto(self.cur, ('document',)) # 'section', 'blockquote')) # __________________________________________________________________________ # The _*_repl methods called for matches in regexps. Sometimes the # same method needs several names, because of group names in regexps. def _text_repl(self, groups): # print("_text_repl()", self.cur.kind) # self.debug_groups(groups) if self.cur.kind in ('table', 'table_row', 'bullet_list', 'number_list'): self._upto_block() if self.cur.kind in ('document', 'section', 'blockquote'): self.cur = DocNode('paragraph', self.cur) text = groups.get('text', "") if groups.get('space') and self.cur.children: # use wikipedia style line breaks and seperate a new line with one space text = " " + text self.parse_inline(text) if groups.get('break') and self.cur.kind in ('paragraph', 'emphasis', 'strong', 'pre_inline'): self.last_text_break = DocNode('break', self.cur, "") self.text = None _break_repl = _text_repl _space_repl = _text_repl def _url_repl(self, groups): """Handle raw urls in text.""" if not groups.get('escaped_url'): # this url is NOT escaped target = groups.get('url_target', "") node = DocNode('link', self.cur) node.content = target DocNode('text', node, node.content) self.text = None else: # this url is escaped, we render it as text if self.text is None: self.text = DocNode('text', self.cur, "") self.text.content += groups.get('url_target') _url_target_repl = _url_repl _url_proto_repl = _url_repl _escaped_url_repl = _url_repl def _link_repl(self, groups): """Handle all kinds of links.""" target = groups.get('link_target', "") text = (groups.get('link_text', "") or "").strip() parent = self.cur self.cur = DocNode('link', self.cur) self.cur.content = target self.text = None re.sub(self.link_re, self._replace, text) self.cur = parent self.text = None _link_target_repl = _link_repl _link_text_repl = _link_repl # -------------------------------------------------------------------------- def _add_macro(self, groups, macro_type, name_key, args_key, text_key=None): """ generic method to handle the macro, used for all variants: inline, inline-tag, block """ # self.debug_groups(groups) assert macro_type in ("macro_inline", "macro_block") if text_key: macro_text = groups.get(text_key, "").strip() else: macro_text = None node = DocNode(macro_type, self.cur, macro_text) macro_name = groups[name_key] node.macro_name = macro_name self.root.used_macros.add(macro_name) node.macro_args = groups.get(args_key, "").strip() self.text = None def _macro_block_repl(self, groups): """ block macro, e.g: <> some lines <> """ self._upto_block() self.cur = self.root self._add_macro( groups, macro_type="macro_block", name_key="macro_block_start", args_key="macro_block_args", text_key="macro_block_text", ) _macro_block_start_repl = _macro_block_repl _macro_block_args_repl = _macro_block_repl _macro_block_text_repl = _macro_block_repl def _macro_tag_repl(self, groups): """ A single macro tag, e.g.: <> or <> """ self._add_macro( groups, macro_type="macro_inline", name_key="macro_tag_name", args_key="macro_tag_args", text_key=None, ) _macro_tag_name_repl = _macro_tag_repl _macro_tag_args_repl = _macro_tag_repl def _macro_inline_repl(self, groups): """ inline macro tag with data, e.g.: <>text<> """ self._add_macro( groups, macro_type="macro_inline", name_key="macro_inline_start", args_key="macro_inline_args", text_key="macro_inline_text", ) _macro_inline_start_repl = _macro_inline_repl _macro_inline_args_repl = _macro_inline_repl _macro_inline_text_repl = _macro_inline_repl # -------------------------------------------------------------------------- def _image_repl(self, groups): """Handles images and attachemnts included in the page.""" target = groups.get('image_target', "").strip() text = (groups.get('image_text', "") or "").strip() node = DocNode("image", self.cur, target) DocNode('text', node, text or node.content) self.text = None _image_target_repl = _image_repl _image_text_repl = _image_repl def _separator_repl(self, groups): self._upto_block() DocNode('separator', self.cur) def _item_repl(self, groups): """ List item """ bullet = groups.get('item_head', "") text = groups.get('item_text', "") if bullet[-1] == '#': kind = 'number_list' else: kind = 'bullet_list' level = len(bullet) - 1 lst = self.cur # Find a list of the same kind and level up the tree while ( lst and not ( lst.kind in ( 'number_list', 'bullet_list') and lst.level == level) and lst.kind not in ( 'document', 'section', 'blockquote')): lst = lst.parent if lst and lst.kind == kind: self.cur = lst else: # Create a new level of list self.cur = self._upto(self.cur, ('list_item', 'document', 'section', 'blockquote')) self.cur = DocNode(kind, self.cur) self.cur.level = level self.cur = DocNode('list_item', self.cur) self.cur.level = level + 1 self.parse_inline(text) self.text = None _item_text_repl = _item_repl _item_head_repl = _item_repl def _list_repl(self, groups): """ complete list """ self.item_re.sub(self._replace, groups["list"]) def _head_repl(self, groups): self._upto_block() node = DocNode('header', self.cur, groups['head_text'].strip()) node.level = len(groups['head_head']) self.text = None _head_head_repl = _head_repl _head_text_repl = _head_repl def _table_repl(self, groups): row = groups.get('table', '|').strip() self.cur = self._upto(self.cur, ( 'table', 'document', 'section', 'blockquote')) if self.cur.kind != 'table': self.cur = DocNode('table', self.cur) tb = self.cur tr = DocNode('table_row', tb) for m in self.cell_re.finditer(row): cell = m.group('cell') if cell: text = cell.strip() self.cur = DocNode('table_cell', tr) self.text = None else: text = m.group('head').strip('= ') self.cur = DocNode('table_head', tr) self.text = DocNode('text', self.cur, "") self.parse_inline(text) self.cur = tb self.text = None def _pre_block_repl(self, groups): self._upto_block() kind = groups.get('pre_block_kind', None) text = groups.get('pre_block_text', "") def remove_tilde(m): return m.group('indent') + m.group('rest') text = self.pre_escape_re.sub(remove_tilde, text) node = DocNode('pre_block', self.cur, text) node.sect = kind or '' self.text = None _pre_block_text_repl = _pre_block_repl _pre_block_head_repl = _pre_block_repl _pre_block_kind_repl = _pre_block_repl def _line_repl(self, groups): """ Transfer newline from the original markup into the html code """ self._upto_block() DocNode('line', self.cur, "") def _pre_inline_repl(self, groups): text = groups.get('pre_inline_text', "") DocNode('pre_inline', self.cur, text) self.text = None _pre_inline_text_repl = _pre_inline_repl _pre_inline_head_repl = _pre_inline_repl # -------------------------------------------------------------------------- def _inline_mark(self, groups, key): self.cur = DocNode(key, self.cur) self.text = None text = groups[f"{key}_text"] self.parse_inline(text) self.cur = self._upto(self.cur, (key,)).parent self.text = None # TODO: How can we generalize that: def _emphasis_repl(self, groups): self._inline_mark(groups, key='emphasis') _emphasis_text_repl = _emphasis_repl def _strong_repl(self, groups): self._inline_mark(groups, key='strong') _strong_text_repl = _strong_repl def _monospace_repl(self, groups): self._inline_mark(groups, key='monospace') _monospace_text_repl = _monospace_repl def _superscript_repl(self, groups): self._inline_mark(groups, key='superscript') _superscript_text_repl = _superscript_repl def _subscript_repl(self, groups): self._inline_mark(groups, key='subscript') _subscript_text_repl = _subscript_repl def _underline_repl(self, groups): self._inline_mark(groups, key='underline') _underline_text_repl = _underline_repl def _small_repl(self, groups): self._inline_mark(groups, key='small') _small_text_repl = _small_repl def _delete_repl(self, groups): self._inline_mark(groups, key='delete') _delete_text_repl = _delete_repl # -------------------------------------------------------------------------- def _linebreak_repl(self, groups): DocNode('break', self.cur, None) self.text = None def _escape_repl(self, groups): if self.text is None: self.text = DocNode('text', self.cur, "") self.text.content += groups.get('escaped_char', "") _escaped_char_repl = _escape_repl def _char_repl(self, groups): if self.text is None: self.text = DocNode('text', self.cur, "") self.text.content += groups.get('char', "") # -------------------------------------------------------------------------- def _replace(self, match): """Invoke appropriate _*_repl method. Called for every matched group.""" def debug(groups): data = dict([ group for group in groups.items() if group[1] is not None ]) print(pformat(data)) groups = match.groupdict() for name, text in groups.items(): if text is not None: if self.debug and name != "char": # TODO: use logging debug(groups) replace_method = getattr(self, f'_{name}_repl') replace_method(groups) return def parse_inline(self, raw): """Recognize inline elements inside blocks.""" re.sub(self.inline_re, self._replace, raw) def parse_block(self, raw): """Recognize block elements.""" re.sub(self.block_re, self._replace, raw) def parse(self): """Parse the text given as self.raw and return DOM tree.""" # convert all lineendings to \n text = self.raw.replace("\r\n", "\n").replace("\r", "\n") if self.debug: # TODO: use logging print(repr(text)) self.parse_block(text) return self.root # -------------------------------------------------------------------------- def debug_tree(self, start_node=None): """ Display the current document tree """ print("_" * 80) if start_node is None: start_node = self.root print(" document tree:") else: print(f" tree from {start_node}:") print("=" * 80) def emit(node, ident=0): for child in node.children: print("%s%s: %r" % (" " * ident, child.kind, child.content)) emit(child, ident + 4) emit(start_node) print("*" * 80) def debug_groups(self, groups): print("_" * 80) print(" debug groups:") for name, text in groups.items(): if text is not None: print("%15s: %r" % (name, text)) print("-" * 80) if __name__ == "__main__": import doctest print(doctest.testmod()) print("-" * 80) txt = """A <>bar1<> in a line...""" print(txt) print("-" * 80) blog_line_breaks = False p = CreoleParser(txt, blog_line_breaks=blog_line_breaks) document = p.parse() p.debug() def display_match(match): groups = match.groupdict() for name, text in groups.items(): if name != "char" and text is not None: print("%20s: %r" % (name, text)) parser = CreoleParser("", blog_line_breaks=blog_line_breaks) print("_" * 80) print("merged block rules test:") re.sub(parser.block_re, display_match, txt) print("_" * 80) print("merged inline rules test:") re.sub(parser.inline_re, display_match, txt) def test_single(rules, flags, txt): for rule in rules: rexp = re.compile(rule, flags) rexp.sub(display_match, txt) print("_" * 80) print("single block rules match test:") block_rules = BlockRules() test_single(block_rules.rules, block_rules.re_flags, txt) print("_" * 80) print("single inline rules match test:") test_single(INLINE_RULES, INLINE_FLAGS, txt) print("---END---")