diff options
Diffstat (limited to 'creole/parser/creol2html_rules.py')
-rw-r--r-- | creole/parser/creol2html_rules.py | 215 |
1 files changed, 111 insertions, 104 deletions
diff --git a/creole/parser/creol2html_rules.py b/creole/parser/creol2html_rules.py index 01bb08e..b507757 100644 --- a/creole/parser/creol2html_rules.py +++ b/creole/parser/creol2html_rules.py @@ -9,7 +9,7 @@ :license: GNU GPL v3 or above, see LICENSE for more details. """ -from __future__ import division, absolute_import, print_function, unicode_literals + import re @@ -18,217 +18,223 @@ class InlineRules(object): """ All inline rules """ - proto = r'http|https|ftp|nntp|news|mailto|telnet|file|irc' + + proto = r"http|https|ftp|nntp|news|mailto|telnet|file|irc" # New regex for finding uris, requires uri to free stand within whitespace or lineends. - url = r'''(?P<url> + url = ( + r"""(?P<url> (^ | (?<=\s)) (?P<escaped_url>~)? (?P<url_target> (?P<url_proto> %s )://[^$\s]+ ) - )''' % proto + )""" + % proto + ) # Original uri matching regex inherited from MoinMoin code. - #url = r'''(?P<url> - #(^ | (?<=\s | [.,:;!?()/=])) - #(?P<escaped_url>~)? - #(?P<url_target> (?P<url_proto> %s ):\S+? ) - #($ | (?=\s | [,.:;!?()] (\s | $))) - #)''' % proto - link = r'''(?P<link> + # url = r'''(?P<url> + # (^ | (?<=\s | [.,:;!?()/=])) + # (?P<escaped_url>~)? + # (?P<url_target> (?P<url_proto> %s ):\S+? ) + # ($ | (?=\s | [,.:;!?()] (\s | $))) + # )''' % proto + link = r"""(?P<link> \[\[ (?P<link_target>.+?) \s* ([|] \s* (?P<link_text>.+?) \s*)? ]] - )''' - -# link = r'''(?P<link1> -# \[\[ -# (?P<link_target1>.+?)\|(?P<link_text1>.+?) -# ]] -# )|(?P<link2> -# \[\[ -# (?P<link_target2> (%s)://[^ ]+) \s* (?P<link_text2>.+?) -# ]] -# )| -# \[\[(?P<internal_link>.+)\]\] -# ''' % proto + )""" + + # link = r'''(?P<link1> + # \[\[ + # (?P<link_target1>.+?)\|(?P<link_text1>.+?) + # ]] + # )|(?P<link2> + # \[\[ + # (?P<link_target2> (%s)://[^ ]+) \s* (?P<link_text2>.+?) + # ]] + # )| + # \[\[(?P<internal_link>.+)\]\] + # ''' % proto # image tag - image = r'''(?P<image> + image = r"""(?P<image> {{ (?P<image_target>.+?) \s* (\| \s* (?P<image_text>.+?) \s*)? }} - )(?i)''' - #-------------------------------------------------------------------------- + )(?i)""" + # -------------------------------------------------------------------------- # a macro like: <<macro>>text<</macro>> - macro_inline = r''' + macro_inline = r""" (?P<macro_inline> << \s* (?P<macro_inline_start>\w+) \s* (?P<macro_inline_args>.*?) \s* >> (?P<macro_inline_text>(.|\n)*?) <</ \s* (?P=macro_inline_start) \s* >> ) - ''' + """ # A single macro tag, like <<macro-a foo="bar">> or <<macro />> - macro_tag = r'''(?P<macro_tag> + macro_tag = r"""(?P<macro_tag> <<(?P<macro_tag_name> \w+) (?P<macro_tag_args>.*?) \s* /*>> - )''' + )""" - pre_inline = r'(?P<pre_inline> {{{ (?P<pre_inline_text>.*?) }}} )' + pre_inline = r"(?P<pre_inline> {{{ (?P<pre_inline_text>.*?) }}} )" # Basic text typefaces: - emphasis = r'(?P<emphasis>(?<!:)// (?P<emphasis_text>.+?) (?<!:)// )' + emphasis = r"(?P<emphasis>(?<!:)// (?P<emphasis_text>.+?) (?<!:)// )" # there must be no : in front of the // avoids italic rendering # in urls with unknown protocols - strong = r'(?P<strong>\*\* (?P<strong_text>.+?) \*\* )' + strong = r"(?P<strong>\*\* (?P<strong_text>.+?) \*\* )" # Creole 1.0 optional: - monospace = r'(?P<monospace> \#\# (?P<monospace_text>.+?) \#\# )' - superscript = r'(?P<superscript> \^\^ (?P<superscript_text>.+?) \^\^ )' - subscript = r'(?P<subscript> ,, (?P<subscript_text>.+?) ,, )' - underline = r'(?P<underline> __ (?P<underline_text>.+?) __ )' - delete = r'(?P<delete> ~~ (?P<delete_text>.+?) ~~ )' + monospace = r"(?P<monospace> \#\# (?P<monospace_text>.+?) \#\# )" + superscript = r"(?P<superscript> \^\^ (?P<superscript_text>.+?) \^\^ )" + subscript = r"(?P<subscript> ,, (?P<subscript_text>.+?) ,, )" + underline = r"(?P<underline> __ (?P<underline_text>.+?) __ )" + delete = r"(?P<delete> ~~ (?P<delete_text>.+?) ~~ )" # own additions: - small = r'(?P<small>-- (?P<small_text>.+?) -- )' - - linebreak = r'(?P<linebreak> \\\\ )' - escape = r'(?P<escape> ~ (?P<escaped_char>\S) )' - char = r'(?P<char> . )' - - - + small = r"(?P<small>-- (?P<small_text>.+?) -- )" + linebreak = r"(?P<linebreak> \\\\ )" + escape = r"(?P<escape> ~ (?P<escaped_char>\S) )" + char = r"(?P<char> . )" class BlockRules(object): """ All used block rules. """ -# macro_block = r'''(?P<macro_block> -# \s* << (?P<macro_block_start>\w+) \s* (?P<macro_block_args>.*?) >> -# (?P<macro_block_text>(.|\n)+?) -# <</(?P=macro_block_start)>> \s* -# )''' -# macro_block = r'''(?P<macro_block> -# <<(?P<macro_block_start>.*?)>> -# (?P<macro_block_text>.*?) -# <</.*?>> -# )''' - - macro_block = r''' + + # macro_block = r'''(?P<macro_block> + # \s* << (?P<macro_block_start>\w+) \s* (?P<macro_block_args>.*?) >> + # (?P<macro_block_text>(.|\n)+?) + # <</(?P=macro_block_start)>> \s* + # )''' + # macro_block = r'''(?P<macro_block> + # <<(?P<macro_block_start>.*?)>> + # (?P<macro_block_text>.*?) + # <</.*?>> + # )''' + + macro_block = r""" (?P<macro_block> << \s* (?P<macro_block_start>\w+) \s* (?P<macro_block_args>.*?) \s* >> (?P<macro_block_text>(.|\n)*?) <</ \s* (?P=macro_block_start) \s* >> ) - ''' + """ - line = r'''(?P<line> ^\s*$ )''' # empty line that separates paragraphs + line = r"""(?P<line> ^\s*$ )""" # empty line that separates paragraphs - head = r'''(?P<head> + head = r"""(?P<head> ^ (?P<head_head>=+) \s* (?P<head_text> .*? ) (=|\s)*?$ - )''' - separator = r'(?P<separator> ^ \s* ---- \s* $ )' # horizontal line + )""" + separator = r"(?P<separator> ^ \s* ---- \s* $ )" # horizontal line - pre_block = r'''(?P<pre_block> + pre_block = r"""(?P<pre_block> ^{{{ \s* $ (?P<pre_block_text> ([\#]!(?P<pre_block_kind>\w*?)(\s+.*)?$)? (.|\n)+? ) ^}}}) - ''' + """ # Matches the whole list, separate items are parsed later. The # list *must* start with a single bullet. - list = r'''(?P<list> + list = r"""(?P<list> ^ [ \t]* ([*][^*\#]|[\#][^\#*]).* $ ( \n[ \t]* [*\#]+.* $ )* - )''' + )""" - table = r'''^ \s*(?P<table> + table = r"""^ \s*(?P<table> [|].*? \s* [|]? - ) \s* $''' + ) \s* $""" re_flags = re.VERBOSE | re.UNICODE | re.MULTILINE def __init__(self, blog_line_breaks=True): if blog_line_breaks: - # use blog style line breaks (every line break would be converted into <br />) - self.text = r'(?P<text> .+ ) (?P<break> (?<!\\)$\n(?!\s*$) )?' + # use blog style line breaks (every line break would be converted into <br />) + self.text = r"(?P<text> .+ ) (?P<break> (?<!\\)$\n(?!\s*$) )?" else: # use wiki style line breaks, seperate lines with one space - self.text = r'(?P<space> (?<!\\)$\n(?!\s*$) )? (?P<text> .+ )' + self.text = r"(?P<space> (?<!\\)$\n(?!\s*$) )? (?P<text> .+ )" self.rules = ( self.macro_block, - self.line, self.head, self.separator, - self.pre_block, self.list, - self.table, self.text, + self.line, + self.head, + self.separator, + self.pre_block, + self.list, + self.table, + self.text, ) - - - class SpecialRules(object): """ re rules witch not directly used as inline/block rules. """ + # Matches single list items: - item = r'''^ \s* (?P<item> + item = r"""^ \s* (?P<item> (?P<item_head> [\#*]+) \s* (?P<item_text> .*?) - ) \s* $''' + ) \s* $""" # For splitting table cells: - cell = r''' + cell = r""" \| \s* ( (?P<head> [=][^|]+ ) | (?P<cell> ( %s | [^|])+ ) ) \s* - ''' % '|'.join([ - InlineRules.link, - InlineRules.macro_inline, InlineRules.macro_tag, - InlineRules.image, - InlineRules.pre_inline - ]) + """ % "|".join( + [InlineRules.link, InlineRules.macro_inline, InlineRules.macro_tag, InlineRules.image, InlineRules.pre_inline] + ) # For pre escaping, in creole 1.0 done with ~: - pre_escape = r' ^(?P<indent>\s*) ~ (?P<rest> \}\}\} \s*) $' + pre_escape = r" ^(?P<indent>\s*) ~ (?P<rest> \}\}\} \s*) $" INLINE_FLAGS = re.VERBOSE | re.UNICODE INLINE_RULES = ( - InlineRules.link, InlineRules.url, - InlineRules.macro_inline, InlineRules.macro_tag, - InlineRules.pre_inline, InlineRules.image, - - InlineRules.strong, InlineRules.emphasis, - InlineRules.monospace, InlineRules.underline, - InlineRules.superscript, InlineRules.subscript, - InlineRules.small, InlineRules.delete, - + InlineRules.link, + InlineRules.url, + InlineRules.macro_inline, + InlineRules.macro_tag, + InlineRules.pre_inline, + InlineRules.image, + InlineRules.strong, + InlineRules.emphasis, + InlineRules.monospace, + InlineRules.underline, + InlineRules.superscript, + InlineRules.subscript, + InlineRules.small, + InlineRules.delete, InlineRules.linebreak, - InlineRules.escape, InlineRules.char + InlineRules.escape, + InlineRules.char, ) def _verify_rules(rules, flags): """ Simple verify the rules -> try to compile it ;) - + >>> _verify_rules(INLINE_RULES, INLINE_FLAGS) Rule test ok. - - >>> block_rules = BlockRules() + + >>> block_rules = BlockRules() >>> _verify_rules(block_rules.rules, block_rules.re_flags) Rule test ok. """ @@ -236,12 +242,12 @@ def _verify_rules(rules, flags): rule_list = [] for rule in rules: try: -# print(rule) + # print(rule) re.compile(rule, flags) # Try to merge the rules. e.g. Check if group named double used. rule_list.append(rule) - re.compile('|'.join(rule_list), flags) + re.compile("|".join(rule_list), flags) except Exception as err: print(" *** Error with rule:") print(rule) @@ -252,6 +258,7 @@ def _verify_rules(rules, flags): if __name__ == "__main__": import doctest + print(doctest.testmod()) print("-" * 80) |