diff options
author | Jens Diemer <github.com@jensdiemer.de> | 2009-02-25 16:19:57 +0000 |
---|---|---|
committer | Jens Diemer <github.com@jensdiemer.de> | 2009-02-25 16:19:57 +0000 |
commit | 04d5a18abfb268b1c9267e61aa291c537a0c84e2 (patch) | |
tree | 2c1b87291d8d8e2f635ced0a40619e9cfb7a8472 /creole | |
parent | d58cfa6b1f68be9cfe398070f1ae74e906cf7c04 (diff) | |
download | creole-04d5a18abfb268b1c9267e61aa291c537a0c84e2.tar.gz |
* differ macro/preformatted areas (inline or block), fix for issue 1: http://code.google.com/p/python-creole/issues/detail?id=1
* add a test macro
* add different unknown tag handles in html2creole
* add unittests
Diffstat (limited to 'creole')
-rw-r--r-- | creole/__init__.py | 7 | ||||
-rw-r--r-- | creole/creole.py | 187 | ||||
-rw-r--r-- | creole/creole2html.py | 14 | ||||
-rw-r--r-- | creole/default_macros.py | 10 | ||||
-rw-r--r-- | creole/html2creole.py | 165 |
5 files changed, 256 insertions, 127 deletions
diff --git a/creole/__init__.py b/creole/__init__.py index 8014515..9e3fed0 100644 --- a/creole/__init__.py +++ b/creole/__init__.py @@ -8,14 +8,13 @@ from html2creole import Html2CreoleParser, Html2CreoleEmitter -def creole2html(markup_string, **kwargs): +def creole2html(markup_string, debug=False, **kwargs): """ convert creole markup into html code >>> creole2html(u'This is **creole //markup//**!') u'<p>This is <strong>creole <i>markup</i></strong>!</p>\\n' """ - debug = kwargs.pop("debug", False) # Create document tree from creole markup document = Parser(markup_string).parse() if debug: @@ -26,7 +25,7 @@ def creole2html(markup_string, **kwargs): -def html2creole(html_string, debug=False): +def html2creole(html_string, debug=False, **kwargs): """ convert html code into creole markup @@ -40,7 +39,7 @@ def html2creole(html_string, debug=False): h2c.debug() # create creole markup from the document tree - emitter = Html2CreoleEmitter(document_tree, debug) + emitter = Html2CreoleEmitter(document_tree, debug=debug, **kwargs) return emitter.emit() diff --git a/creole/creole.py b/creole/creole.py index 02eb656..33c5011 100644 --- a/creole/creole.py +++ b/creole/creole.py @@ -80,19 +80,20 @@ class InlineRules: )(?i)''' #-------------------------------------------------------------------------- - macro = r'''(?P<macro> - <<(?P<macro_name> \w+) (?P<macro_args>.*?)>> - )''' - + # a macro like: <<macro>>text<</macro>> inline_macro = r''' (?P<inline_macro> - << \s* (?P<inline_macro_start>\w+) \s* (?P<inline_macro_args>.*?) \s* >> - (?P<inline_macro_text>(.|\n)*) - <</ \s* (?P=inline_macro_start) \s* >> + << \s* (?P<macro_inline_start>\w+) \s* (?P<macro_inline_args>.*?) \s* >> + (?P<macro_inline_text>(.|\n)*?) + <</ \s* (?P=macro_inline_start) \s* >> ) ''' + # A single macro tag, like <<macro-a foo="bar">> or <<macro />> + macro_tag = r'''(?P<macro_tag> + <<(?P<macro_tag_name> \w+) (?P<macro_tag_args>.*?) \s* /*>> + )''' - preformatted = r'(?P<preformatted> {{{ (?P<preformatted_text>.*?) }}} )' + pre_inline = r'(?P<pre_inline> {{{ (?P<pre_inline_text>.*?) }}} )' # Basic text typefaces: emph = r'(?P<emph> (?<!:)// )' # there must be no : in front of the // @@ -164,7 +165,7 @@ class BlockRules: macro_block = r''' (?P<macro_block> << \s* (?P<macro_block_start>\w+) \s* (?P<macro_block_args>.*?) \s* >> - (?P<macro_block_text>(.|\n)*) + (?P<macro_block_text>(.|\n)*?) <</ \s* (?P=macro_block_start) \s* >> ) ''' @@ -183,16 +184,15 @@ class BlockRules: =*$ )''' separator = r'(?P<separator> ^ \s* ---- \s* $ )' # horizontal line - pre = r'''(?P<pre> + + pre_block = r'''(?P<pre_block> ^{{{ \s* $ - (\n)? - (?P<pre_text> - ([\#]!(?P<pre_kind>\w*?)(\s+.*)?$)? + (?P<pre_block_text> + ([\#]!(?P<pre_block_kind>\w*?)(\s+.*)?$)? (.|\n)+? ) - (\n)? - ^}}} \s*$ - )''' + ^}}}) + ''' list = r'''(?P<list> ^ [ \t]* ([*][^*\#]|[\#][^\#*]).* $ ( \n[ \t]* [*\#]+.* $ )* @@ -227,8 +227,10 @@ class SpecialRules: (?P<cell> ( %s | [^|])+ ) ) \s* ''' % '|'.join([ - InlineRules.link, InlineRules.macro, InlineRules.image, - InlineRules.preformatted + InlineRules.link, + InlineRules.inline_macro, InlineRules.macro_tag, + InlineRules.image, + InlineRules.pre_inline ]) # For pre escaping, in creole 1.0 done with ~: @@ -242,16 +244,16 @@ BLOCK_RULES = ( BlockRules.pass_line, BlockRules.macro_block, BlockRules.html, - BlockRules.line, BlockRules.head, BlockRules.separator, BlockRules.pre, BlockRules.list, + BlockRules.line, BlockRules.head, BlockRules.separator, + BlockRules.pre_block, BlockRules.list, BlockRules.table, BlockRules.text, ) INLINE_FLAGS = re.VERBOSE | re.UNICODE INLINE_RULES = ( InlineRules.link, InlineRules.url, - InlineRules.macro, - InlineRules.inline_macro, - InlineRules.preformatted, InlineRules.image, + InlineRules.inline_macro, InlineRules.macro_tag, + InlineRules.pre_inline, InlineRules.image, InlineRules.pass_inline, InlineRules.strong, InlineRules.emph, @@ -405,7 +407,7 @@ class Parser: self.parse_inline(groups.get('text', u"")) if groups.get('break') and self.cur.kind in ('paragraph', - 'emphasis', 'strong', 'preformatted'): + 'emphasis', 'strong', 'pre_inline'): self.last_text_break = DocNode('break', self.cur, u"") self.text = None @@ -442,44 +444,80 @@ class Parser: self.text = None _link_target_repl = _link_repl _link_text_repl = _link_repl + + #-------------------------------------------------------------------------- + + def _add_macro(self, groups, macro_type, name_key, args_key, text_key=None): + """ + generic mathod to handle the macro, used for all variants: + inline, inline-tag, block + """ + #self.debug_groups(groups) + assert macro_type in ("macro_inline", "macro_block") + + if text_key: + macro_text = groups.get(text_key, u"").strip() + else: + macro_text = None + + node = DocNode(macro_type, self.cur, macro_text) + node.macro_name = groups[name_key] + node.macro_args = groups.get(args_key, u"").strip() - def _add_macro(self, macro_name, macro_args, macro_text=u""): -# self._upto_block() - node = DocNode("macro", self.cur, macro_text.strip()) - node.macro_name = macro_name - node.macro_args = macro_args.strip() self.text = None def _macro_block_repl(self, groups): - """Handles macros using the placeholder syntax.""" - #self.debug_groups(groups) + """ + block macro, e.g: + <<macro args="foo">> + some + lines + <</macro>> + """ self._upto_block() self.cur = self.root self._add_macro( - macro_name = groups['macro_block_start'], - macro_text = groups.get('macro_block_text', u""), - macro_args = groups.get('macro_block_args', u""), + groups, + macro_type = "macro_block", + name_key = "macro_block_start", + args_key = "macro_block_args", + text_key = "macro_block_text", ) - self.text = None _macro_block_start_repl = _macro_block_repl _macro_block_args_repl = _macro_block_repl _macro_block_text_repl = _macro_block_repl - def _macro_repl(self, groups): - """Handles macros using the placeholder syntax.""" - macro_name = groups.get('macro_name', u"") - macro_args = groups.get('macro_args', u"") - self._add_macro(macro_name, macro_args) - self.text = None + def _macro_tag_repl(self, groups): + """ + A single macro tag, e.g.: <<macro-a foo="bar">> or <<macro />> + """ + self._add_macro( + groups, + macro_type = "macro_inline", + name_key = "macro_tag_name", + args_key = "macro_tag_args", + text_key = None, + ) + _macro_tag_name_repl = _macro_tag_repl + _macro_tag_args_repl = _macro_tag_repl + + + def _macro_inline_repl(self, groups): + """ + inline macro tag with data, e.g.: <<macro>>text<</macro>> + """ + self._add_macro( + groups, + macro_type = "macro_inline", + name_key = "macro_inline_start", + args_key = "macro_inline_args", + text_key = "macro_inline_text", + ) + _macro_inline_start_repl = _macro_inline_repl + _macro_inline_args_repl = _macro_inline_repl + _macro_inline_text_repl = _macro_inline_repl -# text = (groups.get('macro_text', u"") or u"").strip() -# node = DocNode('macro', self.cur, name) -# node.args = groups.get('macro_args', u"") or '' -# DocNode('text', node, text or name) -# self.text = None - _macro_name_repl = _macro_repl - _macro_args_repl = _macro_repl -# _macro_text_repl = _macro_repl + #-------------------------------------------------------------------------- def _image_repl(self, groups): """Handles images and attachemnts included in the page.""" @@ -560,31 +598,31 @@ class Parser: self.cur = tb self.text = None - def _pre_repl(self, groups): + def _pre_block_repl(self, groups): self._upto_block() - kind = groups.get('pre_kind', None) - text = groups.get('pre_text', u"") + kind = groups.get('pre_block_kind', None) + text = groups.get('pre_block_text', u"") def remove_tilde(m): return m.group('indent') + m.group('rest') text = self.pre_escape_re.sub(remove_tilde, text) - node = DocNode('preformatted', self.cur, text) + node = DocNode('pre_block', self.cur, text) node.sect = kind or '' self.text = None - _pre_text_repl = _pre_repl - _pre_head_repl = _pre_repl - _pre_kind_repl = _pre_repl + _pre_block_text_repl = _pre_block_repl + _pre_block_head_repl = _pre_block_repl + _pre_block_kind_repl = _pre_block_repl def _line_repl(self, groups): """ Transfer newline from the original markup into the html code """ self._upto_block() DocNode('line', self.cur, u"") - def _preformatted_repl(self, groups): - text = groups.get('preformatted_text', u"") - DocNode('preformatted', self.cur, text) + def _pre_inline_repl(self, groups): + text = groups.get('pre_inline_text', u"") + DocNode('pre_inline', self.cur, text) self.text = None - _preformatted_text_repl = _preformatted_repl - _preformatted_head_repl = _preformatted_repl + _pre_inline_text_repl = _pre_inline_repl + _pre_inline_head_repl = _pre_inline_repl #-------------------------------------------------------------------------- @@ -721,16 +759,15 @@ class DocNode: def __repr__(self): return u"<DocNode %s: %r>" % (self.kind, self.content) -# def debug(self): -# raise -# print "_"*80 -# print "\tDocNode - debug:" -# print "str(): %s" % self -# print "attributes:" -# for i in dir(self): -# if i.startswith("_"): -# continue -# print "%20s: %r" % (i, getattr(self, i, "---")) + def debug(self): + print "_"*80 + print "\tDocNode - debug:" + print "str(): %s" % self + print "attributes:" + for i in dir(self): + if i.startswith("_"): + continue + print "%20s: %r" % (i, getattr(self, i, "---")) #------------------------------------------------------------------------------ @@ -741,12 +778,10 @@ if __name__=="__main__": doctest.testmod() print "doc test done." - txt = r"""Creole **<<html>>{...}<</html>>** code""" - txt = r"""foo -Y<<html>>the -code X<</html>>bar -Creole <<html>>{...}<</html>> code - """ + txt = r"""111 <<html>><X><</html>>foo<<html>></X><</html>> 222 + 333<<html>><X foo1="bar1"><</html>>foobar<<html>></X><</html>>444 + + 555<<html>><X /><</html>>666""" print "-"*80 p = Parser(txt) diff --git a/creole/creole2html.py b/creole/creole2html.py index 9a17642..07eac09 100644 --- a/creole/creole2html.py +++ b/creole/creole2html.py @@ -238,7 +238,12 @@ class HtmlEmitter: msg += " - returns: %r, type %r" % (result, type(result)) return self.error(msg) + if node.kind == "macro_block": + result += "\n" + return result + macro_inline_emit = macro_emit + macro_block_emit = macro_emit def break_emit(self, node): if node.parent.kind == "list_item": @@ -251,7 +256,11 @@ class HtmlEmitter: def line_emit(self, node): return u"\n" - def preformatted_emit(self, node): + def pre_block_emit(self, node): + """ pre block, with newline at the end """ + return u"<pre>%s</pre>\n" % self.html_escape(node.content) + def pre_inline_emit(self, node): + """ pre without newline at the end """ return u"<pre>%s</pre>" % self.html_escape(node.content) def pass_block_emit(self, node): @@ -301,8 +310,7 @@ class HtmlEmitter: return u"" if __name__=="__main__": - txt = r"""Creole **<<html>>{...}<</html>>** code""" - txt = r"""Creole {{{preprepre}}} c **od** e""" + txt = r"""<<html>>1<</html><<html>>2<</html>>""" print "-"*80 # from creole_alt.creole import Parser diff --git a/creole/default_macros.py b/creole/default_macros.py index 4e38a3a..07d8fb9 100644 --- a/creole/default_macros.py +++ b/creole/default_macros.py @@ -5,4 +5,12 @@ def html(args, text): Macro tag <<html>>...<</html>> Pass-trought for html code (or other stuff) """ - return text
\ No newline at end of file + return text + + + +def test_macro(args, text): + """ + a macro only for testing + """ + return u"[%s text: %s]" % (args, text) diff --git a/creole/html2creole.py b/creole/html2creole.py index 0bb3c8a..1d5517f 100644 --- a/creole/html2creole.py +++ b/creole/html2creole.py @@ -3,11 +3,13 @@ import re import inspect from HTMLParser import HTMLParser +from xml.sax.saxutils import escape from htmlentitydefs import entitydefs + BLOCK_TAGS = ( - "address", "blockquote", "center", "del", "dir", "div", "dl", "fieldset", + "address", "blockquote", "center", "dir", "div", "dl", "fieldset", "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "ins", "isindex", "menu", "noframes", "noscript", @@ -28,12 +30,11 @@ pass_block_re = r''' [\s\n]* )''' pre_block_re = r''' - [\s\n]* - <pre> + ^<pre> \s* $ (?P<pre_block> (\n|.)*? ) - </pre> + ^</pre> \s* $ [\s\n]* ''' block_re = re.compile( @@ -46,13 +47,6 @@ block_re = re.compile( #------------------------------------------------------------------------------ -#tt_block_re = r''' -# <tt> -# (?P<tt_block> -# (\n|.)*? -# ) -# </tt> -#''' inline_django_re = r''' (?P<django_tag> [\s\n]* @@ -60,9 +54,16 @@ inline_django_re = r''' [\s\n]* ) ''' +pre_inline_re = r''' + <pre> + (?P<pre_inline> + (\n|.)*? + ) + </pre> +''' inline_re = re.compile( '|'.join([ -# tt_block_re, + pre_inline_re, inline_django_re, ]), re.VERBOSE | re.UNICODE @@ -74,6 +75,12 @@ headline_tag_re = re.compile(r"h(\d)", re.UNICODE) + + + + + + class DocNode: """ A node in the document. @@ -91,6 +98,17 @@ class DocNode: self.content = content self.level = level + def get_attrs_as_string(self): + """ + FIXME! + """ + attr_list = [] + for key, value in self.attrs.iteritems(): + if isinstance(value, unicode): + value = value.encode("utf-8") + attr_list.append("%s=%r" % (key, value)) + return " ".join(attr_list).replace("'", '"') + def __str__(self): return "<DocNode %s: %r>" % (self.kind, self.content) @@ -159,6 +177,9 @@ def strip_html(html_code): >>> strip_html(u'<p>a <unknown tag /> foobar </p>') u'<p>a <unknown tag /> foobar</p>' + + >>> strip_html(u'<p>a <pre> preformated area </pre> foo </p>') + u'<p>a<pre>preformated area</pre>foo</p>' """ def strip_tag(match): block = match.group(0) @@ -275,8 +296,8 @@ class Html2CreoleParser(HTMLParser): id = len(self.blockdata)-1 return '<%s type="%s" id="%s" />' % (placeholder, type, id) - def _pre_tt_block_cut(self, groups): - return self._pre_cut(groups["tt_block"], "tt", self._inline_placeholder) + def _pre_pre_inline_cut(self, groups): + return self._pre_cut(groups["pre_inline"], "pre", self._inline_placeholder) def _pre_pre_block_cut(self, groups): return self._pre_cut(groups["pre_block"], "pre", self._block_placeholder) @@ -479,14 +500,90 @@ def deentitfy(text): +#------------------------------------------------------------------------------ + +RAISE_UNKNOWN_NODES = 1 +HTML_MACRO_UNKNOWN_NODES = 2 +ESCAPE_UNKNOWN_NODES = 3 + class Html2CreoleEmitter(object): - def __init__(self, document_tree, debug=False): + + def __init__(self, document_tree, unknown_emit=ESCAPE_UNKNOWN_NODES, + debug=False): self.root = document_tree + + if unknown_emit == RAISE_UNKNOWN_NODES: + self.unknown_emit = self.raise_unknown_node + elif unknown_emit == HTML_MACRO_UNKNOWN_NODES: + self.unknown_emit = self.use_html_macro + elif unknown_emit == ESCAPE_UNKNOWN_NODES: + self.unknown_emit = self.escape_unknown_nodes + else: + raise AssertionError("wrong keyword argument 'unknown_emit'!") + self.debugging = debug self.__inner_list = "" self.__mask_linebreak = False #-------------------------------------------------------------------------- + + def raise_unknown_node(self, node): + """ + Raise NotImplementedError on unknown tags. + """ + raise NotImplementedError( + "Node from type '%s' is not implemented!" % node.kind + ) + + def use_html_macro(self, node): + """ + Use the <<html>> macro to mask unknown tags. + """ + #node.debug() + attrs = node.get_attrs_as_string() + if attrs: + attrs = " "+attrs + + tag_data = { + "tag": node.kind, + "attrs": attrs, + } + + content = self.emit_children(node) + if not content: + # single tag + return u"<<html>><%(tag)s%(attrs)s /><</html>>" % tag_data + + start_tag = u"<<html>><%(tag)s%(attrs)s><</html>>" % tag_data + end_tag = u"<<html>></%(tag)s><</html>>" % tag_data + + return start_tag + content + end_tag + + def escape_unknown_nodes(self, node): + """ + All unknown tags should be escaped. + """ + #node.debug() + attrs = node.get_attrs_as_string() + if attrs: + attrs = " "+attrs + + tag_data = { + "tag": node.kind, + "attrs": attrs, + } + + content = self.emit_children(node) + if not content: + # single tag + return escape(u"<%(tag)s%(attrs)s />" % tag_data) + + start_tag = escape(u"<%(tag)s%(attrs)s>" % tag_data) + end_tag = escape(u"</%(tag)s>" % tag_data) + + return start_tag + content + end_tag + + #-------------------------------------------------------------------------- def _escape_linebreaks(self, text): text = text.split("\n") @@ -496,15 +593,16 @@ class Html2CreoleEmitter(object): #-------------------------------------------------------------------------- def blockdata_pre_emit(self, node): + """ pre block -> with newline at the end """ return u"{{{%s}}}\n" % deentitfy(node.content) + def inlinedata_pre_emit(self, node): + """ a pre inline block -> no newline at the end """ + return u"{{{%s}}}" % deentitfy(node.content) def blockdata_pass_emit(self, node): return u"%s\n\n" % node.content return node.content - def inlinedata_tt_emit(self, node): - return u"{{{ %s }}}" % deentitfy(node.content) - def inlinedata_django_tag_emit(self, node): return node.content @@ -661,14 +759,6 @@ class Html2CreoleEmitter(object): def document_emit(self, node): return self.emit_children(node) - def default_emit(self, node): - """ - Fallback function for emit unknown nodes. - """ - raise NotImplementedError( - "Node from type '%s' is not implemented!" % node.kind - ) - def emit_children(self, node): """Emit all the children of a node.""" result = [] @@ -684,7 +774,7 @@ class Html2CreoleEmitter(object): self.debug_msg("emit_node", "%s: %r" % (node.kind, node.content)) method_name = "%s_emit" % node.kind - emit_method = getattr(self, method_name, self.default_emit) + emit_method = getattr(self, method_name, self.unknown_emit) content = emit_method(node) if not isinstance(content, unicode): @@ -717,24 +807,13 @@ if __name__ == '__main__': doctest.testmod() print "doc test done." -# import sys -# sys.exit() +# import sys;sys.exit() data = """ -<pre> jojo </pre> -<p>basics:<br /> -<strong><i>bold italics</i></strong><br /> -<i><strong>bold italics</strong></i><br /> -<i>This is <strong>also</strong> good.</i></p> - -<p>Creole 1.0 optional:<br /> -This is <tt>monospace</tt> text.<br /> -This is <sup>superscripted</sup> text.<br /> -This is <sub>subscripted</sub> text.<br /> -This is <u>underlined</u> text.</p> - -<p>own additions:<br /> -This is <small>small</small> and this <del>strikeout</del> ;)</p>""" +<p>111 <x>foo</x> 222<br /> +333<x foo1="bar1">foobar</x>444</p> + +<p>555<x />666</p>""" # print data.strip() h2c = Html2CreoleParser( |