diff options
author | Jens Diemer <github.com@jensdiemer.de> | 2009-03-19 21:31:06 +0000 |
---|---|---|
committer | Jens Diemer <github.com@jensdiemer.de> | 2009-03-19 21:31:06 +0000 |
commit | c9b7bada58682661ed41db7325ec28a56e20872f (patch) | |
tree | 257309a59a60393407890277425c58fe9d78c2d2 /creole | |
parent | 9d97c07534f59d95815e2a9c34b1b8fc63b0d57e (diff) | |
download | creole-c9b7bada58682661ed41db7325ec28a56e20872f.tar.gz |
handle html entities in <pre> blocks, too.
Diffstat (limited to 'creole')
-rw-r--r-- | creole/html2creole.py | 85 |
1 files changed, 60 insertions, 25 deletions
diff --git a/creole/html2creole.py b/creole/html2creole.py index 25ade6c..d3aa193 100644 --- a/creole/html2creole.py +++ b/creole/html2creole.py @@ -492,21 +492,63 @@ class Html2CreoleParser(HTMLParser): +entities_rules = '|'.join([ + r"(&\#(?P<number>\d+);)", + r"(&\#x(?P<hex>[a-fA-F0-9]+);)", + r"(&(?P<named>[a-zA-Z]+);)", +]) +#print entities_rules +entities_regex = re.compile( + entities_rules, re.VERBOSE | re.UNICODE | re.MULTILINE +) +class Deentity(object): + """ + replace html entity + + >>> d = Deentity() + >>> d.replace_all(u"-=[ >>>nice<<< ]=-") + u'-=[ >>>nice<<< ]=-' + + >>> d.replace_number("126") + u'~' + >>> d.replace_hex("7E") + u'~' + >>> d.replace_named("amp") + u'&' + """ + def replace_number(self, text): + """ unicode number entity """ + unicode_no = int(text) + return unichr(unicode_no) -entities_regex = re.compile(r"&([#\w]+);", re.UNICODE) + def replace_hex(self, text): + """ hex entity """ + unicode_no = int(text, 16) + return unichr(unicode_no) + def replace_named(self, text): + """ named entity """ + if text == "nbsp": + # Non breaking spaces is not in htmlentitydefs + return u" " + else: + character = entitydefs[text] + return unicode(character) -def deentitfy(text): - """ - >>> deentitfy("a text with >entity<!") - 'a text with >entity<!' - """ - def deentitfy(match): - entity = match.group(1) - return entitydefs[entity] + def replace_all(self, content): + """ replace all html entities form the given text. """ + def replace_entity(match): + groups = match.groupdict() + for name, text in groups.iteritems(): + if text is not None: + replace_method = getattr(self, 'replace_%s' % name) + return replace_method(text) - return entities_regex.sub(deentitfy, text) + # Should never happen: + raise RuntimeError("deentitfy re rules wrong!") + + return entities_regex.sub(replace_entity, content) @@ -532,6 +574,8 @@ class Html2CreoleEmitter(object): raise AssertionError("wrong keyword argument 'unknown_emit'!") self.debugging = debug + + self.deentity = Deentity() # for replacing html entities self.__inner_list = "" self.__mask_linebreak = False @@ -604,10 +648,10 @@ class Html2CreoleEmitter(object): def blockdata_pre_emit(self, node): """ pre block -> with newline at the end """ - return u"{{{%s}}}\n" % deentitfy(node.content) + return u"{{{%s}}}\n" % self.deentity.replace_all(node.content) def inlinedata_pre_emit(self, node): """ a pre inline block -> no newline at the end """ - return u"{{{%s}}}" % deentitfy(node.content) + return u"{{{%s}}}" % self.deentity.replace_all(node.content) def blockdata_pass_emit(self, node): return u"%s\n\n" % node.content @@ -628,19 +672,12 @@ class Html2CreoleEmitter(object): """ entity = node.content - if entity == "nbsp": - # Non breaking spaces - return u" " - try: - character = entitydefs[entity] + return self.deentity.replace_named(entity) except KeyError, err: if self.debugging: print "unknown html entity found: %r" % entity return "&%s" % entity # FIXME - - try: - return unicode(character) except UnicodeDecodeError, err: raise UnicodeError( "Error handling entity %r: %s" % (entity, err) @@ -655,12 +692,10 @@ class Html2CreoleEmitter(object): if entity.startswith("x"): # entity in hex hex_no = entity[1:] - unicode_no = int(hex_no, 16) + return self.deentity.replace_hex(hex_no) else: # entity as a unicode number - unicode_no = int(entity) - - return unichr(unicode_no) + return self.deentity.replace_number(entity) #-------------------------------------------------------------------------- @@ -854,7 +889,7 @@ if __name__ == '__main__': doctest.testmod() print "doc test done." - #import sys;sys.exit() + import sys;sys.exit() data = u"""<p>less-than sign: < < <<br/> greater-than sign: > > ></p> |