diff options
| author | JensDiemer <git@jensdiemer.de> | 2011-06-06 15:56:33 +0200 |
|---|---|---|
| committer | JensDiemer <git@jensdiemer.de> | 2011-06-06 15:56:33 +0200 |
| commit | b5d7c4221bb83a03febada1ae07a048707e838b7 (patch) | |
| tree | 6ab034fc161f4e69bb636c0b13b122149eaacf42 /creole/html_tools | |
| parent | 5549d55cf548f3b500f677638674655117dffa72 (diff) | |
| download | creole-b5d7c4221bb83a03febada1ae07a048707e838b7.tar.gz | |
v0.6 - *NEW*: html2textile converter (not completed and some API changed!)
Diffstat (limited to 'creole/html_tools')
| -rw-r--r-- | creole/html_tools/__init__.py | 0 | ||||
| -rw-r--r-- | creole/html_tools/deentity.py | 85 | ||||
| -rw-r--r-- | creole/html_tools/strip_html.py | 111 | ||||
| -rw-r--r-- | creole/html_tools/text_tools.py | 55 |
4 files changed, 251 insertions, 0 deletions
diff --git a/creole/html_tools/__init__.py b/creole/html_tools/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/creole/html_tools/__init__.py diff --git a/creole/html_tools/deentity.py b/creole/html_tools/deentity.py new file mode 100644 index 0000000..980b218 --- /dev/null +++ b/creole/html_tools/deentity.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# coding: utf-8 + + +""" + python-creole utils + ~~~~~~~~~~~~~~~~~~~ + + + :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details. + :license: GNU GPL v3 or above, see LICENSE for more details. +""" + + +import re +import htmlentitydefs + + +entities_rules = '|'.join([ + r"(&\#(?P<number>\d+);)", + r"(&\#x(?P<hex>[a-fA-F0-9]+);)", + r"(&(?P<named>[a-zA-Z]+);)", +]) +#print entities_rules +entities_regex = re.compile( + entities_rules, re.VERBOSE | re.UNICODE | re.MULTILINE +) + + +class Deentity(object): + """ + replace html entity + + >>> d = Deentity() + >>> d.replace_all(u"-=[ >>>nice<<< ]=-") + u'-=[ >>>nice<<< ]=-' + + >>> d.replace_all(u"-=[Mühlheim]=-") # uuml - latin small letter u with diaeresis + u'-=[M\\xfchlheim]=-' + + >>> d.replace_number("126") + u'~' + >>> d.replace_hex("7E") + u'~' + >>> d.replace_named("amp") + u'&' + """ + def replace_number(self, text): + """ unicode number entity """ + unicode_no = int(text) + return unichr(unicode_no) + + def replace_hex(self, text): + """ hex entity """ + unicode_no = int(text, 16) + return unichr(unicode_no) + + def replace_named(self, text): + """ named entity """ + if text == "nbsp": + # Non breaking spaces is not in htmlentitydefs + return u" " + else: + codepoint = htmlentitydefs.name2codepoint[text] + character = unichr(codepoint) + return character + + def replace_all(self, content): + """ replace all html entities form the given text. """ + def replace_entity(match): + groups = match.groupdict() + for name, text in groups.iteritems(): + if text is not None: + replace_method = getattr(self, 'replace_%s' % name) + return replace_method(text) + + # Should never happen: + raise RuntimeError("deentitfy re rules wrong!") + + return entities_regex.sub(replace_entity, content) + + +if __name__ == '__main__': + import doctest + print doctest.testmod() diff --git a/creole/html_tools/strip_html.py b/creole/html_tools/strip_html.py new file mode 100644 index 0000000..6f9a2c5 --- /dev/null +++ b/creole/html_tools/strip_html.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python +# coding: utf-8 + + +""" + python-creole utils + ~~~~~~~~~~~~~~~~~~~ + + + :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details. + :license: GNU GPL v3 or above, see LICENSE for more details. +""" + + +import re + +from creole.html_parser.config import BLOCK_TAGS + + +strip_html_regex = re.compile( + r""" + \s* + < + (?P<end>/{0,1}) # end tag e.g.: </end> + (?P<tag>[^ >]+) # tag name + .*? + (?P<startend>/{0,1}) # closed tag e.g.: <closed /> + > + \s* + """, + re.VERBOSE | re.MULTILINE | re.UNICODE +) + + + +def strip_html(html_code): + """ + Delete whitespace from html code. Doesn't recordnize preformatted blocks! + + >>> strip_html(u' <p> one \\n two </p>') + u'<p>one two</p>' + + >>> strip_html(u'<p><strong><i>bold italics</i></strong></p>') + u'<p><strong><i>bold italics</i></strong></p>' + + >>> strip_html(u'<li> Force <br /> \\n linebreak </li>') + u'<li>Force<br />linebreak</li>' + + >>> strip_html(u'one <i>two \\n <strong> \\n three \\n </strong></i>') + u'one <i>two <strong>three</strong> </i>' + + >>> strip_html(u'<p>a <unknown tag /> foobar </p>') + u'<p>a <unknown tag /> foobar</p>' + + >>> strip_html(u'<p>a <pre> preformated area </pre> foo </p>') + u'<p>a<pre>preformated area</pre>foo</p>' + + FIXME: + >>> strip_html(u'<strong>foo</strong>\\n<ul><li>one</li></ul>') + u'<strong>foo</strong><ul><li>one</li></ul>' + """ + + def strip_tag(match): + block = match.group(0) + end_tag = match.group("end") in ("/", u"/") + startend_tag = match.group("startend") in ("/", u"/") + tag = match.group("tag") + +# print "_"*40 +# print match.groupdict() +# print "block.......: %r" % block +# print "end_tag.....:", end_tag +# print "startend_tag:", startend_tag +# print "tag.........: %r" % tag + + if tag in BLOCK_TAGS: + return block.strip() + + space_start = block.startswith(" ") + space_end = block.endswith(" ") + + result = block.strip() + + if end_tag: + # It's a normal end tag e.g.: </strong> + if space_start or space_end: + result += " " + elif startend_tag: + # It's a closed start tag e.g.: <br /> + + if space_start: # there was space before the tag + result = " " + result + + if space_end: # there was space after the tag + result += " " + else: + # a start tag e.g.: <strong> + if space_start or space_end: + result = " " + result + + return result + + data = html_code.strip() + clean_data = " ".join([line.strip() for line in data.split("\n")]) + clean_data = strip_html_regex.sub(strip_tag, clean_data) + return clean_data + + +if __name__ == '__main__': + import doctest + print doctest.testmod() diff --git a/creole/html_tools/text_tools.py b/creole/html_tools/text_tools.py new file mode 100644 index 0000000..47ac474 --- /dev/null +++ b/creole/html_tools/text_tools.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# coding: utf-8 + + +""" + python-creole utils + ~~~~~~~~~~~~~~~~~~~ + + + :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details. + :license: GNU GPL v3 or above, see LICENSE for more details. +""" + + +import re + + +space_re = re.compile(r"^(\s*)(.*?)(\s*)$", re.DOTALL) +def clean_whitespace(txt): + """ + Special whitespaces cleanup + + >>> clean_whitespace(u"\\n\\nfoo bar\\n\\n") + u'foo bar\\n' + + >>> clean_whitespace(u" foo bar \\n \\n") + u' foo bar\\n' + + >>> clean_whitespace(u" \\n \\n foo bar ") + u' foo bar ' + + >>> clean_whitespace(u"foo bar") + u'foo bar' + """ + def cleanup(match): + start, txt, end = match.groups() + + if " " in start: + start = " " + else: + start = "" + + if "\n" in end: + end = "\n" + elif " " in end: + end = " " + + return start + txt + end + + return space_re.sub(cleanup, txt) + + +if __name__ == '__main__': + import doctest + print doctest.testmod() |
