summaryrefslogtreecommitdiff
path: root/creole/html_tools
diff options
context:
space:
mode:
authorJensDiemer <git@jensdiemer.de>2011-06-06 15:56:33 +0200
committerJensDiemer <git@jensdiemer.de>2011-06-06 15:56:33 +0200
commitb5d7c4221bb83a03febada1ae07a048707e838b7 (patch)
tree6ab034fc161f4e69bb636c0b13b122149eaacf42 /creole/html_tools
parent5549d55cf548f3b500f677638674655117dffa72 (diff)
downloadcreole-b5d7c4221bb83a03febada1ae07a048707e838b7.tar.gz
v0.6 - *NEW*: html2textile converter (not completed and some API changed!)
Diffstat (limited to 'creole/html_tools')
-rw-r--r--creole/html_tools/__init__.py0
-rw-r--r--creole/html_tools/deentity.py85
-rw-r--r--creole/html_tools/strip_html.py111
-rw-r--r--creole/html_tools/text_tools.py55
4 files changed, 251 insertions, 0 deletions
diff --git a/creole/html_tools/__init__.py b/creole/html_tools/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/creole/html_tools/__init__.py
diff --git a/creole/html_tools/deentity.py b/creole/html_tools/deentity.py
new file mode 100644
index 0000000..980b218
--- /dev/null
+++ b/creole/html_tools/deentity.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+
+"""
+ python-creole utils
+ ~~~~~~~~~~~~~~~~~~~
+
+
+ :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details.
+ :license: GNU GPL v3 or above, see LICENSE for more details.
+"""
+
+
+import re
+import htmlentitydefs
+
+
+entities_rules = '|'.join([
+ r"(&\#(?P<number>\d+);)",
+ r"(&\#x(?P<hex>[a-fA-F0-9]+);)",
+ r"(&(?P<named>[a-zA-Z]+);)",
+])
+#print entities_rules
+entities_regex = re.compile(
+ entities_rules, re.VERBOSE | re.UNICODE | re.MULTILINE
+)
+
+
+class Deentity(object):
+ """
+ replace html entity
+
+ >>> d = Deentity()
+ >>> d.replace_all(u"-=[&nbsp;&gt;&#62;&#x3E;nice&lt;&#60;&#x3C;&nbsp;]=-")
+ u'-=[ >>>nice<<< ]=-'
+
+ >>> d.replace_all(u"-=[M&uuml;hlheim]=-") # uuml - latin small letter u with diaeresis
+ u'-=[M\\xfchlheim]=-'
+
+ >>> d.replace_number("126")
+ u'~'
+ >>> d.replace_hex("7E")
+ u'~'
+ >>> d.replace_named("amp")
+ u'&'
+ """
+ def replace_number(self, text):
+ """ unicode number entity """
+ unicode_no = int(text)
+ return unichr(unicode_no)
+
+ def replace_hex(self, text):
+ """ hex entity """
+ unicode_no = int(text, 16)
+ return unichr(unicode_no)
+
+ def replace_named(self, text):
+ """ named entity """
+ if text == "nbsp":
+ # Non breaking spaces is not in htmlentitydefs
+ return u" "
+ else:
+ codepoint = htmlentitydefs.name2codepoint[text]
+ character = unichr(codepoint)
+ return character
+
+ def replace_all(self, content):
+ """ replace all html entities form the given text. """
+ def replace_entity(match):
+ groups = match.groupdict()
+ for name, text in groups.iteritems():
+ if text is not None:
+ replace_method = getattr(self, 'replace_%s' % name)
+ return replace_method(text)
+
+ # Should never happen:
+ raise RuntimeError("deentitfy re rules wrong!")
+
+ return entities_regex.sub(replace_entity, content)
+
+
+if __name__ == '__main__':
+ import doctest
+ print doctest.testmod()
diff --git a/creole/html_tools/strip_html.py b/creole/html_tools/strip_html.py
new file mode 100644
index 0000000..6f9a2c5
--- /dev/null
+++ b/creole/html_tools/strip_html.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+
+"""
+ python-creole utils
+ ~~~~~~~~~~~~~~~~~~~
+
+
+ :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details.
+ :license: GNU GPL v3 or above, see LICENSE for more details.
+"""
+
+
+import re
+
+from creole.html_parser.config import BLOCK_TAGS
+
+
+strip_html_regex = re.compile(
+ r"""
+ \s*
+ <
+ (?P<end>/{0,1}) # end tag e.g.: </end>
+ (?P<tag>[^ >]+) # tag name
+ .*?
+ (?P<startend>/{0,1}) # closed tag e.g.: <closed />
+ >
+ \s*
+ """,
+ re.VERBOSE | re.MULTILINE | re.UNICODE
+)
+
+
+
+def strip_html(html_code):
+ """
+ Delete whitespace from html code. Doesn't recordnize preformatted blocks!
+
+ >>> strip_html(u' <p> one \\n two </p>')
+ u'<p>one two</p>'
+
+ >>> strip_html(u'<p><strong><i>bold italics</i></strong></p>')
+ u'<p><strong><i>bold italics</i></strong></p>'
+
+ >>> strip_html(u'<li> Force <br /> \\n linebreak </li>')
+ u'<li>Force<br />linebreak</li>'
+
+ >>> strip_html(u'one <i>two \\n <strong> \\n three \\n </strong></i>')
+ u'one <i>two <strong>three</strong> </i>'
+
+ >>> strip_html(u'<p>a <unknown tag /> foobar </p>')
+ u'<p>a <unknown tag /> foobar</p>'
+
+ >>> strip_html(u'<p>a <pre> preformated area </pre> foo </p>')
+ u'<p>a<pre>preformated area</pre>foo</p>'
+
+ FIXME:
+ >>> strip_html(u'<strong>foo</strong>\\n<ul><li>one</li></ul>')
+ u'<strong>foo</strong><ul><li>one</li></ul>'
+ """
+
+ def strip_tag(match):
+ block = match.group(0)
+ end_tag = match.group("end") in ("/", u"/")
+ startend_tag = match.group("startend") in ("/", u"/")
+ tag = match.group("tag")
+
+# print "_"*40
+# print match.groupdict()
+# print "block.......: %r" % block
+# print "end_tag.....:", end_tag
+# print "startend_tag:", startend_tag
+# print "tag.........: %r" % tag
+
+ if tag in BLOCK_TAGS:
+ return block.strip()
+
+ space_start = block.startswith(" ")
+ space_end = block.endswith(" ")
+
+ result = block.strip()
+
+ if end_tag:
+ # It's a normal end tag e.g.: </strong>
+ if space_start or space_end:
+ result += " "
+ elif startend_tag:
+ # It's a closed start tag e.g.: <br />
+
+ if space_start: # there was space before the tag
+ result = " " + result
+
+ if space_end: # there was space after the tag
+ result += " "
+ else:
+ # a start tag e.g.: <strong>
+ if space_start or space_end:
+ result = " " + result
+
+ return result
+
+ data = html_code.strip()
+ clean_data = " ".join([line.strip() for line in data.split("\n")])
+ clean_data = strip_html_regex.sub(strip_tag, clean_data)
+ return clean_data
+
+
+if __name__ == '__main__':
+ import doctest
+ print doctest.testmod()
diff --git a/creole/html_tools/text_tools.py b/creole/html_tools/text_tools.py
new file mode 100644
index 0000000..47ac474
--- /dev/null
+++ b/creole/html_tools/text_tools.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+
+"""
+ python-creole utils
+ ~~~~~~~~~~~~~~~~~~~
+
+
+ :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details.
+ :license: GNU GPL v3 or above, see LICENSE for more details.
+"""
+
+
+import re
+
+
+space_re = re.compile(r"^(\s*)(.*?)(\s*)$", re.DOTALL)
+def clean_whitespace(txt):
+ """
+ Special whitespaces cleanup
+
+ >>> clean_whitespace(u"\\n\\nfoo bar\\n\\n")
+ u'foo bar\\n'
+
+ >>> clean_whitespace(u" foo bar \\n \\n")
+ u' foo bar\\n'
+
+ >>> clean_whitespace(u" \\n \\n foo bar ")
+ u' foo bar '
+
+ >>> clean_whitespace(u"foo bar")
+ u'foo bar'
+ """
+ def cleanup(match):
+ start, txt, end = match.groups()
+
+ if " " in start:
+ start = " "
+ else:
+ start = ""
+
+ if "\n" in end:
+ end = "\n"
+ elif " " in end:
+ end = " "
+
+ return start + txt + end
+
+ return space_re.sub(cleanup, txt)
+
+
+if __name__ == '__main__':
+ import doctest
+ print doctest.testmod()