"""
python-creole utils
~~~~~~~~~~~~~~~~~~~
:copyleft: 2008-2011 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
import re
from creole.parser.html_parser_config import BLOCK_TAGS
strip_html_regex = re.compile(
r"""
\s*
<
(?P/{0,1}) # end tag e.g.:
(?P[^ >]+) # tag name
.*?
(?P/{0,1}) # closed tag e.g.:
>
\s*
""",
re.VERBOSE | re.MULTILINE | re.UNICODE
)
def strip_html(html_code):
"""
Delete whitespace from html code. Doesn't recordnize preformatted blocks!
>>> strip_html(' one \\n two
')
'one two
'
>>> strip_html('bold italics
')
'bold italics
'
>>> strip_html(' Force
\\n linebreak ')
'Force
linebreak'
>>> strip_html('one two \\n \\n three \\n ')
'one two three '
>>> strip_html('a foobar
')
'a foobar
'
>>> strip_html('a
preformated area
foo
')
'a
preformated area
foo'
>>> strip_html('a
image.
')
'a
image.
'
"""
def strip_tag(match):
block = match.group(0)
end_tag = match.group("end") in ("/", "/")
startend_tag = match.group("startend") in ("/", "/")
tag = match.group("tag")
# print("_"*40)
# print(match.groupdict())
# print("block.......: %r" % block)
# print("end_tag.....:", end_tag)
# print("startend_tag:", startend_tag)
# print("tag.........: %r" % tag)
if tag in BLOCK_TAGS:
return block.strip()
space_start = block.startswith(" ")
space_end = block.endswith(" ")
result = block.strip()
if end_tag:
# It's a normal end tag e.g.:
if space_start or space_end:
result += " "
elif startend_tag:
# It's a closed start tag e.g.:
if space_start: # there was space before the tag
result = " " + result
if space_end: # there was space after the tag
result += " "
else:
# a start tag e.g.:
if space_start or space_end:
result = " " + result
return result
data = html_code.strip()
clean_data = " ".join([line.strip() for line in data.split("\n")])
clean_data = strip_html_regex.sub(strip_tag, clean_data)
return clean_data
if __name__ == '__main__':
import doctest
print(doctest.testmod())