#!/usr/bin/env python # coding: utf-8 """ python-creole utils ~~~~~~~~~~~~~~~~~~~ :copyleft: 2008-2011 by python-creole team, see AUTHORS for more details. :license: GNU GPL v3 or above, see LICENSE for more details. """ import re from creole.parser.html_parser_config import BLOCK_TAGS strip_html_regex = re.compile( r""" \s* < (?P/{0,1}) # end tag e.g.: (?P[^ >]+) # tag name .*? (?P/{0,1}) # closed tag e.g.: > \s* """, re.VERBOSE | re.MULTILINE | re.UNICODE ) def strip_html(html_code): """ Delete whitespace from html code. Doesn't recordnize preformatted blocks! >>> strip_html('

one \\n two

') '

one two

' >>> strip_html('

bold italics

') '

bold italics

' >>> strip_html('

Force
\\n linebreak

') '

Force
linebreak

' >>> strip_html('one two \\n \\n three \\n ') 'one two three ' >>> strip_html('

a foobar

') '

a foobar

' >>> strip_html('

 preformated area

foo

') '

preformated area

foo

' >>> strip_html('

a image.

') '

a image.

' """ def strip_tag(match): block = match.group(0) end_tag = match.group("end") in ("/", "/") startend_tag = match.group("startend") in ("/", "/") tag = match.group("tag") # print("_"*40) # print(match.groupdict()) # print("block.......: %r" % block) # print("end_tag.....:", end_tag) # print("startend_tag:", startend_tag) # print("tag.........: %r" % tag) if tag in BLOCK_TAGS: return block.strip() space_start = block.startswith(" ") space_end = block.endswith(" ") result = block.strip() if end_tag: # It's a normal end tag e.g.: if space_start or space_end: result += " " elif startend_tag: # It's a closed start tag e.g.:
if space_start: # there was space before the tag result = " " + result if space_end: # there was space after the tag result += " " else: # a start tag e.g.: if space_start or space_end: result = " " + result return result data = html_code.strip() clean_data = " ".join([line.strip() for line in data.split("\n")]) clean_data = strip_html_regex.sub(strip_tag, clean_data) return clean_data if __name__ == '__main__': import doctest print(doctest.testmod())