blob: c5e79a216c3d536919029efe7d75098a0f36c661 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
|
#!/usr/bin/env python
# coding: utf-8
"""
python-creole utils
~~~~~~~~~~~~~~~~~~~
:copyleft: 2008-2011 by python-creole team, see AUTHORS for more details.
:license: GNU GPL v3 or above, see LICENSE for more details.
"""
import re
from creole.parser.html_parser_config import BLOCK_TAGS
strip_html_regex = re.compile(
r"""
\s*
<
(?P<end>/{0,1}) # end tag e.g.: </end>
(?P<tag>[^ >]+) # tag name
.*?
(?P<startend>/{0,1}) # closed tag e.g.: <closed />
>
\s*
""",
re.VERBOSE | re.MULTILINE | re.UNICODE
)
def strip_html(html_code):
"""
Delete whitespace from html code. Doesn't recordnize preformatted blocks!
>>> strip_html(' <p> one \\n two </p>')
'<p>one two</p>'
>>> strip_html('<p><strong><i>bold italics</i></strong></p>')
'<p><strong><i>bold italics</i></strong></p>'
>>> strip_html('<li> Force <br /> \\n linebreak </li>')
'<li>Force<br />linebreak</li>'
>>> strip_html('one <i>two \\n <strong> \\n three \\n </strong></i>')
'one <i>two <strong>three</strong> </i>'
>>> strip_html('<p>a <unknown tag /> foobar </p>')
'<p>a <unknown tag /> foobar</p>'
>>> strip_html('<p>a <pre> preformated area </pre> foo </p>')
'<p>a<pre>preformated area</pre>foo</p>'
>>> strip_html('<p>a <img src="/image.jpg" /> image.</p>')
'<p>a <img src="/image.jpg" /> image.</p>'
"""
def strip_tag(match):
block = match.group(0)
end_tag = match.group("end") in ("/", "/")
startend_tag = match.group("startend") in ("/", "/")
tag = match.group("tag")
# print("_"*40)
# print(match.groupdict())
# print("block.......: %r" % block)
# print("end_tag.....:", end_tag)
# print("startend_tag:", startend_tag)
# print("tag.........: %r" % tag)
if tag in BLOCK_TAGS:
return block.strip()
space_start = block.startswith(" ")
space_end = block.endswith(" ")
result = block.strip()
if end_tag:
# It's a normal end tag e.g.: </strong>
if space_start or space_end:
result += " "
elif startend_tag:
# It's a closed start tag e.g.: <br />
if space_start: # there was space before the tag
result = " " + result
if space_end: # there was space after the tag
result += " "
else:
# a start tag e.g.: <strong>
if space_start or space_end:
result = " " + result
return result
data = html_code.strip()
clean_data = " ".join([line.strip() for line in data.split("\n")])
clean_data = strip_html_regex.sub(strip_tag, clean_data)
return clean_data
if __name__ == '__main__':
import doctest
print(doctest.testmod())
|