summaryrefslogtreecommitdiff
path: root/creole/parser/html_parser.py
diff options
context:
space:
mode:
Diffstat (limited to 'creole/parser/html_parser.py')
-rw-r--r--creole/parser/html_parser.py38
1 files changed, 22 insertions, 16 deletions
diff --git a/creole/parser/html_parser.py b/creole/parser/html_parser.py
index a3879a9..0b8e171 100644
--- a/creole/parser/html_parser.py
+++ b/creole/parser/html_parser.py
@@ -95,6 +95,7 @@ class HtmlParser(HTMLParser):
self.__list_level = 0
def _pre_cut(self, data, type, placeholder):
+ # TODO: Check if we have a code block, e.g.: "<pre><code>...</code></pre>"
if self.debugging:
print(f"append blockdata: {data!r}")
assert isinstance(data, str), "blockdata is not unicode"
@@ -125,27 +126,32 @@ class HtmlParser(HTMLParser):
# data = match.group("data")
- def feed(self, raw_data):
+ def feed(self, raw_data, preprocess=True) -> DocNode:
assert isinstance(raw_data, str), "feed data must be unicode!"
data = raw_data.strip()
-
- # cut out <pre> and <tt> areas block tag areas
- data = block_re.sub(self._pre_cut_out, data)
- data = inline_re.sub(self._pre_cut_out, data)
-
- # Delete whitespace from html code
- data = strip_html(data)
-
- if self.debugging:
+ if preprocess:
+ # cut out <pre> and <tt> areas block tag areas
+ data = block_re.sub(self._pre_cut_out, data)
+ data = inline_re.sub(self._pre_cut_out, data)
+
+ # Delete whitespace from html code
+ data = strip_html(data)
+
+ if self.debugging:
+ print("_" * 79)
+ print("raw data:")
+ print(repr(raw_data))
+ print(" -" * 40)
+ print("cleaned data:")
+ print(data)
+ print("-" * 79)
+ # print(data.replace(">", ">\n"))
+ # print("-"*79)
+ elif self.debugging:
print("_" * 79)
- print("raw data:")
- print(repr(raw_data))
- print(" -" * 40)
- print("cleaned data:")
+ print("data:")
print(data)
print("-" * 79)
-# print(clean_data.replace(">", ">\n"))
-# print("-"*79)
HTMLParser.feed(self, data)