diff options
Diffstat (limited to 'creole/parser/html_parser.py')
-rw-r--r-- | creole/parser/html_parser.py | 38 |
1 files changed, 22 insertions, 16 deletions
diff --git a/creole/parser/html_parser.py b/creole/parser/html_parser.py index a3879a9..0b8e171 100644 --- a/creole/parser/html_parser.py +++ b/creole/parser/html_parser.py @@ -95,6 +95,7 @@ class HtmlParser(HTMLParser): self.__list_level = 0 def _pre_cut(self, data, type, placeholder): + # TODO: Check if we have a code block, e.g.: "<pre><code>...</code></pre>" if self.debugging: print(f"append blockdata: {data!r}") assert isinstance(data, str), "blockdata is not unicode" @@ -125,27 +126,32 @@ class HtmlParser(HTMLParser): # data = match.group("data") - def feed(self, raw_data): + def feed(self, raw_data, preprocess=True) -> DocNode: assert isinstance(raw_data, str), "feed data must be unicode!" data = raw_data.strip() - - # cut out <pre> and <tt> areas block tag areas - data = block_re.sub(self._pre_cut_out, data) - data = inline_re.sub(self._pre_cut_out, data) - - # Delete whitespace from html code - data = strip_html(data) - - if self.debugging: + if preprocess: + # cut out <pre> and <tt> areas block tag areas + data = block_re.sub(self._pre_cut_out, data) + data = inline_re.sub(self._pre_cut_out, data) + + # Delete whitespace from html code + data = strip_html(data) + + if self.debugging: + print("_" * 79) + print("raw data:") + print(repr(raw_data)) + print(" -" * 40) + print("cleaned data:") + print(data) + print("-" * 79) + # print(data.replace(">", ">\n")) + # print("-"*79) + elif self.debugging: print("_" * 79) - print("raw data:") - print(repr(raw_data)) - print(" -" * 40) - print("cleaned data:") + print("data:") print(data) print("-" * 79) -# print(clean_data.replace(">", ">\n")) -# print("-"*79) HTMLParser.feed(self, data) |