Fix issues related to hr tags

Ensure that start/end tag handler does not include tags in the previous paragraph. Provide special handling for tags like hr that never have content. Use sets for block tag lists as they are much faster when comparing if an item is in the list. Fixes #1053.
author: Isaac Muse <faceless.shop@gmail.com> 2020-10-24 19:34:51 -0600
committer: GitHub <noreply@github.com> 2020-10-24 21:34:51 -0400
commit: 11c9e179390ba4e3fbc5ed35b9af16ea93f7d5ca (patch)
tree: c9a2e7d7c1a349127c9ef2fd9b865e33e2f4fb41
parent: 18b17e1bf5efa22ed06f09df14cc4c3ff8d7b5f8 (diff)
download: python-markdown-11c9e179390ba4e3fbc5ed35b9af16ea93f7d5ca.tar.gz
5 files changed, 271 insertions, 6 deletions
diff --git a/docs/change_log/index.md b/docs/change_log/index.md
index a47afec..fd9ba20 100644
--- a/docs/change_log/index.md
+++ b/docs/change_log/index.md
@@ -8,6 +8,7 @@ Under development: version 3.3.3 (a bug-fix release).
 * Unify all block-level tags (#1047).
 * Fix issue where some empty elements would have text rendered as `None` when using `md_in_html` (#1049).
 * Avoid catastrophic backtracking in `hr` regex (#1055).
+* Fix `hr` HTML handling (#1053).
 
 Oct 19, 2020: version 3.3.2 (a bug-fix release).
 
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
index a2137c7..eb8902e 100644
--- a/markdown/extensions/md_in_html.py
+++ b/markdown/extensions/md_in_html.py
@@ -30,15 +30,19 @@ class HTMLExtractorExtra(HTMLExtractor):
 
     def __init__(self, md, *args, **kwargs):
         # All block-level tags.
-        self.block_level_tags = md.block_level_elements.copy()
+        self.block_level_tags = set(md.block_level_elements.copy())
         # Block-level tags in which the content only gets span level parsing
-        self.span_tags = ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
+        self.span_tags = set(
+            ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
+        )
         # Block-level tags which never get their content parsed.
-        self.raw_tags = ['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea']
+        self.raw_tags = set(['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea'])
         # Block-level tags in which the content gets parsed as blocks
-        self.block_tags = [tag for tag in self.block_level_tags if tag not in self.span_tags + self.raw_tags]
         super().__init__(md, *args, **kwargs)
 
+        self.block_tags = set(self.block_level_tags) - (self.span_tags | self.raw_tags | self.empty_tags)
+        self.span_and_blocks_tags = self.block_tags | self.span_tags
+
     def reset(self):
         """Reset this instance.  Loses all unprocessed data."""
         self.mdstack = []  # When markdown=1, stack contains a list of tags
@@ -71,10 +75,10 @@ class HTMLExtractorExtra(HTMLExtractor):
             # Only use the parent state if it is more restrictive than the markdown attribute.
             md_attr = parent_state
         if ((md_attr == '1' and tag in self.block_tags) or
-                (md_attr == 'block' and tag in self.span_tags + self.block_tags)):
+                (md_attr == 'block' and tag in self.span_and_blocks_tags)):
             return 'block'
         elif ((md_attr == '1' and tag in self.span_tags) or
-              (md_attr == 'span' and tag in self.span_tags + self.block_tags)):
+              (md_attr == 'span' and tag in self.span_and_blocks_tags)):
             return 'span'
         elif tag in self.block_level_tags:
             return 'off'
@@ -90,6 +94,18 @@ class HTMLExtractorExtra(HTMLExtractor):
         return value
 
     def handle_starttag(self, tag, attrs):
+        # Handle tags that should always be empty and do not specify a closing tag
+        if tag in self.empty_tags:
+            attrs = {key: value if value is not None else key for key, value in attrs}
+            if "markdown" in attrs:
+                attrs.pop('markdown')
+                element = etree.Element(tag, attrs)
+                data = etree.tostring(element, encoding='unicode', method='html')
+            else:
+                data = self.get_starttag_text()
+            self.handle_empty_tag(data, True)
+            return
+
         if tag in self.block_level_tags:
             # Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`.
             # Convert to `{'checked': 'checked'}`.
@@ -161,6 +177,19 @@ class HTMLExtractorExtra(HTMLExtractor):
                 else:
                     self.handle_data(text)
 
+    def handle_startendtag(self, tag, attrs):
+        if tag in self.empty_tags:
+            attrs = {key: value if value is not None else key for key, value in attrs}
+            if "markdown" in attrs:
+                attrs.pop('markdown')
+                element = etree.Element(tag, attrs)
+                data = etree.tostring(element, encoding='unicode', method='html')
+            else:
+                data = self.get_starttag_text()
+        else:
+            data = self.get_starttag_text()
+        self.handle_empty_tag(data, is_block=self.md.is_block_level(tag))
+
     def handle_data(self, data):
         if self.inraw or not self.mdstack:
             super().handle_data(data)
diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index 6776d34..fee9cd5 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -56,6 +56,10 @@ class HTMLExtractor(htmlparser.HTMLParser):
     def __init__(self, md, *args, **kwargs):
         if 'convert_charrefs' not in kwargs:
             kwargs['convert_charrefs'] = False
+
+        # Block tags that should contain no content (self closing)
+        self.empty_tags = set(['hr'])
+
         # This calls self.reset
         super().__init__(*args, **kwargs)
         self.md = md
@@ -120,6 +124,11 @@ class HTMLExtractor(htmlparser.HTMLParser):
             return '</{}>'.format(tag)
 
     def handle_starttag(self, tag, attrs):
+        # Handle tags that should always be empty and do not specify a closing tag
+        if tag in self.empty_tags:
+            self.handle_startendtag(tag, attrs)
+            return
+
         if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
             # Started a new raw block. Prepare stack.
             self.inraw = True
@@ -183,6 +192,10 @@ class HTMLExtractor(htmlparser.HTMLParser):
             else:
                 # More content exists after tag.
                 self.intail = True
+            item = self.cleandoc[-1] if self.cleandoc else ''
+            # If we only have one newline before block element, add another
+            if not item.endswith('\n\n') and item.endswith('\n'):
+                self.cleandoc.append('\n')
             self.cleandoc.append(self.md.htmlStash.store(data))
             # Insert blank line between this and next line.
             self.cleandoc.append('\n\n')
diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py
index 3fea766..589f682 100644
--- a/tests/test_syntax/blocks/test_html_blocks.py
+++ b/tests/test_syntax/blocks/test_html_blocks.py
@@ -1402,3 +1402,102 @@ class TestHTMLBlocks(TestCase):
                 """
             )
         )
+
+    def test_hr_only_start(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p><em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_hr_self_close(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr/>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr/>
+                <p><em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_hr_start_and_end(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr></hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p></hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_hr_only_end(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                </hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em>
+                </hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_hr_with_content(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        # Content is not allowed and will be treated as normal content between two hr tags.
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr>
+                **content**
+                </hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p><strong>content</strong>
+                </hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py
index 865007f..824917c 100644
--- a/tests/test_syntax/extensions/test_md_in_html.py
+++ b/tests/test_syntax/extensions/test_md_in_html.py
@@ -893,6 +893,129 @@ class TestMdInHTML(TestCase):
             )
         )
 
+    def test_md1_hr_only_start(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr markdown="1">
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p><em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_md1_hr_self_close(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr markdown="1" />
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p><em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_md1_hr_start_and_end(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr markdown="1"></hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p></hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_md1_hr_only_end(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                </hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em>
+                </hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_md1_hr_with_content(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        # Content is not allowed and will be treated as normal content between two hr tags
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr markdown="1">
+                **content**
+                </hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p><strong>content</strong>
+                </hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
+
+    def test_no_md1_hr_with_content(self):
+        # Browers ignore ending hr tags, so we don't try to do anything to handle them special.
+        # Content is not allowed and will be treated as normal content between two hr tags
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                *emphasis1*
+                <hr>
+                **content**
+                </hr>
+                *emphasis2*
+                """
+            ),
+            self.dedent(
+                """
+                <p><em>emphasis1</em></p>
+                <hr>
+                <p><strong>content</strong>
+                </hr>
+                <em>emphasis2</em></p>
+                """
+            )
+        )
+
     def test_md1_nested_abbr_ref(self):
         self.assertMarkdownRenders(
             self.dedent(
author	Isaac Muse <faceless.shop@gmail.com>	2020-10-24 19:34:51 -0600
committer	GitHub <noreply@github.com>	2020-10-24 21:34:51 -0400
commit	11c9e179390ba4e3fbc5ed35b9af16ea93f7d5ca (patch)
tree	c9a2e7d7c1a349127c9ef2fd9b865e33e2f4fb41
parent	18b17e1bf5efa22ed06f09df14cc4c3ff8d7b5f8 (diff)
download	python-markdown-11c9e179390ba4e3fbc5ed35b9af16ea93f7d5ca.tar.gz