Properly parse code spans in md_in_html (#1069)

This reverts part of 2766698 and re-implements handling of tails in the same manner as the core. Also, ensure line_offset doesn't raise an error on bad input (see #1066) and properly handle script tags in code spans (same as in the core). Fixes #1068.
author: Waylan Limberg <waylan.limberg@icloud.com> 2020-11-18 13:33:20 -0500
committer: GitHub <noreply@github.com> 2020-11-18 13:33:20 -0500
commit: 81cc5b8bf1ad2a44b0a042d059caab3ed802ed33 (patch)
tree: 78eeb3ceaeb05a4518fe45156e864e6e2c7877cf
parent: 447da662b0c9548941a44a911e45c7cf6ad32861 (diff)
download: python-markdown-81cc5b8bf1ad2a44b0a042d059caab3ed802ed33.tar.gz
4 files changed, 92 insertions, 15 deletions
diff --git a/docs/change_log/index.md b/docs/change_log/index.md
index 632449a..bce97da 100644
--- a/docs/change_log/index.md
+++ b/docs/change_log/index.md
@@ -3,6 +3,10 @@ title: Change Log
 Python-Markdown Change Log
 =========================
 
+Under development: version 3.3.4 (a bug-fix release).
+
+* Properly parse code spans in md_in_html (#1069).
+
 Oct 25, 2020: version 3.3.3 (a bug-fix release).
 
 * Unify all block-level tags (#1047).
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
index eb8902e..b8848ef 100644
--- a/markdown/extensions/md_in_html.py
+++ b/markdown/extensions/md_in_html.py
@@ -19,7 +19,7 @@ from ..blockprocessors import BlockProcessor
 from ..preprocessors import Preprocessor
 from ..postprocessors import RawHtmlPostprocessor
 from .. import util
-from ..htmlparser import HTMLExtractor
+from ..htmlparser import HTMLExtractor, blank_line_re
 import xml.etree.ElementTree as etree
 
 
@@ -85,17 +85,9 @@ class HTMLExtractorExtra(HTMLExtractor):
         else:  # pragma: no cover
             return None
 
-    def at_line_start(self):
-        """At line start."""
-
-        value = super().at_line_start()
-        if not value and self.cleandoc and self.cleandoc[-1].endswith('\n'):
-            value = True
-        return value
-
     def handle_starttag(self, tag, attrs):
         # Handle tags that should always be empty and do not specify a closing tag
-        if tag in self.empty_tags:
+        if tag in self.empty_tags and (self.at_line_start() or self.intail):
             attrs = {key: value if value is not None else key for key, value in attrs}
             if "markdown" in attrs:
                 attrs.pop('markdown')
@@ -106,13 +98,12 @@ class HTMLExtractorExtra(HTMLExtractor):
             self.handle_empty_tag(data, True)
             return
 
-        if tag in self.block_level_tags:
+        if tag in self.block_level_tags and (self.at_line_start() or self.intail):
             # Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`.
             # Convert to `{'checked': 'checked'}`.
             attrs = {key: value if value is not None else key for key, value in attrs}
             state = self.get_state(tag, attrs)
-
-            if self.inraw or (state in [None, 'off'] and not self.mdstack) or not self.at_line_start():
+            if self.inraw or (state in [None, 'off'] and not self.mdstack):
                 # fall back to default behavior
                 attrs.pop('markdown', None)
                 super().handle_starttag(tag, attrs)
@@ -134,6 +125,9 @@ class HTMLExtractorExtra(HTMLExtractor):
                     self.handle_data(self.md.htmlStash.store(text))
                 else:
                     self.handle_data(text)
+                if tag in self.CDATA_CONTENT_ELEMENTS:
+                    # This is presumably a standalone tag in a code span (see #1036).
+                    self.clear_cdata_mode()
 
     def handle_endtag(self, tag):
         if tag in self.block_level_tags:
@@ -159,6 +153,11 @@ class HTMLExtractorExtra(HTMLExtractor):
                     self.cleandoc.append(self.md.htmlStash.store(element))
                     self.cleandoc.append('\n\n')
                     self.state = []
+                    # Check if element has a tail
+                    if not blank_line_re.match(
+                            self.rawdata[self.line_offset + self.offset + len(self.get_endtag_text(tag)):]):
+                        # More content exists after endtag.
+                        self.intail = True
             else:
                 # Treat orphan closing tag as a span level tag.
                 text = self.get_endtag_text(tag)
@@ -191,6 +190,8 @@ class HTMLExtractorExtra(HTMLExtractor):
         self.handle_empty_tag(data, is_block=self.md.is_block_level(tag))
 
     def handle_data(self, data):
+        if self.intail and '\n' in data:
+            self.intail = False
         if self.inraw or not self.mdstack:
             super().handle_data(data)
         else:
diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index fee9cd5..269d954 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -91,8 +91,14 @@ class HTMLExtractor(htmlparser.HTMLParser):
     @property
     def line_offset(self):
         """Returns char index in self.rawdata for the start of the current line. """
-        if self.lineno > 1:
-            return re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata).end()
+        if self.lineno > 1 and '\n' in self.rawdata:
+            m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata)
+            if m:
+                return m.end()
+            else:  # pragma: no cover
+                # Value of self.lineno must exceed total number of lines.
+                # Find index of begining of last line.
+                return self.rawdata.rfind('\n')
         return 0
 
     def at_line_start(self):
diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py
index 824917c..7786b80 100644
--- a/tests/test_syntax/extensions/test_md_in_html.py
+++ b/tests/test_syntax/extensions/test_md_in_html.py
@@ -126,6 +126,72 @@ class TestMdInHTML(TestCase):
             )
         )
 
+    def test_md1_code_span(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                <div markdown="1">
+                `<h1>code span</h1>`
+                </div>
+                """
+            ),
+            self.dedent(
+                """
+                <div>
+                <p><code>&lt;h1&gt;code span&lt;/h1&gt;</code></p>
+                </div>
+                """
+            )
+        )
+
+    def test_md1_code_span_oneline(self):
+        self.assertMarkdownRenders(
+            '<div markdown="1">`<h1>code span</h1>`</div>',
+            self.dedent(
+                """
+                <div>
+                <p><code>&lt;h1&gt;code span&lt;/h1&gt;</code></p>
+                </div>
+                """
+            )
+        )
+
+    def test_md1_code_span_unclosed(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                <div markdown="1">
+                `<p>`
+                </div>
+                """
+            ),
+            self.dedent(
+                """
+                <div>
+                <p><code>&lt;p&gt;</code></p>
+                </div>
+                """
+            )
+        )
+
+    def test_md1_code_span_script_tag(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                <div markdown="1">
+                `<script>`
+                </div>
+                """
+            ),
+            self.dedent(
+                """
+                <div>
+                <p><code>&lt;script&gt;</code></p>
+                </div>
+                """
+            )
+        )
+
     def test_md1_div_blank_lines(self):
         self.assertMarkdownRenders(
             self.dedent(
author	Waylan Limberg <waylan.limberg@icloud.com>	2020-11-18 13:33:20 -0500
committer	GitHub <noreply@github.com>	2020-11-18 13:33:20 -0500
commit	81cc5b8bf1ad2a44b0a042d059caab3ed802ed33 (patch)
tree	78eeb3ceaeb05a4518fe45156e864e6e2c7877cf
parent	447da662b0c9548941a44a911e45c7cf6ad32861 (diff)
download	python-markdown-81cc5b8bf1ad2a44b0a042d059caab3ed802ed33.tar.gz