summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWaylan Limberg <waylan.limberg@icloud.com>2020-11-23 13:11:21 -0500
committerGitHub <noreply@github.com>2020-11-23 13:11:21 -0500
commit1279074ea97807c0131a2b82893189bc07bf2dd8 (patch)
treef4087381979070955d9c441390ee0e123c4f39e5
parent82ac9056350e67411cdb1da34363950b1e18a271 (diff)
downloadpython-markdown-1279074ea97807c0131a2b82893189bc07bf2dd8.tar.gz
Properly parse unclosed tags in code spans
* fix unclosed pi in code span * fix unclosed dec in code span * fix unclosed tag in code span Closes #1066.
-rw-r--r--docs/change_log/index.md1
-rw-r--r--markdown/extensions/md_in_html.py20
-rw-r--r--markdown/htmlparser.py32
-rw-r--r--tests/test_syntax/blocks/test_html_blocks.py105
4 files changed, 158 insertions, 0 deletions
diff --git a/docs/change_log/index.md b/docs/change_log/index.md
index 7edb2b9..0069c22 100644
--- a/docs/change_log/index.md
+++ b/docs/change_log/index.md
@@ -5,6 +5,7 @@ Python-Markdown Change Log
Under development: version 3.3.4 (a bug-fix release).
+* Properly parse unclosed tags in code spans (#1066).
* Properly parse processing instructions in md_in_html (#1070).
* Properly parse code spans in md_in_html (#1069).
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
index 6d2a0e7..86cf00d 100644
--- a/markdown/extensions/md_in_html.py
+++ b/markdown/extensions/md_in_html.py
@@ -206,6 +206,26 @@ class HTMLExtractorExtra(HTMLExtractor):
else:
self.handle_data(self.md.htmlStash.store(data))
+ def parse_pi(self, i):
+ if self.at_line_start() or self.intail or self.mdstack:
+ # The same override exists in HTMLExtractor without the check
+ # for mdstack. Therefore, use HTMLExtractor's parent instead.
+ return super(HTMLExtractor, self).parse_pi(i)
+ # This is not the beginning of a raw block so treat as plain data
+ # and avoid consuming any tags which may follow (see #1066).
+ self.handle_data('<?')
+ return i + 2
+
+ def parse_html_declaration(self, i):
+ if self.at_line_start() or self.intail or self.mdstack:
+ # The same override exists in HTMLExtractor without the check
+ # for mdstack. Therefore, use HTMLExtractor's parent instead.
+ return super(HTMLExtractor, self).parse_html_declaration(i)
+ # This is not the beginning of a raw block so treat as plain data
+ # and avoid consuming any tags which may follow (see #1066).
+ self.handle_data('<!')
+ return i + 2
+
class HtmlBlockPreprocessor(Preprocessor):
"""Remove html blocks from the text and store them for later retrieval."""
diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index 269d954..c08856a 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -39,6 +39,22 @@ htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
# so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete,
# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
htmlparser.incomplete = htmlparser.entityref
+# Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value.
+htmlparser.locatestarttagend_tolerant = re.compile(r"""
+ <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
+ (?:[\s/]* # optional whitespace before attribute name
+ (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
+ (?:\s*=+\s* # value indicator
+ (?:'[^']*' # LITA-enclosed value
+ |"[^"]*" # LIT-enclosed value
+ |(?!['"])[^`>\s]* # bare value <= added backtick here
+ )
+ (?:\s*,)* # possibly followed by a comma
+ )?(?:\s|/(?!>))*
+ )*
+ )?
+ \s* # trailing whitespace
+""", re.VERBOSE)
# Match a blank line at the start of a block of text (two newlines).
# The newlines may be preceded by additional whitespace.
@@ -230,6 +246,22 @@ class HTMLExtractor(htmlparser.HTMLParser):
end = ']]>' if data.startswith('CDATA[') else ']>'
self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
+ def parse_pi(self, i):
+ if self.at_line_start() or self.intail:
+ return super().parse_pi(i)
+ # This is not the beginning of a raw block so treat as plain data
+ # and avoid consuming any tags which may follow (see #1066).
+ self.handle_data('<?')
+ return i + 2
+
+ def parse_html_declaration(self, i):
+ if self.at_line_start() or self.intail:
+ return super().parse_html_declaration(i)
+ # This is not the beginning of a raw block so treat as plain data
+ # and avoid consuming any tags which may follow (see #1066).
+ self.handle_data('<!')
+ return i + 2
+
# The rest has been copied from base class in standard lib to address #1036.
# As __startag_text is private, all references to it must be in this subclass.
# The last few lines of parse_starttag are reversed so that handle_starttag
diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py
index 589f682..0fdb3e5 100644
--- a/tests/test_syntax/blocks/test_html_blocks.py
+++ b/tests/test_syntax/blocks/test_html_blocks.py
@@ -663,6 +663,48 @@ class TestHTMLBlocks(TestCase):
'<p>&lt;foo</p>'
)
+ def test_raw_unclosed_tag_in_code_span(self):
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ `<div`.
+
+ <div>
+ hello
+ </div>
+ """
+ ),
+ self.dedent(
+ """
+ <p><code>&lt;div</code>.</p>
+ <div>
+ hello
+ </div>
+ """
+ )
+ )
+
+ def test_raw_unclosed_tag_in_code_span_space(self):
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ ` <div `.
+
+ <div>
+ hello
+ </div>
+ """
+ ),
+ self.dedent(
+ """
+ <p><code>&lt;div</code>.</p>
+ <div>
+ hello
+ </div>
+ """
+ )
+ )
+
def test_raw_attributes(self):
self.assertMarkdownRenders(
'<p id="foo", class="bar baz", style="margin: 15px; line-height: 1.5; text-align: center;">text</p>',
@@ -1073,6 +1115,27 @@ class TestHTMLBlocks(TestCase):
)
)
+ def test_raw_processing_instruction_code_span(self):
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ `<?php`
+
+ <div>
+ foo
+ </div>
+ """
+ ),
+ self.dedent(
+ """
+ <p><code>&lt;?php</code></p>
+ <div>
+ foo
+ </div>
+ """
+ )
+ )
+
def test_raw_declaration_one_line(self):
self.assertMarkdownRenders(
'<!DOCTYPE html>',
@@ -1110,6 +1173,27 @@ class TestHTMLBlocks(TestCase):
)
)
+ def test_raw_declaration_code_span(self):
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ `<!`
+
+ <div>
+ foo
+ </div>
+ """
+ ),
+ self.dedent(
+ """
+ <p><code>&lt;!</code></p>
+ <div>
+ foo
+ </div>
+ """
+ )
+ )
+
def test_raw_cdata_one_line(self):
self.assertMarkdownRenders(
'<![CDATA[ document.write(">"); ]]>',
@@ -1190,6 +1274,27 @@ class TestHTMLBlocks(TestCase):
)
)
+ def test_raw_cdata_code_span(self):
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ `<![`
+
+ <div>
+ foo
+ </div>
+ """
+ ),
+ self.dedent(
+ """
+ <p><code>&lt;![</code></p>
+ <div>
+ foo
+ </div>
+ """
+ )
+ )
+
def test_charref(self):
self.assertMarkdownRenders(
'&sect;',