diff options
author | Waylan Limberg <waylan.limberg@icloud.com> | 2020-11-23 13:11:21 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-11-23 13:11:21 -0500 |
commit | 1279074ea97807c0131a2b82893189bc07bf2dd8 (patch) | |
tree | f4087381979070955d9c441390ee0e123c4f39e5 | |
parent | 82ac9056350e67411cdb1da34363950b1e18a271 (diff) | |
download | python-markdown-1279074ea97807c0131a2b82893189bc07bf2dd8.tar.gz |
Properly parse unclosed tags in code spans
* fix unclosed pi in code span
* fix unclosed dec in code span
* fix unclosed tag in code span
Closes #1066.
-rw-r--r-- | docs/change_log/index.md | 1 | ||||
-rw-r--r-- | markdown/extensions/md_in_html.py | 20 | ||||
-rw-r--r-- | markdown/htmlparser.py | 32 | ||||
-rw-r--r-- | tests/test_syntax/blocks/test_html_blocks.py | 105 |
4 files changed, 158 insertions, 0 deletions
diff --git a/docs/change_log/index.md b/docs/change_log/index.md index 7edb2b9..0069c22 100644 --- a/docs/change_log/index.md +++ b/docs/change_log/index.md @@ -5,6 +5,7 @@ Python-Markdown Change Log Under development: version 3.3.4 (a bug-fix release). +* Properly parse unclosed tags in code spans (#1066). * Properly parse processing instructions in md_in_html (#1070). * Properly parse code spans in md_in_html (#1069). diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py index 6d2a0e7..86cf00d 100644 --- a/markdown/extensions/md_in_html.py +++ b/markdown/extensions/md_in_html.py @@ -206,6 +206,26 @@ class HTMLExtractorExtra(HTMLExtractor): else: self.handle_data(self.md.htmlStash.store(data)) + def parse_pi(self, i): + if self.at_line_start() or self.intail or self.mdstack: + # The same override exists in HTMLExtractor without the check + # for mdstack. Therefore, use HTMLExtractor's parent instead. + return super(HTMLExtractor, self).parse_pi(i) + # This is not the beginning of a raw block so treat as plain data + # and avoid consuming any tags which may follow (see #1066). + self.handle_data('<?') + return i + 2 + + def parse_html_declaration(self, i): + if self.at_line_start() or self.intail or self.mdstack: + # The same override exists in HTMLExtractor without the check + # for mdstack. Therefore, use HTMLExtractor's parent instead. + return super(HTMLExtractor, self).parse_html_declaration(i) + # This is not the beginning of a raw block so treat as plain data + # and avoid consuming any tags which may follow (see #1066). + self.handle_data('<!') + return i + 2 + class HtmlBlockPreprocessor(Preprocessor): """Remove html blocks from the text and store them for later retrieval.""" diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py index 269d954..c08856a 100644 --- a/markdown/htmlparser.py +++ b/markdown/htmlparser.py @@ -39,6 +39,22 @@ htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') # so the 'incomplete' functionality is unnecessary. As the entityref regex is run right before incomplete, # and the two regex are the same, then incomplete will simply never match and we avoid the logic within. htmlparser.incomplete = htmlparser.entityref +# Monkeypatch HTMLParser to not accept a backtick in a tag name, attribute name, or bare value. +htmlparser.locatestarttagend_tolerant = re.compile(r""" + <[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here + (?:[\s/]* # optional whitespace before attribute name + (?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here + (?:\s*=+\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |"[^"]*" # LIT-enclosed value + |(?!['"])[^`>\s]* # bare value <= added backtick here + ) + (?:\s*,)* # possibly followed by a comma + )?(?:\s|/(?!>))* + )* + )? + \s* # trailing whitespace +""", re.VERBOSE) # Match a blank line at the start of a block of text (two newlines). # The newlines may be preceded by additional whitespace. @@ -230,6 +246,22 @@ class HTMLExtractor(htmlparser.HTMLParser): end = ']]>' if data.startswith('CDATA[') else ']>' self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True) + def parse_pi(self, i): + if self.at_line_start() or self.intail: + return super().parse_pi(i) + # This is not the beginning of a raw block so treat as plain data + # and avoid consuming any tags which may follow (see #1066). + self.handle_data('<?') + return i + 2 + + def parse_html_declaration(self, i): + if self.at_line_start() or self.intail: + return super().parse_html_declaration(i) + # This is not the beginning of a raw block so treat as plain data + # and avoid consuming any tags which may follow (see #1066). + self.handle_data('<!') + return i + 2 + # The rest has been copied from base class in standard lib to address #1036. # As __startag_text is private, all references to it must be in this subclass. # The last few lines of parse_starttag are reversed so that handle_starttag diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py index 589f682..0fdb3e5 100644 --- a/tests/test_syntax/blocks/test_html_blocks.py +++ b/tests/test_syntax/blocks/test_html_blocks.py @@ -663,6 +663,48 @@ class TestHTMLBlocks(TestCase): '<p><foo</p>' ) + def test_raw_unclosed_tag_in_code_span(self): + self.assertMarkdownRenders( + self.dedent( + """ + `<div`. + + <div> + hello + </div> + """ + ), + self.dedent( + """ + <p><code><div</code>.</p> + <div> + hello + </div> + """ + ) + ) + + def test_raw_unclosed_tag_in_code_span_space(self): + self.assertMarkdownRenders( + self.dedent( + """ + ` <div `. + + <div> + hello + </div> + """ + ), + self.dedent( + """ + <p><code><div</code>.</p> + <div> + hello + </div> + """ + ) + ) + def test_raw_attributes(self): self.assertMarkdownRenders( '<p id="foo", class="bar baz", style="margin: 15px; line-height: 1.5; text-align: center;">text</p>', @@ -1073,6 +1115,27 @@ class TestHTMLBlocks(TestCase): ) ) + def test_raw_processing_instruction_code_span(self): + self.assertMarkdownRenders( + self.dedent( + """ + `<?php` + + <div> + foo + </div> + """ + ), + self.dedent( + """ + <p><code><?php</code></p> + <div> + foo + </div> + """ + ) + ) + def test_raw_declaration_one_line(self): self.assertMarkdownRenders( '<!DOCTYPE html>', @@ -1110,6 +1173,27 @@ class TestHTMLBlocks(TestCase): ) ) + def test_raw_declaration_code_span(self): + self.assertMarkdownRenders( + self.dedent( + """ + `<!` + + <div> + foo + </div> + """ + ), + self.dedent( + """ + <p><code><!</code></p> + <div> + foo + </div> + """ + ) + ) + def test_raw_cdata_one_line(self): self.assertMarkdownRenders( '<![CDATA[ document.write(">"); ]]>', @@ -1190,6 +1274,27 @@ class TestHTMLBlocks(TestCase): ) ) + def test_raw_cdata_code_span(self): + self.assertMarkdownRenders( + self.dedent( + """ + `<![` + + <div> + foo + </div> + """ + ), + self.dedent( + """ + <p><code><![</code></p> + <div> + foo + </div> + """ + ) + ) + def test_charref(self): self.assertMarkdownRenders( '§', |