summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWaylan Limberg <waylan.limberg@icloud.com>2020-10-12 14:17:03 -0400
committerGitHub <noreply@github.com>2020-10-12 14:17:03 -0400
commit5fdf7d47aa90a0983fa356b577a2ff5e16e68147 (patch)
treeb0bcd2b3cdb1e26b7fafc00c4f044cffebf93656
parente02ed390666930ce8640d4cebcac51059e9a34d8 (diff)
downloadpython-markdown-5fdf7d47aa90a0983fa356b577a2ff5e16e68147.tar.gz
Correctly parse raw `script` and `style` tags. (#1038)
* Ensure unclosed script tags are parsed correctly by providing a workaround for https://bugs.python.org/issue41989. * Avoid cdata_mode outside of HTML blocks, such as in inline code spans. Fixes #1036.
-rw-r--r--docs/change_log/index.md1
-rw-r--r--markdown/htmlparser.py70
-rw-r--r--tests/test_syntax/blocks/test_html_blocks.py85
3 files changed, 156 insertions, 0 deletions
diff --git a/docs/change_log/index.md b/docs/change_log/index.md
index 5548644..3b2eea5 100644
--- a/docs/change_log/index.md
+++ b/docs/change_log/index.md
@@ -5,6 +5,7 @@ Python-Markdown Change Log
Under development: version 3.3.1 (a bug-fix release).
+* Correctly parse raw `script` and `style` tags (#1036).
* Ensure consistent class handling by `fenced_code` and `codehilite` (#1032).
Oct 6, 2020: version 3.3 ([Notes](release-3.3.md)).
diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index f83ddea..6776d34 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -72,6 +72,13 @@ class HTMLExtractor(htmlparser.HTMLParser):
def close(self):
"""Handle any buffered data."""
super().close()
+ if len(self.rawdata):
+ # Temp fix for https://bugs.python.org/issue41989
+ # TODO: remove this when the bug is fixed in all supported Python versions.
+ if self.convert_charrefs and not self.cdata_elem: # pragma: no cover
+ self.handle_data(htmlparser.unescape(self.rawdata))
+ else:
+ self.handle_data(self.rawdata)
# Handle any unclosed tags.
if len(self._cache):
self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
@@ -124,6 +131,9 @@ class HTMLExtractor(htmlparser.HTMLParser):
self._cache.append(text)
else:
self.cleandoc.append(text)
+ if tag in self.CDATA_CONTENT_ELEMENTS:
+ # This is presumably a standalone tag in a code span (see #1036).
+ self.clear_cdata_mode()
def handle_endtag(self, tag):
text = self.get_endtag_text(tag)
@@ -200,3 +210,63 @@ class HTMLExtractor(htmlparser.HTMLParser):
def unknown_decl(self, data):
end = ']]>' if data.startswith('CDATA[') else ']>'
self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)
+
+ # The rest has been copied from base class in standard lib to address #1036.
+ # As __startag_text is private, all references to it must be in this subclass.
+ # The last few lines of parse_starttag are reversed so that handle_starttag
+ # can override cdata_mode in certain situations (in a code span).
+ __starttag_text = None
+
+ def get_starttag_text(self):
+ """Return full source of start tag: '<...>'."""
+ return self.__starttag_text
+
+ def parse_starttag(self, i): # pragma: no cover
+ self.__starttag_text = None
+ endpos = self.check_for_whole_start_tag(i)
+ if endpos < 0:
+ return endpos
+ rawdata = self.rawdata
+ self.__starttag_text = rawdata[i:endpos]
+
+ # Now parse the data between i+1 and j into a tag and attrs
+ attrs = []
+ match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
+ assert match, 'unexpected call to parse_starttag()'
+ k = match.end()
+ self.lasttag = tag = match.group(1).lower()
+ while k < endpos:
+ m = htmlparser.attrfind_tolerant.match(rawdata, k)
+ if not m:
+ break
+ attrname, rest, attrvalue = m.group(1, 2, 3)
+ if not rest:
+ attrvalue = None
+ elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
+ attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127
+ attrvalue = attrvalue[1:-1]
+ if attrvalue:
+ attrvalue = htmlparser.unescape(attrvalue)
+ attrs.append((attrname.lower(), attrvalue))
+ k = m.end()
+
+ end = rawdata[k:endpos].strip()
+ if end not in (">", "/>"):
+ lineno, offset = self.getpos()
+ if "\n" in self.__starttag_text:
+ lineno = lineno + self.__starttag_text.count("\n")
+ offset = len(self.__starttag_text) \
+ - self.__starttag_text.rfind("\n") # noqa: E127
+ else:
+ offset = offset + len(self.__starttag_text)
+ self.handle_data(rawdata[i:endpos])
+ return endpos
+ if end.endswith('/>'):
+ # XHTML-style empty tag: <span attr="value" />
+ self.handle_startendtag(tag, attrs)
+ else:
+ # *** set cdata_mode first so we can override it in handle_starttag (see #1036) ***
+ if tag in self.CDATA_CONTENT_ELEMENTS:
+ self.set_cdata_mode(tag)
+ self.handle_starttag(tag, attrs)
+ return endpos
diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py
index 0a2092d..3fea766 100644
--- a/tests/test_syntax/blocks/test_html_blocks.py
+++ b/tests/test_syntax/blocks/test_html_blocks.py
@@ -1317,3 +1317,88 @@ class TestHTMLBlocks(TestCase):
"""
)
)
+
+ def test_script_tags(self):
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ <script>
+ *random stuff* <div> &amp;
+ </script>
+
+ <style>
+ **more stuff**
+ </style>
+ """
+ ),
+ self.dedent(
+ """
+ <script>
+ *random stuff* <div> &amp;
+ </script>
+
+ <style>
+ **more stuff**
+ </style>
+ """
+ )
+ )
+
+ def test_unclosed_script_tag(self):
+ # Ensure we have a working fix for https://bugs.python.org/issue41989
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ <script>
+ *random stuff* <div> &amp;
+
+ Still part of the *script* tag
+ """
+ ),
+ self.dedent(
+ """
+ <script>
+ *random stuff* <div> &amp;
+
+ Still part of the *script* tag
+ """
+ )
+ )
+
+ def test_inline_script_tags(self):
+ # Ensure inline script tags doesn't cause the parser to eat content (see #1036).
+ self.assertMarkdownRenders(
+ self.dedent(
+ """
+ Text `<script>` more *text*.
+
+ <div>
+ *foo*
+ </div>
+
+ <div>
+
+ bar
+
+ </div>
+
+ A new paragraph with a closing `</script>` tag.
+ """
+ ),
+ self.dedent(
+ """
+ <p>Text <code>&lt;script&gt;</code> more <em>text</em>.</p>
+ <div>
+ *foo*
+ </div>
+
+ <div>
+
+ bar
+
+ </div>
+
+ <p>A new paragraph with a closing <code>&lt;/script&gt;</code> tag.</p>
+ """
+ )
+ )