summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWaylan Limberg <waylan.limberg@icloud.com>2020-10-20 14:06:48 -0400
committerGitHub <noreply@github.com>2020-10-20 14:06:48 -0400
commit6b6cd8bc2f0a870ed309f8b8036492af535e75a1 (patch)
tree4641036a9df302c211f51a07971c4b483b777b8d
parent56b03b21f50d2b28b7ab87df7d8015e1f1b62184 (diff)
downloadpython-markdown-6b6cd8bc2f0a870ed309f8b8036492af535e75a1.tar.gz
Unify all block-level tags. (#1048)
Use the list of tags defined in the core by the md_in_html extension. This ensures that the lists do not diverge and allows users and/or extensions to expand the list in the core and have that change affect the extension. Fixes #1047.
-rw-r--r--docs/change_log/index.md4
-rw-r--r--docs/extensions/md_in_html.md8
-rw-r--r--markdown/core.py9
-rw-r--r--markdown/extensions/md_in_html.py43
-rw-r--r--markdown/util.py9
5 files changed, 37 insertions, 36 deletions
diff --git a/docs/change_log/index.md b/docs/change_log/index.md
index 47e8f9e..994e9a2 100644
--- a/docs/change_log/index.md
+++ b/docs/change_log/index.md
@@ -3,6 +3,10 @@ title: Change Log
Python-Markdown Change Log
=========================
+Under development: version 3.3.3 (a bug-fix release).
+
+* Unify all block-level tags (#1047).
+
Oct 19, 2020: version 3.3.2 (a bug-fix release).
* Properly parse inline HTML in md_in_html (#1040 & #1045).
diff --git a/docs/extensions/md_in_html.md b/docs/extensions/md_in_html.md
index ba4424b..978f5c3 100644
--- a/docs/extensions/md_in_html.md
+++ b/docs/extensions/md_in_html.md
@@ -25,10 +25,10 @@ The `markdown` attribute can be assigned one of three values: [`"1"`](#1), [`"bl
When the `markdown` attribute is set to `"1"`, then the parser will use the default behavior for that specific tag.
-The following tags have the `block` behavior by default: `address`, `article`, `aside`, `blockquote`, `body`,
-`colgroup`, `details`, `div`, `dl`, `fieldset`, `figcaption`, `figure`, `footer`, `form`, `iframe`, `header`, `hr`,
-`main`, `menu`, `nav`, `map`, `noscript`, `object`, `ol`, `section`, `table`, `tbody`, `thead`, `tfoot`, `tr`, and
-`ul`.
+The following tags have the `block` behavior by default: `article`, `aside`, `blockquote`, `body`, `colgroup`,
+`details`, `div`, `dl`, `fieldset`, `figcaption`, `figure`, `footer`, `form`, `group`, `header`, `hgroup`, `hr`,
+`iframe`, `main`, `map`, `menu`, `nav`, `noscript`, `object`, `ol`, `output`, `progress`, `section`, `table`,
+`tbody`, `tfoot`, `thead`, `tr`, `ul` and `video`.
For example, the following:
diff --git a/markdown/core.py b/markdown/core.py
index 79ca3f3..2f7f2d5 100644
--- a/markdown/core.py
+++ b/markdown/core.py
@@ -77,11 +77,12 @@ class Markdown:
# See https://w3c.github.io/html/grouping-content.html#the-p-element
'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl',
'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3',
- 'h4', 'h5', 'h6', 'header', 'hr', 'main', 'menu', 'nav', 'ol', 'p', 'pre',
- 'section', 'table', 'ul',
+ 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol',
+ 'p', 'pre', 'section', 'table', 'ul',
# Other elements which Markdown should not be mucking up the contents of.
- 'canvas', 'dd', 'dt', 'group', 'iframe', 'li', 'math', 'noscript', 'output',
- 'progress', 'script', 'style', 'tbody', 'td', 'th', 'thead', 'tr', 'video'
+ 'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend',
+ 'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script',
+ 'style', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video'
]
self.registeredExtensions = []
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
index f635563..489c3fe 100644
--- a/markdown/extensions/md_in_html.py
+++ b/markdown/extensions/md_in_html.py
@@ -23,27 +23,22 @@ from ..htmlparser import HTMLExtractor
import xml.etree.ElementTree as etree
-# Block-level tags in which the content only gets span level parsing
-span_tags = ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
-
-# Block-level tags in which the content gets parsed as blocks
-block_tags = [
- 'address', 'article', 'aside', 'blockquote', 'body', 'colgroup', 'details', 'div', 'dl', 'fieldset',
- 'figcaption', 'figure', 'footer', 'form', 'iframe', 'header', 'hr', 'main', 'menu', 'nav', 'map',
- 'noscript', 'object', 'ol', 'section', 'table', 'tbody', 'thead', 'tfoot', 'tr', 'ul'
-]
-
-# Block-level tags which never get their content parsed.
-raw_tags = ['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea']
-
-block_level_tags = span_tags + block_tags + raw_tags
-
-
class HTMLExtractorExtra(HTMLExtractor):
"""
Override HTMLExtractor and create etree Elements for any elements which should have content parsed as Markdown.
"""
+ def __init__(self, md, *args, **kwargs):
+ # All block-level tags.
+ self.block_level_tags = md.block_level_elements.copy()
+ # Block-level tags in which the content only gets span level parsing
+ self.span_tags = ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'td', 'th']
+ # Block-level tags which never get their content parsed.
+ self.raw_tags = ['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea']
+ # Block-level tags in which the content gets parsed as blocks
+ self.block_tags = [tag for tag in self.block_level_tags if tag not in self.span_tags + self.raw_tags]
+ super().__init__(md, *args, **kwargs)
+
def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.mdstack = [] # When markdown=1, stack contains a list of tags
@@ -75,13 +70,13 @@ class HTMLExtractorExtra(HTMLExtractor):
if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'):
# Only use the parent state if it is more restrictive than the markdown attribute.
md_attr = parent_state
- if ((md_attr == '1' and tag in block_tags) or
- (md_attr == 'block' and tag in span_tags + block_tags)):
+ if ((md_attr == '1' and tag in self.block_tags) or
+ (md_attr == 'block' and tag in self.span_tags + self.block_tags)):
return 'block'
- elif ((md_attr == '1' and tag in span_tags) or
- (md_attr == 'span' and tag in span_tags + block_tags)):
+ elif ((md_attr == '1' and tag in self.span_tags) or
+ (md_attr == 'span' and tag in self.span_tags + self.block_tags)):
return 'span'
- elif tag in block_level_tags:
+ elif tag in self.block_level_tags:
return 'off'
else: # pragma: no cover
return None
@@ -95,7 +90,7 @@ class HTMLExtractorExtra(HTMLExtractor):
return value
def handle_starttag(self, tag, attrs):
- if tag in block_level_tags:
+ if tag in self.block_level_tags:
# Valueless attr (ex: `<tag checked>`) results in `[('checked', None)]`.
# Convert to `{'checked': 'checked'}`.
attrs = {key: value if value is not None else key for key, value in attrs}
@@ -106,7 +101,7 @@ class HTMLExtractorExtra(HTMLExtractor):
attrs.pop('markdown', None)
super().handle_starttag(tag, attrs)
else:
- if 'p' in self.mdstack and tag in block_level_tags:
+ if 'p' in self.mdstack and tag in self.block_level_tags:
# Close unclosed 'p' tag
self.handle_endtag('p')
self.mdstate.append(state)
@@ -125,7 +120,7 @@ class HTMLExtractorExtra(HTMLExtractor):
self.handle_data(text)
def handle_endtag(self, tag):
- if tag in block_level_tags:
+ if tag in self.block_level_tags:
if self.inraw:
super().handle_endtag(tag)
elif tag in self.mdstack:
diff --git a/markdown/util.py b/markdown/util.py
index a49486b..2cb2317 100644
--- a/markdown/util.py
+++ b/markdown/util.py
@@ -58,11 +58,12 @@ BLOCK_LEVEL_ELEMENTS = [
# See https://w3c.github.io/html/grouping-content.html#the-p-element
'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl',
'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3',
- 'h4', 'h5', 'h6', 'header', 'hr', 'main', 'menu', 'nav', 'ol', 'p', 'pre',
- 'section', 'table', 'ul',
+ 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'main', 'menu', 'nav', 'ol',
+ 'p', 'pre', 'section', 'table', 'ul',
# Other elements which Markdown should not be mucking up the contents of.
- 'canvas', 'dd', 'dt', 'group', 'iframe', 'li', 'math', 'noscript', 'output',
- 'progress', 'script', 'style', 'tbody', 'td', 'th', 'thead', 'tr', 'video'
+ 'canvas', 'colgroup', 'dd', 'body', 'dt', 'group', 'iframe', 'li', 'legend',
+ 'math', 'map', 'noscript', 'output', 'object', 'option', 'progress', 'script',
+ 'style', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'video'
]
# Placeholders