summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2018-07-15 16:23:19 -0400
committerLeonard Richardson <leonardr@segfault.org>2018-07-15 16:23:19 -0400
commitdb0ef1662efba41a111861d652a248385f7baac9 (patch)
tree7c839b4a67f56c8c569da5184c57a8d8363afde6
parentd62f4adb3831ab0a8535f97fa304c87e0d871177 (diff)
downloadbeautifulsoup4-db0ef1662efba41a111861d652a248385f7baac9.tar.gz
It's possible for a TreeBuilder subclass to specify that void
elements should be represented as <element> rather than <element/>, by setting TreeBuilder.void_element_close_prefix to the empty string. [bug=1716272]
-rw-r--r--NEWS.txt5
-rw-r--r--bs4/builder/__init__.py25
-rw-r--r--bs4/element.py6
3 files changed, 31 insertions, 5 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 4788489..e22e88f 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -14,6 +14,11 @@
* Stopped HTMLParser from raising an exception in very rare cases of
bad markup. [bug=1708831]
+* It's possible for a TreeBuilder subclass to specify that void
+ elements should be represented as "<element>" rather than
+ "<element/>", by setting TreeBuilder.void_element_close_prefix to
+ the empty string. [bug=1716272]
+
= 4.6.0 (20170507) =
* Added the `Tag.get_attribute_list` method, which acts like `Tag.get` for
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index fdb3362..d7166bb 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -93,7 +93,15 @@ class TreeBuilder(object):
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
-
+ void_tags = None # There are no void tags.
+
+ # This string goes just before the end of the start tag for an
+ # void element.
+ #
+ # Leave this alone and you'll get tags like "<br/>". Change it to the
+ # empty string and you'll get tags like "<br>".
+ void_element_close_prefix = '/'
+
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {}
@@ -126,6 +134,17 @@ class TreeBuilder(object):
return True
return tag_name in self.empty_element_tags
+ def is_void(self, tag_name):
+ """Must a tag with this name be a void tag?
+
+ A void tag cannot have contents and is presented with neither
+ a a closing tag or a closing slash, e.g.:
+ <link href="foo">
+ """
+ if self.void_tags is None:
+ return False
+ return tag_name in self.void_tags
+
def feed(self, markup):
raise NotImplementedError()
@@ -235,11 +254,11 @@ class HTMLTreeBuilder(TreeBuilder):
empty_element_tags = set([
# These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
-
+
# These are from HTML4, removed in HTML5.
'spacer', 'frame'
])
-
+
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,
diff --git a/bs4/element.py b/bs4/element.py
index 5ee9887..181f135 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -871,9 +871,11 @@ class Tag(PageElement):
if builder is not None:
builder.set_up_substitutions(self)
self.can_be_empty_element = builder.can_be_empty_element(name)
+ self.void_element_close_prefix = builder.void_element_close_prefix or ""
else:
self.can_be_empty_element = False
-
+ self.void_element_close_prefix = '/'
+
parserClass = _alias("parser_class") # BS3
def __copy__(self):
@@ -1173,7 +1175,7 @@ class Tag(PageElement):
prefix = self.prefix + ":"
if self.is_empty_element:
- close = '/'
+ close = self.void_element_close_prefix
else:
closeTag = '</%s%s>' % (prefix, self.name)