summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2018-12-23 17:19:27 -0500
committerLeonard Richardson <leonardr@segfault.org>2018-12-23 17:19:27 -0500
commit03090b2f1e0ec5809d2371175291040f29805a59 (patch)
tree61e77323e8ae60c80d1dbbe34c6d83842c213800
parent82bcf5550438c10a1084f46052c84f858220ae1e (diff)
parent49356a1f4d1ef2f56efdc791869706a01ea36bd1 (diff)
downloadbeautifulsoup4-03090b2f1e0ec5809d2371175291040f29805a59.tar.gz
Merged in next_previous_fixes from Isaac Muse. [bug=1782928,1798699]
-rw-r--r--NEWS.txt11
-rw-r--r--bs4/__init__.py33
-rw-r--r--bs4/builder/_html5lib.py10
-rw-r--r--bs4/element.py14
-rw-r--r--bs4/tests/test_html5lib.py40
5 files changed, 86 insertions, 22 deletions
diff --git a/NEWS.txt b/NEWS.txt
index bd98b13..6e44504 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,3 +1,14 @@
+= 4.7.0 (Unreleased)
+
+* Fix a number of problems with the tree builder that caused
+ trees that were superficially okay, but which fell apart when bits
+ were extracted. Patch by Isaac Muse. [bug=1782928]
+
+* Fixed a problem with the tree builder in which elements that
+ contained no content (such as empty comments and all-whitespace
+ elements) were not being treated as part of the tree. Patch by Isaac
+ Muse. [bug=1798699]
+
= 4.6.3 (20180812)
* Exactly the same as 4.6.2. Re-released to make the README file
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 470177f..087ee69 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -382,7 +382,7 @@ class BeautifulSoup(Tag):
def pushTag(self, tag):
#print "Push", tag.name
- if self.currentTag:
+ if self.currentTag is not None:
self.currentTag.contents.append(tag)
self.tagStack.append(tag)
self.currentTag = self.tagStack[-1]
@@ -421,15 +421,19 @@ class BeautifulSoup(Tag):
def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree."""
- parent = parent or self.currentTag
- previous_element = most_recent_element or self._most_recent_element
+ if parent is None:
+ parent = self.currentTag
+ if most_recent_element is not None:
+ previous_element = most_recent_element
+ else:
+ previous_element = self._most_recent_element
next_element = previous_sibling = next_sibling = None
if isinstance(o, Tag):
next_element = o.next_element
next_sibling = o.next_sibling
previous_sibling = o.previous_sibling
- if not previous_element:
+ if previous_element is None:
previous_element = o.previous_element
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
@@ -437,7 +441,7 @@ class BeautifulSoup(Tag):
self._most_recent_element = o
parent.contents.append(o)
- if parent.next_sibling:
+ if parent.next_sibling is not None:
# This node is being inserted into an element that has
# already been parsed. Deal with any dangling references.
index = len(parent.contents)-1
@@ -457,6 +461,15 @@ class BeautifulSoup(Tag):
previous_sibling = None
else:
previous_element = previous_sibling = parent.contents[index-1]
+ previous = previous_element
+ while isinstance(previous, Tag):
+ if previous.contents:
+ previous.next_element = previous.contents[0]
+ previous = previous.contents[-1]
+ else:
+ break
+ previous_element = previous
+
if index == len(parent.contents)-1:
next_element = parent.next_sibling
next_sibling = None
@@ -464,16 +477,16 @@ class BeautifulSoup(Tag):
next_element = next_sibling = parent.contents[index+1]
o.previous_element = previous_element
- if previous_element:
+ if previous_element is not None:
previous_element.next_element = o
o.next_element = next_element
- if next_element:
+ if next_element is not None:
next_element.previous_element = o
o.next_sibling = next_sibling
- if next_sibling:
+ if next_sibling is not None:
next_sibling.previous_sibling = o
o.previous_sibling = previous_sibling
- if previous_sibling:
+ if previous_sibling is not None:
previous_sibling.next_sibling = o
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
@@ -520,7 +533,7 @@ class BeautifulSoup(Tag):
self.currentTag, self._most_recent_element)
if tag is None:
return tag
- if self._most_recent_element:
+ if self._most_recent_element is not None:
self._most_recent_element.next_element = tag
self._most_recent_element = tag
self.pushTag(tag)
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 5f54893..388c3d6 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -249,7 +249,7 @@ class Element(treebuilder_base.Node):
if not isinstance(child, basestring) and child.parent is not None:
node.element.extract()
- if (string_child and self.element.contents
+ if (string_child is not None and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# We are appending a string onto another string.
# TODO This has O(n^2) performance, for input like
@@ -360,16 +360,16 @@ class Element(treebuilder_base.Node):
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
first_child = to_append[0]
- if new_parents_last_descendant:
+ if new_parents_last_descendant is not None:
first_child.previous_element = new_parents_last_descendant
else:
first_child.previous_element = new_parent_element
first_child.previous_sibling = new_parents_last_child
- if new_parents_last_descendant:
+ if new_parents_last_descendant is not None:
new_parents_last_descendant.next_element = first_child
else:
new_parent_element.next_element = first_child
- if new_parents_last_child:
+ if new_parents_last_child is not None:
new_parents_last_child.next_sibling = first_child
# Find the very last element being moved. It is now the
@@ -379,7 +379,7 @@ class Element(treebuilder_base.Node):
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
- if new_parents_last_descendant_next_element:
+ if new_parents_last_descendant_next_element is not None:
# TODO: This code has no test coverage and I'm not sure
# how to get html5lib to go through this path, but it's
# just the other side of the previous line.
diff --git a/bs4/element.py b/bs4/element.py
index 886eb91..8059c43 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -256,26 +256,26 @@ class PageElement(object):
self.previous_element.next_element = self
self.next_element = next_element
- if self.next_element:
+ if self.next_element is not None:
self.next_element.previous_element = self
self.next_sibling = next_sibling
- if self.next_sibling:
+ if self.next_sibling is not None:
self.next_sibling.previous_sibling = self
- if (not previous_sibling
+ if (previous_sibling is None
and self.parent is not None and self.parent.contents):
previous_sibling = self.parent.contents[-1]
self.previous_sibling = previous_sibling
- if previous_sibling:
+ if previous_sibling is not None:
self.previous_sibling.next_sibling = self
nextSibling = _alias("next_sibling") # BS3
previousSibling = _alias("previous_sibling") # BS3
def replace_with(self, replace_with):
- if not self.parent:
+ if self.parent is None:
raise ValueError(
"Cannot replace one element with another when the"
"element to be replaced is not part of a tree.")
@@ -292,7 +292,7 @@ class PageElement(object):
def unwrap(self):
my_parent = self.parent
- if not self.parent:
+ if self.parent is None:
raise ValueError(
"Cannot replace an element with its contents when that"
"element is not part of a tree.")
@@ -340,7 +340,7 @@ class PageElement(object):
def _last_descendant(self, is_initialized=True, accept_self=True):
"Finds the last element beneath this object to be parsed."
- if is_initialized and self.next_sibling:
+ if is_initialized and self.next_sibling is not None:
last_child = self.next_sibling.previous_element
else:
last_child = self
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 0f89d62..3a04787 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -128,3 +128,43 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
markup = b"""<table><td></tbody>A"""
soup = self.soup(markup)
self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
+
+ def test_extraction(self):
+ """
+ Test that extraction does not destroy the tree.
+
+ https://bugs.launchpad.net/beautifulsoup/+bug/1782928
+ """
+
+ markup = """
+<html><head></head>
+<style>
+</style><script></script><body><p>hello</p></body></html>
+"""
+ soup = self.soup(markup)
+ [s.extract() for s in soup('script')]
+ [s.extract() for s in soup('style')]
+
+ self.assertEqual(len(soup.find_all("p")), 1)
+
+ def test_empty_comment(self):
+ """
+ Test that empty comment does not break structure.
+
+ https://bugs.launchpad.net/beautifulsoup/+bug/1806598
+ """
+
+ markup = """
+<html>
+<body>
+<form>
+<!----><input type="text">
+</form>
+</body>
+</html>
+"""
+ soup = self.soup(markup)
+ inputs = []
+ for form in soup.find_all('form'):
+ inputs.extend(form.find_all('input'))
+ self.assertEqual(len(inputs), 1)