summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIsaac Muse <isaacmuse@gmail.com>2019-01-05 12:24:35 -0700
committerIsaac Muse <isaacmuse@gmail.com>2019-01-05 12:24:35 -0700
commit0a61c6604aea4553bef2f2b78f269ec36127e6ca (patch)
treecb7efbf3fefaecb97d65b6bb059dcda3dffd2a78
parent75bbc0579c25930bc59163c0f2f8f1fbf35f301e (diff)
downloadbeautifulsoup4-0a61c6604aea4553bef2f2b78f269ec36127e6ca.tar.gz
Fix for performance with the linkage fix.
The exact situations have been pinned down, and now solve current known issues without excessive and aggressive recursion.
-rw-r--r--bs4/__init__.py137
1 files changed, 40 insertions, 97 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index c58ff85..3e34f63 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -435,113 +435,56 @@ class BeautifulSoup(Tag):
if previous_element is None:
previous_element = o.previous_element
+ fix = parent.next_element is not None
+
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
self._most_recent_element = o
parent.contents.append(o)
# Check if we are inserting into an already parsed node.
- if parent.next_element is not None:
-
- # Check that links are proper across tag parent boundaries
- child = self._linkage_fixer(parent)
+ if fix:
+ self._linkage_fixer(parent)
- def _linkage_fixer(self, el, _recursive_call=False):
+ def _linkage_fixer(self, el):
"""Make sure linkage of this fragment is sound."""
- descendant = None
-
- # If element is document element,
- # it should have no previous element, previous sibling, or next sibling.
- if el.parent is None:
- if el.previous_element is not None:
- el.previous_element = None
- if el.previous_sibling is not None:
- el.previous_element = None
- if el.next_sibling is not None:
- el.next_sibling = None
-
- idx = 0
- child = None
- last_child = None
- last_idx = len(el.contents) - 1
- for child in el.contents:
- descendant = None
-
- # Parent should link next element to their first child
- # That child should have no previous sibling
- if idx == 0:
- if el.parent is not None:
- if el.next_element is not child:
- el.next_element = child
-
- if child.previous_element is not el:
- child.previous_element = el
-
- if child.previous_sibling is not None:
- child.previous_sibling = None
-
- # If not the first child, previous index should link as sibling to last index.
- # Previous element should match the last index or the last bubbled up descendant (of a Tag sibling).
- else:
- if child.previous_sibling is not el.contents[idx - 1]:
- child.previous_sibling = el.contents[idx - 1]
- if el.contents[idx - 1].next_sibling is not child:
- el.contents[idx - 1].next_sibling = child
-
- if last_child is not None:
- if child.previous_element is not last_child:
- child.previous_element = last_child
- if last_child.next_element is not child:
- last_child.next_element = child
-
- # This index is a tag, dig deeper for a "last descendant" fixing linkage along the way
- if isinstance(child, Tag) and child.contents:
- descendant = self._linkage_fixer(child, True)
- # A bubbled up descendant should have no next siblings
- # as it is last in its content list.
- if descendant.next_sibling is not None:
- descendant.next_sibling = None
-
- # Mark last child as either the bubbled up descendant or the current child
- if descendant is not None:
- last_child = descendant
- else:
- last_child = child
-
- # If last child in list, there are no next siblings
- if idx == last_idx:
- if child.next_sibling is not None:
- child.next_sibling = None
- idx += 1
-
- # The child to return is either the last descendant (if available)
- # or the last processed child (if any). If neither is available,
- # the parent element is its own last descendant.
- child = descendant if descendant is not None else child
- if child is None:
- child = el
-
- # If not a recursive call, we are done processing this element.
+
+ first = el.contents[0]
+ child = el.contents[-1]
+ descendant = child
+
+ if child is first and el.parent is not None:
+ # Parent should be linked to first child
+ el.next_element = child
+ # We are no longer linked to whatever this element is
+ prev_el = child.previous_element
+ if prev_el is not None and prev_el is not el:
+ prev_el.next_element = None
+ # First child should be linked to the parent, and no previous siblings.
+ child.previous_element = el
+ child.previous_sibling = None
+
+ # We have no sibling as we've been appended as the last.
+ child.next_sibling = None
+
+ # This index is a tag, dig deeper for a "last descendant"
+ if isinstance(child, Tag) and child.contents:
+ descendant = child._last_descendant(False)
+
# As the final step, link last descendant. It should be linked
# to the parent's next sibling (if found), else walk up the chain
- # and find a parent with a sibling.
- if not _recursive_call and child is not None:
- child.next_element = None
- target = el
- while True:
- if target is None:
- break
- elif target.next_sibling is not None:
- child.next_element = target.next_sibling
- target.next_sibling.previous_element = child
- break
- target = target.parent
-
- # We are done, so nothing to return
- return None
- else:
- # Return the child to the recursive caller
- return child
+ # and find a parent with a sibling. It should have no next sibling.
+ descendant.next_element = None
+ descendant.next_sibling = None
+ target = el
+ while True:
+ if target is None:
+ break
+ elif target.next_sibling is not None:
+ descendant.next_element = target.next_sibling
+ target.next_sibling.previous_element = child
+ break
+ target = target.parent
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent