summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2018-07-28 18:23:00 -0400
committerLeonard Richardson <leonardr@segfault.org>2018-07-28 18:23:00 -0400
commit334e562e1f19b166cc5f3be986329d97e08df6d9 (patch)
treeeecc455c527d8a519ef47c77cb4e9579e75b2b62
parent37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 (diff)
downloadbeautifulsoup4-334e562e1f19b166cc5f3be986329d97e08df6d9.tar.gz
When markup contains duplicate elements, a select() call that
includes multiple match clauses will match all relevant elements. [bug=1770596]
-rw-r--r--NEWS.txt4
-rw-r--r--bs4/element.py22
-rw-r--r--bs4/tests/test_lxml.py2
-rw-r--r--bs4/tests/test_tree.py14
4 files changed, 36 insertions, 6 deletions
diff --git a/NEWS.txt b/NEWS.txt
index acdcc04..3884687 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -20,6 +20,10 @@
* Improved the warning given when no parser is specified. [bug=1780571]
+* When markup contains duplicate elements, a select() call that
+ includes multiple match clauses will match all relevant
+ elements. [bug=1770596]
+
* Fixed code that was causing deprecation warnings in recent Python 3
versions. Includes a patch from Ville Skyttä. [bug=1778909] [bug=1689496]
diff --git a/bs4/element.py b/bs4/element.py
index c431a97..8383c3f 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1409,15 +1409,29 @@ class Tag(PageElement):
# Handle grouping selectors if ',' exists, ie: p,a
if ',' in selector:
context = []
- for partial_selector in selector.split(','):
- partial_selector = partial_selector.strip()
+ selectors = [x.strip() for x in selector.split(",")]
+
+ # If a selector is mentioned multiple times we don't want
+ # to use it more than once.
+ used_selectors = set()
+
+ # We also don't want to select the same element more than once,
+ # if it's matched by multiple selectors.
+ selected_object_ids = set()
+ for partial_selector in selectors:
if partial_selector == '':
raise ValueError('Invalid group selection syntax: %s' % selector)
+ if partial_selector in used_selectors:
+ continue
+ used_selectors.add(partial_selector)
candidates = self.select(partial_selector, limit=limit)
for candidate in candidates:
- if candidate not in context:
+ # This lets us distinguish between distinct tags that
+ # represent the same markup.
+ object_id = id(candidate)
+ if object_id not in selected_object_ids:
context.append(candidate)
-
+ selected_object_ids.add(object_id)
if limit and len(context) >= limit:
break
return context
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 23cbaef..8a8f690 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -46,7 +46,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
- def test_entities_in_original_document_encoding(self):
+ def test_entities_in_foreign_document_encoding(self):
# We can't implement this case correctly because by the time we
# hear about markup like "&#147;", it's been (incorrectly) converted into
# a string like u'\x93'
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index e5dcfa7..68887b4 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -2074,5 +2074,17 @@ class TestSoupSelector(TreeTest):
def test_multiple_select_nested(self):
self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
-
+ def test_select_duplicate_elements(self):
+ # When markup contains duplicate elements, a multiple select
+ # will find all of them.
+ markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
+ soup = BeautifulSoup(markup, 'html.parser')
+ selected = soup.select(".c1, .c2")
+ self.assertEquals(3, len(selected))
+
+ # Verify that find_all finds the same elements, though because
+ # of an implementation detail it finds them in a different
+ # order.
+ for element in soup.find_all(class_=['c1', 'c2']):
+ assert element in selected