diff options
author | Leonard Richardson <leonardr@segfault.org> | 2018-07-28 18:23:00 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2018-07-28 18:23:00 -0400 |
commit | 334e562e1f19b166cc5f3be986329d97e08df6d9 (patch) | |
tree | eecc455c527d8a519ef47c77cb4e9579e75b2b62 | |
parent | 37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 (diff) | |
download | beautifulsoup4-334e562e1f19b166cc5f3be986329d97e08df6d9.tar.gz |
When markup contains duplicate elements, a select() call that
includes multiple match clauses will match all relevant
elements. [bug=1770596]
-rw-r--r-- | NEWS.txt | 4 | ||||
-rw-r--r-- | bs4/element.py | 22 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 14 |
4 files changed, 36 insertions, 6 deletions
@@ -20,6 +20,10 @@ * Improved the warning given when no parser is specified. [bug=1780571] +* When markup contains duplicate elements, a select() call that + includes multiple match clauses will match all relevant + elements. [bug=1770596] + * Fixed code that was causing deprecation warnings in recent Python 3 versions. Includes a patch from Ville Skyttä. [bug=1778909] [bug=1689496] diff --git a/bs4/element.py b/bs4/element.py index c431a97..8383c3f 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1409,15 +1409,29 @@ class Tag(PageElement): # Handle grouping selectors if ',' exists, ie: p,a if ',' in selector: context = [] - for partial_selector in selector.split(','): - partial_selector = partial_selector.strip() + selectors = [x.strip() for x in selector.split(",")] + + # If a selector is mentioned multiple times we don't want + # to use it more than once. + used_selectors = set() + + # We also don't want to select the same element more than once, + # if it's matched by multiple selectors. + selected_object_ids = set() + for partial_selector in selectors: if partial_selector == '': raise ValueError('Invalid group selection syntax: %s' % selector) + if partial_selector in used_selectors: + continue + used_selectors.add(partial_selector) candidates = self.select(partial_selector, limit=limit) for candidate in candidates: - if candidate not in context: + # This lets us distinguish between distinct tags that + # represent the same markup. + object_id = id(candidate) + if object_id not in selected_object_ids: context.append(candidate) - + selected_object_ids.add(object_id) if limit and len(context) >= limit: break return context diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 23cbaef..8a8f690 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -46,7 +46,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertSoupEquals( "<p>foo�bar</p>", "<p>foobar</p>") - def test_entities_in_original_document_encoding(self): + def test_entities_in_foreign_document_encoding(self): # We can't implement this case correctly because by the time we # hear about markup like "“", it's been (incorrectly) converted into # a string like u'\x93' diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index e5dcfa7..68887b4 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -2074,5 +2074,17 @@ class TestSoupSelector(TreeTest): def test_multiple_select_nested(self): self.assertSelects('body > div > x, y > z', ['xid', 'zidb']) - + def test_select_duplicate_elements(self): + # When markup contains duplicate elements, a multiple select + # will find all of them. + markup = '<div class="c1"/><div class="c2"/><div class="c1"/>' + soup = BeautifulSoup(markup, 'html.parser') + selected = soup.select(".c1, .c2") + self.assertEquals(3, len(selected)) + + # Verify that find_all finds the same elements, though because + # of an implementation detail it finds them in a different + # order. + for element in soup.find_all(class_=['c1', 'c2']): + assert element in selected |