When markup contains duplicate elements, a select() call that

includes multiple match clauses will match all relevant elements. [bug=1770596]
author: Leonard Richardson <leonardr@segfault.org> 2018-07-28 18:23:00 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2018-07-28 18:23:00 -0400
commit: 334e562e1f19b166cc5f3be986329d97e08df6d9 (patch)
tree: eecc455c527d8a519ef47c77cb4e9579e75b2b62
parent: 37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 (diff)
download: beautifulsoup4-334e562e1f19b166cc5f3be986329d97e08df6d9.tar.gz
4 files changed, 36 insertions, 6 deletions
diff --git a/NEWS.txt b/NEWS.txt
index acdcc04..3884687 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -20,6 +20,10 @@
 
 * Improved the warning given when no parser is specified. [bug=1780571]
 
+* When markup contains duplicate elements, a select() call that
+  includes multiple match clauses will match all relevant
+  elements. [bug=1770596]
+
 * Fixed code that was causing deprecation warnings in recent Python 3
   versions. Includes a patch from Ville Skyttä. [bug=1778909] [bug=1689496]
 
diff --git a/bs4/element.py b/bs4/element.py
index c431a97..8383c3f 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1409,15 +1409,29 @@ class Tag(PageElement):
         # Handle grouping selectors if ',' exists, ie: p,a
         if ',' in selector:
             context = []
-            for partial_selector in selector.split(','):
-                partial_selector = partial_selector.strip()
+            selectors = [x.strip() for x in selector.split(",")]
+
+            # If a selector is mentioned multiple times we don't want
+            # to use it more than once.
+            used_selectors = set()
+
+            # We also don't want to select the same element more than once,
+            # if it's matched by multiple selectors.
+            selected_object_ids = set()
+            for partial_selector in selectors:
                 if partial_selector == '':
                     raise ValueError('Invalid group selection syntax: %s' % selector)
+                if partial_selector in used_selectors:
+                    continue
+                used_selectors.add(partial_selector)
                 candidates = self.select(partial_selector, limit=limit)
                 for candidate in candidates:
-                    if candidate not in context:
+                    # This lets us distinguish between distinct tags that
+                    # represent the same markup.
+                    object_id = id(candidate)
+                    if object_id not in selected_object_ids:
                         context.append(candidate)
-
+                        selected_object_ids.add(object_id)
                 if limit and len(context) >= limit:
                     break
             return context
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 23cbaef..8a8f690 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -46,7 +46,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
         self.assertSoupEquals(
             "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
 
-    def test_entities_in_original_document_encoding(self):
+    def test_entities_in_foreign_document_encoding(self):
         # We can't implement this case correctly because by the time we
         # hear about markup like "&#147;", it's been (incorrectly) converted into
         # a string like u'\x93'
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index e5dcfa7..68887b4 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -2074,5 +2074,17 @@ class TestSoupSelector(TreeTest):
     def test_multiple_select_nested(self):
         self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
 
-
+    def test_select_duplicate_elements(self):
+        # When markup contains duplicate elements, a multiple select
+        # will find all of them.
+        markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
+        soup = BeautifulSoup(markup, 'html.parser')
+        selected = soup.select(".c1, .c2")
+        self.assertEquals(3, len(selected))
+
+        # Verify that find_all finds the same elements, though because
+        # of an implementation detail it finds them in a different
+        # order.
+        for element in soup.find_all(class_=['c1', 'c2']):
+            assert element in selected
author	Leonard Richardson <leonardr@segfault.org>	2018-07-28 18:23:00 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2018-07-28 18:23:00 -0400
commit	334e562e1f19b166cc5f3be986329d97e08df6d9 (patch)
tree	eecc455c527d8a519ef47c77cb4e9579e75b2b62
parent	37e4159cb49d2f7c8fdafa0268adca5a1e2017e4 (diff)
download	beautifulsoup4-334e562e1f19b166cc5f3be986329d97e08df6d9.tar.gz