summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2013-04-28 15:40:09 +0200
committerStefan Behnel <stefan_ml@behnel.de>2013-04-28 15:40:09 +0200
commit889172388ecb621f5739578ade35faa1879d66df (patch)
treeb1a09cb66c57f0828ac60c5f5e16078448f89b02
parentedd23c372e853a9a11f8a07cda11525f518dcbb1 (diff)
downloadpython-lxml-889172388ecb621f5739578ade35faa1879d66df.tar.gz
avoid writing duplicate 'nofollow' argument into HTML 'rel' attributes while cleaning
-rw-r--r--CHANGES.txt5
-rw-r--r--src/lxml/html/clean.py11
-rw-r--r--src/lxml/html/tests/test_clean.txt10
3 files changed, 17 insertions, 9 deletions
diff --git a/CHANGES.txt b/CHANGES.txt
index d39614c7..d51486e8 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -14,8 +14,11 @@ Bugs fixed
* LP#673205: Parsing from in-memory strings disabled network access in the
default parser and made subsequent attempts to parse from a URL fail.
+* LP#971754: lxml.html.clean appends 'nofollow' to 'rel' attributes instead
+ of overwriting the current value.
+
* LP#715687: lxml.html.clean no longer discards scripts that are explicitly
- allowed by the user provided whitelist.
+ allowed by the user provided whitelist. Patch by Christine Koppelt.
Other changes
-------------
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
index e94eec26..11253817 100644
--- a/src/lxml/html/clean.py
+++ b/src/lxml/html/clean.py
@@ -399,9 +399,14 @@ class Cleaner(object):
if self.add_nofollow:
for el in _find_external_links(doc):
if not self.allow_follow(el):
- rel = 'nofollow'
- if el.get('rel'):
- rel = el.get('rel') + ' ' + rel
+ rel = el.get('rel')
+ if rel:
+ if ('nofollow' in rel
+ and ' nofollow ' in (' %s ' % rel)):
+ continue
+ rel = '%s nofollow' % rel
+ else:
+ rel = 'nofollow'
el.set('rel', rel)
def allow_follow(self, anchor):
diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt
index 21bd12c6..fb99c0d9 100644
--- a/src/lxml/html/tests/test_clean.txt
+++ b/src/lxml/html/tests/test_clean.txt
@@ -28,7 +28,7 @@
... </form>
... <a href="evil-site">spam spam SPAM!</a>
... <a href="http://example.com" rel="author">Author</a>
-... <a href="http://example.com">Text</a>
+... <a href="http://example.com" rel="nofollow">Text</a>
... <img src="evil!">
... </body>
... </html>'''
@@ -60,7 +60,7 @@
</form>
<a href="evil-site">spam spam SPAM!</a>
<a href="http://example.com" rel="author">Author</a>
- <a href="http://example.com">Text</a>
+ <a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>
@@ -92,7 +92,7 @@
</form>
<a href="evil-site">spam spam SPAM!</a>
<a href="http://example.com" rel="author">Author</a>
- <a href="http://example.com">Text</a>
+ <a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>
@@ -112,7 +112,7 @@
Password:
<a href="evil-site">spam spam SPAM!</a>
<a href="http://example.com" rel="author">Author</a>
- <a href="http://example.com">Text</a>
+ <a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>
@@ -154,7 +154,7 @@
Password:
<a href="evil-site">spam spam SPAM!</a>
<a href="http://example.com" rel="author">Author</a>
- <a href="http://example.com">Text</a>
+ <a href="http://example.com" rel="nofollow">Text</a>
<img src="evil!">
</body>
</html>