diff options
author | Stefan Behnel <stefan_ml@behnel.de> | 2013-04-28 15:40:09 +0200 |
---|---|---|
committer | Stefan Behnel <stefan_ml@behnel.de> | 2013-04-28 15:40:09 +0200 |
commit | 889172388ecb621f5739578ade35faa1879d66df (patch) | |
tree | b1a09cb66c57f0828ac60c5f5e16078448f89b02 | |
parent | edd23c372e853a9a11f8a07cda11525f518dcbb1 (diff) | |
download | python-lxml-889172388ecb621f5739578ade35faa1879d66df.tar.gz |
avoid writing duplicate 'nofollow' argument into HTML 'rel' attributes while cleaning
-rw-r--r-- | CHANGES.txt | 5 | ||||
-rw-r--r-- | src/lxml/html/clean.py | 11 | ||||
-rw-r--r-- | src/lxml/html/tests/test_clean.txt | 10 |
3 files changed, 17 insertions, 9 deletions
diff --git a/CHANGES.txt b/CHANGES.txt index d39614c7..d51486e8 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -14,8 +14,11 @@ Bugs fixed * LP#673205: Parsing from in-memory strings disabled network access in the default parser and made subsequent attempts to parse from a URL fail. +* LP#971754: lxml.html.clean appends 'nofollow' to 'rel' attributes instead + of overwriting the current value. + * LP#715687: lxml.html.clean no longer discards scripts that are explicitly - allowed by the user provided whitelist. + allowed by the user provided whitelist. Patch by Christine Koppelt. Other changes ------------- diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index e94eec26..11253817 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -399,9 +399,14 @@ class Cleaner(object): if self.add_nofollow: for el in _find_external_links(doc): if not self.allow_follow(el): - rel = 'nofollow' - if el.get('rel'): - rel = el.get('rel') + ' ' + rel + rel = el.get('rel') + if rel: + if ('nofollow' in rel + and ' nofollow ' in (' %s ' % rel)): + continue + rel = '%s nofollow' % rel + else: + rel = 'nofollow' el.set('rel', rel) def allow_follow(self, anchor): diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt index 21bd12c6..fb99c0d9 100644 --- a/src/lxml/html/tests/test_clean.txt +++ b/src/lxml/html/tests/test_clean.txt @@ -28,7 +28,7 @@ ... </form> ... <a href="evil-site">spam spam SPAM!</a> ... <a href="http://example.com" rel="author">Author</a> -... <a href="http://example.com">Text</a> +... <a href="http://example.com" rel="nofollow">Text</a> ... <img src="evil!"> ... </body> ... </html>''' @@ -60,7 +60,7 @@ </form> <a href="evil-site">spam spam SPAM!</a> <a href="http://example.com" rel="author">Author</a> - <a href="http://example.com">Text</a> + <a href="http://example.com" rel="nofollow">Text</a> <img src="evil!"> </body> </html> @@ -92,7 +92,7 @@ </form> <a href="evil-site">spam spam SPAM!</a> <a href="http://example.com" rel="author">Author</a> - <a href="http://example.com">Text</a> + <a href="http://example.com" rel="nofollow">Text</a> <img src="evil!"> </body> </html> @@ -112,7 +112,7 @@ Password: <a href="evil-site">spam spam SPAM!</a> <a href="http://example.com" rel="author">Author</a> - <a href="http://example.com">Text</a> + <a href="http://example.com" rel="nofollow">Text</a> <img src="evil!"> </body> </html> @@ -154,7 +154,7 @@ Password: <a href="evil-site">spam spam SPAM!</a> <a href="http://example.com" rel="author">Author</a> - <a href="http://example.com">Text</a> + <a href="http://example.com" rel="nofollow">Text</a> <img src="evil!"> </body> </html> |