diff options
author | scoder <stefan_ml@behnel.de> | 2013-04-27 05:17:06 -0700 |
---|---|---|
committer | scoder <stefan_ml@behnel.de> | 2013-04-27 05:17:06 -0700 |
commit | a790a62b4e07a6c17bad3e7dc9784050123b4a1f (patch) | |
tree | 19db55dcf653deb7e7e00878739bfd1229eb0397 | |
parent | 49268d013d37ad77b87660c11d0cd8120b592681 (diff) | |
parent | 2b0bdf759009abd954d4dfddb5f82ad1eecb085d (diff) | |
download | python-lxml-a790a62b4e07a6c17bad3e7dc9784050123b4a1f.tar.gz |
Merge pull request #115 from cko/Bug715687
fix for Bug #715687
-rw-r--r-- | src/lxml/html/clean.py | 3 | ||||
-rw-r--r-- | src/lxml/html/tests/test_clean.txt | 30 |
2 files changed, 30 insertions, 3 deletions
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index 4a6912f0..2fa7a2de 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -312,7 +312,8 @@ class Cleaner(object): for el in list(doc.iter('link')): if 'stylesheet' in el.get('rel', '').lower(): # Note this kills alternate stylesheets as well - el.drop_tree() + if not self.allow_element(el): + el.drop_tree() if self.meta: kill_tags.add('meta') if self.page_structure: diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt index a8e2959b..d87a6619 100644 --- a/src/lxml/html/tests/test_clean.txt +++ b/src/lxml/html/tests/test_clean.txt @@ -6,6 +6,8 @@ ... <head> ... <script type="text/javascript" src="evil-site"></script> ... <link rel="alternate" type="text/rss" src="evil-rss"> +... <link rel="alternate" type="text/rss" href="http://example.com"> +... <link rel="stylesheet" type="text/rss" href="http://example.com"> ... <style> ... body {background-image: url(javascript:do_evil)}; ... div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)}; @@ -34,6 +36,8 @@ <head> <script type="text/javascript" src="evil-site"></script> <link rel="alternate" type="text/rss" src="evil-rss"> + <link rel="alternate" type="text/rss" href="http://example.com"> + <link rel="stylesheet" type="text/rss" href="http://example.com"> <style> body {background-image: url(javascript:do_evil)}; div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)}; @@ -62,6 +66,8 @@ <head> <script type="text/javascript" src="evil-site"></script> <link rel="alternate" type="text/rss" src="evil-rss"> + <link rel="alternate" type="text/rss" href="http://example.com"> + <link rel="stylesheet" type="text/rss" href="http://example.com"> <style> body {background-image: url(javascript:do_evil)}; div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)}; @@ -103,8 +109,7 @@ </body> </html> ->>> print(Cleaner(style=True, links=True, add_nofollow=True, -... page_structure=False, safe_attrs_only=False).clean_html(doc)) +>>> print(Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc)) <html> <head> </head> @@ -120,3 +125,24 @@ <img src="evil!"> </body> </html> + +>>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc)) +<html> + <head> + <link rel="alternate" type="text/rss" src="evil-rss"> + <link rel="alternate" type="text/rss" href="http://example.com"> + <link rel="stylesheet" type="text/rss" href="http://example.com"> + <style>/* deleted */</style> + </head> + <body> + <a href="">a link</a> + <a href="">data</a> + <a href="#">another link</a> + <p>a paragraph</p> + <div>secret EVIL!</div> + of EVIL! + Password: + <a href="evil-site">spam spam SPAM!</a> + <img src="evil!"> + </body> +</html> |