summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristine Koppelt <ch.ko123@googlemail.com>2013-04-27 12:22:16 +0200
committerChristine Koppelt <ch.ko123@googlemail.com>2013-04-27 12:44:14 +0200
commit2b0bdf759009abd954d4dfddb5f82ad1eecb085d (patch)
tree740f50fc2192a062f9053585cbb9f714f18bf253
parent524b97999c8e6cfdc80c32fc4e84c3cb685dbadc (diff)
downloadpython-lxml-2b0bdf759009abd954d4dfddb5f82ad1eecb085d.tar.gz
fix for Bug #715687 (Consider host_whitelist and whitelist_tags before deleting element)
-rw-r--r--src/lxml/html/clean.py3
-rw-r--r--src/lxml/html/tests/test_clean.txt30
2 files changed, 30 insertions, 3 deletions
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
index 4a6912f0..2fa7a2de 100644
--- a/src/lxml/html/clean.py
+++ b/src/lxml/html/clean.py
@@ -312,7 +312,8 @@ class Cleaner(object):
for el in list(doc.iter('link')):
if 'stylesheet' in el.get('rel', '').lower():
# Note this kills alternate stylesheets as well
- el.drop_tree()
+ if not self.allow_element(el):
+ el.drop_tree()
if self.meta:
kill_tags.add('meta')
if self.page_structure:
diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt
index a8e2959b..d87a6619 100644
--- a/src/lxml/html/tests/test_clean.txt
+++ b/src/lxml/html/tests/test_clean.txt
@@ -6,6 +6,8 @@
... <head>
... <script type="text/javascript" src="evil-site"></script>
... <link rel="alternate" type="text/rss" src="evil-rss">
+... <link rel="alternate" type="text/rss" href="http://example.com">
+... <link rel="stylesheet" type="text/rss" href="http://example.com">
... <style>
... body {background-image: url(javascript:do_evil)};
... div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
@@ -34,6 +36,8 @@
<head>
<script type="text/javascript" src="evil-site"></script>
<link rel="alternate" type="text/rss" src="evil-rss">
+ <link rel="alternate" type="text/rss" href="http://example.com">
+ <link rel="stylesheet" type="text/rss" href="http://example.com">
<style>
body {background-image: url(javascript:do_evil)};
div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
@@ -62,6 +66,8 @@
<head>
<script type="text/javascript" src="evil-site"></script>
<link rel="alternate" type="text/rss" src="evil-rss">
+ <link rel="alternate" type="text/rss" href="http://example.com">
+ <link rel="stylesheet" type="text/rss" href="http://example.com">
<style>
body {background-image: url(javascript:do_evil)};
div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
@@ -103,8 +109,7 @@
</body>
</html>
->>> print(Cleaner(style=True, links=True, add_nofollow=True,
-... page_structure=False, safe_attrs_only=False).clean_html(doc))
+>>> print(Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
<html>
<head>
</head>
@@ -120,3 +125,24 @@
<img src="evil!">
</body>
</html>
+
+>>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc))
+<html>
+ <head>
+ <link rel="alternate" type="text/rss" src="evil-rss">
+ <link rel="alternate" type="text/rss" href="http://example.com">
+ <link rel="stylesheet" type="text/rss" href="http://example.com">
+ <style>/* deleted */</style>
+ </head>
+ <body>
+ <a href="">a link</a>
+ <a href="">data</a>
+ <a href="#">another link</a>
+ <p>a paragraph</p>
+ <div>secret EVIL!</div>
+ of EVIL!
+ Password:
+ <a href="evil-site">spam spam SPAM!</a>
+ <img src="evil!">
+ </body>
+</html>