summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorscoder <stefan_ml@behnel.de>2013-01-10 14:06:37 -0800
committerscoder <stefan_ml@behnel.de>2013-01-10 14:06:37 -0800
commit4b70bc172f3899b6804049b17d850774bdd0662a (patch)
treea6dd4321e18c34b7ef60949278f2615c95c510cc /src
parente01a81740c77ddd26f98b316cec4610b2914eb64 (diff)
parent559b6dd3dad6155273515f093b4cd9e9f45a1716 (diff)
downloadpython-lxml-4b70bc172f3899b6804049b17d850774bdd0662a.tar.gz
Merge pull request #89 from brightinteractive/specify_safe_attrs
allow the set of attributes considered safe by Cleaner to be overridden
Diffstat (limited to 'src')
-rw-r--r--src/lxml/html/clean.py10
-rw-r--r--src/lxml/html/tests/test_clean.py26
2 files changed, 34 insertions, 2 deletions
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
index faf32898..4a6912f0 100644
--- a/src/lxml/html/clean.py
+++ b/src/lxml/html/clean.py
@@ -147,6 +147,10 @@ class Cleaner(object):
If true, only include 'safe' attributes (specifically the list
from the feedparser HTML sanitisation web site).
+ ``safe_attrs``:
+ A set of attribute names to override the default list of attributes
+ considered 'safe' (when safe_attrs_only=True).
+
``add_nofollow``:
If true, then any <a> tags will have ``rel="nofollow"`` added to them.
@@ -189,6 +193,7 @@ class Cleaner(object):
kill_tags = None
remove_unknown_tags = True
safe_attrs_only = True
+ safe_attrs = defs.safe_attrs
add_nofollow = False
host_whitelist = ()
whitelist_tags = set(['iframe', 'embed'])
@@ -247,14 +252,15 @@ class Cleaner(object):
if self.scripts:
kill_tags.add('script')
if self.safe_attrs_only:
- safe_attrs = set(defs.safe_attrs)
+ safe_attrs = set(self.safe_attrs)
for el in doc.iter():
attrib = el.attrib
for aname in attrib.keys():
if aname not in safe_attrs:
del attrib[aname]
if self.javascript:
- if not self.safe_attrs_only:
+ if not (self.safe_attrs_only and
+ self.safe_attrs == defs.safe_attrs):
# safe_attrs handles events attributes itself
for el in doc.iter():
attrib = el.attrib
diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py
index 0daa0d27..f799be92 100644
--- a/src/lxml/html/tests/test_clean.py
+++ b/src/lxml/html/tests/test_clean.py
@@ -34,6 +34,32 @@ class CleanerTest(unittest.TestCase):
self.assertEqual(12-5+1, len(list(result.iter())))
+ def test_safe_attrs_included(self):
+ html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
+
+ safe_attrs=set(lxml.html.defs.safe_attrs)
+ safe_attrs.add('style')
+
+ cleaner = Cleaner(
+ safe_attrs_only=True,
+ safe_attrs=safe_attrs)
+ result = cleaner.clean_html(html)
+
+ self.assertEqual(html, result)
+
+ def test_safe_attrs_excluded(self):
+ html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
+ expected = """<p><span>Cyan</span></p>"""
+
+ safe_attrs=set()
+
+ cleaner = Cleaner(
+ safe_attrs_only=True,
+ safe_attrs=safe_attrs)
+ result = cleaner.clean_html(html)
+
+ self.assertEqual(expected, result)
+
def test_suite():
suite = unittest.TestSuite()
if sys.version_info >= (2,4):