diff options
| author | scoder <stefan_ml@behnel.de> | 2013-01-10 14:06:37 -0800 |
|---|---|---|
| committer | scoder <stefan_ml@behnel.de> | 2013-01-10 14:06:37 -0800 |
| commit | 4b70bc172f3899b6804049b17d850774bdd0662a (patch) | |
| tree | a6dd4321e18c34b7ef60949278f2615c95c510cc /src | |
| parent | e01a81740c77ddd26f98b316cec4610b2914eb64 (diff) | |
| parent | 559b6dd3dad6155273515f093b4cd9e9f45a1716 (diff) | |
| download | python-lxml-4b70bc172f3899b6804049b17d850774bdd0662a.tar.gz | |
Merge pull request #89 from brightinteractive/specify_safe_attrs
allow the set of attributes considered safe by Cleaner to be overridden
Diffstat (limited to 'src')
| -rw-r--r-- | src/lxml/html/clean.py | 10 | ||||
| -rw-r--r-- | src/lxml/html/tests/test_clean.py | 26 |
2 files changed, 34 insertions, 2 deletions
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index faf32898..4a6912f0 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -147,6 +147,10 @@ class Cleaner(object): If true, only include 'safe' attributes (specifically the list from the feedparser HTML sanitisation web site). + ``safe_attrs``: + A set of attribute names to override the default list of attributes + considered 'safe' (when safe_attrs_only=True). + ``add_nofollow``: If true, then any <a> tags will have ``rel="nofollow"`` added to them. @@ -189,6 +193,7 @@ class Cleaner(object): kill_tags = None remove_unknown_tags = True safe_attrs_only = True + safe_attrs = defs.safe_attrs add_nofollow = False host_whitelist = () whitelist_tags = set(['iframe', 'embed']) @@ -247,14 +252,15 @@ class Cleaner(object): if self.scripts: kill_tags.add('script') if self.safe_attrs_only: - safe_attrs = set(defs.safe_attrs) + safe_attrs = set(self.safe_attrs) for el in doc.iter(): attrib = el.attrib for aname in attrib.keys(): if aname not in safe_attrs: del attrib[aname] if self.javascript: - if not self.safe_attrs_only: + if not (self.safe_attrs_only and + self.safe_attrs == defs.safe_attrs): # safe_attrs handles events attributes itself for el in doc.iter(): attrib = el.attrib diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py index 0daa0d27..f799be92 100644 --- a/src/lxml/html/tests/test_clean.py +++ b/src/lxml/html/tests/test_clean.py @@ -34,6 +34,32 @@ class CleanerTest(unittest.TestCase): self.assertEqual(12-5+1, len(list(result.iter()))) + def test_safe_attrs_included(self): + html = """<p><span style="color: #00ffff;">Cyan</span></p>""" + + safe_attrs=set(lxml.html.defs.safe_attrs) + safe_attrs.add('style') + + cleaner = Cleaner( + safe_attrs_only=True, + safe_attrs=safe_attrs) + result = cleaner.clean_html(html) + + self.assertEqual(html, result) + + def test_safe_attrs_excluded(self): + html = """<p><span style="color: #00ffff;">Cyan</span></p>""" + expected = """<p><span>Cyan</span></p>""" + + safe_attrs=set() + + cleaner = Cleaner( + safe_attrs_only=True, + safe_attrs=safe_attrs) + result = cleaner.clean_html(html) + + self.assertEqual(expected, result) + def test_suite(): suite = unittest.TestSuite() if sys.version_info >= (2,4): |
