From 4f34ff6ad55abb1a57d7109ce2a4d868271bc981 Mon Sep 17 00:00:00 2001 From: Patrick Strawderman Date: Sun, 18 Nov 2012 01:52:42 -0500 Subject: decode all HTML entities --- tests/test_click.py | 5 +++++ webtest/app.py | 23 +++++++++++++++++------ webtest/compat.py | 2 ++ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/tests/test_click.py b/tests/test_click.py index 5c27c53..b02179a 100644 --- a/tests/test_click.py +++ b/tests/test_click.py @@ -130,3 +130,8 @@ class TestClick(unittest.TestCase): self.assertEqual(_parse_attrs("href='foo' id=\"bar\" "), {'href': 'foo', 'id': 'bar'}) self.assertEqual(_parse_attrs("href='foo' id='bar' "), {'href': 'foo', 'id': 'bar'}) self.assertEqual(_parse_attrs("tag='foo\"'"), {'tag': 'foo"'}) + self.assertEqual( + _parse_attrs('value="<>&"{"'), + {'value': u('<>&"{')}) + self.assertEqual(_parse_attrs('value="∑"'), {'value': u('∑')}) + self.assertEqual(_parse_attrs('value="€"'), {'value': u('€')}) diff --git a/webtest/app.py b/webtest/app.py index 406a846..32a0ef9 100644 --- a/webtest/app.py +++ b/webtest/app.py @@ -21,6 +21,7 @@ from webtest.compat import StringIO from webtest.compat import BytesIO from webtest.compat import SimpleCookie, CookieError from webtest.compat import cookie_quote +from webtest.compat import name2codepoint from webtest.compat import urlencode from webtest.compat import splittype from webtest.compat import splithost @@ -1819,15 +1820,25 @@ def _make_pattern(pat): "Cannot make callable pattern object out of %r" % pat) +entity_pattern = re.compile(r"&(\w+|#\d+|#[xX][a-fA-F0-9]+);") def html_unquote(v): """ - Unquote (some) entities in HTML. (incomplete) + Unquote entities in HTML. """ - for ent, repl in [(' ', ' '), ('>', '>'), - ('<', '<'), ('"', '"'), - ('&', '&')]: - v = v.replace(ent, repl) - return v + to_chr = chr if PY3 else unichr + def repl(match): + s = match.group(1) + if s.startswith("#"): + if s[1].lower() == "x": + s = int(s[2:], 16) + else: + s = int(s[1:]) + elif s in name2codepoint: + s = name2codepoint[s] + else: + return + return to_chr(s) + return entity_pattern.sub(repl, v) def encode_params(params, content_type): diff --git a/webtest/compat.py b/webtest/compat.py index 1c30a4f..b4d39ac 100644 --- a/webtest/compat.py +++ b/webtest/compat.py @@ -8,6 +8,7 @@ if sys.version_info[0] > 2: binary_type = bytes from json import loads from json import dumps + from html.entities import name2codepoint from io import StringIO from io import BytesIO from urllib.parse import urlencode @@ -40,6 +41,7 @@ else: string_types = basestring text_type = unicode binary_type = str + from htmlentitydefs import name2codepoint from urllib import splittype from urllib import splithost from urllib import urlencode -- cgit v1.2.1