summaryrefslogtreecommitdiff
path: root/requests
diff options
context:
space:
mode:
authorJon Parise <jon@indelible.org>2013-09-03 16:16:46 -0700
committerJon Parise <jon@indelible.org>2013-09-03 16:16:46 -0700
commitb9b5be7c4c78e02f83464d8e332a1bf819853b60 (patch)
treef52f6e6f409bacd22e0636c978fe46f38029e4a5 /requests
parentd991fb032a7b92e6efe97a32f24b6e6d9ccfe84c (diff)
downloadpython-requests-b9b5be7c4c78e02f83464d8e332a1bf819853b60.tar.gz
Improved content encoding detection.
get_encodings_from_content() can now detect HTML in-document content encoding declarations in the following formats: - HTML5 - HTML4 - XHTML 1.x served with text/html MIME type - XHTML 1.x served as XML Ref: http://www.w3.org/International/questions/qa-html-encoding-declarations
Diffstat (limited to 'requests')
-rw-r--r--requests/utils.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/requests/utils.py b/requests/utils.py
index 37aa19e4..ac5f59d8 100644
--- a/requests/utils.py
+++ b/requests/utils.py
@@ -265,8 +265,12 @@ def get_encodings_from_content(content):
"""
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
+ pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
+ xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
- return charset_re.findall(content)
+ return (charset_re.findall(content) +
+ pragma_re.findall(content) +
+ xml_re.findall(content))
def get_encoding_from_headers(headers):