Improved content encoding detection.

get_encodings_from_content() can now detect HTML in-document content encoding declarations in the following formats: - HTML5 - HTML4 - XHTML 1.x served with text/html MIME type - XHTML 1.x served as XML Ref: http://www.w3.org/International/questions/qa-html-encoding-declarations
author: Jon Parise <jon@indelible.org> 2013-09-03 16:16:46 -0700
committer: Jon Parise <jon@indelible.org> 2013-09-03 16:16:46 -0700
commit: b9b5be7c4c78e02f83464d8e332a1bf819853b60 (patch)
tree: f52f6e6f409bacd22e0636c978fe46f38029e4a5 /requests
parent: d991fb032a7b92e6efe97a32f24b6e6d9ccfe84c (diff)
download: python-requests-b9b5be7c4c78e02f83464d8e332a1bf819853b60.tar.gz
1 files changed, 5 insertions, 1 deletions
diff --git a/requests/utils.py b/requests/utils.py
index 37aa19e4..ac5f59d8 100644
--- a/requests/utils.py
+++ b/requests/utils.py
@@ -265,8 +265,12 @@ def get_encodings_from_content(content):
     """
 
     charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
+    pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
+    xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
 
-    return charset_re.findall(content)
+    return (charset_re.findall(content) +
+            pragma_re.findall(content) +
+            xml_re.findall(content))
 
 
 def get_encoding_from_headers(headers):
author	Jon Parise <jon@indelible.org>	2013-09-03 16:16:46 -0700
committer	Jon Parise <jon@indelible.org>	2013-09-03 16:16:46 -0700
commit	b9b5be7c4c78e02f83464d8e332a1bf819853b60 (patch)
tree	f52f6e6f409bacd22e0636c978fe46f38029e4a5 /requests
parent	d991fb032a7b92e6efe97a32f24b6e6d9ccfe84c (diff)
download	python-requests-b9b5be7c4c78e02f83464d8e332a1bf819853b60.tar.gz