diff options
| author | Jon Parise <jon@indelible.org> | 2013-09-03 16:16:46 -0700 |
|---|---|---|
| committer | Jon Parise <jon@indelible.org> | 2013-09-03 16:16:46 -0700 |
| commit | b9b5be7c4c78e02f83464d8e332a1bf819853b60 (patch) | |
| tree | f52f6e6f409bacd22e0636c978fe46f38029e4a5 /requests | |
| parent | d991fb032a7b92e6efe97a32f24b6e6d9ccfe84c (diff) | |
| download | python-requests-b9b5be7c4c78e02f83464d8e332a1bf819853b60.tar.gz | |
Improved content encoding detection.
get_encodings_from_content() can now detect HTML in-document content
encoding declarations in the following formats:
- HTML5
- HTML4
- XHTML 1.x served with text/html MIME type
- XHTML 1.x served as XML
Ref: http://www.w3.org/International/questions/qa-html-encoding-declarations
Diffstat (limited to 'requests')
| -rw-r--r-- | requests/utils.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/requests/utils.py b/requests/utils.py index 37aa19e4..ac5f59d8 100644 --- a/requests/utils.py +++ b/requests/utils.py @@ -265,8 +265,12 @@ def get_encodings_from_content(content): """ charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I) + pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I) + xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') - return charset_re.findall(content) + return (charset_re.findall(content) + + pragma_re.findall(content) + + xml_re.findall(content)) def get_encoding_from_headers(headers): |
