summaryrefslogtreecommitdiff
path: root/Lib
diff options
context:
space:
mode:
authorEli Bendersky <eliben@gmail.com>2013-05-25 05:27:10 -0700
committerEli Bendersky <eliben@gmail.com>2013-05-25 05:27:10 -0700
commit7b3022f24f4bb8c64d02e103de3168e96729bdc2 (patch)
tree1d027a6e109d195fe7c6a1b3662ea4618c6fcb28 /Lib
parent19fef69b752d5a1e836ba5b552a8e68592503852 (diff)
parent6dc32b34ddfba0ddb990cbbb77cf8803879d20f9 (diff)
downloadcpython-git-7b3022f24f4bb8c64d02e103de3168e96729bdc2.tar.gz
Issue #13612: handle unknown encodings without a buffer overflow.
This affects pyexpat and _elementtree. PyExpat_CAPI now exposes a new function - DefaultUnknownEncodingHandler. Based on a patch by Serhiy Storchaka.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/test/test_xml_etree.py92
1 files changed, 92 insertions, 0 deletions
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index 2ea0058dfd..2ec3322cae 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -681,6 +681,98 @@ class ElementTreeTest(unittest.TestCase):
check("cp437", '\u221a')
check("mac-roman", '\u02da')
+ def xml(encoding):
+ return "<?xml version='1.0' encoding='%s'?><xml />" % encoding
+ def bxml(encoding):
+ return xml(encoding).encode(encoding)
+ supported_encodings = [
+ 'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
+ 'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
+ 'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
+ 'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
+ 'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852',
+ 'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862',
+ 'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1250',
+ 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
+ 'cp1257', 'cp1258',
+ 'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
+ 'mac-roman', 'mac-turkish',
+ 'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
+ 'iso2022-jp-3', 'iso2022-jp-ext',
+ 'koi8-r', 'koi8-u',
+ 'hz', 'ptcp154',
+ ]
+ for encoding in supported_encodings:
+ self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
+
+ unsupported_ascii_compatible_encodings = [
+ 'big5', 'big5hkscs',
+ 'cp932', 'cp949', 'cp950',
+ 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
+ 'gb2312', 'gbk', 'gb18030',
+ 'iso2022-kr', 'johab',
+ 'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
+ 'utf-7',
+ ]
+ for encoding in unsupported_ascii_compatible_encodings:
+ self.assertRaises(ValueError, ET.XML, bxml(encoding))
+
+ unsupported_ascii_incompatible_encodings = [
+ 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
+ 'utf_32', 'utf_32_be', 'utf_32_le',
+ ]
+ for encoding in unsupported_ascii_incompatible_encodings:
+ self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
+
+ self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
+ self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))
+
+ def xml(encoding):
+ return "<?xml version='1.0' encoding='%s'?><xml />" % encoding
+ def bxml(encoding):
+ return xml(encoding).encode(encoding)
+ supported_encodings = [
+ 'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
+ 'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
+ 'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
+ 'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
+ 'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852',
+ 'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862',
+ 'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1250',
+ 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
+ 'cp1257', 'cp1258',
+ 'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
+ 'mac-roman', 'mac-turkish',
+ 'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
+ 'iso2022-jp-3', 'iso2022-jp-ext',
+ 'koi8-r', 'koi8-u',
+ 'hz', 'ptcp154',
+ ]
+ for encoding in supported_encodings:
+ self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
+
+ unsupported_ascii_compatible_encodings = [
+ 'big5', 'big5hkscs',
+ 'cp932', 'cp949', 'cp950',
+ 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
+ 'gb2312', 'gbk', 'gb18030',
+ 'iso2022-kr', 'johab',
+ 'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
+ 'utf-7',
+ ]
+ for encoding in unsupported_ascii_compatible_encodings:
+ self.assertRaises(ValueError, ET.XML, bxml(encoding))
+
+ unsupported_ascii_incompatible_encodings = [
+ 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
+ 'utf_32', 'utf_32_be', 'utf_32_le',
+ ]
+ for encoding in unsupported_ascii_incompatible_encodings:
+ self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
+
+ self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
+ self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))
+
def test_methods(self):
# Test serialization methods.