SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support

decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful.
author: Walter Dörwald <walter@livinglogic.de> 2004-09-07 20:24:22 +0000
committer: Walter Dörwald <walter@livinglogic.de> 2004-09-07 20:24:22 +0000
commit: 69652035bc2cf22b0326bb00824f4b7e2674cc8b (patch)
tree: 088104a47f9c9cfc466a3e1c5f4d2560b2d41450 /Lib/test/test_codecs.py
parent: a708d6e3b0aa2d225d4e5ab338862f67994e1c45 (diff)
download: cpython-git-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.gz
1 files changed, 117 insertions, 1 deletions
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index c428c615fd..524c247d8c 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3,7 +3,45 @@ import unittest
 import codecs
 import StringIO
 
-class UTF16Test(unittest.TestCase):
+class Queue(object):
+    """
+    queue: write bytes at one end, read bytes from the other end
+    """
+    def __init__(self):
+        self._buffer = ""
+
+    def write(self, chars):
+        self._buffer += chars
+
+    def read(self, size=-1):
+        if size<0:
+            s = self._buffer
+            self._buffer = ""
+            return s
+        else:
+            s = self._buffer[:size]
+            self._buffer = self._buffer[size:]
+            return s
+
+class PartialReadTest(unittest.TestCase):
+    def check_partial(self, encoding, input, partialresults):
+        # get a StreamReader for the encoding and feed the bytestring version
+        # of input to the reader byte by byte. Read every available from
+        # the StreamReader and check that the results equal the appropriate
+        # entries from partialresults.
+        q = Queue()
+        r = codecs.getreader(encoding)(q)
+        result = u""
+        for (c, partialresult) in zip(input.encode(encoding), partialresults):
+            q.write(c)
+            result += r.read()
+            self.assertEqual(result, partialresult)
+        # check that there's nothing left in the buffers
+        self.assertEqual(r.read(), u"")
+        self.assertEqual(r.bytebuffer, "")
+        self.assertEqual(r.charbuffer, u"")
+
+class UTF16Test(PartialReadTest):
 
     spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
     spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
@@ -23,6 +61,81 @@ class UTF16Test(unittest.TestCase):
         f = reader(s)
         self.assertEquals(f.read(), u"spamspam")
 
+    def test_partial(self):
+        self.check_partial(
+            "utf-16",
+            u"\x00\xff\u0100\uffff",
+            [
+                u"", # first byte of BOM read
+                u"", # second byte of BOM read => byteorder known
+                u"",
+                u"\x00",
+                u"\x00",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100\uffff",
+            ]
+        )
+
+class UTF16LETest(PartialReadTest):
+
+    def test_partial(self):
+        self.check_partial(
+            "utf-16-le",
+            u"\x00\xff\u0100\uffff",
+            [
+                u"",
+                u"\x00",
+                u"\x00",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100\uffff",
+            ]
+        )
+
+class UTF16BETest(PartialReadTest):
+
+    def test_partial(self):
+        self.check_partial(
+            "utf-16-be",
+            u"\x00\xff\u0100\uffff",
+            [
+                u"",
+                u"\x00",
+                u"\x00",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100",
+                u"\x00\xff\u0100\uffff",
+            ]
+        )
+
+class UTF8Test(PartialReadTest):
+
+    def test_partial(self):
+        self.check_partial(
+            "utf-8",
+            u"\x00\xff\u07ff\u0800\uffff",
+            [
+                u"\x00",
+                u"\x00",
+                u"\x00\xff",
+                u"\x00\xff",
+                u"\x00\xff\u07ff",
+                u"\x00\xff\u07ff",
+                u"\x00\xff\u07ff",
+                u"\x00\xff\u07ff\u0800",
+                u"\x00\xff\u07ff\u0800",
+                u"\x00\xff\u07ff\u0800",
+                u"\x00\xff\u07ff\u0800\uffff",
+            ]
+        )
+
 class EscapeDecodeTest(unittest.TestCase):
     def test_empty_escape_decode(self):
         self.assertEquals(codecs.escape_decode(""), ("", 0))
@@ -348,6 +461,9 @@ class CodecsModuleTest(unittest.TestCase):
 def test_main():
     test_support.run_unittest(
         UTF16Test,
+        UTF16LETest,
+        UTF16BETest,
+        UTF8Test,
         EscapeDecodeTest,
         RecodingTest,
         PunycodeTest,
author	Walter Dörwald <walter@livinglogic.de>	2004-09-07 20:24:22 +0000
committer	Walter Dörwald <walter@livinglogic.de>	2004-09-07 20:24:22 +0000
commit	69652035bc2cf22b0326bb00824f4b7e2674cc8b (patch)
tree	088104a47f9c9cfc466a3e1c5f4d2560b2d41450 /Lib/test/test_codecs.py
parent	a708d6e3b0aa2d225d4e5ab338862f67994e1c45 (diff)
download	cpython-git-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.gz