Issue #84: Unicode support in screen and ANSI.

This commit updates the the screen and ANSI modules to support Unicode under Python 2.x. Under Python 3.x, it was already supported because strings are Unicode by default. Now, on both Python versions: - The constructors accept a codec name (defaults to 'latin-1') and a scheme for handling encoding/decoding errors (defaults to 'replace'). The codec may be set to None to inhibit encoding/decoding. - Unicode is now used internally for storing the screen contents. - Methods that accept input characters will, if passed input of type 'bytes' (or, under Python 2.x, 'str'), use the specified codec to decode the input, otherwise treating it as Unicode. - Methods that return screen contents now return Unicode, with the exception of __str__() under Python 2.x, and __bytes__() in all versions of Python, which return the screen contents encoded using the specified codec. These changes are designed to work only with Python 2.6, 2.7, and 3.3 and later, specifically versions that provide both b'' and u'' string literals. The check in ANSI for characters being printable is also removed, as this prevents non-ASCII characters being accepted, which is not compatible with the goal of adding Unicode support. This addresses issue #83.
author: David O'Shea <doshea@doshea-centos-x86-64.adl.quantum.com> 2014-07-08 10:40:27 +0930
committer: David O'Shea <doshea@doshea-centos-x86-64.adl.quantum.com> 2014-07-24 22:09:23 +0930
commit: db34b8fed2c21b8b790d2710fde1004d49c8f00e (patch)
tree: 589a5f4a902fc38570136c615686bd5bab8c4a68 /tests/test_ansi.py
parent: cfca6f309c1c8109bd3ad023c6382849fb5c6632 (diff)
download: pexpect-db34b8fed2c21b8b790d2710fde1004d49c8f00e.tar.gz
1 files changed, 52 insertions, 0 deletions
diff --git a/tests/test_ansi.py b/tests/test_ansi.py
index 3b8d6a9..48d92a8 100755
--- a/tests/test_ansi.py
+++ b/tests/test_ansi.py
@@ -21,6 +21,11 @@ PEXPECT LICENSE
 from pexpect import ANSI
 import unittest
 from . import PexpectTestCase
+import sys
+
+PY3 = (sys.version_info[0] >= 3)
+if PY3:
+    unicode = str
 
 write_target = 'I\'ve got a ferret sticking up my nose.                           \n' +\
 '(He\'s got a ferret sticking up his nose.)                        \n' +\
@@ -147,6 +152,53 @@ class ansiTestCase (PexpectTestCase.PexpectTestCase):
         assert str(s) == ('test                ')
         assert(s.state.memory == [s, '0', '1', '32', '45'])
 
+    def test_utf8_bytes(self):
+        """Test that when bytes are passed in containing UTF-8 encoded
+        characters, where the encoding of each character consists of
+        multiple bytes, the characters are correctly decoded.
+        Incremental decoding is also tested."""
+        s = ANSI.ANSI(2, 10, codec='utf-8')
+        # This is the UTF-8 encoding of the UCS character "HOURGLASS"
+        # followed by the UTF-8 encoding of the UCS character
+        # "KEYBOARD".  These characters can't be encoded in cp437 or
+        # latin-1.  The "KEYBOARD" character is split into two
+        # separate writes.
+        s.write(b'\xe2\x8c\x9b')
+        s.write(b'\xe2\x8c')
+        s.write(b'\xa8')
+        assert unicode(s) == u'\u231b\u2328        \n          '
+        assert bytes(s) == b'\xe2\x8c\x9b\xe2\x8c\xa8        \n          '
+        assert s.dump() == u'\u231b\u2328                  '
+        assert s.pretty() == u'+----------+\n|\u231b\u2328        |\n|          |\n+----------+\n'
+        assert s.get_abs(1, 1) == u'\u231b'
+        assert s.get_region(1, 1, 1, 5) == [u'\u231b\u2328   ']
+
+    def test_unicode(self):
+        """Test passing in of a unicode string."""
+        s = ANSI.ANSI(2, 10, codec="utf-8")
+        s.write(u'\u231b\u2328')
+        assert unicode(s) == u'\u231b\u2328        \n          '
+        assert bytes(s) == b'\xe2\x8c\x9b\xe2\x8c\xa8        \n          '
+        assert s.dump() == u'\u231b\u2328                  '
+        assert s.pretty() == u'+----------+\n|\u231b\u2328        |\n|          |\n+----------+\n'
+        assert s.get_abs(1, 1) == u'\u231b'
+        assert s.get_region(1, 1, 1, 5) == [u'\u231b\u2328   ']
+
+    def test_decode_error(self):
+        """Test that default handling of decode errors replaces the
+        invalid characters."""
+        s = ANSI.ANSI(2, 10, codec="ascii")
+        s.write(b'\xff') # a non-ASCII character
+        # In unicode, the non-ASCII character is replaced with
+        # REPLACEMENT CHARACTER.
+        assert unicode(s) == u'\ufffd         \n          '
+        assert bytes(s) == b'?         \n          '
+        assert s.dump() == u'\ufffd                   '
+        assert s.pretty() == u'+----------+\n|\ufffd         |\n|          |\n+----------+\n'
+        assert s.get_abs(1, 1) == u'\ufffd'
+        assert s.get_region(1, 1, 1, 5) == [u'\ufffd    ']
+
+
 if __name__ == '__main__':
     unittest.main()
author	David O'Shea <doshea@doshea-centos-x86-64.adl.quantum.com>	2014-07-08 10:40:27 +0930
committer	David O'Shea <doshea@doshea-centos-x86-64.adl.quantum.com>	2014-07-24 22:09:23 +0930
commit	db34b8fed2c21b8b790d2710fde1004d49c8f00e (patch)
tree	589a5f4a902fc38570136c615686bd5bab8c4a68 /tests/test_ansi.py
parent	cfca6f309c1c8109bd3ad023c6382849fb5c6632 (diff)
download	pexpect-db34b8fed2c21b8b790d2710fde1004d49c8f00e.tar.gz