diff options
| author | Serhiy Storchaka <storchaka@gmail.com> | 2015-10-02 13:13:14 +0300 | 
|---|---|---|
| committer | Serhiy Storchaka <storchaka@gmail.com> | 2015-10-02 13:13:14 +0300 | 
| commit | 58c8f2bb6de115b620cec3cf995f04005573765c (patch) | |
| tree | d5baea3027a00eb820002365608c30762d7da22e | |
| parent | b9d98d532cb9bdebff9854eaff91fea13769a595 (diff) | |
| parent | 28b21e50c8f1bc9f4524b02df75b83f3b5efacb4 (diff) | |
| download | cpython-git-58c8f2bb6de115b620cec3cf995f04005573765c.tar.gz | |
Issue #24848: Fixed bugs in UTF-7 decoding of misformed data:
1. Non-ASCII bytes were accepted after shift sequence.
2. A low surrogate could be emitted in case of error in high surrogate.
3. In some circumstances the '\xfd' character was produced instead of the
replacement character '\ufffd' (due to a bug in _PyUnicodeWriter).
| -rw-r--r-- | Lib/test/test_codecs.py | 60 | ||||
| -rw-r--r-- | Lib/test/test_unicode.py | 3 | ||||
| -rw-r--r-- | Misc/NEWS | 2 | ||||
| -rw-r--r-- | Objects/unicodeobject.c | 21 | 
4 files changed, 75 insertions, 11 deletions
| diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index a4a6f95ca2..cc1f11aeaf 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -903,6 +903,32 @@ class CP65001Test(ReadTest, unittest.TestCase):  class UTF7Test(ReadTest, unittest.TestCase):      encoding = "utf-7" +    def test_ascii(self): +        # Set D (directly encoded characters) +        set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' +                 'abcdefghijklmnopqrstuvwxyz' +                 '0123456789' +                 '\'(),-./:?') +        self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii')) +        self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d) +        # Set O (optional direct characters) +        set_o = ' !"#$%&*;<=>@[]^_`{|}' +        self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii')) +        self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o) +        # + +        self.assertEqual('a+b'.encode(self.encoding), b'a+-b') +        self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b') +        # White spaces +        ws = ' \t\n\r' +        self.assertEqual(ws.encode(self.encoding), ws.encode('ascii')) +        self.assertEqual(ws.encode('ascii').decode(self.encoding), ws) +        # Other ASCII characters +        other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) - +                                     set(set_d + set_o + '+' + ws))) +        self.assertEqual(other_ascii.encode(self.encoding), +                         b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU' +                         b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-') +      def test_partial(self):          self.check_partial(              'a+-b\x00c\x80d\u0100e\U00010000f', @@ -944,7 +970,9 @@ class UTF7Test(ReadTest, unittest.TestCase):      def test_errors(self):          tests = [ +            (b'\xffb', '\ufffdb'),              (b'a\xffb', 'a\ufffdb'), +            (b'a\xff\xffb', 'a\ufffd\ufffdb'),              (b'a+IK', 'a\ufffd'),              (b'a+IK-b', 'a\ufffdb'),              (b'a+IK,b', 'a\ufffdb'), @@ -960,6 +988,8 @@ class UTF7Test(ReadTest, unittest.TestCase):              (b'a+//,+IKw-b', 'a\ufffd\u20acb'),              (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),              (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'), +            (b'a+IKw-b\xff', 'a\u20acb\ufffd'), +            (b'a+IKw\xffb', 'a\u20ac\ufffdb'),          ]          for raw, expected in tests:              with self.subTest(raw=raw): @@ -971,8 +1001,36 @@ class UTF7Test(ReadTest, unittest.TestCase):          self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')          self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')          self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') +        self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0') +        self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-') +        self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0') +        self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0') +        self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding), +                         b'+IKwgrNgB3KA-') +        self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding), +                         '\u20ac\u20ac\U000104A0') +        self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding), +                         '\u20ac\u20ac\U000104A0') -    test_lone_surrogates = None +    def test_lone_surrogates(self): +        tests = [ +            (b'a+2AE-b', 'a\ud801b'), +            (b'a+2AE\xffb', 'a\ufffdb'), +            (b'a+2AE', 'a\ufffd'), +            (b'a+2AEA-b', 'a\ufffdb'), +            (b'a+2AH-b', 'a\ufffdb'), +            (b'a+IKzYAQ-b', 'a\u20ac\ud801b'), +            (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'), +            (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'), +            (b'a+IKzYAd-b', 'a\u20ac\ufffdb'), +            (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'), +            (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'), +            (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'), +            (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'), +        ] +        for raw, expected in tests: +            with self.subTest(raw=raw): +                self.assertEqual(raw.decode('utf-7', 'replace'), expected)  class UTF16ExTest(unittest.TestCase): diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 3fcb590f69..1429a6d545 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1553,7 +1553,7 @@ class UnicodeTest(string_tests.CommonTest,          self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')          # Issue #2242: crash on some Windows/MSVC versions -        self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1') +        self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')          # Direct encoded characters          set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?" @@ -1995,6 +1995,7 @@ class UnicodeTest(string_tests.CommonTest,          self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')          self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")          self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x') +        self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')          # Error handling (unknown character names)          self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx") @@ -11,6 +11,8 @@ Release date: TBA  Core and Builtins  ----------------- +- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data. +  - Issue #25280: Import trace messages emitted in verbose (-v) mode are no    longer formatted twice. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 9223c9911e..168f9f9923 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4330,31 +4330,31 @@ PyUnicode_DecodeUTF7Stateful(const char *s,              }              else { /* now leaving a base-64 section */                  inShift = 0; -                s++; -                if (surrogate) { -                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) -                        goto onError; -                    surrogate = 0; -                }                  if (base64bits > 0) { /* left-over bits */                      if (base64bits >= 6) {                          /* We've seen at least one base-64 character */ +                        s++;                          errmsg = "partial character in shift sequence";                          goto utf7Error;                      }                      else {                          /* Some bits remain; they should be zero */                          if (base64buffer != 0) { +                            s++;                              errmsg = "non-zero padding bits in shift sequence";                              goto utf7Error;                          }                      }                  } -                if (ch != '-') { +                if (surrogate && DECODE_DIRECT(ch)) { +                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0) +                        goto onError; +                } +                surrogate = 0; +                if (ch == '-') {                      /* '-' is absorbed; other terminating                         characters are preserved */ -                    if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) -                        goto onError; +                    s++;                  }              }          } @@ -4368,6 +4368,7 @@ PyUnicode_DecodeUTF7Stateful(const char *s,              }              else { /* begin base64-encoded section */                  inShift = 1; +                surrogate = 0;                  shiftOutStart = writer.pos;                  base64bits = 0;                  base64buffer = 0; @@ -4399,6 +4400,7 @@ utf7Error:      if (inShift && !consumed) { /* in shift sequence, no more to follow */          /* if we're in an inconsistent state, that's an error */ +        inShift = 0;          if (surrogate ||                  (base64bits >= 6) ||                  (base64bits > 0 && base64buffer != 0)) { @@ -13291,6 +13293,7 @@ _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,          if (maxchar > writer->maxchar || writer->readonly) {              /* resize + widen */ +            maxchar = Py_MAX(maxchar, writer->maxchar);              newbuffer = PyUnicode_New(newlen, maxchar);              if (newbuffer == NULL)                  return -1; | 
