diff options
| author | Victor Stinner <victor.stinner@gmail.com> | 2012-11-12 01:23:15 +0100 | 
|---|---|---|
| committer | Victor Stinner <victor.stinner@gmail.com> | 2012-11-12 01:23:15 +0100 | 
| commit | e667e98faad93e76b8b569d853eb02d91591f5fb (patch) | |
| tree | 009075f72468af1156312858aada27dd389678f3 /Lib/test/support.py | |
| parent | 37bfa4e7ec5df1528aa208150933a3fc54508cf9 (diff) | |
| download | cpython-git-e667e98faad93e76b8b569d853eb02d91591f5fb.tar.gz | |
Issue #16218, #16444: Backport improvment on tests for non-ASCII characters
Diffstat (limited to 'Lib/test/support.py')
| -rw-r--r-- | Lib/test/support.py | 75 | 
1 files changed, 75 insertions, 0 deletions
| diff --git a/Lib/test/support.py b/Lib/test/support.py index c5640e0a08..d0a37ea926 100644 --- a/Lib/test/support.py +++ b/Lib/test/support.py @@ -603,6 +603,49 @@ else:  # module name.  TESTFN = "{}_{}_tmp".format(TESTFN, os.getpid()) +# FS_NONASCII: non-ASCII character encodable by os.fsencode(), +# or None if there is no such character. +FS_NONASCII = None +for character in ( +    # First try printable and common characters to have a readable filename. +    # For each character, the encoding list are just example of encodings able +    # to encode the character (the list is not exhaustive). + +    # U+00E6 (Latin Small Letter Ae): cp1252, iso-8859-1 +    '\u00E6', +    # U+0130 (Latin Capital Letter I With Dot Above): cp1254, iso8859_3 +    '\u0130', +    # U+0141 (Latin Capital Letter L With Stroke): cp1250, cp1257 +    '\u0141', +    # U+03C6 (Greek Small Letter Phi): cp1253 +    '\u03C6', +    # U+041A (Cyrillic Capital Letter Ka): cp1251 +    '\u041A', +    # U+05D0 (Hebrew Letter Alef): Encodable to cp424 +    '\u05D0', +    # U+060C (Arabic Comma): cp864, cp1006, iso8859_6, mac_arabic +    '\u060C', +    # U+062A (Arabic Letter Teh): cp720 +    '\u062A', +    # U+0E01 (Thai Character Ko Kai): cp874 +    '\u0E01', + +    # Then try more "special" characters. "special" because they may be +    # interpreted or displayed differently depending on the exact locale +    # encoding and the font. + +    # U+00A0 (No-Break Space) +    '\u00A0', +    # U+20AC (Euro Sign) +    '\u20AC', +): +    try: +        os.fsdecode(os.fsencode(character)) +    except UnicodeError: +        pass +    else: +        FS_NONASCII = character +        break  # TESTFN_UNICODE is a non-ascii filename  TESTFN_UNICODE = TESTFN + "-\xe0\xf2\u0258\u0141\u011f" @@ -647,6 +690,38 @@ elif sys.platform != 'darwin':          # the byte 0xff. Skip some unicode filename tests.          pass +# TESTFN_UNDECODABLE is a filename (bytes type) that should *not* be able to be +# decoded from the filesystem encoding (in strict mode). It can be None if we +# cannot generate such filename (ex: the latin1 encoding can decode any byte +# sequence). On UNIX, TESTFN_UNDECODABLE can be decoded by os.fsdecode() thanks +# to the surrogateescape error handler (PEP 383), but not from the filesystem +# encoding in strict mode. +TESTFN_UNDECODABLE = None +for name in ( +    # b'\xff' is not decodable by os.fsdecode() with code page 932. Windows +    # accepts it to create a file or a directory, or don't accept to enter to +    # such directory (when the bytes name is used). So test b'\xe7' first: it is +    # not decodable from cp932. +    b'\xe7w\xf0', +    # undecodable from ASCII, UTF-8 +    b'\xff', +    # undecodable from iso8859-3, iso8859-6, iso8859-7, cp424, iso8859-8, cp856 +    # and cp857 +    b'\xae\xd5' +    # undecodable from UTF-8 (UNIX and Mac OS X) +    b'\xed\xb2\x80', b'\xed\xb4\x80', +): +    try: +        name.decode(TESTFN_ENCODING) +    except UnicodeDecodeError: +        TESTFN_UNDECODABLE = os.fsencode(TESTFN) + name +        break + +if FS_NONASCII: +    TESTFN_NONASCII = TESTFN + '-' + FS_NONASCII +else: +    TESTFN_NONASCII = None +  # Save the initial cwd  SAVEDCWD = os.getcwd() | 
