diff options
-rw-r--r-- | Doc/c-api/init_config.rst | 52 | ||||
-rw-r--r-- | Doc/library/sys.rst | 31 | ||||
-rw-r--r-- | Include/cpython/initconfig.h | 37 | ||||
-rw-r--r-- | Include/internal/pycore_fileutils.h | 2 | ||||
-rw-r--r-- | Include/pyport.h | 8 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Core and Builtins/2020-11-01-21-21-38.bpo-42236.MPx-NK.rst | 2 | ||||
-rw-r--r-- | Python/fileutils.c | 34 | ||||
-rw-r--r-- | Python/initconfig.c | 12 |
8 files changed, 88 insertions, 90 deletions
diff --git a/Doc/c-api/init_config.rst b/Doc/c-api/init_config.rst index 37f5b9f880..92a6c3a56d 100644 --- a/Doc/c-api/init_config.rst +++ b/Doc/c-api/init_config.rst @@ -253,10 +253,16 @@ PyPreConfig See :c:member:`PyConfig.isolated`. - .. c:member:: int legacy_windows_fs_encoding (Windows only) + .. c:member:: int legacy_windows_fs_encoding - If non-zero, disable UTF-8 Mode, set the Python filesystem encoding to - ``mbcs``, set the filesystem error handler to ``replace``. + If non-zero: + + * Set :c:member:`PyPreConfig.utf8_mode` to ``0``, + * Set :c:member:`PyConfig.filesystem_encoding` to ``"mbcs"``, + * Set :c:member:`PyConfig.filesystem_errors` to ``"replace"``. + + Initialized the from :envvar:`PYTHONLEGACYWINDOWSFSENCODING` environment + variable value. Only available on Windows. ``#ifdef MS_WINDOWS`` macro can be used for Windows specific code. @@ -499,11 +505,47 @@ PyConfig .. c:member:: wchar_t* filesystem_encoding - Filesystem encoding, :func:`sys.getfilesystemencoding`. + Filesystem encoding: :func:`sys.getfilesystemencoding`. + + On macOS, Android and VxWorks: use ``"utf-8"`` by default. + + On Windows: use ``"utf-8"`` by default, or ``"mbcs"`` if + :c:member:`~PyPreConfig.legacy_windows_fs_encoding` of + :c:type:`PyPreConfig` is non-zero. + + Default encoding on other platforms: + + * ``"utf-8"`` if :c:member:`PyPreConfig.utf8_mode` is non-zero. + * ``"ascii"`` if Python detects that ``nl_langinfo(CODESET)`` announces + the ASCII encoding (or Roman8 encoding on HP-UX), whereas the + ``mbstowcs()`` function decodes from a different encoding (usually + Latin1). + * ``"utf-8"`` if ``nl_langinfo(CODESET)`` returns an empty string. + * Otherwise, use the LC_CTYPE locale encoding: + ``nl_langinfo(CODESET)`` result. + + At Python statup, the encoding name is normalized to the Python codec + name. For example, ``"ANSI_X3.4-1968"`` is replaced with ``"ascii"``. + + See also the :c:member:`~PyConfig.filesystem_errors` member. .. c:member:: wchar_t* filesystem_errors - Filesystem encoding errors, :func:`sys.getfilesystemencodeerrors`. + Filesystem error handler: :func:`sys.getfilesystemencodeerrors`. + + On Windows: use ``"surrogatepass"`` by default, or ``"replace"`` if + :c:member:`~PyPreConfig.legacy_windows_fs_encoding` of + :c:type:`PyPreConfig` is non-zero. + + On other platforms: use ``"surrogateescape"`` by default. + + Supported error handlers: + + * ``"strict"`` + * ``"surrogateescape"`` + * ``"surrogatepass"`` (only supported with the UTF-8 encoding) + + See also the :c:member:`~PyConfig.filesystem_encoding` member. .. c:member:: unsigned long hash_seed .. c:member:: int use_hash_seed diff --git a/Doc/library/sys.rst b/Doc/library/sys.rst index 468a30d326..2f0840e2a7 100644 --- a/Doc/library/sys.rst +++ b/Doc/library/sys.rst @@ -616,29 +616,20 @@ always available. .. function:: getfilesystemencoding() Return the name of the encoding used to convert between Unicode - filenames and bytes filenames. For best compatibility, str should be - used for filenames in all cases, although representing filenames as bytes - is also supported. Functions accepting or returning filenames should support - either str or bytes and internally convert to the system's preferred - representation. + filenames and bytes filenames. + + For best compatibility, str should be used for filenames in all cases, + although representing filenames as bytes is also supported. Functions + accepting or returning filenames should support either str or bytes and + internally convert to the system's preferred representation. This encoding is always ASCII-compatible. :func:`os.fsencode` and :func:`os.fsdecode` should be used to ensure that the correct encoding and errors mode are used. - * In the UTF-8 mode, the encoding is ``utf-8`` on any platform. - - * On macOS, the encoding is ``'utf-8'``. - - * On Unix, the encoding is the locale encoding. - - * On Windows, the encoding may be ``'utf-8'`` or ``'mbcs'``, depending - on user configuration. - - * On Android, the encoding is ``'utf-8'``. - - * On VxWorks, the encoding is ``'utf-8'``. + The filesystem encoding is initialized from + :c:member:`PyConfig.filesystem_encoding`. .. versionchanged:: 3.2 :func:`getfilesystemencoding` result cannot be ``None`` anymore. @@ -660,6 +651,9 @@ always available. :func:`os.fsencode` and :func:`os.fsdecode` should be used to ensure that the correct encoding and errors mode are used. + The filesystem error handler is initialized from + :c:member:`PyConfig.filesystem_errors`. + .. versionadded:: 3.6 .. function:: getrefcount(object) @@ -1457,6 +1451,9 @@ always available. This is equivalent to defining the :envvar:`PYTHONLEGACYWINDOWSFSENCODING` environment variable before launching Python. + See also :func:`sys.getfilesystemencoding` and + :func:`sys.getfilesystemencodeerrors`. + .. availability:: Windows. .. versionadded:: 3.6 diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h index bbe8387677..dd5ca6121c 100644 --- a/Include/cpython/initconfig.h +++ b/Include/cpython/initconfig.h @@ -156,36 +156,13 @@ typedef struct { /* Python filesystem encoding and error handler: sys.getfilesystemencoding() and sys.getfilesystemencodeerrors(). - Default encoding and error handler: - - * if Py_SetStandardStreamEncoding() has been called: they have the - highest priority; - * PYTHONIOENCODING environment variable; - * The UTF-8 Mode uses UTF-8/surrogateescape; - * If Python forces the usage of the ASCII encoding (ex: C locale - or POSIX locale on FreeBSD or HP-UX), use ASCII/surrogateescape; - * locale encoding: ANSI code page on Windows, UTF-8 on Android and - VxWorks, LC_CTYPE locale encoding on other platforms; - * On Windows, "surrogateescape" error handler; - * "surrogateescape" error handler if the LC_CTYPE locale is "C" or "POSIX"; - * "surrogateescape" error handler if the LC_CTYPE locale has been coerced - (PEP 538); - * "strict" error handler. - - Supported error handlers: "strict", "surrogateescape" and - "surrogatepass". The surrogatepass error handler is only supported - if Py_DecodeLocale() and Py_EncodeLocale() use directly the UTF-8 codec; - it's only used on Windows. - - initfsencoding() updates the encoding to the Python codec name. - For example, "ANSI_X3.4-1968" is replaced with "ascii". - - On Windows, sys._enablelegacywindowsfsencoding() sets the - encoding/errors to mbcs/replace at runtime. - - - See Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors. - */ + The Doc/c-api/init_config.rst documentation explains how Python selects + the filesystem encoding and error handler. + + _PyUnicode_InitEncodings() updates the encoding name to the Python codec + name. For example, "ANSI_X3.4-1968" is replaced with "ascii". It also + sets Py_FileSystemDefaultEncoding to filesystem_encoding and + sets Py_FileSystemDefaultEncodeErrors to filesystem_errors. */ wchar_t *filesystem_encoding; wchar_t *filesystem_errors; diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h index 1ab554f945..9281f4eeb8 100644 --- a/Include/internal/pycore_fileutils.h +++ b/Include/internal/pycore_fileutils.h @@ -50,7 +50,7 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric( PyAPI_FUNC(void) _Py_closerange(int first, int last); -PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(const char **errmsg); +PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(void); PyAPI_FUNC(PyObject*) _Py_GetLocaleEncodingObject(void); #ifdef __cplusplus diff --git a/Include/pyport.h b/Include/pyport.h index 7137006870..79fc7c4a52 100644 --- a/Include/pyport.h +++ b/Include/pyport.h @@ -841,12 +841,16 @@ extern _invalid_parameter_handler _Py_silent_invalid_parameter_handler; #endif #if defined(__ANDROID__) || defined(__VXWORKS__) - /* Ignore the locale encoding: force UTF-8 */ + // Use UTF-8 as the locale encoding, ignore the LC_CTYPE locale. + // See _Py_GetLocaleEncoding(), PyUnicode_DecodeLocale() + // and PyUnicode_EncodeLocale(). # define _Py_FORCE_UTF8_LOCALE #endif #if defined(_Py_FORCE_UTF8_LOCALE) || defined(__APPLE__) - /* Use UTF-8 as filesystem encoding */ + // Use UTF-8 as the filesystem encoding. + // See PyUnicode_DecodeFSDefaultAndSize(), PyUnicode_EncodeFSDefault(), + // Py_DecodeLocale() and Py_EncodeLocale(). # define _Py_FORCE_UTF8_FS_ENCODING #endif diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-11-01-21-21-38.bpo-42236.MPx-NK.rst b/Misc/NEWS.d/next/Core and Builtins/2020-11-01-21-21-38.bpo-42236.MPx-NK.rst new file mode 100644 index 0000000000..22e8c534ff --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-11-01-21-21-38.bpo-42236.MPx-NK.rst @@ -0,0 +1,2 @@ +If the ``nl_langinfo(CODESET)`` function returns an empty string, Python now +uses UTF-8 as the filesystem encoding. Patch by Victor Stinner. diff --git a/Python/fileutils.c b/Python/fileutils.c index 72cdee2a51..5177b37288 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -826,20 +826,15 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str, // - Return "UTF-8" if _Py_FORCE_UTF8_LOCALE macro is defined (ex: on Android) // - Return "UTF-8" if the UTF-8 Mode is enabled // - On Windows, return the ANSI code page (ex: "cp1250") -// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string -// and if the _Py_FORCE_UTF8_FS_ENCODING macro is defined (ex: on macOS). +// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string. // - Otherwise, return nl_langinfo(CODESET). // -// Return NULL and set errmsg to an error message -// if nl_langinfo(CODESET) fails. -// -// Return NULL and set errmsg to NULL on memory allocation failure. +// Return NULL on memory allocation failure. // // See also config_get_locale_encoding() wchar_t* -_Py_GetLocaleEncoding(const char **errmsg) +_Py_GetLocaleEncoding(void) { - *errmsg = NULL; #ifdef _Py_FORCE_UTF8_LOCALE // On Android langinfo.h and CODESET are missing, // and UTF-8 is always used in mbstowcs() and wcstombs(). @@ -859,21 +854,14 @@ _Py_GetLocaleEncoding(const char **errmsg) #else const char *encoding = nl_langinfo(CODESET); if (!encoding || encoding[0] == '\0') { -#ifdef _Py_FORCE_UTF8_FS_ENCODING - // nl_langinfo() can return an empty string when the LC_CTYPE locale is - // not supported. Default to UTF-8 in that case, because UTF-8 is the - // default charset on macOS. + // Use UTF-8 if nl_langinfo() returns an empty string. It can happen on + // macOS if the LC_CTYPE locale is not supported. return _PyMem_RawWcsdup(L"UTF-8"); -#else - *errmsg = "failed to get the locale encoding: " - "nl_langinfo(CODESET) returns an empty string"; - return NULL; -#endif } wchar_t *wstr; int res = decode_current_locale(encoding, &wstr, NULL, - errmsg, _Py_ERROR_SURROGATEESCAPE); + NULL, _Py_ERROR_SURROGATEESCAPE); if (res < 0) { return NULL; } @@ -887,15 +875,9 @@ _Py_GetLocaleEncoding(const char **errmsg) PyObject * _Py_GetLocaleEncodingObject(void) { - const char *errmsg; - wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg); + wchar_t *encoding = _Py_GetLocaleEncoding(); if (encoding == NULL) { - if (errmsg != NULL) { - PyErr_SetString(PyExc_ValueError, errmsg); - } - else { - PyErr_NoMemory(); - } + PyErr_NoMemory(); return NULL; } diff --git a/Python/initconfig.c b/Python/initconfig.c index 56f4297ba9..d0ff888c7f 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -1318,7 +1318,7 @@ config_read_env_vars(PyConfig *config) #ifdef MS_WINDOWS _Py_get_env_flag(use_env, &config->legacy_windows_stdio, - "PYTHONLEGACYWINDOWSSTDIO"); + "PYTHONLEGACYWINDOWSSTDIO"); #endif if (config_get_env(config, "PYTHONDUMPREFS")) { @@ -1498,15 +1498,9 @@ static PyStatus config_get_locale_encoding(PyConfig *config, const PyPreConfig *preconfig, wchar_t **locale_encoding) { - const char *errmsg; - wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg); + wchar_t *encoding = _Py_GetLocaleEncoding(); if (encoding == NULL) { - if (errmsg != NULL) { - return _PyStatus_ERR(errmsg); - } - else { - return _PyStatus_NO_MEMORY(); - } + return _PyStatus_NO_MEMORY(); } PyStatus status = PyConfig_SetString(config, locale_encoding, encoding); PyMem_RawFree(encoding); |