From e662c398d87f136497f8ec672e83657ae3a599e0 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Sun, 1 Nov 2020 23:07:23 +0100 Subject: bpo-42236: Use UTF-8 encoding if nl_langinfo(CODESET) fails (GH-23086) If the nl_langinfo(CODESET) function returns an empty string, Python now uses UTF-8 as the filesystem encoding. In May 2010 (commit b744ba1d14c5487576c95d0311e357b707600b47), I modified Python to log a warning and use UTF-8 as the filesystem encoding (instead of None) if nl_langinfo(CODESET) returns an empty string. In August 2020 (commit 94908bbc1503df830d1d615e7b57744ae1b41079), I modified Python startup to fail with a fatal error and a specific error message if nl_langinfo(CODESET) returns an empty string. The intent was to prevent guessing the encoding and also investigate user configuration where this case happens. In 10 years (2010 to 2020), I saw zero user report about the error message related to nl_langinfo(CODESET) returning an empty string. Today, UTF-8 became the defacto standard and it's safe to make the assumption that the user expects UTF-8. For example, nl_langinfo(CODESET) can return an empty string on macOS if the LC_CTYPE locale is not supported, and UTF-8 is the default encoding on macOS. While this change is likely to not affect anyone in practice, it should make UTF-8 lover happy ;-) Rewrite also the documentation explaining how Python selects the filesystem encoding and error handler. --- Python/fileutils.c | 34 ++++++++-------------------------- 1 file changed, 8 insertions(+), 26 deletions(-) (limited to 'Python/fileutils.c') diff --git a/Python/fileutils.c b/Python/fileutils.c index 72cdee2a51..5177b37288 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -826,20 +826,15 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str, // - Return "UTF-8" if _Py_FORCE_UTF8_LOCALE macro is defined (ex: on Android) // - Return "UTF-8" if the UTF-8 Mode is enabled // - On Windows, return the ANSI code page (ex: "cp1250") -// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string -// and if the _Py_FORCE_UTF8_FS_ENCODING macro is defined (ex: on macOS). +// - Return "UTF-8" if nl_langinfo(CODESET) returns an empty string. // - Otherwise, return nl_langinfo(CODESET). // -// Return NULL and set errmsg to an error message -// if nl_langinfo(CODESET) fails. -// -// Return NULL and set errmsg to NULL on memory allocation failure. +// Return NULL on memory allocation failure. // // See also config_get_locale_encoding() wchar_t* -_Py_GetLocaleEncoding(const char **errmsg) +_Py_GetLocaleEncoding(void) { - *errmsg = NULL; #ifdef _Py_FORCE_UTF8_LOCALE // On Android langinfo.h and CODESET are missing, // and UTF-8 is always used in mbstowcs() and wcstombs(). @@ -859,21 +854,14 @@ _Py_GetLocaleEncoding(const char **errmsg) #else const char *encoding = nl_langinfo(CODESET); if (!encoding || encoding[0] == '\0') { -#ifdef _Py_FORCE_UTF8_FS_ENCODING - // nl_langinfo() can return an empty string when the LC_CTYPE locale is - // not supported. Default to UTF-8 in that case, because UTF-8 is the - // default charset on macOS. + // Use UTF-8 if nl_langinfo() returns an empty string. It can happen on + // macOS if the LC_CTYPE locale is not supported. return _PyMem_RawWcsdup(L"UTF-8"); -#else - *errmsg = "failed to get the locale encoding: " - "nl_langinfo(CODESET) returns an empty string"; - return NULL; -#endif } wchar_t *wstr; int res = decode_current_locale(encoding, &wstr, NULL, - errmsg, _Py_ERROR_SURROGATEESCAPE); + NULL, _Py_ERROR_SURROGATEESCAPE); if (res < 0) { return NULL; } @@ -887,15 +875,9 @@ _Py_GetLocaleEncoding(const char **errmsg) PyObject * _Py_GetLocaleEncodingObject(void) { - const char *errmsg; - wchar_t *encoding = _Py_GetLocaleEncoding(&errmsg); + wchar_t *encoding = _Py_GetLocaleEncoding(); if (encoding == NULL) { - if (errmsg != NULL) { - PyErr_SetString(PyExc_ValueError, errmsg); - } - else { - PyErr_NoMemory(); - } + PyErr_NoMemory(); return NULL; } -- cgit v1.2.1