bpo-36775: _PyCoreConfig only uses wchar_t* (GH-13062)

_PyCoreConfig: Change filesystem_encoding, filesystem_errors, stdio_encoding and stdio_errors fields type from char* to wchar_t*. Changes: * PyInterpreterState: replace fscodec_initialized (int) with fs_codec structure. * Add get_error_handler_wide() and unicode_encode_utf8() helper functions. * Add error_handler parameter to unicode_encode_locale() and unicode_decode_locale(). * Remove _PyCoreConfig_SetString(). * Rename _PyCoreConfig_SetWideString() to _PyCoreConfig_SetString(). * Rename _PyCoreConfig_SetWideStringFromString() to _PyCoreConfig_DecodeLocale().
author: Victor Stinner <vstinner@redhat.com> 2019-05-02 18:56:30 (GMT)
committer: GitHub <noreply@github.com> 2019-05-02 18:56:30 (GMT)
commit: 709d23dee69e700b87d5a4cb59e149d0e1af7993 (patch)
tree: b06aafe79f83137a3c85649bcebf1fbfd2ea1240 /Objects/unicodeobject.c
parent: 6ae2bbbdfcb8969d1d362b17c2fbd5a684fa4f9d (diff)
download: cpython-709d23dee69e700b87d5a4cb59e149d0e1af7993.zip
cpython-709d23dee69e700b87d5a4cb59e149d0e1af7993.tar.gz
cpython-709d23dee69e700b87d5a4cb59e149d0e1af7993.tar.bz2
1 files changed, 242 insertions, 72 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 5b6b241..4d86519e8 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -40,6 +40,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
 #define PY_SSIZE_T_CLEAN
 #include "Python.h"
+#include "pycore_coreconfig.h"
 #include "pycore_fileutils.h"
 #include "pycore_object.h"
 #include "pycore_pylifecycle.h"
@@ -264,6 +265,13 @@ unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
 /* Forward declaration */
 static inline int
 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
+static PyObject *
+unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
+                    const char *errors);
+static PyObject *
+unicode_decode_utf8(const char *s, Py_ssize_t size,
+                    _Py_error_handler error_handler, const char *errors,
+                    Py_ssize_t *consumed);
 
 /* List of static strings. */
 static _Py_Identifier *static_strings = NULL;
@@ -388,6 +396,35 @@ _Py_GetErrorHandler(const char *errors)
     return _Py_ERROR_OTHER;
 }
 
+
+static _Py_error_handler
+get_error_handler_wide(const wchar_t *errors)
+{
+    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
+        return _Py_ERROR_STRICT;
+    }
+    if (wcscmp(errors, L"surrogateescape") == 0) {
+        return _Py_ERROR_SURROGATEESCAPE;
+    }
+    if (wcscmp(errors, L"replace") == 0) {
+        return _Py_ERROR_REPLACE;
+    }
+    if (wcscmp(errors, L"ignore") == 0) {
+        return _Py_ERROR_IGNORE;
+    }
+    if (wcscmp(errors, L"backslashreplace") == 0) {
+        return _Py_ERROR_BACKSLASHREPLACE;
+    }
+    if (wcscmp(errors, L"surrogatepass") == 0) {
+        return _Py_ERROR_SURROGATEPASS;
+    }
+    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
+        return _Py_ERROR_XMLCHARREFREPLACE;
+    }
+    return _Py_ERROR_OTHER;
+}
+
+
 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
    This function is kept for backward compatibility with the old API. */
 Py_UNICODE
@@ -3445,11 +3482,9 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
 
 
 static PyObject *
-unicode_encode_locale(PyObject *unicode, const char *errors,
+unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
                       int current_locale)
 {
-    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
-
     Py_ssize_t wlen;
     wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
     if (wstr == NULL) {
@@ -3499,30 +3534,44 @@ unicode_encode_locale(PyObject *unicode, const char *errors,
 PyObject *
 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
 {
-    return unicode_encode_locale(unicode, errors, 1);
+    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
+    return unicode_encode_locale(unicode, error_handler, 1);
 }
 
 PyObject *
 PyUnicode_EncodeFSDefault(PyObject *unicode)
 {
     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
-    const _PyCoreConfig *config = &interp->core_config;
 #ifdef _Py_FORCE_UTF8_FS_ENCODING
-    return _PyUnicode_AsUTF8String(unicode, config->filesystem_errors);
+    if (interp->fs_codec.encoding) {
+        return unicode_encode_utf8(unicode,
+                                   interp->fs_codec.error_handler,
+                                   interp->fs_codec.errors);
+    }
+    else {
+        const _PyCoreConfig *config = &interp->core_config;
+        _Py_error_handler errors;
+        errors = get_error_handler_wide(config->filesystem_errors);
+        assert(errors != _Py_ERROR_UNKNOWN);
+        return unicode_encode_utf8(unicode, errors, NULL);
+    }
 #else
     /* Bootstrap check: if the filesystem codec is implemented in Python, we
        cannot use it to encode and decode filenames before it is loaded. Load
        the Python codec requires to encode at least its own filename. Use the C
        implementation of the locale codec until the codec registry is
        initialized and the Python codec is loaded. See initfsencoding(). */
-    if (interp->fscodec_initialized) {
+    if (interp->fs_codec.encoding) {
         return PyUnicode_AsEncodedString(unicode,
-                                         config->filesystem_encoding,
-                                         config->filesystem_errors);
+                                         interp->fs_codec.encoding,
+                                         interp->fs_codec.errors);
     }
     else {
-        return unicode_encode_locale(unicode,
-                                     config->filesystem_errors, 0);
+        const _PyCoreConfig *config = &interp->core_config;
+        _Py_error_handler errors;
+        errors = get_error_handler_wide(config->filesystem_errors);
+        assert(errors != _Py_ERROR_UNKNOWN);
+        return unicode_encode_locale(unicode, errors, 0);
     }
 #endif
 }
@@ -3663,11 +3712,9 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
 }
 
 static PyObject*
-unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
-                      int current_locale)
+unicode_decode_locale(const char *str, Py_ssize_t len,
+                      _Py_error_handler errors, int current_locale)
 {
-    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
-
     if (str[len] != '\0' || (size_t)len != strlen(str))  {
         PyErr_SetString(PyExc_ValueError, "embedded null byte");
         return NULL;
@@ -3677,7 +3724,7 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
     size_t wlen;
     const char *reason;
     int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
-                                 current_locale, error_handler);
+                                 current_locale, errors);
     if (res != 0) {
         if (res == -2) {
             PyObject *exc;
@@ -3709,14 +3756,16 @@ PyObject*
 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
                               const char *errors)
 {
-    return unicode_decode_locale(str, len, errors, 1);
+    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
+    return unicode_decode_locale(str, len, error_handler, 1);
 }
 
 PyObject*
 PyUnicode_DecodeLocale(const char *str, const char *errors)
 {
     Py_ssize_t size = (Py_ssize_t)strlen(str);
-    return unicode_decode_locale(str, size, errors, 1);
+    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
+    return unicode_decode_locale(str, size, error_handler, 1);
 }
 
 
@@ -3730,23 +3779,36 @@ PyObject*
 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
 {
     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
-    const _PyCoreConfig *config = &interp->core_config;
 #ifdef _Py_FORCE_UTF8_FS_ENCODING
-    return PyUnicode_DecodeUTF8Stateful(s, size, config->filesystem_errors, NULL);
+    if (interp->fs_codec.encoding) {
+        return unicode_decode_utf8(s, size,
+                                   interp->fs_codec.error_handler,
+                                   interp->fs_codec.errors,
+                                   NULL);
+    }
+    else {
+        const _PyCoreConfig *config = &interp->core_config;
+        _Py_error_handler errors;
+        errors = get_error_handler_wide(config->filesystem_errors);
+        assert(errors != _Py_ERROR_UNKNOWN);
+        return unicode_decode_utf8(s, size, errors, NULL, NULL);
+    }
 #else
     /* Bootstrap check: if the filesystem codec is implemented in Python, we
        cannot use it to encode and decode filenames before it is loaded. Load
        the Python codec requires to encode at least its own filename. Use the C
        implementation of the locale codec until the codec registry is
        initialized and the Python codec is loaded. See initfsencoding(). */
-    if (interp->fscodec_initialized) {
+    if (interp->fs_codec.encoding) {
         return PyUnicode_Decode(s, size,
-                                config->filesystem_encoding,
-                                config->filesystem_errors);
+                                interp->fs_codec.encoding,
+                                interp->fs_codec.errors);
     }
     else {
-        return unicode_decode_locale(s, size,
-                                     config->filesystem_errors, 0);
+        const _PyCoreConfig *config = &interp->core_config;
+        _Py_error_handler errors;
+        errors = get_error_handler_wide(config->filesystem_errors);
+        return unicode_decode_locale(s, size, errors, 0);
     }
 #endif
 }
@@ -4810,11 +4872,10 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
     return p - start;
 }
 
-PyObject *
-PyUnicode_DecodeUTF8Stateful(const char *s,
-                             Py_ssize_t size,
-                             const char *errors,
-                             Py_ssize_t *consumed)
+static PyObject *
+unicode_decode_utf8(const char *s, Py_ssize_t size,
+                    _Py_error_handler error_handler, const char *errors,
+                    Py_ssize_t *consumed)
 {
     _PyUnicodeWriter writer;
     const char *starts = s;
@@ -4825,7 +4886,6 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
     const char *errmsg = "";
     PyObject *error_handler_obj = NULL;
     PyObject *exc = NULL;
-    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
 
     if (size == 0) {
         if (consumed)
@@ -4948,6 +5008,16 @@ onError:
 }
 
 
+PyObject *
+PyUnicode_DecodeUTF8Stateful(const char *s,
+                             Py_ssize_t size,
+                             const char *errors,
+                             Py_ssize_t *consumed)
+{
+    return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
+}
+
+
 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
    non-zero, use strict error handler otherwise.
 
@@ -5231,8 +5301,9 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
    maximum possible needed (4 result bytes per Unicode character), and return
    the excess memory at the end.
 */
-PyObject *
-_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
+static PyObject *
+unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
+                    const char *errors)
 {
     enum PyUnicode_Kind kind;
     void *data;
@@ -5260,15 +5331,22 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
     case PyUnicode_1BYTE_KIND:
         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
         assert(!PyUnicode_IS_ASCII(unicode));
-        return ucs1lib_utf8_encoder(unicode, data, size, errors);
+        return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
     case PyUnicode_2BYTE_KIND:
-        return ucs2lib_utf8_encoder(unicode, data, size, errors);
+        return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
     case PyUnicode_4BYTE_KIND:
-        return ucs4lib_utf8_encoder(unicode, data, size, errors);
+        return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
     }
 }
 
 PyObject *
+_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
+{
+    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
+}
+
+
+PyObject *
 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
                      Py_ssize_t size,
                      const char *errors)
@@ -15575,12 +15653,35 @@ PyUnicode_AsUnicodeCopy(PyObject *unicode)
 }
 
 
-static char*
-get_codec_name(const char *encoding)
+static int
+encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
 {
-    PyObject *codec, *name_obj = NULL;
+    int res;
+    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
+    if (res == -2) {
+        PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
+        return -1;
+    }
+    if (res < 0) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    return 0;
+}
+
+
+static int
+config_get_codec_name(wchar_t **config_encoding)
+{
+    char *encoding;
+    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
+        return -1;
+    }
+
+    PyObject *name_obj = NULL;
+    PyObject *codec = _PyCodec_Lookup(encoding);
+    PyMem_RawFree(encoding);
 
-    codec = _PyCodec_Lookup(encoding);
     if (!codec)
         goto error;
 
@@ -15590,71 +15691,107 @@ get_codec_name(const char *encoding)
         goto error;
     }
 
-    const char *name_utf8 = PyUnicode_AsUTF8(name_obj);
-    if (name_utf8 == NULL) {
+    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
+    Py_DECREF(name_obj);
+    if (wname == NULL) {
         goto error;
     }
 
-    char *name = _PyMem_RawStrdup(name_utf8);
-    Py_DECREF(name_obj);
-    if (name == NULL) {
+    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
+    if (raw_wname == NULL) {
+        PyMem_Free(wname);
         PyErr_NoMemory();
-        return NULL;
+        goto error;
     }
-    return name;
+
+    PyMem_RawFree(*config_encoding);
+    *config_encoding = raw_wname;
+
+    PyMem_Free(wname);
+    return 0;
 
 error:
     Py_XDECREF(codec);
     Py_XDECREF(name_obj);
-    return NULL;
+    return -1;
 }
 
 
 static _PyInitError
 init_stdio_encoding(PyInterpreterState *interp)
 {
+    /* Update the stdio encoding to the normalized Python codec name. */
     _PyCoreConfig *config = &interp->core_config;
-
-    char *codec_name = get_codec_name(config->stdio_encoding);
-    if (codec_name == NULL) {
+    if (config_get_codec_name(&config->stdio_encoding) < 0) {
         return _Py_INIT_ERR("failed to get the Python codec name "
                             "of the stdio encoding");
     }
-    PyMem_RawFree(config->stdio_encoding);
-    config->stdio_encoding = codec_name;
     return _Py_INIT_OK();
 }
 
 
-static _PyInitError
-init_fs_encoding(PyInterpreterState *interp)
+static int
+init_fs_codec(PyInterpreterState *interp)
 {
     _PyCoreConfig *config = &interp->core_config;
 
-    char *encoding = get_codec_name(config->filesystem_encoding);
-    if (encoding == NULL) {
-        /* Such error can only occurs in critical situations: no more
-           memory, import a module of the standard library failed, etc. */
-        return _Py_INIT_ERR("failed to get the Python codec "
-                            "of the filesystem encoding");
+    _Py_error_handler error_handler;
+    error_handler = get_error_handler_wide(config->filesystem_errors);
+    if (error_handler == _Py_ERROR_UNKNOWN) {
+        PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
+        return -1;
     }
 
-    /* Update the filesystem encoding to the normalized Python codec name.
-       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
-       (Python codec name). */
-    PyMem_RawFree(config->filesystem_encoding);
-    config->filesystem_encoding = encoding;
+    char *encoding, *errors;
+    if (encode_wstr_utf8(config->filesystem_encoding,
+                         &encoding,
+                         "filesystem_encoding") < 0) {
+        return -1;
+    }
+
+    if (encode_wstr_utf8(config->filesystem_errors,
+                         &errors,
+                         "filesystem_errors") < 0) {
+        PyMem_RawFree(encoding);
+        return -1;
+    }
+
+    PyMem_RawFree(interp->fs_codec.encoding);
+    interp->fs_codec.encoding = encoding;
+    PyMem_RawFree(interp->fs_codec.errors);
+    interp->fs_codec.errors = errors;
+    interp->fs_codec.error_handler = error_handler;
+
+    /* At this point, PyUnicode_EncodeFSDefault() and
+       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
+       the C implementation of the filesystem encoding. */
 
     /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
        global configuration variables. */
-    if (_Py_SetFileSystemEncoding(config->filesystem_encoding,
-                                  config->filesystem_errors) < 0) {
-        return _Py_INIT_NO_MEMORY();
+    if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
+                                  interp->fs_codec.errors) < 0) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    return 0;
+}
+
+
+static _PyInitError
+init_fs_encoding(PyInterpreterState *interp)
+{
+    /* Update the filesystem encoding to the normalized Python codec name.
+       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
+       (Python codec name). */
+    _PyCoreConfig *config = &interp->core_config;
+    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
+        return _Py_INIT_ERR("failed to get the Python codec "
+                            "of the filesystem encoding");
     }
 
-    /* PyUnicode can now use the Python codec rather than C implementation
-       for the filesystem encoding */
-    interp->fscodec_initialized = 1;
+    if (init_fs_codec(interp) < 0) {
+        return _Py_INIT_ERR("cannot initialize filesystem codec");
+    }
     return _Py_INIT_OK();
 }
 
@@ -15671,6 +15808,33 @@ _PyUnicode_InitEncodings(PyInterpreterState *interp)
 }
 
 
+#ifdef MS_WINDOWS
+int
+_PyUnicode_EnableLegacyWindowsFSEncoding(void)
+{
+    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
+    _PyCoreConfig *config = &interp->core_config;
+
+    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
+    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
+    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
+    if (encoding == NULL || errors == NULL) {
+        PyMem_RawFree(encoding);
+        PyMem_RawFree(errors);
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    PyMem_RawFree(config->filesystem_encoding);
+    config->filesystem_encoding = encoding;
+    PyMem_RawFree(config->filesystem_errors);
+    config->filesystem_errors = errors;
+
+    return init_fs_codec(interp);
+}
+#endif
+
+
 void
 _PyUnicode_Fini(void)
 {
@@ -15694,6 +15858,12 @@ _PyUnicode_Fini(void)
     }
     _PyUnicode_ClearStaticStrings();
     (void)PyUnicode_ClearFreeList();
+
+    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
+    PyMem_RawFree(interp->fs_codec.encoding);
+    interp->fs_codec.encoding = NULL;
+    PyMem_RawFree(interp->fs_codec.errors);
+    interp->fs_codec.errors = NULL;
 }
author	Victor Stinner <vstinner@redhat.com>	2019-05-02 18:56:30 (GMT)
committer	GitHub <noreply@github.com>	2019-05-02 18:56:30 (GMT)
commit	709d23dee69e700b87d5a4cb59e149d0e1af7993 (patch)
tree	b06aafe79f83137a3c85649bcebf1fbfd2ea1240 /Objects/unicodeobject.c
parent	6ae2bbbdfcb8969d1d362b17c2fbd5a684fa4f9d (diff)
download	cpython-709d23dee69e700b87d5a4cb59e149d0e1af7993.zip cpython-709d23dee69e700b87d5a4cb59e149d0e1af7993.tar.gz cpython-709d23dee69e700b87d5a4cb59e149d0e1af7993.tar.bz2