From f3170ccef8809e4a3f82fe9f82dc7a4a486c28c1 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 15 Oct 2010 12:04:23 +0000 Subject: Use locale encoding if Py_FileSystemDefaultEncoding is not set * PyUnicode_EncodeFSDefault(), PyUnicode_DecodeFSDefaultAndSize() and PyUnicode_DecodeFSDefault() use the locale encoding instead of UTF-8 if Py_FileSystemDefaultEncoding is NULL * redecode_filenames() functions and _Py_code_object_list (issue #9630) are no more needed: remove them --- Doc/c-api/unicode.rst | 9 +- Include/code.h | 9 +- Include/unicodeobject.h | 9 +- Misc/NEWS | 3 + Objects/codeobject.c | 13 --- Objects/object.c | 4 - Objects/unicodeobject.c | 40 ++++++-- Python/pythonrun.c | 258 ------------------------------------------------ 8 files changed, 48 insertions(+), 297 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index dac01a4..d9a48d6 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -415,7 +415,8 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function: Decode a string using :c:data:`Py_FileSystemDefaultEncoding` and the ``'surrogateescape'`` error handler, or ``'strict'`` on Windows. - If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to UTF-8. + If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the + locale encoding. .. versionchanged:: 3.2 Use ``'strict'`` error handler on Windows. @@ -426,7 +427,8 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function: Decode a null-terminated string using :c:data:`Py_FileSystemDefaultEncoding` and the ``'surrogateescape'`` error handler, or ``'strict'`` on Windows. - If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to UTF-8. + If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the + locale encoding. Use :c:func:`PyUnicode_DecodeFSDefaultAndSize` if you know the string length. @@ -440,7 +442,8 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function: ``'surrogateescape'`` error handler, or ``'strict'`` on Windows, and return :class:`bytes`. - If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to UTF-8. + If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the + locale encoding. .. versionadded:: 3.2 diff --git a/Include/code.h b/Include/code.h index bdbfaba..11ecc95 100644 --- a/Include/code.h +++ b/Include/code.h @@ -72,7 +72,7 @@ PyAPI_DATA(PyTypeObject) PyCode_Type; PyAPI_FUNC(PyCodeObject *) PyCode_New( int, int, int, int, int, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *, - PyObject *, PyObject *, int, PyObject *); + PyObject *, PyObject *, int, PyObject *); /* same as struct above */ /* Creates a new empty code object with the specified source location. */ @@ -99,13 +99,6 @@ PyAPI_FUNC(int) _PyCode_CheckLineNumber(PyCodeObject* co, PyAPI_FUNC(PyObject*) PyCode_Optimize(PyObject *code, PyObject* consts, PyObject *names, PyObject *lineno_obj); -/* List of weak references to all code objects. The list is used by - initfsencoding() to redecode code filenames at startup if the filesystem - encoding changes. At initfsencoding() exit, the list is set to NULL and it - is no more used. */ - -extern PyObject *_Py_code_object_list; - #ifdef __cplusplus } #endif diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 39a6b2e..f61712b 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1193,7 +1193,8 @@ PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); /* Decode a null-terminated string using Py_FileSystemDefaultEncoding and the "surrogateescape" error handler. - If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8. + If Py_FileSystemDefaultEncoding is not set, fall back to the locale + encoding. Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. */ @@ -1205,7 +1206,8 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( /* Decode a string using Py_FileSystemDefaultEncoding and the "surrogateescape" error handler. - If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8. + If Py_FileSystemDefaultEncoding is not set, fall back to the locale + encoding. */ PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( @@ -1216,7 +1218,8 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the "surrogateescape" error handler, and return bytes. - If Py_FileSystemDefaultEncoding is not set, fall back to UTF-8. + If Py_FileSystemDefaultEncoding is not set, fall back to the locale + encoding. */ PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( diff --git a/Misc/NEWS b/Misc/NEWS index 38e68e3..4e66031 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,9 @@ What's New in Python 3.2 Beta 1? Core and Builtins ----------------- +- Use locale encoding instead of UTF-8 to encode and decode filenames if + Py_FileSystemDefaultEncoding is not set. + - Issue #10095: fp_setreadl() doesn't reopen the file, reuse instead the file descriptor. diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 470bf56..e24fc8d 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -5,8 +5,6 @@ #define NAME_CHARS \ "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz" -PyObject *_Py_code_object_list = NULL; - /* all_name_chars(s): true iff all chars in s are valid NAME_CHARS */ static int @@ -111,17 +109,6 @@ PyCode_New(int argcount, int kwonlyargcount, co->co_lnotab = lnotab; co->co_zombieframe = NULL; co->co_weakreflist = NULL; - - if (_Py_code_object_list != NULL) { - int err; - PyObject *ref = PyWeakref_NewRef((PyObject*)co, NULL); - if (ref == NULL) - goto error; - err = PyList_Append(_Py_code_object_list, ref); - Py_DECREF(ref); - if (err) - goto error; - } } return co; diff --git a/Objects/object.c b/Objects/object.c index e322e53..ff3363f 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -1604,10 +1604,6 @@ _Py_ReadyTypes(void) if (PyType_Ready(&PyCode_Type) < 0) Py_FatalError("Can't initialize code type"); - _Py_code_object_list = PyList_New(0); - if (_Py_code_object_list == NULL) - Py_FatalError("Can't initialize code type"); - if (PyType_Ready(&PyFrame_Type) < 0) Py_FatalError("Can't initialize frame type"); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index a18eeef..98427e3 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1597,11 +1597,22 @@ PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode) "surrogateescape"); } else { - /* if you change the default encoding, update also - PyUnicode_DecodeFSDefaultAndSize() and redecode_filenames() */ - return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - "surrogateescape"); + /* locale encoding with surrogateescape */ + wchar_t *wchar; + char *bytes; + PyObject *bytes_obj; + + wchar = PyUnicode_AsWideCharString(unicode, NULL); + if (wchar == NULL) + return NULL; + bytes = _Py_wchar2char(wchar); + PyMem_Free(wchar); + if (bytes == NULL) + return NULL; + + bytes_obj = PyBytes_FromString(bytes); + PyMem_Free(bytes); + return bytes_obj; } } @@ -1769,9 +1780,22 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) "surrogateescape"); } else { - /* if you change the default encoding, update also - PyUnicode_EncodeFSDefault() and redecode_filenames() */ - return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); + /* locale encoding with surrogateescape */ + wchar_t *wchar; + PyObject *unicode; + + if (s[size] != '\0' || size != strlen(s)) { + PyErr_SetString(PyExc_TypeError, "embedded NUL character"); + return NULL; + } + + wchar = _Py_char2wchar(s); + if (wchar == NULL) + return NULL; + + unicode = PyUnicode_FromWideChar(wchar, -1); + PyMem_Free(wchar); + return unicode; } } diff --git a/Python/pythonrun.c b/Python/pythonrun.c index 026fcfa..73fef75 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -719,259 +719,6 @@ initmain(void) } } -/* Redecode a filename from the default filesystem encoding (utf-8) to - 'new_encoding' encoding with 'errors' error handler */ -static PyObject* -redecode_filename(PyObject *file, const char *new_encoding, - const char *errors) -{ - PyObject *file_bytes, *new_file; - - file_bytes = PyUnicode_EncodeFSDefault(file); - if (file_bytes == NULL) - return NULL; - new_file = PyUnicode_Decode( - PyBytes_AsString(file_bytes), - PyBytes_GET_SIZE(file_bytes), - new_encoding, - errors); - Py_DECREF(file_bytes); - return new_file; -} - -/* Redecode a path list */ -static int -redecode_path_list(PyObject *paths, - const char *new_encoding, const char *errors) -{ - PyObject *filename, *new_filename; - Py_ssize_t i, size; - - size = PyList_Size(paths); - for (i=0; i < size; i++) { - filename = PyList_GetItem(paths, i); - if (filename == NULL) - return -1; - - new_filename = redecode_filename(filename, new_encoding, errors); - if (new_filename == NULL) - return -1; - if (PyList_SetItem(paths, i, new_filename)) { - Py_DECREF(new_filename); - return -1; - } - } - return 0; -} - -/* Redecode __file__ and __path__ attributes of sys.modules */ -static int -redecode_sys_modules(const char *new_encoding, const char *errors) -{ - PyInterpreterState *interp; - PyObject *modules, *values, *file, *new_file, *paths; - PyObject *iter = NULL, *module = NULL; - - interp = PyThreadState_GET()->interp; - modules = interp->modules; - - values = PyObject_CallMethod(modules, "values", ""); - if (values == NULL) - goto error; - - iter = PyObject_GetIter(values); - Py_DECREF(values); - if (iter == NULL) - goto error; - - while (1) - { - module = PyIter_Next(iter); - if (module == NULL) { - if (PyErr_Occurred()) - goto error; - else - break; - } - - file = PyModule_GetFilenameObject(module); - if (file != NULL) { - new_file = redecode_filename(file, new_encoding, errors); - Py_DECREF(file); - if (new_file == NULL) - goto error; - if (PyObject_SetAttrString(module, "__file__", new_file)) { - Py_DECREF(new_file); - goto error; - } - Py_DECREF(new_file); - } - else - PyErr_Clear(); - - paths = PyObject_GetAttrString(module, "__path__"); - if (paths != NULL) { - if (redecode_path_list(paths, new_encoding, errors)) - goto error; - } - else - PyErr_Clear(); - - Py_CLEAR(module); - } - Py_CLEAR(iter); - return 0; - -error: - Py_XDECREF(iter); - Py_XDECREF(module); - return -1; -} - -/* Redecode sys.path_importer_cache keys */ -static int -redecode_sys_path_importer_cache(const char *new_encoding, const char *errors) -{ - PyObject *path_importer_cache, *items, *item, *path, *importer, *new_path; - PyObject *new_cache = NULL, *iter = NULL; - - path_importer_cache = PySys_GetObject("path_importer_cache"); - if (path_importer_cache == NULL) - goto error; - - items = PyObject_CallMethod(path_importer_cache, "items", ""); - if (items == NULL) - goto error; - - iter = PyObject_GetIter(items); - Py_DECREF(items); - if (iter == NULL) - goto error; - - new_cache = PyDict_New(); - if (new_cache == NULL) - goto error; - - while (1) - { - item = PyIter_Next(iter); - if (item == NULL) { - if (PyErr_Occurred()) - goto error; - else - break; - } - path = PyTuple_GET_ITEM(item, 0); - importer = PyTuple_GET_ITEM(item, 1); - - new_path = redecode_filename(path, new_encoding, errors); - if (new_path == NULL) - goto error; - if (PyDict_SetItem(new_cache, new_path, importer)) { - Py_DECREF(new_path); - goto error; - } - Py_DECREF(new_path); - } - Py_CLEAR(iter); - if (PySys_SetObject("path_importer_cache", new_cache)) - goto error; - Py_CLEAR(new_cache); - return 0; - -error: - Py_XDECREF(iter); - Py_XDECREF(new_cache); - return -1; -} - -/* Redecode co_filename attribute of all code objects */ -static int -redecode_code_objects(const char *new_encoding, const char *errors) -{ - Py_ssize_t i, len; - PyCodeObject *co; - PyObject *ref, *new_file; - - len = Py_SIZE(_Py_code_object_list); - for (i=0; i < len; i++) { - ref = PyList_GET_ITEM(_Py_code_object_list, i); - co = (PyCodeObject *)PyWeakref_GetObject(ref); - if ((PyObject*)co == Py_None) - continue; - if (co == NULL) - return -1; - - new_file = redecode_filename(co->co_filename, new_encoding, errors); - if (new_file == NULL) - return -1; - Py_DECREF(co->co_filename); - co->co_filename = new_file; - } - Py_CLEAR(_Py_code_object_list); - return 0; -} - -/* Redecode the filenames of all modules (__file__ and __path__ attributes), - all code objects (co_filename attribute), sys.path, sys.meta_path, - sys.executable and sys.path_importer_cache (keys) when the filesystem - encoding changes from the default encoding (utf-8) to new_encoding */ -static int -redecode_filenames(const char *new_encoding) -{ - char *errors; - PyObject *paths, *executable, *new_executable; - - /* PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault() do already - use utf-8 if Py_FileSystemDefaultEncoding is NULL */ - if (strcmp(new_encoding, "utf-8") == 0) - return 0; - - if (strcmp(new_encoding, "mbcs") != 0) - errors = "surrogateescape"; - else - errors = NULL; - - /* sys.modules */ - if (redecode_sys_modules(new_encoding, errors)) - return -1; - - /* sys.path and sys.meta_path */ - paths = PySys_GetObject("path"); - if (paths != NULL) { - if (redecode_path_list(paths, new_encoding, errors)) - return -1; - } - paths = PySys_GetObject("meta_path"); - if (paths != NULL) { - if (redecode_path_list(paths, new_encoding, errors)) - return -1; - } - - /* sys.executable */ - executable = PySys_GetObject("executable"); - if (executable == NULL) - return -1; - new_executable = redecode_filename(executable, new_encoding, errors); - if (new_executable == NULL) - return -1; - if (PySys_SetObject("executable", new_executable)) { - Py_DECREF(new_executable); - return -1; - } - Py_DECREF(new_executable); - - /* sys.path_importer_cache */ - if (redecode_sys_path_importer_cache(new_encoding, errors)) - return -1; - - /* code objects */ - if (redecode_code_objects(new_encoding, errors)) - return -1; - - return 0; -} - static void initfsencoding(void) { @@ -987,11 +734,8 @@ initfsencoding(void) stdin and stdout if these are terminals. */ codeset = get_codeset(); if (codeset != NULL) { - if (redecode_filenames(codeset)) - Py_FatalError("Py_Initialize: can't redecode filenames"); Py_FileSystemDefaultEncoding = codeset; Py_HasFileSystemDefaultEncoding = 0; - Py_CLEAR(_Py_code_object_list); return; } else { fprintf(stderr, "Unable to get the locale encoding:\n"); @@ -1004,8 +748,6 @@ initfsencoding(void) } #endif - Py_CLEAR(_Py_code_object_list); - /* the encoding is mbcs, utf-8 or ascii */ codec = _PyCodec_Lookup(Py_FileSystemDefaultEncoding); if (!codec) { -- cgit v0.12