diff options
author | Victor Stinner <vstinner@python.org> | 2020-02-05 16:39:57 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-02-05 16:39:57 (GMT) |
commit | bf305cc6f05948f264349a6a6c6fd7d49c1839d3 (patch) | |
tree | 29db3b1a8fb376e8cca2d847515acb47884136c5 /Objects | |
parent | 0e4e735d06967145b49fd00693627f3624991dbc (diff) | |
download | cpython-bf305cc6f05948f264349a6a6c6fd7d49c1839d3.zip cpython-bf305cc6f05948f264349a6a6c6fd7d49c1839d3.tar.gz cpython-bf305cc6f05948f264349a6a6c6fd7d49c1839d3.tar.bz2 |
Add PyInterpreterState.fs_codec.utf8 (GH-18367)
Add a fast-path for UTF-8 encoding in PyUnicode_EncodeFSDefault()
and PyUnicode_DecodeFSDefaultAndSize().
Add _PyUnicode_FiniEncodings() helper function for _PyUnicode_Fini().
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 93 |
1 files changed, 47 insertions, 46 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5f10437..7c8bc06 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3615,39 +3615,32 @@ PyObject * PyUnicode_EncodeFSDefault(PyObject *unicode) { PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); -#ifdef _Py_FORCE_UTF8_FS_ENCODING - if (interp->fs_codec.encoding) { + if (interp->fs_codec.utf8) { return unicode_encode_utf8(unicode, interp->fs_codec.error_handler, interp->fs_codec.errors); } - else { - const wchar_t *filesystem_errors = interp->config.filesystem_errors; - _Py_error_handler errors; - errors = get_error_handler_wide(filesystem_errors); - assert(errors != _Py_ERROR_UNKNOWN); - return unicode_encode_utf8(unicode, errors, NULL); - } -#else - /* Bootstrap check: if the filesystem codec is implemented in Python, we - cannot use it to encode and decode filenames before it is loaded. Load - the Python codec requires to encode at least its own filename. Use the C - implementation of the locale codec until the codec registry is - initialized and the Python codec is loaded. - See _PyUnicode_InitEncodings(). */ - if (interp->fs_codec.encoding) { +#ifndef _Py_FORCE_UTF8_FS_ENCODING + else if (interp->fs_codec.encoding) { return PyUnicode_AsEncodedString(unicode, interp->fs_codec.encoding, interp->fs_codec.errors); } +#endif else { + /* Before _PyUnicode_InitEncodings() is called, the Python codec + machinery is not ready and so cannot be used: + use wcstombs() in this case. */ const wchar_t *filesystem_errors = interp->config.filesystem_errors; - _Py_error_handler errors; - errors = get_error_handler_wide(filesystem_errors); + assert(filesystem_errors != NULL); + _Py_error_handler errors = get_error_handler_wide(filesystem_errors); assert(errors != _Py_ERROR_UNKNOWN); +#ifdef _Py_FORCE_UTF8_FS_ENCODING + return unicode_encode_utf8(unicode, errors, NULL); +#else return unicode_encode_locale(unicode, errors, 0); - } #endif + } } PyObject * @@ -3857,39 +3850,33 @@ PyObject* PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) { PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); -#ifdef _Py_FORCE_UTF8_FS_ENCODING - if (interp->fs_codec.encoding) { + if (interp->fs_codec.utf8) { return unicode_decode_utf8(s, size, interp->fs_codec.error_handler, interp->fs_codec.errors, NULL); } - else { - const wchar_t *filesystem_errors = interp->config.filesystem_errors; - _Py_error_handler errors; - errors = get_error_handler_wide(filesystem_errors); - assert(errors != _Py_ERROR_UNKNOWN); - return unicode_decode_utf8(s, size, errors, NULL, NULL); - } -#else - /* Bootstrap check: if the filesystem codec is implemented in Python, we - cannot use it to encode and decode filenames before it is loaded. Load - the Python codec requires to encode at least its own filename. Use the C - implementation of the locale codec until the codec registry is - initialized and the Python codec is loaded. - See _PyUnicode_InitEncodings(). */ - if (interp->fs_codec.encoding) { +#ifndef _Py_FORCE_UTF8_FS_ENCODING + else if (interp->fs_codec.encoding) { return PyUnicode_Decode(s, size, interp->fs_codec.encoding, interp->fs_codec.errors); } +#endif else { + /* Before _PyUnicode_InitEncodings() is called, the Python codec + machinery is not ready and so cannot be used: + use mbstowcs() in this case. */ const wchar_t *filesystem_errors = interp->config.filesystem_errors; - _Py_error_handler errors; - errors = get_error_handler_wide(filesystem_errors); + assert(filesystem_errors != NULL); + _Py_error_handler errors = get_error_handler_wide(filesystem_errors); + assert(errors != _Py_ERROR_UNKNOWN); +#ifdef _Py_FORCE_UTF8_FS_ENCODING + return unicode_decode_utf8(s, size, errors, NULL, NULL); +#else return unicode_decode_locale(s, size, errors, 0); - } #endif + } } @@ -15849,10 +15836,16 @@ init_fs_codec(PyInterpreterState *interp) PyMem_RawFree(interp->fs_codec.encoding); interp->fs_codec.encoding = encoding; + /* encoding has been normalized by init_fs_encoding() */ + interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0); PyMem_RawFree(interp->fs_codec.errors); interp->fs_codec.errors = errors; interp->fs_codec.error_handler = error_handler; +#ifdef _Py_FORCE_UTF8_FS_ENCODING + assert(interp->fs_codec.utf8 == 1); +#endif + /* At this point, PyUnicode_EncodeFSDefault() and PyUnicode_DecodeFSDefault() can now use the Python codec rather than the C implementation of the filesystem encoding. */ @@ -15902,6 +15895,19 @@ _PyUnicode_InitEncodings(PyThreadState *tstate) } +static void +_PyUnicode_FiniEncodings(PyThreadState *tstate) +{ + PyInterpreterState *interp = tstate->interp; + PyMem_RawFree(interp->fs_codec.encoding); + interp->fs_codec.encoding = NULL; + interp->fs_codec.utf8 = 0; + PyMem_RawFree(interp->fs_codec.errors); + interp->fs_codec.errors = NULL; + interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN; +} + + #ifdef MS_WINDOWS int _PyUnicode_EnableLegacyWindowsFSEncoding(void) @@ -15954,12 +15960,7 @@ _PyUnicode_Fini(PyThreadState *tstate) _PyUnicode_ClearStaticStrings(); } - PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); - PyMem_RawFree(interp->fs_codec.encoding); - interp->fs_codec.encoding = NULL; - PyMem_RawFree(interp->fs_codec.errors); - interp->fs_codec.errors = NULL; - interp->config.filesystem_errors = (wchar_t *)_Py_ERROR_UNKNOWN; + _PyUnicode_FiniEncodings(tstate); } |