From 2cba6b85797ba60d67389126f184aad5c9e02ff3 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 10 Jan 2018 22:46:15 +0100 Subject: bpo-29240: readline now ignores the UTF-8 Mode (#5145) Add new fuctions ignoring the UTF-8 mode: * _Py_DecodeCurrentLocale() * _Py_EncodeCurrentLocale() * _PyUnicode_DecodeCurrentLocaleAndSize() * _PyUnicode_EncodeCurrentLocale() Modify the readline module to use these functions. Re-enable test_readline.test_nonascii(). --- Include/fileutils.h | 8 +++++ Include/unicodeobject.h | 10 ++++++ Lib/test/test_readline.py | 2 -- Modules/readline.c | 5 +-- Objects/unicodeobject.c | 62 ++++++++++++++++++++++++++++++------ Python/fileutils.c | 80 ++++++++++++++++++++++++++++++----------------- 6 files changed, 125 insertions(+), 42 deletions(-) diff --git a/Include/fileutils.h b/Include/fileutils.h index d027e18..2527d84 100644 --- a/Include/fileutils.h +++ b/Include/fileutils.h @@ -24,6 +24,14 @@ PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape( const char *s, Py_ssize_t size, size_t *p_wlen); + +PyAPI_FUNC(wchar_t *) _Py_DecodeCurrentLocale( + const char *arg, + size_t *size); + +PyAPI_FUNC(char*) _Py_EncodeCurrentLocale( + const wchar_t *text, + size_t *error_pos); #endif #ifndef Py_LIMITED_API diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 0274de6..576e7ad 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1810,6 +1810,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( PyObject *unicode, const char *errors ); + +PyAPI_FUNC(PyObject*) _PyUnicode_DecodeCurrentLocaleAndSize( + const char *str, + Py_ssize_t len, + const char *errors); + +PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCurrentLocale( + PyObject *unicode, + const char *errors + ); #endif /* --- File system encoding ---------------------------------------------- */ diff --git a/Lib/test/test_readline.py b/Lib/test/test_readline.py index 28ea38b7..b4c25de 100644 --- a/Lib/test/test_readline.py +++ b/Lib/test/test_readline.py @@ -152,8 +152,6 @@ print("History length:", readline.get_current_history_length()) output = run_pty(self.auto_history_script.format(False)) self.assertIn(b"History length: 0\r\n", output) - @unittest.skipIf(True, - "FIXME: test broken by bpo-29240") def test_nonascii(self): try: readline.add_history("\xEB\xEF") diff --git a/Modules/readline.c b/Modules/readline.c index 811fca8..8db4cfd 100644 --- a/Modules/readline.c +++ b/Modules/readline.c @@ -132,13 +132,14 @@ static PyModuleDef readlinemodule; static PyObject * encode(PyObject *b) { - return PyUnicode_EncodeLocale(b, "surrogateescape"); + return _PyUnicode_EncodeCurrentLocale(b, "surrogateescape"); } static PyObject * decode(const char *s) { - return PyUnicode_DecodeLocale(s, "surrogateescape"); + return _PyUnicode_DecodeCurrentLocaleAndSize(s, strlen(s), + "surrogateescape"); } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 92a6ad6..1a230e0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3395,8 +3395,8 @@ locale_error_handler(const char *errors, int *surrogateescape) } } -PyObject * -PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) +static PyObject * +unicode_encode_locale(PyObject *unicode, const char *errors, int current_locale) { Py_ssize_t wlen, wlen2; wchar_t *wstr; @@ -3423,7 +3423,12 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) /* "surrogateescape" error handler */ char *str; - str = Py_EncodeLocale(wstr, &error_pos); + if (current_locale) { + str = _Py_EncodeCurrentLocale(wstr, &error_pos); + } + else { + str = Py_EncodeLocale(wstr, &error_pos); + } if (str == NULL) { if (error_pos == (size_t)-1) { PyErr_NoMemory(); @@ -3437,7 +3442,12 @@ PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) PyMem_Free(wstr); bytes = PyBytes_FromString(str); - PyMem_Free(str); + if (current_locale) { + PyMem_RawFree(str); + } + else { + PyMem_Free(str); + } } else { /* strict mode */ @@ -3503,6 +3513,18 @@ encode_error: } PyObject * +PyUnicode_EncodeLocale(PyObject *unicode, const char *errors) +{ + return unicode_encode_locale(unicode, errors, 0); +} + +PyObject * +_PyUnicode_EncodeCurrentLocale(PyObject *unicode, const char *errors) +{ + return unicode_encode_locale(unicode, errors, 1); +} + +PyObject * PyUnicode_EncodeFSDefault(PyObject *unicode) { #if defined(__APPLE__) @@ -3524,7 +3546,8 @@ PyUnicode_EncodeFSDefault(PyObject *unicode) Py_FileSystemDefaultEncodeErrors); } else { - return PyUnicode_EncodeLocale(unicode, Py_FileSystemDefaultEncodeErrors); + return unicode_encode_locale(unicode, + Py_FileSystemDefaultEncodeErrors, 0); } #endif } @@ -3695,9 +3718,9 @@ mbstowcs_errorpos(const char *str, size_t len) return 0; } -PyObject* -PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, - const char *errors) +static PyObject* +unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors, + int current_locale) { wchar_t smallbuf[256]; size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf); @@ -3719,7 +3742,12 @@ PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, if (surrogateescape) { /* "surrogateescape" error handler */ - wstr = Py_DecodeLocale(str, &wlen); + if (current_locale) { + wstr = _Py_DecodeCurrentLocale(str, &wlen); + } + else { + wstr = Py_DecodeLocale(str, &wlen); + } if (wstr == NULL) { if (wlen == (size_t)-1) PyErr_NoMemory(); @@ -3795,10 +3823,24 @@ decode_error: } PyObject* +PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, + const char *errors) +{ + return unicode_decode_locale(str, len, errors, 0); +} + +PyObject* +_PyUnicode_DecodeCurrentLocaleAndSize(const char *str, Py_ssize_t len, + const char *errors) +{ + return unicode_decode_locale(str, len, errors, 1); +} + +PyObject* PyUnicode_DecodeLocale(const char *str, const char *errors) { Py_ssize_t size = (Py_ssize_t)strlen(str); - return PyUnicode_DecodeLocaleAndSize(str, size, errors); + return unicode_decode_locale(str, size, errors, 0); } diff --git a/Python/fileutils.c b/Python/fileutils.c index 645a179..9275494 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -263,7 +263,7 @@ decode_ascii_surrogateescape(const char *arg, size_t *size) #if !defined(__APPLE__) && !defined(__ANDROID__) static wchar_t* -decode_locale(const char* arg, size_t *size) +decode_current_locale(const char* arg, size_t *size) { wchar_t *res; size_t argsize; @@ -380,32 +380,13 @@ oom: #endif -/* Decode a byte string from the locale encoding with the - surrogateescape error handler: undecodable bytes are decoded as characters - in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate - character, escape the bytes using the surrogateescape error handler instead - of decoding them. - - Return a pointer to a newly allocated wide character string, use - PyMem_RawFree() to free the memory. If size is not NULL, write the number of - wide characters excluding the null character into *size - - Return NULL on decoding error or memory allocation error. If *size* is not - NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on - decoding error. - - Decoding errors should never happen, unless there is a bug in the C - library. - - Use the Py_EncodeLocale() function to encode the character string back to a - byte string. */ -wchar_t* -Py_DecodeLocale(const char* arg, size_t *size) +static wchar_t* +decode_locale(const char* arg, size_t *size, int ignore_utf8_mode) { #if defined(__APPLE__) || defined(__ANDROID__) return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size); #else - if (Py_UTF8Mode == 1) { + if (!ignore_utf8_mode && Py_UTF8Mode == 1) { return _Py_DecodeUTF8_surrogateescape(arg, strlen(arg), size); } @@ -426,11 +407,45 @@ Py_DecodeLocale(const char* arg, size_t *size) } #endif - return decode_locale(arg, size); + return decode_current_locale(arg, size); #endif /* __APPLE__ or __ANDROID__ */ } +/* Decode a byte string from the locale encoding with the + surrogateescape error handler: undecodable bytes are decoded as characters + in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate + character, escape the bytes using the surrogateescape error handler instead + of decoding them. + + Return a pointer to a newly allocated wide character string, use + PyMem_RawFree() to free the memory. If size is not NULL, write the number of + wide characters excluding the null character into *size + + Return NULL on decoding error or memory allocation error. If *size* is not + NULL, *size is set to (size_t)-1 on memory error or set to (size_t)-2 on + decoding error. + + Decoding errors should never happen, unless there is a bug in the C + library. + + Use the Py_EncodeLocale() function to encode the character string back to a + byte string. */ +wchar_t* +Py_DecodeLocale(const char* arg, size_t *size) +{ + return decode_locale(arg, size, 0); +} + + +/* Similar to Py_DecodeLocale() but ignore the UTF-8 mode */ +wchar_t* +_Py_DecodeCurrentLocale(const char* arg, size_t *size) +{ + return decode_locale(arg, size, 1); +} + + #if !defined(__APPLE__) && !defined(__ANDROID__) static char* encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc) @@ -508,12 +523,13 @@ encode_current_locale(const wchar_t *text, size_t *error_pos, int raw_malloc) #endif static char* -encode_locale(const wchar_t *text, size_t *error_pos, int raw_malloc) +encode_locale(const wchar_t *text, size_t *error_pos, + int raw_malloc, int ignore_utf8_mode) { #if defined(__APPLE__) || defined(__ANDROID__) return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc); #else /* __APPLE__ */ - if (Py_UTF8Mode == 1) { + if (!ignore_utf8_mode && Py_UTF8Mode == 1) { return _Py_EncodeUTF8_surrogateescape(text, error_pos, raw_malloc); } @@ -544,7 +560,7 @@ encode_locale(const wchar_t *text, size_t *error_pos, int raw_malloc) char* Py_EncodeLocale(const wchar_t *text, size_t *error_pos) { - return encode_locale(text, error_pos, 0); + return encode_locale(text, error_pos, 0, 0); } @@ -553,7 +569,15 @@ Py_EncodeLocale(const wchar_t *text, size_t *error_pos) char* _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos) { - return encode_locale(text, error_pos, 1); + return encode_locale(text, error_pos, 1, 0); +} + + +/* Similar to _Py_EncodeLocaleRaw() but ignore the UTF-8 Mode */ +char* +_Py_EncodeCurrentLocale(const wchar_t *text, size_t *error_pos) +{ + return encode_locale(text, error_pos, 1, 1); } -- cgit v0.12