From f2ea71fcc8986101512265b685d8d3dfdf7b7bdb Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Sat, 17 Dec 2011 04:13:41 +0100 Subject: Issue #13560: Add PyUnicode_EncodeLocale() * Use PyUnicode_EncodeLocale() in time.strftime() if wcsftime() is not available * Document my last changes in Misc/NEWS --- Doc/c-api/unicode.rst | 25 +++++++- Include/unicodeobject.h | 12 +++- Misc/NEWS | 8 ++- Modules/timemodule.c | 2 +- Objects/unicodeobject.c | 167 ++++++++++++++++++++++++++++++++++++++---------- 5 files changed, 177 insertions(+), 37 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 0bf2eea..a6f3a69 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -713,7 +713,7 @@ system. bytes. If a byte sequence can be decoded as a surrogate character and *surrogateescape* is not equal to zero, the byte sequence is escaped using the ``'surrogateescape'`` error handler instead of being decoded. *str* - must end with a null character but cannot contain embedded null character. + must end with a null character but cannot contain embedded null characters. .. seealso:: @@ -732,6 +732,22 @@ system. .. versionadded:: 3.3 +.. c:function:: PyObject* PyUnicode_EncodeLocale(PyObject *unicode, int surrogateescape) + + Encode a Unicode object to the current locale encoding. The encoder is + strict if *surrogateescape* is equal to zero, otherwise it uses the + ``'surrogateescape'`` error handler (:pep:`383`). Return a :class:`bytes` + object. *str* cannot contain embedded null characters. + + .. seealso:: + + Use :c:func:`PyUnicode_EncodeFSDefault` to encode a string to + :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at + Python startup). + + .. versionadded:: 3.3 + + File System Encoding """""""""""""""""""" @@ -806,6 +822,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function: If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the locale encoding. + .. seealso:: + + :c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the + locale encoding and cannot be modified later. If you need to encode a + string to the current locale encoding, use + :c:func:`PyUnicode_EncodeLocale`. + .. versionadded:: 3.2 diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 5f073e0..8a23c7d 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1603,7 +1603,7 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( be decoded as a surrogate character and *surrogateescape* is not equal to zero, the byte sequence is escaped using the 'surrogateescape' error handler instead of being decoded. *str* must end with a null character but cannot - contain embedded null character. */ + contain embedded null characters. */ PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( const char *str, @@ -1617,6 +1617,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( const char *str, int surrogateescape); +/* Encode a Unicode object to the current locale encoding. The encoder is + strict is *surrogateescape* is equal to zero, otherwise the + "surrogateescape" error handler is used. Return a bytes object. The string + cannot contain embedded null characters.. */ + +PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( + PyObject *unicode, + int surrogateescape + ); + /* --- File system encoding ---------------------------------------------- */ /* ParseTuple converter: encode str objects to bytes using diff --git a/Misc/NEWS b/Misc/NEWS index 5be6990..51505d4 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -419,6 +419,10 @@ Core and Builtins Library ------- +- Issue #13560: Add PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize() + and PyUnicode_EncodeLocale() functions to the C API to decode/encode from/to + the current locale encoding. + - Issue #8373: The filesystem path of AF_UNIX sockets now uses the filesystem encoding and the surrogateescape error handler, rather than UTF-8. Patch by David Watson. @@ -451,8 +455,8 @@ Library 'importlib.abc.PyPycLoader', 'nntplib.NNTP.xgtitle', 'nntplib.NNTP.xpath', and private attributes of 'smtpd.SMTPChannel'. -- Issue #5905: time.strftime() is now using the locale encoding, instead of - UTF-8, if the wcsftime() function is not available. +- Issue #5905, #13560: time.strftime() is now using the current locale + encoding, instead of UTF-8, if the wcsftime() function is not available. - Issue #8641: Update IDLE 3 syntax coloring to recognize b".." and not u"..". Patch by Tal Einat. diff --git a/Modules/timemodule.c b/Modules/timemodule.c index a46c4f1..ad1c54e 100644 --- a/Modules/timemodule.c +++ b/Modules/timemodule.c @@ -486,7 +486,7 @@ time_strftime(PyObject *self, PyObject *args) fmt = format; #else /* Convert the unicode string to an ascii one */ - format = PyUnicode_EncodeFSDefault(format_arg); + format = PyUnicode_EncodeLocale(format_arg, 1); if (format == NULL) return NULL; fmt = PyBytes_AS_STRING(format); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7444c8b..a2c3227 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3073,6 +3073,140 @@ PyUnicode_AsEncodedObject(PyObject *unicode, return NULL; } +static size_t +wcstombs_errorpos(const wchar_t *wstr) +{ + size_t len; +#if SIZEOF_WCHAR_T == 2 + wchar_t buf[3]; +#else + wchar_t buf[2]; +#endif + char outbuf[MB_LEN_MAX]; + const wchar_t *start, *previous; + int save_errno; + + save_errno = errno; +#if SIZEOF_WCHAR_T == 2 + buf[2] = 0; +#else + buf[1] = 0; +#endif + start = wstr; + while (*wstr != L'\0') + { + previous = wstr; +#if SIZEOF_WCHAR_T == 2 + if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0]) + && Py_UNICODE_IS_LOW_SURROGATE(wstr[1])) + { + buf[0] = wstr[0]; + buf[1] = wstr[1]; + wstr += 2; + } + else { + buf[0] = *wstr; + buf[1] = 0; + wstr++; + } +#else + buf[0] = *wstr; + wstr++; +#endif + len = wcstombs(outbuf, buf, sizeof(outbuf)); + if (len == (size_t)-1) { + errno = save_errno; + return previous - start; + } + } + + /* failed to find the unencodable character */ + errno = save_errno; + return 0; +} + +PyObject * +PyUnicode_EncodeLocale(PyObject *unicode, int surrogateescape) +{ + Py_ssize_t wlen, wlen2; + wchar_t *wstr; + PyObject *bytes = NULL; + char *errmsg; + PyObject *exc; + size_t error_pos; + + wstr = PyUnicode_AsWideCharString(unicode, &wlen); + if (wstr == NULL) + return NULL; + + wlen2 = wcslen(wstr); + if (wlen2 != wlen) { + PyMem_Free(wstr); + PyErr_SetString(PyExc_TypeError, "embedded null character"); + return NULL; + } + + if (surrogateescape) { + /* locale encoding with surrogateescape */ + char *str; + + str = _Py_wchar2char(wstr, &error_pos); + if (str == NULL) { + if (error_pos == (size_t)-1) { + PyErr_NoMemory(); + PyMem_Free(wstr); + return NULL; + } + else { + goto encode_error; + } + } + PyMem_Free(wstr); + + bytes = PyBytes_FromString(str); + PyMem_Free(str); + } + else { + size_t len, len2; + + len = wcstombs(NULL, wstr, 0); + if (len == (size_t)-1) { + error_pos = wcstombs_errorpos(wstr); + goto encode_error; + } + + bytes = PyBytes_FromStringAndSize(NULL, len); + if (bytes == NULL) { + PyMem_Free(wstr); + return NULL; + } + + len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1); + if (len2 == (size_t)-1 || len2 > len) { + error_pos = wcstombs_errorpos(wstr); + goto encode_error; + } + PyMem_Free(wstr); + } + return bytes; + +encode_error: + errmsg = strerror(errno); + assert(errmsg != NULL); + if (errmsg == NULL) + errmsg = "wcstombs() encountered an unencodable wide character"; + PyMem_Free(wstr); + Py_XDECREF(bytes); + + exc = NULL; + raise_encode_exception(&exc, + "locale", unicode, + error_pos, error_pos+1, + errmsg); + Py_XDECREF(exc); + return NULL; +} + PyObject * PyUnicode_EncodeFSDefault(PyObject *unicode) { @@ -3097,38 +3231,7 @@ PyUnicode_EncodeFSDefault(PyObject *unicode) "surrogateescape"); } else { - /* locale encoding with surrogateescape */ - wchar_t *wchar; - char *bytes; - PyObject *bytes_obj; - size_t error_pos; - - wchar = PyUnicode_AsWideCharString(unicode, NULL); - if (wchar == NULL) - return NULL; - bytes = _Py_wchar2char(wchar, &error_pos); - if (bytes == NULL) { - if (error_pos != (size_t)-1) { - char *errmsg = strerror(errno); - PyObject *exc = NULL; - if (errmsg == NULL) - errmsg = "Py_wchar2char() failed"; - raise_encode_exception(&exc, - "filesystemencoding", unicode, - error_pos, error_pos+1, - errmsg); - Py_XDECREF(exc); - } - else - PyErr_NoMemory(); - PyMem_Free(wchar); - return NULL; - } - PyMem_Free(wchar); - - bytes_obj = PyBytes_FromString(bytes); - PyMem_Free(bytes); - return bytes_obj; + return PyUnicode_EncodeLocale(unicode, 1); } #endif } -- cgit v0.12