summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@haypocalc.com>2011-12-17 03:13:41 (GMT)
committerVictor Stinner <victor.stinner@haypocalc.com>2011-12-17 03:13:41 (GMT)
commitf2ea71fcc8986101512265b685d8d3dfdf7b7bdb (patch)
tree6b5e8c37d0b73993e542181584f0af7b87482220
parent9987d9351ca06dbad3b66ab6da52ab4621955f31 (diff)
downloadcpython-f2ea71fcc8986101512265b685d8d3dfdf7b7bdb.zip
cpython-f2ea71fcc8986101512265b685d8d3dfdf7b7bdb.tar.gz
cpython-f2ea71fcc8986101512265b685d8d3dfdf7b7bdb.tar.bz2
Issue #13560: Add PyUnicode_EncodeLocale()
* Use PyUnicode_EncodeLocale() in time.strftime() if wcsftime() is not available * Document my last changes in Misc/NEWS
-rw-r--r--Doc/c-api/unicode.rst25
-rw-r--r--Include/unicodeobject.h12
-rw-r--r--Misc/NEWS8
-rw-r--r--Modules/timemodule.c2
-rw-r--r--Objects/unicodeobject.c167
5 files changed, 177 insertions, 37 deletions
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index 0bf2eea..a6f3a69 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -713,7 +713,7 @@ system.
bytes. If a byte sequence can be decoded as a surrogate character and
*surrogateescape* is not equal to zero, the byte sequence is escaped using
the ``'surrogateescape'`` error handler instead of being decoded. *str*
- must end with a null character but cannot contain embedded null character.
+ must end with a null character but cannot contain embedded null characters.
.. seealso::
@@ -732,6 +732,22 @@ system.
.. versionadded:: 3.3
+.. c:function:: PyObject* PyUnicode_EncodeLocale(PyObject *unicode, int surrogateescape)
+
+ Encode a Unicode object to the current locale encoding. The encoder is
+ strict if *surrogateescape* is equal to zero, otherwise it uses the
+ ``'surrogateescape'`` error handler (:pep:`383`). Return a :class:`bytes`
+ object. *str* cannot contain embedded null characters.
+
+ .. seealso::
+
+ Use :c:func:`PyUnicode_EncodeFSDefault` to encode a string to
+ :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
+ Python startup).
+
+ .. versionadded:: 3.3
+
+
File System Encoding
""""""""""""""""""""
@@ -806,6 +822,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function:
If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the
locale encoding.
+ .. seealso::
+
+ :c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the
+ locale encoding and cannot be modified later. If you need to encode a
+ string to the current locale encoding, use
+ :c:func:`PyUnicode_EncodeLocale`.
+
.. versionadded:: 3.2
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 5f073e0..8a23c7d 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1603,7 +1603,7 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
be decoded as a surrogate character and *surrogateescape* is not equal to
zero, the byte sequence is escaped using the 'surrogateescape' error handler
instead of being decoded. *str* must end with a null character but cannot
- contain embedded null character. */
+ contain embedded null characters. */
PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
const char *str,
@@ -1617,6 +1617,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
const char *str,
int surrogateescape);
+/* Encode a Unicode object to the current locale encoding. The encoder is
+ strict is *surrogateescape* is equal to zero, otherwise the
+ "surrogateescape" error handler is used. Return a bytes object. The string
+ cannot contain embedded null characters.. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
+ PyObject *unicode,
+ int surrogateescape
+ );
+
/* --- File system encoding ---------------------------------------------- */
/* ParseTuple converter: encode str objects to bytes using
diff --git a/Misc/NEWS b/Misc/NEWS
index 5be6990..51505d4 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -419,6 +419,10 @@ Core and Builtins
Library
-------
+- Issue #13560: Add PyUnicode_DecodeLocale(), PyUnicode_DecodeLocaleAndSize()
+ and PyUnicode_EncodeLocale() functions to the C API to decode/encode from/to
+ the current locale encoding.
+
- Issue #8373: The filesystem path of AF_UNIX sockets now uses the filesystem
encoding and the surrogateescape error handler, rather than UTF-8. Patch
by David Watson.
@@ -451,8 +455,8 @@ Library
'importlib.abc.PyPycLoader', 'nntplib.NNTP.xgtitle', 'nntplib.NNTP.xpath',
and private attributes of 'smtpd.SMTPChannel'.
-- Issue #5905: time.strftime() is now using the locale encoding, instead of
- UTF-8, if the wcsftime() function is not available.
+- Issue #5905, #13560: time.strftime() is now using the current locale
+ encoding, instead of UTF-8, if the wcsftime() function is not available.
- Issue #8641: Update IDLE 3 syntax coloring to recognize b".." and not u"..".
Patch by Tal Einat.
diff --git a/Modules/timemodule.c b/Modules/timemodule.c
index a46c4f1..ad1c54e 100644
--- a/Modules/timemodule.c
+++ b/Modules/timemodule.c
@@ -486,7 +486,7 @@ time_strftime(PyObject *self, PyObject *args)
fmt = format;
#else
/* Convert the unicode string to an ascii one */
- format = PyUnicode_EncodeFSDefault(format_arg);
+ format = PyUnicode_EncodeLocale(format_arg, 1);
if (format == NULL)
return NULL;
fmt = PyBytes_AS_STRING(format);
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 7444c8b..a2c3227 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3073,6 +3073,140 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
return NULL;
}
+static size_t
+wcstombs_errorpos(const wchar_t *wstr)
+{
+ size_t len;
+#if SIZEOF_WCHAR_T == 2
+ wchar_t buf[3];
+#else
+ wchar_t buf[2];
+#endif
+ char outbuf[MB_LEN_MAX];
+ const wchar_t *start, *previous;
+ int save_errno;
+
+ save_errno = errno;
+#if SIZEOF_WCHAR_T == 2
+ buf[2] = 0;
+#else
+ buf[1] = 0;
+#endif
+ start = wstr;
+ while (*wstr != L'\0')
+ {
+ previous = wstr;
+#if SIZEOF_WCHAR_T == 2
+ if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
+ && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
+ {
+ buf[0] = wstr[0];
+ buf[1] = wstr[1];
+ wstr += 2;
+ }
+ else {
+ buf[0] = *wstr;
+ buf[1] = 0;
+ wstr++;
+ }
+#else
+ buf[0] = *wstr;
+ wstr++;
+#endif
+ len = wcstombs(outbuf, buf, sizeof(outbuf));
+ if (len == (size_t)-1) {
+ errno = save_errno;
+ return previous - start;
+ }
+ }
+
+ /* failed to find the unencodable character */
+ errno = save_errno;
+ return 0;
+}
+
+PyObject *
+PyUnicode_EncodeLocale(PyObject *unicode, int surrogateescape)
+{
+ Py_ssize_t wlen, wlen2;
+ wchar_t *wstr;
+ PyObject *bytes = NULL;
+ char *errmsg;
+ PyObject *exc;
+ size_t error_pos;
+
+ wstr = PyUnicode_AsWideCharString(unicode, &wlen);
+ if (wstr == NULL)
+ return NULL;
+
+ wlen2 = wcslen(wstr);
+ if (wlen2 != wlen) {
+ PyMem_Free(wstr);
+ PyErr_SetString(PyExc_TypeError, "embedded null character");
+ return NULL;
+ }
+
+ if (surrogateescape) {
+ /* locale encoding with surrogateescape */
+ char *str;
+
+ str = _Py_wchar2char(wstr, &error_pos);
+ if (str == NULL) {
+ if (error_pos == (size_t)-1) {
+ PyErr_NoMemory();
+ PyMem_Free(wstr);
+ return NULL;
+ }
+ else {
+ goto encode_error;
+ }
+ }
+ PyMem_Free(wstr);
+
+ bytes = PyBytes_FromString(str);
+ PyMem_Free(str);
+ }
+ else {
+ size_t len, len2;
+
+ len = wcstombs(NULL, wstr, 0);
+ if (len == (size_t)-1) {
+ error_pos = wcstombs_errorpos(wstr);
+ goto encode_error;
+ }
+
+ bytes = PyBytes_FromStringAndSize(NULL, len);
+ if (bytes == NULL) {
+ PyMem_Free(wstr);
+ return NULL;
+ }
+
+ len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
+ if (len2 == (size_t)-1 || len2 > len) {
+ error_pos = wcstombs_errorpos(wstr);
+ goto encode_error;
+ }
+ PyMem_Free(wstr);
+ }
+ return bytes;
+
+encode_error:
+ errmsg = strerror(errno);
+ assert(errmsg != NULL);
+ if (errmsg == NULL)
+ errmsg = "wcstombs() encountered an unencodable wide character";
+ PyMem_Free(wstr);
+ Py_XDECREF(bytes);
+
+ exc = NULL;
+ raise_encode_exception(&exc,
+ "locale", unicode,
+ error_pos, error_pos+1,
+ errmsg);
+ Py_XDECREF(exc);
+ return NULL;
+}
+
PyObject *
PyUnicode_EncodeFSDefault(PyObject *unicode)
{
@@ -3097,38 +3231,7 @@ PyUnicode_EncodeFSDefault(PyObject *unicode)
"surrogateescape");
}
else {
- /* locale encoding with surrogateescape */
- wchar_t *wchar;
- char *bytes;
- PyObject *bytes_obj;
- size_t error_pos;
-
- wchar = PyUnicode_AsWideCharString(unicode, NULL);
- if (wchar == NULL)
- return NULL;
- bytes = _Py_wchar2char(wchar, &error_pos);
- if (bytes == NULL) {
- if (error_pos != (size_t)-1) {
- char *errmsg = strerror(errno);
- PyObject *exc = NULL;
- if (errmsg == NULL)
- errmsg = "Py_wchar2char() failed";
- raise_encode_exception(&exc,
- "filesystemencoding", unicode,
- error_pos, error_pos+1,
- errmsg);
- Py_XDECREF(exc);
- }
- else
- PyErr_NoMemory();
- PyMem_Free(wchar);
- return NULL;
- }
- PyMem_Free(wchar);
-
- bytes_obj = PyBytes_FromString(bytes);
- PyMem_Free(bytes);
- return bytes_obj;
+ return PyUnicode_EncodeLocale(unicode, 1);
}
#endif
}