summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@haypocalc.com>2011-12-16 22:56:01 (GMT)
committerVictor Stinner <victor.stinner@haypocalc.com>2011-12-16 22:56:01 (GMT)
commitaf02e1c85a66009cdc645a64de7d7ee1335c8301 (patch)
tree5bc78c3a8628589cf5a4c246afc0076871d51c62
parent3607e3de278c89660f773064a94385066eebda1b (diff)
downloadcpython-af02e1c85a66009cdc645a64de7d7ee1335c8301.zip
cpython-af02e1c85a66009cdc645a64de7d7ee1335c8301.tar.gz
cpython-af02e1c85a66009cdc645a64de7d7ee1335c8301.tar.bz2
Add PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale()
* PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale() decode a string from the current locale encoding * _Py_char2wchar() writes an "error code" in the size argument to indicate if the function failed because of memory allocation failure or because of a decoding error. The function doesn't write the error message directly to stderr. * Fix time.strftime() (if wcsftime() is missing): decode strftime() result from the current locale encoding, not from the filesystem encoding.
-rw-r--r--Doc/c-api/unicode.rst40
-rw-r--r--Include/unicodeobject.h22
-rw-r--r--Modules/_localemodule.c57
-rw-r--r--Modules/main.c13
-rw-r--r--Modules/timemodule.c6
-rw-r--r--Objects/unicodeobject.c95
-rw-r--r--Python/fileutils.c25
7 files changed, 174 insertions, 84 deletions
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
index 81ed540..0bf2eea 100644
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@@ -699,6 +699,39 @@ Extension modules can continue using them, as they will not be removed in Python
throughout the interpreter whenever coercion to Unicode is needed.
+Locale Encoding
+"""""""""""""""
+
+The current locale encoding can be used to decode text from the operating
+system.
+
+.. c:function:: PyObject* PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, int surrogateescape)
+
+ Decode a string from the current locale encoding. The decoder is strict if
+ *surrogateescape* is equal to zero, otherwise it uses the
+ ``'surrogateescape'`` error handler (:pep:`383`) to escape undecodable
+ bytes. If a byte sequence can be decoded as a surrogate character and
+ *surrogateescape* is not equal to zero, the byte sequence is escaped using
+ the ``'surrogateescape'`` error handler instead of being decoded. *str*
+ must end with a null character but cannot contain embedded null character.
+
+ .. seealso::
+
+ Use :c:func:`PyUnicode_DecodeFSDefaultAndSize` to decode a string from
+ :c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
+ Python startup).
+
+ .. versionadded:: 3.3
+
+
+.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, int surrogateescape)
+
+ Similar to :c:func:`PyUnicode_DecodeLocaleAndSize`, but compute the string
+ length using :c:func:`strlen`.
+
+ .. versionadded:: 3.3
+
+
File System Encoding
""""""""""""""""""""
@@ -739,6 +772,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function:
If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the
locale encoding.
+ .. seealso::
+
+ :c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the
+ locale encoding and cannot be modified later. If you need to decode a
+ string from the current locale encoding, use
+ :c:func:`PyUnicode_DecodeLocaleAndSize`.
+
.. versionchanged:: 3.2
Use ``'strict'`` error handler on Windows.
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index cd35ae6..5f073e0 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1595,6 +1595,28 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
);
#endif
+/* --- Locale encoding --------------------------------------------------- */
+
+/* Decode a string from the current locale encoding. The decoder is strict if
+ *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
+ error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
+ be decoded as a surrogate character and *surrogateescape* is not equal to
+ zero, the byte sequence is escaped using the 'surrogateescape' error handler
+ instead of being decoded. *str* must end with a null character but cannot
+ contain embedded null character. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
+ const char *str,
+ Py_ssize_t len,
+ int surrogateescape);
+
+/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
+ length using strlen(). */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
+ const char *str,
+ int surrogateescape);
+
/* --- File system encoding ---------------------------------------------- */
/* ParseTuple converter: encode str objects to bytes using
diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c
index 9bba1b3..1cab7c0 100644
--- a/Modules/_localemodule.c
+++ b/Modules/_localemodule.c
@@ -42,43 +42,6 @@ PyDoc_STRVAR(locale__doc__, "Support for POSIX locales.");
static PyObject *Error;
-/* Convert a char* to a Unicode object according to the current locale */
-static PyObject*
-str2uni(const char* s)
-{
-#ifdef HAVE_BROKEN_MBSTOWCS
- size_t needed = strlen(s);
-#else
- size_t needed = mbstowcs(NULL, s, 0);
-#endif
- size_t res1;
- wchar_t smallbuf[30];
- wchar_t *dest;
- PyObject *res2;
- if (needed == (size_t)-1) {
- PyErr_SetString(PyExc_ValueError, "Cannot convert byte to string");
- return NULL;
- }
- if (needed*sizeof(wchar_t) < sizeof(smallbuf))
- dest = smallbuf;
- else {
- dest = PyMem_Malloc((needed+1)*sizeof(wchar_t));
- if (!dest)
- return PyErr_NoMemory();
- }
- /* This shouldn't fail now */
- res1 = mbstowcs(dest, s, needed+1);
-#ifdef HAVE_BROKEN_MBSTOWCS
- assert(res1 != (size_t)-1);
-#else
- assert(res1 == needed);
-#endif
- res2 = PyUnicode_FromWideChar(dest, res1);
- if (dest != smallbuf)
- PyMem_Free(dest);
- return res2;
-}
-
/* support functions for formatting floating point numbers */
PyDoc_STRVAR(setlocale__doc__,
@@ -149,7 +112,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
PyErr_SetString(Error, "unsupported locale setting");
return NULL;
}
- result_object = str2uni(result);
+ result_object = PyUnicode_DecodeLocale(result, 0);
if (!result_object)
return NULL;
} else {
@@ -159,7 +122,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
PyErr_SetString(Error, "locale query failed");
return NULL;
}
- result_object = str2uni(result);
+ result_object = PyUnicode_DecodeLocale(result, 0);
}
return result_object;
}
@@ -185,7 +148,7 @@ PyLocale_localeconv(PyObject* self)
involved herein */
#define RESULT_STRING(s)\
- x = str2uni(l->s); \
+ x = PyUnicode_DecodeLocale(l->s, 0); \
if (!x) goto failed;\
PyDict_SetItemString(result, #s, x);\
Py_XDECREF(x)
@@ -476,7 +439,7 @@ PyLocale_nl_langinfo(PyObject* self, PyObject* args)
instead of an empty string for nl_langinfo(ERA). */
const char *result = nl_langinfo(item);
result = result != NULL ? result : "";
- return str2uni(result);
+ return PyUnicode_DecodeLocale(result, 0);
}
PyErr_SetString(PyExc_ValueError, "unsupported langinfo constant");
return NULL;
@@ -495,7 +458,7 @@ PyIntl_gettext(PyObject* self, PyObject *args)
char *in;
if (!PyArg_ParseTuple(args, "s", &in))
return 0;
- return str2uni(gettext(in));
+ return PyUnicode_DecodeLocale(gettext(in), 0);
}
PyDoc_STRVAR(dgettext__doc__,
@@ -508,7 +471,7 @@ PyIntl_dgettext(PyObject* self, PyObject *args)
char *domain, *in;
if (!PyArg_ParseTuple(args, "zs", &domain, &in))
return 0;
- return str2uni(dgettext(domain, in));
+ return PyUnicode_DecodeLocale(dgettext(domain, in), 0);
}
PyDoc_STRVAR(dcgettext__doc__,
@@ -522,7 +485,7 @@ PyIntl_dcgettext(PyObject *self, PyObject *args)
int category;
if (!PyArg_ParseTuple(args, "zsi", &domain, &msgid, &category))
return 0;
- return str2uni(dcgettext(domain,msgid,category));
+ return PyUnicode_DecodeLocale(dcgettext(domain,msgid,category), 0);
}
PyDoc_STRVAR(textdomain__doc__,
@@ -540,7 +503,7 @@ PyIntl_textdomain(PyObject* self, PyObject* args)
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
- return str2uni(domain);
+ return PyUnicode_DecodeLocale(domain, 0);
}
PyDoc_STRVAR(bindtextdomain__doc__,
@@ -572,7 +535,7 @@ PyIntl_bindtextdomain(PyObject* self,PyObject*args)
PyErr_SetFromErrno(PyExc_OSError);
return NULL;
}
- result = str2uni(current_dirname);
+ result = PyUnicode_DecodeLocale(current_dirname, 0);
Py_XDECREF(dirname_bytes);
return result;
}
@@ -590,7 +553,7 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args)
return NULL;
codeset = bind_textdomain_codeset(domain, codeset);
if (codeset)
- return str2uni(codeset);
+ return PyUnicode_DecodeLocale(codeset, 0);
Py_RETURN_NONE;
}
#endif
diff --git a/Modules/main.c b/Modules/main.c
index d4c3314..4899378 100644
--- a/Modules/main.c
+++ b/Modules/main.c
@@ -495,16 +495,13 @@ Py_Main(int argc, wchar_t **argv)
/* Use utf-8 on Mac OS X */
unicode = PyUnicode_FromString(p);
#else
- wchar_t *wchar;
- size_t len;
- wchar = _Py_char2wchar(p, &len);
- if (wchar == NULL)
- continue;
- unicode = PyUnicode_FromWideChar(wchar, len);
- PyMem_Free(wchar);
+ unicode = PyUnicode_DecodeLocale(p, 1);
#endif
- if (unicode == NULL)
+ if (unicode == NULL) {
+ /* ignore errors */
+ PyErr_Clear();
continue;
+ }
PySys_AddWarnOptionUnicode(unicode);
Py_DECREF(unicode);
}
diff --git a/Modules/timemodule.c b/Modules/timemodule.c
index 001b311..a46c4f1 100644
--- a/Modules/timemodule.c
+++ b/Modules/timemodule.c
@@ -532,7 +532,7 @@ time_strftime(PyObject *self, PyObject *args)
#ifdef HAVE_WCSFTIME
ret = PyUnicode_FromWideChar(outbuf, buflen);
#else
- ret = PyUnicode_DecodeFSDefaultAndSize(outbuf, buflen);
+ ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, 1);
#endif
PyMem_Free(outbuf);
break;
@@ -764,8 +764,8 @@ PyInit_timezone(PyObject *m) {
#endif /* PYOS_OS2 */
#endif
PyModule_AddIntConstant(m, "daylight", daylight);
- otz0 = PyUnicode_DecodeFSDefaultAndSize(tzname[0], strlen(tzname[0]));
- otz1 = PyUnicode_DecodeFSDefaultAndSize(tzname[1], strlen(tzname[1]));
+ otz0 = PyUnicode_DecodeLocale(tzname[0], 1);
+ otz1 = PyUnicode_DecodeLocale(tzname[1], 1);
PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1));
#else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/
#ifdef HAVE_STRUCT_TM_TM_ZONE
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 5758ffa..7444c8b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3235,6 +3235,83 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
}
PyObject*
+PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
+ int surrogateescape)
+{
+ wchar_t smallbuf[256];
+ size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
+ wchar_t *wstr;
+ size_t wlen, wlen2;
+ PyObject *unicode;
+
+ if (str[len] != '\0' || len != strlen(str)) {
+ PyErr_SetString(PyExc_TypeError, "embedded null character");
+ return NULL;
+ }
+
+ if (surrogateescape)
+ {
+ wstr = _Py_char2wchar(str, &wlen);
+ if (wstr == NULL) {
+ if (wlen == (size_t)-1)
+ PyErr_NoMemory();
+ else
+ PyErr_SetFromErrno(PyExc_OSError);
+ return NULL;
+ }
+
+ unicode = PyUnicode_FromWideChar(wstr, wlen);
+ PyMem_Free(wstr);
+ }
+ else {
+#ifndef HAVE_BROKEN_MBSTOWCS
+ wlen = mbstowcs(NULL, str, 0);
+#else
+ wlen = len;
+#endif
+ if (wlen == (size_t)-1) {
+ PyErr_SetFromErrno(PyExc_OSError);
+ return NULL;
+ }
+ if (wlen+1 <= smallbuf_len) {
+ wstr = smallbuf;
+ }
+ else {
+ if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
+ return PyErr_NoMemory();
+
+ wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
+ if (!wstr)
+ return PyErr_NoMemory();
+ }
+
+ /* This shouldn't fail now */
+ wlen2 = mbstowcs(wstr, str, wlen+1);
+ if (wlen2 == (size_t)-1) {
+ if (wstr != smallbuf)
+ PyMem_Free(wstr);
+ PyErr_SetFromErrno(PyExc_OSError);
+ return NULL;
+ }
+#ifdef HAVE_BROKEN_MBSTOWCS
+ assert(wlen2 == wlen);
+#endif
+ unicode = PyUnicode_FromWideChar(wstr, wlen2);
+ if (wstr != smallbuf)
+ PyMem_Free(wstr);
+ }
+ return unicode;
+}
+
+PyObject*
+PyUnicode_DecodeLocale(const char *str, int surrogateescape)
+{
+ Py_ssize_t size = (Py_ssize_t)strlen(str);
+ return PyUnicode_DecodeLocaleAndSize(str, size, surrogateescape);
+}
+
+
+PyObject*
PyUnicode_DecodeFSDefault(const char *s) {
Py_ssize_t size = (Py_ssize_t)strlen(s);
return PyUnicode_DecodeFSDefaultAndSize(s, size);
@@ -3264,23 +3341,7 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
"surrogateescape");
}
else {
- /* locale encoding with surrogateescape */
- wchar_t *wchar;
- PyObject *unicode;
- size_t len;
-
- if (s[size] != '\0' || size != strlen(s)) {
- PyErr_SetString(PyExc_TypeError, "embedded NUL character");
- return NULL;
- }
-
- wchar = _Py_char2wchar(s, &len);
- if (wchar == NULL)
- return PyErr_NoMemory();
-
- unicode = PyUnicode_FromWideChar(wchar, len);
- PyMem_Free(wchar);
- return unicode;
+ return PyUnicode_DecodeLocaleAndSize(s, size, 1);
}
#endif
}
diff --git a/Python/fileutils.c b/Python/fileutils.c
index 0afa415..0aad220 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -16,7 +16,9 @@
Return a pointer to a newly allocated wide character string (use
PyMem_Free() to free the memory) and write the number of written wide
characters excluding the null character into *size if size is not NULL, or
- NULL on error (conversion or memory allocation error).
+ NULL on error (decoding or memory allocation error). If size is not NULL,
+ *size is set to (size_t)-1 on memory error and (size_t)-2 on decoding
+ error.
Conversion errors should never happen, unless there is a bug in the C
library. */
@@ -82,8 +84,9 @@ _Py_char2wchar(const char* arg, size_t *size)
since we provide everything that we have -
unless there is a bug in the C library, or I
misunderstood how mbrtowc works. */
- fprintf(stderr, "unexpected mbrtowc result -2\n");
PyMem_Free(res);
+ if (size != NULL)
+ *size = (size_t)-2;
return NULL;
}
if (converted == (size_t)-1) {
@@ -112,7 +115,8 @@ _Py_char2wchar(const char* arg, size_t *size)
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
correctly in the locale's charset, which must be an ASCII superset. */
res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
- if (!res) goto oom;
+ if (!res)
+ goto oom;
in = (unsigned char*)arg;
out = res;
while(*in)
@@ -126,7 +130,8 @@ _Py_char2wchar(const char* arg, size_t *size)
*size = out - res;
return res;
oom:
- fprintf(stderr, "out of memory\n");
+ if (size != NULL)
+ *size = (size_t)-1;
return NULL;
}
@@ -137,10 +142,10 @@ oom:
This function is the reverse of _Py_char2wchar().
Return a pointer to a newly allocated byte string (use PyMem_Free() to free
- the memory), or NULL on conversion or memory allocation error.
+ the memory), or NULL on encoding or memory allocation error.
If error_pos is not NULL: *error_pos is the index of the invalid character
- on conversion error, or (size_t)-1 otherwise. */
+ on encoding error, or (size_t)-1 otherwise. */
char*
_Py_wchar2char(const wchar_t *text, size_t *error_pos)
{
@@ -328,7 +333,7 @@ _Py_fopen(PyObject *path, const char *mode)
#ifdef HAVE_READLINK
/* Read value of symbolic link. Encode the path to the locale encoding, decode
- the result from the locale encoding. */
+ the result from the locale encoding. Return -1 on error. */
int
_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
@@ -372,7 +377,8 @@ _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
#ifdef HAVE_REALPATH
/* Return the canonicalized absolute pathname. Encode path to the locale
- encoding, decode the result from the locale encoding. */
+ encoding, decode the result from the locale encoding.
+ Return NULL on error. */
wchar_t*
_Py_wrealpath(const wchar_t *path,
@@ -410,7 +416,8 @@ _Py_wrealpath(const wchar_t *path,
#endif
/* Get the current directory. size is the buffer size in wide characters
- including the null character. Decode the path from the locale encoding. */
+ including the null character. Decode the path from the locale encoding.
+ Return NULL on error. */
wchar_t*
_Py_wgetcwd(wchar_t *buf, size_t size)