summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2017-12-21 14:45:16 (GMT)
committerGitHub <noreply@github.com>2017-12-21 14:45:16 (GMT)
commite47e698da6bd982da277960c14afa9d9939e3155 (patch)
treedd6a4bee16d8c66533f6a6c3f71f24f39caafd35
parentfbd605151fcf2899b14575f4ddb9ce3c55e684ab (diff)
downloadcpython-e47e698da6bd982da277960c14afa9d9939e3155.zip
cpython-e47e698da6bd982da277960c14afa9d9939e3155.tar.gz
cpython-e47e698da6bd982da277960c14afa9d9939e3155.tar.bz2
bpo-32030: Add _Py_EncodeUTF8_surrogateescape() (#4960)
Py_EncodeLocale() now uses _Py_EncodeUTF8_surrogateescape(), instead of using temporary unicode and bytes objects. So Py_EncodeLocale() doesn't use the Python C API anymore.
-rw-r--r--Objects/unicodeobject.c89
-rw-r--r--Python/fileutils.c42
2 files changed, 93 insertions, 38 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index c8600a8..716e352 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -5147,6 +5147,95 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
}
+/* UTF-8 encoder using the surrogateescape error handler .
+
+ On success, return a pointer to a newly allocated character string (use
+ PyMem_Free() to free the memory).
+
+ On encoding failure, return NULL and write the position of the invalid
+ surrogate character into *error_pos (if error_pos is set).
+
+ On memory allocation failure, return NULL and write (size_t)-1 into
+ *error_pos (if error_pos is set). */
+char*
+_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos)
+{
+ const Py_ssize_t max_char_size = 4;
+ Py_ssize_t len = wcslen(text);
+
+ assert(len >= 0);
+
+ char *bytes;
+ if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
+ bytes = PyMem_Malloc((len + 1) * max_char_size);
+ }
+ else {
+ bytes = NULL;
+ }
+ if (bytes == NULL) {
+ if (error_pos != NULL) {
+ *error_pos = (size_t)-1;
+ }
+ return NULL;
+ }
+
+ char *p = bytes;
+ Py_ssize_t i;
+ for (i = 0; i < len;) {
+ Py_UCS4 ch = text[i++];
+
+ if (ch < 0x80) {
+ /* Encode ASCII */
+ *p++ = (char) ch;
+
+ }
+ else if (ch < 0x0800) {
+ /* Encode Latin-1 */
+ *p++ = (char)(0xc0 | (ch >> 6));
+ *p++ = (char)(0x80 | (ch & 0x3f));
+ }
+ else if (Py_UNICODE_IS_SURROGATE(ch)) {
+ /* surrogateescape error handler */
+ if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
+ if (error_pos != NULL) {
+ *error_pos = (size_t)i - 1;
+ }
+ goto error;
+ }
+ *p++ = (char)(ch & 0xff);
+ }
+ else if (ch < 0x10000) {
+ *p++ = (char)(0xe0 | (ch >> 12));
+ *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+ *p++ = (char)(0x80 | (ch & 0x3f));
+ }
+ else { /* ch >= 0x10000 */
+ assert(ch <= MAX_UNICODE);
+ /* Encode UCS4 Unicode ordinals */
+ *p++ = (char)(0xf0 | (ch >> 18));
+ *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
+ *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+ *p++ = (char)(0x80 | (ch & 0x3f));
+ }
+ }
+ *p++ = '\0';
+
+ size_t final_size = (p - bytes);
+ char *bytes2 = PyMem_Realloc(bytes, final_size);
+ if (bytes2 == NULL) {
+ if (error_pos != NULL) {
+ *error_pos = (size_t)-1;
+ }
+ goto error;
+ }
+ return bytes2;
+
+ error:
+ PyMem_Free(bytes);
+ return NULL;
+}
+
+
/* Primary internal function which creates utf8 encoded bytes objects.
Allocation strategy: if the string is short, convert into a stack buffer
diff --git a/Python/fileutils.c b/Python/fileutils.c
index c4d495d..eeb5f2e 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -22,6 +22,8 @@ extern int winerror_to_errno(int);
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
size_t *p_wlen);
+extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text,
+ size_t *error_pos);
#ifdef O_CLOEXEC
/* Does open() support the O_CLOEXEC flag? Possible values:
@@ -418,42 +420,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
#endif /* __APPLE__ or __ANDROID__ */
}
-static char*
-_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
-{
- Py_ssize_t len;
- PyObject *unicode, *bytes = NULL;
- char *cpath;
-
- unicode = PyUnicode_FromWideChar(text, wcslen(text));
- if (unicode == NULL) {
- return NULL;
- }
-
- bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
- Py_DECREF(unicode);
- if (bytes == NULL) {
- PyErr_Clear();
- if (error_pos != NULL) {
- *error_pos = (size_t)-1;
- }
- return NULL;
- }
-
- len = PyBytes_GET_SIZE(bytes);
- cpath = PyMem_Malloc(len+1);
- if (cpath == NULL) {
- PyErr_Clear();
- Py_DECREF(bytes);
- if (error_pos != NULL) {
- *error_pos = (size_t)-1;
- }
- return NULL;
- }
- memcpy(cpath, PyBytes_AsString(bytes), len + 1);
- Py_DECREF(bytes);
- return cpath;
-}
#if !defined(__APPLE__) && !defined(__ANDROID__)
static char*
@@ -537,10 +503,10 @@ char*
Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
{
#if defined(__APPLE__) || defined(__ANDROID__)
- return _Py_EncodeLocaleUTF8(text, error_pos);
+ return _Py_EncodeUTF8_surrogateescape(text, error_pos);
#else /* __APPLE__ */
if (Py_UTF8Mode == 1) {
- return _Py_EncodeLocaleUTF8(text, error_pos);
+ return _Py_EncodeUTF8_surrogateescape(text, error_pos);
}
#ifndef MS_WINDOWS