diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2017-12-21 14:45:16 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-12-21 14:45:16 (GMT) |
commit | e47e698da6bd982da277960c14afa9d9939e3155 (patch) | |
tree | dd6a4bee16d8c66533f6a6c3f71f24f39caafd35 | |
parent | fbd605151fcf2899b14575f4ddb9ce3c55e684ab (diff) | |
download | cpython-e47e698da6bd982da277960c14afa9d9939e3155.zip cpython-e47e698da6bd982da277960c14afa9d9939e3155.tar.gz cpython-e47e698da6bd982da277960c14afa9d9939e3155.tar.bz2 |
bpo-32030: Add _Py_EncodeUTF8_surrogateescape() (#4960)
Py_EncodeLocale() now uses _Py_EncodeUTF8_surrogateescape(), instead
of using temporary unicode and bytes objects. So Py_EncodeLocale()
doesn't use the Python C API anymore.
-rw-r--r-- | Objects/unicodeobject.c | 89 | ||||
-rw-r--r-- | Python/fileutils.c | 42 |
2 files changed, 93 insertions, 38 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c8600a8..716e352 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5147,6 +5147,95 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) } +/* UTF-8 encoder using the surrogateescape error handler . + + On success, return a pointer to a newly allocated character string (use + PyMem_Free() to free the memory). + + On encoding failure, return NULL and write the position of the invalid + surrogate character into *error_pos (if error_pos is set). + + On memory allocation failure, return NULL and write (size_t)-1 into + *error_pos (if error_pos is set). */ +char* +_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos) +{ + const Py_ssize_t max_char_size = 4; + Py_ssize_t len = wcslen(text); + + assert(len >= 0); + + char *bytes; + if (len <= PY_SSIZE_T_MAX / max_char_size - 1) { + bytes = PyMem_Malloc((len + 1) * max_char_size); + } + else { + bytes = NULL; + } + if (bytes == NULL) { + if (error_pos != NULL) { + *error_pos = (size_t)-1; + } + return NULL; + } + + char *p = bytes; + Py_ssize_t i; + for (i = 0; i < len;) { + Py_UCS4 ch = text[i++]; + + if (ch < 0x80) { + /* Encode ASCII */ + *p++ = (char) ch; + + } + else if (ch < 0x0800) { + /* Encode Latin-1 */ + *p++ = (char)(0xc0 | (ch >> 6)); + *p++ = (char)(0x80 | (ch & 0x3f)); + } + else if (Py_UNICODE_IS_SURROGATE(ch)) { + /* surrogateescape error handler */ + if (!(0xDC80 <= ch && ch <= 0xDCFF)) { + if (error_pos != NULL) { + *error_pos = (size_t)i - 1; + } + goto error; + } + *p++ = (char)(ch & 0xff); + } + else if (ch < 0x10000) { + *p++ = (char)(0xe0 | (ch >> 12)); + *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *p++ = (char)(0x80 | (ch & 0x3f)); + } + else { /* ch >= 0x10000 */ + assert(ch <= MAX_UNICODE); + /* Encode UCS4 Unicode ordinals */ + *p++ = (char)(0xf0 | (ch >> 18)); + *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); + *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *p++ = (char)(0x80 | (ch & 0x3f)); + } + } + *p++ = '\0'; + + size_t final_size = (p - bytes); + char *bytes2 = PyMem_Realloc(bytes, final_size); + if (bytes2 == NULL) { + if (error_pos != NULL) { + *error_pos = (size_t)-1; + } + goto error; + } + return bytes2; + + error: + PyMem_Free(bytes); + return NULL; +} + + /* Primary internal function which creates utf8 encoded bytes objects. Allocation strategy: if the string is short, convert into a stack buffer diff --git a/Python/fileutils.c b/Python/fileutils.c index c4d495d..eeb5f2e 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -22,6 +22,8 @@ extern int winerror_to_errno(int); extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen); +extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text, + size_t *error_pos); #ifdef O_CLOEXEC /* Does open() support the O_CLOEXEC flag? Possible values: @@ -418,42 +420,6 @@ Py_DecodeLocale(const char* arg, size_t *size) #endif /* __APPLE__ or __ANDROID__ */ } -static char* -_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos) -{ - Py_ssize_t len; - PyObject *unicode, *bytes = NULL; - char *cpath; - - unicode = PyUnicode_FromWideChar(text, wcslen(text)); - if (unicode == NULL) { - return NULL; - } - - bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape"); - Py_DECREF(unicode); - if (bytes == NULL) { - PyErr_Clear(); - if (error_pos != NULL) { - *error_pos = (size_t)-1; - } - return NULL; - } - - len = PyBytes_GET_SIZE(bytes); - cpath = PyMem_Malloc(len+1); - if (cpath == NULL) { - PyErr_Clear(); - Py_DECREF(bytes); - if (error_pos != NULL) { - *error_pos = (size_t)-1; - } - return NULL; - } - memcpy(cpath, PyBytes_AsString(bytes), len + 1); - Py_DECREF(bytes); - return cpath; -} #if !defined(__APPLE__) && !defined(__ANDROID__) static char* @@ -537,10 +503,10 @@ char* Py_EncodeLocale(const wchar_t *text, size_t *error_pos) { #if defined(__APPLE__) || defined(__ANDROID__) - return _Py_EncodeLocaleUTF8(text, error_pos); + return _Py_EncodeUTF8_surrogateescape(text, error_pos); #else /* __APPLE__ */ if (Py_UTF8Mode == 1) { - return _Py_EncodeLocaleUTF8(text, error_pos); + return _Py_EncodeUTF8_surrogateescape(text, error_pos); } #ifndef MS_WINDOWS |