diff options
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c8600a8..716e352 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5147,6 +5147,95 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) } +/* UTF-8 encoder using the surrogateescape error handler . + + On success, return a pointer to a newly allocated character string (use + PyMem_Free() to free the memory). + + On encoding failure, return NULL and write the position of the invalid + surrogate character into *error_pos (if error_pos is set). + + On memory allocation failure, return NULL and write (size_t)-1 into + *error_pos (if error_pos is set). */ +char* +_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos) +{ + const Py_ssize_t max_char_size = 4; + Py_ssize_t len = wcslen(text); + + assert(len >= 0); + + char *bytes; + if (len <= PY_SSIZE_T_MAX / max_char_size - 1) { + bytes = PyMem_Malloc((len + 1) * max_char_size); + } + else { + bytes = NULL; + } + if (bytes == NULL) { + if (error_pos != NULL) { + *error_pos = (size_t)-1; + } + return NULL; + } + + char *p = bytes; + Py_ssize_t i; + for (i = 0; i < len;) { + Py_UCS4 ch = text[i++]; + + if (ch < 0x80) { + /* Encode ASCII */ + *p++ = (char) ch; + + } + else if (ch < 0x0800) { + /* Encode Latin-1 */ + *p++ = (char)(0xc0 | (ch >> 6)); + *p++ = (char)(0x80 | (ch & 0x3f)); + } + else if (Py_UNICODE_IS_SURROGATE(ch)) { + /* surrogateescape error handler */ + if (!(0xDC80 <= ch && ch <= 0xDCFF)) { + if (error_pos != NULL) { + *error_pos = (size_t)i - 1; + } + goto error; + } + *p++ = (char)(ch & 0xff); + } + else if (ch < 0x10000) { + *p++ = (char)(0xe0 | (ch >> 12)); + *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *p++ = (char)(0x80 | (ch & 0x3f)); + } + else { /* ch >= 0x10000 */ + assert(ch <= MAX_UNICODE); + /* Encode UCS4 Unicode ordinals */ + *p++ = (char)(0xf0 | (ch >> 18)); + *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); + *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *p++ = (char)(0x80 | (ch & 0x3f)); + } + } + *p++ = '\0'; + + size_t final_size = (p - bytes); + char *bytes2 = PyMem_Realloc(bytes, final_size); + if (bytes2 == NULL) { + if (error_pos != NULL) { + *error_pos = (size_t)-1; + } + goto error; + } + return bytes2; + + error: + PyMem_Free(bytes); + return NULL; +} + + /* Primary internal function which creates utf8 encoded bytes objects. Allocation strategy: if the string is short, convert into a stack buffer |