summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2017-12-21 14:45:16 (GMT)
committerGitHub <noreply@github.com>2017-12-21 14:45:16 (GMT)
commite47e698da6bd982da277960c14afa9d9939e3155 (patch)
treedd6a4bee16d8c66533f6a6c3f71f24f39caafd35 /Objects
parentfbd605151fcf2899b14575f4ddb9ce3c55e684ab (diff)
downloadcpython-e47e698da6bd982da277960c14afa9d9939e3155.zip
cpython-e47e698da6bd982da277960c14afa9d9939e3155.tar.gz
cpython-e47e698da6bd982da277960c14afa9d9939e3155.tar.bz2
bpo-32030: Add _Py_EncodeUTF8_surrogateescape() (#4960)
Py_EncodeLocale() now uses _Py_EncodeUTF8_surrogateescape(), instead of using temporary unicode and bytes objects. So Py_EncodeLocale() doesn't use the Python C API anymore.
Diffstat (limited to 'Objects')
-rw-r--r--Objects/unicodeobject.c89
1 files changed, 89 insertions, 0 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index c8600a8..716e352 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -5147,6 +5147,95 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
}
+/* UTF-8 encoder using the surrogateescape error handler .
+
+ On success, return a pointer to a newly allocated character string (use
+ PyMem_Free() to free the memory).
+
+ On encoding failure, return NULL and write the position of the invalid
+ surrogate character into *error_pos (if error_pos is set).
+
+ On memory allocation failure, return NULL and write (size_t)-1 into
+ *error_pos (if error_pos is set). */
+char*
+_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos)
+{
+ const Py_ssize_t max_char_size = 4;
+ Py_ssize_t len = wcslen(text);
+
+ assert(len >= 0);
+
+ char *bytes;
+ if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
+ bytes = PyMem_Malloc((len + 1) * max_char_size);
+ }
+ else {
+ bytes = NULL;
+ }
+ if (bytes == NULL) {
+ if (error_pos != NULL) {
+ *error_pos = (size_t)-1;
+ }
+ return NULL;
+ }
+
+ char *p = bytes;
+ Py_ssize_t i;
+ for (i = 0; i < len;) {
+ Py_UCS4 ch = text[i++];
+
+ if (ch < 0x80) {
+ /* Encode ASCII */
+ *p++ = (char) ch;
+
+ }
+ else if (ch < 0x0800) {
+ /* Encode Latin-1 */
+ *p++ = (char)(0xc0 | (ch >> 6));
+ *p++ = (char)(0x80 | (ch & 0x3f));
+ }
+ else if (Py_UNICODE_IS_SURROGATE(ch)) {
+ /* surrogateescape error handler */
+ if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
+ if (error_pos != NULL) {
+ *error_pos = (size_t)i - 1;
+ }
+ goto error;
+ }
+ *p++ = (char)(ch & 0xff);
+ }
+ else if (ch < 0x10000) {
+ *p++ = (char)(0xe0 | (ch >> 12));
+ *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+ *p++ = (char)(0x80 | (ch & 0x3f));
+ }
+ else { /* ch >= 0x10000 */
+ assert(ch <= MAX_UNICODE);
+ /* Encode UCS4 Unicode ordinals */
+ *p++ = (char)(0xf0 | (ch >> 18));
+ *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
+ *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+ *p++ = (char)(0x80 | (ch & 0x3f));
+ }
+ }
+ *p++ = '\0';
+
+ size_t final_size = (p - bytes);
+ char *bytes2 = PyMem_Realloc(bytes, final_size);
+ if (bytes2 == NULL) {
+ if (error_pos != NULL) {
+ *error_pos = (size_t)-1;
+ }
+ goto error;
+ }
+ return bytes2;
+
+ error:
+ PyMem_Free(bytes);
+ return NULL;
+}
+
+
/* Primary internal function which creates utf8 encoded bytes objects.
Allocation strategy: if the string is short, convert into a stack buffer