diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2016-09-07 00:04:34 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@gmail.com> | 2016-09-07 00:04:34 (GMT) |
commit | 62ec3317d2b7f1be98ff49b2278538a9ae5a7049 (patch) | |
tree | 5ac3fa8ef949a33433962ffa80cc36ddc3a8e181 | |
parent | 9f2a92018279e24c7b688376c1ad5521bef800aa (diff) | |
download | cpython-62ec3317d2b7f1be98ff49b2278538a9ae5a7049.zip cpython-62ec3317d2b7f1be98ff49b2278538a9ae5a7049.tar.gz cpython-62ec3317d2b7f1be98ff49b2278538a9ae5a7049.tar.bz2 |
Optimize unicode_escape and raw_unicode_escape
Issue #16334. Patch written by Serhiy Storchaka.
-rw-r--r-- | Objects/unicodeobject.c | 688 |
1 files changed, 314 insertions, 374 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 9ecf0e1..aa0402a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5846,61 +5846,6 @@ PyUnicode_AsUTF16String(PyObject *unicode) /* --- Unicode Escape Codec ----------------------------------------------- */ -/* Helper function for PyUnicode_DecodeUnicodeEscape, determines - if all the escapes in the string make it still a valid ASCII string. - Returns -1 if any escapes were found which cause the string to - pop out of ASCII range. Otherwise returns the length of the - required buffer to hold the string. - */ -static Py_ssize_t -length_of_escaped_ascii_string(const char *s, Py_ssize_t size) -{ - const unsigned char *p = (const unsigned char *)s; - const unsigned char *end = p + size; - Py_ssize_t length = 0; - - if (size < 0) - return -1; - - for (; p < end; ++p) { - if (*p > 127) { - /* Non-ASCII */ - return -1; - } - else if (*p != '\\') { - /* Normal character */ - ++length; - } - else { - /* Backslash-escape, check next char */ - ++p; - /* Escape sequence reaches till end of string or - non-ASCII follow-up. */ - if (p >= end || *p > 127) - return -1; - switch (*p) { - case '\n': - /* backslash + \n result in zero characters */ - break; - case '\\': case '\'': case '\"': - case 'b': case 'f': case 't': - case 'n': case 'r': case 'v': case 'a': - ++length; - break; - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - case 'x': case 'u': case 'U': case 'N': - /* these do not guarantee ASCII characters */ - return -1; - default: - /* count the backslash + the other character */ - length += 2; - } - } - } - return length; -} - static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; PyObject * @@ -5909,218 +5854,212 @@ PyUnicode_DecodeUnicodeEscape(const char *s, const char *errors) { const char *starts = s; - Py_ssize_t startinpos; - Py_ssize_t endinpos; _PyUnicodeWriter writer; const char *end; - char* message; - Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ PyObject *errorHandler = NULL; PyObject *exc = NULL; - Py_ssize_t len; - len = length_of_escaped_ascii_string(s, size); - if (len == 0) + if (size == 0) { _Py_RETURN_UNICODE_EMPTY(); - - /* After length_of_escaped_ascii_string() there are two alternatives, - either the string is pure ASCII with named escapes like \n, etc. - and we determined it's exact size (common case) - or it contains \x, \u, ... escape sequences. then we create a - legacy wchar string and resize it at the end of this function. */ - _PyUnicodeWriter_Init(&writer); - if (len > 0) { - writer.min_length = len; } - else { - /* Escaped strings will always be longer than the resulting - Unicode string, so we start with size here and then reduce the - length after conversion to the true value. - (but if the error callback returns a long replacement string - we'll have to allocate more space) */ - writer.min_length = size; + /* Escaped strings will always be longer than the resulting + Unicode string, so we start with size here and then reduce the + length after conversion to the true value. + (but if the error callback returns a long replacement string + we'll have to allocate more space) */ + _PyUnicodeWriter_Init(&writer); + writer.min_length = size; + if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { + goto onError; } - if (size == 0) - return _PyUnicodeWriter_Finish(&writer); end = s + size; - while (s < end) { - unsigned char c; - Py_UCS4 x; - int digits; + unsigned char c = (unsigned char) *s++; + Py_UCS4 ch; + int count; + Py_ssize_t startinpos; + Py_ssize_t endinpos; + const char *message; + +#define WRITE_ASCII_CHAR(ch) \ + do { \ + assert(ch <= 127); \ + assert(writer.pos < writer.size); \ + PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ + } while(0) + +#define WRITE_CHAR(ch) \ + do { \ + if (ch <= writer.maxchar) { \ + assert(writer.pos < writer.size); \ + PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ + } \ + else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \ + goto onError; \ + } \ + } while(0) /* Non-escape characters are interpreted as Unicode ordinals */ - if (*s != '\\') { - x = (unsigned char)*s; - s++; - if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) - goto onError; + if (c != '\\') { + WRITE_CHAR(c); continue; } - startinpos = s-starts; + startinpos = s - starts - 1; /* \ - Escapes */ - s++; - c = *s++; - if (s > end) - c = '\0'; /* Invalid after \ */ + if (s >= end) { + message = "\\ at end of string"; + goto error; + } + c = (unsigned char) *s++; + assert(writer.pos < writer.size); switch (c) { /* \x escapes */ -#define WRITECHAR(ch) \ - do { \ - if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \ - goto onError; \ - } while(0) - - case '\n': break; - case '\\': WRITECHAR('\\'); break; - case '\'': WRITECHAR('\''); break; - case '\"': WRITECHAR('\"'); break; - case 'b': WRITECHAR('\b'); break; + case '\n': continue; + case '\\': WRITE_ASCII_CHAR('\\'); continue; + case '\'': WRITE_ASCII_CHAR('\''); continue; + case '\"': WRITE_ASCII_CHAR('\"'); continue; + case 'b': WRITE_ASCII_CHAR('\b'); continue; /* FF */ - case 'f': WRITECHAR('\014'); break; - case 't': WRITECHAR('\t'); break; - case 'n': WRITECHAR('\n'); break; - case 'r': WRITECHAR('\r'); break; + case 'f': WRITE_ASCII_CHAR('\014'); continue; + case 't': WRITE_ASCII_CHAR('\t'); continue; + case 'n': WRITE_ASCII_CHAR('\n'); continue; + case 'r': WRITE_ASCII_CHAR('\r'); continue; /* VT */ - case 'v': WRITECHAR('\013'); break; + case 'v': WRITE_ASCII_CHAR('\013'); continue; /* BEL, not classic C */ - case 'a': WRITECHAR('\007'); break; + case 'a': WRITE_ASCII_CHAR('\007'); continue; /* \OOO (octal) escapes */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': - x = s[-1] - '0'; + ch = c - '0'; if (s < end && '0' <= *s && *s <= '7') { - x = (x<<3) + *s++ - '0'; - if (s < end && '0' <= *s && *s <= '7') - x = (x<<3) + *s++ - '0'; + ch = (ch<<3) + *s++ - '0'; + if (s < end && '0' <= *s && *s <= '7') { + ch = (ch<<3) + *s++ - '0'; + } } - WRITECHAR(x); - break; + WRITE_CHAR(ch); + continue; /* hex escapes */ /* \xXX */ case 'x': - digits = 2; + count = 2; message = "truncated \\xXX escape"; goto hexescape; /* \uXXXX */ case 'u': - digits = 4; + count = 4; message = "truncated \\uXXXX escape"; goto hexescape; /* \UXXXXXXXX */ case 'U': - digits = 8; + count = 8; message = "truncated \\UXXXXXXXX escape"; hexescape: - chr = 0; - if (end - s < digits) { - /* count only hex digits */ - for (; s < end; ++s) { - c = (unsigned char)*s; - if (!Py_ISXDIGIT(c)) - goto error; + for (ch = 0; count && s < end; ++s, --count) { + c = (unsigned char)*s; + ch <<= 4; + if (c >= '0' && c <= '9') { + ch += c - '0'; + } + else if (c >= 'a' && c <= 'f') { + ch += c - ('a' - 10); + } + else if (c >= 'A' && c <= 'F') { + ch += c - ('A' - 10); + } + else { + break; } - goto error; } - for (; digits--; ++s) { - c = (unsigned char)*s; - if (!Py_ISXDIGIT(c)) - goto error; - chr = (chr<<4) & ~0xF; - if (c >= '0' && c <= '9') - chr += c - '0'; - else if (c >= 'a' && c <= 'f') - chr += 10 + c - 'a'; - else - chr += 10 + c - 'A'; + if (count) { + goto error; } - if (chr == 0xffffffff && PyErr_Occurred()) - /* _decoding_error will have already written into the - target buffer. */ - break; - store: - /* when we get here, chr is a 32-bit unicode character */ - message = "illegal Unicode character"; - if (chr > MAX_UNICODE) + + /* when we get here, ch is a 32-bit unicode character */ + if (ch > MAX_UNICODE) { + message = "illegal Unicode character"; goto error; - WRITECHAR(chr); - break; + } + + WRITE_CHAR(ch); + continue; /* \N{name} */ case 'N': - message = "malformed \\N character escape"; if (ucnhash_CAPI == NULL) { /* load the unicode data module */ ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( PyUnicodeData_CAPSULE_NAME, 1); - if (ucnhash_CAPI == NULL) - goto ucnhashError; + if (ucnhash_CAPI == NULL) { + PyErr_SetString( + PyExc_UnicodeError, + "\\N escapes not supported (can't load unicodedata module)" + ); + goto onError; + } } + + message = "malformed \\N character escape"; if (*s == '{') { - const char *start = s+1; + const char *start = ++s; + size_t namelen; /* look for the closing brace */ - while (*s != '}' && s < end) + while (s < end && *s != '}') s++; - if (s > start && s < end && *s == '}') { + namelen = s - start; + if (namelen && s < end) { /* found a name. look it up in the unicode database */ - message = "unknown Unicode character name"; s++; - if (s - start - 1 <= INT_MAX && - ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), - &chr, 0)) - goto store; + ch = 0xffffffff; /* in case 'getcode' messes up */ + if (namelen <= INT_MAX && + ucnhash_CAPI->getcode(NULL, start, (int)namelen, + &ch, 0)) { + assert(ch <= MAX_UNICODE); + WRITE_CHAR(ch); + continue; + } + message = "unknown Unicode character name"; } } goto error; default: - if (s > end) { - message = "\\ at end of string"; - s--; - goto error; - } - else { - WRITECHAR('\\'); - WRITECHAR((unsigned char)s[-1]); - } - break; + WRITE_ASCII_CHAR('\\'); + WRITE_CHAR(c); + continue; } - continue; error: endinpos = s-starts; + writer.min_length = end - s + writer.pos; if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, "unicodeescape", message, &starts, &end, &startinpos, &endinpos, &exc, &s, - &writer)) + &writer)) { goto onError; - continue; + } + if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) { + goto onError; + } + +#undef WRITE_ASCII_CHAR +#undef WRITE_CHAR } -#undef WRITECHAR Py_XDECREF(errorHandler); Py_XDECREF(exc); return _PyUnicodeWriter_Finish(&writer); - ucnhashError: - PyErr_SetString( - PyExc_UnicodeError, - "\\N escapes not supported (can't load unicodedata module)" - ); - _PyUnicodeWriter_Dealloc(&writer); - Py_XDECREF(errorHandler); - Py_XDECREF(exc); - return NULL; - onError: _PyUnicodeWriter_Dealloc(&writer); Py_XDECREF(errorHandler); @@ -6139,10 +6078,11 @@ PyObject * PyUnicode_AsUnicodeEscapeString(PyObject *unicode) { Py_ssize_t i, len; + PyObject *repr; char *p; - int kind; + enum PyUnicode_Kind kind; void *data; - _PyBytesWriter writer; + Py_ssize_t expandsize; /* Initial allocation is based on the longest-possible character escape. @@ -6156,62 +6096,71 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) PyErr_BadArgument(); return NULL; } - if (PyUnicode_READY(unicode) == -1) + if (PyUnicode_READY(unicode) == -1) { return NULL; - - _PyBytesWriter_Init(&writer); + } len = PyUnicode_GET_LENGTH(unicode); + if (len == 0) { + return PyBytes_FromStringAndSize(NULL, 0); + } + kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); + /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 + bytes, and 1 byte characters 4. */ + expandsize = kind * 2 + 2; + if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) { + return PyErr_NoMemory(); + } + repr = PyBytes_FromStringAndSize(NULL, 2 + expandsize * len + 1); + if (repr == NULL) { + return NULL; + } - p = _PyBytesWriter_Alloc(&writer, len); - if (p == NULL) - goto error; - writer.overallocate = 1; - + p = PyBytes_AS_STRING(repr); for (i = 0; i < len; i++) { Py_UCS4 ch = PyUnicode_READ(kind, data, i); - /* Escape backslashes */ - if (ch == '\\') { - /* -1: subtract 1 preallocated byte */ - p = _PyBytesWriter_Prepare(&writer, p, 2-1); - if (p == NULL) - goto error; - - *p++ = '\\'; - *p++ = (char) ch; - continue; - } - - /* Map 21-bit characters to '\U00xxxxxx' */ - else if (ch >= 0x10000) { - assert(ch <= MAX_UNICODE); + /* U+0000-U+00ff range */ + if (ch < 0x100) { + if (ch >= ' ' && ch < 127) { + if (ch != '\\') { + /* Copy printable US ASCII as-is */ + *p++ = (char) ch; + } + /* Escape backslashes */ + else { + *p++ = '\\'; + *p++ = '\\'; + } + } - p = _PyBytesWriter_Prepare(&writer, p, 10-1); - if (p == NULL) - goto error; + /* Map special whitespace to '\t', \n', '\r' */ + else if (ch == '\t') { + *p++ = '\\'; + *p++ = 't'; + } + else if (ch == '\n') { + *p++ = '\\'; + *p++ = 'n'; + } + else if (ch == '\r') { + *p++ = '\\'; + *p++ = 'r'; + } - *p++ = '\\'; - *p++ = 'U'; - *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; - *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F]; - *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; - *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; - *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; - *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; - *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; - *p++ = Py_hexdigits[ch & 0x0000000F]; - continue; + /* Map non-printable US ASCII and 8-bit characters to '\xHH' */ + else { + *p++ = '\\'; + *p++ = 'x'; + *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; + *p++ = Py_hexdigits[ch & 0x000F]; + } } - - /* Map 16-bit characters to '\uxxxx' */ - if (ch >= 256) { - p = _PyBytesWriter_Prepare(&writer, p, 6-1); - if (p == NULL) - goto error; - + /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */ + else if (ch < 0x10000) { + /* U+0100-U+ffff */ *p++ = '\\'; *p++ = 'u'; *p++ = Py_hexdigits[(ch >> 12) & 0x000F]; @@ -6219,56 +6168,29 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; *p++ = Py_hexdigits[ch & 0x000F]; } + /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */ + else { - /* Map special whitespace to '\t', \n', '\r' */ - else if (ch == '\t') { - p = _PyBytesWriter_Prepare(&writer, p, 2-1); - if (p == NULL) - goto error; - - *p++ = '\\'; - *p++ = 't'; - } - else if (ch == '\n') { - p = _PyBytesWriter_Prepare(&writer, p, 2-1); - if (p == NULL) - goto error; - - *p++ = '\\'; - *p++ = 'n'; - } - else if (ch == '\r') { - p = _PyBytesWriter_Prepare(&writer, p, 2-1); - if (p == NULL) - goto error; - - *p++ = '\\'; - *p++ = 'r'; - } - - /* Map non-printable US ASCII to '\xhh' */ - else if (ch < ' ' || ch >= 0x7F) { - /* -1: subtract 1 preallocated byte */ - p = _PyBytesWriter_Prepare(&writer, p, 4-1); - if (p == NULL) - goto error; - + /* Make sure that the first two digits are zero */ + assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff); *p++ = '\\'; - *p++ = 'x'; - *p++ = Py_hexdigits[(ch >> 4) & 0x000F]; - *p++ = Py_hexdigits[ch & 0x000F]; + *p++ = 'U'; + *p++ = '0'; + *p++ = '0'; + *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F]; + *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F]; + *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F]; + *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F]; + *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F]; + *p++ = Py_hexdigits[ch & 0x0000000F]; } - - /* Copy everything else as-is */ - else - *p++ = (char) ch; } - return _PyBytesWriter_Finish(&writer, p); - -error: - _PyBytesWriter_Dealloc(&writer); - return NULL; + assert(p - PyBytes_AS_STRING(repr) > 0); + if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) { + return NULL; + } + return repr; } PyObject * @@ -6277,8 +6199,10 @@ PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, { PyObject *result; PyObject *tmp = PyUnicode_FromUnicode(s, size); - if (tmp == NULL) + if (tmp == NULL) { return NULL; + } + result = PyUnicode_AsUnicodeEscapeString(tmp); Py_DECREF(tmp); return result; @@ -6292,95 +6216,107 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, const char *errors) { const char *starts = s; - Py_ssize_t startinpos; - Py_ssize_t endinpos; _PyUnicodeWriter writer; const char *end; - const char *bs; PyObject *errorHandler = NULL; PyObject *exc = NULL; - if (size == 0) + if (size == 0) { _Py_RETURN_UNICODE_EMPTY(); + } /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the length after conversion to the true value. (But decoding error handler might have to resize the string) */ _PyUnicodeWriter_Init(&writer); - writer.min_length = size; + writer.min_length = size; + if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) { + goto onError; + } end = s + size; while (s < end) { - unsigned char c; - Py_UCS4 x; - int i; + unsigned char c = (unsigned char) *s++; + Py_UCS4 ch; int count; + Py_ssize_t startinpos; + Py_ssize_t endinpos; + const char *message; + +#define WRITE_CHAR(ch) \ + do { \ + if (ch <= writer.maxchar) { \ + assert(writer.pos < writer.size); \ + PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \ + } \ + else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \ + goto onError; \ + } \ + } while(0) /* Non-escape characters are interpreted as Unicode ordinals */ - if (*s != '\\') { - x = (unsigned char)*s++; - if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) - goto onError; + if (c != '\\' || s >= end) { + WRITE_CHAR(c); continue; } - startinpos = s-starts; - /* \u-escapes are only interpreted iff the number of leading - backslashes if odd */ - bs = s; - for (;s < end;) { - if (*s != '\\') - break; - x = (unsigned char)*s++; - if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) - goto onError; + c = (unsigned char) *s++; + if (c == 'u') { + count = 4; + message = "truncated \\uXXXX escape"; + } + else if (c == 'U') { + count = 8; + message = "truncated \\UXXXXXXXX escape"; } - if (((s - bs) & 1) == 0 || - s >= end || - (*s != 'u' && *s != 'U')) { + else { + assert(writer.pos < writer.size); + PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\'); + WRITE_CHAR(c); continue; } - writer.pos--; - count = *s=='u' ? 4 : 8; - s++; + startinpos = s - starts - 2; - /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ - for (x = 0, i = 0; i < count; ++i, ++s) { + /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */ + for (ch = 0; count && s < end; ++s, --count) { c = (unsigned char)*s; - if (!Py_ISXDIGIT(c)) { - endinpos = s-starts; - if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, - "rawunicodeescape", "truncated \\uXXXX", - &starts, &end, &startinpos, &endinpos, &exc, &s, - &writer)) - goto onError; - goto nextByte; + ch <<= 4; + if (c >= '0' && c <= '9') { + ch += c - '0'; + } + else if (c >= 'a' && c <= 'f') { + ch += c - ('a' - 10); + } + else if (c >= 'A' && c <= 'F') { + ch += c - ('A' - 10); + } + else { + break; } - x = (x<<4) & ~0xF; - if (c >= '0' && c <= '9') - x += c - '0'; - else if (c >= 'a' && c <= 'f') - x += 10 + c - 'a'; - else - x += 10 + c - 'A'; } - if (x <= MAX_UNICODE) { - if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0) - goto onError; + if (!count) { + if (ch <= MAX_UNICODE) { + WRITE_CHAR(ch); + continue; + } + message = "\\Uxxxxxxxx out of range"; } - else { - endinpos = s-starts; - if (unicode_decode_call_errorhandler_writer( - errors, &errorHandler, - "rawunicodeescape", "\\Uxxxxxxxx out of range", - &starts, &end, &startinpos, &endinpos, &exc, &s, - &writer)) - goto onError; + + endinpos = s-starts; + writer.min_length = end - s + writer.pos; + if (unicode_decode_call_errorhandler_writer( + errors, &errorHandler, + "rawunicodeescape", message, + &starts, &end, &startinpos, &endinpos, &exc, &s, + &writer)) { + goto onError; + } + if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0) { + goto onError; } - nextByte: - ; + +#undef WRITE_CHAR } Py_XDECREF(errorHandler); Py_XDECREF(exc); @@ -6391,83 +6327,87 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, Py_XDECREF(errorHandler); Py_XDECREF(exc); return NULL; + } PyObject * PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) { + PyObject *repr; char *p; - Py_ssize_t pos; + Py_ssize_t expandsize, pos; int kind; void *data; Py_ssize_t len; - _PyBytesWriter writer; if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); return NULL; } - if (PyUnicode_READY(unicode) == -1) + if (PyUnicode_READY(unicode) == -1) { return NULL; - - _PyBytesWriter_Init(&writer); - + } kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); len = PyUnicode_GET_LENGTH(unicode); + if (kind == PyUnicode_1BYTE_KIND) { + return PyBytes_FromStringAndSize(data, len); + } - p = _PyBytesWriter_Alloc(&writer, len); - if (p == NULL) - goto error; - writer.overallocate = 1; + /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6 + bytes, and 1 byte characters 4. */ + expandsize = kind * 2 + 2; + if (len > PY_SSIZE_T_MAX / expandsize) { + return PyErr_NoMemory(); + } + repr = PyBytes_FromStringAndSize(NULL, expandsize * len); + if (repr == NULL) { + return NULL; + } + if (len == 0) { + return repr; + } + + p = PyBytes_AS_STRING(repr); for (pos = 0; pos < len; pos++) { Py_UCS4 ch = PyUnicode_READ(kind, data, pos); - /* Map 32-bit characters to '\Uxxxxxxxx' */ - if (ch >= 0x10000) { - assert(ch <= MAX_UNICODE); - - /* -1: subtract 1 preallocated byte */ - p = _PyBytesWriter_Prepare(&writer, p, 10-1); - if (p == NULL) - goto error; + /* U+0000-U+00ff range: Copy 8-bit characters as-is */ + if (ch < 0x100) { + *p++ = (char) ch; + } + /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */ + else if (ch < 0x10000) { *p++ = '\\'; - *p++ = 'U'; - *p++ = Py_hexdigits[(ch >> 28) & 0xf]; - *p++ = Py_hexdigits[(ch >> 24) & 0xf]; - *p++ = Py_hexdigits[(ch >> 20) & 0xf]; - *p++ = Py_hexdigits[(ch >> 16) & 0xf]; + *p++ = 'u'; *p++ = Py_hexdigits[(ch >> 12) & 0xf]; *p++ = Py_hexdigits[(ch >> 8) & 0xf]; *p++ = Py_hexdigits[(ch >> 4) & 0xf]; *p++ = Py_hexdigits[ch & 15]; } - /* Map 16-bit characters to '\uxxxx' */ - else if (ch >= 256) { - /* -1: subtract 1 preallocated byte */ - p = _PyBytesWriter_Prepare(&writer, p, 6-1); - if (p == NULL) - goto error; - + /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */ + else { + assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff); *p++ = '\\'; - *p++ = 'u'; + *p++ = 'U'; + *p++ = '0'; + *p++ = '0'; + *p++ = Py_hexdigits[(ch >> 20) & 0xf]; + *p++ = Py_hexdigits[(ch >> 16) & 0xf]; *p++ = Py_hexdigits[(ch >> 12) & 0xf]; *p++ = Py_hexdigits[(ch >> 8) & 0xf]; *p++ = Py_hexdigits[(ch >> 4) & 0xf]; *p++ = Py_hexdigits[ch & 15]; } - /* Copy everything else as-is */ - else - *p++ = (char) ch; } - return _PyBytesWriter_Finish(&writer, p); - -error: - _PyBytesWriter_Dealloc(&writer); - return NULL; + assert(p > PyBytes_AS_STRING(repr)); + if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) { + return NULL; + } + return repr; } PyObject * |