From 2ec8063cc960d32e244dc6a27567f66a447bbda3 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 14 Oct 2015 13:32:13 +0200 Subject: Modify _PyBytes_DecodeEscapeRecode() to use _PyBytesAPI * Don't overallocate by 400% when recode is needed: only overallocate on demand using _PyBytesWriter. * Use _PyLong_DigitValue to convert hexadecimal digit to int * Create _PyBytes_DecodeEscapeRecode() subfunction --- Include/longobject.h | 3 +- Objects/bytesobject.c | 131 ++++++++++++++++++++++++++++---------------------- 2 files changed, 75 insertions(+), 59 deletions(-) diff --git a/Include/longobject.h b/Include/longobject.h index 9574f05..eaf7a7e 100644 --- a/Include/longobject.h +++ b/Include/longobject.h @@ -65,7 +65,8 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void); # error "void* different in size from int, long and long long" #endif /* SIZEOF_VOID_P */ -/* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */ +/* Used by Python/mystrtoul.c, _PyBytes_FromHex(), + _PyBytes_DecodeEscapeRecode(), etc. */ #ifndef Py_LIMITED_API PyAPI_DATA(unsigned char) _PyLong_DigitValue[256]; #endif diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 8810647..556b480 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1068,6 +1068,42 @@ bytes_dealloc(PyObject *op) the string is UTF-8 encoded and should be re-encoded in the specified encoding. */ +static char * +_PyBytes_DecodeEscapeRecode(const char **s, const char *end, + const char *errors, const char *recode_encoding, + _PyBytesWriter *writer, char *p) +{ + PyObject *u, *w; + const char* t; + + t = *s; + /* Decode non-ASCII bytes as UTF-8. */ + while (t < end && (*t & 0x80)) + t++; + u = PyUnicode_DecodeUTF8(*s, t - *s, errors); + if (u == NULL) + return NULL; + + /* Recode them in target encoding. */ + w = PyUnicode_AsEncodedString(u, recode_encoding, errors); + Py_DECREF(u); + if (w == NULL) + return NULL; + assert(PyBytes_Check(w)); + + /* Append bytes to output buffer. */ + writer->min_size--; /* substract 1 preallocated byte */ + p = _PyBytesWriter_WriteBytes(writer, p, + PyBytes_AS_STRING(w), + PyBytes_GET_SIZE(w)); + Py_DECREF(w); + if (p == NULL) + return NULL; + + *s = t; + return p; +} + PyObject *PyBytes_DecodeEscape(const char *s, Py_ssize_t len, const char *errors, @@ -1075,54 +1111,42 @@ PyObject *PyBytes_DecodeEscape(const char *s, const char *recode_encoding) { int c; - char *p, *buf; + char *p; const char *end; - PyObject *v; - Py_ssize_t newlen = recode_encoding ? 4*len:len; - v = PyBytes_FromStringAndSize((char *)NULL, newlen); - if (v == NULL) + _PyBytesWriter writer; + + _PyBytesWriter_Init(&writer); + + p = _PyBytesWriter_Alloc(&writer, len); + if (p == NULL) return NULL; - p = buf = PyBytes_AsString(v); + writer.overallocate = 1; + end = s + len; while (s < end) { if (*s != '\\') { non_esc: - if (recode_encoding && (*s & 0x80)) { - PyObject *u, *w; - char *r; - const char* t; - Py_ssize_t rn; - t = s; - /* Decode non-ASCII bytes as UTF-8. */ - while (t < end && (*t & 0x80)) t++; - u = PyUnicode_DecodeUTF8(s, t - s, errors); - if(!u) goto failed; - - /* Recode them in target encoding. */ - w = PyUnicode_AsEncodedString( - u, recode_encoding, errors); - Py_DECREF(u); - if (!w) goto failed; - - /* Append bytes to output buffer. */ - assert(PyBytes_Check(w)); - r = PyBytes_AS_STRING(w); - rn = PyBytes_GET_SIZE(w); - Py_MEMCPY(p, r, rn); - p += rn; - Py_DECREF(w); - s = t; - } else { + if (!(recode_encoding && (*s & 0x80))) { *p++ = *s++; } + else { + /* non-ASCII character and need to recode */ + p = _PyBytes_DecodeEscapeRecode(&s, end, + errors, recode_encoding, + &writer, p); + if (p == NULL) + goto failed; + } continue; } + s++; - if (s==end) { + if (s == end) { PyErr_SetString(PyExc_ValueError, "Trailing \\ in string"); goto failed; } + switch (*s++) { /* XXX This assumes ASCII! */ case '\n': break; @@ -1147,28 +1171,18 @@ PyObject *PyBytes_DecodeEscape(const char *s, *p++ = c; break; case 'x': - if (s+1 < end && Py_ISXDIGIT(s[0]) && Py_ISXDIGIT(s[1])) { - unsigned int x = 0; - c = Py_CHARMASK(*s); - s++; - if (Py_ISDIGIT(c)) - x = c - '0'; - else if (Py_ISLOWER(c)) - x = 10 + c - 'a'; - else - x = 10 + c - 'A'; - x = x << 4; - c = Py_CHARMASK(*s); - s++; - if (Py_ISDIGIT(c)) - x += c - '0'; - else if (Py_ISLOWER(c)) - x += 10 + c - 'a'; - else - x += 10 + c - 'A'; - *p++ = x; - break; + if (s+1 < end) { + int digit1, digit2; + digit1 = _PyLong_DigitValue[Py_CHARMASK(s[0])]; + digit2 = _PyLong_DigitValue[Py_CHARMASK(s[1])]; + if (digit1 < 16 && digit2 < 16) { + *p++ = (unsigned char)((digit1 << 4) + digit2); + s += 2; + break; + } } + /* invalid hexadecimal digits */ + if (!errors || strcmp(errors, "strict") == 0) { PyErr_Format(PyExc_ValueError, "invalid \\x escape at position %d", @@ -1190,6 +1204,7 @@ PyObject *PyBytes_DecodeEscape(const char *s, if (s < end && Py_ISXDIGIT(s[0])) s++; /* and a hexdigit */ break; + default: *p++ = '\\'; s--; @@ -1197,11 +1212,11 @@ PyObject *PyBytes_DecodeEscape(const char *s, UTF-8 bytes may follow. */ } } - if (p-buf < newlen) - _PyBytes_Resize(&v, p - buf); - return v; + + return _PyBytesWriter_Finish(&writer, p); + failed: - Py_DECREF(v); + _PyBytesWriter_Dealloc(&writer); return NULL; } -- cgit v0.12