diff options
Diffstat (limited to 'Objects/bytesobject.c')
-rw-r--r-- | Objects/bytesobject.c | 1384 |
1 files changed, 919 insertions, 465 deletions
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 495c3eb..701ae9d 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -30,6 +30,10 @@ static PyBytesObject *nullstring; */ #define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1) +/* Forward declaration */ +Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer, + char *str); + /* For PyBytes_FromString(), the parameter `str' points to a null-terminated string containing exactly `size' bytes. @@ -174,190 +178,184 @@ PyBytes_FromString(const char *str) PyObject * PyBytes_FromFormatV(const char *format, va_list vargs) { - va_list count; - Py_ssize_t n = 0; - const char* f; char *s; - PyObject* string; + const char *f; + const char *p; + Py_ssize_t prec; + int longflag; + int size_tflag; + /* Longest 64-bit formatted numbers: + - "18446744073709551615\0" (21 bytes) + - "-9223372036854775808\0" (21 bytes) + Decimal takes the most space (it isn't enough for octal.) + + Longest 64-bit pointer representation: + "0xffffffffffffffff\0" (19 bytes). */ + char buffer[21]; + _PyBytesWriter writer; + + _PyBytesWriter_Init(&writer); + + s = _PyBytesWriter_Alloc(&writer, strlen(format)); + if (s == NULL) + return NULL; + writer.overallocate = 1; + +#define WRITE_BYTES(str) \ + do { \ + s = _PyBytesWriter_WriteBytes(&writer, s, (str), strlen(str)); \ + if (s == NULL) \ + goto error; \ + } while (0) - Py_VA_COPY(count, vargs); - /* step 1: figure out how large a buffer we need */ for (f = format; *f; f++) { - if (*f == '%') { - const char* p = f; - while (*++f && *f != '%' && !Py_ISALPHA(*f)) - ; - - /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since - * they don't affect the amount of space we reserve. - */ - if ((*f == 'l' || *f == 'z') && - (f[1] == 'd' || f[1] == 'u')) - ++f; - - switch (*f) { - case 'c': - { - int c = va_arg(count, int); - if (c < 0 || c > 255) { - PyErr_SetString(PyExc_OverflowError, - "PyBytes_FromFormatV(): %c format " - "expects an integer in range [0; 255]"); - return NULL; - } - n++; - break; + if (*f != '%') { + *s++ = *f; + continue; + } + + p = f++; + + /* ignore the width (ex: 10 in "%10s") */ + while (Py_ISDIGIT(*f)) + f++; + + /* parse the precision (ex: 10 in "%.10s") */ + prec = 0; + if (*f == '.') { + f++; + for (; Py_ISDIGIT(*f); f++) { + prec = (prec * 10) + (*f - '0'); } - case '%': - n++; - break; - case 'd': case 'u': case 'i': case 'x': - (void) va_arg(count, int); - /* 20 bytes is enough to hold a 64-bit - integer. Decimal takes the most space. - This isn't enough for octal. */ - n += 20; - break; - case 's': - s = va_arg(count, char*); - n += strlen(s); - break; - case 'p': - (void) va_arg(count, int); - /* maximum 64-bit pointer representation: - * 0xffffffffffffffff - * so 19 characters is enough. - * XXX I count 18 -- what's the extra for? - */ - n += 19; - break; - default: - /* if we stumble upon an unknown - formatting code, copy the rest of - the format string to the output - string. (we cannot just skip the - code, since there's no way to know - what's in the argument list) */ - n += strlen(p); - goto expand; + } + + while (*f && *f != '%' && !Py_ISALPHA(*f)) + f++; + + /* handle the long flag ('l'), but only for %ld and %lu. + others can be added when necessary. */ + longflag = 0; + if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { + longflag = 1; + ++f; + } + + /* handle the size_t flag ('z'). */ + size_tflag = 0; + if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { + size_tflag = 1; + ++f; + } + + /* substract bytes preallocated for the format string + (ex: 2 for "%s") */ + writer.min_size -= (f - p + 1); + + switch (*f) { + case 'c': + { + int c = va_arg(vargs, int); + if (c < 0 || c > 255) { + PyErr_SetString(PyExc_OverflowError, + "PyBytes_FromFormatV(): %c format " + "expects an integer in range [0; 255]"); + goto error; } - } else - n++; - } - expand: - /* step 2: fill the buffer */ - /* Since we've analyzed how much space we need for the worst case, - use sprintf directly instead of the slower PyOS_snprintf. */ - string = PyBytes_FromStringAndSize(NULL, n); - if (!string) - return NULL; + writer.min_size++; + *s++ = (unsigned char)c; + break; + } - s = PyBytes_AsString(string); + case 'd': + if (longflag) + sprintf(buffer, "%ld", va_arg(vargs, long)); + else if (size_tflag) + sprintf(buffer, "%" PY_FORMAT_SIZE_T "d", + va_arg(vargs, Py_ssize_t)); + else + sprintf(buffer, "%d", va_arg(vargs, int)); + assert(strlen(buffer) < sizeof(buffer)); + WRITE_BYTES(buffer); + break; - for (f = format; *f; f++) { - if (*f == '%') { - const char* p = f++; + case 'u': + if (longflag) + sprintf(buffer, "%lu", + va_arg(vargs, unsigned long)); + else if (size_tflag) + sprintf(buffer, "%" PY_FORMAT_SIZE_T "u", + va_arg(vargs, size_t)); + else + sprintf(buffer, "%u", + va_arg(vargs, unsigned int)); + assert(strlen(buffer) < sizeof(buffer)); + WRITE_BYTES(buffer); + break; + + case 'i': + sprintf(buffer, "%i", va_arg(vargs, int)); + assert(strlen(buffer) < sizeof(buffer)); + WRITE_BYTES(buffer); + break; + + case 'x': + sprintf(buffer, "%x", va_arg(vargs, int)); + assert(strlen(buffer) < sizeof(buffer)); + WRITE_BYTES(buffer); + break; + + case 's': + { Py_ssize_t i; - int longflag = 0; - int size_tflag = 0; - /* parse the width.precision part (we're only - interested in the precision value, if any) */ - n = 0; - while (Py_ISDIGIT(*f)) - n = (n*10) + *f++ - '0'; - if (*f == '.') { - f++; - n = 0; - while (Py_ISDIGIT(*f)) - n = (n*10) + *f++ - '0'; - } - while (*f && *f != '%' && !Py_ISALPHA(*f)) - f++; - /* handle the long flag, but only for %ld and %lu. - others can be added when necessary. */ - if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { - longflag = 1; - ++f; - } - /* handle the size_t flag. */ - if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { - size_tflag = 1; - ++f; - } - switch (*f) { - case 'c': - { - int c = va_arg(vargs, int); - /* c has been checked for overflow in the first step */ - *s++ = (unsigned char)c; - break; + p = va_arg(vargs, const char*); + i = strlen(p); + if (prec > 0 && i > prec) + i = prec; + s = _PyBytesWriter_WriteBytes(&writer, s, p, i); + if (s == NULL) + goto error; + break; + } + + case 'p': + sprintf(buffer, "%p", va_arg(vargs, void*)); + assert(strlen(buffer) < sizeof(buffer)); + /* %p is ill-defined: ensure leading 0x. */ + if (buffer[1] == 'X') + buffer[1] = 'x'; + else if (buffer[1] != 'x') { + memmove(buffer+2, buffer, strlen(buffer)+1); + buffer[0] = '0'; + buffer[1] = 'x'; } - case 'd': - if (longflag) - sprintf(s, "%ld", va_arg(vargs, long)); - else if (size_tflag) - sprintf(s, "%" PY_FORMAT_SIZE_T "d", - va_arg(vargs, Py_ssize_t)); - else - sprintf(s, "%d", va_arg(vargs, int)); - s += strlen(s); - break; - case 'u': - if (longflag) - sprintf(s, "%lu", - va_arg(vargs, unsigned long)); - else if (size_tflag) - sprintf(s, "%" PY_FORMAT_SIZE_T "u", - va_arg(vargs, size_t)); - else - sprintf(s, "%u", - va_arg(vargs, unsigned int)); - s += strlen(s); - break; - case 'i': - sprintf(s, "%i", va_arg(vargs, int)); - s += strlen(s); - break; - case 'x': - sprintf(s, "%x", va_arg(vargs, int)); - s += strlen(s); - break; - case 's': - p = va_arg(vargs, char*); - i = strlen(p); - if (n > 0 && i > n) - i = n; - Py_MEMCPY(s, p, i); - s += i; - break; - case 'p': - sprintf(s, "%p", va_arg(vargs, void*)); - /* %p is ill-defined: ensure leading 0x. */ - if (s[1] == 'X') - s[1] = 'x'; - else if (s[1] != 'x') { - memmove(s+2, s, strlen(s)+1); - s[0] = '0'; - s[1] = 'x'; - } - s += strlen(s); - break; - case '%': - *s++ = '%'; - break; - default: - strcpy(s, p); - s += strlen(s); - goto end; + WRITE_BYTES(buffer); + break; + + case '%': + writer.min_size++; + *s++ = '%'; + break; + + default: + if (*f == 0) { + /* fix min_size if we reached the end of the format string */ + writer.min_size++; } - } else - *s++ = *f; + + /* invalid format string: copy unformatted string and exit */ + WRITE_BYTES(p); + return _PyBytesWriter_Finish(&writer, s); + } } - end: - _PyBytes_Resize(&string, s - PyBytes_AS_STRING(string)); - return string; +#undef WRITE_BYTES + + return _PyBytesWriter_Finish(&writer, s); + + error: + _PyBytesWriter_Dealloc(&writer); + return NULL; } PyObject * @@ -409,12 +407,14 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) /* Returns a new reference to a PyBytes object, or NULL on failure. */ -static PyObject * -formatfloat(PyObject *v, int flags, int prec, int type) +static char* +formatfloat(PyObject *v, int flags, int prec, int type, + PyObject **p_result, _PyBytesWriter *writer, char *str) { char *p; PyObject *result; double x; + size_t len; x = PyFloat_AsDouble(v); if (x == -1.0 && PyErr_Occurred()) { @@ -431,9 +431,21 @@ formatfloat(PyObject *v, int flags, int prec, int type) if (p == NULL) return NULL; - result = PyBytes_FromStringAndSize(p, strlen(p)); + + len = strlen(p); + if (writer != NULL) { + str = _PyBytesWriter_Prepare(writer, str, len); + if (str == NULL) + return NULL; + Py_MEMCPY(str, p, len); + str += len; + return str; + } + + result = PyBytes_FromStringAndSize(p, len); PyMem_Free(p); - return result; + *p_result = result; + return str; } static PyObject * @@ -557,36 +569,36 @@ format_obj(PyObject *v, const char **pbuf, Py_ssize_t *plen) return NULL; } -/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) - - FORMATBUFLEN is the length of the buffer in which the ints & - chars are formatted. XXX This is a magic number. Each formatting - routine does bounds checking to ensure no overflow, but a better - solution may be to malloc a buffer of appropriate size for each - format. For now, the current solution is sufficient. -*/ -#define FORMATBUFLEN (size_t)120 +/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) */ PyObject * -_PyBytes_Format(PyObject *format, PyObject *args) +_PyBytes_FormatEx(const char *format, Py_ssize_t format_len, + PyObject *args, int use_bytearray) { - char *fmt, *res; + const char *fmt; + char *res; Py_ssize_t arglen, argidx; - Py_ssize_t reslen, rescnt, fmtcnt; + Py_ssize_t fmtcnt; int args_owned = 0; - PyObject *result; PyObject *dict = NULL; - if (format == NULL || !PyBytes_Check(format) || args == NULL) { + _PyBytesWriter writer; + + if (args == NULL) { PyErr_BadInternalCall(); return NULL; } - fmt = PyBytes_AS_STRING(format); - fmtcnt = PyBytes_GET_SIZE(format); - reslen = rescnt = fmtcnt + 100; - result = PyBytes_FromStringAndSize((char *)NULL, reslen); - if (result == NULL) + fmt = format; + fmtcnt = format_len; + + _PyBytesWriter_Init(&writer); + writer.use_bytearray = use_bytearray; + + res = _PyBytesWriter_Alloc(&writer, fmtcnt); + if (res == NULL) return NULL; - res = PyBytes_AsString(result); + if (!use_bytearray) + writer.overallocate = 1; + if (PyTuple_Check(args)) { arglen = PyTuple_GET_SIZE(args); argidx = 0; @@ -600,18 +612,23 @@ _PyBytes_Format(PyObject *format, PyObject *args) !PyByteArray_Check(args)) { dict = args; } + while (--fmtcnt >= 0) { if (*fmt != '%') { - if (--rescnt < 0) { - rescnt = fmtcnt + 100; - reslen += rescnt; - if (_PyBytes_Resize(&result, reslen)) - return NULL; - res = PyBytes_AS_STRING(result) - + reslen - rescnt; - --rescnt; - } - *res++ = *fmt++; + Py_ssize_t len; + char *pos; + + pos = strchr(fmt + 1, '%'); + if (pos != NULL) + len = pos - fmt; + else + len = format_len - (fmt - format); + assert(len != 0); + + Py_MEMCPY(res, fmt, len); + res += len; + fmt += len; + fmtcnt -= (len - 1); } else { /* Got a format specifier */ @@ -626,10 +643,14 @@ _PyBytes_Format(PyObject *format, PyObject *args) int sign; Py_ssize_t len = 0; char onechar; /* For byte_converter() */ + Py_ssize_t alloc; +#ifdef Py_DEBUG + char *before; +#endif fmt++; if (*fmt == '(') { - char *keystart; + const char *keystart; Py_ssize_t keylen; PyObject *key; int pcount = 1; @@ -673,6 +694,8 @@ _PyBytes_Format(PyObject *format, PyObject *args) arglen = -1; argidx = -2; } + + /* Parse flags. Example: "%+i" => flags=F_SIGN. */ while (--fmtcnt >= 0) { switch (c = *fmt++) { case '-': flags |= F_LJUST; continue; @@ -683,6 +706,8 @@ _PyBytes_Format(PyObject *format, PyObject *args) } break; } + + /* Parse width. Example: "%10s" => width=10 */ if (c == '*') { v = getnextarg(args, arglen, &argidx); if (v == NULL) @@ -717,6 +742,8 @@ _PyBytes_Format(PyObject *format, PyObject *args) width = width*10 + (c - '0'); } } + + /* Parse precision. Example: "%.3f" => prec=3 */ if (c == '.') { prec = 0; if (--fmtcnt >= 0) @@ -771,13 +798,19 @@ _PyBytes_Format(PyObject *format, PyObject *args) if (v == NULL) goto error; } + + if (fmtcnt < 0) { + /* last writer: disable writer overallocation */ + writer.overallocate = 0; + } + sign = 0; fill = ' '; switch (c) { case '%': - pbuf = "%"; - len = 1; - break; + *res++ = '%'; + continue; + case 'r': // %r is only for 2/3 code; 3 only code should use %a case 'a': @@ -790,6 +823,7 @@ _PyBytes_Format(PyObject *format, PyObject *args) if (prec >= 0 && len > prec) len = prec; break; + case 's': // %s is only for 2/3 code; 3 only code should use %b case 'b': @@ -799,12 +833,49 @@ _PyBytes_Format(PyObject *format, PyObject *args) if (prec >= 0 && len > prec) len = prec; break; + case 'i': case 'd': case 'u': case 'o': case 'x': case 'X': + if (PyLong_CheckExact(v) + && width == -1 && prec == -1 + && !(flags & (F_SIGN | F_BLANK)) + && c != 'X') + { + /* Fast path */ + int alternate = flags & F_ALT; + int base; + + switch(c) + { + default: + assert(0 && "'type' not in [diuoxX]"); + case 'd': + case 'i': + case 'u': + base = 10; + break; + case 'o': + base = 8; + break; + case 'x': + case 'X': + base = 16; + break; + } + + /* Fast path */ + writer.min_size -= 2; /* size preallocated for "%d" */ + res = _PyLong_FormatBytesWriter(&writer, res, + v, base, alternate); + if (res == NULL) + goto error; + continue; + } + temp = formatlong(v, flags, prec, c); if (!temp) goto error; @@ -815,14 +886,25 @@ _PyBytes_Format(PyObject *format, PyObject *args) if (flags & F_ZERO) fill = '0'; break; + case 'e': case 'E': case 'f': case 'F': case 'g': case 'G': - temp = formatfloat(v, flags, prec, c); - if (temp == NULL) + if (width == -1 && prec == -1 + && !(flags & (F_SIGN | F_BLANK))) + { + /* Fast path */ + writer.min_size -= 2; /* size preallocated for "%f" */ + res = formatfloat(v, flags, prec, c, NULL, &writer, res); + if (res == NULL) + goto error; + continue; + } + + if (!formatfloat(v, flags, prec, c, &temp, NULL, res)) goto error; pbuf = PyBytes_AS_STRING(temp); len = PyBytes_GET_SIZE(temp); @@ -830,21 +912,28 @@ _PyBytes_Format(PyObject *format, PyObject *args) if (flags & F_ZERO) fill = '0'; break; + case 'c': pbuf = &onechar; len = byte_converter(v, &onechar); if (!len) goto error; + if (width == -1) { + /* Fast path */ + *res++ = onechar; + continue; + } break; + default: PyErr_Format(PyExc_ValueError, "unsupported format character '%c' (0x%x) " "at index %zd", c, c, - (Py_ssize_t)(fmt - 1 - - PyBytes_AsString(format))); + (Py_ssize_t)(fmt - 1 - format)); goto error; } + if (sign) { if (*pbuf == '-' || *pbuf == '+') { sign = *pbuf++; @@ -859,29 +948,31 @@ _PyBytes_Format(PyObject *format, PyObject *args) } if (width < len) width = len; - if (rescnt - (sign != 0) < width) { - reslen -= rescnt; - rescnt = width + fmtcnt + 100; - reslen += rescnt; - if (reslen < 0) { - Py_DECREF(result); - Py_XDECREF(temp); - return PyErr_NoMemory(); - } - if (_PyBytes_Resize(&result, reslen)) { - Py_XDECREF(temp); - return NULL; - } - res = PyBytes_AS_STRING(result) - + reslen - rescnt; + + alloc = width; + if (sign != 0 && len == width) + alloc++; + /* 2: size preallocated for %s */ + if (alloc > 2) { + res = _PyBytesWriter_Prepare(&writer, res, alloc - 2); + if (res == NULL) + goto error; } +#ifdef Py_DEBUG + before = res; +#endif + + /* Write the sign if needed */ if (sign) { if (fill != ' ') *res++ = sign; - rescnt--; if (width > len) width--; } + + /* Write the numeric prefix for "x", "X" and "o" formats + if the alternate form is used. + For example, write "0x" for the "%#x" format. */ if ((flags & F_ALT) && (c == 'x' || c == 'X')) { assert(pbuf[0] == '0'); assert(pbuf[1] == c); @@ -889,18 +980,21 @@ _PyBytes_Format(PyObject *format, PyObject *args) *res++ = *pbuf++; *res++ = *pbuf++; } - rescnt -= 2; width -= 2; if (width < 0) width = 0; len -= 2; } + + /* Pad left with the fill character if needed */ if (width > len && !(flags & F_LJUST)) { - do { - --rescnt; - *res++ = fill; - } while (--width > len); + memset(res, fill, width - len); + res += (width - len); + width = len; } + + /* If padding with spaces: write sign if needed and/or numeric + prefix if the alternate form is used */ if (fill == ' ') { if (sign) *res++ = sign; @@ -912,13 +1006,17 @@ _PyBytes_Format(PyObject *format, PyObject *args) *res++ = *pbuf++; } } + + /* Copy bytes */ Py_MEMCPY(res, pbuf, len); res += len; - rescnt -= len; - while (--width >= len) { - --rescnt; - *res++ = ' '; + + /* Pad right with the fill character if needed */ + if (width > len) { + memset(res, ' ', width - len); + res += (width - len); } + if (dict && (argidx < arglen) && c != '%') { PyErr_SetString(PyExc_TypeError, "not all arguments converted during bytes formatting"); @@ -926,22 +1024,31 @@ _PyBytes_Format(PyObject *format, PyObject *args) goto error; } Py_XDECREF(temp); + +#ifdef Py_DEBUG + /* check that we computed the exact size for this write */ + assert((res - before) == alloc); +#endif } /* '%' */ + + /* If overallocation was disabled, ensure that it was the last + write. Otherwise, we missed an optimization */ + assert(writer.overallocate || fmtcnt < 0 || use_bytearray); } /* until end */ + if (argidx < arglen && !dict) { PyErr_SetString(PyExc_TypeError, "not all arguments converted during bytes formatting"); goto error; } + if (args_owned) { Py_DECREF(args); } - if (_PyBytes_Resize(&result, reslen - rescnt)) - return NULL; - return result; + return _PyBytesWriter_Finish(&writer, res); error: - Py_DECREF(result); + _PyBytesWriter_Dealloc(&writer); if (args_owned) { Py_DECREF(args); } @@ -961,6 +1068,42 @@ bytes_dealloc(PyObject *op) the string is UTF-8 encoded and should be re-encoded in the specified encoding. */ +static char * +_PyBytes_DecodeEscapeRecode(const char **s, const char *end, + const char *errors, const char *recode_encoding, + _PyBytesWriter *writer, char *p) +{ + PyObject *u, *w; + const char* t; + + t = *s; + /* Decode non-ASCII bytes as UTF-8. */ + while (t < end && (*t & 0x80)) + t++; + u = PyUnicode_DecodeUTF8(*s, t - *s, errors); + if (u == NULL) + return NULL; + + /* Recode them in target encoding. */ + w = PyUnicode_AsEncodedString(u, recode_encoding, errors); + Py_DECREF(u); + if (w == NULL) + return NULL; + assert(PyBytes_Check(w)); + + /* Append bytes to output buffer. */ + writer->min_size--; /* substract 1 preallocated byte */ + p = _PyBytesWriter_WriteBytes(writer, p, + PyBytes_AS_STRING(w), + PyBytes_GET_SIZE(w)); + Py_DECREF(w); + if (p == NULL) + return NULL; + + *s = t; + return p; +} + PyObject *PyBytes_DecodeEscape(const char *s, Py_ssize_t len, const char *errors, @@ -968,54 +1111,42 @@ PyObject *PyBytes_DecodeEscape(const char *s, const char *recode_encoding) { int c; - char *p, *buf; + char *p; const char *end; - PyObject *v; - Py_ssize_t newlen = recode_encoding ? 4*len:len; - v = PyBytes_FromStringAndSize((char *)NULL, newlen); - if (v == NULL) + _PyBytesWriter writer; + + _PyBytesWriter_Init(&writer); + + p = _PyBytesWriter_Alloc(&writer, len); + if (p == NULL) return NULL; - p = buf = PyBytes_AsString(v); + writer.overallocate = 1; + end = s + len; while (s < end) { if (*s != '\\') { non_esc: - if (recode_encoding && (*s & 0x80)) { - PyObject *u, *w; - char *r; - const char* t; - Py_ssize_t rn; - t = s; - /* Decode non-ASCII bytes as UTF-8. */ - while (t < end && (*t & 0x80)) t++; - u = PyUnicode_DecodeUTF8(s, t - s, errors); - if(!u) goto failed; - - /* Recode them in target encoding. */ - w = PyUnicode_AsEncodedString( - u, recode_encoding, errors); - Py_DECREF(u); - if (!w) goto failed; - - /* Append bytes to output buffer. */ - assert(PyBytes_Check(w)); - r = PyBytes_AS_STRING(w); - rn = PyBytes_GET_SIZE(w); - Py_MEMCPY(p, r, rn); - p += rn; - Py_DECREF(w); - s = t; - } else { + if (!(recode_encoding && (*s & 0x80))) { *p++ = *s++; } + else { + /* non-ASCII character and need to recode */ + p = _PyBytes_DecodeEscapeRecode(&s, end, + errors, recode_encoding, + &writer, p); + if (p == NULL) + goto failed; + } continue; } + s++; - if (s==end) { + if (s == end) { PyErr_SetString(PyExc_ValueError, "Trailing \\ in string"); goto failed; } + switch (*s++) { /* XXX This assumes ASCII! */ case '\n': break; @@ -1040,28 +1171,18 @@ PyObject *PyBytes_DecodeEscape(const char *s, *p++ = c; break; case 'x': - if (s+1 < end && Py_ISXDIGIT(s[0]) && Py_ISXDIGIT(s[1])) { - unsigned int x = 0; - c = Py_CHARMASK(*s); - s++; - if (Py_ISDIGIT(c)) - x = c - '0'; - else if (Py_ISLOWER(c)) - x = 10 + c - 'a'; - else - x = 10 + c - 'A'; - x = x << 4; - c = Py_CHARMASK(*s); - s++; - if (Py_ISDIGIT(c)) - x += c - '0'; - else if (Py_ISLOWER(c)) - x += 10 + c - 'a'; - else - x += 10 + c - 'A'; - *p++ = x; - break; + if (s+1 < end) { + int digit1, digit2; + digit1 = _PyLong_DigitValue[Py_CHARMASK(s[0])]; + digit2 = _PyLong_DigitValue[Py_CHARMASK(s[1])]; + if (digit1 < 16 && digit2 < 16) { + *p++ = (unsigned char)((digit1 << 4) + digit2); + s += 2; + break; + } } + /* invalid hexadecimal digits */ + if (!errors || strcmp(errors, "strict") == 0) { PyErr_Format(PyExc_ValueError, "invalid \\x escape at position %d", @@ -1083,6 +1204,7 @@ PyObject *PyBytes_DecodeEscape(const char *s, if (s < end && Py_ISXDIGIT(s[0])) s++; /* and a hexdigit */ break; + default: *p++ = '\\'; s--; @@ -1090,11 +1212,11 @@ PyObject *PyBytes_DecodeEscape(const char *s, UTF-8 bytes may follow. */ } } - if (p-buf < newlen) - _PyBytes_Resize(&v, p - buf); - return v; + + return _PyBytesWriter_Finish(&writer, p); + failed: - Py_DECREF(v); + _PyBytesWriter_Dealloc(&writer); return NULL; } @@ -1815,16 +1937,15 @@ bytes_find_internal(PyBytesObject *self, PyObject *args, int dir) ADJUST_INDICES(start, end, len); if (end - start < sub_len) res = -1; - else if (sub_len == 1 -#ifndef HAVE_MEMRCHR - && dir > 0 -#endif - ) { - unsigned char needle = *sub; - int mode = (dir > 0) ? FAST_SEARCH : FAST_RSEARCH; - res = stringlib_fastsearch_memchr_1char( - PyBytes_AS_STRING(self) + start, end - start, - needle, needle, mode); + else if (sub_len == 1) { + if (dir > 0) + res = stringlib_find_char( + PyBytes_AS_STRING(self) + start, end - start, + *sub); + else + res = stringlib_rfind_char( + PyBytes_AS_STRING(self) + start, end - start, + *sub); if (res >= 0) res += start; } @@ -2343,17 +2464,27 @@ replace_interleave(PyBytesObject *self, self_s = PyBytes_AS_STRING(self); result_s = PyBytes_AS_STRING(result); - /* TODO: special case single character, which doesn't need memcpy */ - - /* Lay the first one down (guaranteed this will occur) */ - Py_MEMCPY(result_s, to_s, to_len); - result_s += to_len; - count -= 1; - - for (i=0; i<count; i++) { - *result_s++ = *self_s++; + if (to_len > 1) { + /* Lay the first one down (guaranteed this will occur) */ Py_MEMCPY(result_s, to_s, to_len); result_s += to_len; + count -= 1; + + for (i = 0; i < count; i++) { + *result_s++ = *self_s++; + Py_MEMCPY(result_s, to_s, to_len); + result_s += to_len; + } + } + else { + result_s[0] = to_s[0]; + result_s += to_len; + count -= 1; + for (i = 0; i < count; i++) { + *result_s++ = *self_s++; + result_s[0] = to_s[0]; + result_s += to_len; + } } /* Copy the rest of the original string */ @@ -2975,22 +3106,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends) ); } -static int -hex_digit_to_int(Py_UCS4 c) -{ - if (c >= 128) - return -1; - if (Py_ISDIGIT(c)) - return c - '0'; - else { - if (Py_ISUPPER(c)) - c = Py_TOLOWER(c); - if (c >= 'a' && c <= 'f') - return c - 'a' + 10; - } - return -1; -} - /*[clinic input] @classmethod bytes.fromhex @@ -3008,47 +3123,83 @@ static PyObject * bytes_fromhex_impl(PyTypeObject *type, PyObject *string) /*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/ { - PyObject *newstring; + return _PyBytes_FromHex(string, 0); +} + +PyObject* +_PyBytes_FromHex(PyObject *string, int use_bytearray) +{ char *buf; - Py_ssize_t hexlen, byteslen, i, j; - int top, bot; - void *data; - unsigned int kind; + Py_ssize_t hexlen, invalid_char; + unsigned int top, bot; + Py_UCS1 *str, *end; + _PyBytesWriter writer; + + _PyBytesWriter_Init(&writer); + writer.use_bytearray = use_bytearray; assert(PyUnicode_Check(string)); if (PyUnicode_READY(string)) return NULL; - kind = PyUnicode_KIND(string); - data = PyUnicode_DATA(string); hexlen = PyUnicode_GET_LENGTH(string); - byteslen = hexlen/2; /* This overestimates if there are spaces */ - newstring = PyBytes_FromStringAndSize(NULL, byteslen); - if (!newstring) + if (!PyUnicode_IS_ASCII(string)) { + void *data = PyUnicode_DATA(string); + unsigned int kind = PyUnicode_KIND(string); + Py_ssize_t i; + + /* search for the first non-ASCII character */ + for (i = 0; i < hexlen; i++) { + if (PyUnicode_READ(kind, data, i) >= 128) + break; + } + invalid_char = i; + goto error; + } + + assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND); + str = PyUnicode_1BYTE_DATA(string); + + /* This overestimates if there are spaces */ + buf = _PyBytesWriter_Alloc(&writer, hexlen / 2); + if (buf == NULL) return NULL; - buf = PyBytes_AS_STRING(newstring); - for (i = j = 0; i < hexlen; i += 2) { + + end = str + hexlen; + while (str < end) { /* skip over spaces in the input */ - while (PyUnicode_READ(kind, data, i) == ' ') - i++; - if (i >= hexlen) - break; - top = hex_digit_to_int(PyUnicode_READ(kind, data, i)); - bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1)); - if (top == -1 || bot == -1) { - PyErr_Format(PyExc_ValueError, - "non-hexadecimal number found in " - "fromhex() arg at position %zd", i); + if (*str == ' ') { + do { + str++; + } while (*str == ' '); + if (str >= end) + break; + } + + top = _PyLong_DigitValue[*str]; + if (top >= 16) { + invalid_char = str - PyUnicode_1BYTE_DATA(string); goto error; } - buf[j++] = (top << 4) + bot; + str++; + + bot = _PyLong_DigitValue[*str]; + if (bot >= 16) { + invalid_char = str - PyUnicode_1BYTE_DATA(string); + goto error; + } + str++; + + *buf++ = (unsigned char)((top << 4) + bot); } - if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0) - goto error; - return newstring; + + return _PyBytesWriter_Finish(&writer, buf); error: - Py_XDECREF(newstring); + PyErr_Format(PyExc_ValueError, + "non-hexadecimal number found in " + "fromhex() arg at position %zd", invalid_char); + _PyBytesWriter_Dealloc(&writer); return NULL; } @@ -3131,11 +3282,13 @@ bytes_methods[] = { }; static PyObject * -bytes_mod(PyObject *v, PyObject *w) +bytes_mod(PyObject *self, PyObject *arg) { - if (!PyBytes_Check(v)) + if (!PyBytes_Check(self)) { Py_RETURN_NOTIMPLEMENTED; - return _PyBytes_Format(v, w); + } + return _PyBytes_FormatEx(PyBytes_AS_STRING(self), PyBytes_GET_SIZE(self), + arg, 0); } static PyNumberMethods bytes_as_number = { @@ -3244,108 +3397,93 @@ bytes_new(PyTypeObject *type, PyObject *args, PyObject *kwds) return PyBytes_FromObject(x); } -PyObject * -PyBytes_FromObject(PyObject *x) +static PyObject* +_PyBytes_FromBuffer(PyObject *x) { - PyObject *new, *it; - Py_ssize_t i, size; + PyObject *new; + Py_buffer view; - if (x == NULL) { - PyErr_BadInternalCall(); + if (PyObject_GetBuffer(x, &view, PyBUF_FULL_RO) < 0) return NULL; - } - if (PyBytes_CheckExact(x)) { - Py_INCREF(x); - return x; - } + new = PyBytes_FromStringAndSize(NULL, view.len); + if (!new) + goto fail; + if (PyBuffer_ToContiguous(((PyBytesObject *)new)->ob_sval, + &view, view.len, 'C') < 0) + goto fail; + PyBuffer_Release(&view); + return new; - /* Use the modern buffer interface */ - if (PyObject_CheckBuffer(x)) { - Py_buffer view; - if (PyObject_GetBuffer(x, &view, PyBUF_FULL_RO) < 0) - return NULL; - new = PyBytes_FromStringAndSize(NULL, view.len); - if (!new) - goto fail; - if (PyBuffer_ToContiguous(((PyBytesObject *)new)->ob_sval, - &view, view.len, 'C') < 0) - goto fail; - PyBuffer_Release(&view); - return new; - fail: - Py_XDECREF(new); - PyBuffer_Release(&view); - return NULL; - } - if (PyUnicode_Check(x)) { - PyErr_SetString(PyExc_TypeError, - "cannot convert unicode object to bytes"); - return NULL; - } +fail: + Py_XDECREF(new); + PyBuffer_Release(&view); + return NULL; +} - if (PyList_CheckExact(x)) { - new = PyBytes_FromStringAndSize(NULL, Py_SIZE(x)); - if (new == NULL) - return NULL; - for (i = 0; i < Py_SIZE(x); i++) { - Py_ssize_t value = PyNumber_AsSsize_t( - PyList_GET_ITEM(x, i), PyExc_ValueError); - if (value == -1 && PyErr_Occurred()) { - Py_DECREF(new); - return NULL; - } - if (value < 0 || value >= 256) { - PyErr_SetString(PyExc_ValueError, - "bytes must be in range(0, 256)"); - Py_DECREF(new); - return NULL; - } - ((PyBytesObject *)new)->ob_sval[i] = (char) value; - } - return new; - } - if (PyTuple_CheckExact(x)) { - new = PyBytes_FromStringAndSize(NULL, Py_SIZE(x)); - if (new == NULL) - return NULL; - for (i = 0; i < Py_SIZE(x); i++) { - Py_ssize_t value = PyNumber_AsSsize_t( - PyTuple_GET_ITEM(x, i), PyExc_ValueError); - if (value == -1 && PyErr_Occurred()) { - Py_DECREF(new); - return NULL; - } - if (value < 0 || value >= 256) { - PyErr_SetString(PyExc_ValueError, - "bytes must be in range(0, 256)"); - Py_DECREF(new); - return NULL; - } - ((PyBytesObject *)new)->ob_sval[i] = (char) value; - } - return new; - } +#define _PyBytes_FROM_LIST_BODY(x, GET_ITEM) \ + do { \ + PyObject *bytes; \ + Py_ssize_t i; \ + Py_ssize_t value; \ + char *str; \ + PyObject *item; \ + \ + bytes = PyBytes_FromStringAndSize(NULL, Py_SIZE(x)); \ + if (bytes == NULL) \ + return NULL; \ + str = ((PyBytesObject *)bytes)->ob_sval; \ + \ + for (i = 0; i < Py_SIZE(x); i++) { \ + item = GET_ITEM((x), i); \ + value = PyNumber_AsSsize_t(item, PyExc_ValueError); \ + if (value == -1 && PyErr_Occurred()) \ + goto error; \ + \ + if (value < 0 || value >= 256) { \ + PyErr_SetString(PyExc_ValueError, \ + "bytes must be in range(0, 256)"); \ + goto error; \ + } \ + *str++ = (char) value; \ + } \ + return bytes; \ + \ + error: \ + Py_DECREF(bytes); \ + return NULL; \ + } while (0) + +static PyObject* +_PyBytes_FromList(PyObject *x) +{ + _PyBytes_FROM_LIST_BODY(x, PyList_GET_ITEM); +} + +static PyObject* +_PyBytes_FromTuple(PyObject *x) +{ + _PyBytes_FROM_LIST_BODY(x, PyTuple_GET_ITEM); +} + +static PyObject * +_PyBytes_FromIterator(PyObject *it, PyObject *x) +{ + char *str; + Py_ssize_t i, size; + _PyBytesWriter writer; /* For iterator version, create a string object and resize as needed */ size = PyObject_LengthHint(x, 64); if (size == -1 && PyErr_Occurred()) return NULL; - /* Allocate an extra byte to prevent PyBytes_FromStringAndSize() from - returning a shared empty bytes string. This required because we - want to call _PyBytes_Resize() the returned object, which we can - only do on bytes objects with refcount == 1. */ - if (size == 0) - size = 1; - new = PyBytes_FromStringAndSize(NULL, size); - if (new == NULL) - return NULL; - assert(Py_REFCNT(new) == 1); - /* Get the iterator */ - it = PyObject_GetIter(x); - if (it == NULL) - goto error; + _PyBytesWriter_Init(&writer); + str = _PyBytesWriter_Alloc(&writer, size); + if (str == NULL) + return NULL; + writer.overallocate = 1; + size = writer.allocated; /* Run the iterator to exhaustion */ for (i = 0; ; i++) { @@ -3375,21 +3513,58 @@ PyBytes_FromObject(PyObject *x) /* Append the byte */ if (i >= size) { - size = 2 * size + 1; - if (_PyBytes_Resize(&new, size) < 0) - goto error; + str = _PyBytesWriter_Resize(&writer, str, size+1); + if (str == NULL) + return NULL; + size = writer.allocated; } - ((PyBytesObject *)new)->ob_sval[i] = (char) value; + *str++ = (char) value; } - _PyBytes_Resize(&new, i); - /* Clean up and return success */ - Py_DECREF(it); - return new; + return _PyBytesWriter_Finish(&writer, str); error: - Py_XDECREF(it); - Py_XDECREF(new); + _PyBytesWriter_Dealloc(&writer); + return NULL; +} + +PyObject * +PyBytes_FromObject(PyObject *x) +{ + PyObject *it, *result; + + if (x == NULL) { + PyErr_BadInternalCall(); + return NULL; + } + + if (PyBytes_CheckExact(x)) { + Py_INCREF(x); + return x; + } + + /* Use the modern buffer interface */ + if (PyObject_CheckBuffer(x)) + return _PyBytes_FromBuffer(x); + + if (PyList_CheckExact(x)) + return _PyBytes_FromList(x); + + if (PyTuple_CheckExact(x)) + return _PyBytes_FromTuple(x); + + if (!PyUnicode_Check(x)) { + it = PyObject_GetIter(x); + if (it != NULL) { + result = _PyBytes_FromIterator(it, x); + Py_DECREF(it); + return result; + } + } + + PyErr_Format(PyExc_TypeError, + "cannot convert '%.200s' object to bytes", + x->ob_type->tp_name); return NULL; } @@ -3740,3 +3915,282 @@ bytes_iter(PyObject *seq) _PyObject_GC_TRACK(it); return (PyObject *)it; } + + +/* _PyBytesWriter API */ + +#ifdef MS_WINDOWS + /* On Windows, overallocate by 50% is the best factor */ +# define OVERALLOCATE_FACTOR 2 +#else + /* On Linux, overallocate by 25% is the best factor */ +# define OVERALLOCATE_FACTOR 4 +#endif + +void +_PyBytesWriter_Init(_PyBytesWriter *writer) +{ + /* Set all attributes before small_buffer to 0 */ + memset(writer, 0, offsetof(_PyBytesWriter, small_buffer)); +#ifdef Py_DEBUG + memset(writer->small_buffer, 0xCB, sizeof(writer->small_buffer)); +#endif +} + +void +_PyBytesWriter_Dealloc(_PyBytesWriter *writer) +{ + Py_CLEAR(writer->buffer); +} + +Py_LOCAL_INLINE(char*) +_PyBytesWriter_AsString(_PyBytesWriter *writer) +{ + if (writer->use_small_buffer) { + assert(writer->buffer == NULL); + return writer->small_buffer; + } + else if (writer->use_bytearray) { + assert(writer->buffer != NULL); + return PyByteArray_AS_STRING(writer->buffer); + } + else { + assert(writer->buffer != NULL); + return PyBytes_AS_STRING(writer->buffer); + } +} + +Py_LOCAL_INLINE(Py_ssize_t) +_PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str) +{ + char *start = _PyBytesWriter_AsString(writer); + assert(str != NULL); + assert(str >= start); + assert(str - start <= writer->allocated); + return str - start; +} + +Py_LOCAL_INLINE(void) +_PyBytesWriter_CheckConsistency(_PyBytesWriter *writer, char *str) +{ +#ifdef Py_DEBUG + char *start, *end; + + if (writer->use_small_buffer) { + assert(writer->buffer == NULL); + } + else { + assert(writer->buffer != NULL); + if (writer->use_bytearray) + assert(PyByteArray_CheckExact(writer->buffer)); + else + assert(PyBytes_CheckExact(writer->buffer)); + assert(Py_REFCNT(writer->buffer) == 1); + } + + if (writer->use_bytearray) { + /* bytearray has its own overallocation algorithm, + writer overallocation must be disabled */ + assert(!writer->overallocate); + } + + assert(0 <= writer->allocated); + assert(0 <= writer->min_size && writer->min_size <= writer->allocated); + /* the last byte must always be null */ + start = _PyBytesWriter_AsString(writer); + assert(start[writer->allocated] == 0); + + end = start + writer->allocated; + assert(str != NULL); + assert(start <= str && str <= end); +#endif +} + +void* +_PyBytesWriter_Resize(_PyBytesWriter *writer, void *str, Py_ssize_t size) +{ + Py_ssize_t allocated, pos; + + _PyBytesWriter_CheckConsistency(writer, str); + assert(writer->allocated < size); + + allocated = size; + if (writer->overallocate + && allocated <= (PY_SSIZE_T_MAX - allocated / OVERALLOCATE_FACTOR)) { + /* overallocate to limit the number of realloc() */ + allocated += allocated / OVERALLOCATE_FACTOR; + } + + pos = _PyBytesWriter_GetSize(writer, str); + if (!writer->use_small_buffer) { + if (writer->use_bytearray) { + if (PyByteArray_Resize(writer->buffer, allocated)) + goto error; + /* writer->allocated can be smaller than writer->buffer->ob_alloc, + but we cannot use ob_alloc because bytes may need to be moved + to use the whole buffer. bytearray uses an internal optimization + to avoid moving or copying bytes when bytes are removed at the + beginning (ex: del bytearray[:1]). */ + } + else { + if (_PyBytes_Resize(&writer->buffer, allocated)) + goto error; + } + } + else { + /* convert from stack buffer to bytes object buffer */ + assert(writer->buffer == NULL); + + if (writer->use_bytearray) + writer->buffer = PyByteArray_FromStringAndSize(NULL, allocated); + else + writer->buffer = PyBytes_FromStringAndSize(NULL, allocated); + if (writer->buffer == NULL) + goto error; + + if (pos != 0) { + char *dest; + if (writer->use_bytearray) + dest = PyByteArray_AS_STRING(writer->buffer); + else + dest = PyBytes_AS_STRING(writer->buffer); + Py_MEMCPY(dest, + writer->small_buffer, + pos); + } + + writer->use_small_buffer = 0; +#ifdef Py_DEBUG + memset(writer->small_buffer, 0xDB, sizeof(writer->small_buffer)); +#endif + } + writer->allocated = allocated; + + str = _PyBytesWriter_AsString(writer) + pos; + _PyBytesWriter_CheckConsistency(writer, str); + return str; + +error: + _PyBytesWriter_Dealloc(writer); + return NULL; +} + +void* +_PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size) +{ + Py_ssize_t new_min_size; + + _PyBytesWriter_CheckConsistency(writer, str); + assert(size >= 0); + + if (size == 0) { + /* nothing to do */ + return str; + } + + if (writer->min_size > PY_SSIZE_T_MAX - size) { + PyErr_NoMemory(); + _PyBytesWriter_Dealloc(writer); + return NULL; + } + new_min_size = writer->min_size + size; + + if (new_min_size > writer->allocated) + str = _PyBytesWriter_Resize(writer, str, new_min_size); + + writer->min_size = new_min_size; + return str; +} + +/* Allocate the buffer to write size bytes. + Return the pointer to the beginning of buffer data. + Raise an exception and return NULL on error. */ +void* +_PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size) +{ + /* ensure that _PyBytesWriter_Alloc() is only called once */ + assert(writer->min_size == 0 && writer->buffer == NULL); + assert(size >= 0); + + writer->use_small_buffer = 1; +#ifdef Py_DEBUG + writer->allocated = sizeof(writer->small_buffer) - 1; + /* In debug mode, don't use the full small buffer because it is less + efficient than bytes and bytearray objects to detect buffer underflow + and buffer overflow. Use 10 bytes of the small buffer to test also + code using the smaller buffer in debug mode. + + Don't modify the _PyBytesWriter structure (use a shorter small buffer) + in debug mode to also be able to detect stack overflow when running + tests in debug mode. The _PyBytesWriter is large (more than 512 bytes), + if Py_EnterRecursiveCall() is not used in deep C callback, we may hit a + stack overflow. */ + writer->allocated = Py_MIN(writer->allocated, 10); + /* _PyBytesWriter_CheckConsistency() requires the last byte to be 0, + to detect buffer overflow */ + writer->small_buffer[writer->allocated] = 0; +#else + writer->allocated = sizeof(writer->small_buffer); +#endif + return _PyBytesWriter_Prepare(writer, writer->small_buffer, size); +} + +PyObject * +_PyBytesWriter_Finish(_PyBytesWriter *writer, void *str) +{ + Py_ssize_t size; + PyObject *result; + + _PyBytesWriter_CheckConsistency(writer, str); + + size = _PyBytesWriter_GetSize(writer, str); + if (size == 0 && !writer->use_bytearray) { + Py_CLEAR(writer->buffer); + /* Get the empty byte string singleton */ + result = PyBytes_FromStringAndSize(NULL, 0); + } + else if (writer->use_small_buffer) { + if (writer->use_bytearray) { + result = PyByteArray_FromStringAndSize(writer->small_buffer, size); + } + else { + result = PyBytes_FromStringAndSize(writer->small_buffer, size); + } + } + else { + result = writer->buffer; + writer->buffer = NULL; + + if (size != writer->allocated) { + if (writer->use_bytearray) { + if (PyByteArray_Resize(result, size)) { + Py_DECREF(result); + return NULL; + } + } + else { + if (_PyBytes_Resize(&result, size)) { + assert(result == NULL); + return NULL; + } + } + } + } + return result; +} + +void* +_PyBytesWriter_WriteBytes(_PyBytesWriter *writer, void *ptr, + const void *bytes, Py_ssize_t size) +{ + char *str = (char *)ptr; + + str = _PyBytesWriter_Prepare(writer, str, size); + if (str == NULL) + return NULL; + + Py_MEMCPY(str, bytes, size); + str += size; + + return str; +} |