diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2012-10-06 21:03:36 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@gmail.com> | 2012-10-06 21:03:36 (GMT) |
commit | e215d960be3c5e1457920c452dc8f94ebf42b159 (patch) | |
tree | 6a48506e8fbe03543c0c258aab982c6a6af8558d | |
parent | 2a09b6e84955779dbc878c5a0679c41d84f5d021 (diff) | |
download | cpython-e215d960be3c5e1457920c452dc8f94ebf42b159.zip cpython-e215d960be3c5e1457920c452dc8f94ebf42b159.tar.gz cpython-e215d960be3c5e1457920c452dc8f94ebf42b159.tar.bz2 |
Issue #16147: Rewrite PyUnicode_FromFormatV() to use _PyUnicodeWriter API
* Simplify the code: replace 4 steps with one unique step using the
_PyUnicodeWriter API. PyUnicode_Format() has the same design. It avoids to
store intermediate results which require to allocate an array of pointers on
the heap.
* Use the _PyUnicodeWriter API for speed (and its convinient API):
overallocate the buffer to reduce the number of "realloc()"
* Implement "width" and "precision" in Python, don't rely on sprintf(). It
avoids to need of a temporary buffer allocated on the heap: only use a small
buffer allocated in the stack.
* Add _PyUnicodeWriter_WriteCstr() function
* Split PyUnicode_FromFormatV() into two functions: add
unicode_fromformat_arg().
* Inline parse_format_flags(): the format of an argument is now only parsed
once, it's no more needed to have a subfunction.
* Optimize PyUnicode_FromFormatV() for characters between two "%" arguments:
search the next "%" and copy the substring in one chunk, instead of copying
character per character.
-rw-r--r-- | Include/unicodeobject.h | 18 | ||||
-rw-r--r-- | Lib/test/test_unicode.py | 16 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 814 |
3 files changed, 364 insertions, 484 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 4152dd7..fa21c1c 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -933,12 +933,28 @@ PyAPI_FUNC(int) _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, Py_ssize_t length, Py_UCS4 maxchar); +/* Append a Unicode string. + Return 0 on success, raise an exception and return -1 on error. */ PyAPI_FUNC(int) -_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str); +_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, + PyObject *str /* Unicode string */ + ); +/* Append a latin1-encoded byte string. + Return 0 on success, raise an exception and return -1 on error. */ +PyAPI_FUNC(int) +_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, + const char *str, /* latin1-encoded byte string */ + Py_ssize_t len /* length in bytes */ + ); + +/* Get the value of the write as an Unicode string. Clear the + buffer of the writer. Raise an exception and return NULL + on error. */ PyAPI_FUNC(PyObject *) _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer); +/* Deallocate memory of a writer (clear its internal buffer). */ PyAPI_FUNC(void) _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer); #endif diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index b13a90a..42df721 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1769,6 +1769,22 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual(PyUnicode_FromFormat(b'%llu', c_ulonglong(123)), '123') self.assertEqual(PyUnicode_FromFormat(b'%zu', c_size_t(123)), '123') + # test padding (width and/or precision) + self.assertEqual(PyUnicode_FromFormat(b'%010i', c_int(123)), '123'.rjust(10, '0')) + self.assertEqual(PyUnicode_FromFormat(b'%100i', c_int(123)), '123'.rjust(100)) + self.assertEqual(PyUnicode_FromFormat(b'%.100i', c_int(123)), '123'.rjust(100, '0')) + self.assertEqual(PyUnicode_FromFormat(b'%100.80i', c_int(123)), '123'.rjust(80, '0').rjust(100)) + + self.assertEqual(PyUnicode_FromFormat(b'%010u', c_uint(123)), '123'.rjust(10, '0')) + self.assertEqual(PyUnicode_FromFormat(b'%100u', c_uint(123)), '123'.rjust(100)) + self.assertEqual(PyUnicode_FromFormat(b'%.100u', c_uint(123)), '123'.rjust(100, '0')) + self.assertEqual(PyUnicode_FromFormat(b'%100.80u', c_uint(123)), '123'.rjust(80, '0').rjust(100)) + + self.assertEqual(PyUnicode_FromFormat(b'%010x', c_int(0x123)), '123'.rjust(10, '0')) + self.assertEqual(PyUnicode_FromFormat(b'%100x', c_int(0x123)), '123'.rjust(100)) + self.assertEqual(PyUnicode_FromFormat(b'%.100x', c_int(0x123)), '123'.rjust(100, '0')) + self.assertEqual(PyUnicode_FromFormat(b'%100.80x', c_int(0x123)), '123'.rjust(80, '0').rjust(100)) + # test %A text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'") diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c3bf392..562efed 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2301,16 +2301,9 @@ PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) static void makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, - int zeropad, int width, int precision, char c) + char c) { *fmt++ = '%'; - if (width) { - if (zeropad) - *fmt++ = '0'; - fmt += sprintf(fmt, "%d", width); - } - if (precision) - fmt += sprintf(fmt, ".%d", precision); if (longflag) *fmt++ = 'l'; else if (longlongflag) { @@ -2335,44 +2328,59 @@ makefmt(char *fmt, int longflag, int longlongflag, int size_tflag, *fmt = '\0'; } -/* helper for PyUnicode_FromFormatV() */ +/* maximum number of characters required for output of %ld. 21 characters + allows for 64-bit integers (in decimal) and an optional sign. */ +#define MAX_LONG_CHARS 21 +/* maximum number of characters required for output of %lld. + We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, + plus 1 for the sign. 53/22 is an upper bound for log10(256). */ +#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) static const char* -parse_format_flags(const char *f, - int *p_width, int *p_precision, - int *p_longflag, int *p_longlongflag, int *p_size_tflag) +unicode_fromformat_arg(_PyUnicodeWriter *writer, + const char *f, va_list *vargs) { - int width, precision, longflag, longlongflag, size_tflag; + const char *p; + Py_ssize_t len; + int zeropad; + int width; + int precision; + int longflag; + int longlongflag; + int size_tflag; + int fill; + + p = f; + f++; + zeropad = (*f == '0'); /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ - f++; width = 0; - while (Py_ISDIGIT((unsigned)*f)) - width = (width*10) + *f++ - '0'; + while (Py_ISDIGIT((unsigned)*f)) { + width = (width*10) + (*f - '0'); + f++; + } precision = 0; if (*f == '.') { f++; - while (Py_ISDIGIT((unsigned)*f)) - precision = (precision*10) + *f++ - '0'; + while (Py_ISDIGIT((unsigned)*f)) { + precision = (precision*10) + (*f - '0'); + f++; + } if (*f == '%') { /* "%.3%s" => f points to "3" */ f--; } } if (*f == '\0') { - /* bogus format "%.1" => go backward, f points to "1" */ + /* bogus format "%.123" => go backward, f points to "3" */ f--; } - if (p_width != NULL) - *p_width = width; - if (p_precision != NULL) - *p_precision = precision; /* Handle %ld, %lu, %lld and %llu. */ longflag = 0; longlongflag = 0; size_tflag = 0; - if (*f == 'l') { if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') { longflag = 1; @@ -2391,494 +2399,321 @@ parse_format_flags(const char *f, size_tflag = 1; ++f; } - if (p_longflag != NULL) - *p_longflag = longflag; - if (p_longlongflag != NULL) - *p_longlongflag = longlongflag; - if (p_size_tflag != NULL) - *p_size_tflag = size_tflag; - return f; -} -/* maximum number of characters required for output of %ld. 21 characters - allows for 64-bit integers (in decimal) and an optional sign. */ -#define MAX_LONG_CHARS 21 -/* maximum number of characters required for output of %lld. - We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits, - plus 1 for the sign. 53/22 is an upper bound for log10(256). */ -#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) + if (f[1] == '\0') + writer->overallocate = 0; -PyObject * -PyUnicode_FromFormatV(const char *format, va_list vargs) -{ - va_list count; - Py_ssize_t callcount = 0; - PyObject **callresults = NULL; - PyObject **callresult = NULL; - Py_ssize_t n = 0; - int width = 0; - int precision = 0; - int zeropad; - const char* f; - PyObject *string; - /* used by sprintf */ - char fmt[61]; /* should be enough for %0width.precisionlld */ - Py_UCS4 maxchar = 127; /* result is ASCII by default */ - Py_UCS4 argmaxchar; - Py_ssize_t numbersize = 0; - char *numberresults = NULL; - char *numberresult = NULL; - Py_ssize_t i; - int kind; - void *data; + switch (*f) { + case 'c': + { + int ordinal = va_arg(*vargs, int); + if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1) + return NULL; + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal); + writer->pos++; + break; + } - Py_VA_COPY(count, vargs); - /* step 1: count the number of %S/%R/%A/%s format specifications - * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ - * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the - * result in an array) - * also estimate a upper bound for all the number formats in the string, - * numbers will be formatted in step 3 and be kept in a '\0'-separated - * buffer before putting everything together. */ - for (f = format; *f; f++) { - if (*f == '%') { - int longlongflag; - /* skip width or width.precision (eg. "1.2" of "%1.2f") */ - f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL); - if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V') - ++callcount; + case 'i': + case 'd': + case 'u': + case 'x': + { + /* used by sprintf */ + char fmt[10]; /* should be enough for "%0lld\0" */ + char small_buffer[MAX_LONG_CHARS]; + char *buffer; + int err; + + if (sizeof(small_buffer) - 1 < precision) { + buffer = PyMem_Malloc(precision + 1); + if (buffer == NULL) { + PyErr_NoMemory(); + return NULL; + } + } + else + buffer = small_buffer; + + if (*f == 'u') { + makefmt(fmt, longflag, longlongflag, size_tflag, *f); - else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') { + if (longflag) + len = sprintf(buffer, fmt, + va_arg(*vargs, unsigned long)); #ifdef HAVE_LONG_LONG - if (longlongflag) { - if (width < MAX_LONG_LONG_CHARS) - width = MAX_LONG_LONG_CHARS; - } - else + else if (longlongflag) + len = sprintf(buffer, fmt, + va_arg(*vargs, unsigned PY_LONG_LONG)); +#endif + else if (size_tflag) + len = sprintf(buffer, fmt, + va_arg(*vargs, size_t)); + else + len = sprintf(buffer, fmt, + va_arg(*vargs, unsigned int)); + } + else if (*f == 'x') { + makefmt(fmt, 0, 0, 0, 'x'); + len = sprintf(buffer, fmt, va_arg(*vargs, int)); + } + else { + makefmt(fmt, longflag, longlongflag, size_tflag, *f); + + if (longflag) + len = sprintf(buffer, fmt, + va_arg(*vargs, long)); +#ifdef HAVE_LONG_LONG + else if (longlongflag) + len = sprintf(buffer, fmt, + va_arg(*vargs, PY_LONG_LONG)); #endif - /* MAX_LONG_CHARS is enough to hold a 64-bit integer, - including sign. Decimal takes the most space. This - isn't enough for octal. If a width is specified we - need more (which we allocate later). */ - if (width < MAX_LONG_CHARS) - width = MAX_LONG_CHARS; - - /* account for the size + '\0' to separate numbers - inside of the numberresults buffer */ - numbersize += (width + 1); + else if (size_tflag) + len = sprintf(buffer, fmt, + va_arg(*vargs, Py_ssize_t)); + else + len = sprintf(buffer, fmt, + va_arg(*vargs, int)); + } + assert(len >= 0); + + err = 0; + if (precision < len) + precision = len; + if (width > precision) { + Py_UCS4 fillchar; + fill = width - precision; + fillchar = zeropad?'0':' '; + if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) != -1) { + if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) + err = 1; } + else + err = 1; + if (!err) + writer->pos += fill; + } + if (!err && precision > len) { + fill = precision - len; + if (_PyUnicodeWriter_Prepare(writer, fill, '0') != -1) { + if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) + err = 1; + } + else + err = 1; + if (!err) + writer->pos += fill; + } + if (!err) { + if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1) + err = 1; } - else if ((unsigned char)*f > 127) { - PyErr_Format(PyExc_ValueError, - "PyUnicode_FromFormatV() expects an ASCII-encoded format " - "string, got a non-ASCII byte: 0x%02x", - (unsigned char)*f); + + if (buffer != small_buffer) { + PyMem_Free(buffer); + buffer = small_buffer; + } + if (err) return NULL; + + break; + } + + case 'p': + { + char number[MAX_LONG_LONG_CHARS]; + + len = sprintf(number, "%p", va_arg(*vargs, void*)); + assert(len >= 0); + + /* %p is ill-defined: ensure leading 0x. */ + if (number[1] == 'X') + number[1] = 'x'; + else if (number[1] != 'x') { + memmove(number + 2, number, + strlen(number) + 1); + number[0] = '0'; + number[1] = 'x'; + len += 2; } + + if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1) + return NULL; + break; } - /* step 2: allocate memory for the results of - * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */ - if (callcount) { - callresults = PyObject_Malloc(sizeof(PyObject *) * callcount); - if (!callresults) { - PyErr_NoMemory(); + + case 's': + { + /* UTF-8 */ + const char *s = va_arg(*vargs, const char*); + PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL); + if (!str) + return NULL; + if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { + Py_DECREF(str); return NULL; } - callresult = callresults; + Py_DECREF(str); + break; } - /* step 2.5: allocate memory for the results of formating numbers */ - if (numbersize) { - numberresults = PyObject_Malloc(numbersize); - if (!numberresults) { - PyErr_NoMemory(); - goto fail; + + case 'U': + { + PyObject *obj = va_arg(*vargs, PyObject *); + assert(obj && _PyUnicode_CHECK(obj)); + + if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) + return NULL; + break; + } + + case 'V': + { + PyObject *obj = va_arg(*vargs, PyObject *); + const char *str = va_arg(*vargs, const char *); + PyObject *str_obj; + assert(obj || str); + if (obj) { + assert(_PyUnicode_CHECK(obj)); + if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) + return NULL; } - numberresult = numberresults; + else { + str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL); + if (!str_obj) + return NULL; + if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) { + Py_DECREF(str_obj); + return NULL; + } + Py_DECREF(str_obj); + } + break; } - /* step 3: format numbers and figure out how large a buffer we need */ - for (f = format; *f; f++) { - if (*f == '%') { - const char* p; - int longflag; - int longlongflag; - int size_tflag; - int numprinted; + case 'S': + { + PyObject *obj = va_arg(*vargs, PyObject *); + PyObject *str; + assert(obj); + str = PyObject_Str(obj); + if (!str) + return NULL; + if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { + Py_DECREF(str); + return NULL; + } + Py_DECREF(str); + break; + } - p = f; - zeropad = (f[1] == '0'); - f = parse_format_flags(f, &width, &precision, - &longflag, &longlongflag, &size_tflag); - switch (*f) { - case 'c': - { - Py_UCS4 ordinal = va_arg(count, int); - maxchar = MAX_MAXCHAR(maxchar, ordinal); - n++; - break; - } - case '%': - n++; - break; - case 'i': - case 'd': - makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, - width, precision, *f); - if (longflag) - numprinted = sprintf(numberresult, fmt, - va_arg(count, long)); -#ifdef HAVE_LONG_LONG - else if (longlongflag) - numprinted = sprintf(numberresult, fmt, - va_arg(count, PY_LONG_LONG)); -#endif - else if (size_tflag) - numprinted = sprintf(numberresult, fmt, - va_arg(count, Py_ssize_t)); - else - numprinted = sprintf(numberresult, fmt, - va_arg(count, int)); - n += numprinted; - /* advance by +1 to skip over the '\0' */ - numberresult += (numprinted + 1); - assert(*(numberresult - 1) == '\0'); - assert(*(numberresult - 2) != '\0'); - assert(numprinted >= 0); - assert(numberresult <= numberresults + numbersize); - break; - case 'u': - makefmt(fmt, longflag, longlongflag, size_tflag, zeropad, - width, precision, 'u'); - if (longflag) - numprinted = sprintf(numberresult, fmt, - va_arg(count, unsigned long)); -#ifdef HAVE_LONG_LONG - else if (longlongflag) - numprinted = sprintf(numberresult, fmt, - va_arg(count, unsigned PY_LONG_LONG)); -#endif - else if (size_tflag) - numprinted = sprintf(numberresult, fmt, - va_arg(count, size_t)); - else - numprinted = sprintf(numberresult, fmt, - va_arg(count, unsigned int)); - n += numprinted; - numberresult += (numprinted + 1); - assert(*(numberresult - 1) == '\0'); - assert(*(numberresult - 2) != '\0'); - assert(numprinted >= 0); - assert(numberresult <= numberresults + numbersize); - break; - case 'x': - makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x'); - numprinted = sprintf(numberresult, fmt, va_arg(count, int)); - n += numprinted; - numberresult += (numprinted + 1); - assert(*(numberresult - 1) == '\0'); - assert(*(numberresult - 2) != '\0'); - assert(numprinted >= 0); - assert(numberresult <= numberresults + numbersize); - break; - case 'p': - numprinted = sprintf(numberresult, "%p", va_arg(count, void*)); - /* %p is ill-defined: ensure leading 0x. */ - if (numberresult[1] == 'X') - numberresult[1] = 'x'; - else if (numberresult[1] != 'x') { - memmove(numberresult + 2, numberresult, - strlen(numberresult) + 1); - numberresult[0] = '0'; - numberresult[1] = 'x'; - numprinted += 2; - } - n += numprinted; - numberresult += (numprinted + 1); - assert(*(numberresult - 1) == '\0'); - assert(*(numberresult - 2) != '\0'); - assert(numprinted >= 0); - assert(numberresult <= numberresults + numbersize); - break; - case 's': - { - /* UTF-8 */ - const char *s = va_arg(count, const char*); - PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL); - if (!str) - goto fail; - /* since PyUnicode_DecodeUTF8 returns already flexible - unicode objects, there is no need to call ready on them */ - argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(str); - /* Remember the str and switch to the next slot */ - *callresult++ = str; - break; - } - case 'U': - { - PyObject *obj = va_arg(count, PyObject *); - assert(obj && _PyUnicode_CHECK(obj)); - if (PyUnicode_READY(obj) == -1) - goto fail; - argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(obj); - break; - } - case 'V': - { - PyObject *obj = va_arg(count, PyObject *); - const char *str = va_arg(count, const char *); - PyObject *str_obj; - assert(obj || str); - assert(!obj || _PyUnicode_CHECK(obj)); - if (obj) { - if (PyUnicode_READY(obj) == -1) - goto fail; - argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(obj); - *callresult++ = NULL; - } - else { - str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL); - if (!str_obj) - goto fail; - if (PyUnicode_READY(str_obj) == -1) { - Py_DECREF(str_obj); - goto fail; - } - argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(str_obj); - *callresult++ = str_obj; - } - break; - } - case 'S': - { - PyObject *obj = va_arg(count, PyObject *); - PyObject *str; - assert(obj); - str = PyObject_Str(obj); - if (!str) - goto fail; - if (PyUnicode_READY(str) == -1) { - Py_DECREF(str); - goto fail; - } - argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(str); - /* Remember the str and switch to the next slot */ - *callresult++ = str; - break; - } - case 'R': - { - PyObject *obj = va_arg(count, PyObject *); - PyObject *repr; - assert(obj); - repr = PyObject_Repr(obj); - if (!repr) - goto fail; - if (PyUnicode_READY(repr) == -1) { - Py_DECREF(repr); - goto fail; - } - argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(repr); - /* Remember the repr and switch to the next slot */ - *callresult++ = repr; - break; - } - case 'A': - { - PyObject *obj = va_arg(count, PyObject *); - PyObject *ascii; - assert(obj); - ascii = PyObject_ASCII(obj); - if (!ascii) - goto fail; - if (PyUnicode_READY(ascii) == -1) { - Py_DECREF(ascii); - goto fail; - } - argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); - n += PyUnicode_GET_LENGTH(ascii); - /* Remember the repr and switch to the next slot */ - *callresult++ = ascii; - break; - } - default: - /* if we stumble upon an unknown - formatting code, copy the rest of - the format string to the output - string. (we cannot just skip the - code, since there's no way to know - what's in the argument list) */ - n += strlen(p); - goto expand; - } - } else - n++; - } - expand: - /* step 4: fill the buffer */ - /* Since we've analyzed how much space we need, - we don't have to resize the string. - There can be no errors beyond this point. */ - string = PyUnicode_New(n, maxchar); - if (!string) - goto fail; - kind = PyUnicode_KIND(string); - data = PyUnicode_DATA(string); - callresult = callresults; - numberresult = numberresults; + case 'R': + { + PyObject *obj = va_arg(*vargs, PyObject *); + PyObject *repr; + assert(obj); + repr = PyObject_Repr(obj); + if (!repr) + return NULL; + if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) { + Py_DECREF(repr); + return NULL; + } + Py_DECREF(repr); + break; + } + + case 'A': + { + PyObject *obj = va_arg(*vargs, PyObject *); + PyObject *ascii; + assert(obj); + ascii = PyObject_ASCII(obj); + if (!ascii) + return NULL; + if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) { + Py_DECREF(ascii); + return NULL; + } + Py_DECREF(ascii); + break; + } - for (i = 0, f = format; *f; f++) { + case '%': + if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1) + return NULL; + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%'); + writer->pos++; + break; + + default: + /* if we stumble upon an unknown formatting code, copy the rest + of the format string to the output string. (we cannot just + skip the code, since there's no way to know what's in the + argument list) */ + len = strlen(p); + if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1) + return NULL; + f = p+len; + return f; + } + + f++; + return f; +} + +PyObject * +PyUnicode_FromFormatV(const char *format, va_list vargs) +{ + va_list vargs2; + const char *f; + _PyUnicodeWriter writer; + + _PyUnicodeWriter_Init(&writer, strlen(format) + 100); + + /* va_list may be an array (of 1 item) on some platforms (ex: AMD64). + Copy it to be able to pass a reference to a subfunction. */ + Py_VA_COPY(vargs2, vargs); + + for (f = format; *f; ) { if (*f == '%') { - const char* p; + f = unicode_fromformat_arg(&writer, f, &vargs2); + if (f == NULL) + goto fail; + } + else { + const char *p; + Py_ssize_t len; p = f; - f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL); - /* checking for == because the last argument could be a empty - string, which causes i to point to end, the assert at the end of - the loop */ - assert(i <= PyUnicode_GET_LENGTH(string)); - - switch (*f) { - case 'c': - { - const int ordinal = va_arg(vargs, int); - PyUnicode_WRITE(kind, data, i++, ordinal); - break; - } - case 'i': - case 'd': - case 'u': - case 'x': - case 'p': - { - Py_ssize_t len; - /* unused, since we already have the result */ - if (*f == 'p') - (void) va_arg(vargs, void *); - else - (void) va_arg(vargs, int); - /* extract the result from numberresults and append. */ - len = strlen(numberresult); - unicode_write_cstr(string, i, numberresult, len); - /* skip over the separating '\0' */ - i += len; - numberresult += len; - assert(*numberresult == '\0'); - numberresult++; - assert(numberresult <= numberresults + numbersize); - break; - } - case 's': - { - /* unused, since we already have the result */ - Py_ssize_t size; - (void) va_arg(vargs, char *); - size = PyUnicode_GET_LENGTH(*callresult); - assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); - _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); - i += size; - /* We're done with the unicode()/repr() => forget it */ - Py_DECREF(*callresult); - /* switch to next unicode()/repr() result */ - ++callresult; - break; - } - case 'U': - { - PyObject *obj = va_arg(vargs, PyObject *); - Py_ssize_t size; - assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); - size = PyUnicode_GET_LENGTH(obj); - _PyUnicode_FastCopyCharacters(string, i, obj, 0, size); - i += size; - break; - } - case 'V': + do { - Py_ssize_t size; - PyObject *obj = va_arg(vargs, PyObject *); - va_arg(vargs, const char *); - if (obj) { - size = PyUnicode_GET_LENGTH(obj); - assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); - _PyUnicode_FastCopyCharacters(string, i, obj, 0, size); - i += size; - } else { - size = PyUnicode_GET_LENGTH(*callresult); - assert(PyUnicode_KIND(*callresult) <= - PyUnicode_KIND(string)); - _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); - i += size; - Py_DECREF(*callresult); + if ((unsigned char)*p > 127) { + PyErr_Format(PyExc_ValueError, + "PyUnicode_FromFormatV() expects an ASCII-encoded format " + "string, got a non-ASCII byte: 0x%02x", + (unsigned char)*p); + return NULL; } - ++callresult; - break; - } - case 'S': - case 'R': - case 'A': - { - Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult); - /* unused, since we already have the result */ - (void) va_arg(vargs, PyObject *); - assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); - _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size); - i += size; - /* We're done with the unicode()/repr() => forget it */ - Py_DECREF(*callresult); - /* switch to next unicode()/repr() result */ - ++callresult; - break; + p++; } - case '%': - PyUnicode_WRITE(kind, data, i++, '%'); - break; - default: - { - Py_ssize_t len = strlen(p); - unicode_write_cstr(string, i, p, len); - i += len; - assert(i == PyUnicode_GET_LENGTH(string)); - goto end; - } - } - } - else { - assert(i < PyUnicode_GET_LENGTH(string)); - PyUnicode_WRITE(kind, data, i++, *f); + while (*p != '\0' && *p != '%'); + len = p - f; + + if (*p == '\0') + writer.overallocate = 0; + if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1) + goto fail; + unicode_write_cstr(writer.buffer, writer.pos, f, len); + writer.pos += len; + + f = p; } } - assert(i == PyUnicode_GET_LENGTH(string)); + return _PyUnicodeWriter_Finish(&writer); - end: - if (callresults) - PyObject_Free(callresults); - if (numberresults) - PyObject_Free(numberresults); - return unicode_result(string); fail: - if (callresults) { - PyObject **callresult2 = callresults; - while (callresult2 < callresult) { - Py_XDECREF(*callresult2); - ++callresult2; - } - PyObject_Free(callresults); - } - if (numberresults) - PyObject_Free(numberresults); + _PyUnicodeWriter_Dealloc(&writer); return NULL; } @@ -12962,6 +12797,19 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) return 0; } +int +_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len) +{ + Py_UCS4 maxchar; + + maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len); + if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1) + return -1; + unicode_write_cstr(writer->buffer, writer->pos, str, len); + writer->pos += len; + return 0; +} + PyObject * _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer) { |