summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2012-10-06 21:03:36 (GMT)
committerVictor Stinner <victor.stinner@gmail.com>2012-10-06 21:03:36 (GMT)
commite215d960be3c5e1457920c452dc8f94ebf42b159 (patch)
tree6a48506e8fbe03543c0c258aab982c6a6af8558d
parent2a09b6e84955779dbc878c5a0679c41d84f5d021 (diff)
downloadcpython-e215d960be3c5e1457920c452dc8f94ebf42b159.zip
cpython-e215d960be3c5e1457920c452dc8f94ebf42b159.tar.gz
cpython-e215d960be3c5e1457920c452dc8f94ebf42b159.tar.bz2
Issue #16147: Rewrite PyUnicode_FromFormatV() to use _PyUnicodeWriter API
* Simplify the code: replace 4 steps with one unique step using the _PyUnicodeWriter API. PyUnicode_Format() has the same design. It avoids to store intermediate results which require to allocate an array of pointers on the heap. * Use the _PyUnicodeWriter API for speed (and its convinient API): overallocate the buffer to reduce the number of "realloc()" * Implement "width" and "precision" in Python, don't rely on sprintf(). It avoids to need of a temporary buffer allocated on the heap: only use a small buffer allocated in the stack. * Add _PyUnicodeWriter_WriteCstr() function * Split PyUnicode_FromFormatV() into two functions: add unicode_fromformat_arg(). * Inline parse_format_flags(): the format of an argument is now only parsed once, it's no more needed to have a subfunction. * Optimize PyUnicode_FromFormatV() for characters between two "%" arguments: search the next "%" and copy the substring in one chunk, instead of copying character per character.
-rw-r--r--Include/unicodeobject.h18
-rw-r--r--Lib/test/test_unicode.py16
-rw-r--r--Objects/unicodeobject.c814
3 files changed, 364 insertions, 484 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 4152dd7..fa21c1c 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -933,12 +933,28 @@ PyAPI_FUNC(int)
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
Py_ssize_t length, Py_UCS4 maxchar);
+/* Append a Unicode string.
+ Return 0 on success, raise an exception and return -1 on error. */
PyAPI_FUNC(int)
-_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str);
+_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
+ PyObject *str /* Unicode string */
+ );
+/* Append a latin1-encoded byte string.
+ Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer,
+ const char *str, /* latin1-encoded byte string */
+ Py_ssize_t len /* length in bytes */
+ );
+
+/* Get the value of the write as an Unicode string. Clear the
+ buffer of the writer. Raise an exception and return NULL
+ on error. */
PyAPI_FUNC(PyObject *)
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
+/* Deallocate memory of a writer (clear its internal buffer). */
PyAPI_FUNC(void)
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
#endif
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index b13a90a..42df721 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1769,6 +1769,22 @@ class UnicodeTest(string_tests.CommonTest,
self.assertEqual(PyUnicode_FromFormat(b'%llu', c_ulonglong(123)), '123')
self.assertEqual(PyUnicode_FromFormat(b'%zu', c_size_t(123)), '123')
+ # test padding (width and/or precision)
+ self.assertEqual(PyUnicode_FromFormat(b'%010i', c_int(123)), '123'.rjust(10, '0'))
+ self.assertEqual(PyUnicode_FromFormat(b'%100i', c_int(123)), '123'.rjust(100))
+ self.assertEqual(PyUnicode_FromFormat(b'%.100i', c_int(123)), '123'.rjust(100, '0'))
+ self.assertEqual(PyUnicode_FromFormat(b'%100.80i', c_int(123)), '123'.rjust(80, '0').rjust(100))
+
+ self.assertEqual(PyUnicode_FromFormat(b'%010u', c_uint(123)), '123'.rjust(10, '0'))
+ self.assertEqual(PyUnicode_FromFormat(b'%100u', c_uint(123)), '123'.rjust(100))
+ self.assertEqual(PyUnicode_FromFormat(b'%.100u', c_uint(123)), '123'.rjust(100, '0'))
+ self.assertEqual(PyUnicode_FromFormat(b'%100.80u', c_uint(123)), '123'.rjust(80, '0').rjust(100))
+
+ self.assertEqual(PyUnicode_FromFormat(b'%010x', c_int(0x123)), '123'.rjust(10, '0'))
+ self.assertEqual(PyUnicode_FromFormat(b'%100x', c_int(0x123)), '123'.rjust(100))
+ self.assertEqual(PyUnicode_FromFormat(b'%.100x', c_int(0x123)), '123'.rjust(100, '0'))
+ self.assertEqual(PyUnicode_FromFormat(b'%100.80x', c_int(0x123)), '123'.rjust(80, '0').rjust(100))
+
# test %A
text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'")
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index c3bf392..562efed 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2301,16 +2301,9 @@ PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
static void
makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
- int zeropad, int width, int precision, char c)
+ char c)
{
*fmt++ = '%';
- if (width) {
- if (zeropad)
- *fmt++ = '0';
- fmt += sprintf(fmt, "%d", width);
- }
- if (precision)
- fmt += sprintf(fmt, ".%d", precision);
if (longflag)
*fmt++ = 'l';
else if (longlongflag) {
@@ -2335,44 +2328,59 @@ makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
*fmt = '\0';
}
-/* helper for PyUnicode_FromFormatV() */
+/* maximum number of characters required for output of %ld. 21 characters
+ allows for 64-bit integers (in decimal) and an optional sign. */
+#define MAX_LONG_CHARS 21
+/* maximum number of characters required for output of %lld.
+ We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
+ plus 1 for the sign. 53/22 is an upper bound for log10(256). */
+#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
static const char*
-parse_format_flags(const char *f,
- int *p_width, int *p_precision,
- int *p_longflag, int *p_longlongflag, int *p_size_tflag)
+unicode_fromformat_arg(_PyUnicodeWriter *writer,
+ const char *f, va_list *vargs)
{
- int width, precision, longflag, longlongflag, size_tflag;
+ const char *p;
+ Py_ssize_t len;
+ int zeropad;
+ int width;
+ int precision;
+ int longflag;
+ int longlongflag;
+ int size_tflag;
+ int fill;
+
+ p = f;
+ f++;
+ zeropad = (*f == '0');
/* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
- f++;
width = 0;
- while (Py_ISDIGIT((unsigned)*f))
- width = (width*10) + *f++ - '0';
+ while (Py_ISDIGIT((unsigned)*f)) {
+ width = (width*10) + (*f - '0');
+ f++;
+ }
precision = 0;
if (*f == '.') {
f++;
- while (Py_ISDIGIT((unsigned)*f))
- precision = (precision*10) + *f++ - '0';
+ while (Py_ISDIGIT((unsigned)*f)) {
+ precision = (precision*10) + (*f - '0');
+ f++;
+ }
if (*f == '%') {
/* "%.3%s" => f points to "3" */
f--;
}
}
if (*f == '\0') {
- /* bogus format "%.1" => go backward, f points to "1" */
+ /* bogus format "%.123" => go backward, f points to "3" */
f--;
}
- if (p_width != NULL)
- *p_width = width;
- if (p_precision != NULL)
- *p_precision = precision;
/* Handle %ld, %lu, %lld and %llu. */
longflag = 0;
longlongflag = 0;
size_tflag = 0;
-
if (*f == 'l') {
if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
longflag = 1;
@@ -2391,494 +2399,321 @@ parse_format_flags(const char *f,
size_tflag = 1;
++f;
}
- if (p_longflag != NULL)
- *p_longflag = longflag;
- if (p_longlongflag != NULL)
- *p_longlongflag = longlongflag;
- if (p_size_tflag != NULL)
- *p_size_tflag = size_tflag;
- return f;
-}
-/* maximum number of characters required for output of %ld. 21 characters
- allows for 64-bit integers (in decimal) and an optional sign. */
-#define MAX_LONG_CHARS 21
-/* maximum number of characters required for output of %lld.
- We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
- plus 1 for the sign. 53/22 is an upper bound for log10(256). */
-#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
+ if (f[1] == '\0')
+ writer->overallocate = 0;
-PyObject *
-PyUnicode_FromFormatV(const char *format, va_list vargs)
-{
- va_list count;
- Py_ssize_t callcount = 0;
- PyObject **callresults = NULL;
- PyObject **callresult = NULL;
- Py_ssize_t n = 0;
- int width = 0;
- int precision = 0;
- int zeropad;
- const char* f;
- PyObject *string;
- /* used by sprintf */
- char fmt[61]; /* should be enough for %0width.precisionlld */
- Py_UCS4 maxchar = 127; /* result is ASCII by default */
- Py_UCS4 argmaxchar;
- Py_ssize_t numbersize = 0;
- char *numberresults = NULL;
- char *numberresult = NULL;
- Py_ssize_t i;
- int kind;
- void *data;
+ switch (*f) {
+ case 'c':
+ {
+ int ordinal = va_arg(*vargs, int);
+ if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
+ return NULL;
+ PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
+ writer->pos++;
+ break;
+ }
- Py_VA_COPY(count, vargs);
- /* step 1: count the number of %S/%R/%A/%s format specifications
- * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
- * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
- * result in an array)
- * also estimate a upper bound for all the number formats in the string,
- * numbers will be formatted in step 3 and be kept in a '\0'-separated
- * buffer before putting everything together. */
- for (f = format; *f; f++) {
- if (*f == '%') {
- int longlongflag;
- /* skip width or width.precision (eg. "1.2" of "%1.2f") */
- f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
- if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
- ++callcount;
+ case 'i':
+ case 'd':
+ case 'u':
+ case 'x':
+ {
+ /* used by sprintf */
+ char fmt[10]; /* should be enough for "%0lld\0" */
+ char small_buffer[MAX_LONG_CHARS];
+ char *buffer;
+ int err;
+
+ if (sizeof(small_buffer) - 1 < precision) {
+ buffer = PyMem_Malloc(precision + 1);
+ if (buffer == NULL) {
+ PyErr_NoMemory();
+ return NULL;
+ }
+ }
+ else
+ buffer = small_buffer;
+
+ if (*f == 'u') {
+ makefmt(fmt, longflag, longlongflag, size_tflag, *f);
- else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
+ if (longflag)
+ len = sprintf(buffer, fmt,
+ va_arg(*vargs, unsigned long));
#ifdef HAVE_LONG_LONG
- if (longlongflag) {
- if (width < MAX_LONG_LONG_CHARS)
- width = MAX_LONG_LONG_CHARS;
- }
- else
+ else if (longlongflag)
+ len = sprintf(buffer, fmt,
+ va_arg(*vargs, unsigned PY_LONG_LONG));
+#endif
+ else if (size_tflag)
+ len = sprintf(buffer, fmt,
+ va_arg(*vargs, size_t));
+ else
+ len = sprintf(buffer, fmt,
+ va_arg(*vargs, unsigned int));
+ }
+ else if (*f == 'x') {
+ makefmt(fmt, 0, 0, 0, 'x');
+ len = sprintf(buffer, fmt, va_arg(*vargs, int));
+ }
+ else {
+ makefmt(fmt, longflag, longlongflag, size_tflag, *f);
+
+ if (longflag)
+ len = sprintf(buffer, fmt,
+ va_arg(*vargs, long));
+#ifdef HAVE_LONG_LONG
+ else if (longlongflag)
+ len = sprintf(buffer, fmt,
+ va_arg(*vargs, PY_LONG_LONG));
#endif
- /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
- including sign. Decimal takes the most space. This
- isn't enough for octal. If a width is specified we
- need more (which we allocate later). */
- if (width < MAX_LONG_CHARS)
- width = MAX_LONG_CHARS;
-
- /* account for the size + '\0' to separate numbers
- inside of the numberresults buffer */
- numbersize += (width + 1);
+ else if (size_tflag)
+ len = sprintf(buffer, fmt,
+ va_arg(*vargs, Py_ssize_t));
+ else
+ len = sprintf(buffer, fmt,
+ va_arg(*vargs, int));
+ }
+ assert(len >= 0);
+
+ err = 0;
+ if (precision < len)
+ precision = len;
+ if (width > precision) {
+ Py_UCS4 fillchar;
+ fill = width - precision;
+ fillchar = zeropad?'0':' ';
+ if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) != -1) {
+ if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
+ err = 1;
}
+ else
+ err = 1;
+ if (!err)
+ writer->pos += fill;
+ }
+ if (!err && precision > len) {
+ fill = precision - len;
+ if (_PyUnicodeWriter_Prepare(writer, fill, '0') != -1) {
+ if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
+ err = 1;
+ }
+ else
+ err = 1;
+ if (!err)
+ writer->pos += fill;
+ }
+ if (!err) {
+ if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
+ err = 1;
}
- else if ((unsigned char)*f > 127) {
- PyErr_Format(PyExc_ValueError,
- "PyUnicode_FromFormatV() expects an ASCII-encoded format "
- "string, got a non-ASCII byte: 0x%02x",
- (unsigned char)*f);
+
+ if (buffer != small_buffer) {
+ PyMem_Free(buffer);
+ buffer = small_buffer;
+ }
+ if (err)
return NULL;
+
+ break;
+ }
+
+ case 'p':
+ {
+ char number[MAX_LONG_LONG_CHARS];
+
+ len = sprintf(number, "%p", va_arg(*vargs, void*));
+ assert(len >= 0);
+
+ /* %p is ill-defined: ensure leading 0x. */
+ if (number[1] == 'X')
+ number[1] = 'x';
+ else if (number[1] != 'x') {
+ memmove(number + 2, number,
+ strlen(number) + 1);
+ number[0] = '0';
+ number[1] = 'x';
+ len += 2;
}
+
+ if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
+ return NULL;
+ break;
}
- /* step 2: allocate memory for the results of
- * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
- if (callcount) {
- callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
- if (!callresults) {
- PyErr_NoMemory();
+
+ case 's':
+ {
+ /* UTF-8 */
+ const char *s = va_arg(*vargs, const char*);
+ PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
+ if (!str)
+ return NULL;
+ if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
+ Py_DECREF(str);
return NULL;
}
- callresult = callresults;
+ Py_DECREF(str);
+ break;
}
- /* step 2.5: allocate memory for the results of formating numbers */
- if (numbersize) {
- numberresults = PyObject_Malloc(numbersize);
- if (!numberresults) {
- PyErr_NoMemory();
- goto fail;
+
+ case 'U':
+ {
+ PyObject *obj = va_arg(*vargs, PyObject *);
+ assert(obj && _PyUnicode_CHECK(obj));
+
+ if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
+ return NULL;
+ break;
+ }
+
+ case 'V':
+ {
+ PyObject *obj = va_arg(*vargs, PyObject *);
+ const char *str = va_arg(*vargs, const char *);
+ PyObject *str_obj;
+ assert(obj || str);
+ if (obj) {
+ assert(_PyUnicode_CHECK(obj));
+ if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
+ return NULL;
}
- numberresult = numberresults;
+ else {
+ str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
+ if (!str_obj)
+ return NULL;
+ if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
+ Py_DECREF(str_obj);
+ return NULL;
+ }
+ Py_DECREF(str_obj);
+ }
+ break;
}
- /* step 3: format numbers and figure out how large a buffer we need */
- for (f = format; *f; f++) {
- if (*f == '%') {
- const char* p;
- int longflag;
- int longlongflag;
- int size_tflag;
- int numprinted;
+ case 'S':
+ {
+ PyObject *obj = va_arg(*vargs, PyObject *);
+ PyObject *str;
+ assert(obj);
+ str = PyObject_Str(obj);
+ if (!str)
+ return NULL;
+ if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
+ Py_DECREF(str);
+ return NULL;
+ }
+ Py_DECREF(str);
+ break;
+ }
- p = f;
- zeropad = (f[1] == '0');
- f = parse_format_flags(f, &width, &precision,
- &longflag, &longlongflag, &size_tflag);
- switch (*f) {
- case 'c':
- {
- Py_UCS4 ordinal = va_arg(count, int);
- maxchar = MAX_MAXCHAR(maxchar, ordinal);
- n++;
- break;
- }
- case '%':
- n++;
- break;
- case 'i':
- case 'd':
- makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
- width, precision, *f);
- if (longflag)
- numprinted = sprintf(numberresult, fmt,
- va_arg(count, long));
-#ifdef HAVE_LONG_LONG
- else if (longlongflag)
- numprinted = sprintf(numberresult, fmt,
- va_arg(count, PY_LONG_LONG));
-#endif
- else if (size_tflag)
- numprinted = sprintf(numberresult, fmt,
- va_arg(count, Py_ssize_t));
- else
- numprinted = sprintf(numberresult, fmt,
- va_arg(count, int));
- n += numprinted;
- /* advance by +1 to skip over the '\0' */
- numberresult += (numprinted + 1);
- assert(*(numberresult - 1) == '\0');
- assert(*(numberresult - 2) != '\0');
- assert(numprinted >= 0);
- assert(numberresult <= numberresults + numbersize);
- break;
- case 'u':
- makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
- width, precision, 'u');
- if (longflag)
- numprinted = sprintf(numberresult, fmt,
- va_arg(count, unsigned long));
-#ifdef HAVE_LONG_LONG
- else if (longlongflag)
- numprinted = sprintf(numberresult, fmt,
- va_arg(count, unsigned PY_LONG_LONG));
-#endif
- else if (size_tflag)
- numprinted = sprintf(numberresult, fmt,
- va_arg(count, size_t));
- else
- numprinted = sprintf(numberresult, fmt,
- va_arg(count, unsigned int));
- n += numprinted;
- numberresult += (numprinted + 1);
- assert(*(numberresult - 1) == '\0');
- assert(*(numberresult - 2) != '\0');
- assert(numprinted >= 0);
- assert(numberresult <= numberresults + numbersize);
- break;
- case 'x':
- makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
- numprinted = sprintf(numberresult, fmt, va_arg(count, int));
- n += numprinted;
- numberresult += (numprinted + 1);
- assert(*(numberresult - 1) == '\0');
- assert(*(numberresult - 2) != '\0');
- assert(numprinted >= 0);
- assert(numberresult <= numberresults + numbersize);
- break;
- case 'p':
- numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
- /* %p is ill-defined: ensure leading 0x. */
- if (numberresult[1] == 'X')
- numberresult[1] = 'x';
- else if (numberresult[1] != 'x') {
- memmove(numberresult + 2, numberresult,
- strlen(numberresult) + 1);
- numberresult[0] = '0';
- numberresult[1] = 'x';
- numprinted += 2;
- }
- n += numprinted;
- numberresult += (numprinted + 1);
- assert(*(numberresult - 1) == '\0');
- assert(*(numberresult - 2) != '\0');
- assert(numprinted >= 0);
- assert(numberresult <= numberresults + numbersize);
- break;
- case 's':
- {
- /* UTF-8 */
- const char *s = va_arg(count, const char*);
- PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
- if (!str)
- goto fail;
- /* since PyUnicode_DecodeUTF8 returns already flexible
- unicode objects, there is no need to call ready on them */
- argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
- maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
- n += PyUnicode_GET_LENGTH(str);
- /* Remember the str and switch to the next slot */
- *callresult++ = str;
- break;
- }
- case 'U':
- {
- PyObject *obj = va_arg(count, PyObject *);
- assert(obj && _PyUnicode_CHECK(obj));
- if (PyUnicode_READY(obj) == -1)
- goto fail;
- argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
- maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
- n += PyUnicode_GET_LENGTH(obj);
- break;
- }
- case 'V':
- {
- PyObject *obj = va_arg(count, PyObject *);
- const char *str = va_arg(count, const char *);
- PyObject *str_obj;
- assert(obj || str);
- assert(!obj || _PyUnicode_CHECK(obj));
- if (obj) {
- if (PyUnicode_READY(obj) == -1)
- goto fail;
- argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
- maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
- n += PyUnicode_GET_LENGTH(obj);
- *callresult++ = NULL;
- }
- else {
- str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
- if (!str_obj)
- goto fail;
- if (PyUnicode_READY(str_obj) == -1) {
- Py_DECREF(str_obj);
- goto fail;
- }
- argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
- maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
- n += PyUnicode_GET_LENGTH(str_obj);
- *callresult++ = str_obj;
- }
- break;
- }
- case 'S':
- {
- PyObject *obj = va_arg(count, PyObject *);
- PyObject *str;
- assert(obj);
- str = PyObject_Str(obj);
- if (!str)
- goto fail;
- if (PyUnicode_READY(str) == -1) {
- Py_DECREF(str);
- goto fail;
- }
- argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
- maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
- n += PyUnicode_GET_LENGTH(str);
- /* Remember the str and switch to the next slot */
- *callresult++ = str;
- break;
- }
- case 'R':
- {
- PyObject *obj = va_arg(count, PyObject *);
- PyObject *repr;
- assert(obj);
- repr = PyObject_Repr(obj);
- if (!repr)
- goto fail;
- if (PyUnicode_READY(repr) == -1) {
- Py_DECREF(repr);
- goto fail;
- }
- argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
- maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
- n += PyUnicode_GET_LENGTH(repr);
- /* Remember the repr and switch to the next slot */
- *callresult++ = repr;
- break;
- }
- case 'A':
- {
- PyObject *obj = va_arg(count, PyObject *);
- PyObject *ascii;
- assert(obj);
- ascii = PyObject_ASCII(obj);
- if (!ascii)
- goto fail;
- if (PyUnicode_READY(ascii) == -1) {
- Py_DECREF(ascii);
- goto fail;
- }
- argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
- maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
- n += PyUnicode_GET_LENGTH(ascii);
- /* Remember the repr and switch to the next slot */
- *callresult++ = ascii;
- break;
- }
- default:
- /* if we stumble upon an unknown
- formatting code, copy the rest of
- the format string to the output
- string. (we cannot just skip the
- code, since there's no way to know
- what's in the argument list) */
- n += strlen(p);
- goto expand;
- }
- } else
- n++;
- }
- expand:
- /* step 4: fill the buffer */
- /* Since we've analyzed how much space we need,
- we don't have to resize the string.
- There can be no errors beyond this point. */
- string = PyUnicode_New(n, maxchar);
- if (!string)
- goto fail;
- kind = PyUnicode_KIND(string);
- data = PyUnicode_DATA(string);
- callresult = callresults;
- numberresult = numberresults;
+ case 'R':
+ {
+ PyObject *obj = va_arg(*vargs, PyObject *);
+ PyObject *repr;
+ assert(obj);
+ repr = PyObject_Repr(obj);
+ if (!repr)
+ return NULL;
+ if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
+ Py_DECREF(repr);
+ return NULL;
+ }
+ Py_DECREF(repr);
+ break;
+ }
+
+ case 'A':
+ {
+ PyObject *obj = va_arg(*vargs, PyObject *);
+ PyObject *ascii;
+ assert(obj);
+ ascii = PyObject_ASCII(obj);
+ if (!ascii)
+ return NULL;
+ if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
+ Py_DECREF(ascii);
+ return NULL;
+ }
+ Py_DECREF(ascii);
+ break;
+ }
- for (i = 0, f = format; *f; f++) {
+ case '%':
+ if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
+ return NULL;
+ PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
+ writer->pos++;
+ break;
+
+ default:
+ /* if we stumble upon an unknown formatting code, copy the rest
+ of the format string to the output string. (we cannot just
+ skip the code, since there's no way to know what's in the
+ argument list) */
+ len = strlen(p);
+ if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
+ return NULL;
+ f = p+len;
+ return f;
+ }
+
+ f++;
+ return f;
+}
+
+PyObject *
+PyUnicode_FromFormatV(const char *format, va_list vargs)
+{
+ va_list vargs2;
+ const char *f;
+ _PyUnicodeWriter writer;
+
+ _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
+
+ /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
+ Copy it to be able to pass a reference to a subfunction. */
+ Py_VA_COPY(vargs2, vargs);
+
+ for (f = format; *f; ) {
if (*f == '%') {
- const char* p;
+ f = unicode_fromformat_arg(&writer, f, &vargs2);
+ if (f == NULL)
+ goto fail;
+ }
+ else {
+ const char *p;
+ Py_ssize_t len;
p = f;
- f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
- /* checking for == because the last argument could be a empty
- string, which causes i to point to end, the assert at the end of
- the loop */
- assert(i <= PyUnicode_GET_LENGTH(string));
-
- switch (*f) {
- case 'c':
- {
- const int ordinal = va_arg(vargs, int);
- PyUnicode_WRITE(kind, data, i++, ordinal);
- break;
- }
- case 'i':
- case 'd':
- case 'u':
- case 'x':
- case 'p':
- {
- Py_ssize_t len;
- /* unused, since we already have the result */
- if (*f == 'p')
- (void) va_arg(vargs, void *);
- else
- (void) va_arg(vargs, int);
- /* extract the result from numberresults and append. */
- len = strlen(numberresult);
- unicode_write_cstr(string, i, numberresult, len);
- /* skip over the separating '\0' */
- i += len;
- numberresult += len;
- assert(*numberresult == '\0');
- numberresult++;
- assert(numberresult <= numberresults + numbersize);
- break;
- }
- case 's':
- {
- /* unused, since we already have the result */
- Py_ssize_t size;
- (void) va_arg(vargs, char *);
- size = PyUnicode_GET_LENGTH(*callresult);
- assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
- _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
- i += size;
- /* We're done with the unicode()/repr() => forget it */
- Py_DECREF(*callresult);
- /* switch to next unicode()/repr() result */
- ++callresult;
- break;
- }
- case 'U':
- {
- PyObject *obj = va_arg(vargs, PyObject *);
- Py_ssize_t size;
- assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
- size = PyUnicode_GET_LENGTH(obj);
- _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
- i += size;
- break;
- }
- case 'V':
+ do
{
- Py_ssize_t size;
- PyObject *obj = va_arg(vargs, PyObject *);
- va_arg(vargs, const char *);
- if (obj) {
- size = PyUnicode_GET_LENGTH(obj);
- assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
- _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
- i += size;
- } else {
- size = PyUnicode_GET_LENGTH(*callresult);
- assert(PyUnicode_KIND(*callresult) <=
- PyUnicode_KIND(string));
- _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
- i += size;
- Py_DECREF(*callresult);
+ if ((unsigned char)*p > 127) {
+ PyErr_Format(PyExc_ValueError,
+ "PyUnicode_FromFormatV() expects an ASCII-encoded format "
+ "string, got a non-ASCII byte: 0x%02x",
+ (unsigned char)*p);
+ return NULL;
}
- ++callresult;
- break;
- }
- case 'S':
- case 'R':
- case 'A':
- {
- Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
- /* unused, since we already have the result */
- (void) va_arg(vargs, PyObject *);
- assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
- _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
- i += size;
- /* We're done with the unicode()/repr() => forget it */
- Py_DECREF(*callresult);
- /* switch to next unicode()/repr() result */
- ++callresult;
- break;
+ p++;
}
- case '%':
- PyUnicode_WRITE(kind, data, i++, '%');
- break;
- default:
- {
- Py_ssize_t len = strlen(p);
- unicode_write_cstr(string, i, p, len);
- i += len;
- assert(i == PyUnicode_GET_LENGTH(string));
- goto end;
- }
- }
- }
- else {
- assert(i < PyUnicode_GET_LENGTH(string));
- PyUnicode_WRITE(kind, data, i++, *f);
+ while (*p != '\0' && *p != '%');
+ len = p - f;
+
+ if (*p == '\0')
+ writer.overallocate = 0;
+ if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
+ goto fail;
+ unicode_write_cstr(writer.buffer, writer.pos, f, len);
+ writer.pos += len;
+
+ f = p;
}
}
- assert(i == PyUnicode_GET_LENGTH(string));
+ return _PyUnicodeWriter_Finish(&writer);
- end:
- if (callresults)
- PyObject_Free(callresults);
- if (numberresults)
- PyObject_Free(numberresults);
- return unicode_result(string);
fail:
- if (callresults) {
- PyObject **callresult2 = callresults;
- while (callresult2 < callresult) {
- Py_XDECREF(*callresult2);
- ++callresult2;
- }
- PyObject_Free(callresults);
- }
- if (numberresults)
- PyObject_Free(numberresults);
+ _PyUnicodeWriter_Dealloc(&writer);
return NULL;
}
@@ -12962,6 +12797,19 @@ _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
return 0;
}
+int
+_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
+{
+ Py_UCS4 maxchar;
+
+ maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
+ if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
+ return -1;
+ unicode_write_cstr(writer->buffer, writer->pos, str, len);
+ writer->pos += len;
+ return 0;
+}
+
PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
{