diff options
author | Eric Smith <eric@trueblade.com> | 2008-02-17 19:46:49 (GMT) |
---|---|---|
committer | Eric Smith <eric@trueblade.com> | 2008-02-17 19:46:49 (GMT) |
commit | a9f7d6248032c9572b4d2024a1be8bd2823af09f (patch) | |
tree | 5465a1051312055678248db0076d314924ee4ebc /Objects/stringlib | |
parent | e139688d34cc12b23d3a310f10d4f440f75f7d08 (diff) | |
download | cpython-a9f7d6248032c9572b4d2024a1be8bd2823af09f.zip cpython-a9f7d6248032c9572b4d2024a1be8bd2823af09f.tar.gz cpython-a9f7d6248032c9572b4d2024a1be8bd2823af09f.tar.bz2 |
Backport of PEP 3101, Advanced String Formatting, from py3k.
Highlights:
- Adding PyObject_Format.
- Adding string.Format class.
- Adding __format__ for str, unicode, int, long, float, datetime.
- Adding builtin format.
- Adding ''.format and u''.format.
- str/unicode fixups for formatters.
The files in Objects/stringlib that implement PEP 3101 (stringdefs.h,
unicodedefs.h, formatter.h, string_format.h) are identical in trunk
and py3k. Any changes from here on should be made to trunk, and
changes will propogate to py3k).
Diffstat (limited to 'Objects/stringlib')
-rw-r--r-- | Objects/stringlib/formatter.h | 980 | ||||
-rw-r--r-- | Objects/stringlib/string_format.h | 1214 | ||||
-rw-r--r-- | Objects/stringlib/stringdefs.h | 27 | ||||
-rw-r--r-- | Objects/stringlib/unicodedefs.h | 52 |
4 files changed, 2273 insertions, 0 deletions
diff --git a/Objects/stringlib/formatter.h b/Objects/stringlib/formatter.h new file mode 100644 index 0000000..39da6b3 --- /dev/null +++ b/Objects/stringlib/formatter.h @@ -0,0 +1,980 @@ +/* implements the string, long, and float formatters. that is, + string.__format__, etc. */ + +/* Before including this, you must include either: + stringlib/unicodedefs.h + stringlib/stringdefs.h + + Also, you should define the names: + FORMAT_STRING + FORMAT_LONG + FORMAT_FLOAT + to be whatever you want the public names of these functions to + be. These are the only non-static functions defined here. +*/ + +#define ALLOW_PARENS_FOR_SIGN 0 + +/* + get_integer consumes 0 or more decimal digit characters from an + input string, updates *result with the corresponding positive + integer, and returns the number of digits consumed. + + returns -1 on error. +*/ +static int +get_integer(STRINGLIB_CHAR **ptr, STRINGLIB_CHAR *end, + Py_ssize_t *result) +{ + Py_ssize_t accumulator, digitval, oldaccumulator; + int numdigits; + accumulator = numdigits = 0; + for (;;(*ptr)++, numdigits++) { + if (*ptr >= end) + break; + digitval = STRINGLIB_TODECIMAL(**ptr); + if (digitval < 0) + break; + /* + This trick was copied from old Unicode format code. It's cute, + but would really suck on an old machine with a slow divide + implementation. Fortunately, in the normal case we do not + expect too many digits. + */ + oldaccumulator = accumulator; + accumulator *= 10; + if ((accumulator+10)/10 != oldaccumulator+1) { + PyErr_Format(PyExc_ValueError, + "Too many decimal digits in format string"); + return -1; + } + accumulator += digitval; + } + *result = accumulator; + return numdigits; +} + +/************************************************************************/ +/*********** standard format specifier parsing **************************/ +/************************************************************************/ + +/* returns true if this character is a specifier alignment token */ +Py_LOCAL_INLINE(int) +is_alignment_token(STRINGLIB_CHAR c) +{ + switch (c) { + case '<': case '>': case '=': case '^': + return 1; + default: + return 0; + } +} + +/* returns true if this character is a sign element */ +Py_LOCAL_INLINE(int) +is_sign_element(STRINGLIB_CHAR c) +{ + switch (c) { + case ' ': case '+': case '-': +#if ALLOW_PARENS_FOR_SIGN + case '(': +#endif + return 1; + default: + return 0; + } +} + + +typedef struct { + STRINGLIB_CHAR fill_char; + STRINGLIB_CHAR align; + STRINGLIB_CHAR sign; + Py_ssize_t width; + Py_ssize_t precision; + STRINGLIB_CHAR type; +} InternalFormatSpec; + +/* + ptr points to the start of the format_spec, end points just past its end. + fills in format with the parsed information. + returns 1 on success, 0 on failure. + if failure, sets the exception +*/ +static int +parse_internal_render_format_spec(PyObject *format_spec, + InternalFormatSpec *format, + char default_type) +{ + STRINGLIB_CHAR *ptr = STRINGLIB_STR(format_spec); + STRINGLIB_CHAR *end = ptr + STRINGLIB_LEN(format_spec); + + /* end-ptr is used throughout this code to specify the length of + the input string */ + + Py_ssize_t specified_width; + + format->fill_char = '\0'; + format->align = '\0'; + format->sign = '\0'; + format->width = -1; + format->precision = -1; + format->type = default_type; + + /* If the second char is an alignment token, + then parse the fill char */ + if (end-ptr >= 2 && is_alignment_token(ptr[1])) { + format->align = ptr[1]; + format->fill_char = ptr[0]; + ptr += 2; + } + else if (end-ptr >= 1 && is_alignment_token(ptr[0])) { + format->align = ptr[0]; + ptr++; + } + + /* Parse the various sign options */ + if (end-ptr >= 1 && is_sign_element(ptr[0])) { + format->sign = ptr[0]; + ptr++; +#if ALLOW_PARENS_FOR_SIGN + if (end-ptr >= 1 && ptr[0] == ')') { + ptr++; + } +#endif + } + + /* The special case for 0-padding (backwards compat) */ + if (format->fill_char == '\0' && end-ptr >= 1 && ptr[0] == '0') { + format->fill_char = '0'; + if (format->align == '\0') { + format->align = '='; + } + ptr++; + } + + /* XXX add error checking */ + specified_width = get_integer(&ptr, end, &format->width); + + /* if specified_width is 0, we didn't consume any characters for + the width. in that case, reset the width to -1, because + get_integer() will have set it to zero */ + if (specified_width == 0) { + format->width = -1; + } + + /* Parse field precision */ + if (end-ptr && ptr[0] == '.') { + ptr++; + + /* XXX add error checking */ + specified_width = get_integer(&ptr, end, &format->precision); + + /* not having a precision after a dot is an error */ + if (specified_width == 0) { + PyErr_Format(PyExc_ValueError, + "Format specifier missing precision"); + return 0; + } + + } + + /* Finally, parse the type field */ + + if (end-ptr > 1) { + /* invalid conversion spec */ + PyErr_Format(PyExc_ValueError, "Invalid conversion specification"); + return 0; + } + + if (end-ptr == 1) { + format->type = ptr[0]; + ptr++; + } + + return 1; +} + +#if defined FORMAT_FLOAT || defined FORMAT_LONG +/************************************************************************/ +/*********** common routines for numeric formatting *********************/ +/************************************************************************/ + +/* describes the layout for an integer, see the comment in + _calc_integer_widths() for details */ +typedef struct { + Py_ssize_t n_lpadding; + Py_ssize_t n_spadding; + Py_ssize_t n_rpadding; + char lsign; + Py_ssize_t n_lsign; + char rsign; + Py_ssize_t n_rsign; + Py_ssize_t n_total; /* just a convenience, it's derivable from the + other fields */ +} NumberFieldWidths; + +/* not all fields of format are used. for example, precision is + unused. should this take discrete params in order to be more clear + about what it does? or is passing a single format parameter easier + and more efficient enough to justify a little obfuscation? */ +static void +calc_number_widths(NumberFieldWidths *r, STRINGLIB_CHAR actual_sign, + Py_ssize_t n_digits, const InternalFormatSpec *format) +{ + r->n_lpadding = 0; + r->n_spadding = 0; + r->n_rpadding = 0; + r->lsign = '\0'; + r->n_lsign = 0; + r->rsign = '\0'; + r->n_rsign = 0; + + /* the output will look like: + | | + | <lpadding> <lsign> <spadding> <digits> <rsign> <rpadding> | + | | + + lsign and rsign are computed from format->sign and the actual + sign of the number + + digits is already known + + the total width is either given, or computed from the + actual digits + + only one of lpadding, spadding, and rpadding can be non-zero, + and it's calculated from the width and other fields + */ + + /* compute the various parts we're going to write */ + if (format->sign == '+') { + /* always put a + or - */ + r->n_lsign = 1; + r->lsign = (actual_sign == '-' ? '-' : '+'); + } +#if ALLOW_PARENS_FOR_SIGN + else if (format->sign == '(') { + if (actual_sign == '-') { + r->n_lsign = 1; + r->lsign = '('; + r->n_rsign = 1; + r->rsign = ')'; + } + } +#endif + else if (format->sign == ' ') { + r->n_lsign = 1; + r->lsign = (actual_sign == '-' ? '-' : ' '); + } + else { + /* non specified, or the default (-) */ + if (actual_sign == '-') { + r->n_lsign = 1; + r->lsign = '-'; + } + } + + /* now the number of padding characters */ + if (format->width == -1) { + /* no padding at all, nothing to do */ + } + else { + /* see if any padding is needed */ + if (r->n_lsign + n_digits + r->n_rsign >= format->width) { + /* no padding needed, we're already bigger than the + requested width */ + } + else { + /* determine which of left, space, or right padding is + needed */ + Py_ssize_t padding = format->width - + (r->n_lsign + n_digits + r->n_rsign); + if (format->align == '<') + r->n_rpadding = padding; + else if (format->align == '>') + r->n_lpadding = padding; + else if (format->align == '^') { + r->n_lpadding = padding / 2; + r->n_rpadding = padding - r->n_lpadding; + } + else if (format->align == '=') + r->n_spadding = padding; + else + r->n_lpadding = padding; + } + } + r->n_total = r->n_lpadding + r->n_lsign + r->n_spadding + + n_digits + r->n_rsign + r->n_rpadding; +} + +/* fill in the non-digit parts of a numbers's string representation, + as determined in _calc_integer_widths(). returns the pointer to + where the digits go. */ +static STRINGLIB_CHAR * +fill_number(STRINGLIB_CHAR *p_buf, const NumberFieldWidths *spec, + Py_ssize_t n_digits, STRINGLIB_CHAR fill_char) +{ + STRINGLIB_CHAR* p_digits; + + if (spec->n_lpadding) { + STRINGLIB_FILL(p_buf, fill_char, spec->n_lpadding); + p_buf += spec->n_lpadding; + } + if (spec->n_lsign == 1) { + *p_buf++ = spec->lsign; + } + if (spec->n_spadding) { + STRINGLIB_FILL(p_buf, fill_char, spec->n_spadding); + p_buf += spec->n_spadding; + } + p_digits = p_buf; + p_buf += n_digits; + if (spec->n_rsign == 1) { + *p_buf++ = spec->rsign; + } + if (spec->n_rpadding) { + STRINGLIB_FILL(p_buf, fill_char, spec->n_rpadding); + p_buf += spec->n_rpadding; + } + return p_digits; +} +#endif /* FORMAT_FLOAT || FORMAT_LONG */ + +/************************************************************************/ +/*********** string formatting ******************************************/ +/************************************************************************/ + +static PyObject * +format_string_internal(PyObject *value, const InternalFormatSpec *format) +{ + Py_ssize_t width; /* total field width */ + Py_ssize_t lpad; + STRINGLIB_CHAR *dst; + STRINGLIB_CHAR *src = STRINGLIB_STR(value); + Py_ssize_t len = STRINGLIB_LEN(value); + PyObject *result = NULL; + + /* sign is not allowed on strings */ + if (format->sign != '\0') { + PyErr_SetString(PyExc_ValueError, + "Sign not allowed in string format specifier"); + goto done; + } + + /* '=' alignment not allowed on strings */ + if (format->align == '=') { + PyErr_SetString(PyExc_ValueError, + "'=' alignment not allowed " + "in string format specifier"); + goto done; + } + + /* if precision is specified, output no more that format.precision + characters */ + if (format->precision >= 0 && len >= format->precision) { + len = format->precision; + } + + if (format->width >= 0) { + width = format->width; + + /* but use at least len characters */ + if (len > width) { + width = len; + } + } + else { + /* not specified, use all of the chars and no more */ + width = len; + } + + /* allocate the resulting string */ + result = STRINGLIB_NEW(NULL, width); + if (result == NULL) + goto done; + + /* now write into that space */ + dst = STRINGLIB_STR(result); + + /* figure out how much leading space we need, based on the + aligning */ + if (format->align == '>') + lpad = width - len; + else if (format->align == '^') + lpad = (width - len) / 2; + else + lpad = 0; + + /* if right aligning, increment the destination allow space on the + left */ + memcpy(dst + lpad, src, len * sizeof(STRINGLIB_CHAR)); + + /* do any padding */ + if (width > len) { + STRINGLIB_CHAR fill_char = format->fill_char; + if (fill_char == '\0') { + /* use the default, if not specified */ + fill_char = ' '; + } + + /* pad on left */ + if (lpad) + STRINGLIB_FILL(dst, fill_char, lpad); + + /* pad on right */ + if (width - len - lpad) + STRINGLIB_FILL(dst + len + lpad, fill_char, width - len - lpad); + } + +done: + return result; +} + + +/************************************************************************/ +/*********** long formatting ********************************************/ +/************************************************************************/ + +#if defined FORMAT_LONG || defined FORMAT_INT +typedef PyObject* +(*IntOrLongToString)(PyObject *value, int base); + +static PyObject * +format_int_or_long_internal(PyObject *value, const InternalFormatSpec *format, + IntOrLongToString tostring) +{ + PyObject *result = NULL; + PyObject *tmp = NULL; + STRINGLIB_CHAR *pnumeric_chars; + STRINGLIB_CHAR numeric_char; + STRINGLIB_CHAR sign = '\0'; + STRINGLIB_CHAR *p; + Py_ssize_t n_digits; /* count of digits need from the computed + string */ + Py_ssize_t n_leading_chars; + NumberFieldWidths spec; + long x; + + /* no precision allowed on integers */ + if (format->precision != -1) { + PyErr_SetString(PyExc_ValueError, + "Precision not allowed in integer format specifier"); + goto done; + } + + + /* special case for character formatting */ + if (format->type == 'c') { + /* error to specify a sign */ + if (format->sign != '\0') { + PyErr_SetString(PyExc_ValueError, + "Sign not allowed with integer" + " format specifier 'c'"); + goto done; + } + + /* taken from unicodeobject.c formatchar() */ + /* Integer input truncated to a character */ +/* XXX: won't work for int */ + x = PyLong_AsLong(value); + if (x == -1 && PyErr_Occurred()) + goto done; +#ifdef Py_UNICODE_WIDE + if (x < 0 || x > 0x10ffff) { + PyErr_SetString(PyExc_OverflowError, + "%c arg not in range(0x110000) " + "(wide Python build)"); + goto done; + } +#else + if (x < 0 || x > 0xffff) { + PyErr_SetString(PyExc_OverflowError, + "%c arg not in range(0x10000) " + "(narrow Python build)"); + goto done; + } +#endif + numeric_char = (STRINGLIB_CHAR)x; + pnumeric_chars = &numeric_char; + n_digits = 1; + } + else { + int base; + int leading_chars_to_skip; /* Number of characters added by + PyNumber_ToBase that we want to + skip over. */ + + /* Compute the base and how many characters will be added by + PyNumber_ToBase */ + switch (format->type) { + case 'b': + base = 2; + leading_chars_to_skip = 2; /* 0b */ + break; + case 'o': + base = 8; + leading_chars_to_skip = 2; /* 0o */ + break; + case 'x': + case 'X': + base = 16; + leading_chars_to_skip = 2; /* 0x */ + break; + default: /* shouldn't be needed, but stops a compiler warning */ + case 'd': + base = 10; + leading_chars_to_skip = 0; + break; + } + + /* Do the hard part, converting to a string in a given base */ + tmp = tostring(value, base); + if (tmp == NULL) + goto done; + + pnumeric_chars = STRINGLIB_STR(tmp); + n_digits = STRINGLIB_LEN(tmp); + + /* Remember not to modify what pnumeric_chars points to. it + might be interned. Only modify it after we copy it into a + newly allocated output buffer. */ + + /* Is a sign character present in the output? If so, remember it + and skip it */ + sign = pnumeric_chars[0]; + if (sign == '-') { + ++leading_chars_to_skip; + } + + /* Skip over the leading chars (0x, 0b, etc.) */ + n_digits -= leading_chars_to_skip; + pnumeric_chars += leading_chars_to_skip; + } + + /* Calculate the widths of the various leading and trailing parts */ + calc_number_widths(&spec, sign, n_digits, format); + + /* Allocate a new string to hold the result */ + result = STRINGLIB_NEW(NULL, spec.n_total); + if (!result) + goto done; + p = STRINGLIB_STR(result); + + /* Fill in the digit parts */ + n_leading_chars = spec.n_lpadding + spec.n_lsign + spec.n_spadding; + memmove(p + n_leading_chars, + pnumeric_chars, + n_digits * sizeof(STRINGLIB_CHAR)); + + /* if X, convert to uppercase */ + if (format->type == 'X') { + Py_ssize_t t; + for (t = 0; t < n_digits; t++) + p[t + n_leading_chars] = STRINGLIB_TOUPPER(p[t + n_leading_chars]); + } + + /* Fill in the non-digit parts */ + fill_number(p, &spec, n_digits, + format->fill_char == '\0' ? ' ' : format->fill_char); + +done: + Py_XDECREF(tmp); + return result; +} +#endif /* defined FORMAT_LONG || defined FORMAT_INT */ + +/************************************************************************/ +/*********** float formatting *******************************************/ +/************************************************************************/ + +#ifdef FORMAT_FLOAT +#if STRINGLIB_IS_UNICODE +/* taken from unicodeobject.c */ +static Py_ssize_t +strtounicode(Py_UNICODE *buffer, const char *charbuffer) +{ + register Py_ssize_t i; + Py_ssize_t len = strlen(charbuffer); + for (i = len - 1; i >= 0; i--) + buffer[i] = (Py_UNICODE) charbuffer[i]; + + return len; +} +#endif + +/* the callback function to call to do the actual float formatting. + it matches the definition of PyOS_ascii_formatd */ +typedef char* +(*DoubleSnprintfFunction)(char *buffer, size_t buf_len, + const char *format, double d); + +/* just a wrapper to make PyOS_snprintf look like DoubleSnprintfFunction */ +static char* +snprintf_double(char *buffer, size_t buf_len, const char *format, double d) +{ + PyOS_snprintf(buffer, buf_len, format, d); + return NULL; +} + +/* see FORMATBUFLEN in unicodeobject.c */ +#define FLOAT_FORMATBUFLEN 120 + +/* much of this is taken from unicodeobject.c */ +/* use type instead of format->type, so that it can be overridden by + format_number() */ +static PyObject * +_format_float(STRINGLIB_CHAR type, PyObject *value, + const InternalFormatSpec *format, + DoubleSnprintfFunction snprintf) +{ + /* fmt = '%.' + `prec` + `type` + '%%' + worst case length = 2 + 10 (len of INT_MAX) + 1 + 2 = 15 (use 20)*/ + char fmt[20]; + + /* taken from unicodeobject.c */ + /* Worst case length calc to ensure no buffer overrun: + + 'g' formats: + fmt = %#.<prec>g + buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp + for any double rep.) + len = 1 + prec + 1 + 2 + 5 = 9 + prec + + 'f' formats: + buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) + len = 1 + 50 + 1 + prec = 52 + prec + + If prec=0 the effective precision is 1 (the leading digit is + always given), therefore increase the length by one. + + */ + char charbuf[FLOAT_FORMATBUFLEN]; + Py_ssize_t n_digits; + double x; + Py_ssize_t precision = format->precision; + PyObject *result = NULL; + STRINGLIB_CHAR sign; + char* trailing = ""; + STRINGLIB_CHAR *p; + NumberFieldWidths spec; + +#if STRINGLIB_IS_UNICODE + Py_UNICODE unicodebuf[FLOAT_FORMATBUFLEN]; +#endif + + /* first, do the conversion as 8-bit chars, using the platform's + snprintf. then, if needed, convert to unicode. */ + + /* 'F' is the same as 'f', per the PEP */ + if (type == 'F') + type = 'f'; + + x = PyFloat_AsDouble(value); + + if (x == -1.0 && PyErr_Occurred()) + goto done; + + if (type == '%') { + type = 'f'; + x *= 100; + trailing = "%"; + } + + if (precision < 0) + precision = 6; + if (type == 'f' && (fabs(x) / 1e25) >= 1e25) + type = 'g'; + + /* cast "type", because if we're in unicode we need to pass a + 8-bit char. this is safe, because we've restricted what "type" + can be */ + PyOS_snprintf(fmt, sizeof(fmt), "%%.%" PY_FORMAT_SIZE_T "d%c", precision, + (char)type); + + /* call the passed in function to do the actual formatting */ + snprintf(charbuf, sizeof(charbuf), fmt, x); + + /* adding trailing to fmt with PyOS_snprintf doesn't work, not + sure why. we'll just concatentate it here, no harm done. we + know we can't have a buffer overflow from the fmt size + analysis */ + strcat(charbuf, trailing); + + /* rather than duplicate the code for snprintf for both unicode + and 8 bit strings, we just use the 8 bit version and then + convert to unicode in a separate code path. that's probably + the lesser of 2 evils. */ +#if STRINGLIB_IS_UNICODE + n_digits = strtounicode(unicodebuf, charbuf); + p = unicodebuf; +#else + /* compute the length. I believe this is done because the return + value from snprintf above is unreliable */ + n_digits = strlen(charbuf); + p = charbuf; +#endif + + /* is a sign character present in the output? if so, remember it + and skip it */ + sign = p[0]; + if (sign == '-') { + p++; + n_digits--; + } + + calc_number_widths(&spec, sign, n_digits, format); + + /* allocate a string with enough space */ + result = STRINGLIB_NEW(NULL, spec.n_total); + if (result == NULL) + goto done; + + /* fill in the non-digit parts */ + fill_number(STRINGLIB_STR(result), &spec, n_digits, + format->fill_char == '\0' ? ' ' : format->fill_char); + + /* fill in the digit parts */ + memmove(STRINGLIB_STR(result) + + (spec.n_lpadding + spec.n_lsign + spec.n_spadding), + p, + n_digits * sizeof(STRINGLIB_CHAR)); + +done: + return result; +} + +static PyObject * +format_float_internal(PyObject *value, const InternalFormatSpec *format) +{ + if (format->type == 'n') + return _format_float('f', value, format, snprintf_double); + else + return _format_float(format->type, value, format, PyOS_ascii_formatd); +} +#endif /* FORMAT_FLOAT */ + +/************************************************************************/ +/*********** built in formatters ****************************************/ +/************************************************************************/ +#ifdef FORMAT_STRING +PyObject * +FORMAT_STRING(PyObject* value, PyObject* args) +{ + PyObject *format_spec; + PyObject *result = NULL; +#if PY_VERSION_HEX < 0x03000000 + PyObject *tmp = NULL; +#endif + InternalFormatSpec format; + + /* If 2.x, we accept either str or unicode, and try to convert it + to the right type. In 3.x, we insist on only unicode */ +#if PY_VERSION_HEX >= 0x03000000 + if (!PyArg_ParseTuple(args, STRINGLIB_PARSE_CODE ":__format__", + &format_spec)) + goto done; +#else + /* If 2.x, convert format_spec to the same type as value */ + /* This is to allow things like u''.format('') */ + if (!PyArg_ParseTuple(args, "O:__format__", &format_spec)) + goto done; + if (!(PyString_Check(format_spec) || PyUnicode_Check(format_spec))) { + PyErr_Format(PyExc_TypeError, "__format__ arg must be str " + "or unicode, not %s", Py_TYPE(format_spec)->tp_name); + goto done; + } + tmp = STRINGLIB_TOSTR(format_spec); + if (tmp == NULL) + goto done; + format_spec = tmp; +#endif + + /* check for the special case of zero length format spec, make + it equivalent to str(value) */ + if (STRINGLIB_LEN(format_spec) == 0) { + result = STRINGLIB_TOSTR(value); + goto done; + } + + + /* parse the format_spec */ + if (!parse_internal_render_format_spec(format_spec, &format, 's')) + goto done; + + /* type conversion? */ + switch (format.type) { + case 's': + /* no type conversion needed, already a string. do the formatting */ + result = format_string_internal(value, &format); + break; + default: + /* unknown */ + PyErr_Format(PyExc_ValueError, "Unknown conversion type %c", + format.type); + goto done; + } + +done: +#if PY_VERSION_HEX < 0x03000000 + Py_XDECREF(tmp); +#endif + return result; +} +#endif /* FORMAT_STRING */ + +#if defined FORMAT_LONG || defined FORMAT_INT +static PyObject* +format_int_or_long(PyObject* value, PyObject* args, IntOrLongToString tostring) +{ + PyObject *format_spec; + PyObject *result = NULL; + PyObject *tmp = NULL; + InternalFormatSpec format; + + if (!PyArg_ParseTuple(args, STRINGLIB_PARSE_CODE ":__format__", + &format_spec)) + goto done; + + /* check for the special case of zero length format spec, make + it equivalent to str(value) */ + if (STRINGLIB_LEN(format_spec) == 0) { + result = STRINGLIB_TOSTR(value); + goto done; + } + + /* parse the format_spec */ + if (!parse_internal_render_format_spec(format_spec, &format, 'd')) + goto done; + + /* type conversion? */ + switch (format.type) { + case 'b': + case 'c': + case 'd': + case 'o': + case 'x': + case 'X': + /* no type conversion needed, already an int (or long). do + the formatting */ + result = format_int_or_long_internal(value, &format, tostring); + break; + + case 'e': + case 'E': + case 'f': + case 'F': + case 'g': + case 'G': + case 'n': + case '%': + /* convert to float */ + tmp = PyNumber_Float(value); + if (tmp == NULL) + goto done; + result = format_float_internal(value, &format); + break; + + default: + /* unknown */ + PyErr_Format(PyExc_ValueError, "Unknown conversion type %c", + format.type); + goto done; + } + +done: + Py_XDECREF(tmp); + return result; +} +#endif /* FORMAT_LONG || defined FORMAT_INT */ + +#ifdef FORMAT_LONG +/* Need to define long_format as a function that will convert a long + to a string. In 3.0, _PyLong_Format has the correct signature. In + 2.x, we need to fudge a few parameters */ +#if PY_VERSION_HEX >= 0x03000000 +#define long_format _PyLong_Format +#else +static PyObject* +long_format(PyObject* value, int base) +{ + /* Convert to base, don't add trailing 'L', and use the new octal + format. We already know this is a long object */ + assert(PyLong_Check(value)); + /* convert to base, don't add 'L', and use the new octal format */ + return _PyLong_Format(value, base, 0, 1); +} +#endif + +PyObject * +FORMAT_LONG(PyObject* value, PyObject* args) +{ + return format_int_or_long(value, args, long_format); +} +#endif /* FORMAT_LONG */ + +#ifdef FORMAT_INT +/* this is only used for 2.x, not 3.0 */ +static PyObject* +int_format(PyObject* value, int base) +{ + /* Convert to base, and use the new octal format. We already + know this is an int object */ + assert(PyInt_Check(value)); + return _PyInt_Format((PyIntObject*)value, base, 1); +} + +PyObject * +FORMAT_INT(PyObject* value, PyObject* args) +{ + return format_int_or_long(value, args, int_format); +} +#endif /* FORMAT_INT */ + +#ifdef FORMAT_FLOAT +PyObject * +FORMAT_FLOAT(PyObject *value, PyObject *args) +{ + PyObject *format_spec; + PyObject *result = NULL; + InternalFormatSpec format; + + if (!PyArg_ParseTuple(args, STRINGLIB_PARSE_CODE ":__format__", &format_spec)) + goto done; + + /* check for the special case of zero length format spec, make + it equivalent to str(value) */ + if (STRINGLIB_LEN(format_spec) == 0) { + result = STRINGLIB_TOSTR(value); + goto done; + } + + /* parse the format_spec */ + if (!parse_internal_render_format_spec(format_spec, &format, 'g')) + goto done; + + /* type conversion? */ + switch (format.type) { + case 'e': + case 'E': + case 'f': + case 'F': + case 'g': + case 'G': + case 'n': + case '%': + /* no conversion, already a float. do the formatting */ + result = format_float_internal(value, &format); + break; + + default: + /* unknown */ + PyErr_Format(PyExc_ValueError, "Unknown conversion type %c", + format.type); + goto done; + } + +done: + return result; +} +#endif /* FORMAT_FLOAT */ diff --git a/Objects/stringlib/string_format.h b/Objects/stringlib/string_format.h new file mode 100644 index 0000000..70f8f13 --- /dev/null +++ b/Objects/stringlib/string_format.h @@ -0,0 +1,1214 @@ +/* + string_format.h -- implementation of string.format(). + + It uses the Objects/stringlib conventions, so that it can be + compiled for both unicode and string objects. +*/ + + +/* Defines for Python 2.6 compatability */ +#if PY_VERSION_HEX < 0x03000000 +#define PyLong_FromSsize_t _PyLong_FromSsize_t +#endif + +/* Defines for more efficiently reallocating the string buffer */ +#define INITIAL_SIZE_INCREMENT 100 +#define SIZE_MULTIPLIER 2 +#define MAX_SIZE_INCREMENT 3200 + + +/************************************************************************/ +/*********** Global data structures and forward declarations *********/ +/************************************************************************/ + +/* + A SubString consists of the characters between two string or + unicode pointers. +*/ +typedef struct { + STRINGLIB_CHAR *ptr; + STRINGLIB_CHAR *end; +} SubString; + + +/* forward declaration for recursion */ +static PyObject * +build_string(SubString *input, PyObject *args, PyObject *kwargs, + int recursion_depth); + + + +/************************************************************************/ +/************************** Utility functions ************************/ +/************************************************************************/ + +/* fill in a SubString from a pointer and length */ +Py_LOCAL_INLINE(void) +SubString_init(SubString *str, STRINGLIB_CHAR *p, Py_ssize_t len) +{ + str->ptr = p; + if (p == NULL) + str->end = NULL; + else + str->end = str->ptr + len; +} + +/* return a new string. if str->ptr is NULL, return None */ +Py_LOCAL_INLINE(PyObject *) +SubString_new_object(SubString *str) +{ + if (str->ptr == NULL) { + Py_INCREF(Py_None); + return Py_None; + } + return STRINGLIB_NEW(str->ptr, str->end - str->ptr); +} + +/* return a new string. if str->ptr is NULL, return None */ +Py_LOCAL_INLINE(PyObject *) +SubString_new_object_or_empty(SubString *str) +{ + if (str->ptr == NULL) { + return STRINGLIB_NEW(NULL, 0); + } + return STRINGLIB_NEW(str->ptr, str->end - str->ptr); +} + +/************************************************************************/ +/*********** Output string management functions ****************/ +/************************************************************************/ + +typedef struct { + STRINGLIB_CHAR *ptr; + STRINGLIB_CHAR *end; + PyObject *obj; + Py_ssize_t size_increment; +} OutputString; + +/* initialize an OutputString object, reserving size characters */ +static int +output_initialize(OutputString *output, Py_ssize_t size) +{ + output->obj = STRINGLIB_NEW(NULL, size); + if (output->obj == NULL) + return 0; + + output->ptr = STRINGLIB_STR(output->obj); + output->end = STRINGLIB_LEN(output->obj) + output->ptr; + output->size_increment = INITIAL_SIZE_INCREMENT; + + return 1; +} + +/* + output_extend reallocates the output string buffer. + It returns a status: 0 for a failed reallocation, + 1 for success. +*/ + +static int +output_extend(OutputString *output, Py_ssize_t count) +{ + STRINGLIB_CHAR *startptr = STRINGLIB_STR(output->obj); + Py_ssize_t curlen = output->ptr - startptr; + Py_ssize_t maxlen = curlen + count + output->size_increment; + + if (STRINGLIB_RESIZE(&output->obj, maxlen) < 0) + return 0; + startptr = STRINGLIB_STR(output->obj); + output->ptr = startptr + curlen; + output->end = startptr + maxlen; + if (output->size_increment < MAX_SIZE_INCREMENT) + output->size_increment *= SIZE_MULTIPLIER; + return 1; +} + +/* + output_data dumps characters into our output string + buffer. + + In some cases, it has to reallocate the string. + + It returns a status: 0 for a failed reallocation, + 1 for success. +*/ +static int +output_data(OutputString *output, const STRINGLIB_CHAR *s, Py_ssize_t count) +{ + if ((count > output->end - output->ptr) && !output_extend(output, count)) + return 0; + memcpy(output->ptr, s, count * sizeof(STRINGLIB_CHAR)); + output->ptr += count; + return 1; +} + +/************************************************************************/ +/*********** Format string parsing -- integers and identifiers *********/ +/************************************************************************/ + +static Py_ssize_t +get_integer(const SubString *str) +{ + Py_ssize_t accumulator = 0; + Py_ssize_t digitval; + Py_ssize_t oldaccumulator; + STRINGLIB_CHAR *p; + + /* empty string is an error */ + if (str->ptr >= str->end) + return -1; + + for (p = str->ptr; p < str->end; p++) { + digitval = STRINGLIB_TODECIMAL(*p); + if (digitval < 0) + return -1; + /* + This trick was copied from old Unicode format code. It's cute, + but would really suck on an old machine with a slow divide + implementation. Fortunately, in the normal case we do not + expect too many digits. + */ + oldaccumulator = accumulator; + accumulator *= 10; + if ((accumulator+10)/10 != oldaccumulator+1) { + PyErr_Format(PyExc_ValueError, + "Too many decimal digits in format string"); + return -1; + } + accumulator += digitval; + } + return accumulator; +} + +/************************************************************************/ +/******** Functions to get field objects and specification strings ******/ +/************************************************************************/ + +/* do the equivalent of obj.name */ +static PyObject * +getattr(PyObject *obj, SubString *name) +{ + PyObject *newobj; + PyObject *str = SubString_new_object(name); + if (str == NULL) + return NULL; + newobj = PyObject_GetAttr(obj, str); + Py_DECREF(str); + return newobj; +} + +/* do the equivalent of obj[idx], where obj is a sequence */ +static PyObject * +getitem_sequence(PyObject *obj, Py_ssize_t idx) +{ + return PySequence_GetItem(obj, idx); +} + +/* do the equivalent of obj[idx], where obj is not a sequence */ +static PyObject * +getitem_idx(PyObject *obj, Py_ssize_t idx) +{ + PyObject *newobj; + PyObject *idx_obj = PyLong_FromSsize_t(idx); + if (idx_obj == NULL) + return NULL; + newobj = PyObject_GetItem(obj, idx_obj); + Py_DECREF(idx_obj); + return newobj; +} + +/* do the equivalent of obj[name] */ +static PyObject * +getitem_str(PyObject *obj, SubString *name) +{ + PyObject *newobj; + PyObject *str = SubString_new_object(name); + if (str == NULL) + return NULL; + newobj = PyObject_GetItem(obj, str); + Py_DECREF(str); + return newobj; +} + +typedef struct { + /* the entire string we're parsing. we assume that someone else + is managing its lifetime, and that it will exist for the + lifetime of the iterator. can be empty */ + SubString str; + + /* pointer to where we are inside field_name */ + STRINGLIB_CHAR *ptr; +} FieldNameIterator; + + +static int +FieldNameIterator_init(FieldNameIterator *self, STRINGLIB_CHAR *ptr, + Py_ssize_t len) +{ + SubString_init(&self->str, ptr, len); + self->ptr = self->str.ptr; + return 1; +} + +static int +_FieldNameIterator_attr(FieldNameIterator *self, SubString *name) +{ + STRINGLIB_CHAR c; + + name->ptr = self->ptr; + + /* return everything until '.' or '[' */ + while (self->ptr < self->str.end) { + switch (c = *self->ptr++) { + case '[': + case '.': + /* backup so that we this character will be seen next time */ + self->ptr--; + break; + default: + continue; + } + break; + } + /* end of string is okay */ + name->end = self->ptr; + return 1; +} + +static int +_FieldNameIterator_item(FieldNameIterator *self, SubString *name) +{ + int bracket_seen = 0; + STRINGLIB_CHAR c; + + name->ptr = self->ptr; + + /* return everything until ']' */ + while (self->ptr < self->str.end) { + switch (c = *self->ptr++) { + case ']': + bracket_seen = 1; + break; + default: + continue; + } + break; + } + /* make sure we ended with a ']' */ + if (!bracket_seen) { + PyErr_SetString(PyExc_ValueError, "Missing ']' in format string"); + return 0; + } + + /* end of string is okay */ + /* don't include the ']' */ + name->end = self->ptr-1; + return 1; +} + +/* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */ +static int +FieldNameIterator_next(FieldNameIterator *self, int *is_attribute, + Py_ssize_t *name_idx, SubString *name) +{ + /* check at end of input */ + if (self->ptr >= self->str.end) + return 1; + + switch (*self->ptr++) { + case '.': + *is_attribute = 1; + if (_FieldNameIterator_attr(self, name) == 0) + return 0; + *name_idx = -1; + break; + case '[': + *is_attribute = 0; + if (_FieldNameIterator_item(self, name) == 0) + return 0; + *name_idx = get_integer(name); + break; + default: + /* interal error, can't get here */ + assert(0); + return 0; + } + + /* empty string is an error */ + if (name->ptr == name->end) { + PyErr_SetString(PyExc_ValueError, "Empty attribute in format string"); + return 0; + } + + return 2; +} + + +/* input: field_name + output: 'first' points to the part before the first '[' or '.' + 'first_idx' is -1 if 'first' is not an integer, otherwise + it's the value of first converted to an integer + 'rest' is an iterator to return the rest +*/ +static int +field_name_split(STRINGLIB_CHAR *ptr, Py_ssize_t len, SubString *first, + Py_ssize_t *first_idx, FieldNameIterator *rest) +{ + STRINGLIB_CHAR c; + STRINGLIB_CHAR *p = ptr; + STRINGLIB_CHAR *end = ptr + len; + + /* find the part up until the first '.' or '[' */ + while (p < end) { + switch (c = *p++) { + case '[': + case '.': + /* backup so that we this character is available to the + "rest" iterator */ + p--; + break; + default: + continue; + } + break; + } + + /* set up the return values */ + SubString_init(first, ptr, p - ptr); + FieldNameIterator_init(rest, p, end - p); + + /* see if "first" is an integer, in which case it's used as an index */ + *first_idx = get_integer(first); + + /* zero length string is an error */ + if (first->ptr >= first->end) { + PyErr_SetString(PyExc_ValueError, "empty field name"); + goto error; + } + + return 1; +error: + return 0; +} + + +/* + get_field_object returns the object inside {}, before the + format_spec. It handles getindex and getattr lookups and consumes + the entire input string. +*/ +static PyObject * +get_field_object(SubString *input, PyObject *args, PyObject *kwargs) +{ + PyObject *obj = NULL; + int ok; + int is_attribute; + SubString name; + SubString first; + Py_ssize_t index; + FieldNameIterator rest; + + if (!field_name_split(input->ptr, input->end - input->ptr, &first, + &index, &rest)) { + goto error; + } + + if (index == -1) { + /* look up in kwargs */ + PyObject *key = SubString_new_object(&first); + if (key == NULL) + goto error; + if ((kwargs == NULL) || (obj = PyDict_GetItem(kwargs, key)) == NULL) { + PyErr_SetObject(PyExc_KeyError, key); + Py_DECREF(key); + goto error; + } + Py_DECREF(key); + Py_INCREF(obj); + } + else { + /* look up in args */ + obj = PySequence_GetItem(args, index); + if (obj == NULL) + goto error; + } + + /* iterate over the rest of the field_name */ + while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index, + &name)) == 2) { + PyObject *tmp; + + if (is_attribute) + /* getattr lookup "." */ + tmp = getattr(obj, &name); + else + /* getitem lookup "[]" */ + if (index == -1) + tmp = getitem_str(obj, &name); + else + if (PySequence_Check(obj)) + tmp = getitem_sequence(obj, index); + else + /* not a sequence */ + tmp = getitem_idx(obj, index); + if (tmp == NULL) + goto error; + + /* assign to obj */ + Py_DECREF(obj); + obj = tmp; + } + /* end of iterator, this is the non-error case */ + if (ok == 1) + return obj; +error: + Py_XDECREF(obj); + return NULL; +} + +/************************************************************************/ +/***************** Field rendering functions **************************/ +/************************************************************************/ + +/* + render_field() is the main function in this section. It takes the + field object and field specification string generated by + get_field_and_spec, and renders the field into the output string. + + render_field calls fieldobj.__format__(format_spec) method, and + appends to the output. +*/ +static int +render_field(PyObject *fieldobj, SubString *format_spec, OutputString *output) +{ + int ok = 0; + PyObject *result = NULL; + + /* we need to create an object out of the pointers we have */ + PyObject *format_spec_object = SubString_new_object_or_empty(format_spec); + if (format_spec_object == NULL) + goto done; + + result = PyObject_Format(fieldobj, format_spec_object); + if (result == NULL) + goto done; + + ok = output_data(output, + STRINGLIB_STR(result), STRINGLIB_LEN(result)); +done: + Py_DECREF(format_spec_object); + Py_XDECREF(result); + return ok; +} + +static int +parse_field(SubString *str, SubString *field_name, SubString *format_spec, + STRINGLIB_CHAR *conversion) +{ + STRINGLIB_CHAR c = 0; + + /* initialize these, as they may be empty */ + *conversion = '\0'; + SubString_init(format_spec, NULL, 0); + + /* search for the field name. it's terminated by the end of the + string, or a ':' or '!' */ + field_name->ptr = str->ptr; + while (str->ptr < str->end) { + switch (c = *(str->ptr++)) { + case ':': + case '!': + break; + default: + continue; + } + break; + } + + if (c == '!' || c == ':') { + /* we have a format specifier and/or a conversion */ + /* don't include the last character */ + field_name->end = str->ptr-1; + + /* the format specifier is the rest of the string */ + format_spec->ptr = str->ptr; + format_spec->end = str->end; + + /* see if there's a conversion specifier */ + if (c == '!') { + /* there must be another character present */ + if (format_spec->ptr >= format_spec->end) { + PyErr_SetString(PyExc_ValueError, + "end of format while looking for conversion " + "specifier"); + return 0; + } + *conversion = *(format_spec->ptr++); + + /* if there is another character, it must be a colon */ + if (format_spec->ptr < format_spec->end) { + c = *(format_spec->ptr++); + if (c != ':') { + PyErr_SetString(PyExc_ValueError, + "expected ':' after format specifier"); + return 0; + } + } + } + + return 1; + + } + else { + /* end of string, there's no format_spec or conversion */ + field_name->end = str->ptr; + return 1; + } +} + +/************************************************************************/ +/******* Output string allocation and escape-to-markup processing ******/ +/************************************************************************/ + +/* MarkupIterator breaks the string into pieces of either literal + text, or things inside {} that need to be marked up. it is + designed to make it easy to wrap a Python iterator around it, for + use with the Formatter class */ + +typedef struct { + SubString str; +} MarkupIterator; + +static int +MarkupIterator_init(MarkupIterator *self, STRINGLIB_CHAR *ptr, Py_ssize_t len) +{ + SubString_init(&self->str, ptr, len); + return 1; +} + +/* returns 0 on error, 1 on non-error termination, and 2 if it got a + string (or something to be expanded) */ +static int +MarkupIterator_next(MarkupIterator *self, SubString *literal, + SubString *field_name, SubString *format_spec, + STRINGLIB_CHAR *conversion, + int *format_spec_needs_expanding) +{ + int at_end; + STRINGLIB_CHAR c = 0; + STRINGLIB_CHAR *start; + int count; + Py_ssize_t len; + int markup_follows = 0; + + /* initialize all of the output variables */ + SubString_init(literal, NULL, 0); + SubString_init(field_name, NULL, 0); + SubString_init(format_spec, NULL, 0); + *conversion = '\0'; + *format_spec_needs_expanding = 0; + + /* No more input, end of iterator. This is the normal exit + path. */ + if (self->str.ptr >= self->str.end) + return 1; + + start = self->str.ptr; + + /* First read any literal text. Read until the end of string, an + escaped '{' or '}', or an unescaped '{'. In order to never + allocate memory and so I can just pass pointers around, if + there's an escaped '{' or '}' then we'll return the literal + including the brace, but no format object. The next time + through, we'll return the rest of the literal, skipping past + the second consecutive brace. */ + while (self->str.ptr < self->str.end) { + switch (c = *(self->str.ptr++)) { + case '{': + case '}': + markup_follows = 1; + break; + default: + continue; + } + break; + } + + at_end = self->str.ptr >= self->str.end; + len = self->str.ptr - start; + + if ((c == '}') && (at_end || (c != *self->str.ptr))) { + PyErr_SetString(PyExc_ValueError, "Single '}' encountered " + "in format string"); + return 0; + } + if (at_end && c == '{') { + PyErr_SetString(PyExc_ValueError, "Single '{' encountered " + "in format string"); + return 0; + } + if (!at_end) { + if (c == *self->str.ptr) { + /* escaped } or {, skip it in the input. there is no + markup object following us, just this literal text */ + self->str.ptr++; + markup_follows = 0; + } + else + len--; + } + + /* record the literal text */ + literal->ptr = start; + literal->end = start + len; + + if (!markup_follows) + return 2; + + /* this is markup, find the end of the string by counting nested + braces. note that this prohibits escaped braces, so that + format_specs cannot have braces in them. */ + count = 1; + + start = self->str.ptr; + + /* we know we can't have a zero length string, so don't worry + about that case */ + while (self->str.ptr < self->str.end) { + switch (c = *(self->str.ptr++)) { + case '{': + /* the format spec needs to be recursively expanded. + this is an optimization, and not strictly needed */ + *format_spec_needs_expanding = 1; + count++; + break; + case '}': + count--; + if (count <= 0) { + /* we're done. parse and get out */ + SubString s; + + SubString_init(&s, start, self->str.ptr - 1 - start); + if (parse_field(&s, field_name, format_spec, conversion) == 0) + return 0; + + /* a zero length field_name is an error */ + if (field_name->ptr == field_name->end) { + PyErr_SetString(PyExc_ValueError, "zero length field name " + "in format"); + return 0; + } + + /* success */ + return 2; + } + break; + } + } + + /* end of string while searching for matching '}' */ + PyErr_SetString(PyExc_ValueError, "unmatched '{' in format"); + return 0; +} + + +/* do the !r or !s conversion on obj */ +static PyObject * +do_conversion(PyObject *obj, STRINGLIB_CHAR conversion) +{ + /* XXX in pre-3.0, do we need to convert this to unicode, since it + might have returned a string? */ + switch (conversion) { + case 'r': + return PyObject_Repr(obj); + case 's': + return STRINGLIB_TOSTR(obj); + default: + PyErr_Format(PyExc_ValueError, + "Unknown converion specifier %c", + conversion); + return NULL; + } +} + +/* given: + + {field_name!conversion:format_spec} + + compute the result and write it to output. + format_spec_needs_expanding is an optimization. if it's false, + just output the string directly, otherwise recursively expand the + format_spec string. */ + +static int +output_markup(SubString *field_name, SubString *format_spec, + int format_spec_needs_expanding, STRINGLIB_CHAR conversion, + OutputString *output, PyObject *args, PyObject *kwargs, + int recursion_depth) +{ + PyObject *tmp = NULL; + PyObject *fieldobj = NULL; + SubString expanded_format_spec; + SubString *actual_format_spec; + int result = 0; + + /* convert field_name to an object */ + fieldobj = get_field_object(field_name, args, kwargs); + if (fieldobj == NULL) + goto done; + + if (conversion != '\0') { + tmp = do_conversion(fieldobj, conversion); + if (tmp == NULL) + goto done; + + /* do the assignment, transferring ownership: fieldobj = tmp */ + Py_DECREF(fieldobj); + fieldobj = tmp; + tmp = NULL; + } + + /* if needed, recurively compute the format_spec */ + if (format_spec_needs_expanding) { + tmp = build_string(format_spec, args, kwargs, recursion_depth-1); + if (tmp == NULL) + goto done; + + /* note that in the case we're expanding the format string, + tmp must be kept around until after the call to + render_field. */ + SubString_init(&expanded_format_spec, + STRINGLIB_STR(tmp), STRINGLIB_LEN(tmp)); + actual_format_spec = &expanded_format_spec; + } + else + actual_format_spec = format_spec; + + if (render_field(fieldobj, actual_format_spec, output) == 0) + goto done; + + result = 1; + +done: + Py_XDECREF(fieldobj); + Py_XDECREF(tmp); + + return result; +} + +/* + do_markup is the top-level loop for the format() method. It + searches through the format string for escapes to markup codes, and + calls other functions to move non-markup text to the output, + and to perform the markup to the output. +*/ +static int +do_markup(SubString *input, PyObject *args, PyObject *kwargs, + OutputString *output, int recursion_depth) +{ + MarkupIterator iter; + int format_spec_needs_expanding; + int result; + SubString literal; + SubString field_name; + SubString format_spec; + STRINGLIB_CHAR conversion; + + MarkupIterator_init(&iter, input->ptr, input->end - input->ptr); + while ((result = MarkupIterator_next(&iter, &literal, &field_name, + &format_spec, &conversion, + &format_spec_needs_expanding)) == 2) { + if (!output_data(output, literal.ptr, literal.end - literal.ptr)) + return 0; + if (field_name.ptr != field_name.end) + if (!output_markup(&field_name, &format_spec, + format_spec_needs_expanding, conversion, output, + args, kwargs, recursion_depth)) + return 0; + } + return result; +} + + +/* + build_string allocates the output string and then + calls do_markup to do the heavy lifting. +*/ +static PyObject * +build_string(SubString *input, PyObject *args, PyObject *kwargs, + int recursion_depth) +{ + OutputString output; + PyObject *result = NULL; + Py_ssize_t count; + + output.obj = NULL; /* needed so cleanup code always works */ + + /* check the recursion level */ + if (recursion_depth <= 0) { + PyErr_SetString(PyExc_ValueError, + "Max string recursion exceeded"); + goto done; + } + + /* initial size is the length of the format string, plus the size + increment. seems like a reasonable default */ + if (!output_initialize(&output, + input->end - input->ptr + + INITIAL_SIZE_INCREMENT)) + goto done; + + if (!do_markup(input, args, kwargs, &output, recursion_depth)) { + goto done; + } + + count = output.ptr - STRINGLIB_STR(output.obj); + if (STRINGLIB_RESIZE(&output.obj, count) < 0) { + goto done; + } + + /* transfer ownership to result */ + result = output.obj; + output.obj = NULL; + +done: + Py_XDECREF(output.obj); + return result; +} + +/************************************************************************/ +/*********** main routine ***********************************************/ +/************************************************************************/ + +/* this is the main entry point */ +static PyObject * +do_string_format(PyObject *self, PyObject *args, PyObject *kwargs) +{ + SubString input; + + /* PEP 3101 says only 2 levels, so that + "{0:{1}}".format('abc', 's') # works + "{0:{1:{2}}}".format('abc', 's', '') # fails + */ + int recursion_depth = 2; + + SubString_init(&input, STRINGLIB_STR(self), STRINGLIB_LEN(self)); + return build_string(&input, args, kwargs, recursion_depth); +} + + + +/************************************************************************/ +/*********** formatteriterator ******************************************/ +/************************************************************************/ + +/* This is used to implement string.Formatter.vparse(). It exists so + Formatter can share code with the built in unicode.format() method. + It's really just a wrapper around MarkupIterator that is callable + from Python. */ + +typedef struct { + PyObject_HEAD + + STRINGLIB_OBJECT *str; + + MarkupIterator it_markup; +} formatteriterobject; + +static void +formatteriter_dealloc(formatteriterobject *it) +{ + Py_XDECREF(it->str); + PyObject_FREE(it); +} + +/* returns a tuple: + (literal, field_name, format_spec, conversion) + + literal is any literal text to output. might be zero length + field_name is the string before the ':'. might be None + format_spec is the string after the ':'. mibht be None + conversion is either None, or the string after the '!' +*/ +static PyObject * +formatteriter_next(formatteriterobject *it) +{ + SubString literal; + SubString field_name; + SubString format_spec; + STRINGLIB_CHAR conversion; + int format_spec_needs_expanding; + int result = MarkupIterator_next(&it->it_markup, &literal, &field_name, + &format_spec, &conversion, + &format_spec_needs_expanding); + + /* all of the SubString objects point into it->str, so no + memory management needs to be done on them */ + assert(0 <= result && result <= 2); + if (result == 0 || result == 1) + /* if 0, error has already been set, if 1, iterator is empty */ + return NULL; + else { + PyObject *literal_str = NULL; + PyObject *field_name_str = NULL; + PyObject *format_spec_str = NULL; + PyObject *conversion_str = NULL; + PyObject *tuple = NULL; + int has_field = field_name.ptr != field_name.end; + + literal_str = SubString_new_object(&literal); + if (literal_str == NULL) + goto done; + + field_name_str = SubString_new_object(&field_name); + if (field_name_str == NULL) + goto done; + + /* if field_name is non-zero length, return a string for + format_spec (even if zero length), else return None */ + format_spec_str = (has_field ? + SubString_new_object_or_empty : + SubString_new_object)(&format_spec); + if (format_spec_str == NULL) + goto done; + + /* if the conversion is not specified, return a None, + otherwise create a one length string with the conversion + character */ + if (conversion == '\0') { + conversion_str = Py_None; + Py_INCREF(conversion_str); + } + else + conversion_str = STRINGLIB_NEW(&conversion, 1); + if (conversion_str == NULL) + goto done; + + tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str, + conversion_str); + done: + Py_XDECREF(literal_str); + Py_XDECREF(field_name_str); + Py_XDECREF(format_spec_str); + Py_XDECREF(conversion_str); + return tuple; + } +} + +static PyMethodDef formatteriter_methods[] = { + {NULL, NULL} /* sentinel */ +}; + +static PyTypeObject PyFormatterIter_Type = { + PyVarObject_HEAD_INIT(&PyType_Type, 0) + "formatteriterator", /* tp_name */ + sizeof(formatteriterobject), /* tp_basicsize */ + 0, /* tp_itemsize */ + /* methods */ + (destructor)formatteriter_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + PyObject_SelfIter, /* tp_iter */ + (iternextfunc)formatteriter_next, /* tp_iternext */ + formatteriter_methods, /* tp_methods */ + 0, +}; + +/* unicode_formatter_parser is used to implement + string.Formatter.vformat. it parses a string and returns tuples + describing the parsed elements. It's a wrapper around + stringlib/string_format.h's MarkupIterator */ +static PyObject * +formatter_parser(STRINGLIB_OBJECT *self) +{ + formatteriterobject *it; + + it = PyObject_New(formatteriterobject, &PyFormatterIter_Type); + if (it == NULL) + return NULL; + + /* take ownership, give the object to the iterator */ + Py_INCREF(self); + it->str = self; + + /* initialize the contained MarkupIterator */ + MarkupIterator_init(&it->it_markup, + STRINGLIB_STR(self), + STRINGLIB_LEN(self)); + + return (PyObject *)it; +} + + +/************************************************************************/ +/*********** fieldnameiterator ******************************************/ +/************************************************************************/ + + +/* This is used to implement string.Formatter.vparse(). It parses the + field name into attribute and item values. It's a Python-callable + wrapper around FieldNameIterator */ + +typedef struct { + PyObject_HEAD + + STRINGLIB_OBJECT *str; + + FieldNameIterator it_field; +} fieldnameiterobject; + +static void +fieldnameiter_dealloc(fieldnameiterobject *it) +{ + Py_XDECREF(it->str); + PyObject_FREE(it); +} + +/* returns a tuple: + (is_attr, value) + is_attr is true if we used attribute syntax (e.g., '.foo') + false if we used index syntax (e.g., '[foo]') + value is an integer or string +*/ +static PyObject * +fieldnameiter_next(fieldnameiterobject *it) +{ + int result; + int is_attr; + Py_ssize_t idx; + SubString name; + + result = FieldNameIterator_next(&it->it_field, &is_attr, + &idx, &name); + if (result == 0 || result == 1) + /* if 0, error has already been set, if 1, iterator is empty */ + return NULL; + else { + PyObject* result = NULL; + PyObject* is_attr_obj = NULL; + PyObject* obj = NULL; + + is_attr_obj = PyBool_FromLong(is_attr); + if (is_attr_obj == NULL) + goto done; + + /* either an integer or a string */ + if (idx != -1) + obj = PyLong_FromSsize_t(idx); + else + obj = SubString_new_object(&name); + if (obj == NULL) + goto done; + + /* return a tuple of values */ + result = PyTuple_Pack(2, is_attr_obj, obj); + + done: + Py_XDECREF(is_attr_obj); + Py_XDECREF(obj); + return result; + } +} + +static PyMethodDef fieldnameiter_methods[] = { + {NULL, NULL} /* sentinel */ +}; + +static PyTypeObject PyFieldNameIter_Type = { + PyVarObject_HEAD_INIT(&PyType_Type, 0) + "fieldnameiterator", /* tp_name */ + sizeof(fieldnameiterobject), /* tp_basicsize */ + 0, /* tp_itemsize */ + /* methods */ + (destructor)fieldnameiter_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + PyObject_SelfIter, /* tp_iter */ + (iternextfunc)fieldnameiter_next, /* tp_iternext */ + fieldnameiter_methods, /* tp_methods */ + 0}; + +/* unicode_formatter_field_name_split is used to implement + string.Formatter.vformat. it takes an PEP 3101 "field name", and + returns a tuple of (first, rest): "first", the part before the + first '.' or '['; and "rest", an iterator for the rest of the field + name. it's a wrapper around stringlib/string_format.h's + field_name_split. The iterator it returns is a + FieldNameIterator */ +static PyObject * +formatter_field_name_split(STRINGLIB_OBJECT *self) +{ + SubString first; + Py_ssize_t first_idx; + fieldnameiterobject *it; + + PyObject *first_obj = NULL; + PyObject *result = NULL; + + it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type); + if (it == NULL) + return NULL; + + /* take ownership, give the object to the iterator. this is + just to keep the field_name alive */ + Py_INCREF(self); + it->str = self; + + if (!field_name_split(STRINGLIB_STR(self), + STRINGLIB_LEN(self), + &first, &first_idx, &it->it_field)) + goto done; + + /* first becomes an integer, if possible; else a string */ + if (first_idx != -1) + first_obj = PyLong_FromSsize_t(first_idx); + else + /* convert "first" into a string object */ + first_obj = SubString_new_object(&first); + if (first_obj == NULL) + goto done; + + /* return a tuple of values */ + result = PyTuple_Pack(2, first_obj, it); + +done: + Py_XDECREF(it); + Py_XDECREF(first_obj); + return result; +} diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h new file mode 100644 index 0000000..1e0df0f --- /dev/null +++ b/Objects/stringlib/stringdefs.h @@ -0,0 +1,27 @@ +#ifndef STRINGLIB_STRINGDEFS_H +#define STRINGLIB_STRINGDEFS_H + +/* this is sort of a hack. there's at least one place (formatting + floats) where some stringlib code takes a different path if it's + compiled as unicode. */ +#define STRINGLIB_IS_UNICODE 0 + +#define STRINGLIB_OBJECT PyStringObject +#define STRINGLIB_CHAR char +#define STRINGLIB_TYPE_NAME "string" +#define STRINGLIB_PARSE_CODE "S" +#define STRINGLIB_EMPTY nullstring +#define STRINGLIB_ISDECIMAL(x) ((x >= '0') && (x <= '9')) +#define STRINGLIB_TODECIMAL(x) (STRINGLIB_ISDECIMAL(x) ? (x - '0') : -1) +#define STRINGLIB_TOUPPER toupper +#define STRINGLIB_TOLOWER tolower +#define STRINGLIB_FILL memset +#define STRINGLIB_STR PyString_AS_STRING +#define STRINGLIB_LEN PyString_GET_SIZE +#define STRINGLIB_NEW PyString_FromStringAndSize +#define STRINGLIB_RESIZE _PyString_Resize +#define STRINGLIB_CHECK PyString_Check +#define STRINGLIB_CMP memcmp +#define STRINGLIB_TOSTR PyObject_Str + +#endif /* !STRINGLIB_STRINGDEFS_H */ diff --git a/Objects/stringlib/unicodedefs.h b/Objects/stringlib/unicodedefs.h new file mode 100644 index 0000000..f402a98 --- /dev/null +++ b/Objects/stringlib/unicodedefs.h @@ -0,0 +1,52 @@ +#ifndef STRINGLIB_UNICODEDEFS_H +#define STRINGLIB_UNICODEDEFS_H + +/* this is sort of a hack. there's at least one place (formatting + floats) where some stringlib code takes a different path if it's + compiled as unicode. */ +#define STRINGLIB_IS_UNICODE 1 + +#define STRINGLIB_OBJECT PyUnicodeObject +#define STRINGLIB_CHAR Py_UNICODE +#define STRINGLIB_TYPE_NAME "unicode" +#define STRINGLIB_PARSE_CODE "U" +#define STRINGLIB_EMPTY unicode_empty +#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL +#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL +#define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER +#define STRINGLIB_TOLOWER Py_UNICODE_TOLOWER +#define STRINGLIB_FILL Py_UNICODE_FILL +#define STRINGLIB_STR PyUnicode_AS_UNICODE +#define STRINGLIB_LEN PyUnicode_GET_SIZE +#define STRINGLIB_NEW PyUnicode_FromUnicode +#define STRINGLIB_RESIZE PyUnicode_Resize +#define STRINGLIB_CHECK PyUnicode_Check + +#if PY_VERSION_HEX < 0x03000000 +#define STRINGLIB_TOSTR PyObject_Unicode +#else +#define STRINGLIB_TOSTR PyObject_Str +#endif + +#define STRINGLIB_WANT_CONTAINS_OBJ 1 + +/* STRINGLIB_CMP was defined as: + +Py_LOCAL_INLINE(int) +STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len) +{ + if (str[0] != other[0]) + return 1; + return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE)); +} + +but unfortunately that gives a error if the function isn't used in a file that +includes this file. So, reluctantly convert it to a macro instead. */ + +#define STRINGLIB_CMP(str, other, len) \ + (((str)[0] != (other)[0]) ? \ + 1 : \ + memcmp((void*) (str), (void*) (other), (len) * sizeof(Py_UNICODE))) + + +#endif /* !STRINGLIB_UNICODEDEFS_H */ |