diff options
| author | Eric Smith <eric@trueblade.com> | 2008-02-17 19:46:49 (GMT) | 
|---|---|---|
| committer | Eric Smith <eric@trueblade.com> | 2008-02-17 19:46:49 (GMT) | 
| commit | a9f7d6248032c9572b4d2024a1be8bd2823af09f (patch) | |
| tree | 5465a1051312055678248db0076d314924ee4ebc /Objects/stringlib | |
| parent | e139688d34cc12b23d3a310f10d4f440f75f7d08 (diff) | |
| download | cpython-a9f7d6248032c9572b4d2024a1be8bd2823af09f.zip cpython-a9f7d6248032c9572b4d2024a1be8bd2823af09f.tar.gz cpython-a9f7d6248032c9572b4d2024a1be8bd2823af09f.tar.bz2  | |
Backport of PEP 3101, Advanced String Formatting, from py3k.
Highlights:
 - Adding PyObject_Format.
 - Adding string.Format class.
 - Adding __format__ for str, unicode, int, long, float, datetime.
 - Adding builtin format.
 - Adding ''.format and u''.format.
 - str/unicode fixups for formatters.
The files in Objects/stringlib that implement PEP 3101 (stringdefs.h,
unicodedefs.h, formatter.h, string_format.h) are identical in trunk
and py3k.  Any changes from here on should be made to trunk, and
changes will propogate to py3k).
Diffstat (limited to 'Objects/stringlib')
| -rw-r--r-- | Objects/stringlib/formatter.h | 980 | ||||
| -rw-r--r-- | Objects/stringlib/string_format.h | 1214 | ||||
| -rw-r--r-- | Objects/stringlib/stringdefs.h | 27 | ||||
| -rw-r--r-- | Objects/stringlib/unicodedefs.h | 52 | 
4 files changed, 2273 insertions, 0 deletions
diff --git a/Objects/stringlib/formatter.h b/Objects/stringlib/formatter.h new file mode 100644 index 0000000..39da6b3 --- /dev/null +++ b/Objects/stringlib/formatter.h @@ -0,0 +1,980 @@ +/* implements the string, long, and float formatters.  that is, +   string.__format__, etc. */ + +/* Before including this, you must include either: +   stringlib/unicodedefs.h +   stringlib/stringdefs.h + +   Also, you should define the names: +   FORMAT_STRING +   FORMAT_LONG +   FORMAT_FLOAT +   to be whatever you want the public names of these functions to +   be.  These are the only non-static functions defined here. +*/ + +#define ALLOW_PARENS_FOR_SIGN 0 + +/* +    get_integer consumes 0 or more decimal digit characters from an +    input string, updates *result with the corresponding positive +    integer, and returns the number of digits consumed. + +    returns -1 on error. +*/ +static int +get_integer(STRINGLIB_CHAR **ptr, STRINGLIB_CHAR *end, +                  Py_ssize_t *result) +{ +    Py_ssize_t accumulator, digitval, oldaccumulator; +    int numdigits; +    accumulator = numdigits = 0; +    for (;;(*ptr)++, numdigits++) { +        if (*ptr >= end) +            break; +        digitval = STRINGLIB_TODECIMAL(**ptr); +        if (digitval < 0) +            break; +        /* +           This trick was copied from old Unicode format code.  It's cute, +           but would really suck on an old machine with a slow divide +           implementation.  Fortunately, in the normal case we do not +           expect too many digits. +        */ +        oldaccumulator = accumulator; +        accumulator *= 10; +        if ((accumulator+10)/10 != oldaccumulator+1) { +            PyErr_Format(PyExc_ValueError, +                         "Too many decimal digits in format string"); +            return -1; +        } +        accumulator += digitval; +    } +    *result = accumulator; +    return numdigits; +} + +/************************************************************************/ +/*********** standard format specifier parsing **************************/ +/************************************************************************/ + +/* returns true if this character is a specifier alignment token */ +Py_LOCAL_INLINE(int) +is_alignment_token(STRINGLIB_CHAR c) +{ +    switch (c) { +    case '<': case '>': case '=': case '^': +        return 1; +    default: +        return 0; +    } +} + +/* returns true if this character is a sign element */ +Py_LOCAL_INLINE(int) +is_sign_element(STRINGLIB_CHAR c) +{ +    switch (c) { +    case ' ': case '+': case '-': +#if ALLOW_PARENS_FOR_SIGN +    case '(': +#endif +        return 1; +    default: +        return 0; +    } +} + + +typedef struct { +    STRINGLIB_CHAR fill_char; +    STRINGLIB_CHAR align; +    STRINGLIB_CHAR sign; +    Py_ssize_t width; +    Py_ssize_t precision; +    STRINGLIB_CHAR type; +} InternalFormatSpec; + +/* +  ptr points to the start of the format_spec, end points just past its end. +  fills in format with the parsed information. +  returns 1 on success, 0 on failure. +  if failure, sets the exception +*/ +static int +parse_internal_render_format_spec(PyObject *format_spec, +                                  InternalFormatSpec *format, +                                  char default_type) +{ +    STRINGLIB_CHAR *ptr = STRINGLIB_STR(format_spec); +    STRINGLIB_CHAR *end = ptr + STRINGLIB_LEN(format_spec); + +    /* end-ptr is used throughout this code to specify the length of +       the input string */ + +    Py_ssize_t specified_width; + +    format->fill_char = '\0'; +    format->align = '\0'; +    format->sign = '\0'; +    format->width = -1; +    format->precision = -1; +    format->type = default_type; + +    /* If the second char is an alignment token, +       then parse the fill char */ +    if (end-ptr >= 2 && is_alignment_token(ptr[1])) { +        format->align = ptr[1]; +        format->fill_char = ptr[0]; +        ptr += 2; +    } +    else if (end-ptr >= 1 && is_alignment_token(ptr[0])) { +        format->align = ptr[0]; +        ptr++; +    } + +    /* Parse the various sign options */ +    if (end-ptr >= 1 && is_sign_element(ptr[0])) { +        format->sign = ptr[0]; +        ptr++; +#if ALLOW_PARENS_FOR_SIGN +        if (end-ptr >= 1 && ptr[0] == ')') { +            ptr++; +        } +#endif +    } + +    /* The special case for 0-padding (backwards compat) */ +    if (format->fill_char == '\0' && end-ptr >= 1 && ptr[0] == '0') { +        format->fill_char = '0'; +        if (format->align == '\0') { +            format->align = '='; +        } +        ptr++; +    } + +    /* XXX add error checking */ +    specified_width = get_integer(&ptr, end, &format->width); + +    /* if specified_width is 0, we didn't consume any characters for +       the width. in that case, reset the width to -1, because +       get_integer() will have set it to zero */ +    if (specified_width == 0) { +        format->width = -1; +    } + +    /* Parse field precision */ +    if (end-ptr && ptr[0] == '.') { +        ptr++; + +        /* XXX add error checking */ +        specified_width = get_integer(&ptr, end, &format->precision); + +        /* not having a precision after a dot is an error */ +        if (specified_width == 0) { +            PyErr_Format(PyExc_ValueError, +                         "Format specifier missing precision"); +            return 0; +        } + +    } + +    /* Finally, parse the type field */ + +    if (end-ptr > 1) { +        /* invalid conversion spec */ +        PyErr_Format(PyExc_ValueError, "Invalid conversion specification"); +        return 0; +    } + +    if (end-ptr == 1) { +        format->type = ptr[0]; +        ptr++; +    } + +    return 1; +} + +#if defined FORMAT_FLOAT || defined FORMAT_LONG +/************************************************************************/ +/*********** common routines for numeric formatting *********************/ +/************************************************************************/ + +/* describes the layout for an integer, see the comment in +   _calc_integer_widths() for details */ +typedef struct { +    Py_ssize_t n_lpadding; +    Py_ssize_t n_spadding; +    Py_ssize_t n_rpadding; +    char lsign; +    Py_ssize_t n_lsign; +    char rsign; +    Py_ssize_t n_rsign; +    Py_ssize_t n_total; /* just a convenience, it's derivable from the +                           other fields */ +} NumberFieldWidths; + +/* not all fields of format are used.  for example, precision is +   unused.  should this take discrete params in order to be more clear +   about what it does?  or is passing a single format parameter easier +   and more efficient enough to justify a little obfuscation? */ +static void +calc_number_widths(NumberFieldWidths *r, STRINGLIB_CHAR actual_sign, +                   Py_ssize_t n_digits, const InternalFormatSpec *format) +{ +    r->n_lpadding = 0; +    r->n_spadding = 0; +    r->n_rpadding = 0; +    r->lsign = '\0'; +    r->n_lsign = 0; +    r->rsign = '\0'; +    r->n_rsign = 0; + +    /* the output will look like: +       |                                                           | +       | <lpadding> <lsign> <spadding> <digits> <rsign> <rpadding> | +       |                                                           | + +       lsign and rsign are computed from format->sign and the actual +       sign of the number + +       digits is already known + +       the total width is either given, or computed from the +       actual digits + +       only one of lpadding, spadding, and rpadding can be non-zero, +       and it's calculated from the width and other fields +    */ + +    /* compute the various parts we're going to write */ +    if (format->sign == '+') { +        /* always put a + or - */ +        r->n_lsign = 1; +        r->lsign = (actual_sign == '-' ? '-' : '+'); +    } +#if ALLOW_PARENS_FOR_SIGN +    else if (format->sign == '(') { +        if (actual_sign == '-') { +            r->n_lsign = 1; +            r->lsign = '('; +            r->n_rsign = 1; +            r->rsign = ')'; +        } +    } +#endif +    else if (format->sign == ' ') { +        r->n_lsign = 1; +        r->lsign = (actual_sign == '-' ? '-' : ' '); +    } +    else { +        /* non specified, or the default (-) */ +        if (actual_sign == '-') { +            r->n_lsign = 1; +            r->lsign = '-'; +        } +    } + +    /* now the number of padding characters */ +    if (format->width == -1) { +        /* no padding at all, nothing to do */ +    } +    else { +        /* see if any padding is needed */ +        if (r->n_lsign + n_digits + r->n_rsign >= format->width) { +            /* no padding needed, we're already bigger than the +               requested width */ +        } +        else { +            /* determine which of left, space, or right padding is +               needed */ +            Py_ssize_t padding = format->width - +		                    (r->n_lsign + n_digits + r->n_rsign); +            if (format->align == '<') +                r->n_rpadding = padding; +            else if (format->align == '>') +                r->n_lpadding = padding; +            else if (format->align == '^') { +                r->n_lpadding = padding / 2; +                r->n_rpadding = padding - r->n_lpadding; +            } +            else if (format->align == '=') +                r->n_spadding = padding; +            else +                r->n_lpadding = padding; +        } +    } +    r->n_total = r->n_lpadding + r->n_lsign + r->n_spadding + +        n_digits + r->n_rsign + r->n_rpadding; +} + +/* fill in the non-digit parts of a numbers's string representation, +   as determined in _calc_integer_widths().  returns the pointer to +   where the digits go. */ +static STRINGLIB_CHAR * +fill_number(STRINGLIB_CHAR *p_buf, const NumberFieldWidths *spec, +            Py_ssize_t n_digits, STRINGLIB_CHAR fill_char) +{ +    STRINGLIB_CHAR* p_digits; + +    if (spec->n_lpadding) { +        STRINGLIB_FILL(p_buf, fill_char, spec->n_lpadding); +        p_buf += spec->n_lpadding; +    } +    if (spec->n_lsign == 1) { +        *p_buf++ = spec->lsign; +    } +    if (spec->n_spadding) { +        STRINGLIB_FILL(p_buf, fill_char, spec->n_spadding); +        p_buf += spec->n_spadding; +    } +    p_digits = p_buf; +    p_buf += n_digits; +    if (spec->n_rsign == 1) { +        *p_buf++ = spec->rsign; +    } +    if (spec->n_rpadding) { +        STRINGLIB_FILL(p_buf, fill_char, spec->n_rpadding); +        p_buf += spec->n_rpadding; +    } +    return p_digits; +} +#endif /* FORMAT_FLOAT || FORMAT_LONG */ + +/************************************************************************/ +/*********** string formatting ******************************************/ +/************************************************************************/ + +static PyObject * +format_string_internal(PyObject *value, const InternalFormatSpec *format) +{ +    Py_ssize_t width; /* total field width */ +    Py_ssize_t lpad; +    STRINGLIB_CHAR *dst; +    STRINGLIB_CHAR *src = STRINGLIB_STR(value); +    Py_ssize_t len = STRINGLIB_LEN(value); +    PyObject *result = NULL; + +    /* sign is not allowed on strings */ +    if (format->sign != '\0') { +        PyErr_SetString(PyExc_ValueError, +                        "Sign not allowed in string format specifier"); +        goto done; +    } + +    /* '=' alignment not allowed on strings */ +    if (format->align == '=') { +        PyErr_SetString(PyExc_ValueError, +                        "'=' alignment not allowed " +                        "in string format specifier"); +        goto done; +    } + +    /* if precision is specified, output no more that format.precision +       characters */ +    if (format->precision >= 0 && len >= format->precision) { +        len = format->precision; +    } + +    if (format->width >= 0) { +        width = format->width; + +        /* but use at least len characters */ +        if (len > width) { +            width = len; +        } +    } +    else { +        /* not specified, use all of the chars and no more */ +        width = len; +    } + +    /* allocate the resulting string */ +    result = STRINGLIB_NEW(NULL, width); +    if (result == NULL) +        goto done; + +    /* now write into that space */ +    dst = STRINGLIB_STR(result); + +    /* figure out how much leading space we need, based on the +       aligning */ +    if (format->align == '>') +        lpad = width - len; +    else if (format->align == '^') +        lpad = (width - len) / 2; +    else +        lpad = 0; + +    /* if right aligning, increment the destination allow space on the +       left */ +    memcpy(dst + lpad, src, len * sizeof(STRINGLIB_CHAR)); + +    /* do any padding */ +    if (width > len) { +        STRINGLIB_CHAR fill_char = format->fill_char; +        if (fill_char == '\0') { +            /* use the default, if not specified */ +            fill_char = ' '; +        } + +        /* pad on left */ +        if (lpad) +            STRINGLIB_FILL(dst, fill_char, lpad); + +        /* pad on right */ +        if (width - len - lpad) +            STRINGLIB_FILL(dst + len + lpad, fill_char, width - len - lpad); +    } + +done: +    return result; +} + + +/************************************************************************/ +/*********** long formatting ********************************************/ +/************************************************************************/ + +#if defined FORMAT_LONG || defined FORMAT_INT +typedef PyObject* +(*IntOrLongToString)(PyObject *value, int base); + +static PyObject * +format_int_or_long_internal(PyObject *value, const InternalFormatSpec *format, +			    IntOrLongToString tostring) +{ +    PyObject *result = NULL; +    PyObject *tmp = NULL; +    STRINGLIB_CHAR *pnumeric_chars; +    STRINGLIB_CHAR numeric_char; +    STRINGLIB_CHAR sign = '\0'; +    STRINGLIB_CHAR *p; +    Py_ssize_t n_digits;       /* count of digits need from the computed +                                  string */ +    Py_ssize_t n_leading_chars; +    NumberFieldWidths spec; +    long x; + +    /* no precision allowed on integers */ +    if (format->precision != -1) { +        PyErr_SetString(PyExc_ValueError, +                        "Precision not allowed in integer format specifier"); +        goto done; +    } + + +    /* special case for character formatting */ +    if (format->type == 'c') { +        /* error to specify a sign */ +        if (format->sign != '\0') { +            PyErr_SetString(PyExc_ValueError, +                            "Sign not allowed with integer" +                            " format specifier 'c'"); +            goto done; +        } + +        /* taken from unicodeobject.c formatchar() */ +        /* Integer input truncated to a character */ +/* XXX: won't work for int */ +        x = PyLong_AsLong(value); +        if (x == -1 && PyErr_Occurred()) +            goto done; +#ifdef Py_UNICODE_WIDE +        if (x < 0 || x > 0x10ffff) { +            PyErr_SetString(PyExc_OverflowError, +                            "%c arg not in range(0x110000) " +                            "(wide Python build)"); +            goto done; +        } +#else +        if (x < 0 || x > 0xffff) { +            PyErr_SetString(PyExc_OverflowError, +                            "%c arg not in range(0x10000) " +                            "(narrow Python build)"); +            goto done; +        } +#endif +	numeric_char = (STRINGLIB_CHAR)x; +	pnumeric_chars = &numeric_char; +        n_digits = 1; +    } +    else { +        int base; +	int leading_chars_to_skip;  /* Number of characters added by +				       PyNumber_ToBase that we want to +				       skip over. */ + +        /* Compute the base and how many characters will be added by +           PyNumber_ToBase */ +        switch (format->type) { +        case 'b': +            base = 2; +            leading_chars_to_skip = 2; /* 0b */ +            break; +        case 'o': +            base = 8; +            leading_chars_to_skip = 2; /* 0o */ +            break; +        case 'x': +        case 'X': +            base = 16; +            leading_chars_to_skip = 2; /* 0x */ +            break; +        default:  /* shouldn't be needed, but stops a compiler warning */ +        case 'd': +            base = 10; +            leading_chars_to_skip = 0; +            break; +        } + +        /* Do the hard part, converting to a string in a given base */ +	tmp = tostring(value, base); +        if (tmp == NULL) +            goto done; + +	pnumeric_chars = STRINGLIB_STR(tmp); +        n_digits = STRINGLIB_LEN(tmp); + +	/* Remember not to modify what pnumeric_chars points to.  it +	   might be interned.  Only modify it after we copy it into a +	   newly allocated output buffer. */ + +        /* Is a sign character present in the output?  If so, remember it +           and skip it */ +        sign = pnumeric_chars[0]; +        if (sign == '-') { +	    ++leading_chars_to_skip; +        } + +	/* Skip over the leading chars (0x, 0b, etc.) */ +	n_digits -= leading_chars_to_skip; +	pnumeric_chars += leading_chars_to_skip; +    } + +    /* Calculate the widths of the various leading and trailing parts */ +    calc_number_widths(&spec, sign, n_digits, format); + +    /* Allocate a new string to hold the result */ +    result = STRINGLIB_NEW(NULL, spec.n_total); +    if (!result) +	goto done; +    p = STRINGLIB_STR(result); + +    /* Fill in the digit parts */ +    n_leading_chars = spec.n_lpadding + spec.n_lsign + spec.n_spadding; +    memmove(p + n_leading_chars, +	    pnumeric_chars, +	    n_digits * sizeof(STRINGLIB_CHAR)); + +    /* if X, convert to uppercase */ +    if (format->type == 'X') { +	Py_ssize_t t; +	for (t = 0; t < n_digits; t++) +	    p[t + n_leading_chars] = STRINGLIB_TOUPPER(p[t + n_leading_chars]); +    } + +    /* Fill in the non-digit parts */ +    fill_number(p, &spec, n_digits, +                format->fill_char == '\0' ? ' ' : format->fill_char); + +done: +    Py_XDECREF(tmp); +    return result; +} +#endif /* defined FORMAT_LONG || defined FORMAT_INT */ + +/************************************************************************/ +/*********** float formatting *******************************************/ +/************************************************************************/ + +#ifdef FORMAT_FLOAT +#if STRINGLIB_IS_UNICODE +/* taken from unicodeobject.c */ +static Py_ssize_t +strtounicode(Py_UNICODE *buffer, const char *charbuffer) +{ +    register Py_ssize_t i; +    Py_ssize_t len = strlen(charbuffer); +    for (i = len - 1; i >= 0; i--) +        buffer[i] = (Py_UNICODE) charbuffer[i]; + +    return len; +} +#endif + +/* the callback function to call to do the actual float formatting. +   it matches the definition of PyOS_ascii_formatd */ +typedef char* +(*DoubleSnprintfFunction)(char *buffer, size_t buf_len, +                          const char *format, double d); + +/* just a wrapper to make PyOS_snprintf look like DoubleSnprintfFunction */ +static char* +snprintf_double(char *buffer, size_t buf_len, const char *format, double d) +{ +    PyOS_snprintf(buffer, buf_len, format, d); +    return NULL; +} + +/* see FORMATBUFLEN in unicodeobject.c */ +#define FLOAT_FORMATBUFLEN 120 + +/* much of this is taken from unicodeobject.c */ +/* use type instead of format->type, so that it can be overridden by +   format_number() */ +static PyObject * +_format_float(STRINGLIB_CHAR type, PyObject *value, +              const InternalFormatSpec *format, +              DoubleSnprintfFunction snprintf) +{ +    /* fmt = '%.' + `prec` + `type` + '%%' +       worst case length = 2 + 10 (len of INT_MAX) + 1 + 2 = 15 (use 20)*/ +    char fmt[20]; + +    /* taken from unicodeobject.c */ +    /* Worst case length calc to ensure no buffer overrun: + +       'g' formats: +         fmt = %#.<prec>g +         buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp +            for any double rep.) +         len = 1 + prec + 1 + 2 + 5 = 9 + prec + +       'f' formats: +         buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) +         len = 1 + 50 + 1 + prec = 52 + prec + +       If prec=0 the effective precision is 1 (the leading digit is +       always given), therefore increase the length by one. + +    */ +    char charbuf[FLOAT_FORMATBUFLEN]; +    Py_ssize_t n_digits; +    double x; +    Py_ssize_t precision = format->precision; +    PyObject *result = NULL; +    STRINGLIB_CHAR sign; +    char* trailing = ""; +    STRINGLIB_CHAR *p; +    NumberFieldWidths spec; + +#if STRINGLIB_IS_UNICODE +    Py_UNICODE unicodebuf[FLOAT_FORMATBUFLEN]; +#endif + +    /* first, do the conversion as 8-bit chars, using the platform's +       snprintf.  then, if needed, convert to unicode. */ + +    /* 'F' is the same as 'f', per the PEP */ +    if (type == 'F') +        type = 'f'; + +    x = PyFloat_AsDouble(value); + +    if (x == -1.0 && PyErr_Occurred()) +        goto done; + +    if (type == '%') { +        type = 'f'; +        x *= 100; +        trailing = "%"; +    } + +    if (precision < 0) +        precision = 6; +    if (type == 'f' && (fabs(x) / 1e25) >= 1e25) +        type = 'g'; + +    /* cast "type", because if we're in unicode we need to pass a +       8-bit char.  this is safe, because we've restricted what "type" +       can be */ +    PyOS_snprintf(fmt, sizeof(fmt), "%%.%" PY_FORMAT_SIZE_T "d%c", precision, +		  (char)type); + +    /* call the passed in function to do the actual formatting */ +    snprintf(charbuf, sizeof(charbuf), fmt, x); + +    /* adding trailing to fmt with PyOS_snprintf doesn't work, not +       sure why.  we'll just concatentate it here, no harm done.  we +       know we can't have a buffer overflow from the fmt size +       analysis */ +    strcat(charbuf, trailing); + +    /* rather than duplicate the code for snprintf for both unicode +       and 8 bit strings, we just use the 8 bit version and then +       convert to unicode in a separate code path.  that's probably +       the lesser of 2 evils. */ +#if STRINGLIB_IS_UNICODE +    n_digits = strtounicode(unicodebuf, charbuf); +    p = unicodebuf; +#else +    /* compute the length.  I believe this is done because the return +       value from snprintf above is unreliable */ +    n_digits = strlen(charbuf); +    p = charbuf; +#endif + +    /* is a sign character present in the output?  if so, remember it +       and skip it */ +    sign = p[0]; +    if (sign == '-') { +        p++; +        n_digits--; +    } + +    calc_number_widths(&spec, sign, n_digits, format); + +    /* allocate a string with enough space */ +    result = STRINGLIB_NEW(NULL, spec.n_total); +    if (result == NULL) +        goto done; + +    /* fill in the non-digit parts */ +    fill_number(STRINGLIB_STR(result), &spec, n_digits, +                format->fill_char == '\0' ? ' ' : format->fill_char); + +    /* fill in the digit parts */ +    memmove(STRINGLIB_STR(result) + +	       (spec.n_lpadding + spec.n_lsign + spec.n_spadding), +            p, +            n_digits * sizeof(STRINGLIB_CHAR)); + +done: +    return result; +} + +static PyObject * +format_float_internal(PyObject *value, const InternalFormatSpec *format) +{ +    if (format->type == 'n') +        return _format_float('f', value, format, snprintf_double); +    else +        return _format_float(format->type, value, format, PyOS_ascii_formatd); +} +#endif /* FORMAT_FLOAT */ + +/************************************************************************/ +/*********** built in formatters ****************************************/ +/************************************************************************/ +#ifdef FORMAT_STRING +PyObject * +FORMAT_STRING(PyObject* value, PyObject* args) +{ +    PyObject *format_spec; +    PyObject *result = NULL; +#if PY_VERSION_HEX < 0x03000000 +    PyObject *tmp = NULL; +#endif +    InternalFormatSpec format; + +    /* If 2.x, we accept either str or unicode, and try to convert it +       to the right type.  In 3.x, we insist on only unicode */ +#if PY_VERSION_HEX >= 0x03000000 +    if (!PyArg_ParseTuple(args, STRINGLIB_PARSE_CODE ":__format__", +			  &format_spec)) +        goto done; +#else +    /* If 2.x, convert format_spec to the same type as value */ +    /* This is to allow things like u''.format('') */ +    if (!PyArg_ParseTuple(args, "O:__format__", &format_spec)) +        goto done; +    if (!(PyString_Check(format_spec) || PyUnicode_Check(format_spec))) { +        PyErr_Format(PyExc_TypeError, "__format__ arg must be str " +		     "or unicode, not %s", Py_TYPE(format_spec)->tp_name); +	goto done; +    } +    tmp = STRINGLIB_TOSTR(format_spec); +    if (tmp == NULL) +        goto done; +    format_spec = tmp; +#endif + +    /* check for the special case of zero length format spec, make +       it equivalent to str(value) */ +    if (STRINGLIB_LEN(format_spec) == 0) { +        result = STRINGLIB_TOSTR(value); +        goto done; +    } + + +    /* parse the format_spec */ +    if (!parse_internal_render_format_spec(format_spec, &format, 's')) +        goto done; + +    /* type conversion? */ +    switch (format.type) { +    case 's': +        /* no type conversion needed, already a string.  do the formatting */ +        result = format_string_internal(value, &format); +        break; +    default: +        /* unknown */ +        PyErr_Format(PyExc_ValueError, "Unknown conversion type %c", +                     format.type); +        goto done; +    } + +done: +#if PY_VERSION_HEX < 0x03000000 +    Py_XDECREF(tmp); +#endif +    return result; +} +#endif /* FORMAT_STRING */ + +#if defined FORMAT_LONG || defined FORMAT_INT +static PyObject* +format_int_or_long(PyObject* value, PyObject* args, IntOrLongToString tostring) +{ +    PyObject *format_spec; +    PyObject *result = NULL; +    PyObject *tmp = NULL; +    InternalFormatSpec format; + +    if (!PyArg_ParseTuple(args, STRINGLIB_PARSE_CODE ":__format__", +			  &format_spec)) +        goto done; + +    /* check for the special case of zero length format spec, make +       it equivalent to str(value) */ +    if (STRINGLIB_LEN(format_spec) == 0) { +        result = STRINGLIB_TOSTR(value); +        goto done; +    } + +    /* parse the format_spec */ +    if (!parse_internal_render_format_spec(format_spec, &format, 'd')) +        goto done; + +    /* type conversion? */ +    switch (format.type) { +    case 'b': +    case 'c': +    case 'd': +    case 'o': +    case 'x': +    case 'X': +        /* no type conversion needed, already an int (or long).  do +	   the formatting */ +	    result = format_int_or_long_internal(value, &format, tostring); +        break; + +    case 'e': +    case 'E': +    case 'f': +    case 'F': +    case 'g': +    case 'G': +    case 'n': +    case '%': +        /* convert to float */ +        tmp = PyNumber_Float(value); +        if (tmp == NULL) +            goto done; +        result = format_float_internal(value, &format); +        break; + +    default: +        /* unknown */ +        PyErr_Format(PyExc_ValueError, "Unknown conversion type %c", +                     format.type); +        goto done; +    } + +done: +    Py_XDECREF(tmp); +    return result; +} +#endif /* FORMAT_LONG || defined FORMAT_INT */ + +#ifdef FORMAT_LONG +/* Need to define long_format as a function that will convert a long +   to a string.  In 3.0, _PyLong_Format has the correct signature.  In +   2.x, we need to fudge a few parameters */ +#if PY_VERSION_HEX >= 0x03000000 +#define long_format _PyLong_Format +#else +static PyObject* +long_format(PyObject* value, int base) +{ +    /* Convert to base, don't add trailing 'L', and use the new octal +       format. We already know this is a long object */ +    assert(PyLong_Check(value)); +    /* convert to base, don't add 'L', and use the new octal format */ +    return _PyLong_Format(value, base, 0, 1); +} +#endif + +PyObject * +FORMAT_LONG(PyObject* value, PyObject* args) +{ +    return format_int_or_long(value, args, long_format); +} +#endif /* FORMAT_LONG */ + +#ifdef FORMAT_INT +/* this is only used for 2.x, not 3.0 */ +static PyObject* +int_format(PyObject* value, int base) +{ +    /* Convert to base, and use the new octal format. We already +       know this is an int object */ +    assert(PyInt_Check(value)); +    return _PyInt_Format((PyIntObject*)value, base, 1); +} + +PyObject * +FORMAT_INT(PyObject* value, PyObject* args) +{ +    return format_int_or_long(value, args, int_format); +} +#endif /* FORMAT_INT */ + +#ifdef FORMAT_FLOAT +PyObject * +FORMAT_FLOAT(PyObject *value, PyObject *args) +{ +    PyObject *format_spec; +    PyObject *result = NULL; +    InternalFormatSpec format; + +    if (!PyArg_ParseTuple(args, STRINGLIB_PARSE_CODE ":__format__", &format_spec)) +        goto done; + +    /* check for the special case of zero length format spec, make +       it equivalent to str(value) */ +    if (STRINGLIB_LEN(format_spec) == 0) { +        result = STRINGLIB_TOSTR(value); +        goto done; +    } + +    /* parse the format_spec */ +    if (!parse_internal_render_format_spec(format_spec, &format, 'g')) +        goto done; + +    /* type conversion? */ +    switch (format.type) { +    case 'e': +    case 'E': +    case 'f': +    case 'F': +    case 'g': +    case 'G': +    case 'n': +    case '%': +        /* no conversion, already a float.  do the formatting */ +        result = format_float_internal(value, &format); +        break; + +    default: +        /* unknown */ +        PyErr_Format(PyExc_ValueError, "Unknown conversion type %c", +                     format.type); +        goto done; +    } + +done: +    return result; +} +#endif /* FORMAT_FLOAT */ diff --git a/Objects/stringlib/string_format.h b/Objects/stringlib/string_format.h new file mode 100644 index 0000000..70f8f13 --- /dev/null +++ b/Objects/stringlib/string_format.h @@ -0,0 +1,1214 @@ +/* +    string_format.h -- implementation of string.format(). + +    It uses the Objects/stringlib conventions, so that it can be +    compiled for both unicode and string objects. +*/ + + +/* Defines for Python 2.6 compatability */ +#if PY_VERSION_HEX < 0x03000000 +#define PyLong_FromSsize_t _PyLong_FromSsize_t +#endif + +/* Defines for more efficiently reallocating the string buffer */ +#define INITIAL_SIZE_INCREMENT 100 +#define SIZE_MULTIPLIER 2 +#define MAX_SIZE_INCREMENT  3200 + + +/************************************************************************/ +/***********   Global data structures and forward declarations  *********/ +/************************************************************************/ + +/* +   A SubString consists of the characters between two string or +   unicode pointers. +*/ +typedef struct { +    STRINGLIB_CHAR *ptr; +    STRINGLIB_CHAR *end; +} SubString; + + +/* forward declaration for recursion */ +static PyObject * +build_string(SubString *input, PyObject *args, PyObject *kwargs, +             int recursion_depth); + + + +/************************************************************************/ +/**************************  Utility  functions  ************************/ +/************************************************************************/ + +/* fill in a SubString from a pointer and length */ +Py_LOCAL_INLINE(void) +SubString_init(SubString *str, STRINGLIB_CHAR *p, Py_ssize_t len) +{ +    str->ptr = p; +    if (p == NULL) +        str->end = NULL; +    else +        str->end = str->ptr + len; +} + +/* return a new string.  if str->ptr is NULL, return None */ +Py_LOCAL_INLINE(PyObject *) +SubString_new_object(SubString *str) +{ +    if (str->ptr == NULL) { +        Py_INCREF(Py_None); +        return Py_None; +    } +    return STRINGLIB_NEW(str->ptr, str->end - str->ptr); +} + +/* return a new string.  if str->ptr is NULL, return None */ +Py_LOCAL_INLINE(PyObject *) +SubString_new_object_or_empty(SubString *str) +{ +    if (str->ptr == NULL) { +        return STRINGLIB_NEW(NULL, 0); +    } +    return STRINGLIB_NEW(str->ptr, str->end - str->ptr); +} + +/************************************************************************/ +/***********    Output string management functions       ****************/ +/************************************************************************/ + +typedef struct { +    STRINGLIB_CHAR *ptr; +    STRINGLIB_CHAR *end; +    PyObject *obj; +    Py_ssize_t size_increment; +} OutputString; + +/* initialize an OutputString object, reserving size characters */ +static int +output_initialize(OutputString *output, Py_ssize_t size) +{ +    output->obj = STRINGLIB_NEW(NULL, size); +    if (output->obj == NULL) +        return 0; + +    output->ptr = STRINGLIB_STR(output->obj); +    output->end = STRINGLIB_LEN(output->obj) + output->ptr; +    output->size_increment = INITIAL_SIZE_INCREMENT; + +    return 1; +} + +/* +    output_extend reallocates the output string buffer. +    It returns a status:  0 for a failed reallocation, +    1 for success. +*/ + +static int +output_extend(OutputString *output, Py_ssize_t count) +{ +    STRINGLIB_CHAR *startptr = STRINGLIB_STR(output->obj); +    Py_ssize_t curlen = output->ptr - startptr; +    Py_ssize_t maxlen = curlen + count + output->size_increment; + +    if (STRINGLIB_RESIZE(&output->obj, maxlen) < 0) +        return 0; +    startptr = STRINGLIB_STR(output->obj); +    output->ptr = startptr + curlen; +    output->end = startptr + maxlen; +    if (output->size_increment < MAX_SIZE_INCREMENT) +        output->size_increment *= SIZE_MULTIPLIER; +    return 1; +} + +/* +    output_data dumps characters into our output string +    buffer. + +    In some cases, it has to reallocate the string. + +    It returns a status:  0 for a failed reallocation, +    1 for success. +*/ +static int +output_data(OutputString *output, const STRINGLIB_CHAR *s, Py_ssize_t count) +{ +    if ((count > output->end - output->ptr) && !output_extend(output, count)) +        return 0; +    memcpy(output->ptr, s, count * sizeof(STRINGLIB_CHAR)); +    output->ptr += count; +    return 1; +} + +/************************************************************************/ +/***********  Format string parsing -- integers and identifiers *********/ +/************************************************************************/ + +static Py_ssize_t +get_integer(const SubString *str) +{ +    Py_ssize_t accumulator = 0; +    Py_ssize_t digitval; +    Py_ssize_t oldaccumulator; +    STRINGLIB_CHAR *p; + +    /* empty string is an error */ +    if (str->ptr >= str->end) +        return -1; + +    for (p = str->ptr; p < str->end; p++) { +        digitval = STRINGLIB_TODECIMAL(*p); +        if (digitval < 0) +            return -1; +        /* +           This trick was copied from old Unicode format code.  It's cute, +           but would really suck on an old machine with a slow divide +           implementation.  Fortunately, in the normal case we do not +           expect too many digits. +        */ +        oldaccumulator = accumulator; +        accumulator *= 10; +        if ((accumulator+10)/10 != oldaccumulator+1) { +            PyErr_Format(PyExc_ValueError, +                         "Too many decimal digits in format string"); +            return -1; +        } +        accumulator += digitval; +    } +    return accumulator; +} + +/************************************************************************/ +/******** Functions to get field objects and specification strings ******/ +/************************************************************************/ + +/* do the equivalent of obj.name */ +static PyObject * +getattr(PyObject *obj, SubString *name) +{ +    PyObject *newobj; +    PyObject *str = SubString_new_object(name); +    if (str == NULL) +        return NULL; +    newobj = PyObject_GetAttr(obj, str); +    Py_DECREF(str); +    return newobj; +} + +/* do the equivalent of obj[idx], where obj is a sequence */ +static PyObject * +getitem_sequence(PyObject *obj, Py_ssize_t idx) +{ +    return PySequence_GetItem(obj, idx); +} + +/* do the equivalent of obj[idx], where obj is not a sequence */ +static PyObject * +getitem_idx(PyObject *obj, Py_ssize_t idx) +{ +    PyObject *newobj; +    PyObject *idx_obj = PyLong_FromSsize_t(idx); +    if (idx_obj == NULL) +        return NULL; +    newobj = PyObject_GetItem(obj, idx_obj); +    Py_DECREF(idx_obj); +    return newobj; +} + +/* do the equivalent of obj[name] */ +static PyObject * +getitem_str(PyObject *obj, SubString *name) +{ +    PyObject *newobj; +    PyObject *str = SubString_new_object(name); +    if (str == NULL) +        return NULL; +    newobj = PyObject_GetItem(obj, str); +    Py_DECREF(str); +    return newobj; +} + +typedef struct { +    /* the entire string we're parsing.  we assume that someone else +       is managing its lifetime, and that it will exist for the +       lifetime of the iterator.  can be empty */ +    SubString str; + +    /* pointer to where we are inside field_name */ +    STRINGLIB_CHAR *ptr; +} FieldNameIterator; + + +static int +FieldNameIterator_init(FieldNameIterator *self, STRINGLIB_CHAR *ptr, +                       Py_ssize_t len) +{ +    SubString_init(&self->str, ptr, len); +    self->ptr = self->str.ptr; +    return 1; +} + +static int +_FieldNameIterator_attr(FieldNameIterator *self, SubString *name) +{ +    STRINGLIB_CHAR c; + +    name->ptr = self->ptr; + +    /* return everything until '.' or '[' */ +    while (self->ptr < self->str.end) { +        switch (c = *self->ptr++) { +        case '[': +        case '.': +            /* backup so that we this character will be seen next time */ +            self->ptr--; +            break; +        default: +            continue; +        } +        break; +    } +    /* end of string is okay */ +    name->end = self->ptr; +    return 1; +} + +static int +_FieldNameIterator_item(FieldNameIterator *self, SubString *name) +{ +    int bracket_seen = 0; +    STRINGLIB_CHAR c; + +    name->ptr = self->ptr; + +    /* return everything until ']' */ +    while (self->ptr < self->str.end) { +        switch (c = *self->ptr++) { +        case ']': +            bracket_seen = 1; +            break; +        default: +            continue; +        } +        break; +    } +    /* make sure we ended with a ']' */ +    if (!bracket_seen) { +        PyErr_SetString(PyExc_ValueError, "Missing ']' in format string"); +        return 0; +    } + +    /* end of string is okay */ +    /* don't include the ']' */ +    name->end = self->ptr-1; +    return 1; +} + +/* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */ +static int +FieldNameIterator_next(FieldNameIterator *self, int *is_attribute, +                       Py_ssize_t *name_idx, SubString *name) +{ +    /* check at end of input */ +    if (self->ptr >= self->str.end) +        return 1; + +    switch (*self->ptr++) { +    case '.': +        *is_attribute = 1; +        if (_FieldNameIterator_attr(self, name) == 0) +            return 0; +        *name_idx = -1; +        break; +    case '[': +        *is_attribute = 0; +        if (_FieldNameIterator_item(self, name) == 0) +            return 0; +        *name_idx = get_integer(name); +        break; +    default: +        /* interal error, can't get here */ +        assert(0); +        return 0; +    } + +    /* empty string is an error */ +    if (name->ptr == name->end) { +        PyErr_SetString(PyExc_ValueError, "Empty attribute in format string"); +        return 0; +    } + +    return 2; +} + + +/* input: field_name +   output: 'first' points to the part before the first '[' or '.' +           'first_idx' is -1 if 'first' is not an integer, otherwise +                       it's the value of first converted to an integer +           'rest' is an iterator to return the rest +*/ +static int +field_name_split(STRINGLIB_CHAR *ptr, Py_ssize_t len, SubString *first, +                 Py_ssize_t *first_idx, FieldNameIterator *rest) +{ +    STRINGLIB_CHAR c; +    STRINGLIB_CHAR *p = ptr; +    STRINGLIB_CHAR *end = ptr + len; + +    /* find the part up until the first '.' or '[' */ +    while (p < end) { +        switch (c = *p++) { +        case '[': +        case '.': +            /* backup so that we this character is available to the +               "rest" iterator */ +            p--; +            break; +        default: +            continue; +        } +        break; +    } + +    /* set up the return values */ +    SubString_init(first, ptr, p - ptr); +    FieldNameIterator_init(rest, p, end - p); + +    /* see if "first" is an integer, in which case it's used as an index */ +    *first_idx = get_integer(first); + +    /* zero length string is an error */ +    if (first->ptr >= first->end) { +        PyErr_SetString(PyExc_ValueError, "empty field name"); +        goto error; +    } + +    return 1; +error: +    return 0; +} + + +/* +    get_field_object returns the object inside {}, before the +    format_spec.  It handles getindex and getattr lookups and consumes +    the entire input string. +*/ +static PyObject * +get_field_object(SubString *input, PyObject *args, PyObject *kwargs) +{ +    PyObject *obj = NULL; +    int ok; +    int is_attribute; +    SubString name; +    SubString first; +    Py_ssize_t index; +    FieldNameIterator rest; + +    if (!field_name_split(input->ptr, input->end - input->ptr, &first, +                          &index, &rest)) { +        goto error; +    } + +    if (index == -1) { +        /* look up in kwargs */ +        PyObject *key = SubString_new_object(&first); +        if (key == NULL) +            goto error; +        if ((kwargs == NULL) || (obj = PyDict_GetItem(kwargs, key)) == NULL) { +            PyErr_SetObject(PyExc_KeyError, key); +            Py_DECREF(key); +            goto error; +        } +        Py_DECREF(key); +        Py_INCREF(obj); +    } +    else { +        /* look up in args */ +        obj = PySequence_GetItem(args, index); +        if (obj == NULL) +            goto error; +    } + +    /* iterate over the rest of the field_name */ +    while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index, +                                        &name)) == 2) { +        PyObject *tmp; + +        if (is_attribute) +            /* getattr lookup "." */ +            tmp = getattr(obj, &name); +        else +            /* getitem lookup "[]" */ +            if (index == -1) +                tmp = getitem_str(obj, &name); +            else +                if (PySequence_Check(obj)) +                    tmp = getitem_sequence(obj, index); +                else +                    /* not a sequence */ +                    tmp = getitem_idx(obj, index); +        if (tmp == NULL) +            goto error; + +        /* assign to obj */ +        Py_DECREF(obj); +        obj = tmp; +    } +    /* end of iterator, this is the non-error case */ +    if (ok == 1) +        return obj; +error: +    Py_XDECREF(obj); +    return NULL; +} + +/************************************************************************/ +/*****************  Field rendering functions  **************************/ +/************************************************************************/ + +/* +    render_field() is the main function in this section.  It takes the +    field object and field specification string generated by +    get_field_and_spec, and renders the field into the output string. + +    render_field calls fieldobj.__format__(format_spec) method, and +    appends to the output. +*/ +static int +render_field(PyObject *fieldobj, SubString *format_spec, OutputString *output) +{ +    int ok = 0; +    PyObject *result = NULL; + +    /* we need to create an object out of the pointers we have */ +    PyObject *format_spec_object = SubString_new_object_or_empty(format_spec); +    if (format_spec_object == NULL) +        goto done; + +    result = PyObject_Format(fieldobj, format_spec_object); +    if (result == NULL) +        goto done; + +    ok = output_data(output, +                     STRINGLIB_STR(result), STRINGLIB_LEN(result)); +done: +    Py_DECREF(format_spec_object); +    Py_XDECREF(result); +    return ok; +} + +static int +parse_field(SubString *str, SubString *field_name, SubString *format_spec, +            STRINGLIB_CHAR *conversion) +{ +    STRINGLIB_CHAR c = 0; + +    /* initialize these, as they may be empty */ +    *conversion = '\0'; +    SubString_init(format_spec, NULL, 0); + +    /* search for the field name.  it's terminated by the end of the +       string, or a ':' or '!' */ +    field_name->ptr = str->ptr; +    while (str->ptr < str->end) { +        switch (c = *(str->ptr++)) { +        case ':': +        case '!': +            break; +        default: +            continue; +        } +        break; +    } + +    if (c == '!' || c == ':') { +        /* we have a format specifier and/or a conversion */ +        /* don't include the last character */ +        field_name->end = str->ptr-1; + +        /* the format specifier is the rest of the string */ +        format_spec->ptr = str->ptr; +        format_spec->end = str->end; + +        /* see if there's a conversion specifier */ +        if (c == '!') { +            /* there must be another character present */ +            if (format_spec->ptr >= format_spec->end) { +                PyErr_SetString(PyExc_ValueError, +                                "end of format while looking for conversion " +                                "specifier"); +                return 0; +            } +            *conversion = *(format_spec->ptr++); + +            /* if there is another character, it must be a colon */ +            if (format_spec->ptr < format_spec->end) { +                c = *(format_spec->ptr++); +                if (c != ':') { +                    PyErr_SetString(PyExc_ValueError, +                                    "expected ':' after format specifier"); +                    return 0; +                } +            } +        } + +        return 1; + +    } +    else { +        /* end of string, there's no format_spec or conversion */ +        field_name->end = str->ptr; +        return 1; +    } +} + +/************************************************************************/ +/******* Output string allocation and escape-to-markup processing  ******/ +/************************************************************************/ + +/* MarkupIterator breaks the string into pieces of either literal +   text, or things inside {} that need to be marked up.  it is +   designed to make it easy to wrap a Python iterator around it, for +   use with the Formatter class */ + +typedef struct { +    SubString str; +} MarkupIterator; + +static int +MarkupIterator_init(MarkupIterator *self, STRINGLIB_CHAR *ptr, Py_ssize_t len) +{ +    SubString_init(&self->str, ptr, len); +    return 1; +} + +/* returns 0 on error, 1 on non-error termination, and 2 if it got a +   string (or something to be expanded) */ +static int +MarkupIterator_next(MarkupIterator *self, SubString *literal, +                    SubString *field_name, SubString *format_spec, +                    STRINGLIB_CHAR *conversion, +                    int *format_spec_needs_expanding) +{ +    int at_end; +    STRINGLIB_CHAR c = 0; +    STRINGLIB_CHAR *start; +    int count; +    Py_ssize_t len; +    int markup_follows = 0; + +    /* initialize all of the output variables */ +    SubString_init(literal, NULL, 0); +    SubString_init(field_name, NULL, 0); +    SubString_init(format_spec, NULL, 0); +    *conversion = '\0'; +    *format_spec_needs_expanding = 0; + +    /* No more input, end of iterator.  This is the normal exit +       path. */ +    if (self->str.ptr >= self->str.end) +        return 1; + +    start = self->str.ptr; + +    /* First read any literal text. Read until the end of string, an +       escaped '{' or '}', or an unescaped '{'.  In order to never +       allocate memory and so I can just pass pointers around, if +       there's an escaped '{' or '}' then we'll return the literal +       including the brace, but no format object.  The next time +       through, we'll return the rest of the literal, skipping past +       the second consecutive brace. */ +    while (self->str.ptr < self->str.end) { +        switch (c = *(self->str.ptr++)) { +        case '{': +        case '}': +            markup_follows = 1; +            break; +        default: +            continue; +        } +        break; +    } + +    at_end = self->str.ptr >= self->str.end; +    len = self->str.ptr - start; + +    if ((c == '}') && (at_end || (c != *self->str.ptr))) { +        PyErr_SetString(PyExc_ValueError, "Single '}' encountered " +                        "in format string"); +        return 0; +    } +    if (at_end && c == '{') { +        PyErr_SetString(PyExc_ValueError, "Single '{' encountered " +                        "in format string"); +        return 0; +    } +    if (!at_end) { +        if (c == *self->str.ptr) { +            /* escaped } or {, skip it in the input.  there is no +               markup object following us, just this literal text */ +            self->str.ptr++; +            markup_follows = 0; +        } +        else +            len--; +    } + +    /* record the literal text */ +    literal->ptr = start; +    literal->end = start + len; + +    if (!markup_follows) +        return 2; + +    /* this is markup, find the end of the string by counting nested +       braces.  note that this prohibits escaped braces, so that +       format_specs cannot have braces in them. */ +    count = 1; + +    start = self->str.ptr; + +    /* we know we can't have a zero length string, so don't worry +       about that case */ +    while (self->str.ptr < self->str.end) { +        switch (c = *(self->str.ptr++)) { +        case '{': +            /* the format spec needs to be recursively expanded. +               this is an optimization, and not strictly needed */ +            *format_spec_needs_expanding = 1; +            count++; +            break; +        case '}': +            count--; +            if (count <= 0) { +                /* we're done.  parse and get out */ +                SubString s; + +                SubString_init(&s, start, self->str.ptr - 1 - start); +                if (parse_field(&s, field_name, format_spec, conversion) == 0) +                    return 0; + +                /* a zero length field_name is an error */ +                if (field_name->ptr == field_name->end) { +                    PyErr_SetString(PyExc_ValueError, "zero length field name " +                                    "in format"); +                    return 0; +                } + +                /* success */ +                return 2; +            } +            break; +        } +    } + +    /* end of string while searching for matching '}' */ +    PyErr_SetString(PyExc_ValueError, "unmatched '{' in format"); +    return 0; +} + + +/* do the !r or !s conversion on obj */ +static PyObject * +do_conversion(PyObject *obj, STRINGLIB_CHAR conversion) +{ +    /* XXX in pre-3.0, do we need to convert this to unicode, since it +       might have returned a string? */ +    switch (conversion) { +    case 'r': +        return PyObject_Repr(obj); +    case 's': +        return STRINGLIB_TOSTR(obj); +    default: +        PyErr_Format(PyExc_ValueError, +                     "Unknown converion specifier %c", +                     conversion); +        return NULL; +    } +} + +/* given: + +   {field_name!conversion:format_spec} + +   compute the result and write it to output. +   format_spec_needs_expanding is an optimization.  if it's false, +   just output the string directly, otherwise recursively expand the +   format_spec string. */ + +static int +output_markup(SubString *field_name, SubString *format_spec, +              int format_spec_needs_expanding, STRINGLIB_CHAR conversion, +              OutputString *output, PyObject *args, PyObject *kwargs, +              int recursion_depth) +{ +    PyObject *tmp = NULL; +    PyObject *fieldobj = NULL; +    SubString expanded_format_spec; +    SubString *actual_format_spec; +    int result = 0; + +    /* convert field_name to an object */ +    fieldobj = get_field_object(field_name, args, kwargs); +    if (fieldobj == NULL) +        goto done; + +    if (conversion != '\0') { +        tmp = do_conversion(fieldobj, conversion); +        if (tmp == NULL) +            goto done; + +        /* do the assignment, transferring ownership: fieldobj = tmp */ +        Py_DECREF(fieldobj); +        fieldobj = tmp; +        tmp = NULL; +    } + +    /* if needed, recurively compute the format_spec */ +    if (format_spec_needs_expanding) { +        tmp = build_string(format_spec, args, kwargs, recursion_depth-1); +        if (tmp == NULL) +            goto done; + +        /* note that in the case we're expanding the format string, +           tmp must be kept around until after the call to +           render_field. */ +        SubString_init(&expanded_format_spec, +                       STRINGLIB_STR(tmp), STRINGLIB_LEN(tmp)); +        actual_format_spec = &expanded_format_spec; +    } +    else +        actual_format_spec = format_spec; + +    if (render_field(fieldobj, actual_format_spec, output) == 0) +        goto done; + +    result = 1; + +done: +    Py_XDECREF(fieldobj); +    Py_XDECREF(tmp); + +    return result; +} + +/* +    do_markup is the top-level loop for the format() method.  It +    searches through the format string for escapes to markup codes, and +    calls other functions to move non-markup text to the output, +    and to perform the markup to the output. +*/ +static int +do_markup(SubString *input, PyObject *args, PyObject *kwargs, +          OutputString *output, int recursion_depth) +{ +    MarkupIterator iter; +    int format_spec_needs_expanding; +    int result; +    SubString literal; +    SubString field_name; +    SubString format_spec; +    STRINGLIB_CHAR conversion; + +    MarkupIterator_init(&iter, input->ptr, input->end - input->ptr); +    while ((result = MarkupIterator_next(&iter, &literal, &field_name, +                                         &format_spec, &conversion, +                                         &format_spec_needs_expanding)) == 2) { +        if (!output_data(output, literal.ptr, literal.end - literal.ptr)) +            return 0; +        if (field_name.ptr != field_name.end) +            if (!output_markup(&field_name, &format_spec, +                               format_spec_needs_expanding, conversion, output, +                               args, kwargs, recursion_depth)) +                return 0; +    } +    return result; +} + + +/* +    build_string allocates the output string and then +    calls do_markup to do the heavy lifting. +*/ +static PyObject * +build_string(SubString *input, PyObject *args, PyObject *kwargs, +             int recursion_depth) +{ +    OutputString output; +    PyObject *result = NULL; +    Py_ssize_t count; + +    output.obj = NULL; /* needed so cleanup code always works */ + +    /* check the recursion level */ +    if (recursion_depth <= 0) { +        PyErr_SetString(PyExc_ValueError, +                        "Max string recursion exceeded"); +        goto done; +    } + +    /* initial size is the length of the format string, plus the size +       increment.  seems like a reasonable default */ +    if (!output_initialize(&output, +                           input->end - input->ptr + +                           INITIAL_SIZE_INCREMENT)) +        goto done; + +    if (!do_markup(input, args, kwargs, &output, recursion_depth)) { +        goto done; +    } + +    count = output.ptr - STRINGLIB_STR(output.obj); +    if (STRINGLIB_RESIZE(&output.obj, count) < 0) { +        goto done; +    } + +    /* transfer ownership to result */ +    result = output.obj; +    output.obj = NULL; + +done: +    Py_XDECREF(output.obj); +    return result; +} + +/************************************************************************/ +/*********** main routine ***********************************************/ +/************************************************************************/ + +/* this is the main entry point */ +static PyObject * +do_string_format(PyObject *self, PyObject *args, PyObject *kwargs) +{ +    SubString input; + +    /* PEP 3101 says only 2 levels, so that +       "{0:{1}}".format('abc', 's')            # works +       "{0:{1:{2}}}".format('abc', 's', '')    # fails +    */ +    int recursion_depth = 2; + +    SubString_init(&input, STRINGLIB_STR(self), STRINGLIB_LEN(self)); +    return build_string(&input, args, kwargs, recursion_depth); +} + + + +/************************************************************************/ +/*********** formatteriterator ******************************************/ +/************************************************************************/ + +/* This is used to implement string.Formatter.vparse().  It exists so +   Formatter can share code with the built in unicode.format() method. +   It's really just a wrapper around MarkupIterator that is callable +   from Python. */ + +typedef struct { +    PyObject_HEAD + +    STRINGLIB_OBJECT *str; + +    MarkupIterator it_markup; +} formatteriterobject; + +static void +formatteriter_dealloc(formatteriterobject *it) +{ +    Py_XDECREF(it->str); +    PyObject_FREE(it); +} + +/* returns a tuple: +   (literal, field_name, format_spec, conversion) + +   literal is any literal text to output.  might be zero length +   field_name is the string before the ':'.  might be None +   format_spec is the string after the ':'.  mibht be None +   conversion is either None, or the string after the '!' +*/ +static PyObject * +formatteriter_next(formatteriterobject *it) +{ +    SubString literal; +    SubString field_name; +    SubString format_spec; +    STRINGLIB_CHAR conversion; +    int format_spec_needs_expanding; +    int result = MarkupIterator_next(&it->it_markup, &literal, &field_name, +                                     &format_spec, &conversion, +                                     &format_spec_needs_expanding); + +    /* all of the SubString objects point into it->str, so no +       memory management needs to be done on them */ +    assert(0 <= result && result <= 2); +    if (result == 0 || result == 1) +        /* if 0, error has already been set, if 1, iterator is empty */ +        return NULL; +    else { +        PyObject *literal_str = NULL; +        PyObject *field_name_str = NULL; +        PyObject *format_spec_str = NULL; +        PyObject *conversion_str = NULL; +        PyObject *tuple = NULL; +        int has_field = field_name.ptr != field_name.end; + +        literal_str = SubString_new_object(&literal); +        if (literal_str == NULL) +            goto done; + +        field_name_str = SubString_new_object(&field_name); +        if (field_name_str == NULL) +            goto done; + +        /* if field_name is non-zero length, return a string for +           format_spec (even if zero length), else return None */ +        format_spec_str = (has_field ? +                           SubString_new_object_or_empty : +                           SubString_new_object)(&format_spec); +        if (format_spec_str == NULL) +            goto done; + +        /* if the conversion is not specified, return a None, +           otherwise create a one length string with the conversion +           character */ +        if (conversion == '\0') { +            conversion_str = Py_None; +            Py_INCREF(conversion_str); +        } +        else +	    conversion_str = STRINGLIB_NEW(&conversion, 1); +        if (conversion_str == NULL) +            goto done; + +        tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str, +                             conversion_str); +    done: +        Py_XDECREF(literal_str); +        Py_XDECREF(field_name_str); +        Py_XDECREF(format_spec_str); +        Py_XDECREF(conversion_str); +        return tuple; +    } +} + +static PyMethodDef formatteriter_methods[] = { +    {NULL,		NULL}		/* sentinel */ +}; + +static PyTypeObject PyFormatterIter_Type = { +    PyVarObject_HEAD_INIT(&PyType_Type, 0) +    "formatteriterator",		/* tp_name */ +    sizeof(formatteriterobject),	/* tp_basicsize */ +    0,					/* tp_itemsize */ +    /* methods */ +    (destructor)formatteriter_dealloc,	/* tp_dealloc */ +    0,					/* tp_print */ +    0,					/* tp_getattr */ +    0,					/* tp_setattr */ +    0,					/* tp_compare */ +    0,					/* tp_repr */ +    0,					/* tp_as_number */ +    0,					/* tp_as_sequence */ +    0,					/* tp_as_mapping */ +    0,					/* tp_hash */ +    0,					/* tp_call */ +    0,					/* tp_str */ +    PyObject_GenericGetAttr,		/* tp_getattro */ +    0,					/* tp_setattro */ +    0,					/* tp_as_buffer */ +    Py_TPFLAGS_DEFAULT,			/* tp_flags */ +    0,					/* tp_doc */ +    0,					/* tp_traverse */ +    0,					/* tp_clear */ +    0,					/* tp_richcompare */ +    0,					/* tp_weaklistoffset */ +    PyObject_SelfIter,			/* tp_iter */ +    (iternextfunc)formatteriter_next,	/* tp_iternext */ +    formatteriter_methods,		/* tp_methods */ +    0, +}; + +/* unicode_formatter_parser is used to implement +   string.Formatter.vformat.  it parses a string and returns tuples +   describing the parsed elements.  It's a wrapper around +   stringlib/string_format.h's MarkupIterator */ +static PyObject * +formatter_parser(STRINGLIB_OBJECT *self) +{ +    formatteriterobject *it; + +    it = PyObject_New(formatteriterobject, &PyFormatterIter_Type); +    if (it == NULL) +        return NULL; + +    /* take ownership, give the object to the iterator */ +    Py_INCREF(self); +    it->str = self; + +    /* initialize the contained MarkupIterator */ +    MarkupIterator_init(&it->it_markup, +                        STRINGLIB_STR(self), +                        STRINGLIB_LEN(self)); + +    return (PyObject *)it; +} + + +/************************************************************************/ +/*********** fieldnameiterator ******************************************/ +/************************************************************************/ + + +/* This is used to implement string.Formatter.vparse().  It parses the +   field name into attribute and item values.  It's a Python-callable +   wrapper around FieldNameIterator */ + +typedef struct { +    PyObject_HEAD + +    STRINGLIB_OBJECT *str; + +    FieldNameIterator it_field; +} fieldnameiterobject; + +static void +fieldnameiter_dealloc(fieldnameiterobject *it) +{ +    Py_XDECREF(it->str); +    PyObject_FREE(it); +} + +/* returns a tuple: +   (is_attr, value) +   is_attr is true if we used attribute syntax (e.g., '.foo') +              false if we used index syntax (e.g., '[foo]') +   value is an integer or string +*/ +static PyObject * +fieldnameiter_next(fieldnameiterobject *it) +{ +    int result; +    int is_attr; +    Py_ssize_t idx; +    SubString name; + +    result = FieldNameIterator_next(&it->it_field, &is_attr, +                                    &idx, &name); +    if (result == 0 || result == 1) +        /* if 0, error has already been set, if 1, iterator is empty */ +        return NULL; +    else { +        PyObject* result = NULL; +        PyObject* is_attr_obj = NULL; +        PyObject* obj = NULL; + +        is_attr_obj = PyBool_FromLong(is_attr); +        if (is_attr_obj == NULL) +            goto done; + +        /* either an integer or a string */ +        if (idx != -1) +            obj = PyLong_FromSsize_t(idx); +        else +            obj = SubString_new_object(&name); +        if (obj == NULL) +            goto done; + +        /* return a tuple of values */ +        result = PyTuple_Pack(2, is_attr_obj, obj); + +    done: +        Py_XDECREF(is_attr_obj); +        Py_XDECREF(obj); +        return result; +    } +} + +static PyMethodDef fieldnameiter_methods[] = { +    {NULL,		NULL}		/* sentinel */ +}; + +static PyTypeObject PyFieldNameIter_Type = { +    PyVarObject_HEAD_INIT(&PyType_Type, 0) +    "fieldnameiterator",		/* tp_name */ +    sizeof(fieldnameiterobject),	/* tp_basicsize */ +    0,					/* tp_itemsize */ +    /* methods */ +    (destructor)fieldnameiter_dealloc,	/* tp_dealloc */ +    0,					/* tp_print */ +    0,					/* tp_getattr */ +    0,					/* tp_setattr */ +    0,					/* tp_compare */ +    0,					/* tp_repr */ +    0,					/* tp_as_number */ +    0,					/* tp_as_sequence */ +    0,					/* tp_as_mapping */ +    0,					/* tp_hash */ +    0,					/* tp_call */ +    0,					/* tp_str */ +    PyObject_GenericGetAttr,		/* tp_getattro */ +    0,					/* tp_setattro */ +    0,					/* tp_as_buffer */ +    Py_TPFLAGS_DEFAULT,			/* tp_flags */ +    0,					/* tp_doc */ +    0,					/* tp_traverse */ +    0,					/* tp_clear */ +    0,					/* tp_richcompare */ +    0,					/* tp_weaklistoffset */ +    PyObject_SelfIter,			/* tp_iter */ +    (iternextfunc)fieldnameiter_next,	/* tp_iternext */ +    fieldnameiter_methods,		/* tp_methods */ +    0}; + +/* unicode_formatter_field_name_split is used to implement +   string.Formatter.vformat.  it takes an PEP 3101 "field name", and +   returns a tuple of (first, rest): "first", the part before the +   first '.' or '['; and "rest", an iterator for the rest of the field +   name.  it's a wrapper around stringlib/string_format.h's +   field_name_split.  The iterator it returns is a +   FieldNameIterator */ +static PyObject * +formatter_field_name_split(STRINGLIB_OBJECT *self) +{ +    SubString first; +    Py_ssize_t first_idx; +    fieldnameiterobject *it; + +    PyObject *first_obj = NULL; +    PyObject *result = NULL; + +    it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type); +    if (it == NULL) +        return NULL; + +    /* take ownership, give the object to the iterator.  this is +       just to keep the field_name alive */ +    Py_INCREF(self); +    it->str = self; + +    if (!field_name_split(STRINGLIB_STR(self), +                          STRINGLIB_LEN(self), +                          &first, &first_idx, &it->it_field)) +        goto done; + +    /* first becomes an integer, if possible; else a string */ +    if (first_idx != -1) +        first_obj = PyLong_FromSsize_t(first_idx); +    else +        /* convert "first" into a string object */ +        first_obj = SubString_new_object(&first); +    if (first_obj == NULL) +        goto done; + +    /* return a tuple of values */ +    result = PyTuple_Pack(2, first_obj, it); + +done: +    Py_XDECREF(it); +    Py_XDECREF(first_obj); +    return result; +} diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h new file mode 100644 index 0000000..1e0df0f --- /dev/null +++ b/Objects/stringlib/stringdefs.h @@ -0,0 +1,27 @@ +#ifndef STRINGLIB_STRINGDEFS_H +#define STRINGLIB_STRINGDEFS_H + +/* this is sort of a hack.  there's at least one place (formatting +   floats) where some stringlib code takes a different path if it's +   compiled as unicode. */ +#define STRINGLIB_IS_UNICODE     0 + +#define STRINGLIB_OBJECT         PyStringObject +#define STRINGLIB_CHAR           char +#define STRINGLIB_TYPE_NAME      "string" +#define STRINGLIB_PARSE_CODE     "S" +#define STRINGLIB_EMPTY          nullstring +#define STRINGLIB_ISDECIMAL(x)   ((x >= '0') && (x <= '9')) +#define STRINGLIB_TODECIMAL(x)   (STRINGLIB_ISDECIMAL(x) ? (x - '0') : -1) +#define STRINGLIB_TOUPPER        toupper +#define STRINGLIB_TOLOWER        tolower +#define STRINGLIB_FILL           memset +#define STRINGLIB_STR            PyString_AS_STRING +#define STRINGLIB_LEN            PyString_GET_SIZE +#define STRINGLIB_NEW            PyString_FromStringAndSize +#define STRINGLIB_RESIZE         _PyString_Resize +#define STRINGLIB_CHECK          PyString_Check +#define STRINGLIB_CMP            memcmp +#define STRINGLIB_TOSTR          PyObject_Str + +#endif /* !STRINGLIB_STRINGDEFS_H */ diff --git a/Objects/stringlib/unicodedefs.h b/Objects/stringlib/unicodedefs.h new file mode 100644 index 0000000..f402a98 --- /dev/null +++ b/Objects/stringlib/unicodedefs.h @@ -0,0 +1,52 @@ +#ifndef STRINGLIB_UNICODEDEFS_H +#define STRINGLIB_UNICODEDEFS_H + +/* this is sort of a hack.  there's at least one place (formatting +   floats) where some stringlib code takes a different path if it's +   compiled as unicode. */ +#define STRINGLIB_IS_UNICODE     1 + +#define STRINGLIB_OBJECT         PyUnicodeObject +#define STRINGLIB_CHAR           Py_UNICODE +#define STRINGLIB_TYPE_NAME      "unicode" +#define STRINGLIB_PARSE_CODE     "U" +#define STRINGLIB_EMPTY          unicode_empty +#define STRINGLIB_ISDECIMAL      Py_UNICODE_ISDECIMAL +#define STRINGLIB_TODECIMAL      Py_UNICODE_TODECIMAL +#define STRINGLIB_TOUPPER        Py_UNICODE_TOUPPER +#define STRINGLIB_TOLOWER        Py_UNICODE_TOLOWER +#define STRINGLIB_FILL           Py_UNICODE_FILL +#define STRINGLIB_STR            PyUnicode_AS_UNICODE +#define STRINGLIB_LEN            PyUnicode_GET_SIZE +#define STRINGLIB_NEW            PyUnicode_FromUnicode +#define STRINGLIB_RESIZE         PyUnicode_Resize +#define STRINGLIB_CHECK          PyUnicode_Check + +#if PY_VERSION_HEX < 0x03000000 +#define STRINGLIB_TOSTR          PyObject_Unicode +#else +#define STRINGLIB_TOSTR          PyObject_Str +#endif + +#define STRINGLIB_WANT_CONTAINS_OBJ 1 + +/* STRINGLIB_CMP was defined as: + +Py_LOCAL_INLINE(int) +STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len) +{ +    if (str[0] != other[0]) +        return 1; +    return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE)); +} + +but unfortunately that gives a error if the function isn't used in a file that +includes this file.  So, reluctantly convert it to a macro instead. */ + +#define STRINGLIB_CMP(str, other, len) \ +    (((str)[0] != (other)[0]) ? \ +     1 : \ +     memcmp((void*) (str), (void*) (other), (len) * sizeof(Py_UNICODE))) + + +#endif /* !STRINGLIB_UNICODEDEFS_H */  | 
