From e226b559637ca8a9972879b1ce33c9d99f449636 Mon Sep 17 00:00:00 2001 From: Eric Smith Date: Mon, 27 Aug 2007 11:28:18 +0000 Subject: PEP 3101: Removed _formatter_xxx routines from sysmodule, and made them unicode methods instead (per GvR suggestion). --- Include/unicodeobject.h | 3 - Lib/string.py | 10 +- Objects/unicodeobject.c | 2687 ++++++++++++++++++++++++----------------------- Python/sysmodule.c | 51 - 4 files changed, 1351 insertions(+), 1400 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 8184f3a..4d8e45a 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1437,9 +1437,6 @@ PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr( const Py_UNICODE *s, Py_UNICODE c ); -PyObject *_PyUnicode_FormatterIterator(PyObject *str); -PyObject *_PyUnicode_FormatterFieldNameSplit(PyObject *field_name); - #ifdef __cplusplus } #endif diff --git a/Lib/string.py b/Lib/string.py index edf5ae2..0663f31 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -200,10 +200,8 @@ class Template(metaclass=_TemplateMetaclass): # exposed here via the sys module. sys was chosen because it's always # available and doesn't have to be dynamically loaded. -# The overall parser is implemented in sys._formatter_parser. -# The field name parser is implemented in sys._formatter_field_name_split - -from sys import _formatter_parser, _formatter_field_name_split +# The overall parser is implemented in str._formatter_parser. +# The field name parser is implemented in str._formatter_field_name_split class Formatter: def format(self, format_string, *args, **kwargs): @@ -213,13 +211,13 @@ class Formatter: used_args = set() result = [] for (is_markup, literal, field_name, format_spec, conversion) in \ - _formatter_parser(format_string): + format_string._formatter_parser(): if is_markup: # given the field_name, find the object it references # split it into the first part, and and iterator that # looks over the rest - first, rest = _formatter_field_name_split(field_name) + first, rest = field_name._formatter_field_name_split() used_args.add(first) obj = self.get_value(first, args, kwargs) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 84f55b5..c46da45 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -598,7 +598,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) */ if ((*f == 'l' || *f == 'z') && (f[1] == 'd' || f[1] == 'u')) - ++f; + ++f; switch (*f) { case 'c': @@ -7981,1502 +7981,1509 @@ unicode__format__(PyObject *self, PyObject *args) } -static PyObject * -unicode_getnewargs(PyUnicodeObject *v) -{ - return Py_BuildValue("(u#)", v->str, v->length); -} - - -static PyMethodDef unicode_methods[] = { +/********************* Formatter Iterator ************************/ - /* Order is according to common usage: often used methods should - appear first, since lookup is done sequentially. */ +/* this is used to implement string.Formatter.vparse(). it exists so + Formatter can share code with the built in unicode.format() + method */ - {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, - {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, - {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, - {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, - {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, - {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, - {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, - {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, - {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, - {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, - {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, - {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, - {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, - {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, - {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, - {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, - {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__}, -/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ - {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, - {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, - {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, - {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, - {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, - {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, - {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, - {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, - {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, - {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, - {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, - {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, - {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, - {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, - {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, - {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, - {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, - {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, - {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, - {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, - {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, - {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, - {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, - {"format", (PyCFunction) unicode_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, - {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, -#if 0 - {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, -#endif +typedef struct { + PyObject_HEAD -#if 0 - /* This one is just used for debugging the implementation. */ - {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, -#endif + PyUnicodeObject *str; - {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, - {NULL, NULL} -}; + MarkupIterator it_markup; +} formatteriterobject; -static PyObject * -unicode_mod(PyObject *v, PyObject *w) +static void +formatteriter_dealloc(formatteriterobject *it) { - if (!PyUnicode_Check(v)) { - Py_INCREF(Py_NotImplemented); - return Py_NotImplemented; - } - return PyUnicode_Format(v, w); + Py_XDECREF(it->str); + PyObject_FREE(it); } -static PyNumberMethods unicode_as_number = { - 0, /*nb_add*/ - 0, /*nb_subtract*/ - 0, /*nb_multiply*/ - unicode_mod, /*nb_remainder*/ -}; - -static PySequenceMethods unicode_as_sequence = { - (lenfunc) unicode_length, /* sq_length */ - PyUnicode_Concat, /* sq_concat */ - (ssizeargfunc) unicode_repeat, /* sq_repeat */ - (ssizeargfunc) unicode_getitem, /* sq_item */ - (ssizessizeargfunc) unicode_slice, /* sq_slice */ - 0, /* sq_ass_item */ - 0, /* sq_ass_slice */ - PyUnicode_Contains, /* sq_contains */ -}; - -static PyObject* -unicode_subscript(PyUnicodeObject* self, PyObject* item) +/* returns a tuple: + (is_markup, literal, field_name, format_spec, conversion) + if is_markup == True: + literal is None + field_name is the string before the ':' + format_spec is the string after the ':' + conversion is either None, or the string after the '!' + if is_markup == False: + literal is the literal string + field_name is None + format_spec is None + conversion is None +*/ +static PyObject * +formatteriter_next(formatteriterobject *it) { - if (PyIndex_Check(item)) { - Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); - if (i == -1 && PyErr_Occurred()) - return NULL; - if (i < 0) - i += PyUnicode_GET_SIZE(self); - return unicode_getitem(self, i); - } else if (PySlice_Check(item)) { - Py_ssize_t start, stop, step, slicelength, cur, i; - Py_UNICODE* source_buf; - Py_UNICODE* result_buf; - PyObject* result; - - if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), - &start, &stop, &step, &slicelength) < 0) { - return NULL; - } + SubString literal; + SubString field_name; + SubString format_spec; + Py_UNICODE conversion; + int is_markup; + int format_spec_needs_expanding; + int result = MarkupIterator_next(&it->it_markup, &is_markup, &literal, + &field_name, &format_spec, &conversion, + &format_spec_needs_expanding); - if (slicelength <= 0) { - return PyUnicode_FromUnicode(NULL, 0); + /* all of the SubString objects point into it->str, so no + memory management needs to be done on them */ + assert(0 <= result && result <= 2); + if (result == 0) { + /* error has already been set */ + return NULL; + } else if (result == 1) { + /* end of iterator */ + return NULL; } else { - source_buf = PyUnicode_AS_UNICODE((PyObject*)self); - result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength* - sizeof(Py_UNICODE)); - - if (result_buf == NULL) - return PyErr_NoMemory(); + PyObject *is_markup_bool = NULL; + PyObject *literal_str = NULL; + PyObject *field_name_str = NULL; + PyObject *format_spec_str = NULL; + PyObject *conversion_str = NULL; + PyObject *tuple = NULL; - for (cur = start, i = 0; i < slicelength; cur += step, i++) { - result_buf[i] = source_buf[cur]; - } + is_markup_bool = PyBool_FromLong(is_markup); + if (!is_markup_bool) + return NULL; - result = PyUnicode_FromUnicode(result_buf, slicelength); - PyMem_FREE(result_buf); - return result; - } - } else { - PyErr_SetString(PyExc_TypeError, "string indices must be integers"); - return NULL; - } -} + if (is_markup) { + /* field_name, format_spec, and conversion are + returned */ + literal_str = Py_None; + Py_INCREF(literal_str); -static PyMappingMethods unicode_as_mapping = { - (lenfunc)unicode_length, /* mp_length */ - (binaryfunc)unicode_subscript, /* mp_subscript */ - (objobjargproc)0, /* mp_ass_subscript */ -}; + field_name_str = SubString_new_object(&field_name); + if (field_name_str == NULL) + goto error; + format_spec_str = SubString_new_object(&format_spec); + if (format_spec_str == NULL) + goto error; -static int -unicode_buffer_getbuffer(PyUnicodeObject *self, PyBuffer *view, int flags) -{ + /* if the conversion is not specified, return + a None, otherwise create a one length + string with the conversion characater */ + if (conversion == '\0') { + conversion_str = Py_None; + Py_INCREF(conversion_str); + } else + conversion_str = PyUnicode_FromUnicode(&conversion, + 1); + if (conversion_str == NULL) + goto error; + } else { + /* only literal is returned */ + literal_str = SubString_new_object(&literal); + if (literal_str == NULL) + goto error; - if (flags & PyBUF_CHARACTER) { - PyObject *str; - - str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); - if (str == NULL) return -1; - return PyBuffer_FillInfo(view, (void *)PyString_AS_STRING(str), - PyString_GET_SIZE(str), 1, flags); - } - else { - return PyBuffer_FillInfo(view, (void *)self->str, - PyUnicode_GET_DATA_SIZE(self), 1, flags); - } -} + field_name_str = Py_None; + format_spec_str = Py_None; + conversion_str = Py_None; + Py_INCREF(field_name_str); + Py_INCREF(format_spec_str); + Py_INCREF(conversion_str); + } + tuple = PyTuple_Pack(5, is_markup_bool, literal_str, + field_name_str, format_spec_str, + conversion_str); + error: + Py_XDECREF(is_markup_bool); + Py_XDECREF(literal_str); + Py_XDECREF(field_name_str); + Py_XDECREF(format_spec_str); + Py_XDECREF(conversion_str); + return tuple; + } +} -/* Helpers for PyUnicode_Format() */ +static PyMethodDef formatteriter_methods[] = { + {NULL, NULL} /* sentinel */ +}; -static PyObject * -getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) -{ - Py_ssize_t argidx = *p_argidx; - if (argidx < arglen) { - (*p_argidx)++; - if (arglen < 0) - return args; - else - return PyTuple_GetItem(args, argidx); - } - PyErr_SetString(PyExc_TypeError, - "not enough arguments for format string"); - return NULL; -} - -#define F_LJUST (1<<0) -#define F_SIGN (1<<1) -#define F_BLANK (1<<2) -#define F_ALT (1<<3) -#define F_ZERO (1<<4) +PyTypeObject PyFormatterIter_Type = { + PyVarObject_HEAD_INIT(&PyType_Type, 0) + "formatteriterator", /* tp_name */ + sizeof(formatteriterobject), /* tp_basicsize */ + 0, /* tp_itemsize */ + /* methods */ + (destructor)formatteriter_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + PyObject_SelfIter, /* tp_iter */ + (iternextfunc)formatteriter_next, /* tp_iternext */ + formatteriter_methods, /* tp_methods */ + 0, +}; -static Py_ssize_t -strtounicode(Py_UNICODE *buffer, const char *charbuffer) +/* unicode_formatter_parser is used to implement + string.Formatter.vformat. it parses a string and returns tuples + describing the parsed elements. It's a wrapper around + stringlib/string_format.h's MarkupIterator */ +static PyObject * +unicode_formatter_parser(PyUnicodeObject *self) { - register Py_ssize_t i; - Py_ssize_t len = strlen(charbuffer); - for (i = len - 1; i >= 0; i--) - buffer[i] = (Py_UNICODE) charbuffer[i]; + formatteriterobject *it; - return len; -} + it = PyObject_New(formatteriterobject, &PyFormatterIter_Type); + if (it == NULL) + return NULL; -static int -doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) -{ - Py_ssize_t result; + /* take ownership, give the object to the iterator */ + Py_INCREF(self); + it->str = self; - PyOS_ascii_formatd((char *)buffer, len, format, x); - result = strtounicode(buffer, (char *)buffer); - return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); + /* initialize the contained MarkupIterator */ + MarkupIterator_init(&it->it_markup, + PyUnicode_AS_UNICODE(self), + PyUnicode_GET_SIZE(self)); + + return (PyObject *)it; } +/***************** end Formatter Iterator ************************/ +/********************* FieldName Iterator ************************/ -static int -longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) -{ - Py_ssize_t result; +/* this is used to implement string.Formatter.vparse(). it parses + the field name into attribute and item values. */ - PyOS_snprintf((char *)buffer, len, format, x); - result = strtounicode(buffer, (char *)buffer); - return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); -} +typedef struct { + PyObject_HEAD -/* XXX To save some code duplication, formatfloat/long/int could have been - shared with stringobject.c, converting from 8-bit to Unicode after the - formatting is done. */ + PyUnicodeObject *str; -static int -formatfloat(Py_UNICODE *buf, - size_t buflen, - int flags, - int prec, - int type, - PyObject *v) + FieldNameIterator it_field; +} fieldnameiterobject; + +static void +fieldnameiter_dealloc(fieldnameiterobject *it) { - /* fmt = '%#.' + `prec` + `type` - worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ - char fmt[20]; - double x; + Py_XDECREF(it->str); + PyObject_FREE(it); +} - x = PyFloat_AsDouble(v); - if (x == -1.0 && PyErr_Occurred()) - return -1; - if (prec < 0) - prec = 6; - if (type == 'f' && (fabs(x) / 1e25) >= 1e25) - type = 'g'; - /* Worst case length calc to ensure no buffer overrun: +/* returns a tuple: + (is_attr, value) + is_attr is true if we used attribute syntax (e.g., '.foo') + false if we used index syntax (e.g., '[foo]') + value is an integer or string +*/ +static PyObject * +fieldnameiter_next(fieldnameiterobject *it) +{ + int result; + int is_attr; + Py_ssize_t idx; + SubString name; - 'g' formats: - fmt = %#.g - buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp - for any double rep.) - len = 1 + prec + 1 + 2 + 5 = 9 + prec + result = FieldNameIterator_next(&it->it_field, &is_attr, + &idx, &name); + if (result == 0 || result == 1) { + /* if 0, error has already been set, if 1, iterator is empty */ + return NULL; + } else { + PyObject* result = NULL; + PyObject* is_attr_obj = NULL; + PyObject* obj = NULL; - 'f' formats: - buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) - len = 1 + 50 + 1 + prec = 52 + prec + is_attr_obj = PyBool_FromLong(is_attr); + if (is_attr_obj == NULL) + goto error; - If prec=0 the effective precision is 1 (the leading digit is - always given), therefore increase the length by one. + /* either an integer or a string */ + if (idx != -1) + obj = PyInt_FromSsize_t(idx); + else + obj = STRINGLIB_NEW(name.ptr, name.end - name.ptr); + if (obj == NULL) + goto error; - */ - if (((type == 'g' || type == 'G') && - buflen <= (size_t)10 + (size_t)prec) || - (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { - PyErr_SetString(PyExc_OverflowError, - "formatted float is too long (precision too large?)"); - return -1; - } - PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", - (flags&F_ALT) ? "#" : "", - prec, type); - return doubletounicode(buf, buflen, fmt, x); -} + /* return a tuple of values */ + result = PyTuple_Pack(2, is_attr_obj, obj); + if (result == NULL) + goto error; -static PyObject* -formatlong(PyObject *val, int flags, int prec, int type) -{ - char *buf; - int len; - PyObject *str; /* temporary string object. */ - PyObject *result; + return result; - str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); - if (!str) - return NULL; - result = PyUnicode_FromStringAndSize(buf, len); - Py_DECREF(str); - return result; + error: + Py_XDECREF(result); + Py_XDECREF(is_attr_obj); + Py_XDECREF(obj); + return NULL; + } + return NULL; } -static int -formatint(Py_UNICODE *buf, - size_t buflen, - int flags, - int prec, - int type, - PyObject *v) +static PyMethodDef fieldnameiter_methods[] = { + {NULL, NULL} /* sentinel */ +}; + +static PyTypeObject PyFieldNameIter_Type = { + PyVarObject_HEAD_INIT(&PyType_Type, 0) + "fieldnameiterator", /* tp_name */ + sizeof(fieldnameiterobject), /* tp_basicsize */ + 0, /* tp_itemsize */ + /* methods */ + (destructor)fieldnameiter_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + PyObject_SelfIter, /* tp_iter */ + (iternextfunc)fieldnameiter_next, /* tp_iternext */ + fieldnameiter_methods, /* tp_methods */ + 0}; + +/* unicode_formatter_field_name_split is used to implement + string.Formatter.vformat. it takes an PEP 3101 "field name", and + returns a tuple of (first, rest): "first", the part before the + first '.' or '['; and "rest", an iterator for the rest of the field + name. it's a wrapper around stringlib/string_format.h's + field_name_split. The iterator it returns is a + FieldNameIterator */ +static PyObject * +unicode_formatter_field_name_split(PyUnicodeObject *self) { - /* fmt = '%#.' + `prec` + 'l' + `type` - * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) - * + 1 + 1 - * = 24 - */ - char fmt[64]; /* plenty big enough! */ - char *sign; - long x; + SubString first; + Py_ssize_t first_idx; + fieldnameiterobject *it; - x = PyInt_AsLong(v); - if (x == -1 && PyErr_Occurred()) - return -1; - if (x < 0 && type == 'u') { - type = 'd'; - } - if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) - sign = "-"; - else - sign = ""; - if (prec < 0) - prec = 1; + PyObject *first_obj = NULL; + PyObject *result = NULL; - /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) - * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 - */ - if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { - PyErr_SetString(PyExc_OverflowError, - "formatted integer is too long (precision too large?)"); - return -1; - } + it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type); + if (it == NULL) + return NULL; - if ((flags & F_ALT) && - (type == 'x' || type == 'X' || type == 'o')) { - /* When converting under %#o, %#x or %#X, there are a number - * of issues that cause pain: - * - for %#o, we want a different base marker than C - * - when 0 is being converted, the C standard leaves off - * the '0x' or '0X', which is inconsistent with other - * %#x/%#X conversions and inconsistent with Python's - * hex() function - * - there are platforms that violate the standard and - * convert 0 with the '0x' or '0X' - * (Metrowerks, Compaq Tru64) - * - there are platforms that give '0x' when converting - * under %#X, but convert 0 in accordance with the - * standard (OS/2 EMX) - * - * We can achieve the desired consistency by inserting our - * own '0x' or '0X' prefix, and substituting %x/%X in place - * of %#x/%#X. - * - * Note that this is the same approach as used in - * formatint() in stringobject.c - */ - PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", - sign, type, prec, type); - } - else { - PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", - sign, (flags&F_ALT) ? "#" : "", - prec, type); - } - if (sign[0]) - return longtounicode(buf, buflen, fmt, -x); - else - return longtounicode(buf, buflen, fmt, x); -} + /* take ownership, give the object to the iterator. this is + just to keep the field_name alive */ + Py_INCREF(self); + it->str = self; -static int -formatchar(Py_UNICODE *buf, - size_t buflen, - PyObject *v) -{ - /* presume that the buffer is at least 2 characters long */ - if (PyUnicode_Check(v)) { - if (PyUnicode_GET_SIZE(v) != 1) - goto onError; - buf[0] = PyUnicode_AS_UNICODE(v)[0]; - } + if (!field_name_split(STRINGLIB_STR(self), + STRINGLIB_LEN(self), + &first, &first_idx, &it->it_field)) + goto error; - else if (PyString_Check(v)) { - if (PyString_GET_SIZE(v) != 1) - goto onError; - buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; - } + /* first becomes an integer, if possible, else a string */ + if (first_idx != -1) + first_obj = PyInt_FromSsize_t(first_idx); + else + /* convert "first" into a string object */ + first_obj = STRINGLIB_NEW(first.ptr, first.end - first.ptr); + if (first_obj == NULL) + goto error; - else { - /* Integer input truncated to a character */ - long x; - x = PyInt_AsLong(v); - if (x == -1 && PyErr_Occurred()) - goto onError; -#ifdef Py_UNICODE_WIDE - if (x < 0 || x > 0x10ffff) { - PyErr_SetString(PyExc_OverflowError, - "%c arg not in range(0x110000) " - "(wide Python build)"); - return -1; - } -#else - if (x < 0 || x > 0xffff) { - PyErr_SetString(PyExc_OverflowError, - "%c arg not in range(0x10000) " - "(narrow Python build)"); - return -1; - } -#endif - buf[0] = (Py_UNICODE) x; - } - buf[1] = '\0'; - return 1; + /* return a tuple of values */ + result = PyTuple_Pack(2, first_obj, it); - onError: - PyErr_SetString(PyExc_TypeError, - "%c requires int or char"); - return -1; +error: + Py_XDECREF(it); + Py_XDECREF(first_obj); + return result; } +/***************** end FieldName Iterator ************************/ -/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) - - FORMATBUFLEN is the length of the buffer in which the floats, ints, & - chars are formatted. XXX This is a magic number. Each formatting - routine does bounds checking to ensure no overflow, but a better - solution may be to malloc a buffer of appropriate size for each - format. For now, the current solution is sufficient. -*/ -#define FORMATBUFLEN (size_t)120 -PyObject *PyUnicode_Format(PyObject *format, - PyObject *args) +static PyObject * +unicode_getnewargs(PyUnicodeObject *v) { - Py_UNICODE *fmt, *res; - Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; - int args_owned = 0; - PyUnicodeObject *result = NULL; - PyObject *dict = NULL; - PyObject *uformat; + return Py_BuildValue("(u#)", v->str, v->length); +} - if (format == NULL || args == NULL) { - PyErr_BadInternalCall(); - return NULL; - } - uformat = PyUnicode_FromObject(format); - if (uformat == NULL) - return NULL; - fmt = PyUnicode_AS_UNICODE(uformat); - fmtcnt = PyUnicode_GET_SIZE(uformat); - reslen = rescnt = fmtcnt + 100; - result = _PyUnicode_New(reslen); - if (result == NULL) - goto onError; - res = PyUnicode_AS_UNICODE(result); +static PyMethodDef unicode_methods[] = { - if (PyTuple_Check(args)) { - arglen = PyTuple_Size(args); - argidx = 0; - } - else { - arglen = -1; - argidx = -2; - } - if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) && - !PyObject_TypeCheck(args, &PyBaseString_Type)) - dict = args; + /* Order is according to common usage: often used methods should + appear first, since lookup is done sequentially. */ - while (--fmtcnt >= 0) { - if (*fmt != '%') { - if (--rescnt < 0) { - rescnt = fmtcnt + 100; - reslen += rescnt; - if (_PyUnicode_Resize(&result, reslen) < 0) - goto onError; - res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; - --rescnt; - } - *res++ = *fmt++; - } - else { - /* Got a format specifier */ - int flags = 0; - Py_ssize_t width = -1; - int prec = -1; - Py_UNICODE c = '\0'; - Py_UNICODE fill; - PyObject *v = NULL; - PyObject *temp = NULL; - Py_UNICODE *pbuf; - Py_UNICODE sign; - Py_ssize_t len; - Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ + {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, + {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, + {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, + {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, + {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, + {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, + {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, + {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, + {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, + {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, + {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, + {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, + {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, + {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, + {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, + {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, + {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__}, +/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ + {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, + {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, + {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, + {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, + {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, + {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, + {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, + {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, + {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, + {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, + {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, + {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, + {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, + {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, + {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, + {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, + {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, + {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, + {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, + {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, + {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, + {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, + {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, + {"format", (PyCFunction) unicode_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, + {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, + {"_formatter_field_name_split", (PyCFunction) unicode_formatter_field_name_split, METH_NOARGS}, + {"_formatter_parser", (PyCFunction) unicode_formatter_parser, METH_NOARGS}, +#if 0 + {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, +#endif - fmt++; - if (*fmt == '(') { - Py_UNICODE *keystart; - Py_ssize_t keylen; - PyObject *key; - int pcount = 1; +#if 0 + /* This one is just used for debugging the implementation. */ + {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, +#endif - if (dict == NULL) { - PyErr_SetString(PyExc_TypeError, - "format requires a mapping"); - goto onError; - } - ++fmt; - --fmtcnt; - keystart = fmt; - /* Skip over balanced parentheses */ - while (pcount > 0 && --fmtcnt >= 0) { - if (*fmt == ')') - --pcount; - else if (*fmt == '(') - ++pcount; - fmt++; - } - keylen = fmt - keystart - 1; - if (fmtcnt < 0 || pcount > 0) { - PyErr_SetString(PyExc_ValueError, - "incomplete format key"); - goto onError; - } -#if 0 - /* keys are converted to strings using UTF-8 and - then looked up since Python uses strings to hold - variables names etc. in its namespaces and we - wouldn't want to break common idioms. */ - key = PyUnicode_EncodeUTF8(keystart, - keylen, - NULL); -#else - key = PyUnicode_FromUnicode(keystart, keylen); -#endif - if (key == NULL) - goto onError; - if (args_owned) { - Py_DECREF(args); - args_owned = 0; - } - args = PyObject_GetItem(dict, key); - Py_DECREF(key); - if (args == NULL) { - goto onError; - } - args_owned = 1; - arglen = -1; - argidx = -2; - } - while (--fmtcnt >= 0) { - switch (c = *fmt++) { - case '-': flags |= F_LJUST; continue; - case '+': flags |= F_SIGN; continue; - case ' ': flags |= F_BLANK; continue; - case '#': flags |= F_ALT; continue; - case '0': flags |= F_ZERO; continue; - } - break; - } - if (c == '*') { - v = getnextarg(args, arglen, &argidx); - if (v == NULL) - goto onError; - if (!PyInt_Check(v)) { - PyErr_SetString(PyExc_TypeError, - "* wants int"); - goto onError; - } - width = PyInt_AsLong(v); - if (width == -1 && PyErr_Occurred()) - goto onError; - if (width < 0) { - flags |= F_LJUST; - width = -width; - } - if (--fmtcnt >= 0) - c = *fmt++; - } - else if (c >= '0' && c <= '9') { - width = c - '0'; - while (--fmtcnt >= 0) { - c = *fmt++; - if (c < '0' || c > '9') - break; - if ((width*10) / 10 != width) { - PyErr_SetString(PyExc_ValueError, - "width too big"); - goto onError; - } - width = width*10 + (c - '0'); - } - } - if (c == '.') { - prec = 0; - if (--fmtcnt >= 0) - c = *fmt++; - if (c == '*') { - v = getnextarg(args, arglen, &argidx); - if (v == NULL) - goto onError; - if (!PyInt_Check(v)) { - PyErr_SetString(PyExc_TypeError, - "* wants int"); - goto onError; - } - prec = PyInt_AsLong(v); - if (prec == -1 && PyErr_Occurred()) - goto onError; - if (prec < 0) - prec = 0; - if (--fmtcnt >= 0) - c = *fmt++; - } - else if (c >= '0' && c <= '9') { - prec = c - '0'; - while (--fmtcnt >= 0) { - c = Py_CHARMASK(*fmt++); - if (c < '0' || c > '9') - break; - if ((prec*10) / 10 != prec) { - PyErr_SetString(PyExc_ValueError, - "prec too big"); - goto onError; - } - prec = prec*10 + (c - '0'); - } - } - } /* prec */ - if (fmtcnt >= 0) { - if (c == 'h' || c == 'l' || c == 'L') { - if (--fmtcnt >= 0) - c = *fmt++; - } - } - if (fmtcnt < 0) { - PyErr_SetString(PyExc_ValueError, - "incomplete format"); - goto onError; - } - if (c != '%') { - v = getnextarg(args, arglen, &argidx); - if (v == NULL) - goto onError; - } - sign = 0; - fill = ' '; - switch (c) { - - case '%': - pbuf = formatbuf; - /* presume that buffer length is at least 1 */ - pbuf[0] = '%'; - len = 1; - break; - - case 's': - case 'r': - if (PyUnicode_Check(v) && c == 's') { - temp = v; - Py_INCREF(temp); - } - else { - PyObject *unicode; - if (c == 's') - temp = PyObject_Unicode(v); - else - temp = PyObject_Repr(v); - if (temp == NULL) - goto onError; - if (PyUnicode_Check(temp)) - /* nothing to do */; - else if (PyString_Check(temp)) { - /* convert to string to Unicode */ - unicode = PyUnicode_Decode(PyString_AS_STRING(temp), - PyString_GET_SIZE(temp), - NULL, - "strict"); - Py_DECREF(temp); - temp = unicode; - if (temp == NULL) - goto onError; - } - else { - Py_DECREF(temp); - PyErr_SetString(PyExc_TypeError, - "%s argument has non-string str()"); - goto onError; - } - } - pbuf = PyUnicode_AS_UNICODE(temp); - len = PyUnicode_GET_SIZE(temp); - if (prec >= 0 && len > prec) - len = prec; - break; - - case 'i': - case 'd': - case 'u': - case 'o': - case 'x': - case 'X': - if (c == 'i') - c = 'd'; - if (PyLong_Check(v)) { - temp = formatlong(v, flags, prec, c); - if (!temp) - goto onError; - pbuf = PyUnicode_AS_UNICODE(temp); - len = PyUnicode_GET_SIZE(temp); - sign = 1; - } - else { - pbuf = formatbuf; - len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), - flags, prec, c, v); - if (len < 0) - goto onError; - sign = 1; - } - if (flags & F_ZERO) - fill = '0'; - break; - - case 'e': - case 'E': - case 'f': - case 'F': - case 'g': - case 'G': - if (c == 'F') - c = 'f'; - pbuf = formatbuf; - len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), - flags, prec, c, v); - if (len < 0) - goto onError; - sign = 1; - if (flags & F_ZERO) - fill = '0'; - break; - - case 'c': - pbuf = formatbuf; - len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); - if (len < 0) - goto onError; - break; - - default: - PyErr_Format(PyExc_ValueError, - "unsupported format character '%c' (0x%x) " - "at index %zd", - (31<=c && c<=126) ? (char)c : '?', - (int)c, - (Py_ssize_t)(fmt - 1 - - PyUnicode_AS_UNICODE(uformat))); - goto onError; - } - if (sign) { - if (*pbuf == '-' || *pbuf == '+') { - sign = *pbuf++; - len--; - } - else if (flags & F_SIGN) - sign = '+'; - else if (flags & F_BLANK) - sign = ' '; - else - sign = 0; - } - if (width < len) - width = len; - if (rescnt - (sign != 0) < width) { - reslen -= rescnt; - rescnt = width + fmtcnt + 100; - reslen += rescnt; - if (reslen < 0) { - Py_XDECREF(temp); - PyErr_NoMemory(); - goto onError; - } - if (_PyUnicode_Resize(&result, reslen) < 0) { - Py_XDECREF(temp); - goto onError; - } - res = PyUnicode_AS_UNICODE(result) - + reslen - rescnt; - } - if (sign) { - if (fill != ' ') - *res++ = sign; - rescnt--; - if (width > len) - width--; - } - if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { - assert(pbuf[0] == '0'); - assert(pbuf[1] == c); - if (fill != ' ') { - *res++ = *pbuf++; - *res++ = *pbuf++; - } - rescnt -= 2; - width -= 2; - if (width < 0) - width = 0; - len -= 2; - } - if (width > len && !(flags & F_LJUST)) { - do { - --rescnt; - *res++ = fill; - } while (--width > len); - } - if (fill == ' ') { - if (sign) - *res++ = sign; - if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { - assert(pbuf[0] == '0'); - assert(pbuf[1] == c); - *res++ = *pbuf++; - *res++ = *pbuf++; - } - } - Py_UNICODE_COPY(res, pbuf, len); - res += len; - rescnt -= len; - while (--width >= len) { - --rescnt; - *res++ = ' '; - } - if (dict && (argidx < arglen) && c != '%') { - PyErr_SetString(PyExc_TypeError, - "not all arguments converted during string formatting"); - Py_XDECREF(temp); - goto onError; - } - Py_XDECREF(temp); - } /* '%' */ - } /* until end */ - if (argidx < arglen && !dict) { - PyErr_SetString(PyExc_TypeError, - "not all arguments converted during string formatting"); - goto onError; - } + {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, + {NULL, NULL} +}; - if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) - goto onError; - if (args_owned) { - Py_DECREF(args); - } - Py_DECREF(uformat); - return (PyObject *)result; +static PyObject * +unicode_mod(PyObject *v, PyObject *w) +{ + if (!PyUnicode_Check(v)) { + Py_INCREF(Py_NotImplemented); + return Py_NotImplemented; + } + return PyUnicode_Format(v, w); +} - onError: - Py_XDECREF(result); - Py_DECREF(uformat); - if (args_owned) { - Py_DECREF(args); +static PyNumberMethods unicode_as_number = { + 0, /*nb_add*/ + 0, /*nb_subtract*/ + 0, /*nb_multiply*/ + unicode_mod, /*nb_remainder*/ +}; + +static PySequenceMethods unicode_as_sequence = { + (lenfunc) unicode_length, /* sq_length */ + PyUnicode_Concat, /* sq_concat */ + (ssizeargfunc) unicode_repeat, /* sq_repeat */ + (ssizeargfunc) unicode_getitem, /* sq_item */ + (ssizessizeargfunc) unicode_slice, /* sq_slice */ + 0, /* sq_ass_item */ + 0, /* sq_ass_slice */ + PyUnicode_Contains, /* sq_contains */ +}; + +static PyObject* +unicode_subscript(PyUnicodeObject* self, PyObject* item) +{ + if (PyIndex_Check(item)) { + Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); + if (i == -1 && PyErr_Occurred()) + return NULL; + if (i < 0) + i += PyUnicode_GET_SIZE(self); + return unicode_getitem(self, i); + } else if (PySlice_Check(item)) { + Py_ssize_t start, stop, step, slicelength, cur, i; + Py_UNICODE* source_buf; + Py_UNICODE* result_buf; + PyObject* result; + + if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), + &start, &stop, &step, &slicelength) < 0) { + return NULL; + } + + if (slicelength <= 0) { + return PyUnicode_FromUnicode(NULL, 0); + } else { + source_buf = PyUnicode_AS_UNICODE((PyObject*)self); + result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength* + sizeof(Py_UNICODE)); + + if (result_buf == NULL) + return PyErr_NoMemory(); + + for (cur = start, i = 0; i < slicelength; cur += step, i++) { + result_buf[i] = source_buf[cur]; + } + + result = PyUnicode_FromUnicode(result_buf, slicelength); + PyMem_FREE(result_buf); + return result; + } + } else { + PyErr_SetString(PyExc_TypeError, "string indices must be integers"); + return NULL; } - return NULL; } -static PyBufferProcs unicode_as_buffer = { - (getbufferproc) unicode_buffer_getbuffer, - NULL, +static PyMappingMethods unicode_as_mapping = { + (lenfunc)unicode_length, /* mp_length */ + (binaryfunc)unicode_subscript, /* mp_subscript */ + (objobjargproc)0, /* mp_ass_subscript */ }; -static PyObject * -unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); -static PyObject * -unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +static int +unicode_buffer_getbuffer(PyUnicodeObject *self, PyBuffer *view, int flags) { - PyObject *x = NULL; - static char *kwlist[] = {"object", "encoding", "errors", 0}; - char *encoding = NULL; - char *errors = NULL; - if (type != &PyUnicode_Type) - return unicode_subtype_new(type, args, kwds); - if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", - kwlist, &x, &encoding, &errors)) - return NULL; - if (x == NULL) - return (PyObject *)_PyUnicode_New(0); - if (encoding == NULL && errors == NULL) - return PyObject_Unicode(x); - else - return PyUnicode_FromEncodedObject(x, encoding, errors); + if (flags & PyBUF_CHARACTER) { + PyObject *str; + + str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); + if (str == NULL) return -1; + return PyBuffer_FillInfo(view, (void *)PyString_AS_STRING(str), + PyString_GET_SIZE(str), 1, flags); + } + else { + return PyBuffer_FillInfo(view, (void *)self->str, + PyUnicode_GET_DATA_SIZE(self), 1, flags); + } } + +/* Helpers for PyUnicode_Format() */ + static PyObject * -unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) { - PyUnicodeObject *tmp, *pnew; - Py_ssize_t n; + Py_ssize_t argidx = *p_argidx; + if (argidx < arglen) { + (*p_argidx)++; + if (arglen < 0) + return args; + else + return PyTuple_GetItem(args, argidx); + } + PyErr_SetString(PyExc_TypeError, + "not enough arguments for format string"); + return NULL; +} - assert(PyType_IsSubtype(type, &PyUnicode_Type)); - tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); - if (tmp == NULL) - return NULL; - assert(PyUnicode_Check(tmp)); - pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); - if (pnew == NULL) { - Py_DECREF(tmp); - return NULL; - } - pnew->str = PyMem_NEW(Py_UNICODE, n+1); - if (pnew->str == NULL) { - _Py_ForgetReference((PyObject *)pnew); - PyObject_Del(pnew); - Py_DECREF(tmp); - return PyErr_NoMemory(); - } - Py_UNICODE_COPY(pnew->str, tmp->str, n+1); - pnew->length = n; - pnew->hash = tmp->hash; - Py_DECREF(tmp); - return (PyObject *)pnew; +#define F_LJUST (1<<0) +#define F_SIGN (1<<1) +#define F_BLANK (1<<2) +#define F_ALT (1<<3) +#define F_ZERO (1<<4) + +static Py_ssize_t +strtounicode(Py_UNICODE *buffer, const char *charbuffer) +{ + register Py_ssize_t i; + Py_ssize_t len = strlen(charbuffer); + for (i = len - 1; i >= 0; i--) + buffer[i] = (Py_UNICODE) charbuffer[i]; + + return len; } -PyDoc_STRVAR(unicode_doc, -"str(string [, encoding[, errors]]) -> object\n\ -\n\ -Create a new string object from the given encoded string.\n\ -encoding defaults to the current default string encoding.\n\ -errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); +static int +doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) +{ + Py_ssize_t result; -static PyObject *unicode_iter(PyObject *seq); + PyOS_ascii_formatd((char *)buffer, len, format, x); + result = strtounicode(buffer, (char *)buffer); + return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); +} -PyTypeObject PyUnicode_Type = { - PyVarObject_HEAD_INIT(&PyType_Type, 0) - "str", /* tp_name */ - sizeof(PyUnicodeObject), /* tp_size */ - 0, /* tp_itemsize */ - /* Slots */ - (destructor)unicode_dealloc, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_compare */ - unicode_repr, /* tp_repr */ - &unicode_as_number, /* tp_as_number */ - &unicode_as_sequence, /* tp_as_sequence */ - &unicode_as_mapping, /* tp_as_mapping */ - (hashfunc) unicode_hash, /* tp_hash*/ - 0, /* tp_call*/ - (reprfunc) unicode_str, /* tp_str */ - PyObject_GenericGetAttr, /* tp_getattro */ - 0, /* tp_setattro */ - &unicode_as_buffer, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | - Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ - unicode_doc, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - PyUnicode_RichCompare, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - unicode_iter, /* tp_iter */ - 0, /* tp_iternext */ - unicode_methods, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - &PyBaseString_Type, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - unicode_new, /* tp_new */ - PyObject_Del, /* tp_free */ -}; +static int +longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) +{ + Py_ssize_t result; -/* Initialize the Unicode implementation */ + PyOS_snprintf((char *)buffer, len, format, x); + result = strtounicode(buffer, (char *)buffer); + return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); +} -void _PyUnicode_Init(void) +/* XXX To save some code duplication, formatfloat/long/int could have been + shared with stringobject.c, converting from 8-bit to Unicode after the + formatting is done. */ + +static int +formatfloat(Py_UNICODE *buf, + size_t buflen, + int flags, + int prec, + int type, + PyObject *v) { - int i; + /* fmt = '%#.' + `prec` + `type` + worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ + char fmt[20]; + double x; - /* XXX - move this array to unicodectype.c ? */ - Py_UNICODE linebreak[] = { - 0x000A, /* LINE FEED */ - 0x000D, /* CARRIAGE RETURN */ - 0x001C, /* FILE SEPARATOR */ - 0x001D, /* GROUP SEPARATOR */ - 0x001E, /* RECORD SEPARATOR */ - 0x0085, /* NEXT LINE */ - 0x2028, /* LINE SEPARATOR */ - 0x2029, /* PARAGRAPH SEPARATOR */ - }; + x = PyFloat_AsDouble(v); + if (x == -1.0 && PyErr_Occurred()) + return -1; + if (prec < 0) + prec = 6; + if (type == 'f' && (fabs(x) / 1e25) >= 1e25) + type = 'g'; + /* Worst case length calc to ensure no buffer overrun: - /* Init the implementation */ - unicode_freelist = NULL; - unicode_freelist_size = 0; - unicode_empty = _PyUnicode_New(0); - if (!unicode_empty) - return; + 'g' formats: + fmt = %#.g + buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp + for any double rep.) + len = 1 + prec + 1 + 2 + 5 = 9 + prec - for (i = 0; i < 256; i++) - unicode_latin1[i] = NULL; - if (PyType_Ready(&PyUnicode_Type) < 0) - Py_FatalError("Can't initialize 'unicode'"); + 'f' formats: + buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) + len = 1 + 50 + 1 + prec = 52 + prec - /* initialize the linebreak bloom filter */ - bloom_linebreak = make_bloom_mask( - linebreak, sizeof(linebreak) / sizeof(linebreak[0]) - ); + If prec=0 the effective precision is 1 (the leading digit is + always given), therefore increase the length by one. - PyType_Ready(&EncodingMapType); + */ + if (((type == 'g' || type == 'G') && + buflen <= (size_t)10 + (size_t)prec) || + (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { + PyErr_SetString(PyExc_OverflowError, + "formatted float is too long (precision too large?)"); + return -1; + } + PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", + (flags&F_ALT) ? "#" : "", + prec, type); + return doubletounicode(buf, buflen, fmt, x); } -/* Finalize the Unicode implementation */ +static PyObject* +formatlong(PyObject *val, int flags, int prec, int type) +{ + char *buf; + int len; + PyObject *str; /* temporary string object. */ + PyObject *result; -void -_PyUnicode_Fini(void) + str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); + if (!str) + return NULL; + result = PyUnicode_FromStringAndSize(buf, len); + Py_DECREF(str); + return result; +} + +static int +formatint(Py_UNICODE *buf, + size_t buflen, + int flags, + int prec, + int type, + PyObject *v) { - PyUnicodeObject *u; - int i; + /* fmt = '%#.' + `prec` + 'l' + `type` + * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) + * + 1 + 1 + * = 24 + */ + char fmt[64]; /* plenty big enough! */ + char *sign; + long x; - Py_XDECREF(unicode_empty); - unicode_empty = NULL; + x = PyInt_AsLong(v); + if (x == -1 && PyErr_Occurred()) + return -1; + if (x < 0 && type == 'u') { + type = 'd'; + } + if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) + sign = "-"; + else + sign = ""; + if (prec < 0) + prec = 1; - for (i = 0; i < 256; i++) { - if (unicode_latin1[i]) { - Py_DECREF(unicode_latin1[i]); - unicode_latin1[i] = NULL; - } + /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) + * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 + */ + if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { + PyErr_SetString(PyExc_OverflowError, + "formatted integer is too long (precision too large?)"); + return -1; } - for (u = unicode_freelist; u != NULL;) { - PyUnicodeObject *v = u; - u = *(PyUnicodeObject **)u; - if (v->str) - PyMem_DEL(v->str); - Py_XDECREF(v->defenc); - PyObject_Del(v); + if ((flags & F_ALT) && + (type == 'x' || type == 'X' || type == 'o')) { + /* When converting under %#o, %#x or %#X, there are a number + * of issues that cause pain: + * - for %#o, we want a different base marker than C + * - when 0 is being converted, the C standard leaves off + * the '0x' or '0X', which is inconsistent with other + * %#x/%#X conversions and inconsistent with Python's + * hex() function + * - there are platforms that violate the standard and + * convert 0 with the '0x' or '0X' + * (Metrowerks, Compaq Tru64) + * - there are platforms that give '0x' when converting + * under %#X, but convert 0 in accordance with the + * standard (OS/2 EMX) + * + * We can achieve the desired consistency by inserting our + * own '0x' or '0X' prefix, and substituting %x/%X in place + * of %#x/%#X. + * + * Note that this is the same approach as used in + * formatint() in stringobject.c + */ + PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", + sign, type, prec, type); } - unicode_freelist = NULL; - unicode_freelist_size = 0; + else { + PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", + sign, (flags&F_ALT) ? "#" : "", + prec, type); + } + if (sign[0]) + return longtounicode(buf, buflen, fmt, -x); + else + return longtounicode(buf, buflen, fmt, x); } -void -PyUnicode_InternInPlace(PyObject **p) +static int +formatchar(Py_UNICODE *buf, + size_t buflen, + PyObject *v) { - register PyUnicodeObject *s = (PyUnicodeObject *)(*p); - PyObject *t; - if (s == NULL || !PyUnicode_Check(s)) - Py_FatalError( - "PyUnicode_InternInPlace: unicode strings only please!"); - /* If it's a subclass, we don't really know what putting - it in the interned dict might do. */ - if (!PyUnicode_CheckExact(s)) - return; - if (PyUnicode_CHECK_INTERNED(s)) - return; - if (interned == NULL) { - interned = PyDict_New(); - if (interned == NULL) { - PyErr_Clear(); /* Don't leave an exception */ - return; - } - } - /* It might be that the GetItem call fails even - though the key is present in the dictionary, - namely when this happens during a stack overflow. */ - Py_ALLOW_RECURSION - t = PyDict_GetItem(interned, (PyObject *)s); - Py_END_ALLOW_RECURSION + /* presume that the buffer is at least 2 characters long */ + if (PyUnicode_Check(v)) { + if (PyUnicode_GET_SIZE(v) != 1) + goto onError; + buf[0] = PyUnicode_AS_UNICODE(v)[0]; + } - if (t) { - Py_INCREF(t); - Py_DECREF(*p); - *p = t; - return; - } + else if (PyString_Check(v)) { + if (PyString_GET_SIZE(v) != 1) + goto onError; + buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; + } - PyThreadState_GET()->recursion_critical = 1; - if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { - PyErr_Clear(); - PyThreadState_GET()->recursion_critical = 0; - return; + else { + /* Integer input truncated to a character */ + long x; + x = PyInt_AsLong(v); + if (x == -1 && PyErr_Occurred()) + goto onError; +#ifdef Py_UNICODE_WIDE + if (x < 0 || x > 0x10ffff) { + PyErr_SetString(PyExc_OverflowError, + "%c arg not in range(0x110000) " + "(wide Python build)"); + return -1; } - PyThreadState_GET()->recursion_critical = 0; - /* The two references in interned are not counted by refcnt. - The deallocator will take care of this */ - Py_Refcnt(s) -= 2; - PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; +#else + if (x < 0 || x > 0xffff) { + PyErr_SetString(PyExc_OverflowError, + "%c arg not in range(0x10000) " + "(narrow Python build)"); + return -1; + } +#endif + buf[0] = (Py_UNICODE) x; + } + buf[1] = '\0'; + return 1; + + onError: + PyErr_SetString(PyExc_TypeError, + "%c requires int or char"); + return -1; } -void -PyUnicode_InternImmortal(PyObject **p) -{ - PyUnicode_InternInPlace(p); - if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { - PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; - Py_INCREF(*p); - } -} +/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) -PyObject * -PyUnicode_InternFromString(const char *cp) -{ - PyObject *s = PyUnicode_FromString(cp); - if (s == NULL) - return NULL; - PyUnicode_InternInPlace(&s); - return s; -} + FORMATBUFLEN is the length of the buffer in which the floats, ints, & + chars are formatted. XXX This is a magic number. Each formatting + routine does bounds checking to ensure no overflow, but a better + solution may be to malloc a buffer of appropriate size for each + format. For now, the current solution is sufficient. +*/ +#define FORMATBUFLEN (size_t)120 -void _Py_ReleaseInternedUnicodeStrings(void) +PyObject *PyUnicode_Format(PyObject *format, + PyObject *args) { - PyObject *keys; - PyUnicodeObject *s; - Py_ssize_t i, n; - Py_ssize_t immortal_size = 0, mortal_size = 0; - - if (interned == NULL || !PyDict_Check(interned)) - return; - keys = PyDict_Keys(interned); - if (keys == NULL || !PyList_Check(keys)) { - PyErr_Clear(); - return; - } + Py_UNICODE *fmt, *res; + Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; + int args_owned = 0; + PyUnicodeObject *result = NULL; + PyObject *dict = NULL; + PyObject *uformat; - /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak - detector, interned unicode strings are not forcibly deallocated; - rather, we give them their stolen references back, and then clear - and DECREF the interned dict. */ + if (format == NULL || args == NULL) { + PyErr_BadInternalCall(); + return NULL; + } + uformat = PyUnicode_FromObject(format); + if (uformat == NULL) + return NULL; + fmt = PyUnicode_AS_UNICODE(uformat); + fmtcnt = PyUnicode_GET_SIZE(uformat); - n = PyList_GET_SIZE(keys); - fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", - n); - for (i = 0; i < n; i++) { - s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); - switch (s->state) { - case SSTATE_NOT_INTERNED: - /* XXX Shouldn't happen */ - break; - case SSTATE_INTERNED_IMMORTAL: - Py_Refcnt(s) += 1; - immortal_size += s->length; - break; - case SSTATE_INTERNED_MORTAL: - Py_Refcnt(s) += 2; - mortal_size += s->length; - break; - default: - Py_FatalError("Inconsistent interned string state."); - } - s->state = SSTATE_NOT_INTERNED; - } - fprintf(stderr, "total size of all interned strings: " - "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " - "mortal/immortal\n", mortal_size, immortal_size); - Py_DECREF(keys); - PyDict_Clear(interned); - Py_DECREF(interned); - interned = NULL; -} + reslen = rescnt = fmtcnt + 100; + result = _PyUnicode_New(reslen); + if (result == NULL) + goto onError; + res = PyUnicode_AS_UNICODE(result); + if (PyTuple_Check(args)) { + arglen = PyTuple_Size(args); + argidx = 0; + } + else { + arglen = -1; + argidx = -2; + } + if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) && + !PyObject_TypeCheck(args, &PyBaseString_Type)) + dict = args; -/********************* Formatter Iterator ************************/ + while (--fmtcnt >= 0) { + if (*fmt != '%') { + if (--rescnt < 0) { + rescnt = fmtcnt + 100; + reslen += rescnt; + if (_PyUnicode_Resize(&result, reslen) < 0) + goto onError; + res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; + --rescnt; + } + *res++ = *fmt++; + } + else { + /* Got a format specifier */ + int flags = 0; + Py_ssize_t width = -1; + int prec = -1; + Py_UNICODE c = '\0'; + Py_UNICODE fill; + PyObject *v = NULL; + PyObject *temp = NULL; + Py_UNICODE *pbuf; + Py_UNICODE sign; + Py_ssize_t len; + Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ -/* this is used to implement string.Formatter.vparse(). it exists so - Formatter can share code with the built in unicode.format() - method */ + fmt++; + if (*fmt == '(') { + Py_UNICODE *keystart; + Py_ssize_t keylen; + PyObject *key; + int pcount = 1; -typedef struct { - PyObject_HEAD + if (dict == NULL) { + PyErr_SetString(PyExc_TypeError, + "format requires a mapping"); + goto onError; + } + ++fmt; + --fmtcnt; + keystart = fmt; + /* Skip over balanced parentheses */ + while (pcount > 0 && --fmtcnt >= 0) { + if (*fmt == ')') + --pcount; + else if (*fmt == '(') + ++pcount; + fmt++; + } + keylen = fmt - keystart - 1; + if (fmtcnt < 0 || pcount > 0) { + PyErr_SetString(PyExc_ValueError, + "incomplete format key"); + goto onError; + } +#if 0 + /* keys are converted to strings using UTF-8 and + then looked up since Python uses strings to hold + variables names etc. in its namespaces and we + wouldn't want to break common idioms. */ + key = PyUnicode_EncodeUTF8(keystart, + keylen, + NULL); +#else + key = PyUnicode_FromUnicode(keystart, keylen); +#endif + if (key == NULL) + goto onError; + if (args_owned) { + Py_DECREF(args); + args_owned = 0; + } + args = PyObject_GetItem(dict, key); + Py_DECREF(key); + if (args == NULL) { + goto onError; + } + args_owned = 1; + arglen = -1; + argidx = -2; + } + while (--fmtcnt >= 0) { + switch (c = *fmt++) { + case '-': flags |= F_LJUST; continue; + case '+': flags |= F_SIGN; continue; + case ' ': flags |= F_BLANK; continue; + case '#': flags |= F_ALT; continue; + case '0': flags |= F_ZERO; continue; + } + break; + } + if (c == '*') { + v = getnextarg(args, arglen, &argidx); + if (v == NULL) + goto onError; + if (!PyInt_Check(v)) { + PyErr_SetString(PyExc_TypeError, + "* wants int"); + goto onError; + } + width = PyInt_AsLong(v); + if (width == -1 && PyErr_Occurred()) + goto onError; + if (width < 0) { + flags |= F_LJUST; + width = -width; + } + if (--fmtcnt >= 0) + c = *fmt++; + } + else if (c >= '0' && c <= '9') { + width = c - '0'; + while (--fmtcnt >= 0) { + c = *fmt++; + if (c < '0' || c > '9') + break; + if ((width*10) / 10 != width) { + PyErr_SetString(PyExc_ValueError, + "width too big"); + goto onError; + } + width = width*10 + (c - '0'); + } + } + if (c == '.') { + prec = 0; + if (--fmtcnt >= 0) + c = *fmt++; + if (c == '*') { + v = getnextarg(args, arglen, &argidx); + if (v == NULL) + goto onError; + if (!PyInt_Check(v)) { + PyErr_SetString(PyExc_TypeError, + "* wants int"); + goto onError; + } + prec = PyInt_AsLong(v); + if (prec == -1 && PyErr_Occurred()) + goto onError; + if (prec < 0) + prec = 0; + if (--fmtcnt >= 0) + c = *fmt++; + } + else if (c >= '0' && c <= '9') { + prec = c - '0'; + while (--fmtcnt >= 0) { + c = Py_CHARMASK(*fmt++); + if (c < '0' || c > '9') + break; + if ((prec*10) / 10 != prec) { + PyErr_SetString(PyExc_ValueError, + "prec too big"); + goto onError; + } + prec = prec*10 + (c - '0'); + } + } + } /* prec */ + if (fmtcnt >= 0) { + if (c == 'h' || c == 'l' || c == 'L') { + if (--fmtcnt >= 0) + c = *fmt++; + } + } + if (fmtcnt < 0) { + PyErr_SetString(PyExc_ValueError, + "incomplete format"); + goto onError; + } + if (c != '%') { + v = getnextarg(args, arglen, &argidx); + if (v == NULL) + goto onError; + } + sign = 0; + fill = ' '; + switch (c) { - /* we know this to be a unicode object, but since we just keep - it around to keep the object alive, having it as PyObject - is okay */ - PyObject *str; + case '%': + pbuf = formatbuf; + /* presume that buffer length is at least 1 */ + pbuf[0] = '%'; + len = 1; + break; - MarkupIterator it_markup; -} formatteriterobject; + case 's': + case 'r': + if (PyUnicode_Check(v) && c == 's') { + temp = v; + Py_INCREF(temp); + } + else { + PyObject *unicode; + if (c == 's') + temp = PyObject_Unicode(v); + else + temp = PyObject_Repr(v); + if (temp == NULL) + goto onError; + if (PyUnicode_Check(temp)) + /* nothing to do */; + else if (PyString_Check(temp)) { + /* convert to string to Unicode */ + unicode = PyUnicode_Decode(PyString_AS_STRING(temp), + PyString_GET_SIZE(temp), + NULL, + "strict"); + Py_DECREF(temp); + temp = unicode; + if (temp == NULL) + goto onError; + } + else { + Py_DECREF(temp); + PyErr_SetString(PyExc_TypeError, + "%s argument has non-string str()"); + goto onError; + } + } + pbuf = PyUnicode_AS_UNICODE(temp); + len = PyUnicode_GET_SIZE(temp); + if (prec >= 0 && len > prec) + len = prec; + break; -static void -formatteriter_dealloc(formatteriterobject *it) -{ - Py_XDECREF(it->str); - PyObject_FREE(it); -} + case 'i': + case 'd': + case 'u': + case 'o': + case 'x': + case 'X': + if (c == 'i') + c = 'd'; + if (PyLong_Check(v)) { + temp = formatlong(v, flags, prec, c); + if (!temp) + goto onError; + pbuf = PyUnicode_AS_UNICODE(temp); + len = PyUnicode_GET_SIZE(temp); + sign = 1; + } + else { + pbuf = formatbuf; + len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), + flags, prec, c, v); + if (len < 0) + goto onError; + sign = 1; + } + if (flags & F_ZERO) + fill = '0'; + break; -/* returns a tuple: - (is_markup, literal, field_name, format_spec, conversion) - if is_markup == True: - literal is None - field_name is the string before the ':' - format_spec is the string after the ':' - conversion is either None, or the string after the '!' - if is_markup == False: - literal is the literal string - field_name is None - format_spec is None - conversion is None -*/ -static PyObject * -formatteriter_next(formatteriterobject *it) -{ - SubString literal; - SubString field_name; - SubString format_spec; - Py_UNICODE conversion; - int is_markup; - int format_spec_needs_expanding; - int result = MarkupIterator_next(&it->it_markup, &is_markup, &literal, - &field_name, &format_spec, &conversion, - &format_spec_needs_expanding); + case 'e': + case 'E': + case 'f': + case 'F': + case 'g': + case 'G': + if (c == 'F') + c = 'f'; + pbuf = formatbuf; + len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), + flags, prec, c, v); + if (len < 0) + goto onError; + sign = 1; + if (flags & F_ZERO) + fill = '0'; + break; - /* all of the SubString objects point into it->str, so no - memory management needs to be done on them */ - assert(0 <= result && result <= 2); - if (result == 0) { - /* error has already been set */ - return NULL; - } else if (result == 1) { - /* end of iterator */ - return NULL; - } else { - PyObject *is_markup_bool = NULL; - PyObject *literal_str = NULL; - PyObject *field_name_str = NULL; - PyObject *format_spec_str = NULL; - PyObject *conversion_str = NULL; - PyObject *tuple = NULL; + case 'c': + pbuf = formatbuf; + len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); + if (len < 0) + goto onError; + break; - is_markup_bool = PyBool_FromLong(is_markup); - if (!is_markup_bool) - return NULL; + default: + PyErr_Format(PyExc_ValueError, + "unsupported format character '%c' (0x%x) " + "at index %zd", + (31<=c && c<=126) ? (char)c : '?', + (int)c, + (Py_ssize_t)(fmt - 1 - + PyUnicode_AS_UNICODE(uformat))); + goto onError; + } + if (sign) { + if (*pbuf == '-' || *pbuf == '+') { + sign = *pbuf++; + len--; + } + else if (flags & F_SIGN) + sign = '+'; + else if (flags & F_BLANK) + sign = ' '; + else + sign = 0; + } + if (width < len) + width = len; + if (rescnt - (sign != 0) < width) { + reslen -= rescnt; + rescnt = width + fmtcnt + 100; + reslen += rescnt; + if (reslen < 0) { + Py_XDECREF(temp); + PyErr_NoMemory(); + goto onError; + } + if (_PyUnicode_Resize(&result, reslen) < 0) { + Py_XDECREF(temp); + goto onError; + } + res = PyUnicode_AS_UNICODE(result) + + reslen - rescnt; + } + if (sign) { + if (fill != ' ') + *res++ = sign; + rescnt--; + if (width > len) + width--; + } + if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { + assert(pbuf[0] == '0'); + assert(pbuf[1] == c); + if (fill != ' ') { + *res++ = *pbuf++; + *res++ = *pbuf++; + } + rescnt -= 2; + width -= 2; + if (width < 0) + width = 0; + len -= 2; + } + if (width > len && !(flags & F_LJUST)) { + do { + --rescnt; + *res++ = fill; + } while (--width > len); + } + if (fill == ' ') { + if (sign) + *res++ = sign; + if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) { + assert(pbuf[0] == '0'); + assert(pbuf[1] == c); + *res++ = *pbuf++; + *res++ = *pbuf++; + } + } + Py_UNICODE_COPY(res, pbuf, len); + res += len; + rescnt -= len; + while (--width >= len) { + --rescnt; + *res++ = ' '; + } + if (dict && (argidx < arglen) && c != '%') { + PyErr_SetString(PyExc_TypeError, + "not all arguments converted during string formatting"); + Py_XDECREF(temp); + goto onError; + } + Py_XDECREF(temp); + } /* '%' */ + } /* until end */ + if (argidx < arglen && !dict) { + PyErr_SetString(PyExc_TypeError, + "not all arguments converted during string formatting"); + goto onError; + } - if (is_markup) { - /* field_name, format_spec, and conversion are - returned */ - literal_str = Py_None; - Py_INCREF(literal_str); + if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) + goto onError; + if (args_owned) { + Py_DECREF(args); + } + Py_DECREF(uformat); + return (PyObject *)result; - field_name_str = SubString_new_object(&field_name); - if (field_name_str == NULL) - goto error; + onError: + Py_XDECREF(result); + Py_DECREF(uformat); + if (args_owned) { + Py_DECREF(args); + } + return NULL; +} - format_spec_str = SubString_new_object(&format_spec); - if (format_spec_str == NULL) - goto error; +static PyBufferProcs unicode_as_buffer = { + (getbufferproc) unicode_buffer_getbuffer, + NULL, +}; - /* if the conversion is not specified, return - a None, otherwise create a one length - string with the conversion characater */ - if (conversion == '\0') { - conversion_str = Py_None; - Py_INCREF(conversion_str); - } else - conversion_str = PyUnicode_FromUnicode(&conversion, - 1); - if (conversion_str == NULL) - goto error; - } else { - /* only literal is returned */ - literal_str = SubString_new_object(&literal); - if (literal_str == NULL) - goto error; +static PyObject * +unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); - field_name_str = Py_None; - format_spec_str = Py_None; - conversion_str = Py_None; +static PyObject * +unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + PyObject *x = NULL; + static char *kwlist[] = {"object", "encoding", "errors", 0}; + char *encoding = NULL; + char *errors = NULL; - Py_INCREF(field_name_str); - Py_INCREF(format_spec_str); - Py_INCREF(conversion_str); - } - tuple = PyTuple_Pack(5, is_markup_bool, literal_str, - field_name_str, format_spec_str, - conversion_str); - error: - Py_XDECREF(is_markup_bool); - Py_XDECREF(literal_str); - Py_XDECREF(field_name_str); - Py_XDECREF(format_spec_str); - Py_XDECREF(conversion_str); - return tuple; - } + if (type != &PyUnicode_Type) + return unicode_subtype_new(type, args, kwds); + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", + kwlist, &x, &encoding, &errors)) + return NULL; + if (x == NULL) + return (PyObject *)_PyUnicode_New(0); + if (encoding == NULL && errors == NULL) + return PyObject_Unicode(x); + else + return PyUnicode_FromEncodedObject(x, encoding, errors); } -static PyMethodDef formatteriter_methods[] = { - {NULL, NULL} /* sentinel */ -}; - -PyTypeObject PyFormatterIter_Type = { - PyVarObject_HEAD_INIT(&PyType_Type, 0) - "formatteriterator", /* tp_name */ - sizeof(formatteriterobject), /* tp_basicsize */ - 0, /* tp_itemsize */ - /* methods */ - (destructor)formatteriter_dealloc, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_compare */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - PyObject_GenericGetAttr, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - 0, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - PyObject_SelfIter, /* tp_iter */ - (iternextfunc)formatteriter_next, /* tp_iternext */ - formatteriter_methods, /* tp_methods */ - 0, -}; - -PyObject * -_PyUnicode_FormatterIterator(PyObject *str) +static PyObject * +unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { - formatteriterobject *it; + PyUnicodeObject *tmp, *pnew; + Py_ssize_t n; - assert(PyUnicode_Check(str)); - it = PyObject_New(formatteriterobject, &PyFormatterIter_Type); - if (it == NULL) + assert(PyType_IsSubtype(type, &PyUnicode_Type)); + tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); + if (tmp == NULL) + return NULL; + assert(PyUnicode_Check(tmp)); + pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); + if (pnew == NULL) { + Py_DECREF(tmp); return NULL; + } + pnew->str = PyMem_NEW(Py_UNICODE, n+1); + if (pnew->str == NULL) { + _Py_ForgetReference((PyObject *)pnew); + PyObject_Del(pnew); + Py_DECREF(tmp); + return PyErr_NoMemory(); + } + Py_UNICODE_COPY(pnew->str, tmp->str, n+1); + pnew->length = n; + pnew->hash = tmp->hash; + Py_DECREF(tmp); + return (PyObject *)pnew; +} - /* take ownership, give the object to the iterator */ - Py_INCREF(str); - it->str = str; +PyDoc_STRVAR(unicode_doc, +"str(string [, encoding[, errors]]) -> object\n\ +\n\ +Create a new string object from the given encoded string.\n\ +encoding defaults to the current default string encoding.\n\ +errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); - /* initialize the contained MarkupIterator */ - MarkupIterator_init(&it->it_markup, - PyUnicode_AS_UNICODE(str), - PyUnicode_GET_SIZE(str)); +static PyObject *unicode_iter(PyObject *seq); - return (PyObject *)it; -} +PyTypeObject PyUnicode_Type = { + PyVarObject_HEAD_INIT(&PyType_Type, 0) + "str", /* tp_name */ + sizeof(PyUnicodeObject), /* tp_size */ + 0, /* tp_itemsize */ + /* Slots */ + (destructor)unicode_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + unicode_repr, /* tp_repr */ + &unicode_as_number, /* tp_as_number */ + &unicode_as_sequence, /* tp_as_sequence */ + &unicode_as_mapping, /* tp_as_mapping */ + (hashfunc) unicode_hash, /* tp_hash*/ + 0, /* tp_call*/ + (reprfunc) unicode_str, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + &unicode_as_buffer, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | + Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ + unicode_doc, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + PyUnicode_RichCompare, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + unicode_iter, /* tp_iter */ + 0, /* tp_iternext */ + unicode_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + &PyBaseString_Type, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + unicode_new, /* tp_new */ + PyObject_Del, /* tp_free */ +}; -/********************* FieldName Iterator ************************/ +/* Initialize the Unicode implementation */ -/* this is used to implement string.Formatter.vparse(). it parses - the field name into attribute and item values. */ +void _PyUnicode_Init(void) +{ + int i; -typedef struct { - PyObject_HEAD + /* XXX - move this array to unicodectype.c ? */ + Py_UNICODE linebreak[] = { + 0x000A, /* LINE FEED */ + 0x000D, /* CARRIAGE RETURN */ + 0x001C, /* FILE SEPARATOR */ + 0x001D, /* GROUP SEPARATOR */ + 0x001E, /* RECORD SEPARATOR */ + 0x0085, /* NEXT LINE */ + 0x2028, /* LINE SEPARATOR */ + 0x2029, /* PARAGRAPH SEPARATOR */ + }; + + /* Init the implementation */ + unicode_freelist = NULL; + unicode_freelist_size = 0; + unicode_empty = _PyUnicode_New(0); + if (!unicode_empty) + return; - /* we know this to be a unicode object, but since we just keep - it around to keep the object alive, having it as PyObject - is okay */ - PyObject *str; + for (i = 0; i < 256; i++) + unicode_latin1[i] = NULL; + if (PyType_Ready(&PyUnicode_Type) < 0) + Py_FatalError("Can't initialize 'unicode'"); - FieldNameIterator it_field; -} fieldnameiterobject; + /* initialize the linebreak bloom filter */ + bloom_linebreak = make_bloom_mask( + linebreak, sizeof(linebreak) / sizeof(linebreak[0]) + ); -static void -fieldnameiter_dealloc(fieldnameiterobject *it) -{ - Py_XDECREF(it->str); - PyObject_FREE(it); + PyType_Ready(&EncodingMapType); } -/* returns a tuple: - (is_attr, value) - is_attr is true if we used attribute syntax (e.g., '.foo') - false if we used index syntax (e.g., '[foo]') - value is an integer or string -*/ -static PyObject * -fieldnameiter_next(fieldnameiterobject *it) +/* Finalize the Unicode implementation */ + +void +_PyUnicode_Fini(void) { - int result; - int is_attr; - Py_ssize_t idx; - SubString name; + PyUnicodeObject *u; + int i; - result = FieldNameIterator_next(&it->it_field, &is_attr, - &idx, &name); - if (result == 0 || result == 1) { - /* if 0, error has already been set, if 1, iterator is empty */ - return NULL; - } else { - PyObject* result = NULL; - PyObject* is_attr_obj = NULL; - PyObject* obj = NULL; + Py_XDECREF(unicode_empty); + unicode_empty = NULL; - is_attr_obj = PyBool_FromLong(is_attr); - if (is_attr_obj == NULL) - goto error; + for (i = 0; i < 256; i++) { + if (unicode_latin1[i]) { + Py_DECREF(unicode_latin1[i]); + unicode_latin1[i] = NULL; + } + } - /* either an integer or a string */ - if (idx != -1) - obj = PyInt_FromSsize_t(idx); - else - obj = STRINGLIB_NEW(name.ptr, name.end - name.ptr); - if (obj == NULL) - goto error; + for (u = unicode_freelist; u != NULL;) { + PyUnicodeObject *v = u; + u = *(PyUnicodeObject **)u; + if (v->str) + PyMem_DEL(v->str); + Py_XDECREF(v->defenc); + PyObject_Del(v); + } + unicode_freelist = NULL; + unicode_freelist_size = 0; +} - /* return a tuple of values */ - result = PyTuple_Pack(2, is_attr_obj, obj); - if (result == NULL) - goto error; +void +PyUnicode_InternInPlace(PyObject **p) +{ + register PyUnicodeObject *s = (PyUnicodeObject *)(*p); + PyObject *t; + if (s == NULL || !PyUnicode_Check(s)) + Py_FatalError( + "PyUnicode_InternInPlace: unicode strings only please!"); + /* If it's a subclass, we don't really know what putting + it in the interned dict might do. */ + if (!PyUnicode_CheckExact(s)) + return; + if (PyUnicode_CHECK_INTERNED(s)) + return; + if (interned == NULL) { + interned = PyDict_New(); + if (interned == NULL) { + PyErr_Clear(); /* Don't leave an exception */ + return; + } + } + /* It might be that the GetItem call fails even + though the key is present in the dictionary, + namely when this happens during a stack overflow. */ + Py_ALLOW_RECURSION + t = PyDict_GetItem(interned, (PyObject *)s); + Py_END_ALLOW_RECURSION - return result; + if (t) { + Py_INCREF(t); + Py_DECREF(*p); + *p = t; + return; + } - error: - Py_XDECREF(result); - Py_XDECREF(is_attr_obj); - Py_XDECREF(obj); - return NULL; - } - return NULL; + PyThreadState_GET()->recursion_critical = 1; + if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) { + PyErr_Clear(); + PyThreadState_GET()->recursion_critical = 0; + return; + } + PyThreadState_GET()->recursion_critical = 0; + /* The two references in interned are not counted by refcnt. + The deallocator will take care of this */ + Py_Refcnt(s) -= 2; + PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL; } -static PyMethodDef fieldnameiter_methods[] = { - {NULL, NULL} /* sentinel */ -}; - -static PyTypeObject PyFieldNameIter_Type = { - PyVarObject_HEAD_INIT(&PyType_Type, 0) - "fieldnameiterator", /* tp_name */ - sizeof(fieldnameiterobject), /* tp_basicsize */ - 0, /* tp_itemsize */ - /* methods */ - (destructor)fieldnameiter_dealloc, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_compare */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - PyObject_GenericGetAttr, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - 0, /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - PyObject_SelfIter, /* tp_iter */ - (iternextfunc)fieldnameiter_next, /* tp_iternext */ - fieldnameiter_methods, /* tp_methods */ - 0}; +void +PyUnicode_InternImmortal(PyObject **p) +{ + PyUnicode_InternInPlace(p); + if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) { + PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL; + Py_INCREF(*p); + } +} PyObject * -_PyUnicode_FormatterFieldNameSplit(PyObject *field_name) +PyUnicode_InternFromString(const char *cp) { - SubString first; - Py_ssize_t first_idx; - fieldnameiterobject *it; - - PyObject *first_obj = NULL; - PyObject *result = NULL; - - assert(PyUnicode_Check(field_name)); - it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type); - if (it == NULL) - return NULL; - - /* take ownership, give the object to the iterator. this is - just to keep the field_name alive */ - Py_INCREF(field_name); - it->str = field_name; + PyObject *s = PyUnicode_FromString(cp); + if (s == NULL) + return NULL; + PyUnicode_InternInPlace(&s); + return s; +} - if (!field_name_split(STRINGLIB_STR(field_name), - STRINGLIB_LEN(field_name), - &first, &first_idx, &it->it_field)) - goto error; +void _Py_ReleaseInternedUnicodeStrings(void) +{ + PyObject *keys; + PyUnicodeObject *s; + Py_ssize_t i, n; + Py_ssize_t immortal_size = 0, mortal_size = 0; - /* first becomes an integer, if possible, else a string */ - if (first_idx != -1) - first_obj = PyInt_FromSsize_t(first_idx); - else - /* convert "first" into a string object */ - first_obj = STRINGLIB_NEW(first.ptr, first.end - first.ptr); - if (first_obj == NULL) - goto error; + if (interned == NULL || !PyDict_Check(interned)) + return; + keys = PyDict_Keys(interned); + if (keys == NULL || !PyList_Check(keys)) { + PyErr_Clear(); + return; + } - /* return a tuple of values */ - result = PyTuple_Pack(2, first_obj, it); + /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak + detector, interned unicode strings are not forcibly deallocated; + rather, we give them their stolen references back, and then clear + and DECREF the interned dict. */ -error: - Py_XDECREF(it); - Py_XDECREF(first_obj); - return result; + n = PyList_GET_SIZE(keys); + fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n", + n); + for (i = 0; i < n; i++) { + s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); + switch (s->state) { + case SSTATE_NOT_INTERNED: + /* XXX Shouldn't happen */ + break; + case SSTATE_INTERNED_IMMORTAL: + Py_Refcnt(s) += 1; + immortal_size += s->length; + break; + case SSTATE_INTERNED_MORTAL: + Py_Refcnt(s) += 2; + mortal_size += s->length; + break; + default: + Py_FatalError("Inconsistent interned string state."); + } + s->state = SSTATE_NOT_INTERNED; + } + fprintf(stderr, "total size of all interned strings: " + "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d " + "mortal/immortal\n", mortal_size, immortal_size); + Py_DECREF(keys); + PyDict_Clear(interned); + Py_DECREF(interned); + interned = NULL; } + /********************* Unicode Iterator **************************/ typedef struct { diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 3d9acd7..6ccd3e9 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -660,54 +660,6 @@ sys_current_frames(PyObject *self, PyObject *noargs) return _PyThread_CurrentFrames(); } -/* sys_formatter_iterator is used to implement - string.Formatter.vformat. it parses a string and returns tuples - describing the parsed elements. see unicodeobject.c's - _PyUnicode_FormatterIterator for details */ -static PyObject * -sys_formatter_iterator(PyObject *self, PyObject *args) -{ - /* in 2.6, check type and dispatch to unicode or string - accordingly */ - PyObject *str; - - if (!PyArg_ParseTuple(args, "O:_formatter_iterator", &str)) - return NULL; - - if (!PyUnicode_Check(str)) { - PyErr_SetString(PyExc_TypeError, - "_formatter_iterator expects unicode object"); - return NULL; - } - - return _PyUnicode_FormatterIterator(str); -} - -/* sys_formatter_field_name_split is used to implement - string.Formatter.vformat. it takes an PEP 3101 "field name", and - returns a tuple of (first, rest): "first", the part before the - first '.' or '['; and "rest", an iterator for the rest of the field - name. see unicodeobjects' _PyUnicode_FormatterFieldNameSplit for - details */ -static PyObject * -sys_formatter_field_name_split(PyObject *self, PyObject *args) -{ - PyObject *field_name; - - if (!PyArg_ParseTuple(args, "O:_formatter_field_name_split", - &field_name)) - return NULL; - - if (!PyUnicode_Check(field_name)) { - PyErr_SetString(PyExc_TypeError, "_formatter_field_name_split " - "expects unicode object"); - return NULL; - } - - return _PyUnicode_FormatterFieldNameSplit(field_name); -} - - PyDoc_STRVAR(call_tracing_doc, "call_tracing(func, args) -> object\n\ \n\ @@ -772,9 +724,6 @@ static PyMethodDef sys_methods[] = { callstats_doc}, {"_current_frames", sys_current_frames, METH_NOARGS, current_frames_doc}, - {"_formatter_parser", sys_formatter_iterator, METH_VARARGS}, - {"_formatter_field_name_split", sys_formatter_field_name_split, - METH_VARARGS}, {"displayhook", sys_displayhook, METH_O, displayhook_doc}, {"exc_info", sys_exc_info, METH_NOARGS, exc_info_doc}, {"excepthook", sys_excepthook, METH_VARARGS, excepthook_doc}, -- cgit v0.12