diff options
author | Walter Dörwald <walter@livinglogic.de> | 2007-05-18 16:29:38 (GMT) |
---|---|---|
committer | Walter Dörwald <walter@livinglogic.de> | 2007-05-18 16:29:38 (GMT) |
commit | d2034310d66b9d387b252972852537c0b592f141 (patch) | |
tree | 77d03818fd4896b2dc1ea3eb87202bf1d82d8866 /Objects | |
parent | 5550731d9cf5bca2379b15d5238ee5a39ebc6ce3 (diff) | |
download | cpython-d2034310d66b9d387b252972852537c0b592f141.zip cpython-d2034310d66b9d387b252972852537c0b592f141.tar.gz cpython-d2034310d66b9d387b252972852537c0b592f141.tar.bz2 |
Add 'U'/'U#' format characters to Py_BuildValue (and thus
to PyObject_CallFunction()) that take a char * (and a size
in the case of 'U#') and create a unicode object out of it.
Add functions PyUnicode_FromFormat() and PyUnicode_FromFormatV()
that work similar to PyString_FromFormat(), but create a unicode
object (also a %U format character has been added, that takes
a PyObject *, which must point to a unicode object).
Change the encoding and reason attributes of UnicodeEncodeError,
UnicodeDecodeError and UnicodeTranslateError to be unicode
objects.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/exceptions.c | 117 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 236 |
2 files changed, 280 insertions, 73 deletions
diff --git a/Objects/exceptions.c b/Objects/exceptions.c index fabf359..36e3795 100644 --- a/Objects/exceptions.c +++ b/Objects/exceptions.c @@ -1187,35 +1187,6 @@ set_ssize_t(PyObject **attr, Py_ssize_t value) } static PyObject * -get_string(PyObject *attr, const char *name) -{ - if (!attr) { - PyErr_Format(PyExc_TypeError, "%.200s attribute not set", name); - return NULL; - } - - if (!PyString_Check(attr)) { - PyErr_Format(PyExc_TypeError, "%.200s attribute must be str", name); - return NULL; - } - Py_INCREF(attr); - return attr; -} - - -static int -set_string(PyObject **attr, const char *value) -{ - PyObject *obj = PyString_FromString(value); - if (!obj) - return -1; - Py_CLEAR(*attr); - *attr = obj; - return 0; -} - - -static PyObject * get_bytes(PyObject *attr, const char *name) { if (!attr) { @@ -1248,16 +1219,27 @@ get_unicode(PyObject *attr, const char *name) return attr; } +static int +set_unicodefromstring(PyObject **attr, const char *value) +{ + PyObject *obj = PyUnicode_FromString(value); + if (!obj) + return -1; + Py_CLEAR(*attr); + *attr = obj; + return 0; +} + PyObject * PyUnicodeEncodeError_GetEncoding(PyObject *exc) { - return get_string(((PyUnicodeErrorObject *)exc)->encoding, "encoding"); + return get_unicode(((PyUnicodeErrorObject *)exc)->encoding, "encoding"); } PyObject * PyUnicodeDecodeError_GetEncoding(PyObject *exc) { - return get_string(((PyUnicodeErrorObject *)exc)->encoding, "encoding"); + return get_unicode(((PyUnicodeErrorObject *)exc)->encoding, "encoding"); } PyObject * @@ -1416,42 +1398,45 @@ PyUnicodeTranslateError_SetEnd(PyObject *exc, Py_ssize_t end) PyObject * PyUnicodeEncodeError_GetReason(PyObject *exc) { - return get_string(((PyUnicodeErrorObject *)exc)->reason, "reason"); + return get_unicode(((PyUnicodeErrorObject *)exc)->reason, "reason"); } PyObject * PyUnicodeDecodeError_GetReason(PyObject *exc) { - return get_string(((PyUnicodeErrorObject *)exc)->reason, "reason"); + return get_unicode(((PyUnicodeErrorObject *)exc)->reason, "reason"); } PyObject * PyUnicodeTranslateError_GetReason(PyObject *exc) { - return get_string(((PyUnicodeErrorObject *)exc)->reason, "reason"); + return get_unicode(((PyUnicodeErrorObject *)exc)->reason, "reason"); } int PyUnicodeEncodeError_SetReason(PyObject *exc, const char *reason) { - return set_string(&((PyUnicodeErrorObject *)exc)->reason, reason); + return set_unicodefromstring(&((PyUnicodeErrorObject *)exc)->reason, + reason); } int PyUnicodeDecodeError_SetReason(PyObject *exc, const char *reason) { - return set_string(&((PyUnicodeErrorObject *)exc)->reason, reason); + return set_unicodefromstring(&((PyUnicodeErrorObject *)exc)->reason, + reason); } int PyUnicodeTranslateError_SetReason(PyObject *exc, const char *reason) { - return set_string(&((PyUnicodeErrorObject *)exc)->reason, reason); + return set_unicodefromstring(&((PyUnicodeErrorObject *)exc)->reason, + reason); } @@ -1466,11 +1451,11 @@ UnicodeError_init(PyUnicodeErrorObject *self, PyObject *args, PyObject *kwds, Py_CLEAR(self->reason); if (!PyArg_ParseTuple(args, "O!O!O!O!O!", - &PyString_Type, &self->encoding, + &PyUnicode_Type, &self->encoding, objecttype, &self->object, &PyLong_Type, &self->start, &PyLong_Type, &self->end, - &PyString_Type, &self->reason)) { + &PyUnicode_Type, &self->reason)) { self->encoding = self->object = self->start = self->end = self->reason = NULL; return -1; @@ -1564,20 +1549,20 @@ UnicodeEncodeError_str(PyObject *self) PyOS_snprintf(badchar_str, sizeof(badchar_str), "u%04x", badchar); else PyOS_snprintf(badchar_str, sizeof(badchar_str), "U%08x", badchar); - return PyString_FromFormat( - "'%.400s' codec can't encode character u'\\%s' in position %zd: %.400s", - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding), + return PyUnicode_FromFormat( + "'%U' codec can't encode character u'\\%s' in position %zd: %U", + ((PyUnicodeErrorObject *)self)->encoding, badchar_str, start, - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason) + ((PyUnicodeErrorObject *)self)->reason ); } - return PyString_FromFormat( - "'%.400s' codec can't encode characters in position %zd-%zd: %.400s", - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding), + return PyUnicode_FromFormat( + "'%U' codec can't encode characters in position %zd-%zd: %U", + ((PyUnicodeErrorObject *)self)->encoding, start, (end-1), - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason) + ((PyUnicodeErrorObject *)self)->reason ); } @@ -1601,7 +1586,7 @@ PyUnicodeEncodeError_Create( const char *encoding, const Py_UNICODE *object, Py_ssize_t length, Py_ssize_t start, Py_ssize_t end, const char *reason) { - return PyObject_CallFunction(PyExc_UnicodeEncodeError, "su#nns", + return PyObject_CallFunction(PyExc_UnicodeEncodeError, "Uu#nnU", encoding, object, length, start, end, reason); } @@ -1626,30 +1611,30 @@ UnicodeDecodeError_str(PyObject *self) Py_ssize_t end = 0; if (PyUnicodeDecodeError_GetStart(self, &start)) - return NULL; + return NULL; if (PyUnicodeDecodeError_GetEnd(self, &end)) - return NULL; + return NULL; if (end==start+1) { /* FromFormat does not support %02x, so format that separately */ char byte[4]; PyOS_snprintf(byte, sizeof(byte), "%02x", ((int)PyBytes_AS_STRING(((PyUnicodeErrorObject *)self)->object)[start])&0xff); - return PyString_FromFormat( - "'%.400s' codec can't decode byte 0x%s in position %zd: %.400s", - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding), + return PyUnicode_FromFormat( + "'%U' codec can't decode byte 0x%s in position %zd: %U", + ((PyUnicodeErrorObject *)self)->encoding, byte, start, - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason) + ((PyUnicodeErrorObject *)self)->reason ); } - return PyString_FromFormat( - "'%.400s' codec can't decode bytes in position %zd-%zd: %.400s", - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding), + return PyUnicode_FromFormat( + "'%U' codec can't decode bytes in position %zd-%zd: %U", + ((PyUnicodeErrorObject *)self)->encoding, start, (end-1), - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason) + ((PyUnicodeErrorObject *)self)->reason ); } @@ -1676,7 +1661,7 @@ PyUnicodeDecodeError_Create( assert(length < INT_MAX); assert(start < INT_MAX); assert(end < INT_MAX); - return PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns", + return PyObject_CallFunction(PyExc_UnicodeDecodeError, "Uy#nnU", encoding, object, length, start, end, reason); } @@ -1701,7 +1686,7 @@ UnicodeTranslateError_init(PyUnicodeErrorObject *self, PyObject *args, &PyUnicode_Type, &self->object, &PyLong_Type, &self->start, &PyLong_Type, &self->end, - &PyString_Type, &self->reason)) { + &PyUnicode_Type, &self->reason)) { self->object = self->start = self->end = self->reason = NULL; return -1; } @@ -1736,18 +1721,18 @@ UnicodeTranslateError_str(PyObject *self) PyOS_snprintf(badchar_str, sizeof(badchar_str), "u%04x", badchar); else PyOS_snprintf(badchar_str, sizeof(badchar_str), "U%08x", badchar); - return PyString_FromFormat( - "can't translate character u'\\%s' in position %zd: %.400s", + return PyUnicode_FromFormat( + "can't translate character u'\\%s' in position %zd: %U", badchar_str, start, - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason) + ((PyUnicodeErrorObject *)self)->reason ); } - return PyString_FromFormat( - "can't translate characters in position %zd-%zd: %.400s", + return PyUnicode_FromFormat( + "can't translate characters in position %zd-%zd: %U", start, (end-1), - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason) + ((PyUnicodeErrorObject *)self)->reason ); } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7e455a5..e77b65d 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -393,15 +393,9 @@ PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, return (PyObject *)unicode; } -PyObject *PyUnicode_FromString(const char *u) +PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) { PyUnicodeObject *unicode; - size_t size = strlen(u); - if (size > PY_SSIZE_T_MAX) { - PyErr_SetString(PyExc_OverflowError, "input too long"); - return NULL; - } - /* If the Unicode data is known at construction time, we can apply some optimizations which share commonly used objects. */ if (u != NULL) { @@ -441,6 +435,17 @@ PyObject *PyUnicode_FromString(const char *u) return (PyObject *)unicode; } +PyObject *PyUnicode_FromString(const char *u) +{ + size_t size = strlen(u); + if (size > PY_SSIZE_T_MAX) { + PyErr_SetString(PyExc_OverflowError, "input too long"); + return NULL; + } + + return PyUnicode_FromStringAndSize(u, size); +} + #ifdef HAVE_WCHAR_H PyObject *PyUnicode_FromWideChar(register const wchar_t *w, @@ -473,6 +478,223 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w, return (PyObject *)unicode; } +#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} + +PyObject * +PyUnicode_FromFormatV(const char *format, va_list vargs) +{ + va_list count; + Py_ssize_t n = 0; + const char* f; + Py_UNICODE *s; + PyObject *string; + /* used by sprintf */ + char buffer[21]; + const char *copy; + +#ifdef VA_LIST_IS_ARRAY + Py_MEMCPY(count, vargs, sizeof(va_list)); +#else +#ifdef __va_copy + __va_copy(count, vargs); +#else + count = vargs; +#endif +#endif + /* step 1: figure out how large a buffer we need */ + for (f = format; *f; f++) { + if (*f == '%') { + const char* p = f; + while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f))) + ; + + /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since + * they don't affect the amount of space we reserve. + */ + if ((*f == 'l' || *f == 'z') && + (f[1] == 'd' || f[1] == 'u')) + ++f; + + switch (*f) { + case 'c': + (void)va_arg(count, int); + /* fall through... */ + case '%': + n++; + break; + case 'd': case 'u': case 'i': case 'x': + (void) va_arg(count, int); + /* 20 bytes is enough to hold a 64-bit + integer. Decimal takes the most space. + This isn't enough for octal. */ + n += 20; + break; + case 's': + n += strlen(va_arg(count, char*)); + break; + case 'U': + { + PyObject *obj = va_arg(count, PyObject *); + assert(obj && PyUnicode_Check(obj)); + n += PyUnicode_GET_SIZE(obj); + break; + } + case 'p': + (void) va_arg(count, int); + /* maximum 64-bit pointer representation: + * 0xffffffffffffffff + * so 19 characters is enough. + * XXX I count 18 -- what's the extra for? + */ + n += 19; + break; + default: + /* if we stumble upon an unknown + formatting code, copy the rest of + the format string to the output + string. (we cannot just skip the + code, since there's no way to know + what's in the argument list) */ + n += strlen(p); + goto expand; + } + } else + n++; + } + expand: + /* step 2: fill the buffer */ + /* Since we've analyzed how much space we need for the worst case, + we don't have to resize the string. */ + string = PyUnicode_FromUnicode(NULL, n); + if (!string) + return NULL; + + s = PyUnicode_AS_UNICODE(string); + + for (f = format; *f; f++) { + if (*f == '%') { + const char* p = f++; + int longflag = 0; + int size_tflag = 0; + /* parse the width.precision part (we're only + interested in the precision value, if any) */ + n = 0; + while (isdigit(Py_CHARMASK(*f))) + n = (n*10) + *f++ - '0'; + if (*f == '.') { + f++; + n = 0; + while (isdigit(Py_CHARMASK(*f))) + n = (n*10) + *f++ - '0'; + } + while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f))) + f++; + /* handle the long flag, but only for %ld and %lu. + others can be added when necessary. */ + if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { + longflag = 1; + ++f; + } + /* handle the size_t flag. */ + if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { + size_tflag = 1; + ++f; + } + + switch (*f) { + case 'c': + *s++ = va_arg(vargs, int); + break; + case 'd': + if (longflag) + sprintf(buffer, "%ld", va_arg(vargs, long)); + else if (size_tflag) + sprintf(buffer, "%" PY_FORMAT_SIZE_T "d", + va_arg(vargs, Py_ssize_t)); + else + sprintf(buffer, "%d", va_arg(vargs, int)); + appendstring(buffer); + break; + case 'u': + if (longflag) + sprintf(buffer, "%lu", + va_arg(vargs, unsigned long)); + else if (size_tflag) + sprintf(buffer, "%" PY_FORMAT_SIZE_T "u", + va_arg(vargs, size_t)); + else + sprintf(buffer, "%u", + va_arg(vargs, unsigned int)); + appendstring(buffer); + break; + case 'i': + sprintf(buffer, "%i", va_arg(vargs, int)); + appendstring(buffer); + break; + case 'x': + sprintf(buffer, "%x", va_arg(vargs, int)); + appendstring(buffer); + break; + case 's': + p = va_arg(vargs, char*); + appendstring(p); + break; + case 'U': + { + PyObject *obj = va_arg(vargs, PyObject *); + Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(obj); + Py_ssize_t usize = PyUnicode_GET_SIZE(obj); + Py_ssize_t upos; + for (upos = 0; upos<usize;) + *s++ = ucopy[upos++]; + break; + } + case 'p': + sprintf(buffer, "%p", va_arg(vargs, void*)); + /* %p is ill-defined: ensure leading 0x. */ + if (buffer[1] == 'X') + buffer[1] = 'x'; + else if (buffer[1] != 'x') { + memmove(buffer+2, buffer, strlen(buffer)+1); + buffer[0] = '0'; + buffer[1] = 'x'; + } + appendstring(buffer); + break; + case '%': + *s++ = '%'; + break; + default: + appendstring(p); + goto end; + } + } else + *s++ = *f; + } + + end: + _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); + return string; +} + +#undef appendstring + +PyObject * +PyUnicode_FromFormat(const char *format, ...) +{ + PyObject* ret; + va_list vargs; + +#ifdef HAVE_STDARG_PROTOTYPES + va_start(vargs, format); +#else + va_start(vargs); +#endif + ret = PyUnicode_FromFormatV(format, vargs); + va_end(vargs); + return ret; +} + Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, wchar_t *w, Py_ssize_t size) |