From d2034310d66b9d387b252972852537c0b592f141 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Walter=20D=C3=B6rwald?= Date: Fri, 18 May 2007 16:29:38 +0000 Subject: Add 'U'/'U#' format characters to Py_BuildValue (and thus to PyObject_CallFunction()) that take a char * (and a size in the case of 'U#') and create a unicode object out of it. Add functions PyUnicode_FromFormat() and PyUnicode_FromFormatV() that work similar to PyString_FromFormat(), but create a unicode object (also a %U format character has been added, that takes a PyObject *, which must point to a unicode object). Change the encoding and reason attributes of UnicodeEncodeError, UnicodeDecodeError and UnicodeTranslateError to be unicode objects. --- Doc/api/utilities.tex | 9 ++ Include/unicodeobject.h | 16 ++- Lib/test/test_codeccallbacks.py | 78 ++++++------- Objects/exceptions.c | 117 +++++++++----------- Objects/unicodeobject.c | 236 ++++++++++++++++++++++++++++++++++++++-- Python/modsupport.c | 33 ++++++ 6 files changed, 376 insertions(+), 113 deletions(-) diff --git a/Doc/api/utilities.tex b/Doc/api/utilities.tex index fb9c909..968ce4f 100644 --- a/Doc/api/utilities.tex +++ b/Doc/api/utilities.tex @@ -848,6 +848,15 @@ PyArg_ParseTuple(args, "O|O:ref", &object, &callback) to a Python Unicode object. If the Unicode buffer pointer is \NULL, the length is ignored and \code{None} is returned. + \item[\samp{U} (string) {[char *]}] + Convert a null-terminated C string to a Python unicode object. + If the C string pointer is \NULL, \code{None} is used. + + \item[\samp{U\#} (string) {[char *, int]}] + Convert a C string and its length to a Python unicode object. + If the C string pointer is \NULL, the length is ignored and \code{None} + is returned. + \item[\samp{i} (integer) {[int]}] Convert a plain C \ctype{int} to a Python integer object. diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 9d0cabf..2e27d74 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -173,7 +173,9 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode # define PyUnicode_FromString PyUnicodeUCS2_FromString -# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar +# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize +# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV +# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding # define PyUnicode_GetMax PyUnicodeUCS2_GetMax # define PyUnicode_GetSize PyUnicodeUCS2_GetSize @@ -252,6 +254,9 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode # define PyUnicode_FromString PyUnicodeUCS4_FromString +# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize +# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV +# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding # define PyUnicode_GetMax PyUnicodeUCS4_GetMax @@ -429,6 +434,12 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode( Py_ssize_t size /* size of buffer */ ); +/* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */ +PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( + const char *u, /* char buffer */ + Py_ssize_t size /* size of buffer */ + ); + /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated Latin-1 encoded bytes */ PyAPI_FUNC(PyObject*) PyUnicode_FromString( @@ -510,6 +521,9 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromObject( register PyObject *obj /* Object */ ); +PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list); +PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...); + /* --- wchar_t support for platforms which support it --------------------- */ #ifdef HAVE_WCHAR_H diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 911496d..4981d54 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -21,43 +21,43 @@ class PosReturn: # A UnicodeEncodeError object with a bad start attribute class BadStartUnicodeEncodeError(UnicodeEncodeError): def __init__(self): - UnicodeEncodeError.__init__(self, str8("ascii"), "", 0, 1, str8("bad")) + UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad") self.start = [] # A UnicodeEncodeError object with a bad object attribute class BadObjectUnicodeEncodeError(UnicodeEncodeError): def __init__(self): - UnicodeEncodeError.__init__(self, str8("ascii"), "", 0, 1, str8("bad")) + UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad") self.object = [] # A UnicodeDecodeError object without an end attribute class NoEndUnicodeDecodeError(UnicodeDecodeError): def __init__(self): - UnicodeDecodeError.__init__(self, str8("ascii"), b"", 0, 1, str8("bad")) + UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad") del self.end # A UnicodeDecodeError object with a bad object attribute class BadObjectUnicodeDecodeError(UnicodeDecodeError): def __init__(self): - UnicodeDecodeError.__init__(self, str8("ascii"), b"", 0, 1, str8("bad")) + UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad") self.object = [] # A UnicodeTranslateError object without a start attribute class NoStartUnicodeTranslateError(UnicodeTranslateError): def __init__(self): - UnicodeTranslateError.__init__(self, "", 0, 1, str8("bad")) + UnicodeTranslateError.__init__(self, "", 0, 1, "bad") del self.start # A UnicodeTranslateError object without an end attribute class NoEndUnicodeTranslateError(UnicodeTranslateError): def __init__(self): - UnicodeTranslateError.__init__(self, "", 0, 1, str8("bad")) + UnicodeTranslateError.__init__(self, "", 0, 1, "bad") del self.end # A UnicodeTranslateError object without an object attribute class NoObjectUnicodeTranslateError(UnicodeTranslateError): def __init__(self): - UnicodeTranslateError.__init__(self, "", 0, 1, str8("bad")) + UnicodeTranslateError.__init__(self, "", 0, 1, "bad") del self.object class CodecCallbackTest(unittest.TestCase): @@ -328,73 +328,73 @@ class CodecCallbackTest(unittest.TestCase): def test_unicodeencodeerror(self): self.check_exceptionobjectargs( UnicodeEncodeError, - [str8("ascii"), "g\xfcrk", 1, 2, str8("ouch")], + ["ascii", "g\xfcrk", 1, 2, "ouch"], "'ascii' codec can't encode character u'\\xfc' in position 1: ouch" ) self.check_exceptionobjectargs( UnicodeEncodeError, - [str8("ascii"), "g\xfcrk", 1, 4, str8("ouch")], + ["ascii", "g\xfcrk", 1, 4, "ouch"], "'ascii' codec can't encode characters in position 1-3: ouch" ) self.check_exceptionobjectargs( UnicodeEncodeError, - [str8("ascii"), "\xfcx", 0, 1, str8("ouch")], + ["ascii", "\xfcx", 0, 1, "ouch"], "'ascii' codec can't encode character u'\\xfc' in position 0: ouch" ) self.check_exceptionobjectargs( UnicodeEncodeError, - [str8("ascii"), "\u0100x", 0, 1, str8("ouch")], + ["ascii", "\u0100x", 0, 1, "ouch"], "'ascii' codec can't encode character u'\\u0100' in position 0: ouch" ) self.check_exceptionobjectargs( UnicodeEncodeError, - [str8("ascii"), "\uffffx", 0, 1, str8("ouch")], + ["ascii", "\uffffx", 0, 1, "ouch"], "'ascii' codec can't encode character u'\\uffff' in position 0: ouch" ) if sys.maxunicode > 0xffff: self.check_exceptionobjectargs( UnicodeEncodeError, - [str8("ascii"), "\U00010000x", 0, 1, str8("ouch")], + ["ascii", "\U00010000x", 0, 1, "ouch"], "'ascii' codec can't encode character u'\\U00010000' in position 0: ouch" ) def test_unicodedecodeerror(self): self.check_exceptionobjectargs( UnicodeDecodeError, - [str8("ascii"), b"g\xfcrk", 1, 2, str8("ouch")], + ["ascii", b"g\xfcrk", 1, 2, "ouch"], "'ascii' codec can't decode byte 0xfc in position 1: ouch" ) self.check_exceptionobjectargs( UnicodeDecodeError, - [str8("ascii"), b"g\xfcrk", 1, 3, str8("ouch")], + ["ascii", b"g\xfcrk", 1, 3, "ouch"], "'ascii' codec can't decode bytes in position 1-2: ouch" ) def test_unicodetranslateerror(self): self.check_exceptionobjectargs( UnicodeTranslateError, - ["g\xfcrk", 1, 2, str8("ouch")], + ["g\xfcrk", 1, 2, "ouch"], "can't translate character u'\\xfc' in position 1: ouch" ) self.check_exceptionobjectargs( UnicodeTranslateError, - ["g\u0100rk", 1, 2, str8("ouch")], + ["g\u0100rk", 1, 2, "ouch"], "can't translate character u'\\u0100' in position 1: ouch" ) self.check_exceptionobjectargs( UnicodeTranslateError, - ["g\uffffrk", 1, 2, str8("ouch")], + ["g\uffffrk", 1, 2, "ouch"], "can't translate character u'\\uffff' in position 1: ouch" ) if sys.maxunicode > 0xffff: self.check_exceptionobjectargs( UnicodeTranslateError, - ["g\U00010000rk", 1, 2, str8("ouch")], + ["g\U00010000rk", 1, 2, "ouch"], "can't translate character u'\\U00010000' in position 1: ouch" ) self.check_exceptionobjectargs( UnicodeTranslateError, - ["g\xfcrk", 1, 3, str8("ouch")], + ["g\xfcrk", 1, 3, "ouch"], "can't translate characters in position 1-2: ouch" ) @@ -416,7 +416,7 @@ class CodecCallbackTest(unittest.TestCase): self.assertRaises( UnicodeEncodeError, codecs.strict_errors, - UnicodeEncodeError(str8("ascii"), "\u3042", 0, 1, str8("ouch")) + UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch") ) def test_badandgoodignoreexceptions(self): @@ -435,17 +435,17 @@ class CodecCallbackTest(unittest.TestCase): # If the correct exception is passed in, "ignore" returns an empty replacement self.assertEquals( codecs.ignore_errors( - UnicodeEncodeError(str8("ascii"), "\u3042", 0, 1, str8("ouch"))), + UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")), ("", 1) ) self.assertEquals( codecs.ignore_errors( - UnicodeDecodeError(str8("ascii"), b"\xff", 0, 1, str8("ouch"))), + UnicodeDecodeError("ascii", b"\xff", 0, 1, "ouch")), ("", 1) ) self.assertEquals( codecs.ignore_errors( - UnicodeTranslateError("\u3042", 0, 1, str8("ouch"))), + UnicodeTranslateError("\u3042", 0, 1, "ouch")), ("", 1) ) @@ -475,17 +475,17 @@ class CodecCallbackTest(unittest.TestCase): # With the correct exception, "replace" returns an "?" or "\ufffd" replacement self.assertEquals( codecs.replace_errors( - UnicodeEncodeError(str8("ascii"), "\u3042", 0, 1, str8("ouch"))), + UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")), ("?", 1) ) self.assertEquals( codecs.replace_errors( - UnicodeDecodeError(str8("ascii"), b"\xff", 0, 1, str8("ouch"))), + UnicodeDecodeError("ascii", b"\xff", 0, 1, "ouch")), ("\ufffd", 1) ) self.assertEquals( codecs.replace_errors( - UnicodeTranslateError("\u3042", 0, 1, str8("ouch"))), + UnicodeTranslateError("\u3042", 0, 1, "ouch")), ("\ufffd", 1) ) @@ -506,19 +506,19 @@ class CodecCallbackTest(unittest.TestCase): self.assertRaises( TypeError, codecs.xmlcharrefreplace_errors, - UnicodeDecodeError(str8("ascii"), b"\xff", 0, 1, str8("ouch")) + UnicodeDecodeError("ascii", b"\xff", 0, 1, "ouch") ) self.assertRaises( TypeError, codecs.xmlcharrefreplace_errors, - UnicodeTranslateError("\u3042", 0, 1, str8("ouch")) + UnicodeTranslateError("\u3042", 0, 1, "ouch") ) # Use the correct exception cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 0x3042) s = "".join(chr(c) for c in cs) self.assertEquals( codecs.xmlcharrefreplace_errors( - UnicodeEncodeError(str8("ascii"), s, 0, len(s), str8("ouch")) + UnicodeEncodeError("ascii", s, 0, len(s), "ouch") ), ("".join("&#%d;" % ord(c) for c in s), len(s)) ) @@ -540,48 +540,48 @@ class CodecCallbackTest(unittest.TestCase): self.assertRaises( TypeError, codecs.backslashreplace_errors, - UnicodeDecodeError(str8("ascii"), b"\xff", 0, 1, str8("ouch")) + UnicodeDecodeError("ascii", b"\xff", 0, 1, "ouch") ) self.assertRaises( TypeError, codecs.backslashreplace_errors, - UnicodeTranslateError("\u3042", 0, 1, str8("ouch")) + UnicodeTranslateError("\u3042", 0, 1, "ouch") ) # Use the correct exception self.assertEquals( codecs.backslashreplace_errors( - UnicodeEncodeError(str8("ascii"), "\u3042", 0, 1, str8("ouch"))), + UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")), ("\\u3042", 1) ) self.assertEquals( codecs.backslashreplace_errors( - UnicodeEncodeError(str8("ascii"), "\x00", 0, 1, str8("ouch"))), + UnicodeEncodeError("ascii", "\x00", 0, 1, "ouch")), ("\\x00", 1) ) self.assertEquals( codecs.backslashreplace_errors( - UnicodeEncodeError(str8("ascii"), "\xff", 0, 1, str8("ouch"))), + UnicodeEncodeError("ascii", "\xff", 0, 1, "ouch")), ("\\xff", 1) ) self.assertEquals( codecs.backslashreplace_errors( - UnicodeEncodeError(str8("ascii"), "\u0100", 0, 1, str8("ouch"))), + UnicodeEncodeError("ascii", "\u0100", 0, 1, "ouch")), ("\\u0100", 1) ) self.assertEquals( codecs.backslashreplace_errors( - UnicodeEncodeError(str8("ascii"), "\uffff", 0, 1, str8("ouch"))), + UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")), ("\\uffff", 1) ) if sys.maxunicode>0xffff: self.assertEquals( codecs.backslashreplace_errors( - UnicodeEncodeError(str8("ascii"), "\U00010000", 0, 1, str8("ouch"))), + UnicodeEncodeError("ascii", "\U00010000", 0, 1, "ouch")), ("\\U00010000", 1) ) self.assertEquals( codecs.backslashreplace_errors( - UnicodeEncodeError(str8("ascii"), "\U0010ffff", 0, 1, str8("ouch"))), + UnicodeEncodeError("ascii", "\U0010ffff", 0, 1, "ouch")), ("\\U0010ffff", 1) ) diff --git a/Objects/exceptions.c b/Objects/exceptions.c index fabf359..36e3795 100644 --- a/Objects/exceptions.c +++ b/Objects/exceptions.c @@ -1187,35 +1187,6 @@ set_ssize_t(PyObject **attr, Py_ssize_t value) } static PyObject * -get_string(PyObject *attr, const char *name) -{ - if (!attr) { - PyErr_Format(PyExc_TypeError, "%.200s attribute not set", name); - return NULL; - } - - if (!PyString_Check(attr)) { - PyErr_Format(PyExc_TypeError, "%.200s attribute must be str", name); - return NULL; - } - Py_INCREF(attr); - return attr; -} - - -static int -set_string(PyObject **attr, const char *value) -{ - PyObject *obj = PyString_FromString(value); - if (!obj) - return -1; - Py_CLEAR(*attr); - *attr = obj; - return 0; -} - - -static PyObject * get_bytes(PyObject *attr, const char *name) { if (!attr) { @@ -1248,16 +1219,27 @@ get_unicode(PyObject *attr, const char *name) return attr; } +static int +set_unicodefromstring(PyObject **attr, const char *value) +{ + PyObject *obj = PyUnicode_FromString(value); + if (!obj) + return -1; + Py_CLEAR(*attr); + *attr = obj; + return 0; +} + PyObject * PyUnicodeEncodeError_GetEncoding(PyObject *exc) { - return get_string(((PyUnicodeErrorObject *)exc)->encoding, "encoding"); + return get_unicode(((PyUnicodeErrorObject *)exc)->encoding, "encoding"); } PyObject * PyUnicodeDecodeError_GetEncoding(PyObject *exc) { - return get_string(((PyUnicodeErrorObject *)exc)->encoding, "encoding"); + return get_unicode(((PyUnicodeErrorObject *)exc)->encoding, "encoding"); } PyObject * @@ -1416,42 +1398,45 @@ PyUnicodeTranslateError_SetEnd(PyObject *exc, Py_ssize_t end) PyObject * PyUnicodeEncodeError_GetReason(PyObject *exc) { - return get_string(((PyUnicodeErrorObject *)exc)->reason, "reason"); + return get_unicode(((PyUnicodeErrorObject *)exc)->reason, "reason"); } PyObject * PyUnicodeDecodeError_GetReason(PyObject *exc) { - return get_string(((PyUnicodeErrorObject *)exc)->reason, "reason"); + return get_unicode(((PyUnicodeErrorObject *)exc)->reason, "reason"); } PyObject * PyUnicodeTranslateError_GetReason(PyObject *exc) { - return get_string(((PyUnicodeErrorObject *)exc)->reason, "reason"); + return get_unicode(((PyUnicodeErrorObject *)exc)->reason, "reason"); } int PyUnicodeEncodeError_SetReason(PyObject *exc, const char *reason) { - return set_string(&((PyUnicodeErrorObject *)exc)->reason, reason); + return set_unicodefromstring(&((PyUnicodeErrorObject *)exc)->reason, + reason); } int PyUnicodeDecodeError_SetReason(PyObject *exc, const char *reason) { - return set_string(&((PyUnicodeErrorObject *)exc)->reason, reason); + return set_unicodefromstring(&((PyUnicodeErrorObject *)exc)->reason, + reason); } int PyUnicodeTranslateError_SetReason(PyObject *exc, const char *reason) { - return set_string(&((PyUnicodeErrorObject *)exc)->reason, reason); + return set_unicodefromstring(&((PyUnicodeErrorObject *)exc)->reason, + reason); } @@ -1466,11 +1451,11 @@ UnicodeError_init(PyUnicodeErrorObject *self, PyObject *args, PyObject *kwds, Py_CLEAR(self->reason); if (!PyArg_ParseTuple(args, "O!O!O!O!O!", - &PyString_Type, &self->encoding, + &PyUnicode_Type, &self->encoding, objecttype, &self->object, &PyLong_Type, &self->start, &PyLong_Type, &self->end, - &PyString_Type, &self->reason)) { + &PyUnicode_Type, &self->reason)) { self->encoding = self->object = self->start = self->end = self->reason = NULL; return -1; @@ -1564,20 +1549,20 @@ UnicodeEncodeError_str(PyObject *self) PyOS_snprintf(badchar_str, sizeof(badchar_str), "u%04x", badchar); else PyOS_snprintf(badchar_str, sizeof(badchar_str), "U%08x", badchar); - return PyString_FromFormat( - "'%.400s' codec can't encode character u'\\%s' in position %zd: %.400s", - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding), + return PyUnicode_FromFormat( + "'%U' codec can't encode character u'\\%s' in position %zd: %U", + ((PyUnicodeErrorObject *)self)->encoding, badchar_str, start, - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason) + ((PyUnicodeErrorObject *)self)->reason ); } - return PyString_FromFormat( - "'%.400s' codec can't encode characters in position %zd-%zd: %.400s", - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding), + return PyUnicode_FromFormat( + "'%U' codec can't encode characters in position %zd-%zd: %U", + ((PyUnicodeErrorObject *)self)->encoding, start, (end-1), - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason) + ((PyUnicodeErrorObject *)self)->reason ); } @@ -1601,7 +1586,7 @@ PyUnicodeEncodeError_Create( const char *encoding, const Py_UNICODE *object, Py_ssize_t length, Py_ssize_t start, Py_ssize_t end, const char *reason) { - return PyObject_CallFunction(PyExc_UnicodeEncodeError, "su#nns", + return PyObject_CallFunction(PyExc_UnicodeEncodeError, "Uu#nnU", encoding, object, length, start, end, reason); } @@ -1626,30 +1611,30 @@ UnicodeDecodeError_str(PyObject *self) Py_ssize_t end = 0; if (PyUnicodeDecodeError_GetStart(self, &start)) - return NULL; + return NULL; if (PyUnicodeDecodeError_GetEnd(self, &end)) - return NULL; + return NULL; if (end==start+1) { /* FromFormat does not support %02x, so format that separately */ char byte[4]; PyOS_snprintf(byte, sizeof(byte), "%02x", ((int)PyBytes_AS_STRING(((PyUnicodeErrorObject *)self)->object)[start])&0xff); - return PyString_FromFormat( - "'%.400s' codec can't decode byte 0x%s in position %zd: %.400s", - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding), + return PyUnicode_FromFormat( + "'%U' codec can't decode byte 0x%s in position %zd: %U", + ((PyUnicodeErrorObject *)self)->encoding, byte, start, - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason) + ((PyUnicodeErrorObject *)self)->reason ); } - return PyString_FromFormat( - "'%.400s' codec can't decode bytes in position %zd-%zd: %.400s", - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding), + return PyUnicode_FromFormat( + "'%U' codec can't decode bytes in position %zd-%zd: %U", + ((PyUnicodeErrorObject *)self)->encoding, start, (end-1), - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason) + ((PyUnicodeErrorObject *)self)->reason ); } @@ -1676,7 +1661,7 @@ PyUnicodeDecodeError_Create( assert(length < INT_MAX); assert(start < INT_MAX); assert(end < INT_MAX); - return PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns", + return PyObject_CallFunction(PyExc_UnicodeDecodeError, "Uy#nnU", encoding, object, length, start, end, reason); } @@ -1701,7 +1686,7 @@ UnicodeTranslateError_init(PyUnicodeErrorObject *self, PyObject *args, &PyUnicode_Type, &self->object, &PyLong_Type, &self->start, &PyLong_Type, &self->end, - &PyString_Type, &self->reason)) { + &PyUnicode_Type, &self->reason)) { self->object = self->start = self->end = self->reason = NULL; return -1; } @@ -1736,18 +1721,18 @@ UnicodeTranslateError_str(PyObject *self) PyOS_snprintf(badchar_str, sizeof(badchar_str), "u%04x", badchar); else PyOS_snprintf(badchar_str, sizeof(badchar_str), "U%08x", badchar); - return PyString_FromFormat( - "can't translate character u'\\%s' in position %zd: %.400s", + return PyUnicode_FromFormat( + "can't translate character u'\\%s' in position %zd: %U", badchar_str, start, - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason) + ((PyUnicodeErrorObject *)self)->reason ); } - return PyString_FromFormat( - "can't translate characters in position %zd-%zd: %.400s", + return PyUnicode_FromFormat( + "can't translate characters in position %zd-%zd: %U", start, (end-1), - PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason) + ((PyUnicodeErrorObject *)self)->reason ); } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7e455a5..e77b65d 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -393,15 +393,9 @@ PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, return (PyObject *)unicode; } -PyObject *PyUnicode_FromString(const char *u) +PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) { PyUnicodeObject *unicode; - size_t size = strlen(u); - if (size > PY_SSIZE_T_MAX) { - PyErr_SetString(PyExc_OverflowError, "input too long"); - return NULL; - } - /* If the Unicode data is known at construction time, we can apply some optimizations which share commonly used objects. */ if (u != NULL) { @@ -441,6 +435,17 @@ PyObject *PyUnicode_FromString(const char *u) return (PyObject *)unicode; } +PyObject *PyUnicode_FromString(const char *u) +{ + size_t size = strlen(u); + if (size > PY_SSIZE_T_MAX) { + PyErr_SetString(PyExc_OverflowError, "input too long"); + return NULL; + } + + return PyUnicode_FromStringAndSize(u, size); +} + #ifdef HAVE_WCHAR_H PyObject *PyUnicode_FromWideChar(register const wchar_t *w, @@ -473,6 +478,223 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w, return (PyObject *)unicode; } +#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} + +PyObject * +PyUnicode_FromFormatV(const char *format, va_list vargs) +{ + va_list count; + Py_ssize_t n = 0; + const char* f; + Py_UNICODE *s; + PyObject *string; + /* used by sprintf */ + char buffer[21]; + const char *copy; + +#ifdef VA_LIST_IS_ARRAY + Py_MEMCPY(count, vargs, sizeof(va_list)); +#else +#ifdef __va_copy + __va_copy(count, vargs); +#else + count = vargs; +#endif +#endif + /* step 1: figure out how large a buffer we need */ + for (f = format; *f; f++) { + if (*f == '%') { + const char* p = f; + while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f))) + ; + + /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since + * they don't affect the amount of space we reserve. + */ + if ((*f == 'l' || *f == 'z') && + (f[1] == 'd' || f[1] == 'u')) + ++f; + + switch (*f) { + case 'c': + (void)va_arg(count, int); + /* fall through... */ + case '%': + n++; + break; + case 'd': case 'u': case 'i': case 'x': + (void) va_arg(count, int); + /* 20 bytes is enough to hold a 64-bit + integer. Decimal takes the most space. + This isn't enough for octal. */ + n += 20; + break; + case 's': + n += strlen(va_arg(count, char*)); + break; + case 'U': + { + PyObject *obj = va_arg(count, PyObject *); + assert(obj && PyUnicode_Check(obj)); + n += PyUnicode_GET_SIZE(obj); + break; + } + case 'p': + (void) va_arg(count, int); + /* maximum 64-bit pointer representation: + * 0xffffffffffffffff + * so 19 characters is enough. + * XXX I count 18 -- what's the extra for? + */ + n += 19; + break; + default: + /* if we stumble upon an unknown + formatting code, copy the rest of + the format string to the output + string. (we cannot just skip the + code, since there's no way to know + what's in the argument list) */ + n += strlen(p); + goto expand; + } + } else + n++; + } + expand: + /* step 2: fill the buffer */ + /* Since we've analyzed how much space we need for the worst case, + we don't have to resize the string. */ + string = PyUnicode_FromUnicode(NULL, n); + if (!string) + return NULL; + + s = PyUnicode_AS_UNICODE(string); + + for (f = format; *f; f++) { + if (*f == '%') { + const char* p = f++; + int longflag = 0; + int size_tflag = 0; + /* parse the width.precision part (we're only + interested in the precision value, if any) */ + n = 0; + while (isdigit(Py_CHARMASK(*f))) + n = (n*10) + *f++ - '0'; + if (*f == '.') { + f++; + n = 0; + while (isdigit(Py_CHARMASK(*f))) + n = (n*10) + *f++ - '0'; + } + while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f))) + f++; + /* handle the long flag, but only for %ld and %lu. + others can be added when necessary. */ + if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { + longflag = 1; + ++f; + } + /* handle the size_t flag. */ + if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { + size_tflag = 1; + ++f; + } + + switch (*f) { + case 'c': + *s++ = va_arg(vargs, int); + break; + case 'd': + if (longflag) + sprintf(buffer, "%ld", va_arg(vargs, long)); + else if (size_tflag) + sprintf(buffer, "%" PY_FORMAT_SIZE_T "d", + va_arg(vargs, Py_ssize_t)); + else + sprintf(buffer, "%d", va_arg(vargs, int)); + appendstring(buffer); + break; + case 'u': + if (longflag) + sprintf(buffer, "%lu", + va_arg(vargs, unsigned long)); + else if (size_tflag) + sprintf(buffer, "%" PY_FORMAT_SIZE_T "u", + va_arg(vargs, size_t)); + else + sprintf(buffer, "%u", + va_arg(vargs, unsigned int)); + appendstring(buffer); + break; + case 'i': + sprintf(buffer, "%i", va_arg(vargs, int)); + appendstring(buffer); + break; + case 'x': + sprintf(buffer, "%x", va_arg(vargs, int)); + appendstring(buffer); + break; + case 's': + p = va_arg(vargs, char*); + appendstring(p); + break; + case 'U': + { + PyObject *obj = va_arg(vargs, PyObject *); + Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(obj); + Py_ssize_t usize = PyUnicode_GET_SIZE(obj); + Py_ssize_t upos; + for (upos = 0; upos PY_SSIZE_T_MAX) { + PyErr_SetString(PyExc_OverflowError, + "string too long for Python string"); + return NULL; + } + n = (Py_ssize_t)m; + } + v = PyUnicode_FromStringAndSize(str, n); + } + return v; + } + case 'y': { PyObject *v; -- cgit v0.12