summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWalter Dörwald <walter@livinglogic.de>2007-05-18 16:29:38 (GMT)
committerWalter Dörwald <walter@livinglogic.de>2007-05-18 16:29:38 (GMT)
commitd2034310d66b9d387b252972852537c0b592f141 (patch)
tree77d03818fd4896b2dc1ea3eb87202bf1d82d8866
parent5550731d9cf5bca2379b15d5238ee5a39ebc6ce3 (diff)
downloadcpython-d2034310d66b9d387b252972852537c0b592f141.zip
cpython-d2034310d66b9d387b252972852537c0b592f141.tar.gz
cpython-d2034310d66b9d387b252972852537c0b592f141.tar.bz2
Add 'U'/'U#' format characters to Py_BuildValue (and thus
to PyObject_CallFunction()) that take a char * (and a size in the case of 'U#') and create a unicode object out of it. Add functions PyUnicode_FromFormat() and PyUnicode_FromFormatV() that work similar to PyString_FromFormat(), but create a unicode object (also a %U format character has been added, that takes a PyObject *, which must point to a unicode object). Change the encoding and reason attributes of UnicodeEncodeError, UnicodeDecodeError and UnicodeTranslateError to be unicode objects.
-rw-r--r--Doc/api/utilities.tex9
-rw-r--r--Include/unicodeobject.h16
-rw-r--r--Lib/test/test_codeccallbacks.py78
-rw-r--r--Objects/exceptions.c117
-rw-r--r--Objects/unicodeobject.c236
-rw-r--r--Python/modsupport.c33
6 files changed, 376 insertions, 113 deletions
diff --git a/Doc/api/utilities.tex b/Doc/api/utilities.tex
index fb9c909..968ce4f 100644
--- a/Doc/api/utilities.tex
+++ b/Doc/api/utilities.tex
@@ -848,6 +848,15 @@ PyArg_ParseTuple(args, "O|O:ref", &object, &callback)
to a Python Unicode object. If the Unicode buffer pointer
is \NULL, the length is ignored and \code{None} is returned.
+ \item[\samp{U} (string) {[char *]}]
+ Convert a null-terminated C string to a Python unicode object.
+ If the C string pointer is \NULL, \code{None} is used.
+
+ \item[\samp{U\#} (string) {[char *, int]}]
+ Convert a C string and its length to a Python unicode object.
+ If the C string pointer is \NULL, the length is ignored and \code{None}
+ is returned.
+
\item[\samp{i} (integer) {[int]}]
Convert a plain C \ctype{int} to a Python integer object.
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 9d0cabf..2e27d74 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -173,7 +173,9 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
# define PyUnicode_FromString PyUnicodeUCS2_FromString
-# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
+# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
+# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
+# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
@@ -252,6 +254,9 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
# define PyUnicode_FromString PyUnicodeUCS4_FromString
+# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
+# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
+# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
@@ -429,6 +434,12 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
Py_ssize_t size /* size of buffer */
);
+/* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */
+PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
+ const char *u, /* char buffer */
+ Py_ssize_t size /* size of buffer */
+ );
+
/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
Latin-1 encoded bytes */
PyAPI_FUNC(PyObject*) PyUnicode_FromString(
@@ -510,6 +521,9 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
register PyObject *obj /* Object */
);
+PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
+PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
+
/* --- wchar_t support for platforms which support it --------------------- */
#ifdef HAVE_WCHAR_H
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index 911496d..4981d54 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -21,43 +21,43 @@ class PosReturn:
# A UnicodeEncodeError object with a bad start attribute
class BadStartUnicodeEncodeError(UnicodeEncodeError):
def __init__(self):
- UnicodeEncodeError.__init__(self, str8("ascii"), "", 0, 1, str8("bad"))
+ UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
self.start = []
# A UnicodeEncodeError object with a bad object attribute
class BadObjectUnicodeEncodeError(UnicodeEncodeError):
def __init__(self):
- UnicodeEncodeError.__init__(self, str8("ascii"), "", 0, 1, str8("bad"))
+ UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
self.object = []
# A UnicodeDecodeError object without an end attribute
class NoEndUnicodeDecodeError(UnicodeDecodeError):
def __init__(self):
- UnicodeDecodeError.__init__(self, str8("ascii"), b"", 0, 1, str8("bad"))
+ UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad")
del self.end
# A UnicodeDecodeError object with a bad object attribute
class BadObjectUnicodeDecodeError(UnicodeDecodeError):
def __init__(self):
- UnicodeDecodeError.__init__(self, str8("ascii"), b"", 0, 1, str8("bad"))
+ UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad")
self.object = []
# A UnicodeTranslateError object without a start attribute
class NoStartUnicodeTranslateError(UnicodeTranslateError):
def __init__(self):
- UnicodeTranslateError.__init__(self, "", 0, 1, str8("bad"))
+ UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
del self.start
# A UnicodeTranslateError object without an end attribute
class NoEndUnicodeTranslateError(UnicodeTranslateError):
def __init__(self):
- UnicodeTranslateError.__init__(self, "", 0, 1, str8("bad"))
+ UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
del self.end
# A UnicodeTranslateError object without an object attribute
class NoObjectUnicodeTranslateError(UnicodeTranslateError):
def __init__(self):
- UnicodeTranslateError.__init__(self, "", 0, 1, str8("bad"))
+ UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
del self.object
class CodecCallbackTest(unittest.TestCase):
@@ -328,73 +328,73 @@ class CodecCallbackTest(unittest.TestCase):
def test_unicodeencodeerror(self):
self.check_exceptionobjectargs(
UnicodeEncodeError,
- [str8("ascii"), "g\xfcrk", 1, 2, str8("ouch")],
+ ["ascii", "g\xfcrk", 1, 2, "ouch"],
"'ascii' codec can't encode character u'\\xfc' in position 1: ouch"
)
self.check_exceptionobjectargs(
UnicodeEncodeError,
- [str8("ascii"), "g\xfcrk", 1, 4, str8("ouch")],
+ ["ascii", "g\xfcrk", 1, 4, "ouch"],
"'ascii' codec can't encode characters in position 1-3: ouch"
)
self.check_exceptionobjectargs(
UnicodeEncodeError,
- [str8("ascii"), "\xfcx", 0, 1, str8("ouch")],
+ ["ascii", "\xfcx", 0, 1, "ouch"],
"'ascii' codec can't encode character u'\\xfc' in position 0: ouch"
)
self.check_exceptionobjectargs(
UnicodeEncodeError,
- [str8("ascii"), "\u0100x", 0, 1, str8("ouch")],
+ ["ascii", "\u0100x", 0, 1, "ouch"],
"'ascii' codec can't encode character u'\\u0100' in position 0: ouch"
)
self.check_exceptionobjectargs(
UnicodeEncodeError,
- [str8("ascii"), "\uffffx", 0, 1, str8("ouch")],
+ ["ascii", "\uffffx", 0, 1, "ouch"],
"'ascii' codec can't encode character u'\\uffff' in position 0: ouch"
)
if sys.maxunicode > 0xffff:
self.check_exceptionobjectargs(
UnicodeEncodeError,
- [str8("ascii"), "\U00010000x", 0, 1, str8("ouch")],
+ ["ascii", "\U00010000x", 0, 1, "ouch"],
"'ascii' codec can't encode character u'\\U00010000' in position 0: ouch"
)
def test_unicodedecodeerror(self):
self.check_exceptionobjectargs(
UnicodeDecodeError,
- [str8("ascii"), b"g\xfcrk", 1, 2, str8("ouch")],
+ ["ascii", b"g\xfcrk", 1, 2, "ouch"],
"'ascii' codec can't decode byte 0xfc in position 1: ouch"
)
self.check_exceptionobjectargs(
UnicodeDecodeError,
- [str8("ascii"), b"g\xfcrk", 1, 3, str8("ouch")],
+ ["ascii", b"g\xfcrk", 1, 3, "ouch"],
"'ascii' codec can't decode bytes in position 1-2: ouch"
)
def test_unicodetranslateerror(self):
self.check_exceptionobjectargs(
UnicodeTranslateError,
- ["g\xfcrk", 1, 2, str8("ouch")],
+ ["g\xfcrk", 1, 2, "ouch"],
"can't translate character u'\\xfc' in position 1: ouch"
)
self.check_exceptionobjectargs(
UnicodeTranslateError,
- ["g\u0100rk", 1, 2, str8("ouch")],
+ ["g\u0100rk", 1, 2, "ouch"],
"can't translate character u'\\u0100' in position 1: ouch"
)
self.check_exceptionobjectargs(
UnicodeTranslateError,
- ["g\uffffrk", 1, 2, str8("ouch")],
+ ["g\uffffrk", 1, 2, "ouch"],
"can't translate character u'\\uffff' in position 1: ouch"
)
if sys.maxunicode > 0xffff:
self.check_exceptionobjectargs(
UnicodeTranslateError,
- ["g\U00010000rk", 1, 2, str8("ouch")],
+ ["g\U00010000rk", 1, 2, "ouch"],
"can't translate character u'\\U00010000' in position 1: ouch"
)
self.check_exceptionobjectargs(
UnicodeTranslateError,
- ["g\xfcrk", 1, 3, str8("ouch")],
+ ["g\xfcrk", 1, 3, "ouch"],
"can't translate characters in position 1-2: ouch"
)
@@ -416,7 +416,7 @@ class CodecCallbackTest(unittest.TestCase):
self.assertRaises(
UnicodeEncodeError,
codecs.strict_errors,
- UnicodeEncodeError(str8("ascii"), "\u3042", 0, 1, str8("ouch"))
+ UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")
)
def test_badandgoodignoreexceptions(self):
@@ -435,17 +435,17 @@ class CodecCallbackTest(unittest.TestCase):
# If the correct exception is passed in, "ignore" returns an empty replacement
self.assertEquals(
codecs.ignore_errors(
- UnicodeEncodeError(str8("ascii"), "\u3042", 0, 1, str8("ouch"))),
+ UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")),
("", 1)
)
self.assertEquals(
codecs.ignore_errors(
- UnicodeDecodeError(str8("ascii"), b"\xff", 0, 1, str8("ouch"))),
+ UnicodeDecodeError("ascii", b"\xff", 0, 1, "ouch")),
("", 1)
)
self.assertEquals(
codecs.ignore_errors(
- UnicodeTranslateError("\u3042", 0, 1, str8("ouch"))),
+ UnicodeTranslateError("\u3042", 0, 1, "ouch")),
("", 1)
)
@@ -475,17 +475,17 @@ class CodecCallbackTest(unittest.TestCase):
# With the correct exception, "replace" returns an "?" or "\ufffd" replacement
self.assertEquals(
codecs.replace_errors(
- UnicodeEncodeError(str8("ascii"), "\u3042", 0, 1, str8("ouch"))),
+ UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")),
("?", 1)
)
self.assertEquals(
codecs.replace_errors(
- UnicodeDecodeError(str8("ascii"), b"\xff", 0, 1, str8("ouch"))),
+ UnicodeDecodeError("ascii", b"\xff", 0, 1, "ouch")),
("\ufffd", 1)
)
self.assertEquals(
codecs.replace_errors(
- UnicodeTranslateError("\u3042", 0, 1, str8("ouch"))),
+ UnicodeTranslateError("\u3042", 0, 1, "ouch")),
("\ufffd", 1)
)
@@ -506,19 +506,19 @@ class CodecCallbackTest(unittest.TestCase):
self.assertRaises(
TypeError,
codecs.xmlcharrefreplace_errors,
- UnicodeDecodeError(str8("ascii"), b"\xff", 0, 1, str8("ouch"))
+ UnicodeDecodeError("ascii", b"\xff", 0, 1, "ouch")
)
self.assertRaises(
TypeError,
codecs.xmlcharrefreplace_errors,
- UnicodeTranslateError("\u3042", 0, 1, str8("ouch"))
+ UnicodeTranslateError("\u3042", 0, 1, "ouch")
)
# Use the correct exception
cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 0x3042)
s = "".join(chr(c) for c in cs)
self.assertEquals(
codecs.xmlcharrefreplace_errors(
- UnicodeEncodeError(str8("ascii"), s, 0, len(s), str8("ouch"))
+ UnicodeEncodeError("ascii", s, 0, len(s), "ouch")
),
("".join("&#%d;" % ord(c) for c in s), len(s))
)
@@ -540,48 +540,48 @@ class CodecCallbackTest(unittest.TestCase):
self.assertRaises(
TypeError,
codecs.backslashreplace_errors,
- UnicodeDecodeError(str8("ascii"), b"\xff", 0, 1, str8("ouch"))
+ UnicodeDecodeError("ascii", b"\xff", 0, 1, "ouch")
)
self.assertRaises(
TypeError,
codecs.backslashreplace_errors,
- UnicodeTranslateError("\u3042", 0, 1, str8("ouch"))
+ UnicodeTranslateError("\u3042", 0, 1, "ouch")
)
# Use the correct exception
self.assertEquals(
codecs.backslashreplace_errors(
- UnicodeEncodeError(str8("ascii"), "\u3042", 0, 1, str8("ouch"))),
+ UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")),
("\\u3042", 1)
)
self.assertEquals(
codecs.backslashreplace_errors(
- UnicodeEncodeError(str8("ascii"), "\x00", 0, 1, str8("ouch"))),
+ UnicodeEncodeError("ascii", "\x00", 0, 1, "ouch")),
("\\x00", 1)
)
self.assertEquals(
codecs.backslashreplace_errors(
- UnicodeEncodeError(str8("ascii"), "\xff", 0, 1, str8("ouch"))),
+ UnicodeEncodeError("ascii", "\xff", 0, 1, "ouch")),
("\\xff", 1)
)
self.assertEquals(
codecs.backslashreplace_errors(
- UnicodeEncodeError(str8("ascii"), "\u0100", 0, 1, str8("ouch"))),
+ UnicodeEncodeError("ascii", "\u0100", 0, 1, "ouch")),
("\\u0100", 1)
)
self.assertEquals(
codecs.backslashreplace_errors(
- UnicodeEncodeError(str8("ascii"), "\uffff", 0, 1, str8("ouch"))),
+ UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
("\\uffff", 1)
)
if sys.maxunicode>0xffff:
self.assertEquals(
codecs.backslashreplace_errors(
- UnicodeEncodeError(str8("ascii"), "\U00010000", 0, 1, str8("ouch"))),
+ UnicodeEncodeError("ascii", "\U00010000", 0, 1, "ouch")),
("\\U00010000", 1)
)
self.assertEquals(
codecs.backslashreplace_errors(
- UnicodeEncodeError(str8("ascii"), "\U0010ffff", 0, 1, str8("ouch"))),
+ UnicodeEncodeError("ascii", "\U0010ffff", 0, 1, "ouch")),
("\\U0010ffff", 1)
)
diff --git a/Objects/exceptions.c b/Objects/exceptions.c
index fabf359..36e3795 100644
--- a/Objects/exceptions.c
+++ b/Objects/exceptions.c
@@ -1187,35 +1187,6 @@ set_ssize_t(PyObject **attr, Py_ssize_t value)
}
static PyObject *
-get_string(PyObject *attr, const char *name)
-{
- if (!attr) {
- PyErr_Format(PyExc_TypeError, "%.200s attribute not set", name);
- return NULL;
- }
-
- if (!PyString_Check(attr)) {
- PyErr_Format(PyExc_TypeError, "%.200s attribute must be str", name);
- return NULL;
- }
- Py_INCREF(attr);
- return attr;
-}
-
-
-static int
-set_string(PyObject **attr, const char *value)
-{
- PyObject *obj = PyString_FromString(value);
- if (!obj)
- return -1;
- Py_CLEAR(*attr);
- *attr = obj;
- return 0;
-}
-
-
-static PyObject *
get_bytes(PyObject *attr, const char *name)
{
if (!attr) {
@@ -1248,16 +1219,27 @@ get_unicode(PyObject *attr, const char *name)
return attr;
}
+static int
+set_unicodefromstring(PyObject **attr, const char *value)
+{
+ PyObject *obj = PyUnicode_FromString(value);
+ if (!obj)
+ return -1;
+ Py_CLEAR(*attr);
+ *attr = obj;
+ return 0;
+}
+
PyObject *
PyUnicodeEncodeError_GetEncoding(PyObject *exc)
{
- return get_string(((PyUnicodeErrorObject *)exc)->encoding, "encoding");
+ return get_unicode(((PyUnicodeErrorObject *)exc)->encoding, "encoding");
}
PyObject *
PyUnicodeDecodeError_GetEncoding(PyObject *exc)
{
- return get_string(((PyUnicodeErrorObject *)exc)->encoding, "encoding");
+ return get_unicode(((PyUnicodeErrorObject *)exc)->encoding, "encoding");
}
PyObject *
@@ -1416,42 +1398,45 @@ PyUnicodeTranslateError_SetEnd(PyObject *exc, Py_ssize_t end)
PyObject *
PyUnicodeEncodeError_GetReason(PyObject *exc)
{
- return get_string(((PyUnicodeErrorObject *)exc)->reason, "reason");
+ return get_unicode(((PyUnicodeErrorObject *)exc)->reason, "reason");
}
PyObject *
PyUnicodeDecodeError_GetReason(PyObject *exc)
{
- return get_string(((PyUnicodeErrorObject *)exc)->reason, "reason");
+ return get_unicode(((PyUnicodeErrorObject *)exc)->reason, "reason");
}
PyObject *
PyUnicodeTranslateError_GetReason(PyObject *exc)
{
- return get_string(((PyUnicodeErrorObject *)exc)->reason, "reason");
+ return get_unicode(((PyUnicodeErrorObject *)exc)->reason, "reason");
}
int
PyUnicodeEncodeError_SetReason(PyObject *exc, const char *reason)
{
- return set_string(&((PyUnicodeErrorObject *)exc)->reason, reason);
+ return set_unicodefromstring(&((PyUnicodeErrorObject *)exc)->reason,
+ reason);
}
int
PyUnicodeDecodeError_SetReason(PyObject *exc, const char *reason)
{
- return set_string(&((PyUnicodeErrorObject *)exc)->reason, reason);
+ return set_unicodefromstring(&((PyUnicodeErrorObject *)exc)->reason,
+ reason);
}
int
PyUnicodeTranslateError_SetReason(PyObject *exc, const char *reason)
{
- return set_string(&((PyUnicodeErrorObject *)exc)->reason, reason);
+ return set_unicodefromstring(&((PyUnicodeErrorObject *)exc)->reason,
+ reason);
}
@@ -1466,11 +1451,11 @@ UnicodeError_init(PyUnicodeErrorObject *self, PyObject *args, PyObject *kwds,
Py_CLEAR(self->reason);
if (!PyArg_ParseTuple(args, "O!O!O!O!O!",
- &PyString_Type, &self->encoding,
+ &PyUnicode_Type, &self->encoding,
objecttype, &self->object,
&PyLong_Type, &self->start,
&PyLong_Type, &self->end,
- &PyString_Type, &self->reason)) {
+ &PyUnicode_Type, &self->reason)) {
self->encoding = self->object = self->start = self->end =
self->reason = NULL;
return -1;
@@ -1564,20 +1549,20 @@ UnicodeEncodeError_str(PyObject *self)
PyOS_snprintf(badchar_str, sizeof(badchar_str), "u%04x", badchar);
else
PyOS_snprintf(badchar_str, sizeof(badchar_str), "U%08x", badchar);
- return PyString_FromFormat(
- "'%.400s' codec can't encode character u'\\%s' in position %zd: %.400s",
- PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding),
+ return PyUnicode_FromFormat(
+ "'%U' codec can't encode character u'\\%s' in position %zd: %U",
+ ((PyUnicodeErrorObject *)self)->encoding,
badchar_str,
start,
- PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason)
+ ((PyUnicodeErrorObject *)self)->reason
);
}
- return PyString_FromFormat(
- "'%.400s' codec can't encode characters in position %zd-%zd: %.400s",
- PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding),
+ return PyUnicode_FromFormat(
+ "'%U' codec can't encode characters in position %zd-%zd: %U",
+ ((PyUnicodeErrorObject *)self)->encoding,
start,
(end-1),
- PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason)
+ ((PyUnicodeErrorObject *)self)->reason
);
}
@@ -1601,7 +1586,7 @@ PyUnicodeEncodeError_Create(
const char *encoding, const Py_UNICODE *object, Py_ssize_t length,
Py_ssize_t start, Py_ssize_t end, const char *reason)
{
- return PyObject_CallFunction(PyExc_UnicodeEncodeError, "su#nns",
+ return PyObject_CallFunction(PyExc_UnicodeEncodeError, "Uu#nnU",
encoding, object, length, start, end, reason);
}
@@ -1626,30 +1611,30 @@ UnicodeDecodeError_str(PyObject *self)
Py_ssize_t end = 0;
if (PyUnicodeDecodeError_GetStart(self, &start))
- return NULL;
+ return NULL;
if (PyUnicodeDecodeError_GetEnd(self, &end))
- return NULL;
+ return NULL;
if (end==start+1) {
/* FromFormat does not support %02x, so format that separately */
char byte[4];
PyOS_snprintf(byte, sizeof(byte), "%02x",
((int)PyBytes_AS_STRING(((PyUnicodeErrorObject *)self)->object)[start])&0xff);
- return PyString_FromFormat(
- "'%.400s' codec can't decode byte 0x%s in position %zd: %.400s",
- PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding),
+ return PyUnicode_FromFormat(
+ "'%U' codec can't decode byte 0x%s in position %zd: %U",
+ ((PyUnicodeErrorObject *)self)->encoding,
byte,
start,
- PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason)
+ ((PyUnicodeErrorObject *)self)->reason
);
}
- return PyString_FromFormat(
- "'%.400s' codec can't decode bytes in position %zd-%zd: %.400s",
- PyString_AS_STRING(((PyUnicodeErrorObject *)self)->encoding),
+ return PyUnicode_FromFormat(
+ "'%U' codec can't decode bytes in position %zd-%zd: %U",
+ ((PyUnicodeErrorObject *)self)->encoding,
start,
(end-1),
- PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason)
+ ((PyUnicodeErrorObject *)self)->reason
);
}
@@ -1676,7 +1661,7 @@ PyUnicodeDecodeError_Create(
assert(length < INT_MAX);
assert(start < INT_MAX);
assert(end < INT_MAX);
- return PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
+ return PyObject_CallFunction(PyExc_UnicodeDecodeError, "Uy#nnU",
encoding, object, length, start, end, reason);
}
@@ -1701,7 +1686,7 @@ UnicodeTranslateError_init(PyUnicodeErrorObject *self, PyObject *args,
&PyUnicode_Type, &self->object,
&PyLong_Type, &self->start,
&PyLong_Type, &self->end,
- &PyString_Type, &self->reason)) {
+ &PyUnicode_Type, &self->reason)) {
self->object = self->start = self->end = self->reason = NULL;
return -1;
}
@@ -1736,18 +1721,18 @@ UnicodeTranslateError_str(PyObject *self)
PyOS_snprintf(badchar_str, sizeof(badchar_str), "u%04x", badchar);
else
PyOS_snprintf(badchar_str, sizeof(badchar_str), "U%08x", badchar);
- return PyString_FromFormat(
- "can't translate character u'\\%s' in position %zd: %.400s",
+ return PyUnicode_FromFormat(
+ "can't translate character u'\\%s' in position %zd: %U",
badchar_str,
start,
- PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason)
+ ((PyUnicodeErrorObject *)self)->reason
);
}
- return PyString_FromFormat(
- "can't translate characters in position %zd-%zd: %.400s",
+ return PyUnicode_FromFormat(
+ "can't translate characters in position %zd-%zd: %U",
start,
(end-1),
- PyString_AS_STRING(((PyUnicodeErrorObject *)self)->reason)
+ ((PyUnicodeErrorObject *)self)->reason
);
}
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 7e455a5..e77b65d 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -393,15 +393,9 @@ PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
return (PyObject *)unicode;
}
-PyObject *PyUnicode_FromString(const char *u)
+PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
{
PyUnicodeObject *unicode;
- size_t size = strlen(u);
- if (size > PY_SSIZE_T_MAX) {
- PyErr_SetString(PyExc_OverflowError, "input too long");
- return NULL;
- }
-
/* If the Unicode data is known at construction time, we can apply
some optimizations which share commonly used objects. */
if (u != NULL) {
@@ -441,6 +435,17 @@ PyObject *PyUnicode_FromString(const char *u)
return (PyObject *)unicode;
}
+PyObject *PyUnicode_FromString(const char *u)
+{
+ size_t size = strlen(u);
+ if (size > PY_SSIZE_T_MAX) {
+ PyErr_SetString(PyExc_OverflowError, "input too long");
+ return NULL;
+ }
+
+ return PyUnicode_FromStringAndSize(u, size);
+}
+
#ifdef HAVE_WCHAR_H
PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
@@ -473,6 +478,223 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
return (PyObject *)unicode;
}
+#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
+
+PyObject *
+PyUnicode_FromFormatV(const char *format, va_list vargs)
+{
+ va_list count;
+ Py_ssize_t n = 0;
+ const char* f;
+ Py_UNICODE *s;
+ PyObject *string;
+ /* used by sprintf */
+ char buffer[21];
+ const char *copy;
+
+#ifdef VA_LIST_IS_ARRAY
+ Py_MEMCPY(count, vargs, sizeof(va_list));
+#else
+#ifdef __va_copy
+ __va_copy(count, vargs);
+#else
+ count = vargs;
+#endif
+#endif
+ /* step 1: figure out how large a buffer we need */
+ for (f = format; *f; f++) {
+ if (*f == '%') {
+ const char* p = f;
+ while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
+ ;
+
+ /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
+ * they don't affect the amount of space we reserve.
+ */
+ if ((*f == 'l' || *f == 'z') &&
+ (f[1] == 'd' || f[1] == 'u'))
+ ++f;
+
+ switch (*f) {
+ case 'c':
+ (void)va_arg(count, int);
+ /* fall through... */
+ case '%':
+ n++;
+ break;
+ case 'd': case 'u': case 'i': case 'x':
+ (void) va_arg(count, int);
+ /* 20 bytes is enough to hold a 64-bit
+ integer. Decimal takes the most space.
+ This isn't enough for octal. */
+ n += 20;
+ break;
+ case 's':
+ n += strlen(va_arg(count, char*));
+ break;
+ case 'U':
+ {
+ PyObject *obj = va_arg(count, PyObject *);
+ assert(obj && PyUnicode_Check(obj));
+ n += PyUnicode_GET_SIZE(obj);
+ break;
+ }
+ case 'p':
+ (void) va_arg(count, int);
+ /* maximum 64-bit pointer representation:
+ * 0xffffffffffffffff
+ * so 19 characters is enough.
+ * XXX I count 18 -- what's the extra for?
+ */
+ n += 19;
+ break;
+ default:
+ /* if we stumble upon an unknown
+ formatting code, copy the rest of
+ the format string to the output
+ string. (we cannot just skip the
+ code, since there's no way to know
+ what's in the argument list) */
+ n += strlen(p);
+ goto expand;
+ }
+ } else
+ n++;
+ }
+ expand:
+ /* step 2: fill the buffer */
+ /* Since we've analyzed how much space we need for the worst case,
+ we don't have to resize the string. */
+ string = PyUnicode_FromUnicode(NULL, n);
+ if (!string)
+ return NULL;
+
+ s = PyUnicode_AS_UNICODE(string);
+
+ for (f = format; *f; f++) {
+ if (*f == '%') {
+ const char* p = f++;
+ int longflag = 0;
+ int size_tflag = 0;
+ /* parse the width.precision part (we're only
+ interested in the precision value, if any) */
+ n = 0;
+ while (isdigit(Py_CHARMASK(*f)))
+ n = (n*10) + *f++ - '0';
+ if (*f == '.') {
+ f++;
+ n = 0;
+ while (isdigit(Py_CHARMASK(*f)))
+ n = (n*10) + *f++ - '0';
+ }
+ while (*f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
+ f++;
+ /* handle the long flag, but only for %ld and %lu.
+ others can be added when necessary. */
+ if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
+ longflag = 1;
+ ++f;
+ }
+ /* handle the size_t flag. */
+ if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
+ size_tflag = 1;
+ ++f;
+ }
+
+ switch (*f) {
+ case 'c':
+ *s++ = va_arg(vargs, int);
+ break;
+ case 'd':
+ if (longflag)
+ sprintf(buffer, "%ld", va_arg(vargs, long));
+ else if (size_tflag)
+ sprintf(buffer, "%" PY_FORMAT_SIZE_T "d",
+ va_arg(vargs, Py_ssize_t));
+ else
+ sprintf(buffer, "%d", va_arg(vargs, int));
+ appendstring(buffer);
+ break;
+ case 'u':
+ if (longflag)
+ sprintf(buffer, "%lu",
+ va_arg(vargs, unsigned long));
+ else if (size_tflag)
+ sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
+ va_arg(vargs, size_t));
+ else
+ sprintf(buffer, "%u",
+ va_arg(vargs, unsigned int));
+ appendstring(buffer);
+ break;
+ case 'i':
+ sprintf(buffer, "%i", va_arg(vargs, int));
+ appendstring(buffer);
+ break;
+ case 'x':
+ sprintf(buffer, "%x", va_arg(vargs, int));
+ appendstring(buffer);
+ break;
+ case 's':
+ p = va_arg(vargs, char*);
+ appendstring(p);
+ break;
+ case 'U':
+ {
+ PyObject *obj = va_arg(vargs, PyObject *);
+ Py_UNICODE *ucopy = PyUnicode_AS_UNICODE(obj);
+ Py_ssize_t usize = PyUnicode_GET_SIZE(obj);
+ Py_ssize_t upos;
+ for (upos = 0; upos<usize;)
+ *s++ = ucopy[upos++];
+ break;
+ }
+ case 'p':
+ sprintf(buffer, "%p", va_arg(vargs, void*));
+ /* %p is ill-defined: ensure leading 0x. */
+ if (buffer[1] == 'X')
+ buffer[1] = 'x';
+ else if (buffer[1] != 'x') {
+ memmove(buffer+2, buffer, strlen(buffer)+1);
+ buffer[0] = '0';
+ buffer[1] = 'x';
+ }
+ appendstring(buffer);
+ break;
+ case '%':
+ *s++ = '%';
+ break;
+ default:
+ appendstring(p);
+ goto end;
+ }
+ } else
+ *s++ = *f;
+ }
+
+ end:
+ _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
+ return string;
+}
+
+#undef appendstring
+
+PyObject *
+PyUnicode_FromFormat(const char *format, ...)
+{
+ PyObject* ret;
+ va_list vargs;
+
+#ifdef HAVE_STDARG_PROTOTYPES
+ va_start(vargs, format);
+#else
+ va_start(vargs);
+#endif
+ ret = PyUnicode_FromFormatV(format, vargs);
+ va_end(vargs);
+ return ret;
+}
+
Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
wchar_t *w,
Py_ssize_t size)
diff --git a/Python/modsupport.c b/Python/modsupport.c
index 8f600dc..a272ce3 100644
--- a/Python/modsupport.c
+++ b/Python/modsupport.c
@@ -424,6 +424,39 @@ do_mkvalue(const char **p_format, va_list *p_va, int flags)
return v;
}
+ case 'U':
+ {
+ PyObject *v;
+ char *str = va_arg(*p_va, char *);
+ Py_ssize_t n;
+ if (**p_format == '#') {
+ ++*p_format;
+ if (flags & FLAG_SIZE_T)
+ n = va_arg(*p_va, Py_ssize_t);
+ else
+ n = va_arg(*p_va, int);
+ }
+ else
+ n = -1;
+ if (str == NULL) {
+ v = Py_None;
+ Py_INCREF(v);
+ }
+ else {
+ if (n < 0) {
+ size_t m = strlen(str);
+ if (m > PY_SSIZE_T_MAX) {
+ PyErr_SetString(PyExc_OverflowError,
+ "string too long for Python string");
+ return NULL;
+ }
+ n = (Py_ssize_t)m;
+ }
+ v = PyUnicode_FromStringAndSize(str, n);
+ }
+ return v;
+ }
+
case 'y':
{
PyObject *v;