summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2007-08-05 20:26:11 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2007-08-05 20:26:11 (GMT)
commit9c121069d3a61868f4586ad2ba2e5435a82af061 (patch)
tree2b855fe92ed298ec849c14a4f01a9c0402a6fff7 /Objects
parent64ce5052e1c2495bcbc78f732e8ece2f4c8375ac (diff)
downloadcpython-9c121069d3a61868f4586ad2ba2e5435a82af061.zip
cpython-9c121069d3a61868f4586ad2ba2e5435a82af061.tar.gz
cpython-9c121069d3a61868f4586ad2ba2e5435a82af061.tar.bz2
Change PyUnicode_FromString[AndSize] to expect UTF-8.
Diffstat (limited to 'Objects')
-rw-r--r--Objects/bytesobject.c12
-rw-r--r--Objects/unicodeobject.c20
2 files changed, 15 insertions, 17 deletions
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index 25f7763..47ee8a4 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -2724,11 +2724,13 @@ PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
static PyObject *
bytes_reduce(PyBytesObject *self)
{
- return Py_BuildValue("(O(s#s))",
- Py_Type(self),
- self->ob_bytes == NULL ? "" : self->ob_bytes,
- Py_Size(self),
- "latin-1");
+ PyObject *latin1;
+ if (self->ob_bytes)
+ latin1 = PyUnicode_DecodeLatin1(self->ob_bytes,
+ Py_Size(self), NULL);
+ else
+ latin1 = PyUnicode_FromString("");
+ return Py_BuildValue("(O(Ns))", Py_Type(self), latin1, "latin-1");
}
static PySequenceMethods bytes_as_sequence = {
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index d1b5747..27fedca 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -427,7 +427,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
{
PyUnicodeObject *unicode;
/* If the Unicode data is known at construction time, we can apply
- some optimizations which share commonly used objects. */
+ some optimizations which share commonly used objects.
+ Also, this means the input must be UTF-8, so fall back to the
+ UTF-8 decoder at the end. */
if (u != NULL) {
/* Optimization for empty strings */
@@ -436,8 +438,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
return (PyObject *)unicode_empty;
}
- /* Single characters are shared when using this constructor */
- if (size == 1) {
+ /* Single characters are shared when using this constructor.
+ Restrict to ASCII, since the input must be UTF-8. */
+ if (size == 1 && Py_CHARMASK(*u) < 128) {
unicode = unicode_latin1[Py_CHARMASK(*u)];
if (!unicode) {
unicode = _PyUnicode_New(1);
@@ -449,21 +452,14 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Py_INCREF(unicode);
return (PyObject *)unicode;
}
+
+ return PyUnicode_DecodeUTF8(u, size, NULL);
}
unicode = _PyUnicode_New(size);
if (!unicode)
return NULL;
- /* Copy the Unicode data into the new object */
- if (u != NULL) {
- Py_UNICODE *p = unicode->str;
- while (size--)
- *p++ = Py_CHARMASK(*u++);
- /* Don't need to write trailing 0 because
- that's already done by _PyUnicode_New */
- }
-
return (PyObject *)unicode;
}