diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2007-08-05 20:26:11 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2007-08-05 20:26:11 (GMT) |
commit | 9c121069d3a61868f4586ad2ba2e5435a82af061 (patch) | |
tree | 2b855fe92ed298ec849c14a4f01a9c0402a6fff7 /Objects | |
parent | 64ce5052e1c2495bcbc78f732e8ece2f4c8375ac (diff) | |
download | cpython-9c121069d3a61868f4586ad2ba2e5435a82af061.zip cpython-9c121069d3a61868f4586ad2ba2e5435a82af061.tar.gz cpython-9c121069d3a61868f4586ad2ba2e5435a82af061.tar.bz2 |
Change PyUnicode_FromString[AndSize] to expect UTF-8.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/bytesobject.c | 12 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 20 |
2 files changed, 15 insertions, 17 deletions
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 25f7763..47ee8a4 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -2724,11 +2724,13 @@ PyDoc_STRVAR(reduce_doc, "Return state information for pickling."); static PyObject * bytes_reduce(PyBytesObject *self) { - return Py_BuildValue("(O(s#s))", - Py_Type(self), - self->ob_bytes == NULL ? "" : self->ob_bytes, - Py_Size(self), - "latin-1"); + PyObject *latin1; + if (self->ob_bytes) + latin1 = PyUnicode_DecodeLatin1(self->ob_bytes, + Py_Size(self), NULL); + else + latin1 = PyUnicode_FromString(""); + return Py_BuildValue("(O(Ns))", Py_Type(self), latin1, "latin-1"); } static PySequenceMethods bytes_as_sequence = { diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index d1b5747..27fedca 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -427,7 +427,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) { PyUnicodeObject *unicode; /* If the Unicode data is known at construction time, we can apply - some optimizations which share commonly used objects. */ + some optimizations which share commonly used objects. + Also, this means the input must be UTF-8, so fall back to the + UTF-8 decoder at the end. */ if (u != NULL) { /* Optimization for empty strings */ @@ -436,8 +438,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) return (PyObject *)unicode_empty; } - /* Single characters are shared when using this constructor */ - if (size == 1) { + /* Single characters are shared when using this constructor. + Restrict to ASCII, since the input must be UTF-8. */ + if (size == 1 && Py_CHARMASK(*u) < 128) { unicode = unicode_latin1[Py_CHARMASK(*u)]; if (!unicode) { unicode = _PyUnicode_New(1); @@ -449,21 +452,14 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) Py_INCREF(unicode); return (PyObject *)unicode; } + + return PyUnicode_DecodeUTF8(u, size, NULL); } unicode = _PyUnicode_New(size); if (!unicode) return NULL; - /* Copy the Unicode data into the new object */ - if (u != NULL) { - Py_UNICODE *p = unicode->str; - while (size--) - *p++ = Py_CHARMASK(*u++); - /* Don't need to write trailing 0 because - that's already done by _PyUnicode_New */ - } - return (PyObject *)unicode; } |