summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2007-08-05 20:26:11 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2007-08-05 20:26:11 (GMT)
commit9c121069d3a61868f4586ad2ba2e5435a82af061 (patch)
tree2b855fe92ed298ec849c14a4f01a9c0402a6fff7 /Objects/unicodeobject.c
parent64ce5052e1c2495bcbc78f732e8ece2f4c8375ac (diff)
downloadcpython-9c121069d3a61868f4586ad2ba2e5435a82af061.zip
cpython-9c121069d3a61868f4586ad2ba2e5435a82af061.tar.gz
cpython-9c121069d3a61868f4586ad2ba2e5435a82af061.tar.bz2
Change PyUnicode_FromString[AndSize] to expect UTF-8.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c20
1 files changed, 8 insertions, 12 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index d1b5747..27fedca 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -427,7 +427,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
{
PyUnicodeObject *unicode;
/* If the Unicode data is known at construction time, we can apply
- some optimizations which share commonly used objects. */
+ some optimizations which share commonly used objects.
+ Also, this means the input must be UTF-8, so fall back to the
+ UTF-8 decoder at the end. */
if (u != NULL) {
/* Optimization for empty strings */
@@ -436,8 +438,9 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
return (PyObject *)unicode_empty;
}
- /* Single characters are shared when using this constructor */
- if (size == 1) {
+ /* Single characters are shared when using this constructor.
+ Restrict to ASCII, since the input must be UTF-8. */
+ if (size == 1 && Py_CHARMASK(*u) < 128) {
unicode = unicode_latin1[Py_CHARMASK(*u)];
if (!unicode) {
unicode = _PyUnicode_New(1);
@@ -449,21 +452,14 @@ PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Py_INCREF(unicode);
return (PyObject *)unicode;
}
+
+ return PyUnicode_DecodeUTF8(u, size, NULL);
}
unicode = _PyUnicode_New(size);
if (!unicode)
return NULL;
- /* Copy the Unicode data into the new object */
- if (u != NULL) {
- Py_UNICODE *p = unicode->str;
- while (size--)
- *p++ = Py_CHARMASK(*u++);
- /* Don't need to write trailing 0 because
- that's already done by _PyUnicode_New */
- }
-
return (PyObject *)unicode;
}