Unicode: optimize creating of 1-character strings

author: Victor Stinner <victor.stinner@gmail.com> 2012-05-03 00:17:04 (GMT)
committer: Victor Stinner <victor.stinner@gmail.com> 2012-05-03 00:17:04 (GMT)
commit: b6cd014d75394ca48e6113d614d36ed16fbe7b13 (patch)
tree: cf5b9082e64ee3fc8cde78204d836a781cd762d4 /Objects
parent: bff7c9683442a6297bda2fb1ebedf73c9e4a265f (diff)
download: cpython-b6cd014d75394ca48e6113d614d36ed16fbe7b13.zip
cpython-b6cd014d75394ca48e6113d614d36ed16fbe7b13.tar.gz
cpython-b6cd014d75394ca48e6113d614d36ed16fbe7b13.tar.bz2
1 files changed, 50 insertions, 8 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index e22fcfd..f427fd3 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1919,8 +1919,18 @@ _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
         return unicode_empty;
     }
     assert(size > 0);
-    if (size == 1 && u[0] < 256)
-        return get_latin1_char((unsigned char)u[0]);
+    if (size == 1) {
+        Py_UCS4 ch = u[0];
+        if (ch < 256)
+            return get_latin1_char((unsigned char)ch);
+
+        res = PyUnicode_New(1, ch);
+        if (res == NULL)
+            return NULL;
+        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
+        assert(_PyUnicode_CheckConsistency(res, 1));
+        return res;
+    }
 
     max_char = ucs2lib_find_max_char(u, u + size);
     res = PyUnicode_New(size, max_char);
@@ -1947,8 +1957,18 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
         return unicode_empty;
     }
     assert(size > 0);
-    if (size == 1 && u[0] < 256)
-        return get_latin1_char((unsigned char)u[0]);
+    if (size == 1) {
+        Py_UCS4 ch = u[0];
+        if (ch < 256)
+            return get_latin1_char((unsigned char)ch);
+
+        res = PyUnicode_New(1, ch);
+        if (res == NULL)
+            return NULL;
+        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
+        assert(_PyUnicode_CheckConsistency(res, 1));
+        return res;
+    }
 
     max_char = ucs4lib_find_max_char(u, u + size);
     res = PyUnicode_New(size, max_char);
@@ -11368,10 +11388,33 @@ unicode_find(PyObject *self, PyObject *args)
 static PyObject *
 unicode_getitem(PyObject *self, Py_ssize_t index)
 {
-    Py_UCS4 ch = PyUnicode_ReadChar(self, index);
-    if (ch == (Py_UCS4)-1)
+    void *data;
+    enum PyUnicode_Kind kind;
+    Py_UCS4 ch;
+    PyObject *res;
+
+    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
+        PyErr_BadArgument();
+        return NULL;
+    }
+    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
+        PyErr_SetString(PyExc_IndexError, "string index out of range");
         return NULL;
-    return PyUnicode_FromOrdinal(ch);
+    }
+    kind = PyUnicode_KIND(self);
+    data = PyUnicode_DATA(self);
+    ch = PyUnicode_READ(kind, data, index);
+    if (ch < 256)
+        return get_latin1_char(ch);
+
+    res = PyUnicode_New(1, ch);
+    if (res == NULL)
+        return NULL;
+    kind = PyUnicode_KIND(res);
+    data = PyUnicode_DATA(res);
+    PyUnicode_WRITE(kind, data, 0, ch);
+    assert(_PyUnicode_CheckConsistency(res, 1));
+    return res;
 }
 
 /* Believe it or not, this produces the same value for ASCII strings
@@ -12039,7 +12082,6 @@ PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
     }
 
     if (PyUnicode_IS_ASCII(self)) {
-        kind = PyUnicode_KIND(self);
         data = PyUnicode_1BYTE_DATA(self);
         return unicode_fromascii(data + start, length);
     }
author	Victor Stinner <victor.stinner@gmail.com>	2012-05-03 00:17:04 (GMT)
committer	Victor Stinner <victor.stinner@gmail.com>	2012-05-03 00:17:04 (GMT)
commit	b6cd014d75394ca48e6113d614d36ed16fbe7b13 (patch)
tree	cf5b9082e64ee3fc8cde78204d836a781cd762d4 /Objects
parent	bff7c9683442a6297bda2fb1ebedf73c9e4a265f (diff)
download	cpython-b6cd014d75394ca48e6113d614d36ed16fbe7b13.zip cpython-b6cd014d75394ca48e6113d614d36ed16fbe7b13.tar.gz cpython-b6cd014d75394ca48e6113d614d36ed16fbe7b13.tar.bz2