summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@haypocalc.com>2011-10-01 14:16:43 (GMT)
committerVictor Stinner <victor.stinner@haypocalc.com>2011-10-01 14:16:43 (GMT)
commit07ac3ebd7ba6b42ba1adb07ec6075eb916d167b1 (patch)
tree5318d1d3c233f80d486690dc242c3c365ef3cbaa
parente90fe6a8f4a6cc6504387a77720ef7d1316f8086 (diff)
downloadcpython-07ac3ebd7ba6b42ba1adb07ec6075eb916d167b1.zip
cpython-07ac3ebd7ba6b42ba1adb07ec6075eb916d167b1.tar.gz
cpython-07ac3ebd7ba6b42ba1adb07ec6075eb916d167b1.tar.bz2
Optimize unicode_subtype_new(): don't encode to wchar_t and decode from wchar_t
Rewrite unicode_subtype_new(): allocate directly the right type.
-rw-r--r--Lib/test/test_unicode.py11
-rw-r--r--Objects/unicodeobject.c119
2 files changed, 84 insertions, 46 deletions
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index a527dff..4f6f132 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1010,10 +1010,13 @@ class UnicodeTest(string_tests.CommonTest,
class UnicodeSubclass(str):
pass
- self.assertEqual(
- str(UnicodeSubclass('unicode subclass becomes unicode')),
- 'unicode subclass becomes unicode'
- )
+ for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
+ subclass = UnicodeSubclass(text)
+ self.assertEqual(str(subclass), text)
+ self.assertEqual(len(subclass), len(text))
+ if text == 'ascii':
+ self.assertEqual(subclass.encode('ascii'), b'ascii')
+ self.assertEqual(subclass.encode('utf-8'), b'ascii')
self.assertEqual(
str('strings are converted to unicode'),
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index cd58de6..fe91975 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -12410,56 +12410,91 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
static PyObject *
unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
- PyUnicodeObject *tmp, *pnew;
- Py_ssize_t n;
- PyObject *err = NULL;
+ PyUnicodeObject *unicode, *self;
+ Py_ssize_t length, char_size;
+ int share_wstr, share_utf8;
+ unsigned int kind;
+ void *data;
assert(PyType_IsSubtype(type, &PyUnicode_Type));
- tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
- if (tmp == NULL)
- return NULL;
- assert(PyUnicode_Check(tmp));
- // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
- // it seems kind of strange that tp_alloc gets passed the size
- // of the unicode string because there will follow another
- // malloc.
- pnew = (PyUnicodeObject *) type->tp_alloc(type,
- n = PyUnicode_GET_SIZE(tmp));
- if (pnew == NULL) {
- Py_DECREF(tmp);
- return NULL;
- }
- _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
- if (_PyUnicode_WSTR(pnew) == NULL) {
- err = PyErr_NoMemory();
+
+ unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
+ if (unicode == NULL)
+ return NULL;
+ assert(PyUnicode_Check(unicode));
+ if (PyUnicode_READY(unicode))
+ return NULL;
+
+ self = (PyUnicodeObject *) type->tp_alloc(type, 0);
+ if (self == NULL) {
+ Py_DECREF(unicode);
+ return NULL;
+ }
+ kind = PyUnicode_KIND(unicode);
+ length = PyUnicode_GET_LENGTH(unicode);
+
+ _PyUnicode_LENGTH(self) = length;
+ _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
+ _PyUnicode_STATE(self).interned = 0;
+ _PyUnicode_STATE(self).kind = kind;
+ _PyUnicode_STATE(self).compact = 0;
+ _PyUnicode_STATE(self).ascii = 0;
+ _PyUnicode_STATE(self).ready = 1;
+ _PyUnicode_WSTR(self) = NULL;
+ _PyUnicode_UTF8_LENGTH(self) = 0;
+ _PyUnicode_UTF8(self) = NULL;
+ _PyUnicode_WSTR_LENGTH(self) = 0;
+ self->data.any = NULL;
+
+ share_utf8 = 0;
+ share_wstr = 0;
+ if (kind == PyUnicode_1BYTE_KIND) {
+ char_size = 1;
+ if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
+ share_utf8 = 1;
+ }
+ else if (kind == PyUnicode_2BYTE_KIND) {
+ char_size = 2;
+ if (sizeof(wchar_t) == 2)
+ share_wstr = 1;
+ }
+ else {
+ assert(kind == PyUnicode_4BYTE_KIND);
+ char_size = 4;
+ if (sizeof(wchar_t) == 4)
+ share_wstr = 1;
+ }
+
+ /* Ensure we won't overflow the length. */
+ if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
+ PyErr_NoMemory();
goto onError;
}
- Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
- _PyUnicode_WSTR_LENGTH(pnew) = n;
- _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
- _PyUnicode_STATE(pnew).interned = 0;
- _PyUnicode_STATE(pnew).kind = 0;
- _PyUnicode_STATE(pnew).compact = 0;
- _PyUnicode_STATE(pnew).ready = 0;
- _PyUnicode_STATE(pnew).ascii = 0;
- pnew->data.any = NULL;
- _PyUnicode_LENGTH(pnew) = 0;
- pnew->_base.utf8 = NULL;
- pnew->_base.utf8_length = 0;
-
- if (PyUnicode_READY(pnew) == -1) {
- PyObject_FREE(_PyUnicode_WSTR(pnew));
+ data = PyObject_MALLOC((length + 1) * char_size);
+ if (data == NULL) {
+ PyErr_NoMemory();
goto onError;
}
- Py_DECREF(tmp);
- return (PyObject *)pnew;
+ self->data.any = data;
+ if (share_utf8) {
+ _PyUnicode_UTF8_LENGTH(self) = length;
+ _PyUnicode_UTF8(self) = data;
+ }
+ if (share_wstr) {
+ _PyUnicode_WSTR_LENGTH(self) = length;
+ _PyUnicode_WSTR(self) = (wchar_t *)data;
+ }
+
+ Py_MEMCPY(data, PyUnicode_DATA(unicode),
+ PyUnicode_KIND_SIZE(kind, length + 1));
+ Py_DECREF(unicode);
+ return (PyObject *)self;
- onError:
- _Py_ForgetReference((PyObject *)pnew);
- PyObject_Del(pnew);
- Py_DECREF(tmp);
- return err;
+onError:
+ Py_DECREF(unicode);
+ Py_DECREF(self);
+ return NULL;
}
PyDoc_STRVAR(unicode_doc,