diff options
-rw-r--r-- | Doc/api/api.tex | 4 | ||||
-rw-r--r-- | Include/unicodeobject.h | 8 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 184 |
3 files changed, 142 insertions, 54 deletions
diff --git a/Doc/api/api.tex b/Doc/api/api.tex index 58188b5..0d7f6f2 100644 --- a/Doc/api/api.tex +++ b/Doc/api/api.tex @@ -2457,7 +2457,9 @@ use these APIs: Create a Unicode Object from the Py_UNICODE buffer \var{u} of the given size. \var{u} may be \NULL{} which causes the contents to be undefined. It is the user's responsibility to fill in the needed data. -The buffer is copied into the new object. +The buffer is copied into the new object. If the buffer is not \NULL{}, +the return value might be a shared object. Therefore, modification of +the resulting Unicode Object is only allowed when \var{u} is \NULL{}. \end{cfuncdesc} \begin{cfuncdesc}{Py_UNICODE*}{PyUnicode_AsUnicode}{PyObject *unicode} diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index e88b8ed..988ea1b 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -239,8 +239,12 @@ extern DL_IMPORT(PyTypeObject) PyUnicode_Type; /* --- Plain Py_UNICODE --------------------------------------------------- */ /* Create a Unicode Object from the Py_UNICODE buffer u of the given - size. u may be NULL which causes the contents to be undefined. It - is the user's responsibility to fill in the needed data. + size. + + u may be NULL which causes the contents to be undefined. It is the + user's responsibility to fill in the needed data afterwards. Note + that modifying the Unicode object contents after construction is + only allowed if u was set to NULL. The buffer is copied into the new object. */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index b3c8ba4..1d72c0d 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -83,13 +83,17 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -/* The empty Unicode object */ -static PyUnicodeObject *unicode_empty; - /* Free list for Unicode objects */ static PyUnicodeObject *unicode_freelist; static int unicode_freelist_size; +/* The empty Unicode object is shared to improve performance. */ +static PyUnicodeObject *unicode_empty; + +/* Single character Unicode strings in the Latin-1 range are being + shared as well. */ +static PyUnicodeObject *unicode_latin1[256]; + /* Default encoding to use and assume when NULL is passed as encoding parameter; it is initialized by _PyUnicode_Init(). @@ -97,13 +101,12 @@ static int unicode_freelist_size; PyUnicode_GetDefaultEncoding() APIs to access this global. */ - static char unicode_default_encoding[100]; /* --- Unicode Object ----------------------------------------------------- */ static -int _PyUnicode_Resize(register PyUnicodeObject *unicode, +int unicode_resize(register PyUnicodeObject *unicode, int length) { void *oldstr; @@ -112,10 +115,15 @@ int _PyUnicode_Resize(register PyUnicodeObject *unicode, if (unicode->length == length) goto reset; - /* Resizing unicode_empty is not allowed. */ - if (unicode == unicode_empty) { + /* Resizing shared object (unicode_empty or single character + objects) in-place is not allowed. Use PyUnicode_Resize() + instead ! */ + if (unicode == unicode_empty || + (unicode->length == 1 && + unicode->str[0] < 256 && + unicode_latin1[unicode->str[0]] == unicode)) { PyErr_SetString(PyExc_SystemError, - "can't resize empty unicode object"); + "can't resize shared unicode objects"); return -1; } @@ -142,23 +150,6 @@ int _PyUnicode_Resize(register PyUnicodeObject *unicode, return 0; } -int PyUnicode_Resize(PyObject **unicode, - int length) -{ - PyUnicodeObject *v; - - if (unicode == NULL) { - PyErr_BadInternalCall(); - return -1; - } - v = (PyUnicodeObject *)*unicode; - if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) { - PyErr_BadInternalCall(); - return -1; - } - return _PyUnicode_Resize(v, length); -} - /* We allocate one more byte to make sure the string is Ux0000 terminated -- XXX is this needed ? @@ -187,7 +178,7 @@ PyUnicodeObject *_PyUnicode_New(int length) /* Keep-Alive optimization: we only upsize the buffer, never downsize it. */ if ((unicode->length < length) && - _PyUnicode_Resize(unicode, length)) { + unicode_resize(unicode, length)) { PyMem_DEL(unicode->str); goto onError; } @@ -246,18 +237,83 @@ void _PyUnicode_Free(register PyUnicodeObject *unicode) } } +int PyUnicode_Resize(PyObject **unicode, + int length) +{ + register PyUnicodeObject *v; + + /* Argument checks */ + if (unicode == NULL) { + PyErr_BadInternalCall(); + return -1; + } + v = (PyUnicodeObject *)*unicode; + if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) { + PyErr_BadInternalCall(); + return -1; + } + + /* Resizing unicode_empty and single character objects is not + possible since these are being shared. We simply return a fresh + copy with the same Unicode content. */ + if (v->length != length && + (v == unicode_empty || v->length == 1)) { + PyUnicodeObject *w = _PyUnicode_New(length); + if (w == NULL) + return -1; + Py_UNICODE_COPY(w->str, v->str, + length < v->length ? length : v->length); + *unicode = (PyObject *)w; + return 0; + } + + /* Note that we don't have to modify *unicode for unshared Unicode + objects, since we can modify them in-place. */ + return unicode_resize(v, length); +} + +/* Internal API for use in unicodeobject.c only ! */ +#define _PyUnicode_Resize(unicodevar, length) \ + PyUnicode_Resize(((PyObject **)(unicodevar)), length) + PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, int size) { PyUnicodeObject *unicode; + /* If the Unicode data is known at construction time, we can apply + some optimizations which share commonly used objects. */ + if (u != NULL) { + + /* Optimization for empty strings */ + if (size == 0 && unicode_empty != NULL) { + Py_INCREF(unicode_empty); + return (PyObject *)unicode_empty; + } + + /* Single character Unicode objects in the Latin-1 range are + shared when using this constructor */ + if (size == 1 && *u < 256) { + unicode = unicode_latin1[*u]; + if (!unicode) { + unicode = _PyUnicode_New(1); + unicode->str[0] = *u; + if (!unicode) + return NULL; + unicode_latin1[*u] = unicode; + } + Py_INCREF(unicode); + return (PyObject *)unicode; + } + } + unicode = _PyUnicode_New(size); if (!unicode) return NULL; /* Copy the Unicode data into the new object */ if (u != NULL) - memcpy(unicode->str, u, size * sizeof(Py_UNICODE)); + Py_UNICODE_COPY(unicode->str, u, size); return (PyObject *)unicode; } @@ -748,7 +804,7 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, } /* Adjust length */ - if (_PyUnicode_Resize(unicode, p - unicode->str)) + if (_PyUnicode_Resize(&unicode, p - unicode->str)) goto onError; return (PyObject *)unicode; @@ -1008,7 +1064,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s, *byteorder = bo; /* Adjust length */ - if (_PyUnicode_Resize(unicode, p - unicode->str)) + if (_PyUnicode_Resize(&unicode, p - unicode->str)) goto onError; return (PyObject *)unicode; @@ -1048,7 +1104,7 @@ PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s, byteorder == 1 #endif ) - memcpy(p, s, size * sizeof(Py_UNICODE)); + Py_UNICODE_COPY(p, s, size); else while (size-- > 0) { Py_UNICODE ch = *s++; @@ -1263,7 +1319,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, break; } } - if (_PyUnicode_Resize(v, (int)(p - buf))) + if (_PyUnicode_Resize(&v, (int)(p - buf))) goto onError; return (PyObject *)v; @@ -1451,7 +1507,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, s += i; *p++ = x; } - if (_PyUnicode_Resize(v, (int)(p - buf))) + if (_PyUnicode_Resize(&v, (int)(p - buf))) goto onError; return (PyObject *)v; @@ -1522,6 +1578,11 @@ PyObject *PyUnicode_DecodeLatin1(const char *s, Py_UNICODE *p; /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ + if (size == 1 && *(unsigned char*)s < 256) { + Py_UNICODE r = *(unsigned char*)s; + return PyUnicode_FromUnicode(&r, 1); + } + v = _PyUnicode_New(size); if (v == NULL) goto onError; @@ -1654,6 +1715,11 @@ PyObject *PyUnicode_DecodeASCII(const char *s, Py_UNICODE *p; /* ASCII is equivalent to the first 128 ordinals in Unicode. */ + if (size == 1 && *(unsigned char*)s < 128) { + Py_UNICODE r = *(unsigned char*)s; + return PyUnicode_FromUnicode(&r, 1); + } + v = _PyUnicode_New(size); if (v == NULL) goto onError; @@ -1671,7 +1737,7 @@ PyObject *PyUnicode_DecodeASCII(const char *s, goto onError; } if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) - if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)))) + if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; return (PyObject *)v; @@ -1926,7 +1992,8 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, int needed = (targetsize - extrachars) + \ (targetsize << 2); extrachars += needed; - if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) { + if (_PyUnicode_Resize(&v, + PyUnicode_GET_SIZE(v) + needed)) { Py_DECREF(x); goto onError; } @@ -1950,7 +2017,7 @@ PyObject *PyUnicode_DecodeCharmap(const char *s, Py_DECREF(x); } if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) - if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)))) + if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; return (PyObject *)v; @@ -2068,9 +2135,7 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, } s = PyString_AS_STRING(v) + oldpos; } - memcpy(s, - PyString_AS_STRING(x), - targetsize); + memcpy(s, PyString_AS_STRING(x), targetsize); s += targetsize; extrachars -= targetsize; } @@ -2209,7 +2274,7 @@ PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s, Py_DECREF(x); } if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) - if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)))) + if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v)))) goto onError; done: @@ -2506,10 +2571,12 @@ PyObject *fixup(PyUnicodeObject *self, PyUnicodeObject *u; - u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str, - self->length); + u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); if (u == NULL) return NULL; + + Py_UNICODE_COPY(u->str, self->str, self->length); + if (!fixfct(u)) { /* fixfct should return TRUE if it modified the buffer. If FALSE, return a reference to the original buffer instead @@ -2698,22 +2765,22 @@ PyObject *PyUnicode_Join(PyObject *separator, } itemlen = PyUnicode_GET_SIZE(item); while (reslen + itemlen + seplen >= sz) { - if (_PyUnicode_Resize(res, sz*2)) + if (_PyUnicode_Resize(&res, sz*2)) goto onError; sz *= 2; p = PyUnicode_AS_UNICODE(res) + reslen; } if (i > 0) { - memcpy(p, sep, seplen * sizeof(Py_UNICODE)); + Py_UNICODE_COPY(p, sep, seplen); p += seplen; reslen += seplen; } - memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE)); + Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen); p += itemlen; reslen += itemlen; Py_DECREF(item); } - if (_PyUnicode_Resize(res, reslen)) + if (_PyUnicode_Resize(&res, reslen)) goto onError; Py_XDECREF(separator); @@ -3001,10 +3068,12 @@ PyObject *replace(PyUnicodeObject *self, Py_UNICODE u2 = str2->str[0]; u = (PyUnicodeObject*) PyUnicode_FromUnicode( - self->str, + NULL, self->length ); - if (u) + if (u != NULL) { + Py_UNICODE_COPY(u->str, self->str, + self->length); for (i = 0; i < u->length; i++) if (u->str[i] == u1) { if (--maxcount < 0) @@ -3012,6 +3081,7 @@ PyObject *replace(PyUnicodeObject *self, u->str[i] = u2; } } + } } else { int n, i; @@ -4778,7 +4848,7 @@ PyObject *PyUnicode_Format(PyObject *format, if (--rescnt < 0) { rescnt = fmtcnt + 100; reslen += rescnt; - if (_PyUnicode_Resize(result, reslen) < 0) + if (_PyUnicode_Resize(&result, reslen) < 0) return NULL; res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; --rescnt; @@ -5069,7 +5139,7 @@ PyObject *PyUnicode_Format(PyObject *format, reslen -= rescnt; rescnt = width + fmtcnt + 100; reslen += rescnt; - if (_PyUnicode_Resize(result, reslen) < 0) + if (_PyUnicode_Resize(&result, reslen) < 0) return NULL; res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; @@ -5110,7 +5180,7 @@ PyObject *PyUnicode_Format(PyObject *format, *res++ = *pbuf++; } } - memcpy(res, pbuf, len * sizeof(Py_UNICODE)); + Py_UNICODE_COPY(res, pbuf, len); res += len; rescnt -= len; while (--width >= len) { @@ -5135,7 +5205,7 @@ PyObject *PyUnicode_Format(PyObject *format, Py_DECREF(args); } Py_DECREF(uformat); - if (_PyUnicode_Resize(result, reslen - rescnt)) + if (_PyUnicode_Resize(&result, reslen - rescnt)) goto onError; return (PyObject *)result; @@ -5184,6 +5254,8 @@ PyTypeObject PyUnicode_Type = { void _PyUnicode_Init(void) { + int i; + /* Doublecheck the configuration... */ if (sizeof(Py_UNICODE) != 2) Py_FatalError("Unicode configuration error: " @@ -5194,6 +5266,8 @@ void _PyUnicode_Init(void) unicode_freelist_size = 0; unicode_empty = _PyUnicode_New(0); strcpy(unicode_default_encoding, "ascii"); + for (i = 0; i < 256; i++) + unicode_latin1[i] = NULL; } /* Finalize the Unicode implementation */ @@ -5202,10 +5276,18 @@ void _PyUnicode_Fini(void) { PyUnicodeObject *u; + int i; Py_XDECREF(unicode_empty); unicode_empty = NULL; + for (i = 0; i < 256; i++) { + if (unicode_latin1[i]) { + Py_DECREF(unicode_latin1[i]); + unicode_latin1[i] = NULL; + } + } + for (u = unicode_freelist; u != NULL;) { PyUnicodeObject *v = u; u = *(PyUnicodeObject **)u; |