summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorWalter Dörwald <walter@livinglogic.de>2007-05-25 13:52:07 (GMT)
committerWalter Dörwald <walter@livinglogic.de>2007-05-25 13:52:07 (GMT)
commit1680713e524016d93a94114c4a874ad71a090b95 (patch)
treeef1f75a1a9748b50ab4a4e66d4c81662062546f7 /Objects
parent34a042d301d6ab88645046a6dfa6c38265ca4b39 (diff)
downloadcpython-1680713e524016d93a94114c4a874ad71a090b95.zip
cpython-1680713e524016d93a94114c4a874ad71a090b95.tar.gz
cpython-1680713e524016d93a94114c4a874ad71a090b95.tar.bz2
Add interning of unicode strings by copying the functionality from
stringobject.c. Intern "True" and "False" in bool_repr() again as it was in the 8bit string era.
Diffstat (limited to 'Objects')
-rw-r--r--Objects/boolobject.c4
-rw-r--r--Objects/unicodeobject.c139
2 files changed, 141 insertions, 2 deletions
diff --git a/Objects/boolobject.c b/Objects/boolobject.c
index 0a9f958..b0170f6 100644
--- a/Objects/boolobject.c
+++ b/Objects/boolobject.c
@@ -24,10 +24,10 @@ bool_repr(PyObject *self)
if (self == Py_True)
s = true_str ? true_str :
- (true_str = PyUnicode_FromString("True"));
+ (true_str = PyUnicode_InternFromString("True"));
else
s = false_str ? false_str :
- (false_str = PyUnicode_FromString("False"));
+ (false_str = PyUnicode_InternFromString("False"));
Py_XINCREF(s);
return s;
}
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 999b166..854310b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -92,6 +92,16 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
extern "C" {
#endif
+/* This dictionary holds all interned unicode strings. Note that references
+ to strings in this dictionary are *not* counted in the string's ob_refcnt.
+ When the interned string reaches a refcnt of 0 the string deallocation
+ function will delete the reference from this dictionary.
+
+ Another way to look at this is that to say that the actual reference
+ count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
+*/
+static PyObject *interned;
+
/* Free list for Unicode objects */
static PyUnicodeObject *unicode_freelist;
static int unicode_freelist_size;
@@ -276,6 +286,7 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
unicode->str[length] = 0;
unicode->length = length;
unicode->hash = -1;
+ unicode->state = 0;
unicode->defenc = NULL;
return unicode;
@@ -288,6 +299,25 @@ PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
static
void unicode_dealloc(register PyUnicodeObject *unicode)
{
+ switch (PyUnicode_CHECK_INTERNED(unicode)) {
+ case SSTATE_NOT_INTERNED:
+ break;
+
+ case SSTATE_INTERNED_MORTAL:
+ /* revive dead object temporarily for DelItem */
+ unicode->ob_refcnt = 3;
+ if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
+ Py_FatalError(
+ "deletion of interned unicode string failed");
+ break;
+
+ case SSTATE_INTERNED_IMMORTAL:
+ Py_FatalError("Immortal interned unicode string died.");
+
+ default:
+ Py_FatalError("Inconsistent interned unicode string state.");
+ }
+
if (PyUnicode_CheckExact(unicode) &&
unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
/* Keep-Alive optimization */
@@ -8564,6 +8594,115 @@ _PyUnicode_Fini(void)
unicode_freelist_size = 0;
}
+void
+PyUnicode_InternInPlace(PyObject **p)
+{
+ register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
+ PyObject *t;
+ if (s == NULL || !PyUnicode_Check(s))
+ Py_FatalError(
+ "PyUnicode_InternInPlace: unicode strings only please!");
+ /* If it's a subclass, we don't really know what putting
+ it in the interned dict might do. */
+ if (!PyUnicode_CheckExact(s))
+ return;
+ if (PyUnicode_CHECK_INTERNED(s))
+ return;
+ if (interned == NULL) {
+ interned = PyDict_New();
+ if (interned == NULL) {
+ PyErr_Clear(); /* Don't leave an exception */
+ return;
+ }
+ }
+ t = PyDict_GetItem(interned, (PyObject *)s);
+ if (t) {
+ Py_INCREF(t);
+ Py_DECREF(*p);
+ *p = t;
+ return;
+ }
+
+ if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
+ PyErr_Clear();
+ return;
+ }
+ /* The two references in interned are not counted by refcnt.
+ The deallocator will take care of this */
+ s->ob_refcnt -= 2;
+ PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
+}
+
+void
+PyUnicode_InternImmortal(PyObject **p)
+{
+ PyUnicode_InternInPlace(p);
+ if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
+ PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
+ Py_INCREF(*p);
+ }
+}
+
+PyObject *
+PyUnicode_InternFromString(const char *cp)
+{
+ PyObject *s = PyUnicode_FromString(cp);
+ if (s == NULL)
+ return NULL;
+ PyUnicode_InternInPlace(&s);
+ return s;
+}
+
+void _Py_ReleaseInternedUnicodeStrings(void)
+{
+ PyObject *keys;
+ PyUnicodeObject *s;
+ Py_ssize_t i, n;
+ Py_ssize_t immortal_size = 0, mortal_size = 0;
+
+ if (interned == NULL || !PyDict_Check(interned))
+ return;
+ keys = PyDict_Keys(interned);
+ if (keys == NULL || !PyList_Check(keys)) {
+ PyErr_Clear();
+ return;
+ }
+
+ /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
+ detector, interned unicode strings are not forcibly deallocated;
+ rather, we give them their stolen references back, and then clear
+ and DECREF the interned dict. */
+
+ n = PyList_GET_SIZE(keys);
+ fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
+ n);
+ for (i = 0; i < n; i++) {
+ s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
+ switch (s->state) {
+ case SSTATE_NOT_INTERNED:
+ /* XXX Shouldn't happen */
+ break;
+ case SSTATE_INTERNED_IMMORTAL:
+ s->ob_refcnt += 1;
+ immortal_size += s->length;
+ break;
+ case SSTATE_INTERNED_MORTAL:
+ s->ob_refcnt += 2;
+ mortal_size += s->length;
+ break;
+ default:
+ Py_FatalError("Inconsistent interned string state.");
+ }
+ s->state = SSTATE_NOT_INTERNED;
+ }
+ fprintf(stderr, "total size of all interned strings: "
+ "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
+ "mortal/immortal\n", mortal_size, immortal_size);
+ Py_DECREF(keys);
+ PyDict_Clear(interned);
+ Py_DECREF(interned);
+ interned = NULL;
+}
/********************* Unicode Iterator **************************/