11 files changed, 397 insertions, 2 deletions
diff --git a/Doc/library/gc.rst b/Doc/library/gc.rst
index 7c425e3..6929a3d 100644
--- a/Doc/library/gc.rst
+++ b/Doc/library/gc.rst
@@ -129,6 +129,31 @@ The :mod:`gc` module provides the following functions:
    from an argument, that integer object may or may not appear in the result list.
 
 
+.. function:: is_tracked(obj)
+
+   Returns True if the object is currently tracked by the garbage collector,
+   False otherwise.  As a general rule, instances of atomic types aren't
+   tracked and instances of non-atomic types (containers, user-defined
+   objects...) are.  However, some type-specific optimizations can be present
+   in order to suppress the garbage collector footprint of simple instances
+   (e.g. dicts containing only atomic keys and values)::
+
+      >>> gc.is_tracked(0)
+      False
+      >>> gc.is_tracked("a")
+      False
+      >>> gc.is_tracked([])
+      True
+      >>> gc.is_tracked({})
+      False
+      >>> gc.is_tracked({"a": 1})
+      False
+      >>> gc.is_tracked({"a": []})
+      True
+
+   .. versionadded:: 2.7
+
+
 The following variable is provided for read-only access (you can mutate its
 value but should not rebind it):
 
diff --git a/Include/dictobject.h b/Include/dictobject.h
index 3d9d5e1..d8c409e 100644
--- a/Include/dictobject.h
+++ b/Include/dictobject.h
@@ -125,6 +125,7 @@ PyAPI_FUNC(PyObject *) PyDict_Copy(PyObject *mp);
 PyAPI_FUNC(int) PyDict_Contains(PyObject *mp, PyObject *key);
 PyAPI_FUNC(int) _PyDict_Contains(PyObject *mp, PyObject *key, long hash);
 PyAPI_FUNC(PyObject *) _PyDict_NewPresized(Py_ssize_t minused);
+PyAPI_FUNC(void) _PyDict_MaybeUntrack(PyObject *mp);
 
 /* PyDict_Update(mp, other) is equivalent to PyDict_Merge(mp, other, 1). */
 PyAPI_FUNC(int) PyDict_Update(PyObject *mp, PyObject *other);
diff --git a/Include/objimpl.h b/Include/objimpl.h
index 20d9c24..5a27382 100644
--- a/Include/objimpl.h
+++ b/Include/objimpl.h
@@ -282,6 +282,17 @@ extern PyGC_Head *_PyGC_generation0;
 	g->gc.gc_next = NULL; \
     } while (0);
 
+/* True if the object is currently tracked by the GC. */
+#define _PyObject_GC_IS_TRACKED(o) \
+	((_Py_AS_GC(o))->gc.gc_refs != _PyGC_REFS_UNTRACKED)
+ 
+/* True if the object may be tracked by the GC in the future, or already is.
+   This can be useful to implement some optimizations. */
+#define _PyObject_GC_MAY_BE_TRACKED(obj) \
+	(PyObject_IS_GC(obj) && \
+		(!PyTuple_CheckExact(obj) || _PyObject_GC_IS_TRACKED(obj)))
+
+
 PyAPI_FUNC(PyObject *) _PyObject_GC_Malloc(size_t);
 PyAPI_FUNC(PyObject *) _PyObject_GC_New(PyTypeObject *);
 PyAPI_FUNC(PyVarObject *) _PyObject_GC_NewVar(PyTypeObject *, Py_ssize_t);
diff --git a/Include/tupleobject.h b/Include/tupleobject.h
index 7a887d1..19fe7a5 100644
--- a/Include/tupleobject.h
+++ b/Include/tupleobject.h
@@ -45,6 +45,7 @@ PyAPI_FUNC(int) PyTuple_SetItem(PyObject *, Py_ssize_t, PyObject *);
 PyAPI_FUNC(PyObject *) PyTuple_GetSlice(PyObject *, Py_ssize_t, Py_ssize_t);
 PyAPI_FUNC(int) _PyTuple_Resize(PyObject **, Py_ssize_t);
 PyAPI_FUNC(PyObject *) PyTuple_Pack(Py_ssize_t, ...);
+PyAPI_FUNC(void) _PyTuple_MaybeUntrack(PyObject *);
 
 /* Macro, trading safety for speed */
 #define PyTuple_GET_ITEM(op, i) (((PyTupleObject *)(op))->ob_item[i])
diff --git a/Lib/test/test_dict.py b/Lib/test/test_dict.py
index 308143d..1c9bca8 100644
--- a/Lib/test/test_dict.py
+++ b/Lib/test/test_dict.py
@@ -665,6 +665,104 @@ class DictTest(unittest.TestCase):
             gc.collect()
             self.assert_(ref() is None, "Cycle was not collected")
 
+    def _not_tracked(self, t):
+        # Nested containers can take several collections to untrack
+        gc.collect()
+        gc.collect()
+        self.assertFalse(gc.is_tracked(t), t)
+
+    def _tracked(self, t):
+        self.assertTrue(gc.is_tracked(t), t)
+        gc.collect()
+        gc.collect()
+        self.assertTrue(gc.is_tracked(t), t)
+
+    def test_track_literals(self):
+        # Test GC-optimization of dict literals
+        x, y, z, w = 1.5, "a", (1, None), []
+
+        self._not_tracked({})
+        self._not_tracked({x:(), y:x, z:1})
+        self._not_tracked({1: "a", "b": 2})
+        self._not_tracked({1: 2, (None, True, False, ()): int})
+        self._not_tracked({1: object()})
+
+        # Dicts with mutable elements are always tracked, even if those
+        # elements are not tracked right now.
+        self._tracked({1: []})
+        self._tracked({1: ([],)})
+        self._tracked({1: {}})
+        self._tracked({1: set()})
+
+    def test_track_dynamic(self):
+        # Test GC-optimization of dynamically-created dicts
+        class MyObject(object):
+            pass
+        x, y, z, w, o = 1.5, "a", (1, object()), [], MyObject()
+
+        d = dict()
+        self._not_tracked(d)
+        d[1] = "a"
+        self._not_tracked(d)
+        d[y] = 2
+        self._not_tracked(d)
+        d[z] = 3
+        self._not_tracked(d)
+        self._not_tracked(d.copy())
+        d[4] = w
+        self._tracked(d)
+        self._tracked(d.copy())
+        d[4] = None
+        self._not_tracked(d)
+        self._not_tracked(d.copy())
+
+        # dd isn't tracked right now, but it may mutate and therefore d
+        # which contains it must be tracked.
+        d = dict()
+        dd = dict()
+        d[1] = dd
+        self._not_tracked(dd)
+        self._tracked(d)
+        dd[1] = d
+        self._tracked(dd)
+
+        d = dict.fromkeys([x, y, z])
+        self._not_tracked(d)
+        dd = dict()
+        dd.update(d)
+        self._not_tracked(dd)
+        d = dict.fromkeys([x, y, z, o])
+        self._tracked(d)
+        dd = dict()
+        dd.update(d)
+        self._tracked(dd)
+
+        d = dict(x=x, y=y, z=z)
+        self._not_tracked(d)
+        d = dict(x=x, y=y, z=z, w=w)
+        self._tracked(d)
+        d = dict()
+        d.update(x=x, y=y, z=z)
+        self._not_tracked(d)
+        d.update(w=w)
+        self._tracked(d)
+
+        d = dict([(x, y), (z, 1)])
+        self._not_tracked(d)
+        d = dict([(x, y), (z, w)])
+        self._tracked(d)
+        d = dict()
+        d.update([(x, y), (z, 1)])
+        self._not_tracked(d)
+        d.update([(x, y), (z, w)])
+        self._tracked(d)
+
+    def test_track_subtypes(self):
+        # Dict subtypes are always tracked
+        class MyDict(dict):
+            pass
+        self._tracked(MyDict())
+
 
 from test import mapping_tests
 
diff --git a/Lib/test/test_gc.py b/Lib/test/test_gc.py
index 414e17a..2262b36 100644
--- a/Lib/test/test_gc.py
+++ b/Lib/test/test_gc.py
@@ -415,6 +415,33 @@ class GCTests(unittest.TestCase):
 
         self.assertEqual(gc.get_referents(1, 'a', 4j), [])
 
+    def test_is_tracked(self):
+        # Atomic built-in types are not tracked, user-defined objects and
+        # mutable containers are.
+        # NOTE: types with special optimizations (e.g. tuple) have tests
+        # in their own test files instead.
+        self.assertFalse(gc.is_tracked(None))
+        self.assertFalse(gc.is_tracked(1))
+        self.assertFalse(gc.is_tracked(1.0))
+        self.assertFalse(gc.is_tracked(1.0 + 5.0j))
+        self.assertFalse(gc.is_tracked(True))
+        self.assertFalse(gc.is_tracked(False))
+        self.assertFalse(gc.is_tracked(b"a"))
+        self.assertFalse(gc.is_tracked("a"))
+        self.assertFalse(gc.is_tracked(bytearray(b"a")))
+        self.assertFalse(gc.is_tracked(type))
+        self.assertFalse(gc.is_tracked(int))
+        self.assertFalse(gc.is_tracked(object))
+        self.assertFalse(gc.is_tracked(object()))
+
+        class UserClass:
+            pass
+        self.assertTrue(gc.is_tracked(gc))
+        self.assertTrue(gc.is_tracked(UserClass))
+        self.assertTrue(gc.is_tracked(UserClass()))
+        self.assertTrue(gc.is_tracked([]))
+        self.assertTrue(gc.is_tracked(set()))
+
     def test_bug1055820b(self):
         # Corresponds to temp2b.py in the bug report.
 
diff --git a/Lib/test/test_tuple.py b/Lib/test/test_tuple.py
index de08131..f82af31 100644
--- a/Lib/test/test_tuple.py
+++ b/Lib/test/test_tuple.py
@@ -1,5 +1,7 @@
 from test import support, seq_tests
 
+import gc
+
 class TupleTest(seq_tests.CommonTest):
     type2test = tuple
 
@@ -82,6 +84,69 @@ class TupleTest(seq_tests.CommonTest):
         self.assertEqual(repr(a0), "()")
         self.assertEqual(repr(a2), "(0, 1, 2)")
 
+    def _not_tracked(self, t):
+        # Nested tuples can take several collections to untrack
+        gc.collect()
+        gc.collect()
+        self.assertFalse(gc.is_tracked(t), t)
+
+    def _tracked(self, t):
+        self.assertTrue(gc.is_tracked(t), t)
+        gc.collect()
+        gc.collect()
+        self.assertTrue(gc.is_tracked(t), t)
+
+    def test_track_literals(self):
+        # Test GC-optimization of tuple literals
+        x, y, z = 1.5, "a", []
+
+        self._not_tracked(())
+        self._not_tracked((1,))
+        self._not_tracked((1, 2))
+        self._not_tracked((1, 2, "a"))
+        self._not_tracked((1, 2, (None, True, False, ()), int))
+        self._not_tracked((object(),))
+        self._not_tracked(((1, x), y, (2, 3)))
+
+        # Tuples with mutable elements are always tracked, even if those
+        # elements are not tracked right now.
+        self._tracked(([],))
+        self._tracked(([1],))
+        self._tracked(({},))
+        self._tracked((set(),))
+        self._tracked((x, y, z))
+
+    def check_track_dynamic(self, tp, always_track):
+        x, y, z = 1.5, "a", []
+
+        check = self._tracked if always_track else self._not_tracked
+        check(tp())
+        check(tp([]))
+        check(tp(set()))
+        check(tp([1, x, y]))
+        check(tp(obj for obj in [1, x, y]))
+        check(tp(set([1, x, y])))
+        check(tp(tuple([obj]) for obj in [1, x, y]))
+        check(tuple(tp([obj]) for obj in [1, x, y]))
+
+        self._tracked(tp([z]))
+        self._tracked(tp([[x, y]]))
+        self._tracked(tp([{x: y}]))
+        self._tracked(tp(obj for obj in [x, y, z]))
+        self._tracked(tp(tuple([obj]) for obj in [x, y, z]))
+        self._tracked(tuple(tp([obj]) for obj in [x, y, z]))
+
+    def test_track_dynamic(self):
+        # Test GC-optimization of dynamically constructed tuples.
+        self.check_track_dynamic(tuple, False)
+
+    def test_track_subtypes(self):
+        # Tuple subtypes must always be tracked
+        class MyTuple(tuple):
+            pass
+        self.check_track_dynamic(MyTuple, True)
+
+
 def test_main():
     support.run_unittest(TupleTest)
 
diff --git a/Misc/NEWS b/Misc/NEWS
index 8c752a7..c690dea 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,11 @@ What's New in Python 3.1 alpha 2?
 Core and Builtins
 -----------------
 
+- Issue #4688: Add a heuristic so that tuples and dicts containing only
+  untrackable objects are not tracked by the garbage collector. This can
+  reduce the size of collections and therefore the garbage collection overhead
+  on long-running programs, depending on their particular use of datatypes.
+
 - Issue #5512: Rewrite PyLong long division algorithm (x_divrem) to
   improve its performance.  Long divisions and remainder operations
   are now between 50% and 150% faster.
diff --git a/Modules/gcmodule.c b/Modules/gcmodule.c
index 2474721..d4a0900 100644
--- a/Modules/gcmodule.c
+++ b/Modules/gcmodule.c
@@ -433,7 +433,13 @@ move_unreachable(PyGC_Head *young, PyGC_Head *unreachable)
                         (void) traverse(op,
                                         (visitproc)visit_reachable,
                                         (void *)young);
-                        next = gc->gc.gc_next;
+			next = gc->gc.gc_next;
+			if (PyTuple_CheckExact(op)) {
+				_PyTuple_MaybeUntrack(op);
+			}
+			else if (PyDict_CheckExact(op)) {
+				_PyDict_MaybeUntrack(op);
+			}
 		}
 		else {
 			/* This *may* be unreachable.  To make progress,
@@ -1229,6 +1235,26 @@ gc_get_objects(PyObject *self, PyObject *noargs)
 	return result;
 }
 
+PyDoc_STRVAR(gc_is_tracked__doc__,
+"is_tracked(obj) -> bool\n"
+"\n"
+"Returns true if the object is tracked by the garbage collector.\n"
+"Simple atomic objects will return false.\n"
+);
+
+static PyObject *
+gc_is_tracked(PyObject *self, PyObject *obj)
+{
+	PyObject *result;
+	
+	if (PyObject_IS_GC(obj) && IS_TRACKED(obj))
+		result = Py_True;
+	else
+		result = Py_False;
+	Py_INCREF(result);
+	return result;
+}
+
 
 PyDoc_STRVAR(gc__doc__,
 "This module provides access to the garbage collector for reference cycles.\n"
@@ -1243,6 +1269,7 @@ PyDoc_STRVAR(gc__doc__,
 "set_threshold() -- Set the collection thresholds.\n"
 "get_threshold() -- Return the current the collection thresholds.\n"
 "get_objects() -- Return a list of all objects tracked by the collector.\n"
+"is_tracked() -- Returns true if a given object is tracked.\n"
 "get_referrers() -- Return the list of objects that refer to an object.\n"
 "get_referents() -- Return the list of objects that an object refers to.\n");
 
@@ -1258,6 +1285,7 @@ static PyMethodDef GcMethods[] = {
 	{"collect",	   (PyCFunction)gc_collect,
          	METH_VARARGS | METH_KEYWORDS,           gc_collect__doc__},
 	{"get_objects",    gc_get_objects,METH_NOARGS,  gc_get_objects__doc__},
+	{"is_tracked",     gc_is_tracked, METH_O,       gc_is_tracked__doc__},
 	{"get_referrers",  gc_get_referrers, METH_VARARGS,
 		gc_get_referrers__doc__},
 	{"get_referents",  gc_get_referents, METH_VARARGS,
diff --git a/Objects/dictobject.c b/Objects/dictobject.c
index e235993..e005f8e 100644
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@@ -181,6 +181,24 @@ show_alloc(void)
 }
 #endif
 
+/* Debug statistic to count GC tracking of dicts */
+#ifdef SHOW_TRACK_COUNT
+static Py_ssize_t count_untracked = 0;
+static Py_ssize_t count_tracked = 0;
+
+static void
+show_track(void)
+{
+	fprintf(stderr, "Dicts created: %" PY_FORMAT_SIZE_T "d\n",
+		count_tracked + count_untracked);
+	fprintf(stderr, "Dicts tracked by the GC: %" PY_FORMAT_SIZE_T
+		"d\n", count_tracked);
+	fprintf(stderr, "%.2f%% dict tracking rate\n\n",
+		(100.0*count_tracked/(count_untracked+count_tracked)));
+}
+#endif
+
+
 /* Initialization macros.
    There are two ways to create a dict:  PyDict_New() is the main C API
    function, and the tp_new slot maps to dict_new().  In the latter case we
@@ -234,6 +252,9 @@ PyDict_New(void)
 #ifdef SHOW_ALLOC_COUNT
 		Py_AtExit(show_alloc);
 #endif
+#ifdef SHOW_TRACK_COUNT
+		Py_AtExit(show_track);
+#endif
 	}
 	if (numfree) {
 		mp = free_list[--numfree];
@@ -263,10 +284,12 @@ PyDict_New(void)
 #endif
 	}
 	mp->ma_lookup = lookdict_unicode;
+#ifdef SHOW_TRACK_COUNT
+	count_untracked++;
+#endif
 #ifdef SHOW_CONVERSION_COUNTS
 	++created;
 #endif
-	_PyObject_GC_TRACK(mp);
 	return (PyObject *)mp;
 }
 
@@ -435,6 +458,52 @@ lookdict_unicode(PyDictObject *mp, PyObject *key, register long hash)
 	return 0;
 }
 
+#ifdef SHOW_TRACK_COUNT
+#define INCREASE_TRACK_COUNT \
+	(count_tracked++, count_untracked--);
+#define DECREASE_TRACK_COUNT \
+	(count_tracked--, count_untracked++);
+#else
+#define INCREASE_TRACK_COUNT
+#define DECREASE_TRACK_COUNT
+#endif
+
+#define MAINTAIN_TRACKING(mp, key, value) \
+	do { \
+		if (!_PyObject_GC_IS_TRACKED(mp)) { \
+			if (_PyObject_GC_MAY_BE_TRACKED(key) || \
+				_PyObject_GC_MAY_BE_TRACKED(value)) { \
+				_PyObject_GC_TRACK(mp); \
+				INCREASE_TRACK_COUNT \
+			} \
+		} \
+	} while(0)
+
+void
+_PyDict_MaybeUntrack(PyObject *op)
+{
+	PyDictObject *mp;
+	PyObject *value;
+	Py_ssize_t mask, i;
+	PyDictEntry *ep;
+
+	if (!PyDict_CheckExact(op) || !_PyObject_GC_IS_TRACKED(op))
+		return;
+	
+	mp = (PyDictObject *) op;
+	ep = mp->ma_table;
+	mask = mp->ma_mask;
+	for (i = 0; i <= mask; i++) {
+		if ((value = ep[i].me_value) == NULL)
+			continue;
+		if (_PyObject_GC_MAY_BE_TRACKED(value) ||
+			_PyObject_GC_MAY_BE_TRACKED(ep[i].me_key))
+			return;
+	}
+	_PyObject_GC_UNTRACK(op);
+}
+
+
 /*
 Internal routine to insert a new item into the table.
 Used both by the internal resize routine and by the public insert routine.
@@ -455,6 +524,7 @@ insertdict(register PyDictObject *mp, PyObject *key, long hash, PyObject *value)
 		Py_DECREF(value);
 		return -1;
 	}
+	MAINTAIN_TRACKING(mp, key, value);
 	if (ep->me_value != NULL) {
 		old_value = ep->me_value;
 		ep->me_value = value;
@@ -494,6 +564,7 @@ insertdict_clean(register PyDictObject *mp, PyObject *key, long hash,
 	PyDictEntry *ep0 = mp->ma_table;
 	register PyDictEntry *ep;
 
+	MAINTAIN_TRACKING(mp, key, value);
 	i = hash & mask;
 	ep = &ep0[i];
 	for (perturb = hash; ep->me_key != NULL; perturb >>= PERTURB_SHIFT) {
@@ -1993,9 +2064,18 @@ dict_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 		assert(d->ma_table == NULL && d->ma_fill == 0 && d->ma_used == 0);
 		INIT_NONZERO_DICT_SLOTS(d);
 		d->ma_lookup = lookdict_unicode;
+		/* The object has been implicitely tracked by tp_alloc */
+		if (type == &PyDict_Type)
+			_PyObject_GC_UNTRACK(d);
 #ifdef SHOW_CONVERSION_COUNTS
 		++created;
 #endif
+#ifdef SHOW_TRACK_COUNT
+		if (_PyObject_GC_IS_TRACKED(d))
+			count_tracked++;
+		else
+			count_untracked++;
+#endif
 	}
 	return self;
 }
diff --git a/Objects/tupleobject.c b/Objects/tupleobject.c
index 8828a2d..11be0e1 100644
--- a/Objects/tupleobject.c
+++ b/Objects/tupleobject.c
@@ -23,11 +23,36 @@ Py_ssize_t fast_tuple_allocs;
 Py_ssize_t tuple_zero_allocs;
 #endif
 
+/* Debug statistic to count GC tracking of tuples.
+   Please note that tuples are only untracked when considered by the GC, and
+   many of them will be dead before. Therefore, a tracking rate close to 100%
+   does not necessarily prove that the heuristic is inefficient.
+*/
+#ifdef SHOW_TRACK_COUNT
+static Py_ssize_t count_untracked = 0;
+static Py_ssize_t count_tracked = 0;
+
+static void
+show_track(void)
+{
+	fprintf(stderr, "Tuples created: %" PY_FORMAT_SIZE_T "d\n",
+		count_tracked + count_untracked);
+	fprintf(stderr, "Tuples tracked by the GC: %" PY_FORMAT_SIZE_T
+		"d\n", count_tracked);
+	fprintf(stderr, "%.2f%% tuple tracking rate\n\n",
+		(100.0*count_tracked/(count_untracked+count_tracked)));
+}
+#endif
+
+
 PyObject *
 PyTuple_New(register Py_ssize_t size)
 {
 	register PyTupleObject *op;
 	Py_ssize_t i;
+#ifdef SHOW_TRACK_COUNT
+	count_tracked++;
+#endif
 	if (size < 0) {
 		PyErr_BadInternalCall();
 		return NULL;
@@ -131,6 +156,32 @@ PyTuple_SetItem(register PyObject *op, register Py_ssize_t i, PyObject *newitem)
 	return 0;
 }
 
+void
+_PyTuple_MaybeUntrack(PyObject *op)
+{
+	PyTupleObject *t;
+	Py_ssize_t i, n;
+	
+	if (!PyTuple_CheckExact(op) || !_PyObject_GC_IS_TRACKED(op))
+		return;
+	t = (PyTupleObject *) op;
+	n = Py_SIZE(t);
+	for (i = 0; i < n; i++) {
+		PyObject *elt = PyTuple_GET_ITEM(t, i);
+		/* Tuple with NULL elements aren't
+		   fully constructed, don't untrack
+		   them yet. */
+		if (!elt ||
+			_PyObject_GC_MAY_BE_TRACKED(elt))
+			return;
+	}
+#ifdef SHOW_TRACK_COUNT
+	count_tracked--;
+	count_untracked++;
+#endif
+	_PyObject_GC_UNTRACK(op);
+}
+
 PyObject *
 PyTuple_Pack(Py_ssize_t n, ...)
 {
@@ -855,6 +906,9 @@ PyTuple_Fini(void)
 
 	(void)PyTuple_ClearFreeList();
 #endif
+#ifdef SHOW_TRACK_COUNT
+	show_track();
+#endif
 }
 
 /*********************** Tuple Iterator **************************/