From 4c5ecb7cbbb7b20e8c643addc092edf72e753e16 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 9 Jan 2009 21:40:55 +0000 Subject: =?UTF-8?q?Issue=20#4074:=20Change=20the=20criteria=20for=20doing?= =?UTF-8?q?=20a=20full=20garbage=20collection=20(i.e.=20collecting=20the?= =?UTF-8?q?=20oldest=20generation)=20so=20that=20allocating=20lots=20of=20?= =?UTF-8?q?objects=20without=20destroying=20them=20does=20not=20show=20qua?= =?UTF-8?q?dratic=20performance.=20Based=20on=20a=20proposal=20by=20Martin?= =?UTF-8?q?=20von=20L=C3=B6wis=20at=20http://mail.python.org/pipermail/pyt?= =?UTF-8?q?hon-dev/2008-June/080579.html.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Misc/NEWS | 6 +++++ Modules/gcmodule.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/Misc/NEWS b/Misc/NEWS index 7a903d2..fdb73d3 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,12 @@ What's New in Python 2.7 alpha 1 Core and Builtins ----------------- +- Issue #4074: Change the criteria for doing a full garbage collection (i.e. + collecting the oldest generation) so that allocating lots of objects without + destroying them does not show quadratic performance. Based on a proposal by + Martin von Löwis at + http://mail.python.org/pipermail/python-dev/2008-June/080579.html. + - Issue #4850: Change COUNT_ALLOCS variables to Py_ssize_t. - Issue #1180193: When importing a module from a .pyc (or .pyo) file with diff --git a/Modules/gcmodule.c b/Modules/gcmodule.c index c7426a5..9b47819 100644 --- a/Modules/gcmodule.c +++ b/Modules/gcmodule.c @@ -63,6 +63,55 @@ static PyObject *gc_str = NULL; /* Python string used to look for __del__ attribute. */ static PyObject *delstr = NULL; +/* This is the number of objects who survived the last full collection. It + approximates the number of long lived objects tracked by the GC. + + (by "full collection", we mean a collection of the oldest generation). +*/ +static Py_ssize_t long_lived_total = 0; + +/* This is the number of objects who survived all "non-full" collections, + and are awaiting to undergo a full collection for the first time. + +*/ +static Py_ssize_t long_lived_pending = 0; + +/* + NOTE: about the counting of long-lived objects. + + To limit the cost of garbage collection, there are two strategies; + - make each collection faster, e.g. by scanning fewer objects + - do less collections + This heuristic is about the latter strategy. + + In addition to the various configurable thresholds, we only trigger a + full collection if the ratio + long_lived_pending / long_lived_total + is above a given value (hardwired to 25%). + + The reason is that, while "non-full" collections (i.e., collections of + the young and middle generations) will always examine roughly the same + number of objects -- determined by the aforementioned thresholds --, + the cost of a full collection is proportional to the total number of + long-lived objects, which is virtually unbounded. + + Indeed, it has been remarked that doing a full collection every + of object creations entails a dramatic performance + degradation in workloads which consist in creating and storing lots of + long-lived objects (e.g. building a large list of GC-tracked objects would + show quadratic performance, instead of linear as expected: see issue #4074). + + Using the above ratio, instead, yields amortized linear performance in + the total number of objects (the effect of which can be summarized + thusly: "each full garbage collection is more and more costly as the + number of objects grows, but we do fewer and fewer of them"). + + This heuristic was suggested by Martin von Löwis on python-dev in + June 2008. His original analysis and proposal can be found at: + http://mail.python.org/pipermail/python-dev/2008-June/080579.html +*/ + + /* set for debugging information */ #define DEBUG_STATS (1<<0) /* print collection statistics */ #define DEBUG_COLLECTABLE (1<<1) /* print collectable objects */ @@ -826,8 +875,16 @@ collect(int generation) move_unreachable(young, &unreachable); /* Move reachable objects to next generation. */ - if (young != old) + if (young != old) { + if (generation == NUM_GENERATIONS - 2) { + long_lived_pending += gc_list_size(young); + } gc_list_merge(young, old); + } + else { + long_lived_pending = 0; + long_lived_total = gc_list_size(young); + } /* All objects in unreachable are trash, but objects reachable from * finalizers can't safely be deleted. Python programmers should take @@ -921,6 +978,13 @@ collect_generations(void) * generations younger than it will be collected. */ for (i = NUM_GENERATIONS-1; i >= 0; i--) { if (generations[i].count > generations[i].threshold) { + /* Avoid quadratic performance degradation in number + of tracked objects. See comments at the beginning + of this file, and issue #4074. + */ + if (i == NUM_GENERATIONS - 1 + && long_lived_pending < long_lived_total / 4) + continue; n = collect(i); break; } -- cgit v0.12