summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaymond Hettinger <python@rcn.com>2009-01-12 22:58:41 (GMT)
committerRaymond Hettinger <python@rcn.com>2009-01-12 22:58:41 (GMT)
commitf94d7fa5fb90df0163cffca2864885a7da49d4f6 (patch)
tree0645074534eff18800e28fd723c18d94a355ce05
parent99234c5c7466845316ca5e33d86226a1d1fe227b (diff)
downloadcpython-f94d7fa5fb90df0163cffca2864885a7da49d4f6.zip
cpython-f94d7fa5fb90df0163cffca2864885a7da49d4f6.tar.gz
cpython-f94d7fa5fb90df0163cffca2864885a7da49d4f6.tar.bz2
Issue 1696199: Add collections.Counter().
-rw-r--r--Doc/library/collections.rst128
-rw-r--r--Lib/collections.py201
-rw-r--r--Lib/test/test_collections.py100
-rw-r--r--Misc/NEWS3
4 files changed, 430 insertions, 2 deletions
diff --git a/Doc/library/collections.rst b/Doc/library/collections.rst
index 2725d68..4aced1e 100644
--- a/Doc/library/collections.rst
+++ b/Doc/library/collections.rst
@@ -152,6 +152,134 @@ Notes on using :class:`Set` and :class:`MutableSet` as a mixin:
(For more about ABCs, see the :mod:`abc` module and :pep:`3119`.)
+.. _counter-objects:
+
+:class:`Counter` objects
+------------------------
+
+A counter tool is provided to support convenient and rapid tallies.
+For example::
+
+ # Tally repeated words in a list
+ >>> words = ['red', 'blue', 'red', 'green', 'blue', blue']
+ >>> cnt = Counter()
+ >>> for word in words:
+ ... cnt[word] += 1
+ >>> cnt
+ Counter(items=[('blue', 3), ('red', 2), ('green', 1)])
+
+ # Find the ten most common words in Hamlet
+ >>> import re
+ >>> words = re.findall('\w+', open('hamlet.txt').read().lower())
+ >>> Counter(hamlet_words).most_common(10)
+ [('the', 1143), ('and', 966), ('to', 762), ('of', 669), ('i', 631),
+ ('you', 554), ('a', 546), ('my', 514), ('hamlet', 471), ('in', 451)]
+
+.. class:: Counter([iterable[, items]])
+
+ A :class:`Counter` is a :class:`dict` subclass for counting hashable items.
+ Elements are stored as dictionary keys and their counts are stored as
+ dictionary values. Counts are allowed to be any integer value including
+ zero or negative counts. The :class:`Counter` class is similar to bags
+ or multisets in other languages.
+
+ Elements are counted from the *iterable* if given. Also, the counts
+ can be initialized from an *items* list of *(element, count)* pairs.
+ If provided, *items* must be a keyword argument::
+
+ >>> c = Counter() # a new, empty counter
+ >>> c = Counter('gallahad') # a new counter from an iterable
+ >>> c = Counter(items=[('a', 4), ('b', 2)]) # a new counter from an items list
+
+ The returned object has a dictionary style interface except that it returns
+ a zero count for missing items (instead of raising a :exc:`KeyError` like a
+ dictionary would)::
+
+ >>> c = Counter(['if', 'your', 'peril', 'be'])
+ >>> c['questions'] # count of a missing element is zero
+ 0
+
+ Assigning a count of zero or reducing the count to zero leaves the
+ element in the dictionary. Use ``del`` to remove the entry entirely:
+
+ >>> c = Counter(['arthur', 'gwain'])
+ >>> c['arthur'] = 0 # set the count of "arthur" to zero
+ >>> 'arthur' in c # but "arthur" is still in the counter
+ True
+ >>> del c['arthur'] # del will completely remove the entry
+ >>> 'arthur' in c
+ False
+
+ .. versionadded:: 2.7
+
+
+ Counter objects support two methods beyond those available for all
+ dictionaries:
+
+ .. method:: elements()
+
+ Return an iterator over elements repeating each as many times as its count.
+ Elements are returned in arbitrary order. If an element's count has been
+ set to zero or a negative number, :meth:`elements` will ignore it.
+
+ >>> c = Counter(items=[('a', 4), ('b', 2), ('d', 0), ('e', -2)])
+ >>> list(c.elements())
+ ['a', 'a', 'a', 'a', 'b', 'b']
+
+ .. method:: most_common([n])
+
+ Return a list of the *n* most common elements and their counts from
+ the most common to the least. If *n* is not specified or is ``None``,
+ return a list of all element counts in decreasing order of frequency.
+ Elements with equal counts are ordered arbitrarily::
+
+ >>> Counter('abracadabra').most_common(3)
+ [('a', 5), ('r', 2), ('b', 2)]
+
+ The usual dictionary methods are available for :class:`Counter` objects.
+ All of those work the same as they do for dictionaries except for two
+ which work differently for counters.
+
+ .. method:: fromkeys(iterable)
+
+ There is no equivalent class method for :class:`Counter` objects.
+ Raises a :exc:`NotImplementedError` when called.
+
+ .. method:: update(mapping)
+
+ Like :meth:`dict.update` but adds-in counts instead of replacing them.
+ Used for combining two independent counts. Accepts a *mapping* object
+ which can be another counter or can be a :class:`dict` that maps
+ elements to element counts::
+
+ >>> c = Counter('which') # count letters in a word
+ >>> d = Counter('witch') # count letters in another word
+ >>> c.update(d) # add counts from d to those in c
+ >>> c['h'] # count of 'h' is now three
+ 3
+ >>> c.update(Counter('watch')) # add in letters from another word
+ >>> c['h'] # count of 'h' is now four
+ 4
+
+
+.. seealso::
+
+ `Multisets <http://en.wikipedia.org/wiki/Multiset>`_
+ in the Wikipedia
+
+ `Bag <http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html>`_
+ a Smalltalk class
+
+ `C++ multisets <http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm>`_
+ a tutorial with standalone examples
+
+ `Bag class <http://code.activestate.com/recipes/259174/>`_
+ an early Python recipe
+
+ Use cases for multisets and mathematical operations on multisets.
+ Knuth, Donald. The Art of Computer Programming Volume II,
+ Section 4.6.3, Exercise 19.
+
.. _deque-objects:
diff --git a/Lib/collections.py b/Lib/collections.py
index ace2b2a..ff49844 100644
--- a/Lib/collections.py
+++ b/Lib/collections.py
@@ -9,6 +9,11 @@ from _collections import deque, defaultdict
from operator import itemgetter as _itemgetter
from keyword import iskeyword as _iskeyword
import sys as _sys
+import heapq as _heapq
+import itertools as _itertools
+
+########################################################################
+### namedtuple #######################################################
def namedtuple(typename, field_names, verbose=False):
"""Returns a new subclass of tuple with named fields.
@@ -108,7 +113,160 @@ def namedtuple(typename, field_names, verbose=False):
return result
+########################################################################
+### Counter ##########################################################
+
+class Counter(dict):
+ '''Dict subclass for counting hashable items. Sometimes called a bag
+ or multiset. Elements are stored as dictionary keys and their counts
+ are stored as dictionary values.
+
+ >>> c = Counter('abracadabra') # count elements from a string
+
+ >>> c.most_common(3) # three most common elements
+ [('a', 5), ('r', 2), ('b', 2)]
+ >>> sorted(c) # list all unique elements
+ ['a', 'b', 'c', 'd', 'r']
+ >>> ''.join(sorted(c.elements())) # list elements with repetitions
+ 'aaaaabbcdrr'
+ >>> sum(c.values()) # total of all counts
+ 11
+
+ >>> c['a'] # count of letter 'a'
+ 5
+ >>> for elem in 'shazam': # update counts from an iterable
+ ... c[elem] += 1 # by adding 1 to each element's count
+ >>> c['a'] # now there are seven 'a'
+ 7
+ >>> del c['r'] # remove all 'r'
+ >>> c['r'] # now there are zero 'r'
+ 0
+
+ >>> d = Counter('simsalabim') # make another counter
+ >>> c.update(d) # add in the second counter
+ >>> c['a'] # now there are nine 'a'
+ 9
+
+ >>> c.clear() # empty the counter
+ >>> c
+ Counter()
+
+ Note: If a count is set to zero or reduced to zero, it will remain
+ in the counter until the entry is deleted or the counter is cleared:
+
+ >>> c = Counter('aaabbc')
+ >>> c['b'] -= 2 # reduce the count of 'b' by two
+ >>> c.most_common() # 'b' is still in, but its count is zero
+ [('a', 3), ('c', 1), ('b', 0)]
+
+ '''
+ # References:
+ # http://en.wikipedia.org/wiki/Multiset
+ # http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html
+ # http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm
+ # http://code.activestate.com/recipes/259174/
+ # Knuth, TAOCP Vol. II section 4.6.3
+
+ def __init__(self, iterable=None, items=None):
+ '''Create a new, empty Counter object. And if given, count elements
+ from an input iterable. Or, initialize the count from an items list
+ of (element, count) pairs.
+
+ >>> c = Counter('hocus pocus') # count elements in an iterable
+ >>> c = Counter(items=[('a', 4), ('b', 2)]) # take counts from an items list
+
+ '''
+ if iterable is not None:
+ for elem in iterable:
+ self[elem] += 1
+ if items is not None:
+ for elem, count in items:
+ self[elem] += count
+
+ def __missing__(self, key):
+ 'The count of elements not in the Counter is zero.'
+ # Needed so that self[missing_item] does not raise KeyError
+ return 0
+
+ def most_common(self, n=None):
+ '''List the n most common elements and their counts from the most
+ common to the least. If n is None, then list all element counts.
+
+ >>> Counter('abracadabra').most_common(3)
+ [('a', 5), ('r', 2), ('b', 2)]
+
+ '''
+ # Emulate Bag.sortedByCount from Smalltalk
+ if n is None:
+ return sorted(self.iteritems(), key=_itemgetter(1), reverse=True)
+ return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1))
+ def elements(self):
+ '''Iterator over elements repeating each as many times as its count.
+
+ >>> c = Counter('ABCABC')
+ >>> sorted(c.elements())
+ ['A', 'A', 'B', 'B', 'C', 'C']
+
+ # Knuth's example of prime factors of 1836: 2**2 * 3**3 * 17**1
+ >>> import operator
+ >>> prime_factors = Counter(items=[(2,2), (3,3), (17,1)])
+ >>> sorted(prime_factors.elements()) # list individual factors
+ [2, 2, 3, 3, 3, 17]
+ >>> reduce(operator.mul, prime_factors.elements(), 1) # multiply them
+ 1836
+
+ Note, if an element's count has been set to zero or a negative number,
+ elements() will ignore it.
+
+ '''
+ # Emulate Bag.do from Smalltalk and Multiset.begin from C++.
+ return _itertools.chain.from_iterable(
+ _itertools.starmap(_itertools.repeat,
+ self.iteritems()))
+
+ # Override dict methods where necessary
+
+ @classmethod
+ def fromkeys(cls, iterable, v=None):
+ # There is no equivalent method for counters because setting v=1
+ # means that no element can have a count greater than one.
+ raise NotImplementedError(
+ 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.')
+
+ def update(self, mapping):
+ '''Like dict.update() but add counts instead of replacing them.
+
+ Source can be another dictionary or a Counter.instance().
+
+ >>> c = Counter('which')
+ >>> d = Counter('witch')
+ >>> c.update(d) # Add counts from d to those in c
+ >>> c['h'] # Count of 'h' is now three
+ 3
+
+ '''
+ # The regular dict.update() operation makes no sense here because the
+ # replace behavior results in the some of original untouched counts
+ # being mixed-in with all of the other counts for a mismash that
+ # doesn't have a straight-forward interpretation in most counting
+ # contexts. Instead, we look to Knuth for suggested operations on
+ # multisets and implement the union-add operation discussed in
+ # TAOCP Volume II section 4.6.3 exercise 19. The Wikipedia entry for
+ # multisets calls that operation a sum or join.
+ for elem, count in mapping.iteritems():
+ self[elem] += count
+
+ def copy(self):
+ 'Like dict.copy() but returns a Counter instance instead of a dict.'
+ c = Counter()
+ c.update(self)
+ return c
+
+ def __repr__(self):
+ if not self:
+ return '%s()' % self.__class__.__name__
+ return '%s(items=%r)' % (self.__class__.__name__, self.most_common())
@@ -143,6 +301,49 @@ if __name__ == '__main__':
Point3D = namedtuple('Point3D', Point._fields + ('z',))
print Point3D.__doc__
+ # Check that counters are copyable, deepcopyable, picklable, and have a
+ # repr/eval round-trip
+ import copy
+ words = Counter('which witch had which witches wrist watch'.split())
+ update_test = Counter()
+ update_test.update(words)
+ for i, dup in enumerate([
+ words.copy(),
+ copy.copy(words),
+ copy.deepcopy(words),
+ loads(dumps(words, 0)),
+ loads(dumps(words, 1)),
+ loads(dumps(words, 2)),
+ loads(dumps(words, -1)),
+ eval(repr(words)),
+ update_test,
+ ]):
+ msg = (i, dup, words)
+ assert dup is not words, msg
+ assert dup == words, msg
+ assert len(dup) == len(words), msg
+ assert type(dup) == type(words), msg
+
+ # Verify that counters are unhashable
+ try:
+ hash(words)
+ except TypeError:
+ pass
+ else:
+ print 'Failed hashing test'
+
+ # Verify that Counter.fromkeys() is disabled
+ try:
+ Counter.fromkeys('razmataz')
+ except NotImplementedError:
+ pass
+ else:
+ print 'Failed fromkeys() test'
+
+ # Check ABCs
+ assert issubclass(Counter, Mapping)
+ assert isinstance(Counter('asdfasdf'), Mapping)
+
import doctest
TestResults = namedtuple('TestResults', 'failed attempted')
print TestResults(*doctest.testmod())
diff --git a/Lib/test/test_collections.py b/Lib/test/test_collections.py
index 7dffd73..00882e2 100644
--- a/Lib/test/test_collections.py
+++ b/Lib/test/test_collections.py
@@ -1,6 +1,6 @@
import unittest, doctest
from test import test_support
-from collections import namedtuple
+from collections import namedtuple, Counter, Mapping
import pickle, cPickle, copy
from collections import Hashable, Iterable, Iterator
from collections import Sized, Container, Callable
@@ -346,11 +346,107 @@ class TestCollectionABCs(unittest.TestCase):
self.failUnless(issubclass(sample, MutableSequence))
self.failIf(issubclass(basestring, MutableSequence))
+class TestCounter(unittest.TestCase):
+
+ def test_basics(self):
+ c = Counter('abcaba')
+ self.assert_(isinstance(c, dict))
+ self.assert_(isinstance(c, Mapping))
+ self.assert_(issubclass(Counter, dict))
+ self.assert_(issubclass(Counter, Mapping))
+ self.assertEqual(len(c), 3)
+ self.assertEqual(sum(c.values()), 6)
+ self.assertEqual(sorted(c.values()), [1, 2, 3])
+ self.assertEqual(sorted(c.keys()), ['a', 'b', 'c'])
+ self.assertEqual(sorted(c), ['a', 'b', 'c'])
+ self.assertEqual(sorted(c.items()),
+ [('a', 3), ('b', 2), ('c', 1)])
+ self.assertEqual(c['b'], 2)
+ self.assertEqual(c['z'], 0)
+ self.assertEqual(c.has_key('c'), True)
+ self.assertEqual(c.has_key('z'), False)
+ self.assertEqual(c.__contains__('c'), True)
+ self.assertEqual(c.__contains__('z'), False)
+ self.assertEqual(c.get('b', 10), 2)
+ self.assertEqual(c.get('z', 10), 10)
+ self.assertEqual(c, dict(a=3, b=2, c=1))
+ self.assertEqual(repr(c),
+ "Counter(items=[('a', 3), ('b', 2), ('c', 1)])")
+ self.assertEqual(c.most_common(), [('a', 3), ('b', 2), ('c', 1)])
+ for i in range(5):
+ self.assertEqual(c.most_common(i),
+ [('a', 3), ('b', 2), ('c', 1)][:i])
+ self.assertEqual(''.join(sorted(c.elements())), 'aaabbc')
+ c['a'] += 1 # increment an existing value
+ c['b'] -= 2 # sub existing value to zero
+ del c['c'] # remove an entry
+ c['d'] -= 2 # sub from a missing value
+ c['e'] = -5 # directly assign a missing value
+ c['f'] += 4 # add to a missing value
+ self.assertEqual(c, dict(a=4, b=0, d=-2, e=-5, f=4))
+ self.assertEqual(''.join(sorted(c.elements())), 'aaaaffff')
+ self.assertEqual(c.pop('f'), 4)
+ self.assertEqual('f' in c, False)
+ for i in range(3):
+ elem, cnt = c.popitem()
+ self.assertEqual(elem in c, False)
+ c.clear()
+ self.assertEqual(c, {})
+ self.assertEqual(repr(c), 'Counter()')
+ self.assertRaises(NotImplementedError, Counter.fromkeys, 'abc')
+ self.assertRaises(TypeError, hash, c)
+ c.update(dict(a=5, b=3, c=1))
+ c.update(Counter(items=[('a', 50), ('b', 30)]))
+ c.__init__(items=[('a', 500), ('b', 300)])
+ c.__init__('cdc')
+ self.assertEqual(c, dict(a=555, b=333, c=3, d=1))
+ self.assertEqual(c.setdefault('d', 5), 1)
+ self.assertEqual(c['d'], 1)
+ self.assertEqual(c.setdefault('e', 5), 5)
+ self.assertEqual(c['e'], 5)
+
+ def test_copying(self):
+ # Check that counters are copyable, deepcopyable, picklable, and
+ #have a repr/eval round-trip
+ words = Counter('which witch had which witches wrist watch'.split())
+ update_test = Counter()
+ update_test.update(words)
+ for i, dup in enumerate([
+ words.copy(),
+ copy.copy(words),
+ copy.deepcopy(words),
+ pickle.loads(pickle.dumps(words, 0)),
+ pickle.loads(pickle.dumps(words, 1)),
+ pickle.loads(pickle.dumps(words, 2)),
+ pickle.loads(pickle.dumps(words, -1)),
+ cPickle.loads(cPickle.dumps(words, 0)),
+ cPickle.loads(cPickle.dumps(words, 1)),
+ cPickle.loads(cPickle.dumps(words, 2)),
+ cPickle.loads(cPickle.dumps(words, -1)),
+ eval(repr(words)),
+ update_test,
+ ]):
+ msg = (i, dup, words)
+ self.assert_(dup is not words)
+ self.assertEquals(dup, words)
+ self.assertEquals(len(dup), len(words))
+ self.assertEquals(type(dup), type(words))
+
+ def test_conversions(self):
+ # Convert to: set, list, dict
+ s = 'she sells sea shells by the sea shore'
+ self.assertEqual(sorted(Counter(s).elements()), sorted(s))
+ self.assertEqual(sorted(Counter(s)), sorted(set(s)))
+ self.assertEqual(dict(Counter(s)), dict(Counter(s).items()))
+ self.assertEqual(set(Counter(s)), set(s))
+
+
import doctest, collections
def test_main(verbose=None):
NamedTupleDocs = doctest.DocTestSuite(module=collections)
- test_classes = [TestNamedTuple, NamedTupleDocs, TestOneTrickPonyABCs, TestCollectionABCs]
+ test_classes = [TestNamedTuple, NamedTupleDocs, TestOneTrickPonyABCs,
+ TestCollectionABCs, TestCounter]
test_support.run_unittest(*test_classes)
test_support.run_doctest(collections, verbose)
diff --git a/Misc/NEWS b/Misc/NEWS
index 7732ba9..6a6f1e5 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -128,6 +128,9 @@ Core and Builtins
Library
-------
+- Issue #1696199: Add collections.Counter() for rapid and convenient
+ counting.
+
- Issue #3860: GzipFile and BZ2File now support the context manager protocol.
- Issue #4272: Add an optional argument to the GzipFile constructor to override