Issue 1696199: Add collections.Counter().

author: Raymond Hettinger <python@rcn.com> 2009-01-12 22:58:41 (GMT)
committer: Raymond Hettinger <python@rcn.com> 2009-01-12 22:58:41 (GMT)
commit: f94d7fa5fb90df0163cffca2864885a7da49d4f6 (patch)
tree: 0645074534eff18800e28fd723c18d94a355ce05
parent: 99234c5c7466845316ca5e33d86226a1d1fe227b (diff)
download: cpython-f94d7fa5fb90df0163cffca2864885a7da49d4f6.zip
cpython-f94d7fa5fb90df0163cffca2864885a7da49d4f6.tar.gz
cpython-f94d7fa5fb90df0163cffca2864885a7da49d4f6.tar.bz2
4 files changed, 430 insertions, 2 deletions
diff --git a/Doc/library/collections.rst b/Doc/library/collections.rst
index 2725d68..4aced1e 100644
--- a/Doc/library/collections.rst
+++ b/Doc/library/collections.rst
@@ -152,6 +152,134 @@ Notes on using :class:`Set` and :class:`MutableSet` as a mixin:
 (For more about ABCs, see the :mod:`abc` module and :pep:`3119`.)
 
 
+.. _counter-objects:
+
+:class:`Counter` objects
+------------------------
+
+A counter tool is provided to support convenient and rapid tallies.
+For example::
+
+    # Tally repeated words in a list
+    >>> words = ['red', 'blue', 'red', 'green', 'blue', blue']
+    >>> cnt = Counter()
+    >>> for word in words:
+    ...     cnt[word] += 1
+    >>> cnt
+    Counter(items=[('blue', 3), ('red', 2), ('green', 1)])
+
+    # Find the ten most common words in Hamlet
+    >>> import re
+    >>> words = re.findall('\w+', open('hamlet.txt').read().lower())
+    >>> Counter(hamlet_words).most_common(10)
+    [('the', 1143), ('and', 966), ('to', 762), ('of', 669), ('i', 631),
+     ('you', 554),  ('a', 546), ('my', 514), ('hamlet', 471), ('in', 451)]
+
+.. class:: Counter([iterable[, items]])
+
+   A :class:`Counter` is a :class:`dict` subclass for counting hashable items.
+   Elements are stored as dictionary keys and their counts are stored as
+   dictionary values.  Counts are allowed to be any integer value including
+   zero or negative counts.  The :class:`Counter` class is similar to bags
+   or multisets in other languages.
+
+   Elements are counted from the *iterable* if given.  Also, the counts
+   can be initialized from an *items* list of *(element, count)* pairs.
+   If provided, *items* must be a keyword argument::
+
+       >>> c = Counter()                            # a new, empty counter
+       >>> c = Counter('gallahad')                  # a new counter from an iterable
+       >>> c = Counter(items=[('a', 4), ('b', 2)])  # a new counter from an items list
+
+   The returned object has a dictionary style interface except that it returns
+   a zero count for missing items (instead of raising a :exc:`KeyError` like a
+   dictionary would)::
+
+        >>> c = Counter(['if', 'your', 'peril', 'be'])
+        >>> c['questions']                          # count of a missing element is zero
+        0
+
+   Assigning a count of zero or reducing the count to zero leaves the
+   element in the dictionary.  Use ``del`` to remove the entry entirely:
+
+        >>> c = Counter(['arthur', 'gwain'])
+        >>> c['arthur'] = 0                         # set the count of "arthur" to zero
+        >>> 'arthur' in c                           # but "arthur" is still in the counter
+        True
+        >>> del c['arthur']                         # del will completely remove the entry
+        >>> 'arthur' in c
+        False
+
+   .. versionadded:: 2.7
+
+
+   Counter objects support two methods beyond those available for all
+   dictionaries:
+
+   .. method:: elements()
+
+      Return an iterator over elements repeating each as many times as its count.
+      Elements are returned in arbitrary order.  If an element's count has been
+      set to zero or a negative number, :meth:`elements` will ignore it.
+
+            >>> c = Counter(items=[('a', 4), ('b', 2), ('d', 0), ('e', -2)])
+            >>> list(c.elements())
+            ['a', 'a', 'a', 'a', 'b', 'b']
+
+   .. method:: most_common([n])
+
+      Return a list of the *n* most common elements and their counts from
+      the most common to the least.  If *n* is not specified or is ``None``,
+      return a list of all element counts in decreasing order of frequency.
+      Elements with equal counts are ordered arbitrarily::
+
+            >>> Counter('abracadabra').most_common(3)
+            [('a', 5), ('r', 2), ('b', 2)]
+
+   The usual dictionary methods are available for :class:`Counter` objects.
+   All of those work the same as they do for dictionaries except for two
+   which work differently for counters.
+
+   .. method:: fromkeys(iterable)
+
+       There is no equivalent class method for :class:`Counter` objects.
+       Raises a :exc:`NotImplementedError` when called.
+
+   .. method:: update(mapping)
+
+       Like :meth:`dict.update` but adds-in counts instead of replacing them.
+       Used for combining two independent counts.  Accepts a *mapping* object
+       which can be another counter or can be a :class:`dict` that maps
+       elements to element counts::
+
+            >>> c = Counter('which')        # count letters in a word
+            >>> d = Counter('witch')        # count letters in another word
+            >>> c.update(d)                 # add counts from d to those in c
+            >>> c['h']                      # count of 'h' is now three
+            3
+            >>> c.update(Counter('watch'))  # add in letters from another word
+            >>> c['h']                      # count of 'h' is now four
+            4
+
+
+.. seealso::
+
+   `Multisets <http://en.wikipedia.org/wiki/Multiset>`_
+       in the Wikipedia
+
+   `Bag <http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html>`_
+       a Smalltalk class
+
+   `C++ multisets <http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm>`_
+       a tutorial with standalone examples
+
+   `Bag class <http://code.activestate.com/recipes/259174/>`_
+       an early Python recipe
+
+   Use cases for multisets and mathematical operations on multisets.
+       Knuth, Donald. The Art of Computer Programming Volume II,
+       Section 4.6.3, Exercise 19.
+
 
 .. _deque-objects:
 
diff --git a/Lib/collections.py b/Lib/collections.py
index ace2b2a..ff49844 100644
--- a/Lib/collections.py
+++ b/Lib/collections.py
@@ -9,6 +9,11 @@ from _collections import deque, defaultdict
 from operator import itemgetter as _itemgetter
 from keyword import iskeyword as _iskeyword
 import sys as _sys
+import heapq as _heapq
+import itertools as _itertools
+
+########################################################################
+###  namedtuple  #######################################################
 
 def namedtuple(typename, field_names, verbose=False):
     """Returns a new subclass of tuple with named fields.
@@ -108,7 +113,160 @@ def namedtuple(typename, field_names, verbose=False):
     return result
 
 
+########################################################################
+###  Counter  ##########################################################
+
+class Counter(dict):
+    '''Dict subclass for counting hashable items.  Sometimes called a bag
+    or multiset.  Elements are stored as dictionary keys and their counts
+    are stored as dictionary values.
+
+    >>> c = Counter('abracadabra')      # count elements from a string
+
+    >>> c.most_common(3)                # three most common elements
+    [('a', 5), ('r', 2), ('b', 2)]
+    >>> sorted(c)                       # list all unique elements
+    ['a', 'b', 'c', 'd', 'r']
+    >>> ''.join(sorted(c.elements()))   # list elements with repetitions
+    'aaaaabbcdrr'
+    >>> sum(c.values())                 # total of all counts
+    11
+
+    >>> c['a']                          # count of letter 'a'
+    5
+    >>> for elem in 'shazam':           # update counts from an iterable
+    ...     c[elem] += 1                # by adding 1 to each element's count
+    >>> c['a']                          # now there are seven 'a'
+    7
+    >>> del c['r']                      # remove all 'r'
+    >>> c['r']                          # now there are zero 'r'
+    0
+
+    >>> d = Counter('simsalabim')       # make another counter
+    >>> c.update(d)                     # add in the second counter
+    >>> c['a']                          # now there are nine 'a'
+    9
+
+    >>> c.clear()                       # empty the counter
+    >>> c
+    Counter()
+
+    Note:  If a count is set to zero or reduced to zero, it will remain
+    in the counter until the entry is deleted or the counter is cleared:
+
+    >>> c = Counter('aaabbc')
+    >>> c['b'] -= 2                     # reduce the count of 'b' by two
+    >>> c.most_common()                 # 'b' is still in, but its count is zero
+    [('a', 3), ('c', 1), ('b', 0)]
+
+    '''
+    # References:
+    #   http://en.wikipedia.org/wiki/Multiset
+    #   http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html
+    #   http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm
+    #   http://code.activestate.com/recipes/259174/
+    #   Knuth, TAOCP Vol. II section 4.6.3
+
+    def __init__(self, iterable=None, items=None):
+        '''Create a new, empty Counter object.  And if given, count elements
+        from an input iterable.  Or, initialize the count from an items list
+        of (element, count) pairs.
+
+        >>> c = Counter('hocus pocus')              # count elements in an iterable
+        >>> c = Counter(items=[('a', 4), ('b', 2)]) # take counts from an items list
+
+        '''
+        if iterable is not None:
+            for elem in iterable:
+                self[elem] += 1
+        if items is not None:
+            for elem, count in items:
+                self[elem] += count
+
+    def __missing__(self, key):
+        'The count of elements not in the Counter is zero.'
+        # Needed so that self[missing_item] does not raise KeyError
+        return 0
+
+    def most_common(self, n=None):
+        '''List the n most common elements and their counts from the most
+        common to the least.  If n is None, then list all element counts.
+
+        >>> Counter('abracadabra').most_common(3)
+        [('a', 5), ('r', 2), ('b', 2)]
+
+        '''
+        # Emulate Bag.sortedByCount from Smalltalk
+        if n is None:
+            return sorted(self.iteritems(), key=_itemgetter(1), reverse=True)
+        return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1))
 
+    def elements(self):
+        '''Iterator over elements repeating each as many times as its count.
+
+        >>> c = Counter('ABCABC')
+        >>> sorted(c.elements())
+        ['A', 'A', 'B', 'B', 'C', 'C']
+
+        # Knuth's example of prime factors of 1836:  2**2 * 3**3 * 17**1
+        >>> import operator
+        >>> prime_factors = Counter(items=[(2,2), (3,3), (17,1)])
+        >>> sorted(prime_factors.elements())         # list individual factors
+        [2, 2, 3, 3, 3, 17]
+        >>> reduce(operator.mul, prime_factors.elements(), 1)  # multiply them
+        1836
+
+        Note, if an element's count has been set to zero or a negative number,
+        elements() will ignore it.
+
+        '''
+        # Emulate Bag.do from Smalltalk and Multiset.begin from C++.
+        return _itertools.chain.from_iterable(
+                                    _itertools.starmap(_itertools.repeat,
+                                                       self.iteritems()))
+
+    # Override dict methods where necessary
+
+    @classmethod
+    def fromkeys(cls, iterable, v=None):
+        # There is no equivalent method for counters because setting v=1
+        # means that no element can have a count greater than one.
+        raise NotImplementedError(
+            'Counter.fromkeys() is undefined.  Use Counter(iterable) instead.')
+
+    def update(self, mapping):
+        '''Like dict.update() but add counts instead of replacing them.
+
+        Source can be another dictionary or a Counter.instance().
+
+        >>> c = Counter('which')
+        >>> d = Counter('witch')
+        >>> c.update(d)                 # Add counts from d to those in c
+        >>> c['h']                      # Count of 'h' is now three
+        3
+
+        '''
+        # The regular dict.update() operation makes no sense here because the
+        # replace behavior results in the some of original untouched counts
+        # being mixed-in with all of the other counts for a mismash that
+        # doesn't have a straight-forward interpretation in most counting
+        # contexts.  Instead, we look to Knuth for suggested operations on
+        # multisets and implement the union-add operation discussed in
+        # TAOCP Volume II section 4.6.3 exercise 19.  The Wikipedia entry for
+        # multisets calls that operation a sum or join.
+        for elem, count in mapping.iteritems():
+            self[elem] += count
+
+    def copy(self):
+        'Like dict.copy() but returns a Counter instance instead of a dict.'
+        c = Counter()
+        c.update(self)
+        return c
+
+    def __repr__(self):
+        if not self:
+            return '%s()' % self.__class__.__name__
+        return '%s(items=%r)' % (self.__class__.__name__, self.most_common())
 
 
 
@@ -143,6 +301,49 @@ if __name__ == '__main__':
     Point3D = namedtuple('Point3D', Point._fields + ('z',))
     print Point3D.__doc__
 
+    # Check that counters are copyable, deepcopyable, picklable, and have a
+    # repr/eval round-trip
+    import copy
+    words = Counter('which witch had which witches wrist watch'.split())
+    update_test = Counter()
+    update_test.update(words)
+    for i, dup in enumerate([
+                words.copy(),
+                copy.copy(words),
+                copy.deepcopy(words),
+                loads(dumps(words, 0)),
+                loads(dumps(words, 1)),
+                loads(dumps(words, 2)),
+                loads(dumps(words, -1)),
+                eval(repr(words)),
+                update_test,
+                ]):
+        msg = (i, dup, words)
+        assert dup is not words, msg
+        assert dup == words, msg
+        assert len(dup) == len(words), msg
+        assert type(dup) == type(words), msg
+
+    # Verify that counters are unhashable
+    try:
+        hash(words)
+    except TypeError:
+        pass
+    else:
+        print 'Failed hashing test'
+
+    # Verify that Counter.fromkeys() is disabled
+    try:
+        Counter.fromkeys('razmataz')
+    except NotImplementedError:
+        pass
+    else:
+        print 'Failed fromkeys() test'
+
+    # Check ABCs
+    assert issubclass(Counter, Mapping)
+    assert isinstance(Counter('asdfasdf'), Mapping)
+
     import doctest
     TestResults = namedtuple('TestResults', 'failed attempted')
     print TestResults(*doctest.testmod())
diff --git a/Lib/test/test_collections.py b/Lib/test/test_collections.py
index 7dffd73..00882e2 100644
--- a/Lib/test/test_collections.py
+++ b/Lib/test/test_collections.py
@@ -1,6 +1,6 @@
 import unittest, doctest
 from test import test_support
-from collections import namedtuple
+from collections import namedtuple, Counter, Mapping
 import pickle, cPickle, copy
 from collections import Hashable, Iterable, Iterator
 from collections import Sized, Container, Callable
@@ -346,11 +346,107 @@ class TestCollectionABCs(unittest.TestCase):
             self.failUnless(issubclass(sample, MutableSequence))
         self.failIf(issubclass(basestring, MutableSequence))
 
+class TestCounter(unittest.TestCase):
+
+    def test_basics(self):
+        c = Counter('abcaba')
+        self.assert_(isinstance(c, dict))
+        self.assert_(isinstance(c, Mapping))
+        self.assert_(issubclass(Counter, dict))
+        self.assert_(issubclass(Counter, Mapping))
+        self.assertEqual(len(c), 3)
+        self.assertEqual(sum(c.values()), 6)
+        self.assertEqual(sorted(c.values()), [1, 2, 3])
+        self.assertEqual(sorted(c.keys()), ['a', 'b', 'c'])
+        self.assertEqual(sorted(c), ['a', 'b', 'c'])
+        self.assertEqual(sorted(c.items()),
+                         [('a', 3), ('b', 2), ('c', 1)])
+        self.assertEqual(c['b'], 2)
+        self.assertEqual(c['z'], 0)
+        self.assertEqual(c.has_key('c'), True)
+        self.assertEqual(c.has_key('z'), False)
+        self.assertEqual(c.__contains__('c'), True)
+        self.assertEqual(c.__contains__('z'), False)
+        self.assertEqual(c.get('b', 10), 2)
+        self.assertEqual(c.get('z', 10), 10)
+        self.assertEqual(c, dict(a=3, b=2, c=1))
+        self.assertEqual(repr(c),
+                         "Counter(items=[('a', 3), ('b', 2), ('c', 1)])")
+        self.assertEqual(c.most_common(), [('a', 3), ('b', 2), ('c', 1)])
+        for i in range(5):
+            self.assertEqual(c.most_common(i),
+                             [('a', 3), ('b', 2), ('c', 1)][:i])
+        self.assertEqual(''.join(sorted(c.elements())), 'aaabbc')
+        c['a'] += 1         # increment an existing value
+        c['b'] -= 2         # sub existing value to zero
+        del c['c']          # remove an entry
+        c['d'] -= 2         # sub from a missing value
+        c['e'] = -5         # directly assign a missing value
+        c['f'] += 4         # add to a missing value
+        self.assertEqual(c, dict(a=4, b=0, d=-2, e=-5, f=4))
+        self.assertEqual(''.join(sorted(c.elements())), 'aaaaffff')
+        self.assertEqual(c.pop('f'), 4)
+        self.assertEqual('f' in c, False)
+        for i in range(3):
+            elem, cnt = c.popitem()
+            self.assertEqual(elem in c, False)
+        c.clear()
+        self.assertEqual(c, {})
+        self.assertEqual(repr(c), 'Counter()')
+        self.assertRaises(NotImplementedError, Counter.fromkeys, 'abc')
+        self.assertRaises(TypeError, hash, c)
+        c.update(dict(a=5, b=3, c=1))
+        c.update(Counter(items=[('a', 50), ('b', 30)]))
+        c.__init__(items=[('a', 500), ('b', 300)])
+        c.__init__('cdc')
+        self.assertEqual(c, dict(a=555, b=333, c=3, d=1))
+        self.assertEqual(c.setdefault('d', 5), 1)
+        self.assertEqual(c['d'], 1)
+        self.assertEqual(c.setdefault('e', 5), 5)
+        self.assertEqual(c['e'], 5)
+
+    def test_copying(self):
+        # Check that counters are copyable, deepcopyable, picklable, and
+        #have a repr/eval round-trip
+        words = Counter('which witch had which witches wrist watch'.split())
+        update_test = Counter()
+        update_test.update(words)
+        for i, dup in enumerate([
+                    words.copy(),
+                    copy.copy(words),
+                    copy.deepcopy(words),
+                    pickle.loads(pickle.dumps(words, 0)),
+                    pickle.loads(pickle.dumps(words, 1)),
+                    pickle.loads(pickle.dumps(words, 2)),
+                    pickle.loads(pickle.dumps(words, -1)),
+                    cPickle.loads(cPickle.dumps(words, 0)),
+                    cPickle.loads(cPickle.dumps(words, 1)),
+                    cPickle.loads(cPickle.dumps(words, 2)),
+                    cPickle.loads(cPickle.dumps(words, -1)),
+                    eval(repr(words)),
+                    update_test,
+                    ]):
+            msg = (i, dup, words)
+            self.assert_(dup is not words)
+            self.assertEquals(dup, words)
+            self.assertEquals(len(dup), len(words))
+            self.assertEquals(type(dup), type(words))
+
+    def test_conversions(self):
+        # Convert to: set, list, dict
+        s = 'she sells sea shells by the sea shore'
+        self.assertEqual(sorted(Counter(s).elements()), sorted(s))
+        self.assertEqual(sorted(Counter(s)), sorted(set(s)))
+        self.assertEqual(dict(Counter(s)), dict(Counter(s).items()))
+        self.assertEqual(set(Counter(s)), set(s))
+
+
 import doctest, collections
 
 def test_main(verbose=None):
     NamedTupleDocs = doctest.DocTestSuite(module=collections)
-    test_classes = [TestNamedTuple, NamedTupleDocs, TestOneTrickPonyABCs, TestCollectionABCs]
+    test_classes = [TestNamedTuple, NamedTupleDocs, TestOneTrickPonyABCs,
+                    TestCollectionABCs, TestCounter]
     test_support.run_unittest(*test_classes)
     test_support.run_doctest(collections, verbose)
 
diff --git a/Misc/NEWS b/Misc/NEWS
index 7732ba9..6a6f1e5 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -128,6 +128,9 @@ Core and Builtins
 Library
 -------
 
+- Issue #1696199:  Add collections.Counter() for rapid and convenient
+  counting.
+
 - Issue #3860: GzipFile and BZ2File now support the context manager protocol.
 
 - Issue #4272: Add an optional argument to the GzipFile constructor to override
author	Raymond Hettinger <python@rcn.com>	2009-01-12 22:58:41 (GMT)
committer	Raymond Hettinger <python@rcn.com>	2009-01-12 22:58:41 (GMT)
commit	f94d7fa5fb90df0163cffca2864885a7da49d4f6 (patch)
tree	0645074534eff18800e28fd723c18d94a355ce05
parent	99234c5c7466845316ca5e33d86226a1d1fe227b (diff)
download	cpython-f94d7fa5fb90df0163cffca2864885a7da49d4f6.zip cpython-f94d7fa5fb90df0163cffca2864885a7da49d4f6.tar.gz cpython-f94d7fa5fb90df0163cffca2864885a7da49d4f6.tar.bz2