diff options
author | Raymond Hettinger <python@rcn.com> | 2009-01-14 02:20:07 (GMT) |
---|---|---|
committer | Raymond Hettinger <python@rcn.com> | 2009-01-14 02:20:07 (GMT) |
commit | b8baf6360f7e93f5caa2a20292a868f890056215 (patch) | |
tree | 02aa4cb923d36aa3b72e5d9657c1bd509e1c8b4e /Lib/collections.py | |
parent | 3bc0f7ebeed6b165c2694845256cf0f62e8fddfe (diff) | |
download | cpython-b8baf6360f7e93f5caa2a20292a868f890056215.zip cpython-b8baf6360f7e93f5caa2a20292a868f890056215.tar.gz cpython-b8baf6360f7e93f5caa2a20292a868f890056215.tar.bz2 |
Issue #1696199: Add collections.Counter().
Forward port from Py2.7.
Diffstat (limited to 'Lib/collections.py')
-rw-r--r-- | Lib/collections.py | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/Lib/collections.py b/Lib/collections.py index c3faa9a..6c1abce 100644 --- a/Lib/collections.py +++ b/Lib/collections.py @@ -10,6 +10,9 @@ from _collections import deque, defaultdict from operator import itemgetter as _itemgetter from keyword import iskeyword as _iskeyword import sys as _sys +import heapq as _heapq +from itertools import repeat as _repeat, chain as _chain, starmap as _starmap + ################################################################################ ### namedtuple @@ -113,6 +116,162 @@ def namedtuple(typename, field_names, verbose=False): return result +######################################################################## +### Counter +######################################################################## + +class Counter(dict): + '''Dict subclass for counting hashable items. Sometimes called a bag + or multiset. Elements are stored as dictionary keys and their counts + are stored as dictionary values. + + >>> c = Counter('abracadabra') # count elements from a string + + >>> c.most_common(3) # three most common elements + [('a', 5), ('r', 2), ('b', 2)] + >>> sorted(c) # list all unique elements + ['a', 'b', 'c', 'd', 'r'] + >>> ''.join(sorted(c.elements())) # list elements with repetitions + 'aaaaabbcdrr' + >>> sum(c.values()) # total of all counts + 11 + + >>> c['a'] # count of letter 'a' + 5 + >>> for elem in 'shazam': # update counts from an iterable + ... c[elem] += 1 # by adding 1 to each element's count + >>> c['a'] # now there are seven 'a' + 7 + >>> del c['r'] # remove all 'r' + >>> c['r'] # now there are zero 'r' + 0 + + >>> d = Counter('simsalabim') # make another counter + >>> c.update(d) # add in the second counter + >>> c['a'] # now there are nine 'a' + 9 + + >>> c.clear() # empty the counter + >>> c + Counter() + + Note: If a count is set to zero or reduced to zero, it will remain + in the counter until the entry is deleted or the counter is cleared: + + >>> c = Counter('aaabbc') + >>> c['b'] -= 2 # reduce the count of 'b' by two + >>> c.most_common() # 'b' is still in, but its count is zero + [('a', 3), ('c', 1), ('b', 0)] + + ''' + # References: + # http://en.wikipedia.org/wiki/Multiset + # http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html + # http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm + # http://code.activestate.com/recipes/259174/ + # Knuth, TAOCP Vol. II section 4.6.3 + + def __init__(self, iterable=None): + '''Create a new, empty Counter object. And if given, count elements + from an input iterable. Or, initialize the count from another mapping + of elements to their counts. + + >>> c = Counter() # a new, empty counter + >>> c = Counter('gallahad') # a new counter from an iterable + >>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping + + ''' + self.update(iterable) + + def __missing__(self, key): + 'The count of elements not in the Counter is zero.' + # Needed so that self[missing_item] does not raise KeyError + return 0 + + def most_common(self, n=None): + '''List the n most common elements and their counts from the most + common to the least. If n is None, then list all element counts. + + >>> Counter('abracadabra').most_common(3) + [('a', 5), ('r', 2), ('b', 2)] + + ''' + # Emulate Bag.sortedByCount from Smalltalk + if n is None: + return sorted(self.items(), key=_itemgetter(1), reverse=True) + return _heapq.nlargest(n, self.items(), key=_itemgetter(1)) + + def elements(self): + '''Iterator over elements repeating each as many times as its count. + + >>> c = Counter('ABCABC') + >>> sorted(c.elements()) + ['A', 'A', 'B', 'B', 'C', 'C'] + + # Knuth's example for prime factors of 1836: 2**2 * 3**3 * 17**1 + >>> prime_factors = Counter({2: 2, 3: 3, 17: 1}) + >>> product = 1 + >>> for factor in prime_factors.elements(): # loop over factors + ... product *= factor # and multiply them + >>> product + 1836 + + Note, if an element's count has been set to zero or is a negative + number, elements() will ignore it. + + ''' + # Emulate Bag.do from Smalltalk and Multiset.begin from C++. + return _chain.from_iterable(_starmap(_repeat, self.items())) + + # Override dict methods where necessary + + @classmethod + def fromkeys(cls, iterable, v=None): + # There is no equivalent method for counters because setting v=1 + # means that no element can have a count greater than one. + raise NotImplementedError( + 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.') + + def update(self, iterable=None): + '''Like dict.update() but add counts instead of replacing them. + + Source can be an iterable, a dictionary, or another Counter instance. + + >>> c = Counter('which') + >>> c.update('witch') # add elements from another iterable + >>> d = Counter('watch') + >>> c.update(d) # add elements from another counter + >>> c['h'] # four 'h' in which, witch, and watch + 4 + + ''' + # The regular dict.update() operation makes no sense here because the + # replace behavior results in the some of original untouched counts + # being mixed-in with all of the other counts for a mismash that + # doesn't have a straight-forward interpretation in most counting + # contexts. Instead, we look to Knuth for suggested operations on + # multisets and implement the union-add operation discussed in + # TAOCP Volume II section 4.6.3 exercise 19. The Wikipedia entry for + # multisets calls that operation a sum or join. + + if iterable is not None: + if isinstance(iterable, Mapping): + for elem, count in iterable.items(): + self[elem] += count + else: + for elem in iterable: + self[elem] += 1 + + def copy(self): + 'Like dict.copy() but returns a Counter instance instead of a dict.' + return Counter(self) + + def __repr__(self): + if not self: + return '%s()' % self.__class__.__name__ + items = ', '.join(map('%r: %r'.__mod__, self.most_common())) + return '%s({%s})' % (self.__class__.__name__, items) + ################################################################################ ### UserDict |