From 4d2073a0731f14be65e6a033685923bb1effe33d Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 20 Jan 2009 03:41:22 +0000 Subject: Forward port r68792 and r68789 putting Counter in __all__ and adding Counter buildouts. --- Doc/library/collections.rst | 37 ++++++++++++++-- Lib/collections.py | 100 +++++++++++++++++++++++++++++++++++++++---- Lib/test/test_collections.py | 46 +++++++++++++++++++- 3 files changed, 169 insertions(+), 14 deletions(-) diff --git a/Doc/library/collections.rst b/Doc/library/collections.rst index 816d814..0984751 100644 --- a/Doc/library/collections.rst +++ b/Doc/library/collections.rst @@ -177,6 +177,7 @@ For example:: >>> c = Counter() # a new, empty counter >>> c = Counter('gallahad') # a new counter from an iterable >>> c = Counter({'red': 4, 'blue': 2}) # a new counter from a mapping + >>> c = Counter(spam=8, eggs=1) # a new counter from keyword args The returned object has a dictionary style interface except that it returns a zero count for missing items (instead of raising a :exc:`KeyError` like a @@ -207,7 +208,7 @@ For example:: Elements are returned in arbitrary order. If an element's count has been set to zero or a negative number, :meth:`elements` will ignore it. - >>> c = Counter({'a': 4, 'b': 2, 'd': 0, 'e': -2}) + >>> c = Counter(a=4, b=2, c=0, d=-2) >>> list(c.elements()) ['a', 'a', 'a', 'a', 'b', 'b'] @@ -232,10 +233,10 @@ For example:: .. method:: update([iterable-or-mapping]) - Like :meth:`dict.update` but adds-in counts instead of replacing them. - Elements are counted from an *iterable* or added-in from another - *mapping* (or counter):: + *mapping* (or counter). Like :meth:`dict.update` but adds-in counts + instead of replacing them, and the *iterable* is expected to be a + sequence of elements, not a sequence of ``(key, value)`` pairs:: >>> c = Counter('which') >>> c.update('witch') # add elements from another iterable @@ -255,6 +256,34 @@ Common patterns for working with :class:`Counter` objects:: Counter(dict(list_of_pairs)) # convert from a list of (elem, cnt) pairs c.most_common()[:-n:-1] # n least common elements +Several multiset mathematical operations are provided for combining +:class:`Counter` objects. Multisets are like regular sets but allowed to +contain repeated elements (with counts of one or more). Addition and +subtraction combine counters by adding or subtracting the counts of +corresponding elements. Intersection and union return the minimum and maximum +of corresponding counts:: + + >>> c = Counter('a': 3, 'b': 1}) + >>> d = Counter({'a': 1, 'b': 2}) + >>> c + d # add two counters together: c[x] + d[x] + Counter({'a': 4, 'b': 3}) + >>> c - d # subtract (keeping only positive counts) + Counter({'a': 2}) + >>> c & d # interection: min(c[x], d[x]) + Counter({'a': 1, 'b': 1}) + >>> c | d # union: max(c[x], d[x]) + Counter({'a': 3, 'b': 2}) + +All four multiset operations produce only positive counts (negative and zero +results are skipped). If inputs include negative counts, addition will sum +both counts and then exclude non-positive results. The other three operations +are undefined for negative inputs:: + + >>> e = Counter(a=8, b=-2, c=0) + >>> e += Counter() # remove zero and negative counts + >>> e + Counter({'a': 8}) + **References**: * Wikipedia entry for `Multisets `_ diff --git a/Lib/collections.py b/Lib/collections.py index 6c1abce..45558f9 100644 --- a/Lib/collections.py +++ b/Lib/collections.py @@ -1,5 +1,5 @@ __all__ = ['deque', 'defaultdict', 'namedtuple', 'UserDict', 'UserList', - 'UserString'] + 'UserString', 'Counter'] # For bootstrapping reasons, the collection ABCs are defined in _abcoll.py. # They should however be considered an integral part of collections.py. from _abcoll import * @@ -171,7 +171,7 @@ class Counter(dict): # http://code.activestate.com/recipes/259174/ # Knuth, TAOCP Vol. II section 4.6.3 - def __init__(self, iterable=None): + def __init__(self, iterable=None, **kwds): '''Create a new, empty Counter object. And if given, count elements from an input iterable. Or, initialize the count from another mapping of elements to their counts. @@ -179,9 +179,10 @@ class Counter(dict): >>> c = Counter() # a new, empty counter >>> c = Counter('gallahad') # a new counter from an iterable >>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping + >>> c = Counter(a=4, b=2) # a new counter from keyword args ''' - self.update(iterable) + self.update(iterable, **kwds) def __missing__(self, key): 'The count of elements not in the Counter is zero.' @@ -232,7 +233,7 @@ class Counter(dict): raise NotImplementedError( 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.') - def update(self, iterable=None): + def update(self, iterable=None, **kwds): '''Like dict.update() but add counts instead of replacing them. Source can be an iterable, a dictionary, or another Counter instance. @@ -249,10 +250,8 @@ class Counter(dict): # replace behavior results in the some of original untouched counts # being mixed-in with all of the other counts for a mismash that # doesn't have a straight-forward interpretation in most counting - # contexts. Instead, we look to Knuth for suggested operations on - # multisets and implement the union-add operation discussed in - # TAOCP Volume II section 4.6.3 exercise 19. The Wikipedia entry for - # multisets calls that operation a sum or join. + # contexts. Instead, we implement straight-addition. Both the inputs + # and outputs are allowed to contain zero and negative counts. if iterable is not None: if isinstance(iterable, Mapping): @@ -261,17 +260,102 @@ class Counter(dict): else: for elem in iterable: self[elem] += 1 + if kwds: + self.update(kwds) def copy(self): 'Like dict.copy() but returns a Counter instance instead of a dict.' return Counter(self) + def __delitem__(self, elem): + 'Like dict.__delitem__() but does not raise KeyError for missing values.' + if elem in self: + dict.__delitem__(self, elem) + def __repr__(self): if not self: return '%s()' % self.__class__.__name__ items = ', '.join(map('%r: %r'.__mod__, self.most_common())) return '%s({%s})' % (self.__class__.__name__, items) + # Multiset-style mathematical operations discussed in: + # Knuth TAOCP Volume II section 4.6.3 exercise 19 + # and at http://en.wikipedia.org/wiki/Multiset + # + # Results are undefined when inputs contain negative counts. + # Outputs guaranteed to only include positive counts. + # + # To strip negative and zero counts, add-in an empty counter: + # c += Counter() + + def __add__(self, other): + '''Add counts from two counters. + + >>> Counter('abbb') + Counter('bcc') + Counter({'b': 4, 'c': 2, 'a': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem in set(self) | set(other): + newcount = self[elem] + other[elem] + if newcount > 0: + result[elem] = newcount + return result + + def __sub__(self, other): + ''' Subtract count, but keep only results with positive counts. + + >>> Counter('abbbc') - Counter('bccd') + Counter({'b': 2, 'a': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem, count in self.items(): + newcount = count - other[elem] + if newcount > 0: + result[elem] = newcount + return result + + def __or__(self, other): + '''Union is the maximum of value in either of the input counters. + + >>> Counter('abbb') | Counter('bcc') + Counter({'b': 3, 'c': 2, 'a': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + _max = max + result = Counter() + for elem in set(self) | set(other): + newcount = _max(self[elem], other[elem]) + if newcount > 0: + result[elem] = newcount + return result + + def __and__(self, other): + ''' Intersection is the minimum of corresponding counts. + + >>> Counter('abbb') & Counter('bcc') + Counter({'b': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + _min = min + result = Counter() + if len(self) < len(other): + self, other = other, self + for elem in filter(self.__contains__, other): + newcount = _min(self[elem], other[elem]) + if newcount > 0: + result[elem] = newcount + return result + ################################################################################ ### UserDict diff --git a/Lib/test/test_collections.py b/Lib/test/test_collections.py index 153059a..6630c4e 100644 --- a/Lib/test/test_collections.py +++ b/Lib/test/test_collections.py @@ -4,6 +4,8 @@ import unittest, doctest from test import support from collections import namedtuple, Counter, Mapping import pickle, copy +from random import randrange +import operator from collections import Hashable, Iterable, Iterator from collections import Sized, Container, Callable from collections import Set, MutableSet @@ -361,6 +363,8 @@ class TestCounter(unittest.TestCase): def test_basics(self): c = Counter('abcaba') + self.assertEqual(c, Counter({'a':3 , 'b': 2, 'c': 1})) + self.assertEqual(c, Counter(a=3, b=2, c=1)) self.assert_(isinstance(c, dict)) self.assert_(isinstance(c, Mapping)) self.assert_(issubclass(Counter, dict)) @@ -388,6 +392,7 @@ class TestCounter(unittest.TestCase): c['a'] += 1 # increment an existing value c['b'] -= 2 # sub existing value to zero del c['c'] # remove an entry + del c['c'] # make sure that del doesn't raise KeyError c['d'] -= 2 # sub from a missing value c['e'] = -5 # directly assign a missing value c['f'] += 4 # add to a missing value @@ -403,7 +408,8 @@ class TestCounter(unittest.TestCase): self.assertEqual(repr(c), 'Counter()') self.assertRaises(NotImplementedError, Counter.fromkeys, 'abc') self.assertRaises(TypeError, hash, c) - c.update(dict(a=5, b=3, c=1)) + c.update(dict(a=5, b=3)) + c.update(c=1) c.update(Counter('a' * 50 + 'b' * 30)) c.update() # test case with no args c.__init__('a' * 500 + 'b' * 300) @@ -447,7 +453,43 @@ class TestCounter(unittest.TestCase): self.assertEqual(dict(Counter(s)), dict(Counter(s).items())) self.assertEqual(set(Counter(s)), set(s)) - + def test_multiset_operations(self): + # Verify that adding a zero counter will strip zeros and negatives + c = Counter(a=10, b=-2, c=0) + Counter() + self.assertEqual(dict(c), dict(a=10)) + + elements = 'abcd' + for i in range(1000): + # test random pairs of multisets + p = Counter(dict((elem, randrange(-2,4)) for elem in elements)) + q = Counter(dict((elem, randrange(-2,4)) for elem in elements)) + for counterop, numberop, defneg in [ + (Counter.__add__, lambda x, y: x+y if x+y>0 else 0, True), + (Counter.__sub__, lambda x, y: x-y if x-y>0 else 0, False), + (Counter.__or__, max, False), + (Counter.__and__, min, False), + ]: + result = counterop(p, q) + for x in elements: + # all except __add__ are undefined for negative inputs + if defneg or (p[x] >= 0 and q[x] >= 0): + self.assertEqual(numberop(p[x], q[x]), result[x]) + # verify that results exclude non-positive counts + self.assert_(x>0 for x in result.values()) + + elements = 'abcdef' + for i in range(100): + # verify that random multisets with no repeats are exactly like sets + p = Counter(dict((elem, randrange(0, 2)) for elem in elements)) + q = Counter(dict((elem, randrange(0, 2)) for elem in elements)) + for counterop, setop in [ + (Counter.__sub__, set.__sub__), + (Counter.__or__, set.__or__), + (Counter.__and__, set.__and__), + ]: + counter_result = counterop(p, q) + set_result = setop(set(p.elements()), set(q.elements())) + self.assertEqual(counter_result, dict.fromkeys(set_result, 1)) import doctest, collections -- cgit v0.12