From 05dadcfb28b815c9558fe2a6a74cd3ce7df62577 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 16 Dec 2014 18:00:56 +0200 Subject: Issue #19858: pickletools.optimize() now aware of the MEMOIZE opcode, can produce more compact result and no longer produces invalid output if input data contains MEMOIZE opcodes together with PUT or BINPUT opcodes. --- Lib/pickletools.py | 69 +++++++++++++++++++++++++++++--------------- Lib/test/test_pickletools.py | 43 +++++++++++++++++++++++++++ Misc/NEWS | 4 +++ 3 files changed, 92 insertions(+), 24 deletions(-) diff --git a/Lib/pickletools.py b/Lib/pickletools.py index 71c2aa1..6b86723 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -2282,40 +2282,61 @@ def genops(pickle): def optimize(p): 'Optimize a pickle string by removing unused PUT opcodes' - not_a_put = object() - gets = { not_a_put } # set of args used by a GET opcode - opcodes = [] # (startpos, stoppos, putid) + put = 'PUT' + get = 'GET' + oldids = set() # set of all PUT ids + newids = {} # set of ids used by a GET opcode + opcodes = [] # (op, idx) or (pos, end_pos) proto = 0 + protoheader = b'' for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True): if 'PUT' in opcode.name: - opcodes.append((pos, end_pos, arg)) + oldids.add(arg) + opcodes.append((put, arg)) + elif opcode.name == 'MEMOIZE': + idx = len(oldids) + oldids.add(idx) + opcodes.append((put, idx)) elif 'FRAME' in opcode.name: pass - else: - if 'GET' in opcode.name: - gets.add(arg) - elif opcode.name == 'PROTO': - assert pos == 0, pos + elif 'GET' in opcode.name: + if opcode.proto > proto: + proto = opcode.proto + newids[arg] = None + opcodes.append((get, arg)) + elif opcode.name == 'PROTO': + if arg > proto: proto = arg - opcodes.append((pos, end_pos, not_a_put)) - prevpos, prevarg = pos, None + if pos == 0: + protoheader = p[pos: end_pos] + else: + opcodes.append((pos, end_pos)) + else: + opcodes.append((pos, end_pos)) + del oldids # Copy the opcodes except for PUTS without a corresponding GET out = io.BytesIO() - opcodes = iter(opcodes) - if proto >= 2: - # Write the PROTO header before any framing - start, stop, _ = next(opcodes) - out.write(p[start:stop]) - buf = pickle._Framer(out.write) + # Write the PROTO header before any framing + out.write(protoheader) + pickler = pickle._Pickler(out, proto) if proto >= 4: - buf.start_framing() - for start, stop, putid in opcodes: - if putid in gets: - buf.commit_frame() - buf.write(p[start:stop]) - if proto >= 4: - buf.end_framing() + pickler.framer.start_framing() + idx = 0 + for op, arg in opcodes: + if op is put: + if arg not in newids: + continue + data = pickler.put(idx) + newids[arg] = idx + idx += 1 + elif op is get: + data = pickler.get(newids[arg]) + else: + data = p[op:arg] + pickler.framer.commit_frame() + pickler.write(data) + pickler.framer.end_framing() return out.getvalue() ############################################################################## diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py index d37ac26..bbe6875 100644 --- a/Lib/test/test_pickletools.py +++ b/Lib/test/test_pickletools.py @@ -1,3 +1,4 @@ +import struct import pickle import pickletools from test import support @@ -15,6 +16,48 @@ class OptimizedPickleTests(AbstractPickleTests, AbstractPickleModuleTests): # Test relies on precise output of dumps() test_pickle_to_2x = None + def test_optimize_long_binget(self): + data = [str(i) for i in range(257)] + data.append(data[-1]) + for proto in range(pickle.HIGHEST_PROTOCOL + 1): + pickled = pickle.dumps(data, proto) + unpickled = pickle.loads(pickled) + self.assertEqual(unpickled, data) + self.assertIs(unpickled[-1], unpickled[-2]) + + pickled2 = pickletools.optimize(pickled) + unpickled2 = pickle.loads(pickled2) + self.assertEqual(unpickled2, data) + self.assertIs(unpickled2[-1], unpickled2[-2]) + self.assertNotIn(pickle.LONG_BINGET, pickled2) + self.assertNotIn(pickle.LONG_BINPUT, pickled2) + + def test_optimize_binput_and_memoize(self): + pickled = (b'\x80\x04\x95\x15\x00\x00\x00\x00\x00\x00\x00' + b']\x94(\x8c\x04spamq\x01\x8c\x03ham\x94h\x02e.') + # 0: \x80 PROTO 4 + # 2: \x95 FRAME 21 + # 11: ] EMPTY_LIST + # 12: \x94 MEMOIZE + # 13: ( MARK + # 14: \x8c SHORT_BINUNICODE 'spam' + # 20: q BINPUT 1 + # 22: \x8c SHORT_BINUNICODE 'ham' + # 27: \x94 MEMOIZE + # 28: h BINGET 2 + # 30: e APPENDS (MARK at 13) + # 31: . STOP + self.assertIn(pickle.BINPUT, pickled) + unpickled = pickle.loads(pickled) + self.assertEqual(unpickled, ['spam', 'ham', 'ham']) + self.assertIs(unpickled[1], unpickled[2]) + + pickled2 = pickletools.optimize(pickled) + unpickled2 = pickle.loads(pickled2) + self.assertEqual(unpickled2, ['spam', 'ham', 'ham']) + self.assertIs(unpickled2[1], unpickled2[2]) + self.assertNotIn(pickle.BINPUT, pickled2) + def test_main(): support.run_unittest(OptimizedPickleTests) diff --git a/Misc/NEWS b/Misc/NEWS index 93ae79c..ae37fd1 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -41,6 +41,10 @@ Core and Builtins Library ------- +- Issue #19858: pickletools.optimize() now aware of the MEMOIZE opcode, can + produce more compact result and no longer produces invalid output if input + data contains MEMOIZE opcodes together with PUT or BINPUT opcodes. + - Issue #22095: Fixed HTTPConnection.set_tunnel with default port. The port value in the host header was set to "None". Patch by Demian Brecht. -- cgit v0.12