summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/pickletools.py69
-rw-r--r--Lib/test/test_pickletools.py43
-rw-r--r--Misc/NEWS4
3 files changed, 92 insertions, 24 deletions
diff --git a/Lib/pickletools.py b/Lib/pickletools.py
index 71c2aa1..6b86723 100644
--- a/Lib/pickletools.py
+++ b/Lib/pickletools.py
@@ -2282,40 +2282,61 @@ def genops(pickle):
def optimize(p):
'Optimize a pickle string by removing unused PUT opcodes'
- not_a_put = object()
- gets = { not_a_put } # set of args used by a GET opcode
- opcodes = [] # (startpos, stoppos, putid)
+ put = 'PUT'
+ get = 'GET'
+ oldids = set() # set of all PUT ids
+ newids = {} # set of ids used by a GET opcode
+ opcodes = [] # (op, idx) or (pos, end_pos)
proto = 0
+ protoheader = b''
for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True):
if 'PUT' in opcode.name:
- opcodes.append((pos, end_pos, arg))
+ oldids.add(arg)
+ opcodes.append((put, arg))
+ elif opcode.name == 'MEMOIZE':
+ idx = len(oldids)
+ oldids.add(idx)
+ opcodes.append((put, idx))
elif 'FRAME' in opcode.name:
pass
- else:
- if 'GET' in opcode.name:
- gets.add(arg)
- elif opcode.name == 'PROTO':
- assert pos == 0, pos
+ elif 'GET' in opcode.name:
+ if opcode.proto > proto:
+ proto = opcode.proto
+ newids[arg] = None
+ opcodes.append((get, arg))
+ elif opcode.name == 'PROTO':
+ if arg > proto:
proto = arg
- opcodes.append((pos, end_pos, not_a_put))
- prevpos, prevarg = pos, None
+ if pos == 0:
+ protoheader = p[pos: end_pos]
+ else:
+ opcodes.append((pos, end_pos))
+ else:
+ opcodes.append((pos, end_pos))
+ del oldids
# Copy the opcodes except for PUTS without a corresponding GET
out = io.BytesIO()
- opcodes = iter(opcodes)
- if proto >= 2:
- # Write the PROTO header before any framing
- start, stop, _ = next(opcodes)
- out.write(p[start:stop])
- buf = pickle._Framer(out.write)
+ # Write the PROTO header before any framing
+ out.write(protoheader)
+ pickler = pickle._Pickler(out, proto)
if proto >= 4:
- buf.start_framing()
- for start, stop, putid in opcodes:
- if putid in gets:
- buf.commit_frame()
- buf.write(p[start:stop])
- if proto >= 4:
- buf.end_framing()
+ pickler.framer.start_framing()
+ idx = 0
+ for op, arg in opcodes:
+ if op is put:
+ if arg not in newids:
+ continue
+ data = pickler.put(idx)
+ newids[arg] = idx
+ idx += 1
+ elif op is get:
+ data = pickler.get(newids[arg])
+ else:
+ data = p[op:arg]
+ pickler.framer.commit_frame()
+ pickler.write(data)
+ pickler.framer.end_framing()
return out.getvalue()
##############################################################################
diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py
index d37ac26..bbe6875 100644
--- a/Lib/test/test_pickletools.py
+++ b/Lib/test/test_pickletools.py
@@ -1,3 +1,4 @@
+import struct
import pickle
import pickletools
from test import support
@@ -15,6 +16,48 @@ class OptimizedPickleTests(AbstractPickleTests, AbstractPickleModuleTests):
# Test relies on precise output of dumps()
test_pickle_to_2x = None
+ def test_optimize_long_binget(self):
+ data = [str(i) for i in range(257)]
+ data.append(data[-1])
+ for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+ pickled = pickle.dumps(data, proto)
+ unpickled = pickle.loads(pickled)
+ self.assertEqual(unpickled, data)
+ self.assertIs(unpickled[-1], unpickled[-2])
+
+ pickled2 = pickletools.optimize(pickled)
+ unpickled2 = pickle.loads(pickled2)
+ self.assertEqual(unpickled2, data)
+ self.assertIs(unpickled2[-1], unpickled2[-2])
+ self.assertNotIn(pickle.LONG_BINGET, pickled2)
+ self.assertNotIn(pickle.LONG_BINPUT, pickled2)
+
+ def test_optimize_binput_and_memoize(self):
+ pickled = (b'\x80\x04\x95\x15\x00\x00\x00\x00\x00\x00\x00'
+ b']\x94(\x8c\x04spamq\x01\x8c\x03ham\x94h\x02e.')
+ # 0: \x80 PROTO 4
+ # 2: \x95 FRAME 21
+ # 11: ] EMPTY_LIST
+ # 12: \x94 MEMOIZE
+ # 13: ( MARK
+ # 14: \x8c SHORT_BINUNICODE 'spam'
+ # 20: q BINPUT 1
+ # 22: \x8c SHORT_BINUNICODE 'ham'
+ # 27: \x94 MEMOIZE
+ # 28: h BINGET 2
+ # 30: e APPENDS (MARK at 13)
+ # 31: . STOP
+ self.assertIn(pickle.BINPUT, pickled)
+ unpickled = pickle.loads(pickled)
+ self.assertEqual(unpickled, ['spam', 'ham', 'ham'])
+ self.assertIs(unpickled[1], unpickled[2])
+
+ pickled2 = pickletools.optimize(pickled)
+ unpickled2 = pickle.loads(pickled2)
+ self.assertEqual(unpickled2, ['spam', 'ham', 'ham'])
+ self.assertIs(unpickled2[1], unpickled2[2])
+ self.assertNotIn(pickle.BINPUT, pickled2)
+
def test_main():
support.run_unittest(OptimizedPickleTests)
diff --git a/Misc/NEWS b/Misc/NEWS
index acb99a4..437753b 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -196,6 +196,10 @@ Core and Builtins
Library
-------
+- Issue #19858: pickletools.optimize() now aware of the MEMOIZE opcode, can
+ produce more compact result and no longer produces invalid output if input
+ data contains MEMOIZE opcodes together with PUT or BINPUT opcodes.
+
- Issue #22095: Fixed HTTPConnection.set_tunnel with default port. The port
value in the host header was set to "None". Patch by Demian Brecht.