summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTim Peters <tim.peters@gmail.com>2003-01-29 20:12:21 (GMT)
committerTim Peters <tim.peters@gmail.com>2003-01-29 20:12:21 (GMT)
commitc1c2b3e0e2af6eb05346b4908f4c4d56618fb7b6 (patch)
treea966c28559109bf571bc8d82409ad8af4cbfea67
parent5d9113d8be81596bc93f2b1a37f57e5110d39a77 (diff)
downloadcpython-c1c2b3e0e2af6eb05346b4908f4c4d56618fb7b6.zip
cpython-c1c2b3e0e2af6eb05346b4908f4c4d56618fb7b6.tar.gz
cpython-c1c2b3e0e2af6eb05346b4908f4c4d56618fb7b6.tar.bz2
dis(): This had a problem with proto 0 pickles, in that POP sometimes
popped a MARK, but without stack emulation the disassembler couldn't know that, and subsequent indentation got hosed. Now the disassembler does do enough stack emulation to catch this. While I was at it, also added lots of sanity checks for other stack operations, and correct use of the memo. This goes (I think) a long way toward being a "pickle verifier" now too.
-rw-r--r--Lib/pickletools.py130
1 files changed, 113 insertions, 17 deletions
diff --git a/Lib/pickletools.py b/Lib/pickletools.py
index dd88a0e..31c62b6 100644
--- a/Lib/pickletools.py
+++ b/Lib/pickletools.py
@@ -13,10 +13,11 @@ dis(pickle, out=None, indentlevel=4)
# Other ideas:
#
# - A pickle verifier: read a pickle and check it exhaustively for
-# well-formedness.
+# well-formedness. dis() does a lot of this already.
#
# - A protocol identifier: examine a pickle and return its protocol number
# (== the highest .proto attr value among all the opcodes in the pickle).
+# dis() already prints this info at the end.
#
# - A pickle optimizer: for example, tuple-building code is sometimes more
# elaborate than necessary, catering for the possibility that the tuple
@@ -712,6 +713,9 @@ class StackObject(object):
assert isinstance(doc, str)
self.doc = doc
+ def __repr__(self):
+ return self.name
+
pyint = StackObject(
name='int',
@@ -1858,10 +1862,33 @@ def dis(pickle, out=None, indentlevel=4):
Optional arg indentlevel is the number of blanks by which to indent
a new MARK level. It defaults to 4.
+
+ In addition to printing the disassembly, some sanity checks are made:
+
+ + All embedded opcode arguments "make sense".
+
+ + Explicit and implicit pop operations have enough items on the stack.
+
+ + When an opcode implicitly refers to a markobject, a markobject is
+ actually on the stack.
+
+ + A memo entry isn't referenced before it's defined.
+
+ + The markobject isn't stored in the memo.
+
+ + A memo entry isn't redefined.
"""
- markstack = []
+ # Most of the hair here is for sanity checks, but most of it is needed
+ # anyway to detect when a protocol 0 POP takes a MARK off the stack
+ # (which in turn is needed to indent MARK blocks correctly).
+
+ stack = [] # crude emulation of unpickler stack
+ memo = {} # crude emulation of unpicker memo
+ maxproto = -1 # max protocol number seen
+ markstack = [] # bytecode positions of MARK opcodes
indentchunk = ' ' * indentlevel
+ errormsg = None
for opcode, arg, pos in genops(pickle):
if pos is not None:
print >> out, "%5d:" % pos,
@@ -1870,12 +1897,54 @@ def dis(pickle, out=None, indentlevel=4):
indentchunk * len(markstack),
opcode.name)
+ maxproto = max(maxproto, opcode.proto)
+
+ # See whether a MARK should be popped.
+ before = opcode.stack_before # don't mutate
+ after = opcode.stack_after # don't mutate
markmsg = None
- if markstack and markobject in opcode.stack_before:
- assert markobject not in opcode.stack_after
- markpos = markstack.pop()
- if markpos is not None:
- markmsg = "(MARK at %d)" % markpos
+ if markobject in before or (opcode.name == "POP" and
+ stack and
+ stack[-1] is markobject):
+ assert markobject not in after
+ if markstack:
+ markpos = markstack.pop()
+ if markpos is None:
+ markmsg = "(MARK at unknown opcode offset)"
+ else:
+ markmsg = "(MARK at %d)" % markpos
+ # Pop everything at and after the topmost markobject.
+ while stack[-1] is not markobject:
+ stack.pop()
+ stack.pop()
+ # Remove markobject stuff from stack_before.
+ try:
+ i = before.index(markobject)
+ before = before[:i]
+ except ValueError:
+ assert opcode.name == "POP"
+ assert len(before) == 1
+ before = [] # stop code later from popping again
+ else:
+ errormsg = markmsg = "no MARK exists on stack"
+
+ # Check for correct memo usage.
+ if opcode.name in ("PUT", "BINPUT", "LONG_BINPUT"):
+ if arg in memo:
+ errormsg = "memo key %r already defined" % arg
+ elif not stack:
+ errormsg = "stack is empty -- can't store into memo"
+ elif stack[-1] is markobject:
+ errormsg = "can't store markobject in the memo"
+ else:
+ memo[arg] = stack[-1]
+
+ elif opcode.name in ("GET", "BINGET", "LONG_BINGET"):
+ if arg in memo:
+ assert len(after) == 1
+ after = [memo[arg]] # for better stack emulation
+ else:
+ errormsg = "memo key %r has never been stored into" % arg
if arg is not None or markmsg:
# make a mild effort to align arguments
@@ -1886,10 +1955,27 @@ def dis(pickle, out=None, indentlevel=4):
line += ' ' + markmsg
print >> out, line
- if markobject in opcode.stack_after:
+ if errormsg:
+ # Note that we delayed complaining until the offending opcode
+ # was printed.
+ raise ValueError(errormsg)
+
+ # Emulate the stack effects.
+ n = len(before)
+ if len(stack) < n:
+ raise ValueError("tried to pop %d items from stack with "
+ "only %d items" % (n, len(stack)))
+ if n:
+ del stack[-n:]
+ if markobject in after:
assert markobject not in opcode.stack_before
markstack.append(pos)
+ stack.extend(after)
+
+ print >> out, "highest protocol among opcodes =", maxproto
+ if stack:
+ raise ValueError("stack not empty after STOP: %r" % stack)
_dis_test = r"""
>>> import pickle
@@ -1919,6 +2005,7 @@ _dis_test = r"""
48: s SETITEM
49: a APPEND
50: . STOP
+highest protocol among opcodes = 0
Try again with a "binary" pickle.
@@ -1943,6 +2030,7 @@ Try again with a "binary" pickle.
36: s SETITEM
37: e APPENDS (MARK at 3)
38: . STOP
+highest protocol among opcodes = 1
Exercise the INST/OBJ/BUILD family.
@@ -1951,6 +2039,7 @@ Exercise the INST/OBJ/BUILD family.
0: c GLOBAL 'random random'
15: p PUT 0
18: . STOP
+highest protocol among opcodes = 0
>>> x = [pickle.PicklingError()] * 2
>>> dis(pickle.dumps(x, 0))
@@ -1973,6 +2062,7 @@ Exercise the INST/OBJ/BUILD family.
52: g GET 1
55: a APPEND
56: . STOP
+highest protocol among opcodes = 0
>>> dis(pickle.dumps(x, 1))
0: ] EMPTY_LIST
@@ -1993,6 +2083,7 @@ Exercise the INST/OBJ/BUILD family.
46: h BINGET 2
48: e APPENDS (MARK at 3)
49: . STOP
+highest protocol among opcodes = 1
Try "the canonical" recursive-object test.
@@ -2017,6 +2108,8 @@ True
10: p PUT 1
13: a APPEND
14: . STOP
+highest protocol among opcodes = 0
+
>>> dis(pickle.dumps(L, 1))
0: ] EMPTY_LIST
1: q BINPUT 0
@@ -2026,13 +2119,11 @@ True
7: q BINPUT 1
9: a APPEND
10: . STOP
+highest protocol among opcodes = 1
-The protocol 0 pickle of the tuple causes the disassembly to get confused,
-as it doesn't realize that the POP opcode at 16 gets rid of the MARK at 0
-(so the output remains indented until the end). The protocol 1 pickle
-doesn't trigger this glitch, because the disassembler realizes that
-POP_MARK gets rid of the MARK. Doing a better job on the protocol 0
-pickle would require the disassembler to emulate the stack.
+Note that, in the protocol 0 pickle of the recursive tuple, the disassembler
+has to emulate the stack in order to realize that the POP opcode at 16 gets
+rid of the MARK at 0.
>>> dis(pickle.dumps(T, 0))
0: ( MARK
@@ -2045,9 +2136,11 @@ pickle would require the disassembler to emulate the stack.
11: p PUT 1
14: a APPEND
15: 0 POP
- 16: 0 POP
- 17: g GET 1
- 20: . STOP
+ 16: 0 POP (MARK at 0)
+ 17: g GET 1
+ 20: . STOP
+highest protocol among opcodes = 0
+
>>> dis(pickle.dumps(T, 1))
0: ( MARK
1: ] EMPTY_LIST
@@ -2060,6 +2153,7 @@ pickle would require the disassembler to emulate the stack.
11: 1 POP_MARK (MARK at 0)
12: h BINGET 1
14: . STOP
+highest protocol among opcodes = 1
Try protocol 2.
@@ -2072,6 +2166,7 @@ Try protocol 2.
8: q BINPUT 1
10: a APPEND
11: . STOP
+highest protocol among opcodes = 2
>>> dis(pickle.dumps(T, 2))
0: \x80 PROTO 2
@@ -2084,6 +2179,7 @@ Try protocol 2.
11: 0 POP
12: h BINGET 1
14: . STOP
+highest protocol among opcodes = 2
"""
__test__ = {'disassembler_test': _dis_test,