diff options
-rw-r--r-- | Lib/pickle.py | 64 | ||||
-rw-r--r-- | Lib/pickletools.py | 109 | ||||
-rw-r--r-- | Lib/test/pickletester.py | 21 | ||||
-rw-r--r-- | Lib/test/test_pickle.py | 12 | ||||
-rw-r--r-- | Lib/test/test_pickletools.py | 2 | ||||
-rw-r--r-- | Misc/NEWS | 12 |
6 files changed, 165 insertions, 55 deletions
diff --git a/Lib/pickle.py b/Lib/pickle.py index e3c112f..ea8d8b5 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -42,19 +42,22 @@ __all__ = ["PickleError", "PicklingError", "UnpicklingError", "Pickler", bytes_types = (bytes, bytearray, memoryview) # These are purely informational; no code uses these. -format_version = "2.0" # File format version we write +format_version = "3.0" # File format version we write compatible_formats = ["1.0", # Original protocol 0 "1.1", # Protocol 0 with INST added "1.2", # Original protocol 1 "1.3", # Protocol 1 with BINFLOAT added "2.0", # Protocol 2 + "3.0", # Protocol 3 ] # Old format versions we can read # This is the highest protocol number we know how to read. -HIGHEST_PROTOCOL = 2 +HIGHEST_PROTOCOL = 3 # The protocol we write by default. May be less than HIGHEST_PROTOCOL. -DEFAULT_PROTOCOL = 2 +# We intentionally write a protocol that Python 2.x cannot read; +# there are too many issues with that. +DEFAULT_PROTOCOL = 3 # Why use struct.pack() for pickling but marshal.loads() for # unpickling? struct.pack() is 40% faster than marshal.dumps(), but @@ -161,6 +164,10 @@ LONG4 = b'\x8b' # push really big long _tuplesize2code = [EMPTY_TUPLE, TUPLE1, TUPLE2, TUPLE3] +# Protocol 3 (Python 3.x) + +BINBYTES = b'B' # push bytes; counted binary string argument +SHORT_BINBYTES = b'C' # " " ; " " " " < 256 bytes __all__.extend([x for x in dir() if re.match("[A-Z][A-Z0-9_]+$",x)]) @@ -494,20 +501,19 @@ class Pickler: self.write(FLOAT + repr(obj).encode("ascii") + b'\n') dispatch[float] = save_float - def save_string(self, obj, pack=struct.pack): - if self.bin: - n = len(obj) - if n < 256: - self.write(SHORT_BINSTRING + bytes([n]) + bytes(obj)) - else: - self.write(BINSTRING + pack("<i", n) + bytes(obj)) + def save_bytes(self, obj, pack=struct.pack): + if self.proto < 3: + self.save_reduce(bytes, (list(obj),)) + return + n = len(obj) + if n < 256: + self.write(SHORT_BINBYTES + bytes([n]) + bytes(obj)) else: - # Strip leading 'b' due to repr() of bytes() returning b'...' - self.write(STRING + repr(obj).lstrip("b").encode("ascii") + b'\n') + self.write(BINBYTES + pack("<i", n) + bytes(obj)) self.memoize(obj) - dispatch[bytes] = save_string + dispatch[bytes] = save_bytes - def save_unicode(self, obj, pack=struct.pack): + def save_str(self, obj, pack=struct.pack): if self.bin: encoded = obj.encode('utf-8') n = len(encoded) @@ -518,7 +524,7 @@ class Pickler: self.write(UNICODE + bytes(obj.encode('raw-unicode-escape')) + b'\n') self.memoize(obj) - dispatch[str] = save_unicode + dispatch[str] = save_str def save_tuple(self, obj): write = self.write @@ -775,7 +781,7 @@ def whichmodule(func, funcname): class Unpickler: - def __init__(self, file): + def __init__(self, file, *, encoding="ASCII", errors="strict"): """This takes a binary file for reading a pickle data stream. The protocol version of the pickle is detected automatically, so no @@ -787,10 +793,16 @@ class Unpickler: Thus file-like object can be a binary file object opened for reading, a BytesIO object, or any other custom object that meets this interface. + + Optional keyword arguments are encoding and errors, which are + used to decode 8-bit string instances pickled by Python 2.x. + These default to 'ASCII' and 'strict', respectively. """ self.readline = file.readline self.read = file.read self.memo = {} + self.encoding = encoding + self.errors = errors def load(self): """Read a pickled object representation from the open file. @@ -831,7 +843,7 @@ class Unpickler: def load_proto(self): proto = ord(self.read(1)) - if not 0 <= proto <= 2: + if not 0 <= proto <= HIGHEST_PROTOCOL: raise ValueError("unsupported pickle protocol: %d" % proto) dispatch[PROTO[0]] = load_proto @@ -924,9 +936,16 @@ class Unpickler: def load_binstring(self): len = mloads(b'i' + self.read(4)) - self.append(self.read(len)) + data = self.read(len) + value = str(data, self.encoding, self.errors) + self.append(value) dispatch[BINSTRING[0]] = load_binstring + def load_binbytes(self): + len = mloads(b'i' + self.read(4)) + self.append(self.read(len)) + dispatch[BINBYTES[0]] = load_binbytes + def load_unicode(self): self.append(str(self.readline()[:-1], 'raw-unicode-escape')) dispatch[UNICODE[0]] = load_unicode @@ -938,9 +957,16 @@ class Unpickler: def load_short_binstring(self): len = ord(self.read(1)) - self.append(bytes(self.read(len))) + data = bytes(self.read(len)) + value = str(data, self.encoding, self.errors) + self.append(value) dispatch[SHORT_BINSTRING[0]] = load_short_binstring + def load_short_binbytes(self): + len = ord(self.read(1)) + self.append(bytes(self.read(len))) + dispatch[SHORT_BINBYTES[0]] = load_short_binbytes + def load_tuple(self): k = self.marker() self.stack[k:] = [tuple(self.stack[k+1:])] diff --git a/Lib/pickletools.py b/Lib/pickletools.py index ca09c03..37dad9b 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -746,6 +746,11 @@ pyfloat = StackObject( doc="A Python float object.") pystring = StackObject( + name='string', + obtype=bytes, + doc="A Python (8-bit) string object.") + +pybytes = StackObject( name='bytes', obtype=bytes, doc="A Python bytes object.") @@ -753,7 +758,7 @@ pystring = StackObject( pyunicode = StackObject( name='str', obtype=str, - doc="A Python string object.") + doc="A Python (Unicode) string object.") pynone = StackObject( name="None", @@ -868,7 +873,7 @@ class OpcodeInfo(object): assert isinstance(x, StackObject) self.stack_after = stack_after - assert isinstance(proto, int) and 0 <= proto <= 2 + assert isinstance(proto, int) and 0 <= proto <= 3 self.proto = proto assert isinstance(doc, str) @@ -995,7 +1000,9 @@ opcodes = [ The argument is a repr-style string, with bracketing quote characters, and perhaps embedded escapes. The argument extends until the next - newline character. + newline character. (Actually, they are decoded into a str instance + using the encoding given to the Unpickler constructor. or the default, + 'ASCII'.) """), I(name='BINSTRING', @@ -1008,7 +1015,9 @@ opcodes = [ There are two arguments: the first is a 4-byte little-endian signed int giving the number of bytes in the string, and the second is that many - bytes, which are taken literally as the string content. + bytes, which are taken literally as the string content. (Actually, + they are decoded into a str instance using the encoding given to the + Unpickler constructor. or the default, 'ASCII'.) """), I(name='SHORT_BINSTRING', @@ -1021,6 +1030,36 @@ opcodes = [ There are two arguments: the first is a 1-byte unsigned int giving the number of bytes in the string, and the second is that many bytes, + which are taken literally as the string content. (Actually, they + are decoded into a str instance using the encoding given to the + Unpickler constructor. or the default, 'ASCII'.) + """), + + # Bytes (protocol 3 only; older protocols don't support bytes at all) + + I(name='BINBYTES', + code='B', + arg=string4, + stack_before=[], + stack_after=[pybytes], + proto=3, + doc="""Push a Python bytes object. + + There are two arguments: the first is a 4-byte little-endian signed int + giving the number of bytes in the string, and the second is that many + bytes, which are taken literally as the bytes content. + """), + + I(name='SHORT_BINBYTES', + code='C', + arg=string1, + stack_before=[], + stack_after=[pybytes], + proto=1, + doc="""Push a Python string object. + + There are two arguments: the first is a 1-byte unsigned int giving + the number of bytes in the string, and the second is that many bytes, which are taken literally as the string content. """), @@ -2006,9 +2045,9 @@ class _Example: _dis_test = r""" >>> import pickle ->>> x = [1, 2, (3, 4), {bytes(b'abc'): "def"}] ->>> pkl = pickle.dumps(x, 0) ->>> dis(pkl) +>>> x = [1, 2, (3, 4), {b'abc': "def"}] +>>> pkl0 = pickle.dumps(x, 0) +>>> dis(pkl0) 0: ( MARK 1: l LIST (MARK at 0) 2: p PUT 0 @@ -2025,19 +2064,32 @@ _dis_test = r""" 25: ( MARK 26: d DICT (MARK at 25) 27: p PUT 2 - 30: S STRING 'abc' - 37: p PUT 3 - 40: V UNICODE 'def' - 45: p PUT 4 - 48: s SETITEM - 49: a APPEND - 50: . STOP + 30: c GLOBAL 'builtins bytes' + 46: p PUT 3 + 49: ( MARK + 50: ( MARK + 51: l LIST (MARK at 50) + 52: p PUT 4 + 55: L LONG 97 + 59: a APPEND + 60: L LONG 98 + 64: a APPEND + 65: L LONG 99 + 69: a APPEND + 70: t TUPLE (MARK at 49) + 71: p PUT 5 + 74: R REDUCE + 75: V UNICODE 'def' + 80: p PUT 6 + 83: s SETITEM + 84: a APPEND + 85: . STOP highest protocol among opcodes = 0 Try again with a "binary" pickle. ->>> pkl = pickle.dumps(x, 1) ->>> dis(pkl) +>>> pkl1 = pickle.dumps(x, 1) +>>> dis(pkl1) 0: ] EMPTY_LIST 1: q BINPUT 0 3: ( MARK @@ -2050,13 +2102,24 @@ Try again with a "binary" pickle. 14: q BINPUT 1 16: } EMPTY_DICT 17: q BINPUT 2 - 19: U SHORT_BINSTRING 'abc' - 24: q BINPUT 3 - 26: X BINUNICODE 'def' - 34: q BINPUT 4 - 36: s SETITEM - 37: e APPENDS (MARK at 3) - 38: . STOP + 19: c GLOBAL 'builtins bytes' + 35: q BINPUT 3 + 37: ( MARK + 38: ] EMPTY_LIST + 39: q BINPUT 4 + 41: ( MARK + 42: K BININT1 97 + 44: K BININT1 98 + 46: K BININT1 99 + 48: e APPENDS (MARK at 41) + 49: t TUPLE (MARK at 37) + 50: q BINPUT 5 + 52: R REDUCE + 53: X BINUNICODE 'def' + 61: q BINPUT 6 + 63: s SETITEM + 64: e APPENDS (MARK at 3) + 65: . STOP highest protocol among opcodes = 1 Exercise the INST/OBJ/BUILD family. diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 5ac2bdc..0230f3c 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -490,6 +490,12 @@ class AbstractPickleTests(unittest.TestCase): u2 = self.loads(p) self.assertEqual(u2, u) + def test_bytes(self): + for proto in protocols: + for u in b'', b'xyz', b'xyz'*100: + p = self.dumps(u) + self.assertEqual(self.loads(p), u) + def test_ints(self): import sys for proto in protocols: @@ -532,8 +538,8 @@ class AbstractPickleTests(unittest.TestCase): @run_with_locale('LC_ALL', 'de_DE', 'fr_FR') def test_float_format(self): - # make sure that floats are formatted locale independent - self.assertEqual(self.dumps(1.2)[0:3], b'F1.') + # make sure that floats are formatted locale independent with proto 0 + self.assertEqual(self.dumps(1.2, 0)[0:3], b'F1.') def test_reduce(self): pass @@ -624,6 +630,12 @@ class AbstractPickleTests(unittest.TestCase): (2, 2): pickle.TUPLE2, (2, 3): pickle.TUPLE3, (2, 4): pickle.TUPLE, + + (3, 0): pickle.EMPTY_TUPLE, + (3, 1): pickle.TUPLE1, + (3, 2): pickle.TUPLE2, + (3, 3): pickle.TUPLE3, + (3, 4): pickle.TUPLE, } a = () b = (1,) @@ -643,14 +655,17 @@ class AbstractPickleTests(unittest.TestCase): expected_opcode = {(0, None): pickle.NONE, (1, None): pickle.NONE, (2, None): pickle.NONE, + (3, None): pickle.NONE, (0, True): pickle.INT, (1, True): pickle.INT, (2, True): pickle.NEWTRUE, + (3, True): pickle.NEWTRUE, (0, False): pickle.INT, (1, False): pickle.INT, (2, False): pickle.NEWFALSE, + (3, False): pickle.NEWFALSE, } for proto in protocols: for x in None, False, True: @@ -955,7 +970,7 @@ class AbstractPickleModuleTests(unittest.TestCase): def test_highest_protocol(self): # Of course this needs to be changed when HIGHEST_PROTOCOL changes. - self.assertEqual(self.module.HIGHEST_PROTOCOL, 2) + self.assertEqual(self.module.HIGHEST_PROTOCOL, 3) def test_callapi(self): from io import BytesIO diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py index aa09a6a..67d83c7c 100644 --- a/Lib/test/test_pickle.py +++ b/Lib/test/test_pickle.py @@ -12,23 +12,19 @@ class PickleTests(AbstractPickleTests, AbstractPickleModuleTests): module = pickle error = KeyError - def dumps(self, arg, proto=0, fast=0): - # Ignore fast + def dumps(self, arg, proto=None): return pickle.dumps(arg, proto) def loads(self, buf): - # Ignore fast return pickle.loads(buf) class PicklerTests(AbstractPickleTests): error = KeyError - def dumps(self, arg, proto=0, fast=0): + def dumps(self, arg, proto=None): f = io.BytesIO() p = pickle.Pickler(f, proto) - if fast: - p.fast = fast p.dump(arg) f.seek(0) return bytes(f.read()) @@ -40,14 +36,12 @@ class PicklerTests(AbstractPickleTests): class PersPicklerTests(AbstractPersistentPicklerTests): - def dumps(self, arg, proto=0, fast=0): + def dumps(self, arg, proto=None): class PersPickler(pickle.Pickler): def persistent_id(subself, obj): return self.persistent_id(obj) f = io.BytesIO() p = PersPickler(f, proto) - if fast: - p.fast = fast p.dump(arg) f.seek(0) return f.read() diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py index 3e5b35a..932dcd1 100644 --- a/Lib/test/test_pickletools.py +++ b/Lib/test/test_pickletools.py @@ -6,7 +6,7 @@ from test.pickletester import AbstractPickleModuleTests class OptimizedPickleTests(AbstractPickleTests, AbstractPickleModuleTests): - def dumps(self, arg, proto=0, fast=0): + def dumps(self, arg, proto=None): return pickletools.optimize(pickle.dumps(arg, proto)) def loads(self, buf): @@ -23,6 +23,18 @@ Extension Modules Library ------- +- A new pickle protocol (protocol 3) is added with explicit support + for bytes. This is the default protocol. It intentionally cannot + be unpickled by Python 2.x. + +- When a pickle written by Python 2.x contains an (8-bit) str + instance, this is now decoded to a (Unicode) str instance. The + encoding used to do this defaults to ASCII, but can be overridden + via two new keyword arguments to the Unpickler class. Previously + this would create bytes instances, which is usually wrong: str + instances are often used to pickle attribute names etc., and text is + more common than binary data anyway. + - Default to ASCII as the locale.getpreferredencoding, if the POSIX system doesn't support CODESET and LANG isn't set or doesn't allow deduction of an encoding. |