From f41698169198b32eecd60337a9437ea8c1714380 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Mon, 17 Mar 2008 22:56:06 +0000 Subject: - A new pickle protocol (protocol 3) is added with explicit support for bytes. This is the default protocol. It intentionally cannot be unpickled by Python 2.x. - When a pickle written by Python 2.x contains an (8-bit) str instance, this is now decoded to a (Unicode) str instance. The encoding used to do this defaults to ASCII, but can be overridden via two new keyword arguments to the Unpickler class. Previously this would create bytes instances, which is usually wrong: str instances are often used to pickle attribute names etc., and text is more common than binary data anyway. --- Lib/pickle.py | 64 +++++++++++++++++-------- Lib/pickletools.py | 109 ++++++++++++++++++++++++++++++++++--------- Lib/test/pickletester.py | 21 +++++++-- Lib/test/test_pickle.py | 12 ++--- Lib/test/test_pickletools.py | 2 +- Misc/NEWS | 12 +++++ 6 files changed, 165 insertions(+), 55 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index e3c112f..ea8d8b5 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -42,19 +42,22 @@ __all__ = ["PickleError", "PicklingError", "UnpicklingError", "Pickler", bytes_types = (bytes, bytearray, memoryview) # These are purely informational; no code uses these. -format_version = "2.0" # File format version we write +format_version = "3.0" # File format version we write compatible_formats = ["1.0", # Original protocol 0 "1.1", # Protocol 0 with INST added "1.2", # Original protocol 1 "1.3", # Protocol 1 with BINFLOAT added "2.0", # Protocol 2 + "3.0", # Protocol 3 ] # Old format versions we can read # This is the highest protocol number we know how to read. -HIGHEST_PROTOCOL = 2 +HIGHEST_PROTOCOL = 3 # The protocol we write by default. May be less than HIGHEST_PROTOCOL. -DEFAULT_PROTOCOL = 2 +# We intentionally write a protocol that Python 2.x cannot read; +# there are too many issues with that. +DEFAULT_PROTOCOL = 3 # Why use struct.pack() for pickling but marshal.loads() for # unpickling? struct.pack() is 40% faster than marshal.dumps(), but @@ -161,6 +164,10 @@ LONG4 = b'\x8b' # push really big long _tuplesize2code = [EMPTY_TUPLE, TUPLE1, TUPLE2, TUPLE3] +# Protocol 3 (Python 3.x) + +BINBYTES = b'B' # push bytes; counted binary string argument +SHORT_BINBYTES = b'C' # " " ; " " " " < 256 bytes __all__.extend([x for x in dir() if re.match("[A-Z][A-Z0-9_]+$",x)]) @@ -494,20 +501,19 @@ class Pickler: self.write(FLOAT + repr(obj).encode("ascii") + b'\n') dispatch[float] = save_float - def save_string(self, obj, pack=struct.pack): - if self.bin: - n = len(obj) - if n < 256: - self.write(SHORT_BINSTRING + bytes([n]) + bytes(obj)) - else: - self.write(BINSTRING + pack(">> import pickle ->>> x = [1, 2, (3, 4), {bytes(b'abc'): "def"}] ->>> pkl = pickle.dumps(x, 0) ->>> dis(pkl) +>>> x = [1, 2, (3, 4), {b'abc': "def"}] +>>> pkl0 = pickle.dumps(x, 0) +>>> dis(pkl0) 0: ( MARK 1: l LIST (MARK at 0) 2: p PUT 0 @@ -2025,19 +2064,32 @@ _dis_test = r""" 25: ( MARK 26: d DICT (MARK at 25) 27: p PUT 2 - 30: S STRING 'abc' - 37: p PUT 3 - 40: V UNICODE 'def' - 45: p PUT 4 - 48: s SETITEM - 49: a APPEND - 50: . STOP + 30: c GLOBAL 'builtins bytes' + 46: p PUT 3 + 49: ( MARK + 50: ( MARK + 51: l LIST (MARK at 50) + 52: p PUT 4 + 55: L LONG 97 + 59: a APPEND + 60: L LONG 98 + 64: a APPEND + 65: L LONG 99 + 69: a APPEND + 70: t TUPLE (MARK at 49) + 71: p PUT 5 + 74: R REDUCE + 75: V UNICODE 'def' + 80: p PUT 6 + 83: s SETITEM + 84: a APPEND + 85: . STOP highest protocol among opcodes = 0 Try again with a "binary" pickle. ->>> pkl = pickle.dumps(x, 1) ->>> dis(pkl) +>>> pkl1 = pickle.dumps(x, 1) +>>> dis(pkl1) 0: ] EMPTY_LIST 1: q BINPUT 0 3: ( MARK @@ -2050,13 +2102,24 @@ Try again with a "binary" pickle. 14: q BINPUT 1 16: } EMPTY_DICT 17: q BINPUT 2 - 19: U SHORT_BINSTRING 'abc' - 24: q BINPUT 3 - 26: X BINUNICODE 'def' - 34: q BINPUT 4 - 36: s SETITEM - 37: e APPENDS (MARK at 3) - 38: . STOP + 19: c GLOBAL 'builtins bytes' + 35: q BINPUT 3 + 37: ( MARK + 38: ] EMPTY_LIST + 39: q BINPUT 4 + 41: ( MARK + 42: K BININT1 97 + 44: K BININT1 98 + 46: K BININT1 99 + 48: e APPENDS (MARK at 41) + 49: t TUPLE (MARK at 37) + 50: q BINPUT 5 + 52: R REDUCE + 53: X BINUNICODE 'def' + 61: q BINPUT 6 + 63: s SETITEM + 64: e APPENDS (MARK at 3) + 65: . STOP highest protocol among opcodes = 1 Exercise the INST/OBJ/BUILD family. diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 5ac2bdc..0230f3c 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -490,6 +490,12 @@ class AbstractPickleTests(unittest.TestCase): u2 = self.loads(p) self.assertEqual(u2, u) + def test_bytes(self): + for proto in protocols: + for u in b'', b'xyz', b'xyz'*100: + p = self.dumps(u) + self.assertEqual(self.loads(p), u) + def test_ints(self): import sys for proto in protocols: @@ -532,8 +538,8 @@ class AbstractPickleTests(unittest.TestCase): @run_with_locale('LC_ALL', 'de_DE', 'fr_FR') def test_float_format(self): - # make sure that floats are formatted locale independent - self.assertEqual(self.dumps(1.2)[0:3], b'F1.') + # make sure that floats are formatted locale independent with proto 0 + self.assertEqual(self.dumps(1.2, 0)[0:3], b'F1.') def test_reduce(self): pass @@ -624,6 +630,12 @@ class AbstractPickleTests(unittest.TestCase): (2, 2): pickle.TUPLE2, (2, 3): pickle.TUPLE3, (2, 4): pickle.TUPLE, + + (3, 0): pickle.EMPTY_TUPLE, + (3, 1): pickle.TUPLE1, + (3, 2): pickle.TUPLE2, + (3, 3): pickle.TUPLE3, + (3, 4): pickle.TUPLE, } a = () b = (1,) @@ -643,14 +655,17 @@ class AbstractPickleTests(unittest.TestCase): expected_opcode = {(0, None): pickle.NONE, (1, None): pickle.NONE, (2, None): pickle.NONE, + (3, None): pickle.NONE, (0, True): pickle.INT, (1, True): pickle.INT, (2, True): pickle.NEWTRUE, + (3, True): pickle.NEWTRUE, (0, False): pickle.INT, (1, False): pickle.INT, (2, False): pickle.NEWFALSE, + (3, False): pickle.NEWFALSE, } for proto in protocols: for x in None, False, True: @@ -955,7 +970,7 @@ class AbstractPickleModuleTests(unittest.TestCase): def test_highest_protocol(self): # Of course this needs to be changed when HIGHEST_PROTOCOL changes. - self.assertEqual(self.module.HIGHEST_PROTOCOL, 2) + self.assertEqual(self.module.HIGHEST_PROTOCOL, 3) def test_callapi(self): from io import BytesIO diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py index aa09a6a..67d83c7c 100644 --- a/Lib/test/test_pickle.py +++ b/Lib/test/test_pickle.py @@ -12,23 +12,19 @@ class PickleTests(AbstractPickleTests, AbstractPickleModuleTests): module = pickle error = KeyError - def dumps(self, arg, proto=0, fast=0): - # Ignore fast + def dumps(self, arg, proto=None): return pickle.dumps(arg, proto) def loads(self, buf): - # Ignore fast return pickle.loads(buf) class PicklerTests(AbstractPickleTests): error = KeyError - def dumps(self, arg, proto=0, fast=0): + def dumps(self, arg, proto=None): f = io.BytesIO() p = pickle.Pickler(f, proto) - if fast: - p.fast = fast p.dump(arg) f.seek(0) return bytes(f.read()) @@ -40,14 +36,12 @@ class PicklerTests(AbstractPickleTests): class PersPicklerTests(AbstractPersistentPicklerTests): - def dumps(self, arg, proto=0, fast=0): + def dumps(self, arg, proto=None): class PersPickler(pickle.Pickler): def persistent_id(subself, obj): return self.persistent_id(obj) f = io.BytesIO() p = PersPickler(f, proto) - if fast: - p.fast = fast p.dump(arg) f.seek(0) return f.read() diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py index 3e5b35a..932dcd1 100644 --- a/Lib/test/test_pickletools.py +++ b/Lib/test/test_pickletools.py @@ -6,7 +6,7 @@ from test.pickletester import AbstractPickleModuleTests class OptimizedPickleTests(AbstractPickleTests, AbstractPickleModuleTests): - def dumps(self, arg, proto=0, fast=0): + def dumps(self, arg, proto=None): return pickletools.optimize(pickle.dumps(arg, proto)) def loads(self, buf): diff --git a/Misc/NEWS b/Misc/NEWS index 65643bf..db686cd 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -23,6 +23,18 @@ Extension Modules Library ------- +- A new pickle protocol (protocol 3) is added with explicit support + for bytes. This is the default protocol. It intentionally cannot + be unpickled by Python 2.x. + +- When a pickle written by Python 2.x contains an (8-bit) str + instance, this is now decoded to a (Unicode) str instance. The + encoding used to do this defaults to ASCII, but can be overridden + via two new keyword arguments to the Unpickler class. Previously + this would create bytes instances, which is usually wrong: str + instances are often used to pickle attribute names etc., and text is + more common than binary data anyway. + - Default to ASCII as the locale.getpreferredencoding, if the POSIX system doesn't support CODESET and LANG isn't set or doesn't allow deduction of an encoding. -- cgit v0.12