summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/pickle.py64
-rw-r--r--Lib/pickletools.py109
-rw-r--r--Lib/test/pickletester.py21
-rw-r--r--Lib/test/test_pickle.py12
-rw-r--r--Lib/test/test_pickletools.py2
-rw-r--r--Misc/NEWS12
6 files changed, 165 insertions, 55 deletions
diff --git a/Lib/pickle.py b/Lib/pickle.py
index e3c112f..ea8d8b5 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -42,19 +42,22 @@ __all__ = ["PickleError", "PicklingError", "UnpicklingError", "Pickler",
bytes_types = (bytes, bytearray, memoryview)
# These are purely informational; no code uses these.
-format_version = "2.0" # File format version we write
+format_version = "3.0" # File format version we write
compatible_formats = ["1.0", # Original protocol 0
"1.1", # Protocol 0 with INST added
"1.2", # Original protocol 1
"1.3", # Protocol 1 with BINFLOAT added
"2.0", # Protocol 2
+ "3.0", # Protocol 3
] # Old format versions we can read
# This is the highest protocol number we know how to read.
-HIGHEST_PROTOCOL = 2
+HIGHEST_PROTOCOL = 3
# The protocol we write by default. May be less than HIGHEST_PROTOCOL.
-DEFAULT_PROTOCOL = 2
+# We intentionally write a protocol that Python 2.x cannot read;
+# there are too many issues with that.
+DEFAULT_PROTOCOL = 3
# Why use struct.pack() for pickling but marshal.loads() for
# unpickling? struct.pack() is 40% faster than marshal.dumps(), but
@@ -161,6 +164,10 @@ LONG4 = b'\x8b' # push really big long
_tuplesize2code = [EMPTY_TUPLE, TUPLE1, TUPLE2, TUPLE3]
+# Protocol 3 (Python 3.x)
+
+BINBYTES = b'B' # push bytes; counted binary string argument
+SHORT_BINBYTES = b'C' # " " ; " " " " < 256 bytes
__all__.extend([x for x in dir() if re.match("[A-Z][A-Z0-9_]+$",x)])
@@ -494,20 +501,19 @@ class Pickler:
self.write(FLOAT + repr(obj).encode("ascii") + b'\n')
dispatch[float] = save_float
- def save_string(self, obj, pack=struct.pack):
- if self.bin:
- n = len(obj)
- if n < 256:
- self.write(SHORT_BINSTRING + bytes([n]) + bytes(obj))
- else:
- self.write(BINSTRING + pack("<i", n) + bytes(obj))
+ def save_bytes(self, obj, pack=struct.pack):
+ if self.proto < 3:
+ self.save_reduce(bytes, (list(obj),))
+ return
+ n = len(obj)
+ if n < 256:
+ self.write(SHORT_BINBYTES + bytes([n]) + bytes(obj))
else:
- # Strip leading 'b' due to repr() of bytes() returning b'...'
- self.write(STRING + repr(obj).lstrip("b").encode("ascii") + b'\n')
+ self.write(BINBYTES + pack("<i", n) + bytes(obj))
self.memoize(obj)
- dispatch[bytes] = save_string
+ dispatch[bytes] = save_bytes
- def save_unicode(self, obj, pack=struct.pack):
+ def save_str(self, obj, pack=struct.pack):
if self.bin:
encoded = obj.encode('utf-8')
n = len(encoded)
@@ -518,7 +524,7 @@ class Pickler:
self.write(UNICODE + bytes(obj.encode('raw-unicode-escape')) +
b'\n')
self.memoize(obj)
- dispatch[str] = save_unicode
+ dispatch[str] = save_str
def save_tuple(self, obj):
write = self.write
@@ -775,7 +781,7 @@ def whichmodule(func, funcname):
class Unpickler:
- def __init__(self, file):
+ def __init__(self, file, *, encoding="ASCII", errors="strict"):
"""This takes a binary file for reading a pickle data stream.
The protocol version of the pickle is detected automatically, so no
@@ -787,10 +793,16 @@ class Unpickler:
Thus file-like object can be a binary file object opened for
reading, a BytesIO object, or any other custom object that
meets this interface.
+
+ Optional keyword arguments are encoding and errors, which are
+ used to decode 8-bit string instances pickled by Python 2.x.
+ These default to 'ASCII' and 'strict', respectively.
"""
self.readline = file.readline
self.read = file.read
self.memo = {}
+ self.encoding = encoding
+ self.errors = errors
def load(self):
"""Read a pickled object representation from the open file.
@@ -831,7 +843,7 @@ class Unpickler:
def load_proto(self):
proto = ord(self.read(1))
- if not 0 <= proto <= 2:
+ if not 0 <= proto <= HIGHEST_PROTOCOL:
raise ValueError("unsupported pickle protocol: %d" % proto)
dispatch[PROTO[0]] = load_proto
@@ -924,9 +936,16 @@ class Unpickler:
def load_binstring(self):
len = mloads(b'i' + self.read(4))
- self.append(self.read(len))
+ data = self.read(len)
+ value = str(data, self.encoding, self.errors)
+ self.append(value)
dispatch[BINSTRING[0]] = load_binstring
+ def load_binbytes(self):
+ len = mloads(b'i' + self.read(4))
+ self.append(self.read(len))
+ dispatch[BINBYTES[0]] = load_binbytes
+
def load_unicode(self):
self.append(str(self.readline()[:-1], 'raw-unicode-escape'))
dispatch[UNICODE[0]] = load_unicode
@@ -938,9 +957,16 @@ class Unpickler:
def load_short_binstring(self):
len = ord(self.read(1))
- self.append(bytes(self.read(len)))
+ data = bytes(self.read(len))
+ value = str(data, self.encoding, self.errors)
+ self.append(value)
dispatch[SHORT_BINSTRING[0]] = load_short_binstring
+ def load_short_binbytes(self):
+ len = ord(self.read(1))
+ self.append(bytes(self.read(len)))
+ dispatch[SHORT_BINBYTES[0]] = load_short_binbytes
+
def load_tuple(self):
k = self.marker()
self.stack[k:] = [tuple(self.stack[k+1:])]
diff --git a/Lib/pickletools.py b/Lib/pickletools.py
index ca09c03..37dad9b 100644
--- a/Lib/pickletools.py
+++ b/Lib/pickletools.py
@@ -746,6 +746,11 @@ pyfloat = StackObject(
doc="A Python float object.")
pystring = StackObject(
+ name='string',
+ obtype=bytes,
+ doc="A Python (8-bit) string object.")
+
+pybytes = StackObject(
name='bytes',
obtype=bytes,
doc="A Python bytes object.")
@@ -753,7 +758,7 @@ pystring = StackObject(
pyunicode = StackObject(
name='str',
obtype=str,
- doc="A Python string object.")
+ doc="A Python (Unicode) string object.")
pynone = StackObject(
name="None",
@@ -868,7 +873,7 @@ class OpcodeInfo(object):
assert isinstance(x, StackObject)
self.stack_after = stack_after
- assert isinstance(proto, int) and 0 <= proto <= 2
+ assert isinstance(proto, int) and 0 <= proto <= 3
self.proto = proto
assert isinstance(doc, str)
@@ -995,7 +1000,9 @@ opcodes = [
The argument is a repr-style string, with bracketing quote characters,
and perhaps embedded escapes. The argument extends until the next
- newline character.
+ newline character. (Actually, they are decoded into a str instance
+ using the encoding given to the Unpickler constructor. or the default,
+ 'ASCII'.)
"""),
I(name='BINSTRING',
@@ -1008,7 +1015,9 @@ opcodes = [
There are two arguments: the first is a 4-byte little-endian signed int
giving the number of bytes in the string, and the second is that many
- bytes, which are taken literally as the string content.
+ bytes, which are taken literally as the string content. (Actually,
+ they are decoded into a str instance using the encoding given to the
+ Unpickler constructor. or the default, 'ASCII'.)
"""),
I(name='SHORT_BINSTRING',
@@ -1021,6 +1030,36 @@ opcodes = [
There are two arguments: the first is a 1-byte unsigned int giving
the number of bytes in the string, and the second is that many bytes,
+ which are taken literally as the string content. (Actually, they
+ are decoded into a str instance using the encoding given to the
+ Unpickler constructor. or the default, 'ASCII'.)
+ """),
+
+ # Bytes (protocol 3 only; older protocols don't support bytes at all)
+
+ I(name='BINBYTES',
+ code='B',
+ arg=string4,
+ stack_before=[],
+ stack_after=[pybytes],
+ proto=3,
+ doc="""Push a Python bytes object.
+
+ There are two arguments: the first is a 4-byte little-endian signed int
+ giving the number of bytes in the string, and the second is that many
+ bytes, which are taken literally as the bytes content.
+ """),
+
+ I(name='SHORT_BINBYTES',
+ code='C',
+ arg=string1,
+ stack_before=[],
+ stack_after=[pybytes],
+ proto=1,
+ doc="""Push a Python string object.
+
+ There are two arguments: the first is a 1-byte unsigned int giving
+ the number of bytes in the string, and the second is that many bytes,
which are taken literally as the string content.
"""),
@@ -2006,9 +2045,9 @@ class _Example:
_dis_test = r"""
>>> import pickle
->>> x = [1, 2, (3, 4), {bytes(b'abc'): "def"}]
->>> pkl = pickle.dumps(x, 0)
->>> dis(pkl)
+>>> x = [1, 2, (3, 4), {b'abc': "def"}]
+>>> pkl0 = pickle.dumps(x, 0)
+>>> dis(pkl0)
0: ( MARK
1: l LIST (MARK at 0)
2: p PUT 0
@@ -2025,19 +2064,32 @@ _dis_test = r"""
25: ( MARK
26: d DICT (MARK at 25)
27: p PUT 2
- 30: S STRING 'abc'
- 37: p PUT 3
- 40: V UNICODE 'def'
- 45: p PUT 4
- 48: s SETITEM
- 49: a APPEND
- 50: . STOP
+ 30: c GLOBAL 'builtins bytes'
+ 46: p PUT 3
+ 49: ( MARK
+ 50: ( MARK
+ 51: l LIST (MARK at 50)
+ 52: p PUT 4
+ 55: L LONG 97
+ 59: a APPEND
+ 60: L LONG 98
+ 64: a APPEND
+ 65: L LONG 99
+ 69: a APPEND
+ 70: t TUPLE (MARK at 49)
+ 71: p PUT 5
+ 74: R REDUCE
+ 75: V UNICODE 'def'
+ 80: p PUT 6
+ 83: s SETITEM
+ 84: a APPEND
+ 85: . STOP
highest protocol among opcodes = 0
Try again with a "binary" pickle.
->>> pkl = pickle.dumps(x, 1)
->>> dis(pkl)
+>>> pkl1 = pickle.dumps(x, 1)
+>>> dis(pkl1)
0: ] EMPTY_LIST
1: q BINPUT 0
3: ( MARK
@@ -2050,13 +2102,24 @@ Try again with a "binary" pickle.
14: q BINPUT 1
16: } EMPTY_DICT
17: q BINPUT 2
- 19: U SHORT_BINSTRING 'abc'
- 24: q BINPUT 3
- 26: X BINUNICODE 'def'
- 34: q BINPUT 4
- 36: s SETITEM
- 37: e APPENDS (MARK at 3)
- 38: . STOP
+ 19: c GLOBAL 'builtins bytes'
+ 35: q BINPUT 3
+ 37: ( MARK
+ 38: ] EMPTY_LIST
+ 39: q BINPUT 4
+ 41: ( MARK
+ 42: K BININT1 97
+ 44: K BININT1 98
+ 46: K BININT1 99
+ 48: e APPENDS (MARK at 41)
+ 49: t TUPLE (MARK at 37)
+ 50: q BINPUT 5
+ 52: R REDUCE
+ 53: X BINUNICODE 'def'
+ 61: q BINPUT 6
+ 63: s SETITEM
+ 64: e APPENDS (MARK at 3)
+ 65: . STOP
highest protocol among opcodes = 1
Exercise the INST/OBJ/BUILD family.
diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index 5ac2bdc..0230f3c 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -490,6 +490,12 @@ class AbstractPickleTests(unittest.TestCase):
u2 = self.loads(p)
self.assertEqual(u2, u)
+ def test_bytes(self):
+ for proto in protocols:
+ for u in b'', b'xyz', b'xyz'*100:
+ p = self.dumps(u)
+ self.assertEqual(self.loads(p), u)
+
def test_ints(self):
import sys
for proto in protocols:
@@ -532,8 +538,8 @@ class AbstractPickleTests(unittest.TestCase):
@run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
def test_float_format(self):
- # make sure that floats are formatted locale independent
- self.assertEqual(self.dumps(1.2)[0:3], b'F1.')
+ # make sure that floats are formatted locale independent with proto 0
+ self.assertEqual(self.dumps(1.2, 0)[0:3], b'F1.')
def test_reduce(self):
pass
@@ -624,6 +630,12 @@ class AbstractPickleTests(unittest.TestCase):
(2, 2): pickle.TUPLE2,
(2, 3): pickle.TUPLE3,
(2, 4): pickle.TUPLE,
+
+ (3, 0): pickle.EMPTY_TUPLE,
+ (3, 1): pickle.TUPLE1,
+ (3, 2): pickle.TUPLE2,
+ (3, 3): pickle.TUPLE3,
+ (3, 4): pickle.TUPLE,
}
a = ()
b = (1,)
@@ -643,14 +655,17 @@ class AbstractPickleTests(unittest.TestCase):
expected_opcode = {(0, None): pickle.NONE,
(1, None): pickle.NONE,
(2, None): pickle.NONE,
+ (3, None): pickle.NONE,
(0, True): pickle.INT,
(1, True): pickle.INT,
(2, True): pickle.NEWTRUE,
+ (3, True): pickle.NEWTRUE,
(0, False): pickle.INT,
(1, False): pickle.INT,
(2, False): pickle.NEWFALSE,
+ (3, False): pickle.NEWFALSE,
}
for proto in protocols:
for x in None, False, True:
@@ -955,7 +970,7 @@ class AbstractPickleModuleTests(unittest.TestCase):
def test_highest_protocol(self):
# Of course this needs to be changed when HIGHEST_PROTOCOL changes.
- self.assertEqual(self.module.HIGHEST_PROTOCOL, 2)
+ self.assertEqual(self.module.HIGHEST_PROTOCOL, 3)
def test_callapi(self):
from io import BytesIO
diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py
index aa09a6a..67d83c7c 100644
--- a/Lib/test/test_pickle.py
+++ b/Lib/test/test_pickle.py
@@ -12,23 +12,19 @@ class PickleTests(AbstractPickleTests, AbstractPickleModuleTests):
module = pickle
error = KeyError
- def dumps(self, arg, proto=0, fast=0):
- # Ignore fast
+ def dumps(self, arg, proto=None):
return pickle.dumps(arg, proto)
def loads(self, buf):
- # Ignore fast
return pickle.loads(buf)
class PicklerTests(AbstractPickleTests):
error = KeyError
- def dumps(self, arg, proto=0, fast=0):
+ def dumps(self, arg, proto=None):
f = io.BytesIO()
p = pickle.Pickler(f, proto)
- if fast:
- p.fast = fast
p.dump(arg)
f.seek(0)
return bytes(f.read())
@@ -40,14 +36,12 @@ class PicklerTests(AbstractPickleTests):
class PersPicklerTests(AbstractPersistentPicklerTests):
- def dumps(self, arg, proto=0, fast=0):
+ def dumps(self, arg, proto=None):
class PersPickler(pickle.Pickler):
def persistent_id(subself, obj):
return self.persistent_id(obj)
f = io.BytesIO()
p = PersPickler(f, proto)
- if fast:
- p.fast = fast
p.dump(arg)
f.seek(0)
return f.read()
diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py
index 3e5b35a..932dcd1 100644
--- a/Lib/test/test_pickletools.py
+++ b/Lib/test/test_pickletools.py
@@ -6,7 +6,7 @@ from test.pickletester import AbstractPickleModuleTests
class OptimizedPickleTests(AbstractPickleTests, AbstractPickleModuleTests):
- def dumps(self, arg, proto=0, fast=0):
+ def dumps(self, arg, proto=None):
return pickletools.optimize(pickle.dumps(arg, proto))
def loads(self, buf):
diff --git a/Misc/NEWS b/Misc/NEWS
index 65643bf..db686cd 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -23,6 +23,18 @@ Extension Modules
Library
-------
+- A new pickle protocol (protocol 3) is added with explicit support
+ for bytes. This is the default protocol. It intentionally cannot
+ be unpickled by Python 2.x.
+
+- When a pickle written by Python 2.x contains an (8-bit) str
+ instance, this is now decoded to a (Unicode) str instance. The
+ encoding used to do this defaults to ASCII, but can be overridden
+ via two new keyword arguments to the Unpickler class. Previously
+ this would create bytes instances, which is usually wrong: str
+ instances are often used to pickle attribute names etc., and text is
+ more common than binary data anyway.
+
- Default to ASCII as the locale.getpreferredencoding, if the POSIX
system doesn't support CODESET and LANG isn't set or doesn't
allow deduction of an encoding.