summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
Diffstat (limited to 'Lib')
-rw-r--r--Lib/pickle.py71
-rw-r--r--Lib/pickletools.py185
-rw-r--r--Lib/test/pickletester.py30
-rw-r--r--Lib/test/test_pickle.py4
4 files changed, 167 insertions, 123 deletions
diff --git a/Lib/pickle.py b/Lib/pickle.py
index c57149a..9cd0132 100644
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -348,24 +348,25 @@ class _Pickler:
def __init__(self, file, protocol=None, *, fix_imports=True):
"""This takes a binary file for writing a pickle data stream.
- The optional protocol argument tells the pickler to use the
+ The optional *protocol* argument tells the pickler to use the
given protocol; supported protocols are 0, 1, 2, 3 and 4. The
- default protocol is 3; a backward-incompatible protocol designed for
- Python 3.
+ default protocol is 3; a backward-incompatible protocol designed
+ for Python 3.
Specifying a negative protocol version selects the highest
protocol version supported. The higher the protocol used, the
more recent the version of Python needed to read the pickle
produced.
- The file argument must have a write() method that accepts a single
- bytes argument. It can thus be a file object opened for binary
- writing, a io.BytesIO instance, or any other custom object that
- meets this interface.
+ The *file* argument must have a write() method that accepts a
+ single bytes argument. It can thus be a file object opened for
+ binary writing, a io.BytesIO instance, or any other custom
+ object that meets this interface.
- If fix_imports is True and protocol is less than 3, pickle will try to
- map the new Python 3 names to the old module names used in Python 2,
- so that the pickle data stream is readable with Python 2.
+ If *fix_imports* is True and *protocol* is less than 3, pickle
+ will try to map the new Python 3 names to the old module names
+ used in Python 2, so that the pickle data stream is readable
+ with Python 2.
"""
if protocol is None:
protocol = DEFAULT_PROTOCOL
@@ -389,10 +390,9 @@ class _Pickler:
"""Clears the pickler's "memo".
The memo is the data structure that remembers which objects the
- pickler has already seen, so that shared or recursive objects are
- pickled by reference and not by value. This method is useful when
- re-using picklers.
-
+ pickler has already seen, so that shared or recursive objects
+ are pickled by reference and not by value. This method is
+ useful when re-using picklers.
"""
self.memo.clear()
@@ -975,8 +975,14 @@ class _Unpickler:
encoding="ASCII", errors="strict"):
"""This takes a binary file for reading a pickle data stream.
- The protocol version of the pickle is detected automatically, so no
- proto argument is needed.
+ The protocol version of the pickle is detected automatically, so
+ no proto argument is needed.
+
+ The argument *file* must have two methods, a read() method that
+ takes an integer argument, and a readline() method that requires
+ no arguments. Both methods should return bytes. Thus *file*
+ can be a binary file object opened for reading, a io.BytesIO
+ object, or any other custom object that meets this interface.
The file-like object must have two methods, a read() method
that takes an integer argument, and a readline() method that
@@ -985,13 +991,14 @@ class _Unpickler:
reading, a BytesIO object, or any other custom object that
meets this interface.
- Optional keyword arguments are *fix_imports*, *encoding* and *errors*,
- which are used to control compatiblity support for pickle stream
- generated by Python 2.x. If *fix_imports* is True, pickle will try to
- map the old Python 2.x names to the new names used in Python 3.x. The
- *encoding* and *errors* tell pickle how to decode 8-bit string
- instances pickled by Python 2.x; these default to 'ASCII' and
- 'strict', respectively.
+ Optional keyword arguments are *fix_imports*, *encoding* and
+ *errors*, which are used to control compatiblity support for
+ pickle stream generated by Python 2. If *fix_imports* is True,
+ pickle will try to map the old Python 2 names to the new names
+ used in Python 3. The *encoding* and *errors* tell pickle how
+ to decode 8-bit string instances pickled by Python 2; these
+ default to 'ASCII' and 'strict', respectively. *encoding* can be
+ 'bytes' to read theses 8-bit string instances as bytes objects.
"""
self._file_readline = file.readline
self._file_read = file.read
@@ -1139,6 +1146,15 @@ class _Unpickler:
self.append(unpack('>d', self.read(8))[0])
dispatch[BINFLOAT[0]] = load_binfloat
+ def _decode_string(self, value):
+ # Used to allow strings from Python 2 to be decoded either as
+ # bytes or Unicode strings. This should be used only with the
+ # STRING, BINSTRING and SHORT_BINSTRING opcodes.
+ if self.encoding == "bytes":
+ return value
+ else:
+ return value.decode(self.encoding, self.errors)
+
def load_string(self):
data = self.readline()[:-1]
# Strip outermost quotes
@@ -1146,8 +1162,7 @@ class _Unpickler:
data = data[1:-1]
else:
raise UnpicklingError("the STRING opcode argument must be quoted")
- self.append(codecs.escape_decode(data)[0]
- .decode(self.encoding, self.errors))
+ self.append(self._decode_string(codecs.escape_decode(data)[0]))
dispatch[STRING[0]] = load_string
def load_binstring(self):
@@ -1156,8 +1171,7 @@ class _Unpickler:
if len < 0:
raise UnpicklingError("BINSTRING pickle has negative byte count")
data = self.read(len)
- value = str(data, self.encoding, self.errors)
- self.append(value)
+ self.append(self._decode_string(data))
dispatch[BINSTRING[0]] = load_binstring
def load_binbytes(self):
@@ -1191,8 +1205,7 @@ class _Unpickler:
def load_short_binstring(self):
len = self.read(1)[0]
data = self.read(len)
- value = str(data, self.encoding, self.errors)
- self.append(value)
+ self.append(self._decode_string(data))
dispatch[SHORT_BINSTRING[0]] = load_short_binstring
def load_short_binbytes(self):
diff --git a/Lib/pickletools.py b/Lib/pickletools.py
index a2480f6..71c2aa1 100644
--- a/Lib/pickletools.py
+++ b/Lib/pickletools.py
@@ -969,113 +969,107 @@ class StackObject(object):
return self.name
-pyint = StackObject(
- name='int',
- obtype=int,
- doc="A short (as opposed to long) Python integer object.")
-
-pylong = StackObject(
- name='long',
- obtype=int,
- doc="A long (as opposed to short) Python integer object.")
+pyint = pylong = StackObject(
+ name='int',
+ obtype=int,
+ doc="A Python integer object.")
pyinteger_or_bool = StackObject(
- name='int_or_bool',
- obtype=(int, bool),
- doc="A Python integer object (short or long), or "
- "a Python bool.")
+ name='int_or_bool',
+ obtype=(int, bool),
+ doc="A Python integer or boolean object.")
pybool = StackObject(
- name='bool',
- obtype=(bool,),
- doc="A Python bool object.")
+ name='bool',
+ obtype=bool,
+ doc="A Python boolean object.")
pyfloat = StackObject(
- name='float',
- obtype=float,
- doc="A Python float object.")
+ name='float',
+ obtype=float,
+ doc="A Python float object.")
-pystring = StackObject(
- name='string',
- obtype=bytes,
- doc="A Python (8-bit) string object.")
+pybytes_or_str = pystring = StackObject(
+ name='bytes_or_str',
+ obtype=(bytes, str),
+ doc="A Python bytes or (Unicode) string object.")
pybytes = StackObject(
- name='bytes',
- obtype=bytes,
- doc="A Python bytes object.")
+ name='bytes',
+ obtype=bytes,
+ doc="A Python bytes object.")
pyunicode = StackObject(
- name='str',
- obtype=str,
- doc="A Python (Unicode) string object.")
+ name='str',
+ obtype=str,
+ doc="A Python (Unicode) string object.")
pynone = StackObject(
- name="None",
- obtype=type(None),
- doc="The Python None object.")
+ name="None",
+ obtype=type(None),
+ doc="The Python None object.")
pytuple = StackObject(
- name="tuple",
- obtype=tuple,
- doc="A Python tuple object.")
+ name="tuple",
+ obtype=tuple,
+ doc="A Python tuple object.")
pylist = StackObject(
- name="list",
- obtype=list,
- doc="A Python list object.")
+ name="list",
+ obtype=list,
+ doc="A Python list object.")
pydict = StackObject(
- name="dict",
- obtype=dict,
- doc="A Python dict object.")
+ name="dict",
+ obtype=dict,
+ doc="A Python dict object.")
pyset = StackObject(
- name="set",
- obtype=set,
- doc="A Python set object.")
+ name="set",
+ obtype=set,
+ doc="A Python set object.")
pyfrozenset = StackObject(
- name="frozenset",
- obtype=set,
- doc="A Python frozenset object.")
+ name="frozenset",
+ obtype=set,
+ doc="A Python frozenset object.")
anyobject = StackObject(
- name='any',
- obtype=object,
- doc="Any kind of object whatsoever.")
+ name='any',
+ obtype=object,
+ doc="Any kind of object whatsoever.")
markobject = StackObject(
- name="mark",
- obtype=StackObject,
- doc="""'The mark' is a unique object.
-
- Opcodes that operate on a variable number of objects
- generally don't embed the count of objects in the opcode,
- or pull it off the stack. Instead the MARK opcode is used
- to push a special marker object on the stack, and then
- some other opcodes grab all the objects from the top of
- the stack down to (but not including) the topmost marker
- object.
- """)
+ name="mark",
+ obtype=StackObject,
+ doc="""'The mark' is a unique object.
+
+Opcodes that operate on a variable number of objects
+generally don't embed the count of objects in the opcode,
+or pull it off the stack. Instead the MARK opcode is used
+to push a special marker object on the stack, and then
+some other opcodes grab all the objects from the top of
+the stack down to (but not including) the topmost marker
+object.
+""")
stackslice = StackObject(
- name="stackslice",
- obtype=StackObject,
- doc="""An object representing a contiguous slice of the stack.
+ name="stackslice",
+ obtype=StackObject,
+ doc="""An object representing a contiguous slice of the stack.
- This is used in conjunction with markobject, to represent all
- of the stack following the topmost markobject. For example,
- the POP_MARK opcode changes the stack from
+This is used in conjunction with markobject, to represent all
+of the stack following the topmost markobject. For example,
+the POP_MARK opcode changes the stack from
- [..., markobject, stackslice]
- to
- [...]
+ [..., markobject, stackslice]
+to
+ [...]
- No matter how many object are on the stack after the topmost
- markobject, POP_MARK gets rid of all of them (including the
- topmost markobject too).
- """)
+No matter how many object are on the stack after the topmost
+markobject, POP_MARK gets rid of all of them (including the
+topmost markobject too).
+""")
##############################################################################
# Descriptors for pickle opcodes.
@@ -1212,7 +1206,7 @@ opcodes = [
code='L',
arg=decimalnl_long,
stack_before=[],
- stack_after=[pylong],
+ stack_after=[pyint],
proto=0,
doc="""Push a long integer.
@@ -1230,7 +1224,7 @@ opcodes = [
code='\x8a',
arg=long1,
stack_before=[],
- stack_after=[pylong],
+ stack_after=[pyint],
proto=2,
doc="""Long integer using one-byte length.
@@ -1241,7 +1235,7 @@ opcodes = [
code='\x8b',
arg=long4,
stack_before=[],
- stack_after=[pylong],
+ stack_after=[pyint],
proto=2,
doc="""Long integer using found-byte length.
@@ -1254,45 +1248,50 @@ opcodes = [
code='S',
arg=stringnl,
stack_before=[],
- stack_after=[pystring],
+ stack_after=[pybytes_or_str],
proto=0,
doc="""Push a Python string object.
The argument is a repr-style string, with bracketing quote characters,
and perhaps embedded escapes. The argument extends until the next
- newline character. (Actually, they are decoded into a str instance
+ newline character. These are usually decoded into a str instance
using the encoding given to the Unpickler constructor. or the default,
- 'ASCII'.)
+ 'ASCII'. If the encoding given was 'bytes' however, they will be
+ decoded as bytes object instead.
"""),
I(name='BINSTRING',
code='T',
arg=string4,
stack_before=[],
- stack_after=[pystring],
+ stack_after=[pybytes_or_str],
proto=1,
doc="""Push a Python string object.
- There are two arguments: the first is a 4-byte little-endian signed int
- giving the number of bytes in the string, and the second is that many
- bytes, which are taken literally as the string content. (Actually,
- they are decoded into a str instance using the encoding given to the
- Unpickler constructor. or the default, 'ASCII'.)
+ There are two arguments: the first is a 4-byte little-endian
+ signed int giving the number of bytes in the string, and the
+ second is that many bytes, which are taken literally as the string
+ content. These are usually decoded into a str instance using the
+ encoding given to the Unpickler constructor. or the default,
+ 'ASCII'. If the encoding given was 'bytes' however, they will be
+ decoded as bytes object instead.
"""),
I(name='SHORT_BINSTRING',
code='U',
arg=string1,
stack_before=[],
- stack_after=[pystring],
+ stack_after=[pybytes_or_str],
proto=1,
doc="""Push a Python string object.
- There are two arguments: the first is a 1-byte unsigned int giving
- the number of bytes in the string, and the second is that many bytes,
- which are taken literally as the string content. (Actually, they
- are decoded into a str instance using the encoding given to the
- Unpickler constructor. or the default, 'ASCII'.)
+ There are two arguments: the first is a 1-byte unsigned int giving
+ the number of bytes in the string, and the second is that many
+ bytes, which are taken literally as the string content. These are
+ usually decoded into a str instance using the encoding given to
+ the Unpickler constructor. or the default, 'ASCII'. If the
+ encoding given was 'bytes' however, they will be decoded as bytes
+ object instead.
"""),
# Bytes (protocol 3 only; older protocols don't support bytes at all)
diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py
index 040c26f..05befbf 100644
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -1305,6 +1305,35 @@ class AbstractPickleTests(unittest.TestCase):
dumped = self.dumps(set([3]), 2)
self.assertEqual(dumped, DATA6)
+ def test_load_python2_str_as_bytes(self):
+ # From Python 2: pickle.dumps('a\x00\xa0', protocol=0)
+ self.assertEqual(self.loads(b"S'a\\x00\\xa0'\n.",
+ encoding="bytes"), b'a\x00\xa0')
+ # From Python 2: pickle.dumps('a\x00\xa0', protocol=1)
+ self.assertEqual(self.loads(b'U\x03a\x00\xa0.',
+ encoding="bytes"), b'a\x00\xa0')
+ # From Python 2: pickle.dumps('a\x00\xa0', protocol=2)
+ self.assertEqual(self.loads(b'\x80\x02U\x03a\x00\xa0.',
+ encoding="bytes"), b'a\x00\xa0')
+
+ def test_load_python2_unicode_as_str(self):
+ # From Python 2: pickle.dumps(u'π', protocol=0)
+ self.assertEqual(self.loads(b'V\\u03c0\n.',
+ encoding='bytes'), 'π')
+ # From Python 2: pickle.dumps(u'π', protocol=1)
+ self.assertEqual(self.loads(b'X\x02\x00\x00\x00\xcf\x80.',
+ encoding="bytes"), 'π')
+ # From Python 2: pickle.dumps(u'π', protocol=2)
+ self.assertEqual(self.loads(b'\x80\x02X\x02\x00\x00\x00\xcf\x80.',
+ encoding="bytes"), 'π')
+
+ def test_load_long_python2_str_as_bytes(self):
+ # From Python 2: pickle.dumps('x' * 300, protocol=1)
+ self.assertEqual(self.loads(pickle.BINSTRING +
+ struct.pack("<I", 300) +
+ b'x' * 300 + pickle.STOP,
+ encoding='bytes'), b'x' * 300)
+
def test_large_pickles(self):
# Test the correctness of internal buffering routines when handling
# large data.
@@ -1566,7 +1595,6 @@ class AbstractPickleTests(unittest.TestCase):
unpickled = self.loads(self.dumps(method, proto))
self.assertEqual(method(obj), unpickled(obj))
-
def test_c_methods(self):
global Subclass
class Subclass(tuple):
diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py
index fbe96ac..0b2fe1e 100644
--- a/Lib/test/test_pickle.py
+++ b/Lib/test/test_pickle.py
@@ -83,13 +83,17 @@ class PyPicklerUnpicklerObjectTests(AbstractPicklerUnpicklerObjectTests):
class PyDispatchTableTests(AbstractDispatchTableTests):
+
pickler_class = pickle._Pickler
+
def get_dispatch_table(self):
return pickle.dispatch_table.copy()
class PyChainDispatchTableTests(AbstractDispatchTableTests):
+
pickler_class = pickle._Pickler
+
def get_dispatch_table(self):
return collections.ChainMap({}, pickle.dispatch_table)