diff options
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/pickle.py | 71 | ||||
-rw-r--r-- | Lib/pickletools.py | 185 | ||||
-rw-r--r-- | Lib/test/pickletester.py | 30 | ||||
-rw-r--r-- | Lib/test/test_pickle.py | 4 |
4 files changed, 167 insertions, 123 deletions
diff --git a/Lib/pickle.py b/Lib/pickle.py index c57149a..9cd0132 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -348,24 +348,25 @@ class _Pickler: def __init__(self, file, protocol=None, *, fix_imports=True): """This takes a binary file for writing a pickle data stream. - The optional protocol argument tells the pickler to use the + The optional *protocol* argument tells the pickler to use the given protocol; supported protocols are 0, 1, 2, 3 and 4. The - default protocol is 3; a backward-incompatible protocol designed for - Python 3. + default protocol is 3; a backward-incompatible protocol designed + for Python 3. Specifying a negative protocol version selects the highest protocol version supported. The higher the protocol used, the more recent the version of Python needed to read the pickle produced. - The file argument must have a write() method that accepts a single - bytes argument. It can thus be a file object opened for binary - writing, a io.BytesIO instance, or any other custom object that - meets this interface. + The *file* argument must have a write() method that accepts a + single bytes argument. It can thus be a file object opened for + binary writing, a io.BytesIO instance, or any other custom + object that meets this interface. - If fix_imports is True and protocol is less than 3, pickle will try to - map the new Python 3 names to the old module names used in Python 2, - so that the pickle data stream is readable with Python 2. + If *fix_imports* is True and *protocol* is less than 3, pickle + will try to map the new Python 3 names to the old module names + used in Python 2, so that the pickle data stream is readable + with Python 2. """ if protocol is None: protocol = DEFAULT_PROTOCOL @@ -389,10 +390,9 @@ class _Pickler: """Clears the pickler's "memo". The memo is the data structure that remembers which objects the - pickler has already seen, so that shared or recursive objects are - pickled by reference and not by value. This method is useful when - re-using picklers. - + pickler has already seen, so that shared or recursive objects + are pickled by reference and not by value. This method is + useful when re-using picklers. """ self.memo.clear() @@ -975,8 +975,14 @@ class _Unpickler: encoding="ASCII", errors="strict"): """This takes a binary file for reading a pickle data stream. - The protocol version of the pickle is detected automatically, so no - proto argument is needed. + The protocol version of the pickle is detected automatically, so + no proto argument is needed. + + The argument *file* must have two methods, a read() method that + takes an integer argument, and a readline() method that requires + no arguments. Both methods should return bytes. Thus *file* + can be a binary file object opened for reading, a io.BytesIO + object, or any other custom object that meets this interface. The file-like object must have two methods, a read() method that takes an integer argument, and a readline() method that @@ -985,13 +991,14 @@ class _Unpickler: reading, a BytesIO object, or any other custom object that meets this interface. - Optional keyword arguments are *fix_imports*, *encoding* and *errors*, - which are used to control compatiblity support for pickle stream - generated by Python 2.x. If *fix_imports* is True, pickle will try to - map the old Python 2.x names to the new names used in Python 3.x. The - *encoding* and *errors* tell pickle how to decode 8-bit string - instances pickled by Python 2.x; these default to 'ASCII' and - 'strict', respectively. + Optional keyword arguments are *fix_imports*, *encoding* and + *errors*, which are used to control compatiblity support for + pickle stream generated by Python 2. If *fix_imports* is True, + pickle will try to map the old Python 2 names to the new names + used in Python 3. The *encoding* and *errors* tell pickle how + to decode 8-bit string instances pickled by Python 2; these + default to 'ASCII' and 'strict', respectively. *encoding* can be + 'bytes' to read theses 8-bit string instances as bytes objects. """ self._file_readline = file.readline self._file_read = file.read @@ -1139,6 +1146,15 @@ class _Unpickler: self.append(unpack('>d', self.read(8))[0]) dispatch[BINFLOAT[0]] = load_binfloat + def _decode_string(self, value): + # Used to allow strings from Python 2 to be decoded either as + # bytes or Unicode strings. This should be used only with the + # STRING, BINSTRING and SHORT_BINSTRING opcodes. + if self.encoding == "bytes": + return value + else: + return value.decode(self.encoding, self.errors) + def load_string(self): data = self.readline()[:-1] # Strip outermost quotes @@ -1146,8 +1162,7 @@ class _Unpickler: data = data[1:-1] else: raise UnpicklingError("the STRING opcode argument must be quoted") - self.append(codecs.escape_decode(data)[0] - .decode(self.encoding, self.errors)) + self.append(self._decode_string(codecs.escape_decode(data)[0])) dispatch[STRING[0]] = load_string def load_binstring(self): @@ -1156,8 +1171,7 @@ class _Unpickler: if len < 0: raise UnpicklingError("BINSTRING pickle has negative byte count") data = self.read(len) - value = str(data, self.encoding, self.errors) - self.append(value) + self.append(self._decode_string(data)) dispatch[BINSTRING[0]] = load_binstring def load_binbytes(self): @@ -1191,8 +1205,7 @@ class _Unpickler: def load_short_binstring(self): len = self.read(1)[0] data = self.read(len) - value = str(data, self.encoding, self.errors) - self.append(value) + self.append(self._decode_string(data)) dispatch[SHORT_BINSTRING[0]] = load_short_binstring def load_short_binbytes(self): diff --git a/Lib/pickletools.py b/Lib/pickletools.py index a2480f6..71c2aa1 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -969,113 +969,107 @@ class StackObject(object): return self.name -pyint = StackObject( - name='int', - obtype=int, - doc="A short (as opposed to long) Python integer object.") - -pylong = StackObject( - name='long', - obtype=int, - doc="A long (as opposed to short) Python integer object.") +pyint = pylong = StackObject( + name='int', + obtype=int, + doc="A Python integer object.") pyinteger_or_bool = StackObject( - name='int_or_bool', - obtype=(int, bool), - doc="A Python integer object (short or long), or " - "a Python bool.") + name='int_or_bool', + obtype=(int, bool), + doc="A Python integer or boolean object.") pybool = StackObject( - name='bool', - obtype=(bool,), - doc="A Python bool object.") + name='bool', + obtype=bool, + doc="A Python boolean object.") pyfloat = StackObject( - name='float', - obtype=float, - doc="A Python float object.") + name='float', + obtype=float, + doc="A Python float object.") -pystring = StackObject( - name='string', - obtype=bytes, - doc="A Python (8-bit) string object.") +pybytes_or_str = pystring = StackObject( + name='bytes_or_str', + obtype=(bytes, str), + doc="A Python bytes or (Unicode) string object.") pybytes = StackObject( - name='bytes', - obtype=bytes, - doc="A Python bytes object.") + name='bytes', + obtype=bytes, + doc="A Python bytes object.") pyunicode = StackObject( - name='str', - obtype=str, - doc="A Python (Unicode) string object.") + name='str', + obtype=str, + doc="A Python (Unicode) string object.") pynone = StackObject( - name="None", - obtype=type(None), - doc="The Python None object.") + name="None", + obtype=type(None), + doc="The Python None object.") pytuple = StackObject( - name="tuple", - obtype=tuple, - doc="A Python tuple object.") + name="tuple", + obtype=tuple, + doc="A Python tuple object.") pylist = StackObject( - name="list", - obtype=list, - doc="A Python list object.") + name="list", + obtype=list, + doc="A Python list object.") pydict = StackObject( - name="dict", - obtype=dict, - doc="A Python dict object.") + name="dict", + obtype=dict, + doc="A Python dict object.") pyset = StackObject( - name="set", - obtype=set, - doc="A Python set object.") + name="set", + obtype=set, + doc="A Python set object.") pyfrozenset = StackObject( - name="frozenset", - obtype=set, - doc="A Python frozenset object.") + name="frozenset", + obtype=set, + doc="A Python frozenset object.") anyobject = StackObject( - name='any', - obtype=object, - doc="Any kind of object whatsoever.") + name='any', + obtype=object, + doc="Any kind of object whatsoever.") markobject = StackObject( - name="mark", - obtype=StackObject, - doc="""'The mark' is a unique object. - - Opcodes that operate on a variable number of objects - generally don't embed the count of objects in the opcode, - or pull it off the stack. Instead the MARK opcode is used - to push a special marker object on the stack, and then - some other opcodes grab all the objects from the top of - the stack down to (but not including) the topmost marker - object. - """) + name="mark", + obtype=StackObject, + doc="""'The mark' is a unique object. + +Opcodes that operate on a variable number of objects +generally don't embed the count of objects in the opcode, +or pull it off the stack. Instead the MARK opcode is used +to push a special marker object on the stack, and then +some other opcodes grab all the objects from the top of +the stack down to (but not including) the topmost marker +object. +""") stackslice = StackObject( - name="stackslice", - obtype=StackObject, - doc="""An object representing a contiguous slice of the stack. + name="stackslice", + obtype=StackObject, + doc="""An object representing a contiguous slice of the stack. - This is used in conjunction with markobject, to represent all - of the stack following the topmost markobject. For example, - the POP_MARK opcode changes the stack from +This is used in conjunction with markobject, to represent all +of the stack following the topmost markobject. For example, +the POP_MARK opcode changes the stack from - [..., markobject, stackslice] - to - [...] + [..., markobject, stackslice] +to + [...] - No matter how many object are on the stack after the topmost - markobject, POP_MARK gets rid of all of them (including the - topmost markobject too). - """) +No matter how many object are on the stack after the topmost +markobject, POP_MARK gets rid of all of them (including the +topmost markobject too). +""") ############################################################################## # Descriptors for pickle opcodes. @@ -1212,7 +1206,7 @@ opcodes = [ code='L', arg=decimalnl_long, stack_before=[], - stack_after=[pylong], + stack_after=[pyint], proto=0, doc="""Push a long integer. @@ -1230,7 +1224,7 @@ opcodes = [ code='\x8a', arg=long1, stack_before=[], - stack_after=[pylong], + stack_after=[pyint], proto=2, doc="""Long integer using one-byte length. @@ -1241,7 +1235,7 @@ opcodes = [ code='\x8b', arg=long4, stack_before=[], - stack_after=[pylong], + stack_after=[pyint], proto=2, doc="""Long integer using found-byte length. @@ -1254,45 +1248,50 @@ opcodes = [ code='S', arg=stringnl, stack_before=[], - stack_after=[pystring], + stack_after=[pybytes_or_str], proto=0, doc="""Push a Python string object. The argument is a repr-style string, with bracketing quote characters, and perhaps embedded escapes. The argument extends until the next - newline character. (Actually, they are decoded into a str instance + newline character. These are usually decoded into a str instance using the encoding given to the Unpickler constructor. or the default, - 'ASCII'.) + 'ASCII'. If the encoding given was 'bytes' however, they will be + decoded as bytes object instead. """), I(name='BINSTRING', code='T', arg=string4, stack_before=[], - stack_after=[pystring], + stack_after=[pybytes_or_str], proto=1, doc="""Push a Python string object. - There are two arguments: the first is a 4-byte little-endian signed int - giving the number of bytes in the string, and the second is that many - bytes, which are taken literally as the string content. (Actually, - they are decoded into a str instance using the encoding given to the - Unpickler constructor. or the default, 'ASCII'.) + There are two arguments: the first is a 4-byte little-endian + signed int giving the number of bytes in the string, and the + second is that many bytes, which are taken literally as the string + content. These are usually decoded into a str instance using the + encoding given to the Unpickler constructor. or the default, + 'ASCII'. If the encoding given was 'bytes' however, they will be + decoded as bytes object instead. """), I(name='SHORT_BINSTRING', code='U', arg=string1, stack_before=[], - stack_after=[pystring], + stack_after=[pybytes_or_str], proto=1, doc="""Push a Python string object. - There are two arguments: the first is a 1-byte unsigned int giving - the number of bytes in the string, and the second is that many bytes, - which are taken literally as the string content. (Actually, they - are decoded into a str instance using the encoding given to the - Unpickler constructor. or the default, 'ASCII'.) + There are two arguments: the first is a 1-byte unsigned int giving + the number of bytes in the string, and the second is that many + bytes, which are taken literally as the string content. These are + usually decoded into a str instance using the encoding given to + the Unpickler constructor. or the default, 'ASCII'. If the + encoding given was 'bytes' however, they will be decoded as bytes + object instead. """), # Bytes (protocol 3 only; older protocols don't support bytes at all) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 040c26f..05befbf 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -1305,6 +1305,35 @@ class AbstractPickleTests(unittest.TestCase): dumped = self.dumps(set([3]), 2) self.assertEqual(dumped, DATA6) + def test_load_python2_str_as_bytes(self): + # From Python 2: pickle.dumps('a\x00\xa0', protocol=0) + self.assertEqual(self.loads(b"S'a\\x00\\xa0'\n.", + encoding="bytes"), b'a\x00\xa0') + # From Python 2: pickle.dumps('a\x00\xa0', protocol=1) + self.assertEqual(self.loads(b'U\x03a\x00\xa0.', + encoding="bytes"), b'a\x00\xa0') + # From Python 2: pickle.dumps('a\x00\xa0', protocol=2) + self.assertEqual(self.loads(b'\x80\x02U\x03a\x00\xa0.', + encoding="bytes"), b'a\x00\xa0') + + def test_load_python2_unicode_as_str(self): + # From Python 2: pickle.dumps(u'π', protocol=0) + self.assertEqual(self.loads(b'V\\u03c0\n.', + encoding='bytes'), 'π') + # From Python 2: pickle.dumps(u'π', protocol=1) + self.assertEqual(self.loads(b'X\x02\x00\x00\x00\xcf\x80.', + encoding="bytes"), 'π') + # From Python 2: pickle.dumps(u'π', protocol=2) + self.assertEqual(self.loads(b'\x80\x02X\x02\x00\x00\x00\xcf\x80.', + encoding="bytes"), 'π') + + def test_load_long_python2_str_as_bytes(self): + # From Python 2: pickle.dumps('x' * 300, protocol=1) + self.assertEqual(self.loads(pickle.BINSTRING + + struct.pack("<I", 300) + + b'x' * 300 + pickle.STOP, + encoding='bytes'), b'x' * 300) + def test_large_pickles(self): # Test the correctness of internal buffering routines when handling # large data. @@ -1566,7 +1595,6 @@ class AbstractPickleTests(unittest.TestCase): unpickled = self.loads(self.dumps(method, proto)) self.assertEqual(method(obj), unpickled(obj)) - def test_c_methods(self): global Subclass class Subclass(tuple): diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py index fbe96ac..0b2fe1e 100644 --- a/Lib/test/test_pickle.py +++ b/Lib/test/test_pickle.py @@ -83,13 +83,17 @@ class PyPicklerUnpicklerObjectTests(AbstractPicklerUnpicklerObjectTests): class PyDispatchTableTests(AbstractDispatchTableTests): + pickler_class = pickle._Pickler + def get_dispatch_table(self): return pickle.dispatch_table.copy() class PyChainDispatchTableTests(AbstractDispatchTableTests): + pickler_class = pickle._Pickler + def get_dispatch_table(self): return collections.ChainMap({}, pickle.dispatch_table) |