From 5b76bdba071e7bbd9fda0b9b100d1506d95c04bd Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 13 Jan 2018 00:28:31 +0200 Subject: bpo-31993: Do not use memoryview when pickle large strings. (#5154) PyMemoryView_FromMemory() created a memoryview referring to the internal data of the string. When the string is destroyed the memoryview become referring to a freed memory. --- Lib/test/pickletester.py | 60 ++++++++++++++++++++++++------------------------ Misc/NEWS.d/3.7.0a4.rst | 6 ++--- Modules/_pickle.c | 5 ++-- 3 files changed, 36 insertions(+), 35 deletions(-) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index f4e3f81..062ec5a 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -2169,67 +2169,67 @@ class AbstractPickleTests(unittest.TestCase): def test_framed_write_sizes_with_delayed_writer(self): class ChunkAccumulator: """Accumulate pickler output in a list of raw chunks.""" - def __init__(self): self.chunks = [] - def write(self, chunk): self.chunks.append(chunk) - def concatenate_chunks(self): - # Some chunks can be memoryview instances, we need to convert - # them to bytes to be able to call join - return b"".join([c.tobytes() if hasattr(c, 'tobytes') else c - for c in self.chunks]) - - small_objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)}) - for i in range(int(1e4))] + return b"".join(self.chunks) for proto in range(4, pickle.HIGHEST_PROTOCOL + 1): + objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)}) + for i in range(int(1e4))] + # Add a large unique ASCII string + objects.append('0123456789abcdef' * + (self.FRAME_SIZE_TARGET // 16 + 1)) + # Protocol 4 packs groups of small objects into frames and issues # calls to write only once or twice per frame: # The C pickler issues one call to write per-frame (header and # contents) while Python pickler issues two calls to write: one for # the frame header and one for the frame binary contents. writer = ChunkAccumulator() - self.pickler(writer, proto).dump(small_objects) + self.pickler(writer, proto).dump(objects) # Actually read the binary content of the chunks after the end - # of the call to dump: ant memoryview passed to write should not + # of the call to dump: any memoryview passed to write should not # be released otherwise this delayed access would not be possible. pickled = writer.concatenate_chunks() reconstructed = self.loads(pickled) - self.assertEqual(reconstructed, small_objects) + self.assertEqual(reconstructed, objects) self.assertGreater(len(writer.chunks), 1) - n_frames, remainder = divmod(len(pickled), self.FRAME_SIZE_TARGET) - if remainder > 0: - n_frames += 1 + # memoryviews should own the memory. + del objects + support.gc_collect() + self.assertEqual(writer.concatenate_chunks(), pickled) + n_frames = (len(pickled) - 1) // self.FRAME_SIZE_TARGET + 1 # There should be at least one call to write per frame self.assertGreaterEqual(len(writer.chunks), n_frames) # but not too many either: there can be one for the proto, - # one per-frame header and one per frame for the actual contents. - self.assertGreaterEqual(2 * n_frames + 1, len(writer.chunks)) + # one per-frame header, one per frame for the actual contents, + # and two for the header. + self.assertLessEqual(len(writer.chunks), 2 * n_frames + 3) - chunk_sizes = [len(c) for c in writer.chunks[:-1]] + chunk_sizes = [len(c) for c in writer.chunks] large_sizes = [s for s in chunk_sizes if s >= self.FRAME_SIZE_TARGET] - small_sizes = [s for s in chunk_sizes - if s < self.FRAME_SIZE_TARGET] + medium_sizes = [s for s in chunk_sizes + if 9 < s < self.FRAME_SIZE_TARGET] + small_sizes = [s for s in chunk_sizes if s <= 9] # Large chunks should not be too large: for chunk_size in large_sizes: - self.assertGreater(2 * self.FRAME_SIZE_TARGET, chunk_size) - - last_chunk_size = len(writer.chunks[-1]) - self.assertGreater(2 * self.FRAME_SIZE_TARGET, last_chunk_size) - - # Small chunks (if any) should be very small - # (only proto and frame headers) - for chunk_size in small_sizes: - self.assertGreaterEqual(9, chunk_size) + self.assertLess(chunk_size, 2 * self.FRAME_SIZE_TARGET, + chunk_sizes) + # There shouldn't bee too many small chunks: the protocol header, + # the frame headers and the large string headers are written + # in small chunks. + self.assertLessEqual(len(small_sizes), + len(large_sizes) + len(medium_sizes) + 3, + chunk_sizes) def test_nested_names(self): global Nested diff --git a/Misc/NEWS.d/3.7.0a4.rst b/Misc/NEWS.d/3.7.0a4.rst index ca9aa17..a6322ed 100644 --- a/Misc/NEWS.d/3.7.0a4.rst +++ b/Misc/NEWS.d/3.7.0a4.rst @@ -693,9 +693,9 @@ ctypes._aix.find_library() Patch by: Michael Felt .. nonce: -OMNg8 .. section: Library -The picklers no longer allocate temporary memory when dumping large -``bytes`` and ``str`` objects into a file object. Instead the data is -directly streamed into the underlying file object. +The pickler now uses less memory when serializing large bytes and str +objects into a file. Pickles created with protocol 4 will require less +memory for unpickling large bytes and str objects. .. diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 5b3de4d..f8cbea1 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -2184,8 +2184,9 @@ _Pickler_write_bytes(PicklerObject *self, /* Stream write the payload into the file without going through the output buffer. */ if (payload == NULL) { - payload = mem = PyMemoryView_FromMemory((char *) data, data_size, - PyBUF_READ); + /* TODO: It would be better to use a memoryview with a linked + original string if this is possible. */ + payload = mem = PyBytes_FromStringAndSize(data, data_size); if (payload == NULL) { return -1; } -- cgit v0.12