diff options
author | Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> | 2023-12-24 10:01:53 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-12-24 10:01:53 (GMT) |
commit | 336d44593486288742e4b8009282da1af62d87ba (patch) | |
tree | 2eddb86f7d3d29be70222078fe61343dfc44a8f7 | |
parent | b60bddbde22f74a2289f4838a6c404c53c58e57b (diff) | |
download | cpython-336d44593486288742e4b8009282da1af62d87ba.zip cpython-336d44593486288742e4b8009282da1af62d87ba.tar.gz cpython-336d44593486288742e4b8009282da1af62d87ba.tar.bz2 |
[3.11] gh-113028: Correctly memoize str in pickle when escapes added (GH-113436) (GH-113449)
This fixes a divergence between the Python and C implementations of pickle
for protocol 0, such that it pickle.py fails to re-use the first pickled
representation of strings involving characters that have to be escaped.
(cherry picked from commit 08398631a0298dcf785ee7bd0e26c7844823ce59)
Co-authored-by: Jeff Allen <ja.py@farowl.co.uk>
-rw-r--r-- | Lib/pickle.py | 14 | ||||
-rw-r--r-- | Lib/test/pickletester.py | 8 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2023-12-23-16-51-17.gh-issue-113028.3Jmdoj.rst | 6 |
3 files changed, 21 insertions, 7 deletions
diff --git a/Lib/pickle.py b/Lib/pickle.py index f027e04..1160356 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -861,13 +861,13 @@ class _Pickler: else: self.write(BINUNICODE + pack("<I", n) + encoded) else: - obj = obj.replace("\\", "\\u005c") - obj = obj.replace("\0", "\\u0000") - obj = obj.replace("\n", "\\u000a") - obj = obj.replace("\r", "\\u000d") - obj = obj.replace("\x1a", "\\u001a") # EOF on DOS - self.write(UNICODE + obj.encode('raw-unicode-escape') + - b'\n') + # Escape what raw-unicode-escape doesn't, but memoize the original. + tmp = obj.replace("\\", "\\u005c") + tmp = tmp.replace("\0", "\\u0000") + tmp = tmp.replace("\n", "\\u000a") + tmp = tmp.replace("\r", "\\u000d") + tmp = tmp.replace("\x1a", "\\u001a") # EOF on DOS + self.write(UNICODE + tmp.encode('raw-unicode-escape') + b'\n') self.memoize(obj) dispatch[str] = save_str diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 5d5df16..5b9bcec 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -1825,6 +1825,14 @@ class AbstractPickleTests: t2 = self.loads(p) self.assert_is_copy(t, t2) + def test_unicode_memoization(self): + # Repeated str is re-used (even when escapes added). + for proto in protocols: + for s in '', 'xyz', 'xyz\n', 'x\\yz', 'x\xa1yz\r': + p = self.dumps((s, s), proto) + s1, s2 = self.loads(p) + self.assertIs(s1, s2) + def test_bytes(self): for proto in protocols: for s in b'', b'xyz', b'xyz'*100: diff --git a/Misc/NEWS.d/next/Library/2023-12-23-16-51-17.gh-issue-113028.3Jmdoj.rst b/Misc/NEWS.d/next/Library/2023-12-23-16-51-17.gh-issue-113028.3Jmdoj.rst new file mode 100644 index 0000000..5f66d6a --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-12-23-16-51-17.gh-issue-113028.3Jmdoj.rst @@ -0,0 +1,6 @@ +When a second reference to a string appears in the input to :mod:`pickle`, +and the Python implementation is in use, +we are guaranteed that a single copy gets pickled +and a single object is shared when reloaded. +Previously, in protocol 0, when a string contained certain characters +(e.g. newline) it resulted in duplicate objects. |