From 3bfc65a25ba2e010ef12db0cff006c2cbbeb18f9 Mon Sep 17 00:00:00 2001 From: Alexandre Vassalotti Date: Tue, 13 Dec 2011 13:08:09 -0500 Subject: Issue #13505: Make pickling of bytes object compatible with Python 2. Initial patch by sbt. --- Lib/pickle.py | 6 ++++- Lib/pickletools.py | 70 +++++++++++++++++++++--------------------------- Lib/test/pickletester.py | 12 ++++++--- Misc/NEWS | 3 +++ Modules/_pickle.c | 58 ++++++++++++++++++++++++++++++--------- 5 files changed, 93 insertions(+), 56 deletions(-) diff --git a/Lib/pickle.py b/Lib/pickle.py index 1f45f37..d10ac776 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -487,7 +487,11 @@ class _Pickler: def save_bytes(self, obj, pack=struct.pack): if self.proto < 3: - self.save_reduce(bytes, (list(obj),), obj=obj) + if len(obj) == 0: + self.save_reduce(bytes, (), obj=obj) + else: + self.save_reduce(codecs.encode, + (str(obj, 'latin1'), 'latin1'), obj=obj) return n = len(obj) if n < 256: diff --git a/Lib/pickletools.py b/Lib/pickletools.py index 3061675..ec6cc53 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -2083,27 +2083,22 @@ _dis_test = r""" 29: ( MARK 30: d DICT (MARK at 29) 31: p PUT 2 - 34: c GLOBAL '__builtin__ bytes' - 53: p PUT 3 - 56: ( MARK - 57: ( MARK - 58: l LIST (MARK at 57) + 34: c GLOBAL '_codecs encode' + 50: p PUT 3 + 53: ( MARK + 54: V UNICODE 'abc' 59: p PUT 4 - 62: L LONG 97 - 67: a APPEND - 68: L LONG 98 - 73: a APPEND - 74: L LONG 99 - 79: a APPEND - 80: t TUPLE (MARK at 56) - 81: p PUT 5 - 84: R REDUCE - 85: p PUT 6 - 88: V UNICODE 'def' - 93: p PUT 7 - 96: s SETITEM - 97: a APPEND - 98: . STOP + 62: V UNICODE 'latin1' + 70: p PUT 5 + 73: t TUPLE (MARK at 53) + 74: p PUT 6 + 77: R REDUCE + 78: p PUT 7 + 81: V UNICODE 'def' + 86: p PUT 8 + 89: s SETITEM + 90: a APPEND + 91: . STOP highest protocol among opcodes = 0 Try again with a "binary" pickle. @@ -2122,25 +2117,22 @@ Try again with a "binary" pickle. 14: q BINPUT 1 16: } EMPTY_DICT 17: q BINPUT 2 - 19: c GLOBAL '__builtin__ bytes' - 38: q BINPUT 3 - 40: ( MARK - 41: ] EMPTY_LIST - 42: q BINPUT 4 - 44: ( MARK - 45: K BININT1 97 - 47: K BININT1 98 - 49: K BININT1 99 - 51: e APPENDS (MARK at 44) - 52: t TUPLE (MARK at 40) - 53: q BINPUT 5 - 55: R REDUCE - 56: q BINPUT 6 - 58: X BINUNICODE 'def' - 66: q BINPUT 7 - 68: s SETITEM - 69: e APPENDS (MARK at 3) - 70: . STOP + 19: c GLOBAL '_codecs encode' + 35: q BINPUT 3 + 37: ( MARK + 38: X BINUNICODE 'abc' + 46: q BINPUT 4 + 48: X BINUNICODE 'latin1' + 59: q BINPUT 5 + 61: t TUPLE (MARK at 37) + 62: q BINPUT 6 + 64: R REDUCE + 65: q BINPUT 7 + 67: X BINUNICODE 'def' + 75: q BINPUT 8 + 77: s SETITEM + 78: e APPENDS (MARK at 3) + 79: . STOP highest protocol among opcodes = 1 Exercise the INST/OBJ/BUILD family. diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 49be720..cab0523 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -636,9 +636,15 @@ class AbstractPickleTests(unittest.TestCase): def test_bytes(self): for proto in protocols: - for u in b'', b'xyz', b'xyz'*100: - p = self.dumps(u) - self.assertEqual(self.loads(p), u) + for s in b'', b'xyz', b'xyz'*100: + p = self.dumps(s) + self.assertEqual(self.loads(p), s) + for s in [bytes([i]) for i in range(256)]: + p = self.dumps(s) + self.assertEqual(self.loads(p), s) + for s in [bytes([i, i]) for i in range(256)]: + p = self.dumps(s) + self.assertEqual(self.loads(p), s) def test_ints(self): import sys diff --git a/Misc/NEWS b/Misc/NEWS index 6d18ddf..f654d8d 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,9 @@ What's New in Python 3.2.3? Core and Builtins ----------------- +- Issue #13505: Pickle bytes objects in a way that is compatible with + Python 2 when using protocols <= 2. + - Issue #11147: Fix an unused argument in _Py_ANNOTATE_MEMORY_ORDER. (Fix given by Campbell Barton). diff --git a/Modules/_pickle.c b/Modules/_pickle.c index cb58349..fc5f871 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -369,7 +369,7 @@ typedef struct UnpicklerObject { char *errors; /* Name of errors handling scheme to used when decoding strings. The default value is "strict". */ - Py_ssize_t *marks; /* Mark stack, used for unpickling container + Py_ssize_t *marks; /* Mark stack, used for unpickling container objects. */ Py_ssize_t num_marks; /* Number of marks in the mark stack. */ Py_ssize_t marks_size; /* Current allocated size of the mark stack. */ @@ -1700,26 +1700,58 @@ save_bytes(PicklerObject *self, PyObject *obj) if (self->proto < 3) { /* Older pickle protocols do not have an opcode for pickling bytes objects. Therefore, we need to fake the copy protocol (i.e., - the __reduce__ method) to permit bytes object unpickling. */ + the __reduce__ method) to permit bytes object unpickling. + + Here we use a hack to be compatible with Python 2. Since in Python + 2 'bytes' is just an alias for 'str' (which has different + parameters than the actual bytes object), we use codecs.encode + to create the appropriate 'str' object when unpickled using + Python 2 *and* the appropriate 'bytes' object when unpickled + using Python 3. Again this is a hack and we don't need to do this + with newer protocols. */ + static PyObject *codecs_encode = NULL; PyObject *reduce_value = NULL; - PyObject *bytelist = NULL; int status; - bytelist = PySequence_List(obj); - if (bytelist == NULL) - return -1; + if (codecs_encode == NULL) { + PyObject *codecs_module = PyImport_ImportModule("codecs"); + if (codecs_module == NULL) { + return -1; + } + codecs_encode = PyObject_GetAttrString(codecs_module, "encode"); + Py_DECREF(codecs_module); + if (codecs_encode == NULL) { + return -1; + } + } - reduce_value = Py_BuildValue("(O(O))", (PyObject *)&PyBytes_Type, - bytelist); - if (reduce_value == NULL) { - Py_DECREF(bytelist); - return -1; + if (PyBytes_GET_SIZE(obj) == 0) { + reduce_value = Py_BuildValue("(O())", (PyObject*)&PyBytes_Type); } + else { + static PyObject *latin1 = NULL; + PyObject *unicode_str = + PyUnicode_DecodeLatin1(PyBytes_AS_STRING(obj), + PyBytes_GET_SIZE(obj), + "strict"); + if (unicode_str == NULL) + return -1; + if (latin1 == NULL) { + latin1 = PyUnicode_InternFromString("latin1"); + if (latin1 == NULL) + return -1; + } + reduce_value = Py_BuildValue("(O(OO))", + codecs_encode, unicode_str, latin1); + Py_DECREF(unicode_str); + } + + if (reduce_value == NULL) + return -1; /* save_reduce() will memoize the object automatically. */ status = save_reduce(self, reduce_value, obj); Py_DECREF(reduce_value); - Py_DECREF(bytelist); return status; } else { @@ -1727,7 +1759,7 @@ save_bytes(PicklerObject *self, PyObject *obj) char header[5]; Py_ssize_t len; - size = PyBytes_Size(obj); + size = PyBytes_GET_SIZE(obj); if (size < 0) return -1; -- cgit v0.12