diff options
author | Victor Stinner <victor.stinner@haypocalc.com> | 2010-04-13 11:07:24 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@haypocalc.com> | 2010-04-13 11:07:24 (GMT) |
commit | 485fb56eb86a1fcd35fd3d0d37efb5ec514dba2b (patch) | |
tree | f6b23e9670e70af104a248010798942acdbc5c7b | |
parent | 36067606707844f7de076cf1846afb767b494d7e (diff) | |
download | cpython-485fb56eb86a1fcd35fd3d0d37efb5ec514dba2b.zip cpython-485fb56eb86a1fcd35fd3d0d37efb5ec514dba2b.tar.gz cpython-485fb56eb86a1fcd35fd3d0d37efb5ec514dba2b.tar.bz2 |
Issue #8383: pickle and pickletools use surrogatepass error handler when
encoding unicode as utf8 to support lone surrogates and stay compatible with
Python 2.x and 3.0
-rw-r--r-- | Lib/pickle.py | 4 | ||||
-rw-r--r-- | Lib/pickletools.py | 2 | ||||
-rw-r--r-- | Lib/test/pickletester.py | 4 | ||||
-rw-r--r-- | Misc/NEWS | 4 | ||||
-rw-r--r-- | Modules/_pickle.c | 6 |
5 files changed, 14 insertions, 6 deletions
diff --git a/Lib/pickle.py b/Lib/pickle.py index 8a2abcc..c4fc2c4 100644 --- a/Lib/pickle.py +++ b/Lib/pickle.py @@ -499,7 +499,7 @@ class _Pickler: def save_str(self, obj, pack=struct.pack): if self.bin: - encoded = obj.encode('utf-8') + encoded = obj.encode('utf-8', 'surrogatepass') n = len(encoded) self.write(BINUNICODE + pack("<i", n) + encoded) else: @@ -966,7 +966,7 @@ class _Unpickler: def load_binunicode(self): len = mloads(b'i' + self.read(4)) - self.append(str(self.read(len), 'utf-8')) + self.append(str(self.read(len), 'utf-8', 'surrogatepass')) dispatch[BINUNICODE[0]] = load_binunicode def load_short_binstring(self): diff --git a/Lib/pickletools.py b/Lib/pickletools.py index ca11aa3..6ab75c7 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -469,7 +469,7 @@ def read_unicodestring4(f): raise ValueError("unicodestring4 byte count < 0: %d" % n) data = f.read(n) if len(data) == n: - return str(data, 'utf-8') + return str(data, 'utf-8', 'surrogatepass') raise ValueError("expected %d bytes in a unicodestring4, but only %d " "remain" % (n, len(data))) diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 79407a6..dd0ed15 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -515,7 +515,9 @@ class AbstractPickleTests(unittest.TestCase): def test_unicode(self): endcases = ['', '<\\u>', '<\\\u1234>', '<\n>', - '<\\>', '<\\\U00012345>'] + '<\\>', '<\\\U00012345>', + # surrogates + '<\udc80>'] for proto in protocols: for u in endcases: p = self.dumps(u, proto) @@ -312,6 +312,10 @@ C-API Library ------- +- Issue #8383: pickle and pickletools use surrogatepass error handler when + encoding unicode as utf8 to support lone surrogates and stay compatible with + Python 2.x and 3.0 + - Issue #7585: difflib context and unified diffs now place a tab between filename and date, conforming to the 'standards' they were originally designed to follow. This improves compatibility with patch tools. diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 29aed7a..0e1c2cd 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -1227,7 +1227,9 @@ save_unicode(PicklerObject *self, PyObject *obj) if (self->bin) { char pdata[5]; - encoded = PyUnicode_AsUTF8String(obj); + encoded = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(obj), + PyUnicode_GET_SIZE(obj), + "surrogatepass"); if (encoded == NULL) goto error; @@ -3352,7 +3354,7 @@ load_binunicode(UnpicklerObject *self) if (unpickler_read(self, &s, size) < 0) return -1; - str = PyUnicode_DecodeUTF8(s, size, NULL); + str = PyUnicode_DecodeUTF8(s, size, "surrogatepass"); if (str == NULL) return -1; |