Minimal fix for the complaints about pickling Unicode objects. (SF

bugs #126161 and 123634). The solution doesn't use the unicode-escape encoding; that has other problems (it seems not 100% reversible). Rather, it transforms the input Unicode object slightly before encoding it using raw-unicode-escape, so that the decoding will reconstruct the original string: backslash and newline characters are translated into their \uXXXX counterparts. This is backwards incompatible for strings containing backslashes, but for some of those strings, the pickling was already broken. Note that SF bug #123634 complains specifically that cPickle fails to unpickle the pickle for u'' (the empty Unicode string) correctly. This was an off-by-one error in load_unicode(). XXX Ugliness: in order to do the modified raw-unicode-escape, I've cut-and-pasted a copy of PyUnicode_EncodeRawUnicodeEscape() into this file that also encodes '\\' and '\n'. It might be nice to migrate this into the Unicode implementation and give this encoding a new name ('half-raw-unicode-escape'? 'pickle-unicode-escape'?); that would help pickle.py too. But right now I can't be bothered with the necessary infrastructural changes.
author: Guido van Rossum <guido@python.org> 2000-12-19 02:08:38 (GMT)
committer: Guido van Rossum <guido@python.org> 2000-12-19 02:08:38 (GMT)
commit: fb10c3f6645cab80b95e52ae76297616dcb6ec64 (patch)
tree: cb84db270d497a2ec516fbef3efc38916d84d52e
parent: 8b74b15b9283bea63a194e71b5c88af5dc6f497d (diff)
download: cpython-fb10c3f6645cab80b95e52ae76297616dcb6ec64.zip
cpython-fb10c3f6645cab80b95e52ae76297616dcb6ec64.tar.gz
cpython-fb10c3f6645cab80b95e52ae76297616dcb6ec64.tar.bz2
1 files changed, 48 insertions, 2 deletions
diff --git a/Modules/cPickle.c b/Modules/cPickle.c
index 5b02c2a..aac2e61 100644
--- a/Modules/cPickle.c
+++ b/Modules/cPickle.c
@@ -1149,6 +1149,51 @@ err:
 }
 
 
+/* A copy of PyUnicode_EncodeRawUnicodeEscape() that also translates
+   backslash and newline characters to \uXXXX escapes. */
+static PyObject *
+modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size)
+{
+    PyObject *repr;
+    char *p;
+    char *q;
+
+    static const char *hexdigit = "0123456789ABCDEF";
+
+    repr = PyString_FromStringAndSize(NULL, 6 * size);
+    if (repr == NULL)
+        return NULL;
+    if (size == 0)
+	return repr;
+
+    p = q = PyString_AS_STRING(repr);
+    while (size-- > 0) {
+        Py_UNICODE ch = *s++;
+	/* Map 16-bit characters to '\uxxxx' */
+	if (ch >= 256 || ch == '\\' || ch == '\n') {
+            *p++ = '\\';
+            *p++ = 'u';
+            *p++ = hexdigit[(ch >> 12) & 0xf];
+            *p++ = hexdigit[(ch >> 8) & 0xf];
+            *p++ = hexdigit[(ch >> 4) & 0xf];
+            *p++ = hexdigit[ch & 15];
+        }
+	/* Copy everything else as-is */
+	else
+            *p++ = (char) ch;
+    }
+    *p = '\0';
+    if (_PyString_Resize(&repr, p - q))
+	goto onError;
+
+    return repr;
+
+ onError:
+    Py_DECREF(repr);
+    return NULL;
+}
+
+
 static int
 save_unicode(Picklerobject *self, PyObject *args, int doput) {
     int size, len;
@@ -1161,7 +1206,8 @@ save_unicode(Picklerobject *self, PyObject *args, int doput) {
         char *repr_str;
         static char string = UNICODE;
 
-        UNLESS (repr = PyUnicode_AsRawUnicodeEscapeString(args))
+        UNLESS(repr = modified_EncodeRawUnicodeEscape(
+		PyUnicode_AS_UNICODE(args), PyUnicode_GET_SIZE(args)))
             return -1;
 
         if ((len = PyString_Size(repr)) < 0)
@@ -2745,7 +2791,7 @@ load_unicode(Unpicklerobject *self) {
     char *s;
 
     if ((len = (*self->readline_func)(self, &s)) < 0) return -1;
-    if (len < 2) return bad_readline();
+    if (len < 1) return bad_readline();
 
     UNLESS (str = PyUnicode_DecodeRawUnicodeEscape(s, len - 1, NULL))
         goto finally;
author	Guido van Rossum <guido@python.org>	2000-12-19 02:08:38 (GMT)
committer	Guido van Rossum <guido@python.org>	2000-12-19 02:08:38 (GMT)
commit	fb10c3f6645cab80b95e52ae76297616dcb6ec64 (patch)
tree	cb84db270d497a2ec516fbef3efc38916d84d52e
parent	8b74b15b9283bea63a194e71b5c88af5dc6f497d (diff)
download	cpython-fb10c3f6645cab80b95e52ae76297616dcb6ec64.zip cpython-fb10c3f6645cab80b95e52ae76297616dcb6ec64.tar.gz cpython-fb10c3f6645cab80b95e52ae76297616dcb6ec64.tar.bz2