summaryrefslogtreecommitdiffstats
path: root/Modules
diff options
context:
space:
mode:
authorAlexandre Vassalotti <alexandre@peadrop.com>2008-12-27 07:08:47 (GMT)
committerAlexandre Vassalotti <alexandre@peadrop.com>2008-12-27 07:08:47 (GMT)
commitf852bf97ef45e10f37938434c84b58d65b1b9a7e (patch)
tree439a2c1f154f37c6600e436b7f12e971d295552c /Modules
parent034e08ce8daafb61bfc8e3f7c6e3b6194e05dd78 (diff)
downloadcpython-f852bf97ef45e10f37938434c84b58d65b1b9a7e.zip
cpython-f852bf97ef45e10f37938434c84b58d65b1b9a7e.tar.gz
cpython-f852bf97ef45e10f37938434c84b58d65b1b9a7e.tar.bz2
Fix issue #4730: cPickle corrupts high-unicode strings.
Update outdated copy of PyUnicode_EncodeRawUnicodeEscape. Add a test case.
Diffstat (limited to 'Modules')
-rw-r--r--Modules/cPickle.c109
1 files changed, 79 insertions, 30 deletions
diff --git a/Modules/cPickle.c b/Modules/cPickle.c
index f777286..18baee1 100644
--- a/Modules/cPickle.c
+++ b/Modules/cPickle.c
@@ -1255,41 +1255,90 @@ save_string(Picklerobject *self, PyObject *args, int doput)
/* A copy of PyUnicode_EncodeRawUnicodeEscape() that also translates
backslash and newline characters to \uXXXX escapes. */
static PyObject *
-modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size)
+modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size)
{
- PyObject *repr;
- char *p;
- char *q;
+ PyObject *repr;
+ char *p;
+ char *q;
- static const char *hexdigit = "0123456789ABCDEF";
+ static const char *hexdigit = "0123456789abcdef";
+#ifdef Py_UNICODE_WIDE
+ const Py_ssize_t expandsize = 10;
+#else
+ const Py_ssize_t expandsize = 6;
+#endif
- repr = PyString_FromStringAndSize(NULL, 6 * size);
- if (repr == NULL)
- return NULL;
- if (size == 0)
- return repr;
-
- p = q = PyString_AS_STRING(repr);
- while (size-- > 0) {
- Py_UNICODE ch = *s++;
- /* Map 16-bit characters to '\uxxxx' */
- if (ch >= 256 || ch == '\\' || ch == '\n') {
- *p++ = '\\';
- *p++ = 'u';
- *p++ = hexdigit[(ch >> 12) & 0xf];
- *p++ = hexdigit[(ch >> 8) & 0xf];
- *p++ = hexdigit[(ch >> 4) & 0xf];
- *p++ = hexdigit[ch & 15];
- }
- /* Copy everything else as-is */
- else
- *p++ = (char) ch;
- }
- *p = '\0';
- _PyString_Resize(&repr, p - q);
+ if (size > PY_SSIZE_T_MAX / expandsize)
+ return PyErr_NoMemory();
+
+ repr = PyString_FromStringAndSize(NULL, expandsize * size);
+ if (repr == NULL)
+ return NULL;
+ if (size == 0)
return repr;
-}
+ p = q = PyString_AS_STRING(repr);
+ while (size-- > 0) {
+ Py_UNICODE ch = *s++;
+#ifdef Py_UNICODE_WIDE
+ /* Map 32-bit characters to '\Uxxxxxxxx' */
+ if (ch >= 0x10000) {
+ *p++ = '\\';
+ *p++ = 'U';
+ *p++ = hexdigit[(ch >> 28) & 0xf];
+ *p++ = hexdigit[(ch >> 24) & 0xf];
+ *p++ = hexdigit[(ch >> 20) & 0xf];
+ *p++ = hexdigit[(ch >> 16) & 0xf];
+ *p++ = hexdigit[(ch >> 12) & 0xf];
+ *p++ = hexdigit[(ch >> 8) & 0xf];
+ *p++ = hexdigit[(ch >> 4) & 0xf];
+ *p++ = hexdigit[ch & 15];
+ }
+ else
+#else
+ /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
+ if (ch >= 0xD800 && ch < 0xDC00) {
+ Py_UNICODE ch2;
+ Py_UCS4 ucs;
+
+ ch2 = *s++;
+ size--;
+ if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+ ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
+ *p++ = '\\';
+ *p++ = 'U';
+ *p++ = hexdigit[(ucs >> 28) & 0xf];
+ *p++ = hexdigit[(ucs >> 24) & 0xf];
+ *p++ = hexdigit[(ucs >> 20) & 0xf];
+ *p++ = hexdigit[(ucs >> 16) & 0xf];
+ *p++ = hexdigit[(ucs >> 12) & 0xf];
+ *p++ = hexdigit[(ucs >> 8) & 0xf];
+ *p++ = hexdigit[(ucs >> 4) & 0xf];
+ *p++ = hexdigit[ucs & 0xf];
+ continue;
+ }
+ /* Fall through: isolated surrogates are copied as-is */
+ s--;
+ size++;
+ }
+#endif
+ /* Map 16-bit characters to '\uxxxx' */
+ if (ch >= 256 || ch == '\\' || ch == '\n') {
+ *p++ = '\\';
+ *p++ = 'u';
+ *p++ = hexdigit[(ch >> 12) & 0xf];
+ *p++ = hexdigit[(ch >> 8) & 0xf];
+ *p++ = hexdigit[(ch >> 4) & 0xf];
+ *p++ = hexdigit[ch & 15];
+ }
+ /* Copy everything else as-is */
+ else
+ *p++ = (char) ch;
+ }
+ *p = '\0';
+ _PyString_Resize(&repr, p - q);
+ return repr;
+}
static int
save_unicode(Picklerobject *self, PyObject *args, int doput)