diff options
author | Alexandre Vassalotti <alexandre@peadrop.com> | 2008-12-27 07:08:47 (GMT) |
---|---|---|
committer | Alexandre Vassalotti <alexandre@peadrop.com> | 2008-12-27 07:08:47 (GMT) |
commit | f852bf97ef45e10f37938434c84b58d65b1b9a7e (patch) | |
tree | 439a2c1f154f37c6600e436b7f12e971d295552c /Modules | |
parent | 034e08ce8daafb61bfc8e3f7c6e3b6194e05dd78 (diff) | |
download | cpython-f852bf97ef45e10f37938434c84b58d65b1b9a7e.zip cpython-f852bf97ef45e10f37938434c84b58d65b1b9a7e.tar.gz cpython-f852bf97ef45e10f37938434c84b58d65b1b9a7e.tar.bz2 |
Fix issue #4730: cPickle corrupts high-unicode strings.
Update outdated copy of PyUnicode_EncodeRawUnicodeEscape.
Add a test case.
Diffstat (limited to 'Modules')
-rw-r--r-- | Modules/cPickle.c | 109 |
1 files changed, 79 insertions, 30 deletions
diff --git a/Modules/cPickle.c b/Modules/cPickle.c index f777286..18baee1 100644 --- a/Modules/cPickle.c +++ b/Modules/cPickle.c @@ -1255,41 +1255,90 @@ save_string(Picklerobject *self, PyObject *args, int doput) /* A copy of PyUnicode_EncodeRawUnicodeEscape() that also translates backslash and newline characters to \uXXXX escapes. */ static PyObject * -modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, int size) +modified_EncodeRawUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size) { - PyObject *repr; - char *p; - char *q; + PyObject *repr; + char *p; + char *q; - static const char *hexdigit = "0123456789ABCDEF"; + static const char *hexdigit = "0123456789abcdef"; +#ifdef Py_UNICODE_WIDE + const Py_ssize_t expandsize = 10; +#else + const Py_ssize_t expandsize = 6; +#endif - repr = PyString_FromStringAndSize(NULL, 6 * size); - if (repr == NULL) - return NULL; - if (size == 0) - return repr; - - p = q = PyString_AS_STRING(repr); - while (size-- > 0) { - Py_UNICODE ch = *s++; - /* Map 16-bit characters to '\uxxxx' */ - if (ch >= 256 || ch == '\\' || ch == '\n') { - *p++ = '\\'; - *p++ = 'u'; - *p++ = hexdigit[(ch >> 12) & 0xf]; - *p++ = hexdigit[(ch >> 8) & 0xf]; - *p++ = hexdigit[(ch >> 4) & 0xf]; - *p++ = hexdigit[ch & 15]; - } - /* Copy everything else as-is */ - else - *p++ = (char) ch; - } - *p = '\0'; - _PyString_Resize(&repr, p - q); + if (size > PY_SSIZE_T_MAX / expandsize) + return PyErr_NoMemory(); + + repr = PyString_FromStringAndSize(NULL, expandsize * size); + if (repr == NULL) + return NULL; + if (size == 0) return repr; -} + p = q = PyString_AS_STRING(repr); + while (size-- > 0) { + Py_UNICODE ch = *s++; +#ifdef Py_UNICODE_WIDE + /* Map 32-bit characters to '\Uxxxxxxxx' */ + if (ch >= 0x10000) { + *p++ = '\\'; + *p++ = 'U'; + *p++ = hexdigit[(ch >> 28) & 0xf]; + *p++ = hexdigit[(ch >> 24) & 0xf]; + *p++ = hexdigit[(ch >> 20) & 0xf]; + *p++ = hexdigit[(ch >> 16) & 0xf]; + *p++ = hexdigit[(ch >> 12) & 0xf]; + *p++ = hexdigit[(ch >> 8) & 0xf]; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 15]; + } + else +#else + /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ + if (ch >= 0xD800 && ch < 0xDC00) { + Py_UNICODE ch2; + Py_UCS4 ucs; + + ch2 = *s++; + size--; + if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { + ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; + *p++ = '\\'; + *p++ = 'U'; + *p++ = hexdigit[(ucs >> 28) & 0xf]; + *p++ = hexdigit[(ucs >> 24) & 0xf]; + *p++ = hexdigit[(ucs >> 20) & 0xf]; + *p++ = hexdigit[(ucs >> 16) & 0xf]; + *p++ = hexdigit[(ucs >> 12) & 0xf]; + *p++ = hexdigit[(ucs >> 8) & 0xf]; + *p++ = hexdigit[(ucs >> 4) & 0xf]; + *p++ = hexdigit[ucs & 0xf]; + continue; + } + /* Fall through: isolated surrogates are copied as-is */ + s--; + size++; + } +#endif + /* Map 16-bit characters to '\uxxxx' */ + if (ch >= 256 || ch == '\\' || ch == '\n') { + *p++ = '\\'; + *p++ = 'u'; + *p++ = hexdigit[(ch >> 12) & 0xf]; + *p++ = hexdigit[(ch >> 8) & 0xf]; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 15]; + } + /* Copy everything else as-is */ + else + *p++ = (char) ch; + } + *p = '\0'; + _PyString_Resize(&repr, p - q); + return repr; +} static int save_unicode(Picklerobject *self, PyObject *args, int doput) |