summaryrefslogtreecommitdiffstats
path: root/Python/codecs.c
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2010-09-09 20:30:23 (GMT)
committerAntoine Pitrou <solipsis@pitrou.net>2010-09-09 20:30:23 (GMT)
commite4a189274f3d88d64d5238bf340cec96eff4e5e0 (patch)
tree5ead5f4f2fe3799a34155f2e41a04518adb995b1 /Python/codecs.c
parentea99c5c94985c21d8a64c9a3d753bde7f801c14a (diff)
downloadcpython-e4a189274f3d88d64d5238bf340cec96eff4e5e0.zip
cpython-e4a189274f3d88d64d5238bf340cec96eff4e5e0.tar.gz
cpython-e4a189274f3d88d64d5238bf340cec96eff4e5e0.tar.bz2
Issue #9804: ascii() now always represents unicode surrogate pairs as
a single `\UXXXXXXXX`, regardless of whether the character is printable or not. Also, the "backslashreplace" error handler now joins surrogate pairs into a single character on UCS-2 builds.
Diffstat (limited to 'Python/codecs.c')
-rw-r--r--Python/codecs.c26
1 files changed, 20 insertions, 6 deletions
diff --git a/Python/codecs.c b/Python/codecs.c
index 04487a2..45d9929 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -678,6 +678,13 @@ static Py_UNICODE hexdigits[] = {
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
{
+#ifndef Py_UNICODE_WIDE
+#define IS_SURROGATE_PAIR(p, end) \
+ (*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
+ *(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
+#else
+#define IS_SURROGATE_PAIR(p, end) 0
+#endif
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
PyObject *restuple;
PyObject *object;
@@ -702,7 +709,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
else
#endif
if (*p >= 0x100) {
- ressize += 1+1+4;
+ if (IS_SURROGATE_PAIR(p, startp+end)) {
+ ressize += 1+1+8;
+ ++p;
+ }
+ else
+ ressize += 1+1+4;
}
else
ressize += 1+1+2;
@@ -712,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
return NULL;
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
p < startp+end; ++p) {
- Py_UNICODE c = *p;
+ Py_UCS4 c = (Py_UCS4) *p;
*outp++ = '\\';
-#ifdef Py_UNICODE_WIDE
+ if (IS_SURROGATE_PAIR(p, startp+end)) {
+ c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
+ ++p;
+ }
if (c >= 0x00010000) {
*outp++ = 'U';
*outp++ = hexdigits[(c>>28)&0xf];
@@ -724,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
*outp++ = hexdigits[(c>>12)&0xf];
*outp++ = hexdigits[(c>>8)&0xf];
}
- else
-#endif
- if (c >= 0x100) {
+ else if (c >= 0x100) {
*outp++ = 'u';
*outp++ = hexdigits[(c>>12)&0xf];
*outp++ = hexdigits[(c>>8)&0xf];
@@ -746,6 +759,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
wrong_exception_type(exc);
return NULL;
}
+#undef IS_SURROGATE_PAIR
}
/* This handler is declared static until someone demonstrates