summaryrefslogtreecommitdiffstats
path: root/Python
diff options
context:
space:
mode:
authorAntoine Pitrou <solipsis@pitrou.net>2010-09-09 20:33:43 (GMT)
committerAntoine Pitrou <solipsis@pitrou.net>2010-09-09 20:33:43 (GMT)
commitc9a8df24cc8c95efb63b9820d9381ad2f54e45c5 (patch)
tree590f0f94fd1907e7849a30f071ee6d27af1a3fbb /Python
parent8e0bb6a1e2907797cd6e4b7cc90539904e54db7e (diff)
downloadcpython-c9a8df24cc8c95efb63b9820d9381ad2f54e45c5.zip
cpython-c9a8df24cc8c95efb63b9820d9381ad2f54e45c5.tar.gz
cpython-c9a8df24cc8c95efb63b9820d9381ad2f54e45c5.tar.bz2
Merged revisions 84655 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/branches/py3k ........ r84655 | antoine.pitrou | 2010-09-09 22:30:23 +0200 (jeu., 09 sept. 2010) | 6 lines Issue #9804: ascii() now always represents unicode surrogate pairs as a single `\UXXXXXXXX`, regardless of whether the character is printable or not. Also, the "backslashreplace" error handler now joins surrogate pairs into a single character on UCS-2 builds. ........
Diffstat (limited to 'Python')
-rw-r--r--Python/codecs.c26
1 files changed, 20 insertions, 6 deletions
diff --git a/Python/codecs.c b/Python/codecs.c
index 04487a2..45d9929 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -678,6 +678,13 @@ static Py_UNICODE hexdigits[] = {
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
{
+#ifndef Py_UNICODE_WIDE
+#define IS_SURROGATE_PAIR(p, end) \
+ (*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
+ *(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
+#else
+#define IS_SURROGATE_PAIR(p, end) 0
+#endif
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
PyObject *restuple;
PyObject *object;
@@ -702,7 +709,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
else
#endif
if (*p >= 0x100) {
- ressize += 1+1+4;
+ if (IS_SURROGATE_PAIR(p, startp+end)) {
+ ressize += 1+1+8;
+ ++p;
+ }
+ else
+ ressize += 1+1+4;
}
else
ressize += 1+1+2;
@@ -712,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
return NULL;
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
p < startp+end; ++p) {
- Py_UNICODE c = *p;
+ Py_UCS4 c = (Py_UCS4) *p;
*outp++ = '\\';
-#ifdef Py_UNICODE_WIDE
+ if (IS_SURROGATE_PAIR(p, startp+end)) {
+ c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
+ ++p;
+ }
if (c >= 0x00010000) {
*outp++ = 'U';
*outp++ = hexdigits[(c>>28)&0xf];
@@ -724,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
*outp++ = hexdigits[(c>>12)&0xf];
*outp++ = hexdigits[(c>>8)&0xf];
}
- else
-#endif
- if (c >= 0x100) {
+ else if (c >= 0x100) {
*outp++ = 'u';
*outp++ = hexdigits[(c>>12)&0xf];
*outp++ = hexdigits[(c>>8)&0xf];
@@ -746,6 +759,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
wrong_exception_type(exc);
return NULL;
}
+#undef IS_SURROGATE_PAIR
}
/* This handler is declared static until someone demonstrates