diff options
author | Antoine Pitrou <solipsis@pitrou.net> | 2010-09-09 20:33:43 (GMT) |
---|---|---|
committer | Antoine Pitrou <solipsis@pitrou.net> | 2010-09-09 20:33:43 (GMT) |
commit | c9a8df24cc8c95efb63b9820d9381ad2f54e45c5 (patch) | |
tree | 590f0f94fd1907e7849a30f071ee6d27af1a3fbb /Python | |
parent | 8e0bb6a1e2907797cd6e4b7cc90539904e54db7e (diff) | |
download | cpython-c9a8df24cc8c95efb63b9820d9381ad2f54e45c5.zip cpython-c9a8df24cc8c95efb63b9820d9381ad2f54e45c5.tar.gz cpython-c9a8df24cc8c95efb63b9820d9381ad2f54e45c5.tar.bz2 |
Merged revisions 84655 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/branches/py3k
........
r84655 | antoine.pitrou | 2010-09-09 22:30:23 +0200 (jeu., 09 sept. 2010) | 6 lines
Issue #9804: ascii() now always represents unicode surrogate pairs as
a single `\UXXXXXXXX`, regardless of whether the character is printable
or not. Also, the "backslashreplace" error handler now joins surrogate
pairs into a single character on UCS-2 builds.
........
Diffstat (limited to 'Python')
-rw-r--r-- | Python/codecs.c | 26 |
1 files changed, 20 insertions, 6 deletions
diff --git a/Python/codecs.c b/Python/codecs.c index 04487a2..45d9929 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -678,6 +678,13 @@ static Py_UNICODE hexdigits[] = { PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) { +#ifndef Py_UNICODE_WIDE +#define IS_SURROGATE_PAIR(p, end) \ + (*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \ + *(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF) +#else +#define IS_SURROGATE_PAIR(p, end) 0 +#endif if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { PyObject *restuple; PyObject *object; @@ -702,7 +709,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) else #endif if (*p >= 0x100) { - ressize += 1+1+4; + if (IS_SURROGATE_PAIR(p, startp+end)) { + ressize += 1+1+8; + ++p; + } + else + ressize += 1+1+4; } else ressize += 1+1+2; @@ -712,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) return NULL; for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); p < startp+end; ++p) { - Py_UNICODE c = *p; + Py_UCS4 c = (Py_UCS4) *p; *outp++ = '\\'; -#ifdef Py_UNICODE_WIDE + if (IS_SURROGATE_PAIR(p, startp+end)) { + c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000; + ++p; + } if (c >= 0x00010000) { *outp++ = 'U'; *outp++ = hexdigits[(c>>28)&0xf]; @@ -724,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) *outp++ = hexdigits[(c>>12)&0xf]; *outp++ = hexdigits[(c>>8)&0xf]; } - else -#endif - if (c >= 0x100) { + else if (c >= 0x100) { *outp++ = 'u'; *outp++ = hexdigits[(c>>12)&0xf]; *outp++ = hexdigits[(c>>8)&0xf]; @@ -746,6 +759,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) wrong_exception_type(exc); return NULL; } +#undef IS_SURROGATE_PAIR } /* This handler is declared static until someone demonstrates |