From e822b034e766e03cd8fbe7ab52fbc2d46fff6d33 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 6 Aug 2013 16:56:26 +0300 Subject: Issue #15866: The xmlcharrefreplace error handler no more produces two XML entities for a non-BMP character on narrow build. --- Lib/test/test_codeccallbacks.py | 25 +++++++++++-- Lib/test/test_unicode.py | 12 ++++++ Misc/NEWS | 3 ++ Modules/_testcapimodule.c | 2 +- Objects/unicodeobject.c | 82 ++++++++++++++++++++++++++++++----------- Python/codecs.c | 66 ++++++++++++++++++--------------- 6 files changed, 135 insertions(+), 55 deletions(-) diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 61c2df2..ecaf997 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -66,15 +66,34 @@ class CodecCallbackTest(unittest.TestCase): # replace unencodable characters which numeric character entities. # For ascii, latin-1 and charmaps this is completely implemented # in C and should be reasonably fast. - s = u"\u30b9\u30d1\u30e2 \xe4nd eggs" + s = u"\u30b9\u30d1\u30e2 \xe4nd egg\u0161" self.assertEqual( s.encode("ascii", "xmlcharrefreplace"), - "スパモ änd eggs" + "スパモ änd eggš" ) self.assertEqual( s.encode("latin-1", "xmlcharrefreplace"), - "スパモ \xe4nd eggs" + "スパモ \xe4nd eggš" ) + self.assertEqual( + s.encode("iso-8859-15", "xmlcharrefreplace"), + "スパモ \xe4nd egg\xa8" + ) + + def test_xmlcharrefreplace_with_surrogates(self): + tests = [(u'\U0001f49d', '💝'), + (u'\ud83d', '�'), + (u'\udc9d', '�'), + (u'\ud83d\udc9d', '💝' if len(u'\U0001f49d') > 1 else + '��'), + ] + for encoding in ['ascii', 'latin1', 'iso-8859-15']: + for s, exp in tests: + self.assertEqual(s.encode(encoding, 'xmlcharrefreplace'), + exp, msg='%r.encode(%r)' % (s, encoding)) + self.assertEqual((s+'X').encode(encoding, 'xmlcharrefreplace'), + exp+'X', + msg='%r.encode(%r)' % (s + 'X', encoding)) def test_xmlcharnamereplace(self): # This time use a named character entity for unencodable diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index e44fe03..666cab8 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1658,6 +1658,18 @@ class UnicodeTest( self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"), b'123?0') + def test_encode_decimal_with_surrogates(self): + from _testcapi import unicode_encodedecimal + tests = [(u'\U0001f49d', '💝'), + (u'\ud83d', '�'), + (u'\udc9d', '�'), + (u'\ud83d\udc9d', '💝' if len(u'\U0001f49d') > 1 else + '��'), + ] + for s, exp in tests: + self.assertEqual( + unicode_encodedecimal(u"123" + s, "xmlcharrefreplace"), + '123' + exp) def test_main(): test_support.run_unittest(__name__) diff --git a/Misc/NEWS b/Misc/NEWS index 64668dd..af3a94b 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -9,6 +9,9 @@ What's New in Python 2.7.6? Core and Builtins ----------------- +- Issue #15866: The xmlcharrefreplace error handler no more produces two XML + entities for a non-BMP character on narrow build. + - Issue #18184: PyUnicode_FromFormat() and PyUnicode_FromFormatV() now raise OverflowError when an argument of %c format is out of range. diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c index b0386f0..4e7d47d 100644 --- a/Modules/_testcapimodule.c +++ b/Modules/_testcapimodule.c @@ -1118,7 +1118,7 @@ unicode_encodedecimal(PyObject *self, PyObject *args) if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length, &errors)) return NULL; - decimal_length = length * 7; /* len('€') */ + decimal_length = length * 10; /* len('􏿿') */ decimal = PyBytes_FromStringAndSize(NULL, decimal_length); if (decimal == NULL) return NULL; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 64a5ef5..866eb9b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -547,6 +547,37 @@ PyObject *PyUnicode_FromString(const char *u) return PyUnicode_FromStringAndSize(u, size); } +/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed + * by 'ptr', possibly combining surrogate pairs on narrow builds. + * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character + * that should be returned and 'end' pointing to the end of the buffer. + * ('end' is used on narrow builds to detect a lone surrogate at the + * end of the buffer that should be returned unchanged.) + * The ptr and end arguments should be side-effect free and ptr must an lvalue. + * The type of the returned char is always Py_UCS4. + * + * Note: the macro advances ptr to next char, so it might have side-effects + * (especially if used with other macros). + */ + +/* helper macros used by _Py_UNICODE_NEXT */ +#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) +#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) +/* Join two surrogate characters and return a single Py_UCS4 value. */ +#define _Py_UNICODE_JOIN_SURROGATES(high, low) \ + (((((Py_UCS4)(high) & 0x03FF) << 10) | \ + ((Py_UCS4)(low) & 0x03FF)) + 0x10000) + +#ifdef Py_UNICODE_WIDE +#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++ +#else +#define _Py_UNICODE_NEXT(ptr, end) \ + (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \ + _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \ + ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \ + (Py_UCS4)*(ptr)++) +#endif + #ifdef HAVE_WCHAR_H #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) @@ -3642,26 +3673,22 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, case 4: /* xmlcharrefreplace */ respos = str-PyString_AS_STRING(res); /* determine replacement size (temporarily (mis)uses p) */ - for (p = collstart, repsize = 0; p < collend; ++p) { - if (*p<10) + for (p = collstart, repsize = 0; p < collend;) { + Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); + if (ch < 10) repsize += 2+1+1; - else if (*p<100) + else if (ch < 100) repsize += 2+2+1; - else if (*p<1000) + else if (ch < 1000) repsize += 2+3+1; - else if (*p<10000) + else if (ch < 10000) repsize += 2+4+1; -#ifndef Py_UNICODE_WIDE - else + else if (ch < 100000) repsize += 2+5+1; -#else - else if (*p<100000) - repsize += 2+5+1; - else if (*p<1000000) + else if (ch < 1000000) repsize += 2+6+1; else repsize += 2+7+1; -#endif } requiredsize = respos+repsize+(endp-collend); if (requiredsize > ressize) { @@ -3673,8 +3700,9 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, ressize = requiredsize; } /* generate replacement (temporarily (mis)uses p) */ - for (p = collstart; p < collend; ++p) { - str += sprintf(str, "&#%d;", (int)*p); + for (p = collstart; p < collend;) { + Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); + str += sprintf(str, "&#%d;", (int)ch); } p = collend; break; @@ -4649,11 +4677,20 @@ int charmap_encoding_error( *inpos = collendpos; break; case 4: /* xmlcharrefreplace */ - /* generate replacement (temporarily (mis)uses p) */ - for (collpos = collstartpos; collpos < collendpos; ++collpos) { + /* generate replacement */ + for (collpos = collstartpos; collpos < collendpos;) { char buffer[2+29+1+1]; char *cp; - sprintf(buffer, "&#%d;", (int)p[collpos]); + Py_UCS4 ch = p[collpos++]; +#ifndef Py_UNICODE_WIDE + if ((0xD800 <= ch && ch <= 0xDBFF) && + (collpos < collendpos) && + (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) { + ch = ((((ch & 0x03FF) << 10) | + ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000); + } +#endif + sprintf(buffer, "&#%d;", (int)ch); for (cp = buffer; *cp; ++cp) { x = charmapencode_output(*cp, mapping, res, respos); if (x==enc_EXCEPTION) @@ -5068,10 +5105,11 @@ PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, break; case 4: /* xmlcharrefreplace */ /* generate replacement (temporarily (mis)uses p) */ - for (p = collstart; p < collend; ++p) { + for (p = collstart; p < collend;) { char buffer[2+29+1+1]; char *cp; - sprintf(buffer, "&#%d;", (int)*p); + Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); + sprintf(buffer, "&#%d;", (int)ch); if (charmaptranslate_makespace(&res, &str, (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) goto onError; @@ -5222,8 +5260,10 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, break; case 4: /* xmlcharrefreplace */ /* generate replacement (temporarily (mis)uses p) */ - for (p = collstart; p < collend; ++p) - output += sprintf(output, "&#%d;", (int)*p); + for (p = collstart; p < collend;) { + Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); + output += sprintf(output, "&#%d;", ch); + } p = collend; break; default: diff --git a/Python/codecs.c b/Python/codecs.c index 7334eb3..91147a0 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -556,6 +556,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) PyObject *res; Py_UNICODE *p; Py_UNICODE *startp; + Py_UNICODE *e; Py_UNICODE *outp; int ressize; if (PyUnicodeEncodeError_GetStart(exc, &start)) @@ -565,26 +566,31 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) if (!(object = PyUnicodeEncodeError_GetObject(exc))) return NULL; startp = PyUnicode_AS_UNICODE(object); - for (p = startp+start, ressize = 0; p < startp+end; ++p) { - if (*p<10) + e = startp + end; + for (p = startp+start, ressize = 0; p < e;) { + Py_UCS4 ch = *p++; +#ifndef Py_UNICODE_WIDE + if ((0xD800 <= ch && ch <= 0xDBFF) && + (p < e) && + (0xDC00 <= *p && *p <= 0xDFFF)) { + ch = ((((ch & 0x03FF) << 10) | + ((Py_UCS4)*p++ & 0x03FF)) + 0x10000); + } +#endif + if (ch < 10) ressize += 2+1+1; - else if (*p<100) + else if (ch < 100) ressize += 2+2+1; - else if (*p<1000) + else if (ch < 1000) ressize += 2+3+1; - else if (*p<10000) + else if (ch < 10000) ressize += 2+4+1; -#ifndef Py_UNICODE_WIDE - else - ressize += 2+5+1; -#else - else if (*p<100000) + else if (ch < 100000) ressize += 2+5+1; - else if (*p<1000000) + else if (ch < 1000000) ressize += 2+6+1; else ressize += 2+7+1; -#endif } /* allocate replacement */ res = PyUnicode_FromUnicode(NULL, ressize); @@ -593,40 +599,41 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) return NULL; } /* generate replacement */ - for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); - p < startp+end; ++p) { - Py_UNICODE c = *p; + for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); p < e;) { int digits; int base; + Py_UCS4 ch = *p++; +#ifndef Py_UNICODE_WIDE + if ((0xD800 <= ch && ch <= 0xDBFF) && + (p < startp+end) && + (0xDC00 <= *p && *p <= 0xDFFF)) { + ch = ((((ch & 0x03FF) << 10) | + ((Py_UCS4)*p++ & 0x03FF)) + 0x10000); + } +#endif *outp++ = '&'; *outp++ = '#'; - if (*p<10) { + if (ch < 10) { digits = 1; base = 1; } - else if (*p<100) { + else if (ch < 100) { digits = 2; base = 10; } - else if (*p<1000) { + else if (ch < 1000) { digits = 3; base = 100; } - else if (*p<10000) { + else if (ch < 10000) { digits = 4; base = 1000; } -#ifndef Py_UNICODE_WIDE - else { - digits = 5; - base = 10000; - } -#else - else if (*p<100000) { + else if (ch < 100000) { digits = 5; base = 10000; } - else if (*p<1000000) { + else if (ch < 1000000) { digits = 6; base = 100000; } @@ -634,10 +641,9 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) digits = 7; base = 1000000; } -#endif while (digits-->0) { - *outp++ = '0' + c/base; - c %= base; + *outp++ = '0' + ch/base; + ch %= base; base /= 10; } *outp++ = ';'; -- cgit v0.12