diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2013-08-06 13:56:26 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2013-08-06 13:56:26 (GMT) |
commit | e822b034e766e03cd8fbe7ab52fbc2d46fff6d33 (patch) | |
tree | a2e79671034ae47c4a0d6d0d030aecda007c8123 /Objects | |
parent | 5ad3514822a80c094f1cfe47ae59450001043482 (diff) | |
download | cpython-e822b034e766e03cd8fbe7ab52fbc2d46fff6d33.zip cpython-e822b034e766e03cd8fbe7ab52fbc2d46fff6d33.tar.gz cpython-e822b034e766e03cd8fbe7ab52fbc2d46fff6d33.tar.bz2 |
Issue #15866: The xmlcharrefreplace error handler no more produces two XML
entities for a non-BMP character on narrow build.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 82 |
1 files changed, 61 insertions, 21 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 64a5ef5..866eb9b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -547,6 +547,37 @@ PyObject *PyUnicode_FromString(const char *u) return PyUnicode_FromStringAndSize(u, size); } +/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed + * by 'ptr', possibly combining surrogate pairs on narrow builds. + * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character + * that should be returned and 'end' pointing to the end of the buffer. + * ('end' is used on narrow builds to detect a lone surrogate at the + * end of the buffer that should be returned unchanged.) + * The ptr and end arguments should be side-effect free and ptr must an lvalue. + * The type of the returned char is always Py_UCS4. + * + * Note: the macro advances ptr to next char, so it might have side-effects + * (especially if used with other macros). + */ + +/* helper macros used by _Py_UNICODE_NEXT */ +#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) +#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) +/* Join two surrogate characters and return a single Py_UCS4 value. */ +#define _Py_UNICODE_JOIN_SURROGATES(high, low) \ + (((((Py_UCS4)(high) & 0x03FF) << 10) | \ + ((Py_UCS4)(low) & 0x03FF)) + 0x10000) + +#ifdef Py_UNICODE_WIDE +#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++ +#else +#define _Py_UNICODE_NEXT(ptr, end) \ + (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \ + _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \ + ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \ + (Py_UCS4)*(ptr)++) +#endif + #ifdef HAVE_WCHAR_H #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4) @@ -3642,26 +3673,22 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, case 4: /* xmlcharrefreplace */ respos = str-PyString_AS_STRING(res); /* determine replacement size (temporarily (mis)uses p) */ - for (p = collstart, repsize = 0; p < collend; ++p) { - if (*p<10) + for (p = collstart, repsize = 0; p < collend;) { + Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); + if (ch < 10) repsize += 2+1+1; - else if (*p<100) + else if (ch < 100) repsize += 2+2+1; - else if (*p<1000) + else if (ch < 1000) repsize += 2+3+1; - else if (*p<10000) + else if (ch < 10000) repsize += 2+4+1; -#ifndef Py_UNICODE_WIDE - else + else if (ch < 100000) repsize += 2+5+1; -#else - else if (*p<100000) - repsize += 2+5+1; - else if (*p<1000000) + else if (ch < 1000000) repsize += 2+6+1; else repsize += 2+7+1; -#endif } requiredsize = respos+repsize+(endp-collend); if (requiredsize > ressize) { @@ -3673,8 +3700,9 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, ressize = requiredsize; } /* generate replacement (temporarily (mis)uses p) */ - for (p = collstart; p < collend; ++p) { - str += sprintf(str, "&#%d;", (int)*p); + for (p = collstart; p < collend;) { + Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); + str += sprintf(str, "&#%d;", (int)ch); } p = collend; break; @@ -4649,11 +4677,20 @@ int charmap_encoding_error( *inpos = collendpos; break; case 4: /* xmlcharrefreplace */ - /* generate replacement (temporarily (mis)uses p) */ - for (collpos = collstartpos; collpos < collendpos; ++collpos) { + /* generate replacement */ + for (collpos = collstartpos; collpos < collendpos;) { char buffer[2+29+1+1]; char *cp; - sprintf(buffer, "&#%d;", (int)p[collpos]); + Py_UCS4 ch = p[collpos++]; +#ifndef Py_UNICODE_WIDE + if ((0xD800 <= ch && ch <= 0xDBFF) && + (collpos < collendpos) && + (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) { + ch = ((((ch & 0x03FF) << 10) | + ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000); + } +#endif + sprintf(buffer, "&#%d;", (int)ch); for (cp = buffer; *cp; ++cp) { x = charmapencode_output(*cp, mapping, res, respos); if (x==enc_EXCEPTION) @@ -5068,10 +5105,11 @@ PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, break; case 4: /* xmlcharrefreplace */ /* generate replacement (temporarily (mis)uses p) */ - for (p = collstart; p < collend; ++p) { + for (p = collstart; p < collend;) { char buffer[2+29+1+1]; char *cp; - sprintf(buffer, "&#%d;", (int)*p); + Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); + sprintf(buffer, "&#%d;", (int)ch); if (charmaptranslate_makespace(&res, &str, (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) goto onError; @@ -5222,8 +5260,10 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, break; case 4: /* xmlcharrefreplace */ /* generate replacement (temporarily (mis)uses p) */ - for (p = collstart; p < collend; ++p) - output += sprintf(output, "&#%d;", (int)*p); + for (p = collstart; p < collend;) { + Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend); + output += sprintf(output, "&#%d;", ch); + } p = collend; break; default: |