From 76df43de30f40b5cc1de9d36a5a083dd8bd8cb27 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 30 Oct 2012 01:42:39 +0100 Subject: Issue #16330: Use surrogate-related macros Patch written by Serhiy Storchaka. --- Include/unicodeobject.h | 6 +++--- Modules/_json.c | 21 ++++++++++----------- Modules/cjkcodecs/cjkcodecs.h | 11 +++++------ Objects/unicodeobject.c | 7 +++---- Python/codecs.c | 4 ++-- Python/fileutils.c | 4 ++-- 6 files changed, 25 insertions(+), 28 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index fa21c1c..363776b 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -180,9 +180,9 @@ typedef unsigned char Py_UCS1; } while (0) /* macros to work with surrogates */ -#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDFFF) -#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) -#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) +#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF) +#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF) +#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF) /* Join two surrogate characters and return a single Py_UCS4 value. */ #define Py_UNICODE_JOIN_SURROGATES(high, low) \ (((((Py_UCS4)(high) & 0x03FF) << 10) | \ diff --git a/Modules/_json.c b/Modules/_json.c index fb8bd59..2538b05 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -174,14 +174,13 @@ ascii_escape_unichar(Py_UCS4 c, unsigned char *output, Py_ssize_t chars) default: if (c >= 0x10000) { /* UTF-16 surrogate pair */ - Py_UCS4 v = c - 0x10000; - c = 0xd800 | ((v >> 10) & 0x3ff); + Py_UCS4 v = Py_UNICODE_HIGH_SURROGATE(c); output[chars++] = 'u'; - output[chars++] = Py_hexdigits[(c >> 12) & 0xf]; - output[chars++] = Py_hexdigits[(c >> 8) & 0xf]; - output[chars++] = Py_hexdigits[(c >> 4) & 0xf]; - output[chars++] = Py_hexdigits[(c ) & 0xf]; - c = 0xdc00 | (v & 0x3ff); + output[chars++] = Py_hexdigits[(v >> 12) & 0xf]; + output[chars++] = Py_hexdigits[(v >> 8) & 0xf]; + output[chars++] = Py_hexdigits[(v >> 4) & 0xf]; + output[chars++] = Py_hexdigits[(v ) & 0xf]; + c = Py_UNICODE_LOW_SURROGATE(c); output[chars++] = '\\'; } output[chars++] = 'u'; @@ -431,7 +430,7 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next } } /* Surrogate pair */ - if ((c & 0xfc00) == 0xd800) { + if (Py_UNICODE_IS_HIGH_SURROGATE(c)) { Py_UCS4 c2 = 0; if (end + 6 >= len) { raise_errmsg("Unpaired high surrogate", pystr, end - 5); @@ -462,13 +461,13 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next goto bail; } } - if ((c2 & 0xfc00) != 0xdc00) { + if (!Py_UNICODE_IS_LOW_SURROGATE(c2)) { raise_errmsg("Unpaired high surrogate", pystr, end - 5); goto bail; } - c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); + c = Py_UNICODE_JOIN_SURROGATES(c, c2); } - else if ((c & 0xfc00) == 0xdc00) { + else if (Py_UNICODE_IS_LOW_SURROGATE(c)) { raise_errmsg("Unpaired low surrogate", pystr, end - 5); goto bail; } diff --git a/Modules/cjkcodecs/cjkcodecs.h b/Modules/cjkcodecs/cjkcodecs.h index ab0682a..fb41bdd 100644 --- a/Modules/cjkcodecs/cjkcodecs.h +++ b/Modules/cjkcodecs/cjkcodecs.h @@ -148,8 +148,8 @@ static const struct dbcs_map *mapping_list; #if Py_UNICODE_SIZE == 2 # define WRITEUCS4(c) \ REQUIRE_OUTBUF(2) \ - (*outbuf)[0] = 0xd800 + (((c) - 0x10000) >> 10); \ - (*outbuf)[1] = 0xdc00 + (((c) - 0x10000) & 0x3ff); \ + (*outbuf)[0] = Py_UNICODE_HIGH_SURROGATE(c); \ + (*outbuf)[1] = Py_UNICODE_LOW_SURROGATE(c); \ NEXT_OUT(2) #else # define WRITEUCS4(c) \ @@ -188,11 +188,10 @@ static const struct dbcs_map *mapping_list; #if Py_UNICODE_SIZE == 2 #define DECODE_SURROGATE(c) \ - if (c >> 10 == 0xd800 >> 10) { /* high surrogate */ \ + if (Py_UNICODE_IS_HIGH_SURROGATE(c)) { \ REQUIRE_INBUF(2) \ - if (IN2 >> 10 == 0xdc00 >> 10) { /* low surrogate */ \ - c = 0x10000 + ((ucs4_t)(c - 0xd800) << 10) + \ - ((ucs4_t)(IN2) - 0xdc00); \ + if (Py_UNICODE_IS_LOW_SURROGATE(IN2)) { \ + c = Py_UNICODE_JOIN_SURROGATES(c, IN2) \ } \ } #define GET_INSIZE(c) ((c) > 0xffff ? 2 : 1) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 0a3712e..3e2e8e3 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4412,7 +4412,7 @@ encode_char: /* code first surrogate */ base64bits += 16; - base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10); + base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch); while (base64bits >= 6) { *out++ = TO_BASE64(base64buffer >> (base64bits-6)); base64bits -= 6; @@ -7052,9 +7052,8 @@ encode_code_page_errors(UINT code_page, PyObject **outbytes, charsize = 1; } else { - ch -= 0x10000; - chars[0] = 0xd800 + (ch >> 10); - chars[1] = 0xdc00 + (ch & 0x3ff); + chars[0] = Py_UNICODE_HIGH_SURROGATE(ch); + chars[1] = Py_UNICODE_LOW_SURROGATE(ch); charsize = 2; } diff --git a/Python/codecs.c b/Python/codecs.c index 5470500..5cfb1c9 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -761,7 +761,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc) for (i = start; i < end; i++) { /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); - if (ch < 0xd800 || ch > 0xdfff) { + if (!Py_UNICODE_IS_SURROGATE(ch)) { /* Not a surrogate, fail with original exception */ PyErr_SetObject(PyExceptionInstance_Class(exc), exc); Py_DECREF(res); @@ -797,7 +797,7 @@ PyCodec_SurrogatePassErrors(PyObject *exc) (p[2] & 0xc0) == 0x80)) { /* it's a three-byte code */ ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); - if (ch < 0xd800 || ch > 0xdfff) + if (!Py_UNICODE_IS_SURROGATE(ch)) /* it's not a surrogate - fail */ ch = 0; } diff --git a/Python/fileutils.c b/Python/fileutils.c index 501cb8c..526751d 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -85,7 +85,7 @@ _Py_char2wchar(const char* arg, size_t *size) /* Only use the result if it contains no surrogate characters. */ for (tmp = res; *tmp != 0 && - (*tmp < 0xd800 || *tmp > 0xdfff); tmp++) + !Py_UNICODE_IS_SURROGATE(*tmp); tmp++) ; if (*tmp == 0) { if (size != NULL) @@ -131,7 +131,7 @@ _Py_char2wchar(const char* arg, size_t *size) memset(&mbs, 0, sizeof mbs); continue; } - if (*out >= 0xd800 && *out <= 0xdfff) { + if (Py_UNICODE_IS_SURROGATE(*out)) { /* Surrogate character. Escape the original byte sequence with surrogateescape. */ argsize -= converted; -- cgit v0.12