From 9a3a9f779142d92655f86eaf9584ce946c61dfea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Sun, 18 May 2003 12:31:09 +0000 Subject: Consider \U-escapes in raw-unicode-escape. Fixes #444514. --- Lib/test/test_unicode.py | 7 +++++++ Objects/unicodeobject.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 9250ef2..8e1f0b1 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -697,6 +697,13 @@ class UnicodeTest( print >>out, u'def\n' print >>out, u'def\n' + def test_ucs4(self): + if sys.maxunicode == 0xFFFF: + return + x = u'\U00100000' + y = x.encode("raw-unicode-escape").decode("raw-unicode-escape") + self.assertEqual(x, y) + def test_main(): test_support.run_unittest(UnicodeTest) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 096dfcb..94c67c8 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2030,6 +2030,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, unsigned char c; Py_UCS4 x; int i; + int count; /* Non-escape characters are interpreted as Unicode ordinals */ if (*s != '\\') { @@ -2048,15 +2049,16 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, } if (((s - bs) & 1) == 0 || s >= end || - *s != 'u') { + (*s != 'u' && *s != 'U')) { continue; } p--; + count = *s=='u' ? 4 : 8; s++; - /* \uXXXX with 4 hex digits */ + /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ outpos = p-PyUnicode_AS_UNICODE(v); - for (x = 0, i = 0; i < 4; ++i, ++s) { + for (x = 0, i = 0; i < count; ++i, ++s) { c = (unsigned char)*s; if (!isxdigit(c)) { endinpos = s-starts; @@ -2076,6 +2078,16 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, else x += 10 + c - 'A'; } +#ifndef Py_UNICODE_WIDE + if (x > 0x10000) { + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "rawunicodeescape", "\\Uxxxxxxxx out of range", + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) + goto onError; + } +#endif *p++ = x; nextByte: ; @@ -2102,7 +2114,11 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, static const char *hexdigit = "0123456789abcdef"; +#ifdef Py_UNICODE_WIDE + repr = PyString_FromStringAndSize(NULL, 10 * size); +#else repr = PyString_FromStringAndSize(NULL, 6 * size); +#endif if (repr == NULL) return NULL; if (size == 0) @@ -2111,6 +2127,22 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, p = q = PyString_AS_STRING(repr); while (size-- > 0) { Py_UNICODE ch = *s++; +#ifdef Py_UNICODE_WIDE + /* Map 32-bit characters to '\Uxxxxxxxx' */ + if (ch >= 0x10000) { + *p++ = '\\'; + *p++ = 'U'; + *p++ = hexdigit[(ch >> 28) & 0xf]; + *p++ = hexdigit[(ch >> 24) & 0xf]; + *p++ = hexdigit[(ch >> 20) & 0xf]; + *p++ = hexdigit[(ch >> 16) & 0xf]; + *p++ = hexdigit[(ch >> 12) & 0xf]; + *p++ = hexdigit[(ch >> 8) & 0xf]; + *p++ = hexdigit[(ch >> 4) & 0xf]; + *p++ = hexdigit[ch & 15]; + } + else +#endif /* Map 16-bit characters to '\uxxxx' */ if (ch >= 256) { *p++ = '\\'; @@ -6769,3 +6801,10 @@ _PyUnicode_Fini(void) unicode_freelist = NULL; unicode_freelist_size = 0; } + +/* +Local variables: +c-basic-offset: 4 +indent-tabs-mode: nil +End: +*/ -- cgit v0.12