From df84675f938238b8193bd2c945967011cce77a21 Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Sun, 3 Sep 2000 11:29:49 +0000 Subject: changed \x to consume exactly two hex digits, also for unicode strings. closes PEP-223. also added \U escape (eight hex digits). --- Objects/unicodeobject.c | 121 ++++++++++++++++++++++++++---------------------- 1 file changed, 66 insertions(+), 55 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 11146ff..8ed6380 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1163,6 +1163,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, PyUnicodeObject *v; Py_UNICODE *p = NULL, *buf = NULL; const char *end; + Py_UCS4 chr; /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the @@ -1214,28 +1215,27 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, *p++ = x; break; - /* \xXXXX escape with 1-n hex digits. for compatibility - with 8-bit strings, this code ignores all but the last - two digits */ + /* \xXX with two hex digits */ case 'x': - x = 0; - c = (unsigned char)*s; - if (isxdigit(c)) { - do { - x = (x<<4) & 0xF0; - if ('0' <= c && c <= '9') - x += c - '0'; - else if ('a' <= c && c <= 'f') - x += 10 + c - 'a'; - else - x += 10 + c - 'A'; - c = (unsigned char)*++s; - } while (isxdigit(c)); - *p++ = (unsigned char) x; - } else { - *p++ = '\\'; - *p++ = (unsigned char)s[-1]; + for (x = 0, i = 0; i < 2; i++) { + c = (unsigned char)s[i]; + if (!isxdigit(c)) { + if (unicodeescape_decoding_error(&s, &x, errors, + "truncated \\xXX")) + goto onError; + i++; + break; + } + x = (x<<4) & ~0xF; + if (c >= '0' && c <= '9') + x += c - '0'; + else if (c >= 'a' && c <= 'f') + x += 10 + c - 'a'; + else + x += 10 + c - 'A'; } + s += i; + *p++ = x; break; /* \uXXXX with 4 hex digits */ @@ -1261,36 +1261,50 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, *p++ = x; break; + /* \UXXXXXXXX with 8 hex digits */ + case 'U': + for (chr = 0, i = 0; i < 8; i++) { + c = (unsigned char)s[i]; + if (!isxdigit(c)) { + if (unicodeescape_decoding_error(&s, &x, errors, + "truncated \\uXXXX")) + goto onError; + i++; + break; + } + chr = (chr<<4) & ~0xF; + if (c >= '0' && c <= '9') + chr += c - '0'; + else if (c >= 'a' && c <= 'f') + chr += 10 + c - 'a'; + else + chr += 10 + c - 'A'; + } + s += i; + goto store; + case 'N': /* Ok, we need to deal with Unicode Character Names now, * make sure we've imported the hash table data... */ - if (pucnHash == NULL) - { + if (pucnHash == NULL) { PyObject *mod = 0, *v = 0; - mod = PyImport_ImportModule("ucnhash"); if (mod == NULL) goto onError; v = PyObject_GetAttrString(mod,"ucnhashAPI"); Py_DECREF(mod); if (v == NULL) - { goto onError; - } pucnHash = PyCObject_AsVoidPtr(v); Py_DECREF(v); if (pucnHash == NULL) - { goto onError; - } } - if (*s == '{') - { + if (*s == '{') { const char *start = s + 1; const char *endBrace = start; - Py_UCS4 value; unsigned long j; /* look for either the closing brace, or we @@ -1303,8 +1317,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, { endBrace++; } - if (endBrace != end && *endBrace == '}') - { + if (endBrace != end && *endBrace == '}') { j = pucnHash->hash(start, endBrace - start); if (j > pucnHash->cKeys || mystrnicmp( @@ -1321,30 +1334,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, } goto ucnFallthrough; } - value = ((_Py_UnicodeCharacterName *) - (pucnHash->getValue(j)))->value; - if (value < 1<<16) - { - /* In UCS-2 range, easy solution.. */ - *p++ = value; - } - else - { - /* Oops, its in UCS-4 space, */ - /* compute and append the two surrogates: */ - /* translate from 10000..10FFFF to 0..FFFFF */ - value -= 0x10000; - - /* high surrogate = top 10 bits added to D800 */ - *p++ = 0xD800 + (value >> 10); - - /* low surrogate = bottom 10 bits added to DC00 */ - *p++ = 0xDC00 + (value & ~0xFC00); - } + chr = ((_Py_UnicodeCharacterName *) + (pucnHash->getValue(j)))->value; s = endBrace + 1; - } - else - { + goto store; + } else { if (unicodeescape_decoding_error( &s, &x, errors, "Unicode name missing closing brace")) @@ -1363,6 +1357,23 @@ ucnFallthrough: *p++ = '\\'; *p++ = (unsigned char)s[-1]; break; +store: + /* when we get here, chr is a 32-bit unicode character */ + if (chr <= 0xffff) + /* UCS-2 character */ + *p++ = (Py_UNICODE) chr; + else if (chr <= 0x10ffff) { + /* UCS-4 character. store as two surrogate characters */ + chr -= 0x10000L; + *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); + *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00); + } else { + if (unicodeescape_decoding_error( + &s, &x, errors, + "Illegal Unicode character") + ) + goto onError; + } } } if (_PyUnicode_Resize(v, (int)(p - buf))) -- cgit v0.12