summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorFredrik Lundh <fredrik@pythonware.com>2000-09-03 11:29:49 (GMT)
committerFredrik Lundh <fredrik@pythonware.com>2000-09-03 11:29:49 (GMT)
commitdf84675f938238b8193bd2c945967011cce77a21 (patch)
treeddcf2913aa7d6162e01a23b00b12d98a9885599e /Objects/unicodeobject.c
parent03dd010b4fa7524e90cf1d5469f41618428589c9 (diff)
downloadcpython-df84675f938238b8193bd2c945967011cce77a21.zip
cpython-df84675f938238b8193bd2c945967011cce77a21.tar.gz
cpython-df84675f938238b8193bd2c945967011cce77a21.tar.bz2
changed \x to consume exactly two hex digits, also for unicode
strings. closes PEP-223. also added \U escape (eight hex digits).
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c121
1 files changed, 66 insertions, 55 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 11146ff..8ed6380 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1163,6 +1163,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
PyUnicodeObject *v;
Py_UNICODE *p = NULL, *buf = NULL;
const char *end;
+ Py_UCS4 chr;
/* Escaped strings will always be longer than the resulting
Unicode string, so we start with size here and then reduce the
@@ -1214,28 +1215,27 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
*p++ = x;
break;
- /* \xXXXX escape with 1-n hex digits. for compatibility
- with 8-bit strings, this code ignores all but the last
- two digits */
+ /* \xXX with two hex digits */
case 'x':
- x = 0;
- c = (unsigned char)*s;
- if (isxdigit(c)) {
- do {
- x = (x<<4) & 0xF0;
- if ('0' <= c && c <= '9')
- x += c - '0';
- else if ('a' <= c && c <= 'f')
- x += 10 + c - 'a';
- else
- x += 10 + c - 'A';
- c = (unsigned char)*++s;
- } while (isxdigit(c));
- *p++ = (unsigned char) x;
- } else {
- *p++ = '\\';
- *p++ = (unsigned char)s[-1];
+ for (x = 0, i = 0; i < 2; i++) {
+ c = (unsigned char)s[i];
+ if (!isxdigit(c)) {
+ if (unicodeescape_decoding_error(&s, &x, errors,
+ "truncated \\xXX"))
+ goto onError;
+ i++;
+ break;
+ }
+ x = (x<<4) & ~0xF;
+ if (c >= '0' && c <= '9')
+ x += c - '0';
+ else if (c >= 'a' && c <= 'f')
+ x += 10 + c - 'a';
+ else
+ x += 10 + c - 'A';
}
+ s += i;
+ *p++ = x;
break;
/* \uXXXX with 4 hex digits */
@@ -1261,36 +1261,50 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
*p++ = x;
break;
+ /* \UXXXXXXXX with 8 hex digits */
+ case 'U':
+ for (chr = 0, i = 0; i < 8; i++) {
+ c = (unsigned char)s[i];
+ if (!isxdigit(c)) {
+ if (unicodeescape_decoding_error(&s, &x, errors,
+ "truncated \\uXXXX"))
+ goto onError;
+ i++;
+ break;
+ }
+ chr = (chr<<4) & ~0xF;
+ if (c >= '0' && c <= '9')
+ chr += c - '0';
+ else if (c >= 'a' && c <= 'f')
+ chr += 10 + c - 'a';
+ else
+ chr += 10 + c - 'A';
+ }
+ s += i;
+ goto store;
+
case 'N':
/* Ok, we need to deal with Unicode Character Names now,
* make sure we've imported the hash table data...
*/
- if (pucnHash == NULL)
- {
+ if (pucnHash == NULL) {
PyObject *mod = 0, *v = 0;
-
mod = PyImport_ImportModule("ucnhash");
if (mod == NULL)
goto onError;
v = PyObject_GetAttrString(mod,"ucnhashAPI");
Py_DECREF(mod);
if (v == NULL)
- {
goto onError;
- }
pucnHash = PyCObject_AsVoidPtr(v);
Py_DECREF(v);
if (pucnHash == NULL)
- {
goto onError;
- }
}
- if (*s == '{')
- {
+ if (*s == '{') {
const char *start = s + 1;
const char *endBrace = start;
- Py_UCS4 value;
unsigned long j;
/* look for either the closing brace, or we
@@ -1303,8 +1317,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
{
endBrace++;
}
- if (endBrace != end && *endBrace == '}')
- {
+ if (endBrace != end && *endBrace == '}') {
j = pucnHash->hash(start, endBrace - start);
if (j > pucnHash->cKeys ||
mystrnicmp(
@@ -1321,30 +1334,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
}
goto ucnFallthrough;
}
- value = ((_Py_UnicodeCharacterName *)
- (pucnHash->getValue(j)))->value;
- if (value < 1<<16)
- {
- /* In UCS-2 range, easy solution.. */
- *p++ = value;
- }
- else
- {
- /* Oops, its in UCS-4 space, */
- /* compute and append the two surrogates: */
- /* translate from 10000..10FFFF to 0..FFFFF */
- value -= 0x10000;
-
- /* high surrogate = top 10 bits added to D800 */
- *p++ = 0xD800 + (value >> 10);
-
- /* low surrogate = bottom 10 bits added to DC00 */
- *p++ = 0xDC00 + (value & ~0xFC00);
- }
+ chr = ((_Py_UnicodeCharacterName *)
+ (pucnHash->getValue(j)))->value;
s = endBrace + 1;
- }
- else
- {
+ goto store;
+ } else {
if (unicodeescape_decoding_error(
&s, &x, errors,
"Unicode name missing closing brace"))
@@ -1363,6 +1357,23 @@ ucnFallthrough:
*p++ = '\\';
*p++ = (unsigned char)s[-1];
break;
+store:
+ /* when we get here, chr is a 32-bit unicode character */
+ if (chr <= 0xffff)
+ /* UCS-2 character */
+ *p++ = (Py_UNICODE) chr;
+ else if (chr <= 0x10ffff) {
+ /* UCS-4 character. store as two surrogate characters */
+ chr -= 0x10000L;
+ *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
+ *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
+ } else {
+ if (unicodeescape_decoding_error(
+ &s, &x, errors,
+ "Illegal Unicode character")
+ )
+ goto onError;
+ }
}
}
if (_PyUnicode_Resize(v, (int)(p - buf)))