summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorFredrik Lundh <fredrik@pythonware.com>2001-02-18 22:13:49 (GMT)
committerFredrik Lundh <fredrik@pythonware.com>2001-02-18 22:13:49 (GMT)
commitccc7473fc8593a19b9d8d26118c22eb759350a53 (patch)
treeb935d0caf459df962d1856c565e967aa373bc8aa /Objects/unicodeobject.c
parentb95896b2d2592132da97ef93f5583a45d49226a4 (diff)
downloadcpython-ccc7473fc8593a19b9d8d26118c22eb759350a53.zip
cpython-ccc7473fc8593a19b9d8d26118c22eb759350a53.tar.gz
cpython-ccc7473fc8593a19b9d8d26118c22eb759350a53.tar.bz2
reorganized PyUnicode_DecodeUnicodeEscape a bit (in order to make it
less likely that bug #132817 ever appears again)
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c179
1 files changed, 69 insertions, 110 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 7b12594..c237789 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1110,10 +1110,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
const char *errors)
{
PyUnicodeObject *v;
- Py_UNICODE *p = NULL, *buf = NULL;
+ Py_UNICODE *p, *buf;
const char *end;
- Py_UCS4 chr;
-
+ char* message;
+ Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
+
/* Escaped strings will always be longer than the resulting
Unicode string, so we start with size here and then reduce the
length after conversion to the true value. */
@@ -1122,16 +1123,18 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
goto onError;
if (size == 0)
return (PyObject *)v;
+
p = buf = PyUnicode_AS_UNICODE(v);
end = s + size;
+
while (s < end) {
unsigned char c;
Py_UNICODE x;
- int i;
+ int i, digits;
/* Non-escape characters are interpreted as Unicode ordinals */
if (*s != '\\') {
- *p++ = (unsigned char)*s++;
+ *p++ = (unsigned char) *s++;
continue;
}
@@ -1164,60 +1167,31 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
*p++ = x;
break;
- /* \xXX with two hex digits */
+ /* hex escapes */
+ /* \xXX */
case 'x':
- for (x = 0, i = 0; i < 2; i++) {
- c = (unsigned char)s[i];
- if (!isxdigit(c)) {
- if (unicodeescape_decoding_error(&s, &x, errors,
- "truncated \\xXX"))
- goto onError;
- i++;
- break;
- }
- x = (x<<4) & ~0xF;
- if (c >= '0' && c <= '9')
- x += c - '0';
- else if (c >= 'a' && c <= 'f')
- x += 10 + c - 'a';
- else
- x += 10 + c - 'A';
- }
- s += i;
- *p++ = x;
- break;
+ digits = 2;
+ message = "truncated \\xXX escape";
+ goto hexescape;
- /* \uXXXX with 4 hex digits */
+ /* \uXXXX */
case 'u':
- for (x = 0, i = 0; i < 4; i++) {
- c = (unsigned char)s[i];
- if (!isxdigit(c)) {
- if (unicodeescape_decoding_error(&s, &x, errors,
- "truncated \\uXXXX"))
- goto onError;
- i++;
- break;
- }
- x = (x<<4) & ~0xF;
- if (c >= '0' && c <= '9')
- x += c - '0';
- else if (c >= 'a' && c <= 'f')
- x += 10 + c - 'a';
- else
- x += 10 + c - 'A';
- }
- s += i;
- *p++ = x;
- break;
+ digits = 4;
+ message = "truncated \\uXXXX escape";
+ goto hexescape;
- /* \UXXXXXXXX with 8 hex digits */
+ /* \UXXXXXXXX */
case 'U':
- for (chr = 0, i = 0; i < 8; i++) {
- c = (unsigned char)s[i];
+ digits = 8;
+ message = "truncated \\UXXXXXXXX escape";
+ hexescape:
+ chr = 0;
+ for (i = 0; i < digits; i++) {
+ c = (unsigned char) s[i];
if (!isxdigit(c)) {
- if (unicodeescape_decoding_error(&s, &x, errors,
- "truncated \\uXXXX"))
+ if (unicodeescape_decoding_error(&s, &x, errors, message))
goto onError;
+ chr = x;
i++;
break;
}
@@ -1230,19 +1204,37 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
chr += 10 + c - 'A';
}
s += i;
- goto store;
+ store:
+ /* when we get here, chr is a 32-bit unicode character */
+ if (chr <= 0xffff)
+ /* UCS-2 character */
+ *p++ = (Py_UNICODE) chr;
+ else if (chr <= 0x10ffff) {
+ /* UCS-4 character. store as two surrogate characters */
+ chr -= 0x10000L;
+ *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
+ *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
+ } else {
+ if (unicodeescape_decoding_error(
+ &s, &x, errors,
+ "illegal Unicode character")
+ )
+ goto onError;
+ *p++ = x; /* store replacement character */
+ }
+ break;
+ /* \N{name} */
case 'N':
- /* Ok, we need to deal with Unicode Character Names now,
- * make sure we've imported the hash table data...
- */
+ message = "malformed \\N character escape";
if (ucnhash_CAPI == NULL) {
- PyObject *mod = 0, *v = 0;
- mod = PyImport_ImportModule("unicodedata");
- if (mod == NULL)
+ /* load the unicode data module */
+ PyObject *m, *v;
+ m = PyImport_ImportModule("unicodedata");
+ if (m == NULL)
goto ucnhashError;
- v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
- Py_DECREF(mod);
+ v = PyObject_GetAttrString(m, "ucnhash_CAPI");
+ Py_DECREF(m);
if (v == NULL)
goto ucnhashError;
ucnhash_CAPI = PyCObject_AsVoidPtr(v);
@@ -1250,75 +1242,42 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
if (ucnhash_CAPI == NULL)
goto ucnhashError;
}
-
if (*s == '{') {
- const char *start = s + 1;
- const char *endBrace = start;
-
+ const char *start = s+1;
/* look for the closing brace */
- while (*endBrace != '}' && endBrace < end)
- endBrace++;
- if (endBrace != end && *endBrace == '}') {
- if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
- if (unicodeescape_decoding_error(
- &s, &x, errors,
- "Invalid Unicode Character Name")
- )
- goto onError;
- goto ucnFallthrough;
- }
- s = endBrace + 1;
- goto store;
- } else {
- if (unicodeescape_decoding_error(
- &s, &x, errors,
- "Unicode name missing closing brace"))
- goto onError;
- goto ucnFallthrough;
+ while (*s != '}' && s < end)
+ s++;
+ if (s > start && s < end && *s == '}') {
+ /* found a name. look it up in the unicode database */
+ message = "unknown Unicode character name";
+ s++;
+ if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
+ goto store;
}
- break;
}
- if (unicodeescape_decoding_error(
- &s, &x, errors,
- "Missing opening brace for Unicode Character Name escape"))
+ if (unicodeescape_decoding_error(&s, &x, errors, message))
goto onError;
-ucnFallthrough:
- /* fall through on purpose */
- default:
+ *p++ = x;
+ break;
+
+ default:
*p++ = '\\';
*p++ = (unsigned char)s[-1];
break;
-store:
- /* when we get here, chr is a 32-bit unicode character */
- if (chr <= 0xffff)
- /* UCS-2 character */
- *p++ = (Py_UNICODE) chr;
- else if (chr <= 0x10ffff) {
- /* UCS-4 character. store as two surrogate characters */
- chr -= 0x10000L;
- *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
- *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
- } else {
- if (unicodeescape_decoding_error(
- &s, &x, errors,
- "Illegal Unicode character")
- )
- goto onError;
- }
}
}
if (_PyUnicode_Resize(v, (int)(p - buf)))
goto onError;
return (PyObject *)v;
- ucnhashError:
+ucnhashError:
PyErr_SetString(
PyExc_UnicodeError,
"\\N escapes not supported (can't load unicodedata module)"
);
return NULL;
- onError:
+onError:
Py_XDECREF(v);
return NULL;
}