diff options
author | Benjamin Peterson <benjamin@python.org> | 2009-10-28 21:59:39 (GMT) |
---|---|---|
committer | Benjamin Peterson <benjamin@python.org> | 2009-10-28 21:59:39 (GMT) |
commit | b2e796aa27af76bdf0386a34a23415e3efbcc75c (patch) | |
tree | b0645c75593e8c01ba1aaf730fdc67652c2e3227 /Python/ast.c | |
parent | 7b1b094ff17035d35c3736bcca12229fa360d33a (diff) | |
download | cpython-b2e796aa27af76bdf0386a34a23415e3efbcc75c.zip cpython-b2e796aa27af76bdf0386a34a23415e3efbcc75c.tar.gz cpython-b2e796aa27af76bdf0386a34a23415e3efbcc75c.tar.bz2 |
in wide builds, avoid storing high unicode characters from source code with surrogates
This is accomplished by decoding with utf-32 instead of utf-16 on all builds.
The patch is by Adam Olsen.
Diffstat (limited to 'Python/ast.c')
-rw-r--r-- | Python/ast.c | 23 |
1 files changed, 14 insertions, 9 deletions
diff --git a/Python/ast.c b/Python/ast.c index c3edea3..c6a6417 100644 --- a/Python/ast.c +++ b/Python/ast.c @@ -3246,10 +3246,11 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons u = NULL; } else { /* check for integer overflow */ - if (len > PY_SIZE_MAX / 4) + if (len > PY_SIZE_MAX / 6) return NULL; - /* "\XX" may become "\u005c\uHHLL" (12 bytes) */ - u = PyBytes_FromStringAndSize((char *)NULL, len * 4); + /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5 + "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */ + u = PyBytes_FromStringAndSize((char *)NULL, len * 6); if (u == NULL) return NULL; p = buf = PyBytes_AsString(u); @@ -3266,20 +3267,24 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons PyObject *w; char *r; Py_ssize_t rn, i; - w = decode_utf8(c, &s, end, "utf-16-be"); + w = decode_utf8(c, &s, end, "utf-32-be"); if (w == NULL) { Py_DECREF(u); return NULL; } r = PyBytes_AS_STRING(w); rn = Py_SIZE(w); - assert(rn % 2 == 0); - for (i = 0; i < rn; i += 2) { - sprintf(p, "\\u%02x%02x", + assert(rn % 4 == 0); + for (i = 0; i < rn; i += 4) { + sprintf(p, "\\U%02x%02x%02x%02x", r[i + 0] & 0xFF, - r[i + 1] & 0xFF); - p += 6; + r[i + 1] & 0xFF, + r[i + 2] & 0xFF, + r[i + 3] & 0xFF); + p += 10; } + /* Should be impossible to overflow */ + assert(p - buf <= Py_SIZE(u)); Py_DECREF(w); } else { *p++ = *s++; |