diff options
author | Marc-André Lemburg <mal@egenix.com> | 2002-02-07 11:33:49 (GMT) |
---|---|---|
committer | Marc-André Lemburg <mal@egenix.com> | 2002-02-07 11:33:49 (GMT) |
commit | bd3be8f0ca4fd70d53d9330489ba565f83530b3b (patch) | |
tree | 33762ec816df0a313706c6a919fbe3629751b0b5 /Objects/unicodeobject.c | |
parent | 9273ec726c1ae9f151d12bc21a1ad96e7c225b77 (diff) | |
download | cpython-bd3be8f0ca4fd70d53d9330489ba565f83530b3b.zip cpython-bd3be8f0ca4fd70d53d9330489ba565f83530b3b.tar.gz cpython-bd3be8f0ca4fd70d53d9330489ba565f83530b3b.tar.bz2 |
Fix to the UTF-8 encoder: it failed on 0-length input strings.
Fix for the UTF-8 decoder: it will now accept isolated surrogates
(previously it raised an exception which causes round-trips to
fail).
Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for
marshalling Unicode objects, so we better make sure it works for
all Unicode code points, including isolated surrogates).
Bumped the PYC magic in a non-standard way -- please review. This
was needed because the old PYC format used illegal UTF-8 sequences
for isolated high surrogates which now raise an exception.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 23 |
1 files changed, 17 insertions, 6 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index fb9b81f..1d0508c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1065,12 +1065,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, goto utf8Error; } ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); - if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) { + if (ch < 0x0800) { + /* Note: UTF-8 encodings of surrogates are considered + legal UTF-8 sequences; + + XXX For wide builds (UCS-4) we should probably try + to recombine the surrogates into a single code + unit. + */ errmsg = "illegal encoding"; goto utf8Error; } else - *p++ = (Py_UNICODE)ch; + *p++ = (Py_UNICODE)ch; break; case 4: @@ -1084,9 +1091,9 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); /* validate and convert to UTF-16 */ if ((ch < 0x10000) /* minimum value allowed for 4 - byte encoding */ + byte encoding */ || (ch > 0x10ffff)) /* maximum value allowed for - UTF-16 */ + UTF-16 */ { errmsg = "illegal encoding"; goto utf8Error; @@ -1175,11 +1182,15 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, unsigned int cbWritten = 0; int i = 0; + /* Short-cut for emtpy strings */ + if (size == 0) + return PyString_FromStringAndSize(NULL, 0); + + /* We allocate 4 more bytes to have room for at least one full + UTF-8 sequence; saves a few cycles in the loop below */ v = PyString_FromStringAndSize(NULL, cbAllocated + 4); if (v == NULL) return NULL; - if (size == 0) - return v; p = PyString_AS_STRING(v); while (i < size) { |