summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorMarc-André Lemburg <mal@egenix.com>2002-02-07 11:33:49 (GMT)
committerMarc-André Lemburg <mal@egenix.com>2002-02-07 11:33:49 (GMT)
commitbd3be8f0ca4fd70d53d9330489ba565f83530b3b (patch)
tree33762ec816df0a313706c6a919fbe3629751b0b5 /Objects
parent9273ec726c1ae9f151d12bc21a1ad96e7c225b77 (diff)
downloadcpython-bd3be8f0ca4fd70d53d9330489ba565f83530b3b.zip
cpython-bd3be8f0ca4fd70d53d9330489ba565f83530b3b.tar.gz
cpython-bd3be8f0ca4fd70d53d9330489ba565f83530b3b.tar.bz2
Fix to the UTF-8 encoder: it failed on 0-length input strings.
Fix for the UTF-8 decoder: it will now accept isolated surrogates (previously it raised an exception which causes round-trips to fail). Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for marshalling Unicode objects, so we better make sure it works for all Unicode code points, including isolated surrogates). Bumped the PYC magic in a non-standard way -- please review. This was needed because the old PYC format used illegal UTF-8 sequences for isolated high surrogates which now raise an exception.
Diffstat (limited to 'Objects')
-rw-r--r--Objects/unicodeobject.c23
1 files changed, 17 insertions, 6 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index fb9b81f..1d0508c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1065,12 +1065,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
goto utf8Error;
}
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
- if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
+ if (ch < 0x0800) {
+ /* Note: UTF-8 encodings of surrogates are considered
+ legal UTF-8 sequences;
+
+ XXX For wide builds (UCS-4) we should probably try
+ to recombine the surrogates into a single code
+ unit.
+ */
errmsg = "illegal encoding";
goto utf8Error;
}
else
- *p++ = (Py_UNICODE)ch;
+ *p++ = (Py_UNICODE)ch;
break;
case 4:
@@ -1084,9 +1091,9 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
/* validate and convert to UTF-16 */
if ((ch < 0x10000) /* minimum value allowed for 4
- byte encoding */
+ byte encoding */
|| (ch > 0x10ffff)) /* maximum value allowed for
- UTF-16 */
+ UTF-16 */
{
errmsg = "illegal encoding";
goto utf8Error;
@@ -1175,11 +1182,15 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
unsigned int cbWritten = 0;
int i = 0;
+ /* Short-cut for emtpy strings */
+ if (size == 0)
+ return PyString_FromStringAndSize(NULL, 0);
+
+ /* We allocate 4 more bytes to have room for at least one full
+ UTF-8 sequence; saves a few cycles in the loop below */
v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
if (v == NULL)
return NULL;
- if (size == 0)
- return v;
p = PyString_AS_STRING(v);
while (i < size) {