diff options
author | Marc-André Lemburg <mal@egenix.com> | 2001-05-21 20:30:15 (GMT) |
---|---|---|
committer | Marc-André Lemburg <mal@egenix.com> | 2001-05-21 20:30:15 (GMT) |
commit | 489b56e04480b8ca3f2d1676265e67c65bae788d (patch) | |
tree | a148a1f74890d004f6434a77eb14185b76c73c77 /Objects | |
parent | f52d27e52d289b99837b4555fb3f757f2c89f4ad (diff) | |
download | cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.zip cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.tar.gz cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.tar.bz2 |
This patch changes the behaviour of the UTF-16 codec family. Only the
UTF-16 codec will now interpret and remove a *leading* BOM mark. Sub-
sequent BOM characters are no longer interpreted and removed.
UTF-16-LE and -BE pass through all BOM mark characters.
These changes should get the UTF-16 codec more in line with what
the Unicode FAQ recommends w/r to BOM marks.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 42 |
1 files changed, 25 insertions, 17 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 475215c..d55e2a7 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1001,31 +1001,39 @@ PyObject *PyUnicode_DecodeUTF16(const char *s, if (byteorder) bo = *byteorder; - while (q < e) { - register Py_UNICODE ch = *q++; - - /* Check for BOM marks (U+FEFF) in the input and adjust - current byte order setting accordingly. Swap input - bytes if needed. (This assumes sizeof(Py_UNICODE) == 2 - !) */ + /* Check for BOM marks (U+FEFF) in the input and adjust current + byte order setting accordingly. In native mode, the leading BOM + mark is skipped, in all other modes, it is copied to the output + stream as-is (giving a ZWNBSP character). */ + if (bo == 0) { #ifdef BYTEORDER_IS_LITTLE_ENDIAN - if (ch == 0xFEFF) { + if (*q == 0xFEFF) { + q++; bo = -1; - continue; - } else if (ch == 0xFFFE) { + } else if (*q == 0xFFFE) { + q++; bo = 1; - continue; } - if (bo == 1) - ch = (ch >> 8) | (ch << 8); #else - if (ch == 0xFEFF) { + if (*q == 0xFEFF) { + q++; bo = 1; - continue; - } else if (ch == 0xFFFE) { + } else if (*q == 0xFFFE) { + q++; bo = -1; - continue; } +#endif + } + + while (q < e) { + register Py_UNICODE ch = *q++; + + /* Swap input bytes if needed. (This assumes + sizeof(Py_UNICODE) == 2 !) */ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + if (bo == 1) + ch = (ch >> 8) | (ch << 8); +#else if (bo == -1) ch = (ch >> 8) | (ch << 8); #endif |