diff options
author | Marc-André Lemburg <mal@egenix.com> | 2001-05-21 20:30:15 (GMT) |
---|---|---|
committer | Marc-André Lemburg <mal@egenix.com> | 2001-05-21 20:30:15 (GMT) |
commit | 489b56e04480b8ca3f2d1676265e67c65bae788d (patch) | |
tree | a148a1f74890d004f6434a77eb14185b76c73c77 | |
parent | f52d27e52d289b99837b4555fb3f757f2c89f4ad (diff) | |
download | cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.zip cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.tar.gz cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.tar.bz2 |
This patch changes the behaviour of the UTF-16 codec family. Only the
UTF-16 codec will now interpret and remove a *leading* BOM mark. Sub-
sequent BOM characters are no longer interpreted and removed.
UTF-16-LE and -BE pass through all BOM mark characters.
These changes should get the UTF-16 codec more in line with what
the Unicode FAQ recommends w/r to BOM marks.
-rw-r--r-- | Include/unicodeobject.h | 9 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 42 |
2 files changed, 30 insertions, 21 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 988ea1b..f91a5a0 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -459,10 +459,11 @@ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8( *byteorder == 0: native order *byteorder == 1: big endian - and then switches according to all BOM marks it finds in the input - data. BOM marks are not copied into the resulting Unicode string. - After completion, *byteorder is set to the current byte order at - the end of input data. + In native mode, the first two bytes of the stream are checked for a + BOM mark. If found, the BOM mark is analysed, the byte order + adjusted and the BOM skipped. In the other modes, no BOM mark + interpretation is done. After completion, *byteorder is set to the + current byte order at the end of input data. If byteorder is NULL, the codec starts in native order mode. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 475215c..d55e2a7 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1001,31 +1001,39 @@ PyObject *PyUnicode_DecodeUTF16(const char *s, if (byteorder) bo = *byteorder; - while (q < e) { - register Py_UNICODE ch = *q++; - - /* Check for BOM marks (U+FEFF) in the input and adjust - current byte order setting accordingly. Swap input - bytes if needed. (This assumes sizeof(Py_UNICODE) == 2 - !) */ + /* Check for BOM marks (U+FEFF) in the input and adjust current + byte order setting accordingly. In native mode, the leading BOM + mark is skipped, in all other modes, it is copied to the output + stream as-is (giving a ZWNBSP character). */ + if (bo == 0) { #ifdef BYTEORDER_IS_LITTLE_ENDIAN - if (ch == 0xFEFF) { + if (*q == 0xFEFF) { + q++; bo = -1; - continue; - } else if (ch == 0xFFFE) { + } else if (*q == 0xFFFE) { + q++; bo = 1; - continue; } - if (bo == 1) - ch = (ch >> 8) | (ch << 8); #else - if (ch == 0xFEFF) { + if (*q == 0xFEFF) { + q++; bo = 1; - continue; - } else if (ch == 0xFFFE) { + } else if (*q == 0xFFFE) { + q++; bo = -1; - continue; } +#endif + } + + while (q < e) { + register Py_UNICODE ch = *q++; + + /* Swap input bytes if needed. (This assumes + sizeof(Py_UNICODE) == 2 !) */ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + if (bo == 1) + ch = (ch >> 8) | (ch << 8); +#else if (bo == -1) ch = (ch >> 8) | (ch << 8); #endif |