This patch changes the behaviour of the UTF-16 codec family. Only the

UTF-16 codec will now interpret and remove a *leading* BOM mark. Sub- sequent BOM characters are no longer interpreted and removed. UTF-16-LE and -BE pass through all BOM mark characters. These changes should get the UTF-16 codec more in line with what the Unicode FAQ recommends w/r to BOM marks.
author: Marc-André Lemburg <mal@egenix.com> 2001-05-21 20:30:15 (GMT)
committer: Marc-André Lemburg <mal@egenix.com> 2001-05-21 20:30:15 (GMT)
commit: 489b56e04480b8ca3f2d1676265e67c65bae788d (patch)
tree: a148a1f74890d004f6434a77eb14185b76c73c77
parent: f52d27e52d289b99837b4555fb3f757f2c89f4ad (diff)
download: cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.zip
cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.tar.gz
cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.tar.bz2
2 files changed, 30 insertions, 21 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 988ea1b..f91a5a0 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -459,10 +459,11 @@ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
 	*byteorder == 0:  native order
 	*byteorder == 1:  big endian
 
-   and then switches according to all BOM marks it finds in the input
-   data. BOM marks are not copied into the resulting Unicode string.
-   After completion, *byteorder is set to the current byte order at
-   the end of input data.
+   In native mode, the first two bytes of the stream are checked for a
+   BOM mark. If found, the BOM mark is analysed, the byte order
+   adjusted and the BOM skipped.  In the other modes, no BOM mark
+   interpretation is done. After completion, *byteorder is set to the
+   current byte order at the end of input data.
 
    If byteorder is NULL, the codec starts in native order mode.
 
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 475215c..d55e2a7 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1001,31 +1001,39 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
     if (byteorder)
 	bo = *byteorder;
 
-    while (q < e) {
-	register Py_UNICODE ch = *q++;
-
-	/* Check for BOM marks (U+FEFF) in the input and adjust
-	   current byte order setting accordingly. Swap input
-	   bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
-	   !) */
+    /* Check for BOM marks (U+FEFF) in the input and adjust current
+       byte order setting accordingly. In native mode, the leading BOM
+       mark is skipped, in all other modes, it is copied to the output
+       stream as-is (giving a ZWNBSP character). */
+    if (bo == 0) {
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
-	if (ch == 0xFEFF) {
+	if (*q == 0xFEFF) {
+	    q++;
 	    bo = -1;
-	    continue;
-	} else if (ch == 0xFFFE) {
+	} else if (*q == 0xFFFE) {
+	    q++;
 	    bo = 1;
-	    continue;
 	}
-	if (bo == 1)
-	    ch = (ch >> 8) | (ch << 8);
 #else    
-	if (ch == 0xFEFF) {
+	if (*q == 0xFEFF) {
+	    q++;
 	    bo = 1;
-	    continue;
-	} else if (ch == 0xFFFE) {
+	} else if (*q == 0xFFFE) {
+	    q++;
 	    bo = -1;
-	    continue;
 	}
+#endif
+    }
+    
+    while (q < e) {
+	register Py_UNICODE ch = *q++;
+
+	/* Swap input bytes if needed. (This assumes
+	   sizeof(Py_UNICODE) == 2 !) */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+	if (bo == 1)
+	    ch = (ch >> 8) | (ch << 8);
+#else    
 	if (bo == -1)
 	    ch = (ch >> 8) | (ch << 8);
 #endif
author	Marc-André Lemburg <mal@egenix.com>	2001-05-21 20:30:15 (GMT)
committer	Marc-André Lemburg <mal@egenix.com>	2001-05-21 20:30:15 (GMT)
commit	489b56e04480b8ca3f2d1676265e67c65bae788d (patch)
tree	a148a1f74890d004f6434a77eb14185b76c73c77
parent	f52d27e52d289b99837b4555fb3f757f2c89f4ad (diff)
download	cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.zip cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.tar.gz cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.tar.bz2