summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
authorMarc-André Lemburg <mal@egenix.com>2001-05-21 20:30:15 (GMT)
committerMarc-André Lemburg <mal@egenix.com>2001-05-21 20:30:15 (GMT)
commit489b56e04480b8ca3f2d1676265e67c65bae788d (patch)
treea148a1f74890d004f6434a77eb14185b76c73c77 /Objects/unicodeobject.c
parentf52d27e52d289b99837b4555fb3f757f2c89f4ad (diff)
downloadcpython-489b56e04480b8ca3f2d1676265e67c65bae788d.zip
cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.tar.gz
cpython-489b56e04480b8ca3f2d1676265e67c65bae788d.tar.bz2
This patch changes the behaviour of the UTF-16 codec family. Only the
UTF-16 codec will now interpret and remove a *leading* BOM mark. Sub- sequent BOM characters are no longer interpreted and removed. UTF-16-LE and -BE pass through all BOM mark characters. These changes should get the UTF-16 codec more in line with what the Unicode FAQ recommends w/r to BOM marks.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c42
1 files changed, 25 insertions, 17 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 475215c..d55e2a7 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1001,31 +1001,39 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
if (byteorder)
bo = *byteorder;
- while (q < e) {
- register Py_UNICODE ch = *q++;
-
- /* Check for BOM marks (U+FEFF) in the input and adjust
- current byte order setting accordingly. Swap input
- bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
- !) */
+ /* Check for BOM marks (U+FEFF) in the input and adjust current
+ byte order setting accordingly. In native mode, the leading BOM
+ mark is skipped, in all other modes, it is copied to the output
+ stream as-is (giving a ZWNBSP character). */
+ if (bo == 0) {
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
- if (ch == 0xFEFF) {
+ if (*q == 0xFEFF) {
+ q++;
bo = -1;
- continue;
- } else if (ch == 0xFFFE) {
+ } else if (*q == 0xFFFE) {
+ q++;
bo = 1;
- continue;
}
- if (bo == 1)
- ch = (ch >> 8) | (ch << 8);
#else
- if (ch == 0xFEFF) {
+ if (*q == 0xFEFF) {
+ q++;
bo = 1;
- continue;
- } else if (ch == 0xFFFE) {
+ } else if (*q == 0xFFFE) {
+ q++;
bo = -1;
- continue;
}
+#endif
+ }
+
+ while (q < e) {
+ register Py_UNICODE ch = *q++;
+
+ /* Swap input bytes if needed. (This assumes
+ sizeof(Py_UNICODE) == 2 !) */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ if (bo == 1)
+ ch = (ch >> 8) | (ch << 8);
+#else
if (bo == -1)
ch = (ch >> 8) | (ch << 8);
#endif