diff options
| -rw-r--r-- | Include/unicodeobject.h | 9 | ||||
| -rw-r--r-- | Objects/unicodeobject.c | 42 | 
2 files changed, 30 insertions, 21 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 988ea1b..f91a5a0 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -459,10 +459,11 @@ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(  	*byteorder == 0:  native order  	*byteorder == 1:  big endian -   and then switches according to all BOM marks it finds in the input -   data. BOM marks are not copied into the resulting Unicode string. -   After completion, *byteorder is set to the current byte order at -   the end of input data. +   In native mode, the first two bytes of the stream are checked for a +   BOM mark. If found, the BOM mark is analysed, the byte order +   adjusted and the BOM skipped.  In the other modes, no BOM mark +   interpretation is done. After completion, *byteorder is set to the +   current byte order at the end of input data.     If byteorder is NULL, the codec starts in native order mode. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 475215c..d55e2a7 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1001,31 +1001,39 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,      if (byteorder)  	bo = *byteorder; -    while (q < e) { -	register Py_UNICODE ch = *q++; - -	/* Check for BOM marks (U+FEFF) in the input and adjust -	   current byte order setting accordingly. Swap input -	   bytes if needed. (This assumes sizeof(Py_UNICODE) == 2 -	   !) */ +    /* Check for BOM marks (U+FEFF) in the input and adjust current +       byte order setting accordingly. In native mode, the leading BOM +       mark is skipped, in all other modes, it is copied to the output +       stream as-is (giving a ZWNBSP character). */ +    if (bo == 0) {  #ifdef BYTEORDER_IS_LITTLE_ENDIAN -	if (ch == 0xFEFF) { +	if (*q == 0xFEFF) { +	    q++;  	    bo = -1; -	    continue; -	} else if (ch == 0xFFFE) { +	} else if (*q == 0xFFFE) { +	    q++;  	    bo = 1; -	    continue;  	} -	if (bo == 1) -	    ch = (ch >> 8) | (ch << 8);  #else     -	if (ch == 0xFEFF) { +	if (*q == 0xFEFF) { +	    q++;  	    bo = 1; -	    continue; -	} else if (ch == 0xFFFE) { +	} else if (*q == 0xFFFE) { +	    q++;  	    bo = -1; -	    continue;  	} +#endif +    } +     +    while (q < e) { +	register Py_UNICODE ch = *q++; + +	/* Swap input bytes if needed. (This assumes +	   sizeof(Py_UNICODE) == 2 !) */ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN +	if (bo == 1) +	    ch = (ch >> 8) | (ch << 8); +#else      	if (bo == -1)  	    ch = (ch >> 8) | (ch << 8);  #endif  | 
