SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support

decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful.
author: Walter Dörwald <walter@livinglogic.de> 2004-09-07 20:24:22 (GMT)
committer: Walter Dörwald <walter@livinglogic.de> 2004-09-07 20:24:22 (GMT)
commit: 69652035bc2cf22b0326bb00824f4b7e2674cc8b (patch)
tree: 088104a47f9c9cfc466a3e1c5f4d2560b2d41450 /Objects
parent: a708d6e3b0aa2d225d4e5ab338862f67994e1c45 (diff)
download: cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.zip
cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.gz
cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.bz2
1 files changed, 57 insertions, 23 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index f78788e..05fd11c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1136,6 +1136,14 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
 			       int size,
 			       const char *errors)
 {
+    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
+}
+
+PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
+			                int size,
+			                const char *errors,
+			                int *consumed)
+{
     const char *starts = s;
     int n;
     int startinpos;
@@ -1153,8 +1161,11 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
     unicode = _PyUnicode_New(size);
     if (!unicode)
         return NULL;
-    if (size == 0)
+    if (size == 0) {
+        if (consumed)
+            *consumed = 0;
         return (PyObject *)unicode;
+    }
 
     /* Unpack UTF-8 encoded data */
     p = unicode->str;
@@ -1172,10 +1183,14 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
         n = utf8_code_length[ch];
 
         if (s + n > e) {
-	    errmsg = "unexpected end of data";
-	    startinpos = s-starts;
-	    endinpos = size;
-	    goto utf8Error;
+	    if (consumed)
+		break;
+	    else {
+		errmsg = "unexpected end of data";
+		startinpos = s-starts;
+		endinpos = size;
+		goto utf8Error;
+	    }
 	}
 
         switch (n) {
@@ -1293,6 +1308,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
 	     (PyObject **)&unicode, &outpos, &p))
 	goto onError;
     }
+    if (consumed)
+	*consumed = s-starts;
 
     /* Adjust length */
     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
@@ -1428,6 +1445,16 @@ PyUnicode_DecodeUTF16(const char *s,
 		      const char *errors,
 		      int *byteorder)
 {
+    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
+}
+
+PyObject *
+PyUnicode_DecodeUTF16Stateful(const char *s,
+			      int size,
+			      const char *errors,
+			      int *byteorder,
+			      int *consumed)
+{
     const char *starts = s;
     int startinpos;
     int endinpos;
@@ -1467,26 +1494,28 @@ PyUnicode_DecodeUTF16(const char *s,
        mark is skipped, in all other modes, it is copied to the output
        stream as-is (giving a ZWNBSP character). */
     if (bo == 0) {
-        const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
+        if (size >= 2) {
+            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
-	if (bom == 0xFEFF) {
-	    q += 2;
-	    bo = -1;
-	}
-        else if (bom == 0xFFFE) {
-	    q += 2;
-	    bo = 1;
-	}
+	    if (bom == 0xFEFF) {
+		q += 2;
+		bo = -1;
+	    }
+	    else if (bom == 0xFFFE) {
+		q += 2;
+		bo = 1;
+	    }
 #else
-	if (bom == 0xFEFF) {
-	    q += 2;
-	    bo = 1;
-	}
-        else if (bom == 0xFFFE) {
-	    q += 2;
-	    bo = -1;
-	}
+	    if (bom == 0xFEFF) {
+		q += 2;
+		bo = 1;
+	    }
+	    else if (bom == 0xFFFE) {
+		q += 2;
+		bo = -1;
+	    }
 #endif
+	}
     }
 
     if (bo == -1) {
@@ -1502,8 +1531,10 @@ PyUnicode_DecodeUTF16(const char *s,
 
     while (q < e) {
 	Py_UNICODE ch;
-	/* remaing bytes at the end? (size should be even) */
+	/* remaining bytes at the end? (size should be even) */
 	if (e-q<2) {
+	    if (consumed)
+		break;
 	    errmsg = "truncated data";
 	    startinpos = ((const char *)q)-starts;
 	    endinpos = ((const char *)e)-starts;
@@ -1565,6 +1596,9 @@ PyUnicode_DecodeUTF16(const char *s,
     if (byteorder)
         *byteorder = bo;
 
+    if (consumed)
+	*consumed = (const char *)q-starts;
+
     /* Adjust length */
     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
         goto onError;
author	Walter Dörwald <walter@livinglogic.de>	2004-09-07 20:24:22 (GMT)
committer	Walter Dörwald <walter@livinglogic.de>	2004-09-07 20:24:22 (GMT)
commit	69652035bc2cf22b0326bb00824f4b7e2674cc8b (patch)
tree	088104a47f9c9cfc466a3e1c5f4d2560b2d41450 /Objects
parent	a708d6e3b0aa2d225d4e5ab338862f67994e1c45 (diff)
download	cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.zip cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.gz cpython-69652035bc2cf22b0326bb00824f4b7e2674cc8b.tar.bz2