Issue #20574: Implement incremental decoder for cp65001 code

(Windows code page 65001, Microsoft UTF-8).
author: Victor Stinner <victor.stinner@gmail.com> 2014-03-17 22:08:06 (GMT)
committer: Victor Stinner <victor.stinner@gmail.com> 2014-03-17 22:08:06 (GMT)
commit: 7d00cc1a6432b5f2338172ceba388d336e466d2e (patch)
tree: cde3b6ca56e2bd0f6c2c892c99a8129a7f763e9e /Objects
parent: c49926748b36d639c3c6889ffcb90ab069bba8ae (diff)
download: cpython-7d00cc1a6432b5f2338172ceba388d336e466d2e.zip
cpython-7d00cc1a6432b5f2338172ceba388d336e466d2e.tar.gz
cpython-7d00cc1a6432b5f2338172ceba388d336e466d2e.tar.bz2
1 files changed, 9 insertions, 32 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index ec22239..0cb023d 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6817,28 +6817,6 @@ code_page_name(UINT code_page, PyObject **obj)
     return PyBytes_AS_STRING(*obj);
 }
 
-static int
-is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
-{
-    const char *curr = s + offset;
-    const char *prev;
-
-    if (!IsDBCSLeadByteEx(code_page, *curr))
-        return 0;
-
-    prev = CharPrevExA(code_page, s, curr, 0);
-    if (prev == curr)
-        return 1;
-    /* FIXME: This code is limited to "true" double-byte encodings,
-       as it assumes an incomplete character consists of a single
-       byte. */
-    if (curr - prev == 2)
-        return 1;
-    if (!IsDBCSLeadByteEx(code_page, *prev))
-        return 1;
-    return 0;
-}
-
 static DWORD
 decode_code_page_flags(UINT code_page)
 {
@@ -6913,7 +6891,7 @@ static int
 decode_code_page_errors(UINT code_page,
                         PyObject **v,
                         const char *in, const int size,
-                        const char *errors)
+                        const char *errors, int final)
 {
     const char *startin = in;
     const char *endin = in + size;
@@ -6940,7 +6918,7 @@ decode_code_page_errors(UINT code_page,
     if (encoding == NULL)
         return -1;
 
-    if (errors == NULL || strcmp(errors, "strict") == 0) {
+    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
            UnicodeDecodeError. */
         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
@@ -7003,6 +6981,10 @@ decode_code_page_errors(UINT code_page,
         if (outsize <= 0) {
             Py_ssize_t startinpos, endinpos, outpos;
 
+            /* last character in partial decode? */
+            if (in + insize >= endin && !final)
+                break;
+
             startinpos = in - startin;
             endinpos = startinpos + 1;
             outpos = out - PyUnicode_AS_UNICODE(*v);
@@ -7031,7 +7013,7 @@ decode_code_page_errors(UINT code_page,
     assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
     if (unicode_resize(v, outsize) < 0)
         goto error;
-    ret = size;
+    ret = in - startin;
 
 error:
     Py_XDECREF(encoding_obj);
@@ -7072,24 +7054,19 @@ decode_code_page_stateful(int code_page,
             done = 1;
         }
 
-        /* Skip trailing lead-byte unless 'final' is set */
-        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
-            --chunk_size;
-
         if (chunk_size == 0 && done) {
             if (v != NULL)
                 break;
             _Py_RETURN_UNICODE_EMPTY();
         }
 
-
         converted = decode_code_page_strict(code_page, &v,
                                             s, chunk_size);
         if (converted == -2)
             converted = decode_code_page_errors(code_page, &v,
                                                 s, chunk_size,
-                                                errors);
-        assert(converted != 0);
+                                                errors, final);
+        assert(converted != 0 || done);
 
         if (converted < 0) {
             Py_XDECREF(v);
author	Victor Stinner <victor.stinner@gmail.com>	2014-03-17 22:08:06 (GMT)
committer	Victor Stinner <victor.stinner@gmail.com>	2014-03-17 22:08:06 (GMT)
commit	7d00cc1a6432b5f2338172ceba388d336e466d2e (patch)
tree	cde3b6ca56e2bd0f6c2c892c99a8129a7f763e9e /Objects
parent	c49926748b36d639c3c6889ffcb90ab069bba8ae (diff)
download	cpython-7d00cc1a6432b5f2338172ceba388d336e466d2e.zip cpython-7d00cc1a6432b5f2338172ceba388d336e466d2e.tar.gz cpython-7d00cc1a6432b5f2338172ceba388d336e466d2e.tar.bz2