From 7d00cc1a6432b5f2338172ceba388d336e466d2e Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 17 Mar 2014 23:08:06 +0100 Subject: Issue #20574: Implement incremental decoder for cp65001 code (Windows code page 65001, Microsoft UTF-8). --- Lib/encodings/cp65001.py | 9 ++++++--- Lib/test/test_codecs.py | 12 ++++-------- Misc/NEWS | 3 +++ Objects/unicodeobject.c | 41 +++++++++-------------------------------- 4 files changed, 22 insertions(+), 43 deletions(-) diff --git a/Lib/encodings/cp65001.py b/Lib/encodings/cp65001.py index 287eb87..95cb2ae 100644 --- a/Lib/encodings/cp65001.py +++ b/Lib/encodings/cp65001.py @@ -11,20 +11,23 @@ if not hasattr(codecs, 'code_page_encode'): ### Codec APIs encode = functools.partial(codecs.code_page_encode, 65001) -decode = functools.partial(codecs.code_page_decode, 65001) +_decode = functools.partial(codecs.code_page_decode, 65001) + +def decode(input, errors='strict'): + return codecs.code_page_decode(65001, input, errors, True) class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): return encode(input, self.errors)[0] class IncrementalDecoder(codecs.BufferedIncrementalDecoder): - _buffer_decode = decode + _buffer_decode = _decode class StreamWriter(codecs.StreamWriter): encode = encode class StreamReader(codecs.StreamReader): - decode = decode + decode = _decode ### encodings module API diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 9b62d5b..6945a99 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -890,10 +890,6 @@ class CP65001Test(ReadTest, unittest.TestCase): "\U00010fff\uD800") self.assertTrue(codecs.lookup_error("surrogatepass")) - def test_readline(self): - self.skipTest("issue #20571: code page 65001 codec does not " - "support partial decoder yet") - class UTF7Test(ReadTest, unittest.TestCase): encoding = "utf-7" @@ -2750,15 +2746,15 @@ class CodePageTest(unittest.TestCase): self.assertRaisesRegex(UnicodeEncodeError, 'cp932', codecs.code_page_encode, 932, '\xff') self.assertRaisesRegex(UnicodeDecodeError, 'cp932', - codecs.code_page_decode, 932, b'\x81\x00') + codecs.code_page_decode, 932, b'\x81\x00', 'strict', True) self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8', - codecs.code_page_decode, self.CP_UTF8, b'\xff') + codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True) def check_decode(self, cp, tests): for raw, errors, expected in tests: if expected is not None: try: - decoded = codecs.code_page_decode(cp, raw, errors) + decoded = codecs.code_page_decode(cp, raw, errors, True) except UnicodeDecodeError as err: self.fail('Unable to decode %a from "cp%s" with ' 'errors=%r: %s' % (raw, cp, errors, err)) @@ -2770,7 +2766,7 @@ class CodePageTest(unittest.TestCase): self.assertLessEqual(decoded[1], len(raw)) else: self.assertRaises(UnicodeDecodeError, - codecs.code_page_decode, cp, raw, errors) + codecs.code_page_decode, cp, raw, errors, True) def check_encode(self, cp, tests): for text, errors, expected in tests: diff --git a/Misc/NEWS b/Misc/NEWS index f6ae02c..5946bc9 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -13,6 +13,9 @@ Core and Builtins Library ------- +- Issue #20574: Implement incremental decoder for cp65001 code (Windows code + page 65001, Microsoft UTF-8). + - Issue #20879: Delay the initialization of encoding and decoding tables for base32, ascii85 and base85 codecs in the base64 module, and delay the initialization of the unquote_to_bytes() table of the urllib.parse module, to diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index ec22239..0cb023d 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6817,28 +6817,6 @@ code_page_name(UINT code_page, PyObject **obj) return PyBytes_AS_STRING(*obj); } -static int -is_dbcs_lead_byte(UINT code_page, const char *s, int offset) -{ - const char *curr = s + offset; - const char *prev; - - if (!IsDBCSLeadByteEx(code_page, *curr)) - return 0; - - prev = CharPrevExA(code_page, s, curr, 0); - if (prev == curr) - return 1; - /* FIXME: This code is limited to "true" double-byte encodings, - as it assumes an incomplete character consists of a single - byte. */ - if (curr - prev == 2) - return 1; - if (!IsDBCSLeadByteEx(code_page, *prev)) - return 1; - return 0; -} - static DWORD decode_code_page_flags(UINT code_page) { @@ -6913,7 +6891,7 @@ static int decode_code_page_errors(UINT code_page, PyObject **v, const char *in, const int size, - const char *errors) + const char *errors, int final) { const char *startin = in; const char *endin = in + size; @@ -6940,7 +6918,7 @@ decode_code_page_errors(UINT code_page, if (encoding == NULL) return -1; - if (errors == NULL || strcmp(errors, "strict") == 0) { + if ((errors == NULL || strcmp(errors, "strict") == 0) && final) { /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a UnicodeDecodeError. */ make_decode_exception(&exc, encoding, in, size, 0, 0, reason); @@ -7003,6 +6981,10 @@ decode_code_page_errors(UINT code_page, if (outsize <= 0) { Py_ssize_t startinpos, endinpos, outpos; + /* last character in partial decode? */ + if (in + insize >= endin && !final) + break; + startinpos = in - startin; endinpos = startinpos + 1; outpos = out - PyUnicode_AS_UNICODE(*v); @@ -7031,7 +7013,7 @@ decode_code_page_errors(UINT code_page, assert(outsize <= PyUnicode_WSTR_LENGTH(*v)); if (unicode_resize(v, outsize) < 0) goto error; - ret = size; + ret = in - startin; error: Py_XDECREF(encoding_obj); @@ -7072,24 +7054,19 @@ decode_code_page_stateful(int code_page, done = 1; } - /* Skip trailing lead-byte unless 'final' is set */ - if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1)) - --chunk_size; - if (chunk_size == 0 && done) { if (v != NULL) break; _Py_RETURN_UNICODE_EMPTY(); } - converted = decode_code_page_strict(code_page, &v, s, chunk_size); if (converted == -2) converted = decode_code_page_errors(code_page, &v, s, chunk_size, - errors); - assert(converted != 0); + errors, final); + assert(converted != 0 || done); if (converted < 0) { Py_XDECREF(v); -- cgit v0.12