From 88d8fb6af603176b6e55766da067f84115e35406 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 15 May 2014 14:37:42 +0300 Subject: Issue #13916: Disallowed the surrogatepass error handler for non UTF-* encodings. --- Lib/test/test_codecs.py | 13 +++++++++++++ Misc/NEWS | 3 +++ Python/codecs.c | 23 +++++++++++++++++++---- 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 7459010..e4d7a60 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -2807,6 +2807,9 @@ class CodePageTest(unittest.TestCase): ('[\u20ac]', 'replace', b'[?]'), ('[\xff]', 'backslashreplace', b'[\\xff]'), ('[\xff]', 'xmlcharrefreplace', b'[ÿ]'), + ('\udcff', 'strict', None), + ('[\udcff]', 'surrogateescape', b'[\xff]'), + ('[\udcff]', 'surrogatepass', None), )) self.check_decode(932, ( (b'abc', 'strict', 'abc'), @@ -2816,6 +2819,7 @@ class CodePageTest(unittest.TestCase): (b'[\xff]', 'ignore', '[]'), (b'[\xff]', 'replace', '[\ufffd]'), (b'[\xff]', 'surrogateescape', '[\udcff]'), + (b'[\xff]', 'surrogatepass', None), (b'\x81\x00abc', 'strict', None), (b'\x81\x00abc', 'ignore', '\x00abc'), (b'\x81\x00abc', 'replace', '\ufffd\x00abc'), @@ -2826,14 +2830,23 @@ class CodePageTest(unittest.TestCase): ('abc', 'strict', b'abc'), ('\xe9\u20ac', 'strict', b'\xe9\x80'), ('\xff', 'strict', b'\xff'), + # test error handlers ('\u0141', 'strict', None), ('\u0141', 'ignore', b''), ('\u0141', 'replace', b'L'), + ('\udc98', 'surrogateescape', b'\x98'), + ('\udc98', 'surrogatepass', None), )) self.check_decode(1252, ( (b'abc', 'strict', 'abc'), (b'\xe9\x80', 'strict', '\xe9\u20ac'), (b'\xff', 'strict', '\xff'), + # invalid bytes + (b'[\x98]', 'strict', None), + (b'[\x98]', 'ignore', '[]'), + (b'[\x98]', 'replace', '[\ufffd]'), + (b'[\x98]', 'surrogateescape', '[\udc98]'), + (b'[\x98]', 'surrogatepass', None), )) def test_cp_utf7(self): diff --git a/Misc/NEWS b/Misc/NEWS index 2412c48..7c42136 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -84,6 +84,9 @@ Core and Builtins Library ------- +- Issue #13916: Disallowed the surrogatepass error handler for non UTF-* + encodings. + - Issue #20998: Fixed re.fullmatch() of repeated single character pattern with ignore case. Original patch by Matthew Barnett. diff --git a/Python/codecs.c b/Python/codecs.c index e06d6e0..7003136 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -901,6 +901,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) } } +#define ENC_UNKNOWN -1 #define ENC_UTF8 0 #define ENC_UTF16BE 1 #define ENC_UTF16LE 2 @@ -916,7 +917,11 @@ get_standard_encoding(const char *encoding, int *bytelength) encoding += 3; if (*encoding == '-' || *encoding == '_' ) encoding++; - if (encoding[0] == '1' && encoding[1] == '6') { + if (encoding[0] == '8' && encoding[1] == '\0') { + *bytelength = 3; + return ENC_UTF8; + } + else if (encoding[0] == '1' && encoding[1] == '6') { encoding += 2; *bytelength = 2; if (*encoding == '\0') { @@ -955,9 +960,7 @@ get_standard_encoding(const char *encoding, int *bytelength) } } } - /* utf-8 */ - *bytelength = 3; - return ENC_UTF8; + return ENC_UNKNOWN; } /* This handler is declared static until someone demonstrates @@ -994,6 +997,12 @@ PyCodec_SurrogatePassErrors(PyObject *exc) } code = get_standard_encoding(encoding, &bytelength); Py_DECREF(encode); + if (code == ENC_UNKNOWN) { + /* Not supported, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(object); + return NULL; + } res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start)); if (!res) { @@ -1068,6 +1077,12 @@ PyCodec_SurrogatePassErrors(PyObject *exc) } code = get_standard_encoding(encoding, &bytelength); Py_DECREF(encode); + if (code == ENC_UNKNOWN) { + /* Not supported, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(object); + return NULL; + } /* Try decoding a single surrogate character. If there are more, let the codec call us again. */ -- cgit v0.12