From a47d1c08d0911f2f49d92b8c6035593a672af436 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Walter=20D=C3=B6rwald?= Date: Tue, 30 Aug 2005 10:23:14 +0000 Subject: SF bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain about illegal code points. The codec now supports PEP 293 style error handlers. (This is a variant of the Nik Haldimann's patch that detects truncated data) --- Include/unicodeobject.h | 10 ++++++ Lib/test/test_codeccallbacks.py | 34 +++++++++++++++++-- Lib/test/test_codecs.py | 51 +++++++++++++++++++++++++++- Misc/NEWS | 4 +++ Modules/_codecsmodule.c | 4 +-- Objects/unicodeobject.c | 75 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 173 insertions(+), 5 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 6738cbd..b534187 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -797,6 +797,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape( int length /* Number of Py_UNICODE chars to encode */ ); +/* --- Unicode Internal Codec --------------------------------------------- + + Only for internal use in _codecsmodule.c */ + +PyObject *_PyUnicode_DecodeUnicodeInternal( + const char *string, + int length, + const char *errors + ); + /* --- Latin-1 Codecs ----------------------------------------------------- Note: Latin-1 corresponds to the first 256 Unicode ordinals. diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 8f0d590..f8e59cd 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -111,7 +111,7 @@ class CodecCallbackTest(unittest.TestCase): sout += "\\U%08x" % sys.maxunicode self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) - def test_relaxedutf8(self): + def test_decoderelaxedutf8(self): # This is the test for a decoding callback handler, # that relaxes the UTF-8 minimal encoding restriction. # A null byte that is encoded as "\xc0\x80" will be @@ -158,6 +158,35 @@ class CodecCallbackTest(unittest.TestCase): charmap[ord("?")] = u"XYZ" self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) + def test_decodeunicodeinternal(self): + self.assertRaises( + UnicodeDecodeError, + "\x00\x00\x00\x00\x00".decode, + "unicode-internal", + ) + if sys.maxunicode > 0xffff: + def handler_unicodeinternal(exc): + if not isinstance(exc, UnicodeDecodeError): + raise TypeError("don't know how to handle %r" % exc) + return (u"\x01", 1) + + self.assertEqual( + "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"), + u"\u0000" + ) + + self.assertEqual( + "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"), + u"\u0000\ufffd" + ) + + codecs.register_error("test.hui", handler_unicodeinternal) + + self.assertEqual( + "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"), + u"\u0000\u0001\u0000" + ) + def test_callbacks(self): def handler1(exc): if not isinstance(exc, UnicodeEncodeError) \ @@ -503,7 +532,8 @@ class CodecCallbackTest(unittest.TestCase): for (enc, bytes) in ( ("ascii", "\xff"), ("utf-8", "\xff"), - ("utf-7", "+x-") + ("utf-7", "+x-"), + ("unicode-internal", "\x00"), ): self.assertRaises( TypeError, diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 5189e80..a4d58c6 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1,7 +1,7 @@ from test import test_support import unittest import codecs -import StringIO +import sys, StringIO class Queue(object): """ @@ -453,6 +453,54 @@ class PunycodeTest(unittest.TestCase): for uni, puny in punycode_testcases: self.assertEquals(uni, puny.decode("punycode")) +class UnicodeInternalTest(unittest.TestCase): + def test_bug1251300(self): + # Decoding with unicode_internal used to not correctly handle "code + # points" above 0x10ffff on UCS-4 builds. + if sys.maxunicode > 0xffff: + ok = [ + ("\x00\x10\xff\xff", u"\U0010ffff"), + ("\x00\x00\x01\x01", u"\U00000101"), + ("", u""), + ] + not_ok = [ + "\x7f\xff\xff\xff", + "\x80\x00\x00\x00", + "\x81\x00\x00\x00", + "\x00", + "\x00\x00\x00\x00\x00", + ] + for internal, uni in ok: + if sys.byteorder == "little": + internal = "".join(reversed(internal)) + self.assertEquals(uni, internal.decode("unicode_internal")) + for internal in not_ok: + if sys.byteorder == "little": + internal = "".join(reversed(internal)) + self.assertRaises(UnicodeDecodeError, internal.decode, + "unicode_internal") + + def test_decode_error_attributes(self): + if sys.maxunicode > 0xffff: + try: + "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal") + except UnicodeDecodeError, ex: + self.assertEquals("unicode_internal", ex.encoding) + self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object) + self.assertEquals(4, ex.start) + self.assertEquals(8, ex.end) + else: + self.fail() + + def test_decode_callback(self): + if sys.maxunicode > 0xffff: + codecs.register_error("UnicodeInternalTest", codecs.ignore_errors) + decoder = codecs.getdecoder("unicode_internal") + ab = u"ab".encode("unicode_internal") + ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]), + "UnicodeInternalTest") + self.assertEquals((u"ab", 12), ignored) + # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html nameprep_tests = [ # 3.1 Map to nothing. @@ -885,6 +933,7 @@ def test_main(): EscapeDecodeTest, RecodingTest, PunycodeTest, + UnicodeInternalTest, NameprepTest, CodecTest, CodecsModuleTest, diff --git a/Misc/NEWS b/Misc/NEWS index a9abb77..307b359 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -435,6 +435,10 @@ Library line ending. Remove the special handling of a "\r\n" that has been split between two lines. +- Bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain + about illegal code points. The codec now supports PEP 293 style error + handlers. + Build ----- diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index a6c42b1..3441f61 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -254,8 +254,8 @@ unicode_internal_decode(PyObject *self, else { if (PyObject_AsReadBuffer(obj, (const void **)&data, &size)) return NULL; - return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE *)data, - size / sizeof(Py_UNICODE)), + + return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors), size); } } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5e5dac5..5d096ed 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2273,6 +2273,81 @@ PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) PyUnicode_GET_SIZE(unicode)); } +/* --- Unicode Internal Codec ------------------------------------------- */ + +PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, + int size, + const char *errors) +{ + const char *starts = s; + int startinpos; + int endinpos; + int outpos; + Py_UNICODE unimax; + PyUnicodeObject *v; + Py_UNICODE *p; + const char *end; + const char *reason; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + + unimax = PyUnicode_GetMax(); + v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); + if (v == NULL) + goto onError; + if (PyUnicode_GetSize((PyObject *)v) == 0) + return (PyObject *)v; + p = PyUnicode_AS_UNICODE(v); + end = s + size; + + while (s < end) { + *p = *(Py_UNICODE *)s; + /* We have to sanity check the raw data, otherwise doom looms for + some malformed UCS-4 data. */ + if ( + #ifdef Py_UNICODE_WIDE + *p > unimax || *p < 0 || + #endif + end-s < Py_UNICODE_SIZE + ) + { + startinpos = s - starts; + if (end-s < Py_UNICODE_SIZE) { + endinpos = end-starts; + reason = "truncated input"; + } + else { + endinpos = s - starts + Py_UNICODE_SIZE; + reason = "illegal code point (> 0x10FFFF)"; + } + outpos = p - PyUnicode_AS_UNICODE(v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicode_internal", reason, + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) { + goto onError; + } + } + else { + p++; + s += Py_UNICODE_SIZE; + } + } + + if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0) + goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return (PyObject *)v; + + onError: + Py_XDECREF(v); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return NULL; +} + /* --- Latin-1 Codec ------------------------------------------------------ */ PyObject *PyUnicode_DecodeLatin1(const char *s, -- cgit v0.12