summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Include/unicodeobject.h10
-rw-r--r--Lib/test/test_codeccallbacks.py34
-rw-r--r--Lib/test/test_codecs.py51
-rw-r--r--Misc/NEWS4
-rw-r--r--Modules/_codecsmodule.c4
-rw-r--r--Objects/unicodeobject.c75
6 files changed, 173 insertions, 5 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 6738cbd..b534187 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -797,6 +797,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
int length /* Number of Py_UNICODE chars to encode */
);
+/* --- Unicode Internal Codec ---------------------------------------------
+
+ Only for internal use in _codecsmodule.c */
+
+PyObject *_PyUnicode_DecodeUnicodeInternal(
+ const char *string,
+ int length,
+ const char *errors
+ );
+
/* --- Latin-1 Codecs -----------------------------------------------------
Note: Latin-1 corresponds to the first 256 Unicode ordinals.
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index 8f0d590..f8e59cd 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -111,7 +111,7 @@ class CodecCallbackTest(unittest.TestCase):
sout += "\\U%08x" % sys.maxunicode
self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
- def test_relaxedutf8(self):
+ def test_decoderelaxedutf8(self):
# This is the test for a decoding callback handler,
# that relaxes the UTF-8 minimal encoding restriction.
# A null byte that is encoded as "\xc0\x80" will be
@@ -158,6 +158,35 @@ class CodecCallbackTest(unittest.TestCase):
charmap[ord("?")] = u"XYZ"
self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
+ def test_decodeunicodeinternal(self):
+ self.assertRaises(
+ UnicodeDecodeError,
+ "\x00\x00\x00\x00\x00".decode,
+ "unicode-internal",
+ )
+ if sys.maxunicode > 0xffff:
+ def handler_unicodeinternal(exc):
+ if not isinstance(exc, UnicodeDecodeError):
+ raise TypeError("don't know how to handle %r" % exc)
+ return (u"\x01", 1)
+
+ self.assertEqual(
+ "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
+ u"\u0000"
+ )
+
+ self.assertEqual(
+ "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
+ u"\u0000\ufffd"
+ )
+
+ codecs.register_error("test.hui", handler_unicodeinternal)
+
+ self.assertEqual(
+ "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
+ u"\u0000\u0001\u0000"
+ )
+
def test_callbacks(self):
def handler1(exc):
if not isinstance(exc, UnicodeEncodeError) \
@@ -503,7 +532,8 @@ class CodecCallbackTest(unittest.TestCase):
for (enc, bytes) in (
("ascii", "\xff"),
("utf-8", "\xff"),
- ("utf-7", "+x-")
+ ("utf-7", "+x-"),
+ ("unicode-internal", "\x00"),
):
self.assertRaises(
TypeError,
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 5189e80..a4d58c6 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1,7 +1,7 @@
from test import test_support
import unittest
import codecs
-import StringIO
+import sys, StringIO
class Queue(object):
"""
@@ -453,6 +453,54 @@ class PunycodeTest(unittest.TestCase):
for uni, puny in punycode_testcases:
self.assertEquals(uni, puny.decode("punycode"))
+class UnicodeInternalTest(unittest.TestCase):
+ def test_bug1251300(self):
+ # Decoding with unicode_internal used to not correctly handle "code
+ # points" above 0x10ffff on UCS-4 builds.
+ if sys.maxunicode > 0xffff:
+ ok = [
+ ("\x00\x10\xff\xff", u"\U0010ffff"),
+ ("\x00\x00\x01\x01", u"\U00000101"),
+ ("", u""),
+ ]
+ not_ok = [
+ "\x7f\xff\xff\xff",
+ "\x80\x00\x00\x00",
+ "\x81\x00\x00\x00",
+ "\x00",
+ "\x00\x00\x00\x00\x00",
+ ]
+ for internal, uni in ok:
+ if sys.byteorder == "little":
+ internal = "".join(reversed(internal))
+ self.assertEquals(uni, internal.decode("unicode_internal"))
+ for internal in not_ok:
+ if sys.byteorder == "little":
+ internal = "".join(reversed(internal))
+ self.assertRaises(UnicodeDecodeError, internal.decode,
+ "unicode_internal")
+
+ def test_decode_error_attributes(self):
+ if sys.maxunicode > 0xffff:
+ try:
+ "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
+ except UnicodeDecodeError, ex:
+ self.assertEquals("unicode_internal", ex.encoding)
+ self.assertEquals("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
+ self.assertEquals(4, ex.start)
+ self.assertEquals(8, ex.end)
+ else:
+ self.fail()
+
+ def test_decode_callback(self):
+ if sys.maxunicode > 0xffff:
+ codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
+ decoder = codecs.getdecoder("unicode_internal")
+ ab = u"ab".encode("unicode_internal")
+ ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
+ "UnicodeInternalTest")
+ self.assertEquals((u"ab", 12), ignored)
+
# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
nameprep_tests = [
# 3.1 Map to nothing.
@@ -885,6 +933,7 @@ def test_main():
EscapeDecodeTest,
RecodingTest,
PunycodeTest,
+ UnicodeInternalTest,
NameprepTest,
CodecTest,
CodecsModuleTest,
diff --git a/Misc/NEWS b/Misc/NEWS
index a9abb77..307b359 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -435,6 +435,10 @@ Library
line ending. Remove the special handling of a "\r\n" that has been split
between two lines.
+- Bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain
+ about illegal code points. The codec now supports PEP 293 style error
+ handlers.
+
Build
-----
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
index a6c42b1..3441f61 100644
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -254,8 +254,8 @@ unicode_internal_decode(PyObject *self,
else {
if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
return NULL;
- return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE *)data,
- size / sizeof(Py_UNICODE)),
+
+ return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors),
size);
}
}
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 5e5dac5..5d096ed 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2273,6 +2273,81 @@ PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
PyUnicode_GET_SIZE(unicode));
}
+/* --- Unicode Internal Codec ------------------------------------------- */
+
+PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
+ int size,
+ const char *errors)
+{
+ const char *starts = s;
+ int startinpos;
+ int endinpos;
+ int outpos;
+ Py_UNICODE unimax;
+ PyUnicodeObject *v;
+ Py_UNICODE *p;
+ const char *end;
+ const char *reason;
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+
+ unimax = PyUnicode_GetMax();
+ v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
+ if (v == NULL)
+ goto onError;
+ if (PyUnicode_GetSize((PyObject *)v) == 0)
+ return (PyObject *)v;
+ p = PyUnicode_AS_UNICODE(v);
+ end = s + size;
+
+ while (s < end) {
+ *p = *(Py_UNICODE *)s;
+ /* We have to sanity check the raw data, otherwise doom looms for
+ some malformed UCS-4 data. */
+ if (
+ #ifdef Py_UNICODE_WIDE
+ *p > unimax || *p < 0 ||
+ #endif
+ end-s < Py_UNICODE_SIZE
+ )
+ {
+ startinpos = s - starts;
+ if (end-s < Py_UNICODE_SIZE) {
+ endinpos = end-starts;
+ reason = "truncated input";
+ }
+ else {
+ endinpos = s - starts + Py_UNICODE_SIZE;
+ reason = "illegal code point (> 0x10FFFF)";
+ }
+ outpos = p - PyUnicode_AS_UNICODE(v);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "unicode_internal", reason,
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&v, &outpos, &p)) {
+ goto onError;
+ }
+ }
+ else {
+ p++;
+ s += Py_UNICODE_SIZE;
+ }
+ }
+
+ if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
+ goto onError;
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ return (PyObject *)v;
+
+ onError:
+ Py_XDECREF(v);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ return NULL;
+}
+
/* --- Latin-1 Codec ------------------------------------------------------ */
PyObject *PyUnicode_DecodeLatin1(const char *s,