summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/c-api/concrete.rst68
-rw-r--r--Doc/library/codecs.rst6
-rw-r--r--Include/unicodeobject.h82
-rw-r--r--Lib/encodings/aliases.py10
-rw-r--r--Lib/encodings/utf_32.py144
-rw-r--r--Lib/encodings/utf_32_be.py37
-rw-r--r--Lib/encodings/utf_32_le.py37
-rw-r--r--Lib/test/test_codeccallbacks.py7
-rw-r--r--Lib/test/test_codecs.py140
-rw-r--r--Misc/NEWS2
-rw-r--r--Modules/_codecsmodule.c204
-rw-r--r--Objects/unicodeobject.c266
12 files changed, 1001 insertions, 2 deletions
diff --git a/Doc/c-api/concrete.rst b/Doc/c-api/concrete.rst
index bc812c2..eda56a5 100644
--- a/Doc/c-api/concrete.rst
+++ b/Doc/c-api/concrete.rst
@@ -1405,6 +1405,74 @@ These are the UTF-8 codec APIs:
object. Error handling is "strict". Return *NULL* if an exception was raised
by the codec.
+These are the UTF-32 codec APIs:
+
+.. % --- UTF-32 Codecs ------------------------------------------------------ */
+
+
+.. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder)
+
+ Decode *length* bytes from a UTF-32 encoded buffer string and return the
+ corresponding Unicode object. *errors* (if non-*NULL*) defines the error
+ handling. It defaults to "strict".
+
+ If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte
+ order::
+
+ *byteorder == -1: little endian
+ *byteorder == 0: native order
+ *byteorder == 1: big endian
+
+ and then switches if the first four bytes of the input data are a byte order mark
+ (BOM) and the specified byte order is native order. This BOM is not copied into
+ the resulting Unicode string. After completion, *\*byteorder* is set to the
+ current byte order at the end of input data.
+
+ In a narrow build codepoints outside the BMP will be decoded as surrogate pairs.
+
+ If *byteorder* is *NULL*, the codec starts in native order mode.
+
+ Return *NULL* if an exception was raised by the codec.
+
+ .. versionadded:: 3.0
+
+
+.. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed)
+
+ If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If
+ *consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat
+ trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible
+ by four) as an error. Those bytes will not be decoded and the number of bytes
+ that have been decoded will be stored in *consumed*.
+
+ .. versionadded:: 3.0
+
+
+.. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder)
+
+ Return a Python bytes object holding the UTF-32 encoded value of the Unicode
+ data in *s*. If *byteorder* is not ``0``, output is written according to the
+ following byte order::
+
+ byteorder == -1: little endian
+ byteorder == 0: native byte order (writes a BOM mark)
+ byteorder == 1: big endian
+
+ If byteorder is ``0``, the output string will always start with the Unicode BOM
+ mark (U+FEFF). In the other two modes, no BOM mark is prepended.
+
+ If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output
+ as a single codepoint.
+
+ Return *NULL* if an exception was raised by the codec.
+
+
+.. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode)
+
+ Return a Python string using the UTF-32 encoding in native byte order. The
+ string always starts with a BOM mark. Error handling is "strict". Return
+ *NULL* if an exception was raised by the codec.
+
These are the UTF-16 codec APIs:
.. % --- UTF-16 Codecs ------------------------------------------------------ */
diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst
index 38264df..7a035c2 100644
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -1089,6 +1089,12 @@ particular, the following variants typically exist:
| shift_jisx0213 | shiftjisx0213, sjisx0213, | Japanese |
| | s_jisx0213 | |
+-----------------+--------------------------------+--------------------------------+
+| utf_32 | U32, utf32 | all languages |
++-----------------+--------------------------------+--------------------------------+
+| utf_32_be | UTF-32BE | all languages |
++-----------------+--------------------------------+--------------------------------+
+| utf_32_le | UTF-32LE | all languages |
++-----------------+--------------------------------+--------------------------------+
| utf_16 | U16, utf16 | all languages |
+-----------------+--------------------------------+--------------------------------+
| utf_16_be | UTF-16BE | all languages (BMP only) |
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 5545344..4cde46a 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -138,6 +138,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
+# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
@@ -154,6 +155,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
+# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
+# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
@@ -165,6 +168,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
+# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
@@ -225,6 +229,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
+# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
@@ -241,6 +246,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
+# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
+# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
@@ -252,6 +259,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
+# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
@@ -749,6 +757,80 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
const char *errors /* error handling */
);
+/* --- UTF-32 Codecs ------------------------------------------------------ */
+
+/* Decodes length bytes from a UTF-32 encoded buffer string and returns
+ the corresponding Unicode object.
+
+ errors (if non-NULL) defines the error handling. It defaults
+ to "strict".
+
+ If byteorder is non-NULL, the decoder starts decoding using the
+ given byte order:
+
+ *byteorder == -1: little endian
+ *byteorder == 0: native order
+ *byteorder == 1: big endian
+
+ In native mode, the first four bytes of the stream are checked for a
+ BOM mark. If found, the BOM mark is analysed, the byte order
+ adjusted and the BOM skipped. In the other modes, no BOM mark
+ interpretation is done. After completion, *byteorder is set to the
+ current byte order at the end of input data.
+
+ If byteorder is NULL, the codec starts in native order mode.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
+ const char *string, /* UTF-32 encoded string */
+ Py_ssize_t length, /* size of string */
+ const char *errors, /* error handling */
+ int *byteorder /* pointer to byteorder to use
+ 0=native;-1=LE,1=BE; updated on
+ exit */
+ );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
+ const char *string, /* UTF-32 encoded string */
+ Py_ssize_t length, /* size of string */
+ const char *errors, /* error handling */
+ int *byteorder, /* pointer to byteorder to use
+ 0=native;-1=LE,1=BE; updated on
+ exit */
+ Py_ssize_t *consumed /* bytes consumed */
+ );
+
+/* Returns a Python string using the UTF-32 encoding in native byte
+ order. The string always starts with a BOM mark. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
+ PyObject *unicode /* Unicode object */
+ );
+
+/* Returns a Python string object holding the UTF-32 encoded value of
+ the Unicode data.
+
+ If byteorder is not 0, output is written according to the following
+ byte order:
+
+ byteorder == -1: little endian
+ byteorder == 0: native byte order (writes a BOM mark)
+ byteorder == 1: big endian
+
+ If byteorder is 0, the output string will always start with the
+ Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
+ prepended.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
+ const Py_UNICODE *data, /* Unicode char buffer */
+ Py_ssize_t length, /* number of Py_UNICODE chars to encode */
+ const char *errors, /* error handling */
+ int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
+ );
+
/* --- UTF-16 Codecs ------------------------------------------------------ */
/* Decodes length bytes from a UTF-16 encoded buffer string and returns
diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py
index cefb2ed..c6f5aeb 100644
--- a/Lib/encodings/aliases.py
+++ b/Lib/encodings/aliases.py
@@ -490,6 +490,16 @@ aliases = {
'unicodelittleunmarked' : 'utf_16_le',
'utf_16le' : 'utf_16_le',
+ # utf_32 codec
+ 'u32' : 'utf_32',
+ 'utf32' : 'utf_32',
+
+ # utf_32_be codec
+ 'utf_32be' : 'utf_32_be',
+
+ # utf_32_le codec
+ 'utf_32le' : 'utf_32_le',
+
# utf_7 codec
'u7' : 'utf_7',
'utf7' : 'utf_7',
diff --git a/Lib/encodings/utf_32.py b/Lib/encodings/utf_32.py
new file mode 100644
index 0000000..622f84b
--- /dev/null
+++ b/Lib/encodings/utf_32.py
@@ -0,0 +1,144 @@
+"""
+Python 'utf-32' Codec
+"""
+import codecs, sys
+
+### Codec APIs
+
+encode = codecs.utf_32_encode
+
+def decode(input, errors='strict'):
+ return codecs.utf_32_decode(input, errors, True)
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+ def __init__(self, errors='strict'):
+ codecs.IncrementalEncoder.__init__(self, errors)
+ self.encoder = None
+
+ def encode(self, input, final=False):
+ if self.encoder is None:
+ result = codecs.utf_32_encode(input, self.errors)[0]
+ if sys.byteorder == 'little':
+ self.encoder = codecs.utf_32_le_encode
+ else:
+ self.encoder = codecs.utf_32_be_encode
+ return result
+ return self.encoder(input, self.errors)[0]
+
+ def reset(self):
+ codecs.IncrementalEncoder.reset(self)
+ self.encoder = None
+
+ def getstate(self):
+ # state info we return to the caller:
+ # 0: stream is in natural order for this platform
+ # 2: endianness hasn't been determined yet
+ # (we're never writing in unnatural order)
+ return (2 if self.encoder is None else 0)
+
+ def setstate(self, state):
+ if state:
+ self.encoder = None
+ else:
+ if sys.byteorder == 'little':
+ self.encoder = codecs.utf_32_le_encode
+ else:
+ self.encoder = codecs.utf_32_be_encode
+
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+ def __init__(self, errors='strict'):
+ codecs.BufferedIncrementalDecoder.__init__(self, errors)
+ self.decoder = None
+
+ def _buffer_decode(self, input, errors, final):
+ if self.decoder is None:
+ (output, consumed, byteorder) = \
+ codecs.utf_32_ex_decode(input, errors, 0, final)
+ if byteorder == -1:
+ self.decoder = codecs.utf_32_le_decode
+ elif byteorder == 1:
+ self.decoder = codecs.utf_32_be_decode
+ elif consumed >= 4:
+ raise UnicodeError("UTF-32 stream does not start with BOM")
+ return (output, consumed)
+ return self.decoder(input, self.errors, final)
+
+ def reset(self):
+ codecs.BufferedIncrementalDecoder.reset(self)
+ self.decoder = None
+
+ def getstate(self):
+ # additonal state info from the base class must be None here,
+ # as it isn't passed along to the caller
+ state = codecs.BufferedIncrementalDecoder.getstate(self)[0]
+ # additional state info we pass to the caller:
+ # 0: stream is in natural order for this platform
+ # 1: stream is in unnatural order
+ # 2: endianness hasn't been determined yet
+ if self.decoder is None:
+ return (state, 2)
+ addstate = int((sys.byteorder == "big") !=
+ (self.decoder is codecs.utf_32_be_decode))
+ return (state, addstate)
+
+ def setstate(self, state):
+ # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
+ codecs.BufferedIncrementalDecoder.setstate(self, state)
+ state = state[1]
+ if state == 0:
+ self.decoder = (codecs.utf_32_be_decode
+ if sys.byteorder == "big"
+ else codecs.utf_32_le_decode)
+ elif state == 1:
+ self.decoder = (codecs.utf_32_le_decode
+ if sys.byteorder == "big"
+ else codecs.utf_32_be_decode)
+ else:
+ self.decoder = None
+
+class StreamWriter(codecs.StreamWriter):
+ def __init__(self, stream, errors='strict'):
+ self.bom_written = False
+ codecs.StreamWriter.__init__(self, stream, errors)
+
+ def encode(self, input, errors='strict'):
+ self.bom_written = True
+ result = codecs.utf_32_encode(input, errors)
+ if sys.byteorder == 'little':
+ self.encode = codecs.utf_32_le_encode
+ else:
+ self.encode = codecs.utf_32_be_encode
+ return result
+
+class StreamReader(codecs.StreamReader):
+
+ def reset(self):
+ codecs.StreamReader.reset(self)
+ try:
+ del self.decode
+ except AttributeError:
+ pass
+
+ def decode(self, input, errors='strict'):
+ (object, consumed, byteorder) = \
+ codecs.utf_32_ex_decode(input, errors, 0, False)
+ if byteorder == -1:
+ self.decode = codecs.utf_32_le_decode
+ elif byteorder == 1:
+ self.decode = codecs.utf_32_le_decode
+ elif consumed>=4:
+ raise UnicodeError,"UTF-32 stream does not start with BOM"
+ return (object, consumed)
+
+### encodings module API
+
+def getregentry():
+ return codecs.CodecInfo(
+ name='utf-32',
+ encode=encode,
+ decode=decode,
+ incrementalencoder=IncrementalEncoder,
+ incrementaldecoder=IncrementalDecoder,
+ streamreader=StreamReader,
+ streamwriter=StreamWriter,
+ )
diff --git a/Lib/encodings/utf_32_be.py b/Lib/encodings/utf_32_be.py
new file mode 100644
index 0000000..fe272b5
--- /dev/null
+++ b/Lib/encodings/utf_32_be.py
@@ -0,0 +1,37 @@
+"""
+Python 'utf-32-be' Codec
+"""
+import codecs
+
+### Codec APIs
+
+encode = codecs.utf_32_be_encode
+
+def decode(input, errors='strict'):
+ return codecs.utf_32_be_decode(input, errors, True)
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+ def encode(self, input, final=False):
+ return codecs.utf_32_be_encode(input, self.errors)[0]
+
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+ _buffer_decode = codecs.utf_32_be_decode
+
+class StreamWriter(codecs.StreamWriter):
+ encode = codecs.utf_32_be_encode
+
+class StreamReader(codecs.StreamReader):
+ decode = codecs.utf_32_be_decode
+
+### encodings module API
+
+def getregentry():
+ return codecs.CodecInfo(
+ name='utf-32-be',
+ encode=encode,
+ decode=decode,
+ incrementalencoder=IncrementalEncoder,
+ incrementaldecoder=IncrementalDecoder,
+ streamreader=StreamReader,
+ streamwriter=StreamWriter,
+ )
diff --git a/Lib/encodings/utf_32_le.py b/Lib/encodings/utf_32_le.py
new file mode 100644
index 0000000..9e48210
--- /dev/null
+++ b/Lib/encodings/utf_32_le.py
@@ -0,0 +1,37 @@
+"""
+Python 'utf-32-le' Codec
+"""
+import codecs
+
+### Codec APIs
+
+encode = codecs.utf_32_le_encode
+
+def decode(input, errors='strict'):
+ return codecs.utf_32_le_decode(input, errors, True)
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+ def encode(self, input, final=False):
+ return codecs.utf_32_le_encode(input, self.errors)[0]
+
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+ _buffer_decode = codecs.utf_32_le_decode
+
+class StreamWriter(codecs.StreamWriter):
+ encode = codecs.utf_32_le_encode
+
+class StreamReader(codecs.StreamReader):
+ decode = codecs.utf_32_le_decode
+
+### encodings module API
+
+def getregentry():
+ return codecs.CodecInfo(
+ name='utf-32-le',
+ encode=encode,
+ decode=decode,
+ incrementalencoder=IncrementalEncoder,
+ incrementaldecoder=IncrementalDecoder,
+ streamreader=StreamReader,
+ streamwriter=StreamWriter,
+ )
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index f76ec65..9b731d5 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -285,7 +285,8 @@ class CodecCallbackTest(unittest.TestCase):
def test_longstrings(self):
# test long strings to check for memory overflow problems
- errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
+ errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
+ "backslashreplace"]
# register the handlers under different names,
# to prevent the codec from recognizing the name
for err in errors:
@@ -293,7 +294,8 @@ class CodecCallbackTest(unittest.TestCase):
l = 1000
errors += [ "test." + err for err in errors ]
for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]:
- for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
+ for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
+ "utf-8", "utf-7", "utf-16", "utf-32"):
for err in errors:
try:
uni.encode(enc, err)
@@ -812,6 +814,7 @@ class CodecCallbackTest(unittest.TestCase):
("utf-7", b"++"),
("utf-8", b"\xff"),
("utf-16", b"\xff"),
+ ("utf-32", b"\xff"),
("unicode-escape", b"\\u123g"),
("raw-unicode-escape", b"\\u123g"),
("unicode-internal", b"\xff"),
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 89a3473..f2ee524 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -277,6 +277,143 @@ class ReadTest(unittest.TestCase, MixInCheckStateHandling):
self.assertEqual(reader.readline(), s5)
self.assertEqual(reader.readline(), "")
+class UTF32Test(ReadTest):
+ encoding = "utf-32"
+
+ spamle = (b'\xff\xfe\x00\x00'
+ b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
+ b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
+ spambe = (b'\x00\x00\xfe\xff'
+ b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
+ b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
+
+ def test_only_one_bom(self):
+ _,_,reader,writer = codecs.lookup(self.encoding)
+ # encode some stream
+ s = io.BytesIO()
+ f = writer(s)
+ f.write("spam")
+ f.write("spam")
+ d = s.getvalue()
+ # check whether there is exactly one BOM in it
+ self.assert_(d == self.spamle or d == self.spambe)
+ # try to read it back
+ s = io.BytesIO(d)
+ f = reader(s)
+ self.assertEquals(f.read(), "spamspam")
+
+ def test_badbom(self):
+ s = io.BytesIO(4*b"\xff")
+ f = codecs.getreader(self.encoding)(s)
+ self.assertRaises(UnicodeError, f.read)
+
+ s = io.BytesIO(8*b"\xff")
+ f = codecs.getreader(self.encoding)(s)
+ self.assertRaises(UnicodeError, f.read)
+
+ def test_partial(self):
+ self.check_partial(
+ "\x00\xff\u0100\uffff",
+ [
+ "", # first byte of BOM read
+ "", # second byte of BOM read
+ "", # third byte of BOM read
+ "", # fourth byte of BOM read => byteorder known
+ "",
+ "",
+ "",
+ "\x00",
+ "\x00",
+ "\x00",
+ "\x00",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100\uffff",
+ ]
+ )
+
+ def test_errors(self):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
+ b"\xff", "strict", True)
+
+ def test_decoder_state(self):
+ self.check_state_handling_decode(self.encoding,
+ "spamspam", self.spamle)
+ self.check_state_handling_decode(self.encoding,
+ "spamspam", self.spambe)
+
+class UTF32LETest(ReadTest):
+ encoding = "utf-32-le"
+
+ def test_partial(self):
+ self.check_partial(
+ "\x00\xff\u0100\uffff",
+ [
+ "",
+ "",
+ "",
+ "\x00",
+ "\x00",
+ "\x00",
+ "\x00",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100\uffff",
+ ]
+ )
+
+ def test_simple(self):
+ self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
+
+ def test_errors(self):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
+ b"\xff", "strict", True)
+
+class UTF32BETest(ReadTest):
+ encoding = "utf-32-be"
+
+ def test_partial(self):
+ self.check_partial(
+ "\x00\xff\u0100\uffff",
+ [
+ "",
+ "",
+ "",
+ "\x00",
+ "\x00",
+ "\x00",
+ "\x00",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100",
+ "\x00\xff\u0100\uffff",
+ ]
+ )
+
+ def test_simple(self):
+ self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
+
+ def test_errors(self):
+ self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
+ b"\xff", "strict", True)
+
class UTF16Test(ReadTest):
encoding = "utf-16"
@@ -1284,6 +1421,9 @@ class WithStmtTest(unittest.TestCase):
def test_main():
test_support.run_unittest(
+ UTF32Test,
+ UTF32LETest,
+ UTF32BETest,
UTF16Test,
UTF16LETest,
UTF16BETest,
diff --git a/Misc/NEWS b/Misc/NEWS
index 0745f8d..f00c4c7 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -213,6 +213,8 @@ Library
- Patch #1680961: atexit has been reimplemented in C.
+- Add new codecs for UTF-32, UTF-32-LE and UTF-32-BE.
+
Build
-----
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
index c500073..e3933e7 100644
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -413,6 +413,126 @@ utf_16_ex_decode(PyObject *self,
}
static PyObject *
+utf_32_decode(PyObject *self,
+ PyObject *args)
+{
+ const char *data;
+ Py_ssize_t size;
+ const char *errors = NULL;
+ int byteorder = 0;
+ int final = 0;
+ Py_ssize_t consumed;
+ PyObject *decoded;
+
+ if (!PyArg_ParseTuple(args, "t#|zi:utf_32_decode",
+ &data, &size, &errors, &final))
+ return NULL;
+ if (size < 0) {
+ PyErr_SetString(PyExc_ValueError, "negative argument");
+ return 0;
+ }
+ consumed = size; /* This is overwritten unless final is true. */
+ decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
+ final ? NULL : &consumed);
+ if (decoded == NULL)
+ return NULL;
+ return codec_tuple(decoded, consumed);
+}
+
+static PyObject *
+utf_32_le_decode(PyObject *self,
+ PyObject *args)
+{
+ const char *data;
+ Py_ssize_t size;
+ const char *errors = NULL;
+ int byteorder = -1;
+ int final = 0;
+ Py_ssize_t consumed;
+ PyObject *decoded = NULL;
+
+ if (!PyArg_ParseTuple(args, "t#|zi:utf_32_le_decode",
+ &data, &size, &errors, &final))
+ return NULL;
+
+ if (size < 0) {
+ PyErr_SetString(PyExc_ValueError, "negative argument");
+ return 0;
+ }
+ consumed = size; /* This is overwritten unless final is true. */
+ decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
+ &byteorder, final ? NULL : &consumed);
+ if (decoded == NULL)
+ return NULL;
+ return codec_tuple(decoded, consumed);
+
+}
+
+static PyObject *
+utf_32_be_decode(PyObject *self,
+ PyObject *args)
+{
+ const char *data;
+ Py_ssize_t size;
+ const char *errors = NULL;
+ int byteorder = 1;
+ int final = 0;
+ Py_ssize_t consumed;
+ PyObject *decoded = NULL;
+
+ if (!PyArg_ParseTuple(args, "t#|zi:utf_32_be_decode",
+ &data, &size, &errors, &final))
+ return NULL;
+ if (size < 0) {
+ PyErr_SetString(PyExc_ValueError, "negative argument");
+ return 0;
+ }
+ consumed = size; /* This is overwritten unless final is true. */
+ decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
+ &byteorder, final ? NULL : &consumed);
+ if (decoded == NULL)
+ return NULL;
+ return codec_tuple(decoded, consumed);
+}
+
+/* This non-standard version also provides access to the byteorder
+ parameter of the builtin UTF-32 codec.
+
+ It returns a tuple (unicode, bytesread, byteorder) with byteorder
+ being the value in effect at the end of data.
+
+*/
+
+static PyObject *
+utf_32_ex_decode(PyObject *self,
+ PyObject *args)
+{
+ const char *data;
+ Py_ssize_t size;
+ const char *errors = NULL;
+ int byteorder = 0;
+ PyObject *unicode, *tuple;
+ int final = 0;
+ Py_ssize_t consumed;
+
+ if (!PyArg_ParseTuple(args, "t#|zii:utf_32_ex_decode",
+ &data, &size, &errors, &byteorder, &final))
+ return NULL;
+ if (size < 0) {
+ PyErr_SetString(PyExc_ValueError, "negative argument");
+ return 0;
+ }
+ consumed = size; /* This is overwritten unless final is true. */
+ unicode = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
+ final ? NULL : &consumed);
+ if (unicode == NULL)
+ return NULL;
+ tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
+ Py_DECREF(unicode);
+ return tuple;
+}
+
+static PyObject *
unicode_escape_decode(PyObject *self,
PyObject *args)
{
@@ -700,6 +820,83 @@ utf_16_be_encode(PyObject *self,
return v;
}
+/* This version provides access to the byteorder parameter of the
+ builtin UTF-32 codecs as optional third argument. It defaults to 0
+ which means: use the native byte order and prepend the data with a
+ BOM mark.
+
+*/
+
+static PyObject *
+utf_32_encode(PyObject *self,
+ PyObject *args)
+{
+ PyObject *str, *v;
+ const char *errors = NULL;
+ int byteorder = 0;
+
+ if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
+ &str, &errors, &byteorder))
+ return NULL;
+
+ str = PyUnicode_FromObject(str);
+ if (str == NULL)
+ return NULL;
+ v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
+ PyUnicode_GET_SIZE(str),
+ errors,
+ byteorder),
+ PyUnicode_GET_SIZE(str));
+ Py_DECREF(str);
+ return v;
+}
+
+static PyObject *
+utf_32_le_encode(PyObject *self,
+ PyObject *args)
+{
+ PyObject *str, *v;
+ const char *errors = NULL;
+
+ if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
+ &str, &errors))
+ return NULL;
+
+ str = PyUnicode_FromObject(str);
+ if (str == NULL)
+ return NULL;
+ v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
+ PyUnicode_GET_SIZE(str),
+ errors,
+ -1),
+ PyUnicode_GET_SIZE(str));
+ Py_DECREF(str);
+ return v;
+}
+
+static PyObject *
+utf_32_be_encode(PyObject *self,
+ PyObject *args)
+{
+ PyObject *str, *v;
+ const char *errors = NULL;
+
+ if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
+ &str, &errors))
+ return NULL;
+
+ str = PyUnicode_FromObject(str);
+ if (str == NULL)
+ return NULL;
+ v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
+ PyUnicode_GET_SIZE(str),
+ errors,
+ +1),
+ PyUnicode_GET_SIZE(str));
+ Py_DECREF(str);
+ return v;
+}
+
static PyObject *
unicode_escape_encode(PyObject *self,
PyObject *args)
@@ -916,6 +1113,13 @@ static PyMethodDef _codecs_functions[] = {
{"utf_16_le_decode", utf_16_le_decode, METH_VARARGS},
{"utf_16_be_decode", utf_16_be_decode, METH_VARARGS},
{"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS},
+ {"utf_32_encode", utf_32_encode, METH_VARARGS},
+ {"utf_32_le_encode", utf_32_le_encode, METH_VARARGS},
+ {"utf_32_be_encode", utf_32_be_encode, METH_VARARGS},
+ {"utf_32_decode", utf_32_decode, METH_VARARGS},
+ {"utf_32_le_decode", utf_32_le_decode, METH_VARARGS},
+ {"utf_32_be_decode", utf_32_be_decode, METH_VARARGS},
+ {"utf_32_ex_decode", utf_32_ex_decode, METH_VARARGS},
{"unicode_escape_encode", unicode_escape_encode, METH_VARARGS},
{"unicode_escape_decode", unicode_escape_decode, METH_VARARGS},
{"unicode_internal_encode", unicode_internal_encode, METH_VARARGS},
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index b345986..54fe16c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1992,6 +1992,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
NULL);
}
+/* --- UTF-32 Codec ------------------------------------------------------- */
+
+PyObject *
+PyUnicode_DecodeUTF32(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ int *byteorder)
+{
+ return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
+}
+
+PyObject *
+PyUnicode_DecodeUTF32Stateful(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ int *byteorder,
+ Py_ssize_t *consumed)
+{
+ const char *starts = s;
+ Py_ssize_t startinpos;
+ Py_ssize_t endinpos;
+ Py_ssize_t outpos;
+ PyUnicodeObject *unicode;
+ Py_UNICODE *p;
+#ifndef Py_UNICODE_WIDE
+ int i, pairs;
+#else
+ const int pairs = 0;
+#endif
+ const unsigned char *q, *e;
+ int bo = 0; /* assume native ordering by default */
+ const char *errmsg = "";
+ /* On narrow builds we split characters outside the BMP into two
+ codepoints => count how much extra space we need. */
+#ifndef Py_UNICODE_WIDE
+ for (i = pairs = 0; i < size/4; i++)
+ if (((Py_UCS4 *)s)[i] >= 0x10000)
+ pairs++;
+#endif
+ /* Offsets from q for retrieving bytes in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ int iorder[] = {0, 1, 2, 3};
+#else
+ int iorder[] = {3, 2, 1, 0};
+#endif
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+
+ /* This might be one to much, because of a BOM */
+ unicode = _PyUnicode_New((size+3)/4+pairs);
+ if (!unicode)
+ return NULL;
+ if (size == 0)
+ return (PyObject *)unicode;
+
+ /* Unpack UTF-32 encoded data */
+ p = unicode->str;
+ q = (unsigned char *)s;
+ e = q + size;
+
+ if (byteorder)
+ bo = *byteorder;
+
+ /* Check for BOM marks (U+FEFF) in the input and adjust current
+ byte order setting accordingly. In native mode, the leading BOM
+ mark is skipped, in all other modes, it is copied to the output
+ stream as-is (giving a ZWNBSP character). */
+ if (bo == 0) {
+ if (size >= 4) {
+ const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
+ (q[iorder[1]] << 8) | q[iorder[0]];
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ if (bom == 0x0000FEFF) {
+ q += 4;
+ bo = -1;
+ }
+ else if (bom == 0xFFFE0000) {
+ q += 4;
+ bo = 1;
+ }
+#else
+ if (bom == 0x0000FEFF) {
+ q += 4;
+ bo = 1;
+ }
+ else if (bom == 0xFFFE0000) {
+ q += 4;
+ bo = -1;
+ }
+#endif
+ }
+ }
+
+ if (bo == -1) {
+ /* force LE */
+ iorder[0] = 0;
+ iorder[1] = 1;
+ iorder[2] = 2;
+ iorder[3] = 3;
+ }
+ else if (bo == 1) {
+ /* force BE */
+ iorder[0] = 3;
+ iorder[1] = 2;
+ iorder[2] = 1;
+ iorder[3] = 0;
+ }
+
+ while (q < e) {
+ Py_UCS4 ch;
+ /* remaining bytes at the end? (size should be divisible by 4) */
+ if (e-q<4) {
+ if (consumed)
+ break;
+ errmsg = "truncated data";
+ startinpos = ((const char *)q)-starts;
+ endinpos = ((const char *)e)-starts;
+ goto utf32Error;
+ /* The remaining input chars are ignored if the callback
+ chooses to skip the input */
+ }
+ ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
+ (q[iorder[1]] << 8) | q[iorder[0]];
+
+ if (ch >= 0x110000)
+ {
+ errmsg = "codepoint not in range(0x110000)";
+ startinpos = ((const char *)q)-starts;
+ endinpos = startinpos+4;
+ goto utf32Error;
+ }
+#ifndef Py_UNICODE_WIDE
+ if (ch >= 0x10000)
+ {
+ *p++ = 0xD800 | ((ch-0x10000) >> 10);
+ *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
+ }
+ else
+#endif
+ *p++ = ch;
+ q += 4;
+ continue;
+ utf32Error:
+ outpos = p-PyUnicode_AS_UNICODE(unicode);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "utf32", errmsg,
+ &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
+ (PyObject **)&unicode, &outpos, &p))
+ goto onError;
+ }
+
+ if (byteorder)
+ *byteorder = bo;
+
+ if (consumed)
+ *consumed = (const char *)q-starts;
+
+ /* Adjust length */
+ if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
+ goto onError;
+
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ return (PyObject *)unicode;
+
+onError:
+ Py_DECREF(unicode);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ return NULL;
+}
+
+PyObject *
+PyUnicode_EncodeUTF32(const Py_UNICODE *s,
+ Py_ssize_t size,
+ const char *errors,
+ int byteorder)
+{
+ PyObject *v;
+ unsigned char *p;
+#ifndef Py_UNICODE_WIDE
+ int i, pairs;
+#else
+ const int pairs = 0;
+#endif
+ /* Offsets from p for storing byte pairs in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ int iorder[] = {0, 1, 2, 3};
+#else
+ int iorder[] = {3, 2, 1, 0};
+#endif
+
+#define STORECHAR(CH) \
+ do { \
+ p[iorder[3]] = ((CH) >> 24) & 0xff; \
+ p[iorder[2]] = ((CH) >> 16) & 0xff; \
+ p[iorder[1]] = ((CH) >> 8) & 0xff; \
+ p[iorder[0]] = (CH) & 0xff; \
+ p += 4; \
+ } while(0)
+
+ /* In narrow builds we can output surrogate pairs as one codepoint,
+ so we need less space. */
+#ifndef Py_UNICODE_WIDE
+ for (i = pairs = 0; i < size-1; i++)
+ if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
+ 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
+ pairs++;
+#endif
+ v = PyBytes_FromStringAndSize(NULL,
+ 4 * (size - pairs + (byteorder == 0)));
+ if (v == NULL)
+ return NULL;
+
+ p = (unsigned char *)PyBytes_AS_STRING(v);
+ if (byteorder == 0)
+ STORECHAR(0xFEFF);
+ if (size == 0)
+ return v;
+
+ if (byteorder == -1) {
+ /* force LE */
+ iorder[0] = 0;
+ iorder[1] = 1;
+ iorder[2] = 2;
+ iorder[3] = 3;
+ }
+ else if (byteorder == 1) {
+ /* force BE */
+ iorder[0] = 3;
+ iorder[1] = 2;
+ iorder[2] = 1;
+ iorder[3] = 0;
+ }
+
+ while (size-- > 0) {
+ Py_UCS4 ch = *s++;
+#ifndef Py_UNICODE_WIDE
+ if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
+ Py_UCS4 ch2 = *s;
+ if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+ ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+ s++;
+ size--;
+ }
+ }
+#endif
+ STORECHAR(ch);
+ }
+ return v;
+#undef STORECHAR
+}
+
+PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
+{
+ if (!PyUnicode_Check(unicode)) {
+ PyErr_BadArgument();
+ return NULL;
+ }
+ return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
+ PyUnicode_GET_SIZE(unicode),
+ NULL,
+ 0);
+}
+
/* --- UTF-16 Codec ------------------------------------------------------- */
PyObject *