From 41980caf644163f1ff74a793b30f1c424eeede82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Walter=20D=C3=B6rwald?= Date: Thu, 16 Aug 2007 21:55:45 +0000 Subject: Apply SF patch #1775604: This adds three new codecs (utf-32, utf-32-le and ut-32-be). On narrow builds the codecs combine surrogate pairs in the unicode object into one codepoint on encoding and create surrogate pairs for codepoints outside the BMP on decoding. Lone surrogates are passed through unchanged in all cases. Backport to the trunk will follow. --- Doc/c-api/concrete.rst | 68 ++++++++++ Doc/library/codecs.rst | 6 + Include/unicodeobject.h | 82 +++++++++++++ Lib/encodings/aliases.py | 10 ++ Lib/encodings/utf_32.py | 144 ++++++++++++++++++++++ Lib/encodings/utf_32_be.py | 37 ++++++ Lib/encodings/utf_32_le.py | 37 ++++++ Lib/test/test_codeccallbacks.py | 7 +- Lib/test/test_codecs.py | 140 +++++++++++++++++++++ Misc/NEWS | 2 + Modules/_codecsmodule.c | 204 ++++++++++++++++++++++++++++++ Objects/unicodeobject.c | 266 ++++++++++++++++++++++++++++++++++++++++ 12 files changed, 1001 insertions(+), 2 deletions(-) create mode 100644 Lib/encodings/utf_32.py create mode 100644 Lib/encodings/utf_32_be.py create mode 100644 Lib/encodings/utf_32_le.py diff --git a/Doc/c-api/concrete.rst b/Doc/c-api/concrete.rst index bc812c2..eda56a5 100644 --- a/Doc/c-api/concrete.rst +++ b/Doc/c-api/concrete.rst @@ -1405,6 +1405,74 @@ These are the UTF-8 codec APIs: object. Error handling is "strict". Return *NULL* if an exception was raised by the codec. +These are the UTF-32 codec APIs: + +.. % --- UTF-32 Codecs ------------------------------------------------------ */ + + +.. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder) + + Decode *length* bytes from a UTF-32 encoded buffer string and return the + corresponding Unicode object. *errors* (if non-*NULL*) defines the error + handling. It defaults to "strict". + + If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte + order:: + + *byteorder == -1: little endian + *byteorder == 0: native order + *byteorder == 1: big endian + + and then switches if the first four bytes of the input data are a byte order mark + (BOM) and the specified byte order is native order. This BOM is not copied into + the resulting Unicode string. After completion, *\*byteorder* is set to the + current byte order at the end of input data. + + In a narrow build codepoints outside the BMP will be decoded as surrogate pairs. + + If *byteorder* is *NULL*, the codec starts in native order mode. + + Return *NULL* if an exception was raised by the codec. + + .. versionadded:: 3.0 + + +.. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed) + + If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If + *consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat + trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible + by four) as an error. Those bytes will not be decoded and the number of bytes + that have been decoded will be stored in *consumed*. + + .. versionadded:: 3.0 + + +.. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder) + + Return a Python bytes object holding the UTF-32 encoded value of the Unicode + data in *s*. If *byteorder* is not ``0``, output is written according to the + following byte order:: + + byteorder == -1: little endian + byteorder == 0: native byte order (writes a BOM mark) + byteorder == 1: big endian + + If byteorder is ``0``, the output string will always start with the Unicode BOM + mark (U+FEFF). In the other two modes, no BOM mark is prepended. + + If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output + as a single codepoint. + + Return *NULL* if an exception was raised by the codec. + + +.. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode) + + Return a Python string using the UTF-32 encoding in native byte order. The + string always starts with a BOM mark. Error handling is "strict". Return + *NULL* if an exception was raised by the codec. + These are the UTF-16 codec APIs: .. % --- UTF-16 Codecs ------------------------------------------------------ */ diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 38264df..7a035c2 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1089,6 +1089,12 @@ particular, the following variants typically exist: | shift_jisx0213 | shiftjisx0213, sjisx0213, | Japanese | | | s_jisx0213 | | +-----------------+--------------------------------+--------------------------------+ +| utf_32 | U32, utf32 | all languages | ++-----------------+--------------------------------+--------------------------------+ +| utf_32_be | UTF-32BE | all languages | ++-----------------+--------------------------------+--------------------------------+ +| utf_32_le | UTF-32LE | all languages | ++-----------------+--------------------------------+--------------------------------+ | utf_16 | U16, utf16 | all languages | +-----------------+--------------------------------+--------------------------------+ | utf_16_be | UTF-16BE | all languages (BMP only) | diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 5545344..4cde46a 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -138,6 +138,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString +# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode @@ -154,6 +155,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape +# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32 +# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 @@ -165,6 +168,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape +# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32 # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape @@ -225,6 +229,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString +# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode @@ -241,6 +246,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape +# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32 +# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 @@ -252,6 +259,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape +# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32 # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape @@ -749,6 +757,80 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( const char *errors /* error handling */ ); +/* --- UTF-32 Codecs ------------------------------------------------------ */ + +/* Decodes length bytes from a UTF-32 encoded buffer string and returns + the corresponding Unicode object. + + errors (if non-NULL) defines the error handling. It defaults + to "strict". + + If byteorder is non-NULL, the decoder starts decoding using the + given byte order: + + *byteorder == -1: little endian + *byteorder == 0: native order + *byteorder == 1: big endian + + In native mode, the first four bytes of the stream are checked for a + BOM mark. If found, the BOM mark is analysed, the byte order + adjusted and the BOM skipped. In the other modes, no BOM mark + interpretation is done. After completion, *byteorder is set to the + current byte order at the end of input data. + + If byteorder is NULL, the codec starts in native order mode. + +*/ + +PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( + const char *string, /* UTF-32 encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + int *byteorder /* pointer to byteorder to use + 0=native;-1=LE,1=BE; updated on + exit */ + ); + +PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( + const char *string, /* UTF-32 encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + int *byteorder, /* pointer to byteorder to use + 0=native;-1=LE,1=BE; updated on + exit */ + Py_ssize_t *consumed /* bytes consumed */ + ); + +/* Returns a Python string using the UTF-32 encoding in native byte + order. The string always starts with a BOM mark. */ + +PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( + PyObject *unicode /* Unicode object */ + ); + +/* Returns a Python string object holding the UTF-32 encoded value of + the Unicode data. + + If byteorder is not 0, output is written according to the following + byte order: + + byteorder == -1: little endian + byteorder == 0: native byte order (writes a BOM mark) + byteorder == 1: big endian + + If byteorder is 0, the output string will always start with the + Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is + prepended. + +*/ + +PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( + const Py_UNICODE *data, /* Unicode char buffer */ + Py_ssize_t length, /* number of Py_UNICODE chars to encode */ + const char *errors, /* error handling */ + int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ + ); + /* --- UTF-16 Codecs ------------------------------------------------------ */ /* Decodes length bytes from a UTF-16 encoded buffer string and returns diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py index cefb2ed..c6f5aeb 100644 --- a/Lib/encodings/aliases.py +++ b/Lib/encodings/aliases.py @@ -490,6 +490,16 @@ aliases = { 'unicodelittleunmarked' : 'utf_16_le', 'utf_16le' : 'utf_16_le', + # utf_32 codec + 'u32' : 'utf_32', + 'utf32' : 'utf_32', + + # utf_32_be codec + 'utf_32be' : 'utf_32_be', + + # utf_32_le codec + 'utf_32le' : 'utf_32_le', + # utf_7 codec 'u7' : 'utf_7', 'utf7' : 'utf_7', diff --git a/Lib/encodings/utf_32.py b/Lib/encodings/utf_32.py new file mode 100644 index 0000000..622f84b --- /dev/null +++ b/Lib/encodings/utf_32.py @@ -0,0 +1,144 @@ +""" +Python 'utf-32' Codec +""" +import codecs, sys + +### Codec APIs + +encode = codecs.utf_32_encode + +def decode(input, errors='strict'): + return codecs.utf_32_decode(input, errors, True) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def __init__(self, errors='strict'): + codecs.IncrementalEncoder.__init__(self, errors) + self.encoder = None + + def encode(self, input, final=False): + if self.encoder is None: + result = codecs.utf_32_encode(input, self.errors)[0] + if sys.byteorder == 'little': + self.encoder = codecs.utf_32_le_encode + else: + self.encoder = codecs.utf_32_be_encode + return result + return self.encoder(input, self.errors)[0] + + def reset(self): + codecs.IncrementalEncoder.reset(self) + self.encoder = None + + def getstate(self): + # state info we return to the caller: + # 0: stream is in natural order for this platform + # 2: endianness hasn't been determined yet + # (we're never writing in unnatural order) + return (2 if self.encoder is None else 0) + + def setstate(self, state): + if state: + self.encoder = None + else: + if sys.byteorder == 'little': + self.encoder = codecs.utf_32_le_encode + else: + self.encoder = codecs.utf_32_be_encode + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def __init__(self, errors='strict'): + codecs.BufferedIncrementalDecoder.__init__(self, errors) + self.decoder = None + + def _buffer_decode(self, input, errors, final): + if self.decoder is None: + (output, consumed, byteorder) = \ + codecs.utf_32_ex_decode(input, errors, 0, final) + if byteorder == -1: + self.decoder = codecs.utf_32_le_decode + elif byteorder == 1: + self.decoder = codecs.utf_32_be_decode + elif consumed >= 4: + raise UnicodeError("UTF-32 stream does not start with BOM") + return (output, consumed) + return self.decoder(input, self.errors, final) + + def reset(self): + codecs.BufferedIncrementalDecoder.reset(self) + self.decoder = None + + def getstate(self): + # additonal state info from the base class must be None here, + # as it isn't passed along to the caller + state = codecs.BufferedIncrementalDecoder.getstate(self)[0] + # additional state info we pass to the caller: + # 0: stream is in natural order for this platform + # 1: stream is in unnatural order + # 2: endianness hasn't been determined yet + if self.decoder is None: + return (state, 2) + addstate = int((sys.byteorder == "big") != + (self.decoder is codecs.utf_32_be_decode)) + return (state, addstate) + + def setstate(self, state): + # state[1] will be ignored by BufferedIncrementalDecoder.setstate() + codecs.BufferedIncrementalDecoder.setstate(self, state) + state = state[1] + if state == 0: + self.decoder = (codecs.utf_32_be_decode + if sys.byteorder == "big" + else codecs.utf_32_le_decode) + elif state == 1: + self.decoder = (codecs.utf_32_le_decode + if sys.byteorder == "big" + else codecs.utf_32_be_decode) + else: + self.decoder = None + +class StreamWriter(codecs.StreamWriter): + def __init__(self, stream, errors='strict'): + self.bom_written = False + codecs.StreamWriter.__init__(self, stream, errors) + + def encode(self, input, errors='strict'): + self.bom_written = True + result = codecs.utf_32_encode(input, errors) + if sys.byteorder == 'little': + self.encode = codecs.utf_32_le_encode + else: + self.encode = codecs.utf_32_be_encode + return result + +class StreamReader(codecs.StreamReader): + + def reset(self): + codecs.StreamReader.reset(self) + try: + del self.decode + except AttributeError: + pass + + def decode(self, input, errors='strict'): + (object, consumed, byteorder) = \ + codecs.utf_32_ex_decode(input, errors, 0, False) + if byteorder == -1: + self.decode = codecs.utf_32_le_decode + elif byteorder == 1: + self.decode = codecs.utf_32_le_decode + elif consumed>=4: + raise UnicodeError,"UTF-32 stream does not start with BOM" + return (object, consumed) + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='utf-32', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/encodings/utf_32_be.py b/Lib/encodings/utf_32_be.py new file mode 100644 index 0000000..fe272b5 --- /dev/null +++ b/Lib/encodings/utf_32_be.py @@ -0,0 +1,37 @@ +""" +Python 'utf-32-be' Codec +""" +import codecs + +### Codec APIs + +encode = codecs.utf_32_be_encode + +def decode(input, errors='strict'): + return codecs.utf_32_be_decode(input, errors, True) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return codecs.utf_32_be_encode(input, self.errors)[0] + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + _buffer_decode = codecs.utf_32_be_decode + +class StreamWriter(codecs.StreamWriter): + encode = codecs.utf_32_be_encode + +class StreamReader(codecs.StreamReader): + decode = codecs.utf_32_be_decode + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='utf-32-be', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/encodings/utf_32_le.py b/Lib/encodings/utf_32_le.py new file mode 100644 index 0000000..9e48210 --- /dev/null +++ b/Lib/encodings/utf_32_le.py @@ -0,0 +1,37 @@ +""" +Python 'utf-32-le' Codec +""" +import codecs + +### Codec APIs + +encode = codecs.utf_32_le_encode + +def decode(input, errors='strict'): + return codecs.utf_32_le_decode(input, errors, True) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return codecs.utf_32_le_encode(input, self.errors)[0] + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + _buffer_decode = codecs.utf_32_le_decode + +class StreamWriter(codecs.StreamWriter): + encode = codecs.utf_32_le_encode + +class StreamReader(codecs.StreamReader): + decode = codecs.utf_32_le_decode + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='utf-32-le', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index f76ec65..9b731d5 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -285,7 +285,8 @@ class CodecCallbackTest(unittest.TestCase): def test_longstrings(self): # test long strings to check for memory overflow problems - errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"] + errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", + "backslashreplace"] # register the handlers under different names, # to prevent the codec from recognizing the name for err in errors: @@ -293,7 +294,8 @@ class CodecCallbackTest(unittest.TestCase): l = 1000 errors += [ "test." + err for err in errors ] for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]: - for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"): + for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", + "utf-8", "utf-7", "utf-16", "utf-32"): for err in errors: try: uni.encode(enc, err) @@ -812,6 +814,7 @@ class CodecCallbackTest(unittest.TestCase): ("utf-7", b"++"), ("utf-8", b"\xff"), ("utf-16", b"\xff"), + ("utf-32", b"\xff"), ("unicode-escape", b"\\u123g"), ("raw-unicode-escape", b"\\u123g"), ("unicode-internal", b"\xff"), diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 89a3473..f2ee524 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -277,6 +277,143 @@ class ReadTest(unittest.TestCase, MixInCheckStateHandling): self.assertEqual(reader.readline(), s5) self.assertEqual(reader.readline(), "") +class UTF32Test(ReadTest): + encoding = "utf-32" + + spamle = (b'\xff\xfe\x00\x00' + b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' + b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00') + spambe = (b'\x00\x00\xfe\xff' + b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' + b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') + + def test_only_one_bom(self): + _,_,reader,writer = codecs.lookup(self.encoding) + # encode some stream + s = io.BytesIO() + f = writer(s) + f.write("spam") + f.write("spam") + d = s.getvalue() + # check whether there is exactly one BOM in it + self.assert_(d == self.spamle or d == self.spambe) + # try to read it back + s = io.BytesIO(d) + f = reader(s) + self.assertEquals(f.read(), "spamspam") + + def test_badbom(self): + s = io.BytesIO(4*b"\xff") + f = codecs.getreader(self.encoding)(s) + self.assertRaises(UnicodeError, f.read) + + s = io.BytesIO(8*b"\xff") + f = codecs.getreader(self.encoding)(s) + self.assertRaises(UnicodeError, f.read) + + def test_partial(self): + self.check_partial( + "\x00\xff\u0100\uffff", + [ + "", # first byte of BOM read + "", # second byte of BOM read + "", # third byte of BOM read + "", # fourth byte of BOM read => byteorder known + "", + "", + "", + "\x00", + "\x00", + "\x00", + "\x00", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100\uffff", + ] + ) + + def test_errors(self): + self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode, + b"\xff", "strict", True) + + def test_decoder_state(self): + self.check_state_handling_decode(self.encoding, + "spamspam", self.spamle) + self.check_state_handling_decode(self.encoding, + "spamspam", self.spambe) + +class UTF32LETest(ReadTest): + encoding = "utf-32-le" + + def test_partial(self): + self.check_partial( + "\x00\xff\u0100\uffff", + [ + "", + "", + "", + "\x00", + "\x00", + "\x00", + "\x00", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100\uffff", + ] + ) + + def test_simple(self): + self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00") + + def test_errors(self): + self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode, + b"\xff", "strict", True) + +class UTF32BETest(ReadTest): + encoding = "utf-32-be" + + def test_partial(self): + self.check_partial( + "\x00\xff\u0100\uffff", + [ + "", + "", + "", + "\x00", + "\x00", + "\x00", + "\x00", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100\uffff", + ] + ) + + def test_simple(self): + self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03") + + def test_errors(self): + self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode, + b"\xff", "strict", True) + class UTF16Test(ReadTest): encoding = "utf-16" @@ -1284,6 +1421,9 @@ class WithStmtTest(unittest.TestCase): def test_main(): test_support.run_unittest( + UTF32Test, + UTF32LETest, + UTF32BETest, UTF16Test, UTF16LETest, UTF16BETest, diff --git a/Misc/NEWS b/Misc/NEWS index 0745f8d..f00c4c7 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -213,6 +213,8 @@ Library - Patch #1680961: atexit has been reimplemented in C. +- Add new codecs for UTF-32, UTF-32-LE and UTF-32-BE. + Build ----- diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index c500073..e3933e7 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -413,6 +413,126 @@ utf_16_ex_decode(PyObject *self, } static PyObject * +utf_32_decode(PyObject *self, + PyObject *args) +{ + const char *data; + Py_ssize_t size; + const char *errors = NULL; + int byteorder = 0; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded; + + if (!PyArg_ParseTuple(args, "t#|zi:utf_32_decode", + &data, &size, &errors, &final)) + return NULL; + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative argument"); + return 0; + } + consumed = size; /* This is overwritten unless final is true. */ + decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder, + final ? NULL : &consumed); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); +} + +static PyObject * +utf_32_le_decode(PyObject *self, + PyObject *args) +{ + const char *data; + Py_ssize_t size; + const char *errors = NULL; + int byteorder = -1; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded = NULL; + + if (!PyArg_ParseTuple(args, "t#|zi:utf_32_le_decode", + &data, &size, &errors, &final)) + return NULL; + + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative argument"); + return 0; + } + consumed = size; /* This is overwritten unless final is true. */ + decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, + &byteorder, final ? NULL : &consumed); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); + +} + +static PyObject * +utf_32_be_decode(PyObject *self, + PyObject *args) +{ + const char *data; + Py_ssize_t size; + const char *errors = NULL; + int byteorder = 1; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded = NULL; + + if (!PyArg_ParseTuple(args, "t#|zi:utf_32_be_decode", + &data, &size, &errors, &final)) + return NULL; + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative argument"); + return 0; + } + consumed = size; /* This is overwritten unless final is true. */ + decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, + &byteorder, final ? NULL : &consumed); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); +} + +/* This non-standard version also provides access to the byteorder + parameter of the builtin UTF-32 codec. + + It returns a tuple (unicode, bytesread, byteorder) with byteorder + being the value in effect at the end of data. + +*/ + +static PyObject * +utf_32_ex_decode(PyObject *self, + PyObject *args) +{ + const char *data; + Py_ssize_t size; + const char *errors = NULL; + int byteorder = 0; + PyObject *unicode, *tuple; + int final = 0; + Py_ssize_t consumed; + + if (!PyArg_ParseTuple(args, "t#|zii:utf_32_ex_decode", + &data, &size, &errors, &byteorder, &final)) + return NULL; + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative argument"); + return 0; + } + consumed = size; /* This is overwritten unless final is true. */ + unicode = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder, + final ? NULL : &consumed); + if (unicode == NULL) + return NULL; + tuple = Py_BuildValue("Oni", unicode, consumed, byteorder); + Py_DECREF(unicode); + return tuple; +} + +static PyObject * unicode_escape_decode(PyObject *self, PyObject *args) { @@ -700,6 +820,83 @@ utf_16_be_encode(PyObject *self, return v; } +/* This version provides access to the byteorder parameter of the + builtin UTF-32 codecs as optional third argument. It defaults to 0 + which means: use the native byte order and prepend the data with a + BOM mark. + +*/ + +static PyObject * +utf_32_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + int byteorder = 0; + + if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode", + &str, &errors, &byteorder)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + errors, + byteorder), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + +static PyObject * +utf_32_le_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode", + &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + errors, + -1), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + +static PyObject * +utf_32_be_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode", + &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + errors, + +1), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + static PyObject * unicode_escape_encode(PyObject *self, PyObject *args) @@ -916,6 +1113,13 @@ static PyMethodDef _codecs_functions[] = { {"utf_16_le_decode", utf_16_le_decode, METH_VARARGS}, {"utf_16_be_decode", utf_16_be_decode, METH_VARARGS}, {"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS}, + {"utf_32_encode", utf_32_encode, METH_VARARGS}, + {"utf_32_le_encode", utf_32_le_encode, METH_VARARGS}, + {"utf_32_be_encode", utf_32_be_encode, METH_VARARGS}, + {"utf_32_decode", utf_32_decode, METH_VARARGS}, + {"utf_32_le_decode", utf_32_le_decode, METH_VARARGS}, + {"utf_32_be_decode", utf_32_be_decode, METH_VARARGS}, + {"utf_32_ex_decode", utf_32_ex_decode, METH_VARARGS}, {"unicode_escape_encode", unicode_escape_encode, METH_VARARGS}, {"unicode_escape_decode", unicode_escape_decode, METH_VARARGS}, {"unicode_internal_encode", unicode_internal_encode, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index b345986..54fe16c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1992,6 +1992,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode) NULL); } +/* --- UTF-32 Codec ------------------------------------------------------- */ + +PyObject * +PyUnicode_DecodeUTF32(const char *s, + Py_ssize_t size, + const char *errors, + int *byteorder) +{ + return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); +} + +PyObject * +PyUnicode_DecodeUTF32Stateful(const char *s, + Py_ssize_t size, + const char *errors, + int *byteorder, + Py_ssize_t *consumed) +{ + const char *starts = s; + Py_ssize_t startinpos; + Py_ssize_t endinpos; + Py_ssize_t outpos; + PyUnicodeObject *unicode; + Py_UNICODE *p; +#ifndef Py_UNICODE_WIDE + int i, pairs; +#else + const int pairs = 0; +#endif + const unsigned char *q, *e; + int bo = 0; /* assume native ordering by default */ + const char *errmsg = ""; + /* On narrow builds we split characters outside the BMP into two + codepoints => count how much extra space we need. */ +#ifndef Py_UNICODE_WIDE + for (i = pairs = 0; i < size/4; i++) + if (((Py_UCS4 *)s)[i] >= 0x10000) + pairs++; +#endif + /* Offsets from q for retrieving bytes in the right order. */ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + int iorder[] = {0, 1, 2, 3}; +#else + int iorder[] = {3, 2, 1, 0}; +#endif + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + + /* This might be one to much, because of a BOM */ + unicode = _PyUnicode_New((size+3)/4+pairs); + if (!unicode) + return NULL; + if (size == 0) + return (PyObject *)unicode; + + /* Unpack UTF-32 encoded data */ + p = unicode->str; + q = (unsigned char *)s; + e = q + size; + + if (byteorder) + bo = *byteorder; + + /* Check for BOM marks (U+FEFF) in the input and adjust current + byte order setting accordingly. In native mode, the leading BOM + mark is skipped, in all other modes, it is copied to the output + stream as-is (giving a ZWNBSP character). */ + if (bo == 0) { + if (size >= 4) { + const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | + (q[iorder[1]] << 8) | q[iorder[0]]; +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + if (bom == 0x0000FEFF) { + q += 4; + bo = -1; + } + else if (bom == 0xFFFE0000) { + q += 4; + bo = 1; + } +#else + if (bom == 0x0000FEFF) { + q += 4; + bo = 1; + } + else if (bom == 0xFFFE0000) { + q += 4; + bo = -1; + } +#endif + } + } + + if (bo == -1) { + /* force LE */ + iorder[0] = 0; + iorder[1] = 1; + iorder[2] = 2; + iorder[3] = 3; + } + else if (bo == 1) { + /* force BE */ + iorder[0] = 3; + iorder[1] = 2; + iorder[2] = 1; + iorder[3] = 0; + } + + while (q < e) { + Py_UCS4 ch; + /* remaining bytes at the end? (size should be divisible by 4) */ + if (e-q<4) { + if (consumed) + break; + errmsg = "truncated data"; + startinpos = ((const char *)q)-starts; + endinpos = ((const char *)e)-starts; + goto utf32Error; + /* The remaining input chars are ignored if the callback + chooses to skip the input */ + } + ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | + (q[iorder[1]] << 8) | q[iorder[0]]; + + if (ch >= 0x110000) + { + errmsg = "codepoint not in range(0x110000)"; + startinpos = ((const char *)q)-starts; + endinpos = startinpos+4; + goto utf32Error; + } +#ifndef Py_UNICODE_WIDE + if (ch >= 0x10000) + { + *p++ = 0xD800 | ((ch-0x10000) >> 10); + *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); + } + else +#endif + *p++ = ch; + q += 4; + continue; + utf32Error: + outpos = p-PyUnicode_AS_UNICODE(unicode); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf32", errmsg, + &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, + (PyObject **)&unicode, &outpos, &p)) + goto onError; + } + + if (byteorder) + *byteorder = bo; + + if (consumed) + *consumed = (const char *)q-starts; + + /* Adjust length */ + if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) + goto onError; + + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return (PyObject *)unicode; + +onError: + Py_DECREF(unicode); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return NULL; +} + +PyObject * +PyUnicode_EncodeUTF32(const Py_UNICODE *s, + Py_ssize_t size, + const char *errors, + int byteorder) +{ + PyObject *v; + unsigned char *p; +#ifndef Py_UNICODE_WIDE + int i, pairs; +#else + const int pairs = 0; +#endif + /* Offsets from p for storing byte pairs in the right order. */ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + int iorder[] = {0, 1, 2, 3}; +#else + int iorder[] = {3, 2, 1, 0}; +#endif + +#define STORECHAR(CH) \ + do { \ + p[iorder[3]] = ((CH) >> 24) & 0xff; \ + p[iorder[2]] = ((CH) >> 16) & 0xff; \ + p[iorder[1]] = ((CH) >> 8) & 0xff; \ + p[iorder[0]] = (CH) & 0xff; \ + p += 4; \ + } while(0) + + /* In narrow builds we can output surrogate pairs as one codepoint, + so we need less space. */ +#ifndef Py_UNICODE_WIDE + for (i = pairs = 0; i < size-1; i++) + if (0xD800 <= s[i] && s[i] <= 0xDBFF && + 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) + pairs++; +#endif + v = PyBytes_FromStringAndSize(NULL, + 4 * (size - pairs + (byteorder == 0))); + if (v == NULL) + return NULL; + + p = (unsigned char *)PyBytes_AS_STRING(v); + if (byteorder == 0) + STORECHAR(0xFEFF); + if (size == 0) + return v; + + if (byteorder == -1) { + /* force LE */ + iorder[0] = 0; + iorder[1] = 1; + iorder[2] = 2; + iorder[3] = 3; + } + else if (byteorder == 1) { + /* force BE */ + iorder[0] = 3; + iorder[1] = 2; + iorder[2] = 1; + iorder[3] = 0; + } + + while (size-- > 0) { + Py_UCS4 ch = *s++; +#ifndef Py_UNICODE_WIDE + if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { + Py_UCS4 ch2 = *s; + if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { + ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; + s++; + size--; + } + } +#endif + STORECHAR(ch); + } + return v; +#undef STORECHAR +} + +PyObject *PyUnicode_AsUTF32String(PyObject *unicode) +{ + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + NULL, + 0); +} + /* --- UTF-16 Codec ------------------------------------------------------- */ PyObject * -- cgit v0.12