diff options
-rw-r--r-- | Doc/c-api/concrete.rst | 68 | ||||
-rw-r--r-- | Doc/library/codecs.rst | 6 | ||||
-rw-r--r-- | Include/unicodeobject.h | 82 | ||||
-rw-r--r-- | Lib/encodings/aliases.py | 10 | ||||
-rw-r--r-- | Lib/encodings/utf_32.py | 144 | ||||
-rw-r--r-- | Lib/encodings/utf_32_be.py | 37 | ||||
-rw-r--r-- | Lib/encodings/utf_32_le.py | 37 | ||||
-rw-r--r-- | Lib/test/test_codeccallbacks.py | 7 | ||||
-rw-r--r-- | Lib/test/test_codecs.py | 140 | ||||
-rw-r--r-- | Misc/NEWS | 2 | ||||
-rw-r--r-- | Modules/_codecsmodule.c | 204 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 266 |
12 files changed, 1001 insertions, 2 deletions
diff --git a/Doc/c-api/concrete.rst b/Doc/c-api/concrete.rst index bc812c2..eda56a5 100644 --- a/Doc/c-api/concrete.rst +++ b/Doc/c-api/concrete.rst @@ -1405,6 +1405,74 @@ These are the UTF-8 codec APIs: object. Error handling is "strict". Return *NULL* if an exception was raised by the codec. +These are the UTF-32 codec APIs: + +.. % --- UTF-32 Codecs ------------------------------------------------------ */ + + +.. cfunction:: PyObject* PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder) + + Decode *length* bytes from a UTF-32 encoded buffer string and return the + corresponding Unicode object. *errors* (if non-*NULL*) defines the error + handling. It defaults to "strict". + + If *byteorder* is non-*NULL*, the decoder starts decoding using the given byte + order:: + + *byteorder == -1: little endian + *byteorder == 0: native order + *byteorder == 1: big endian + + and then switches if the first four bytes of the input data are a byte order mark + (BOM) and the specified byte order is native order. This BOM is not copied into + the resulting Unicode string. After completion, *\*byteorder* is set to the + current byte order at the end of input data. + + In a narrow build codepoints outside the BMP will be decoded as surrogate pairs. + + If *byteorder* is *NULL*, the codec starts in native order mode. + + Return *NULL* if an exception was raised by the codec. + + .. versionadded:: 3.0 + + +.. cfunction:: PyObject* PyUnicode_DecodeUTF32Stateful(const char *s, Py_ssize_t size, const char *errors, int *byteorder, Py_ssize_t *consumed) + + If *consumed* is *NULL*, behave like :cfunc:`PyUnicode_DecodeUTF32`. If + *consumed* is not *NULL*, :cfunc:`PyUnicode_DecodeUTF32Stateful` will not treat + trailing incomplete UTF-32 byte sequences (such as a number of bytes not divisible + by four) as an error. Those bytes will not be decoded and the number of bytes + that have been decoded will be stored in *consumed*. + + .. versionadded:: 3.0 + + +.. cfunction:: PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE *s, Py_ssize_t size, const char *errors, int byteorder) + + Return a Python bytes object holding the UTF-32 encoded value of the Unicode + data in *s*. If *byteorder* is not ``0``, output is written according to the + following byte order:: + + byteorder == -1: little endian + byteorder == 0: native byte order (writes a BOM mark) + byteorder == 1: big endian + + If byteorder is ``0``, the output string will always start with the Unicode BOM + mark (U+FEFF). In the other two modes, no BOM mark is prepended. + + If *Py_UNICODE_WIDE* is not defined, surrogate pairs will be output + as a single codepoint. + + Return *NULL* if an exception was raised by the codec. + + +.. cfunction:: PyObject* PyUnicode_AsUTF32String(PyObject *unicode) + + Return a Python string using the UTF-32 encoding in native byte order. The + string always starts with a BOM mark. Error handling is "strict". Return + *NULL* if an exception was raised by the codec. + These are the UTF-16 codec APIs: .. % --- UTF-16 Codecs ------------------------------------------------------ */ diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index 38264df..7a035c2 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1089,6 +1089,12 @@ particular, the following variants typically exist: | shift_jisx0213 | shiftjisx0213, sjisx0213, | Japanese | | | s_jisx0213 | | +-----------------+--------------------------------+--------------------------------+ +| utf_32 | U32, utf32 | all languages | ++-----------------+--------------------------------+--------------------------------+ +| utf_32_be | UTF-32BE | all languages | ++-----------------+--------------------------------+--------------------------------+ +| utf_32_le | UTF-32LE | all languages | ++-----------------+--------------------------------+--------------------------------+ | utf_16 | U16, utf16 | all languages | +-----------------+--------------------------------+--------------------------------+ | utf_16_be | UTF-16BE | all languages (BMP only) | diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 5545344..4cde46a 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -138,6 +138,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString +# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode @@ -154,6 +155,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape +# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32 +# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8 @@ -165,6 +168,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape +# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32 # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape @@ -225,6 +229,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString +# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode @@ -241,6 +246,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape +# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32 +# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8 @@ -252,6 +259,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape +# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32 # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape @@ -749,6 +757,80 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8( const char *errors /* error handling */ ); +/* --- UTF-32 Codecs ------------------------------------------------------ */ + +/* Decodes length bytes from a UTF-32 encoded buffer string and returns + the corresponding Unicode object. + + errors (if non-NULL) defines the error handling. It defaults + to "strict". + + If byteorder is non-NULL, the decoder starts decoding using the + given byte order: + + *byteorder == -1: little endian + *byteorder == 0: native order + *byteorder == 1: big endian + + In native mode, the first four bytes of the stream are checked for a + BOM mark. If found, the BOM mark is analysed, the byte order + adjusted and the BOM skipped. In the other modes, no BOM mark + interpretation is done. After completion, *byteorder is set to the + current byte order at the end of input data. + + If byteorder is NULL, the codec starts in native order mode. + +*/ + +PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( + const char *string, /* UTF-32 encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + int *byteorder /* pointer to byteorder to use + 0=native;-1=LE,1=BE; updated on + exit */ + ); + +PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( + const char *string, /* UTF-32 encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + int *byteorder, /* pointer to byteorder to use + 0=native;-1=LE,1=BE; updated on + exit */ + Py_ssize_t *consumed /* bytes consumed */ + ); + +/* Returns a Python string using the UTF-32 encoding in native byte + order. The string always starts with a BOM mark. */ + +PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( + PyObject *unicode /* Unicode object */ + ); + +/* Returns a Python string object holding the UTF-32 encoded value of + the Unicode data. + + If byteorder is not 0, output is written according to the following + byte order: + + byteorder == -1: little endian + byteorder == 0: native byte order (writes a BOM mark) + byteorder == 1: big endian + + If byteorder is 0, the output string will always start with the + Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is + prepended. + +*/ + +PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32( + const Py_UNICODE *data, /* Unicode char buffer */ + Py_ssize_t length, /* number of Py_UNICODE chars to encode */ + const char *errors, /* error handling */ + int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */ + ); + /* --- UTF-16 Codecs ------------------------------------------------------ */ /* Decodes length bytes from a UTF-16 encoded buffer string and returns diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py index cefb2ed..c6f5aeb 100644 --- a/Lib/encodings/aliases.py +++ b/Lib/encodings/aliases.py @@ -490,6 +490,16 @@ aliases = { 'unicodelittleunmarked' : 'utf_16_le', 'utf_16le' : 'utf_16_le', + # utf_32 codec + 'u32' : 'utf_32', + 'utf32' : 'utf_32', + + # utf_32_be codec + 'utf_32be' : 'utf_32_be', + + # utf_32_le codec + 'utf_32le' : 'utf_32_le', + # utf_7 codec 'u7' : 'utf_7', 'utf7' : 'utf_7', diff --git a/Lib/encodings/utf_32.py b/Lib/encodings/utf_32.py new file mode 100644 index 0000000..622f84b --- /dev/null +++ b/Lib/encodings/utf_32.py @@ -0,0 +1,144 @@ +""" +Python 'utf-32' Codec +""" +import codecs, sys + +### Codec APIs + +encode = codecs.utf_32_encode + +def decode(input, errors='strict'): + return codecs.utf_32_decode(input, errors, True) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def __init__(self, errors='strict'): + codecs.IncrementalEncoder.__init__(self, errors) + self.encoder = None + + def encode(self, input, final=False): + if self.encoder is None: + result = codecs.utf_32_encode(input, self.errors)[0] + if sys.byteorder == 'little': + self.encoder = codecs.utf_32_le_encode + else: + self.encoder = codecs.utf_32_be_encode + return result + return self.encoder(input, self.errors)[0] + + def reset(self): + codecs.IncrementalEncoder.reset(self) + self.encoder = None + + def getstate(self): + # state info we return to the caller: + # 0: stream is in natural order for this platform + # 2: endianness hasn't been determined yet + # (we're never writing in unnatural order) + return (2 if self.encoder is None else 0) + + def setstate(self, state): + if state: + self.encoder = None + else: + if sys.byteorder == 'little': + self.encoder = codecs.utf_32_le_encode + else: + self.encoder = codecs.utf_32_be_encode + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def __init__(self, errors='strict'): + codecs.BufferedIncrementalDecoder.__init__(self, errors) + self.decoder = None + + def _buffer_decode(self, input, errors, final): + if self.decoder is None: + (output, consumed, byteorder) = \ + codecs.utf_32_ex_decode(input, errors, 0, final) + if byteorder == -1: + self.decoder = codecs.utf_32_le_decode + elif byteorder == 1: + self.decoder = codecs.utf_32_be_decode + elif consumed >= 4: + raise UnicodeError("UTF-32 stream does not start with BOM") + return (output, consumed) + return self.decoder(input, self.errors, final) + + def reset(self): + codecs.BufferedIncrementalDecoder.reset(self) + self.decoder = None + + def getstate(self): + # additonal state info from the base class must be None here, + # as it isn't passed along to the caller + state = codecs.BufferedIncrementalDecoder.getstate(self)[0] + # additional state info we pass to the caller: + # 0: stream is in natural order for this platform + # 1: stream is in unnatural order + # 2: endianness hasn't been determined yet + if self.decoder is None: + return (state, 2) + addstate = int((sys.byteorder == "big") != + (self.decoder is codecs.utf_32_be_decode)) + return (state, addstate) + + def setstate(self, state): + # state[1] will be ignored by BufferedIncrementalDecoder.setstate() + codecs.BufferedIncrementalDecoder.setstate(self, state) + state = state[1] + if state == 0: + self.decoder = (codecs.utf_32_be_decode + if sys.byteorder == "big" + else codecs.utf_32_le_decode) + elif state == 1: + self.decoder = (codecs.utf_32_le_decode + if sys.byteorder == "big" + else codecs.utf_32_be_decode) + else: + self.decoder = None + +class StreamWriter(codecs.StreamWriter): + def __init__(self, stream, errors='strict'): + self.bom_written = False + codecs.StreamWriter.__init__(self, stream, errors) + + def encode(self, input, errors='strict'): + self.bom_written = True + result = codecs.utf_32_encode(input, errors) + if sys.byteorder == 'little': + self.encode = codecs.utf_32_le_encode + else: + self.encode = codecs.utf_32_be_encode + return result + +class StreamReader(codecs.StreamReader): + + def reset(self): + codecs.StreamReader.reset(self) + try: + del self.decode + except AttributeError: + pass + + def decode(self, input, errors='strict'): + (object, consumed, byteorder) = \ + codecs.utf_32_ex_decode(input, errors, 0, False) + if byteorder == -1: + self.decode = codecs.utf_32_le_decode + elif byteorder == 1: + self.decode = codecs.utf_32_le_decode + elif consumed>=4: + raise UnicodeError,"UTF-32 stream does not start with BOM" + return (object, consumed) + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='utf-32', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/encodings/utf_32_be.py b/Lib/encodings/utf_32_be.py new file mode 100644 index 0000000..fe272b5 --- /dev/null +++ b/Lib/encodings/utf_32_be.py @@ -0,0 +1,37 @@ +""" +Python 'utf-32-be' Codec +""" +import codecs + +### Codec APIs + +encode = codecs.utf_32_be_encode + +def decode(input, errors='strict'): + return codecs.utf_32_be_decode(input, errors, True) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return codecs.utf_32_be_encode(input, self.errors)[0] + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + _buffer_decode = codecs.utf_32_be_decode + +class StreamWriter(codecs.StreamWriter): + encode = codecs.utf_32_be_encode + +class StreamReader(codecs.StreamReader): + decode = codecs.utf_32_be_decode + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='utf-32-be', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/encodings/utf_32_le.py b/Lib/encodings/utf_32_le.py new file mode 100644 index 0000000..9e48210 --- /dev/null +++ b/Lib/encodings/utf_32_le.py @@ -0,0 +1,37 @@ +""" +Python 'utf-32-le' Codec +""" +import codecs + +### Codec APIs + +encode = codecs.utf_32_le_encode + +def decode(input, errors='strict'): + return codecs.utf_32_le_decode(input, errors, True) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return codecs.utf_32_le_encode(input, self.errors)[0] + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + _buffer_decode = codecs.utf_32_le_decode + +class StreamWriter(codecs.StreamWriter): + encode = codecs.utf_32_le_encode + +class StreamReader(codecs.StreamReader): + decode = codecs.utf_32_le_decode + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='utf-32-le', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index f76ec65..9b731d5 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -285,7 +285,8 @@ class CodecCallbackTest(unittest.TestCase): def test_longstrings(self): # test long strings to check for memory overflow problems - errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"] + errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", + "backslashreplace"] # register the handlers under different names, # to prevent the codec from recognizing the name for err in errors: @@ -293,7 +294,8 @@ class CodecCallbackTest(unittest.TestCase): l = 1000 errors += [ "test." + err for err in errors ] for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]: - for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"): + for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", + "utf-8", "utf-7", "utf-16", "utf-32"): for err in errors: try: uni.encode(enc, err) @@ -812,6 +814,7 @@ class CodecCallbackTest(unittest.TestCase): ("utf-7", b"++"), ("utf-8", b"\xff"), ("utf-16", b"\xff"), + ("utf-32", b"\xff"), ("unicode-escape", b"\\u123g"), ("raw-unicode-escape", b"\\u123g"), ("unicode-internal", b"\xff"), diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 89a3473..f2ee524 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -277,6 +277,143 @@ class ReadTest(unittest.TestCase, MixInCheckStateHandling): self.assertEqual(reader.readline(), s5) self.assertEqual(reader.readline(), "") +class UTF32Test(ReadTest): + encoding = "utf-32" + + spamle = (b'\xff\xfe\x00\x00' + b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00' + b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00') + spambe = (b'\x00\x00\xfe\xff' + b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m' + b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m') + + def test_only_one_bom(self): + _,_,reader,writer = codecs.lookup(self.encoding) + # encode some stream + s = io.BytesIO() + f = writer(s) + f.write("spam") + f.write("spam") + d = s.getvalue() + # check whether there is exactly one BOM in it + self.assert_(d == self.spamle or d == self.spambe) + # try to read it back + s = io.BytesIO(d) + f = reader(s) + self.assertEquals(f.read(), "spamspam") + + def test_badbom(self): + s = io.BytesIO(4*b"\xff") + f = codecs.getreader(self.encoding)(s) + self.assertRaises(UnicodeError, f.read) + + s = io.BytesIO(8*b"\xff") + f = codecs.getreader(self.encoding)(s) + self.assertRaises(UnicodeError, f.read) + + def test_partial(self): + self.check_partial( + "\x00\xff\u0100\uffff", + [ + "", # first byte of BOM read + "", # second byte of BOM read + "", # third byte of BOM read + "", # fourth byte of BOM read => byteorder known + "", + "", + "", + "\x00", + "\x00", + "\x00", + "\x00", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100\uffff", + ] + ) + + def test_errors(self): + self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode, + b"\xff", "strict", True) + + def test_decoder_state(self): + self.check_state_handling_decode(self.encoding, + "spamspam", self.spamle) + self.check_state_handling_decode(self.encoding, + "spamspam", self.spambe) + +class UTF32LETest(ReadTest): + encoding = "utf-32-le" + + def test_partial(self): + self.check_partial( + "\x00\xff\u0100\uffff", + [ + "", + "", + "", + "\x00", + "\x00", + "\x00", + "\x00", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100\uffff", + ] + ) + + def test_simple(self): + self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00") + + def test_errors(self): + self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode, + b"\xff", "strict", True) + +class UTF32BETest(ReadTest): + encoding = "utf-32-be" + + def test_partial(self): + self.check_partial( + "\x00\xff\u0100\uffff", + [ + "", + "", + "", + "\x00", + "\x00", + "\x00", + "\x00", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100", + "\x00\xff\u0100\uffff", + ] + ) + + def test_simple(self): + self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03") + + def test_errors(self): + self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode, + b"\xff", "strict", True) + class UTF16Test(ReadTest): encoding = "utf-16" @@ -1284,6 +1421,9 @@ class WithStmtTest(unittest.TestCase): def test_main(): test_support.run_unittest( + UTF32Test, + UTF32LETest, + UTF32BETest, UTF16Test, UTF16LETest, UTF16BETest, @@ -213,6 +213,8 @@ Library - Patch #1680961: atexit has been reimplemented in C. +- Add new codecs for UTF-32, UTF-32-LE and UTF-32-BE. + Build ----- diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index c500073..e3933e7 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -413,6 +413,126 @@ utf_16_ex_decode(PyObject *self, } static PyObject * +utf_32_decode(PyObject *self, + PyObject *args) +{ + const char *data; + Py_ssize_t size; + const char *errors = NULL; + int byteorder = 0; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded; + + if (!PyArg_ParseTuple(args, "t#|zi:utf_32_decode", + &data, &size, &errors, &final)) + return NULL; + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative argument"); + return 0; + } + consumed = size; /* This is overwritten unless final is true. */ + decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder, + final ? NULL : &consumed); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); +} + +static PyObject * +utf_32_le_decode(PyObject *self, + PyObject *args) +{ + const char *data; + Py_ssize_t size; + const char *errors = NULL; + int byteorder = -1; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded = NULL; + + if (!PyArg_ParseTuple(args, "t#|zi:utf_32_le_decode", + &data, &size, &errors, &final)) + return NULL; + + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative argument"); + return 0; + } + consumed = size; /* This is overwritten unless final is true. */ + decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, + &byteorder, final ? NULL : &consumed); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); + +} + +static PyObject * +utf_32_be_decode(PyObject *self, + PyObject *args) +{ + const char *data; + Py_ssize_t size; + const char *errors = NULL; + int byteorder = 1; + int final = 0; + Py_ssize_t consumed; + PyObject *decoded = NULL; + + if (!PyArg_ParseTuple(args, "t#|zi:utf_32_be_decode", + &data, &size, &errors, &final)) + return NULL; + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative argument"); + return 0; + } + consumed = size; /* This is overwritten unless final is true. */ + decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, + &byteorder, final ? NULL : &consumed); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); +} + +/* This non-standard version also provides access to the byteorder + parameter of the builtin UTF-32 codec. + + It returns a tuple (unicode, bytesread, byteorder) with byteorder + being the value in effect at the end of data. + +*/ + +static PyObject * +utf_32_ex_decode(PyObject *self, + PyObject *args) +{ + const char *data; + Py_ssize_t size; + const char *errors = NULL; + int byteorder = 0; + PyObject *unicode, *tuple; + int final = 0; + Py_ssize_t consumed; + + if (!PyArg_ParseTuple(args, "t#|zii:utf_32_ex_decode", + &data, &size, &errors, &byteorder, &final)) + return NULL; + if (size < 0) { + PyErr_SetString(PyExc_ValueError, "negative argument"); + return 0; + } + consumed = size; /* This is overwritten unless final is true. */ + unicode = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder, + final ? NULL : &consumed); + if (unicode == NULL) + return NULL; + tuple = Py_BuildValue("Oni", unicode, consumed, byteorder); + Py_DECREF(unicode); + return tuple; +} + +static PyObject * unicode_escape_decode(PyObject *self, PyObject *args) { @@ -700,6 +820,83 @@ utf_16_be_encode(PyObject *self, return v; } +/* This version provides access to the byteorder parameter of the + builtin UTF-32 codecs as optional third argument. It defaults to 0 + which means: use the native byte order and prepend the data with a + BOM mark. + +*/ + +static PyObject * +utf_32_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + int byteorder = 0; + + if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode", + &str, &errors, &byteorder)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + errors, + byteorder), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + +static PyObject * +utf_32_le_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode", + &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + errors, + -1), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + +static PyObject * +utf_32_be_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode", + &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + errors, + +1), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + static PyObject * unicode_escape_encode(PyObject *self, PyObject *args) @@ -916,6 +1113,13 @@ static PyMethodDef _codecs_functions[] = { {"utf_16_le_decode", utf_16_le_decode, METH_VARARGS}, {"utf_16_be_decode", utf_16_be_decode, METH_VARARGS}, {"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS}, + {"utf_32_encode", utf_32_encode, METH_VARARGS}, + {"utf_32_le_encode", utf_32_le_encode, METH_VARARGS}, + {"utf_32_be_encode", utf_32_be_encode, METH_VARARGS}, + {"utf_32_decode", utf_32_decode, METH_VARARGS}, + {"utf_32_le_decode", utf_32_le_decode, METH_VARARGS}, + {"utf_32_be_decode", utf_32_be_decode, METH_VARARGS}, + {"utf_32_ex_decode", utf_32_ex_decode, METH_VARARGS}, {"unicode_escape_encode", unicode_escape_encode, METH_VARARGS}, {"unicode_escape_decode", unicode_escape_decode, METH_VARARGS}, {"unicode_internal_encode", unicode_internal_encode, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index b345986..54fe16c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1992,6 +1992,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode) NULL); } +/* --- UTF-32 Codec ------------------------------------------------------- */ + +PyObject * +PyUnicode_DecodeUTF32(const char *s, + Py_ssize_t size, + const char *errors, + int *byteorder) +{ + return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); +} + +PyObject * +PyUnicode_DecodeUTF32Stateful(const char *s, + Py_ssize_t size, + const char *errors, + int *byteorder, + Py_ssize_t *consumed) +{ + const char *starts = s; + Py_ssize_t startinpos; + Py_ssize_t endinpos; + Py_ssize_t outpos; + PyUnicodeObject *unicode; + Py_UNICODE *p; +#ifndef Py_UNICODE_WIDE + int i, pairs; +#else + const int pairs = 0; +#endif + const unsigned char *q, *e; + int bo = 0; /* assume native ordering by default */ + const char *errmsg = ""; + /* On narrow builds we split characters outside the BMP into two + codepoints => count how much extra space we need. */ +#ifndef Py_UNICODE_WIDE + for (i = pairs = 0; i < size/4; i++) + if (((Py_UCS4 *)s)[i] >= 0x10000) + pairs++; +#endif + /* Offsets from q for retrieving bytes in the right order. */ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + int iorder[] = {0, 1, 2, 3}; +#else + int iorder[] = {3, 2, 1, 0}; +#endif + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + + /* This might be one to much, because of a BOM */ + unicode = _PyUnicode_New((size+3)/4+pairs); + if (!unicode) + return NULL; + if (size == 0) + return (PyObject *)unicode; + + /* Unpack UTF-32 encoded data */ + p = unicode->str; + q = (unsigned char *)s; + e = q + size; + + if (byteorder) + bo = *byteorder; + + /* Check for BOM marks (U+FEFF) in the input and adjust current + byte order setting accordingly. In native mode, the leading BOM + mark is skipped, in all other modes, it is copied to the output + stream as-is (giving a ZWNBSP character). */ + if (bo == 0) { + if (size >= 4) { + const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | + (q[iorder[1]] << 8) | q[iorder[0]]; +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + if (bom == 0x0000FEFF) { + q += 4; + bo = -1; + } + else if (bom == 0xFFFE0000) { + q += 4; + bo = 1; + } +#else + if (bom == 0x0000FEFF) { + q += 4; + bo = 1; + } + else if (bom == 0xFFFE0000) { + q += 4; + bo = -1; + } +#endif + } + } + + if (bo == -1) { + /* force LE */ + iorder[0] = 0; + iorder[1] = 1; + iorder[2] = 2; + iorder[3] = 3; + } + else if (bo == 1) { + /* force BE */ + iorder[0] = 3; + iorder[1] = 2; + iorder[2] = 1; + iorder[3] = 0; + } + + while (q < e) { + Py_UCS4 ch; + /* remaining bytes at the end? (size should be divisible by 4) */ + if (e-q<4) { + if (consumed) + break; + errmsg = "truncated data"; + startinpos = ((const char *)q)-starts; + endinpos = ((const char *)e)-starts; + goto utf32Error; + /* The remaining input chars are ignored if the callback + chooses to skip the input */ + } + ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | + (q[iorder[1]] << 8) | q[iorder[0]]; + + if (ch >= 0x110000) + { + errmsg = "codepoint not in range(0x110000)"; + startinpos = ((const char *)q)-starts; + endinpos = startinpos+4; + goto utf32Error; + } +#ifndef Py_UNICODE_WIDE + if (ch >= 0x10000) + { + *p++ = 0xD800 | ((ch-0x10000) >> 10); + *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); + } + else +#endif + *p++ = ch; + q += 4; + continue; + utf32Error: + outpos = p-PyUnicode_AS_UNICODE(unicode); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf32", errmsg, + &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, + (PyObject **)&unicode, &outpos, &p)) + goto onError; + } + + if (byteorder) + *byteorder = bo; + + if (consumed) + *consumed = (const char *)q-starts; + + /* Adjust length */ + if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) + goto onError; + + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return (PyObject *)unicode; + +onError: + Py_DECREF(unicode); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return NULL; +} + +PyObject * +PyUnicode_EncodeUTF32(const Py_UNICODE *s, + Py_ssize_t size, + const char *errors, + int byteorder) +{ + PyObject *v; + unsigned char *p; +#ifndef Py_UNICODE_WIDE + int i, pairs; +#else + const int pairs = 0; +#endif + /* Offsets from p for storing byte pairs in the right order. */ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + int iorder[] = {0, 1, 2, 3}; +#else + int iorder[] = {3, 2, 1, 0}; +#endif + +#define STORECHAR(CH) \ + do { \ + p[iorder[3]] = ((CH) >> 24) & 0xff; \ + p[iorder[2]] = ((CH) >> 16) & 0xff; \ + p[iorder[1]] = ((CH) >> 8) & 0xff; \ + p[iorder[0]] = (CH) & 0xff; \ + p += 4; \ + } while(0) + + /* In narrow builds we can output surrogate pairs as one codepoint, + so we need less space. */ +#ifndef Py_UNICODE_WIDE + for (i = pairs = 0; i < size-1; i++) + if (0xD800 <= s[i] && s[i] <= 0xDBFF && + 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) + pairs++; +#endif + v = PyBytes_FromStringAndSize(NULL, + 4 * (size - pairs + (byteorder == 0))); + if (v == NULL) + return NULL; + + p = (unsigned char *)PyBytes_AS_STRING(v); + if (byteorder == 0) + STORECHAR(0xFEFF); + if (size == 0) + return v; + + if (byteorder == -1) { + /* force LE */ + iorder[0] = 0; + iorder[1] = 1; + iorder[2] = 2; + iorder[3] = 3; + } + else if (byteorder == 1) { + /* force BE */ + iorder[0] = 3; + iorder[1] = 2; + iorder[2] = 1; + iorder[3] = 0; + } + + while (size-- > 0) { + Py_UCS4 ch = *s++; +#ifndef Py_UNICODE_WIDE + if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { + Py_UCS4 ch2 = *s; + if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { + ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; + s++; + size--; + } + } +#endif + STORECHAR(ch); + } + return v; +#undef STORECHAR +} + +PyObject *PyUnicode_AsUTF32String(PyObject *unicode) +{ + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + NULL, + 0); +} + /* --- UTF-16 Codec ------------------------------------------------------- */ PyObject * |