From da9ec995f66bdb69dce7292abc4e1ca86e6a626a Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 28 Dec 2010 13:26:42 +0000 Subject: Issue #10783: struct.pack() doesn't encode implicitly unicode to UTF-8 * Replace "bytes" by "bytes object" in struct error messages * Document the API change in What's new in Python 3.2 * Fix test_wave * Remove also ugly implicit conversions in test_struct --- Doc/library/struct.rst | 49 +++++++++++------------ Doc/whatsnew/3.2.rst | 4 ++ Lib/test/test_struct.py | 103 +++++++++++++++++++++--------------------------- Lib/wave.py | 4 +- Misc/NEWS | 3 ++ Modules/_struct.c | 27 +++---------- 6 files changed, 83 insertions(+), 107 deletions(-) diff --git a/Doc/library/struct.rst b/Doc/library/struct.rst index aa9921e..42bfc14 100644 --- a/Doc/library/struct.rst +++ b/Doc/library/struct.rst @@ -164,58 +164,53 @@ platform-dependent. +--------+--------------------------+--------------------+----------------+------------+ | ``c`` | :c:type:`char` | bytes of length 1 | 1 | | +--------+--------------------------+--------------------+----------------+------------+ -| ``b`` | :c:type:`signed char` | integer | 1 | \(1),\(4) | +| ``b`` | :c:type:`signed char` | integer | 1 | \(1),\(3) | +--------+--------------------------+--------------------+----------------+------------+ -| ``B`` | :c:type:`unsigned char` | integer | 1 | \(4) | +| ``B`` | :c:type:`unsigned char` | integer | 1 | \(3) | +--------+--------------------------+--------------------+----------------+------------+ -| ``?`` | :c:type:`_Bool` | bool | 1 | \(2) | +| ``?`` | :c:type:`_Bool` | bool | 1 | \(1) | +--------+--------------------------+--------------------+----------------+------------+ -| ``h`` | :c:type:`short` | integer | 2 | \(4) | +| ``h`` | :c:type:`short` | integer | 2 | \(3) | +--------+--------------------------+--------------------+----------------+------------+ -| ``H`` | :c:type:`unsigned short` | integer | 2 | \(4) | +| ``H`` | :c:type:`unsigned short` | integer | 2 | \(3) | +--------+--------------------------+--------------------+----------------+------------+ -| ``i`` | :c:type:`int` | integer | 4 | \(4) | +| ``i`` | :c:type:`int` | integer | 4 | \(3) | +--------+--------------------------+--------------------+----------------+------------+ -| ``I`` | :c:type:`unsigned int` | integer | 4 | \(4) | +| ``I`` | :c:type:`unsigned int` | integer | 4 | \(3) | +--------+--------------------------+--------------------+----------------+------------+ -| ``l`` | :c:type:`long` | integer | 4 | \(4) | +| ``l`` | :c:type:`long` | integer | 4 | \(3) | +--------+--------------------------+--------------------+----------------+------------+ -| ``L`` | :c:type:`unsigned long` | integer | 4 | \(4) | +| ``L`` | :c:type:`unsigned long` | integer | 4 | \(3) | +--------+--------------------------+--------------------+----------------+------------+ -| ``q`` | :c:type:`long long` | integer | 8 | \(3), \(4) | +| ``q`` | :c:type:`long long` | integer | 8 | \(2), \(3) | +--------+--------------------------+--------------------+----------------+------------+ -| ``Q`` | :c:type:`unsigned long | integer | 8 | \(3), \(4) | +| ``Q`` | :c:type:`unsigned long | integer | 8 | \(2), \(3) | | | long` | | | | +--------+--------------------------+--------------------+----------------+------------+ -| ``f`` | :c:type:`float` | float | 4 | \(5) | +| ``f`` | :c:type:`float` | float | 4 | \(4) | +--------+--------------------------+--------------------+----------------+------------+ -| ``d`` | :c:type:`double` | float | 8 | \(5) | +| ``d`` | :c:type:`double` | float | 8 | \(4) | +--------+--------------------------+--------------------+----------------+------------+ -| ``s`` | :c:type:`char[]` | bytes | | \(1) | +| ``s`` | :c:type:`char[]` | bytes | | | +--------+--------------------------+--------------------+----------------+------------+ -| ``p`` | :c:type:`char[]` | bytes | | \(1) | +| ``p`` | :c:type:`char[]` | bytes | | | +--------+--------------------------+--------------------+----------------+------------+ -| ``P`` | :c:type:`void \*` | integer | | \(6) | +| ``P`` | :c:type:`void \*` | integer | | \(5) | +--------+--------------------------+--------------------+----------------+------------+ Notes: (1) - The ``c``, ``s`` and ``p`` conversion codes operate on :class:`bytes` - objects, but packing with such codes also supports :class:`str` objects, - which are encoded using UTF-8. - -(2) The ``'?'`` conversion code corresponds to the :c:type:`_Bool` type defined by C99. If this type is not available, it is simulated using a :c:type:`char`. In standard mode, it is always represented by one byte. -(3) +(2) The ``'q'`` and ``'Q'`` conversion codes are available in native mode only if the platform C compiler supports C :c:type:`long long`, or, on Windows, :c:type:`__int64`. They are always available in standard modes. -(4) +(3) When attempting to pack a non-integer using any of the integer conversion codes, if the non-integer has a :meth:`__index__` method then that method is called to convert the argument to an integer before packing. @@ -223,12 +218,12 @@ Notes: .. versionchanged:: 3.2 Use of the :meth:`__index__` method for non-integers is new in 3.2. -(5) +(4) For the ``'f'`` and ``'d'`` conversion codes, the packed representation uses the IEEE 754 binary32 (for ``'f'``) or binary64 (for ``'d'``) format, regardless of the floating-point format used by the platform. -(6) +(5) The ``'P'`` format character is only available for the native byte ordering (selected as the default or with the ``'@'`` byte order character). The byte order character ``'='`` chooses to use little- or big-endian ordering based @@ -310,9 +305,9 @@ the result in a named tuple:: The ordering of format characters may have an impact on size since the padding needed to satisfy alignment requirements is different:: - >>> pack('ci', '*', 0x12131415) + >>> pack('ci', b'*', 0x12131415) b'*\x00\x00\x00\x12\x13\x14\x15' - >>> pack('ic', 0x12131415, '*') + >>> pack('ic', 0x12131415, b'*') b'\x12\x13\x14\x15*' >>> calcsize('ci') 8 diff --git a/Doc/whatsnew/3.2.rst b/Doc/whatsnew/3.2.rst index b317896..51e1d86 100644 --- a/Doc/whatsnew/3.2.rst +++ b/Doc/whatsnew/3.2.rst @@ -1705,3 +1705,7 @@ require changes to your code: (Contributed by Georg Brandl and Mattias Brändström; `appspot issue 53094 `_.) + +* :func:`struct.pack` doesn't encode implicitly unicode to UTF-8 anymore: use + explicit conversion instead and replace unicode literals by bytes literals. + diff --git a/Lib/test/test_struct.py b/Lib/test/test_struct.py index 1151662..9a9da37 100644 --- a/Lib/test/test_struct.py +++ b/Lib/test/test_struct.py @@ -82,58 +82,52 @@ class StructTest(unittest.TestCase): # Test some of the new features in detail # (format, argument, big-endian result, little-endian result, asymmetric) tests = [ - ('c', 'a', 'a', 'a', 0), - ('xc', 'a', '\0a', '\0a', 0), - ('cx', 'a', 'a\0', 'a\0', 0), - ('s', 'a', 'a', 'a', 0), - ('0s', 'helloworld', '', '', 1), - ('1s', 'helloworld', 'h', 'h', 1), - ('9s', 'helloworld', 'helloworl', 'helloworl', 1), - ('10s', 'helloworld', 'helloworld', 'helloworld', 0), - ('11s', 'helloworld', 'helloworld\0', 'helloworld\0', 1), - ('20s', 'helloworld', 'helloworld'+10*'\0', 'helloworld'+10*'\0', 1), - ('b', 7, '\7', '\7', 0), - ('b', -7, '\371', '\371', 0), - ('B', 7, '\7', '\7', 0), - ('B', 249, '\371', '\371', 0), - ('h', 700, '\002\274', '\274\002', 0), - ('h', -700, '\375D', 'D\375', 0), - ('H', 700, '\002\274', '\274\002', 0), - ('H', 0x10000-700, '\375D', 'D\375', 0), - ('i', 70000000, '\004,\035\200', '\200\035,\004', 0), - ('i', -70000000, '\373\323\342\200', '\200\342\323\373', 0), - ('I', 70000000, '\004,\035\200', '\200\035,\004', 0), - ('I', 0x100000000-70000000, '\373\323\342\200', '\200\342\323\373', 0), - ('l', 70000000, '\004,\035\200', '\200\035,\004', 0), - ('l', -70000000, '\373\323\342\200', '\200\342\323\373', 0), - ('L', 70000000, '\004,\035\200', '\200\035,\004', 0), - ('L', 0x100000000-70000000, '\373\323\342\200', '\200\342\323\373', 0), - ('f', 2.0, '@\000\000\000', '\000\000\000@', 0), - ('d', 2.0, '@\000\000\000\000\000\000\000', - '\000\000\000\000\000\000\000@', 0), - ('f', -2.0, '\300\000\000\000', '\000\000\000\300', 0), - ('d', -2.0, '\300\000\000\000\000\000\000\000', - '\000\000\000\000\000\000\000\300', 0), - ('?', 0, '\0', '\0', 0), - ('?', 3, '\1', '\1', 1), - ('?', True, '\1', '\1', 0), - ('?', [], '\0', '\0', 1), - ('?', (1,), '\1', '\1', 1), + ('c', b'a', b'a', b'a', 0), + ('xc', b'a', b'\0a', b'\0a', 0), + ('cx', b'a', b'a\0', b'a\0', 0), + ('s', b'a', b'a', b'a', 0), + ('0s', b'helloworld', b'', b'', 1), + ('1s', b'helloworld', b'h', b'h', 1), + ('9s', b'helloworld', b'helloworl', b'helloworl', 1), + ('10s', b'helloworld', b'helloworld', b'helloworld', 0), + ('11s', b'helloworld', b'helloworld\0', b'helloworld\0', 1), + ('20s', b'helloworld', b'helloworld'+10*b'\0', b'helloworld'+10*b'\0', 1), + ('b', 7, b'\7', b'\7', 0), + ('b', -7, b'\371', b'\371', 0), + ('B', 7, b'\7', b'\7', 0), + ('B', 249, b'\371', b'\371', 0), + ('h', 700, b'\002\274', b'\274\002', 0), + ('h', -700, b'\375D', b'D\375', 0), + ('H', 700, b'\002\274', b'\274\002', 0), + ('H', 0x10000-700, b'\375D', b'D\375', 0), + ('i', 70000000, b'\004,\035\200', b'\200\035,\004', 0), + ('i', -70000000, b'\373\323\342\200', b'\200\342\323\373', 0), + ('I', 70000000, b'\004,\035\200', b'\200\035,\004', 0), + ('I', 0x100000000-70000000, b'\373\323\342\200', b'\200\342\323\373', 0), + ('l', 70000000, b'\004,\035\200', b'\200\035,\004', 0), + ('l', -70000000, b'\373\323\342\200', b'\200\342\323\373', 0), + ('L', 70000000, b'\004,\035\200', b'\200\035,\004', 0), + ('L', 0x100000000-70000000, b'\373\323\342\200', b'\200\342\323\373', 0), + ('f', 2.0, b'@\000\000\000', b'\000\000\000@', 0), + ('d', 2.0, b'@\000\000\000\000\000\000\000', + b'\000\000\000\000\000\000\000@', 0), + ('f', -2.0, b'\300\000\000\000', b'\000\000\000\300', 0), + ('d', -2.0, b'\300\000\000\000\000\000\000\000', + b'\000\000\000\000\000\000\000\300', 0), + ('?', 0, b'\0', b'\0', 0), + ('?', 3, b'\1', b'\1', 1), + ('?', True, b'\1', b'\1', 0), + ('?', [], b'\0', b'\0', 1), + ('?', (1,), b'\1', b'\1', 1), ] for fmt, arg, big, lil, asy in tests: - big = bytes(big, "latin-1") - lil = bytes(lil, "latin-1") for (xfmt, exp) in [('>'+fmt, big), ('!'+fmt, big), ('<'+fmt, lil), ('='+fmt, ISBIGENDIAN and big or lil)]: res = struct.pack(xfmt, arg) self.assertEqual(res, exp) self.assertEqual(struct.calcsize(xfmt), len(res)) rev = struct.unpack(xfmt, res)[0] - if isinstance(arg, str): - # Strings are returned as bytes since you can't know the - # encoding of the string when packed. - arg = bytes(arg, 'latin1') if rev != arg: self.assertTrue(asy) @@ -334,15 +328,14 @@ class StructTest(unittest.TestCase): def test_p_code(self): # Test p ("Pascal string") code. for code, input, expected, expectedback in [ - ('p','abc', '\x00', b''), - ('1p', 'abc', '\x00', b''), - ('2p', 'abc', '\x01a', b'a'), - ('3p', 'abc', '\x02ab', b'ab'), - ('4p', 'abc', '\x03abc', b'abc'), - ('5p', 'abc', '\x03abc\x00', b'abc'), - ('6p', 'abc', '\x03abc\x00\x00', b'abc'), - ('1000p', 'x'*1000, '\xff' + 'x'*999, b'x'*255)]: - expected = bytes(expected, "latin-1") + ('p', b'abc', b'\x00', b''), + ('1p', b'abc', b'\x00', b''), + ('2p', b'abc', b'\x01a', b'a'), + ('3p', b'abc', b'\x02ab', b'ab'), + ('4p', b'abc', b'\x03abc', b'abc'), + ('5p', b'abc', b'\x03abc\x00', b'abc'), + ('6p', b'abc', b'\x03abc\x00\x00', b'abc'), + ('1000p', b'x'*1000, b'\xff' + b'x'*999, b'x'*255)]: got = struct.pack(code, input) self.assertEqual(got, expected) (got,) = struct.unpack(code, got) @@ -401,15 +394,11 @@ class StructTest(unittest.TestCase): s = struct.Struct(fmt) for cls in (bytes, bytearray): data = cls(test_string) - if not isinstance(data, (bytes, bytearray)): - bytes_data = bytes(data, 'latin1') - else: - bytes_data = data self.assertEqual(s.unpack_from(data), (b'abcd',)) self.assertEqual(s.unpack_from(data, 2), (b'cd01',)) self.assertEqual(s.unpack_from(data, 4), (b'0123',)) for i in range(6): - self.assertEqual(s.unpack_from(data, i), (bytes_data[i:i+4],)) + self.assertEqual(s.unpack_from(data, i), (data[i:i+4],)) for i in range(6, len(test_string) + 1): self.assertRaises(struct.error, s.unpack_from, data, i) for cls in (bytes, bytearray): diff --git a/Lib/wave.py b/Lib/wave.py index b5bb105..2877137 100644 --- a/Lib/wave.py +++ b/Lib/wave.py @@ -467,11 +467,11 @@ class Wave_write: self._datalength = self._nframes * self._nchannels * self._sampwidth self._form_length_pos = self._file.tell() self._file.write(struct.pack('tp_name); return -1; } @@ -1423,7 +1418,7 @@ s_unpack(PyObject *self, PyObject *input) return NULL; if (vbuf.len != soself->s_size) { PyErr_Format(StructError, - "unpack requires a bytes argument of length %zd", + "unpack requires a bytes object of length %zd", soself->s_size); PyBuffer_Release(&vbuf); return NULL; @@ -1503,15 +1498,10 @@ s_pack_internal(PyStructObject *soself, PyObject *args, int offset, char* buf) if (e->format == 's') { int isstring; void *p; - if (PyUnicode_Check(v)) { - v = _PyUnicode_AsDefaultEncodedString(v, NULL); - if (v == NULL) - return -1; - } isstring = PyBytes_Check(v); if (!isstring && !PyByteArray_Check(v)) { PyErr_SetString(StructError, - "argument for 's' must be a bytes or string"); + "argument for 's' must be a bytes object"); return -1; } if (isstring) { @@ -1529,15 +1519,10 @@ s_pack_internal(PyStructObject *soself, PyObject *args, int offset, char* buf) } else if (e->format == 'p') { int isstring; void *p; - if (PyUnicode_Check(v)) { - v = _PyUnicode_AsDefaultEncodedString(v, NULL); - if (v == NULL) - return -1; - } isstring = PyBytes_Check(v); if (!isstring && !PyByteArray_Check(v)) { PyErr_SetString(StructError, - "argument for 'p' must be a bytes or string"); + "argument for 'p' must be a bytes object"); return -1; } if (isstring) { @@ -1691,7 +1676,7 @@ static struct PyMethodDef s_methods[] = { {NULL, NULL} /* sentinel */ }; -PyDoc_STRVAR(s__doc__, +PyDoc_STRVAR(s__doc__, "Struct(fmt) --> compiled struct object\n" "\n" "Return a new Struct object which writes and reads binary data according to\n" -- cgit v0.12