From f5f047aa628caeca680745c55e24519f06aa6724 Mon Sep 17 00:00:00 2001 From: Mark Dickinson Date: Sun, 25 Sep 2022 10:55:33 +0100 Subject: gh-96735: Fix undefined behaviour in struct unpacking functions (#96739) This PR fixes undefined behaviour in the struct module unpacking support functions `bu_longlong`, `lu_longlong`, `bu_int` and `lu_int`; thanks to @kumaraditya303 for finding these. The fix is to accumulate the bytes in an unsigned integer type instead of a signed integer type, then to convert to the appropriate signed type. In cases where the width matches, that conversion will typically be compiled away to a no-op. (Evidence from Godbolt: https://godbolt.org/z/5zvxodj64 .) To make the conversions efficient, I've specialised the relevant functions for their output size: for `bu_longlong` and `lu_longlong`, this only entails checking that the output size is indeed `8`. But `bu_int` and `lu_int` were used for format sizes `2` and `4` - I've split those into two separate functions each. No tests, because all of the affected cases are already exercised by the test suite. --- .../2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst | 1 + Modules/_struct.c | 96 ++++++++++++++++------ 2 files changed, 71 insertions(+), 26 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst diff --git a/Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst b/Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst new file mode 100644 index 0000000..a29ada9 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-09-10-16-46-16.gh-issue-96735.0YzJuG.rst @@ -0,0 +1 @@ +Fix undefined behaviour in :func:`struct.unpack`. diff --git a/Modules/_struct.c b/Modules/_struct.c index 9d66568..09f52a9 100644 --- a/Modules/_struct.c +++ b/Modules/_struct.c @@ -806,18 +806,37 @@ static const formatdef native_table[] = { /* Big-endian routines. *****************************************************/ static PyObject * +bu_short(_structmodulestate *state, const char *p, const formatdef *f) +{ + unsigned long x = 0; + + /* This function is only ever used in the case f->size == 2. */ + assert(f->size == 2); + Py_ssize_t i = 2; + const unsigned char *bytes = (const unsigned char *)p; + do { + x = (x<<8) | *bytes++; + } while (--i > 0); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x8000U) - 0x8000U; + return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x) : (long)x); +} + +static PyObject * bu_int(_structmodulestate *state, const char *p, const formatdef *f) { - long x = 0; - Py_ssize_t i = f->size; + unsigned long x = 0; + + /* This function is only ever used in the case f->size == 4. */ + assert(f->size == 4); + Py_ssize_t i = 4; const unsigned char *bytes = (const unsigned char *)p; do { x = (x<<8) | *bytes++; } while (--i > 0); - /* Extend the sign bit. */ - if (SIZEOF_LONG > f->size) - x |= -(x & (1L << ((8 * f->size) - 1))); - return PyLong_FromLong(x); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x80000000U) - 0x80000000U; + return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x) : (long)x); } static PyObject * @@ -835,16 +854,19 @@ bu_uint(_structmodulestate *state, const char *p, const formatdef *f) static PyObject * bu_longlong(_structmodulestate *state, const char *p, const formatdef *f) { - long long x = 0; - Py_ssize_t i = f->size; + unsigned long long x = 0; + + /* This function is only ever used in the case f->size == 8. */ + assert(f->size == 8); + Py_ssize_t i = 8; const unsigned char *bytes = (const unsigned char *)p; do { x = (x<<8) | *bytes++; } while (--i > 0); - /* Extend the sign bit. */ - if (SIZEOF_LONG_LONG > f->size) - x |= -(x & ((long long)1 << ((8 * f->size) - 1))); - return PyLong_FromLongLong(x); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x8000000000000000U) - 0x8000000000000000U; + return PyLong_FromLongLong( + x & 0x8000000000000000U ? -1 - (long long)(~x) : (long long)x); } static PyObject * @@ -1009,7 +1031,7 @@ static formatdef bigendian_table[] = { {'c', 1, 0, nu_char, np_char}, {'s', 1, 0, NULL}, {'p', 1, 0, NULL}, - {'h', 2, 0, bu_int, bp_int}, + {'h', 2, 0, bu_short, bp_int}, {'H', 2, 0, bu_uint, bp_uint}, {'i', 4, 0, bu_int, bp_int}, {'I', 4, 0, bu_uint, bp_uint}, @@ -1027,18 +1049,37 @@ static formatdef bigendian_table[] = { /* Little-endian routines. *****************************************************/ static PyObject * +lu_short(_structmodulestate *state, const char *p, const formatdef *f) +{ + unsigned long x = 0; + + /* This function is only ever used in the case f->size == 2. */ + assert(f->size == 2); + Py_ssize_t i = 2; + const unsigned char *bytes = (const unsigned char *)p; + do { + x = (x<<8) | bytes[--i]; + } while (i > 0); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x8000U) - 0x8000U; + return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x) : (long)x); +} + +static PyObject * lu_int(_structmodulestate *state, const char *p, const formatdef *f) { - long x = 0; - Py_ssize_t i = f->size; + unsigned long x = 0; + + /* This function is only ever used in the case f->size == 4. */ + assert(f->size == 4); + Py_ssize_t i = 4; const unsigned char *bytes = (const unsigned char *)p; do { x = (x<<8) | bytes[--i]; } while (i > 0); - /* Extend the sign bit. */ - if (SIZEOF_LONG > f->size) - x |= -(x & (1L << ((8 * f->size) - 1))); - return PyLong_FromLong(x); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x80000000U) - 0x80000000U; + return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x) : (long)x); } static PyObject * @@ -1056,16 +1097,19 @@ lu_uint(_structmodulestate *state, const char *p, const formatdef *f) static PyObject * lu_longlong(_structmodulestate *state, const char *p, const formatdef *f) { - long long x = 0; - Py_ssize_t i = f->size; + unsigned long long x = 0; + + /* This function is only ever used in the case f->size == 8. */ + assert(f->size == 8); + Py_ssize_t i = 8; const unsigned char *bytes = (const unsigned char *)p; do { x = (x<<8) | bytes[--i]; } while (i > 0); - /* Extend the sign bit. */ - if (SIZEOF_LONG_LONG > f->size) - x |= -(x & ((long long)1 << ((8 * f->size) - 1))); - return PyLong_FromLongLong(x); + /* Extend sign, avoiding implementation-defined or undefined behaviour. */ + x = (x ^ 0x8000000000000000U) - 0x8000000000000000U; + return PyLong_FromLongLong( + x & 0x8000000000000000U ? -1 - (long long)(~x) : (long long)x); } static PyObject * @@ -1213,7 +1257,7 @@ static formatdef lilendian_table[] = { {'c', 1, 0, nu_char, np_char}, {'s', 1, 0, NULL}, {'p', 1, 0, NULL}, - {'h', 2, 0, lu_int, lp_int}, + {'h', 2, 0, lu_short, lp_int}, {'H', 2, 0, lu_uint, lp_uint}, {'i', 4, 0, lu_int, lp_int}, {'I', 4, 0, lu_uint, lp_uint}, -- cgit v0.12