Issue #25626: Change zlib to accept Py_ssize_t and cap to UINT_MAX

The underlying zlib library stores sizes in “unsigned int”. The corresponding Python parameters are all sizes of buffers filled in by zlib, so it is okay to reduce higher values to the UINT_MAX internal cap. OverflowError is still raised for sizes that do not fit in Py_ssize_t. Sizes are now limited to Py_ssize_t rather than unsigned long, because Python byte strings cannot be larger than Py_ssize_t. Previously this could result in a SystemError on 32-bit platforms. This resolves a regression in the gzip module when reading more than UINT_MAX or LONG_MAX bytes in one call, introduced by revision 62723172412c.
author: Martin Panter <vadmium+py@gmail.com> 2015-11-20 08:13:35 (GMT)
committer: Martin Panter <vadmium+py@gmail.com> 2015-11-20 08:13:35 (GMT)
commit: e99e97762cc75ad94056275ddcae9c84d63a3412 (patch)
tree: f16040197bf0122ce6aded03018b4ee6fee03679
parent: d13cade3817cf452b7c98e9df96b15c6da68201a (diff)
download: cpython-e99e97762cc75ad94056275ddcae9c84d63a3412.zip
cpython-e99e97762cc75ad94056275ddcae9c84d63a3412.tar.gz
cpython-e99e97762cc75ad94056275ddcae9c84d63a3412.tar.bz2
7 files changed, 121 insertions, 48 deletions
diff --git a/Doc/howto/clinic.rst b/Doc/howto/clinic.rst
index 7524c4a..b04edea 100644
--- a/Doc/howto/clinic.rst
+++ b/Doc/howto/clinic.rst
@@ -1249,18 +1249,18 @@ Here's the simplest example of a custom converter, from ``Modules/zlibmodule.c``
 
     /*[python input]
 
-    class uint_converter(CConverter):
+    class capped_uint_converter(CConverter):
         type = 'unsigned int'
-        converter = 'uint_converter'
+        converter = 'capped_uint_converter'
 
     [python start generated code]*/
-    /*[python end generated code: checksum=da39a3ee5e6b4b0d3255bfef95601890afd80709]*/
+    /*[python end generated code: output=da39a3ee5e6b4b0d input=35521e4e733823c7]*/
 
-This block adds a converter to Argument Clinic named ``uint``.  Parameters
-declared as ``uint`` will be declared as type ``unsigned int``, and will
-be parsed by the ``'O&'`` format unit, which will call the ``uint_converter``
-converter function.
-``uint`` variables automatically support default values.
+This block adds a converter to Argument Clinic named ``capped_uint``.  Parameters
+declared as ``capped_uint`` will be declared as type ``unsigned int``, and will
+be parsed by the ``'O&'`` format unit, which will call the
+``capped_uint_converter`` converter function.  ``capped_uint`` variables
+automatically support default values.
 
 More sophisticated custom converters can insert custom C code to
 handle initialization and cleanup.
diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py
index 359d6dd..dfb4c25 100644
--- a/Lib/test/support/__init__.py
+++ b/Lib/test/support/__init__.py
@@ -1608,12 +1608,15 @@ class _MemoryWatchdog:
 def bigmemtest(size, memuse, dry_run=True):
     """Decorator for bigmem tests.
 
-    'minsize' is the minimum useful size for the test (in arbitrary,
-    test-interpreted units.) 'memuse' is the number of 'bytes per size' for
-    the test, or a good estimate of it.
-
-    if 'dry_run' is False, it means the test doesn't support dummy runs
-    when -M is not specified.
+    'size' is a requested size for the test (in arbitrary, test-interpreted
+    units.) 'memuse' is the number of bytes per unit for the test, or a good
+    estimate of it. For example, a test that needs two byte buffers, of 4 GiB
+    each, could be decorated with @bigmemtest(size=_4G, memuse=2).
+
+    The 'size' argument is normally passed to the decorated test method as an
+    extra argument. If 'dry_run' is true, the value passed to the test method
+    may be less than the requested value. If 'dry_run' is false, it means the
+    test doesn't support dummy runs when -M is not specified.
     """
     def decorator(f):
         def wrapper(self):
diff --git a/Lib/test/test_gzip.py b/Lib/test/test_gzip.py
index d8408e1..3c51673 100644
--- a/Lib/test/test_gzip.py
+++ b/Lib/test/test_gzip.py
@@ -3,6 +3,7 @@
 
 import unittest
 from test import support
+from test.support import bigmemtest, _4G
 import os
 import io
 import struct
@@ -116,6 +117,14 @@ class TestGzip(BaseTest):
                 self.assertEqual(f.tell(), nread)
         self.assertEqual(b''.join(blocks), data1 * 50)
 
+    @bigmemtest(size=_4G, memuse=1)
+    def test_read_large(self, size):
+        # Read chunk size over UINT_MAX should be supported, despite zlib's
+        # limitation per low-level call
+        compressed = gzip.compress(data1, compresslevel=1)
+        f = gzip.GzipFile(fileobj=io.BytesIO(compressed), mode='rb')
+        self.assertEqual(f.read(size), data1)
+
     def test_io_on_closed_object(self):
         # Test that I/O operations on closed GzipFile objects raise a
         # ValueError, just like the corresponding functions on file objects.
diff --git a/Lib/test/test_zlib.py b/Lib/test/test_zlib.py
index 7154e13..88c415b 100644
--- a/Lib/test/test_zlib.py
+++ b/Lib/test/test_zlib.py
@@ -122,11 +122,17 @@ class ExceptionTestCase(unittest.TestCase):
         self.assertRaises(ValueError, zlib.decompressobj().flush, 0)
         self.assertRaises(ValueError, zlib.decompressobj().flush, -1)
 
+    @support.cpython_only
+    def test_overflow(self):
+        with self.assertRaisesRegex(OverflowError, 'int too large'):
+            zlib.decompress(b'', 15, sys.maxsize + 1)
+        with self.assertRaisesRegex(OverflowError, 'int too large'):
+            zlib.decompressobj().flush(sys.maxsize + 1)
+
 
 class BaseCompressTestCase(object):
     def check_big_compress_buffer(self, size, compress_func):
         _1M = 1024 * 1024
-        fmt = "%%0%dx" % (2 * _1M)
         # Generate 10MB worth of random, and expand it by repeating it.
         # The assumption is that zlib's memory is not big enough to exploit
         # such spread out redundancy.
@@ -196,6 +202,18 @@ class CompressTestCase(BaseCompressTestCase, unittest.TestCase):
         finally:
             data = None
 
+    @bigmemtest(size=_4G, memuse=1)
+    def test_large_bufsize(self, size):
+        # Test decompress(bufsize) parameter greater than the internal limit
+        data = HAMLET_SCENE * 10
+        compressed = zlib.compress(data, 1)
+        self.assertEqual(zlib.decompress(compressed, 15, size), data)
+
+    def test_custom_bufsize(self):
+        data = HAMLET_SCENE * 10
+        compressed = zlib.compress(data, 1)
+        self.assertEqual(zlib.decompress(compressed, 15, CustomInt()), data)
+
 
 class CompressObjectTestCase(BaseCompressTestCase, unittest.TestCase):
     # Test compression object
@@ -364,6 +382,21 @@ class CompressObjectTestCase(BaseCompressTestCase, unittest.TestCase):
         self.assertRaises(ValueError, dco.decompress, b"", -1)
         self.assertEqual(b'', dco.unconsumed_tail)
 
+    def test_maxlen_large(self):
+        # Sizes up to sys.maxsize should be accepted, although zlib is
+        # internally limited to expressing sizes with unsigned int
+        data = HAMLET_SCENE * 10
+        self.assertGreater(len(data), zlib.DEF_BUF_SIZE)
+        compressed = zlib.compress(data, 1)
+        dco = zlib.decompressobj()
+        self.assertEqual(dco.decompress(compressed, sys.maxsize), data)
+
+    def test_maxlen_custom(self):
+        data = HAMLET_SCENE * 10
+        compressed = zlib.compress(data, 1)
+        dco = zlib.decompressobj()
+        self.assertEqual(dco.decompress(compressed, CustomInt()), data[:100])
+
     def test_clear_unconsumed_tail(self):
         # Issue #12050: calling decompress() without providing max_length
         # should clear the unconsumed_tail attribute.
@@ -537,6 +570,22 @@ class CompressObjectTestCase(BaseCompressTestCase, unittest.TestCase):
         data = zlib.compress(input2)
         self.assertEqual(dco.flush(), input1[1:])
 
+    @bigmemtest(size=_4G, memuse=1)
+    def test_flush_large_length(self, size):
+        # Test flush(length) parameter greater than internal limit UINT_MAX
+        input = HAMLET_SCENE * 10
+        data = zlib.compress(input, 1)
+        dco = zlib.decompressobj()
+        dco.decompress(data, 1)
+        self.assertEqual(dco.flush(size), input[1:])
+
+    def test_flush_custom_length(self):
+        input = HAMLET_SCENE * 10
+        data = zlib.compress(input, 1)
+        dco = zlib.decompressobj()
+        dco.decompress(data, 1)
+        self.assertEqual(dco.flush(CustomInt()), input[1:])
+
     @requires_Compress_copy
     def test_compresscopy(self):
         # Test copying a compression object
@@ -725,5 +774,10 @@ LAERTES
 """
 
 
+class CustomInt:
+    def __int__(self):
+        return 100
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Misc/NEWS b/Misc/NEWS
index 60806d1..9f11892 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -77,6 +77,13 @@ Core and Builtins
 Library
 -------
 
+- Issue #25626: Change three zlib functions to accept sizes that fit in
+  Py_ssize_t, but internally cap those sizes to UINT_MAX.  This resolves a
+  regression in 3.5 where GzipFile.read() failed to read chunks larger than 2
+  or 4 GiB.  The change affects the zlib.Decompress.decompress() max_length
+  parameter, the zlib.decompress() bufsize parameter, and the
+  zlib.Decompress.flush() length parameter.
+
 - Issue #25583: Avoid incorrect errors raised by os.makedirs(exist_ok=True)
   when the OS gives priority to errors such as EACCES over EEXIST.
 
diff --git a/Modules/clinic/zlibmodule.c.h b/Modules/clinic/zlibmodule.c.h
index 35661a5..c5cdf42 100644
--- a/Modules/clinic/zlibmodule.c.h
+++ b/Modules/clinic/zlibmodule.c.h
@@ -68,7 +68,7 @@ zlib_decompress(PyModuleDef *module, PyObject *args)
     unsigned int bufsize = DEF_BUF_SIZE;
 
     if (!PyArg_ParseTuple(args, "y*|iO&:decompress",
-        &data, &wbits, uint_converter, &bufsize))
+        &data, &wbits, capped_uint_converter, &bufsize))
         goto exit;
     return_value = zlib_decompress_impl(module, &data, wbits, bufsize);
 
@@ -242,7 +242,7 @@ zlib_Decompress_decompress(compobject *self, PyObject *args)
     unsigned int max_length = 0;
 
     if (!PyArg_ParseTuple(args, "y*|O&:decompress",
-        &data, uint_converter, &max_length))
+        &data, capped_uint_converter, &max_length))
         goto exit;
     return_value = zlib_Decompress_decompress_impl(self, &data, max_length);
 
@@ -353,7 +353,7 @@ zlib_Decompress_flush(compobject *self, PyObject *args)
     unsigned int length = DEF_BUF_SIZE;
 
     if (!PyArg_ParseTuple(args, "|O&:flush",
-        uint_converter, &length))
+        capped_uint_converter, &length))
         goto exit;
     return_value = zlib_Decompress_flush_impl(self, length);
 
@@ -438,4 +438,4 @@ exit:
 #ifndef ZLIB_COMPRESS_COPY_METHODDEF
     #define ZLIB_COMPRESS_COPY_METHODDEF
 #endif /* !defined(ZLIB_COMPRESS_COPY_METHODDEF) */
-/*[clinic end generated code: output=56ed1147bbbb4788 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=7734aec079550bc8 input=a9049054013a1b77]*/
diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c
index 1997b40..37307be 100644
--- a/Modules/zlibmodule.c
+++ b/Modules/zlibmodule.c
@@ -226,42 +226,42 @@ zlib_compress_impl(PyModuleDef *module, Py_buffer *bytes, int level)
 
 /*[python input]
 
-class uint_converter(CConverter):
+class capped_uint_converter(CConverter):
     type = 'unsigned int'
-    converter = 'uint_converter'
+    converter = 'capped_uint_converter'
     c_ignored_default = "0"
 
 [python start generated code]*/
-/*[python end generated code: output=da39a3ee5e6b4b0d input=22263855f7a3ebfd]*/
+/*[python end generated code: output=da39a3ee5e6b4b0d input=35521e4e733823c7]*/
 
 static int
-uint_converter(PyObject *obj, void *ptr)
+capped_uint_converter(PyObject *obj, void *ptr)
 {
-    long val;
-    unsigned long uval;
+    PyObject *long_obj;
+    Py_ssize_t val;
 
-    val = PyLong_AsLong(obj);
-    if (val == -1 && PyErr_Occurred()) {
-        uval = PyLong_AsUnsignedLong(obj);
-        if (uval == (unsigned long)-1 && PyErr_Occurred())
-            return 0;
+    long_obj = (PyObject *)_PyLong_FromNbInt(obj);
+    if (long_obj == NULL) {
+        return 0;
     }
-    else {
-        if (val < 0) {
-            PyErr_SetString(PyExc_ValueError,
-                            "value must be positive");
-            return 0;
-        }
-        uval = (unsigned long)val;
+    val = PyLong_AsSsize_t(long_obj);
+    Py_DECREF(long_obj);
+    if (val == -1 && PyErr_Occurred()) {
+        return 0;
     }
-
-    if (uval > UINT_MAX) {
-        PyErr_SetString(PyExc_OverflowError,
-                        "Python int too large for C unsigned int");
+    if (val < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "value must be positive");
         return 0;
     }
 
-    *(unsigned int *)ptr = Py_SAFE_DOWNCAST(uval, unsigned long, unsigned int);
+    if ((size_t)val > UINT_MAX) {
+        *(unsigned int *)ptr = UINT_MAX;
+    }
+    else {
+        *(unsigned int *)ptr = Py_SAFE_DOWNCAST(val, Py_ssize_t,
+            unsigned int);
+    }
     return 1;
 }
 
@@ -272,7 +272,7 @@ zlib.decompress
         Compressed data.
     wbits: int(c_default="MAX_WBITS") = MAX_WBITS
         The window buffer size.
-    bufsize: uint(c_default="DEF_BUF_SIZE") = DEF_BUF_SIZE
+    bufsize: capped_uint(c_default="DEF_BUF_SIZE") = DEF_BUF_SIZE
         The initial output buffer size.
     /
 
@@ -282,7 +282,7 @@ Returns a bytes object containing the uncompressed data.
 static PyObject *
 zlib_decompress_impl(PyModuleDef *module, Py_buffer *data, int wbits,
                      unsigned int bufsize)
-/*[clinic end generated code: output=444d0987f3429574 input=0f4b9abb7103f50e]*/
+/*[clinic end generated code: output=444d0987f3429574 input=da095118b3243b27]*/
 {
     PyObject *result_str = NULL;
     Byte *input;
@@ -691,7 +691,7 @@ zlib.Decompress.decompress
 
     data: Py_buffer
         The binary data to decompress.
-    max_length: uint = 0
+    max_length: capped_uint = 0
         The maximum allowable length of the decompressed data.
         Unconsumed input data will be stored in
         the unconsumed_tail attribute.
@@ -707,7 +707,7 @@ Call the flush() method to clear these buffers.
 static PyObject *
 zlib_Decompress_decompress_impl(compobject *self, Py_buffer *data,
                                 unsigned int max_length)
-/*[clinic end generated code: output=b82e2a2c19f5fe7b input=02cfc047377cec86]*/
+/*[clinic end generated code: output=b82e2a2c19f5fe7b input=68b6508ab07c2cf0]*/
 {
     int err;
     unsigned int old_length, length = DEF_BUF_SIZE;
@@ -1048,7 +1048,7 @@ error:
 /*[clinic input]
 zlib.Decompress.flush
 
-    length: uint(c_default="DEF_BUF_SIZE") = zlib.DEF_BUF_SIZE
+    length: capped_uint(c_default="DEF_BUF_SIZE") = zlib.DEF_BUF_SIZE
         the initial size of the output buffer.
     /
 
@@ -1057,7 +1057,7 @@ Return a bytes object containing any remaining decompressed data.
 
 static PyObject *
 zlib_Decompress_flush_impl(compobject *self, unsigned int length)
-/*[clinic end generated code: output=db6fb753ab698e22 input=1580956505978993]*/
+/*[clinic end generated code: output=db6fb753ab698e22 input=1bb961eb21b62aa0]*/
 {
     int err;
     unsigned int new_length;
author	Martin Panter <vadmium+py@gmail.com>	2015-11-20 08:13:35 (GMT)
committer	Martin Panter <vadmium+py@gmail.com>	2015-11-20 08:13:35 (GMT)
commit	e99e97762cc75ad94056275ddcae9c84d63a3412 (patch)
tree	f16040197bf0122ce6aded03018b4ee6fee03679
parent	d13cade3817cf452b7c98e9df96b15c6da68201a (diff)
download	cpython-e99e97762cc75ad94056275ddcae9c84d63a3412.zip cpython-e99e97762cc75ad94056275ddcae9c84d63a3412.tar.gz cpython-e99e97762cc75ad94056275ddcae9c84d63a3412.tar.bz2