gh-95534: Improve gzip reading speed by 10% (#97664)

Change summary: + There is now a `gzip.READ_BUFFER_SIZE` constant that is 128KB. Other programs that read in 128KB chunks: pigz and cat. So this seems best practice among good programs. Also it is faster than 8 kb chunks. + a zlib._ZlibDecompressor was added. This is the _bz2.BZ2Decompressor ported to zlib. Since the zlib.Decompress object is better for in-memory decompression, the _ZlibDecompressor is hidden. It only makes sense in file decompression, and that is already implemented now in the gzip library. No need to bother the users with this. + The ZlibDecompressor uses the older Cpython arrange_output_buffer functions, as those are faster and more appropriate for the use case. + GzipFile.read has been optimized. There is no longer a `unconsumed_tail` member to write back to padded file. This is instead handled by the ZlibDecompressor itself, which has an internal buffer. `_add_read_data` has been inlined, as it was just two calls. EDIT: While I am adding improvements anyway, I figured I could add another one-liner optimization now to the python -m gzip application. That read chunks in io.DEFAULT_BUFFER_SIZE previously, but has been updated now to use READ_BUFFER_SIZE chunks.
author: Ruben Vorderman <r.h.p.vorderman@lumc.nl> 2022-10-17 02:10:58 (GMT)
committer: GitHub <noreply@github.com> 2022-10-17 02:10:58 (GMT)
commit: eae7dad40255bad42e4abce53ff8143dcbc66af5 (patch)
tree: 7cea56066a6db7c451712f8375034c2d8b8914f4 /Modules/clinic/zlibmodule.c.h
parent: bb38b39b339191c5fc001c8fbfbc3037c13bc7bb (diff)
download: cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.zip
cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.tar.gz
cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.tar.bz2
1 files changed, 99 insertions, 1 deletions
diff --git a/Modules/clinic/zlibmodule.c.h b/Modules/clinic/zlibmodule.c.h
index a04b954..65412b2 100644
--- a/Modules/clinic/zlibmodule.c.h
+++ b/Modules/clinic/zlibmodule.c.h
@@ -897,6 +897,104 @@ exit:
     return return_value;
 }
 
+PyDoc_STRVAR(zlib_ZlibDecompressor_decompress__doc__,
+"decompress($self, /, data, max_length=-1)\n"
+"--\n"
+"\n"
+"Decompress *data*, returning uncompressed data as bytes.\n"
+"\n"
+"If *max_length* is nonnegative, returns at most *max_length* bytes of\n"
+"decompressed data. If this limit is reached and further output can be\n"
+"produced, *self.needs_input* will be set to ``False``. In this case, the next\n"
+"call to *decompress()* may provide *data* as b\'\' to obtain more of the output.\n"
+"\n"
+"If all of the input data was decompressed and returned (either because this\n"
+"was less than *max_length* bytes, or because *max_length* was negative),\n"
+"*self.needs_input* will be set to True.\n"
+"\n"
+"Attempting to decompress data after the end of stream is reached raises an\n"
+"EOFError.  Any data found after the end of the stream is ignored and saved in\n"
+"the unused_data attribute.");
+
+#define ZLIB_ZLIBDECOMPRESSOR_DECOMPRESS_METHODDEF    \
+    {"decompress", _PyCFunction_CAST(zlib_ZlibDecompressor_decompress), METH_FASTCALL|METH_KEYWORDS, zlib_ZlibDecompressor_decompress__doc__},
+
+static PyObject *
+zlib_ZlibDecompressor_decompress_impl(ZlibDecompressor *self,
+                                      Py_buffer *data, Py_ssize_t max_length);
+
+static PyObject *
+zlib_ZlibDecompressor_decompress(ZlibDecompressor *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 2
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_item = { &_Py_ID(data), &_Py_ID(max_length), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"data", "max_length", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "decompress",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
+    Py_buffer data = {NULL, NULL};
+    Py_ssize_t max_length = -1;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 2, 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (PyObject_GetBuffer(args[0], &data, PyBUF_SIMPLE) != 0) {
+        goto exit;
+    }
+    if (!PyBuffer_IsContiguous(&data, 'C')) {
+        _PyArg_BadArgument("decompress", "argument 'data'", "contiguous buffer", args[0]);
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_pos;
+    }
+    {
+        Py_ssize_t ival = -1;
+        PyObject *iobj = _PyNumber_Index(args[1]);
+        if (iobj != NULL) {
+            ival = PyLong_AsSsize_t(iobj);
+            Py_DECREF(iobj);
+        }
+        if (ival == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        max_length = ival;
+    }
+skip_optional_pos:
+    return_value = zlib_ZlibDecompressor_decompress_impl(self, &data, max_length);
+
+exit:
+    /* Cleanup for data */
+    if (data.obj) {
+       PyBuffer_Release(&data);
+    }
+
+    return return_value;
+}
+
 PyDoc_STRVAR(zlib_adler32__doc__,
 "adler32($module, data, value=1, /)\n"
 "--\n"
@@ -1031,4 +1129,4 @@ exit:
 #ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
     #define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
 #endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */
-/*[clinic end generated code: output=9e5f9911d0c273e1 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=57ff7b511ab23132 input=a9049054013a1b77]*/
author	Ruben Vorderman <r.h.p.vorderman@lumc.nl>	2022-10-17 02:10:58 (GMT)
committer	GitHub <noreply@github.com>	2022-10-17 02:10:58 (GMT)
commit	eae7dad40255bad42e4abce53ff8143dcbc66af5 (patch)
tree	7cea56066a6db7c451712f8375034c2d8b8914f4 /Modules/clinic/zlibmodule.c.h
parent	bb38b39b339191c5fc001c8fbfbc3037c13bc7bb (diff)
download	cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.zip cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.tar.gz cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.tar.bz2