diff options
author | Ruben Vorderman <r.h.p.vorderman@lumc.nl> | 2022-10-17 02:10:58 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-17 02:10:58 (GMT) |
commit | eae7dad40255bad42e4abce53ff8143dcbc66af5 (patch) | |
tree | 7cea56066a6db7c451712f8375034c2d8b8914f4 /Modules/clinic/zlibmodule.c.h | |
parent | bb38b39b339191c5fc001c8fbfbc3037c13bc7bb (diff) | |
download | cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.zip cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.tar.gz cpython-eae7dad40255bad42e4abce53ff8143dcbc66af5.tar.bz2 |
gh-95534: Improve gzip reading speed by 10% (#97664)
Change summary:
+ There is now a `gzip.READ_BUFFER_SIZE` constant that is 128KB. Other programs that read in 128KB chunks: pigz and cat. So this seems best practice among good programs. Also it is faster than 8 kb chunks.
+ a zlib._ZlibDecompressor was added. This is the _bz2.BZ2Decompressor ported to zlib. Since the zlib.Decompress object is better for in-memory decompression, the _ZlibDecompressor is hidden. It only makes sense in file decompression, and that is already implemented now in the gzip library. No need to bother the users with this.
+ The ZlibDecompressor uses the older Cpython arrange_output_buffer functions, as those are faster and more appropriate for the use case.
+ GzipFile.read has been optimized. There is no longer a `unconsumed_tail` member to write back to padded file. This is instead handled by the ZlibDecompressor itself, which has an internal buffer. `_add_read_data` has been inlined, as it was just two calls.
EDIT: While I am adding improvements anyway, I figured I could add another one-liner optimization now to the python -m gzip application. That read chunks in io.DEFAULT_BUFFER_SIZE previously, but has been updated now to use READ_BUFFER_SIZE chunks.
Diffstat (limited to 'Modules/clinic/zlibmodule.c.h')
-rw-r--r-- | Modules/clinic/zlibmodule.c.h | 100 |
1 files changed, 99 insertions, 1 deletions
diff --git a/Modules/clinic/zlibmodule.c.h b/Modules/clinic/zlibmodule.c.h index a04b954..65412b2 100644 --- a/Modules/clinic/zlibmodule.c.h +++ b/Modules/clinic/zlibmodule.c.h @@ -897,6 +897,104 @@ exit: return return_value; } +PyDoc_STRVAR(zlib_ZlibDecompressor_decompress__doc__, +"decompress($self, /, data, max_length=-1)\n" +"--\n" +"\n" +"Decompress *data*, returning uncompressed data as bytes.\n" +"\n" +"If *max_length* is nonnegative, returns at most *max_length* bytes of\n" +"decompressed data. If this limit is reached and further output can be\n" +"produced, *self.needs_input* will be set to ``False``. In this case, the next\n" +"call to *decompress()* may provide *data* as b\'\' to obtain more of the output.\n" +"\n" +"If all of the input data was decompressed and returned (either because this\n" +"was less than *max_length* bytes, or because *max_length* was negative),\n" +"*self.needs_input* will be set to True.\n" +"\n" +"Attempting to decompress data after the end of stream is reached raises an\n" +"EOFError. Any data found after the end of the stream is ignored and saved in\n" +"the unused_data attribute."); + +#define ZLIB_ZLIBDECOMPRESSOR_DECOMPRESS_METHODDEF \ + {"decompress", _PyCFunction_CAST(zlib_ZlibDecompressor_decompress), METH_FASTCALL|METH_KEYWORDS, zlib_ZlibDecompressor_decompress__doc__}, + +static PyObject * +zlib_ZlibDecompressor_decompress_impl(ZlibDecompressor *self, + Py_buffer *data, Py_ssize_t max_length); + +static PyObject * +zlib_ZlibDecompressor_decompress(ZlibDecompressor *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 2 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_item = { &_Py_ID(data), &_Py_ID(max_length), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"data", "max_length", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "decompress", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1; + Py_buffer data = {NULL, NULL}; + Py_ssize_t max_length = -1; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 2, 0, argsbuf); + if (!args) { + goto exit; + } + if (PyObject_GetBuffer(args[0], &data, PyBUF_SIMPLE) != 0) { + goto exit; + } + if (!PyBuffer_IsContiguous(&data, 'C')) { + _PyArg_BadArgument("decompress", "argument 'data'", "contiguous buffer", args[0]); + goto exit; + } + if (!noptargs) { + goto skip_optional_pos; + } + { + Py_ssize_t ival = -1; + PyObject *iobj = _PyNumber_Index(args[1]); + if (iobj != NULL) { + ival = PyLong_AsSsize_t(iobj); + Py_DECREF(iobj); + } + if (ival == -1 && PyErr_Occurred()) { + goto exit; + } + max_length = ival; + } +skip_optional_pos: + return_value = zlib_ZlibDecompressor_decompress_impl(self, &data, max_length); + +exit: + /* Cleanup for data */ + if (data.obj) { + PyBuffer_Release(&data); + } + + return return_value; +} + PyDoc_STRVAR(zlib_adler32__doc__, "adler32($module, data, value=1, /)\n" "--\n" @@ -1031,4 +1129,4 @@ exit: #ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF #define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF #endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */ -/*[clinic end generated code: output=9e5f9911d0c273e1 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=57ff7b511ab23132 input=a9049054013a1b77]*/ |