summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRuben Vorderman <r.h.p.vorderman@lumc.nl>2021-09-02 15:02:59 (GMT)
committerGitHub <noreply@github.com>2021-09-02 15:02:59 (GMT)
commitea23e7820f02840368569db8082bd0ca4d59b62a (patch)
tree44dcdd66cf7335a31a837d7e84a857e5c677e2b3
parenta7ef15aae8608560bffeeaba412c10e52cab07dd (diff)
downloadcpython-ea23e7820f02840368569db8082bd0ca4d59b62a.zip
cpython-ea23e7820f02840368569db8082bd0ca4d59b62a.tar.gz
cpython-ea23e7820f02840368569db8082bd0ca4d59b62a.tar.bz2
bpo-43613: Faster implementation of gzip.compress and gzip.decompress (GH-27941)
Co-authored-by: Łukasz Langa <lukasz@langa.pl>
-rw-r--r--Doc/library/gzip.rst17
-rw-r--r--Doc/library/zlib.rst46
-rw-r--r--Lib/gzip.py161
-rw-r--r--Lib/test/test_zlib.py7
-rw-r--r--Misc/NEWS.d/next/Library/2021-03-24-09-40-02.bpo-43612.vMGZ4y.rst5
-rw-r--r--Misc/NEWS.d/next/Library/2021-08-25-10-28-49.bpo-43613.WkYmI0.rst3
-rw-r--r--Modules/clinic/zlibmodule.c.h32
-rw-r--r--Modules/zlibmodule.c9
8 files changed, 193 insertions, 87 deletions
diff --git a/Doc/library/gzip.rst b/Doc/library/gzip.rst
index 33c4067..8cea264 100644
--- a/Doc/library/gzip.rst
+++ b/Doc/library/gzip.rst
@@ -174,19 +174,30 @@ The module defines the following items:
Compress the *data*, returning a :class:`bytes` object containing
the compressed data. *compresslevel* and *mtime* have the same meaning as in
- the :class:`GzipFile` constructor above.
+ the :class:`GzipFile` constructor above. When *mtime* is set to ``0``, this
+ function is equivalent to :func:`zlib.compress` with *wbits* set to ``31``.
+ The zlib function is faster.
.. versionadded:: 3.2
.. versionchanged:: 3.8
Added the *mtime* parameter for reproducible output.
+ .. versionchanged:: 3.11
+ Speed is improved by compressing all data at once instead of in a
+ streamed fashion. Calls with *mtime* set to ``0`` are delegated to
+ :func:`zlib.compress` for better speed.
.. function:: decompress(data)
Decompress the *data*, returning a :class:`bytes` object containing the
- uncompressed data.
+ uncompressed data. This function is capable of decompressing multi-member
+ gzip data (multiple gzip blocks concatenated together). When the data is
+ certain to contain only one member the :func:`zlib.decompress` function with
+ *wbits* set to 31 is faster.
.. versionadded:: 3.2
-
+ .. versionchanged:: 3.11
+ Speed is improved by decompressing members at once in memory instead of in
+ a streamed fashion.
.. _gzip-usage-examples:
diff --git a/Doc/library/zlib.rst b/Doc/library/zlib.rst
index ec60ea2..793c90f 100644
--- a/Doc/library/zlib.rst
+++ b/Doc/library/zlib.rst
@@ -47,7 +47,7 @@ The available exception and functions in this module are:
platforms, use ``adler32(data) & 0xffffffff``.
-.. function:: compress(data, /, level=-1)
+.. function:: compress(data, /, level=-1, wbits=MAX_WBITS)
Compresses the bytes in *data*, returning a bytes object containing compressed data.
*level* is an integer from ``0`` to ``9`` or ``-1`` controlling the level of compression;
@@ -55,11 +55,35 @@ The available exception and functions in this module are:
is slowest and produces the most. ``0`` (Z_NO_COMPRESSION) is no compression.
The default value is ``-1`` (Z_DEFAULT_COMPRESSION). Z_DEFAULT_COMPRESSION represents a default
compromise between speed and compression (currently equivalent to level 6).
+
+ .. _compress-wbits:
+
+ The *wbits* argument controls the size of the history buffer (or the
+ "window size") used when compressing data, and whether a header and
+ trailer is included in the output. It can take several ranges of values,
+ defaulting to ``15`` (MAX_WBITS):
+
+ * +9 to +15: The base-two logarithm of the window size, which
+ therefore ranges between 512 and 32768. Larger values produce
+ better compression at the expense of greater memory usage. The
+ resulting output will include a zlib-specific header and trailer.
+
+ * −9 to −15: Uses the absolute value of *wbits* as the
+ window size logarithm, while producing a raw output stream with no
+ header or trailing checksum.
+
+ * +25 to +31 = 16 + (9 to 15): Uses the low 4 bits of the value as the
+ window size logarithm, while including a basic :program:`gzip` header
+ and trailing checksum in the output.
+
Raises the :exc:`error` exception if any error occurs.
.. versionchanged:: 3.6
*level* can now be used as a keyword parameter.
+ .. versionchanged:: 3.11
+ The *wbits* parameter is now available to set window bits and
+ compression type.
.. function:: compressobj(level=-1, method=DEFLATED, wbits=MAX_WBITS, memLevel=DEF_MEM_LEVEL, strategy=Z_DEFAULT_STRATEGY[, zdict])
@@ -76,23 +100,9 @@ The available exception and functions in this module are:
*method* is the compression algorithm. Currently, the only supported value is
:const:`DEFLATED`.
- The *wbits* argument controls the size of the history buffer (or the
- "window size") used when compressing data, and whether a header and
- trailer is included in the output. It can take several ranges of values,
- defaulting to ``15`` (MAX_WBITS):
-
- * +9 to +15: The base-two logarithm of the window size, which
- therefore ranges between 512 and 32768. Larger values produce
- better compression at the expense of greater memory usage. The
- resulting output will include a zlib-specific header and trailer.
-
- * −9 to −15: Uses the absolute value of *wbits* as the
- window size logarithm, while producing a raw output stream with no
- header or trailing checksum.
-
- * +25 to +31 = 16 + (9 to 15): Uses the low 4 bits of the value as the
- window size logarithm, while including a basic :program:`gzip` header
- and trailing checksum in the output.
+ The *wbits* parameter controls the size of the history buffer (or the
+ "window size"), and what header and trailer format will be used. It has
+ the same meaning as `described for compress() <#compress-wbits>`__.
The *memLevel* argument controls the amount of memory used for the
internal compression state. Valid values range from ``1`` to ``9``.
diff --git a/Lib/gzip.py b/Lib/gzip.py
index 3d837b7..0dddb51 100644
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@@ -403,6 +403,59 @@ class GzipFile(_compression.BaseStream):
return self._buffer.__iter__()
+def _read_exact(fp, n):
+ '''Read exactly *n* bytes from `fp`
+
+ This method is required because fp may be unbuffered,
+ i.e. return short reads.
+ '''
+ data = fp.read(n)
+ while len(data) < n:
+ b = fp.read(n - len(data))
+ if not b:
+ raise EOFError("Compressed file ended before the "
+ "end-of-stream marker was reached")
+ data += b
+ return data
+
+
+def _read_gzip_header(fp):
+ '''Read a gzip header from `fp` and progress to the end of the header.
+
+ Returns last mtime if header was present or None otherwise.
+ '''
+ magic = fp.read(2)
+ if magic == b'':
+ return None
+
+ if magic != b'\037\213':
+ raise BadGzipFile('Not a gzipped file (%r)' % magic)
+
+ (method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
+ if method != 8:
+ raise BadGzipFile('Unknown compression method')
+
+ if flag & FEXTRA:
+ # Read & discard the extra field, if present
+ extra_len, = struct.unpack("<H", _read_exact(fp, 2))
+ _read_exact(fp, extra_len)
+ if flag & FNAME:
+ # Read and discard a null-terminated string containing the filename
+ while True:
+ s = fp.read(1)
+ if not s or s==b'\000':
+ break
+ if flag & FCOMMENT:
+ # Read and discard a null-terminated string containing a comment
+ while True:
+ s = fp.read(1)
+ if not s or s==b'\000':
+ break
+ if flag & FHCRC:
+ _read_exact(fp, 2) # Read & discard the 16-bit header CRC
+ return last_mtime
+
+
class _GzipReader(_compression.DecompressReader):
def __init__(self, fp):
super().__init__(_PaddedFile(fp), zlib.decompressobj,
@@ -415,53 +468,11 @@ class _GzipReader(_compression.DecompressReader):
self._crc = zlib.crc32(b"")
self._stream_size = 0 # Decompressed size of unconcatenated stream
- def _read_exact(self, n):
- '''Read exactly *n* bytes from `self._fp`
-
- This method is required because self._fp may be unbuffered,
- i.e. return short reads.
- '''
-
- data = self._fp.read(n)
- while len(data) < n:
- b = self._fp.read(n - len(data))
- if not b:
- raise EOFError("Compressed file ended before the "
- "end-of-stream marker was reached")
- data += b
- return data
-
def _read_gzip_header(self):
- magic = self._fp.read(2)
- if magic == b'':
+ last_mtime = _read_gzip_header(self._fp)
+ if last_mtime is None:
return False
-
- if magic != b'\037\213':
- raise BadGzipFile('Not a gzipped file (%r)' % magic)
-
- (method, flag,
- self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
- if method != 8:
- raise BadGzipFile('Unknown compression method')
-
- if flag & FEXTRA:
- # Read & discard the extra field, if present
- extra_len, = struct.unpack("<H", self._read_exact(2))
- self._read_exact(extra_len)
- if flag & FNAME:
- # Read and discard a null-terminated string containing the filename
- while True:
- s = self._fp.read(1)
- if not s or s==b'\000':
- break
- if flag & FCOMMENT:
- # Read and discard a null-terminated string containing a comment
- while True:
- s = self._fp.read(1)
- if not s or s==b'\000':
- break
- if flag & FHCRC:
- self._read_exact(2) # Read & discard the 16-bit header CRC
+ self._last_mtime = last_mtime
return True
def read(self, size=-1):
@@ -524,7 +535,7 @@ class _GzipReader(_compression.DecompressReader):
# We check that the computed CRC and size of the
# uncompressed data matches the stored values. Note that the size
# stored is the true file size mod 2**32.
- crc32, isize = struct.unpack("<II", self._read_exact(8))
+ crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
if crc32 != self._crc:
raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
hex(self._crc)))
@@ -544,21 +555,65 @@ class _GzipReader(_compression.DecompressReader):
super()._rewind()
self._new_member = True
+
+def _create_simple_gzip_header(compresslevel: int,
+ mtime = None) -> bytes:
+ """
+ Write a simple gzip header with no extra fields.
+ :param compresslevel: Compresslevel used to determine the xfl bytes.
+ :param mtime: The mtime (must support conversion to a 32-bit integer).
+ :return: A bytes object representing the gzip header.
+ """
+ if mtime is None:
+ mtime = time.time()
+ if compresslevel == _COMPRESS_LEVEL_BEST:
+ xfl = 2
+ elif compresslevel == _COMPRESS_LEVEL_FAST:
+ xfl = 4
+ else:
+ xfl = 0
+ # Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
+ # fields added to header), mtime, xfl and os (255 for unknown OS).
+ return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)
+
+
def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
"""Compress data in one shot and return the compressed string.
- Optional argument is the compression level, in range of 0-9.
+
+ compresslevel sets the compression level in range of 0-9.
+ mtime can be used to set the modification time. The modification time is
+ set to the current time by default.
"""
- buf = io.BytesIO()
- with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
- f.write(data)
- return buf.getvalue()
+ if mtime == 0:
+ # Use zlib as it creates the header with 0 mtime by default.
+ # This is faster and with less overhead.
+ return zlib.compress(data, level=compresslevel, wbits=31)
+ header = _create_simple_gzip_header(compresslevel, mtime)
+ trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
+ # Wbits=-15 creates a raw deflate block.
+ return header + zlib.compress(data, wbits=-15) + trailer
+
def decompress(data):
"""Decompress a gzip compressed string in one shot.
Return the decompressed string.
"""
- with GzipFile(fileobj=io.BytesIO(data)) as f:
- return f.read()
+ decompressed_members = []
+ while True:
+ fp = io.BytesIO(data)
+ if _read_gzip_header(fp) is None:
+ return b"".join(decompressed_members)
+ # Use a zlib raw deflate compressor
+ do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
+ # Read all the data except the header
+ decompressed = do.decompress(data[fp.tell():])
+ crc, length = struct.unpack("<II", do.unused_data[:8])
+ if crc != zlib.crc32(decompressed):
+ raise BadGzipFile("CRC check failed")
+ if length != (len(decompressed) & 0xffffffff):
+ raise BadGzipFile("Incorrect length of data produced")
+ decompressed_members.append(decompressed)
+ data = do.unused_data[8:].lstrip(b"\x00")
def main():
diff --git a/Lib/test/test_zlib.py b/Lib/test/test_zlib.py
index cb06108..04fb4d9 100644
--- a/Lib/test/test_zlib.py
+++ b/Lib/test/test_zlib.py
@@ -831,6 +831,13 @@ class CompressObjectTestCase(BaseCompressTestCase, unittest.TestCase):
dco = zlib.decompressobj(32 + 15)
self.assertEqual(dco.decompress(gzip), HAMLET_SCENE)
+ for wbits in (-15, 15, 31):
+ with self.subTest(wbits=wbits):
+ expected = HAMLET_SCENE
+ actual = zlib.decompress(
+ zlib.compress(HAMLET_SCENE, wbits=wbits), wbits=wbits
+ )
+ self.assertEqual(expected, actual)
def choose_lines(source, number, seed=None, generator=random):
"""Return a list of number lines randomly chosen from the source"""
diff --git a/Misc/NEWS.d/next/Library/2021-03-24-09-40-02.bpo-43612.vMGZ4y.rst b/Misc/NEWS.d/next/Library/2021-03-24-09-40-02.bpo-43612.vMGZ4y.rst
new file mode 100644
index 0000000..e6fc88f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2021-03-24-09-40-02.bpo-43612.vMGZ4y.rst
@@ -0,0 +1,5 @@
+:func:`zlib.compress` now accepts a wbits parameter which allows users to
+compress data as a raw deflate block without zlib headers and trailers in
+one go. Previously this required instantiating a ``zlib.compressobj``. It
+also provides a faster alternative to ``gzip.compress`` when wbits=31 is
+used.
diff --git a/Misc/NEWS.d/next/Library/2021-08-25-10-28-49.bpo-43613.WkYmI0.rst b/Misc/NEWS.d/next/Library/2021-08-25-10-28-49.bpo-43613.WkYmI0.rst
new file mode 100644
index 0000000..d6af35c
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2021-08-25-10-28-49.bpo-43613.WkYmI0.rst
@@ -0,0 +1,3 @@
+Improve the speed of :func:`gzip.compress` and :func:`gzip.decompress` by
+compressing and decompressing at once in memory instead of in a streamed
+fashion.
diff --git a/Modules/clinic/zlibmodule.c.h b/Modules/clinic/zlibmodule.c.h
index 14e955d..e2a5fcc 100644
--- a/Modules/clinic/zlibmodule.c.h
+++ b/Modules/clinic/zlibmodule.c.h
@@ -3,7 +3,7 @@ preserve
[clinic start generated code]*/
PyDoc_STRVAR(zlib_compress__doc__,
-"compress($module, data, /, level=Z_DEFAULT_COMPRESSION)\n"
+"compress($module, data, /, level=Z_DEFAULT_COMPRESSION, wbits=MAX_WBITS)\n"
"--\n"
"\n"
"Returns a bytes object containing compressed data.\n"
@@ -11,26 +11,29 @@ PyDoc_STRVAR(zlib_compress__doc__,
" data\n"
" Binary data to be compressed.\n"
" level\n"
-" Compression level, in 0-9 or -1.");
+" Compression level, in 0-9 or -1.\n"
+" wbits\n"
+" The window buffer size and container format.");
#define ZLIB_COMPRESS_METHODDEF \
{"compress", (PyCFunction)(void(*)(void))zlib_compress, METH_FASTCALL|METH_KEYWORDS, zlib_compress__doc__},
static PyObject *
-zlib_compress_impl(PyObject *module, Py_buffer *data, int level);
+zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits);
static PyObject *
zlib_compress(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
{
PyObject *return_value = NULL;
- static const char * const _keywords[] = {"", "level", NULL};
+ static const char * const _keywords[] = {"", "level", "wbits", NULL};
static _PyArg_Parser _parser = {NULL, _keywords, "compress", 0};
- PyObject *argsbuf[2];
+ PyObject *argsbuf[3];
Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
Py_buffer data = {NULL, NULL};
int level = Z_DEFAULT_COMPRESSION;
+ int wbits = MAX_WBITS;
- args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 2, 0, argsbuf);
+ args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 3, 0, argsbuf);
if (!args) {
goto exit;
}
@@ -44,12 +47,21 @@ zlib_compress(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObjec
if (!noptargs) {
goto skip_optional_pos;
}
- level = _PyLong_AsInt(args[1]);
- if (level == -1 && PyErr_Occurred()) {
+ if (args[1]) {
+ level = _PyLong_AsInt(args[1]);
+ if (level == -1 && PyErr_Occurred()) {
+ goto exit;
+ }
+ if (!--noptargs) {
+ goto skip_optional_pos;
+ }
+ }
+ wbits = _PyLong_AsInt(args[2]);
+ if (wbits == -1 && PyErr_Occurred()) {
goto exit;
}
skip_optional_pos:
- return_value = zlib_compress_impl(module, &data, level);
+ return_value = zlib_compress_impl(module, &data, level, wbits);
exit:
/* Cleanup for data */
@@ -803,4 +815,4 @@ exit:
#ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
#define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
#endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */
-/*[clinic end generated code: output=6736bae59fab268b input=a9049054013a1b77]*/
+/*[clinic end generated code: output=e3e8a6142ea045a7 input=a9049054013a1b77]*/
diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c
index 3efb24a..27a6d9a 100644
--- a/Modules/zlibmodule.c
+++ b/Modules/zlibmodule.c
@@ -310,13 +310,15 @@ zlib.compress
/
level: int(c_default="Z_DEFAULT_COMPRESSION") = Z_DEFAULT_COMPRESSION
Compression level, in 0-9 or -1.
+ wbits: int(c_default="MAX_WBITS") = MAX_WBITS
+ The window buffer size and container format.
Returns a bytes object containing compressed data.
[clinic start generated code]*/
static PyObject *
-zlib_compress_impl(PyObject *module, Py_buffer *data, int level)
-/*[clinic end generated code: output=d80906d73f6294c8 input=638d54b6315dbed3]*/
+zlib_compress_impl(PyObject *module, Py_buffer *data, int level, int wbits)
+/*[clinic end generated code: output=46bd152fadd66df2 input=c4d06ee5782a7e3f]*/
{
PyObject *RetVal;
int flush;
@@ -336,7 +338,8 @@ zlib_compress_impl(PyObject *module, Py_buffer *data, int level)
zst.zalloc = PyZlib_Malloc;
zst.zfree = PyZlib_Free;
zst.next_in = ibuf;
- int err = deflateInit(&zst, level);
+ int err = deflateInit2(&zst, level, DEFLATED, wbits, DEF_MEM_LEVEL,
+ Z_DEFAULT_STRATEGY);
switch (err) {
case Z_OK: