From fd8a838d582e3495004036bc82e3e6fee91fa0ba Mon Sep 17 00:00:00 2001 From: Nadeem Vawda Date: Thu, 21 Jun 2012 02:13:12 +0200 Subject: Issue #14684: Add support for predefined compression dictionaries to the zlib module. Original patch by Sam Rushing. --- Doc/library/zlib.rst | 31 ++++++++++--- Lib/test/test_zlib.py | 30 +++++++++++++ Misc/NEWS | 3 ++ Modules/zlibmodule.c | 118 ++++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 153 insertions(+), 29 deletions(-) diff --git a/Doc/library/zlib.rst b/Doc/library/zlib.rst index 1e9a2bc..705f734 100644 --- a/Doc/library/zlib.rst +++ b/Doc/library/zlib.rst @@ -58,12 +58,19 @@ The available exception and functions in this module are: exception if any error occurs. -.. function:: compressobj([level]) +.. function:: compressobj([level[, method[, wbits[, memlevel[, strategy[, zdict]]]]]]) Returns a compression object, to be used for compressing data streams that won't - fit into memory at once. *level* is an integer from ``1`` to ``9`` controlling - the level of compression; ``1`` is fastest and produces the least compression, - ``9`` is slowest and produces the most. The default value is ``6``. + fit into memory at once. + + *level* is an integer from ``1`` to ``9`` controlling the level of + compression; ``1`` is fastest and produces the least compression, ``9`` is + slowest and produces the most. The default value is ``6``. + + *zdict* is a predefined compression dictionary. This is a sequence of bytes + (such as a :class:`bytes` object) containing subsequences that are expected + to occur frequently in the data that is to be compressed. Those subsequences + that are expected to be most common should come at the end of the dictionary. .. function:: crc32(data[, value]) @@ -114,11 +121,21 @@ The available exception and functions in this module are: to :c:func:`malloc`. The default size is 16384. -.. function:: decompressobj([wbits]) +.. function:: decompressobj([wbits[, zdict]]) Returns a decompression object, to be used for decompressing data streams that - won't fit into memory at once. The *wbits* parameter controls the size of the - window buffer. + won't fit into memory at once. + + The *wbits* parameter controls the size of the window buffer. + + The *zdict* parameter specifies a predefined compression dictionary. If + provided, this must be the same dictionary as was used by the compressor that + produced the data that is to be decompressed. + +.. note:: + If *zdict* is a mutable object (such as a :class:`bytearray`), you must not + modify its contents between the call to :func:`decompressobj` and the first + call to the decompressor's ``decompress()`` method. Compression objects support the following methods: diff --git a/Lib/test/test_zlib.py b/Lib/test/test_zlib.py index 3c982c6..904ac4d 100644 --- a/Lib/test/test_zlib.py +++ b/Lib/test/test_zlib.py @@ -425,6 +425,36 @@ class CompressObjectTestCase(BaseCompressTestCase, unittest.TestCase): dco = zlib.decompressobj() self.assertEqual(dco.flush(), b"") # Returns nothing + def test_dictionary(self): + h = HAMLET_SCENE + # build a simulated dictionary out of the words in HAMLET + words = h.split() + random.shuffle(words) + zdict = b''.join(words) + # use it to compress HAMLET + co = zlib.compressobj(zdict=zdict) + cd = co.compress(h) + co.flush() + # verify that it will decompress with the dictionary + dco = zlib.decompressobj(zdict=zdict) + self.assertEqual(dco.decompress(cd) + dco.flush(), h) + # verify that it fails when not given the dictionary + dco = zlib.decompressobj() + self.assertRaises(zlib.error, dco.decompress, cd) + + def test_dictionary_streaming(self): + # this is simulating the needs of SPDY to be able to reuse the same + # stream object (with its compression state) between sets of compressed + # headers. + co = zlib.compressobj(zdict=HAMLET_SCENE) + do = zlib.decompressobj(zdict=HAMLET_SCENE) + piece = HAMLET_SCENE[1000:1500] + d0 = co.compress(piece) + co.flush(zlib.Z_SYNC_FLUSH) + d1 = co.compress(piece[100:]) + co.flush(zlib.Z_SYNC_FLUSH) + d2 = co.compress(piece[:-100]) + co.flush(zlib.Z_SYNC_FLUSH) + self.assertEqual(do.decompress(d0), piece) + self.assertEqual(do.decompress(d1), piece[100:]) + self.assertEqual(do.decompress(d2), piece[:-100]) + def test_decompress_incomplete_stream(self): # This is 'foo', deflated x = b'x\x9cK\xcb\xcf\x07\x00\x02\x82\x01E' diff --git a/Misc/NEWS b/Misc/NEWS index 2f9236f..c919c69 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -34,6 +34,9 @@ Core and Builtins Library ------- +- Issue #14684: zlib.compressobj() and zlib.decompressobj() now support the use + of predefined compression dictionaries. Original patch by Sam Rushing. + - Fix GzipFile's handling of filenames given as bytes objects. - Issue #14772: Return destination values from some shutil functions. diff --git a/Modules/zlibmodule.c b/Modules/zlibmodule.c index a6da056..102740b 100644 --- a/Modules/zlibmodule.c +++ b/Modules/zlibmodule.c @@ -45,6 +45,7 @@ typedef struct PyObject *unconsumed_tail; char eof; int is_initialised; + PyObject *zdict; #ifdef WITH_THREAD PyThread_type_lock lock; #endif @@ -80,14 +81,21 @@ zlib_error(z_stream zst, int err, char *msg) } PyDoc_STRVAR(compressobj__doc__, -"compressobj([level]) -- Return a compressor object.\n" +"compressobj([level[, method[, wbits[, memlevel[, strategy[, zdict]]]]]])\n" +" -- Return a compressor object.\n" "\n" -"Optional arg level is the compression level, in 1-9."); +"Optional arg level is the compression level, in 1-9.\n" +"\n" +"Optional arg zdict is the predefined compression dictionary - a sequence of\n" +"bytes containing subsequences that are likely to occur in the input data."); PyDoc_STRVAR(decompressobj__doc__, -"decompressobj([wbits]) -- Return a decompressor object.\n" +"decompressobj([wbits[, zdict]]) -- Return a decompressor object.\n" +"\n" +"Optional arg wbits is the window buffer size.\n" "\n" -"Optional arg wbits is the window buffer size."); +"Optional arg zdict is the predefined compression dictionary. This must be\n" +"the same dictionary as used by the compressor that produced the input data."); static compobject * newcompobject(PyTypeObject *type) @@ -98,6 +106,7 @@ newcompobject(PyTypeObject *type) return NULL; self->eof = 0; self->is_initialised = 0; + self->zdict = NULL; self->unused_data = PyBytes_FromStringAndSize("", 0); if (self->unused_data == NULL) { Py_DECREF(self); @@ -316,19 +325,24 @@ PyZlib_decompress(PyObject *self, PyObject *args) } static PyObject * -PyZlib_compressobj(PyObject *selfptr, PyObject *args) +PyZlib_compressobj(PyObject *selfptr, PyObject *args, PyObject *kwargs) { compobject *self; int level=Z_DEFAULT_COMPRESSION, method=DEFLATED; int wbits=MAX_WBITS, memLevel=DEF_MEM_LEVEL, strategy=0, err; - - if (!PyArg_ParseTuple(args, "|iiiii:compressobj", &level, &method, &wbits, - &memLevel, &strategy)) + Py_buffer zdict; + static char *kwlist[] = {"level", "method", "wbits", + "memLevel", "strategy", "zdict", NULL}; + + zdict.buf = NULL; /* Sentinel, so we can tell whether zdict was supplied. */ + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iiiiiy*:compressobj", + kwlist, &level, &method, &wbits, + &memLevel, &strategy, &zdict)) return NULL; self = newcompobject(&Comptype); if (self==NULL) - return(NULL); + goto error; self->zst.zalloc = (alloc_func)NULL; self->zst.zfree = (free_func)Z_NULL; self->zst.next_in = NULL; @@ -337,30 +351,58 @@ PyZlib_compressobj(PyObject *selfptr, PyObject *args) switch(err) { case (Z_OK): self->is_initialised = 1; - return (PyObject*)self; + if (zdict.buf == NULL) { + goto success; + } else { + err = deflateSetDictionary(&self->zst, zdict.buf, zdict.len); + switch (err) { + case (Z_OK): + goto success; + case (Z_STREAM_ERROR): + PyErr_SetString(PyExc_ValueError, "Invalid dictionary"); + goto error; + default: + PyErr_SetString(PyExc_ValueError, "deflateSetDictionary()"); + goto error; + } + } case (Z_MEM_ERROR): - Py_DECREF(self); PyErr_SetString(PyExc_MemoryError, "Can't allocate memory for compression object"); - return NULL; + goto error; case(Z_STREAM_ERROR): - Py_DECREF(self); PyErr_SetString(PyExc_ValueError, "Invalid initialization option"); - return NULL; + goto error; default: zlib_error(self->zst, err, "while creating compression object"); - Py_DECREF(self); - return NULL; + goto error; } + + error: + Py_XDECREF(self); + self = NULL; + success: + if (zdict.buf != NULL) + PyBuffer_Release(&zdict); + return (PyObject*)self; } static PyObject * -PyZlib_decompressobj(PyObject *selfptr, PyObject *args) +PyZlib_decompressobj(PyObject *selfptr, PyObject *args, PyObject *kwargs) { + static char *kwlist[] = {"wbits", "zdict", NULL}; int wbits=DEF_WBITS, err; compobject *self; - if (!PyArg_ParseTuple(args, "|i:decompressobj", &wbits)) + PyObject *zdict=NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO:decompressobj", + kwlist, &wbits, &zdict)) + return NULL; + if (zdict != NULL && !PyObject_CheckBuffer(zdict)) { + PyErr_SetString(PyExc_TypeError, + "zdict argument must support the buffer protocol"); return NULL; + } self = newcompobject(&Decomptype); if (self == NULL) @@ -369,6 +411,10 @@ PyZlib_decompressobj(PyObject *selfptr, PyObject *args) self->zst.zfree = (free_func)Z_NULL; self->zst.next_in = NULL; self->zst.avail_in = 0; + if (zdict != NULL) { + Py_INCREF(zdict); + self->zdict = zdict; + } err = inflateInit2(&self->zst, wbits); switch(err) { case (Z_OK): @@ -398,6 +444,7 @@ Dealloc(compobject *self) #endif Py_XDECREF(self->unused_data); Py_XDECREF(self->unconsumed_tail); + Py_XDECREF(self->zdict); PyObject_Del(self); } @@ -557,6 +604,27 @@ PyZlib_objdecompress(compobject *self, PyObject *args) err = inflate(&(self->zst), Z_SYNC_FLUSH); Py_END_ALLOW_THREADS + if (err == Z_NEED_DICT && self->zdict != NULL) { + Py_buffer zdict_buf; + if (PyObject_GetBuffer(self->zdict, &zdict_buf, PyBUF_SIMPLE) == -1) { + Py_DECREF(RetVal); + RetVal = NULL; + goto error; + } + err = inflateSetDictionary(&(self->zst), zdict_buf.buf, zdict_buf.len); + PyBuffer_Release(&zdict_buf); + if (err != Z_OK) { + zlib_error(self->zst, err, "while decompressing data"); + Py_DECREF(RetVal); + RetVal = NULL; + goto error; + } + /* repeat the call to inflate! */ + Py_BEGIN_ALLOW_THREADS + err = inflate(&(self->zst), Z_SYNC_FLUSH); + Py_END_ALLOW_THREADS + } + /* While Z_OK and the output buffer is full, there might be more output. So extend the output buffer and try again. */ @@ -770,10 +838,13 @@ PyZlib_copy(compobject *self) } Py_INCREF(self->unused_data); Py_INCREF(self->unconsumed_tail); + Py_XINCREF(self->zdict); Py_XDECREF(retval->unused_data); Py_XDECREF(retval->unconsumed_tail); + Py_XDECREF(retval->zdict); retval->unused_data = self->unused_data; retval->unconsumed_tail = self->unconsumed_tail; + retval->zdict = self->zdict; retval->eof = self->eof; /* Mark it as being initialized */ @@ -822,10 +893,13 @@ PyZlib_uncopy(compobject *self) Py_INCREF(self->unused_data); Py_INCREF(self->unconsumed_tail); + Py_XINCREF(self->zdict); Py_XDECREF(retval->unused_data); Py_XDECREF(retval->unconsumed_tail); + Py_XDECREF(retval->zdict); retval->unused_data = self->unused_data; retval->unconsumed_tail = self->unconsumed_tail; + retval->zdict = self->zdict; retval->eof = self->eof; /* Mark it as being initialized */ @@ -1032,13 +1106,13 @@ static PyMethodDef zlib_methods[] = adler32__doc__}, {"compress", (PyCFunction)PyZlib_compress, METH_VARARGS, compress__doc__}, - {"compressobj", (PyCFunction)PyZlib_compressobj, METH_VARARGS, + {"compressobj", (PyCFunction)PyZlib_compressobj, METH_VARARGS|METH_KEYWORDS, compressobj__doc__}, {"crc32", (PyCFunction)PyZlib_crc32, METH_VARARGS, crc32__doc__}, {"decompress", (PyCFunction)PyZlib_decompress, METH_VARARGS, decompress__doc__}, - {"decompressobj", (PyCFunction)PyZlib_decompressobj, METH_VARARGS, + {"decompressobj", (PyCFunction)PyZlib_decompressobj, METH_VARARGS|METH_KEYWORDS, decompressobj__doc__}, {NULL, NULL} }; @@ -1112,10 +1186,10 @@ PyDoc_STRVAR(zlib_module_documentation, "\n" "adler32(string[, start]) -- Compute an Adler-32 checksum.\n" "compress(string[, level]) -- Compress string, with compression level in 1-9.\n" -"compressobj([level]) -- Return a compressor object.\n" +"compressobj([level[, ...]]) -- Return a compressor object.\n" "crc32(string[, start]) -- Compute a CRC-32 checksum.\n" "decompress(string,[wbits],[bufsize]) -- Decompresses a compressed string.\n" -"decompressobj([wbits]) -- Return a decompressor object.\n" +"decompressobj([wbits[, zdict]]]) -- Return a decompressor object.\n" "\n" "'wbits' is window buffer size.\n" "Compressor objects support compress() and flush() methods; decompressor\n" -- cgit v0.12