diff options
-rw-r--r-- | Doc/library/lzma.rst | 37 | ||||
-rw-r--r-- | Lib/test/test_lzma.py | 91 | ||||
-rw-r--r-- | Misc/NEWS | 3 | ||||
-rw-r--r-- | Modules/_lzmamodule.c | 218 | ||||
-rw-r--r-- | Modules/clinic/_lzmamodule.c.h | 37 |
5 files changed, 327 insertions, 59 deletions
diff --git a/Doc/library/lzma.rst b/Doc/library/lzma.rst index b71051d..99f07dc 100644 --- a/Doc/library/lzma.rst +++ b/Doc/library/lzma.rst @@ -221,13 +221,32 @@ Compressing and decompressing data in memory decompress a multi-stream input with :class:`LZMADecompressor`, you must create a new decompressor for each stream. - .. method:: decompress(data) + .. method:: decompress(data, max_length=-1) - Decompress *data* (a :class:`bytes` object), returning a :class:`bytes` - object containing the decompressed data for at least part of the input. - Some of *data* may be buffered internally, for use in later calls to - :meth:`decompress`. The returned data should be concatenated with the - output of any previous calls to :meth:`decompress`. + Decompress *data* (a :term:`bytes-like object`), returning + uncompressed data as bytes. Some of *data* may be buffered + internally, for use in later calls to :meth:`decompress`. The + returned data should be concatenated with the output of any + previous calls to :meth:`decompress`. + + If *max_length* is nonnegative, returns at most *max_length* + bytes of decompressed data. If this limit is reached and further + output can be produced, the :attr:`~.needs_input` attribute will + be set to ``False``. In this case, the next call to + :meth:`~.decompress` may provide *data* as ``b''`` to obtain + more of the output. + + If all of the input data was decompressed and returned (either + because this was less than *max_length* bytes, or because + *max_length* was negative), the :attr:`~.needs_input` attribute + will be set to ``True``. + + Attempting to decompress data after the end of stream is reached + raises an `EOFError`. Any data found after the end of the + stream is ignored and saved in the :attr:`~.unused_data` attribute. + + .. versionchanged:: 3.5 + Added the *max_length* parameter. .. attribute:: check @@ -245,6 +264,12 @@ Compressing and decompressing data in memory Before the end of the stream is reached, this will be ``b""``. + .. attribute:: needs_input + + ``False`` if the :meth:`.decompress` method can provide more + decompressed data before requiring new uncompressed input. + + .. versionadded:: 3.5 .. function:: compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None) diff --git a/Lib/test/test_lzma.py b/Lib/test/test_lzma.py index 07fadbd..cded28c 100644 --- a/Lib/test/test_lzma.py +++ b/Lib/test/test_lzma.py @@ -135,6 +135,97 @@ class CompressorDecompressorTestCase(unittest.TestCase): self.assertTrue(lzd.eof) self.assertEqual(lzd.unused_data, b"") + def test_decompressor_chunks_maxsize(self): + lzd = LZMADecompressor() + max_length = 100 + out = [] + + # Feed first half the input + len_ = len(COMPRESSED_XZ) // 2 + out.append(lzd.decompress(COMPRESSED_XZ[:len_], + max_length=max_length)) + self.assertFalse(lzd.needs_input) + self.assertEqual(len(out[-1]), max_length) + + # Retrieve more data without providing more input + out.append(lzd.decompress(b'', max_length=max_length)) + self.assertFalse(lzd.needs_input) + self.assertEqual(len(out[-1]), max_length) + + # Retrieve more data while providing more input + out.append(lzd.decompress(COMPRESSED_XZ[len_:], + max_length=max_length)) + self.assertLessEqual(len(out[-1]), max_length) + + # Retrieve remaining uncompressed data + while not lzd.eof: + out.append(lzd.decompress(b'', max_length=max_length)) + self.assertLessEqual(len(out[-1]), max_length) + + out = b"".join(out) + self.assertEqual(out, INPUT) + self.assertEqual(lzd.check, lzma.CHECK_CRC64) + self.assertEqual(lzd.unused_data, b"") + + def test_decompressor_inputbuf_1(self): + # Test reusing input buffer after moving existing + # contents to beginning + lzd = LZMADecompressor() + out = [] + + # Create input buffer and fill it + self.assertEqual(lzd.decompress(COMPRESSED_XZ[:100], + max_length=0), b'') + + # Retrieve some results, freeing capacity at beginning + # of input buffer + out.append(lzd.decompress(b'', 2)) + + # Add more data that fits into input buffer after + # moving existing data to beginning + out.append(lzd.decompress(COMPRESSED_XZ[100:105], 15)) + + # Decompress rest of data + out.append(lzd.decompress(COMPRESSED_XZ[105:])) + self.assertEqual(b''.join(out), INPUT) + + def test_decompressor_inputbuf_2(self): + # Test reusing input buffer by appending data at the + # end right away + lzd = LZMADecompressor() + out = [] + + # Create input buffer and empty it + self.assertEqual(lzd.decompress(COMPRESSED_XZ[:200], + max_length=0), b'') + out.append(lzd.decompress(b'')) + + # Fill buffer with new data + out.append(lzd.decompress(COMPRESSED_XZ[200:280], 2)) + + # Append some more data, not enough to require resize + out.append(lzd.decompress(COMPRESSED_XZ[280:300], 2)) + + # Decompress rest of data + out.append(lzd.decompress(COMPRESSED_XZ[300:])) + self.assertEqual(b''.join(out), INPUT) + + def test_decompressor_inputbuf_3(self): + # Test reusing input buffer after extending it + + lzd = LZMADecompressor() + out = [] + + # Create almost full input buffer + out.append(lzd.decompress(COMPRESSED_XZ[:200], 5)) + + # Add even more data to it, requiring resize + out.append(lzd.decompress(COMPRESSED_XZ[200:300], 5)) + + # Decompress rest of data + out.append(lzd.decompress(COMPRESSED_XZ[300:])) + self.assertEqual(b''.join(out), INPUT) + def test_decompressor_unused_data(self): lzd = LZMADecompressor() extra = b"fooblibar" @@ -203,6 +203,9 @@ Core and Builtins Library ------- +- Issue #15955: Add an option to limit output size when decompressing LZMA + data. Patch by Nikolaus Rath and Martin Panter. + - Issue #23250: In the http.cookies module, capitalize "HttpOnly" and "Secure" as they are written in the standard. diff --git a/Modules/_lzmamodule.c b/Modules/_lzmamodule.c index dfb95fa..fb15233 100644 --- a/Modules/_lzmamodule.c +++ b/Modules/_lzmamodule.c @@ -66,6 +66,9 @@ typedef struct { int check; char eof; PyObject *unused_data; + char needs_input; + uint8_t *input_buffer; + size_t input_buffer_size; #ifdef WITH_THREAD PyThread_type_lock lock; #endif @@ -142,10 +145,15 @@ PyLzma_Free(void *opaque, void *ptr) #endif static int -grow_buffer(PyObject **buf) +grow_buffer(PyObject **buf, Py_ssize_t max_length) { - size_t size = PyBytes_GET_SIZE(*buf); - return _PyBytes_Resize(buf, size + (size >> 3) + 6); + Py_ssize_t size = PyBytes_GET_SIZE(*buf); + Py_ssize_t newsize = size + (size >> 3) + 6; + + if (max_length > 0 && newsize > max_length) + newsize = max_length; + + return _PyBytes_Resize(buf, newsize); } @@ -504,7 +512,7 @@ class lzma_filter_converter(CConverter): static PyObject * compress(Compressor *c, uint8_t *data, size_t len, lzma_action action) { - size_t data_size = 0; + Py_ssize_t data_size = 0; PyObject *result; result = PyBytes_FromStringAndSize(NULL, INITIAL_BUFFER_SIZE); @@ -527,13 +535,13 @@ compress(Compressor *c, uint8_t *data, size_t len, lzma_action action) (action == LZMA_FINISH && lzret == LZMA_STREAM_END)) { break; } else if (c->lzs.avail_out == 0) { - if (grow_buffer(&result) == -1) + if (grow_buffer(&result, -1) == -1) goto error; c->lzs.next_out = (uint8_t *)PyBytes_AS_STRING(result) + data_size; c->lzs.avail_out = PyBytes_GET_SIZE(result) - data_size; } } - if (data_size != (size_t)PyBytes_GET_SIZE(result)) + if (data_size != PyBytes_GET_SIZE(result)) if (_PyBytes_Resize(&result, data_size) == -1) goto error; return result; @@ -888,25 +896,33 @@ static PyTypeObject Compressor_type = { /* LZMADecompressor class. */ -static PyObject * -decompress(Decompressor *d, uint8_t *data, size_t len) +/* Decompress data of length d->lzs.avail_in in d->lzs.next_in. The output + buffer is allocated dynamically and returned. At most max_length bytes are + returned, so some of the input may not be consumed. d->lzs.next_in and + d->lzs.avail_in are updated to reflect the consumed input. */ +static PyObject* +decompress_buf(Decompressor *d, Py_ssize_t max_length) { - size_t data_size = 0; + Py_ssize_t data_size = 0; PyObject *result; - - result = PyBytes_FromStringAndSize(NULL, INITIAL_BUFFER_SIZE); + lzma_stream *lzs = &d->lzs; + + if (max_length < 0 || max_length >= INITIAL_BUFFER_SIZE) + result = PyBytes_FromStringAndSize(NULL, INITIAL_BUFFER_SIZE); + else + result = PyBytes_FromStringAndSize(NULL, max_length); if (result == NULL) return NULL; - d->lzs.next_in = data; - d->lzs.avail_in = len; - d->lzs.next_out = (uint8_t *)PyBytes_AS_STRING(result); - d->lzs.avail_out = PyBytes_GET_SIZE(result); + + lzs->next_out = (uint8_t *)PyBytes_AS_STRING(result); + lzs->avail_out = PyBytes_GET_SIZE(result); + for (;;) { lzma_ret lzret; Py_BEGIN_ALLOW_THREADS - lzret = lzma_code(&d->lzs, LZMA_RUN); - data_size = (char *)d->lzs.next_out - PyBytes_AS_STRING(result); + lzret = lzma_code(lzs, LZMA_RUN); + data_size = (char *)lzs->next_out - PyBytes_AS_STRING(result); Py_END_ALLOW_THREADS if (catch_lzma_error(lzret)) goto error; @@ -914,26 +930,131 @@ decompress(Decompressor *d, uint8_t *data, size_t len) d->check = lzma_get_check(&d->lzs); if (lzret == LZMA_STREAM_END) { d->eof = 1; - if (d->lzs.avail_in > 0) { - Py_CLEAR(d->unused_data); - d->unused_data = PyBytes_FromStringAndSize( - (char *)d->lzs.next_in, d->lzs.avail_in); - if (d->unused_data == NULL) - goto error; - } break; - } else if (d->lzs.avail_in == 0) { + } else if (lzs->avail_in == 0) { break; - } else if (d->lzs.avail_out == 0) { - if (grow_buffer(&result) == -1) + } else if (lzs->avail_out == 0) { + if (data_size == max_length) + break; + if (grow_buffer(&result, max_length) == -1) goto error; - d->lzs.next_out = (uint8_t *)PyBytes_AS_STRING(result) + data_size; - d->lzs.avail_out = PyBytes_GET_SIZE(result) - data_size; + lzs->next_out = (uint8_t *)PyBytes_AS_STRING(result) + data_size; + lzs->avail_out = PyBytes_GET_SIZE(result) - data_size; } } - if (data_size != (size_t)PyBytes_GET_SIZE(result)) + if (data_size != PyBytes_GET_SIZE(result)) if (_PyBytes_Resize(&result, data_size) == -1) goto error; + + return result; + +error: + Py_XDECREF(result); + return NULL; +} + +static PyObject * +decompress(Decompressor *d, uint8_t *data, size_t len, Py_ssize_t max_length) +{ + char input_buffer_in_use; + PyObject *result; + lzma_stream *lzs = &d->lzs; + + /* Prepend unconsumed input if necessary */ + if (lzs->next_in != NULL) { + size_t avail_now, avail_total; + + /* Number of bytes we can append to input buffer */ + avail_now = (d->input_buffer + d->input_buffer_size) + - (lzs->next_in + lzs->avail_in); + + /* Number of bytes we can append if we move existing + contents to beginning of buffer (overwriting + consumed input) */ + avail_total = d->input_buffer_size - lzs->avail_in; + + if (avail_total < len) { + size_t offset = lzs->next_in - d->input_buffer; + uint8_t *tmp; + size_t new_size = d->input_buffer_size + len - avail_now; + + /* Assign to temporary variable first, so we don't + lose address of allocated buffer if realloc fails */ + tmp = PyMem_Realloc(d->input_buffer, new_size); + if (tmp == NULL) { + PyErr_SetNone(PyExc_MemoryError); + return NULL; + } + d->input_buffer = tmp; + d->input_buffer_size = new_size; + + lzs->next_in = d->input_buffer + offset; + } + else if (avail_now < len) { + memmove(d->input_buffer, lzs->next_in, + lzs->avail_in); + lzs->next_in = d->input_buffer; + } + memcpy((void*)(lzs->next_in + lzs->avail_in), data, len); + lzs->avail_in += len; + input_buffer_in_use = 1; + } + else { + lzs->next_in = data; + lzs->avail_in = len; + input_buffer_in_use = 0; + } + + result = decompress_buf(d, max_length); + if(result == NULL) + return NULL; + + if (d->eof) { + d->needs_input = 0; + if (lzs->avail_in > 0) { + Py_CLEAR(d->unused_data); + d->unused_data = PyBytes_FromStringAndSize( + (char *)lzs->next_in, lzs->avail_in); + if (d->unused_data == NULL) + goto error; + } + } + else if (lzs->avail_in == 0) { + lzs->next_in = NULL; + d->needs_input = 1; + } + else { + d->needs_input = 0; + + /* If we did not use the input buffer, we now have + to copy the tail from the caller's buffer into the + input buffer */ + if (!input_buffer_in_use) { + + /* Discard buffer if it's too small + (resizing it may needlessly copy the current contents) */ + if (d->input_buffer != NULL && + d->input_buffer_size < lzs->avail_in) { + PyMem_Free(d->input_buffer); + d->input_buffer = NULL; + } + + /* Allocate if necessary */ + if (d->input_buffer == NULL) { + d->input_buffer = PyMem_Malloc(lzs->avail_in); + if (d->input_buffer == NULL) { + PyErr_SetNone(PyExc_MemoryError); + goto error; + } + d->input_buffer_size = lzs->avail_in; + } + + /* Copy tail */ + memcpy(d->input_buffer, lzs->next_in, lzs->avail_in); + lzs->next_in = d->input_buffer; + } + } + return result; error: @@ -946,20 +1067,27 @@ _lzma.LZMADecompressor.decompress self: self(type="Decompressor *") data: Py_buffer - / + max_length: Py_ssize_t=-1 -Provide data to the decompressor object. +Decompress *data*, returning uncompressed data as bytes. -Returns a chunk of decompressed data if possible, or b'' otherwise. +If *max_length* is nonnegative, returns at most *max_length* bytes of +decompressed data. If this limit is reached and further output can be +produced, *self.needs_input* will be set to ``False``. In this case, the next +call to *decompress()* may provide *data* as b'' to obtain more of the output. -Attempting to decompress data after the end of stream is reached -raises an EOFError. Any data found after the end of the stream -is ignored and saved in the unused_data attribute. +If all of the input data was decompressed and returned (either because this +was less than *max_length* bytes, or because *max_length* was negative), +*self.needs_input* will be set to True. + +Attempting to decompress data after the end of stream is reached raises an +EOFError. Any data found after the end of the stream is ignored and saved in +the unused_data attribute. [clinic start generated code]*/ static PyObject * -_lzma_LZMADecompressor_decompress_impl(Decompressor *self, Py_buffer *data) -/*[clinic end generated code: output=d86e78da7ff0ff21 input=50c4768b821bf0ef]*/ +_lzma_LZMADecompressor_decompress_impl(Decompressor *self, Py_buffer *data, Py_ssize_t max_length) +/*[clinic end generated code: output=1532a5bb23629001 input=262e4e217f49039b]*/ { PyObject *result = NULL; @@ -967,7 +1095,7 @@ _lzma_LZMADecompressor_decompress_impl(Decompressor *self, Py_buffer *data) if (self->eof) PyErr_SetString(PyExc_EOFError, "Already at end of stream"); else - result = decompress(self, data->buf, data->len); + result = decompress(self, data->buf, data->len, max_length); RELEASE_LOCK(self); return result; } @@ -1055,6 +1183,7 @@ _lzma_LZMADecompressor___init___impl(Decompressor *self, int format, PyObject *m self->alloc.alloc = PyLzma_Malloc; self->alloc.free = PyLzma_Free; self->lzs.allocator = &self->alloc; + self->lzs.next_in = NULL; #ifdef WITH_THREAD self->lock = PyThread_allocate_lock(); @@ -1065,6 +1194,9 @@ _lzma_LZMADecompressor___init___impl(Decompressor *self, int format, PyObject *m #endif self->check = LZMA_CHECK_UNKNOWN; + self->needs_input = 1; + self->input_buffer = NULL; + self->input_buffer_size = 0; self->unused_data = PyBytes_FromStringAndSize(NULL, 0); if (self->unused_data == NULL) goto error; @@ -1113,6 +1245,9 @@ error: static void Decompressor_dealloc(Decompressor *self) { + if(self->input_buffer != NULL) + PyMem_Free(self->input_buffer); + lzma_end(&self->lzs); Py_CLEAR(self->unused_data); #ifdef WITH_THREAD @@ -1134,6 +1269,9 @@ PyDoc_STRVAR(Decompressor_check_doc, PyDoc_STRVAR(Decompressor_eof_doc, "True if the end-of-stream marker has been reached."); +PyDoc_STRVAR(Decompressor_needs_input_doc, +"True if more input is needed before more decompressed data can be produced."); + PyDoc_STRVAR(Decompressor_unused_data_doc, "Data found after the end of the compressed stream."); @@ -1142,6 +1280,8 @@ static PyMemberDef Decompressor_members[] = { Decompressor_check_doc}, {"eof", T_BOOL, offsetof(Decompressor, eof), READONLY, Decompressor_eof_doc}, + {"needs_input", T_BOOL, offsetof(Decompressor, needs_input), READONLY, + Decompressor_needs_input_doc}, {"unused_data", T_OBJECT_EX, offsetof(Decompressor, unused_data), READONLY, Decompressor_unused_data_doc}, {NULL} diff --git a/Modules/clinic/_lzmamodule.c.h b/Modules/clinic/_lzmamodule.c.h index c1ad882..a46a152 100644 --- a/Modules/clinic/_lzmamodule.c.h +++ b/Modules/clinic/_lzmamodule.c.h @@ -62,34 +62,43 @@ _lzma_LZMACompressor_flush(Compressor *self, PyObject *Py_UNUSED(ignored)) } PyDoc_STRVAR(_lzma_LZMADecompressor_decompress__doc__, -"decompress($self, data, /)\n" +"decompress($self, /, data, max_length=-1)\n" "--\n" "\n" -"Provide data to the decompressor object.\n" +"Decompresses *data*, returning uncompressed data as bytes.\n" "\n" -"Returns a chunk of decompressed data if possible, or b\'\' otherwise.\n" +"If *max_length* is nonnegative, returns at most *max_length* bytes of\n" +"decompressed data. If this limit is reached and further output can be\n" +"produced, *self.needs_input* will be set to ``False``. In this case, the next\n" +"call to *decompress()* may provide *data* as b\'\' to obtain more of the output.\n" "\n" -"Attempting to decompress data after the end of stream is reached\n" -"raises an EOFError. Any data found after the end of the stream\n" -"is ignored and saved in the unused_data attribute."); +"If all of the input data was decompressed and returned (either because this\n" +"was less than *max_length* bytes, or because *max_length* was negative),\n" +"*self.needs_input* will be set to True.\n" +"\n" +"Attempting to decompress data after the end of stream is reached raises an\n" +"EOFError. Any data found after the end of the stream is ignored and saved in\n" +"the unused_data attribute."); #define _LZMA_LZMADECOMPRESSOR_DECOMPRESS_METHODDEF \ - {"decompress", (PyCFunction)_lzma_LZMADecompressor_decompress, METH_VARARGS, _lzma_LZMADecompressor_decompress__doc__}, + {"decompress", (PyCFunction)_lzma_LZMADecompressor_decompress, METH_VARARGS|METH_KEYWORDS, _lzma_LZMADecompressor_decompress__doc__}, static PyObject * -_lzma_LZMADecompressor_decompress_impl(Decompressor *self, Py_buffer *data); +_lzma_LZMADecompressor_decompress_impl(Decompressor *self, Py_buffer *data, Py_ssize_t max_length); static PyObject * -_lzma_LZMADecompressor_decompress(Decompressor *self, PyObject *args) +_lzma_LZMADecompressor_decompress(Decompressor *self, PyObject *args, PyObject *kwargs) { PyObject *return_value = NULL; + static char *_keywords[] = {"data", "max_length", NULL}; Py_buffer data = {NULL, NULL}; + Py_ssize_t max_length = -1; - if (!PyArg_ParseTuple(args, - "y*:decompress", - &data)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, + "y*|n:decompress", _keywords, + &data, &max_length)) goto exit; - return_value = _lzma_LZMADecompressor_decompress_impl(self, &data); + return_value = _lzma_LZMADecompressor_decompress_impl(self, &data, max_length); exit: /* Cleanup for data */ @@ -242,4 +251,4 @@ exit: return return_value; } -/*[clinic end generated code: output=808fec8216ac712b input=a9049054013a1b77]*/ +/*[clinic end generated code: output=d17fac38b09626d8 input=a9049054013a1b77]*/ |