diff options
author | Nadeem Vawda <nadeem.vawda@gmail.com> | 2011-05-27 00:03:06 (GMT) |
---|---|---|
committer | Nadeem Vawda <nadeem.vawda@gmail.com> | 2011-05-27 00:03:06 (GMT) |
commit | 4e18ac850f4ba24d5de10137b9e04a900fc3e215 (patch) | |
tree | 8ad2e5db8bf20315d0bbd95351ed2c6a518661f6 | |
parent | 98fe1a0c3bacdc51071d960d8d76b3b9f5b0d8c6 (diff) | |
parent | 200e00a90441f0ee40a71306256774feb0beca7b (diff) | |
download | cpython-4e18ac850f4ba24d5de10137b9e04a900fc3e215.zip cpython-4e18ac850f4ba24d5de10137b9e04a900fc3e215.tar.gz cpython-4e18ac850f4ba24d5de10137b9e04a900fc3e215.tar.bz2 |
Merge heads
-rw-r--r-- | Doc/library/bz2.rst | 39 | ||||
-rw-r--r-- | Lib/bz2.py | 47 | ||||
-rw-r--r-- | Lib/test/test_bz2.py | 126 | ||||
-rw-r--r-- | Misc/NEWS | 3 |
4 files changed, 192 insertions, 23 deletions
diff --git a/Doc/library/bz2.rst b/Doc/library/bz2.rst index 2ccdb51..87f2cf3 100644 --- a/Doc/library/bz2.rst +++ b/Doc/library/bz2.rst @@ -37,14 +37,18 @@ All of the classes in this module may safely be accessed from multiple threads. *fileobj*), or operate directly on a named file (named by *filename*). Exactly one of these two parameters should be provided. - The *mode* argument can be either ``'r'`` for reading (default), or ``'w'`` - for writing. + The *mode* argument can be either ``'r'`` for reading (default), ``'w'`` for + overwriting, or ``'a'`` for appending. If *fileobj* is provided, a mode of + ``'w'`` does not truncate the file, and is instead equivalent to ``'a'``. The *buffering* argument is ignored. Its use is deprecated. - If *mode* is ``'w'``, *compresslevel* can be a number between ``1`` and - ``9`` specifying the level of compression: ``1`` produces the least - compression, and ``9`` (default) produces the most compression. + If *mode* is ``'w'`` or ``'a'``, *compresslevel* can be a number between + ``1`` and ``9`` specifying the level of compression: ``1`` produces the + least compression, and ``9`` (default) produces the most compression. + + If *mode* is ``'r'``, the input file may be the concatenation of multiple + compressed streams. :class:`BZ2File` provides all of the members specified by the :class:`io.BufferedIOBase`, except for :meth:`detach` and :meth:`truncate`. @@ -70,6 +74,10 @@ All of the classes in this module may safely be accessed from multiple threads. .. versionchanged:: 3.3 The *fileobj* argument to the constructor was added. + .. versionchanged:: 3.3 + The ``'a'`` (append) mode was added, along with support for reading + multi-stream files. + Incremental (de)compression --------------------------- @@ -106,14 +114,20 @@ Incremental (de)compression incrementally. For one-shot compression, use the :func:`decompress` function instead. + .. note:: + This class does not transparently handle inputs containing multiple + compressed streams, unlike :func:`decompress` and :class:`BZ2File`. If + you need to decompress a multi-stream input with :class:`BZ2Decompressor`, + you must use a new decompressor for each stream. + .. method:: decompress(data) Provide data to the decompressor object. Returns a chunk of decompressed data if possible, or an empty byte string otherwise. - Attempting to decompress data after the end of stream is reached raises - an :exc:`EOFError`. If any data is found after the end of the stream, it - is ignored and saved in the :attr:`unused_data` attribute. + Attempting to decompress data after the end of the current stream is + reached raises an :exc:`EOFError`. If any data is found after the end of + the stream, it is ignored and saved in the :attr:`unused_data` attribute. .. attribute:: eof @@ -127,6 +141,9 @@ Incremental (de)compression Data found after the end of the compressed stream. + If this attribute is accessed before the end of the stream has been + reached, its value will be ``b''``. + One-shot (de)compression ------------------------ @@ -145,5 +162,11 @@ One-shot (de)compression Decompress *data*. + If *data* is the concatenation of multiple compressed streams, decompress + all of the streams. + For incremental decompression, use a :class:`BZ2Decompressor` instead. + .. versionchanged:: 3.3 + Support for multi-stream inputs was added. + @@ -76,6 +76,10 @@ class BZ2File(io.BufferedIOBase): mode = "wb" mode_code = _MODE_WRITE self._compressor = BZ2Compressor() + elif mode in ("a", "ab"): + mode = "ab" + mode_code = _MODE_WRITE + self._compressor = BZ2Compressor() else: raise ValueError("Invalid mode: {!r}".format(mode)) @@ -161,14 +165,25 @@ class BZ2File(io.BufferedIOBase): def _fill_buffer(self): if self._buffer: return True - if self._decompressor.eof: - self._mode = _MODE_READ_EOF - self._size = self._pos - return False - rawblock = self._fp.read(_BUFFER_SIZE) + + if self._decompressor.unused_data: + rawblock = self._decompressor.unused_data + else: + rawblock = self._fp.read(_BUFFER_SIZE) + if not rawblock: - raise EOFError("Compressed file ended before the " - "end-of-stream marker was reached") + if self._decompressor.eof: + self._mode = _MODE_READ_EOF + self._size = self._pos + return False + else: + raise EOFError("Compressed file ended before the " + "end-of-stream marker was reached") + + # Continue to next stream. + if self._decompressor.eof: + self._decompressor = BZ2Decompressor() + self._buffer = self._decompressor.decompress(rawblock) return True @@ -384,9 +399,15 @@ def decompress(data): """ if len(data) == 0: return b"" - decomp = BZ2Decompressor() - result = decomp.decompress(data) - if not decomp.eof: - raise ValueError("Compressed data ended before the " - "end-of-stream marker was reached") - return result + + result = b"" + while True: + decomp = BZ2Decompressor() + result += decomp.decompress(data) + if not decomp.eof: + raise ValueError("Compressed data ended before the " + "end-of-stream marker was reached") + if not decomp.unused_data: + return result + # There is unused data left over. Proceed to next stream. + data = decomp.unused_data diff --git a/Lib/test/test_bz2.py b/Lib/test/test_bz2.py index 3567b36..4d66840 100644 --- a/Lib/test/test_bz2.py +++ b/Lib/test/test_bz2.py @@ -84,9 +84,9 @@ class BZ2FileTest(BaseTest): else: return self.DATA - def createTempFile(self, crlf=False): + def createTempFile(self, crlf=False, streams=1): with open(self.filename, "wb") as f: - f.write(self.getData(crlf)) + f.write(self.getData(crlf) * streams) def testRead(self): # "Test BZ2File.read()" @@ -95,6 +95,26 @@ class BZ2FileTest(BaseTest): self.assertRaises(TypeError, bz2f.read, None) self.assertEqual(bz2f.read(), self.TEXT) + def testReadMultiStream(self): + # "Test BZ2File.read() with a multi stream archive" + self.createTempFile(streams=5) + with BZ2File(self.filename) as bz2f: + self.assertRaises(TypeError, bz2f.read, None) + self.assertEqual(bz2f.read(), self.TEXT * 5) + + def testReadMonkeyMultiStream(self): + # "Test BZ2File.read() with a multi stream archive in which stream" + # "end is alined with internal buffer size" + buffer_size = bz2._BUFFER_SIZE + bz2._BUFFER_SIZE = len(self.DATA) + try: + self.createTempFile(streams=5) + with BZ2File(self.filename) as bz2f: + self.assertRaises(TypeError, bz2f.read, None) + self.assertEqual(bz2f.read(), self.TEXT * 5) + finally: + bz2._BUFFER_SIZE = buffer_size + def testRead0(self): # "Test BBZ2File.read(0)" self.createTempFile() @@ -114,6 +134,18 @@ class BZ2FileTest(BaseTest): text += str self.assertEqual(text, self.TEXT) + def testReadChunk10MultiStream(self): + # "Test BZ2File.read() in chunks of 10 bytes with a multi stream archive" + self.createTempFile(streams=5) + with BZ2File(self.filename) as bz2f: + text = b'' + while 1: + str = bz2f.read(10) + if not str: + break + text += str + self.assertEqual(text, self.TEXT * 5) + def testRead100(self): # "Test BZ2File.read(100)" self.createTempFile() @@ -151,6 +183,15 @@ class BZ2FileTest(BaseTest): for line in sio.readlines(): self.assertEqual(bz2f.readline(), line) + def testReadLineMultiStream(self): + # "Test BZ2File.readline() with a multi stream archive" + self.createTempFile(streams=5) + with BZ2File(self.filename) as bz2f: + self.assertRaises(TypeError, bz2f.readline, None) + sio = BytesIO(self.TEXT * 5) + for line in sio.readlines(): + self.assertEqual(bz2f.readline(), line) + def testReadLines(self): # "Test BZ2File.readlines()" self.createTempFile() @@ -159,6 +200,14 @@ class BZ2FileTest(BaseTest): sio = BytesIO(self.TEXT) self.assertEqual(bz2f.readlines(), sio.readlines()) + def testReadLinesMultiStream(self): + # "Test BZ2File.readlines() with a multi stream archive" + self.createTempFile(streams=5) + with BZ2File(self.filename) as bz2f: + self.assertRaises(TypeError, bz2f.readlines, None) + sio = BytesIO(self.TEXT * 5) + self.assertEqual(bz2f.readlines(), sio.readlines()) + def testIterator(self): # "Test iter(BZ2File)" self.createTempFile() @@ -166,6 +215,13 @@ class BZ2FileTest(BaseTest): sio = BytesIO(self.TEXT) self.assertEqual(list(iter(bz2f)), sio.readlines()) + def testIteratorMultiStream(self): + # "Test iter(BZ2File) with a multi stream archive" + self.createTempFile(streams=5) + with BZ2File(self.filename) as bz2f: + sio = BytesIO(self.TEXT * 5) + self.assertEqual(list(iter(bz2f)), sio.readlines()) + def testClosedIteratorDeadlock(self): # "Test that iteration on a closed bz2file releases the lock." # http://bugs.python.org/issue3309 @@ -217,6 +273,17 @@ class BZ2FileTest(BaseTest): self.assertRaises(IOError, bz2f.write, b"a") self.assertRaises(IOError, bz2f.writelines, [b"a"]) + def testAppend(self): + # "Test BZ2File.write()" + with BZ2File(self.filename, "w") as bz2f: + self.assertRaises(TypeError, bz2f.write) + bz2f.write(self.TEXT) + with BZ2File(self.filename, "a") as bz2f: + self.assertRaises(TypeError, bz2f.write) + bz2f.write(self.TEXT) + with open(self.filename, 'rb') as f: + self.assertEqual(self.decompress(f.read()), self.TEXT * 2) + def testSeekForward(self): # "Test BZ2File.seek(150, 0)" self.createTempFile() @@ -225,6 +292,14 @@ class BZ2FileTest(BaseTest): bz2f.seek(150) self.assertEqual(bz2f.read(), self.TEXT[150:]) + def testSeekForwardMultiStream(self): + # "Test BZ2File.seek(150, 0) across stream boundaries" + self.createTempFile(streams=2) + with BZ2File(self.filename) as bz2f: + self.assertRaises(TypeError, bz2f.seek) + bz2f.seek(len(self.TEXT) + 150) + self.assertEqual(bz2f.read(), self.TEXT[150:]) + def testSeekBackwards(self): # "Test BZ2File.seek(-150, 1)" self.createTempFile() @@ -233,6 +308,16 @@ class BZ2FileTest(BaseTest): bz2f.seek(-150, 1) self.assertEqual(bz2f.read(), self.TEXT[500-150:]) + def testSeekBackwardsMultiStream(self): + # "Test BZ2File.seek(-150, 1) across stream boundaries" + self.createTempFile(streams=2) + with BZ2File(self.filename) as bz2f: + readto = len(self.TEXT) + 100 + while readto > 0: + readto -= len(bz2f.read(readto)) + bz2f.seek(-150, 1) + self.assertEqual(bz2f.read(), self.TEXT[100-150:] + self.TEXT) + def testSeekBackwardsFromEnd(self): # "Test BZ2File.seek(-150, 2)" self.createTempFile() @@ -240,6 +325,13 @@ class BZ2FileTest(BaseTest): bz2f.seek(-150, 2) self.assertEqual(bz2f.read(), self.TEXT[len(self.TEXT)-150:]) + def testSeekBackwardsFromEndMultiStream(self): + # "Test BZ2File.seek(-1000, 2) across stream boundaries" + self.createTempFile(streams=2) + with BZ2File(self.filename) as bz2f: + bz2f.seek(-1000, 2) + self.assertEqual(bz2f.read(), (self.TEXT * 2)[-1000:]) + def testSeekPostEnd(self): # "Test BZ2File.seek(150000)" self.createTempFile() @@ -248,6 +340,14 @@ class BZ2FileTest(BaseTest): self.assertEqual(bz2f.tell(), len(self.TEXT)) self.assertEqual(bz2f.read(), b"") + def testSeekPostEndMultiStream(self): + # "Test BZ2File.seek(150000)" + self.createTempFile(streams=5) + with BZ2File(self.filename) as bz2f: + bz2f.seek(150000) + self.assertEqual(bz2f.tell(), len(self.TEXT) * 5) + self.assertEqual(bz2f.read(), b"") + def testSeekPostEndTwice(self): # "Test BZ2File.seek(150000) twice" self.createTempFile() @@ -257,6 +357,15 @@ class BZ2FileTest(BaseTest): self.assertEqual(bz2f.tell(), len(self.TEXT)) self.assertEqual(bz2f.read(), b"") + def testSeekPostEndTwiceMultiStream(self): + # "Test BZ2File.seek(150000) twice with a multi stream archive" + self.createTempFile(streams=5) + with BZ2File(self.filename) as bz2f: + bz2f.seek(150000) + bz2f.seek(150000) + self.assertEqual(bz2f.tell(), len(self.TEXT) * 5) + self.assertEqual(bz2f.read(), b"") + def testSeekPreStart(self): # "Test BZ2File.seek(-150, 0)" self.createTempFile() @@ -265,6 +374,14 @@ class BZ2FileTest(BaseTest): self.assertEqual(bz2f.tell(), 0) self.assertEqual(bz2f.read(), self.TEXT) + def testSeekPreStartMultiStream(self): + # "Test BZ2File.seek(-150, 0) with a multi stream archive" + self.createTempFile(streams=2) + with BZ2File(self.filename) as bz2f: + bz2f.seek(-150) + self.assertEqual(bz2f.tell(), 0) + self.assertEqual(bz2f.read(), self.TEXT * 2) + def testFileno(self): # "Test BZ2File.fileno()" self.createTempFile() @@ -510,6 +627,11 @@ class FuncTest(BaseTest): # "Test decompress() function with incomplete data" self.assertRaises(ValueError, bz2.decompress, self.DATA[:-10]) + def testDecompressMultiStream(self): + # "Test decompress() function for data with multiple streams" + text = bz2.decompress(self.DATA * 5) + self.assertEqual(text, self.TEXT * 5) + def test_main(): support.run_unittest( BZ2FileTest, @@ -161,6 +161,9 @@ Core and Builtins Library ------- +- Issue #1625: BZ2File and bz2.decompress() now support multi-stream files. + Initial patch by Nir Aides. + - Issue #8796: codecs.open() calls the builtin open() function instead of using StreamReaderWriter. Deprecate StreamReader, StreamWriter, StreamReaderWriter, StreamRecoder and EncodedFile() of the codec module. Use the builtin open() |