diff options
Diffstat (limited to 'Lib/bz2.py')
-rw-r--r-- | Lib/bz2.py | 504 |
1 files changed, 504 insertions, 0 deletions
diff --git a/Lib/bz2.py b/Lib/bz2.py new file mode 100644 index 0000000..c307507 --- /dev/null +++ b/Lib/bz2.py @@ -0,0 +1,504 @@ +"""Interface to the libbzip2 compression library. + +This module provides a file interface, classes for incremental +(de)compression, and functions for one-shot (de)compression. +""" + +__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor", + "open", "compress", "decompress"] + +__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>" + +import builtins +import io +import warnings + +try: + from threading import RLock +except ImportError: + from dummy_threading import RLock + +from _bz2 import BZ2Compressor, BZ2Decompressor + + +_MODE_CLOSED = 0 +_MODE_READ = 1 +_MODE_READ_EOF = 2 +_MODE_WRITE = 3 + +_BUFFER_SIZE = 8192 + + +class BZ2File(io.BufferedIOBase): + + """A file object providing transparent bzip2 (de)compression. + + A BZ2File can act as a wrapper for an existing file object, or refer + directly to a named file on disk. + + Note that BZ2File provides a *binary* file interface - data read is + returned as bytes, and data to be written should be given as bytes. + """ + + def __init__(self, filename, mode="r", buffering=None, compresslevel=9): + """Open a bzip2-compressed file. + + If filename is a str or bytes object, is gives the name of the file to + be opened. Otherwise, it should be a file object, which will be used to + read or write the compressed data. + + mode can be 'r' for reading (default), 'w' for (over)writing, or 'a' for + appending. These can equivalently be given as 'rb', 'wb', and 'ab'. + + buffering is ignored. Its use is deprecated. + + If mode is 'w' or 'a', compresslevel can be a number between 1 + and 9 specifying the level of compression: 1 produces the least + compression, and 9 (default) produces the most compression. + + If mode is 'r', the input file may be the concatenation of + multiple compressed streams. + """ + # This lock must be recursive, so that BufferedIOBase's + # readline(), readlines() and writelines() don't deadlock. + self._lock = RLock() + self._fp = None + self._closefp = False + self._mode = _MODE_CLOSED + self._pos = 0 + self._size = -1 + + if buffering is not None: + warnings.warn("Use of 'buffering' argument is deprecated", + DeprecationWarning) + + if not (1 <= compresslevel <= 9): + raise ValueError("compresslevel must be between 1 and 9") + + if mode in ("", "r", "rb"): + mode = "rb" + mode_code = _MODE_READ + self._decompressor = BZ2Decompressor() + self._buffer = b"" + self._buffer_offset = 0 + elif mode in ("w", "wb"): + mode = "wb" + mode_code = _MODE_WRITE + self._compressor = BZ2Compressor(compresslevel) + elif mode in ("a", "ab"): + mode = "ab" + mode_code = _MODE_WRITE + self._compressor = BZ2Compressor(compresslevel) + else: + raise ValueError("Invalid mode: {!r}".format(mode)) + + if isinstance(filename, (str, bytes)): + self._fp = builtins.open(filename, mode) + self._closefp = True + self._mode = mode_code + elif hasattr(filename, "read") or hasattr(filename, "write"): + self._fp = filename + self._mode = mode_code + else: + raise TypeError("filename must be a str or bytes object, or a file") + + def close(self): + """Flush and close the file. + + May be called more than once without error. Once the file is + closed, any other operation on it will raise a ValueError. + """ + with self._lock: + if self._mode == _MODE_CLOSED: + return + try: + if self._mode in (_MODE_READ, _MODE_READ_EOF): + self._decompressor = None + elif self._mode == _MODE_WRITE: + self._fp.write(self._compressor.flush()) + self._compressor = None + finally: + try: + if self._closefp: + self._fp.close() + finally: + self._fp = None + self._closefp = False + self._mode = _MODE_CLOSED + self._buffer = b"" + self._buffer_offset = 0 + + @property + def closed(self): + """True if this file is closed.""" + return self._mode == _MODE_CLOSED + + def fileno(self): + """Return the file descriptor for the underlying file.""" + self._check_not_closed() + return self._fp.fileno() + + def seekable(self): + """Return whether the file supports seeking.""" + return self.readable() and self._fp.seekable() + + def readable(self): + """Return whether the file was opened for reading.""" + self._check_not_closed() + return self._mode in (_MODE_READ, _MODE_READ_EOF) + + def writable(self): + """Return whether the file was opened for writing.""" + self._check_not_closed() + return self._mode == _MODE_WRITE + + # Mode-checking helper functions. + + def _check_not_closed(self): + if self.closed: + raise ValueError("I/O operation on closed file") + + def _check_can_read(self): + if self._mode not in (_MODE_READ, _MODE_READ_EOF): + self._check_not_closed() + raise io.UnsupportedOperation("File not open for reading") + + def _check_can_write(self): + if self._mode != _MODE_WRITE: + self._check_not_closed() + raise io.UnsupportedOperation("File not open for writing") + + def _check_can_seek(self): + if self._mode not in (_MODE_READ, _MODE_READ_EOF): + self._check_not_closed() + raise io.UnsupportedOperation("Seeking is only supported " + "on files open for reading") + if not self._fp.seekable(): + raise io.UnsupportedOperation("The underlying file object " + "does not support seeking") + + # Fill the readahead buffer if it is empty. Returns False on EOF. + def _fill_buffer(self): + if self._mode == _MODE_READ_EOF: + return False + # Depending on the input data, our call to the decompressor may not + # return any data. In this case, try again after reading another block. + while self._buffer_offset == len(self._buffer): + rawblock = (self._decompressor.unused_data or + self._fp.read(_BUFFER_SIZE)) + + if not rawblock: + if self._decompressor.eof: + self._mode = _MODE_READ_EOF + self._size = self._pos + return False + else: + raise EOFError("Compressed file ended before the " + "end-of-stream marker was reached") + + # Continue to next stream. + if self._decompressor.eof: + self._decompressor = BZ2Decompressor() + + self._buffer = self._decompressor.decompress(rawblock) + self._buffer_offset = 0 + return True + + # Read data until EOF. + # If return_data is false, consume the data without returning it. + def _read_all(self, return_data=True): + # The loop assumes that _buffer_offset is 0. Ensure that this is true. + self._buffer = self._buffer[self._buffer_offset:] + self._buffer_offset = 0 + + blocks = [] + while self._fill_buffer(): + if return_data: + blocks.append(self._buffer) + self._pos += len(self._buffer) + self._buffer = b"" + if return_data: + return b"".join(blocks) + + # Read a block of up to n bytes. + # If return_data is false, consume the data without returning it. + def _read_block(self, n, return_data=True): + # If we have enough data buffered, return immediately. + end = self._buffer_offset + n + if end <= len(self._buffer): + data = self._buffer[self._buffer_offset : end] + self._buffer_offset = end + self._pos += len(data) + return data if return_data else None + + # The loop assumes that _buffer_offset is 0. Ensure that this is true. + self._buffer = self._buffer[self._buffer_offset:] + self._buffer_offset = 0 + + blocks = [] + while n > 0 and self._fill_buffer(): + if n < len(self._buffer): + data = self._buffer[:n] + self._buffer_offset = n + else: + data = self._buffer + self._buffer = b"" + if return_data: + blocks.append(data) + self._pos += len(data) + n -= len(data) + if return_data: + return b"".join(blocks) + + def peek(self, n=0): + """Return buffered data without advancing the file position. + + Always returns at least one byte of data, unless at EOF. + The exact number of bytes returned is unspecified. + """ + with self._lock: + self._check_can_read() + if not self._fill_buffer(): + return b"" + return self._buffer[self._buffer_offset:] + + def read(self, size=-1): + """Read up to size uncompressed bytes from the file. + + If size is negative or omitted, read until EOF is reached. + Returns b'' if the file is already at EOF. + """ + with self._lock: + self._check_can_read() + if size == 0: + return b"" + elif size < 0: + return self._read_all() + else: + return self._read_block(size) + + def read1(self, size=-1): + """Read up to size uncompressed bytes, while trying to avoid + making multiple reads from the underlying stream. + + Returns b'' if the file is at EOF. + """ + # Usually, read1() calls _fp.read() at most once. However, sometimes + # this does not give enough data for the decompressor to make progress. + # In this case we make multiple reads, to avoid returning b"". + with self._lock: + self._check_can_read() + if (size == 0 or + # Only call _fill_buffer() if the buffer is actually empty. + # This gives a significant speedup if *size* is small. + (self._buffer_offset == len(self._buffer) and not self._fill_buffer())): + return b"" + if size > 0: + data = self._buffer[self._buffer_offset : + self._buffer_offset + size] + self._buffer_offset += len(data) + else: + data = self._buffer[self._buffer_offset:] + self._buffer = b"" + self._buffer_offset = 0 + self._pos += len(data) + return data + + def readinto(self, b): + """Read up to len(b) bytes into b. + + Returns the number of bytes read (0 for EOF). + """ + with self._lock: + return io.BufferedIOBase.readinto(self, b) + + def readline(self, size=-1): + """Read a line of uncompressed bytes from the file. + + The terminating newline (if present) is retained. If size is + non-negative, no more than size bytes will be read (in which + case the line may be incomplete). Returns b'' if already at EOF. + """ + if not isinstance(size, int): + if not hasattr(size, "__index__"): + raise TypeError("Integer argument expected") + size = size.__index__() + with self._lock: + self._check_can_read() + # Shortcut for the common case - the whole line is in the buffer. + if size < 0: + end = self._buffer.find(b"\n", self._buffer_offset) + 1 + if end > 0: + line = self._buffer[self._buffer_offset : end] + self._buffer_offset = end + self._pos += len(line) + return line + return io.BufferedIOBase.readline(self, size) + + def readlines(self, size=-1): + """Read a list of lines of uncompressed bytes from the file. + + size can be specified to control the number of lines read: no + further lines will be read once the total size of the lines read + so far equals or exceeds size. + """ + if not isinstance(size, int): + if not hasattr(size, "__index__"): + raise TypeError("Integer argument expected") + size = size.__index__() + with self._lock: + return io.BufferedIOBase.readlines(self, size) + + def write(self, data): + """Write a byte string to the file. + + Returns the number of uncompressed bytes written, which is + always len(data). Note that due to buffering, the file on disk + may not reflect the data written until close() is called. + """ + with self._lock: + self._check_can_write() + compressed = self._compressor.compress(data) + self._fp.write(compressed) + self._pos += len(data) + return len(data) + + def writelines(self, seq): + """Write a sequence of byte strings to the file. + + Returns the number of uncompressed bytes written. + seq can be any iterable yielding byte strings. + + Line separators are not added between the written byte strings. + """ + with self._lock: + return io.BufferedIOBase.writelines(self, seq) + + # Rewind the file to the beginning of the data stream. + def _rewind(self): + self._fp.seek(0, 0) + self._mode = _MODE_READ + self._pos = 0 + self._decompressor = BZ2Decompressor() + self._buffer = b"" + self._buffer_offset = 0 + + def seek(self, offset, whence=0): + """Change the file position. + + The new position is specified by offset, relative to the + position indicated by whence. Values for whence are: + + 0: start of stream (default); offset must not be negative + 1: current stream position + 2: end of stream; offset must not be positive + + Returns the new file position. + + Note that seeking is emulated, so depending on the parameters, + this operation may be extremely slow. + """ + with self._lock: + self._check_can_seek() + + # Recalculate offset as an absolute file position. + if whence == 0: + pass + elif whence == 1: + offset = self._pos + offset + elif whence == 2: + # Seeking relative to EOF - we need to know the file's size. + if self._size < 0: + self._read_all(return_data=False) + offset = self._size + offset + else: + raise ValueError("Invalid value for whence: {}".format(whence)) + + # Make it so that offset is the number of bytes to skip forward. + if offset < self._pos: + self._rewind() + else: + offset -= self._pos + + # Read and discard data until we reach the desired position. + self._read_block(offset, return_data=False) + + return self._pos + + def tell(self): + """Return the current file position.""" + with self._lock: + self._check_not_closed() + return self._pos + + +def open(filename, mode="rb", compresslevel=9, + encoding=None, errors=None, newline=None): + """Open a bzip2-compressed file in binary or text mode. + + The filename argument can be an actual filename (a str or bytes object), or + an existing file object to read from or write to. + + The mode argument can be "r", "rb", "w", "wb", "a" or "ab" for binary mode, + or "rt", "wt" or "at" for text mode. The default mode is "rb", and the + default compresslevel is 9. + + For binary mode, this function is equivalent to the BZ2File constructor: + BZ2File(filename, mode, compresslevel). In this case, the encoding, errors + and newline arguments must not be provided. + + For text mode, a BZ2File object is created, and wrapped in an + io.TextIOWrapper instance with the specified encoding, error handling + behavior, and line ending(s). + + """ + if "t" in mode: + if "b" in mode: + raise ValueError("Invalid mode: %r" % (mode,)) + else: + if encoding is not None: + raise ValueError("Argument 'encoding' not supported in binary mode") + if errors is not None: + raise ValueError("Argument 'errors' not supported in binary mode") + if newline is not None: + raise ValueError("Argument 'newline' not supported in binary mode") + + bz_mode = mode.replace("t", "") + binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel) + + if "t" in mode: + return io.TextIOWrapper(binary_file, encoding, errors, newline) + else: + return binary_file + + +def compress(data, compresslevel=9): + """Compress a block of data. + + compresslevel, if given, must be a number between 1 and 9. + + For incremental compression, use a BZ2Compressor object instead. + """ + comp = BZ2Compressor(compresslevel) + return comp.compress(data) + comp.flush() + + +def decompress(data): + """Decompress a block of data. + + For incremental decompression, use a BZ2Decompressor object instead. + """ + if len(data) == 0: + return b"" + + results = [] + while True: + decomp = BZ2Decompressor() + results.append(decomp.decompress(data)) + if not decomp.eof: + raise ValueError("Compressed data ended before the " + "end-of-stream marker was reached") + if not decomp.unused_data: + return b"".join(results) + # There is unused data left over. Proceed to next stream. + data = decomp.unused_data |