diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2016-03-08 16:28:36 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2016-03-08 16:28:36 (GMT) |
commit | cc2dbc5844929da1f89e3f548a8d1312b4f0ba0e (patch) | |
tree | bf1373b8718d55fb2ec54f190d0b390ab54947bf /Lib | |
parent | 55e3218eee49260a68de9c5a85549e1c719038ca (diff) | |
download | cpython-cc2dbc5844929da1f89e3f548a8d1312b4f0ba0e.zip cpython-cc2dbc5844929da1f89e3f548a8d1312b4f0ba0e.tar.gz cpython-cc2dbc5844929da1f89e3f548a8d1312b4f0ba0e.tar.bz2 |
Issue #15068: Got rid of excessive buffering in the fileinput module.
The bufsize parameter is no longer used.
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/fileinput.py | 161 | ||||
-rw-r--r-- | Lib/test/test_fileinput.py | 70 |
2 files changed, 142 insertions, 89 deletions
diff --git a/Lib/fileinput.py b/Lib/fileinput.py index 021e39f..9f14762 100644 --- a/Lib/fileinput.py +++ b/Lib/fileinput.py @@ -64,13 +64,6 @@ deleted when the output file is closed. In-place filtering is disabled when standard input is read. XXX The current implementation does not work for MS-DOS 8+3 filesystems. -Performance: this module is unfortunately one of the slower ways of -processing large numbers of input lines. Nevertheless, a significant -speed-up has been obtained by using readlines(bufsize) instead of -readline(). A new keyword argument, bufsize=N, is present on the -input() function and the FileInput() class to override the default -buffer size. - XXX Possible additions: - optional getopt argument processing @@ -86,6 +79,7 @@ __all__ = ["input", "close", "nextfile", "filename", "lineno", "filelineno", _state = None +# No longer used DEFAULT_BUFSIZE = 8*1024 def input(files=None, inplace=False, backup="", bufsize=0, @@ -207,17 +201,15 @@ class FileInput: self._files = files self._inplace = inplace self._backup = backup - self._bufsize = bufsize or DEFAULT_BUFSIZE self._savestdout = None self._output = None self._filename = None - self._lineno = 0 + self._startlineno = 0 self._filelineno = 0 self._file = None + self._readline = self._start_readline self._isstdin = False self._backupfilename = None - self._buffer = [] - self._bufindex = 0 # restrict mode argument to reading modes if mode not in ('r', 'rU', 'U', 'rb'): raise ValueError("FileInput opening mode must be one of " @@ -253,22 +245,18 @@ class FileInput: return self def __next__(self): - try: - line = self._buffer[self._bufindex] - except IndexError: - pass - else: - self._bufindex += 1 - self._lineno += 1 + line = self._readline() + if line: self._filelineno += 1 return line - line = self.readline() - if not line: + if not self._file: raise StopIteration - return line + self.nextfile() + # Recursive call + return self.__next__() def __getitem__(self, i): - if i != self._lineno: + if i != self.lineno(): raise RuntimeError("accessing lines out of order") try: return self.__next__() @@ -289,6 +277,7 @@ class FileInput: finally: file = self._file self._file = None + self._readline = self._start_readline try: if file and not self._isstdin: file.close() @@ -300,85 +289,81 @@ class FileInput: except OSError: pass self._isstdin = False - self._buffer = [] - self._bufindex = 0 def readline(self): - try: - line = self._buffer[self._bufindex] - except IndexError: - pass + while True: + line = self._readline() + if line: + self._filelineno += 1 + return line + if not self._file: + return line + self.nextfile() + # repeat with next file + + def _start_readline(self): + if not self._files: + if 'b' in self._mode: + return b'' + else: + return '' + self._filename = self._files[0] + self._files = self._files[1:] + self._startlineno = self.lineno() + self._filelineno = 0 + self._file = None + self._isstdin = False + self._backupfilename = 0 + if self._filename == '-': + self._filename = '<stdin>' + if 'b' in self._mode: + self._file = getattr(sys.stdin, 'buffer', sys.stdin) + else: + self._file = sys.stdin + self._isstdin = True else: - self._bufindex += 1 - self._lineno += 1 - self._filelineno += 1 - return line - if not self._file: - if not self._files: - if 'b' in self._mode: - return b'' + if self._inplace: + self._backupfilename = ( + self._filename + (self._backup or ".bak")) + try: + os.unlink(self._backupfilename) + except OSError: + pass + # The next few lines may raise OSError + os.rename(self._filename, self._backupfilename) + self._file = open(self._backupfilename, self._mode) + try: + perm = os.fstat(self._file.fileno()).st_mode + except OSError: + self._output = open(self._filename, "w") else: - return '' - self._filename = self._files[0] - self._files = self._files[1:] - self._filelineno = 0 - self._file = None - self._isstdin = False - self._backupfilename = 0 - if self._filename == '-': - self._filename = '<stdin>' - if 'b' in self._mode: - self._file = getattr(sys.stdin, 'buffer', sys.stdin) - else: - self._file = sys.stdin - self._isstdin = True - else: - if self._inplace: - self._backupfilename = ( - self._filename + (self._backup or ".bak")) + mode = os.O_CREAT | os.O_WRONLY | os.O_TRUNC + if hasattr(os, 'O_BINARY'): + mode |= os.O_BINARY + + fd = os.open(self._filename, mode, perm) + self._output = os.fdopen(fd, "w") try: - os.unlink(self._backupfilename) + if hasattr(os, 'chmod'): + os.chmod(self._filename, perm) except OSError: pass - # The next few lines may raise OSError - os.rename(self._filename, self._backupfilename) - self._file = open(self._backupfilename, self._mode) - try: - perm = os.fstat(self._file.fileno()).st_mode - except OSError: - self._output = open(self._filename, "w") - else: - mode = os.O_CREAT | os.O_WRONLY | os.O_TRUNC - if hasattr(os, 'O_BINARY'): - mode |= os.O_BINARY - - fd = os.open(self._filename, mode, perm) - self._output = os.fdopen(fd, "w") - try: - if hasattr(os, 'chmod'): - os.chmod(self._filename, perm) - except OSError: - pass - self._savestdout = sys.stdout - sys.stdout = self._output + self._savestdout = sys.stdout + sys.stdout = self._output + else: + # This may raise OSError + if self._openhook: + self._file = self._openhook(self._filename, self._mode) else: - # This may raise OSError - if self._openhook: - self._file = self._openhook(self._filename, self._mode) - else: - self._file = open(self._filename, self._mode) - self._buffer = self._file.readlines(self._bufsize) - self._bufindex = 0 - if not self._buffer: - self.nextfile() - # Recursive call - return self.readline() + self._file = open(self._filename, self._mode) + self._readline = self._file.readline + return self._readline() def filename(self): return self._filename def lineno(self): - return self._lineno + return self._startlineno + self._filelineno def filelineno(self): return self._filelineno diff --git a/Lib/test/test_fileinput.py b/Lib/test/test_fileinput.py index 91c1166..784bc92 100644 --- a/Lib/test/test_fileinput.py +++ b/Lib/test/test_fileinput.py @@ -46,6 +46,42 @@ def remove_tempfiles(*names): if name: safe_unlink(name) +class LineReader: + + def __init__(self): + self._linesread = [] + + @property + def linesread(self): + try: + return self._linesread[:] + finally: + self._linesread = [] + + def openhook(self, filename, mode): + self.it = iter(filename.splitlines(True)) + return self + + def readline(self, size=None): + line = next(self.it, '') + self._linesread.append(line) + return line + + def readlines(self, hint=-1): + lines = [] + size = 0 + while True: + line = self.readline() + if not line: + return lines + lines.append(line) + size += len(line) + if size >= hint: + return lines + + def close(self): + pass + class BufferSizesTests(unittest.TestCase): def test_buffer_sizes(self): # First, run the tests with default and teeny buffer size. @@ -289,7 +325,7 @@ class FileInputTests(unittest.TestCase): self.addCleanup(safe_unlink, TESTFN) with FileInput(files=TESTFN, - openhook=hook_encoded('ascii'), bufsize=8) as fi: + openhook=hook_encoded('ascii')) as fi: try: self.assertEqual(fi.readline(), 'A\n') self.assertEqual(fi.readline(), 'B\n') @@ -457,6 +493,38 @@ class FileInputTests(unittest.TestCase): self.assertEqual(result, -1, "fileno() should return -1") + def test_readline_buffering(self): + src = LineReader() + with FileInput(files=['line1\nline2', 'line3\n'], + openhook=src.openhook) as fi: + self.assertEqual(src.linesread, []) + self.assertEqual(fi.readline(), 'line1\n') + self.assertEqual(src.linesread, ['line1\n']) + self.assertEqual(fi.readline(), 'line2') + self.assertEqual(src.linesread, ['line2']) + self.assertEqual(fi.readline(), 'line3\n') + self.assertEqual(src.linesread, ['', 'line3\n']) + self.assertEqual(fi.readline(), '') + self.assertEqual(src.linesread, ['']) + self.assertEqual(fi.readline(), '') + self.assertEqual(src.linesread, []) + + def test_iteration_buffering(self): + src = LineReader() + with FileInput(files=['line1\nline2', 'line3\n'], + openhook=src.openhook) as fi: + self.assertEqual(src.linesread, []) + self.assertEqual(next(fi), 'line1\n') + self.assertEqual(src.linesread, ['line1\n']) + self.assertEqual(next(fi), 'line2') + self.assertEqual(src.linesread, ['line2']) + self.assertEqual(next(fi), 'line3\n') + self.assertEqual(src.linesread, ['', 'line3\n']) + self.assertRaises(StopIteration, next, fi) + self.assertEqual(src.linesread, ['']) + self.assertRaises(StopIteration, next, fi) + self.assertEqual(src.linesread, []) + class MockFileInput: """A class that mocks out fileinput.FileInput for use during unit tests""" |