diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2005-08-24 07:38:12 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2005-08-24 07:38:12 (GMT) |
commit | 56066d2e554b6b92375d3e276f2f02663526c087 (patch) | |
tree | 877e93e7f59a7f09a713c029d8b2227f61b8521d /Lib/codecs.py | |
parent | 6d2b346140ed0f3bc4c67fc33bf69a60c496e6a4 (diff) | |
download | cpython-56066d2e554b6b92375d3e276f2f02663526c087.zip cpython-56066d2e554b6b92375d3e276f2f02663526c087.tar.gz cpython-56066d2e554b6b92375d3e276f2f02663526c087.tar.bz2 |
Return complete lines from codec stream readers
even if there is an exception in later lines, resulting in
correct line numbers for decoding errors in source code. Fixes #1178484.
Will backport to 2.4.
Diffstat (limited to 'Lib/codecs.py')
-rw-r--r-- | Lib/codecs.py | 20 |
1 files changed, 17 insertions, 3 deletions
diff --git a/Lib/codecs.py b/Lib/codecs.py index 0ffa382..a964f99 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -236,7 +236,7 @@ class StreamReader(Codec): def decode(self, input, errors='strict'): raise NotImplementedError - def read(self, size=-1, chars=-1): + def read(self, size=-1, chars=-1, firstline=False): """ Decodes data from the stream self.stream and returns the resulting object. @@ -253,6 +253,11 @@ class StreamReader(Codec): is intended to prevent having to decode huge files in one step. + If firstline is true, and a UnicodeDecodeError happens + after the first line terminator in the input only the first line + will be returned, the rest of the input will be kept until the + next call to read(). + The method should use a greedy read strategy meaning that it should read as much data as is allowed within the definition of the encoding and the given size, e.g. if @@ -275,7 +280,16 @@ class StreamReader(Codec): newdata = self.stream.read(size) # decode bytes (those remaining from the last call included) data = self.bytebuffer + newdata - newchars, decodedbytes = self.decode(data, self.errors) + try: + newchars, decodedbytes = self.decode(data, self.errors) + except UnicodeDecodeError, exc: + if firstline: + newchars, decodedbytes = self.decode(data[:exc.start], self.errors) + lines = newchars.splitlines(True) + if len(lines)<=1: + raise + else: + raise # keep undecoded bytes until the next call self.bytebuffer = data[decodedbytes:] # put new characters in the character buffer @@ -306,7 +320,7 @@ class StreamReader(Codec): line = "" # If size is given, we call read() only once while True: - data = self.read(readsize) + data = self.read(readsize, firstline=True) if data: # If we're at a "\r" read one extra character (which might # be a "\n") to get a proper line ending. If the stream is |