Return complete lines from codec stream readers

even if there is an exception in later lines, resulting in correct line numbers for decoding errors in source code. Fixes #1178484. Will backport to 2.4.
author: Martin v. Löwis <martin@v.loewis.de> 2005-08-24 07:38:12 (GMT)
committer: Martin v. Löwis <martin@v.loewis.de> 2005-08-24 07:38:12 (GMT)
commit: 56066d2e554b6b92375d3e276f2f02663526c087 (patch)
tree: 877e93e7f59a7f09a713c029d8b2227f61b8521d /Lib/codecs.py
parent: 6d2b346140ed0f3bc4c67fc33bf69a60c496e6a4 (diff)
download: cpython-56066d2e554b6b92375d3e276f2f02663526c087.zip
cpython-56066d2e554b6b92375d3e276f2f02663526c087.tar.gz
cpython-56066d2e554b6b92375d3e276f2f02663526c087.tar.bz2
1 files changed, 17 insertions, 3 deletions
diff --git a/Lib/codecs.py b/Lib/codecs.py
index 0ffa382..a964f99 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -236,7 +236,7 @@ class StreamReader(Codec):
     def decode(self, input, errors='strict'):
         raise NotImplementedError
 
-    def read(self, size=-1, chars=-1):
+    def read(self, size=-1, chars=-1, firstline=False):
 
         """ Decodes data from the stream self.stream and returns the
             resulting object.
@@ -253,6 +253,11 @@ class StreamReader(Codec):
             is intended to prevent having to decode huge files in one
             step.
 
+            If firstline is true, and a UnicodeDecodeError happens
+            after the first line terminator in the input only the first line
+            will be returned, the rest of the input will be kept until the
+            next call to read().
+
             The method should use a greedy read strategy meaning that
             it should read as much data as is allowed within the
             definition of the encoding and the given size, e.g.  if
@@ -275,7 +280,16 @@ class StreamReader(Codec):
                 newdata = self.stream.read(size)
             # decode bytes (those remaining from the last call included)
             data = self.bytebuffer + newdata
-            newchars, decodedbytes = self.decode(data, self.errors)
+            try:
+                newchars, decodedbytes = self.decode(data, self.errors)
+            except UnicodeDecodeError, exc:
+                if firstline:
+                    newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
+                    lines = newchars.splitlines(True)
+                    if len(lines)<=1:
+                        raise
+                else:
+                    raise
             # keep undecoded bytes until the next call
             self.bytebuffer = data[decodedbytes:]
             # put new characters in the character buffer
@@ -306,7 +320,7 @@ class StreamReader(Codec):
         line = ""
         # If size is given, we call read() only once
         while True:
-            data = self.read(readsize)
+            data = self.read(readsize, firstline=True)
             if data:
                 # If we're at a "\r" read one extra character (which might
                 # be a "\n") to get a proper line ending. If the stream is
author	Martin v. Löwis <martin@v.loewis.de>	2005-08-24 07:38:12 (GMT)
committer	Martin v. Löwis <martin@v.loewis.de>	2005-08-24 07:38:12 (GMT)
commit	56066d2e554b6b92375d3e276f2f02663526c087 (patch)
tree	877e93e7f59a7f09a713c029d8b2227f61b8521d /Lib/codecs.py
parent	6d2b346140ed0f3bc4c67fc33bf69a60c496e6a4 (diff)
download	cpython-56066d2e554b6b92375d3e276f2f02663526c087.zip cpython-56066d2e554b6b92375d3e276f2f02663526c087.tar.gz cpython-56066d2e554b6b92375d3e276f2f02663526c087.tar.bz2