summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2005-08-24 07:38:12 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2005-08-24 07:38:12 (GMT)
commit56066d2e554b6b92375d3e276f2f02663526c087 (patch)
tree877e93e7f59a7f09a713c029d8b2227f61b8521d
parent6d2b346140ed0f3bc4c67fc33bf69a60c496e6a4 (diff)
downloadcpython-56066d2e554b6b92375d3e276f2f02663526c087.zip
cpython-56066d2e554b6b92375d3e276f2f02663526c087.tar.gz
cpython-56066d2e554b6b92375d3e276f2f02663526c087.tar.bz2
Return complete lines from codec stream readers
even if there is an exception in later lines, resulting in correct line numbers for decoding errors in source code. Fixes #1178484. Will backport to 2.4.
-rw-r--r--Doc/lib/libcodecs.tex6
-rw-r--r--Lib/codecs.py20
-rw-r--r--Misc/NEWS4
3 files changed, 26 insertions, 4 deletions
diff --git a/Doc/lib/libcodecs.tex b/Doc/lib/libcodecs.tex
index d98f474..7e22386 100644
--- a/Doc/lib/libcodecs.tex
+++ b/Doc/lib/libcodecs.tex
@@ -394,7 +394,7 @@ order to be compatible to the Python codec registry.
be extended with \function{register_error()}.
\end{classdesc}
-\begin{methoddesc}{read}{\optional{size\optional{, chars}}}
+\begin{methoddesc}{read}{\optional{size\optional{, chars, \optional{firstline}}}}
Decodes data from the stream and returns the resulting object.
\var{chars} indicates the number of characters to read from the
@@ -408,12 +408,16 @@ order to be compatible to the Python codec registry.
decode as much as possible. \var{size} is intended to prevent having
to decode huge files in one step.
+ \var{firstline} indicates that it would be sufficient to only return
+ the first line, if there are decoding errors on later lines.
+
The method should use a greedy read strategy meaning that it should
read as much data as is allowed within the definition of the encoding
and the given size, e.g. if optional encoding endings or state
markers are available on the stream, these should be read too.
\versionchanged[\var{chars} argument added]{2.4}
+ \versionchanged[\var{firstline} argument added]{2.4.2}
\end{methoddesc}
\begin{methoddesc}{readline}{\optional{size\optional{, keepends}}}
diff --git a/Lib/codecs.py b/Lib/codecs.py
index 0ffa382..a964f99 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -236,7 +236,7 @@ class StreamReader(Codec):
def decode(self, input, errors='strict'):
raise NotImplementedError
- def read(self, size=-1, chars=-1):
+ def read(self, size=-1, chars=-1, firstline=False):
""" Decodes data from the stream self.stream and returns the
resulting object.
@@ -253,6 +253,11 @@ class StreamReader(Codec):
is intended to prevent having to decode huge files in one
step.
+ If firstline is true, and a UnicodeDecodeError happens
+ after the first line terminator in the input only the first line
+ will be returned, the rest of the input will be kept until the
+ next call to read().
+
The method should use a greedy read strategy meaning that
it should read as much data as is allowed within the
definition of the encoding and the given size, e.g. if
@@ -275,7 +280,16 @@ class StreamReader(Codec):
newdata = self.stream.read(size)
# decode bytes (those remaining from the last call included)
data = self.bytebuffer + newdata
- newchars, decodedbytes = self.decode(data, self.errors)
+ try:
+ newchars, decodedbytes = self.decode(data, self.errors)
+ except UnicodeDecodeError, exc:
+ if firstline:
+ newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
+ lines = newchars.splitlines(True)
+ if len(lines)<=1:
+ raise
+ else:
+ raise
# keep undecoded bytes until the next call
self.bytebuffer = data[decodedbytes:]
# put new characters in the character buffer
@@ -306,7 +320,7 @@ class StreamReader(Codec):
line = ""
# If size is given, we call read() only once
while True:
- data = self.read(readsize)
+ data = self.read(readsize, firstline=True)
if data:
# If we're at a "\r" read one extra character (which might
# be a "\n") to get a proper line ending. If the stream is
diff --git a/Misc/NEWS b/Misc/NEWS
index b88608e..a65db9d 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -191,6 +191,10 @@ Extension Modules
Library
-------
+- Bug #1178484: Return complete lines from codec stream readers
+ even if there is an exception in later lines, resulting in
+ correct line numbers for decoding errors in source code.
+
- Bug #1192315: Disallow negative arguments to clear() in pdb.
- Patch #827386: Support absolute source paths in msvccompiler.py.