Backport checkin (and the appropriate fix to the test):

If the data read from the bytestream in readline() ends in a '\r' read one more byte, even if the user has passed a size parameter. This extra byte shouldn't cause a buffer overflow in the tokenizer. The original plan was to return a line ending in '\r', which might be recognizable as a complete line and skip any '\n' that was read afterwards. Unfortunately this didn't work, as the tokenizer only recognizes '\n' as line ends, which in turn lead to joined lines and SyntaxErrors, so this special treatment of a split '\r\n' has been dropped. (It can only happen with a temporarily exhausted bytestream now anyway.) Fixes parts of SF bugs #1163244 and #1175396.
author: Walter Dörwald <walter@livinglogic.de> 2005-04-21 21:53:43 (GMT)
committer: Walter Dörwald <walter@livinglogic.de> 2005-04-21 21:53:43 (GMT)
commit: 4d3fec604dadbffe5211c6ca21fa05c72cbceefd (patch)
tree: 6e8f2d9451889a0884ac2b6aabfcbd1c549ebce7
parent: 21287ee5bc46d54cf041f75f857c0fc0e5dde754 (diff)
download: cpython-4d3fec604dadbffe5211c6ca21fa05c72cbceefd.zip
cpython-4d3fec604dadbffe5211c6ca21fa05c72cbceefd.tar.gz
cpython-4d3fec604dadbffe5211c6ca21fa05c72cbceefd.tar.bz2
3 files changed, 12 insertions, 12 deletions
diff --git a/Lib/codecs.py b/Lib/codecs.py
index 3db9248..3b7c8bf 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -230,7 +230,6 @@ class StreamReader(Codec):
         self.errors = errors
         self.bytebuffer = ""
         self.charbuffer = u""
-        self.atcr = False
 
     def decode(self, input, errors='strict'):
         raise NotImplementedError
@@ -306,18 +305,12 @@ class StreamReader(Codec):
         # If size is given, we call read() only once
         while True:
             data = self.read(readsize)
-            if self.atcr and data.startswith(u"\n"):
-                data = data[1:]
             if data:
-                self.atcr = data.endswith(u"\r")
-                # If we're at a "\r" (and are allowed to read more), read one
-                # extra character (which might be a "\n") to get a proper
-                # line ending. (If the stream is temporarily exhausted we return
-                # the wrong line ending, but at least we won't generate a bogus
-                # second line.)
-                if self.atcr and size is None:
+                # If we're at a "\r" read one extra character (which might
+                # be a "\n") to get a proper line ending. If the stream is
+                # temporarily exhausted we return the wrong line ending.
+                if data.endswith(u"\r"):
                     data += self.read(size=1, chars=1)
-                    self.atcr = data.endswith(u"\r")
 
             line += data
             lines = line.splitlines(True)
@@ -367,7 +360,6 @@ class StreamReader(Codec):
         """
         self.bytebuffer = ""
         self.charbuffer = u""
-        self.atcr = False
 
     def seek(self, offset, whence=0):
         """ Set the input stream's current position.
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 7a19b32..93c5ff1 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -176,6 +176,7 @@ class ReadTest(unittest.TestCase):
         writer.write(u"foo\r")
         self.assertEqual(reader.readline(keepends=False), u"foo")
         writer.write(u"\nbar\r")
+        self.assertEqual(reader.readline(keepends=False), u"")
         self.assertEqual(reader.readline(keepends=False), u"bar")
         writer.write(u"baz")
         self.assertEqual(reader.readline(keepends=False), u"baz")
@@ -185,6 +186,7 @@ class ReadTest(unittest.TestCase):
         writer.write(u"foo\r")
         self.assertEqual(reader.readline(keepends=True), u"foo\r")
         writer.write(u"\nbar\r")
+        self.assertEqual(reader.readline(keepends=True), u"\n")
         self.assertEqual(reader.readline(keepends=True), u"bar\r")
         writer.write(u"baz")
         self.assertEqual(reader.readline(keepends=True), u"baz")
diff --git a/Misc/NEWS b/Misc/NEWS
index 6e48305..af55fb7 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -32,6 +32,12 @@ Library
 - distutils.commands.register now encodes the data as UTF-8 before posting
   them to PyPI.
 
+- Partial fixes for SF bugs #1163244 and #1175396: If a chunk read by
+  ``codecs.StreamReader.readline()`` has a trailing "\r", read one more
+  character even if the user has passed a size parameter to get a proper
+  line ending. Remove the special handling of a "\r\n" that has been split
+  between two lines.
+
 
 What's New in Python 2.4.1 final?
 =================================
author	Walter Dörwald <walter@livinglogic.de>	2005-04-21 21:53:43 (GMT)
committer	Walter Dörwald <walter@livinglogic.de>	2005-04-21 21:53:43 (GMT)
commit	4d3fec604dadbffe5211c6ca21fa05c72cbceefd (patch)
tree	6e8f2d9451889a0884ac2b6aabfcbd1c549ebce7
parent	21287ee5bc46d54cf041f75f857c0fc0e5dde754 (diff)
download	cpython-4d3fec604dadbffe5211c6ca21fa05c72cbceefd.zip cpython-4d3fec604dadbffe5211c6ca21fa05c72cbceefd.tar.gz cpython-4d3fec604dadbffe5211c6ca21fa05c72cbceefd.tar.bz2