From b925602f169d47270a064cf9eb03e21706ed25c3 Mon Sep 17 00:00:00 2001
From: Georg Brandl <georg@python.org>
Date: Wed, 24 Aug 2005 18:46:39 +0000
Subject: Patch [ 1062060 ] fix for 1016880 urllib.urlretrieve silently
 truncates dwnld

---
 Doc/lib/liburllib.tex | 30 +++++++++++++++++++++++++++++-
 Lib/urllib.py         | 16 +++++++++++++++-
 Misc/NEWS             |  4 ++++
 3 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/Doc/lib/liburllib.tex b/Doc/lib/liburllib.tex
index dafdd91..5e488c4 100644
--- a/Doc/lib/liburllib.tex
+++ b/Doc/lib/liburllib.tex
@@ -142,6 +142,25 @@ If the \var{url} uses the \file{http:} scheme identifier, the optional
 (normally the request type is \code{GET}).  The \var{data} argument
 must in standard \mimetype{application/x-www-form-urlencoded} format;
 see the \function{urlencode()} function below.
+
+\versionchanged[
+\function{urlretrieve()} will raise \exception{ContentTooShortError}
+when it detects that the amount of data available 
+was less than the expected amount (which is the size reported by a 
+\var{Content-Length} header). This can occur, for example, when the 
+download is interrupted.
+
+The \var{Content-Length} is treated as a lower bound: if there's more data 
+to read, urlretrieve reads more data, but if less data is available, 
+it raises the exception.
+
+You can still retrieve the downloaded data in this case, it is stored 
+in the \member{content} attribute of the exception instance.
+
+If no \var{Content-Length} header was supplied, urlretrieve can
+not check the size of the data it has downloaded, and just returns it. 
+In this case you just have to assume that the download was successful]{2.5}
+
 \end{funcdesc}
 
 \begin{datadesc}{_urlopener}
@@ -283,6 +302,15 @@ subclass may override this method to support more appropriate behavior
 if needed.}
 \end{classdesc}
 
+\begin{excclassdesc}{ContentTooShortError}{msg\optional{, content}}
+This exception is raised when the \function{urlretrieve()} function
+detects that the amount of the downloaded data is less than the 
+expected amount (given by the \var{Content-Length} header). The
+\member{content} attribute stores the downloaded (and supposedly
+truncated) data.
+\versionadded{2.5}
+\end{excclassdesc}
+
 Restrictions:
 
 \begin{itemize}
@@ -317,7 +345,7 @@ Web client using these functions without using threads.
 \item
 The data returned by \function{urlopen()} or \function{urlretrieve()}
 is the raw data returned by the server.  This may be binary data
-(e.g. an image), plain text or (for example) HTML\index{HTML}.  The
+(such as an image), plain text or (for example) HTML\index{HTML}.  The
 HTTP\indexii{HTTP}{protocol} protocol provides type information in the
 reply header, which can be inspected by looking at the
 \mailheader{Content-Type} header.  For the
diff --git a/Lib/urllib.py b/Lib/urllib.py
index 74b2aec..4f1ebdd 100644
--- a/Lib/urllib.py
+++ b/Lib/urllib.py
@@ -86,6 +86,11 @@ def urlcleanup():
     if _urlopener:
         _urlopener.cleanup()
 
+# exception raised when downloaded size does not match content-length
+class ContentTooShortError(IOError):
+    def __init__(self, message, content):
+        IOError.__init__(self, message)
+        self.content = content
 
 ftpcache = {}
 class URLopener:
@@ -228,24 +233,33 @@ class URLopener:
             self.tempcache[url] = result
         bs = 1024*8
         size = -1
+        read = 0
         blocknum = 1
         if reporthook:
             if "content-length" in headers:
                 size = int(headers["Content-Length"])
             reporthook(0, bs, size)
         block = fp.read(bs)
+        read += len(block)
         if reporthook:
             reporthook(1, bs, size)
         while block:
             tfp.write(block)
             block = fp.read(bs)
-            blocknum = blocknum + 1
+            read += len(block)
+            blocknum += 1
             if reporthook:
                 reporthook(blocknum, bs, size)
         fp.close()
         tfp.close()
         del fp
         del tfp
+
+        # raise exception if actual size does not match content-length header
+        if size >= 0 and read < size:
+            raise ContentTooShortError("retrieval incomplete: got only %i out "
+                                       "of %i bytes" % (read, size), result)
+
         return result
 
     # Each method named open_<type> knows how to open that type of URL
diff --git a/Misc/NEWS b/Misc/NEWS
index 7e21b7a..fab6163 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -193,6 +193,10 @@ Extension Modules
 Library
 -------
 
+- Patch #1062060: urllib.urlretrieve() now raises a new exception, named
+  ContentTooShortException, when the actually downloaded size does not
+  match the Content-Length header.
+
 - Bug #1121494: distutils.dir_utils.mkpath now accepts Unicode strings.
 
 - Bug #1178484: Return complete lines from codec stream readers
-- 
cgit v0.12