Patch [ 1062060 ] fix for 1016880 urllib.urlretrieve silently truncates dwnld

author: Georg Brandl <georg@python.org> 2005-08-24 18:46:39 (GMT)
committer: Georg Brandl <georg@python.org> 2005-08-24 18:46:39 (GMT)
commit: b925602f169d47270a064cf9eb03e21706ed25c3 (patch)
tree: eb4e791e8253f66340e95dd7f19f57a0d9112f5c
parent: 568973181aa523bbcf7f827b3a2eb2affd96ea67 (diff)
download: cpython-b925602f169d47270a064cf9eb03e21706ed25c3.zip
cpython-b925602f169d47270a064cf9eb03e21706ed25c3.tar.gz
cpython-b925602f169d47270a064cf9eb03e21706ed25c3.tar.bz2
3 files changed, 48 insertions, 2 deletions
diff --git a/Doc/lib/liburllib.tex b/Doc/lib/liburllib.tex
index dafdd91..5e488c4 100644
--- a/Doc/lib/liburllib.tex
+++ b/Doc/lib/liburllib.tex
@@ -142,6 +142,25 @@ If the \var{url} uses the \file{http:} scheme identifier, the optional
 (normally the request type is \code{GET}).  The \var{data} argument
 must in standard \mimetype{application/x-www-form-urlencoded} format;
 see the \function{urlencode()} function below.
+
+\versionchanged[
+\function{urlretrieve()} will raise \exception{ContentTooShortError}
+when it detects that the amount of data available 
+was less than the expected amount (which is the size reported by a 
+\var{Content-Length} header). This can occur, for example, when the 
+download is interrupted.
+
+The \var{Content-Length} is treated as a lower bound: if there's more data 
+to read, urlretrieve reads more data, but if less data is available, 
+it raises the exception.
+
+You can still retrieve the downloaded data in this case, it is stored 
+in the \member{content} attribute of the exception instance.
+
+If no \var{Content-Length} header was supplied, urlretrieve can
+not check the size of the data it has downloaded, and just returns it. 
+In this case you just have to assume that the download was successful]{2.5}
+
 \end{funcdesc}
 
 \begin{datadesc}{_urlopener}
@@ -283,6 +302,15 @@ subclass may override this method to support more appropriate behavior
 if needed.}
 \end{classdesc}
 
+\begin{excclassdesc}{ContentTooShortError}{msg\optional{, content}}
+This exception is raised when the \function{urlretrieve()} function
+detects that the amount of the downloaded data is less than the 
+expected amount (given by the \var{Content-Length} header). The
+\member{content} attribute stores the downloaded (and supposedly
+truncated) data.
+\versionadded{2.5}
+\end{excclassdesc}
+
 Restrictions:
 
 \begin{itemize}
@@ -317,7 +345,7 @@ Web client using these functions without using threads.
 \item
 The data returned by \function{urlopen()} or \function{urlretrieve()}
 is the raw data returned by the server.  This may be binary data
-(e.g. an image), plain text or (for example) HTML\index{HTML}.  The
+(such as an image), plain text or (for example) HTML\index{HTML}.  The
 HTTP\indexii{HTTP}{protocol} protocol provides type information in the
 reply header, which can be inspected by looking at the
 \mailheader{Content-Type} header.  For the
diff --git a/Lib/urllib.py b/Lib/urllib.py
index 74b2aec..4f1ebdd 100644
--- a/Lib/urllib.py
+++ b/Lib/urllib.py
@@ -86,6 +86,11 @@ def urlcleanup():
     if _urlopener:
         _urlopener.cleanup()
 
+# exception raised when downloaded size does not match content-length
+class ContentTooShortError(IOError):
+    def __init__(self, message, content):
+        IOError.__init__(self, message)
+        self.content = content
 
 ftpcache = {}
 class URLopener:
@@ -228,24 +233,33 @@ class URLopener:
             self.tempcache[url] = result
         bs = 1024*8
         size = -1
+        read = 0
         blocknum = 1
         if reporthook:
             if "content-length" in headers:
                 size = int(headers["Content-Length"])
             reporthook(0, bs, size)
         block = fp.read(bs)
+        read += len(block)
         if reporthook:
             reporthook(1, bs, size)
         while block:
             tfp.write(block)
             block = fp.read(bs)
-            blocknum = blocknum + 1
+            read += len(block)
+            blocknum += 1
             if reporthook:
                 reporthook(blocknum, bs, size)
         fp.close()
         tfp.close()
         del fp
         del tfp
+
+        # raise exception if actual size does not match content-length header
+        if size >= 0 and read < size:
+            raise ContentTooShortError("retrieval incomplete: got only %i out "
+                                       "of %i bytes" % (read, size), result)
+
         return result
 
     # Each method named open_<type> knows how to open that type of URL
diff --git a/Misc/NEWS b/Misc/NEWS
index 7e21b7a..fab6163 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -193,6 +193,10 @@ Extension Modules
 Library
 -------
 
+- Patch #1062060: urllib.urlretrieve() now raises a new exception, named
+  ContentTooShortException, when the actually downloaded size does not
+  match the Content-Length header.
+
 - Bug #1121494: distutils.dir_utils.mkpath now accepts Unicode strings.
 
 - Bug #1178484: Return complete lines from codec stream readers
author	Georg Brandl <georg@python.org>	2005-08-24 18:46:39 (GMT)
committer	Georg Brandl <georg@python.org>	2005-08-24 18:46:39 (GMT)
commit	b925602f169d47270a064cf9eb03e21706ed25c3 (patch)
tree	eb4e791e8253f66340e95dd7f19f57a0d9112f5c
parent	568973181aa523bbcf7f827b3a2eb2affd96ea67 (diff)
download	cpython-b925602f169d47270a064cf9eb03e21706ed25c3.zip cpython-b925602f169d47270a064cf9eb03e21706ed25c3.tar.gz cpython-b925602f169d47270a064cf9eb03e21706ed25c3.tar.bz2