From b925602f169d47270a064cf9eb03e21706ed25c3 Mon Sep 17 00:00:00 2001 From: Georg Brandl Date: Wed, 24 Aug 2005 18:46:39 +0000 Subject: Patch [ 1062060 ] fix for 1016880 urllib.urlretrieve silently truncates dwnld --- Doc/lib/liburllib.tex | 30 +++++++++++++++++++++++++++++- Lib/urllib.py | 16 +++++++++++++++- Misc/NEWS | 4 ++++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/Doc/lib/liburllib.tex b/Doc/lib/liburllib.tex index dafdd91..5e488c4 100644 --- a/Doc/lib/liburllib.tex +++ b/Doc/lib/liburllib.tex @@ -142,6 +142,25 @@ If the \var{url} uses the \file{http:} scheme identifier, the optional (normally the request type is \code{GET}). The \var{data} argument must in standard \mimetype{application/x-www-form-urlencoded} format; see the \function{urlencode()} function below. + +\versionchanged[ +\function{urlretrieve()} will raise \exception{ContentTooShortError} +when it detects that the amount of data available +was less than the expected amount (which is the size reported by a +\var{Content-Length} header). This can occur, for example, when the +download is interrupted. + +The \var{Content-Length} is treated as a lower bound: if there's more data +to read, urlretrieve reads more data, but if less data is available, +it raises the exception. + +You can still retrieve the downloaded data in this case, it is stored +in the \member{content} attribute of the exception instance. + +If no \var{Content-Length} header was supplied, urlretrieve can +not check the size of the data it has downloaded, and just returns it. +In this case you just have to assume that the download was successful]{2.5} + \end{funcdesc} \begin{datadesc}{_urlopener} @@ -283,6 +302,15 @@ subclass may override this method to support more appropriate behavior if needed.} \end{classdesc} +\begin{excclassdesc}{ContentTooShortError}{msg\optional{, content}} +This exception is raised when the \function{urlretrieve()} function +detects that the amount of the downloaded data is less than the +expected amount (given by the \var{Content-Length} header). The +\member{content} attribute stores the downloaded (and supposedly +truncated) data. +\versionadded{2.5} +\end{excclassdesc} + Restrictions: \begin{itemize} @@ -317,7 +345,7 @@ Web client using these functions without using threads. \item The data returned by \function{urlopen()} or \function{urlretrieve()} is the raw data returned by the server. This may be binary data -(e.g. an image), plain text or (for example) HTML\index{HTML}. The +(such as an image), plain text or (for example) HTML\index{HTML}. The HTTP\indexii{HTTP}{protocol} protocol provides type information in the reply header, which can be inspected by looking at the \mailheader{Content-Type} header. For the diff --git a/Lib/urllib.py b/Lib/urllib.py index 74b2aec..4f1ebdd 100644 --- a/Lib/urllib.py +++ b/Lib/urllib.py @@ -86,6 +86,11 @@ def urlcleanup(): if _urlopener: _urlopener.cleanup() +# exception raised when downloaded size does not match content-length +class ContentTooShortError(IOError): + def __init__(self, message, content): + IOError.__init__(self, message) + self.content = content ftpcache = {} class URLopener: @@ -228,24 +233,33 @@ class URLopener: self.tempcache[url] = result bs = 1024*8 size = -1 + read = 0 blocknum = 1 if reporthook: if "content-length" in headers: size = int(headers["Content-Length"]) reporthook(0, bs, size) block = fp.read(bs) + read += len(block) if reporthook: reporthook(1, bs, size) while block: tfp.write(block) block = fp.read(bs) - blocknum = blocknum + 1 + read += len(block) + blocknum += 1 if reporthook: reporthook(blocknum, bs, size) fp.close() tfp.close() del fp del tfp + + # raise exception if actual size does not match content-length header + if size >= 0 and read < size: + raise ContentTooShortError("retrieval incomplete: got only %i out " + "of %i bytes" % (read, size), result) + return result # Each method named open_ knows how to open that type of URL diff --git a/Misc/NEWS b/Misc/NEWS index 7e21b7a..fab6163 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -193,6 +193,10 @@ Extension Modules Library ------- +- Patch #1062060: urllib.urlretrieve() now raises a new exception, named + ContentTooShortException, when the actually downloaded size does not + match the Content-Length header. + - Bug #1121494: distutils.dir_utils.mkpath now accepts Unicode strings. - Bug #1178484: Return complete lines from codec stream readers -- cgit v0.12