From e24f96a05973ddbb59d88c03570aef8545c5ef10 Mon Sep 17 00:00:00 2001 From: Senthil Kumaran Date: Tue, 13 Mar 2012 19:29:33 -0700 Subject: Issue10050 - urlretrieve uses newer urlopen. reporthook of urlretrieve takes, block number, block read size, file_size --- Doc/howto/urllib2.rst | 7 ++++ Doc/library/urllib.request.rst | 39 +++++++++++--------- Lib/test/test_urllib.py | 34 +++++++++--------- Lib/urllib/request.py | 80 +++++++++++++++++++++++++++++++++++++----- Misc/ACKS | 1 + 5 files changed, 119 insertions(+), 42 deletions(-) diff --git a/Doc/howto/urllib2.rst b/Doc/howto/urllib2.rst index 76286bd..058cf96 100644 --- a/Doc/howto/urllib2.rst +++ b/Doc/howto/urllib2.rst @@ -56,6 +56,13 @@ The simplest way to use urllib.request is as follows:: response = urllib.request.urlopen('http://python.org/') html = response.read() +If you wish to retrieve a resource via URL and store it in a temporary location, +you can do so via the :func:`urlretrieve` function:: + + import urllib.request + local_filename, headers = urllib.request.urlretrieve('http://python.org/') + html = open(local_filename) + Many uses of urllib will be that simple (note that instead of an 'http:' URL we could have used an URL starting with 'ftp:', 'file:', etc.). However, it's the purpose of this tutorial to explain the more complicated cases, concentrating on diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index 29a8413..d624f8a 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -1124,16 +1124,14 @@ The following functions and classes are ported from the Python 2 module ``urllib`` (as opposed to ``urllib2``). They might become deprecated at some point in the future. - .. function:: urlretrieve(url, filename=None, reporthook=None, data=None) - Copy a network object denoted by a URL to a local file, if necessary. If the URL - points to a local file, or a valid cached copy of the object exists, the object - is not copied. Return a tuple ``(filename, headers)`` where *filename* is the + Copy a network object denoted by a URL to a local file. If the URL + points to a local file, the object will not be copied unless filename is supplied. + Return a tuple ``(filename, headers)`` where *filename* is the local file name under which the object can be found, and *headers* is whatever the :meth:`info` method of the object returned by :func:`urlopen` returned (for - a remote object, possibly cached). Exceptions are the same as for - :func:`urlopen`. + a remote object). Exceptions are the same as for :func:`urlopen`. The second argument, if present, specifies the file location to copy to (if absent, the location will be a tempfile with a generated name). The third @@ -1144,11 +1142,18 @@ some point in the future. third argument may be ``-1`` on older FTP servers which do not return a file size in response to a retrieval request. + The following example illustrates the most common usage scenario:: + + >>> import urllib.request + >>> local_filename, headers = urllib.request.urlretrieve('http://python.org/') + >>> html = open(local_filename) + >>> html.close() + If the *url* uses the :file:`http:` scheme identifier, the optional *data* - argument may be given to specify a ``POST`` request (normally the request type - is ``GET``). The *data* argument must in standard - :mimetype:`application/x-www-form-urlencoded` format; see the :func:`urlencode` - function below. + argument may be given to specify a ``POST`` request (normally the request + type is ``GET``). The *data* argument must in standard + :mimetype:`application/x-www-form-urlencoded` format; see the + :func:`urlencode` function below. :func:`urlretrieve` will raise :exc:`ContentTooShortError` when it detects that the amount of data available was less than the expected amount (which is the @@ -1156,20 +1161,20 @@ some point in the future. the download is interrupted. The *Content-Length* is treated as a lower bound: if there's more data to read, - :func:`urlretrieve` reads more data, but if less data is available, it raises - the exception. + urlretrieve reads more data, but if less data is available, it raises the + exception. You can still retrieve the downloaded data in this case, it is stored in the :attr:`content` attribute of the exception instance. - If no *Content-Length* header was supplied, :func:`urlretrieve` can not check - the size of the data it has downloaded, and just returns it. In this case - you just have to assume that the download was successful. + If no *Content-Length* header was supplied, urlretrieve can not check the size + of the data it has downloaded, and just returns it. In this case you just have + to assume that the download was successful. .. function:: urlcleanup() - Clear the cache that may have been built up by previous calls to - :func:`urlretrieve`. + Cleans up temporary files that may have been left behind by previous + calls to :func:`urlretrieve`. .. class:: URLopener(proxies=None, **x509) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index b2680ed..85f8f84 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -384,11 +384,11 @@ class urlretrieve_FileTests(unittest.TestCase): def test_reporthook(self): # Make sure that the reporthook works. - def hooktester(count, block_size, total_size, count_holder=[0]): - self.assertIsInstance(count, int) - self.assertIsInstance(block_size, int) - self.assertIsInstance(total_size, int) - self.assertEqual(count, count_holder[0]) + def hooktester(block_count, block_read_size, file_size, count_holder=[0]): + self.assertIsInstance(block_count, int) + self.assertIsInstance(block_read_size, int) + self.assertIsInstance(file_size, int) + self.assertEqual(block_count, count_holder[0]) count_holder[0] = count_holder[0] + 1 second_temp = "%s.2" % support.TESTFN self.registerFileForCleanUp(second_temp) @@ -399,8 +399,8 @@ class urlretrieve_FileTests(unittest.TestCase): def test_reporthook_0_bytes(self): # Test on zero length file. Should call reporthook only 1 time. report = [] - def hooktester(count, block_size, total_size, _report=report): - _report.append((count, block_size, total_size)) + def hooktester(block_count, block_read_size, file_size, _report=report): + _report.append((block_count, block_read_size, file_size)) srcFileName = self.createNewTempFile() urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName), support.TESTFN, hooktester) @@ -410,31 +410,31 @@ class urlretrieve_FileTests(unittest.TestCase): def test_reporthook_5_bytes(self): # Test on 5 byte file. Should call reporthook only 2 times (once when # the "network connection" is established and once when the block is - # read). Since the block size is 8192 bytes, only one block read is - # required to read the entire file. + # read). report = [] - def hooktester(count, block_size, total_size, _report=report): - _report.append((count, block_size, total_size)) + def hooktester(block_count, block_read_size, file_size, _report=report): + _report.append((block_count, block_read_size, file_size)) srcFileName = self.createNewTempFile(b"x" * 5) urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName), support.TESTFN, hooktester) self.assertEqual(len(report), 2) - self.assertEqual(report[0][1], 8192) - self.assertEqual(report[0][2], 5) + self.assertEqual(report[0][1], 0) + self.assertEqual(report[1][1], 5) def test_reporthook_8193_bytes(self): # Test on 8193 byte file. Should call reporthook only 3 times (once # when the "network connection" is established, once for the next 8192 # bytes, and once for the last byte). report = [] - def hooktester(count, block_size, total_size, _report=report): - _report.append((count, block_size, total_size)) + def hooktester(block_count, block_read_size, file_size, _report=report): + _report.append((block_count, block_read_size, file_size)) srcFileName = self.createNewTempFile(b"x" * 8193) urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName), support.TESTFN, hooktester) self.assertEqual(len(report), 3) - self.assertEqual(report[0][1], 8192) - self.assertEqual(report[0][2], 8193) + self.assertEqual(report[0][1], 0) + self.assertEqual(report[1][1], 8192) + self.assertEqual(report[2][1], 1) class urlretrieve_HttpTests(unittest.TestCase, FakeHTTPMixin): diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 90dfcff..c220a7d 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -94,6 +94,9 @@ import socket import sys import time import collections +import tempfile +import contextlib + from urllib.error import URLError, HTTPError, ContentTooShortError from urllib.parse import ( @@ -156,17 +159,78 @@ def install_opener(opener): global _opener _opener = opener -# TODO(jhylton): Make this work with the same global opener. -_urlopener = None +_url_tempfiles = [] def urlretrieve(url, filename=None, reporthook=None, data=None): - global _urlopener - if not _urlopener: - _urlopener = FancyURLopener() - return _urlopener.retrieve(url, filename, reporthook, data) + """ + Retrieve a URL into a temporary location on disk. + + Requires a URL argument. If a filename is passed, it is used as + the temporary file location. The reporthook argument should be + a callable that accepts a block number, a read size, and the + total file size of the URL target. The data argument should be + valid URL encoded data. + + If a filename is passed and the URL points to a local resource, + the result is a copy from local file to new file. + + Returns a tuple containing the path to the newly created + data file as well as the resulting HTTPMessage object. + """ + url_type, path = splittype(url) + + with contextlib.closing(urlopen(url, data)) as fp: + headers = fp.info() + + # Just return the local path and the "headers" for file:// + # URLs. No sense in performing a copy unless requested. + if url_type == "file" and not filename: + return os.path.normpath(path), headers + + # Handle temporary file setup. + if filename: + tfp = open(filename, 'wb') + else: + tfp = tempfile.NamedTemporaryFile(delete=False) + filename = tfp.name + _url_tempfiles.append(filename) + + with tfp: + result = filename, headers + bs = 1024*8 + size = -1 + read = 0 + blocknum = 0 + if "content-length" in headers: + size = int(headers["Content-Length"]) + + if reporthook: + reporthook(blocknum, 0, size) + + while True: + block = fp.read(bs) + if not block: + break + read += len(block) + tfp.write(block) + blocknum += 1 + if reporthook: + reporthook(blocknum, len(block), size) + + if size >= 0 and read < size: + raise ContentTooShortError( + "retrieval incomplete: got only %i out of %i bytes" + % (read, size), result) + + return result def urlcleanup(): - if _urlopener: - _urlopener.cleanup() + for temp_file in _url_tempfiles: + try: + os.unlink(temp_file) + except EnvironmentError: + pass + + del _url_tempfiles[:] global _opener if _opener: _opener = None diff --git a/Misc/ACKS b/Misc/ACKS index 48bdde4..a11d4eb 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -288,6 +288,7 @@ Julien Élie Lance Ellinghaus David Ely Jeff Epler +Jeff McNeil Tom Epperly Stoffel Erasmus Jürgen A. Erhard -- cgit v0.12