From fe7c30956c5cc00a90b8cf62e3caf03019efee59 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 15 Sep 2005 05:28:25 +0000 Subject: Sync-up with patches to the head. Includes SF 1016880: urllib.urlretrieve silently truncates downloads and the performance fix-ups. --- Lib/urllib.py | 114 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 61 insertions(+), 53 deletions(-) diff --git a/Lib/urllib.py b/Lib/urllib.py index 46045c3..bc16be0 100644 --- a/Lib/urllib.py +++ b/Lib/urllib.py @@ -86,6 +86,11 @@ def urlcleanup(): if _urlopener: _urlopener.cleanup() +# exception raised when downloaded size does not match content-length +class ContentTooShortError(IOError): + def __init__(self, message, content): + IOError.__init__(self, message) + self.content = content ftpcache = {} class URLopener: @@ -228,6 +233,7 @@ class URLopener: self.tempcache[url] = result bs = 1024*8 size = -1 + read = 0 blocknum = 0 if reporthook: if "content-length" in headers: @@ -237,6 +243,7 @@ class URLopener: block = fp.read(bs) if block == "": break + read += len(block) tfp.write(block) blocknum += 1 if reporthook: @@ -245,6 +252,12 @@ class URLopener: tfp.close() del fp del tfp + + # raise exception if actual size does not match content-length header + if size >= 0 and read < size: + raise ContentTooShortError("retrieval incomplete: got only %i out " + "of %i bytes" % (read, size), result) + return result # Each method named open_ knows how to open that type of URL @@ -409,7 +422,11 @@ class URLopener: def open_local_file(self, url): """Use local file.""" - import mimetypes, mimetools, email.Utils, StringIO + import mimetypes, mimetools, email.Utils + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO host, file = splithost(url) localname = url2pathname(file) try: @@ -419,7 +436,7 @@ class URLopener: size = stats.st_size modified = email.Utils.formatdate(stats.st_mtime, usegmt=True) mtype = mimetypes.guess_type(url)[0] - headers = mimetools.Message(StringIO.StringIO( + headers = mimetools.Message(StringIO( 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % (mtype or 'text/plain', size, modified))) if not host: @@ -440,7 +457,11 @@ class URLopener: def open_ftp(self, url): """Use FTP protocol.""" - import mimetypes, mimetools, StringIO + import mimetypes, mimetools + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO host, path = splithost(url) if not host: raise IOError, ('ftp error', 'no host given') host, port = splitport(host) @@ -489,7 +510,7 @@ class URLopener: headers += "Content-Type: %s\n" % mtype if retrlen is not None and retrlen >= 0: headers += "Content-Length: %d\n" % retrlen - headers = mimetools.Message(StringIO.StringIO(headers)) + headers = mimetools.Message(StringIO(headers)) return addinfourl(fp, headers, "ftp:" + url) except ftperrors(), msg: raise IOError, ('ftp error', msg), sys.exc_info()[2] @@ -503,7 +524,11 @@ class URLopener: # mediatype := [ type "/" subtype ] *( ";" parameter ) # data := *urlchar # parameter := attribute "=" value - import StringIO, mimetools + import mimetools + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO try: [type, data] = url.split(',', 1) except ValueError: @@ -529,7 +554,7 @@ class URLopener: msg.append('') msg.append(data) msg = '\n'.join(msg) - f = StringIO.StringIO(msg) + f = StringIO(msg) headers = mimetools.Message(f, 0) f.fileno = None # needed for addinfourl return addinfourl(f, headers, url) @@ -696,8 +721,11 @@ def noheaders(): global _noheaders if _noheaders is None: import mimetools - import StringIO - _noheaders = mimetools.Message(StringIO.StringIO(), 0) + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO + _noheaders = mimetools.Message(StringIO(), 0) _noheaders.fp.close() # Recycle file descriptor return _noheaders @@ -1021,23 +1049,18 @@ def splitgophertype(selector): return selector[1], selector[2:] return None, selector +_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) +_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) + def unquote(s): """unquote('abc%20def') -> 'abc def'.""" - mychr = chr - myatoi = int - list = s.split('%') - res = [list[0]] - myappend = res.append - del list[0] - for item in list: - if item[1:2]: - try: - myappend(mychr(myatoi(item[:2], 16)) - + item[2:]) - except ValueError: - myappend('%' + item) - else: - myappend('%' + item) + res = s.split('%') + for i in xrange(1, len(res)): + item = res[i] + try: + res[i] = _hextochr[item[:2]] + item[2:] + except KeyError: + res[i] = '%' + item return "".join(res) def unquote_plus(s): @@ -1048,22 +1071,7 @@ def unquote_plus(s): always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' '0123456789' '_.-') - -_fast_safe_test = always_safe + '/' -_fast_safe = None - -def _fast_quote(s): - global _fast_safe - if _fast_safe is None: - _fast_safe = {} - for c in _fast_safe_test: - _fast_safe[c] = c - res = list(s) - for i in range(len(res)): - c = res[i] - if not c in _fast_safe: - res[i] = '%%%02X' % ord(c) - return ''.join(res) +_safemaps = {} def quote(s, safe = '/'): """quote('abc def') -> 'abc%20def' @@ -1086,25 +1094,25 @@ def quote(s, safe = '/'): called on a path where the existing slash characters are used as reserved characters. """ - safe = always_safe + safe - if _fast_safe_test == safe: - return _fast_quote(s) - res = list(s) - for i in range(len(res)): - c = res[i] - if c not in safe: - res[i] = '%%%02X' % ord(c) + cachekey = (safe, always_safe) + try: + safe_map = _safemaps[cachekey] + except KeyError: + safe += always_safe + safe_map = {} + for i in range(256): + c = chr(i) + safe_map[c] = (c in safe) and c or ('%%%02X' % i) + _safemaps[cachekey] = safe_map + res = map(safe_map.__getitem__, s) return ''.join(res) def quote_plus(s, safe = ''): """Quote the query fragment of a URL; replacing ' ' with '+'""" if ' ' in s: - l = s.split(' ') - for i in range(len(l)): - l[i] = quote(l[i], safe) - return '+'.join(l) - else: - return quote(s, safe) + s = quote(s, safe + ' ') + return s.replace(' ', '+') + return quote(s, safe) def urlencode(query,doseq=0): """Encode a sequence of two-element tuples or dictionary into a URL query string. -- cgit v0.12