diff options
Diffstat (limited to 'Lib/urllib')
-rw-r--r-- | Lib/urllib/error.py | 7 | ||||
-rw-r--r-- | Lib/urllib/parse.py | 204 | ||||
-rw-r--r-- | Lib/urllib/request.py | 160 | ||||
-rw-r--r-- | Lib/urllib/robotparser.py | 4 |
4 files changed, 226 insertions, 149 deletions
diff --git a/Lib/urllib/error.py b/Lib/urllib/error.py index 45b7169..c5b675d 100644 --- a/Lib/urllib/error.py +++ b/Lib/urllib/error.py @@ -35,6 +35,7 @@ class URLError(OSError): def __str__(self): return '<urlopen error %s>' % self.reason + class HTTPError(URLError, urllib.response.addinfourl): """Raised when HTTP error occurs, but also acts like non-error return""" __super_init = urllib.response.addinfourl.__init__ @@ -55,6 +56,9 @@ class HTTPError(URLError, urllib.response.addinfourl): def __str__(self): return 'HTTP Error %s: %s' % (self.code, self.msg) + def __repr__(self): + return '<HTTPError %s: %r>' % (self.code, self.msg) + # since URLError specifies a .reason attribute, HTTPError should also # provide this attribute. See issue13211 for discussion. @property @@ -69,8 +73,9 @@ class HTTPError(URLError, urllib.response.addinfourl): def headers(self, headers): self.hdrs = headers -# exception raised when downloaded size does not match content-length + class ContentTooShortError(URLError): + """Exception raised when downloaded size does not match content-length.""" def __init__(self, message, content): URLError.__init__(self, message) self.content = content diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index d368331..4d7fcec 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -34,7 +34,9 @@ import collections __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", "urlsplit", "urlunsplit", "urlencode", "parse_qs", "parse_qsl", "quote", "quote_plus", "quote_from_bytes", - "unquote", "unquote_plus", "unquote_to_bytes"] + "unquote", "unquote_plus", "unquote_to_bytes", + "DefragResult", "ParseResult", "SplitResult", + "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"] # A classification of schemes ('' means apply by default) uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', @@ -409,11 +411,13 @@ def urljoin(base, url, allow_fragments=True): return url if not url: return base + base, url, _coerce_result = _coerce_args(base, url) bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ urlparse(base, '', allow_fragments) scheme, netloc, path, params, query, fragment = \ urlparse(url, bscheme, allow_fragments) + if scheme != bscheme or scheme not in uses_relative: return _coerce_result(url) if scheme in uses_netloc: @@ -421,9 +425,7 @@ def urljoin(base, url, allow_fragments=True): return _coerce_result(urlunparse((scheme, netloc, path, params, query, fragment))) netloc = bnetloc - if path[:1] == '/': - return _coerce_result(urlunparse((scheme, netloc, path, - params, query, fragment))) + if not path and not params: path = bpath params = bparams @@ -431,29 +433,45 @@ def urljoin(base, url, allow_fragments=True): query = bquery return _coerce_result(urlunparse((scheme, netloc, path, params, query, fragment))) - segments = bpath.split('/')[:-1] + path.split('/') - # XXX The stuff below is bogus in various ways... - if segments[-1] == '.': - segments[-1] = '' - while '.' in segments: - segments.remove('.') - while 1: - i = 1 - n = len(segments) - 1 - while i < n: - if (segments[i] == '..' - and segments[i-1] not in ('', '..')): - del segments[i-1:i+1] - break - i = i+1 + + base_parts = bpath.split('/') + if base_parts[-1] != '': + # the last item is not a directory, so will not be taken into account + # in resolving the relative path + del base_parts[-1] + + # for rfc3986, ignore all base path should the first character be root. + if path[:1] == '/': + segments = path.split('/') + else: + segments = base_parts + path.split('/') + # filter out elements that would cause redundant slashes on re-joining + # the resolved_path + segments[1:-1] = filter(None, segments[1:-1]) + + resolved_path = [] + + for seg in segments: + if seg == '..': + try: + resolved_path.pop() + except IndexError: + # ignore any .. segments that would otherwise cause an IndexError + # when popped from resolved_path if resolving for rfc3986 + pass + elif seg == '.': + continue else: - break - if segments == ['', '..']: - segments[-1] = '' - elif len(segments) >= 2 and segments[-1] == '..': - segments[-2:] = [''] - return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments), - params, query, fragment))) + resolved_path.append(seg) + + if segments[-1] in ('.', '..'): + # do some post-processing here. if the last segment was a relative dir, + # then we need to append the trailing '/' + resolved_path.append('') + + return _coerce_result(urlunparse((scheme, netloc, '/'.join( + resolved_path) or '/', params, query, fragment))) + def urldefrag(url): """Removes any existing fragment from URL. @@ -641,7 +659,7 @@ class Quoter(collections.defaultdict): def __repr__(self): # Without this, will just display as a defaultdict - return "<Quoter %r>" % dict(self) + return "<%s %r>" % (self.__class__.__name__, dict(self)) def __missing__(self, b): # Handle a cache miss. Store quoted string in cache and return. @@ -732,7 +750,8 @@ def quote_from_bytes(bs, safe='/'): _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ return ''.join([quoter(char) for char in bs]) -def urlencode(query, doseq=False, safe='', encoding=None, errors=None): +def urlencode(query, doseq=False, safe='', encoding=None, errors=None, + quote_via=quote_plus): """Encode a dict or sequence of two-element tuples into a URL query string. If any values in the query arg are sequences and doseq is true, each @@ -744,8 +763,8 @@ def urlencode(query, doseq=False, safe='', encoding=None, errors=None): The components of a query arg may each be either a string or a bytes type. - The safe, encoding, and errors parameters are passed down to quote_plus() - (encoding and errors only if a component is a str). + The safe, encoding, and errors parameters are passed down to the function + specified by quote_via (encoding and errors only if a component is a str). """ if hasattr(query, "items"): @@ -771,27 +790,27 @@ def urlencode(query, doseq=False, safe='', encoding=None, errors=None): if not doseq: for k, v in query: if isinstance(k, bytes): - k = quote_plus(k, safe) + k = quote_via(k, safe) else: - k = quote_plus(str(k), safe, encoding, errors) + k = quote_via(str(k), safe, encoding, errors) if isinstance(v, bytes): - v = quote_plus(v, safe) + v = quote_via(v, safe) else: - v = quote_plus(str(v), safe, encoding, errors) + v = quote_via(str(v), safe, encoding, errors) l.append(k + '=' + v) else: for k, v in query: if isinstance(k, bytes): - k = quote_plus(k, safe) + k = quote_via(k, safe) else: - k = quote_plus(str(k), safe, encoding, errors) + k = quote_via(str(k), safe, encoding, errors) if isinstance(v, bytes): - v = quote_plus(v, safe) + v = quote_via(v, safe) l.append(k + '=' + v) elif isinstance(v, str): - v = quote_plus(v, safe, encoding, errors) + v = quote_via(v, safe, encoding, errors) l.append(k + '=' + v) else: try: @@ -799,33 +818,18 @@ def urlencode(query, doseq=False, safe='', encoding=None, errors=None): x = len(v) except TypeError: # not a sequence - v = quote_plus(str(v), safe, encoding, errors) + v = quote_via(str(v), safe, encoding, errors) l.append(k + '=' + v) else: # loop over the sequence for elt in v: if isinstance(elt, bytes): - elt = quote_plus(elt, safe) + elt = quote_via(elt, safe) else: - elt = quote_plus(str(elt), safe, encoding, errors) + elt = quote_via(str(elt), safe, encoding, errors) l.append(k + '=' + elt) return '&'.join(l) -# Utilities to parse URLs (most of these return None for missing parts): -# unwrap('<URL:type://host/path>') --> 'type://host/path' -# splittype('type:opaquestring') --> 'type', 'opaquestring' -# splithost('//host[:port]/path') --> 'host[:port]', '/path' -# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' -# splitpasswd('user:passwd') -> 'user', 'passwd' -# splitport('host:port') --> 'host', 'port' -# splitquery('/path?query') --> '/path', 'query' -# splittag('/path#tag') --> '/path', 'tag' -# splitattr('/path;attr1=value1;attr2=value2;...') -> -# '/path', ['attr1=value1', 'attr2=value2', ...] -# splitvalue('attr=value') --> 'attr', 'value' -# urllib.parse.unquote('abc%20def') -> 'abc def' -# quote('abc def') -> 'abc%20def') - def to_bytes(url): """to_bytes(u"URL") --> 'URL'.""" # Most URL schemes require ASCII. If that changes, the conversion @@ -852,12 +856,12 @@ def splittype(url): """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" global _typeprog if _typeprog is None: - _typeprog = re.compile('^([^/:]+):') + _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL) match = _typeprog.match(url) if match: - scheme = match.group(1) - return scheme.lower(), url[len(scheme) + 1:] + scheme, data = match.groups() + return scheme.lower(), data return None, url _hostprog = None @@ -865,38 +869,25 @@ def splithost(url): """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" global _hostprog if _hostprog is None: - _hostprog = re.compile('^//([^/?]*)(.*)$') + _hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL) match = _hostprog.match(url) if match: - host_port = match.group(1) - path = match.group(2) - if path and not path.startswith('/'): + host_port, path = match.groups() + if path and path[0] != '/': path = '/' + path return host_port, path return None, url -_userprog = None def splituser(host): """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" - global _userprog - if _userprog is None: - _userprog = re.compile('^(.*)@(.*)$') - - match = _userprog.match(host) - if match: return match.group(1, 2) - return None, host + user, delim, host = host.rpartition('@') + return (user if delim else None), host -_passwdprog = None def splitpasswd(user): """splitpasswd('user:passwd') -> 'user', 'passwd'.""" - global _passwdprog - if _passwdprog is None: - _passwdprog = re.compile('^([^:]*):(.*)$',re.S) - - match = _passwdprog.match(user) - if match: return match.group(1, 2) - return user, None + user, delim, passwd = user.partition(':') + return user, (passwd if delim else None) # splittag('/path#tag') --> '/path', 'tag' _portprog = None @@ -904,7 +895,7 @@ def splitport(host): """splitport('host:port') --> 'host', 'port'.""" global _portprog if _portprog is None: - _portprog = re.compile('^(.*):([0-9]*)$') + _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL) match = _portprog.match(host) if match: @@ -913,47 +904,34 @@ def splitport(host): return host, port return host, None -_nportprog = None def splitnport(host, defport=-1): """Split host and port, returning numeric port. Return given default port if no ':' found; defaults to -1. Return numerical port if a valid number are found after ':'. Return None if ':' but not a valid number.""" - global _nportprog - if _nportprog is None: - _nportprog = re.compile('^(.*):(.*)$') - - match = _nportprog.match(host) - if match: - host, port = match.group(1, 2) - if port: - try: - nport = int(port) - except ValueError: - nport = None - return host, nport + host, delim, port = host.rpartition(':') + if not delim: + host = port + elif port: + try: + nport = int(port) + except ValueError: + nport = None + return host, nport return host, defport -_queryprog = None def splitquery(url): """splitquery('/path?query') --> '/path', 'query'.""" - global _queryprog - if _queryprog is None: - _queryprog = re.compile('^(.*)\?([^?]*)$') - - match = _queryprog.match(url) - if match: return match.group(1, 2) + path, delim, query = url.rpartition('?') + if delim: + return path, query return url, None -_tagprog = None def splittag(url): """splittag('/path#tag') --> '/path', 'tag'.""" - global _tagprog - if _tagprog is None: - _tagprog = re.compile('^(.*)#([^#]*)$') - - match = _tagprog.match(url) - if match: return match.group(1, 2) + path, delim, tag = url.rpartition('#') + if delim: + return path, tag return url, None def splitattr(url): @@ -962,13 +940,7 @@ def splitattr(url): words = url.split(';') return words[0], words[1:] -_valueprog = None def splitvalue(attr): """splitvalue('attr=value') --> 'attr', 'value'.""" - global _valueprog - if _valueprog is None: - _valueprog = re.compile('^([^=]*)=(.*)$') - - match = _valueprog.match(attr) - if match: return match.group(1, 2) - return attr, None + attr, delim, value = attr.partition('=') + return attr, (value if delim else None) diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index f769386..3be327d 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -91,6 +91,7 @@ import os import posixpath import re import socket +import string import sys import time import collections @@ -120,9 +121,10 @@ __all__ = [ 'Request', 'OpenerDirector', 'BaseHandler', 'HTTPDefaultErrorHandler', 'HTTPRedirectHandler', 'HTTPCookieProcessor', 'ProxyHandler', 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', - 'AbstractBasicAuthHandler', 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', - 'AbstractDigestAuthHandler', 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', - 'HTTPHandler', 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler', + 'HTTPPasswordMgrWithPriorAuth', 'AbstractBasicAuthHandler', + 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler', 'AbstractDigestAuthHandler', + 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler', 'HTTPHandler', + 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler', 'UnknownHandler', 'HTTPErrorProcessor', # Functions 'urlopen', 'install_opener', 'build_opener', @@ -615,8 +617,12 @@ class HTTPRedirectHandler(BaseHandler): # from the user (of urllib.request, in this case). In practice, # essentially all clients do redirect in this case, so we do # the same. - # be conciliant with URIs containing a space + + # Be conciliant with URIs containing a space. This is mainly + # redundant with the more complete encoding done in http_error_302(), + # but it is kept for compatibility with other callers. newurl = newurl.replace(' ', '%20') + CONTENT_HEADERS = ("content-length", "content-type") newheaders = dict((k, v) for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS) @@ -651,11 +657,16 @@ class HTTPRedirectHandler(BaseHandler): "%s - Redirection to url '%s' is not allowed" % (msg, newurl), headers, fp) - if not urlparts.path: + if not urlparts.path and urlparts.netloc: urlparts = list(urlparts) urlparts[2] = "/" newurl = urlunparse(urlparts) + # http.client.parse_headers() decodes as ISO-8859-1. Recover the + # original bytes and percent-encode non-ASCII bytes, and any special + # characters such as the space. + newurl = quote( + newurl, encoding="iso-8859-1", safe=string.punctuation) newurl = urljoin(req.full_url, newurl) # XXX Probably want to forget about the state of the current @@ -836,6 +847,37 @@ class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): return HTTPPasswordMgr.find_user_password(self, None, authuri) +class HTTPPasswordMgrWithPriorAuth(HTTPPasswordMgrWithDefaultRealm): + + def __init__(self, *args, **kwargs): + self.authenticated = {} + super().__init__(*args, **kwargs) + + def add_password(self, realm, uri, user, passwd, is_authenticated=False): + self.update_authenticated(uri, is_authenticated) + # Add a default for prior auth requests + if realm is not None: + super().add_password(None, uri, user, passwd) + super().add_password(realm, uri, user, passwd) + + def update_authenticated(self, uri, is_authenticated=False): + # uri could be a single URI or a sequence + if isinstance(uri, str): + uri = [uri] + + for default_port in True, False: + for u in uri: + reduced_uri = self.reduce_uri(u, default_port) + self.authenticated[reduced_uri] = is_authenticated + + def is_authenticated(self, authuri): + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uri in self.authenticated: + if self.is_suburi(uri, reduced_authuri): + return self.authenticated[uri] + + class AbstractBasicAuthHandler: # XXX this allows for multiple auth-schemes, but will stupidly pick @@ -890,6 +932,31 @@ class AbstractBasicAuthHandler: else: return None + def http_request(self, req): + if (not hasattr(self.passwd, 'is_authenticated') or + not self.passwd.is_authenticated(req.full_url)): + return req + + if not req.has_header('Authorization'): + user, passwd = self.passwd.find_user_password(None, req.full_url) + credentials = '{0}:{1}'.format(user, passwd).encode() + auth_str = base64.standard_b64encode(credentials).decode() + req.add_unredirected_header('Authorization', + 'Basic {}'.format(auth_str.strip())) + return req + + def http_response(self, req, response): + if hasattr(self.passwd, 'is_authenticated'): + if 200 <= response.code < 300: + self.passwd.update_authenticated(req.full_url, True) + else: + self.passwd.update_authenticated(req.full_url, False) + return response + + https_request = http_request + https_response = http_response + + class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): @@ -1054,6 +1121,9 @@ class AbstractDigestAuthHandler: elif algorithm == 'SHA': H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest() # XXX MD5-sess + else: + raise ValueError("Unsupported digest authentication " + "algorithm %r" % algorithm) KD = lambda s, d: H("%s:%s" % (s, d)) return H, KD @@ -1151,6 +1221,7 @@ class AbstractHTTPHandler(BaseHandler): # will parse host:port h = http_class(host, timeout=req.timeout, **http_conn_args) + h.set_debuglevel(self._debuglevel) headers = dict(req.unredirected_hdrs) headers.update(dict((k, v) for k, v in req.headers.items() @@ -1993,18 +2064,20 @@ class FancyURLopener(URLopener): def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): """Error 302 -- relocated (temporarily).""" self.tries += 1 - if self.maxtries and self.tries >= self.maxtries: - if hasattr(self, "http_error_500"): - meth = self.http_error_500 - else: - meth = self.http_error_default + try: + if self.maxtries and self.tries >= self.maxtries: + if hasattr(self, "http_error_500"): + meth = self.http_error_500 + else: + meth = self.http_error_default + return meth(url, fp, 500, + "Internal Server Error: Redirect Recursion", + headers) + result = self.redirect_internal(url, fp, errcode, errmsg, + headers, data) + return result + finally: self.tries = 0 - return meth(url, fp, 500, - "Internal Server Error: Redirect Recursion", headers) - result = self.redirect_internal(url, fp, errcode, errmsg, headers, - data) - self.tries = 0 - return result def redirect_internal(self, url, fp, errcode, errmsg, headers, data): if 'location' in headers: @@ -2333,26 +2406,41 @@ def getproxies_environment(): """ proxies = {} + # in order to prefer lowercase variables, process environment in + # two passes: first matches any, second pass matches lowercase only for name, value in os.environ.items(): name = name.lower() if value and name[-6:] == '_proxy': proxies[name[:-6]] = value - # CVE-2016-1000110 - If we are running as CGI script, forget HTTP_PROXY # (non-all-lowercase) as it may be set from the web server by a "Proxy:" # header from the client + # If "proxy" is lowercase, it will still be used thanks to the next block if 'REQUEST_METHOD' in os.environ: proxies.pop('http', None) - + for name, value in os.environ.items(): + if name[-6:] == '_proxy': + name = name.lower() + if value: + proxies[name[:-6]] = value + else: + proxies.pop(name[:-6], None) return proxies -def proxy_bypass_environment(host): +def proxy_bypass_environment(host, proxies=None): """Test if proxies should not be used for a particular host. - Checks the environment for a variable named no_proxy, which should - be a list of DNS suffixes separated by commas, or '*' for all hosts. + Checks the proxy dict for the value of no_proxy, which should + be a list of comma separated DNS suffixes, or '*' for all hosts. + """ - no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '') + if proxies is None: + proxies = getproxies_environment() + # don't bypass, if no_proxy isn't specified + try: + no_proxy = proxies['no'] + except KeyError: + return 0 # '*' is special case for always bypass if no_proxy == '*': return 1 @@ -2361,8 +2449,12 @@ def proxy_bypass_environment(host): # check if the host ends with any of the DNS suffixes no_proxy_list = [proxy.strip() for proxy in no_proxy.split(',')] for name in no_proxy_list: - if name and (hostonly.endswith(name) or host.endswith(name)): - return 1 + if name: + name = re.escape(name) + pattern = r'(.+\.)?%s$' % name + if (re.match(pattern, hostonly, re.I) + or re.match(pattern, host, re.I)): + return 1 # otherwise, don't bypass return 0 @@ -2447,8 +2539,15 @@ if sys.platform == 'darwin': def proxy_bypass(host): - if getproxies_environment(): - return proxy_bypass_environment(host) + """Return True, if host should be bypassed. + + Checks proxy settings gathered from the environment, if specified, + or from the MacOSX framework SystemConfiguration. + + """ + proxies = getproxies_environment() + if proxies: + return proxy_bypass_environment(host, proxies) else: return proxy_bypass_macosx_sysconf(host) @@ -2562,14 +2661,15 @@ elif os.name == 'nt': return 0 def proxy_bypass(host): - """Return a dictionary of scheme -> proxy server URL mappings. + """Return True, if host should be bypassed. - Returns settings gathered from the environment, if specified, + Checks proxy settings gathered from the environment, if specified, or the registry. """ - if getproxies_environment(): - return proxy_bypass_environment(host) + proxies = getproxies_environment() + if proxies: + return proxy_bypass_environment(host, proxies) else: return proxy_bypass_registry(host) diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 1d7b751..8b69fd9 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -132,7 +132,7 @@ class RobotFileParser: return True # Until the robots.txt file has been read or found not # to exist, we must assume that no url is allowable. - # This prevents false positives when a user erronenously + # This prevents false positives when a user erroneously # calls can_fetch() before calling read(). if not self.last_checked: return False @@ -172,7 +172,7 @@ class RuleLine: return self.path == "*" or filename.startswith(self.path) def __str__(self): - return (self.allowance and "Allow" or "Disallow") + ": " + self.path + return ("Allow" if self.allowance else "Disallow") + ": " + self.path class Entry: |