diff options
Diffstat (limited to 'Lib/urllib2.py')
-rw-r--r-- | Lib/urllib2.py | 108 |
1 files changed, 89 insertions, 19 deletions
diff --git a/Lib/urllib2.py b/Lib/urllib2.py index 21f916c..91bcc2b 100644 --- a/Lib/urllib2.py +++ b/Lib/urllib2.py @@ -119,7 +119,8 @@ from urllib import (unwrap, unquote, splittype, splithost, quote, # support for FileHandler, proxies via environment variables from urllib import localhost, url2pathname, getproxies -__version__ = "2.5" +# used in User-Agent header sent +__version__ = sys.version[:3] _opener = None def urlopen(url, data=None): @@ -563,6 +564,80 @@ class HTTPRedirectHandler(BaseHandler): "lead to an infinite loop.\n" \ "The last 30x error message was:\n" + +def _parse_proxy(proxy): + """Return (scheme, user, password, host/port) given a URL or an authority. + + If a URL is supplied, it must have an authority (host:port) component. + According to RFC 3986, having an authority component means the URL must + have two slashes after the scheme: + + >>> _parse_proxy('file:/ftp.example.com/') + Traceback (most recent call last): + ValueError: proxy URL with no authority: 'file:/ftp.example.com/' + + The first three items of the returned tuple may be None. + + Examples of authority parsing: + + >>> _parse_proxy('proxy.example.com') + (None, None, None, 'proxy.example.com') + >>> _parse_proxy('proxy.example.com:3128') + (None, None, None, 'proxy.example.com:3128') + + The authority component may optionally include userinfo (assumed to be + username:password): + + >>> _parse_proxy('joe:password@proxy.example.com') + (None, 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('joe:password@proxy.example.com:3128') + (None, 'joe', 'password', 'proxy.example.com:3128') + + Same examples, but with URLs instead: + + >>> _parse_proxy('http://proxy.example.com/') + ('http', None, None, 'proxy.example.com') + >>> _parse_proxy('http://proxy.example.com:3128/') + ('http', None, None, 'proxy.example.com:3128') + >>> _parse_proxy('http://joe:password@proxy.example.com/') + ('http', 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('http://joe:password@proxy.example.com:3128') + ('http', 'joe', 'password', 'proxy.example.com:3128') + + Everything after the authority is ignored: + + >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') + ('ftp', 'joe', 'password', 'proxy.example.com') + + Test for no trailing '/' case: + + >>> _parse_proxy('http://joe:password@proxy.example.com') + ('http', 'joe', 'password', 'proxy.example.com') + + """ + from urlparse import _splitnetloc + scheme, r_scheme = splittype(proxy) + if not r_scheme.startswith("/"): + # authority + scheme = None + authority = proxy + else: + # URL + if not r_scheme.startswith("//"): + raise ValueError("proxy URL with no authority: %r" % proxy) + # We have an authority, so for RFC 3986-compliant URLs (by ss 3. + # and 3.3.), path is empty or starts with '/' + end = r_scheme.find("/", 2) + if end == -1: + end = None + authority = r_scheme[2:end] + userinfo, hostport = splituser(authority) + if userinfo is not None: + user, password = splitpasswd(userinfo) + else: + user = password = None + return scheme, user, password, hostport + class ProxyHandler(BaseHandler): # Proxies must be in front handler_order = 100 @@ -579,30 +654,25 @@ class ProxyHandler(BaseHandler): def proxy_open(self, req, proxy, type): orig_type = req.get_type() - type, r_type = splittype(proxy) - if not type or r_type.isdigit(): - # proxy is specified without protocol - type = orig_type - host = proxy - else: - host, r_host = splithost(r_type) - user_pass, host = splituser(host) - user, password = splitpasswd(user_pass) + proxy_type, user, password, hostport = _parse_proxy(proxy) + if proxy_type is None: + proxy_type = orig_type if user and password: - user, password = user_pass.split(':', 1) - user_pass = base64.encodestring('%s:%s' % (unquote(user), - unquote(password))).strip() - req.add_header('Proxy-authorization', 'Basic ' + user_pass) - host = unquote(host) - req.set_proxy(host, type) - if orig_type == type: + user_pass = '%s:%s' % (unquote(user), unquote(password)) + creds = base64.encodestring(user_pass).strip() + req.add_header('Proxy-authorization', 'Basic ' + creds) + hostport = unquote(hostport) + req.set_proxy(hostport, proxy_type) + if orig_type == proxy_type: # let other handlers take care of it - # XXX this only makes sense if the proxy is before the - # other handlers return None else: # need to start over, because the other handlers don't # grok the proxy's URL type + # e.g. if we have a constructor arg proxies like so: + # {'http': 'ftp://proxy.example.com'}, we may end up turning + # a request for http://acme.example.com/a into one for + # ftp://proxy.example.com/a return self.parent.open(req) # feature suggested by Duncan Booth |