Patch #1462790: fix urllib2 ProxyHandler for host:port proxies

author: Georg Brandl <georg@python.org> 2006-04-02 20:45:34 (GMT)
committer: Georg Brandl <georg@python.org> 2006-04-02 20:45:34 (GMT)
commit: 720096a6bffe00e05aa3811c0f7490249903bd3f (patch)
tree: e8d62adfdf5396a28d3bb2be89417babee92aa09 /Lib/urllib2.py
parent: 4eb521e595216a406ad1d3175056dc8cd8be157b (diff)
download: cpython-720096a6bffe00e05aa3811c0f7490249903bd3f.zip
cpython-720096a6bffe00e05aa3811c0f7490249903bd3f.tar.gz
cpython-720096a6bffe00e05aa3811c0f7490249903bd3f.tar.bz2
1 files changed, 89 insertions, 19 deletions
diff --git a/Lib/urllib2.py b/Lib/urllib2.py
index 21f916c..91bcc2b 100644
--- a/Lib/urllib2.py
+++ b/Lib/urllib2.py
@@ -119,7 +119,8 @@ from urllib import (unwrap, unquote, splittype, splithost, quote,
 # support for FileHandler, proxies via environment variables
 from urllib import localhost, url2pathname, getproxies
 
-__version__ = "2.5"
+# used in User-Agent header sent
+__version__ = sys.version[:3]
 
 _opener = None
 def urlopen(url, data=None):
@@ -563,6 +564,80 @@ class HTTPRedirectHandler(BaseHandler):
               "lead to an infinite loop.\n" \
               "The last 30x error message was:\n"
 
+
+def _parse_proxy(proxy):
+    """Return (scheme, user, password, host/port) given a URL or an authority.
+
+    If a URL is supplied, it must have an authority (host:port) component.
+    According to RFC 3986, having an authority component means the URL must
+    have two slashes after the scheme:
+
+    >>> _parse_proxy('file:/ftp.example.com/')
+    Traceback (most recent call last):
+    ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
+
+    The first three items of the returned tuple may be None.
+
+    Examples of authority parsing:
+
+    >>> _parse_proxy('proxy.example.com')
+    (None, None, None, 'proxy.example.com')
+    >>> _parse_proxy('proxy.example.com:3128')
+    (None, None, None, 'proxy.example.com:3128')
+
+    The authority component may optionally include userinfo (assumed to be
+    username:password):
+
+    >>> _parse_proxy('joe:password@proxy.example.com')
+    (None, 'joe', 'password', 'proxy.example.com')
+    >>> _parse_proxy('joe:password@proxy.example.com:3128')
+    (None, 'joe', 'password', 'proxy.example.com:3128')
+
+    Same examples, but with URLs instead:
+
+    >>> _parse_proxy('http://proxy.example.com/')
+    ('http', None, None, 'proxy.example.com')
+    >>> _parse_proxy('http://proxy.example.com:3128/')
+    ('http', None, None, 'proxy.example.com:3128')
+    >>> _parse_proxy('http://joe:password@proxy.example.com/')
+    ('http', 'joe', 'password', 'proxy.example.com')
+    >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
+    ('http', 'joe', 'password', 'proxy.example.com:3128')
+
+    Everything after the authority is ignored:
+
+    >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
+    ('ftp', 'joe', 'password', 'proxy.example.com')
+
+    Test for no trailing '/' case:
+
+    >>> _parse_proxy('http://joe:password@proxy.example.com')
+    ('http', 'joe', 'password', 'proxy.example.com')
+
+    """
+    from urlparse import _splitnetloc
+    scheme, r_scheme = splittype(proxy)
+    if not r_scheme.startswith("/"):
+        # authority
+        scheme = None
+        authority = proxy
+    else:
+        # URL
+        if not r_scheme.startswith("//"):
+            raise ValueError("proxy URL with no authority: %r" % proxy)
+        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
+        # and 3.3.), path is empty or starts with '/'
+        end = r_scheme.find("/", 2)
+        if end == -1:
+            end = None
+        authority = r_scheme[2:end]
+    userinfo, hostport = splituser(authority)
+    if userinfo is not None:
+        user, password = splitpasswd(userinfo)
+    else:
+        user = password = None
+    return scheme, user, password, hostport
+
 class ProxyHandler(BaseHandler):
     # Proxies must be in front
     handler_order = 100
@@ -579,30 +654,25 @@ class ProxyHandler(BaseHandler):
 
     def proxy_open(self, req, proxy, type):
         orig_type = req.get_type()
-        type, r_type = splittype(proxy)
-        if not type or r_type.isdigit():
-            # proxy is specified without protocol
-            type = orig_type
-            host = proxy
-        else:
-            host, r_host = splithost(r_type)
-        user_pass, host = splituser(host)
-        user, password = splitpasswd(user_pass)
+        proxy_type, user, password, hostport = _parse_proxy(proxy)
+        if proxy_type is None:
+            proxy_type = orig_type
         if user and password:
-            user, password = user_pass.split(':', 1)
-            user_pass = base64.encodestring('%s:%s' % (unquote(user),
-                                            unquote(password))).strip()
-            req.add_header('Proxy-authorization', 'Basic ' + user_pass)
-        host = unquote(host)
-        req.set_proxy(host, type)
-        if orig_type == type:
+            user_pass = '%s:%s' % (unquote(user), unquote(password))
+            creds = base64.encodestring(user_pass).strip()
+            req.add_header('Proxy-authorization', 'Basic ' + creds)
+        hostport = unquote(hostport)
+        req.set_proxy(hostport, proxy_type)
+        if orig_type == proxy_type:
             # let other handlers take care of it
-            # XXX this only makes sense if the proxy is before the
-            # other handlers
             return None
         else:
             # need to start over, because the other handlers don't
             # grok the proxy's URL type
+            # e.g. if we have a constructor arg proxies like so:
+            # {'http': 'ftp://proxy.example.com'}, we may end up turning
+            # a request for http://acme.example.com/a into one for
+            # ftp://proxy.example.com/a
             return self.parent.open(req)
 
 # feature suggested by Duncan Booth
author	Georg Brandl <georg@python.org>	2006-04-02 20:45:34 (GMT)
committer	Georg Brandl <georg@python.org>	2006-04-02 20:45:34 (GMT)
commit	720096a6bffe00e05aa3811c0f7490249903bd3f (patch)
tree	e8d62adfdf5396a28d3bb2be89417babee92aa09 /Lib/urllib2.py
parent	4eb521e595216a406ad1d3175056dc8cd8be157b (diff)
download	cpython-720096a6bffe00e05aa3811c0f7490249903bd3f.zip cpython-720096a6bffe00e05aa3811c0f7490249903bd3f.tar.gz cpython-720096a6bffe00e05aa3811c0f7490249903bd3f.tar.bz2