diff options
40 files changed, 3189 insertions, 3535 deletions
@@ -35,7 +35,7 @@ from operator import attrgetter from io import StringIO import sys import os -import urllib +import urllib.parse import email.parser __all__ = ["MiniFieldStorage", "FieldStorage", @@ -216,8 +216,8 @@ def parse_qsl(qs, keep_blank_values=0, strict_parsing=0): else: continue if len(nv[1]) or keep_blank_values: - name = urllib.unquote(nv[0].replace('+', ' ')) - value = urllib.unquote(nv[1].replace('+', ' ')) + name = urllib.parse.unquote(nv[0].replace('+', ' ')) + value = urllib.parse.unquote(nv[1].replace('+', ' ')) r.append((name, value)) return r diff --git a/Lib/distutils/command/register.py b/Lib/distutils/command/register.py index 89cb2d4..6d5c459 100644 --- a/Lib/distutils/command/register.py +++ b/Lib/distutils/command/register.py @@ -7,8 +7,9 @@ Implements the Distutils 'register' command (register with the repository). __revision__ = "$Id$" -import os, string, urllib2, getpass, urlparse +import os, string, getpass import io +import urllib.parse, urllib.request from distutils.core import PyPIRCCommand from distutils.errors import * @@ -94,7 +95,8 @@ class register(PyPIRCCommand): def classifiers(self): ''' Fetch the list of classifiers from the server. ''' - response = urllib2.urlopen(self.repository+'?:action=list_classifiers') + url = self.repository+'?:action=list_classifiers' + response = urllib.request.urlopen(url) print(response.read()) def verify_metadata(self): @@ -166,8 +168,8 @@ Your selection [default 1]: ''', end=' ') password = getpass.getpass('Password: ') # set up the authentication - auth = urllib2.HTTPPasswordMgr() - host = urlparse.urlparse(self.repository)[1] + auth = urllib.request.HTTPPasswordMgr() + host = urllib.parse.urlparse(self.repository)[1] auth.add_password(self.realm, host, username, password) # send the info to the server and report the result code, result = self.post_to_server(self.build_post_data('submit'), @@ -276,20 +278,20 @@ Your selection [default 1]: ''', end=' ') 'Content-type': 'multipart/form-data; boundary=%s; charset=utf-8'%boundary, 'Content-length': str(len(body)) } - req = urllib2.Request(self.repository, body, headers) + req = urllib.request.Request(self.repository, body, headers) # handle HTTP and include the Basic Auth handler - opener = urllib2.build_opener( - urllib2.HTTPBasicAuthHandler(password_mgr=auth) + opener = urllib.request.build_opener( + urllib.request.HTTPBasicAuthHandler(password_mgr=auth) ) data = '' try: result = opener.open(req) - except urllib2.HTTPError as e: + except urllib.error.HTTPError as e: if self.show_response: data = e.fp.read() result = e.code, e.msg - except urllib2.URLError as e: + except urllib.error.URLError as e: result = 500, str(e) else: if self.show_response: diff --git a/Lib/distutils/command/upload.py b/Lib/distutils/command/upload.py index 2cad2c7..5049f03 100644 --- a/Lib/distutils/command/upload.py +++ b/Lib/distutils/command/upload.py @@ -13,7 +13,7 @@ import platform import configparser import http.client import base64 -import urlparse +import urllib.parse class upload(PyPIRCCommand): @@ -145,10 +145,11 @@ class upload(PyPIRCCommand): self.announce("Submitting %s to %s" % (filename, self.repository), log.INFO) # build the Request - # We can't use urllib2 since we need to send the Basic + # We can't use urllib since we need to send the Basic # auth right with the first request + # TODO(jhylton): Can we fix urllib? schema, netloc, url, params, query, fragments = \ - urlparse.urlparse(self.repository) + urllib.parse.urlparse(self.repository) assert not params and not query and not fragments if schema == 'http': http = http.client.HTTPConnection(netloc) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 0439aff..e1d21f6 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -25,6 +25,7 @@ import time import base64 import random import socket +import urllib.parse import warnings from io import StringIO @@ -218,8 +219,7 @@ def encode_rfc2231(s, charset=None, language=None): charset is given but not language, the string is encoded using the empty string for language. """ - import urllib - s = urllib.quote(s, safe='') + s = urllib.parse.quote(s, safe='') if charset is None and language is None: return s if language is None: @@ -234,7 +234,6 @@ def decode_params(params): params is a sequence of 2-tuples containing (param name, string value). """ - import urllib # Copy params so we don't mess with the original params = params[:] new_params = [] @@ -272,7 +271,7 @@ def decode_params(params): # language specifiers at the beginning of the string. for num, s, encoded in continuations: if encoded: - s = urllib.unquote(s) + s = urllib.parse.unquote(s) extended = True value.append(s) value = quote(EMPTYSTRING.join(value)) diff --git a/Lib/http/client.py b/Lib/http/client.py index 04e75f6..96bcd72 100644 --- a/Lib/http/client.py +++ b/Lib/http/client.py @@ -70,7 +70,7 @@ import io import socket import email.parser import email.message -from urlparse import urlsplit +from urllib.parse import urlsplit import warnings __all__ = ["HTTPResponse", "HTTPConnection", diff --git a/Lib/http/cookiejar.py b/Lib/http/cookiejar.py index 99be888..e9b83ea 100644 --- a/Lib/http/cookiejar.py +++ b/Lib/http/cookiejar.py @@ -28,7 +28,10 @@ http://wwwsearch.sf.net/): __all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy', 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar'] -import re, urlparse, copy, time, urllib +import copy +import re +import time +import urllib.parse, urllib.request try: import threading as _threading except ImportError: @@ -580,7 +583,7 @@ def request_host(request): """ url = request.get_full_url() - host = urlparse.urlparse(url)[1] + host = urllib.parse.urlparse(url)[1] if host == "": host = request.get_header("Host", "") @@ -602,13 +605,11 @@ def eff_request_host(request): def request_path(request): """request-URI, as defined by RFC 2965.""" url = request.get_full_url() - #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url) - #req_path = escape_path("".join(urlparse.urlparse(url)[2:])) - path, parameters, query, frag = urlparse.urlparse(url)[2:] + path, parameters, query, frag = urllib.parse.urlparse(url)[2:] if parameters: path = "%s;%s" % (path, parameters) path = escape_path(path) - req_path = urlparse.urlunparse(("", "", path, "", query, frag)) + req_path = urllib.parse.urlunparse(("", "", path, "", query, frag)) if not req_path.startswith("/"): # fix bad RFC 2396 absoluteURI req_path = "/"+req_path @@ -644,7 +645,7 @@ def escape_path(path): # And here, kind of: draft-fielding-uri-rfc2396bis-03 # (And in draft IRI specification: draft-duerst-iri-05) # (And here, for new URI schemes: RFC 2718) - path = urllib.quote(path, HTTP_PATH_SAFE) + path = urllib.parse.quote(path, HTTP_PATH_SAFE) path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) return path @@ -1197,8 +1198,7 @@ class CookieJar: """Collection of HTTP cookies. You may not need to know about this class: try - urllib2.build_opener(HTTPCookieProcessor).open(url). - + urllib.request.build_opener(HTTPCookieProcessor).open(url). """ non_word_re = re.compile(r"\W") diff --git a/Lib/http/server.py b/Lib/http/server.py index 35ade6c..6259a4d 100644 --- a/Lib/http/server.py +++ b/Lib/http/server.py @@ -93,7 +93,7 @@ import cgi import time import socket # For gethostbyaddr() import shutil -import urllib +import urllib.parse import select import mimetypes import posixpath @@ -683,7 +683,7 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): return None list.sort(key=lambda a: a.lower()) r = [] - displaypath = cgi.escape(urllib.unquote(self.path)) + displaypath = cgi.escape(urllib.parse.unquote(self.path)) r.append('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">') r.append("<html>\n<title>Directory listing for %s</title>\n" % displaypath) r.append("<body>\n<h2>Directory listing for %s</h2>\n" % displaypath) @@ -699,7 +699,7 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): displayname = name + "@" # Note: a link to a directory displays with @ and links with / r.append('<li><a href="%s">%s</a>\n' - % (urllib.quote(linkname), cgi.escape(displayname))) + % (urllib.parse.quote(linkname), cgi.escape(displayname))) r.append("</ul>\n<hr>\n</body>\n</html>\n") enc = sys.getfilesystemencoding() encoded = ''.join(r).encode(enc) @@ -723,7 +723,7 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): # abandon query parameters path = path.split('?',1)[0] path = path.split('#',1)[0] - path = posixpath.normpath(urllib.unquote(path)) + path = posixpath.normpath(urllib.parse.unquote(path)) words = path.split('/') words = filter(None, words) path = os.getcwd() @@ -947,7 +947,7 @@ class CGIHTTPRequestHandler(SimpleHTTPRequestHandler): env['SERVER_PROTOCOL'] = self.protocol_version env['SERVER_PORT'] = str(self.server.server_port) env['REQUEST_METHOD'] = self.command - uqrest = urllib.unquote(rest) + uqrest = urllib.parse.unquote(rest) env['PATH_INFO'] = uqrest env['PATH_TRANSLATED'] = self.translate_path(uqrest) env['SCRIPT_NAME'] = scriptname diff --git a/Lib/macurl2path.py b/Lib/macurl2path.py index 0c8b64f..11944cf 100644 --- a/Lib/macurl2path.py +++ b/Lib/macurl2path.py @@ -2,7 +2,7 @@ Do not import directly; use urllib instead.""" -import urllib +import urllib.parse import os __all__ = ["url2pathname","pathname2url"] @@ -13,7 +13,7 @@ def url2pathname(pathname): # # XXXX The .. handling should be fixed... # - tp = urllib.splittype(pathname)[0] + tp = urllib.parsesplittype(pathname)[0] if tp and tp != 'file': raise RuntimeError('Cannot convert non-local URL to pathname') # Turn starting /// into /, an empty hostname means current host @@ -47,7 +47,7 @@ def url2pathname(pathname): i = i + 1 rv = ':' + ':'.join(components) # and finally unquote slashes and other funny characters - return urllib.unquote(rv) + return urllib.parseunquote(rv) def pathname2url(pathname): """OS-specific conversion from a file system path to a relative URL @@ -73,8 +73,8 @@ def pathname2url(pathname): return '/'.join(components) def _pncomp2url(component): - component = urllib.quote(component[:31], safe='') # We want to quote slashes - return component + # We want to quote slashes + return urllib.parsequote(component[:31], safe='') def test(): for url in ["index.html", diff --git a/Lib/mimetypes.py b/Lib/mimetypes.py index 6c36a9c..5812c0c 100644 --- a/Lib/mimetypes.py +++ b/Lib/mimetypes.py @@ -24,7 +24,7 @@ read_mime_types(file) -- parse one file, return a dictionary or None import os import posixpath -import urllib +import urllib.parse __all__ = [ "guess_type","guess_extension","guess_all_extensions", @@ -104,7 +104,7 @@ class MimeTypes: Optional `strict' argument when False adds a bunch of commonly found, but non-standard types. """ - scheme, url = urllib.splittype(url) + scheme, url = urllib.parse.splittype(url) if scheme == 'data': # syntax of data URLs: # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data diff --git a/Lib/test/regrtest.py b/Lib/test/regrtest.py index 217703b..94c2248 100755 --- a/Lib/test/regrtest.py +++ b/Lib/test/regrtest.py @@ -725,7 +725,7 @@ def dash_R(the_module, test, indirect_test, huntrleaks): def dash_R_cleanup(fs, ps, pic, abcs): import gc, copyreg import _strptime, linecache - import urlparse, urllib, urllib2, mimetypes, doctest + import urllib.parse, urllib.request, mimetypes, doctest import struct, filecmp, _abcoll from distutils.dir_util import _path_created from weakref import WeakSet @@ -758,9 +758,8 @@ def dash_R_cleanup(fs, ps, pic, abcs): _path_created.clear() re.purge() _strptime._regex_cache.clear() - urlparse.clear_cache() - urllib.urlcleanup() - urllib2.install_opener(None) + urllib.parse.clear_cache() + urllib.request.urlcleanup() linecache.clearcache() mimetypes._default_mime_types() filecmp._cache.clear() diff --git a/Lib/test/support.py b/Lib/test/support.py index c011c10..c6ce760 100644 --- a/Lib/test/support.py +++ b/Lib/test/support.py @@ -352,10 +352,10 @@ def check_syntax_error(testcase, statement): testcase.fail('Missing SyntaxError: "%s"' % statement) def open_urlresource(url, *args, **kw): - import urllib, urlparse + import urllib.request, urllib.parse requires('urlfetch') - filename = urlparse.urlparse(url)[2].split('/')[-1] # '/': it's URL! + filename = urllib.parse.urlparse(url)[2].split('/')[-1] # '/': it's URL! for path in [os.path.curdir, os.path.pardir]: fn = os.path.join(path, filename) @@ -363,7 +363,7 @@ def open_urlresource(url, *args, **kw): return open(fn, *args, **kw) print('\tfetching %s ...' % url, file=get_original_stdout()) - fn, _ = urllib.urlretrieve(url, filename) + fn, _ = urllib.request.urlretrieve(url, filename) return open(fn, *args, **kw) diff --git a/Lib/test/test___all__.py b/Lib/test/test___all__.py index ec1ba16..ba8f75b 100644 --- a/Lib/test/test___all__.py +++ b/Lib/test/test___all__.py @@ -111,7 +111,7 @@ class AllTest(unittest.TestCase): self.check_all("re") self.check_all("reprlib") self.check_all("rlcompleter") - self.check_all("robotparser") + self.check_all("urllib.robotparser") self.check_all("sched") self.check_all("shelve") self.check_all("shlex") @@ -134,8 +134,6 @@ class AllTest(unittest.TestCase): self.check_all("traceback") self.check_all("tty") self.check_all("unittest") - self.check_all("urllib") - self.check_all("urlparse") self.check_all("uu") self.check_all("warnings") self.check_all("wave") diff --git a/Lib/test/test_http_cookiejar.py b/Lib/test/test_http_cookiejar.py index c130190..1627923 100644 --- a/Lib/test/test_http_cookiejar.py +++ b/Lib/test/test_http_cookiejar.py @@ -1,6 +1,6 @@ """Tests for http/cookiejar.py.""" -import re, os, time, urllib2 +import re, os, time, urllib.request from unittest import TestCase from test import support @@ -206,7 +206,7 @@ def interact_netscape(cookiejar, url, *set_cookie_hdrs): def _interact(cookiejar, url, set_cookie_hdrs, hdr_name): """Perform a single request / response cycle, returning Cookie: header.""" - req = urllib2.Request(url) + req = urllib.request.Request(url) cookiejar.add_cookie_header(req) cookie_hdr = req.get_header("Cookie", "") headers = [] @@ -330,7 +330,7 @@ class CookieTests(TestCase): ("http://foo/", "foo.local", True), ("http://foo/", ".local", True), ]: - request = urllib2.Request(url) + request = urllib.request.Request(url) r = pol.domain_return_ok(domain, request) if ok: self.assert_(r) else: self.assert_(not r) @@ -547,46 +547,48 @@ class CookieTests(TestCase): def test_request_path(self): # with parameters - req = urllib2.Request("http://www.example.com/rheum/rhaponicum;" - "foo=bar;sing=song?apples=pears&spam=eggs#ni") + req = urllib.request.Request( + "http://www.example.com/rheum/rhaponicum;" + "foo=bar;sing=song?apples=pears&spam=eggs#ni") self.assertEquals(request_path(req), "/rheum/rhaponicum;" "foo=bar;sing=song?apples=pears&spam=eggs#ni") # without parameters - req = urllib2.Request("http://www.example.com/rheum/rhaponicum?" - "apples=pears&spam=eggs#ni") + req = urllib.request.Request( + "http://www.example.com/rheum/rhaponicum?" + "apples=pears&spam=eggs#ni") self.assertEquals(request_path(req), "/rheum/rhaponicum?" "apples=pears&spam=eggs#ni") # missing final slash - req = urllib2.Request("http://www.example.com") + req = urllib.request.Request("http://www.example.com") self.assertEquals(request_path(req), "/") def test_request_port(self): - req = urllib2.Request("http://www.acme.com:1234/", - headers={"Host": "www.acme.com:4321"}) + req = urllib.request.Request("http://www.acme.com:1234/", + headers={"Host": "www.acme.com:4321"}) self.assertEquals(request_port(req), "1234") - req = urllib2.Request("http://www.acme.com/", - headers={"Host": "www.acme.com:4321"}) + req = urllib.request.Request("http://www.acme.com/", + headers={"Host": "www.acme.com:4321"}) self.assertEquals(request_port(req), DEFAULT_HTTP_PORT) def test_request_host(self): # this request is illegal (RFC2616, 14.2.3) - req = urllib2.Request("http://1.1.1.1/", - headers={"Host": "www.acme.com:80"}) + req = urllib.request.Request("http://1.1.1.1/", + headers={"Host": "www.acme.com:80"}) # libwww-perl wants this response, but that seems wrong (RFC 2616, # section 5.2, point 1., and RFC 2965 section 1, paragraph 3) #self.assertEquals(request_host(req), "www.acme.com") self.assertEquals(request_host(req), "1.1.1.1") - req = urllib2.Request("http://www.acme.com/", - headers={"Host": "irrelevant.com"}) + req = urllib.request.Request("http://www.acme.com/", + headers={"Host": "irrelevant.com"}) self.assertEquals(request_host(req), "www.acme.com") # not actually sure this one is valid Request object, so maybe should # remove test for no host in url in request_host function? - req = urllib2.Request("/resource.html", - headers={"Host": "www.acme.com"}) + req = urllib.request.Request("/resource.html", + headers={"Host": "www.acme.com"}) self.assertEquals(request_host(req), "www.acme.com") # port shouldn't be in request-host - req = urllib2.Request("http://www.acme.com:2345/resource.html", - headers={"Host": "www.acme.com:5432"}) + req = urllib.request.Request("http://www.acme.com:2345/resource.html", + headers={"Host": "www.acme.com:5432"}) self.assertEquals(request_host(req), "www.acme.com") def test_is_HDN(self): @@ -766,24 +768,24 @@ class CookieTests(TestCase): blocked_domains=["acme.com"], allowed_domains=["www.acme.com"])) - req = urllib2.Request("http://acme.com/") + req = urllib.request.Request("http://acme.com/") headers = ["Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/"] res = FakeResponse(headers, "http://acme.com/") c.extract_cookies(res, req) self.assertEquals(len(c), 0) - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") res = FakeResponse(headers, "http://www.acme.com/") c.extract_cookies(res, req) self.assertEquals(len(c), 1) - req = urllib2.Request("http://www.coyote.com/") + req = urllib.request.Request("http://www.coyote.com/") res = FakeResponse(headers, "http://www.coyote.com/") c.extract_cookies(res, req) self.assertEquals(len(c), 1) # set a cookie with non-allowed domain... - req = urllib2.Request("http://www.coyote.com/") + req = urllib.request.Request("http://www.coyote.com/") res = FakeResponse(headers, "http://www.coyote.com/") cookies = c.make_cookies(res, req) c.set_cookie(cookies[0]) @@ -798,7 +800,7 @@ class CookieTests(TestCase): c = CookieJar(policy=pol) headers = ["Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/"] - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") res = FakeResponse(headers, "http://www.acme.com/") c.extract_cookies(res, req) self.assertEquals(len(c), 0) @@ -808,11 +810,11 @@ class CookieTests(TestCase): self.assertEquals(len(c), 1) c.clear() - req = urllib2.Request("http://www.roadrunner.net/") + req = urllib.request.Request("http://www.roadrunner.net/") res = FakeResponse(headers, "http://www.roadrunner.net/") c.extract_cookies(res, req) self.assertEquals(len(c), 1) - req = urllib2.Request("http://www.roadrunner.net/") + req = urllib.request.Request("http://www.roadrunner.net/") c.add_cookie_header(req) self.assert_((req.has_header("Cookie") and req.has_header("Cookie2"))) @@ -823,7 +825,7 @@ class CookieTests(TestCase): self.assertEquals(len(c), 1) # set a cookie with blocked domain... - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") res = FakeResponse(headers, "http://www.acme.com/") cookies = c.make_cookies(res, req) c.set_cookie(cookies[0]) @@ -866,7 +868,7 @@ class CookieTests(TestCase): url = "http://www.acme.com" c = CookieJar(DefaultCookiePolicy(rfc2965=True)) interact_2965(c, url, "foo=bar; Version=1") - req = urllib2.Request(url) + req = urllib.request.Request(url) self.assertEquals(len(c), 1) c.add_cookie_header(req) self.assert_(req.has_header("Cookie")) @@ -1009,7 +1011,7 @@ class CookieTests(TestCase): def cookiejar_from_cookie_headers(headers): c = CookieJar() - req = urllib2.Request("http://www.example.com/") + req = urllib.request.Request("http://www.example.com/") r = FakeResponse(headers, "http://www.example.com/") c.extract_cookies(r, req) return c @@ -1080,9 +1082,9 @@ class LWPCookieTests(TestCase): c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) - #req = urllib2.Request("http://1.1.1.1/", + #req = urllib.request.Request("http://1.1.1.1/", # headers={"Host": "www.acme.com:80"}) - req = urllib2.Request("http://www.acme.com:80/", + req = urllib.request.Request("http://www.acme.com:80/", headers={"Host": "www.acme.com:80"}) headers.append( @@ -1091,7 +1093,7 @@ class LWPCookieTests(TestCase): res = FakeResponse(headers, "http://www.acme.com/") c.extract_cookies(res, req) - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") c.add_cookie_header(req) self.assertEqual(req.get_header("Cookie"), "CUSTOMER=WILE_E_COYOTE") @@ -1101,7 +1103,7 @@ class LWPCookieTests(TestCase): res = FakeResponse(headers, "http://www.acme.com/") c.extract_cookies(res, req) - req = urllib2.Request("http://www.acme.com/foo/bar") + req = urllib.request.Request("http://www.acme.com/foo/bar") c.add_cookie_header(req) h = req.get_header("Cookie") @@ -1112,7 +1114,7 @@ class LWPCookieTests(TestCase): res = FakeResponse(headers, "http://www.acme.com") c.extract_cookies(res, req) - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") c.add_cookie_header(req) h = req.get_header("Cookie") @@ -1120,7 +1122,7 @@ class LWPCookieTests(TestCase): "CUSTOMER=WILE_E_COYOTE" in h and "SHIPPING=FEDEX" not in h) - req = urllib2.Request("http://www.acme.com/foo/") + req = urllib.request.Request("http://www.acme.com/foo/") c.add_cookie_header(req) h = req.get_header("Cookie") @@ -1155,13 +1157,13 @@ class LWPCookieTests(TestCase): c = CookieJar() headers = [] - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") headers.append("Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/") res = FakeResponse(headers, "http://www.acme.com/") c.extract_cookies(res, req) - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") c.add_cookie_header(req) self.assertEquals(req.get_header("Cookie"), @@ -1172,7 +1174,7 @@ class LWPCookieTests(TestCase): res = FakeResponse(headers, "http://www.acme.com/") c.extract_cookies(res, req) - req = urllib2.Request("http://www.acme.com/ammo") + req = urllib.request.Request("http://www.acme.com/ammo") c.add_cookie_header(req) self.assert_(re.search(r"PART_NUMBER=RIDING_ROCKET_0023;\s*" @@ -1503,7 +1505,7 @@ class LWPCookieTests(TestCase): # Some additional Netscape cookies tests. c = CookieJar() headers = [] - req = urllib2.Request("http://foo.bar.acme.com/foo") + req = urllib.request.Request("http://foo.bar.acme.com/foo") # Netscape allows a host part that contains dots headers.append("Set-Cookie: Customer=WILE_E_COYOTE; domain=.acme.com") @@ -1517,7 +1519,7 @@ class LWPCookieTests(TestCase): res = FakeResponse(headers, "http://www.acme.com/foo") c.extract_cookies(res, req) - req = urllib2.Request("http://foo.bar.acme.com/foo") + req = urllib.request.Request("http://foo.bar.acme.com/foo") c.add_cookie_header(req) self.assert_( "PART_NUMBER=3,4" in req.get_header("Cookie") and @@ -1559,12 +1561,12 @@ class LWPCookieTests(TestCase): c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) headers = [] - req = urllib2.Request("http://www.ants.com/") + req = urllib.request.Request("http://www.ants.com/") headers.append("Set-Cookie: JSESSIONID=ABCDERANDOM123; Path=") res = FakeResponse(headers, "http://www.ants.com/") c.extract_cookies(res, req) - req = urllib2.Request("http://www.ants.com/") + req = urllib.request.Request("http://www.ants.com/") c.add_cookie_header(req) self.assertEquals(req.get_header("Cookie"), @@ -1572,7 +1574,7 @@ class LWPCookieTests(TestCase): self.assertEquals(req.get_header("Cookie2"), '$Version="1"') # missing path in the request URI - req = urllib2.Request("http://www.ants.com:8080") + req = urllib.request.Request("http://www.ants.com:8080") c.add_cookie_header(req) self.assertEquals(req.get_header("Cookie"), @@ -1585,7 +1587,7 @@ class LWPCookieTests(TestCase): # Check session cookies are deleted properly by # CookieJar.clear_session_cookies method - req = urllib2.Request('http://www.perlmeister.com/scripts') + req = urllib.request.Request('http://www.perlmeister.com/scripts') headers = [] headers.append("Set-Cookie: s1=session;Path=/scripts") headers.append("Set-Cookie: p1=perm; Domain=.perlmeister.com;" diff --git a/Lib/test/test_httpservers.py b/Lib/test/test_httpservers.py index 94c6a3d..0305a90 100644 --- a/Lib/test/test_httpservers.py +++ b/Lib/test/test_httpservers.py @@ -11,7 +11,7 @@ import os import sys import base64 import shutil -import urllib +import urllib.parse import http.client import tempfile import threading @@ -322,7 +322,8 @@ class CGIHTTPServerTestCase(BaseTestCase): (res.read(), res.getheader('Content-type'), res.status)) def test_post(self): - params = urllib.urlencode({'spam' : 1, 'eggs' : 'python', 'bacon' : 123456}) + params = urllib.parse.urlencode( + {'spam' : 1, 'eggs' : 'python', 'bacon' : 123456}) headers = {'Content-type' : 'application/x-www-form-urlencoded'} res = self.request('/cgi-bin/file2.py', 'POST', params, headers) diff --git a/Lib/test/test_importhooks.py b/Lib/test/test_importhooks.py index eaf213d..acf45fb 100644 --- a/Lib/test/test_importhooks.py +++ b/Lib/test/test_importhooks.py @@ -247,22 +247,22 @@ class ImportHooksTestCase(ImportHooksBaseTestCase): i = ImpWrapper() sys.meta_path.append(i) sys.path_hooks.append(ImpWrapper) - mnames = ("colorsys", "urlparse", "distutils.core") + mnames = ("colorsys", "urllib.parse", "distutils.core") for mname in mnames: parent = mname.split(".")[0] - for n in list(sys.modules.keys()): + for n in list(sys.modules): if n.startswith(parent): del sys.modules[n] for mname in mnames: m = __import__(mname, globals(), locals(), ["__dummy__"]) m.__loader__ # to make sure we actually handled the import - # Delete urllib from modules because urlparse was imported above. - # Without this hack, test_socket_ssl fails if run in this order: - # regrtest.py test_codecmaps_tw test_importhooks test_socket_ssl - try: - del sys.modules['urllib'] - except KeyError: - pass +## # Delete urllib from modules because urlparse was imported above. +## # Without this hack, test_socket_ssl fails if run in this order: +## # regrtest.py test_codecmaps_tw test_importhooks test_socket_ssl +## try: +## del sys.modules['urllib'] +## except KeyError: +## pass def test_main(): support.run_unittest(ImportHooksTestCase) diff --git a/Lib/test/test_pyclbr.py b/Lib/test/test_pyclbr.py index 8287877..9438c7b 100644 --- a/Lib/test/test_pyclbr.py +++ b/Lib/test/test_pyclbr.py @@ -156,16 +156,6 @@ class PyclbrTest(TestCase): # These were once about the 10 longest modules cm('random', ignore=('Random',)) # from _random import Random as CoreGenerator cm('cgi', ignore=('log',)) # set with = in module - cm('urllib', ignore=('_CFNumberToInt32', - '_CStringFromCFString', - '_CFSetup', - 'getproxies_registry', - 'proxy_bypass_registry', - 'proxy_bypass_macosx_sysconf', - 'open_https', - '_https_connection', - 'getproxies_macosx_sysconf', - 'getproxies_internetconfig',)) # not on all platforms cm('pickle') cm('aifc', ignore=('openfp',)) # set with = in module cm('sre_parse', ignore=('dump',)) # from sre_constants import * diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 4e530f0..fbb02bc 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -1,5 +1,6 @@ -import unittest, robotparser import io +import unittest +import urllib.robotparser from test import support class RobotTestCase(unittest.TestCase): @@ -34,7 +35,7 @@ def RobotTest(index, robots_txt, good_urls, bad_urls, agent="test_robotparser"): lines = io.StringIO(robots_txt).readlines() - parser = robotparser.RobotFileParser() + parser = urllib.robotparser.RobotFileParser() parser.parse(lines) for url in good_urls: tests.addTest(RobotTestCase(index, parser, url, 1, agent)) @@ -140,7 +141,7 @@ class TestCase(unittest.TestCase): support.requires('network') # whole site is password-protected. url = 'http://mueblesmoraleda.com' - parser = robotparser.RobotFileParser() + parser = urllib.robotparser.RobotFileParser() parser.set_url(url) parser.read() self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False) diff --git a/Lib/test/test_ssl.py b/Lib/test/test_ssl.py index 619161e..9341bf9 100644 --- a/Lib/test/test_ssl.py +++ b/Lib/test/test_ssl.py @@ -10,7 +10,7 @@ import subprocess import time import os import pprint -import urllib, urlparse +import urllib.parse, urllib.request import shutil import traceback import asyncore @@ -440,8 +440,8 @@ else: """ # abandon query parameters - path = urlparse.urlparse(path)[2] - path = os.path.normpath(urllib.unquote(path)) + path = urllib.parse.urlparse(path)[2] + path = os.path.normpath(urllib.parse.unquote(path)) words = path.split('/') words = filter(None, words) path = self.root @@ -943,7 +943,7 @@ else: # now fetch the same data from the HTTPS server url = 'https://%s:%d/%s' % ( HOST, server.port, os.path.split(CERTFILE)[1]) - f = urllib.urlopen(url) + f = urllib.request.urlopen(url) dlen = f.info().get("content-length") if dlen and (int(dlen) > 0): d2 = f.read(int(dlen)) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 2b41cad..f5a9d5d 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1,6 +1,7 @@ """Regresssion tests for urllib""" -import urllib +import urllib.parse +import urllib.request import http.client import email.message import io @@ -16,6 +17,23 @@ def hexescape(char): hex_repr = "0%s" % hex_repr return "%" + hex_repr +# Shortcut for testing FancyURLopener +_urlopener = None +def urlopen(url, data=None, proxies=None): + """urlopen(url [, data]) -> open file-like object""" + global _urlopener + if proxies is not None: + opener = urllib.request.FancyURLopener(proxies=proxies) + elif not _urlopener: + opener = urllib.request.FancyURLopener() + _urlopener = opener + else: + opener = _urlopener + if data is None: + return opener.open(url) + else: + return opener.open(url, data) + class urlopen_FileTests(unittest.TestCase): """Test urlopen() opening a temporary file. @@ -25,15 +43,16 @@ class urlopen_FileTests(unittest.TestCase): """ def setUp(self): - """Setup of a temp file to use for testing""" - self.text = bytes("test_urllib: %s\n" % self.__class__.__name__, "ascii") - FILE = open(support.TESTFN, 'wb') + # Create a temp file to use for testing + self.text = bytes("test_urllib: %s\n" % self.__class__.__name__, + "ascii") + f = open(support.TESTFN, 'wb') try: - FILE.write(self.text) + f.write(self.text) finally: - FILE.close() + f.close() self.pathname = support.TESTFN - self.returned_obj = urllib.urlopen("file:%s" % self.pathname) + self.returned_obj = urlopen("file:%s" % self.pathname) def tearDown(self): """Shut down the open object""" @@ -119,7 +138,7 @@ class urlopen_HttpTests(unittest.TestCase): def test_read(self): self.fakehttp(b"Hello!") try: - fp = urllib.urlopen("http://python.org/") + fp = urlopen("http://python.org/") self.assertEqual(fp.readline(), b"Hello!") self.assertEqual(fp.readline(), b"") self.assertEqual(fp.geturl(), 'http://python.org/') @@ -136,7 +155,7 @@ Connection: close Content-Type: text/html; charset=iso-8859-1 ''') try: - self.assertRaises(IOError, urllib.urlopen, "http://python.org/") + self.assertRaises(IOError, urlopen, "http://python.org/") finally: self.unfakehttp() @@ -145,7 +164,7 @@ Content-Type: text/html; charset=iso-8859-1 # data. (#1680230) self.fakehttp(b'') try: - self.assertRaises(IOError, urllib.urlopen, "http://something") + self.assertRaises(IOError, urlopen, "http://something") finally: self.unfakehttp() @@ -180,7 +199,8 @@ class urlretrieve_FileTests(unittest.TestCase): except: pass def constructLocalFileUrl(self, filePath): - return "file://%s" % urllib.pathname2url(os.path.abspath(filePath)) + return "file://%s" % urllib.request.pathname2url( + os.path.abspath(filePath)) def createNewTempFile(self, data=b""): """Creates a new temporary file containing the specified data, @@ -204,7 +224,7 @@ class urlretrieve_FileTests(unittest.TestCase): def test_basic(self): # Make sure that a local file just gets its own location returned and # a headers value is returned. - result = urllib.urlretrieve("file:%s" % support.TESTFN) + result = urllib.request.urlretrieve("file:%s" % support.TESTFN) self.assertEqual(result[0], support.TESTFN) self.assert_(isinstance(result[1], email.message.Message), "did not get a email.message.Message instance as second " @@ -214,7 +234,7 @@ class urlretrieve_FileTests(unittest.TestCase): # Test that setting the filename argument works. second_temp = "%s.2" % support.TESTFN self.registerFileForCleanUp(second_temp) - result = urllib.urlretrieve(self.constructLocalFileUrl( + result = urllib.request.urlretrieve(self.constructLocalFileUrl( support.TESTFN), second_temp) self.assertEqual(second_temp, result[0]) self.assert_(os.path.exists(second_temp), "copy of the file was not " @@ -238,7 +258,8 @@ class urlretrieve_FileTests(unittest.TestCase): count_holder[0] = count_holder[0] + 1 second_temp = "%s.2" % support.TESTFN self.registerFileForCleanUp(second_temp) - urllib.urlretrieve(self.constructLocalFileUrl(support.TESTFN), + urllib.request.urlretrieve( + self.constructLocalFileUrl(support.TESTFN), second_temp, hooktester) def test_reporthook_0_bytes(self): @@ -247,7 +268,7 @@ class urlretrieve_FileTests(unittest.TestCase): def hooktester(count, block_size, total_size, _report=report): _report.append((count, block_size, total_size)) srcFileName = self.createNewTempFile() - urllib.urlretrieve(self.constructLocalFileUrl(srcFileName), + urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName), support.TESTFN, hooktester) self.assertEqual(len(report), 1) self.assertEqual(report[0][2], 0) @@ -261,7 +282,7 @@ class urlretrieve_FileTests(unittest.TestCase): def hooktester(count, block_size, total_size, _report=report): _report.append((count, block_size, total_size)) srcFileName = self.createNewTempFile(b"x" * 5) - urllib.urlretrieve(self.constructLocalFileUrl(srcFileName), + urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName), support.TESTFN, hooktester) self.assertEqual(len(report), 2) self.assertEqual(report[0][1], 8192) @@ -275,7 +296,7 @@ class urlretrieve_FileTests(unittest.TestCase): def hooktester(count, block_size, total_size, _report=report): _report.append((count, block_size, total_size)) srcFileName = self.createNewTempFile(b"x" * 8193) - urllib.urlretrieve(self.constructLocalFileUrl(srcFileName), + urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName), support.TESTFN, hooktester) self.assertEqual(len(report), 3) self.assertEqual(report[0][1], 8192) @@ -284,10 +305,10 @@ class urlretrieve_FileTests(unittest.TestCase): class QuotingTests(unittest.TestCase): """Tests for urllib.quote() and urllib.quote_plus() - According to RFC 2396 ("Uniform Resource Identifiers), to escape a - character you write it as '%' + <2 character US-ASCII hex value>. The Python - code of ``'%' + hex(ord(<character>))[2:]`` escapes a character properly. - Case does not matter on the hex letters. + According to RFC 2396 (Uniform Resource Identifiers), to escape a + character you write it as '%' + <2 character US-ASCII hex value>. + The Python code of ``'%' + hex(ord(<character>))[2:]`` escapes a + character properly. Case does not matter on the hex letters. The various character sets specified are: @@ -313,24 +334,24 @@ class QuotingTests(unittest.TestCase): "abcdefghijklmnopqrstuvwxyz", "0123456789", "_.-"]) - result = urllib.quote(do_not_quote) + result = urllib.parse.quote(do_not_quote) self.assertEqual(do_not_quote, result, "using quote(): %s != %s" % (do_not_quote, result)) - result = urllib.quote_plus(do_not_quote) + result = urllib.parse.quote_plus(do_not_quote) self.assertEqual(do_not_quote, result, "using quote_plus(): %s != %s" % (do_not_quote, result)) def test_default_safe(self): # Test '/' is default value for 'safe' parameter - self.assertEqual(urllib.quote.__defaults__[0], '/') + self.assertEqual(urllib.parse.quote.__defaults__[0], '/') def test_safe(self): # Test setting 'safe' parameter does what it should do quote_by_default = "<>" - result = urllib.quote(quote_by_default, safe=quote_by_default) + result = urllib.parse.quote(quote_by_default, safe=quote_by_default) self.assertEqual(quote_by_default, result, "using quote(): %s != %s" % (quote_by_default, result)) - result = urllib.quote_plus(quote_by_default, safe=quote_by_default) + result = urllib.parse.quote_plus(quote_by_default, safe=quote_by_default) self.assertEqual(quote_by_default, result, "using quote_plus(): %s != %s" % (quote_by_default, result)) @@ -343,11 +364,11 @@ class QuotingTests(unittest.TestCase): should_quote.append(chr(127)) # For 0x7F should_quote = ''.join(should_quote) for char in should_quote: - result = urllib.quote(char) + result = urllib.parse.quote(char) self.assertEqual(hexescape(char), result, "using quote(): %s should be escaped to %s, not %s" % (char, hexescape(char), result)) - result = urllib.quote_plus(char) + result = urllib.parse.quote_plus(char) self.assertEqual(hexescape(char), result, "using quote_plus(): " "%s should be escapes to %s, not %s" % @@ -355,7 +376,7 @@ class QuotingTests(unittest.TestCase): del should_quote partial_quote = "ab[]cd" expected = "ab%5B%5Dcd" - result = urllib.quote(partial_quote) + result = urllib.parse.quote(partial_quote) self.assertEqual(expected, result, "using quote(): %s != %s" % (expected, result)) self.assertEqual(expected, result, @@ -364,26 +385,26 @@ class QuotingTests(unittest.TestCase): def test_quoting_space(self): # Make sure quote() and quote_plus() handle spaces as specified in # their unique way - result = urllib.quote(' ') + result = urllib.parse.quote(' ') self.assertEqual(result, hexescape(' '), "using quote(): %s != %s" % (result, hexescape(' '))) - result = urllib.quote_plus(' ') + result = urllib.parse.quote_plus(' ') self.assertEqual(result, '+', "using quote_plus(): %s != +" % result) given = "a b cd e f" expect = given.replace(' ', hexescape(' ')) - result = urllib.quote(given) + result = urllib.parse.quote(given) self.assertEqual(expect, result, "using quote(): %s != %s" % (expect, result)) expect = given.replace(' ', '+') - result = urllib.quote_plus(given) + result = urllib.parse.quote_plus(given) self.assertEqual(expect, result, "using quote_plus(): %s != %s" % (expect, result)) def test_quoting_plus(self): - self.assertEqual(urllib.quote_plus('alpha+beta gamma'), + self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma'), 'alpha%2Bbeta+gamma') - self.assertEqual(urllib.quote_plus('alpha+beta gamma', '+'), + self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma', '+'), 'alpha+beta+gamma') class UnquotingTests(unittest.TestCase): @@ -399,21 +420,21 @@ class UnquotingTests(unittest.TestCase): for num in range(128): given = hexescape(chr(num)) expect = chr(num) - result = urllib.unquote(given) + result = urllib.parse.unquote(given) self.assertEqual(expect, result, "using unquote(): %s != %s" % (expect, result)) - result = urllib.unquote_plus(given) + result = urllib.parse.unquote_plus(given) self.assertEqual(expect, result, "using unquote_plus(): %s != %s" % (expect, result)) escape_list.append(given) escape_string = ''.join(escape_list) del escape_list - result = urllib.unquote(escape_string) + result = urllib.parse.unquote(escape_string) self.assertEqual(result.count('%'), 1, "using quote(): not all characters escaped; %s" % result) - result = urllib.unquote(escape_string) + result = urllib.parse.unquote(escape_string) self.assertEqual(result.count('%'), 1, "using unquote(): not all characters escaped: " "%s" % result) @@ -423,10 +444,10 @@ class UnquotingTests(unittest.TestCase): # interspersed given = 'ab%sd' % hexescape('c') expect = "abcd" - result = urllib.unquote(given) + result = urllib.parse.unquote(given) self.assertEqual(expect, result, "using quote(): %s != %s" % (expect, result)) - result = urllib.unquote_plus(given) + result = urllib.parse.unquote_plus(given) self.assertEqual(expect, result, "using unquote_plus(): %s != %s" % (expect, result)) @@ -434,16 +455,16 @@ class UnquotingTests(unittest.TestCase): # Test difference between unquote() and unquote_plus() given = "are+there+spaces..." expect = given - result = urllib.unquote(given) + result = urllib.parse.unquote(given) self.assertEqual(expect, result, "using unquote(): %s != %s" % (expect, result)) expect = given.replace('+', ' ') - result = urllib.unquote_plus(given) + result = urllib.parse.unquote_plus(given) self.assertEqual(expect, result, "using unquote_plus(): %s != %s" % (expect, result)) def test_unquote_with_unicode(self): - r = urllib.unquote('br%C3%BCckner_sapporo_20050930.doc') + r = urllib.parse.unquote('br%C3%BCckner_sapporo_20050930.doc') self.assertEqual(r, 'br\xc3\xbcckner_sapporo_20050930.doc') class urlencode_Tests(unittest.TestCase): @@ -462,7 +483,7 @@ class urlencode_Tests(unittest.TestCase): """ expect_somewhere = ["1st=1", "2nd=2", "3rd=3"] - result = urllib.urlencode(given) + result = urllib.parse.urlencode(given) for expected in expect_somewhere: self.assert_(expected in result, "testing %s: %s not found in %s" % @@ -495,20 +516,20 @@ class urlencode_Tests(unittest.TestCase): # Make sure keys and values are quoted using quote_plus() given = {"&":"="} expect = "%s=%s" % (hexescape('&'), hexescape('=')) - result = urllib.urlencode(given) + result = urllib.parse.urlencode(given) self.assertEqual(expect, result) given = {"key name":"A bunch of pluses"} expect = "key+name=A+bunch+of+pluses" - result = urllib.urlencode(given) + result = urllib.parse.urlencode(given) self.assertEqual(expect, result) def test_doseq(self): # Test that passing True for 'doseq' parameter works correctly given = {'sequence':['1', '2', '3']} - expect = "sequence=%s" % urllib.quote_plus(str(['1', '2', '3'])) - result = urllib.urlencode(given) + expect = "sequence=%s" % urllib.parse.quote_plus(str(['1', '2', '3'])) + result = urllib.parse.urlencode(given) self.assertEqual(expect, result) - result = urllib.urlencode(given, True) + result = urllib.parse.urlencode(given, True) for value in given["sequence"]: expect = "sequence=%s" % value self.assert_(expect in result, @@ -523,11 +544,11 @@ class Pathname_Tests(unittest.TestCase): # Make sure simple tests pass expected_path = os.path.join("parts", "of", "a", "path") expected_url = "parts/of/a/path" - result = urllib.pathname2url(expected_path) + result = urllib.request.pathname2url(expected_path) self.assertEqual(expected_url, result, "pathname2url() failed; %s != %s" % (result, expected_url)) - result = urllib.url2pathname(expected_url) + result = urllib.request.url2pathname(expected_url) self.assertEqual(expected_path, result, "url2pathame() failed; %s != %s" % (result, expected_path)) @@ -536,25 +557,25 @@ class Pathname_Tests(unittest.TestCase): # Test automatic quoting and unquoting works for pathnam2url() and # url2pathname() respectively given = os.path.join("needs", "quot=ing", "here") - expect = "needs/%s/here" % urllib.quote("quot=ing") - result = urllib.pathname2url(given) + expect = "needs/%s/here" % urllib.parse.quote("quot=ing") + result = urllib.request.pathname2url(given) self.assertEqual(expect, result, "pathname2url() failed; %s != %s" % (expect, result)) expect = given - result = urllib.url2pathname(result) + result = urllib.request.url2pathname(result) self.assertEqual(expect, result, "url2pathname() failed; %s != %s" % (expect, result)) given = os.path.join("make sure", "using_quote") - expect = "%s/using_quote" % urllib.quote("make sure") - result = urllib.pathname2url(given) + expect = "%s/using_quote" % urllib.parse.quote("make sure") + result = urllib.request.pathname2url(given) self.assertEqual(expect, result, "pathname2url() failed; %s != %s" % (expect, result)) given = "make+sure/using_unquote" expect = os.path.join("make+sure", "using_unquote") - result = urllib.url2pathname(given) + result = urllib.request.url2pathname(given) self.assertEqual(expect, result, "url2pathname() failed; %s != %s" % (expect, result)) diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index d6b3d57..30aa8f2 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -5,8 +5,8 @@ import os import io import socket -import urllib2 -from urllib2 import Request, OpenerDirector +import urllib.request +from urllib.request import Request, OpenerDirector # XXX # Request @@ -17,10 +17,10 @@ class TrivialTests(unittest.TestCase): def test_trivial(self): # A couple trivial tests - self.assertRaises(ValueError, urllib2.urlopen, 'bogus url') + self.assertRaises(ValueError, urllib.request.urlopen, 'bogus url') # XXX Name hacking to get this to work on Windows. - fname = os.path.abspath(urllib2.__file__).replace('\\', '/') + fname = os.path.abspath(urllib.request.__file__).replace('\\', '/') if fname[1:2] == ":": fname = fname[2:] # And more hacking to get it to work on MacOS. This assumes @@ -29,18 +29,21 @@ class TrivialTests(unittest.TestCase): fname = '/' + fname.replace(':', '/') file_url = "file://%s" % fname - f = urllib2.urlopen(file_url) + f = urllib.request.urlopen(file_url) buf = f.read() f.close() def test_parse_http_list(self): - tests = [('a,b,c', ['a', 'b', 'c']), - ('path"o,l"og"i"cal, example', ['path"o,l"og"i"cal', 'example']), - ('a, b, "c", "d", "e,f", g, h', ['a', 'b', '"c"', '"d"', '"e,f"', 'g', 'h']), - ('a="b\\"c", d="e\\,f", g="h\\\\i"', ['a="b"c"', 'd="e,f"', 'g="h\\i"'])] + tests = [ + ('a,b,c', ['a', 'b', 'c']), + ('path"o,l"og"i"cal, example', ['path"o,l"og"i"cal', 'example']), + ('a, b, "c", "d", "e,f", g, h', + ['a', 'b', '"c"', '"d"', '"e,f"', 'g', 'h']), + ('a="b\\"c", d="e\\,f", g="h\\\\i"', + ['a="b"c"', 'd="e,f"', 'g="h\\i"'])] for string, list in tests: - self.assertEquals(urllib2.parse_http_list(string), list) + self.assertEquals(urllib.request.parse_http_list(string), list) def test_request_headers_dict(): @@ -107,7 +110,7 @@ def test_request_headers_methods(): def test_password_manager(self): """ - >>> mgr = urllib2.HTTPPasswordMgr() + >>> mgr = urllib.request.HTTPPasswordMgr() >>> add = mgr.add_password >>> add("Some Realm", "http://example.com/", "joe", "password") >>> add("Some Realm", "http://example.com/ni", "ni", "ni") @@ -172,7 +175,7 @@ def test_password_manager(self): def test_password_manager_default_port(self): """ - >>> mgr = urllib2.HTTPPasswordMgr() + >>> mgr = urllib.request.HTTPPasswordMgr() >>> add = mgr.add_password The point to note here is that we can't guess the default port if there's @@ -288,7 +291,7 @@ class MockHandler: res = MockResponse(200, "OK", {}, "") return self.parent.error("http", args[0], res, code, "", {}) elif action == "raise": - raise urllib2.URLError("blah") + raise urllib.error.URLError("blah") assert False def close(self): pass def add_parent(self, parent): @@ -337,7 +340,7 @@ def build_test_opener(*handler_instances): opener.add_handler(h) return opener -class MockHTTPHandler(urllib2.BaseHandler): +class MockHTTPHandler(urllib.request.BaseHandler): # useful for testing redirections and auth # sends supplied headers and code as first response # sends 200 OK as second response @@ -392,7 +395,7 @@ class OpenerDirectorTests(unittest.TestCase): # TypeError in real code; here, returning self from these mock # methods would either cause no exception, or AttributeError. - from urllib2 import URLError + from urllib.error import URLError o = OpenerDirector() meth_spec = [ @@ -400,7 +403,7 @@ class OpenerDirectorTests(unittest.TestCase): [("redirect_request", "return self")], ] handlers = add_ordered_mock_handlers(o, meth_spec) - o.add_handler(urllib2.UnknownHandler()) + o.add_handler(urllib.request.UnknownHandler()) for scheme in "do", "proxy", "redirect": self.assertRaises(URLError, o.open, scheme+"://example.com/") @@ -458,7 +461,7 @@ class OpenerDirectorTests(unittest.TestCase): handlers = add_ordered_mock_handlers(o, meth_spec) req = Request("http://example.com/") - self.assertRaises(urllib2.URLError, o.open, req) + self.assertRaises(urllib.error.URLError, o.open, req) self.assertEqual(o.calls, [(handlers[0], "http_open", (req,), {})]) ## def test_error(self): @@ -529,8 +532,7 @@ class OpenerDirectorTests(unittest.TestCase): def sanepathname2url(path): - import urllib - urlpath = urllib.pathname2url(path) + urlpath = urllib.request.pathname2url(path) if os.name == "nt" and urlpath.startswith("///"): urlpath = urlpath[2:] # XXX don't ask me about the mac... @@ -545,7 +547,7 @@ class HandlerTests(unittest.TestCase): self.filename, self.filetype = filename, filetype return io.StringIO(self.data), len(self.data) - class NullFTPHandler(urllib2.FTPHandler): + class NullFTPHandler(urllib.request.FTPHandler): def __init__(self, data): self.data = data def connect_ftp(self, user, passwd, host, port, dirs, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): @@ -587,7 +589,7 @@ class HandlerTests(unittest.TestCase): def test_file(self): import email.utils, socket - h = urllib2.FileHandler() + h = urllib.request.FileHandler() o = h.parent = MockOpener() TESTFN = support.TESTFN @@ -644,12 +646,12 @@ class HandlerTests(unittest.TestCase): finally: f.close() - self.assertRaises(urllib2.URLError, + self.assertRaises(urllib.error.URLError, h.file_open, Request(url)) finally: os.remove(TESTFN) - h = urllib2.FileHandler() + h = urllib.request.FileHandler() o = h.parent = MockOpener() # XXXX why does // mean ftp (and /// mean not ftp!), and where # is file: scheme specified? I think this is really a bug, and @@ -668,7 +670,7 @@ class HandlerTests(unittest.TestCase): try: h.file_open(req) # XXXX remove OSError when bug fixed - except (urllib2.URLError, OSError): + except (urllib.error.URLError, OSError): self.assert_(not ftp) else: self.assert_(o.req is req) @@ -685,6 +687,7 @@ class HandlerTests(unittest.TestCase): return '' class MockHTTPClass: def __init__(self): + self.level = 0 self.req_headers = [] self.data = None self.raise_on_endheaders = False @@ -707,7 +710,7 @@ class HandlerTests(unittest.TestCase): def getresponse(self): return MockHTTPResponse(MockFile(), {}, 200, "OK") - h = urllib2.AbstractHTTPHandler() + h = urllib.request.AbstractHTTPHandler() o = h.parent = MockOpener() url = "http://example.com/" @@ -737,7 +740,7 @@ class HandlerTests(unittest.TestCase): # check socket.error converted to URLError http.raise_on_endheaders = True - self.assertRaises(urllib2.URLError, h.do_open, http, req) + self.assertRaises(urllib.error.URLError, h.do_open, http, req) # check adding of standard headers o.addheaders = [("Spam", "eggs")] @@ -768,7 +771,7 @@ class HandlerTests(unittest.TestCase): self.assertEqual(req.unredirected_hdrs["Spam"], "foo") def test_errors(self): - h = urllib2.HTTPErrorProcessor() + h = urllib.request.HTTPErrorProcessor() o = h.parent = MockOpener() url = "http://example.com/" @@ -794,7 +797,7 @@ class HandlerTests(unittest.TestCase): def test_cookies(self): cj = MockCookieJar() - h = urllib2.HTTPCookieProcessor(cj) + h = urllib.request.HTTPCookieProcessor(cj) o = h.parent = MockOpener() req = Request("http://example.com/") @@ -810,7 +813,7 @@ class HandlerTests(unittest.TestCase): def test_redirect(self): from_url = "http://example.com/a.html" to_url = "http://example.com/b.html" - h = urllib2.HTTPRedirectHandler() + h = urllib.request.HTTPRedirectHandler() o = h.parent = MockOpener() # ordinary redirect behaviour @@ -825,7 +828,7 @@ class HandlerTests(unittest.TestCase): try: method(req, MockFile(), code, "Blah", MockHeaders({"location": to_url})) - except urllib2.HTTPError: + except urllib.error.HTTPError: # 307 in response to POST requires user OK self.assert_(code == 307 and data is not None) self.assertEqual(o.req.get_full_url(), to_url) @@ -860,9 +863,9 @@ class HandlerTests(unittest.TestCase): while 1: redirect(h, req, "http://example.com/") count = count + 1 - except urllib2.HTTPError: + except urllib.error.HTTPError: # don't stop until max_repeats, because cookies may introduce state - self.assertEqual(count, urllib2.HTTPRedirectHandler.max_repeats) + self.assertEqual(count, urllib.request.HTTPRedirectHandler.max_repeats) # detect endless non-repeating chain of redirects req = Request(from_url, origin_req_host="example.com") @@ -871,9 +874,9 @@ class HandlerTests(unittest.TestCase): while 1: redirect(h, req, "http://example.com/%d" % count) count = count + 1 - except urllib2.HTTPError: + except urllib.error.HTTPError: self.assertEqual(count, - urllib2.HTTPRedirectHandler.max_redirections) + urllib.request.HTTPRedirectHandler.max_redirections) def test_cookie_redirect(self): # cookies shouldn't leak into redirected requests @@ -883,16 +886,16 @@ class HandlerTests(unittest.TestCase): cj = CookieJar() interact_netscape(cj, "http://www.example.com/", "spam=eggs") hh = MockHTTPHandler(302, "Location: http://www.cracker.com/\r\n\r\n") - hdeh = urllib2.HTTPDefaultErrorHandler() - hrh = urllib2.HTTPRedirectHandler() - cp = urllib2.HTTPCookieProcessor(cj) + hdeh = urllib.request.HTTPDefaultErrorHandler() + hrh = urllib.request.HTTPRedirectHandler() + cp = urllib.request.HTTPCookieProcessor(cj) o = build_test_opener(hh, hdeh, hrh, cp) o.open("http://www.example.com/") self.assert_(not hh.req.has_header("Cookie")) def test_proxy(self): o = OpenerDirector() - ph = urllib2.ProxyHandler(dict(http="proxy.example.com:3128")) + ph = urllib.request.ProxyHandler(dict(http="proxy.example.com:3128")) o.add_handler(ph) meth_spec = [ [("http_open", "return response")] @@ -910,7 +913,7 @@ class HandlerTests(unittest.TestCase): def test_basic_auth(self, quote_char='"'): opener = OpenerDirector() password_manager = MockPasswordManager() - auth_handler = urllib2.HTTPBasicAuthHandler(password_manager) + auth_handler = urllib.request.HTTPBasicAuthHandler(password_manager) realm = "ACME Widget Store" http_handler = MockHTTPHandler( 401, 'WWW-Authenticate: Basic realm=%s%s%s\r\n\r\n' % @@ -928,10 +931,10 @@ class HandlerTests(unittest.TestCase): def test_proxy_basic_auth(self): opener = OpenerDirector() - ph = urllib2.ProxyHandler(dict(http="proxy.example.com:3128")) + ph = urllib.request.ProxyHandler(dict(http="proxy.example.com:3128")) opener.add_handler(ph) password_manager = MockPasswordManager() - auth_handler = urllib2.ProxyBasicAuthHandler(password_manager) + auth_handler = urllib.request.ProxyBasicAuthHandler(password_manager) realm = "ACME Networks" http_handler = MockHTTPHandler( 407, 'Proxy-Authenticate: Basic realm="%s"\r\n\r\n' % realm) @@ -958,15 +961,15 @@ class HandlerTests(unittest.TestCase): self.recorded = [] def record(self, info): self.recorded.append(info) - class TestDigestAuthHandler(urllib2.HTTPDigestAuthHandler): + class TestDigestAuthHandler(urllib.request.HTTPDigestAuthHandler): def http_error_401(self, *args, **kwds): self.parent.record("digest") - urllib2.HTTPDigestAuthHandler.http_error_401(self, + urllib.request.HTTPDigestAuthHandler.http_error_401(self, *args, **kwds) - class TestBasicAuthHandler(urllib2.HTTPBasicAuthHandler): + class TestBasicAuthHandler(urllib.request.HTTPBasicAuthHandler): def http_error_401(self, *args, **kwds): self.parent.record("basic") - urllib2.HTTPBasicAuthHandler.http_error_401(self, + urllib.request.HTTPBasicAuthHandler.http_error_401(self, *args, **kwds) opener = RecordingOpenerDirector() @@ -1030,13 +1033,13 @@ class HandlerTests(unittest.TestCase): class MiscTests(unittest.TestCase): def test_build_opener(self): - class MyHTTPHandler(urllib2.HTTPHandler): pass - class FooHandler(urllib2.BaseHandler): + class MyHTTPHandler(urllib.request.HTTPHandler): pass + class FooHandler(urllib.request.BaseHandler): def foo_open(self): pass - class BarHandler(urllib2.BaseHandler): + class BarHandler(urllib.request.BaseHandler): def bar_open(self): pass - build_opener = urllib2.build_opener + build_opener = urllib.request.build_opener o = build_opener(FooHandler, BarHandler) self.opener_has_handler(o, FooHandler) @@ -1054,14 +1057,14 @@ class MiscTests(unittest.TestCase): # a particular case of overriding: default handlers can be passed # in explicitly o = build_opener() - self.opener_has_handler(o, urllib2.HTTPHandler) - o = build_opener(urllib2.HTTPHandler) - self.opener_has_handler(o, urllib2.HTTPHandler) - o = build_opener(urllib2.HTTPHandler()) - self.opener_has_handler(o, urllib2.HTTPHandler) + self.opener_has_handler(o, urllib.request.HTTPHandler) + o = build_opener(urllib.request.HTTPHandler) + self.opener_has_handler(o, urllib.request.HTTPHandler) + o = build_opener(urllib.request.HTTPHandler()) + self.opener_has_handler(o, urllib.request.HTTPHandler) # Issue2670: multiple handlers sharing the same base class - class MyOtherHTTPHandler(urllib2.HTTPHandler): pass + class MyOtherHTTPHandler(urllib.request.HTTPHandler): pass o = build_opener(MyHTTPHandler, MyOtherHTTPHandler) self.opener_has_handler(o, MyHTTPHandler) self.opener_has_handler(o, MyOtherHTTPHandler) @@ -1077,7 +1080,7 @@ class MiscTests(unittest.TestCase): def test_main(verbose=None): from test import test_urllib2 support.run_doctest(test_urllib2, verbose) - support.run_doctest(urllib2, verbose) + support.run_doctest(urllib.request, verbose) tests = (TrivialTests, OpenerDirectorTests, HandlerTests, diff --git a/Lib/test/test_urllib2_localnet.py b/Lib/test/test_urllib2_localnet.py index d3016c3..2c572f3 100644 --- a/Lib/test/test_urllib2_localnet.py +++ b/Lib/test/test_urllib2_localnet.py @@ -2,8 +2,8 @@ import email import threading -import urlparse -import urllib2 +import urllib.parse +import urllib.request import http.server import unittest import hashlib @@ -45,7 +45,7 @@ class LoopbackHttpServerThread(threading.Thread): self._stop_server = False self.ready = threading.Event() request_handler.protocol_version = "HTTP/1.0" - self.httpd = LoopbackHttpServer(('127.0.0.1', 0), + self.httpd = LoopbackHttpServer(("127.0.0.1", 0), request_handler) #print "Serving HTTP on %s port %s" % (self.httpd.server_name, # self.httpd.server_port) @@ -154,11 +154,11 @@ class DigestAuthHandler: if len(self._users) == 0: return True - if 'Proxy-Authorization' not in request_handler.headers: + if "Proxy-Authorization" not in request_handler.headers: return self._return_auth_challenge(request_handler) else: auth_dict = self._create_auth_dict( - request_handler.headers['Proxy-Authorization'] + request_handler.headers["Proxy-Authorization"] ) if auth_dict["username"] in self._users: password = self._users[ auth_dict["username"] ] @@ -199,12 +199,12 @@ class FakeProxyHandler(http.server.BaseHTTPRequestHandler): def log_message(self, format, *args): # Uncomment the next line for debugging. - #sys.stderr.write(format % args) + # sys.stderr.write(format % args) pass def do_GET(self): - (scm, netloc, path, params, query, fragment) = urlparse.urlparse( - self.path, 'http') + (scm, netloc, path, params, query, fragment) = urllib.parse.urlparse( + self.path, "http") self.short_path = path if self.digest_auth_handler.handle_request(self): self.send_response(200, "OK") @@ -234,9 +234,10 @@ class ProxyAuthTests(unittest.TestCase): self.server.start() self.server.ready.wait() proxy_url = "http://127.0.0.1:%d" % self.server.port - handler = urllib2.ProxyHandler({"http" : proxy_url}) - self._digest_auth_handler = urllib2.ProxyDigestAuthHandler() - self.opener = urllib2.build_opener(handler, self._digest_auth_handler) + handler = urllib.request.ProxyHandler({"http" : proxy_url}) + self._digest_auth_handler = urllib.request.ProxyDigestAuthHandler() + self.opener = urllib.request.build_opener( + handler, self._digest_auth_handler) def tearDown(self): self.server.stop() @@ -245,13 +246,13 @@ class ProxyAuthTests(unittest.TestCase): self._digest_auth_handler.add_password(self.REALM, self.URL, self.USER, self.PASSWD+"bad") FakeProxyHandler.digest_auth_handler.set_qop("auth") - self.assertRaises(urllib2.HTTPError, + self.assertRaises(urllib.error.HTTPError, self.opener.open, self.URL) def test_proxy_with_no_password_raises_httperror(self): FakeProxyHandler.digest_auth_handler.set_qop("auth") - self.assertRaises(urllib2.HTTPError, + self.assertRaises(urllib.error.HTTPError, self.opener.open, self.URL) @@ -270,7 +271,7 @@ class ProxyAuthTests(unittest.TestCase): FakeProxyHandler.digest_auth_handler.set_qop("auth-int") try: result = self.opener.open(self.URL) - except urllib2.URLError: + except urllib.error.URLError: # It's okay if we don't support auth-int, but we certainly # shouldn't receive any kind of exception here other than # a URLError. @@ -296,7 +297,7 @@ def GetRequestHandler(responses): self.wfile.write(body) def do_POST(self): - content_length = self.headers['Content-Length'] + content_length = self.headers["Content-Length"] post_data = self.rfile.read(int(content_length)) self.do_GET() self.requests.append(post_data) @@ -311,7 +312,7 @@ def GetRequestHandler(responses): for (header, value) in headers: self.send_header(header, value % self.port) if body: - self.send_header('Content-type', 'text/plain') + self.send_header("Content-type", "text/plain") self.end_headers() return body self.end_headers() @@ -332,7 +333,22 @@ class TestUrlopen(unittest.TestCase): for transparent redirection have been written. """ - def start_server(self, responses): + def setUp(self): + self.server = None + + def tearDown(self): + if self.server is not None: + self.server.stop() + + def urlopen(self, url, data=None): + f = urllib.request.urlopen(url, data) + result = f.read() + f.close() + return result + + def start_server(self, responses=None): + if responses is None: + responses = [(200, [], b"we don't care")] handler = GetRequestHandler(responses) self.server = LoopbackHttpServerThread(handler) @@ -342,106 +358,71 @@ class TestUrlopen(unittest.TestCase): handler.port = port return handler - def test_redirection(self): - expected_response = b'We got here...' + expected_response = b"We got here..." responses = [ - (302, [('Location', 'http://localhost:%s/somewhere_else')], ''), + (302, [("Location", "http://localhost:%s/somewhere_else")], ""), (200, [], expected_response) ] handler = self.start_server(responses) - - try: - f = urllib2.urlopen('http://localhost:%s/' % handler.port) - data = f.read() - f.close() - - self.assertEquals(data, expected_response) - self.assertEquals(handler.requests, ['/', '/somewhere_else']) - finally: - self.server.stop() - + data = self.urlopen("http://localhost:%s/" % handler.port) + self.assertEquals(data, expected_response) + self.assertEquals(handler.requests, ["/", "/somewhere_else"]) def test_404(self): - expected_response = b'Bad bad bad...' + expected_response = b"Bad bad bad..." handler = self.start_server([(404, [], expected_response)]) try: - try: - urllib2.urlopen('http://localhost:%s/weeble' % handler.port) - except urllib2.URLError as f: - data = f.read() - f.close() - else: - self.fail('404 should raise URLError') - - self.assertEquals(data, expected_response) - self.assertEquals(handler.requests, ['/weeble']) - finally: - self.server.stop() + self.urlopen("http://localhost:%s/weeble" % handler.port) + except urllib.error.URLError as f: + data = f.read() + f.close() + else: + self.fail("404 should raise URLError") + self.assertEquals(data, expected_response) + self.assertEquals(handler.requests, ["/weeble"]) def test_200(self): - expected_response = b'pycon 2008...' + expected_response = b"pycon 2008..." handler = self.start_server([(200, [], expected_response)]) - - try: - f = urllib2.urlopen('http://localhost:%s/bizarre' % handler.port) - data = f.read() - f.close() - - self.assertEquals(data, expected_response) - self.assertEquals(handler.requests, ['/bizarre']) - finally: - self.server.stop() + data = self.urlopen("http://localhost:%s/bizarre" % handler.port) + self.assertEquals(data, expected_response) + self.assertEquals(handler.requests, ["/bizarre"]) def test_200_with_parameters(self): - expected_response = b'pycon 2008...' + expected_response = b"pycon 2008..." handler = self.start_server([(200, [], expected_response)]) - - try: - f = urllib2.urlopen('http://localhost:%s/bizarre' % handler.port, b'get=with_feeling') - data = f.read() - f.close() - - self.assertEquals(data, expected_response) - self.assertEquals(handler.requests, ['/bizarre', b'get=with_feeling']) - finally: - self.server.stop() - + data = self.urlopen("http://localhost:%s/bizarre" % handler.port, + b"get=with_feeling") + self.assertEquals(data, expected_response) + self.assertEquals(handler.requests, ["/bizarre", b"get=with_feeling"]) def test_sending_headers(self): - handler = self.start_server([(200, [], b"we don't care")]) - - try: - req = urllib2.Request("http://localhost:%s/" % handler.port, - headers={'Range': 'bytes=20-39'}) - urllib2.urlopen(req) - self.assertEqual(handler.headers_received['Range'], 'bytes=20-39') - finally: - self.server.stop() + handler = self.start_server() + req = urllib.request.Request("http://localhost:%s/" % handler.port, + headers={"Range": "bytes=20-39"}) + urllib.request.urlopen(req) + self.assertEqual(handler.headers_received["Range"], "bytes=20-39") def test_basic(self): - handler = self.start_server([(200, [], b"we don't care")]) - + handler = self.start_server() + open_url = urllib.request.urlopen("http://localhost:%s" % handler.port) + for attr in ("read", "close", "info", "geturl"): + self.assert_(hasattr(open_url, attr), "object returned from " + "urlopen lacks the %s attribute" % attr) try: - open_url = urllib2.urlopen("http://localhost:%s" % handler.port) - for attr in ("read", "close", "info", "geturl"): - self.assert_(hasattr(open_url, attr), "object returned from " - "urlopen lacks the %s attribute" % attr) - try: - self.assert_(open_url.read(), "calling 'read' failed") - finally: - open_url.close() + self.assert_(open_url.read(), "calling 'read' failed") finally: - self.server.stop() + open_url.close() def test_info(self): - handler = self.start_server([(200, [], b"we don't care")]) - + handler = self.start_server() try: - open_url = urllib2.urlopen("http://localhost:%s" % handler.port) + open_url = urllib.request.urlopen( + "http://localhost:%s" % handler.port) info_obj = open_url.info() self.assert_(isinstance(info_obj, email.message.Message), "object returned by 'info' is not an instance of " @@ -452,15 +433,10 @@ class TestUrlopen(unittest.TestCase): def test_geturl(self): # Make sure same URL as opened is returned by geturl. - handler = self.start_server([(200, [], b"we don't care")]) - - try: - open_url = urllib2.urlopen("http://localhost:%s" % handler.port) - url = open_url.geturl() - self.assertEqual(url, "http://localhost:%s" % handler.port) - finally: - self.server.stop() - + handler = self.start_server() + open_url = urllib.request.urlopen("http://localhost:%s" % handler.port) + url = open_url.geturl() + self.assertEqual(url, "http://localhost:%s" % handler.port) def test_bad_address(self): # Make sure proper exception is raised when connecting to a bogus @@ -472,17 +448,10 @@ class TestUrlopen(unittest.TestCase): # started failing then. One hopes the .invalid # domain will be spared to serve its defined # purpose. - # urllib2.urlopen, "http://www.sadflkjsasadf.com/") - urllib2.urlopen, "http://www.python.invalid./") - + urllib.request.urlopen, + "http://www.python.invalid./") def test_main(): - # We will NOT depend on the network resource flag - # (Lib/test/regrtest.py -u network) since all tests here are only - # localhost. However, if this is a bad rationale, then uncomment - # the next line. - #support.requires("network") - support.run_unittest(ProxyAuthTests) support.run_unittest(TestUrlopen) diff --git a/Lib/test/test_urllib2net.py b/Lib/test/test_urllib2net.py index 938ab9f..a18a4bb 100644 --- a/Lib/test/test_urllib2net.py +++ b/Lib/test/test_urllib2net.py @@ -4,10 +4,11 @@ import unittest from test import support from test.test_urllib2 import sanepathname2url +import os import socket -import urllib2 import sys -import os +import urllib.error +import urllib.request def _retry_thrice(func, exc, *args, **kwargs): @@ -28,7 +29,8 @@ def _wrap_with_retry_thrice(func, exc): # Connecting to remote hosts is flaky. Make it more robust by retrying # the connection several times. -_urlopen_with_retry = _wrap_with_retry_thrice(urllib2.urlopen, urllib2.URLError) +_urlopen_with_retry = _wrap_with_retry_thrice(urllib.request.urlopen, + urllib.error.URLError) class AuthTests(unittest.TestCase): @@ -78,16 +80,11 @@ class CloseSocketTest(unittest.TestCase): # calling .close() on urllib2's response objects should close the # underlying socket - # delve deep into response to fetch socket._socketobject response = _urlopen_with_retry("http://www.python.org/") - abused_fileobject = response.fp - httpresponse = abused_fileobject.raw - self.assert_(httpresponse.__class__ is http.client.HTTPResponse) - fileobject = httpresponse.fp - - self.assert_(not fileobject.closed) + sock = response.fp + self.assert_(not sock.closed) response.close() - self.assert_(fileobject.closed) + self.assert_(sock.closed) class OtherNetworkTests(unittest.TestCase): def setUp(self): @@ -116,8 +113,9 @@ class OtherNetworkTests(unittest.TestCase): f.write('hi there\n') f.close() urls = [ - 'file:'+sanepathname2url(os.path.abspath(TESTFN)), - ('file:///nonsensename/etc/passwd', None, urllib2.URLError), + 'file:' + sanepathname2url(os.path.abspath(TESTFN)), + ('file:///nonsensename/etc/passwd', None, + urllib.error.URLError), ] self._test_urls(urls, self._extra_handlers(), retry=True) finally: @@ -157,9 +155,9 @@ class OtherNetworkTests(unittest.TestCase): import logging debug = logging.getLogger("test_urllib2").debug - urlopen = urllib2.build_opener(*handlers).open + urlopen = urllib.request.build_opener(*handlers).open if retry: - urlopen = _wrap_with_retry_thrice(urlopen, urllib2.URLError) + urlopen = _wrap_with_retry_thrice(urlopen, urllib.error.URLError) for url in urls: if isinstance(url, tuple): @@ -186,7 +184,7 @@ class OtherNetworkTests(unittest.TestCase): def _extra_handlers(self): handlers = [] - cfh = urllib2.CacheFTPHandler() + cfh = urllib.request.CacheFTPHandler() cfh.setTimeout(1) handlers.append(cfh) @@ -197,7 +195,7 @@ class TimeoutTest(unittest.TestCase): def test_http_basic(self): self.assertTrue(socket.getdefaulttimeout() is None) u = _urlopen_with_retry("http://www.python.org") - self.assertTrue(u.fp.raw.fp._sock.gettimeout() is None) + self.assertTrue(u.fp._sock.gettimeout() is None) def test_http_default_timeout(self): self.assertTrue(socket.getdefaulttimeout() is None) @@ -206,7 +204,7 @@ class TimeoutTest(unittest.TestCase): u = _urlopen_with_retry("http://www.python.org") finally: socket.setdefaulttimeout(None) - self.assertEqual(u.fp.raw.fp._sock.gettimeout(), 60) + self.assertEqual(u.fp._sock.gettimeout(), 60) def test_http_no_timeout(self): self.assertTrue(socket.getdefaulttimeout() is None) @@ -215,11 +213,11 @@ class TimeoutTest(unittest.TestCase): u = _urlopen_with_retry("http://www.python.org", timeout=None) finally: socket.setdefaulttimeout(None) - self.assertTrue(u.fp.raw.fp._sock.gettimeout() is None) + self.assertTrue(u.fp._sock.gettimeout() is None) def test_http_timeout(self): u = _urlopen_with_retry("http://www.python.org", timeout=120) - self.assertEqual(u.fp.raw.fp._sock.gettimeout(), 120) + self.assertEqual(u.fp._sock.gettimeout(), 120) FTP_HOST = "ftp://ftp.mirror.nl/pub/mirror/gnu/" diff --git a/Lib/test/test_urllibnet.py b/Lib/test/test_urllibnet.py index d4831d8..c8166c4 100644 --- a/Lib/test/test_urllibnet.py +++ b/Lib/test/test_urllibnet.py @@ -4,7 +4,7 @@ import unittest from test import support import socket -import urllib +import urllib.request import sys import os import email.message @@ -36,11 +36,11 @@ class URLTimeoutTest(unittest.TestCase): socket.setdefaulttimeout(None) def testURLread(self): - f = _open_with_retry(urllib.urlopen, "http://www.python.org/") + f = _open_with_retry(urllib.request.urlopen, "http://www.python.org/") x = f.read() class urlopenNetworkTests(unittest.TestCase): - """Tests urllib.urlopen using the network. + """Tests urllib.reqest.urlopen using the network. These tests are not exhaustive. Assuming that testing using files does a good job overall of some of the basic interface features. There are no @@ -55,7 +55,7 @@ class urlopenNetworkTests(unittest.TestCase): """ def urlopen(self, *args): - return _open_with_retry(urllib.urlopen, *args) + return _open_with_retry(urllib.request.urlopen, *args) def test_basic(self): # Simple test expected to pass. @@ -105,7 +105,7 @@ class urlopenNetworkTests(unittest.TestCase): def test_getcode(self): # test getcode() with the fancy opener to get 404 error codes URL = "http://www.python.org/XXXinvalidXXX" - open_url = urllib.FancyURLopener().open(URL) + open_url = urllib.request.FancyURLopener().open(URL) try: code = open_url.getcode() finally: @@ -114,7 +114,7 @@ class urlopenNetworkTests(unittest.TestCase): def test_fileno(self): if (sys.platform in ('win32',) or - not hasattr(os, 'fdopen')): + not hasattr(os, 'fdopen')): # On Windows, socket handles are not file descriptors; this # test can't pass on Windows. return @@ -142,13 +142,14 @@ class urlopenNetworkTests(unittest.TestCase): # domain will be spared to serve its defined # purpose. # urllib.urlopen, "http://www.sadflkjsasadf.com/") - urllib.urlopen, "http://www.python.invalid./") + urllib.request.urlopen, + "http://www.python.invalid./") class urlretrieveNetworkTests(unittest.TestCase): - """Tests urllib.urlretrieve using the network.""" + """Tests urllib.request.urlretrieve using the network.""" def urlretrieve(self, *args): - return _open_with_retry(urllib.urlretrieve, *args) + return _open_with_retry(urllib.request.urlretrieve, *args) def test_basic(self): # Test basic functionality. diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 02b2f6f..c92b5aa 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -2,7 +2,7 @@ from test import support import unittest -import urlparse +import urllib.parse RFC1808_BASE = "http://a/b/c/d;p?q#f" RFC2396_BASE = "http://a/b/c/d;p?q" @@ -10,19 +10,19 @@ RFC2396_BASE = "http://a/b/c/d;p?q" class UrlParseTestCase(unittest.TestCase): def checkRoundtrips(self, url, parsed, split): - result = urlparse.urlparse(url) + result = urllib.parse.urlparse(url) self.assertEqual(result, parsed) t = (result.scheme, result.netloc, result.path, result.params, result.query, result.fragment) self.assertEqual(t, parsed) # put it back together and it should be the same - result2 = urlparse.urlunparse(result) + result2 = urllib.parse.urlunparse(result) self.assertEqual(result2, url) self.assertEqual(result2, result.geturl()) # the result of geturl() is a fixpoint; we can always parse it # again to get the same result: - result3 = urlparse.urlparse(result.geturl()) + result3 = urllib.parse.urlparse(result.geturl()) self.assertEqual(result3.geturl(), result.geturl()) self.assertEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) @@ -37,17 +37,17 @@ class UrlParseTestCase(unittest.TestCase): self.assertEqual(result3.port, result.port) # check the roundtrip using urlsplit() as well - result = urlparse.urlsplit(url) + result = urllib.parse.urlsplit(url) self.assertEqual(result, split) t = (result.scheme, result.netloc, result.path, result.query, result.fragment) self.assertEqual(t, split) - result2 = urlparse.urlunsplit(result) + result2 = urllib.parse.urlunsplit(result) self.assertEqual(result2, url) self.assertEqual(result2, result.geturl()) # check the fixpoint property of re-parsing the result of geturl() - result3 = urlparse.urlsplit(result.geturl()) + result3 = urllib.parse.urlsplit(result.geturl()) self.assertEqual(result3.geturl(), result.geturl()) self.assertEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) @@ -83,7 +83,7 @@ class UrlParseTestCase(unittest.TestCase): self.checkRoundtrips(url, parsed, split) def test_http_roundtrips(self): - # urlparse.urlsplit treats 'http:' as an optimized special case, + # urllib.parse.urlsplit treats 'http:' as an optimized special case, # so we test both 'http:' and 'https:' in all the following. # Three cheers for white box knowledge! testcases = [ @@ -111,13 +111,13 @@ class UrlParseTestCase(unittest.TestCase): self.checkRoundtrips(url, parsed, split) def checkJoin(self, base, relurl, expected): - self.assertEqual(urlparse.urljoin(base, relurl), expected, + self.assertEqual(urllib.parse.urljoin(base, relurl), expected, (base, relurl, expected)) def test_unparse_parse(self): for u in ['Python', './Python']: - self.assertEqual(urlparse.urlunsplit(urlparse.urlsplit(u)), u) - self.assertEqual(urlparse.urlunparse(urlparse.urlparse(u)), u) + self.assertEqual(urllib.parse.urlunsplit(urllib.parse.urlsplit(u)), u) + self.assertEqual(urllib.parse.urlunparse(urllib.parse.urlparse(u)), u) def test_RFC1808(self): # "normal" cases from RFC 1808: @@ -223,11 +223,11 @@ class UrlParseTestCase(unittest.TestCase): (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'), (RFC2396_BASE, 'http://a/b/c/d;p?q', ''), ]: - self.assertEqual(urlparse.urldefrag(url), (defrag, frag)) + self.assertEqual(urllib.parse.urldefrag(url), (defrag, frag)) def test_urlsplit_attributes(self): url = "HTTP://WWW.PYTHON.ORG/doc/#frag" - p = urlparse.urlsplit(url) + p = urllib.parse.urlsplit(url) self.assertEqual(p.scheme, "http") self.assertEqual(p.netloc, "WWW.PYTHON.ORG") self.assertEqual(p.path, "/doc/") @@ -242,7 +242,7 @@ class UrlParseTestCase(unittest.TestCase): #self.assertEqual(p.geturl(), url) url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag" - p = urlparse.urlsplit(url) + p = urllib.parse.urlsplit(url) self.assertEqual(p.scheme, "http") self.assertEqual(p.netloc, "User:Pass@www.python.org:080") self.assertEqual(p.path, "/doc/") @@ -259,7 +259,7 @@ class UrlParseTestCase(unittest.TestCase): # and request email addresses as usernames. url = "http://User@example.com:Pass@www.python.org:080/doc/?query=yes#frag" - p = urlparse.urlsplit(url) + p = urllib.parse.urlsplit(url) self.assertEqual(p.scheme, "http") self.assertEqual(p.netloc, "User@example.com:Pass@www.python.org:080") self.assertEqual(p.path, "/doc/") @@ -274,11 +274,11 @@ class UrlParseTestCase(unittest.TestCase): def test_attributes_bad_port(self): """Check handling of non-integer ports.""" - p = urlparse.urlsplit("http://www.example.net:foo") + p = urllib.parse.urlsplit("http://www.example.net:foo") self.assertEqual(p.netloc, "www.example.net:foo") self.assertRaises(ValueError, lambda: p.port) - p = urlparse.urlparse("http://www.example.net:foo") + p = urllib.parse.urlparse("http://www.example.net:foo") self.assertEqual(p.netloc, "www.example.net:foo") self.assertRaises(ValueError, lambda: p.port) @@ -289,7 +289,7 @@ class UrlParseTestCase(unittest.TestCase): # scheme://netloc syntax, the netloc and related attributes # should be left empty. uri = "sip:alice@atlanta.com;maddr=239.255.255.1;ttl=15" - p = urlparse.urlsplit(uri) + p = urllib.parse.urlsplit(uri) self.assertEqual(p.netloc, "") self.assertEqual(p.username, None) self.assertEqual(p.password, None) @@ -297,7 +297,7 @@ class UrlParseTestCase(unittest.TestCase): self.assertEqual(p.port, None) self.assertEqual(p.geturl(), uri) - p = urlparse.urlparse(uri) + p = urllib.parse.urlparse(uri) self.assertEqual(p.netloc, "") self.assertEqual(p.username, None) self.assertEqual(p.password, None) @@ -307,7 +307,7 @@ class UrlParseTestCase(unittest.TestCase): def test_noslash(self): # Issue 1637: http://foo.com?query is legal - self.assertEqual(urlparse.urlparse("http://example.com?blahblah=/foo"), + self.assertEqual(urllib.parse.urlparse("http://example.com?blahblah=/foo"), ('http', 'example.com', '', '', 'blahblah=/foo', '')) def test_main(): diff --git a/Lib/test/test_xmlrpc.py b/Lib/test/test_xmlrpc.py index 9d38470..e285809 100644 --- a/Lib/test/test_xmlrpc.py +++ b/Lib/test/test_xmlrpc.py @@ -111,8 +111,10 @@ class XMLRPCTestCase(unittest.TestCase): (int(2**34),)) xmlrpclib.dumps((xmlrpclib.MAXINT, xmlrpclib.MININT)) - self.assertRaises(OverflowError, xmlrpclib.dumps, (xmlrpclib.MAXINT+1,)) - self.assertRaises(OverflowError, xmlrpclib.dumps, (xmlrpclib.MININT-1,)) + self.assertRaises(OverflowError, xmlrpclib.dumps, + (xmlrpclib.MAXINT+1,)) + self.assertRaises(OverflowError, xmlrpclib.dumps, + (xmlrpclib.MININT-1,)) def dummy_write(s): pass @@ -120,9 +122,10 @@ class XMLRPCTestCase(unittest.TestCase): m = xmlrpclib.Marshaller() m.dump_int(xmlrpclib.MAXINT, dummy_write) m.dump_int(xmlrpclib.MININT, dummy_write) - self.assertRaises(OverflowError, m.dump_int, xmlrpclib.MAXINT+1, dummy_write) - self.assertRaises(OverflowError, m.dump_int, xmlrpclib.MININT-1, dummy_write) - + self.assertRaises(OverflowError, m.dump_int, + xmlrpclib.MAXINT+1, dummy_write) + self.assertRaises(OverflowError, m.dump_int, + xmlrpclib.MININT-1, dummy_write) def test_dump_none(self): value = alist + [None] @@ -132,7 +135,6 @@ class XMLRPCTestCase(unittest.TestCase): xmlrpclib.loads(strg)[0][0]) self.assertRaises(TypeError, xmlrpclib.dumps, (arg1,)) - class HelperTestCase(unittest.TestCase): def test_escape(self): self.assertEqual(xmlrpclib.escape("a&b"), "a&b") @@ -160,7 +162,6 @@ class FaultTestCase(unittest.TestCase): # private methods self.assertRaises(AttributeError, xmlrpc.server.resolve_dotted_attribute, str, '__add') - self.assert_(xmlrpc.server.resolve_dotted_attribute(str, 'title')) class DateTimeTestCase(unittest.TestCase): @@ -170,7 +171,8 @@ class DateTimeTestCase(unittest.TestCase): def test_time(self): d = 1181399930.036952 t = xmlrpclib.DateTime(d) - self.assertEqual(str(t), time.strftime("%Y%m%dT%H:%M:%S", time.localtime(d))) + self.assertEqual(str(t), + time.strftime("%Y%m%dT%H:%M:%S", time.localtime(d))) def test_time_tuple(self): d = (2007,6,9,10,38,50,5,160,0) @@ -180,7 +182,7 @@ class DateTimeTestCase(unittest.TestCase): def test_time_struct(self): d = time.localtime(1181399930.036952) t = xmlrpclib.DateTime(d) - self.assertEqual(str(t), time.strftime("%Y%m%dT%H:%M:%S", d)) + self.assertEqual(str(t), time.strftime("%Y%m%dT%H:%M:%S", d)) def test_datetime_datetime(self): d = datetime.datetime(2007,1,2,3,4,5) @@ -350,12 +352,12 @@ class SimpleServerTestCase(unittest.TestCase): self.assertEqual(response.reason, 'Not Found') def test_introspection1(self): + expected_methods = set(['pow', 'div', 'my_function', 'add', + 'system.listMethods', 'system.methodHelp', + 'system.methodSignature', 'system.multicall']) try: p = xmlrpclib.ServerProxy('http://localhost:%d' % PORT) meth = p.system.listMethods() - expected_methods = set(['pow', 'div', 'my_function', 'add', - 'system.listMethods', 'system.methodHelp', - 'system.methodSignature', 'system.multicall']) self.assertEqual(set(meth), expected_methods) except (xmlrpclib.ProtocolError, socket.error) as e: # ignore failures due to non-blocking socket 'unavailable' errors @@ -593,7 +595,8 @@ class CGIHandlerTestCase(unittest.TestCase): # will respond exception, if so, our goal is achieved ;) handle = open(support.TESTFN, "r").read() - # start with 44th char so as not to get http header, we just need only xml + # start with 44th char so as not to get http header, we just + # need only xml self.assertRaises(xmlrpclib.Fault, xmlrpclib.loads, handle[44:]) os.remove("xmldata.txt") diff --git a/Lib/urllib.py b/Lib/urllib.py deleted file mode 100644 index d60ac5c..0000000 --- a/Lib/urllib.py +++ /dev/null @@ -1,1714 +0,0 @@ -"""Open an arbitrary URL. - -See the following document for more info on URLs: -"Names and Addresses, URIs, URLs, URNs, URCs", at -http://www.w3.org/pub/WWW/Addressing/Overview.html - -See also the HTTP spec (from which the error codes are derived): -"HTTP - Hypertext Transfer Protocol", at -http://www.w3.org/pub/WWW/Protocols/ - -Related standards and specs: -- RFC1808: the "relative URL" spec. (authoritative status) -- RFC1738 - the "URL standard". (authoritative status) -- RFC1630 - the "URI spec". (informational status) - -The object returned by URLopener().open(file) will differ per -protocol. All you know is that is has methods read(), readline(), -readlines(), fileno(), close() and info(). The read*(), fileno() -and close() methods work like those of open files. -The info() method returns a email.message.Message object which can be -used to query various info about the object, if available. -(email.message.Message objects provide a dict-like interface.) -""" - -import http.client -import email.message -import email -import os -import socket -import sys -import time -from urlparse import urljoin as basejoin - -__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve", - "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus", - "urlencode", "url2pathname", "pathname2url", "splittag", - "localhost", "thishost", "ftperrors", "basejoin", "unwrap", - "splittype", "splithost", "splituser", "splitpasswd", "splitport", - "splitnport", "splitquery", "splitattr", "splitvalue", - "getproxies"] - -__version__ = '1.17' # XXX This version is not always updated :-( - -MAXFTPCACHE = 10 # Trim the ftp cache beyond this size - -# Helper for non-unix systems -if os.name == 'mac': - from macurl2path import url2pathname, pathname2url -elif os.name == 'nt': - from nturl2path import url2pathname, pathname2url -else: - def url2pathname(pathname): - """OS-specific conversion from a relative URL of the 'file' scheme - to a file system path; not recommended for general use.""" - return unquote(pathname) - - def pathname2url(pathname): - """OS-specific conversion from a file system path to a relative URL - of the 'file' scheme; not recommended for general use.""" - return quote(pathname) - -# This really consists of two pieces: -# (1) a class which handles opening of all sorts of URLs -# (plus assorted utilities etc.) -# (2) a set of functions for parsing URLs -# XXX Should these be separated out into different modules? - - -# Shortcut for basic usage -_urlopener = None -def urlopen(url, data=None, proxies=None): - """urlopen(url [, data]) -> open file-like object""" - global _urlopener - if proxies is not None: - opener = FancyURLopener(proxies=proxies) - elif not _urlopener: - opener = FancyURLopener() - _urlopener = opener - else: - opener = _urlopener - if data is None: - return opener.open(url) - else: - return opener.open(url, data) - -def urlretrieve(url, filename=None, reporthook=None, data=None): - global _urlopener - if not _urlopener: - _urlopener = FancyURLopener() - return _urlopener.retrieve(url, filename, reporthook, data) - -def urlcleanup(): - if _urlopener: - _urlopener.cleanup() - -# check for SSL -try: - import ssl -except: - _have_ssl = False -else: - _have_ssl = True - -# exception raised when downloaded size does not match content-length -class ContentTooShortError(IOError): - def __init__(self, message, content): - IOError.__init__(self, message) - self.content = content - -ftpcache = {} -class URLopener: - """Class to open URLs. - This is a class rather than just a subroutine because we may need - more than one set of global protocol-specific options. - Note -- this is a base class for those who don't want the - automatic handling of errors type 302 (relocated) and 401 - (authorization needed).""" - - __tempfiles = None - - version = "Python-urllib/%s" % __version__ - - # Constructor - def __init__(self, proxies=None, **x509): - if proxies is None: - proxies = getproxies() - assert hasattr(proxies, 'keys'), "proxies must be a mapping" - self.proxies = proxies - self.key_file = x509.get('key_file') - self.cert_file = x509.get('cert_file') - self.addheaders = [('User-Agent', self.version)] - self.__tempfiles = [] - self.__unlink = os.unlink # See cleanup() - self.tempcache = None - # Undocumented feature: if you assign {} to tempcache, - # it is used to cache files retrieved with - # self.retrieve(). This is not enabled by default - # since it does not work for changing documents (and I - # haven't got the logic to check expiration headers - # yet). - self.ftpcache = ftpcache - # Undocumented feature: you can use a different - # ftp cache by assigning to the .ftpcache member; - # in case you want logically independent URL openers - # XXX This is not threadsafe. Bah. - - def __del__(self): - self.close() - - def close(self): - self.cleanup() - - def cleanup(self): - # This code sometimes runs when the rest of this module - # has already been deleted, so it can't use any globals - # or import anything. - if self.__tempfiles: - for file in self.__tempfiles: - try: - self.__unlink(file) - except OSError: - pass - del self.__tempfiles[:] - if self.tempcache: - self.tempcache.clear() - - def addheader(self, *args): - """Add a header to be used by the HTTP interface only - e.g. u.addheader('Accept', 'sound/basic')""" - self.addheaders.append(args) - - # External interface - def open(self, fullurl, data=None): - """Use URLopener().open(file) instead of open(file, 'r').""" - fullurl = unwrap(toBytes(fullurl)) - if self.tempcache and fullurl in self.tempcache: - filename, headers = self.tempcache[fullurl] - fp = open(filename, 'rb') - return addinfourl(fp, headers, fullurl) - urltype, url = splittype(fullurl) - if not urltype: - urltype = 'file' - if urltype in self.proxies: - proxy = self.proxies[urltype] - urltype, proxyhost = splittype(proxy) - host, selector = splithost(proxyhost) - url = (host, fullurl) # Signal special case to open_*() - else: - proxy = None - name = 'open_' + urltype - self.type = urltype - name = name.replace('-', '_') - if not hasattr(self, name): - if proxy: - return self.open_unknown_proxy(proxy, fullurl, data) - else: - return self.open_unknown(fullurl, data) - try: - if data is None: - return getattr(self, name)(url) - else: - return getattr(self, name)(url, data) - except socket.error as msg: - raise IOError('socket error', msg).with_traceback(sys.exc_info()[2]) - - def open_unknown(self, fullurl, data=None): - """Overridable interface to open unknown URL type.""" - type, url = splittype(fullurl) - raise IOError('url error', 'unknown url type', type) - - def open_unknown_proxy(self, proxy, fullurl, data=None): - """Overridable interface to open unknown URL type.""" - type, url = splittype(fullurl) - raise IOError('url error', 'invalid proxy for %s' % type, proxy) - - # External interface - def retrieve(self, url, filename=None, reporthook=None, data=None): - """retrieve(url) returns (filename, headers) for a local object - or (tempfilename, headers) for a remote object.""" - url = unwrap(toBytes(url)) - if self.tempcache and url in self.tempcache: - return self.tempcache[url] - type, url1 = splittype(url) - if filename is None and (not type or type == 'file'): - try: - fp = self.open_local_file(url1) - hdrs = fp.info() - del fp - return url2pathname(splithost(url1)[1]), hdrs - except IOError as msg: - pass - fp = self.open(url, data) - headers = fp.info() - if filename: - tfp = open(filename, 'wb') - else: - import tempfile - garbage, path = splittype(url) - garbage, path = splithost(path or "") - path, garbage = splitquery(path or "") - path, garbage = splitattr(path or "") - suffix = os.path.splitext(path)[1] - (fd, filename) = tempfile.mkstemp(suffix) - self.__tempfiles.append(filename) - tfp = os.fdopen(fd, 'wb') - result = filename, headers - if self.tempcache is not None: - self.tempcache[url] = result - bs = 1024*8 - size = -1 - read = 0 - blocknum = 0 - if reporthook: - if "content-length" in headers: - size = int(headers["Content-Length"]) - reporthook(blocknum, bs, size) - while 1: - block = fp.read(bs) - if not block: - break - read += len(block) - tfp.write(block) - blocknum += 1 - if reporthook: - reporthook(blocknum, bs, size) - fp.close() - tfp.close() - del fp - del tfp - - # raise exception if actual size does not match content-length header - if size >= 0 and read < size: - raise ContentTooShortError("retrieval incomplete: got only %i out " - "of %i bytes" % (read, size), result) - - return result - - # Each method named open_<type> knows how to open that type of URL - - def _open_generic_http(self, connection_factory, url, data): - """Make an HTTP connection using connection_class. - - This is an internal method that should be called from - open_http() or open_https(). - - Arguments: - - connection_factory should take a host name and return an - HTTPConnection instance. - - url is the url to retrieval or a host, relative-path pair. - - data is payload for a POST request or None. - """ - - user_passwd = None - proxy_passwd= None - if isinstance(url, str): - host, selector = splithost(url) - if host: - user_passwd, host = splituser(host) - host = unquote(host) - realhost = host - else: - host, selector = url - # check whether the proxy contains authorization information - proxy_passwd, host = splituser(host) - # now we proceed with the url we want to obtain - urltype, rest = splittype(selector) - url = rest - user_passwd = None - if urltype.lower() != 'http': - realhost = None - else: - realhost, rest = splithost(rest) - if realhost: - user_passwd, realhost = splituser(realhost) - if user_passwd: - selector = "%s://%s%s" % (urltype, realhost, rest) - if proxy_bypass(realhost): - host = realhost - - #print "proxy via http:", host, selector - if not host: raise IOError('http error', 'no host given') - - if proxy_passwd: - import base64 - proxy_auth = base64.b64encode(proxy_passwd).strip() - else: - proxy_auth = None - - if user_passwd: - import base64 - auth = base64.b64encode(user_passwd).strip() - else: - auth = None - http_conn = connection_factory(host) - # XXX We should fix urllib so that it works with HTTP/1.1. - http_conn._http_vsn = 10 - http_conn._http_vsn_str = "HTTP/1.0" - - headers = {} - if proxy_auth: - headers["Proxy-Authorization"] = "Basic %s" % proxy_auth - if auth: - headers["Authorization"] = "Basic %s" % auth - if realhost: - headers["Host"] = realhost - for header, value in self.addheaders: - headers[header] = value - - if data is not None: - headers["Content-Type"] = "application/x-www-form-urlencoded" - http_conn.request("POST", selector, data, headers) - else: - http_conn.request("GET", selector, headers=headers) - - try: - response = http_conn.getresponse() - except http.client.BadStatusLine: - # something went wrong with the HTTP status line - raise IOError('http protocol error', 0, - 'got a bad status line', None) - - # According to RFC 2616, "2xx" code indicates that the client's - # request was successfully received, understood, and accepted. - if (200 <= response.status < 300): - return addinfourl(response.fp, response.msg, "http:" + url, - response.status) - else: - return self.http_error( - url, response.fp, - response.status, response.reason, response.msg, data) - - def open_http(self, url, data=None): - """Use HTTP protocol.""" - return self._open_generic_http(http.client.HTTPConnection, url, data) - - def http_error(self, url, fp, errcode, errmsg, headers, data=None): - """Handle http errors. - - Derived class can override this, or provide specific handlers - named http_error_DDD where DDD is the 3-digit error code.""" - # First check if there's a specific handler for this error - name = 'http_error_%d' % errcode - if hasattr(self, name): - method = getattr(self, name) - if data is None: - result = method(url, fp, errcode, errmsg, headers) - else: - result = method(url, fp, errcode, errmsg, headers, data) - if result: return result - return self.http_error_default(url, fp, errcode, errmsg, headers) - - def http_error_default(self, url, fp, errcode, errmsg, headers): - """Default error handler: close the connection and raise IOError.""" - void = fp.read() - fp.close() - raise IOError('http error', errcode, errmsg, headers) - - if _have_ssl: - def _https_connection(self, host): - return http.client.HTTPSConnection(host, - key_file=self.key_file, - cert_file=self.cert_file) - - def open_https(self, url, data=None): - """Use HTTPS protocol.""" - return self._open_generic_http(self._https_connection, url, data) - - def open_file(self, url): - """Use local file or FTP depending on form of URL.""" - if not isinstance(url, str): - raise IOError('file error', 'proxy support for file protocol currently not implemented') - if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': - return self.open_ftp(url) - else: - return self.open_local_file(url) - - def open_local_file(self, url): - """Use local file.""" - import mimetypes, email.utils - host, file = splithost(url) - localname = url2pathname(file) - try: - stats = os.stat(localname) - except OSError as e: - raise IOError(e.errno, e.strerror, e.filename) - size = stats.st_size - modified = email.utils.formatdate(stats.st_mtime, usegmt=True) - mtype = mimetypes.guess_type(url)[0] - headers = email.message_from_string( - 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % - (mtype or 'text/plain', size, modified)) - if not host: - urlfile = file - if file[:1] == '/': - urlfile = 'file://' + file - return addinfourl(open(localname, 'rb'), - headers, urlfile) - host, port = splitport(host) - if not port \ - and socket.gethostbyname(host) in (localhost(), thishost()): - urlfile = file - if file[:1] == '/': - urlfile = 'file://' + file - return addinfourl(open(localname, 'rb'), - headers, urlfile) - raise IOError('local file error', 'not on local host') - - def open_ftp(self, url): - """Use FTP protocol.""" - if not isinstance(url, str): - raise IOError('ftp error', 'proxy support for ftp protocol currently not implemented') - import mimetypes - host, path = splithost(url) - if not host: raise IOError('ftp error', 'no host given') - host, port = splitport(host) - user, host = splituser(host) - if user: user, passwd = splitpasswd(user) - else: passwd = None - host = unquote(host) - user = unquote(user or '') - passwd = unquote(passwd or '') - host = socket.gethostbyname(host) - if not port: - import ftplib - port = ftplib.FTP_PORT - else: - port = int(port) - path, attrs = splitattr(path) - path = unquote(path) - dirs = path.split('/') - dirs, file = dirs[:-1], dirs[-1] - if dirs and not dirs[0]: dirs = dirs[1:] - if dirs and not dirs[0]: dirs[0] = '/' - key = user, host, port, '/'.join(dirs) - # XXX thread unsafe! - if len(self.ftpcache) > MAXFTPCACHE: - # Prune the cache, rather arbitrarily - for k in self.ftpcache.keys(): - if k != key: - v = self.ftpcache[k] - del self.ftpcache[k] - v.close() - try: - if not key in self.ftpcache: - self.ftpcache[key] = \ - ftpwrapper(user, passwd, host, port, dirs) - if not file: type = 'D' - else: type = 'I' - for attr in attrs: - attr, value = splitvalue(attr) - if attr.lower() == 'type' and \ - value in ('a', 'A', 'i', 'I', 'd', 'D'): - type = value.upper() - (fp, retrlen) = self.ftpcache[key].retrfile(file, type) - mtype = mimetypes.guess_type("ftp:" + url)[0] - headers = "" - if mtype: - headers += "Content-Type: %s\n" % mtype - if retrlen is not None and retrlen >= 0: - headers += "Content-Length: %d\n" % retrlen - headers = email.message_from_string(headers) - return addinfourl(fp, headers, "ftp:" + url) - except ftperrors() as msg: - raise IOError('ftp error', msg).with_traceback(sys.exc_info()[2]) - - def open_data(self, url, data=None): - """Use "data" URL.""" - if not isinstance(url, str): - raise IOError('data error', 'proxy support for data protocol currently not implemented') - # ignore POSTed data - # - # syntax of data URLs: - # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data - # mediatype := [ type "/" subtype ] *( ";" parameter ) - # data := *urlchar - # parameter := attribute "=" value - from io import StringIO - try: - [type, data] = url.split(',', 1) - except ValueError: - raise IOError('data error', 'bad data URL') - if not type: - type = 'text/plain;charset=US-ASCII' - semi = type.rfind(';') - if semi >= 0 and '=' not in type[semi:]: - encoding = type[semi+1:] - type = type[:semi] - else: - encoding = '' - msg = [] - msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT', - time.gmtime(time.time()))) - msg.append('Content-type: %s' % type) - if encoding == 'base64': - import base64 - data = base64.decodestring(data) - else: - data = unquote(data) - msg.append('Content-Length: %d' % len(data)) - msg.append('') - msg.append(data) - msg = '\n'.join(msg) - headers = email.message_from_string(msg) - f = StringIO(msg) - #f.fileno = None # needed for addinfourl - return addinfourl(f, headers, url) - - -class FancyURLopener(URLopener): - """Derived class with handlers for errors we can handle (perhaps).""" - - def __init__(self, *args, **kwargs): - URLopener.__init__(self, *args, **kwargs) - self.auth_cache = {} - self.tries = 0 - self.maxtries = 10 - - def http_error_default(self, url, fp, errcode, errmsg, headers): - """Default error handling -- don't raise an exception.""" - return addinfourl(fp, headers, "http:" + url, errcode) - - def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): - """Error 302 -- relocated (temporarily).""" - self.tries += 1 - if self.maxtries and self.tries >= self.maxtries: - if hasattr(self, "http_error_500"): - meth = self.http_error_500 - else: - meth = self.http_error_default - self.tries = 0 - return meth(url, fp, 500, - "Internal Server Error: Redirect Recursion", headers) - result = self.redirect_internal(url, fp, errcode, errmsg, headers, - data) - self.tries = 0 - return result - - def redirect_internal(self, url, fp, errcode, errmsg, headers, data): - if 'location' in headers: - newurl = headers['location'] - elif 'uri' in headers: - newurl = headers['uri'] - else: - return - void = fp.read() - fp.close() - # In case the server sent a relative URL, join with original: - newurl = basejoin(self.type + ":" + url, newurl) - return self.open(newurl) - - def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): - """Error 301 -- also relocated (permanently).""" - return self.http_error_302(url, fp, errcode, errmsg, headers, data) - - def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): - """Error 303 -- also relocated (essentially identical to 302).""" - return self.http_error_302(url, fp, errcode, errmsg, headers, data) - - def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): - """Error 307 -- relocated, but turn POST into error.""" - if data is None: - return self.http_error_302(url, fp, errcode, errmsg, headers, data) - else: - return self.http_error_default(url, fp, errcode, errmsg, headers) - - def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): - """Error 401 -- authentication required. - This function supports Basic authentication only.""" - if not 'www-authenticate' in headers: - URLopener.http_error_default(self, url, fp, - errcode, errmsg, headers) - stuff = headers['www-authenticate'] - import re - match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) - if not match: - URLopener.http_error_default(self, url, fp, - errcode, errmsg, headers) - scheme, realm = match.groups() - if scheme.lower() != 'basic': - URLopener.http_error_default(self, url, fp, - errcode, errmsg, headers) - name = 'retry_' + self.type + '_basic_auth' - if data is None: - return getattr(self,name)(url, realm) - else: - return getattr(self,name)(url, realm, data) - - def http_error_407(self, url, fp, errcode, errmsg, headers, data=None): - """Error 407 -- proxy authentication required. - This function supports Basic authentication only.""" - if not 'proxy-authenticate' in headers: - URLopener.http_error_default(self, url, fp, - errcode, errmsg, headers) - stuff = headers['proxy-authenticate'] - import re - match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) - if not match: - URLopener.http_error_default(self, url, fp, - errcode, errmsg, headers) - scheme, realm = match.groups() - if scheme.lower() != 'basic': - URLopener.http_error_default(self, url, fp, - errcode, errmsg, headers) - name = 'retry_proxy_' + self.type + '_basic_auth' - if data is None: - return getattr(self,name)(url, realm) - else: - return getattr(self,name)(url, realm, data) - - def retry_proxy_http_basic_auth(self, url, realm, data=None): - host, selector = splithost(url) - newurl = 'http://' + host + selector - proxy = self.proxies['http'] - urltype, proxyhost = splittype(proxy) - proxyhost, proxyselector = splithost(proxyhost) - i = proxyhost.find('@') + 1 - proxyhost = proxyhost[i:] - user, passwd = self.get_user_passwd(proxyhost, realm, i) - if not (user or passwd): return None - proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost - self.proxies['http'] = 'http://' + proxyhost + proxyselector - if data is None: - return self.open(newurl) - else: - return self.open(newurl, data) - - def retry_proxy_https_basic_auth(self, url, realm, data=None): - host, selector = splithost(url) - newurl = 'https://' + host + selector - proxy = self.proxies['https'] - urltype, proxyhost = splittype(proxy) - proxyhost, proxyselector = splithost(proxyhost) - i = proxyhost.find('@') + 1 - proxyhost = proxyhost[i:] - user, passwd = self.get_user_passwd(proxyhost, realm, i) - if not (user or passwd): return None - proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost - self.proxies['https'] = 'https://' + proxyhost + proxyselector - if data is None: - return self.open(newurl) - else: - return self.open(newurl, data) - - def retry_http_basic_auth(self, url, realm, data=None): - host, selector = splithost(url) - i = host.find('@') + 1 - host = host[i:] - user, passwd = self.get_user_passwd(host, realm, i) - if not (user or passwd): return None - host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host - newurl = 'http://' + host + selector - if data is None: - return self.open(newurl) - else: - return self.open(newurl, data) - - def retry_https_basic_auth(self, url, realm, data=None): - host, selector = splithost(url) - i = host.find('@') + 1 - host = host[i:] - user, passwd = self.get_user_passwd(host, realm, i) - if not (user or passwd): return None - host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host - newurl = 'https://' + host + selector - if data is None: - return self.open(newurl) - else: - return self.open(newurl, data) - - def get_user_passwd(self, host, realm, clear_cache = 0): - key = realm + '@' + host.lower() - if key in self.auth_cache: - if clear_cache: - del self.auth_cache[key] - else: - return self.auth_cache[key] - user, passwd = self.prompt_user_passwd(host, realm) - if user or passwd: self.auth_cache[key] = (user, passwd) - return user, passwd - - def prompt_user_passwd(self, host, realm): - """Override this in a GUI environment!""" - import getpass - try: - user = input("Enter username for %s at %s: " % (realm, host)) - passwd = getpass.getpass("Enter password for %s in %s at %s: " % - (user, realm, host)) - return user, passwd - except KeyboardInterrupt: - print() - return None, None - - -# Utility functions - -_localhost = None -def localhost(): - """Return the IP address of the magic hostname 'localhost'.""" - global _localhost - if _localhost is None: - _localhost = socket.gethostbyname('localhost') - return _localhost - -_thishost = None -def thishost(): - """Return the IP address of the current host.""" - global _thishost - if _thishost is None: - _thishost = socket.gethostbyname(socket.gethostname()) - return _thishost - -_ftperrors = None -def ftperrors(): - """Return the set of errors raised by the FTP class.""" - global _ftperrors - if _ftperrors is None: - import ftplib - _ftperrors = ftplib.all_errors - return _ftperrors - -_noheaders = None -def noheaders(): - """Return an empty email.message.Message object.""" - global _noheaders - if _noheaders is None: - _noheaders = email.message.Message() - return _noheaders - - -# Utility classes - -class ftpwrapper: - """Class used by open_ftp() for cache of open FTP connections.""" - - def __init__(self, user, passwd, host, port, dirs, - timeout=socket._GLOBAL_DEFAULT_TIMEOUT): - self.user = user - self.passwd = passwd - self.host = host - self.port = port - self.dirs = dirs - self.timeout = timeout - self.init() - - def init(self): - import ftplib - self.busy = 0 - self.ftp = ftplib.FTP() - self.ftp.connect(self.host, self.port, self.timeout) - self.ftp.login(self.user, self.passwd) - for dir in self.dirs: - self.ftp.cwd(dir) - - def retrfile(self, file, type): - import ftplib - self.endtransfer() - if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 - else: cmd = 'TYPE ' + type; isdir = 0 - try: - self.ftp.voidcmd(cmd) - except ftplib.all_errors: - self.init() - self.ftp.voidcmd(cmd) - conn = None - if file and not isdir: - # Try to retrieve as a file - try: - cmd = 'RETR ' + file - conn = self.ftp.ntransfercmd(cmd) - except ftplib.error_perm as reason: - if str(reason)[:3] != '550': - raise IOError('ftp error', reason).with_traceback(sys.exc_info()[2]) - if not conn: - # Set transfer mode to ASCII! - self.ftp.voidcmd('TYPE A') - # Try a directory listing. Verify that directory exists. - if file: - pwd = self.ftp.pwd() - try: - try: - self.ftp.cwd(file) - except ftplib.error_perm as reason: - raise IOError('ftp error', reason) from reason - finally: - self.ftp.cwd(pwd) - cmd = 'LIST ' + file - else: - cmd = 'LIST' - conn = self.ftp.ntransfercmd(cmd) - self.busy = 1 - # Pass back both a suitably decorated object and a retrieval length - return (addclosehook(conn[0].makefile('rb'), - self.endtransfer), conn[1]) - def endtransfer(self): - if not self.busy: - return - self.busy = 0 - try: - self.ftp.voidresp() - except ftperrors(): - pass - - def close(self): - self.endtransfer() - try: - self.ftp.close() - except ftperrors(): - pass - -class addbase: - """Base class for addinfo and addclosehook.""" - - # XXX Add a method to expose the timeout on the underlying socket? - - def __init__(self, fp): - self.fp = fp - self.read = self.fp.read - self.readline = self.fp.readline - if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines - if hasattr(self.fp, "fileno"): - self.fileno = self.fp.fileno - else: - self.fileno = lambda: None - if hasattr(self.fp, "__iter__"): - self.__iter__ = self.fp.__iter__ - if hasattr(self.fp, "__next__"): - self.__next__ = self.fp.__next__ - - def __repr__(self): - return '<%s at %r whose fp = %r>' % (self.__class__.__name__, - id(self), self.fp) - - def close(self): - self.read = None - self.readline = None - self.readlines = None - self.fileno = None - if self.fp: self.fp.close() - self.fp = None - -class addclosehook(addbase): - """Class to add a close hook to an open file.""" - - def __init__(self, fp, closehook, *hookargs): - addbase.__init__(self, fp) - self.closehook = closehook - self.hookargs = hookargs - - def close(self): - addbase.close(self) - if self.closehook: - self.closehook(*self.hookargs) - self.closehook = None - self.hookargs = None - -class addinfo(addbase): - """class to add an info() method to an open file.""" - - def __init__(self, fp, headers): - addbase.__init__(self, fp) - self.headers = headers - - def info(self): - return self.headers - -class addinfourl(addbase): - """class to add info() and geturl() methods to an open file.""" - - def __init__(self, fp, headers, url, code=None): - addbase.__init__(self, fp) - self.headers = headers - self.url = url - self.code = code - - def info(self): - return self.headers - - def getcode(self): - return self.code - - def geturl(self): - return self.url - - -# Utilities to parse URLs (most of these return None for missing parts): -# unwrap('<URL:type://host/path>') --> 'type://host/path' -# splittype('type:opaquestring') --> 'type', 'opaquestring' -# splithost('//host[:port]/path') --> 'host[:port]', '/path' -# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' -# splitpasswd('user:passwd') -> 'user', 'passwd' -# splitport('host:port') --> 'host', 'port' -# splitquery('/path?query') --> '/path', 'query' -# splittag('/path#tag') --> '/path', 'tag' -# splitattr('/path;attr1=value1;attr2=value2;...') -> -# '/path', ['attr1=value1', 'attr2=value2', ...] -# splitvalue('attr=value') --> 'attr', 'value' -# unquote('abc%20def') -> 'abc def' -# quote('abc def') -> 'abc%20def') - -def toBytes(url): - """toBytes(u"URL") --> 'URL'.""" - # Most URL schemes require ASCII. If that changes, the conversion - # can be relaxed. - # XXX get rid of toBytes() - if isinstance(url, str): - try: - url = url.encode("ASCII").decode() - except UnicodeError: - raise UnicodeError("URL " + repr(url) + - " contains non-ASCII characters") - return url - -def unwrap(url): - """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" - url = str(url).strip() - if url[:1] == '<' and url[-1:] == '>': - url = url[1:-1].strip() - if url[:4] == 'URL:': url = url[4:].strip() - return url - -_typeprog = None -def splittype(url): - """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" - global _typeprog - if _typeprog is None: - import re - _typeprog = re.compile('^([^/:]+):') - - match = _typeprog.match(url) - if match: - scheme = match.group(1) - return scheme.lower(), url[len(scheme) + 1:] - return None, url - -_hostprog = None -def splithost(url): - """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" - global _hostprog - if _hostprog is None: - import re - _hostprog = re.compile('^//([^/?]*)(.*)$') - - match = _hostprog.match(url) - if match: return match.group(1, 2) - return None, url - -_userprog = None -def splituser(host): - """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" - global _userprog - if _userprog is None: - import re - _userprog = re.compile('^(.*)@(.*)$') - - match = _userprog.match(host) - if match: return map(unquote, match.group(1, 2)) - return None, host - -_passwdprog = None -def splitpasswd(user): - """splitpasswd('user:passwd') -> 'user', 'passwd'.""" - global _passwdprog - if _passwdprog is None: - import re - _passwdprog = re.compile('^([^:]*):(.*)$') - - match = _passwdprog.match(user) - if match: return match.group(1, 2) - return user, None - -# splittag('/path#tag') --> '/path', 'tag' -_portprog = None -def splitport(host): - """splitport('host:port') --> 'host', 'port'.""" - global _portprog - if _portprog is None: - import re - _portprog = re.compile('^(.*):([0-9]+)$') - - match = _portprog.match(host) - if match: return match.group(1, 2) - return host, None - -_nportprog = None -def splitnport(host, defport=-1): - """Split host and port, returning numeric port. - Return given default port if no ':' found; defaults to -1. - Return numerical port if a valid number are found after ':'. - Return None if ':' but not a valid number.""" - global _nportprog - if _nportprog is None: - import re - _nportprog = re.compile('^(.*):(.*)$') - - match = _nportprog.match(host) - if match: - host, port = match.group(1, 2) - try: - if not port: raise ValueError("no digits") - nport = int(port) - except ValueError: - nport = None - return host, nport - return host, defport - -_queryprog = None -def splitquery(url): - """splitquery('/path?query') --> '/path', 'query'.""" - global _queryprog - if _queryprog is None: - import re - _queryprog = re.compile('^(.*)\?([^?]*)$') - - match = _queryprog.match(url) - if match: return match.group(1, 2) - return url, None - -_tagprog = None -def splittag(url): - """splittag('/path#tag') --> '/path', 'tag'.""" - global _tagprog - if _tagprog is None: - import re - _tagprog = re.compile('^(.*)#([^#]*)$') - - match = _tagprog.match(url) - if match: return match.group(1, 2) - return url, None - -def splitattr(url): - """splitattr('/path;attr1=value1;attr2=value2;...') -> - '/path', ['attr1=value1', 'attr2=value2', ...].""" - words = url.split(';') - return words[0], words[1:] - -_valueprog = None -def splitvalue(attr): - """splitvalue('attr=value') --> 'attr', 'value'.""" - global _valueprog - if _valueprog is None: - import re - _valueprog = re.compile('^([^=]*)=(.*)$') - - match = _valueprog.match(attr) - if match: return match.group(1, 2) - return attr, None - -_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) -_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) - -def unquote(s): - """unquote('abc%20def') -> 'abc def'.""" - res = s.split('%') - for i in range(1, len(res)): - item = res[i] - try: - res[i] = _hextochr[item[:2]] + item[2:] - except KeyError: - res[i] = '%' + item - except UnicodeDecodeError: - res[i] = chr(int(item[:2], 16)) + item[2:] - return "".join(res) - -def unquote_plus(s): - """unquote('%7e/abc+def') -> '~/abc def'""" - s = s.replace('+', ' ') - return unquote(s) - -always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' - 'abcdefghijklmnopqrstuvwxyz' - '0123456789' '_.-') -_safe_quoters= {} - -class Quoter: - def __init__(self, safe): - self.cache = {} - self.safe = safe + always_safe - - def __call__(self, c): - try: - return self.cache[c] - except KeyError: - if ord(c) < 256: - res = (c in self.safe) and c or ('%%%02X' % ord(c)) - self.cache[c] = res - return res - else: - return "".join(['%%%02X' % i for i in c.encode("utf-8")]) - -def quote(s, safe = '/'): - """quote('abc def') -> 'abc%20def' - - Each part of a URL, e.g. the path info, the query, etc., has a - different set of reserved characters that must be quoted. - - RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists - the following reserved characters. - - reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | - "$" | "," - - Each of these characters is reserved in some component of a URL, - but not necessarily in all of them. - - By default, the quote function is intended for quoting the path - section of a URL. Thus, it will not encode '/'. This character - is reserved, but in typical usage the quote function is being - called on a path where the existing slash characters are used as - reserved characters. - """ - cachekey = (safe, always_safe) - try: - quoter = _safe_quoters[cachekey] - except KeyError: - quoter = Quoter(safe) - _safe_quoters[cachekey] = quoter - res = map(quoter, s) - return ''.join(res) - -def quote_plus(s, safe = ''): - """Quote the query fragment of a URL; replacing ' ' with '+'""" - if ' ' in s: - s = quote(s, safe + ' ') - return s.replace(' ', '+') - return quote(s, safe) - -def urlencode(query,doseq=0): - """Encode a sequence of two-element tuples or dictionary into a URL query string. - - If any values in the query arg are sequences and doseq is true, each - sequence element is converted to a separate parameter. - - If the query arg is a sequence of two-element tuples, the order of the - parameters in the output will match the order of parameters in the - input. - """ - - if hasattr(query,"items"): - # mapping objects - query = query.items() - else: - # it's a bother at times that strings and string-like objects are - # sequences... - try: - # non-sequence items should not work with len() - # non-empty strings will fail this - if len(query) and not isinstance(query[0], tuple): - raise TypeError - # zero-length sequences of all types will get here and succeed, - # but that's a minor nit - since the original implementation - # allowed empty dicts that type of behavior probably should be - # preserved for consistency - except TypeError: - ty,va,tb = sys.exc_info() - raise TypeError("not a valid non-string sequence or mapping object").with_traceback(tb) - - l = [] - if not doseq: - # preserve old behavior - for k, v in query: - k = quote_plus(str(k)) - v = quote_plus(str(v)) - l.append(k + '=' + v) - else: - for k, v in query: - k = quote_plus(str(k)) - if isinstance(v, str): - v = quote_plus(v) - l.append(k + '=' + v) - elif isinstance(v, str): - # is there a reasonable way to convert to ASCII? - # encode generates a string, but "replace" or "ignore" - # lose information and "strict" can raise UnicodeError - v = quote_plus(v.encode("ASCII","replace")) - l.append(k + '=' + v) - else: - try: - # is this a sufficient test for sequence-ness? - x = len(v) - except TypeError: - # not a sequence - v = quote_plus(str(v)) - l.append(k + '=' + v) - else: - # loop over the sequence - for elt in v: - l.append(k + '=' + quote_plus(str(elt))) - return '&'.join(l) - -# Proxy handling -def getproxies_environment(): - """Return a dictionary of scheme -> proxy server URL mappings. - - Scan the environment for variables named <scheme>_proxy; - this seems to be the standard convention. If you need a - different way, you can pass a proxies dictionary to the - [Fancy]URLopener constructor. - - """ - proxies = {} - for name, value in os.environ.items(): - name = name.lower() - if name == 'no_proxy': - # handled in proxy_bypass_environment - continue - if value and name[-6:] == '_proxy': - proxies[name[:-6]] = value - return proxies - -def proxy_bypass_environment(host): - """Test if proxies should not be used for a particular host. - - Checks the environment for a variable named no_proxy, which should - be a list of DNS suffixes separated by commas, or '*' for all hosts. - """ - no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '') - # '*' is special case for always bypass - if no_proxy == '*': - return 1 - # strip port off host - hostonly, port = splitport(host) - # check if the host ends with any of the DNS suffixes - for name in no_proxy.split(','): - if name and (hostonly.endswith(name) or host.endswith(name)): - return 1 - # otherwise, don't bypass - return 0 - - -if sys.platform == 'darwin': - - def _CFSetup(sc): - from ctypes import c_int32, c_void_p, c_char_p, c_int - sc.CFStringCreateWithCString.argtypes = [ c_void_p, c_char_p, c_int32 ] - sc.CFStringCreateWithCString.restype = c_void_p - sc.SCDynamicStoreCopyProxies.argtypes = [ c_void_p ] - sc.SCDynamicStoreCopyProxies.restype = c_void_p - sc.CFDictionaryGetValue.argtypes = [ c_void_p, c_void_p ] - sc.CFDictionaryGetValue.restype = c_void_p - sc.CFStringGetLength.argtypes = [ c_void_p ] - sc.CFStringGetLength.restype = c_int32 - sc.CFStringGetCString.argtypes = [ c_void_p, c_char_p, c_int32, c_int32 ] - sc.CFStringGetCString.restype = c_int32 - sc.CFNumberGetValue.argtypes = [ c_void_p, c_int, c_void_p ] - sc.CFNumberGetValue.restype = c_int32 - sc.CFRelease.argtypes = [ c_void_p ] - sc.CFRelease.restype = None - - def _CStringFromCFString(sc, value): - from ctypes import create_string_buffer - length = sc.CFStringGetLength(value) + 1 - buff = create_string_buffer(length) - sc.CFStringGetCString(value, buff, length, 0) - return buff.value - - def _CFNumberToInt32(sc, cfnum): - from ctypes import byref, c_int - val = c_int() - kCFNumberSInt32Type = 3 - sc.CFNumberGetValue(cfnum, kCFNumberSInt32Type, byref(val)) - return val.value - - - def proxy_bypass_macosx_sysconf(host): - """ - Return True iff this host shouldn't be accessed using a proxy - - This function uses the MacOSX framework SystemConfiguration - to fetch the proxy information. - """ - from ctypes import cdll - from ctypes.util import find_library - import re - import socket - from fnmatch import fnmatch - - def ip2num(ipAddr): - parts = ipAddr.split('.') - parts = map(int, parts) - if len(parts) != 4: - parts = (parts + [0, 0, 0, 0])[:4] - return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3] - - sc = cdll.LoadLibrary(find_library("SystemConfiguration")) - _CFSetup(sc) - - hostIP = None - - if not sc: - return False - - kSCPropNetProxiesExceptionsList = sc.CFStringCreateWithCString(0, "ExceptionsList", 0) - kSCPropNetProxiesExcludeSimpleHostnames = sc.CFStringCreateWithCString(0, - "ExcludeSimpleHostnames", 0) - - - proxyDict = sc.SCDynamicStoreCopyProxies(None) - if proxyDict is None: - return False - - try: - # Check for simple host names: - if '.' not in host: - exclude_simple = sc.CFDictionaryGetValue(proxyDict, - kSCPropNetProxiesExcludeSimpleHostnames) - if exclude_simple and _CFNumberToInt32(sc, exclude_simple): - return True - - - # Check the exceptions list: - exceptions = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesExceptionsList) - if exceptions: - # Items in the list are strings like these: *.local, 169.254/16 - for index in xrange(sc.CFArrayGetCount(exceptions)): - value = sc.CFArrayGetValueAtIndex(exceptions, index) - if not value: continue - value = _CStringFromCFString(sc, value) - - m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value) - if m is not None: - if hostIP is None: - hostIP = socket.gethostbyname(host) - hostIP = ip2num(hostIP) - - base = ip2num(m.group(1)) - mask = int(m.group(2)[1:]) - mask = 32 - mask - - if (hostIP >> mask) == (base >> mask): - return True - - elif fnmatch(host, value): - return True - - return False - - finally: - sc.CFRelease(kSCPropNetProxiesExceptionsList) - sc.CFRelease(kSCPropNetProxiesExcludeSimpleHostnames) - - - - def getproxies_macosx_sysconf(): - """Return a dictionary of scheme -> proxy server URL mappings. - - This function uses the MacOSX framework SystemConfiguration - to fetch the proxy information. - """ - from ctypes import cdll - from ctypes.util import find_library - - sc = cdll.LoadLibrary(find_library("SystemConfiguration")) - _CFSetup(sc) - - if not sc: - return {} - - kSCPropNetProxiesHTTPEnable = sc.CFStringCreateWithCString(0, b"HTTPEnable", 0) - kSCPropNetProxiesHTTPProxy = sc.CFStringCreateWithCString(0, b"HTTPProxy", 0) - kSCPropNetProxiesHTTPPort = sc.CFStringCreateWithCString(0, b"HTTPPort", 0) - - kSCPropNetProxiesHTTPSEnable = sc.CFStringCreateWithCString(0, b"HTTPSEnable", 0) - kSCPropNetProxiesHTTPSProxy = sc.CFStringCreateWithCString(0, b"HTTPSProxy", 0) - kSCPropNetProxiesHTTPSPort = sc.CFStringCreateWithCString(0, b"HTTPSPort", 0) - - kSCPropNetProxiesFTPEnable = sc.CFStringCreateWithCString(0, b"FTPEnable", 0) - kSCPropNetProxiesFTPPassive = sc.CFStringCreateWithCString(0, b"FTPPassive", 0) - kSCPropNetProxiesFTPPort = sc.CFStringCreateWithCString(0, b"FTPPort", 0) - kSCPropNetProxiesFTPProxy = sc.CFStringCreateWithCString(0, b"FTPProxy", 0) - - kSCPropNetProxiesGopherEnable = sc.CFStringCreateWithCString(0, b"GopherEnable", 0) - kSCPropNetProxiesGopherPort = sc.CFStringCreateWithCString(0, b"GopherPort", 0) - kSCPropNetProxiesGopherProxy = sc.CFStringCreateWithCString(0, b"GopherProxy", 0) - - proxies = {} - proxyDict = sc.SCDynamicStoreCopyProxies(None) - - try: - # HTTP: - enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPEnable) - if enabled and _CFNumberToInt32(sc, enabled): - proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPProxy) - port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPPort) - - if proxy: - proxy = _CStringFromCFString(sc, proxy) - if port: - port = _CFNumberToInt32(sc, port) - proxies["http"] = "http://%s:%i" % (proxy, port) - else: - proxies["http"] = "http://%s" % (proxy, ) - - # HTTPS: - enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPSEnable) - if enabled and _CFNumberToInt32(sc, enabled): - proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPSProxy) - port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPSPort) - - if proxy: - proxy = _CStringFromCFString(sc, proxy) - if port: - port = _CFNumberToInt32(sc, port) - proxies["https"] = "http://%s:%i" % (proxy, port) - else: - proxies["https"] = "http://%s" % (proxy, ) - - # FTP: - enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesFTPEnable) - if enabled and _CFNumberToInt32(sc, enabled): - proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesFTPProxy) - port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesFTPPort) - - if proxy: - proxy = _CStringFromCFString(sc, proxy) - if port: - port = _CFNumberToInt32(sc, port) - proxies["ftp"] = "http://%s:%i" % (proxy, port) - else: - proxies["ftp"] = "http://%s" % (proxy, ) - - # Gopher: - enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesGopherEnable) - if enabled and _CFNumberToInt32(sc, enabled): - proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesGopherProxy) - port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesGopherPort) - - if proxy: - proxy = _CStringFromCFString(sc, proxy) - if port: - port = _CFNumberToInt32(sc, port) - proxies["gopher"] = "http://%s:%i" % (proxy, port) - else: - proxies["gopher"] = "http://%s" % (proxy, ) - finally: - sc.CFRelease(proxyDict) - - sc.CFRelease(kSCPropNetProxiesHTTPEnable) - sc.CFRelease(kSCPropNetProxiesHTTPProxy) - sc.CFRelease(kSCPropNetProxiesHTTPPort) - sc.CFRelease(kSCPropNetProxiesFTPEnable) - sc.CFRelease(kSCPropNetProxiesFTPPassive) - sc.CFRelease(kSCPropNetProxiesFTPPort) - sc.CFRelease(kSCPropNetProxiesFTPProxy) - sc.CFRelease(kSCPropNetProxiesGopherEnable) - sc.CFRelease(kSCPropNetProxiesGopherPort) - sc.CFRelease(kSCPropNetProxiesGopherProxy) - - return proxies - - - - def proxy_bypass(host): - if getproxies_environment(): - return proxy_bypass_environment(host) - else: - return proxy_bypass_macosx_sysconf(host) - - def getproxies(): - return getproxies_environment() or getproxies_macosx_sysconf() - -elif os.name == 'nt': - def getproxies_registry(): - """Return a dictionary of scheme -> proxy server URL mappings. - - Win32 uses the registry to store proxies. - - """ - proxies = {} - try: - import winreg - except ImportError: - # Std module, so should be around - but you never know! - return proxies - try: - internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, - r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') - proxyEnable = winreg.QueryValueEx(internetSettings, - 'ProxyEnable')[0] - if proxyEnable: - # Returned as Unicode but problems if not converted to ASCII - proxyServer = str(winreg.QueryValueEx(internetSettings, - 'ProxyServer')[0]) - if '=' in proxyServer: - # Per-protocol settings - for p in proxyServer.split(';'): - protocol, address = p.split('=', 1) - # See if address has a type:// prefix - import re - if not re.match('^([^/:]+)://', address): - address = '%s://%s' % (protocol, address) - proxies[protocol] = address - else: - # Use one setting for all protocols - if proxyServer[:5] == 'http:': - proxies['http'] = proxyServer - else: - proxies['http'] = 'http://%s' % proxyServer - proxies['ftp'] = 'ftp://%s' % proxyServer - internetSettings.Close() - except (WindowsError, ValueError, TypeError): - # Either registry key not found etc, or the value in an - # unexpected format. - # proxies already set up to be empty so nothing to do - pass - return proxies - - def getproxies(): - """Return a dictionary of scheme -> proxy server URL mappings. - - Returns settings gathered from the environment, if specified, - or the registry. - - """ - return getproxies_environment() or getproxies_registry() - - def proxy_bypass_registry(host): - try: - import winreg - import re - except ImportError: - # Std modules, so should be around - but you never know! - return 0 - try: - internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, - r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') - proxyEnable = winreg.QueryValueEx(internetSettings, - 'ProxyEnable')[0] - proxyOverride = str(winreg.QueryValueEx(internetSettings, - 'ProxyOverride')[0]) - # ^^^^ Returned as Unicode but problems if not converted to ASCII - except WindowsError: - return 0 - if not proxyEnable or not proxyOverride: - return 0 - # try to make a host list from name and IP address. - rawHost, port = splitport(host) - host = [rawHost] - try: - addr = socket.gethostbyname(rawHost) - if addr != rawHost: - host.append(addr) - except socket.error: - pass - try: - fqdn = socket.getfqdn(rawHost) - if fqdn != rawHost: - host.append(fqdn) - except socket.error: - pass - # make a check value list from the registry entry: replace the - # '<local>' string by the localhost entry and the corresponding - # canonical entry. - proxyOverride = proxyOverride.split(';') - i = 0 - while i < len(proxyOverride): - if proxyOverride[i] == '<local>': - proxyOverride[i:i+1] = ['localhost', - '127.0.0.1', - socket.gethostname(), - socket.gethostbyname( - socket.gethostname())] - i += 1 - # print proxyOverride - # now check if we match one of the registry values. - for test in proxyOverride: - test = test.replace(".", r"\.") # mask dots - test = test.replace("*", r".*") # change glob sequence - test = test.replace("?", r".") # change glob char - for val in host: - # print "%s <--> %s" %( test, val ) - if re.match(test, val, re.I): - return 1 - return 0 - - def proxy_bypass(host): - """Return a dictionary of scheme -> proxy server URL mappings. - - Returns settings gathered from the environment, if specified, - or the registry. - - """ - if getproxies_environment(): - return proxy_bypass_environment(host) - else: - return proxy_bypass_registry(host) - -else: - # By default use environment variables - getproxies = getproxies_environment - proxy_bypass = proxy_bypass_environment - -# Test and time quote() and unquote() -def test1(): - s = '' - for i in range(256): s = s + chr(i) - s = s*4 - t0 = time.time() - qs = quote(s) - uqs = unquote(qs) - t1 = time.time() - if uqs != s: - print('Wrong!') - print(repr(s)) - print(repr(qs)) - print(repr(uqs)) - print(round(t1 - t0, 3), 'sec') - - -def reporthook(blocknum, blocksize, totalsize): - # Report during remote transfers - print("Block number: %d, Block size: %d, Total size: %d" % ( - blocknum, blocksize, totalsize)) - -# Test program -def test(args=[]): - if not args: - args = [ - '/etc/passwd', - 'file:/etc/passwd', - 'file://localhost/etc/passwd', - 'ftp://ftp.gnu.org/pub/README', - 'http://www.python.org/index.html', - ] - if hasattr(URLopener, "open_https"): - args.append('https://synergy.as.cmu.edu/~geek/') - try: - for url in args: - print('-'*10, url, '-'*10) - fn, h = urlretrieve(url, None, reporthook) - print(fn) - if h: - print('======') - for k in h.keys(): print(k + ':', h[k]) - print('======') - fp = open(fn, 'rb') - data = fp.read() - del fp - data = data.replace("\r", "") - print(data) - fn, h = None, None - print('-'*40) - finally: - urlcleanup() - -def main(): - import getopt, sys - try: - opts, args = getopt.getopt(sys.argv[1:], "th") - except getopt.error as msg: - print(msg) - print("Use -h for help") - return - t = 0 - for o, a in opts: - if o == '-t': - t = t + 1 - if o == '-h': - print("Usage: python urllib.py [-t] [url ...]") - print("-t runs self-test;", end=' ') - print("otherwise, contents of urls are printed") - return - if t: - if t > 1: - test1() - test(args) - else: - if not args: - print("Use -h for help") - for url in args: - print(urlopen(url).read(), end=' ') - -# Run test program when run as a script -if __name__ == '__main__': - main() diff --git a/Lib/urllib/__init__.py b/Lib/urllib/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/Lib/urllib/__init__.py diff --git a/Lib/urllib/error.py b/Lib/urllib/error.py new file mode 100644 index 0000000..300c3fe --- /dev/null +++ b/Lib/urllib/error.py @@ -0,0 +1,59 @@ +"""Exception classes raised by urllib. + +The base exception class is URLError, which inherits from IOError. It +doesn't define any behavior of its own, but is the base class for all +exceptions defined in this package. + +HTTPError is an exception class that is also a valid HTTP response +instance. It behaves this way because HTTP protocol errors are valid +responses, with a status code, headers, and a body. In some contexts, +an application may want to handle an exception like a regular +response. +""" + +import urllib.response + +# do these error classes make sense? +# make sure all of the IOError stuff is overridden. we just want to be +# subtypes. + +class URLError(IOError): + # URLError is a sub-type of IOError, but it doesn't share any of + # the implementation. need to override __init__ and __str__. + # It sets self.args for compatibility with other EnvironmentError + # subclasses, but args doesn't have the typical format with errno in + # slot 0 and strerror in slot 1. This may be better than nothing. + def __init__(self, reason, filename=None): + self.args = reason, + self.reason = reason + if filename is not None: + self.filename = filename + + def __str__(self): + return '<urlopen error %s>' % self.reason + +class HTTPError(URLError, urllib.response.addinfourl): + """Raised when HTTP error occurs, but also acts like non-error return""" + __super_init = urllib.response.addinfourl.__init__ + + def __init__(self, url, code, msg, hdrs, fp): + self.code = code + self.msg = msg + self.hdrs = hdrs + self.fp = fp + self.filename = url + # The addinfourl classes depend on fp being a valid file + # object. In some cases, the HTTPError may not have a valid + # file object. If this happens, the simplest workaround is to + # not initialize the base classes. + if fp is not None: + self.__super_init(fp, hdrs, url, code) + + def __str__(self): + return 'HTTP Error %s: %s' % (self.code, self.msg) + +# exception raised when downloaded size does not match content-length +class ContentTooShortError(URLError): + def __init__(self, message, content): + URLError.__init__(self, message) + self.content = content diff --git a/Lib/urlparse.py b/Lib/urllib/parse.py index 30de699..71cc369 100644 --- a/Lib/urlparse.py +++ b/Lib/urllib/parse.py @@ -259,6 +259,311 @@ def urldefrag(url): return url, '' +_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) +_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) + +def unquote(s): + """unquote('abc%20def') -> 'abc def'.""" + res = s.split('%') + for i in range(1, len(res)): + item = res[i] + try: + res[i] = _hextochr[item[:2]] + item[2:] + except KeyError: + res[i] = '%' + item + except UnicodeDecodeError: + res[i] = chr(int(item[:2], 16)) + item[2:] + return "".join(res) + +def unquote_plus(s): + """unquote('%7e/abc+def') -> '~/abc def'""" + s = s.replace('+', ' ') + return unquote(s) + +always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' '_.-') +_safe_quoters= {} + +class Quoter: + def __init__(self, safe): + self.cache = {} + self.safe = safe + always_safe + + def __call__(self, c): + try: + return self.cache[c] + except KeyError: + if ord(c) < 256: + res = (c in self.safe) and c or ('%%%02X' % ord(c)) + self.cache[c] = res + return res + else: + return "".join(['%%%02X' % i for i in c.encode("utf-8")]) + +def quote(s, safe = '/'): + """quote('abc def') -> 'abc%20def' + + Each part of a URL, e.g. the path info, the query, etc., has a + different set of reserved characters that must be quoted. + + RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists + the following reserved characters. + + reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | + "$" | "," + + Each of these characters is reserved in some component of a URL, + but not necessarily in all of them. + + By default, the quote function is intended for quoting the path + section of a URL. Thus, it will not encode '/'. This character + is reserved, but in typical usage the quote function is being + called on a path where the existing slash characters are used as + reserved characters. + """ + cachekey = (safe, always_safe) + try: + quoter = _safe_quoters[cachekey] + except KeyError: + quoter = Quoter(safe) + _safe_quoters[cachekey] = quoter + res = map(quoter, s) + return ''.join(res) + +def quote_plus(s, safe = ''): + """Quote the query fragment of a URL; replacing ' ' with '+'""" + if ' ' in s: + s = quote(s, safe + ' ') + return s.replace(' ', '+') + return quote(s, safe) + +def urlencode(query,doseq=0): + """Encode a sequence of two-element tuples or dictionary into a URL query string. + + If any values in the query arg are sequences and doseq is true, each + sequence element is converted to a separate parameter. + + If the query arg is a sequence of two-element tuples, the order of the + parameters in the output will match the order of parameters in the + input. + """ + + if hasattr(query,"items"): + # mapping objects + query = query.items() + else: + # it's a bother at times that strings and string-like objects are + # sequences... + try: + # non-sequence items should not work with len() + # non-empty strings will fail this + if len(query) and not isinstance(query[0], tuple): + raise TypeError + # zero-length sequences of all types will get here and succeed, + # but that's a minor nit - since the original implementation + # allowed empty dicts that type of behavior probably should be + # preserved for consistency + except TypeError: + ty,va,tb = sys.exc_info() + raise TypeError("not a valid non-string sequence or mapping object").with_traceback(tb) + + l = [] + if not doseq: + # preserve old behavior + for k, v in query: + k = quote_plus(str(k)) + v = quote_plus(str(v)) + l.append(k + '=' + v) + else: + for k, v in query: + k = quote_plus(str(k)) + if isinstance(v, str): + v = quote_plus(v) + l.append(k + '=' + v) + elif isinstance(v, str): + # is there a reasonable way to convert to ASCII? + # encode generates a string, but "replace" or "ignore" + # lose information and "strict" can raise UnicodeError + v = quote_plus(v.encode("ASCII","replace")) + l.append(k + '=' + v) + else: + try: + # is this a sufficient test for sequence-ness? + x = len(v) + except TypeError: + # not a sequence + v = quote_plus(str(v)) + l.append(k + '=' + v) + else: + # loop over the sequence + for elt in v: + l.append(k + '=' + quote_plus(str(elt))) + return '&'.join(l) + +# Utilities to parse URLs (most of these return None for missing parts): +# unwrap('<URL:type://host/path>') --> 'type://host/path' +# splittype('type:opaquestring') --> 'type', 'opaquestring' +# splithost('//host[:port]/path') --> 'host[:port]', '/path' +# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' +# splitpasswd('user:passwd') -> 'user', 'passwd' +# splitport('host:port') --> 'host', 'port' +# splitquery('/path?query') --> '/path', 'query' +# splittag('/path#tag') --> '/path', 'tag' +# splitattr('/path;attr1=value1;attr2=value2;...') -> +# '/path', ['attr1=value1', 'attr2=value2', ...] +# splitvalue('attr=value') --> 'attr', 'value' +# urllib.parse.unquote('abc%20def') -> 'abc def' +# quote('abc def') -> 'abc%20def') + +def toBytes(url): + """toBytes(u"URL") --> 'URL'.""" + # Most URL schemes require ASCII. If that changes, the conversion + # can be relaxed. + # XXX get rid of toBytes() + if isinstance(url, str): + try: + url = url.encode("ASCII").decode() + except UnicodeError: + raise UnicodeError("URL " + repr(url) + + " contains non-ASCII characters") + return url + +def unwrap(url): + """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" + url = str(url).strip() + if url[:1] == '<' and url[-1:] == '>': + url = url[1:-1].strip() + if url[:4] == 'URL:': url = url[4:].strip() + return url + +_typeprog = None +def splittype(url): + """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" + global _typeprog + if _typeprog is None: + import re + _typeprog = re.compile('^([^/:]+):') + + match = _typeprog.match(url) + if match: + scheme = match.group(1) + return scheme.lower(), url[len(scheme) + 1:] + return None, url + +_hostprog = None +def splithost(url): + """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" + global _hostprog + if _hostprog is None: + import re + _hostprog = re.compile('^//([^/?]*)(.*)$') + + match = _hostprog.match(url) + if match: return match.group(1, 2) + return None, url + +_userprog = None +def splituser(host): + """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" + global _userprog + if _userprog is None: + import re + _userprog = re.compile('^(.*)@(.*)$') + + match = _userprog.match(host) + if match: return map(unquote, match.group(1, 2)) + return None, host + +_passwdprog = None +def splitpasswd(user): + """splitpasswd('user:passwd') -> 'user', 'passwd'.""" + global _passwdprog + if _passwdprog is None: + import re + _passwdprog = re.compile('^([^:]*):(.*)$') + + match = _passwdprog.match(user) + if match: return match.group(1, 2) + return user, None + +# splittag('/path#tag') --> '/path', 'tag' +_portprog = None +def splitport(host): + """splitport('host:port') --> 'host', 'port'.""" + global _portprog + if _portprog is None: + import re + _portprog = re.compile('^(.*):([0-9]+)$') + + match = _portprog.match(host) + if match: return match.group(1, 2) + return host, None + +_nportprog = None +def splitnport(host, defport=-1): + """Split host and port, returning numeric port. + Return given default port if no ':' found; defaults to -1. + Return numerical port if a valid number are found after ':'. + Return None if ':' but not a valid number.""" + global _nportprog + if _nportprog is None: + import re + _nportprog = re.compile('^(.*):(.*)$') + + match = _nportprog.match(host) + if match: + host, port = match.group(1, 2) + try: + if not port: raise ValueError("no digits") + nport = int(port) + except ValueError: + nport = None + return host, nport + return host, defport + +_queryprog = None +def splitquery(url): + """splitquery('/path?query') --> '/path', 'query'.""" + global _queryprog + if _queryprog is None: + import re + _queryprog = re.compile('^(.*)\?([^?]*)$') + + match = _queryprog.match(url) + if match: return match.group(1, 2) + return url, None + +_tagprog = None +def splittag(url): + """splittag('/path#tag') --> '/path', 'tag'.""" + global _tagprog + if _tagprog is None: + import re + _tagprog = re.compile('^(.*)#([^#]*)$') + + match = _tagprog.match(url) + if match: return match.group(1, 2) + return url, None + +def splitattr(url): + """splitattr('/path;attr1=value1;attr2=value2;...') -> + '/path', ['attr1=value1', 'attr2=value2', ...].""" + words = url.split(';') + return words[0], words[1:] + +_valueprog = None +def splitvalue(attr): + """splitvalue('attr=value') --> 'attr', 'value'.""" + global _valueprog + if _valueprog is None: + import re + _valueprog = re.compile('^([^=]*)=(.*)$') + + match = _valueprog.match(attr) + if match: return match.group(1, 2) + return attr, None + test_input = """ http://a/b/c/d diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py new file mode 100644 index 0000000..cd4729a --- /dev/null +++ b/Lib/urllib/request.py @@ -0,0 +1,2295 @@ +# Issues in merging urllib and urllib2: +# 1. They both define a function named urlopen() + +"""An extensible library for opening URLs using a variety of protocols + +The simplest way to use this module is to call the urlopen function, +which accepts a string containing a URL or a Request object (described +below). It opens the URL and returns the results as file-like +object; the returned object has some extra methods described below. + +The OpenerDirector manages a collection of Handler objects that do +all the actual work. Each Handler implements a particular protocol or +option. The OpenerDirector is a composite object that invokes the +Handlers needed to open the requested URL. For example, the +HTTPHandler performs HTTP GET and POST requests and deals with +non-error returns. The HTTPRedirectHandler automatically deals with +HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler +deals with digest authentication. + +urlopen(url, data=None) -- Basic usage is the same as original +urllib. pass the url and optionally data to post to an HTTP URL, and +get a file-like object back. One difference is that you can also pass +a Request instance instead of URL. Raises a URLError (subclass of +IOError); for HTTP errors, raises an HTTPError, which can also be +treated as a valid response. + +build_opener -- Function that creates a new OpenerDirector instance. +Will install the default handlers. Accepts one or more Handlers as +arguments, either instances or Handler classes that it will +instantiate. If one of the argument is a subclass of the default +handler, the argument will be installed instead of the default. + +install_opener -- Installs a new opener as the default opener. + +objects of interest: +OpenerDirector -- + +Request -- An object that encapsulates the state of a request. The +state can be as simple as the URL. It can also include extra HTTP +headers, e.g. a User-Agent. + +BaseHandler -- + +internals: +BaseHandler and parent +_call_chain conventions + +Example usage: + +import urllib2 + +# set up authentication info +authinfo = urllib2.HTTPBasicAuthHandler() +authinfo.add_password(realm='PDQ Application', + uri='https://mahler:8092/site-updates.py', + user='klem', + passwd='geheim$parole') + +proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"}) + +# build a new opener that adds authentication and caching FTP handlers +opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler) + +# install it +urllib2.install_opener(opener) + +f = urllib2.urlopen('http://www.python.org/') +""" + +# XXX issues: +# If an authentication error handler that tries to perform +# authentication for some reason but fails, how should the error be +# signalled? The client needs to know the HTTP error code. But if +# the handler knows that the problem was, e.g., that it didn't know +# that hash algo that requested in the challenge, it would be good to +# pass that information along to the client, too. +# ftp errors aren't handled cleanly +# check digest against correct (i.e. non-apache) implementation + +# Possible extensions: +# complex proxies XXX not sure what exactly was meant by this +# abstract factory for opener + +import base64 +import email +import hashlib +import http.client +import io +import os +import posixpath +import random +import re +import socket +import sys +import time +import urllib.parse, urllib.error, urllib.response +import bisect + +from io import StringIO + +# check for SSL +try: + import ssl +except: + _have_ssl = False +else: + _have_ssl = True +assert _have_ssl + +# used in User-Agent header sent +__version__ = sys.version[:3] + +_opener = None +def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): + global _opener + if _opener is None: + _opener = build_opener() + return _opener.open(url, data, timeout) + +def install_opener(opener): + global _opener + _opener = opener + +# TODO(jhylton): Make this work with the same global opener. +_urlopener = None +def urlretrieve(url, filename=None, reporthook=None, data=None): + global _urlopener + if not _urlopener: + _urlopener = FancyURLopener() + return _urlopener.retrieve(url, filename, reporthook, data) + +def urlcleanup(): + if _urlopener: + _urlopener.cleanup() + global _opener + if _opener: + _opener = None + +# copied from cookielib.py +_cut_port_re = re.compile(r":\d+$") +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.get_full_url() + host = urllib.parse.urlparse(url)[1] + if host == "": + host = request.get_header("Host", "") + + # remove port, if present + host = _cut_port_re.sub("", host, 1) + return host.lower() + +class Request: + + def __init__(self, url, data=None, headers={}, + origin_req_host=None, unverifiable=False): + # unwrap('<URL:type://host/path>') --> 'type://host/path' + self.__original = urllib.parse.unwrap(url) + self.type = None + # self.__r_type is what's left after doing the splittype + self.host = None + self.port = None + self.data = data + self.headers = {} + for key, value in headers.items(): + self.add_header(key, value) + self.unredirected_hdrs = {} + if origin_req_host is None: + origin_req_host = request_host(self) + self.origin_req_host = origin_req_host + self.unverifiable = unverifiable + + def __getattr__(self, attr): + # XXX this is a fallback mechanism to guard against these + # methods getting called in a non-standard order. this may be + # too complicated and/or unnecessary. + # XXX should the __r_XXX attributes be public? + if attr[:12] == '_Request__r_': + name = attr[12:] + if hasattr(Request, 'get_' + name): + getattr(self, 'get_' + name)() + return getattr(self, attr) + raise AttributeError(attr) + + def get_method(self): + if self.has_data(): + return "POST" + else: + return "GET" + + # XXX these helper methods are lame + + def add_data(self, data): + self.data = data + + def has_data(self): + return self.data is not None + + def get_data(self): + return self.data + + def get_full_url(self): + return self.__original + + def get_type(self): + if self.type is None: + self.type, self.__r_type = urllib.parse.splittype(self.__original) + if self.type is None: + raise ValueError("unknown url type: %s" % self.__original) + return self.type + + def get_host(self): + if self.host is None: + self.host, self.__r_host = urllib.parse.splithost(self.__r_type) + if self.host: + self.host = urllib.parse.unquote(self.host) + return self.host + + def get_selector(self): + return self.__r_host + + def set_proxy(self, host, type): + self.host, self.type = host, type + self.__r_host = self.__original + + def get_origin_req_host(self): + return self.origin_req_host + + def is_unverifiable(self): + return self.unverifiable + + def add_header(self, key, val): + # useful for something like authentication + self.headers[key.capitalize()] = val + + def add_unredirected_header(self, key, val): + # will not be added to a redirected request + self.unredirected_hdrs[key.capitalize()] = val + + def has_header(self, header_name): + return (header_name in self.headers or + header_name in self.unredirected_hdrs) + + def get_header(self, header_name, default=None): + return self.headers.get( + header_name, + self.unredirected_hdrs.get(header_name, default)) + + def header_items(self): + hdrs = self.unredirected_hdrs.copy() + hdrs.update(self.headers) + return list(hdrs.items()) + +class OpenerDirector: + def __init__(self): + client_version = "Python-urllib/%s" % __version__ + self.addheaders = [('User-agent', client_version)] + # manage the individual handlers + self.handlers = [] + self.handle_open = {} + self.handle_error = {} + self.process_response = {} + self.process_request = {} + + def add_handler(self, handler): + if not hasattr(handler, "add_parent"): + raise TypeError("expected BaseHandler instance, got %r" % + type(handler)) + + added = False + for meth in dir(handler): + if meth in ["redirect_request", "do_open", "proxy_open"]: + # oops, coincidental match + continue + + i = meth.find("_") + protocol = meth[:i] + condition = meth[i+1:] + + if condition.startswith("error"): + j = condition.find("_") + i + 1 + kind = meth[j+1:] + try: + kind = int(kind) + except ValueError: + pass + lookup = self.handle_error.get(protocol, {}) + self.handle_error[protocol] = lookup + elif condition == "open": + kind = protocol + lookup = self.handle_open + elif condition == "response": + kind = protocol + lookup = self.process_response + elif condition == "request": + kind = protocol + lookup = self.process_request + else: + continue + + handlers = lookup.setdefault(kind, []) + if handlers: + bisect.insort(handlers, handler) + else: + handlers.append(handler) + added = True + + if added: + # the handlers must work in an specific order, the order + # is specified in a Handler attribute + bisect.insort(self.handlers, handler) + handler.add_parent(self) + + def close(self): + # Only exists for backwards compatibility. + pass + + def _call_chain(self, chain, kind, meth_name, *args): + # Handlers raise an exception if no one else should try to handle + # the request, or return None if they can't but another handler + # could. Otherwise, they return the response. + handlers = chain.get(kind, ()) + for handler in handlers: + func = getattr(handler, meth_name) + + result = func(*args) + if result is not None: + return result + + def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): + # accept a URL or a Request object + if isinstance(fullurl, str): + req = Request(fullurl, data) + else: + req = fullurl + if data is not None: + req.add_data(data) + + req.timeout = timeout + protocol = req.get_type() + + # pre-process request + meth_name = protocol+"_request" + for processor in self.process_request.get(protocol, []): + meth = getattr(processor, meth_name) + req = meth(req) + + response = self._open(req, data) + + # post-process response + meth_name = protocol+"_response" + for processor in self.process_response.get(protocol, []): + meth = getattr(processor, meth_name) + response = meth(req, response) + + return response + + def _open(self, req, data=None): + result = self._call_chain(self.handle_open, 'default', + 'default_open', req) + if result: + return result + + protocol = req.get_type() + result = self._call_chain(self.handle_open, protocol, protocol + + '_open', req) + if result: + return result + + return self._call_chain(self.handle_open, 'unknown', + 'unknown_open', req) + + def error(self, proto, *args): + if proto in ('http', 'https'): + # XXX http[s] protocols are special-cased + dict = self.handle_error['http'] # https is not different than http + proto = args[2] # YUCK! + meth_name = 'http_error_%s' % proto + http_err = 1 + orig_args = args + else: + dict = self.handle_error + meth_name = proto + '_error' + http_err = 0 + args = (dict, proto, meth_name) + args + result = self._call_chain(*args) + if result: + return result + + if http_err: + args = (dict, 'default', 'http_error_default') + orig_args + return self._call_chain(*args) + +# XXX probably also want an abstract factory that knows when it makes +# sense to skip a superclass in favor of a subclass and when it might +# make sense to include both + +def build_opener(*handlers): + """Create an opener object from a list of handlers. + + The opener will use several default handlers, including support + for HTTP and FTP. + + If any of the handlers passed as arguments are subclasses of the + default handlers, the default handlers will not be used. + """ + def isclass(obj): + return isinstance(obj, type) or hasattr(obj, "__bases__") + + opener = OpenerDirector() + default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, + HTTPDefaultErrorHandler, HTTPRedirectHandler, + FTPHandler, FileHandler, HTTPErrorProcessor] + if hasattr(http.client, "HTTPSConnection"): + default_classes.append(HTTPSHandler) + else: + import pdb; pdb.set_trace() + skip = set() + for klass in default_classes: + for check in handlers: + if isclass(check): + if issubclass(check, klass): + skip.add(klass) + elif isinstance(check, klass): + skip.add(klass) + for klass in skip: + default_classes.remove(klass) + + for klass in default_classes: + opener.add_handler(klass()) + + for h in handlers: + if isclass(h): + h = h() + opener.add_handler(h) + return opener + +class BaseHandler: + handler_order = 500 + + def add_parent(self, parent): + self.parent = parent + + def close(self): + # Only exists for backwards compatibility + pass + + def __lt__(self, other): + if not hasattr(other, "handler_order"): + # Try to preserve the old behavior of having custom classes + # inserted after default ones (works only for custom user + # classes which are not aware of handler_order). + return True + return self.handler_order < other.handler_order + + +class HTTPErrorProcessor(BaseHandler): + """Process HTTP error responses.""" + handler_order = 1000 # after all other processing + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + # According to RFC 2616, "2xx" code indicates that the client's + # request was successfully received, understood, and accepted. + if not (200 <= code < 300): + response = self.parent.error( + 'http', request, response, code, msg, hdrs) + + return response + + https_response = http_response + +class HTTPDefaultErrorHandler(BaseHandler): + def http_error_default(self, req, fp, code, msg, hdrs): + raise urllib.error.HTTPError(req.get_full_url(), code, msg, hdrs, fp) + +class HTTPRedirectHandler(BaseHandler): + # maximum number of redirections to any single URL + # this is needed because of the state that cookies introduce + max_repeats = 4 + # maximum total number of redirections (regardless of URL) before + # assuming we're in a loop + max_redirections = 10 + + def redirect_request(self, req, fp, code, msg, headers, newurl): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a + redirection response is received. If a redirection should + take place, return a new Request to allow http_error_30x to + perform the redirect. Otherwise, raise HTTPError if no-one + else should try to handle this url. Return None if you can't + but another Handler might. + """ + m = req.get_method() + if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD") + or code in (301, 302, 303) and m == "POST")): + raise urllib.error.HTTPError(req.get_full_url(), + code, msg, headers, fp) + + # Strictly (according to RFC 2616), 301 or 302 in response to + # a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib2, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + # be conciliant with URIs containing a space + newurl = newurl.replace(' ', '%20') + CONTENT_HEADERS = ("content-length", "content-type") + newheaders = dict((k, v) for k, v in req.headers.items() + if k.lower() not in CONTENT_HEADERS) + return Request(newurl, + headers=newheaders, + origin_req_host=req.get_origin_req_host(), + unverifiable=True) + + # Implementation note: To avoid the server sending us into an + # infinite loop, the request object needs to track what URLs we + # have already seen. Do this by adding a handler-specific + # attribute to the Request object. + def http_error_302(self, req, fp, code, msg, headers): + # Some servers (incorrectly) return multiple Location headers + # (so probably same goes for URI). Use first header. + if "location" in headers: + newurl = headers["location"] + elif "uri" in headers: + newurl = headers["uri"] + else: + return + newurl = urllib.parse.urljoin(req.get_full_url(), newurl) + + # XXX Probably want to forget about the state of the current + # request, although that might interact poorly with other + # handlers that also use handler-specific request attributes + new = self.redirect_request(req, fp, code, msg, headers, newurl) + if new is None: + return + + # loop detection + # .redirect_dict has a key url if url was previously visited. + if hasattr(req, 'redirect_dict'): + visited = new.redirect_dict = req.redirect_dict + if (visited.get(newurl, 0) >= self.max_repeats or + len(visited) >= self.max_redirections): + raise urllib.error.HTTPError(req.get_full_url(), code, + self.inf_msg + msg, headers, fp) + else: + visited = new.redirect_dict = req.redirect_dict = {} + visited[newurl] = visited.get(newurl, 0) + 1 + + # Don't close the fp until we are sure that we won't use it + # with HTTPError. + fp.read() + fp.close() + + return self.parent.open(new) + + http_error_301 = http_error_303 = http_error_307 = http_error_302 + + inf_msg = "The HTTP server returned a redirect error that would " \ + "lead to an infinite loop.\n" \ + "The last 30x error message was:\n" + + +def _parse_proxy(proxy): + """Return (scheme, user, password, host/port) given a URL or an authority. + + If a URL is supplied, it must have an authority (host:port) component. + According to RFC 3986, having an authority component means the URL must + have two slashes after the scheme: + + >>> _parse_proxy('file:/ftp.example.com/') + Traceback (most recent call last): + ValueError: proxy URL with no authority: 'file:/ftp.example.com/' + + The first three items of the returned tuple may be None. + + Examples of authority parsing: + + >>> _parse_proxy('proxy.example.com') + (None, None, None, 'proxy.example.com') + >>> _parse_proxy('proxy.example.com:3128') + (None, None, None, 'proxy.example.com:3128') + + The authority component may optionally include userinfo (assumed to be + username:password): + + >>> _parse_proxy('joe:password@proxy.example.com') + (None, 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('joe:password@proxy.example.com:3128') + (None, 'joe', 'password', 'proxy.example.com:3128') + + Same examples, but with URLs instead: + + >>> _parse_proxy('http://proxy.example.com/') + ('http', None, None, 'proxy.example.com') + >>> _parse_proxy('http://proxy.example.com:3128/') + ('http', None, None, 'proxy.example.com:3128') + >>> _parse_proxy('http://joe:password@proxy.example.com/') + ('http', 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('http://joe:password@proxy.example.com:3128') + ('http', 'joe', 'password', 'proxy.example.com:3128') + + Everything after the authority is ignored: + + >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') + ('ftp', 'joe', 'password', 'proxy.example.com') + + Test for no trailing '/' case: + + >>> _parse_proxy('http://joe:password@proxy.example.com') + ('http', 'joe', 'password', 'proxy.example.com') + + """ + scheme, r_scheme = urllib.parse.splittype(proxy) + if not r_scheme.startswith("/"): + # authority + scheme = None + authority = proxy + else: + # URL + if not r_scheme.startswith("//"): + raise ValueError("proxy URL with no authority: %r" % proxy) + # We have an authority, so for RFC 3986-compliant URLs (by ss 3. + # and 3.3.), path is empty or starts with '/' + end = r_scheme.find("/", 2) + if end == -1: + end = None + authority = r_scheme[2:end] + userinfo, hostport = urllib.parse.splituser(authority) + if userinfo is not None: + user, password = urllib.parse.splitpasswd(userinfo) + else: + user = password = None + return scheme, user, password, hostport + +class ProxyHandler(BaseHandler): + # Proxies must be in front + handler_order = 100 + + def __init__(self, proxies=None): + if proxies is None: + proxies = getproxies() + assert hasattr(proxies, 'keys'), "proxies must be a mapping" + self.proxies = proxies + for type, url in proxies.items(): + setattr(self, '%s_open' % type, + lambda r, proxy=url, type=type, meth=self.proxy_open: \ + meth(r, proxy, type)) + + def proxy_open(self, req, proxy, type): + orig_type = req.get_type() + proxy_type, user, password, hostport = _parse_proxy(proxy) + if proxy_type is None: + proxy_type = orig_type + if user and password: + user_pass = '%s:%s' % (unquote(user), + urllib.parse.unquote(password)) + creds = base64.b64encode(user_pass.encode()).decode("ascii") + req.add_header('Proxy-authorization', 'Basic ' + creds) + hostport = urllib.parse.unquote(hostport) + req.set_proxy(hostport, proxy_type) + if orig_type == proxy_type: + # let other handlers take care of it + return None + else: + # need to start over, because the other handlers don't + # grok the proxy's URL type + # e.g. if we have a constructor arg proxies like so: + # {'http': 'ftp://proxy.example.com'}, we may end up turning + # a request for http://acme.example.com/a into one for + # ftp://proxy.example.com/a + return self.parent.open(req) + +class HTTPPasswordMgr: + + def __init__(self): + self.passwd = {} + + def add_password(self, realm, uri, user, passwd): + # uri could be a single URI or a sequence + if isinstance(uri, str): + uri = [uri] + if not realm in self.passwd: + self.passwd[realm] = {} + for default_port in True, False: + reduced_uri = tuple( + [self.reduce_uri(u, default_port) for u in uri]) + self.passwd[realm][reduced_uri] = (user, passwd) + + def find_user_password(self, realm, authuri): + domains = self.passwd.get(realm, {}) + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uris, authinfo in domains.items(): + for uri in uris: + if self.is_suburi(uri, reduced_authuri): + return authinfo + return None, None + + def reduce_uri(self, uri, default_port=True): + """Accept authority or URI and extract only the authority and path.""" + # note HTTP URLs do not have a userinfo component + parts = urllib.parse.urlsplit(uri) + if parts[1]: + # URI + scheme = parts[0] + authority = parts[1] + path = parts[2] or '/' + else: + # host or host:port + scheme = None + authority = uri + path = '/' + host, port = urllib.parse.splitport(authority) + if default_port and port is None and scheme is not None: + dport = {"http": 80, + "https": 443, + }.get(scheme) + if dport is not None: + authority = "%s:%d" % (host, dport) + return authority, path + + def is_suburi(self, base, test): + """Check if test is below base in a URI tree + + Both args must be URIs in reduced form. + """ + if base == test: + return True + if base[0] != test[0]: + return False + common = posixpath.commonprefix((base[1], test[1])) + if len(common) == len(base[1]): + return True + return False + + +class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): + + def find_user_password(self, realm, authuri): + user, password = HTTPPasswordMgr.find_user_password(self, realm, + authuri) + if user is not None: + return user, password + return HTTPPasswordMgr.find_user_password(self, None, authuri) + + +class AbstractBasicAuthHandler: + + # XXX this allows for multiple auth-schemes, but will stupidly pick + # the last one with a realm specified. + + # allow for double- and single-quoted realm values + # (single quotes are a violation of the RFC, but appear in the wild) + rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' + 'realm=(["\'])(.*?)\\2', re.I) + + # XXX could pre-emptively send auth info already accepted (RFC 2617, + # end of section 2, and section 1.2 immediately after "credentials" + # production). + + def __init__(self, password_mgr=None): + if password_mgr is None: + password_mgr = HTTPPasswordMgr() + self.passwd = password_mgr + self.add_password = self.passwd.add_password + + def http_error_auth_reqed(self, authreq, host, req, headers): + # host may be an authority (without userinfo) or a URL with an + # authority + # XXX could be multiple headers + authreq = headers.get(authreq, None) + if authreq: + mo = AbstractBasicAuthHandler.rx.search(authreq) + if mo: + scheme, quote, realm = mo.groups() + if scheme.lower() == 'basic': + return self.retry_http_basic_auth(host, req, realm) + + def retry_http_basic_auth(self, host, req, realm): + user, pw = self.passwd.find_user_password(realm, host) + if pw is not None: + raw = "%s:%s" % (user, pw) + auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii") + if req.headers.get(self.auth_header, None) == auth: + return None + req.add_header(self.auth_header, auth) + return self.parent.open(req) + else: + return None + + +class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Authorization' + + def http_error_401(self, req, fp, code, msg, headers): + url = req.get_full_url() + return self.http_error_auth_reqed('www-authenticate', + url, req, headers) + + +class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Proxy-authorization' + + def http_error_407(self, req, fp, code, msg, headers): + # http_error_auth_reqed requires that there is no userinfo component in + # authority. Assume there isn't one, since urllib2 does not (and + # should not, RFC 3986 s. 3.2.1) support requests for URLs containing + # userinfo. + authority = req.get_host() + return self.http_error_auth_reqed('proxy-authenticate', + authority, req, headers) + + +def randombytes(n): + """Return n random bytes.""" + return os.urandom(n) + +class AbstractDigestAuthHandler: + # Digest authentication is specified in RFC 2617. + + # XXX The client does not inspect the Authentication-Info header + # in a successful response. + + # XXX It should be possible to test this implementation against + # a mock server that just generates a static set of challenges. + + # XXX qop="auth-int" supports is shaky + + def __init__(self, passwd=None): + if passwd is None: + passwd = HTTPPasswordMgr() + self.passwd = passwd + self.add_password = self.passwd.add_password + self.retried = 0 + self.nonce_count = 0 + + def reset_retry_count(self): + self.retried = 0 + + def http_error_auth_reqed(self, auth_header, host, req, headers): + authreq = headers.get(auth_header, None) + if self.retried > 5: + # Don't fail endlessly - if we failed once, we'll probably + # fail a second time. Hm. Unless the Password Manager is + # prompting for the information. Crap. This isn't great + # but it's better than the current 'repeat until recursion + # depth exceeded' approach <wink> + raise urllib.error.HTTPError(req.get_full_url(), 401, + "digest auth failed", + headers, None) + else: + self.retried += 1 + if authreq: + scheme = authreq.split()[0] + if scheme.lower() == 'digest': + return self.retry_http_digest_auth(req, authreq) + + def retry_http_digest_auth(self, req, auth): + token, challenge = auth.split(' ', 1) + chal = parse_keqv_list(filter(None, parse_http_list(challenge))) + auth = self.get_authorization(req, chal) + if auth: + auth_val = 'Digest %s' % auth + if req.headers.get(self.auth_header, None) == auth_val: + return None + req.add_unredirected_header(self.auth_header, auth_val) + resp = self.parent.open(req) + return resp + + def get_cnonce(self, nonce): + # The cnonce-value is an opaque + # quoted string value provided by the client and used by both client + # and server to avoid chosen plaintext attacks, to provide mutual + # authentication, and to provide some message integrity protection. + # This isn't a fabulous effort, but it's probably Good Enough. + s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime()) + b = s.encode("ascii") + randombytes(8) + dig = hashlib.sha1(b).hexdigest() + return dig[:16] + + def get_authorization(self, req, chal): + try: + realm = chal['realm'] + nonce = chal['nonce'] + qop = chal.get('qop') + algorithm = chal.get('algorithm', 'MD5') + # mod_digest doesn't send an opaque, even though it isn't + # supposed to be optional + opaque = chal.get('opaque', None) + except KeyError: + return None + + H, KD = self.get_algorithm_impls(algorithm) + if H is None: + return None + + user, pw = self.passwd.find_user_password(realm, req.get_full_url()) + if user is None: + return None + + # XXX not implemented yet + if req.has_data(): + entdig = self.get_entity_digest(req.get_data(), chal) + else: + entdig = None + + A1 = "%s:%s:%s" % (user, realm, pw) + A2 = "%s:%s" % (req.get_method(), + # XXX selector: what about proxies and full urls + req.get_selector()) + if qop == 'auth': + self.nonce_count += 1 + ncvalue = '%08x' % self.nonce_count + cnonce = self.get_cnonce(nonce) + noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) + respdig = KD(H(A1), noncebit) + elif qop is None: + respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) + else: + # XXX handle auth-int. + raise urllib.error.URLError("qop '%s' is not supported." % qop) + + # XXX should the partial digests be encoded too? + + base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ + 'response="%s"' % (user, realm, nonce, req.get_selector(), + respdig) + if opaque: + base += ', opaque="%s"' % opaque + if entdig: + base += ', digest="%s"' % entdig + base += ', algorithm="%s"' % algorithm + if qop: + base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) + return base + + def get_algorithm_impls(self, algorithm): + # lambdas assume digest modules are imported at the top level + if algorithm == 'MD5': + H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest() + elif algorithm == 'SHA': + H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest() + # XXX MD5-sess + KD = lambda s, d: H("%s:%s" % (s, d)) + return H, KD + + def get_entity_digest(self, data, chal): + # XXX not implemented yet + return None + + +class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + """An authentication protocol defined by RFC 2069 + + Digest authentication improves on basic authentication because it + does not transmit passwords in the clear. + """ + + auth_header = 'Authorization' + handler_order = 490 # before Basic auth + + def http_error_401(self, req, fp, code, msg, headers): + host = urllib.parse.urlparse(req.get_full_url())[1] + retry = self.http_error_auth_reqed('www-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + + +class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + + auth_header = 'Proxy-Authorization' + handler_order = 490 # before Basic auth + + def http_error_407(self, req, fp, code, msg, headers): + host = req.get_host() + retry = self.http_error_auth_reqed('proxy-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + +class AbstractHTTPHandler(BaseHandler): + + def __init__(self, debuglevel=0): + self._debuglevel = debuglevel + + def set_http_debuglevel(self, level): + self._debuglevel = level + + def do_request_(self, request): + host = request.get_host() + if not host: + raise urllib.error.URLError('no host given') + + if request.has_data(): # POST + data = request.get_data() + if not request.has_header('Content-type'): + request.add_unredirected_header( + 'Content-type', + 'application/x-www-form-urlencoded') + if not request.has_header('Content-length'): + request.add_unredirected_header( + 'Content-length', '%d' % len(data)) + + scheme, sel = urllib.parse.splittype(request.get_selector()) + sel_host, sel_path = urllib.parse.splithost(sel) + if not request.has_header('Host'): + request.add_unredirected_header('Host', sel_host or host) + for name, value in self.parent.addheaders: + name = name.capitalize() + if not request.has_header(name): + request.add_unredirected_header(name, value) + + return request + + def do_open(self, http_class, req): + """Return an addinfourl object for the request, using http_class. + + http_class must implement the HTTPConnection API from http.client. + The addinfourl return value is a file-like object. It also + has methods and attributes including: + - info(): return a mimetools.Message object for the headers + - geturl(): return the original request URL + - code: HTTP status code + """ + host = req.get_host() + if not host: + raise urllib.error.URLError('no host given') + + h = http_class(host, timeout=req.timeout) # will parse host:port + headers = dict(req.headers) + headers.update(req.unredirected_hdrs) + + # TODO(jhylton): Should this be redesigned to handle + # persistent connections? + + # We want to make an HTTP/1.1 request, but the addinfourl + # class isn't prepared to deal with a persistent connection. + # It will try to read all remaining data from the socket, + # which will block while the server waits for the next request. + # So make sure the connection gets closed after the (only) + # request. + headers["Connection"] = "close" + headers = dict( + (name.title(), val) for name, val in headers.items()) + try: + h.request(req.get_method(), req.get_selector(), req.data, headers) + r = h.getresponse() + except socket.error as err: # XXX what error? + raise urllib.error.URLError(err) + + resp = urllib.response.addinfourl(r.fp, r.msg, req.get_full_url()) + resp.code = r.status + resp.msg = r.reason + return resp + + +class HTTPHandler(AbstractHTTPHandler): + + def http_open(self, req): + return self.do_open(http.client.HTTPConnection, req) + + http_request = AbstractHTTPHandler.do_request_ + +if hasattr(http.client, 'HTTPSConnection'): + class HTTPSHandler(AbstractHTTPHandler): + + def https_open(self, req): + return self.do_open(http.client.HTTPSConnection, req) + + https_request = AbstractHTTPHandler.do_request_ + +class HTTPCookieProcessor(BaseHandler): + def __init__(self, cookiejar=None): + import http.cookiejar + if cookiejar is None: + cookiejar = http.cookiejar.CookieJar() + self.cookiejar = cookiejar + + def http_request(self, request): + self.cookiejar.add_cookie_header(request) + return request + + def http_response(self, request, response): + self.cookiejar.extract_cookies(response, request) + return response + + https_request = http_request + https_response = http_response + +class UnknownHandler(BaseHandler): + def unknown_open(self, req): + type = req.get_type() + raise urllib.error.URLError('unknown url type: %s' % type) + +def parse_keqv_list(l): + """Parse list of key=value strings where keys are not duplicated.""" + parsed = {} + for elt in l: + k, v = elt.split('=', 1) + if v[0] == '"' and v[-1] == '"': + v = v[1:-1] + parsed[k] = v + return parsed + +def parse_http_list(s): + """Parse lists as described by RFC 2068 Section 2. + + In particular, parse comma-separated lists where the elements of + the list may include quoted-strings. A quoted-string could + contain a comma. A non-quoted string could have quotes in the + middle. Neither commas nor quotes count if they are escaped. + Only double-quotes count, not single-quotes. + """ + res = [] + part = '' + + escape = quote = False + for cur in s: + if escape: + part += cur + escape = False + continue + if quote: + if cur == '\\': + escape = True + continue + elif cur == '"': + quote = False + part += cur + continue + + if cur == ',': + res.append(part) + part = '' + continue + + if cur == '"': + quote = True + + part += cur + + # append last part + if part: + res.append(part) + + return [part.strip() for part in res] + +class FileHandler(BaseHandler): + # Use local file or FTP depending on form of URL + def file_open(self, req): + url = req.get_selector() + if url[:2] == '//' and url[2:3] != '/': + req.type = 'ftp' + return self.parent.open(req) + else: + return self.open_local_file(req) + + # names for the localhost + names = None + def get_names(self): + if FileHandler.names is None: + try: + FileHandler.names = (socket.gethostbyname('localhost'), + socket.gethostbyname(socket.gethostname())) + except socket.gaierror: + FileHandler.names = (socket.gethostbyname('localhost'),) + return FileHandler.names + + # not entirely sure what the rules are here + def open_local_file(self, req): + import email.utils + import mimetypes + host = req.get_host() + file = req.get_selector() + localfile = url2pathname(file) + try: + stats = os.stat(localfile) + size = stats.st_size + modified = email.utils.formatdate(stats.st_mtime, usegmt=True) + mtype = mimetypes.guess_type(file)[0] + headers = email.message_from_string( + 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % + (mtype or 'text/plain', size, modified)) + if host: + host, port = urllib.parse.splitport(host) + if not host or \ + (not port and _safe_gethostbyname(host) in self.get_names()): + return urllib.response.addinfourl(open(localfile, 'rb'), + headers, 'file:'+file) + except OSError as msg: + # urllib2 users shouldn't expect OSErrors coming from urlopen() + raise urllib.error.URLError(msg) + raise urllib.error.URLError('file not on local host') + +def _safe_gethostbyname(host): + try: + return socket.gethostbyname(host) + except socket.gaierror: + return None + +class FTPHandler(BaseHandler): + def ftp_open(self, req): + import ftplib + import mimetypes + host = req.get_host() + if not host: + raise urllib.error.URLError('ftp error: no host given') + host, port = urllib.parse.splitport(host) + if port is None: + port = ftplib.FTP_PORT + else: + port = int(port) + + # username/password handling + user, host = urllib.parse.splituser(host) + if user: + user, passwd = urllib.parse.splitpasswd(user) + else: + passwd = None + host = urllib.parse.unquote(host) + user = urllib.parse.unquote(user or '') + passwd = urllib.parse.unquote(passwd or '') + + try: + host = socket.gethostbyname(host) + except socket.error as msg: + raise urllib.error.URLError(msg) + path, attrs = urllib.parse.splitattr(req.get_selector()) + dirs = path.split('/') + dirs = list(map(urllib.parse.unquote, dirs)) + dirs, file = dirs[:-1], dirs[-1] + if dirs and not dirs[0]: + dirs = dirs[1:] + try: + fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) + type = file and 'I' or 'D' + for attr in attrs: + attr, value = urllib.parse.splitvalue(attr) + if attr.lower() == 'type' and \ + value in ('a', 'A', 'i', 'I', 'd', 'D'): + type = value.upper() + fp, retrlen = fw.retrfile(file, type) + headers = "" + mtype = mimetypes.guess_type(req.get_full_url())[0] + if mtype: + headers += "Content-type: %s\n" % mtype + if retrlen is not None and retrlen >= 0: + headers += "Content-length: %d\n" % retrlen + headers = email.message_from_string(headers) + return urllib.response.addinfourl(fp, headers, req.get_full_url()) + except ftplib.all_errors as msg: + exc = urllib.error.URLError('ftp error: %s' % msg) + raise exc.with_traceback(sys.exc_info()[2]) + + def connect_ftp(self, user, passwd, host, port, dirs, timeout): + fw = ftpwrapper(user, passwd, host, port, dirs, timeout) + return fw + +class CacheFTPHandler(FTPHandler): + # XXX would be nice to have pluggable cache strategies + # XXX this stuff is definitely not thread safe + def __init__(self): + self.cache = {} + self.timeout = {} + self.soonest = 0 + self.delay = 60 + self.max_conns = 16 + + def setTimeout(self, t): + self.delay = t + + def setMaxConns(self, m): + self.max_conns = m + + def connect_ftp(self, user, passwd, host, port, dirs, timeout): + key = user, host, port, '/'.join(dirs), timeout + if key in self.cache: + self.timeout[key] = time.time() + self.delay + else: + self.cache[key] = ftpwrapper(user, passwd, host, port, + dirs, timeout) + self.timeout[key] = time.time() + self.delay + self.check_cache() + return self.cache[key] + + def check_cache(self): + # first check for old ones + t = time.time() + if self.soonest <= t: + for k, v in list(self.timeout.items()): + if v < t: + self.cache[k].close() + del self.cache[k] + del self.timeout[k] + self.soonest = min(list(self.timeout.values())) + + # then check the size + if len(self.cache) == self.max_conns: + for k, v in list(self.timeout.items()): + if v == self.soonest: + del self.cache[k] + del self.timeout[k] + break + self.soonest = min(list(self.timeout.values())) + +# Code move from the old urllib module + +MAXFTPCACHE = 10 # Trim the ftp cache beyond this size + +# Helper for non-unix systems +if os.name == 'mac': + from macurl2path import url2pathname, pathname2url +elif os.name == 'nt': + from nturl2path import url2pathname, pathname2url +else: + def url2pathname(pathname): + """OS-specific conversion from a relative URL of the 'file' scheme + to a file system path; not recommended for general use.""" + return urllib.parse.unquote(pathname) + + def pathname2url(pathname): + """OS-specific conversion from a file system path to a relative URL + of the 'file' scheme; not recommended for general use.""" + return urllib.parse.quote(pathname) + +# This really consists of two pieces: +# (1) a class which handles opening of all sorts of URLs +# (plus assorted utilities etc.) +# (2) a set of functions for parsing URLs +# XXX Should these be separated out into different modules? + + +ftpcache = {} +class URLopener: + """Class to open URLs. + This is a class rather than just a subroutine because we may need + more than one set of global protocol-specific options. + Note -- this is a base class for those who don't want the + automatic handling of errors type 302 (relocated) and 401 + (authorization needed).""" + + __tempfiles = None + + version = "Python-urllib/%s" % __version__ + + # Constructor + def __init__(self, proxies=None, **x509): + if proxies is None: + proxies = getproxies() + assert hasattr(proxies, 'keys'), "proxies must be a mapping" + self.proxies = proxies + self.key_file = x509.get('key_file') + self.cert_file = x509.get('cert_file') + self.addheaders = [('User-Agent', self.version)] + self.__tempfiles = [] + self.__unlink = os.unlink # See cleanup() + self.tempcache = None + # Undocumented feature: if you assign {} to tempcache, + # it is used to cache files retrieved with + # self.retrieve(). This is not enabled by default + # since it does not work for changing documents (and I + # haven't got the logic to check expiration headers + # yet). + self.ftpcache = ftpcache + # Undocumented feature: you can use a different + # ftp cache by assigning to the .ftpcache member; + # in case you want logically independent URL openers + # XXX This is not threadsafe. Bah. + + def __del__(self): + self.close() + + def close(self): + self.cleanup() + + def cleanup(self): + # This code sometimes runs when the rest of this module + # has already been deleted, so it can't use any globals + # or import anything. + if self.__tempfiles: + for file in self.__tempfiles: + try: + self.__unlink(file) + except OSError: + pass + del self.__tempfiles[:] + if self.tempcache: + self.tempcache.clear() + + def addheader(self, *args): + """Add a header to be used by the HTTP interface only + e.g. u.addheader('Accept', 'sound/basic')""" + self.addheaders.append(args) + + # External interface + def open(self, fullurl, data=None): + """Use URLopener().open(file) instead of open(file, 'r').""" + fullurl = urllib.parse.unwrap(urllib.parse.toBytes(fullurl)) + if self.tempcache and fullurl in self.tempcache: + filename, headers = self.tempcache[fullurl] + fp = open(filename, 'rb') + return urllib.response.addinfourl(fp, headers, fullurl) + urltype, url = urllib.parse.splittype(fullurl) + if not urltype: + urltype = 'file' + if urltype in self.proxies: + proxy = self.proxies[urltype] + urltype, proxyhost = urllib.parse.splittype(proxy) + host, selector = urllib.parse.splithost(proxyhost) + url = (host, fullurl) # Signal special case to open_*() + else: + proxy = None + name = 'open_' + urltype + self.type = urltype + name = name.replace('-', '_') + if not hasattr(self, name): + if proxy: + return self.open_unknown_proxy(proxy, fullurl, data) + else: + return self.open_unknown(fullurl, data) + try: + if data is None: + return getattr(self, name)(url) + else: + return getattr(self, name)(url, data) + except socket.error as msg: + raise IOError('socket error', msg).with_traceback(sys.exc_info()[2]) + + def open_unknown(self, fullurl, data=None): + """Overridable interface to open unknown URL type.""" + type, url = urllib.parse.splittype(fullurl) + raise IOError('url error', 'unknown url type', type) + + def open_unknown_proxy(self, proxy, fullurl, data=None): + """Overridable interface to open unknown URL type.""" + type, url = urllib.parse.splittype(fullurl) + raise IOError('url error', 'invalid proxy for %s' % type, proxy) + + # External interface + def retrieve(self, url, filename=None, reporthook=None, data=None): + """retrieve(url) returns (filename, headers) for a local object + or (tempfilename, headers) for a remote object.""" + url = urllib.parse.unwrap(urllib.parse.toBytes(url)) + if self.tempcache and url in self.tempcache: + return self.tempcache[url] + type, url1 = urllib.parse.splittype(url) + if filename is None and (not type or type == 'file'): + try: + fp = self.open_local_file(url1) + hdrs = fp.info() + del fp + return url2pathname(urllib.parse.splithost(url1)[1]), hdrs + except IOError as msg: + pass + fp = self.open(url, data) + headers = fp.info() + if filename: + tfp = open(filename, 'wb') + else: + import tempfile + garbage, path = urllib.parse.splittype(url) + garbage, path = urllib.parse.splithost(path or "") + path, garbage = urllib.parse.splitquery(path or "") + path, garbage = urllib.parse.splitattr(path or "") + suffix = os.path.splitext(path)[1] + (fd, filename) = tempfile.mkstemp(suffix) + self.__tempfiles.append(filename) + tfp = os.fdopen(fd, 'wb') + result = filename, headers + if self.tempcache is not None: + self.tempcache[url] = result + bs = 1024*8 + size = -1 + read = 0 + blocknum = 0 + if reporthook: + if "content-length" in headers: + size = int(headers["Content-Length"]) + reporthook(blocknum, bs, size) + while 1: + block = fp.read(bs) + if not block: + break + read += len(block) + tfp.write(block) + blocknum += 1 + if reporthook: + reporthook(blocknum, bs, size) + fp.close() + tfp.close() + del fp + del tfp + + # raise exception if actual size does not match content-length header + if size >= 0 and read < size: + raise urllib.error.ContentTooShortError( + "retrieval incomplete: got only %i out of %i bytes" + % (read, size), result) + + return result + + # Each method named open_<type> knows how to open that type of URL + + def _open_generic_http(self, connection_factory, url, data): + """Make an HTTP connection using connection_class. + + This is an internal method that should be called from + open_http() or open_https(). + + Arguments: + - connection_factory should take a host name and return an + HTTPConnection instance. + - url is the url to retrieval or a host, relative-path pair. + - data is payload for a POST request or None. + """ + + user_passwd = None + proxy_passwd= None + if isinstance(url, str): + host, selector = urllib.parse.splithost(url) + if host: + user_passwd, host = urllib.parse.splituser(host) + host = urllib.parse.unquote(host) + realhost = host + else: + host, selector = url + # check whether the proxy contains authorization information + proxy_passwd, host = urllib.parse.splituser(host) + # now we proceed with the url we want to obtain + urltype, rest = urllib.parse.splittype(selector) + url = rest + user_passwd = None + if urltype.lower() != 'http': + realhost = None + else: + realhost, rest = urllib.parse.splithost(rest) + if realhost: + user_passwd, realhost = urllib.parse.splituser(realhost) + if user_passwd: + selector = "%s://%s%s" % (urltype, realhost, rest) + if proxy_bypass(realhost): + host = realhost + + #print "proxy via http:", host, selector + if not host: raise IOError('http error', 'no host given') + + if proxy_passwd: + import base64 + proxy_auth = base64.b64encode(proxy_passwd).strip() + else: + proxy_auth = None + + if user_passwd: + import base64 + auth = base64.b64encode(user_passwd).strip() + else: + auth = None + http_conn = connection_factory(host) + # XXX We should fix urllib so that it works with HTTP/1.1. + http_conn._http_vsn = 10 + http_conn._http_vsn_str = "HTTP/1.0" + + headers = {} + if proxy_auth: + headers["Proxy-Authorization"] = "Basic %s" % proxy_auth + if auth: + headers["Authorization"] = "Basic %s" % auth + if realhost: + headers["Host"] = realhost + for header, value in self.addheaders: + headers[header] = value + + if data is not None: + headers["Content-Type"] = "application/x-www-form-urlencoded" + http_conn.request("POST", selector, data, headers) + else: + http_conn.request("GET", selector, headers=headers) + + try: + response = http_conn.getresponse() + except http.client.BadStatusLine: + # something went wrong with the HTTP status line + raise urllib.error.URLError("http protocol error: bad status line") + + # According to RFC 2616, "2xx" code indicates that the client's + # request was successfully received, understood, and accepted. + if 200 <= response.status < 300: + return urllib.response.addinfourl(response.fp, response.msg, + "http:" + url, + response.status) + else: + return self.http_error( + url, response.fp, + response.status, response.reason, response.msg, data) + + def open_http(self, url, data=None): + """Use HTTP protocol.""" + return self._open_generic_http(http.client.HTTPConnection, url, data) + + def http_error(self, url, fp, errcode, errmsg, headers, data=None): + """Handle http errors. + + Derived class can override this, or provide specific handlers + named http_error_DDD where DDD is the 3-digit error code.""" + # First check if there's a specific handler for this error + name = 'http_error_%d' % errcode + if hasattr(self, name): + method = getattr(self, name) + if data is None: + result = method(url, fp, errcode, errmsg, headers) + else: + result = method(url, fp, errcode, errmsg, headers, data) + if result: return result + return self.http_error_default(url, fp, errcode, errmsg, headers) + + def http_error_default(self, url, fp, errcode, errmsg, headers): + """Default error handler: close the connection and raise IOError.""" + void = fp.read() + fp.close() + raise urllib.error.HTTPError(url, errcode, errmsg, headers, None) + + if _have_ssl: + def _https_connection(self, host): + return http.client.HTTPSConnection(host, + key_file=self.key_file, + cert_file=self.cert_file) + + def open_https(self, url, data=None): + """Use HTTPS protocol.""" + return self._open_generic_http(self._https_connection, url, data) + + def open_file(self, url): + """Use local file or FTP depending on form of URL.""" + if not isinstance(url, str): + raise URLError('file error', 'proxy support for file protocol currently not implemented') + if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': + return self.open_ftp(url) + else: + return self.open_local_file(url) + + def open_local_file(self, url): + """Use local file.""" + import mimetypes, email.utils + from io import StringIO + host, file = urllib.parse.splithost(url) + localname = url2pathname(file) + try: + stats = os.stat(localname) + except OSError as e: + raise URLError(e.errno, e.strerror, e.filename) + size = stats.st_size + modified = email.utils.formatdate(stats.st_mtime, usegmt=True) + mtype = mimetypes.guess_type(url)[0] + headers = email.message_from_string( + 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % + (mtype or 'text/plain', size, modified)) + if not host: + urlfile = file + if file[:1] == '/': + urlfile = 'file://' + file + return urllib.response.addinfourl(open(localname, 'rb'), + headers, urlfile) + host, port = urllib.parse.splitport(host) + if (not port + and socket.gethostbyname(host) in (localhost(), thishost())): + urlfile = file + if file[:1] == '/': + urlfile = 'file://' + file + return urllib.response.addinfourl(open(localname, 'rb'), + headers, urlfile) + raise URLError('local file error', 'not on local host') + + def open_ftp(self, url): + """Use FTP protocol.""" + if not isinstance(url, str): + raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented') + import mimetypes + from io import StringIO + host, path = urllib.parse.splithost(url) + if not host: raise URLError('ftp error', 'no host given') + host, port = urllib.parse.splitport(host) + user, host = urllib.parse.splituser(host) + if user: user, passwd = urllib.parse.splitpasswd(user) + else: passwd = None + host = urllib.parse.unquote(host) + user = urllib.parse.unquote(user or '') + passwd = urllib.parse.unquote(passwd or '') + host = socket.gethostbyname(host) + if not port: + import ftplib + port = ftplib.FTP_PORT + else: + port = int(port) + path, attrs = urllib.parse.splitattr(path) + path = urllib.parse.unquote(path) + dirs = path.split('/') + dirs, file = dirs[:-1], dirs[-1] + if dirs and not dirs[0]: dirs = dirs[1:] + if dirs and not dirs[0]: dirs[0] = '/' + key = user, host, port, '/'.join(dirs) + # XXX thread unsafe! + if len(self.ftpcache) > MAXFTPCACHE: + # Prune the cache, rather arbitrarily + for k in self.ftpcache.keys(): + if k != key: + v = self.ftpcache[k] + del self.ftpcache[k] + v.close() + try: + if not key in self.ftpcache: + self.ftpcache[key] = \ + ftpwrapper(user, passwd, host, port, dirs) + if not file: type = 'D' + else: type = 'I' + for attr in attrs: + attr, value = urllib.parse.splitvalue(attr) + if attr.lower() == 'type' and \ + value in ('a', 'A', 'i', 'I', 'd', 'D'): + type = value.upper() + (fp, retrlen) = self.ftpcache[key].retrfile(file, type) + mtype = mimetypes.guess_type("ftp:" + url)[0] + headers = "" + if mtype: + headers += "Content-Type: %s\n" % mtype + if retrlen is not None and retrlen >= 0: + headers += "Content-Length: %d\n" % retrlen + headers = email.message_from_string(headers) + return urllib.response.addinfourl(fp, headers, "ftp:" + url) + except ftperrors() as msg: + raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2]) + + def open_data(self, url, data=None): + """Use "data" URL.""" + if not isinstance(url, str): + raise URLError('data error', 'proxy support for data protocol currently not implemented') + # ignore POSTed data + # + # syntax of data URLs: + # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data + # mediatype := [ type "/" subtype ] *( ";" parameter ) + # data := *urlchar + # parameter := attribute "=" value + try: + [type, data] = url.split(',', 1) + except ValueError: + raise IOError('data error', 'bad data URL') + if not type: + type = 'text/plain;charset=US-ASCII' + semi = type.rfind(';') + if semi >= 0 and '=' not in type[semi:]: + encoding = type[semi+1:] + type = type[:semi] + else: + encoding = '' + msg = [] + msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT', + time.gmtime(time.time()))) + msg.append('Content-type: %s' % type) + if encoding == 'base64': + import base64 + data = base64.decodestring(data) + else: + data = urllib.parse.unquote(data) + msg.append('Content-Length: %d' % len(data)) + msg.append('') + msg.append(data) + msg = '\n'.join(msg) + headers = mimetools.message_from_string(msg) + #f.fileno = None # needed for addinfourl + return urllib.response.addinfourl(f, headers, url) + + +class FancyURLopener(URLopener): + """Derived class with handlers for errors we can handle (perhaps).""" + + def __init__(self, *args, **kwargs): + URLopener.__init__(self, *args, **kwargs) + self.auth_cache = {} + self.tries = 0 + self.maxtries = 10 + + def http_error_default(self, url, fp, errcode, errmsg, headers): + """Default error handling -- don't raise an exception.""" + return urllib.response.addinfourl(fp, headers, "http:" + url, errcode) + + def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): + """Error 302 -- relocated (temporarily).""" + self.tries += 1 + if self.maxtries and self.tries >= self.maxtries: + if hasattr(self, "http_error_500"): + meth = self.http_error_500 + else: + meth = self.http_error_default + self.tries = 0 + return meth(url, fp, 500, + "Internal Server Error: Redirect Recursion", headers) + result = self.redirect_internal(url, fp, errcode, errmsg, headers, + data) + self.tries = 0 + return result + + def redirect_internal(self, url, fp, errcode, errmsg, headers, data): + if 'location' in headers: + newurl = headers['location'] + elif 'uri' in headers: + newurl = headers['uri'] + else: + return + void = fp.read() + fp.close() + # In case the server sent a relative URL, join with original: + newurl = basejoin(self.type + ":" + url, newurl) + return self.open(newurl) + + def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): + """Error 301 -- also relocated (permanently).""" + return self.http_error_302(url, fp, errcode, errmsg, headers, data) + + def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): + """Error 303 -- also relocated (essentially identical to 302).""" + return self.http_error_302(url, fp, errcode, errmsg, headers, data) + + def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): + """Error 307 -- relocated, but turn POST into error.""" + if data is None: + return self.http_error_302(url, fp, errcode, errmsg, headers, data) + else: + return self.http_error_default(url, fp, errcode, errmsg, headers) + + def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): + """Error 401 -- authentication required. + This function supports Basic authentication only.""" + if not 'www-authenticate' in headers: + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + stuff = headers['www-authenticate'] + import re + match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) + if not match: + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + scheme, realm = match.groups() + if scheme.lower() != 'basic': + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + name = 'retry_' + self.type + '_basic_auth' + if data is None: + return getattr(self,name)(url, realm) + else: + return getattr(self,name)(url, realm, data) + + def http_error_407(self, url, fp, errcode, errmsg, headers, data=None): + """Error 407 -- proxy authentication required. + This function supports Basic authentication only.""" + if not 'proxy-authenticate' in headers: + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + stuff = headers['proxy-authenticate'] + import re + match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) + if not match: + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + scheme, realm = match.groups() + if scheme.lower() != 'basic': + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + name = 'retry_proxy_' + self.type + '_basic_auth' + if data is None: + return getattr(self,name)(url, realm) + else: + return getattr(self,name)(url, realm, data) + + def retry_proxy_http_basic_auth(self, url, realm, data=None): + host, selector = urllib.parse.splithost(url) + newurl = 'http://' + host + selector + proxy = self.proxies['http'] + urltype, proxyhost = urllib.parse.splittype(proxy) + proxyhost, proxyselector = urllib.parse.splithost(proxyhost) + i = proxyhost.find('@') + 1 + proxyhost = proxyhost[i:] + user, passwd = self.get_user_passwd(proxyhost, realm, i) + if not (user or passwd): return None + proxyhost = "%s:%s@%s" % (urllib.parse.quote(user, safe=''), + quote(passwd, safe=''), proxyhost) + self.proxies['http'] = 'http://' + proxyhost + proxyselector + if data is None: + return self.open(newurl) + else: + return self.open(newurl, data) + + def retry_proxy_https_basic_auth(self, url, realm, data=None): + host, selector = urllib.parse.splithost(url) + newurl = 'https://' + host + selector + proxy = self.proxies['https'] + urltype, proxyhost = urllib.parse.splittype(proxy) + proxyhost, proxyselector = urllib.parse.splithost(proxyhost) + i = proxyhost.find('@') + 1 + proxyhost = proxyhost[i:] + user, passwd = self.get_user_passwd(proxyhost, realm, i) + if not (user or passwd): return None + proxyhost = "%s:%s@%s" % (urllib.parse.quote(user, safe=''), + quote(passwd, safe=''), proxyhost) + self.proxies['https'] = 'https://' + proxyhost + proxyselector + if data is None: + return self.open(newurl) + else: + return self.open(newurl, data) + + def retry_http_basic_auth(self, url, realm, data=None): + host, selector = urllib.parse.splithost(url) + i = host.find('@') + 1 + host = host[i:] + user, passwd = self.get_user_passwd(host, realm, i) + if not (user or passwd): return None + host = "%s:%s@%s" % (urllib.parse.quote(user, safe=''), + quote(passwd, safe=''), host) + newurl = 'http://' + host + selector + if data is None: + return self.open(newurl) + else: + return self.open(newurl, data) + + def retry_https_basic_auth(self, url, realm, data=None): + host, selector = urllib.parse.splithost(url) + i = host.find('@') + 1 + host = host[i:] + user, passwd = self.get_user_passwd(host, realm, i) + if not (user or passwd): return None + host = "%s:%s@%s" % (urllib.parse.quote(user, safe=''), + quote(passwd, safe=''), host) + newurl = 'https://' + host + selector + if data is None: + return self.open(newurl) + else: + return self.open(newurl, data) + + def get_user_passwd(self, host, realm, clear_cache = 0): + key = realm + '@' + host.lower() + if key in self.auth_cache: + if clear_cache: + del self.auth_cache[key] + else: + return self.auth_cache[key] + user, passwd = self.prompt_user_passwd(host, realm) + if user or passwd: self.auth_cache[key] = (user, passwd) + return user, passwd + + def prompt_user_passwd(self, host, realm): + """Override this in a GUI environment!""" + import getpass + try: + user = input("Enter username for %s at %s: " % (realm, host)) + passwd = getpass.getpass("Enter password for %s in %s at %s: " % + (user, realm, host)) + return user, passwd + except KeyboardInterrupt: + print() + return None, None + + +# Utility functions + +_localhost = None +def localhost(): + """Return the IP address of the magic hostname 'localhost'.""" + global _localhost + if _localhost is None: + _localhost = socket.gethostbyname('localhost') + return _localhost + +_thishost = None +def thishost(): + """Return the IP address of the current host.""" + global _thishost + if _thishost is None: + _thishost = socket.gethostbyname(socket.gethostname()) + return _thishost + +_ftperrors = None +def ftperrors(): + """Return the set of errors raised by the FTP class.""" + global _ftperrors + if _ftperrors is None: + import ftplib + _ftperrors = ftplib.all_errors + return _ftperrors + +_noheaders = None +def noheaders(): + """Return an empty mimetools.Message object.""" + global _noheaders + if _noheaders is None: + _noheaders = mimetools.message_from_string("") + return _noheaders + + +# Utility classes + +class ftpwrapper: + """Class used by open_ftp() for cache of open FTP connections.""" + + def __init__(self, user, passwd, host, port, dirs, timeout=None): + self.user = user + self.passwd = passwd + self.host = host + self.port = port + self.dirs = dirs + self.timeout = timeout + self.init() + + def init(self): + import ftplib + self.busy = 0 + self.ftp = ftplib.FTP() + self.ftp.connect(self.host, self.port, self.timeout) + self.ftp.login(self.user, self.passwd) + for dir in self.dirs: + self.ftp.cwd(dir) + + def retrfile(self, file, type): + import ftplib + self.endtransfer() + if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 + else: cmd = 'TYPE ' + type; isdir = 0 + try: + self.ftp.voidcmd(cmd) + except ftplib.all_errors: + self.init() + self.ftp.voidcmd(cmd) + conn = None + if file and not isdir: + # Try to retrieve as a file + try: + cmd = 'RETR ' + file + conn = self.ftp.ntransfercmd(cmd) + except ftplib.error_perm as reason: + if str(reason)[:3] != '550': + raise urllib.error.URLError('ftp error', reason).with_traceback(sys.exc_info()[2]) + if not conn: + # Set transfer mode to ASCII! + self.ftp.voidcmd('TYPE A') + # Try a directory listing. Verify that directory exists. + if file: + pwd = self.ftp.pwd() + try: + try: + self.ftp.cwd(file) + except ftplib.error_perm as reason: + raise urllib.error.URLError('ftp error', reason) from reason + finally: + self.ftp.cwd(pwd) + cmd = 'LIST ' + file + else: + cmd = 'LIST' + conn = self.ftp.ntransfercmd(cmd) + self.busy = 1 + # Pass back both a suitably decorated object and a retrieval length + return (urllib.response.addclosehook(conn[0].makefile('rb'), + self.endtransfer), conn[1]) + def endtransfer(self): + if not self.busy: + return + self.busy = 0 + try: + self.ftp.voidresp() + except ftperrors(): + pass + + def close(self): + self.endtransfer() + try: + self.ftp.close() + except ftperrors(): + pass + +# Proxy handling +def getproxies_environment(): + """Return a dictionary of scheme -> proxy server URL mappings. + + Scan the environment for variables named <scheme>_proxy; + this seems to be the standard convention. If you need a + different way, you can pass a proxies dictionary to the + [Fancy]URLopener constructor. + + """ + proxies = {} + for name, value in os.environ.items(): + name = name.lower() + if name == 'no_proxy': + # handled in proxy_bypass_environment + continue + if value and name[-6:] == '_proxy': + proxies[name[:-6]] = value + return proxies + +def proxy_bypass_environment(host): + """Test if proxies should not be used for a particular host. + + Checks the environment for a variable named no_proxy, which should + be a list of DNS suffixes separated by commas, or '*' for all hosts. + """ + no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '') + # '*' is special case for always bypass + if no_proxy == '*': + return 1 + # strip port off host + hostonly, port = urllib.parse.splitport(host) + # check if the host ends with any of the DNS suffixes + for name in no_proxy.split(','): + if name and (hostonly.endswith(name) or host.endswith(name)): + return 1 + # otherwise, don't bypass + return 0 + + +if sys.platform == 'darwin': + def getproxies_internetconfig(): + """Return a dictionary of scheme -> proxy server URL mappings. + + By convention the mac uses Internet Config to store + proxies. An HTTP proxy, for instance, is stored under + the HttpProxy key. + + """ + try: + import ic + except ImportError: + return {} + + try: + config = ic.IC() + except ic.error: + return {} + proxies = {} + # HTTP: + if 'UseHTTPProxy' in config and config['UseHTTPProxy']: + try: + value = config['HTTPProxyHost'] + except ic.error: + pass + else: + proxies['http'] = 'http://%s' % value + # FTP: XXX To be done. + # Gopher: XXX To be done. + return proxies + + def proxy_bypass(host): + if getproxies_environment(): + return proxy_bypass_environment(host) + else: + return 0 + + def getproxies(): + return getproxies_environment() or getproxies_internetconfig() + +elif os.name == 'nt': + def getproxies_registry(): + """Return a dictionary of scheme -> proxy server URL mappings. + + Win32 uses the registry to store proxies. + + """ + proxies = {} + try: + import _winreg + except ImportError: + # Std module, so should be around - but you never know! + return proxies + try: + internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, + r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') + proxyEnable = _winreg.QueryValueEx(internetSettings, + 'ProxyEnable')[0] + if proxyEnable: + # Returned as Unicode but problems if not converted to ASCII + proxyServer = str(_winreg.QueryValueEx(internetSettings, + 'ProxyServer')[0]) + if '=' in proxyServer: + # Per-protocol settings + for p in proxyServer.split(';'): + protocol, address = p.split('=', 1) + # See if address has a type:// prefix + import re + if not re.match('^([^/:]+)://', address): + address = '%s://%s' % (protocol, address) + proxies[protocol] = address + else: + # Use one setting for all protocols + if proxyServer[:5] == 'http:': + proxies['http'] = proxyServer + else: + proxies['http'] = 'http://%s' % proxyServer + proxies['ftp'] = 'ftp://%s' % proxyServer + internetSettings.Close() + except (WindowsError, ValueError, TypeError): + # Either registry key not found etc, or the value in an + # unexpected format. + # proxies already set up to be empty so nothing to do + pass + return proxies + + def getproxies(): + """Return a dictionary of scheme -> proxy server URL mappings. + + Returns settings gathered from the environment, if specified, + or the registry. + + """ + return getproxies_environment() or getproxies_registry() + + def proxy_bypass_registry(host): + try: + import _winreg + import re + except ImportError: + # Std modules, so should be around - but you never know! + return 0 + try: + internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, + r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') + proxyEnable = _winreg.QueryValueEx(internetSettings, + 'ProxyEnable')[0] + proxyOverride = str(_winreg.QueryValueEx(internetSettings, + 'ProxyOverride')[0]) + # ^^^^ Returned as Unicode but problems if not converted to ASCII + except WindowsError: + return 0 + if not proxyEnable or not proxyOverride: + return 0 + # try to make a host list from name and IP address. + rawHost, port = urllib.parse.splitport(host) + host = [rawHost] + try: + addr = socket.gethostbyname(rawHost) + if addr != rawHost: + host.append(addr) + except socket.error: + pass + try: + fqdn = socket.getfqdn(rawHost) + if fqdn != rawHost: + host.append(fqdn) + except socket.error: + pass + # make a check value list from the registry entry: replace the + # '<local>' string by the localhost entry and the corresponding + # canonical entry. + proxyOverride = proxyOverride.split(';') + i = 0 + while i < len(proxyOverride): + if proxyOverride[i] == '<local>': + proxyOverride[i:i+1] = ['localhost', + '127.0.0.1', + socket.gethostname(), + socket.gethostbyname( + socket.gethostname())] + i += 1 + # print proxyOverride + # now check if we match one of the registry values. + for test in proxyOverride: + test = test.replace(".", r"\.") # mask dots + test = test.replace("*", r".*") # change glob sequence + test = test.replace("?", r".") # change glob char + for val in host: + # print "%s <--> %s" %( test, val ) + if re.match(test, val, re.I): + return 1 + return 0 + + def proxy_bypass(host): + """Return a dictionary of scheme -> proxy server URL mappings. + + Returns settings gathered from the environment, if specified, + or the registry. + + """ + if getproxies_environment(): + return proxy_bypass_environment(host) + else: + return proxy_bypass_registry(host) + +else: + # By default use environment variables + getproxies = getproxies_environment + proxy_bypass = proxy_bypass_environment diff --git a/Lib/urllib/response.py b/Lib/urllib/response.py new file mode 100644 index 0000000..1352622 --- /dev/null +++ b/Lib/urllib/response.py @@ -0,0 +1,83 @@ +"""Response classes used by urllib. + +The base class, addbase, defines a minimal file-like interface, +including read() and readline(). The typical response object is an +addinfourl instance, which defines an info() method that returns +headers and a geturl() method that returns the url. +""" + +class addbase(object): + """Base class for addinfo and addclosehook.""" + + # XXX Add a method to expose the timeout on the underlying socket? + + def __init__(self, fp): + # TODO(jhylton): Is there a better way to delegate using io? + self.fp = fp + self.read = self.fp.read + self.readline = self.fp.readline + # TODO(jhylton): Make sure an object with readlines() is also iterable + if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines + if hasattr(self.fp, "fileno"): + self.fileno = self.fp.fileno + else: + self.fileno = lambda: None + if hasattr(self.fp, "__iter__"): + self.__iter__ = self.fp.__iter__ + if hasattr(self.fp, "__next__"): + self.__next__ = self.fp.__next__ + + def __repr__(self): + return '<%s at %r whose fp = %r>' % (self.__class__.__name__, + id(self), self.fp) + + def close(self): + self.read = None + self.readline = None + self.readlines = None + self.fileno = None + if self.fp: self.fp.close() + self.fp = None + +class addclosehook(addbase): + """Class to add a close hook to an open file.""" + + def __init__(self, fp, closehook, *hookargs): + addbase.__init__(self, fp) + self.closehook = closehook + self.hookargs = hookargs + + def close(self): + addbase.close(self) + if self.closehook: + self.closehook(*self.hookargs) + self.closehook = None + self.hookargs = None + +class addinfo(addbase): + """class to add an info() method to an open file.""" + + def __init__(self, fp, headers): + addbase.__init__(self, fp) + self.headers = headers + + def info(self): + return self.headers + +class addinfourl(addbase): + """class to add info() and geturl() methods to an open file.""" + + def __init__(self, fp, headers, url, code=None): + addbase.__init__(self, fp) + self.headers = headers + self.url = url + self.code = code + + def info(self): + return self.headers + + def getcode(self): + return self.code + + def geturl(self): + return self.url diff --git a/Lib/robotparser.py b/Lib/urllib/robotparser.py index d1ef460..a91df8d 100644 --- a/Lib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -9,8 +9,8 @@ The robots.txt Exclusion Protocol is implemented as specified in http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html """ -import urlparse -import urllib + +import urllib.parse, urllib.request __all__ = ["RobotFileParser"] @@ -48,24 +48,19 @@ class RobotFileParser: def set_url(self, url): """Sets the URL referring to a robots.txt file.""" self.url = url - self.host, self.path = urlparse.urlparse(url)[1:3] + self.host, self.path = urllib.parse.urlparse(url)[1:3] def read(self): """Reads the robots.txt URL and feeds it to the parser.""" - opener = URLopener() - f = opener.open(self.url) - lines = [] - line = f.readline() - while line: - lines.append(line.strip()) - line = f.readline() - self.errcode = opener.errcode - if self.errcode in (401, 403): - self.disallow_all = True - elif self.errcode >= 400: - self.allow_all = True - elif self.errcode == 200 and lines: - self.parse(lines) + try: + f = urllib.request.urlopen(self.url) + except urllib.error.HTTPError as err: + if err.code in (401, 403): + self.disallow_all = True + elif err.code >= 400: + self.allow_all = True + else: + self.parse(f.read().splitlines()) def _add_entry(self, entry): if "*" in entry.useragents: @@ -75,15 +70,15 @@ class RobotFileParser: self.entries.append(entry) def parse(self, lines): - """parse the input lines from a robots.txt file. - We allow that a user-agent: line is not preceded by - one or more blank lines.""" + """Parse the input lines from a robots.txt file. + + We allow that a user-agent: line is not preceded by + one or more blank lines. + """ state = 0 - linenumber = 0 entry = Entry() for line in lines: - linenumber = linenumber + 1 if not line: if state == 1: entry = Entry() @@ -102,7 +97,7 @@ class RobotFileParser: line = line.split(':', 1) if len(line) == 2: line[0] = line[0].strip().lower() - line[1] = urllib.unquote(line[1].strip()) + line[1] = urllib.parse.unquote(line[1].strip()) if line[0] == "user-agent": if state == 2: self._add_entry(entry) @@ -128,7 +123,7 @@ class RobotFileParser: return True # search for given user agent matches # the first match counts - url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" + url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/" for entry in self.entries: if entry.applies_to(useragent): return entry.allowance(url) @@ -138,7 +133,6 @@ class RobotFileParser: # agent not found ==> access granted return True - def __str__(self): return ''.join([str(entry) + "\n" for entry in self.entries]) @@ -150,7 +144,7 @@ class RuleLine: if path == '' and not allowance: # an empty value means allow all allowance = True - self.path = urllib.quote(path) + self.path = urllib.parse.quote(path) self.allowance = allowance def applies_to(self, filename): @@ -195,18 +189,3 @@ class Entry: if line.applies_to(filename): return line.allowance return True - -class URLopener(urllib.FancyURLopener): - def __init__(self, *args): - urllib.FancyURLopener.__init__(self, *args) - self.errcode = 200 - - def prompt_user_passwd(self, host, realm): - ## If robots.txt file is accessible only with a password, - ## we act as if the file wasn't there. - return None, None - - def http_error_default(self, url, fp, errcode, errmsg, headers): - self.errcode = errcode - return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, - errmsg, headers) diff --git a/Lib/urllib2.py b/Lib/urllib2.py deleted file mode 100644 index 4bcd397..0000000 --- a/Lib/urllib2.py +++ /dev/null @@ -1,1351 +0,0 @@ -"""An extensible library for opening URLs using a variety of protocols - -The simplest way to use this module is to call the urlopen function, -which accepts a string containing a URL or a Request object (described -below). It opens the URL and returns the results as file-like -object; the returned object has some extra methods described below. - -The OpenerDirector manages a collection of Handler objects that do -all the actual work. Each Handler implements a particular protocol or -option. The OpenerDirector is a composite object that invokes the -Handlers needed to open the requested URL. For example, the -HTTPHandler performs HTTP GET and POST requests and deals with -non-error returns. The HTTPRedirectHandler automatically deals with -HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler -deals with digest authentication. - -urlopen(url, data=None) -- Basic usage is the same as original -urllib. pass the url and optionally data to post to an HTTP URL, and -get a file-like object back. One difference is that you can also pass -a Request instance instead of URL. Raises a URLError (subclass of -IOError); for HTTP errors, raises an HTTPError, which can also be -treated as a valid response. - -build_opener -- Function that creates a new OpenerDirector instance. -Will install the default handlers. Accepts one or more Handlers as -arguments, either instances or Handler classes that it will -instantiate. If one of the argument is a subclass of the default -handler, the argument will be installed instead of the default. - -install_opener -- Installs a new opener as the default opener. - -objects of interest: -OpenerDirector -- - -Request -- An object that encapsulates the state of a request. The -state can be as simple as the URL. It can also include extra HTTP -headers, e.g. a User-Agent. - -BaseHandler -- - -exceptions: -URLError -- A subclass of IOError, individual protocols have their own -specific subclass. - -HTTPError -- Also a valid HTTP response, so you can treat an HTTP error -as an exceptional event or valid response. - -internals: -BaseHandler and parent -_call_chain conventions - -Example usage: - -import urllib2 - -# set up authentication info -authinfo = urllib2.HTTPBasicAuthHandler() -authinfo.add_password(realm='PDQ Application', - uri='https://mahler:8092/site-updates.py', - user='klem', - passwd='geheim$parole') - -proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"}) - -# build a new opener that adds authentication and caching FTP handlers -opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler) - -# install it -urllib2.install_opener(opener) - -f = urllib2.urlopen('http://www.python.org/') - - -""" - -# XXX issues: -# If an authentication error handler that tries to perform -# authentication for some reason but fails, how should the error be -# signalled? The client needs to know the HTTP error code. But if -# the handler knows that the problem was, e.g., that it didn't know -# that hash algo that requested in the challenge, it would be good to -# pass that information along to the client, too. -# ftp errors aren't handled cleanly -# check digest against correct (i.e. non-apache) implementation - -# Possible extensions: -# complex proxies XXX not sure what exactly was meant by this -# abstract factory for opener - -import base64 -import hashlib -import http.client -import io -import email -import os -import posixpath -import random -import re -import socket -import sys -import time -import urlparse -import bisect - -from io import StringIO - -from urllib import (unwrap, unquote, splittype, splithost, quote, - addinfourl, splitport, splitquery, - splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue) - -# support for FileHandler, proxies via environment variables -from urllib import localhost, url2pathname, getproxies - -# used in User-Agent header sent -__version__ = sys.version[:3] - -_opener = None -def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): - global _opener - if _opener is None: - _opener = build_opener() - return _opener.open(url, data, timeout) - -def install_opener(opener): - global _opener - _opener = opener - -# do these error classes make sense? -# make sure all of the IOError stuff is overridden. we just want to be -# subtypes. - -class URLError(IOError): - # URLError is a sub-type of IOError, but it doesn't share any of - # the implementation. need to override __init__ and __str__. - # It sets self.args for compatibility with other EnvironmentError - # subclasses, but args doesn't have the typical format with errno in - # slot 0 and strerror in slot 1. This may be better than nothing. - def __init__(self, reason): - self.args = reason, - self.reason = reason - - def __str__(self): - return '<urlopen error %s>' % self.reason - -class HTTPError(URLError, addinfourl): - """Raised when HTTP error occurs, but also acts like non-error return""" - __super_init = addinfourl.__init__ - - def __init__(self, url, code, msg, hdrs, fp): - self.code = code - self.msg = msg - self.hdrs = hdrs - self.fp = fp - self.filename = url - # The addinfourl classes depend on fp being a valid file - # object. In some cases, the HTTPError may not have a valid - # file object. If this happens, the simplest workaround is to - # not initialize the base classes. - if fp is not None: - self.__super_init(fp, hdrs, url, code) - - def __str__(self): - return 'HTTP Error %s: %s' % (self.code, self.msg) - -# copied from cookielib.py -_cut_port_re = re.compile(r":\d+$") -def request_host(request): - """Return request-host, as defined by RFC 2965. - - Variation from RFC: returned value is lowercased, for convenient - comparison. - - """ - url = request.get_full_url() - host = urlparse.urlparse(url)[1] - if host == "": - host = request.get_header("Host", "") - - # remove port, if present - host = _cut_port_re.sub("", host, 1) - return host.lower() - -class Request: - - def __init__(self, url, data=None, headers={}, - origin_req_host=None, unverifiable=False): - # unwrap('<URL:type://host/path>') --> 'type://host/path' - self.__original = unwrap(url) - self.type = None - # self.__r_type is what's left after doing the splittype - self.host = None - self.port = None - self.data = data - self.headers = {} - for key, value in headers.items(): - self.add_header(key, value) - self.unredirected_hdrs = {} - if origin_req_host is None: - origin_req_host = request_host(self) - self.origin_req_host = origin_req_host - self.unverifiable = unverifiable - - def __getattr__(self, attr): - # XXX this is a fallback mechanism to guard against these - # methods getting called in a non-standard order. this may be - # too complicated and/or unnecessary. - # XXX should the __r_XXX attributes be public? - if attr[:12] == '_Request__r_': - name = attr[12:] - if hasattr(Request, 'get_' + name): - getattr(self, 'get_' + name)() - return getattr(self, attr) - raise AttributeError(attr) - - def get_method(self): - if self.has_data(): - return "POST" - else: - return "GET" - - # XXX these helper methods are lame - - def add_data(self, data): - self.data = data - - def has_data(self): - return self.data is not None - - def get_data(self): - return self.data - - def get_full_url(self): - return self.__original - - def get_type(self): - if self.type is None: - self.type, self.__r_type = splittype(self.__original) - if self.type is None: - raise ValueError("unknown url type: %s" % self.__original) - return self.type - - def get_host(self): - if self.host is None: - self.host, self.__r_host = splithost(self.__r_type) - if self.host: - self.host = unquote(self.host) - return self.host - - def get_selector(self): - return self.__r_host - - def set_proxy(self, host, type): - self.host, self.type = host, type - self.__r_host = self.__original - - def get_origin_req_host(self): - return self.origin_req_host - - def is_unverifiable(self): - return self.unverifiable - - def add_header(self, key, val): - # useful for something like authentication - self.headers[key.capitalize()] = val - - def add_unredirected_header(self, key, val): - # will not be added to a redirected request - self.unredirected_hdrs[key.capitalize()] = val - - def has_header(self, header_name): - return (header_name in self.headers or - header_name in self.unredirected_hdrs) - - def get_header(self, header_name, default=None): - return self.headers.get( - header_name, - self.unredirected_hdrs.get(header_name, default)) - - def header_items(self): - hdrs = self.unredirected_hdrs.copy() - hdrs.update(self.headers) - return list(hdrs.items()) - -class OpenerDirector: - def __init__(self): - client_version = "Python-urllib/%s" % __version__ - self.addheaders = [('User-agent', client_version)] - # manage the individual handlers - self.handlers = [] - self.handle_open = {} - self.handle_error = {} - self.process_response = {} - self.process_request = {} - - def add_handler(self, handler): - if not hasattr(handler, "add_parent"): - raise TypeError("expected BaseHandler instance, got %r" % - type(handler)) - - added = False - for meth in dir(handler): - if meth in ["redirect_request", "do_open", "proxy_open"]: - # oops, coincidental match - continue - - i = meth.find("_") - protocol = meth[:i] - condition = meth[i+1:] - - if condition.startswith("error"): - j = condition.find("_") + i + 1 - kind = meth[j+1:] - try: - kind = int(kind) - except ValueError: - pass - lookup = self.handle_error.get(protocol, {}) - self.handle_error[protocol] = lookup - elif condition == "open": - kind = protocol - lookup = self.handle_open - elif condition == "response": - kind = protocol - lookup = self.process_response - elif condition == "request": - kind = protocol - lookup = self.process_request - else: - continue - - handlers = lookup.setdefault(kind, []) - if handlers: - bisect.insort(handlers, handler) - else: - handlers.append(handler) - added = True - - if added: - # the handlers must work in an specific order, the order - # is specified in a Handler attribute - bisect.insort(self.handlers, handler) - handler.add_parent(self) - - def close(self): - # Only exists for backwards compatibility. - pass - - def _call_chain(self, chain, kind, meth_name, *args): - # Handlers raise an exception if no one else should try to handle - # the request, or return None if they can't but another handler - # could. Otherwise, they return the response. - handlers = chain.get(kind, ()) - for handler in handlers: - func = getattr(handler, meth_name) - - result = func(*args) - if result is not None: - return result - - def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): - # accept a URL or a Request object - if isinstance(fullurl, str): - req = Request(fullurl, data) - else: - req = fullurl - if data is not None: - req.add_data(data) - - req.timeout = timeout - protocol = req.get_type() - - # pre-process request - meth_name = protocol+"_request" - for processor in self.process_request.get(protocol, []): - meth = getattr(processor, meth_name) - req = meth(req) - - response = self._open(req, data) - - # post-process response - meth_name = protocol+"_response" - for processor in self.process_response.get(protocol, []): - meth = getattr(processor, meth_name) - response = meth(req, response) - - return response - - def _open(self, req, data=None): - result = self._call_chain(self.handle_open, 'default', - 'default_open', req) - if result: - return result - - protocol = req.get_type() - result = self._call_chain(self.handle_open, protocol, protocol + - '_open', req) - if result: - return result - - return self._call_chain(self.handle_open, 'unknown', - 'unknown_open', req) - - def error(self, proto, *args): - if proto in ('http', 'https'): - # XXX http[s] protocols are special-cased - dict = self.handle_error['http'] # https is not different than http - proto = args[2] # YUCK! - meth_name = 'http_error_%s' % proto - http_err = 1 - orig_args = args - else: - dict = self.handle_error - meth_name = proto + '_error' - http_err = 0 - args = (dict, proto, meth_name) + args - result = self._call_chain(*args) - if result: - return result - - if http_err: - args = (dict, 'default', 'http_error_default') + orig_args - return self._call_chain(*args) - -# XXX probably also want an abstract factory that knows when it makes -# sense to skip a superclass in favor of a subclass and when it might -# make sense to include both - -def build_opener(*handlers): - """Create an opener object from a list of handlers. - - The opener will use several default handlers, including support - for HTTP and FTP. - - If any of the handlers passed as arguments are subclasses of the - default handlers, the default handlers will not be used. - """ - def isclass(obj): - return isinstance(obj, type) or hasattr(obj, "__bases__") - - opener = OpenerDirector() - default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, - HTTPDefaultErrorHandler, HTTPRedirectHandler, - FTPHandler, FileHandler, HTTPErrorProcessor] - if hasattr(http.client, 'HTTPS'): - default_classes.append(HTTPSHandler) - skip = set() - for klass in default_classes: - for check in handlers: - if isclass(check): - if issubclass(check, klass): - skip.add(klass) - elif isinstance(check, klass): - skip.add(klass) - for klass in skip: - default_classes.remove(klass) - - for klass in default_classes: - opener.add_handler(klass()) - - for h in handlers: - if isclass(h): - h = h() - opener.add_handler(h) - return opener - -class BaseHandler: - handler_order = 500 - - def add_parent(self, parent): - self.parent = parent - - def close(self): - # Only exists for backwards compatibility - pass - - def __lt__(self, other): - if not hasattr(other, "handler_order"): - # Try to preserve the old behavior of having custom classes - # inserted after default ones (works only for custom user - # classes which are not aware of handler_order). - return True - return self.handler_order < other.handler_order - - -class HTTPErrorProcessor(BaseHandler): - """Process HTTP error responses.""" - handler_order = 1000 # after all other processing - - def http_response(self, request, response): - code, msg, hdrs = response.code, response.msg, response.info() - - # According to RFC 2616, "2xx" code indicates that the client's - # request was successfully received, understood, and accepted. - if not (200 <= code < 300): - response = self.parent.error( - 'http', request, response, code, msg, hdrs) - - return response - - https_response = http_response - -class HTTPDefaultErrorHandler(BaseHandler): - def http_error_default(self, req, fp, code, msg, hdrs): - raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) - -class HTTPRedirectHandler(BaseHandler): - # maximum number of redirections to any single URL - # this is needed because of the state that cookies introduce - max_repeats = 4 - # maximum total number of redirections (regardless of URL) before - # assuming we're in a loop - max_redirections = 10 - - def redirect_request(self, req, fp, code, msg, headers, newurl): - """Return a Request or None in response to a redirect. - - This is called by the http_error_30x methods when a - redirection response is received. If a redirection should - take place, return a new Request to allow http_error_30x to - perform the redirect. Otherwise, raise HTTPError if no-one - else should try to handle this url. Return None if you can't - but another Handler might. - """ - m = req.get_method() - if (code in (301, 302, 303, 307) and m in ("GET", "HEAD") - or code in (301, 302, 303) and m == "POST"): - # Strictly (according to RFC 2616), 301 or 302 in response - # to a POST MUST NOT cause a redirection without confirmation - # from the user (of urllib2, in this case). In practice, - # essentially all clients do redirect in this case, so we - # do the same. - # be conciliant with URIs containing a space - newurl = newurl.replace(' ', '%20') - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type") - ) - return Request(newurl, - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True) - else: - raise HTTPError(req.get_full_url(), code, msg, headers, fp) - - # Implementation note: To avoid the server sending us into an - # infinite loop, the request object needs to track what URLs we - # have already seen. Do this by adding a handler-specific - # attribute to the Request object. - def http_error_302(self, req, fp, code, msg, headers): - # Some servers (incorrectly) return multiple Location headers - # (so probably same goes for URI). Use first header. - if 'location' in headers: - newurl = headers['location'] - elif 'uri' in headers: - newurl = headers['uri'] - else: - return - newurl = urlparse.urljoin(req.get_full_url(), newurl) - - # XXX Probably want to forget about the state of the current - # request, although that might interact poorly with other - # handlers that also use handler-specific request attributes - new = self.redirect_request(req, fp, code, msg, headers, newurl) - if new is None: - return - - # loop detection - # .redirect_dict has a key url if url was previously visited. - if hasattr(req, 'redirect_dict'): - visited = new.redirect_dict = req.redirect_dict - if (visited.get(newurl, 0) >= self.max_repeats or - len(visited) >= self.max_redirections): - raise HTTPError(req.get_full_url(), code, - self.inf_msg + msg, headers, fp) - else: - visited = new.redirect_dict = req.redirect_dict = {} - visited[newurl] = visited.get(newurl, 0) + 1 - - # Don't close the fp until we are sure that we won't use it - # with HTTPError. - fp.read() - fp.close() - - return self.parent.open(new) - - http_error_301 = http_error_303 = http_error_307 = http_error_302 - - inf_msg = "The HTTP server returned a redirect error that would " \ - "lead to an infinite loop.\n" \ - "The last 30x error message was:\n" - - -def _parse_proxy(proxy): - """Return (scheme, user, password, host/port) given a URL or an authority. - - If a URL is supplied, it must have an authority (host:port) component. - According to RFC 3986, having an authority component means the URL must - have two slashes after the scheme: - - >>> _parse_proxy('file:/ftp.example.com/') - Traceback (most recent call last): - ValueError: proxy URL with no authority: 'file:/ftp.example.com/' - - The first three items of the returned tuple may be None. - - Examples of authority parsing: - - >>> _parse_proxy('proxy.example.com') - (None, None, None, 'proxy.example.com') - >>> _parse_proxy('proxy.example.com:3128') - (None, None, None, 'proxy.example.com:3128') - - The authority component may optionally include userinfo (assumed to be - username:password): - - >>> _parse_proxy('joe:password@proxy.example.com') - (None, 'joe', 'password', 'proxy.example.com') - >>> _parse_proxy('joe:password@proxy.example.com:3128') - (None, 'joe', 'password', 'proxy.example.com:3128') - - Same examples, but with URLs instead: - - >>> _parse_proxy('http://proxy.example.com/') - ('http', None, None, 'proxy.example.com') - >>> _parse_proxy('http://proxy.example.com:3128/') - ('http', None, None, 'proxy.example.com:3128') - >>> _parse_proxy('http://joe:password@proxy.example.com/') - ('http', 'joe', 'password', 'proxy.example.com') - >>> _parse_proxy('http://joe:password@proxy.example.com:3128') - ('http', 'joe', 'password', 'proxy.example.com:3128') - - Everything after the authority is ignored: - - >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') - ('ftp', 'joe', 'password', 'proxy.example.com') - - Test for no trailing '/' case: - - >>> _parse_proxy('http://joe:password@proxy.example.com') - ('http', 'joe', 'password', 'proxy.example.com') - - """ - scheme, r_scheme = splittype(proxy) - if not r_scheme.startswith("/"): - # authority - scheme = None - authority = proxy - else: - # URL - if not r_scheme.startswith("//"): - raise ValueError("proxy URL with no authority: %r" % proxy) - # We have an authority, so for RFC 3986-compliant URLs (by ss 3. - # and 3.3.), path is empty or starts with '/' - end = r_scheme.find("/", 2) - if end == -1: - end = None - authority = r_scheme[2:end] - userinfo, hostport = splituser(authority) - if userinfo is not None: - user, password = splitpasswd(userinfo) - else: - user = password = None - return scheme, user, password, hostport - -class ProxyHandler(BaseHandler): - # Proxies must be in front - handler_order = 100 - - def __init__(self, proxies=None): - if proxies is None: - proxies = getproxies() - assert hasattr(proxies, 'keys'), "proxies must be a mapping" - self.proxies = proxies - for type, url in proxies.items(): - setattr(self, '%s_open' % type, - lambda r, proxy=url, type=type, meth=self.proxy_open: \ - meth(r, proxy, type)) - - def proxy_open(self, req, proxy, type): - orig_type = req.get_type() - proxy_type, user, password, hostport = _parse_proxy(proxy) - if proxy_type is None: - proxy_type = orig_type - if user and password: - user_pass = '%s:%s' % (unquote(user), unquote(password)) - creds = base64.b64encode(user_pass.encode()).decode("ascii") - req.add_header('Proxy-authorization', 'Basic ' + creds) - hostport = unquote(hostport) - req.set_proxy(hostport, proxy_type) - if orig_type == proxy_type: - # let other handlers take care of it - return None - else: - # need to start over, because the other handlers don't - # grok the proxy's URL type - # e.g. if we have a constructor arg proxies like so: - # {'http': 'ftp://proxy.example.com'}, we may end up turning - # a request for http://acme.example.com/a into one for - # ftp://proxy.example.com/a - return self.parent.open(req) - -class HTTPPasswordMgr: - - def __init__(self): - self.passwd = {} - - def add_password(self, realm, uri, user, passwd): - # uri could be a single URI or a sequence - if isinstance(uri, str): - uri = [uri] - if not realm in self.passwd: - self.passwd[realm] = {} - for default_port in True, False: - reduced_uri = tuple( - [self.reduce_uri(u, default_port) for u in uri]) - self.passwd[realm][reduced_uri] = (user, passwd) - - def find_user_password(self, realm, authuri): - domains = self.passwd.get(realm, {}) - for default_port in True, False: - reduced_authuri = self.reduce_uri(authuri, default_port) - for uris, authinfo in domains.items(): - for uri in uris: - if self.is_suburi(uri, reduced_authuri): - return authinfo - return None, None - - def reduce_uri(self, uri, default_port=True): - """Accept authority or URI and extract only the authority and path.""" - # note HTTP URLs do not have a userinfo component - parts = urlparse.urlsplit(uri) - if parts[1]: - # URI - scheme = parts[0] - authority = parts[1] - path = parts[2] or '/' - else: - # host or host:port - scheme = None - authority = uri - path = '/' - host, port = splitport(authority) - if default_port and port is None and scheme is not None: - dport = {"http": 80, - "https": 443, - }.get(scheme) - if dport is not None: - authority = "%s:%d" % (host, dport) - return authority, path - - def is_suburi(self, base, test): - """Check if test is below base in a URI tree - - Both args must be URIs in reduced form. - """ - if base == test: - return True - if base[0] != test[0]: - return False - common = posixpath.commonprefix((base[1], test[1])) - if len(common) == len(base[1]): - return True - return False - - -class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): - - def find_user_password(self, realm, authuri): - user, password = HTTPPasswordMgr.find_user_password(self, realm, - authuri) - if user is not None: - return user, password - return HTTPPasswordMgr.find_user_password(self, None, authuri) - - -class AbstractBasicAuthHandler: - - # XXX this allows for multiple auth-schemes, but will stupidly pick - # the last one with a realm specified. - - # allow for double- and single-quoted realm values - # (single quotes are a violation of the RFC, but appear in the wild) - rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' - 'realm=(["\'])(.*?)\\2', re.I) - - # XXX could pre-emptively send auth info already accepted (RFC 2617, - # end of section 2, and section 1.2 immediately after "credentials" - # production). - - def __init__(self, password_mgr=None): - if password_mgr is None: - password_mgr = HTTPPasswordMgr() - self.passwd = password_mgr - self.add_password = self.passwd.add_password - - def http_error_auth_reqed(self, authreq, host, req, headers): - # host may be an authority (without userinfo) or a URL with an - # authority - # XXX could be multiple headers - authreq = headers.get(authreq, None) - if authreq: - mo = AbstractBasicAuthHandler.rx.search(authreq) - if mo: - scheme, quote, realm = mo.groups() - if scheme.lower() == 'basic': - return self.retry_http_basic_auth(host, req, realm) - - def retry_http_basic_auth(self, host, req, realm): - user, pw = self.passwd.find_user_password(realm, host) - if pw is not None: - raw = "%s:%s" % (user, pw) - auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii") - if req.headers.get(self.auth_header, None) == auth: - return None - req.add_header(self.auth_header, auth) - return self.parent.open(req) - else: - return None - - -class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): - - auth_header = 'Authorization' - - def http_error_401(self, req, fp, code, msg, headers): - url = req.get_full_url() - return self.http_error_auth_reqed('www-authenticate', - url, req, headers) - - -class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): - - auth_header = 'Proxy-authorization' - - def http_error_407(self, req, fp, code, msg, headers): - # http_error_auth_reqed requires that there is no userinfo component in - # authority. Assume there isn't one, since urllib2 does not (and - # should not, RFC 3986 s. 3.2.1) support requests for URLs containing - # userinfo. - authority = req.get_host() - return self.http_error_auth_reqed('proxy-authenticate', - authority, req, headers) - - -def randombytes(n): - """Return n random bytes.""" - return os.urandom(n) - -class AbstractDigestAuthHandler: - # Digest authentication is specified in RFC 2617. - - # XXX The client does not inspect the Authentication-Info header - # in a successful response. - - # XXX It should be possible to test this implementation against - # a mock server that just generates a static set of challenges. - - # XXX qop="auth-int" supports is shaky - - def __init__(self, passwd=None): - if passwd is None: - passwd = HTTPPasswordMgr() - self.passwd = passwd - self.add_password = self.passwd.add_password - self.retried = 0 - self.nonce_count = 0 - - def reset_retry_count(self): - self.retried = 0 - - def http_error_auth_reqed(self, auth_header, host, req, headers): - authreq = headers.get(auth_header, None) - if self.retried > 5: - # Don't fail endlessly - if we failed once, we'll probably - # fail a second time. Hm. Unless the Password Manager is - # prompting for the information. Crap. This isn't great - # but it's better than the current 'repeat until recursion - # depth exceeded' approach <wink> - raise HTTPError(req.get_full_url(), 401, "digest auth failed", - headers, None) - else: - self.retried += 1 - if authreq: - scheme = authreq.split()[0] - if scheme.lower() == 'digest': - return self.retry_http_digest_auth(req, authreq) - - def retry_http_digest_auth(self, req, auth): - token, challenge = auth.split(' ', 1) - chal = parse_keqv_list(parse_http_list(challenge)) - auth = self.get_authorization(req, chal) - if auth: - auth_val = 'Digest %s' % auth - if req.headers.get(self.auth_header, None) == auth_val: - return None - req.add_unredirected_header(self.auth_header, auth_val) - resp = self.parent.open(req) - return resp - - def get_cnonce(self, nonce): - # The cnonce-value is an opaque - # quoted string value provided by the client and used by both client - # and server to avoid chosen plaintext attacks, to provide mutual - # authentication, and to provide some message integrity protection. - # This isn't a fabulous effort, but it's probably Good Enough. - s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime()) - b = s.encode("ascii") + randombytes(8) - dig = hashlib.sha1(b).hexdigest() - return dig[:16] - - def get_authorization(self, req, chal): - try: - realm = chal['realm'] - nonce = chal['nonce'] - qop = chal.get('qop') - algorithm = chal.get('algorithm', 'MD5') - # mod_digest doesn't send an opaque, even though it isn't - # supposed to be optional - opaque = chal.get('opaque', None) - except KeyError: - return None - - H, KD = self.get_algorithm_impls(algorithm) - if H is None: - return None - - user, pw = self.passwd.find_user_password(realm, req.get_full_url()) - if user is None: - return None - - # XXX not implemented yet - if req.has_data(): - entdig = self.get_entity_digest(req.get_data(), chal) - else: - entdig = None - - A1 = "%s:%s:%s" % (user, realm, pw) - A2 = "%s:%s" % (req.get_method(), - # XXX selector: what about proxies and full urls - req.get_selector()) - if qop == 'auth': - self.nonce_count += 1 - ncvalue = '%08x' % self.nonce_count - cnonce = self.get_cnonce(nonce) - noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) - respdig = KD(H(A1), noncebit) - elif qop is None: - respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) - else: - # XXX handle auth-int. - raise URLError("qop '%s' is not supported." % qop) - - # XXX should the partial digests be encoded too? - - base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ - 'response="%s"' % (user, realm, nonce, req.get_selector(), - respdig) - if opaque: - base += ', opaque="%s"' % opaque - if entdig: - base += ', digest="%s"' % entdig - base += ', algorithm="%s"' % algorithm - if qop: - base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) - return base - - def get_algorithm_impls(self, algorithm): - # algorithm should be case-insensitive according to RFC2617 - algorithm = algorithm.upper() - # lambdas assume digest modules are imported at the top level - if algorithm == 'MD5': - H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest() - elif algorithm == 'SHA': - H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest() - # XXX MD5-sess - KD = lambda s, d: H("%s:%s" % (s, d)) - return H, KD - - def get_entity_digest(self, data, chal): - # XXX not implemented yet - return None - - -class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): - """An authentication protocol defined by RFC 2069 - - Digest authentication improves on basic authentication because it - does not transmit passwords in the clear. - """ - - auth_header = 'Authorization' - handler_order = 490 # before Basic auth - - def http_error_401(self, req, fp, code, msg, headers): - host = urlparse.urlparse(req.get_full_url())[1] - retry = self.http_error_auth_reqed('www-authenticate', - host, req, headers) - self.reset_retry_count() - return retry - - -class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): - - auth_header = 'Proxy-Authorization' - handler_order = 490 # before Basic auth - - def http_error_407(self, req, fp, code, msg, headers): - host = req.get_host() - retry = self.http_error_auth_reqed('proxy-authenticate', - host, req, headers) - self.reset_retry_count() - return retry - -class AbstractHTTPHandler(BaseHandler): - - def __init__(self, debuglevel=0): - self._debuglevel = debuglevel - - def set_http_debuglevel(self, level): - self._debuglevel = level - - def do_request_(self, request): - host = request.get_host() - if not host: - raise URLError('no host given') - - if request.has_data(): # POST - data = request.get_data() - if not request.has_header('Content-type'): - request.add_unredirected_header( - 'Content-type', - 'application/x-www-form-urlencoded') - if not request.has_header('Content-length'): - request.add_unredirected_header( - 'Content-length', '%d' % len(data)) - - scheme, sel = splittype(request.get_selector()) - sel_host, sel_path = splithost(sel) - if not request.has_header('Host'): - request.add_unredirected_header('Host', sel_host or host) - for name, value in self.parent.addheaders: - name = name.capitalize() - if not request.has_header(name): - request.add_unredirected_header(name, value) - - return request - - def do_open(self, http_class, req): - """Return an addinfourl object for the request, using http_class. - - http_class must implement the HTTPConnection API from http.client. - The addinfourl return value is a file-like object. It also - has methods and attributes including: - - info(): return a email.message.Message object for the headers - - geturl(): return the original request URL - - code: HTTP status code - """ - host = req.get_host() - if not host: - raise URLError('no host given') - - h = http_class(host, timeout=req.timeout) # will parse host:port - h.set_debuglevel(self._debuglevel) - - headers = dict(req.headers) - headers.update(req.unredirected_hdrs) - # We want to make an HTTP/1.1 request, but the addinfourl - # class isn't prepared to deal with a persistent connection. - # It will try to read all remaining data from the socket, - # which will block while the server waits for the next request. - # So make sure the connection gets closed after the (only) - # request. - headers["Connection"] = "close" - headers = dict( - (name.title(), val) for name, val in headers.items()) - try: - h.request(req.get_method(), req.get_selector(), req.data, headers) - r = h.getresponse() - except socket.error as err: # XXX what error? - raise URLError(err) - - # Pick apart the HTTPResponse object to get the addinfourl - # object initialized properly. - - # XXX Should an HTTPResponse object really be passed to - # BufferedReader? If so, we should change http.client to support - # this use directly. - - # Add some fake methods to the reader to satisfy BufferedReader. - r.readable = lambda: True - r.writable = r.seekable = lambda: False - r._checkReadable = lambda: True - r._checkWritable = lambda: False - fp = io.BufferedReader(r) - - resp = addinfourl(fp, r.msg, req.get_full_url()) - resp.code = r.status - resp.msg = r.reason - return resp - - -class HTTPHandler(AbstractHTTPHandler): - - def http_open(self, req): - return self.do_open(http.client.HTTPConnection, req) - - http_request = AbstractHTTPHandler.do_request_ - -if hasattr(http.client, 'HTTPS'): - class HTTPSHandler(AbstractHTTPHandler): - - def https_open(self, req): - return self.do_open(http.client.HTTPSConnection, req) - - https_request = AbstractHTTPHandler.do_request_ - -class HTTPCookieProcessor(BaseHandler): - def __init__(self, cookiejar=None): - import http.cookiejar - if cookiejar is None: - cookiejar = http.cookiejar.CookieJar() - self.cookiejar = cookiejar - - def http_request(self, request): - self.cookiejar.add_cookie_header(request) - return request - - def http_response(self, request, response): - self.cookiejar.extract_cookies(response, request) - return response - - https_request = http_request - https_response = http_response - -class UnknownHandler(BaseHandler): - def unknown_open(self, req): - type = req.get_type() - raise URLError('unknown url type: %s' % type) - -def parse_keqv_list(l): - """Parse list of key=value strings where keys are not duplicated.""" - parsed = {} - for elt in l: - # Because of a trailing comma in the auth string, elt could be the - # empty string. - if not elt: - continue - k, v = elt.split('=', 1) - if v[0] == '"' and v[-1] == '"': - v = v[1:-1] - parsed[k] = v - return parsed - -def parse_http_list(s): - """Parse lists as described by RFC 2068 Section 2. - - In particular, parse comma-separated lists where the elements of - the list may include quoted-strings. A quoted-string could - contain a comma. A non-quoted string could have quotes in the - middle. Neither commas nor quotes count if they are escaped. - Only double-quotes count, not single-quotes. - """ - res = [] - part = '' - - escape = quote = False - for cur in s: - if escape: - part += cur - escape = False - continue - if quote: - if cur == '\\': - escape = True - continue - elif cur == '"': - quote = False - part += cur - continue - - if cur == ',': - res.append(part) - part = '' - continue - - if cur == '"': - quote = True - - part += cur - - # append last part - if part: - res.append(part) - - return [part.strip() for part in res] - -class FileHandler(BaseHandler): - # Use local file or FTP depending on form of URL - def file_open(self, req): - url = req.get_selector() - if url[:2] == '//' and url[2:3] != '/': - req.type = 'ftp' - return self.parent.open(req) - else: - return self.open_local_file(req) - - # names for the localhost - names = None - def get_names(self): - if FileHandler.names is None: - try: - FileHandler.names = (socket.gethostbyname('localhost'), - socket.gethostbyname(socket.gethostname())) - except socket.gaierror: - FileHandler.names = (socket.gethostbyname('localhost'),) - return FileHandler.names - - # not entirely sure what the rules are here - def open_local_file(self, req): - import email.utils - import mimetypes - host = req.get_host() - file = req.get_selector() - localfile = url2pathname(file) - try: - stats = os.stat(localfile) - size = stats.st_size - modified = email.utils.formatdate(stats.st_mtime, usegmt=True) - mtype = mimetypes.guess_type(file)[0] - headers = email.message_from_string( - 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % - (mtype or 'text/plain', size, modified)) - if host: - host, port = splitport(host) - if not host or \ - (not port and _safe_gethostbyname(host) in self.get_names()): - return addinfourl(open(localfile, 'rb'), - headers, 'file:'+file) - except OSError as msg: - # urllib2 users shouldn't expect OSErrors coming from urlopen() - raise URLError(msg) - raise URLError('file not on local host') - -def _safe_gethostbyname(host): - try: - return socket.gethostbyname(host) - except socket.gaierror: - return None - -class FTPHandler(BaseHandler): - def ftp_open(self, req): - import ftplib - import mimetypes - host = req.get_host() - if not host: - raise URLError('ftp error: no host given') - host, port = splitport(host) - if port is None: - port = ftplib.FTP_PORT - else: - port = int(port) - - # username/password handling - user, host = splituser(host) - if user: - user, passwd = splitpasswd(user) - else: - passwd = None - host = unquote(host) - user = unquote(user or '') - passwd = unquote(passwd or '') - - try: - host = socket.gethostbyname(host) - except socket.error as msg: - raise URLError(msg) - path, attrs = splitattr(req.get_selector()) - dirs = path.split('/') - dirs = list(map(unquote, dirs)) - dirs, file = dirs[:-1], dirs[-1] - if dirs and not dirs[0]: - dirs = dirs[1:] - try: - fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) - type = file and 'I' or 'D' - for attr in attrs: - attr, value = splitvalue(attr) - if attr.lower() == 'type' and \ - value in ('a', 'A', 'i', 'I', 'd', 'D'): - type = value.upper() - fp, retrlen = fw.retrfile(file, type) - headers = "" - mtype = mimetypes.guess_type(req.get_full_url())[0] - if mtype: - headers += "Content-type: %s\n" % mtype - if retrlen is not None and retrlen >= 0: - headers += "Content-length: %d\n" % retrlen - headers = email.message_from_string(headers) - sf = StringIO(str(headers)) - return addinfourl(fp, headers, req.get_full_url()) - except ftplib.all_errors as msg: - raise URLError('ftp error: %s' % msg).with_traceback(sys.exc_info()[2]) - - def connect_ftp(self, user, passwd, host, port, dirs, timeout): - fw = ftpwrapper(user, passwd, host, port, dirs, timeout) - return fw - -class CacheFTPHandler(FTPHandler): - # XXX would be nice to have pluggable cache strategies - # XXX this stuff is definitely not thread safe - def __init__(self): - self.cache = {} - self.timeout = {} - self.soonest = 0 - self.delay = 60 - self.max_conns = 16 - - def setTimeout(self, t): - self.delay = t - - def setMaxConns(self, m): - self.max_conns = m - - def connect_ftp(self, user, passwd, host, port, dirs, timeout): - key = user, host, port, '/'.join(dirs), timeout - if key in self.cache: - self.timeout[key] = time.time() + self.delay - else: - self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout) - self.timeout[key] = time.time() + self.delay - self.check_cache() - return self.cache[key] - - def check_cache(self): - # first check for old ones - t = time.time() - if self.soonest <= t: - for k, v in list(self.timeout.items()): - if v < t: - self.cache[k].close() - del self.cache[k] - del self.timeout[k] - self.soonest = min(list(self.timeout.values())) - - # then check the size - if len(self.cache) == self.max_conns: - for k, v in list(self.timeout.items()): - if v == self.soonest: - del self.cache[k] - del self.timeout[k] - break - self.soonest = min(list(self.timeout.values())) diff --git a/Lib/wsgiref/simple_server.py b/Lib/wsgiref/simple_server.py index 44a91fa..a82c80a 100644 --- a/Lib/wsgiref/simple_server.py +++ b/Lib/wsgiref/simple_server.py @@ -11,7 +11,8 @@ module. See also the BaseHTTPServer module docs for other API information. """ from http.server import BaseHTTPRequestHandler, HTTPServer -import urllib, sys +import sys +import urllib.parse from wsgiref.handlers import SimpleHandler __version__ = "0.1" @@ -93,7 +94,7 @@ class WSGIRequestHandler(BaseHTTPRequestHandler): else: path,query = self.path,'' - env['PATH_INFO'] = urllib.unquote(path) + env['PATH_INFO'] = urllib.parse.unquote(path) env['QUERY_STRING'] = query host = self.address_string() diff --git a/Lib/wsgiref/util.py b/Lib/wsgiref/util.py index a4ca02f..2686b66 100644 --- a/Lib/wsgiref/util.py +++ b/Lib/wsgiref/util.py @@ -50,7 +50,7 @@ def guess_scheme(environ): def application_uri(environ): """Return the application's base URI (no PATH_INFO or QUERY_STRING)""" url = environ['wsgi.url_scheme']+'://' - from urllib import quote + from urllib.parse import quote if environ.get('HTTP_HOST'): url += environ['HTTP_HOST'] @@ -70,7 +70,7 @@ def application_uri(environ): def request_uri(environ, include_query=1): """Return the full request URI, optionally including the query string""" url = application_uri(environ) - from urllib import quote + from urllib.parse import quote path_info = quote(environ.get('PATH_INFO','')) if not environ.get('SCRIPT_NAME'): url += path_info[1:] diff --git a/Lib/xml/dom/xmlbuilder.py b/Lib/xml/dom/xmlbuilder.py index dc7c5d4..d798624 100644 --- a/Lib/xml/dom/xmlbuilder.py +++ b/Lib/xml/dom/xmlbuilder.py @@ -190,8 +190,8 @@ class DOMBuilder: options.errorHandler = self.errorHandler fp = input.byteStream if fp is None and options.systemId: - import urllib2 - fp = urllib2.urlopen(input.systemId) + import urllib.request + fp = urllib.request.urlopen(input.systemId) return self._parse_bytestream(fp, options) def parseWithContext(self, input, cnode, action): @@ -223,14 +223,14 @@ class DOMEntityResolver(object): source.encoding = self._guess_media_encoding(source) # determine the base URI is we can - import posixpath, urlparse - parts = urlparse.urlparse(systemId) + import posixpath, urllib.parse + parts = urllib.parse.urlparse(systemId) scheme, netloc, path, params, query, fragment = parts # XXX should we check the scheme here as well? if path and not path.endswith("/"): path = posixpath.dirname(path) + "/" parts = scheme, netloc, path, params, query, fragment - source.baseURI = urlparse.urlunparse(parts) + source.baseURI = urllib.parse.urlunparse(parts) return source @@ -242,8 +242,8 @@ class DOMEntityResolver(object): return self._opener def _create_opener(self): - import urllib2 - return urllib2.build_opener() + import urllib.request + return urllib.request.build_opener() def _guess_media_encoding(self, source): info = source.byteStream.info() diff --git a/Lib/xml/sax/saxutils.py b/Lib/xml/sax/saxutils.py index a569c1d..e5d43a5 100644 --- a/Lib/xml/sax/saxutils.py +++ b/Lib/xml/sax/saxutils.py @@ -3,7 +3,7 @@ A library of useful helper classes to the SAX classes, for the convenience of application and driver writers. """ -import os, urlparse, urllib +import os, urllib.parse, urllib.request from . import handler from . import xmlreader @@ -289,8 +289,8 @@ def prepare_input_source(source, base = ""): source.setSystemId(sysidfilename) f = open(sysidfilename, "rb") else: - source.setSystemId(urlparse.urljoin(base, sysid)) - f = urllib.urlopen(source.getSystemId()) + source.setSystemId(urllib.parse.urljoin(base, sysid)) + f = urllib.request.urlopen(source.getSystemId()) source.setByteStream(f) diff --git a/Lib/xmlrpc/client.py b/Lib/xmlrpc/client.py index 6868d3b..121fedf 100644 --- a/Lib/xmlrpc/client.py +++ b/Lib/xmlrpc/client.py @@ -1160,12 +1160,12 @@ class Transport: if isinstance(host, tuple): host, x509 = host - import urllib - auth, host = urllib.splituser(host) + import urllib.parse + auth, host = urllib.parse.splituser(host) if auth: import base64 - auth = base64.encodestring(urllib.unquote(auth)) + auth = base64.encodestring(urllib.parse.unquote(auth)) auth = "".join(auth.split()) # get rid of whitespace extra_headers = [ ("Authorization", "Basic " + auth) @@ -1321,11 +1321,11 @@ class ServerProxy: # establish a "logical" server connection # get the url - import urllib - type, uri = urllib.splittype(uri) + import urllib.parse + type, uri = urllib.parse.splittype(uri) if type not in ("http", "https"): raise IOError("unsupported XML-RPC protocol") - self.__host, self.__handler = urllib.splithost(uri) + self.__host, self.__handler = urllib.parse.splithost(uri) if not self.__handler: self.__handler = "/RPC2" diff --git a/Makefile.pre.in b/Makefile.pre.in index 0d89d12..3de0dbd 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -809,7 +809,7 @@ LIBSUBDIRS= tkinter site-packages test test/output test/data \ email email/mime email/test email/test/data \ html json json/tests http dbm xmlrpc \ sqlite3 sqlite3/test \ - logging bsddb bsddb/test csv wsgiref \ + logging bsddb bsddb/test csv wsgiref urllib \ lib2to3 lib2to3/fixes lib2to3/pgen2 lib2to3/tests \ ctypes ctypes/test ctypes/macholib idlelib idlelib/Icons \ distutils distutils/command distutils/tests $(XMLLIBSUBDIRS) \ @@ -81,6 +81,15 @@ Extension Modules Library ------- +- a new ``urllib`` package was created. It consists of code from + ``urllib``, ``urllib2``, ``urlparse``, and ``robotparser``. The old + modules have all been removed. The new package has five submodules: + ``urllib.parse``, ``urllib.request``, ``urllib.response``, + ``urllib.error``, and ``urllib.robotparser``. The + ``urllib.request.urlopen()`` function uses the url opener from + ``urllib2``. (Note that the unittests have not been renamed for the + beta, but they will be renamed in the future.) + - rfc822 has been removed in favor of the email package. - mimetools has been removed in favor of the email package. |