From 1afc1696167547a5fa101c53e5a3ab4717f8852c Mon Sep 17 00:00:00 2001 From: Jeremy Hylton Date: Wed, 18 Jun 2008 20:49:58 +0000 Subject: Make a new urllib package . It consists of code from urllib, urllib2, urlparse, and robotparser. The old modules have all been removed. The new package has five submodules: urllib.parse, urllib.request, urllib.response, urllib.error, and urllib.robotparser. The urllib.request.urlopen() function uses the url opener from urllib2. Note that the unittests have not been renamed for the beta, but they will be renamed in the future. Joint work with Senthil Kumaran. --- Lib/cgi.py | 6 +- Lib/distutils/command/register.py | 20 +- Lib/distutils/command/upload.py | 7 +- Lib/email/utils.py | 7 +- Lib/http/client.py | 2 +- Lib/http/cookiejar.py | 18 +- Lib/http/server.py | 10 +- Lib/macurl2path.py | 10 +- Lib/mimetypes.py | 4 +- Lib/robotparser.py | 212 ---- Lib/test/regrtest.py | 7 +- Lib/test/support.py | 6 +- Lib/test/test___all__.py | 4 +- Lib/test/test_http_cookiejar.py | 92 +- Lib/test/test_httpservers.py | 5 +- Lib/test/test_importhooks.py | 18 +- Lib/test/test_pyclbr.py | 10 - Lib/test/test_robotparser.py | 7 +- Lib/test/test_ssl.py | 8 +- Lib/test/test_urllib.py | 137 ++- Lib/test/test_urllib2.py | 115 +- Lib/test/test_urllib2_localnet.py | 185 ++- Lib/test/test_urllib2net.py | 38 +- Lib/test/test_urllibnet.py | 19 +- Lib/test/test_urlparse.py | 40 +- Lib/test/test_xmlrpc.py | 29 +- Lib/urllib.py | 1714 --------------------------- Lib/urllib/__init__.py | 0 Lib/urllib/error.py | 59 + Lib/urllib/parse.py | 630 ++++++++++ Lib/urllib/request.py | 2295 +++++++++++++++++++++++++++++++++++++ Lib/urllib/response.py | 83 ++ Lib/urllib/robotparser.py | 191 +++ Lib/urllib2.py | 1351 ---------------------- Lib/urlparse.py | 325 ------ Lib/wsgiref/simple_server.py | 5 +- Lib/wsgiref/util.py | 4 +- Lib/xml/dom/xmlbuilder.py | 14 +- Lib/xml/sax/saxutils.py | 6 +- Lib/xmlrpc/client.py | 12 +- Makefile.pre.in | 2 +- Misc/NEWS | 9 + 42 files changed, 3685 insertions(+), 4031 deletions(-) delete mode 100644 Lib/robotparser.py delete mode 100644 Lib/urllib.py create mode 100644 Lib/urllib/__init__.py create mode 100644 Lib/urllib/error.py create mode 100644 Lib/urllib/parse.py create mode 100644 Lib/urllib/request.py create mode 100644 Lib/urllib/response.py create mode 100644 Lib/urllib/robotparser.py delete mode 100644 Lib/urllib2.py delete mode 100644 Lib/urlparse.py diff --git a/Lib/cgi.py b/Lib/cgi.py index 5b8e40f..d0a2c61 100755 --- a/Lib/cgi.py +++ b/Lib/cgi.py @@ -35,7 +35,7 @@ from operator import attrgetter from io import StringIO import sys import os -import urllib +import urllib.parse import email.parser __all__ = ["MiniFieldStorage", "FieldStorage", @@ -216,8 +216,8 @@ def parse_qsl(qs, keep_blank_values=0, strict_parsing=0): else: continue if len(nv[1]) or keep_blank_values: - name = urllib.unquote(nv[0].replace('+', ' ')) - value = urllib.unquote(nv[1].replace('+', ' ')) + name = urllib.parse.unquote(nv[0].replace('+', ' ')) + value = urllib.parse.unquote(nv[1].replace('+', ' ')) r.append((name, value)) return r diff --git a/Lib/distutils/command/register.py b/Lib/distutils/command/register.py index 89cb2d4..6d5c459 100644 --- a/Lib/distutils/command/register.py +++ b/Lib/distutils/command/register.py @@ -7,8 +7,9 @@ Implements the Distutils 'register' command (register with the repository). __revision__ = "$Id$" -import os, string, urllib2, getpass, urlparse +import os, string, getpass import io +import urllib.parse, urllib.request from distutils.core import PyPIRCCommand from distutils.errors import * @@ -94,7 +95,8 @@ class register(PyPIRCCommand): def classifiers(self): ''' Fetch the list of classifiers from the server. ''' - response = urllib2.urlopen(self.repository+'?:action=list_classifiers') + url = self.repository+'?:action=list_classifiers' + response = urllib.request.urlopen(url) print(response.read()) def verify_metadata(self): @@ -166,8 +168,8 @@ Your selection [default 1]: ''', end=' ') password = getpass.getpass('Password: ') # set up the authentication - auth = urllib2.HTTPPasswordMgr() - host = urlparse.urlparse(self.repository)[1] + auth = urllib.request.HTTPPasswordMgr() + host = urllib.parse.urlparse(self.repository)[1] auth.add_password(self.realm, host, username, password) # send the info to the server and report the result code, result = self.post_to_server(self.build_post_data('submit'), @@ -276,20 +278,20 @@ Your selection [default 1]: ''', end=' ') 'Content-type': 'multipart/form-data; boundary=%s; charset=utf-8'%boundary, 'Content-length': str(len(body)) } - req = urllib2.Request(self.repository, body, headers) + req = urllib.request.Request(self.repository, body, headers) # handle HTTP and include the Basic Auth handler - opener = urllib2.build_opener( - urllib2.HTTPBasicAuthHandler(password_mgr=auth) + opener = urllib.request.build_opener( + urllib.request.HTTPBasicAuthHandler(password_mgr=auth) ) data = '' try: result = opener.open(req) - except urllib2.HTTPError as e: + except urllib.error.HTTPError as e: if self.show_response: data = e.fp.read() result = e.code, e.msg - except urllib2.URLError as e: + except urllib.error.URLError as e: result = 500, str(e) else: if self.show_response: diff --git a/Lib/distutils/command/upload.py b/Lib/distutils/command/upload.py index 2cad2c7..5049f03 100644 --- a/Lib/distutils/command/upload.py +++ b/Lib/distutils/command/upload.py @@ -13,7 +13,7 @@ import platform import configparser import http.client import base64 -import urlparse +import urllib.parse class upload(PyPIRCCommand): @@ -145,10 +145,11 @@ class upload(PyPIRCCommand): self.announce("Submitting %s to %s" % (filename, self.repository), log.INFO) # build the Request - # We can't use urllib2 since we need to send the Basic + # We can't use urllib since we need to send the Basic # auth right with the first request + # TODO(jhylton): Can we fix urllib? schema, netloc, url, params, query, fragments = \ - urlparse.urlparse(self.repository) + urllib.parse.urlparse(self.repository) assert not params and not query and not fragments if schema == 'http': http = http.client.HTTPConnection(netloc) diff --git a/Lib/email/utils.py b/Lib/email/utils.py index 0439aff..e1d21f6 100644 --- a/Lib/email/utils.py +++ b/Lib/email/utils.py @@ -25,6 +25,7 @@ import time import base64 import random import socket +import urllib.parse import warnings from io import StringIO @@ -218,8 +219,7 @@ def encode_rfc2231(s, charset=None, language=None): charset is given but not language, the string is encoded using the empty string for language. """ - import urllib - s = urllib.quote(s, safe='') + s = urllib.parse.quote(s, safe='') if charset is None and language is None: return s if language is None: @@ -234,7 +234,6 @@ def decode_params(params): params is a sequence of 2-tuples containing (param name, string value). """ - import urllib # Copy params so we don't mess with the original params = params[:] new_params = [] @@ -272,7 +271,7 @@ def decode_params(params): # language specifiers at the beginning of the string. for num, s, encoded in continuations: if encoded: - s = urllib.unquote(s) + s = urllib.parse.unquote(s) extended = True value.append(s) value = quote(EMPTYSTRING.join(value)) diff --git a/Lib/http/client.py b/Lib/http/client.py index 04e75f6..96bcd72 100644 --- a/Lib/http/client.py +++ b/Lib/http/client.py @@ -70,7 +70,7 @@ import io import socket import email.parser import email.message -from urlparse import urlsplit +from urllib.parse import urlsplit import warnings __all__ = ["HTTPResponse", "HTTPConnection", diff --git a/Lib/http/cookiejar.py b/Lib/http/cookiejar.py index 99be888..e9b83ea 100644 --- a/Lib/http/cookiejar.py +++ b/Lib/http/cookiejar.py @@ -28,7 +28,10 @@ http://wwwsearch.sf.net/): __all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy', 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar'] -import re, urlparse, copy, time, urllib +import copy +import re +import time +import urllib.parse, urllib.request try: import threading as _threading except ImportError: @@ -580,7 +583,7 @@ def request_host(request): """ url = request.get_full_url() - host = urlparse.urlparse(url)[1] + host = urllib.parse.urlparse(url)[1] if host == "": host = request.get_header("Host", "") @@ -602,13 +605,11 @@ def eff_request_host(request): def request_path(request): """request-URI, as defined by RFC 2965.""" url = request.get_full_url() - #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url) - #req_path = escape_path("".join(urlparse.urlparse(url)[2:])) - path, parameters, query, frag = urlparse.urlparse(url)[2:] + path, parameters, query, frag = urllib.parse.urlparse(url)[2:] if parameters: path = "%s;%s" % (path, parameters) path = escape_path(path) - req_path = urlparse.urlunparse(("", "", path, "", query, frag)) + req_path = urllib.parse.urlunparse(("", "", path, "", query, frag)) if not req_path.startswith("/"): # fix bad RFC 2396 absoluteURI req_path = "/"+req_path @@ -644,7 +645,7 @@ def escape_path(path): # And here, kind of: draft-fielding-uri-rfc2396bis-03 # (And in draft IRI specification: draft-duerst-iri-05) # (And here, for new URI schemes: RFC 2718) - path = urllib.quote(path, HTTP_PATH_SAFE) + path = urllib.parse.quote(path, HTTP_PATH_SAFE) path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) return path @@ -1197,8 +1198,7 @@ class CookieJar: """Collection of HTTP cookies. You may not need to know about this class: try - urllib2.build_opener(HTTPCookieProcessor).open(url). - + urllib.request.build_opener(HTTPCookieProcessor).open(url). """ non_word_re = re.compile(r"\W") diff --git a/Lib/http/server.py b/Lib/http/server.py index 35ade6c..6259a4d 100644 --- a/Lib/http/server.py +++ b/Lib/http/server.py @@ -93,7 +93,7 @@ import cgi import time import socket # For gethostbyaddr() import shutil -import urllib +import urllib.parse import select import mimetypes import posixpath @@ -683,7 +683,7 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): return None list.sort(key=lambda a: a.lower()) r = [] - displaypath = cgi.escape(urllib.unquote(self.path)) + displaypath = cgi.escape(urllib.parse.unquote(self.path)) r.append('') r.append("\nDirectory listing for %s\n" % displaypath) r.append("\n

Directory listing for %s

\n" % displaypath) @@ -699,7 +699,7 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): displayname = name + "@" # Note: a link to a directory displays with @ and links with / r.append('
  • %s\n' - % (urllib.quote(linkname), cgi.escape(displayname))) + % (urllib.parse.quote(linkname), cgi.escape(displayname))) r.append("\n
    \n\n\n") enc = sys.getfilesystemencoding() encoded = ''.join(r).encode(enc) @@ -723,7 +723,7 @@ class SimpleHTTPRequestHandler(BaseHTTPRequestHandler): # abandon query parameters path = path.split('?',1)[0] path = path.split('#',1)[0] - path = posixpath.normpath(urllib.unquote(path)) + path = posixpath.normpath(urllib.parse.unquote(path)) words = path.split('/') words = filter(None, words) path = os.getcwd() @@ -947,7 +947,7 @@ class CGIHTTPRequestHandler(SimpleHTTPRequestHandler): env['SERVER_PROTOCOL'] = self.protocol_version env['SERVER_PORT'] = str(self.server.server_port) env['REQUEST_METHOD'] = self.command - uqrest = urllib.unquote(rest) + uqrest = urllib.parse.unquote(rest) env['PATH_INFO'] = uqrest env['PATH_TRANSLATED'] = self.translate_path(uqrest) env['SCRIPT_NAME'] = scriptname diff --git a/Lib/macurl2path.py b/Lib/macurl2path.py index 0c8b64f..11944cf 100644 --- a/Lib/macurl2path.py +++ b/Lib/macurl2path.py @@ -2,7 +2,7 @@ Do not import directly; use urllib instead.""" -import urllib +import urllib.parse import os __all__ = ["url2pathname","pathname2url"] @@ -13,7 +13,7 @@ def url2pathname(pathname): # # XXXX The .. handling should be fixed... # - tp = urllib.splittype(pathname)[0] + tp = urllib.parsesplittype(pathname)[0] if tp and tp != 'file': raise RuntimeError('Cannot convert non-local URL to pathname') # Turn starting /// into /, an empty hostname means current host @@ -47,7 +47,7 @@ def url2pathname(pathname): i = i + 1 rv = ':' + ':'.join(components) # and finally unquote slashes and other funny characters - return urllib.unquote(rv) + return urllib.parseunquote(rv) def pathname2url(pathname): """OS-specific conversion from a file system path to a relative URL @@ -73,8 +73,8 @@ def pathname2url(pathname): return '/'.join(components) def _pncomp2url(component): - component = urllib.quote(component[:31], safe='') # We want to quote slashes - return component + # We want to quote slashes + return urllib.parsequote(component[:31], safe='') def test(): for url in ["index.html", diff --git a/Lib/mimetypes.py b/Lib/mimetypes.py index 6c36a9c..5812c0c 100644 --- a/Lib/mimetypes.py +++ b/Lib/mimetypes.py @@ -24,7 +24,7 @@ read_mime_types(file) -- parse one file, return a dictionary or None import os import posixpath -import urllib +import urllib.parse __all__ = [ "guess_type","guess_extension","guess_all_extensions", @@ -104,7 +104,7 @@ class MimeTypes: Optional `strict' argument when False adds a bunch of commonly found, but non-standard types. """ - scheme, url = urllib.splittype(url) + scheme, url = urllib.parse.splittype(url) if scheme == 'data': # syntax of data URLs: # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data diff --git a/Lib/robotparser.py b/Lib/robotparser.py deleted file mode 100644 index d1ef460..0000000 --- a/Lib/robotparser.py +++ /dev/null @@ -1,212 +0,0 @@ -""" robotparser.py - - Copyright (C) 2000 Bastian Kleineidam - - You can choose between two licenses when using this package: - 1) GNU GPLv2 - 2) PSF license for Python 2.2 - - The robots.txt Exclusion Protocol is implemented as specified in - http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html -""" -import urlparse -import urllib - -__all__ = ["RobotFileParser"] - -class RobotFileParser: - """ This class provides a set of methods to read, parse and answer - questions about a single robots.txt file. - - """ - - def __init__(self, url=''): - self.entries = [] - self.default_entry = None - self.disallow_all = False - self.allow_all = False - self.set_url(url) - self.last_checked = 0 - - def mtime(self): - """Returns the time the robots.txt file was last fetched. - - This is useful for long-running web spiders that need to - check for new robots.txt files periodically. - - """ - return self.last_checked - - def modified(self): - """Sets the time the robots.txt file was last fetched to the - current time. - - """ - import time - self.last_checked = time.time() - - def set_url(self, url): - """Sets the URL referring to a robots.txt file.""" - self.url = url - self.host, self.path = urlparse.urlparse(url)[1:3] - - def read(self): - """Reads the robots.txt URL and feeds it to the parser.""" - opener = URLopener() - f = opener.open(self.url) - lines = [] - line = f.readline() - while line: - lines.append(line.strip()) - line = f.readline() - self.errcode = opener.errcode - if self.errcode in (401, 403): - self.disallow_all = True - elif self.errcode >= 400: - self.allow_all = True - elif self.errcode == 200 and lines: - self.parse(lines) - - def _add_entry(self, entry): - if "*" in entry.useragents: - # the default entry is considered last - self.default_entry = entry - else: - self.entries.append(entry) - - def parse(self, lines): - """parse the input lines from a robots.txt file. - We allow that a user-agent: line is not preceded by - one or more blank lines.""" - state = 0 - linenumber = 0 - entry = Entry() - - for line in lines: - linenumber = linenumber + 1 - if not line: - if state == 1: - entry = Entry() - state = 0 - elif state == 2: - self._add_entry(entry) - entry = Entry() - state = 0 - # remove optional comment and strip line - i = line.find('#') - if i >= 0: - line = line[:i] - line = line.strip() - if not line: - continue - line = line.split(':', 1) - if len(line) == 2: - line[0] = line[0].strip().lower() - line[1] = urllib.unquote(line[1].strip()) - if line[0] == "user-agent": - if state == 2: - self._add_entry(entry) - entry = Entry() - entry.useragents.append(line[1]) - state = 1 - elif line[0] == "disallow": - if state != 0: - entry.rulelines.append(RuleLine(line[1], False)) - state = 2 - elif line[0] == "allow": - if state != 0: - entry.rulelines.append(RuleLine(line[1], True)) - if state == 2: - self.entries.append(entry) - - - def can_fetch(self, useragent, url): - """using the parsed robots.txt decide if useragent can fetch url""" - if self.disallow_all: - return False - if self.allow_all: - return True - # search for given user agent matches - # the first match counts - url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" - for entry in self.entries: - if entry.applies_to(useragent): - return entry.allowance(url) - # try the default entry last - if self.default_entry: - return self.default_entry.allowance(url) - # agent not found ==> access granted - return True - - - def __str__(self): - return ''.join([str(entry) + "\n" for entry in self.entries]) - - -class RuleLine: - """A rule line is a single "Allow:" (allowance==True) or "Disallow:" - (allowance==False) followed by a path.""" - def __init__(self, path, allowance): - if path == '' and not allowance: - # an empty value means allow all - allowance = True - self.path = urllib.quote(path) - self.allowance = allowance - - def applies_to(self, filename): - return self.path == "*" or filename.startswith(self.path) - - def __str__(self): - return (self.allowance and "Allow" or "Disallow") + ": " + self.path - - -class Entry: - """An entry has one or more user-agents and zero or more rulelines""" - def __init__(self): - self.useragents = [] - self.rulelines = [] - - def __str__(self): - ret = [] - for agent in self.useragents: - ret.extend(["User-agent: ", agent, "\n"]) - for line in self.rulelines: - ret.extend([str(line), "\n"]) - return ''.join(ret) - - def applies_to(self, useragent): - """check if this entry applies to the specified agent""" - # split the name token and make it lower case - useragent = useragent.split("/")[0].lower() - for agent in self.useragents: - if agent == '*': - # we have the catch-all agent - return True - agent = agent.lower() - if agent in useragent: - return True - return False - - def allowance(self, filename): - """Preconditions: - - our agent applies to this entry - - filename is URL decoded""" - for line in self.rulelines: - if line.applies_to(filename): - return line.allowance - return True - -class URLopener(urllib.FancyURLopener): - def __init__(self, *args): - urllib.FancyURLopener.__init__(self, *args) - self.errcode = 200 - - def prompt_user_passwd(self, host, realm): - ## If robots.txt file is accessible only with a password, - ## we act as if the file wasn't there. - return None, None - - def http_error_default(self, url, fp, errcode, errmsg, headers): - self.errcode = errcode - return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, - errmsg, headers) diff --git a/Lib/test/regrtest.py b/Lib/test/regrtest.py index 217703b..94c2248 100755 --- a/Lib/test/regrtest.py +++ b/Lib/test/regrtest.py @@ -725,7 +725,7 @@ def dash_R(the_module, test, indirect_test, huntrleaks): def dash_R_cleanup(fs, ps, pic, abcs): import gc, copyreg import _strptime, linecache - import urlparse, urllib, urllib2, mimetypes, doctest + import urllib.parse, urllib.request, mimetypes, doctest import struct, filecmp, _abcoll from distutils.dir_util import _path_created from weakref import WeakSet @@ -758,9 +758,8 @@ def dash_R_cleanup(fs, ps, pic, abcs): _path_created.clear() re.purge() _strptime._regex_cache.clear() - urlparse.clear_cache() - urllib.urlcleanup() - urllib2.install_opener(None) + urllib.parse.clear_cache() + urllib.request.urlcleanup() linecache.clearcache() mimetypes._default_mime_types() filecmp._cache.clear() diff --git a/Lib/test/support.py b/Lib/test/support.py index c011c10..c6ce760 100644 --- a/Lib/test/support.py +++ b/Lib/test/support.py @@ -352,10 +352,10 @@ def check_syntax_error(testcase, statement): testcase.fail('Missing SyntaxError: "%s"' % statement) def open_urlresource(url, *args, **kw): - import urllib, urlparse + import urllib.request, urllib.parse requires('urlfetch') - filename = urlparse.urlparse(url)[2].split('/')[-1] # '/': it's URL! + filename = urllib.parse.urlparse(url)[2].split('/')[-1] # '/': it's URL! for path in [os.path.curdir, os.path.pardir]: fn = os.path.join(path, filename) @@ -363,7 +363,7 @@ def open_urlresource(url, *args, **kw): return open(fn, *args, **kw) print('\tfetching %s ...' % url, file=get_original_stdout()) - fn, _ = urllib.urlretrieve(url, filename) + fn, _ = urllib.request.urlretrieve(url, filename) return open(fn, *args, **kw) diff --git a/Lib/test/test___all__.py b/Lib/test/test___all__.py index ec1ba16..ba8f75b 100644 --- a/Lib/test/test___all__.py +++ b/Lib/test/test___all__.py @@ -111,7 +111,7 @@ class AllTest(unittest.TestCase): self.check_all("re") self.check_all("reprlib") self.check_all("rlcompleter") - self.check_all("robotparser") + self.check_all("urllib.robotparser") self.check_all("sched") self.check_all("shelve") self.check_all("shlex") @@ -134,8 +134,6 @@ class AllTest(unittest.TestCase): self.check_all("traceback") self.check_all("tty") self.check_all("unittest") - self.check_all("urllib") - self.check_all("urlparse") self.check_all("uu") self.check_all("warnings") self.check_all("wave") diff --git a/Lib/test/test_http_cookiejar.py b/Lib/test/test_http_cookiejar.py index c130190..1627923 100644 --- a/Lib/test/test_http_cookiejar.py +++ b/Lib/test/test_http_cookiejar.py @@ -1,6 +1,6 @@ """Tests for http/cookiejar.py.""" -import re, os, time, urllib2 +import re, os, time, urllib.request from unittest import TestCase from test import support @@ -206,7 +206,7 @@ def interact_netscape(cookiejar, url, *set_cookie_hdrs): def _interact(cookiejar, url, set_cookie_hdrs, hdr_name): """Perform a single request / response cycle, returning Cookie: header.""" - req = urllib2.Request(url) + req = urllib.request.Request(url) cookiejar.add_cookie_header(req) cookie_hdr = req.get_header("Cookie", "") headers = [] @@ -330,7 +330,7 @@ class CookieTests(TestCase): ("http://foo/", "foo.local", True), ("http://foo/", ".local", True), ]: - request = urllib2.Request(url) + request = urllib.request.Request(url) r = pol.domain_return_ok(domain, request) if ok: self.assert_(r) else: self.assert_(not r) @@ -547,46 +547,48 @@ class CookieTests(TestCase): def test_request_path(self): # with parameters - req = urllib2.Request("http://www.example.com/rheum/rhaponicum;" - "foo=bar;sing=song?apples=pears&spam=eggs#ni") + req = urllib.request.Request( + "http://www.example.com/rheum/rhaponicum;" + "foo=bar;sing=song?apples=pears&spam=eggs#ni") self.assertEquals(request_path(req), "/rheum/rhaponicum;" "foo=bar;sing=song?apples=pears&spam=eggs#ni") # without parameters - req = urllib2.Request("http://www.example.com/rheum/rhaponicum?" - "apples=pears&spam=eggs#ni") + req = urllib.request.Request( + "http://www.example.com/rheum/rhaponicum?" + "apples=pears&spam=eggs#ni") self.assertEquals(request_path(req), "/rheum/rhaponicum?" "apples=pears&spam=eggs#ni") # missing final slash - req = urllib2.Request("http://www.example.com") + req = urllib.request.Request("http://www.example.com") self.assertEquals(request_path(req), "/") def test_request_port(self): - req = urllib2.Request("http://www.acme.com:1234/", - headers={"Host": "www.acme.com:4321"}) + req = urllib.request.Request("http://www.acme.com:1234/", + headers={"Host": "www.acme.com:4321"}) self.assertEquals(request_port(req), "1234") - req = urllib2.Request("http://www.acme.com/", - headers={"Host": "www.acme.com:4321"}) + req = urllib.request.Request("http://www.acme.com/", + headers={"Host": "www.acme.com:4321"}) self.assertEquals(request_port(req), DEFAULT_HTTP_PORT) def test_request_host(self): # this request is illegal (RFC2616, 14.2.3) - req = urllib2.Request("http://1.1.1.1/", - headers={"Host": "www.acme.com:80"}) + req = urllib.request.Request("http://1.1.1.1/", + headers={"Host": "www.acme.com:80"}) # libwww-perl wants this response, but that seems wrong (RFC 2616, # section 5.2, point 1., and RFC 2965 section 1, paragraph 3) #self.assertEquals(request_host(req), "www.acme.com") self.assertEquals(request_host(req), "1.1.1.1") - req = urllib2.Request("http://www.acme.com/", - headers={"Host": "irrelevant.com"}) + req = urllib.request.Request("http://www.acme.com/", + headers={"Host": "irrelevant.com"}) self.assertEquals(request_host(req), "www.acme.com") # not actually sure this one is valid Request object, so maybe should # remove test for no host in url in request_host function? - req = urllib2.Request("/resource.html", - headers={"Host": "www.acme.com"}) + req = urllib.request.Request("/resource.html", + headers={"Host": "www.acme.com"}) self.assertEquals(request_host(req), "www.acme.com") # port shouldn't be in request-host - req = urllib2.Request("http://www.acme.com:2345/resource.html", - headers={"Host": "www.acme.com:5432"}) + req = urllib.request.Request("http://www.acme.com:2345/resource.html", + headers={"Host": "www.acme.com:5432"}) self.assertEquals(request_host(req), "www.acme.com") def test_is_HDN(self): @@ -766,24 +768,24 @@ class CookieTests(TestCase): blocked_domains=["acme.com"], allowed_domains=["www.acme.com"])) - req = urllib2.Request("http://acme.com/") + req = urllib.request.Request("http://acme.com/") headers = ["Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/"] res = FakeResponse(headers, "http://acme.com/") c.extract_cookies(res, req) self.assertEquals(len(c), 0) - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") res = FakeResponse(headers, "http://www.acme.com/") c.extract_cookies(res, req) self.assertEquals(len(c), 1) - req = urllib2.Request("http://www.coyote.com/") + req = urllib.request.Request("http://www.coyote.com/") res = FakeResponse(headers, "http://www.coyote.com/") c.extract_cookies(res, req) self.assertEquals(len(c), 1) # set a cookie with non-allowed domain... - req = urllib2.Request("http://www.coyote.com/") + req = urllib.request.Request("http://www.coyote.com/") res = FakeResponse(headers, "http://www.coyote.com/") cookies = c.make_cookies(res, req) c.set_cookie(cookies[0]) @@ -798,7 +800,7 @@ class CookieTests(TestCase): c = CookieJar(policy=pol) headers = ["Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/"] - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") res = FakeResponse(headers, "http://www.acme.com/") c.extract_cookies(res, req) self.assertEquals(len(c), 0) @@ -808,11 +810,11 @@ class CookieTests(TestCase): self.assertEquals(len(c), 1) c.clear() - req = urllib2.Request("http://www.roadrunner.net/") + req = urllib.request.Request("http://www.roadrunner.net/") res = FakeResponse(headers, "http://www.roadrunner.net/") c.extract_cookies(res, req) self.assertEquals(len(c), 1) - req = urllib2.Request("http://www.roadrunner.net/") + req = urllib.request.Request("http://www.roadrunner.net/") c.add_cookie_header(req) self.assert_((req.has_header("Cookie") and req.has_header("Cookie2"))) @@ -823,7 +825,7 @@ class CookieTests(TestCase): self.assertEquals(len(c), 1) # set a cookie with blocked domain... - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") res = FakeResponse(headers, "http://www.acme.com/") cookies = c.make_cookies(res, req) c.set_cookie(cookies[0]) @@ -866,7 +868,7 @@ class CookieTests(TestCase): url = "http://www.acme.com" c = CookieJar(DefaultCookiePolicy(rfc2965=True)) interact_2965(c, url, "foo=bar; Version=1") - req = urllib2.Request(url) + req = urllib.request.Request(url) self.assertEquals(len(c), 1) c.add_cookie_header(req) self.assert_(req.has_header("Cookie")) @@ -1009,7 +1011,7 @@ class CookieTests(TestCase): def cookiejar_from_cookie_headers(headers): c = CookieJar() - req = urllib2.Request("http://www.example.com/") + req = urllib.request.Request("http://www.example.com/") r = FakeResponse(headers, "http://www.example.com/") c.extract_cookies(r, req) return c @@ -1080,9 +1082,9 @@ class LWPCookieTests(TestCase): c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) - #req = urllib2.Request("http://1.1.1.1/", + #req = urllib.request.Request("http://1.1.1.1/", # headers={"Host": "www.acme.com:80"}) - req = urllib2.Request("http://www.acme.com:80/", + req = urllib.request.Request("http://www.acme.com:80/", headers={"Host": "www.acme.com:80"}) headers.append( @@ -1091,7 +1093,7 @@ class LWPCookieTests(TestCase): res = FakeResponse(headers, "http://www.acme.com/") c.extract_cookies(res, req) - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") c.add_cookie_header(req) self.assertEqual(req.get_header("Cookie"), "CUSTOMER=WILE_E_COYOTE") @@ -1101,7 +1103,7 @@ class LWPCookieTests(TestCase): res = FakeResponse(headers, "http://www.acme.com/") c.extract_cookies(res, req) - req = urllib2.Request("http://www.acme.com/foo/bar") + req = urllib.request.Request("http://www.acme.com/foo/bar") c.add_cookie_header(req) h = req.get_header("Cookie") @@ -1112,7 +1114,7 @@ class LWPCookieTests(TestCase): res = FakeResponse(headers, "http://www.acme.com") c.extract_cookies(res, req) - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") c.add_cookie_header(req) h = req.get_header("Cookie") @@ -1120,7 +1122,7 @@ class LWPCookieTests(TestCase): "CUSTOMER=WILE_E_COYOTE" in h and "SHIPPING=FEDEX" not in h) - req = urllib2.Request("http://www.acme.com/foo/") + req = urllib.request.Request("http://www.acme.com/foo/") c.add_cookie_header(req) h = req.get_header("Cookie") @@ -1155,13 +1157,13 @@ class LWPCookieTests(TestCase): c = CookieJar() headers = [] - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") headers.append("Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/") res = FakeResponse(headers, "http://www.acme.com/") c.extract_cookies(res, req) - req = urllib2.Request("http://www.acme.com/") + req = urllib.request.Request("http://www.acme.com/") c.add_cookie_header(req) self.assertEquals(req.get_header("Cookie"), @@ -1172,7 +1174,7 @@ class LWPCookieTests(TestCase): res = FakeResponse(headers, "http://www.acme.com/") c.extract_cookies(res, req) - req = urllib2.Request("http://www.acme.com/ammo") + req = urllib.request.Request("http://www.acme.com/ammo") c.add_cookie_header(req) self.assert_(re.search(r"PART_NUMBER=RIDING_ROCKET_0023;\s*" @@ -1503,7 +1505,7 @@ class LWPCookieTests(TestCase): # Some additional Netscape cookies tests. c = CookieJar() headers = [] - req = urllib2.Request("http://foo.bar.acme.com/foo") + req = urllib.request.Request("http://foo.bar.acme.com/foo") # Netscape allows a host part that contains dots headers.append("Set-Cookie: Customer=WILE_E_COYOTE; domain=.acme.com") @@ -1517,7 +1519,7 @@ class LWPCookieTests(TestCase): res = FakeResponse(headers, "http://www.acme.com/foo") c.extract_cookies(res, req) - req = urllib2.Request("http://foo.bar.acme.com/foo") + req = urllib.request.Request("http://foo.bar.acme.com/foo") c.add_cookie_header(req) self.assert_( "PART_NUMBER=3,4" in req.get_header("Cookie") and @@ -1559,12 +1561,12 @@ class LWPCookieTests(TestCase): c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) headers = [] - req = urllib2.Request("http://www.ants.com/") + req = urllib.request.Request("http://www.ants.com/") headers.append("Set-Cookie: JSESSIONID=ABCDERANDOM123; Path=") res = FakeResponse(headers, "http://www.ants.com/") c.extract_cookies(res, req) - req = urllib2.Request("http://www.ants.com/") + req = urllib.request.Request("http://www.ants.com/") c.add_cookie_header(req) self.assertEquals(req.get_header("Cookie"), @@ -1572,7 +1574,7 @@ class LWPCookieTests(TestCase): self.assertEquals(req.get_header("Cookie2"), '$Version="1"') # missing path in the request URI - req = urllib2.Request("http://www.ants.com:8080") + req = urllib.request.Request("http://www.ants.com:8080") c.add_cookie_header(req) self.assertEquals(req.get_header("Cookie"), @@ -1585,7 +1587,7 @@ class LWPCookieTests(TestCase): # Check session cookies are deleted properly by # CookieJar.clear_session_cookies method - req = urllib2.Request('http://www.perlmeister.com/scripts') + req = urllib.request.Request('http://www.perlmeister.com/scripts') headers = [] headers.append("Set-Cookie: s1=session;Path=/scripts") headers.append("Set-Cookie: p1=perm; Domain=.perlmeister.com;" diff --git a/Lib/test/test_httpservers.py b/Lib/test/test_httpservers.py index 94c6a3d..0305a90 100644 --- a/Lib/test/test_httpservers.py +++ b/Lib/test/test_httpservers.py @@ -11,7 +11,7 @@ import os import sys import base64 import shutil -import urllib +import urllib.parse import http.client import tempfile import threading @@ -322,7 +322,8 @@ class CGIHTTPServerTestCase(BaseTestCase): (res.read(), res.getheader('Content-type'), res.status)) def test_post(self): - params = urllib.urlencode({'spam' : 1, 'eggs' : 'python', 'bacon' : 123456}) + params = urllib.parse.urlencode( + {'spam' : 1, 'eggs' : 'python', 'bacon' : 123456}) headers = {'Content-type' : 'application/x-www-form-urlencoded'} res = self.request('/cgi-bin/file2.py', 'POST', params, headers) diff --git a/Lib/test/test_importhooks.py b/Lib/test/test_importhooks.py index eaf213d..acf45fb 100644 --- a/Lib/test/test_importhooks.py +++ b/Lib/test/test_importhooks.py @@ -247,22 +247,22 @@ class ImportHooksTestCase(ImportHooksBaseTestCase): i = ImpWrapper() sys.meta_path.append(i) sys.path_hooks.append(ImpWrapper) - mnames = ("colorsys", "urlparse", "distutils.core") + mnames = ("colorsys", "urllib.parse", "distutils.core") for mname in mnames: parent = mname.split(".")[0] - for n in list(sys.modules.keys()): + for n in list(sys.modules): if n.startswith(parent): del sys.modules[n] for mname in mnames: m = __import__(mname, globals(), locals(), ["__dummy__"]) m.__loader__ # to make sure we actually handled the import - # Delete urllib from modules because urlparse was imported above. - # Without this hack, test_socket_ssl fails if run in this order: - # regrtest.py test_codecmaps_tw test_importhooks test_socket_ssl - try: - del sys.modules['urllib'] - except KeyError: - pass +## # Delete urllib from modules because urlparse was imported above. +## # Without this hack, test_socket_ssl fails if run in this order: +## # regrtest.py test_codecmaps_tw test_importhooks test_socket_ssl +## try: +## del sys.modules['urllib'] +## except KeyError: +## pass def test_main(): support.run_unittest(ImportHooksTestCase) diff --git a/Lib/test/test_pyclbr.py b/Lib/test/test_pyclbr.py index 8287877..9438c7b 100644 --- a/Lib/test/test_pyclbr.py +++ b/Lib/test/test_pyclbr.py @@ -156,16 +156,6 @@ class PyclbrTest(TestCase): # These were once about the 10 longest modules cm('random', ignore=('Random',)) # from _random import Random as CoreGenerator cm('cgi', ignore=('log',)) # set with = in module - cm('urllib', ignore=('_CFNumberToInt32', - '_CStringFromCFString', - '_CFSetup', - 'getproxies_registry', - 'proxy_bypass_registry', - 'proxy_bypass_macosx_sysconf', - 'open_https', - '_https_connection', - 'getproxies_macosx_sysconf', - 'getproxies_internetconfig',)) # not on all platforms cm('pickle') cm('aifc', ignore=('openfp',)) # set with = in module cm('sre_parse', ignore=('dump',)) # from sre_constants import * diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 4e530f0..fbb02bc 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -1,5 +1,6 @@ -import unittest, robotparser import io +import unittest +import urllib.robotparser from test import support class RobotTestCase(unittest.TestCase): @@ -34,7 +35,7 @@ def RobotTest(index, robots_txt, good_urls, bad_urls, agent="test_robotparser"): lines = io.StringIO(robots_txt).readlines() - parser = robotparser.RobotFileParser() + parser = urllib.robotparser.RobotFileParser() parser.parse(lines) for url in good_urls: tests.addTest(RobotTestCase(index, parser, url, 1, agent)) @@ -140,7 +141,7 @@ class TestCase(unittest.TestCase): support.requires('network') # whole site is password-protected. url = 'http://mueblesmoraleda.com' - parser = robotparser.RobotFileParser() + parser = urllib.robotparser.RobotFileParser() parser.set_url(url) parser.read() self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False) diff --git a/Lib/test/test_ssl.py b/Lib/test/test_ssl.py index 619161e..9341bf9 100644 --- a/Lib/test/test_ssl.py +++ b/Lib/test/test_ssl.py @@ -10,7 +10,7 @@ import subprocess import time import os import pprint -import urllib, urlparse +import urllib.parse, urllib.request import shutil import traceback import asyncore @@ -440,8 +440,8 @@ else: """ # abandon query parameters - path = urlparse.urlparse(path)[2] - path = os.path.normpath(urllib.unquote(path)) + path = urllib.parse.urlparse(path)[2] + path = os.path.normpath(urllib.parse.unquote(path)) words = path.split('/') words = filter(None, words) path = self.root @@ -943,7 +943,7 @@ else: # now fetch the same data from the HTTPS server url = 'https://%s:%d/%s' % ( HOST, server.port, os.path.split(CERTFILE)[1]) - f = urllib.urlopen(url) + f = urllib.request.urlopen(url) dlen = f.info().get("content-length") if dlen and (int(dlen) > 0): d2 = f.read(int(dlen)) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 2b41cad..f5a9d5d 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1,6 +1,7 @@ """Regresssion tests for urllib""" -import urllib +import urllib.parse +import urllib.request import http.client import email.message import io @@ -16,6 +17,23 @@ def hexescape(char): hex_repr = "0%s" % hex_repr return "%" + hex_repr +# Shortcut for testing FancyURLopener +_urlopener = None +def urlopen(url, data=None, proxies=None): + """urlopen(url [, data]) -> open file-like object""" + global _urlopener + if proxies is not None: + opener = urllib.request.FancyURLopener(proxies=proxies) + elif not _urlopener: + opener = urllib.request.FancyURLopener() + _urlopener = opener + else: + opener = _urlopener + if data is None: + return opener.open(url) + else: + return opener.open(url, data) + class urlopen_FileTests(unittest.TestCase): """Test urlopen() opening a temporary file. @@ -25,15 +43,16 @@ class urlopen_FileTests(unittest.TestCase): """ def setUp(self): - """Setup of a temp file to use for testing""" - self.text = bytes("test_urllib: %s\n" % self.__class__.__name__, "ascii") - FILE = open(support.TESTFN, 'wb') + # Create a temp file to use for testing + self.text = bytes("test_urllib: %s\n" % self.__class__.__name__, + "ascii") + f = open(support.TESTFN, 'wb') try: - FILE.write(self.text) + f.write(self.text) finally: - FILE.close() + f.close() self.pathname = support.TESTFN - self.returned_obj = urllib.urlopen("file:%s" % self.pathname) + self.returned_obj = urlopen("file:%s" % self.pathname) def tearDown(self): """Shut down the open object""" @@ -119,7 +138,7 @@ class urlopen_HttpTests(unittest.TestCase): def test_read(self): self.fakehttp(b"Hello!") try: - fp = urllib.urlopen("http://python.org/") + fp = urlopen("http://python.org/") self.assertEqual(fp.readline(), b"Hello!") self.assertEqual(fp.readline(), b"") self.assertEqual(fp.geturl(), 'http://python.org/') @@ -136,7 +155,7 @@ Connection: close Content-Type: text/html; charset=iso-8859-1 ''') try: - self.assertRaises(IOError, urllib.urlopen, "http://python.org/") + self.assertRaises(IOError, urlopen, "http://python.org/") finally: self.unfakehttp() @@ -145,7 +164,7 @@ Content-Type: text/html; charset=iso-8859-1 # data. (#1680230) self.fakehttp(b'') try: - self.assertRaises(IOError, urllib.urlopen, "http://something") + self.assertRaises(IOError, urlopen, "http://something") finally: self.unfakehttp() @@ -180,7 +199,8 @@ class urlretrieve_FileTests(unittest.TestCase): except: pass def constructLocalFileUrl(self, filePath): - return "file://%s" % urllib.pathname2url(os.path.abspath(filePath)) + return "file://%s" % urllib.request.pathname2url( + os.path.abspath(filePath)) def createNewTempFile(self, data=b""): """Creates a new temporary file containing the specified data, @@ -204,7 +224,7 @@ class urlretrieve_FileTests(unittest.TestCase): def test_basic(self): # Make sure that a local file just gets its own location returned and # a headers value is returned. - result = urllib.urlretrieve("file:%s" % support.TESTFN) + result = urllib.request.urlretrieve("file:%s" % support.TESTFN) self.assertEqual(result[0], support.TESTFN) self.assert_(isinstance(result[1], email.message.Message), "did not get a email.message.Message instance as second " @@ -214,7 +234,7 @@ class urlretrieve_FileTests(unittest.TestCase): # Test that setting the filename argument works. second_temp = "%s.2" % support.TESTFN self.registerFileForCleanUp(second_temp) - result = urllib.urlretrieve(self.constructLocalFileUrl( + result = urllib.request.urlretrieve(self.constructLocalFileUrl( support.TESTFN), second_temp) self.assertEqual(second_temp, result[0]) self.assert_(os.path.exists(second_temp), "copy of the file was not " @@ -238,7 +258,8 @@ class urlretrieve_FileTests(unittest.TestCase): count_holder[0] = count_holder[0] + 1 second_temp = "%s.2" % support.TESTFN self.registerFileForCleanUp(second_temp) - urllib.urlretrieve(self.constructLocalFileUrl(support.TESTFN), + urllib.request.urlretrieve( + self.constructLocalFileUrl(support.TESTFN), second_temp, hooktester) def test_reporthook_0_bytes(self): @@ -247,7 +268,7 @@ class urlretrieve_FileTests(unittest.TestCase): def hooktester(count, block_size, total_size, _report=report): _report.append((count, block_size, total_size)) srcFileName = self.createNewTempFile() - urllib.urlretrieve(self.constructLocalFileUrl(srcFileName), + urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName), support.TESTFN, hooktester) self.assertEqual(len(report), 1) self.assertEqual(report[0][2], 0) @@ -261,7 +282,7 @@ class urlretrieve_FileTests(unittest.TestCase): def hooktester(count, block_size, total_size, _report=report): _report.append((count, block_size, total_size)) srcFileName = self.createNewTempFile(b"x" * 5) - urllib.urlretrieve(self.constructLocalFileUrl(srcFileName), + urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName), support.TESTFN, hooktester) self.assertEqual(len(report), 2) self.assertEqual(report[0][1], 8192) @@ -275,7 +296,7 @@ class urlretrieve_FileTests(unittest.TestCase): def hooktester(count, block_size, total_size, _report=report): _report.append((count, block_size, total_size)) srcFileName = self.createNewTempFile(b"x" * 8193) - urllib.urlretrieve(self.constructLocalFileUrl(srcFileName), + urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName), support.TESTFN, hooktester) self.assertEqual(len(report), 3) self.assertEqual(report[0][1], 8192) @@ -284,10 +305,10 @@ class urlretrieve_FileTests(unittest.TestCase): class QuotingTests(unittest.TestCase): """Tests for urllib.quote() and urllib.quote_plus() - According to RFC 2396 ("Uniform Resource Identifiers), to escape a - character you write it as '%' + <2 character US-ASCII hex value>. The Python - code of ``'%' + hex(ord())[2:]`` escapes a character properly. - Case does not matter on the hex letters. + According to RFC 2396 (Uniform Resource Identifiers), to escape a + character you write it as '%' + <2 character US-ASCII hex value>. + The Python code of ``'%' + hex(ord())[2:]`` escapes a + character properly. Case does not matter on the hex letters. The various character sets specified are: @@ -313,24 +334,24 @@ class QuotingTests(unittest.TestCase): "abcdefghijklmnopqrstuvwxyz", "0123456789", "_.-"]) - result = urllib.quote(do_not_quote) + result = urllib.parse.quote(do_not_quote) self.assertEqual(do_not_quote, result, "using quote(): %s != %s" % (do_not_quote, result)) - result = urllib.quote_plus(do_not_quote) + result = urllib.parse.quote_plus(do_not_quote) self.assertEqual(do_not_quote, result, "using quote_plus(): %s != %s" % (do_not_quote, result)) def test_default_safe(self): # Test '/' is default value for 'safe' parameter - self.assertEqual(urllib.quote.__defaults__[0], '/') + self.assertEqual(urllib.parse.quote.__defaults__[0], '/') def test_safe(self): # Test setting 'safe' parameter does what it should do quote_by_default = "<>" - result = urllib.quote(quote_by_default, safe=quote_by_default) + result = urllib.parse.quote(quote_by_default, safe=quote_by_default) self.assertEqual(quote_by_default, result, "using quote(): %s != %s" % (quote_by_default, result)) - result = urllib.quote_plus(quote_by_default, safe=quote_by_default) + result = urllib.parse.quote_plus(quote_by_default, safe=quote_by_default) self.assertEqual(quote_by_default, result, "using quote_plus(): %s != %s" % (quote_by_default, result)) @@ -343,11 +364,11 @@ class QuotingTests(unittest.TestCase): should_quote.append(chr(127)) # For 0x7F should_quote = ''.join(should_quote) for char in should_quote: - result = urllib.quote(char) + result = urllib.parse.quote(char) self.assertEqual(hexescape(char), result, "using quote(): %s should be escaped to %s, not %s" % (char, hexescape(char), result)) - result = urllib.quote_plus(char) + result = urllib.parse.quote_plus(char) self.assertEqual(hexescape(char), result, "using quote_plus(): " "%s should be escapes to %s, not %s" % @@ -355,7 +376,7 @@ class QuotingTests(unittest.TestCase): del should_quote partial_quote = "ab[]cd" expected = "ab%5B%5Dcd" - result = urllib.quote(partial_quote) + result = urllib.parse.quote(partial_quote) self.assertEqual(expected, result, "using quote(): %s != %s" % (expected, result)) self.assertEqual(expected, result, @@ -364,26 +385,26 @@ class QuotingTests(unittest.TestCase): def test_quoting_space(self): # Make sure quote() and quote_plus() handle spaces as specified in # their unique way - result = urllib.quote(' ') + result = urllib.parse.quote(' ') self.assertEqual(result, hexescape(' '), "using quote(): %s != %s" % (result, hexescape(' '))) - result = urllib.quote_plus(' ') + result = urllib.parse.quote_plus(' ') self.assertEqual(result, '+', "using quote_plus(): %s != +" % result) given = "a b cd e f" expect = given.replace(' ', hexescape(' ')) - result = urllib.quote(given) + result = urllib.parse.quote(given) self.assertEqual(expect, result, "using quote(): %s != %s" % (expect, result)) expect = given.replace(' ', '+') - result = urllib.quote_plus(given) + result = urllib.parse.quote_plus(given) self.assertEqual(expect, result, "using quote_plus(): %s != %s" % (expect, result)) def test_quoting_plus(self): - self.assertEqual(urllib.quote_plus('alpha+beta gamma'), + self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma'), 'alpha%2Bbeta+gamma') - self.assertEqual(urllib.quote_plus('alpha+beta gamma', '+'), + self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma', '+'), 'alpha+beta+gamma') class UnquotingTests(unittest.TestCase): @@ -399,21 +420,21 @@ class UnquotingTests(unittest.TestCase): for num in range(128): given = hexescape(chr(num)) expect = chr(num) - result = urllib.unquote(given) + result = urllib.parse.unquote(given) self.assertEqual(expect, result, "using unquote(): %s != %s" % (expect, result)) - result = urllib.unquote_plus(given) + result = urllib.parse.unquote_plus(given) self.assertEqual(expect, result, "using unquote_plus(): %s != %s" % (expect, result)) escape_list.append(given) escape_string = ''.join(escape_list) del escape_list - result = urllib.unquote(escape_string) + result = urllib.parse.unquote(escape_string) self.assertEqual(result.count('%'), 1, "using quote(): not all characters escaped; %s" % result) - result = urllib.unquote(escape_string) + result = urllib.parse.unquote(escape_string) self.assertEqual(result.count('%'), 1, "using unquote(): not all characters escaped: " "%s" % result) @@ -423,10 +444,10 @@ class UnquotingTests(unittest.TestCase): # interspersed given = 'ab%sd' % hexescape('c') expect = "abcd" - result = urllib.unquote(given) + result = urllib.parse.unquote(given) self.assertEqual(expect, result, "using quote(): %s != %s" % (expect, result)) - result = urllib.unquote_plus(given) + result = urllib.parse.unquote_plus(given) self.assertEqual(expect, result, "using unquote_plus(): %s != %s" % (expect, result)) @@ -434,16 +455,16 @@ class UnquotingTests(unittest.TestCase): # Test difference between unquote() and unquote_plus() given = "are+there+spaces..." expect = given - result = urllib.unquote(given) + result = urllib.parse.unquote(given) self.assertEqual(expect, result, "using unquote(): %s != %s" % (expect, result)) expect = given.replace('+', ' ') - result = urllib.unquote_plus(given) + result = urllib.parse.unquote_plus(given) self.assertEqual(expect, result, "using unquote_plus(): %s != %s" % (expect, result)) def test_unquote_with_unicode(self): - r = urllib.unquote('br%C3%BCckner_sapporo_20050930.doc') + r = urllib.parse.unquote('br%C3%BCckner_sapporo_20050930.doc') self.assertEqual(r, 'br\xc3\xbcckner_sapporo_20050930.doc') class urlencode_Tests(unittest.TestCase): @@ -462,7 +483,7 @@ class urlencode_Tests(unittest.TestCase): """ expect_somewhere = ["1st=1", "2nd=2", "3rd=3"] - result = urllib.urlencode(given) + result = urllib.parse.urlencode(given) for expected in expect_somewhere: self.assert_(expected in result, "testing %s: %s not found in %s" % @@ -495,20 +516,20 @@ class urlencode_Tests(unittest.TestCase): # Make sure keys and values are quoted using quote_plus() given = {"&":"="} expect = "%s=%s" % (hexescape('&'), hexescape('=')) - result = urllib.urlencode(given) + result = urllib.parse.urlencode(given) self.assertEqual(expect, result) given = {"key name":"A bunch of pluses"} expect = "key+name=A+bunch+of+pluses" - result = urllib.urlencode(given) + result = urllib.parse.urlencode(given) self.assertEqual(expect, result) def test_doseq(self): # Test that passing True for 'doseq' parameter works correctly given = {'sequence':['1', '2', '3']} - expect = "sequence=%s" % urllib.quote_plus(str(['1', '2', '3'])) - result = urllib.urlencode(given) + expect = "sequence=%s" % urllib.parse.quote_plus(str(['1', '2', '3'])) + result = urllib.parse.urlencode(given) self.assertEqual(expect, result) - result = urllib.urlencode(given, True) + result = urllib.parse.urlencode(given, True) for value in given["sequence"]: expect = "sequence=%s" % value self.assert_(expect in result, @@ -523,11 +544,11 @@ class Pathname_Tests(unittest.TestCase): # Make sure simple tests pass expected_path = os.path.join("parts", "of", "a", "path") expected_url = "parts/of/a/path" - result = urllib.pathname2url(expected_path) + result = urllib.request.pathname2url(expected_path) self.assertEqual(expected_url, result, "pathname2url() failed; %s != %s" % (result, expected_url)) - result = urllib.url2pathname(expected_url) + result = urllib.request.url2pathname(expected_url) self.assertEqual(expected_path, result, "url2pathame() failed; %s != %s" % (result, expected_path)) @@ -536,25 +557,25 @@ class Pathname_Tests(unittest.TestCase): # Test automatic quoting and unquoting works for pathnam2url() and # url2pathname() respectively given = os.path.join("needs", "quot=ing", "here") - expect = "needs/%s/here" % urllib.quote("quot=ing") - result = urllib.pathname2url(given) + expect = "needs/%s/here" % urllib.parse.quote("quot=ing") + result = urllib.request.pathname2url(given) self.assertEqual(expect, result, "pathname2url() failed; %s != %s" % (expect, result)) expect = given - result = urllib.url2pathname(result) + result = urllib.request.url2pathname(result) self.assertEqual(expect, result, "url2pathname() failed; %s != %s" % (expect, result)) given = os.path.join("make sure", "using_quote") - expect = "%s/using_quote" % urllib.quote("make sure") - result = urllib.pathname2url(given) + expect = "%s/using_quote" % urllib.parse.quote("make sure") + result = urllib.request.pathname2url(given) self.assertEqual(expect, result, "pathname2url() failed; %s != %s" % (expect, result)) given = "make+sure/using_unquote" expect = os.path.join("make+sure", "using_unquote") - result = urllib.url2pathname(given) + result = urllib.request.url2pathname(given) self.assertEqual(expect, result, "url2pathname() failed; %s != %s" % (expect, result)) diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index d6b3d57..30aa8f2 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -5,8 +5,8 @@ import os import io import socket -import urllib2 -from urllib2 import Request, OpenerDirector +import urllib.request +from urllib.request import Request, OpenerDirector # XXX # Request @@ -17,10 +17,10 @@ class TrivialTests(unittest.TestCase): def test_trivial(self): # A couple trivial tests - self.assertRaises(ValueError, urllib2.urlopen, 'bogus url') + self.assertRaises(ValueError, urllib.request.urlopen, 'bogus url') # XXX Name hacking to get this to work on Windows. - fname = os.path.abspath(urllib2.__file__).replace('\\', '/') + fname = os.path.abspath(urllib.request.__file__).replace('\\', '/') if fname[1:2] == ":": fname = fname[2:] # And more hacking to get it to work on MacOS. This assumes @@ -29,18 +29,21 @@ class TrivialTests(unittest.TestCase): fname = '/' + fname.replace(':', '/') file_url = "file://%s" % fname - f = urllib2.urlopen(file_url) + f = urllib.request.urlopen(file_url) buf = f.read() f.close() def test_parse_http_list(self): - tests = [('a,b,c', ['a', 'b', 'c']), - ('path"o,l"og"i"cal, example', ['path"o,l"og"i"cal', 'example']), - ('a, b, "c", "d", "e,f", g, h', ['a', 'b', '"c"', '"d"', '"e,f"', 'g', 'h']), - ('a="b\\"c", d="e\\,f", g="h\\\\i"', ['a="b"c"', 'd="e,f"', 'g="h\\i"'])] + tests = [ + ('a,b,c', ['a', 'b', 'c']), + ('path"o,l"og"i"cal, example', ['path"o,l"og"i"cal', 'example']), + ('a, b, "c", "d", "e,f", g, h', + ['a', 'b', '"c"', '"d"', '"e,f"', 'g', 'h']), + ('a="b\\"c", d="e\\,f", g="h\\\\i"', + ['a="b"c"', 'd="e,f"', 'g="h\\i"'])] for string, list in tests: - self.assertEquals(urllib2.parse_http_list(string), list) + self.assertEquals(urllib.request.parse_http_list(string), list) def test_request_headers_dict(): @@ -107,7 +110,7 @@ def test_request_headers_methods(): def test_password_manager(self): """ - >>> mgr = urllib2.HTTPPasswordMgr() + >>> mgr = urllib.request.HTTPPasswordMgr() >>> add = mgr.add_password >>> add("Some Realm", "http://example.com/", "joe", "password") >>> add("Some Realm", "http://example.com/ni", "ni", "ni") @@ -172,7 +175,7 @@ def test_password_manager(self): def test_password_manager_default_port(self): """ - >>> mgr = urllib2.HTTPPasswordMgr() + >>> mgr = urllib.request.HTTPPasswordMgr() >>> add = mgr.add_password The point to note here is that we can't guess the default port if there's @@ -288,7 +291,7 @@ class MockHandler: res = MockResponse(200, "OK", {}, "") return self.parent.error("http", args[0], res, code, "", {}) elif action == "raise": - raise urllib2.URLError("blah") + raise urllib.error.URLError("blah") assert False def close(self): pass def add_parent(self, parent): @@ -337,7 +340,7 @@ def build_test_opener(*handler_instances): opener.add_handler(h) return opener -class MockHTTPHandler(urllib2.BaseHandler): +class MockHTTPHandler(urllib.request.BaseHandler): # useful for testing redirections and auth # sends supplied headers and code as first response # sends 200 OK as second response @@ -392,7 +395,7 @@ class OpenerDirectorTests(unittest.TestCase): # TypeError in real code; here, returning self from these mock # methods would either cause no exception, or AttributeError. - from urllib2 import URLError + from urllib.error import URLError o = OpenerDirector() meth_spec = [ @@ -400,7 +403,7 @@ class OpenerDirectorTests(unittest.TestCase): [("redirect_request", "return self")], ] handlers = add_ordered_mock_handlers(o, meth_spec) - o.add_handler(urllib2.UnknownHandler()) + o.add_handler(urllib.request.UnknownHandler()) for scheme in "do", "proxy", "redirect": self.assertRaises(URLError, o.open, scheme+"://example.com/") @@ -458,7 +461,7 @@ class OpenerDirectorTests(unittest.TestCase): handlers = add_ordered_mock_handlers(o, meth_spec) req = Request("http://example.com/") - self.assertRaises(urllib2.URLError, o.open, req) + self.assertRaises(urllib.error.URLError, o.open, req) self.assertEqual(o.calls, [(handlers[0], "http_open", (req,), {})]) ## def test_error(self): @@ -529,8 +532,7 @@ class OpenerDirectorTests(unittest.TestCase): def sanepathname2url(path): - import urllib - urlpath = urllib.pathname2url(path) + urlpath = urllib.request.pathname2url(path) if os.name == "nt" and urlpath.startswith("///"): urlpath = urlpath[2:] # XXX don't ask me about the mac... @@ -545,7 +547,7 @@ class HandlerTests(unittest.TestCase): self.filename, self.filetype = filename, filetype return io.StringIO(self.data), len(self.data) - class NullFTPHandler(urllib2.FTPHandler): + class NullFTPHandler(urllib.request.FTPHandler): def __init__(self, data): self.data = data def connect_ftp(self, user, passwd, host, port, dirs, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): @@ -587,7 +589,7 @@ class HandlerTests(unittest.TestCase): def test_file(self): import email.utils, socket - h = urllib2.FileHandler() + h = urllib.request.FileHandler() o = h.parent = MockOpener() TESTFN = support.TESTFN @@ -644,12 +646,12 @@ class HandlerTests(unittest.TestCase): finally: f.close() - self.assertRaises(urllib2.URLError, + self.assertRaises(urllib.error.URLError, h.file_open, Request(url)) finally: os.remove(TESTFN) - h = urllib2.FileHandler() + h = urllib.request.FileHandler() o = h.parent = MockOpener() # XXXX why does // mean ftp (and /// mean not ftp!), and where # is file: scheme specified? I think this is really a bug, and @@ -668,7 +670,7 @@ class HandlerTests(unittest.TestCase): try: h.file_open(req) # XXXX remove OSError when bug fixed - except (urllib2.URLError, OSError): + except (urllib.error.URLError, OSError): self.assert_(not ftp) else: self.assert_(o.req is req) @@ -685,6 +687,7 @@ class HandlerTests(unittest.TestCase): return '' class MockHTTPClass: def __init__(self): + self.level = 0 self.req_headers = [] self.data = None self.raise_on_endheaders = False @@ -707,7 +710,7 @@ class HandlerTests(unittest.TestCase): def getresponse(self): return MockHTTPResponse(MockFile(), {}, 200, "OK") - h = urllib2.AbstractHTTPHandler() + h = urllib.request.AbstractHTTPHandler() o = h.parent = MockOpener() url = "http://example.com/" @@ -737,7 +740,7 @@ class HandlerTests(unittest.TestCase): # check socket.error converted to URLError http.raise_on_endheaders = True - self.assertRaises(urllib2.URLError, h.do_open, http, req) + self.assertRaises(urllib.error.URLError, h.do_open, http, req) # check adding of standard headers o.addheaders = [("Spam", "eggs")] @@ -768,7 +771,7 @@ class HandlerTests(unittest.TestCase): self.assertEqual(req.unredirected_hdrs["Spam"], "foo") def test_errors(self): - h = urllib2.HTTPErrorProcessor() + h = urllib.request.HTTPErrorProcessor() o = h.parent = MockOpener() url = "http://example.com/" @@ -794,7 +797,7 @@ class HandlerTests(unittest.TestCase): def test_cookies(self): cj = MockCookieJar() - h = urllib2.HTTPCookieProcessor(cj) + h = urllib.request.HTTPCookieProcessor(cj) o = h.parent = MockOpener() req = Request("http://example.com/") @@ -810,7 +813,7 @@ class HandlerTests(unittest.TestCase): def test_redirect(self): from_url = "http://example.com/a.html" to_url = "http://example.com/b.html" - h = urllib2.HTTPRedirectHandler() + h = urllib.request.HTTPRedirectHandler() o = h.parent = MockOpener() # ordinary redirect behaviour @@ -825,7 +828,7 @@ class HandlerTests(unittest.TestCase): try: method(req, MockFile(), code, "Blah", MockHeaders({"location": to_url})) - except urllib2.HTTPError: + except urllib.error.HTTPError: # 307 in response to POST requires user OK self.assert_(code == 307 and data is not None) self.assertEqual(o.req.get_full_url(), to_url) @@ -860,9 +863,9 @@ class HandlerTests(unittest.TestCase): while 1: redirect(h, req, "http://example.com/") count = count + 1 - except urllib2.HTTPError: + except urllib.error.HTTPError: # don't stop until max_repeats, because cookies may introduce state - self.assertEqual(count, urllib2.HTTPRedirectHandler.max_repeats) + self.assertEqual(count, urllib.request.HTTPRedirectHandler.max_repeats) # detect endless non-repeating chain of redirects req = Request(from_url, origin_req_host="example.com") @@ -871,9 +874,9 @@ class HandlerTests(unittest.TestCase): while 1: redirect(h, req, "http://example.com/%d" % count) count = count + 1 - except urllib2.HTTPError: + except urllib.error.HTTPError: self.assertEqual(count, - urllib2.HTTPRedirectHandler.max_redirections) + urllib.request.HTTPRedirectHandler.max_redirections) def test_cookie_redirect(self): # cookies shouldn't leak into redirected requests @@ -883,16 +886,16 @@ class HandlerTests(unittest.TestCase): cj = CookieJar() interact_netscape(cj, "http://www.example.com/", "spam=eggs") hh = MockHTTPHandler(302, "Location: http://www.cracker.com/\r\n\r\n") - hdeh = urllib2.HTTPDefaultErrorHandler() - hrh = urllib2.HTTPRedirectHandler() - cp = urllib2.HTTPCookieProcessor(cj) + hdeh = urllib.request.HTTPDefaultErrorHandler() + hrh = urllib.request.HTTPRedirectHandler() + cp = urllib.request.HTTPCookieProcessor(cj) o = build_test_opener(hh, hdeh, hrh, cp) o.open("http://www.example.com/") self.assert_(not hh.req.has_header("Cookie")) def test_proxy(self): o = OpenerDirector() - ph = urllib2.ProxyHandler(dict(http="proxy.example.com:3128")) + ph = urllib.request.ProxyHandler(dict(http="proxy.example.com:3128")) o.add_handler(ph) meth_spec = [ [("http_open", "return response")] @@ -910,7 +913,7 @@ class HandlerTests(unittest.TestCase): def test_basic_auth(self, quote_char='"'): opener = OpenerDirector() password_manager = MockPasswordManager() - auth_handler = urllib2.HTTPBasicAuthHandler(password_manager) + auth_handler = urllib.request.HTTPBasicAuthHandler(password_manager) realm = "ACME Widget Store" http_handler = MockHTTPHandler( 401, 'WWW-Authenticate: Basic realm=%s%s%s\r\n\r\n' % @@ -928,10 +931,10 @@ class HandlerTests(unittest.TestCase): def test_proxy_basic_auth(self): opener = OpenerDirector() - ph = urllib2.ProxyHandler(dict(http="proxy.example.com:3128")) + ph = urllib.request.ProxyHandler(dict(http="proxy.example.com:3128")) opener.add_handler(ph) password_manager = MockPasswordManager() - auth_handler = urllib2.ProxyBasicAuthHandler(password_manager) + auth_handler = urllib.request.ProxyBasicAuthHandler(password_manager) realm = "ACME Networks" http_handler = MockHTTPHandler( 407, 'Proxy-Authenticate: Basic realm="%s"\r\n\r\n' % realm) @@ -958,15 +961,15 @@ class HandlerTests(unittest.TestCase): self.recorded = [] def record(self, info): self.recorded.append(info) - class TestDigestAuthHandler(urllib2.HTTPDigestAuthHandler): + class TestDigestAuthHandler(urllib.request.HTTPDigestAuthHandler): def http_error_401(self, *args, **kwds): self.parent.record("digest") - urllib2.HTTPDigestAuthHandler.http_error_401(self, + urllib.request.HTTPDigestAuthHandler.http_error_401(self, *args, **kwds) - class TestBasicAuthHandler(urllib2.HTTPBasicAuthHandler): + class TestBasicAuthHandler(urllib.request.HTTPBasicAuthHandler): def http_error_401(self, *args, **kwds): self.parent.record("basic") - urllib2.HTTPBasicAuthHandler.http_error_401(self, + urllib.request.HTTPBasicAuthHandler.http_error_401(self, *args, **kwds) opener = RecordingOpenerDirector() @@ -1030,13 +1033,13 @@ class HandlerTests(unittest.TestCase): class MiscTests(unittest.TestCase): def test_build_opener(self): - class MyHTTPHandler(urllib2.HTTPHandler): pass - class FooHandler(urllib2.BaseHandler): + class MyHTTPHandler(urllib.request.HTTPHandler): pass + class FooHandler(urllib.request.BaseHandler): def foo_open(self): pass - class BarHandler(urllib2.BaseHandler): + class BarHandler(urllib.request.BaseHandler): def bar_open(self): pass - build_opener = urllib2.build_opener + build_opener = urllib.request.build_opener o = build_opener(FooHandler, BarHandler) self.opener_has_handler(o, FooHandler) @@ -1054,14 +1057,14 @@ class MiscTests(unittest.TestCase): # a particular case of overriding: default handlers can be passed # in explicitly o = build_opener() - self.opener_has_handler(o, urllib2.HTTPHandler) - o = build_opener(urllib2.HTTPHandler) - self.opener_has_handler(o, urllib2.HTTPHandler) - o = build_opener(urllib2.HTTPHandler()) - self.opener_has_handler(o, urllib2.HTTPHandler) + self.opener_has_handler(o, urllib.request.HTTPHandler) + o = build_opener(urllib.request.HTTPHandler) + self.opener_has_handler(o, urllib.request.HTTPHandler) + o = build_opener(urllib.request.HTTPHandler()) + self.opener_has_handler(o, urllib.request.HTTPHandler) # Issue2670: multiple handlers sharing the same base class - class MyOtherHTTPHandler(urllib2.HTTPHandler): pass + class MyOtherHTTPHandler(urllib.request.HTTPHandler): pass o = build_opener(MyHTTPHandler, MyOtherHTTPHandler) self.opener_has_handler(o, MyHTTPHandler) self.opener_has_handler(o, MyOtherHTTPHandler) @@ -1077,7 +1080,7 @@ class MiscTests(unittest.TestCase): def test_main(verbose=None): from test import test_urllib2 support.run_doctest(test_urllib2, verbose) - support.run_doctest(urllib2, verbose) + support.run_doctest(urllib.request, verbose) tests = (TrivialTests, OpenerDirectorTests, HandlerTests, diff --git a/Lib/test/test_urllib2_localnet.py b/Lib/test/test_urllib2_localnet.py index d3016c3..2c572f3 100644 --- a/Lib/test/test_urllib2_localnet.py +++ b/Lib/test/test_urllib2_localnet.py @@ -2,8 +2,8 @@ import email import threading -import urlparse -import urllib2 +import urllib.parse +import urllib.request import http.server import unittest import hashlib @@ -45,7 +45,7 @@ class LoopbackHttpServerThread(threading.Thread): self._stop_server = False self.ready = threading.Event() request_handler.protocol_version = "HTTP/1.0" - self.httpd = LoopbackHttpServer(('127.0.0.1', 0), + self.httpd = LoopbackHttpServer(("127.0.0.1", 0), request_handler) #print "Serving HTTP on %s port %s" % (self.httpd.server_name, # self.httpd.server_port) @@ -154,11 +154,11 @@ class DigestAuthHandler: if len(self._users) == 0: return True - if 'Proxy-Authorization' not in request_handler.headers: + if "Proxy-Authorization" not in request_handler.headers: return self._return_auth_challenge(request_handler) else: auth_dict = self._create_auth_dict( - request_handler.headers['Proxy-Authorization'] + request_handler.headers["Proxy-Authorization"] ) if auth_dict["username"] in self._users: password = self._users[ auth_dict["username"] ] @@ -199,12 +199,12 @@ class FakeProxyHandler(http.server.BaseHTTPRequestHandler): def log_message(self, format, *args): # Uncomment the next line for debugging. - #sys.stderr.write(format % args) + # sys.stderr.write(format % args) pass def do_GET(self): - (scm, netloc, path, params, query, fragment) = urlparse.urlparse( - self.path, 'http') + (scm, netloc, path, params, query, fragment) = urllib.parse.urlparse( + self.path, "http") self.short_path = path if self.digest_auth_handler.handle_request(self): self.send_response(200, "OK") @@ -234,9 +234,10 @@ class ProxyAuthTests(unittest.TestCase): self.server.start() self.server.ready.wait() proxy_url = "http://127.0.0.1:%d" % self.server.port - handler = urllib2.ProxyHandler({"http" : proxy_url}) - self._digest_auth_handler = urllib2.ProxyDigestAuthHandler() - self.opener = urllib2.build_opener(handler, self._digest_auth_handler) + handler = urllib.request.ProxyHandler({"http" : proxy_url}) + self._digest_auth_handler = urllib.request.ProxyDigestAuthHandler() + self.opener = urllib.request.build_opener( + handler, self._digest_auth_handler) def tearDown(self): self.server.stop() @@ -245,13 +246,13 @@ class ProxyAuthTests(unittest.TestCase): self._digest_auth_handler.add_password(self.REALM, self.URL, self.USER, self.PASSWD+"bad") FakeProxyHandler.digest_auth_handler.set_qop("auth") - self.assertRaises(urllib2.HTTPError, + self.assertRaises(urllib.error.HTTPError, self.opener.open, self.URL) def test_proxy_with_no_password_raises_httperror(self): FakeProxyHandler.digest_auth_handler.set_qop("auth") - self.assertRaises(urllib2.HTTPError, + self.assertRaises(urllib.error.HTTPError, self.opener.open, self.URL) @@ -270,7 +271,7 @@ class ProxyAuthTests(unittest.TestCase): FakeProxyHandler.digest_auth_handler.set_qop("auth-int") try: result = self.opener.open(self.URL) - except urllib2.URLError: + except urllib.error.URLError: # It's okay if we don't support auth-int, but we certainly # shouldn't receive any kind of exception here other than # a URLError. @@ -296,7 +297,7 @@ def GetRequestHandler(responses): self.wfile.write(body) def do_POST(self): - content_length = self.headers['Content-Length'] + content_length = self.headers["Content-Length"] post_data = self.rfile.read(int(content_length)) self.do_GET() self.requests.append(post_data) @@ -311,7 +312,7 @@ def GetRequestHandler(responses): for (header, value) in headers: self.send_header(header, value % self.port) if body: - self.send_header('Content-type', 'text/plain') + self.send_header("Content-type", "text/plain") self.end_headers() return body self.end_headers() @@ -332,7 +333,22 @@ class TestUrlopen(unittest.TestCase): for transparent redirection have been written. """ - def start_server(self, responses): + def setUp(self): + self.server = None + + def tearDown(self): + if self.server is not None: + self.server.stop() + + def urlopen(self, url, data=None): + f = urllib.request.urlopen(url, data) + result = f.read() + f.close() + return result + + def start_server(self, responses=None): + if responses is None: + responses = [(200, [], b"we don't care")] handler = GetRequestHandler(responses) self.server = LoopbackHttpServerThread(handler) @@ -342,106 +358,71 @@ class TestUrlopen(unittest.TestCase): handler.port = port return handler - def test_redirection(self): - expected_response = b'We got here...' + expected_response = b"We got here..." responses = [ - (302, [('Location', 'http://localhost:%s/somewhere_else')], ''), + (302, [("Location", "http://localhost:%s/somewhere_else")], ""), (200, [], expected_response) ] handler = self.start_server(responses) - - try: - f = urllib2.urlopen('http://localhost:%s/' % handler.port) - data = f.read() - f.close() - - self.assertEquals(data, expected_response) - self.assertEquals(handler.requests, ['/', '/somewhere_else']) - finally: - self.server.stop() - + data = self.urlopen("http://localhost:%s/" % handler.port) + self.assertEquals(data, expected_response) + self.assertEquals(handler.requests, ["/", "/somewhere_else"]) def test_404(self): - expected_response = b'Bad bad bad...' + expected_response = b"Bad bad bad..." handler = self.start_server([(404, [], expected_response)]) try: - try: - urllib2.urlopen('http://localhost:%s/weeble' % handler.port) - except urllib2.URLError as f: - data = f.read() - f.close() - else: - self.fail('404 should raise URLError') - - self.assertEquals(data, expected_response) - self.assertEquals(handler.requests, ['/weeble']) - finally: - self.server.stop() + self.urlopen("http://localhost:%s/weeble" % handler.port) + except urllib.error.URLError as f: + data = f.read() + f.close() + else: + self.fail("404 should raise URLError") + self.assertEquals(data, expected_response) + self.assertEquals(handler.requests, ["/weeble"]) def test_200(self): - expected_response = b'pycon 2008...' + expected_response = b"pycon 2008..." handler = self.start_server([(200, [], expected_response)]) - - try: - f = urllib2.urlopen('http://localhost:%s/bizarre' % handler.port) - data = f.read() - f.close() - - self.assertEquals(data, expected_response) - self.assertEquals(handler.requests, ['/bizarre']) - finally: - self.server.stop() + data = self.urlopen("http://localhost:%s/bizarre" % handler.port) + self.assertEquals(data, expected_response) + self.assertEquals(handler.requests, ["/bizarre"]) def test_200_with_parameters(self): - expected_response = b'pycon 2008...' + expected_response = b"pycon 2008..." handler = self.start_server([(200, [], expected_response)]) - - try: - f = urllib2.urlopen('http://localhost:%s/bizarre' % handler.port, b'get=with_feeling') - data = f.read() - f.close() - - self.assertEquals(data, expected_response) - self.assertEquals(handler.requests, ['/bizarre', b'get=with_feeling']) - finally: - self.server.stop() - + data = self.urlopen("http://localhost:%s/bizarre" % handler.port, + b"get=with_feeling") + self.assertEquals(data, expected_response) + self.assertEquals(handler.requests, ["/bizarre", b"get=with_feeling"]) def test_sending_headers(self): - handler = self.start_server([(200, [], b"we don't care")]) - - try: - req = urllib2.Request("http://localhost:%s/" % handler.port, - headers={'Range': 'bytes=20-39'}) - urllib2.urlopen(req) - self.assertEqual(handler.headers_received['Range'], 'bytes=20-39') - finally: - self.server.stop() + handler = self.start_server() + req = urllib.request.Request("http://localhost:%s/" % handler.port, + headers={"Range": "bytes=20-39"}) + urllib.request.urlopen(req) + self.assertEqual(handler.headers_received["Range"], "bytes=20-39") def test_basic(self): - handler = self.start_server([(200, [], b"we don't care")]) - + handler = self.start_server() + open_url = urllib.request.urlopen("http://localhost:%s" % handler.port) + for attr in ("read", "close", "info", "geturl"): + self.assert_(hasattr(open_url, attr), "object returned from " + "urlopen lacks the %s attribute" % attr) try: - open_url = urllib2.urlopen("http://localhost:%s" % handler.port) - for attr in ("read", "close", "info", "geturl"): - self.assert_(hasattr(open_url, attr), "object returned from " - "urlopen lacks the %s attribute" % attr) - try: - self.assert_(open_url.read(), "calling 'read' failed") - finally: - open_url.close() + self.assert_(open_url.read(), "calling 'read' failed") finally: - self.server.stop() + open_url.close() def test_info(self): - handler = self.start_server([(200, [], b"we don't care")]) - + handler = self.start_server() try: - open_url = urllib2.urlopen("http://localhost:%s" % handler.port) + open_url = urllib.request.urlopen( + "http://localhost:%s" % handler.port) info_obj = open_url.info() self.assert_(isinstance(info_obj, email.message.Message), "object returned by 'info' is not an instance of " @@ -452,15 +433,10 @@ class TestUrlopen(unittest.TestCase): def test_geturl(self): # Make sure same URL as opened is returned by geturl. - handler = self.start_server([(200, [], b"we don't care")]) - - try: - open_url = urllib2.urlopen("http://localhost:%s" % handler.port) - url = open_url.geturl() - self.assertEqual(url, "http://localhost:%s" % handler.port) - finally: - self.server.stop() - + handler = self.start_server() + open_url = urllib.request.urlopen("http://localhost:%s" % handler.port) + url = open_url.geturl() + self.assertEqual(url, "http://localhost:%s" % handler.port) def test_bad_address(self): # Make sure proper exception is raised when connecting to a bogus @@ -472,17 +448,10 @@ class TestUrlopen(unittest.TestCase): # started failing then. One hopes the .invalid # domain will be spared to serve its defined # purpose. - # urllib2.urlopen, "http://www.sadflkjsasadf.com/") - urllib2.urlopen, "http://www.python.invalid./") - + urllib.request.urlopen, + "http://www.python.invalid./") def test_main(): - # We will NOT depend on the network resource flag - # (Lib/test/regrtest.py -u network) since all tests here are only - # localhost. However, if this is a bad rationale, then uncomment - # the next line. - #support.requires("network") - support.run_unittest(ProxyAuthTests) support.run_unittest(TestUrlopen) diff --git a/Lib/test/test_urllib2net.py b/Lib/test/test_urllib2net.py index 938ab9f..a18a4bb 100644 --- a/Lib/test/test_urllib2net.py +++ b/Lib/test/test_urllib2net.py @@ -4,10 +4,11 @@ import unittest from test import support from test.test_urllib2 import sanepathname2url +import os import socket -import urllib2 import sys -import os +import urllib.error +import urllib.request def _retry_thrice(func, exc, *args, **kwargs): @@ -28,7 +29,8 @@ def _wrap_with_retry_thrice(func, exc): # Connecting to remote hosts is flaky. Make it more robust by retrying # the connection several times. -_urlopen_with_retry = _wrap_with_retry_thrice(urllib2.urlopen, urllib2.URLError) +_urlopen_with_retry = _wrap_with_retry_thrice(urllib.request.urlopen, + urllib.error.URLError) class AuthTests(unittest.TestCase): @@ -78,16 +80,11 @@ class CloseSocketTest(unittest.TestCase): # calling .close() on urllib2's response objects should close the # underlying socket - # delve deep into response to fetch socket._socketobject response = _urlopen_with_retry("http://www.python.org/") - abused_fileobject = response.fp - httpresponse = abused_fileobject.raw - self.assert_(httpresponse.__class__ is http.client.HTTPResponse) - fileobject = httpresponse.fp - - self.assert_(not fileobject.closed) + sock = response.fp + self.assert_(not sock.closed) response.close() - self.assert_(fileobject.closed) + self.assert_(sock.closed) class OtherNetworkTests(unittest.TestCase): def setUp(self): @@ -116,8 +113,9 @@ class OtherNetworkTests(unittest.TestCase): f.write('hi there\n') f.close() urls = [ - 'file:'+sanepathname2url(os.path.abspath(TESTFN)), - ('file:///nonsensename/etc/passwd', None, urllib2.URLError), + 'file:' + sanepathname2url(os.path.abspath(TESTFN)), + ('file:///nonsensename/etc/passwd', None, + urllib.error.URLError), ] self._test_urls(urls, self._extra_handlers(), retry=True) finally: @@ -157,9 +155,9 @@ class OtherNetworkTests(unittest.TestCase): import logging debug = logging.getLogger("test_urllib2").debug - urlopen = urllib2.build_opener(*handlers).open + urlopen = urllib.request.build_opener(*handlers).open if retry: - urlopen = _wrap_with_retry_thrice(urlopen, urllib2.URLError) + urlopen = _wrap_with_retry_thrice(urlopen, urllib.error.URLError) for url in urls: if isinstance(url, tuple): @@ -186,7 +184,7 @@ class OtherNetworkTests(unittest.TestCase): def _extra_handlers(self): handlers = [] - cfh = urllib2.CacheFTPHandler() + cfh = urllib.request.CacheFTPHandler() cfh.setTimeout(1) handlers.append(cfh) @@ -197,7 +195,7 @@ class TimeoutTest(unittest.TestCase): def test_http_basic(self): self.assertTrue(socket.getdefaulttimeout() is None) u = _urlopen_with_retry("http://www.python.org") - self.assertTrue(u.fp.raw.fp._sock.gettimeout() is None) + self.assertTrue(u.fp._sock.gettimeout() is None) def test_http_default_timeout(self): self.assertTrue(socket.getdefaulttimeout() is None) @@ -206,7 +204,7 @@ class TimeoutTest(unittest.TestCase): u = _urlopen_with_retry("http://www.python.org") finally: socket.setdefaulttimeout(None) - self.assertEqual(u.fp.raw.fp._sock.gettimeout(), 60) + self.assertEqual(u.fp._sock.gettimeout(), 60) def test_http_no_timeout(self): self.assertTrue(socket.getdefaulttimeout() is None) @@ -215,11 +213,11 @@ class TimeoutTest(unittest.TestCase): u = _urlopen_with_retry("http://www.python.org", timeout=None) finally: socket.setdefaulttimeout(None) - self.assertTrue(u.fp.raw.fp._sock.gettimeout() is None) + self.assertTrue(u.fp._sock.gettimeout() is None) def test_http_timeout(self): u = _urlopen_with_retry("http://www.python.org", timeout=120) - self.assertEqual(u.fp.raw.fp._sock.gettimeout(), 120) + self.assertEqual(u.fp._sock.gettimeout(), 120) FTP_HOST = "ftp://ftp.mirror.nl/pub/mirror/gnu/" diff --git a/Lib/test/test_urllibnet.py b/Lib/test/test_urllibnet.py index d4831d8..c8166c4 100644 --- a/Lib/test/test_urllibnet.py +++ b/Lib/test/test_urllibnet.py @@ -4,7 +4,7 @@ import unittest from test import support import socket -import urllib +import urllib.request import sys import os import email.message @@ -36,11 +36,11 @@ class URLTimeoutTest(unittest.TestCase): socket.setdefaulttimeout(None) def testURLread(self): - f = _open_with_retry(urllib.urlopen, "http://www.python.org/") + f = _open_with_retry(urllib.request.urlopen, "http://www.python.org/") x = f.read() class urlopenNetworkTests(unittest.TestCase): - """Tests urllib.urlopen using the network. + """Tests urllib.reqest.urlopen using the network. These tests are not exhaustive. Assuming that testing using files does a good job overall of some of the basic interface features. There are no @@ -55,7 +55,7 @@ class urlopenNetworkTests(unittest.TestCase): """ def urlopen(self, *args): - return _open_with_retry(urllib.urlopen, *args) + return _open_with_retry(urllib.request.urlopen, *args) def test_basic(self): # Simple test expected to pass. @@ -105,7 +105,7 @@ class urlopenNetworkTests(unittest.TestCase): def test_getcode(self): # test getcode() with the fancy opener to get 404 error codes URL = "http://www.python.org/XXXinvalidXXX" - open_url = urllib.FancyURLopener().open(URL) + open_url = urllib.request.FancyURLopener().open(URL) try: code = open_url.getcode() finally: @@ -114,7 +114,7 @@ class urlopenNetworkTests(unittest.TestCase): def test_fileno(self): if (sys.platform in ('win32',) or - not hasattr(os, 'fdopen')): + not hasattr(os, 'fdopen')): # On Windows, socket handles are not file descriptors; this # test can't pass on Windows. return @@ -142,13 +142,14 @@ class urlopenNetworkTests(unittest.TestCase): # domain will be spared to serve its defined # purpose. # urllib.urlopen, "http://www.sadflkjsasadf.com/") - urllib.urlopen, "http://www.python.invalid./") + urllib.request.urlopen, + "http://www.python.invalid./") class urlretrieveNetworkTests(unittest.TestCase): - """Tests urllib.urlretrieve using the network.""" + """Tests urllib.request.urlretrieve using the network.""" def urlretrieve(self, *args): - return _open_with_retry(urllib.urlretrieve, *args) + return _open_with_retry(urllib.request.urlretrieve, *args) def test_basic(self): # Test basic functionality. diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 02b2f6f..c92b5aa 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -2,7 +2,7 @@ from test import support import unittest -import urlparse +import urllib.parse RFC1808_BASE = "http://a/b/c/d;p?q#f" RFC2396_BASE = "http://a/b/c/d;p?q" @@ -10,19 +10,19 @@ RFC2396_BASE = "http://a/b/c/d;p?q" class UrlParseTestCase(unittest.TestCase): def checkRoundtrips(self, url, parsed, split): - result = urlparse.urlparse(url) + result = urllib.parse.urlparse(url) self.assertEqual(result, parsed) t = (result.scheme, result.netloc, result.path, result.params, result.query, result.fragment) self.assertEqual(t, parsed) # put it back together and it should be the same - result2 = urlparse.urlunparse(result) + result2 = urllib.parse.urlunparse(result) self.assertEqual(result2, url) self.assertEqual(result2, result.geturl()) # the result of geturl() is a fixpoint; we can always parse it # again to get the same result: - result3 = urlparse.urlparse(result.geturl()) + result3 = urllib.parse.urlparse(result.geturl()) self.assertEqual(result3.geturl(), result.geturl()) self.assertEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) @@ -37,17 +37,17 @@ class UrlParseTestCase(unittest.TestCase): self.assertEqual(result3.port, result.port) # check the roundtrip using urlsplit() as well - result = urlparse.urlsplit(url) + result = urllib.parse.urlsplit(url) self.assertEqual(result, split) t = (result.scheme, result.netloc, result.path, result.query, result.fragment) self.assertEqual(t, split) - result2 = urlparse.urlunsplit(result) + result2 = urllib.parse.urlunsplit(result) self.assertEqual(result2, url) self.assertEqual(result2, result.geturl()) # check the fixpoint property of re-parsing the result of geturl() - result3 = urlparse.urlsplit(result.geturl()) + result3 = urllib.parse.urlsplit(result.geturl()) self.assertEqual(result3.geturl(), result.geturl()) self.assertEqual(result3, result) self.assertEqual(result3.scheme, result.scheme) @@ -83,7 +83,7 @@ class UrlParseTestCase(unittest.TestCase): self.checkRoundtrips(url, parsed, split) def test_http_roundtrips(self): - # urlparse.urlsplit treats 'http:' as an optimized special case, + # urllib.parse.urlsplit treats 'http:' as an optimized special case, # so we test both 'http:' and 'https:' in all the following. # Three cheers for white box knowledge! testcases = [ @@ -111,13 +111,13 @@ class UrlParseTestCase(unittest.TestCase): self.checkRoundtrips(url, parsed, split) def checkJoin(self, base, relurl, expected): - self.assertEqual(urlparse.urljoin(base, relurl), expected, + self.assertEqual(urllib.parse.urljoin(base, relurl), expected, (base, relurl, expected)) def test_unparse_parse(self): for u in ['Python', './Python']: - self.assertEqual(urlparse.urlunsplit(urlparse.urlsplit(u)), u) - self.assertEqual(urlparse.urlunparse(urlparse.urlparse(u)), u) + self.assertEqual(urllib.parse.urlunsplit(urllib.parse.urlsplit(u)), u) + self.assertEqual(urllib.parse.urlunparse(urllib.parse.urlparse(u)), u) def test_RFC1808(self): # "normal" cases from RFC 1808: @@ -223,11 +223,11 @@ class UrlParseTestCase(unittest.TestCase): (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'), (RFC2396_BASE, 'http://a/b/c/d;p?q', ''), ]: - self.assertEqual(urlparse.urldefrag(url), (defrag, frag)) + self.assertEqual(urllib.parse.urldefrag(url), (defrag, frag)) def test_urlsplit_attributes(self): url = "HTTP://WWW.PYTHON.ORG/doc/#frag" - p = urlparse.urlsplit(url) + p = urllib.parse.urlsplit(url) self.assertEqual(p.scheme, "http") self.assertEqual(p.netloc, "WWW.PYTHON.ORG") self.assertEqual(p.path, "/doc/") @@ -242,7 +242,7 @@ class UrlParseTestCase(unittest.TestCase): #self.assertEqual(p.geturl(), url) url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag" - p = urlparse.urlsplit(url) + p = urllib.parse.urlsplit(url) self.assertEqual(p.scheme, "http") self.assertEqual(p.netloc, "User:Pass@www.python.org:080") self.assertEqual(p.path, "/doc/") @@ -259,7 +259,7 @@ class UrlParseTestCase(unittest.TestCase): # and request email addresses as usernames. url = "http://User@example.com:Pass@www.python.org:080/doc/?query=yes#frag" - p = urlparse.urlsplit(url) + p = urllib.parse.urlsplit(url) self.assertEqual(p.scheme, "http") self.assertEqual(p.netloc, "User@example.com:Pass@www.python.org:080") self.assertEqual(p.path, "/doc/") @@ -274,11 +274,11 @@ class UrlParseTestCase(unittest.TestCase): def test_attributes_bad_port(self): """Check handling of non-integer ports.""" - p = urlparse.urlsplit("http://www.example.net:foo") + p = urllib.parse.urlsplit("http://www.example.net:foo") self.assertEqual(p.netloc, "www.example.net:foo") self.assertRaises(ValueError, lambda: p.port) - p = urlparse.urlparse("http://www.example.net:foo") + p = urllib.parse.urlparse("http://www.example.net:foo") self.assertEqual(p.netloc, "www.example.net:foo") self.assertRaises(ValueError, lambda: p.port) @@ -289,7 +289,7 @@ class UrlParseTestCase(unittest.TestCase): # scheme://netloc syntax, the netloc and related attributes # should be left empty. uri = "sip:alice@atlanta.com;maddr=239.255.255.1;ttl=15" - p = urlparse.urlsplit(uri) + p = urllib.parse.urlsplit(uri) self.assertEqual(p.netloc, "") self.assertEqual(p.username, None) self.assertEqual(p.password, None) @@ -297,7 +297,7 @@ class UrlParseTestCase(unittest.TestCase): self.assertEqual(p.port, None) self.assertEqual(p.geturl(), uri) - p = urlparse.urlparse(uri) + p = urllib.parse.urlparse(uri) self.assertEqual(p.netloc, "") self.assertEqual(p.username, None) self.assertEqual(p.password, None) @@ -307,7 +307,7 @@ class UrlParseTestCase(unittest.TestCase): def test_noslash(self): # Issue 1637: http://foo.com?query is legal - self.assertEqual(urlparse.urlparse("http://example.com?blahblah=/foo"), + self.assertEqual(urllib.parse.urlparse("http://example.com?blahblah=/foo"), ('http', 'example.com', '', '', 'blahblah=/foo', '')) def test_main(): diff --git a/Lib/test/test_xmlrpc.py b/Lib/test/test_xmlrpc.py index 9d38470..e285809 100644 --- a/Lib/test/test_xmlrpc.py +++ b/Lib/test/test_xmlrpc.py @@ -111,8 +111,10 @@ class XMLRPCTestCase(unittest.TestCase): (int(2**34),)) xmlrpclib.dumps((xmlrpclib.MAXINT, xmlrpclib.MININT)) - self.assertRaises(OverflowError, xmlrpclib.dumps, (xmlrpclib.MAXINT+1,)) - self.assertRaises(OverflowError, xmlrpclib.dumps, (xmlrpclib.MININT-1,)) + self.assertRaises(OverflowError, xmlrpclib.dumps, + (xmlrpclib.MAXINT+1,)) + self.assertRaises(OverflowError, xmlrpclib.dumps, + (xmlrpclib.MININT-1,)) def dummy_write(s): pass @@ -120,9 +122,10 @@ class XMLRPCTestCase(unittest.TestCase): m = xmlrpclib.Marshaller() m.dump_int(xmlrpclib.MAXINT, dummy_write) m.dump_int(xmlrpclib.MININT, dummy_write) - self.assertRaises(OverflowError, m.dump_int, xmlrpclib.MAXINT+1, dummy_write) - self.assertRaises(OverflowError, m.dump_int, xmlrpclib.MININT-1, dummy_write) - + self.assertRaises(OverflowError, m.dump_int, + xmlrpclib.MAXINT+1, dummy_write) + self.assertRaises(OverflowError, m.dump_int, + xmlrpclib.MININT-1, dummy_write) def test_dump_none(self): value = alist + [None] @@ -132,7 +135,6 @@ class XMLRPCTestCase(unittest.TestCase): xmlrpclib.loads(strg)[0][0]) self.assertRaises(TypeError, xmlrpclib.dumps, (arg1,)) - class HelperTestCase(unittest.TestCase): def test_escape(self): self.assertEqual(xmlrpclib.escape("a&b"), "a&b") @@ -160,7 +162,6 @@ class FaultTestCase(unittest.TestCase): # private methods self.assertRaises(AttributeError, xmlrpc.server.resolve_dotted_attribute, str, '__add') - self.assert_(xmlrpc.server.resolve_dotted_attribute(str, 'title')) class DateTimeTestCase(unittest.TestCase): @@ -170,7 +171,8 @@ class DateTimeTestCase(unittest.TestCase): def test_time(self): d = 1181399930.036952 t = xmlrpclib.DateTime(d) - self.assertEqual(str(t), time.strftime("%Y%m%dT%H:%M:%S", time.localtime(d))) + self.assertEqual(str(t), + time.strftime("%Y%m%dT%H:%M:%S", time.localtime(d))) def test_time_tuple(self): d = (2007,6,9,10,38,50,5,160,0) @@ -180,7 +182,7 @@ class DateTimeTestCase(unittest.TestCase): def test_time_struct(self): d = time.localtime(1181399930.036952) t = xmlrpclib.DateTime(d) - self.assertEqual(str(t), time.strftime("%Y%m%dT%H:%M:%S", d)) + self.assertEqual(str(t), time.strftime("%Y%m%dT%H:%M:%S", d)) def test_datetime_datetime(self): d = datetime.datetime(2007,1,2,3,4,5) @@ -350,12 +352,12 @@ class SimpleServerTestCase(unittest.TestCase): self.assertEqual(response.reason, 'Not Found') def test_introspection1(self): + expected_methods = set(['pow', 'div', 'my_function', 'add', + 'system.listMethods', 'system.methodHelp', + 'system.methodSignature', 'system.multicall']) try: p = xmlrpclib.ServerProxy('http://localhost:%d' % PORT) meth = p.system.listMethods() - expected_methods = set(['pow', 'div', 'my_function', 'add', - 'system.listMethods', 'system.methodHelp', - 'system.methodSignature', 'system.multicall']) self.assertEqual(set(meth), expected_methods) except (xmlrpclib.ProtocolError, socket.error) as e: # ignore failures due to non-blocking socket 'unavailable' errors @@ -593,7 +595,8 @@ class CGIHandlerTestCase(unittest.TestCase): # will respond exception, if so, our goal is achieved ;) handle = open(support.TESTFN, "r").read() - # start with 44th char so as not to get http header, we just need only xml + # start with 44th char so as not to get http header, we just + # need only xml self.assertRaises(xmlrpclib.Fault, xmlrpclib.loads, handle[44:]) os.remove("xmldata.txt") diff --git a/Lib/urllib.py b/Lib/urllib.py deleted file mode 100644 index d60ac5c..0000000 --- a/Lib/urllib.py +++ /dev/null @@ -1,1714 +0,0 @@ -"""Open an arbitrary URL. - -See the following document for more info on URLs: -"Names and Addresses, URIs, URLs, URNs, URCs", at -http://www.w3.org/pub/WWW/Addressing/Overview.html - -See also the HTTP spec (from which the error codes are derived): -"HTTP - Hypertext Transfer Protocol", at -http://www.w3.org/pub/WWW/Protocols/ - -Related standards and specs: -- RFC1808: the "relative URL" spec. (authoritative status) -- RFC1738 - the "URL standard". (authoritative status) -- RFC1630 - the "URI spec". (informational status) - -The object returned by URLopener().open(file) will differ per -protocol. All you know is that is has methods read(), readline(), -readlines(), fileno(), close() and info(). The read*(), fileno() -and close() methods work like those of open files. -The info() method returns a email.message.Message object which can be -used to query various info about the object, if available. -(email.message.Message objects provide a dict-like interface.) -""" - -import http.client -import email.message -import email -import os -import socket -import sys -import time -from urlparse import urljoin as basejoin - -__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve", - "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus", - "urlencode", "url2pathname", "pathname2url", "splittag", - "localhost", "thishost", "ftperrors", "basejoin", "unwrap", - "splittype", "splithost", "splituser", "splitpasswd", "splitport", - "splitnport", "splitquery", "splitattr", "splitvalue", - "getproxies"] - -__version__ = '1.17' # XXX This version is not always updated :-( - -MAXFTPCACHE = 10 # Trim the ftp cache beyond this size - -# Helper for non-unix systems -if os.name == 'mac': - from macurl2path import url2pathname, pathname2url -elif os.name == 'nt': - from nturl2path import url2pathname, pathname2url -else: - def url2pathname(pathname): - """OS-specific conversion from a relative URL of the 'file' scheme - to a file system path; not recommended for general use.""" - return unquote(pathname) - - def pathname2url(pathname): - """OS-specific conversion from a file system path to a relative URL - of the 'file' scheme; not recommended for general use.""" - return quote(pathname) - -# This really consists of two pieces: -# (1) a class which handles opening of all sorts of URLs -# (plus assorted utilities etc.) -# (2) a set of functions for parsing URLs -# XXX Should these be separated out into different modules? - - -# Shortcut for basic usage -_urlopener = None -def urlopen(url, data=None, proxies=None): - """urlopen(url [, data]) -> open file-like object""" - global _urlopener - if proxies is not None: - opener = FancyURLopener(proxies=proxies) - elif not _urlopener: - opener = FancyURLopener() - _urlopener = opener - else: - opener = _urlopener - if data is None: - return opener.open(url) - else: - return opener.open(url, data) - -def urlretrieve(url, filename=None, reporthook=None, data=None): - global _urlopener - if not _urlopener: - _urlopener = FancyURLopener() - return _urlopener.retrieve(url, filename, reporthook, data) - -def urlcleanup(): - if _urlopener: - _urlopener.cleanup() - -# check for SSL -try: - import ssl -except: - _have_ssl = False -else: - _have_ssl = True - -# exception raised when downloaded size does not match content-length -class ContentTooShortError(IOError): - def __init__(self, message, content): - IOError.__init__(self, message) - self.content = content - -ftpcache = {} -class URLopener: - """Class to open URLs. - This is a class rather than just a subroutine because we may need - more than one set of global protocol-specific options. - Note -- this is a base class for those who don't want the - automatic handling of errors type 302 (relocated) and 401 - (authorization needed).""" - - __tempfiles = None - - version = "Python-urllib/%s" % __version__ - - # Constructor - def __init__(self, proxies=None, **x509): - if proxies is None: - proxies = getproxies() - assert hasattr(proxies, 'keys'), "proxies must be a mapping" - self.proxies = proxies - self.key_file = x509.get('key_file') - self.cert_file = x509.get('cert_file') - self.addheaders = [('User-Agent', self.version)] - self.__tempfiles = [] - self.__unlink = os.unlink # See cleanup() - self.tempcache = None - # Undocumented feature: if you assign {} to tempcache, - # it is used to cache files retrieved with - # self.retrieve(). This is not enabled by default - # since it does not work for changing documents (and I - # haven't got the logic to check expiration headers - # yet). - self.ftpcache = ftpcache - # Undocumented feature: you can use a different - # ftp cache by assigning to the .ftpcache member; - # in case you want logically independent URL openers - # XXX This is not threadsafe. Bah. - - def __del__(self): - self.close() - - def close(self): - self.cleanup() - - def cleanup(self): - # This code sometimes runs when the rest of this module - # has already been deleted, so it can't use any globals - # or import anything. - if self.__tempfiles: - for file in self.__tempfiles: - try: - self.__unlink(file) - except OSError: - pass - del self.__tempfiles[:] - if self.tempcache: - self.tempcache.clear() - - def addheader(self, *args): - """Add a header to be used by the HTTP interface only - e.g. u.addheader('Accept', 'sound/basic')""" - self.addheaders.append(args) - - # External interface - def open(self, fullurl, data=None): - """Use URLopener().open(file) instead of open(file, 'r').""" - fullurl = unwrap(toBytes(fullurl)) - if self.tempcache and fullurl in self.tempcache: - filename, headers = self.tempcache[fullurl] - fp = open(filename, 'rb') - return addinfourl(fp, headers, fullurl) - urltype, url = splittype(fullurl) - if not urltype: - urltype = 'file' - if urltype in self.proxies: - proxy = self.proxies[urltype] - urltype, proxyhost = splittype(proxy) - host, selector = splithost(proxyhost) - url = (host, fullurl) # Signal special case to open_*() - else: - proxy = None - name = 'open_' + urltype - self.type = urltype - name = name.replace('-', '_') - if not hasattr(self, name): - if proxy: - return self.open_unknown_proxy(proxy, fullurl, data) - else: - return self.open_unknown(fullurl, data) - try: - if data is None: - return getattr(self, name)(url) - else: - return getattr(self, name)(url, data) - except socket.error as msg: - raise IOError('socket error', msg).with_traceback(sys.exc_info()[2]) - - def open_unknown(self, fullurl, data=None): - """Overridable interface to open unknown URL type.""" - type, url = splittype(fullurl) - raise IOError('url error', 'unknown url type', type) - - def open_unknown_proxy(self, proxy, fullurl, data=None): - """Overridable interface to open unknown URL type.""" - type, url = splittype(fullurl) - raise IOError('url error', 'invalid proxy for %s' % type, proxy) - - # External interface - def retrieve(self, url, filename=None, reporthook=None, data=None): - """retrieve(url) returns (filename, headers) for a local object - or (tempfilename, headers) for a remote object.""" - url = unwrap(toBytes(url)) - if self.tempcache and url in self.tempcache: - return self.tempcache[url] - type, url1 = splittype(url) - if filename is None and (not type or type == 'file'): - try: - fp = self.open_local_file(url1) - hdrs = fp.info() - del fp - return url2pathname(splithost(url1)[1]), hdrs - except IOError as msg: - pass - fp = self.open(url, data) - headers = fp.info() - if filename: - tfp = open(filename, 'wb') - else: - import tempfile - garbage, path = splittype(url) - garbage, path = splithost(path or "") - path, garbage = splitquery(path or "") - path, garbage = splitattr(path or "") - suffix = os.path.splitext(path)[1] - (fd, filename) = tempfile.mkstemp(suffix) - self.__tempfiles.append(filename) - tfp = os.fdopen(fd, 'wb') - result = filename, headers - if self.tempcache is not None: - self.tempcache[url] = result - bs = 1024*8 - size = -1 - read = 0 - blocknum = 0 - if reporthook: - if "content-length" in headers: - size = int(headers["Content-Length"]) - reporthook(blocknum, bs, size) - while 1: - block = fp.read(bs) - if not block: - break - read += len(block) - tfp.write(block) - blocknum += 1 - if reporthook: - reporthook(blocknum, bs, size) - fp.close() - tfp.close() - del fp - del tfp - - # raise exception if actual size does not match content-length header - if size >= 0 and read < size: - raise ContentTooShortError("retrieval incomplete: got only %i out " - "of %i bytes" % (read, size), result) - - return result - - # Each method named open_ knows how to open that type of URL - - def _open_generic_http(self, connection_factory, url, data): - """Make an HTTP connection using connection_class. - - This is an internal method that should be called from - open_http() or open_https(). - - Arguments: - - connection_factory should take a host name and return an - HTTPConnection instance. - - url is the url to retrieval or a host, relative-path pair. - - data is payload for a POST request or None. - """ - - user_passwd = None - proxy_passwd= None - if isinstance(url, str): - host, selector = splithost(url) - if host: - user_passwd, host = splituser(host) - host = unquote(host) - realhost = host - else: - host, selector = url - # check whether the proxy contains authorization information - proxy_passwd, host = splituser(host) - # now we proceed with the url we want to obtain - urltype, rest = splittype(selector) - url = rest - user_passwd = None - if urltype.lower() != 'http': - realhost = None - else: - realhost, rest = splithost(rest) - if realhost: - user_passwd, realhost = splituser(realhost) - if user_passwd: - selector = "%s://%s%s" % (urltype, realhost, rest) - if proxy_bypass(realhost): - host = realhost - - #print "proxy via http:", host, selector - if not host: raise IOError('http error', 'no host given') - - if proxy_passwd: - import base64 - proxy_auth = base64.b64encode(proxy_passwd).strip() - else: - proxy_auth = None - - if user_passwd: - import base64 - auth = base64.b64encode(user_passwd).strip() - else: - auth = None - http_conn = connection_factory(host) - # XXX We should fix urllib so that it works with HTTP/1.1. - http_conn._http_vsn = 10 - http_conn._http_vsn_str = "HTTP/1.0" - - headers = {} - if proxy_auth: - headers["Proxy-Authorization"] = "Basic %s" % proxy_auth - if auth: - headers["Authorization"] = "Basic %s" % auth - if realhost: - headers["Host"] = realhost - for header, value in self.addheaders: - headers[header] = value - - if data is not None: - headers["Content-Type"] = "application/x-www-form-urlencoded" - http_conn.request("POST", selector, data, headers) - else: - http_conn.request("GET", selector, headers=headers) - - try: - response = http_conn.getresponse() - except http.client.BadStatusLine: - # something went wrong with the HTTP status line - raise IOError('http protocol error', 0, - 'got a bad status line', None) - - # According to RFC 2616, "2xx" code indicates that the client's - # request was successfully received, understood, and accepted. - if (200 <= response.status < 300): - return addinfourl(response.fp, response.msg, "http:" + url, - response.status) - else: - return self.http_error( - url, response.fp, - response.status, response.reason, response.msg, data) - - def open_http(self, url, data=None): - """Use HTTP protocol.""" - return self._open_generic_http(http.client.HTTPConnection, url, data) - - def http_error(self, url, fp, errcode, errmsg, headers, data=None): - """Handle http errors. - - Derived class can override this, or provide specific handlers - named http_error_DDD where DDD is the 3-digit error code.""" - # First check if there's a specific handler for this error - name = 'http_error_%d' % errcode - if hasattr(self, name): - method = getattr(self, name) - if data is None: - result = method(url, fp, errcode, errmsg, headers) - else: - result = method(url, fp, errcode, errmsg, headers, data) - if result: return result - return self.http_error_default(url, fp, errcode, errmsg, headers) - - def http_error_default(self, url, fp, errcode, errmsg, headers): - """Default error handler: close the connection and raise IOError.""" - void = fp.read() - fp.close() - raise IOError('http error', errcode, errmsg, headers) - - if _have_ssl: - def _https_connection(self, host): - return http.client.HTTPSConnection(host, - key_file=self.key_file, - cert_file=self.cert_file) - - def open_https(self, url, data=None): - """Use HTTPS protocol.""" - return self._open_generic_http(self._https_connection, url, data) - - def open_file(self, url): - """Use local file or FTP depending on form of URL.""" - if not isinstance(url, str): - raise IOError('file error', 'proxy support for file protocol currently not implemented') - if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': - return self.open_ftp(url) - else: - return self.open_local_file(url) - - def open_local_file(self, url): - """Use local file.""" - import mimetypes, email.utils - host, file = splithost(url) - localname = url2pathname(file) - try: - stats = os.stat(localname) - except OSError as e: - raise IOError(e.errno, e.strerror, e.filename) - size = stats.st_size - modified = email.utils.formatdate(stats.st_mtime, usegmt=True) - mtype = mimetypes.guess_type(url)[0] - headers = email.message_from_string( - 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % - (mtype or 'text/plain', size, modified)) - if not host: - urlfile = file - if file[:1] == '/': - urlfile = 'file://' + file - return addinfourl(open(localname, 'rb'), - headers, urlfile) - host, port = splitport(host) - if not port \ - and socket.gethostbyname(host) in (localhost(), thishost()): - urlfile = file - if file[:1] == '/': - urlfile = 'file://' + file - return addinfourl(open(localname, 'rb'), - headers, urlfile) - raise IOError('local file error', 'not on local host') - - def open_ftp(self, url): - """Use FTP protocol.""" - if not isinstance(url, str): - raise IOError('ftp error', 'proxy support for ftp protocol currently not implemented') - import mimetypes - host, path = splithost(url) - if not host: raise IOError('ftp error', 'no host given') - host, port = splitport(host) - user, host = splituser(host) - if user: user, passwd = splitpasswd(user) - else: passwd = None - host = unquote(host) - user = unquote(user or '') - passwd = unquote(passwd or '') - host = socket.gethostbyname(host) - if not port: - import ftplib - port = ftplib.FTP_PORT - else: - port = int(port) - path, attrs = splitattr(path) - path = unquote(path) - dirs = path.split('/') - dirs, file = dirs[:-1], dirs[-1] - if dirs and not dirs[0]: dirs = dirs[1:] - if dirs and not dirs[0]: dirs[0] = '/' - key = user, host, port, '/'.join(dirs) - # XXX thread unsafe! - if len(self.ftpcache) > MAXFTPCACHE: - # Prune the cache, rather arbitrarily - for k in self.ftpcache.keys(): - if k != key: - v = self.ftpcache[k] - del self.ftpcache[k] - v.close() - try: - if not key in self.ftpcache: - self.ftpcache[key] = \ - ftpwrapper(user, passwd, host, port, dirs) - if not file: type = 'D' - else: type = 'I' - for attr in attrs: - attr, value = splitvalue(attr) - if attr.lower() == 'type' and \ - value in ('a', 'A', 'i', 'I', 'd', 'D'): - type = value.upper() - (fp, retrlen) = self.ftpcache[key].retrfile(file, type) - mtype = mimetypes.guess_type("ftp:" + url)[0] - headers = "" - if mtype: - headers += "Content-Type: %s\n" % mtype - if retrlen is not None and retrlen >= 0: - headers += "Content-Length: %d\n" % retrlen - headers = email.message_from_string(headers) - return addinfourl(fp, headers, "ftp:" + url) - except ftperrors() as msg: - raise IOError('ftp error', msg).with_traceback(sys.exc_info()[2]) - - def open_data(self, url, data=None): - """Use "data" URL.""" - if not isinstance(url, str): - raise IOError('data error', 'proxy support for data protocol currently not implemented') - # ignore POSTed data - # - # syntax of data URLs: - # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data - # mediatype := [ type "/" subtype ] *( ";" parameter ) - # data := *urlchar - # parameter := attribute "=" value - from io import StringIO - try: - [type, data] = url.split(',', 1) - except ValueError: - raise IOError('data error', 'bad data URL') - if not type: - type = 'text/plain;charset=US-ASCII' - semi = type.rfind(';') - if semi >= 0 and '=' not in type[semi:]: - encoding = type[semi+1:] - type = type[:semi] - else: - encoding = '' - msg = [] - msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT', - time.gmtime(time.time()))) - msg.append('Content-type: %s' % type) - if encoding == 'base64': - import base64 - data = base64.decodestring(data) - else: - data = unquote(data) - msg.append('Content-Length: %d' % len(data)) - msg.append('') - msg.append(data) - msg = '\n'.join(msg) - headers = email.message_from_string(msg) - f = StringIO(msg) - #f.fileno = None # needed for addinfourl - return addinfourl(f, headers, url) - - -class FancyURLopener(URLopener): - """Derived class with handlers for errors we can handle (perhaps).""" - - def __init__(self, *args, **kwargs): - URLopener.__init__(self, *args, **kwargs) - self.auth_cache = {} - self.tries = 0 - self.maxtries = 10 - - def http_error_default(self, url, fp, errcode, errmsg, headers): - """Default error handling -- don't raise an exception.""" - return addinfourl(fp, headers, "http:" + url, errcode) - - def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): - """Error 302 -- relocated (temporarily).""" - self.tries += 1 - if self.maxtries and self.tries >= self.maxtries: - if hasattr(self, "http_error_500"): - meth = self.http_error_500 - else: - meth = self.http_error_default - self.tries = 0 - return meth(url, fp, 500, - "Internal Server Error: Redirect Recursion", headers) - result = self.redirect_internal(url, fp, errcode, errmsg, headers, - data) - self.tries = 0 - return result - - def redirect_internal(self, url, fp, errcode, errmsg, headers, data): - if 'location' in headers: - newurl = headers['location'] - elif 'uri' in headers: - newurl = headers['uri'] - else: - return - void = fp.read() - fp.close() - # In case the server sent a relative URL, join with original: - newurl = basejoin(self.type + ":" + url, newurl) - return self.open(newurl) - - def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): - """Error 301 -- also relocated (permanently).""" - return self.http_error_302(url, fp, errcode, errmsg, headers, data) - - def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): - """Error 303 -- also relocated (essentially identical to 302).""" - return self.http_error_302(url, fp, errcode, errmsg, headers, data) - - def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): - """Error 307 -- relocated, but turn POST into error.""" - if data is None: - return self.http_error_302(url, fp, errcode, errmsg, headers, data) - else: - return self.http_error_default(url, fp, errcode, errmsg, headers) - - def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): - """Error 401 -- authentication required. - This function supports Basic authentication only.""" - if not 'www-authenticate' in headers: - URLopener.http_error_default(self, url, fp, - errcode, errmsg, headers) - stuff = headers['www-authenticate'] - import re - match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) - if not match: - URLopener.http_error_default(self, url, fp, - errcode, errmsg, headers) - scheme, realm = match.groups() - if scheme.lower() != 'basic': - URLopener.http_error_default(self, url, fp, - errcode, errmsg, headers) - name = 'retry_' + self.type + '_basic_auth' - if data is None: - return getattr(self,name)(url, realm) - else: - return getattr(self,name)(url, realm, data) - - def http_error_407(self, url, fp, errcode, errmsg, headers, data=None): - """Error 407 -- proxy authentication required. - This function supports Basic authentication only.""" - if not 'proxy-authenticate' in headers: - URLopener.http_error_default(self, url, fp, - errcode, errmsg, headers) - stuff = headers['proxy-authenticate'] - import re - match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) - if not match: - URLopener.http_error_default(self, url, fp, - errcode, errmsg, headers) - scheme, realm = match.groups() - if scheme.lower() != 'basic': - URLopener.http_error_default(self, url, fp, - errcode, errmsg, headers) - name = 'retry_proxy_' + self.type + '_basic_auth' - if data is None: - return getattr(self,name)(url, realm) - else: - return getattr(self,name)(url, realm, data) - - def retry_proxy_http_basic_auth(self, url, realm, data=None): - host, selector = splithost(url) - newurl = 'http://' + host + selector - proxy = self.proxies['http'] - urltype, proxyhost = splittype(proxy) - proxyhost, proxyselector = splithost(proxyhost) - i = proxyhost.find('@') + 1 - proxyhost = proxyhost[i:] - user, passwd = self.get_user_passwd(proxyhost, realm, i) - if not (user or passwd): return None - proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost - self.proxies['http'] = 'http://' + proxyhost + proxyselector - if data is None: - return self.open(newurl) - else: - return self.open(newurl, data) - - def retry_proxy_https_basic_auth(self, url, realm, data=None): - host, selector = splithost(url) - newurl = 'https://' + host + selector - proxy = self.proxies['https'] - urltype, proxyhost = splittype(proxy) - proxyhost, proxyselector = splithost(proxyhost) - i = proxyhost.find('@') + 1 - proxyhost = proxyhost[i:] - user, passwd = self.get_user_passwd(proxyhost, realm, i) - if not (user or passwd): return None - proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost - self.proxies['https'] = 'https://' + proxyhost + proxyselector - if data is None: - return self.open(newurl) - else: - return self.open(newurl, data) - - def retry_http_basic_auth(self, url, realm, data=None): - host, selector = splithost(url) - i = host.find('@') + 1 - host = host[i:] - user, passwd = self.get_user_passwd(host, realm, i) - if not (user or passwd): return None - host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host - newurl = 'http://' + host + selector - if data is None: - return self.open(newurl) - else: - return self.open(newurl, data) - - def retry_https_basic_auth(self, url, realm, data=None): - host, selector = splithost(url) - i = host.find('@') + 1 - host = host[i:] - user, passwd = self.get_user_passwd(host, realm, i) - if not (user or passwd): return None - host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host - newurl = 'https://' + host + selector - if data is None: - return self.open(newurl) - else: - return self.open(newurl, data) - - def get_user_passwd(self, host, realm, clear_cache = 0): - key = realm + '@' + host.lower() - if key in self.auth_cache: - if clear_cache: - del self.auth_cache[key] - else: - return self.auth_cache[key] - user, passwd = self.prompt_user_passwd(host, realm) - if user or passwd: self.auth_cache[key] = (user, passwd) - return user, passwd - - def prompt_user_passwd(self, host, realm): - """Override this in a GUI environment!""" - import getpass - try: - user = input("Enter username for %s at %s: " % (realm, host)) - passwd = getpass.getpass("Enter password for %s in %s at %s: " % - (user, realm, host)) - return user, passwd - except KeyboardInterrupt: - print() - return None, None - - -# Utility functions - -_localhost = None -def localhost(): - """Return the IP address of the magic hostname 'localhost'.""" - global _localhost - if _localhost is None: - _localhost = socket.gethostbyname('localhost') - return _localhost - -_thishost = None -def thishost(): - """Return the IP address of the current host.""" - global _thishost - if _thishost is None: - _thishost = socket.gethostbyname(socket.gethostname()) - return _thishost - -_ftperrors = None -def ftperrors(): - """Return the set of errors raised by the FTP class.""" - global _ftperrors - if _ftperrors is None: - import ftplib - _ftperrors = ftplib.all_errors - return _ftperrors - -_noheaders = None -def noheaders(): - """Return an empty email.message.Message object.""" - global _noheaders - if _noheaders is None: - _noheaders = email.message.Message() - return _noheaders - - -# Utility classes - -class ftpwrapper: - """Class used by open_ftp() for cache of open FTP connections.""" - - def __init__(self, user, passwd, host, port, dirs, - timeout=socket._GLOBAL_DEFAULT_TIMEOUT): - self.user = user - self.passwd = passwd - self.host = host - self.port = port - self.dirs = dirs - self.timeout = timeout - self.init() - - def init(self): - import ftplib - self.busy = 0 - self.ftp = ftplib.FTP() - self.ftp.connect(self.host, self.port, self.timeout) - self.ftp.login(self.user, self.passwd) - for dir in self.dirs: - self.ftp.cwd(dir) - - def retrfile(self, file, type): - import ftplib - self.endtransfer() - if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 - else: cmd = 'TYPE ' + type; isdir = 0 - try: - self.ftp.voidcmd(cmd) - except ftplib.all_errors: - self.init() - self.ftp.voidcmd(cmd) - conn = None - if file and not isdir: - # Try to retrieve as a file - try: - cmd = 'RETR ' + file - conn = self.ftp.ntransfercmd(cmd) - except ftplib.error_perm as reason: - if str(reason)[:3] != '550': - raise IOError('ftp error', reason).with_traceback(sys.exc_info()[2]) - if not conn: - # Set transfer mode to ASCII! - self.ftp.voidcmd('TYPE A') - # Try a directory listing. Verify that directory exists. - if file: - pwd = self.ftp.pwd() - try: - try: - self.ftp.cwd(file) - except ftplib.error_perm as reason: - raise IOError('ftp error', reason) from reason - finally: - self.ftp.cwd(pwd) - cmd = 'LIST ' + file - else: - cmd = 'LIST' - conn = self.ftp.ntransfercmd(cmd) - self.busy = 1 - # Pass back both a suitably decorated object and a retrieval length - return (addclosehook(conn[0].makefile('rb'), - self.endtransfer), conn[1]) - def endtransfer(self): - if not self.busy: - return - self.busy = 0 - try: - self.ftp.voidresp() - except ftperrors(): - pass - - def close(self): - self.endtransfer() - try: - self.ftp.close() - except ftperrors(): - pass - -class addbase: - """Base class for addinfo and addclosehook.""" - - # XXX Add a method to expose the timeout on the underlying socket? - - def __init__(self, fp): - self.fp = fp - self.read = self.fp.read - self.readline = self.fp.readline - if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines - if hasattr(self.fp, "fileno"): - self.fileno = self.fp.fileno - else: - self.fileno = lambda: None - if hasattr(self.fp, "__iter__"): - self.__iter__ = self.fp.__iter__ - if hasattr(self.fp, "__next__"): - self.__next__ = self.fp.__next__ - - def __repr__(self): - return '<%s at %r whose fp = %r>' % (self.__class__.__name__, - id(self), self.fp) - - def close(self): - self.read = None - self.readline = None - self.readlines = None - self.fileno = None - if self.fp: self.fp.close() - self.fp = None - -class addclosehook(addbase): - """Class to add a close hook to an open file.""" - - def __init__(self, fp, closehook, *hookargs): - addbase.__init__(self, fp) - self.closehook = closehook - self.hookargs = hookargs - - def close(self): - addbase.close(self) - if self.closehook: - self.closehook(*self.hookargs) - self.closehook = None - self.hookargs = None - -class addinfo(addbase): - """class to add an info() method to an open file.""" - - def __init__(self, fp, headers): - addbase.__init__(self, fp) - self.headers = headers - - def info(self): - return self.headers - -class addinfourl(addbase): - """class to add info() and geturl() methods to an open file.""" - - def __init__(self, fp, headers, url, code=None): - addbase.__init__(self, fp) - self.headers = headers - self.url = url - self.code = code - - def info(self): - return self.headers - - def getcode(self): - return self.code - - def geturl(self): - return self.url - - -# Utilities to parse URLs (most of these return None for missing parts): -# unwrap('') --> 'type://host/path' -# splittype('type:opaquestring') --> 'type', 'opaquestring' -# splithost('//host[:port]/path') --> 'host[:port]', '/path' -# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' -# splitpasswd('user:passwd') -> 'user', 'passwd' -# splitport('host:port') --> 'host', 'port' -# splitquery('/path?query') --> '/path', 'query' -# splittag('/path#tag') --> '/path', 'tag' -# splitattr('/path;attr1=value1;attr2=value2;...') -> -# '/path', ['attr1=value1', 'attr2=value2', ...] -# splitvalue('attr=value') --> 'attr', 'value' -# unquote('abc%20def') -> 'abc def' -# quote('abc def') -> 'abc%20def') - -def toBytes(url): - """toBytes(u"URL") --> 'URL'.""" - # Most URL schemes require ASCII. If that changes, the conversion - # can be relaxed. - # XXX get rid of toBytes() - if isinstance(url, str): - try: - url = url.encode("ASCII").decode() - except UnicodeError: - raise UnicodeError("URL " + repr(url) + - " contains non-ASCII characters") - return url - -def unwrap(url): - """unwrap('') --> 'type://host/path'.""" - url = str(url).strip() - if url[:1] == '<' and url[-1:] == '>': - url = url[1:-1].strip() - if url[:4] == 'URL:': url = url[4:].strip() - return url - -_typeprog = None -def splittype(url): - """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" - global _typeprog - if _typeprog is None: - import re - _typeprog = re.compile('^([^/:]+):') - - match = _typeprog.match(url) - if match: - scheme = match.group(1) - return scheme.lower(), url[len(scheme) + 1:] - return None, url - -_hostprog = None -def splithost(url): - """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" - global _hostprog - if _hostprog is None: - import re - _hostprog = re.compile('^//([^/?]*)(.*)$') - - match = _hostprog.match(url) - if match: return match.group(1, 2) - return None, url - -_userprog = None -def splituser(host): - """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" - global _userprog - if _userprog is None: - import re - _userprog = re.compile('^(.*)@(.*)$') - - match = _userprog.match(host) - if match: return map(unquote, match.group(1, 2)) - return None, host - -_passwdprog = None -def splitpasswd(user): - """splitpasswd('user:passwd') -> 'user', 'passwd'.""" - global _passwdprog - if _passwdprog is None: - import re - _passwdprog = re.compile('^([^:]*):(.*)$') - - match = _passwdprog.match(user) - if match: return match.group(1, 2) - return user, None - -# splittag('/path#tag') --> '/path', 'tag' -_portprog = None -def splitport(host): - """splitport('host:port') --> 'host', 'port'.""" - global _portprog - if _portprog is None: - import re - _portprog = re.compile('^(.*):([0-9]+)$') - - match = _portprog.match(host) - if match: return match.group(1, 2) - return host, None - -_nportprog = None -def splitnport(host, defport=-1): - """Split host and port, returning numeric port. - Return given default port if no ':' found; defaults to -1. - Return numerical port if a valid number are found after ':'. - Return None if ':' but not a valid number.""" - global _nportprog - if _nportprog is None: - import re - _nportprog = re.compile('^(.*):(.*)$') - - match = _nportprog.match(host) - if match: - host, port = match.group(1, 2) - try: - if not port: raise ValueError("no digits") - nport = int(port) - except ValueError: - nport = None - return host, nport - return host, defport - -_queryprog = None -def splitquery(url): - """splitquery('/path?query') --> '/path', 'query'.""" - global _queryprog - if _queryprog is None: - import re - _queryprog = re.compile('^(.*)\?([^?]*)$') - - match = _queryprog.match(url) - if match: return match.group(1, 2) - return url, None - -_tagprog = None -def splittag(url): - """splittag('/path#tag') --> '/path', 'tag'.""" - global _tagprog - if _tagprog is None: - import re - _tagprog = re.compile('^(.*)#([^#]*)$') - - match = _tagprog.match(url) - if match: return match.group(1, 2) - return url, None - -def splitattr(url): - """splitattr('/path;attr1=value1;attr2=value2;...') -> - '/path', ['attr1=value1', 'attr2=value2', ...].""" - words = url.split(';') - return words[0], words[1:] - -_valueprog = None -def splitvalue(attr): - """splitvalue('attr=value') --> 'attr', 'value'.""" - global _valueprog - if _valueprog is None: - import re - _valueprog = re.compile('^([^=]*)=(.*)$') - - match = _valueprog.match(attr) - if match: return match.group(1, 2) - return attr, None - -_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) -_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) - -def unquote(s): - """unquote('abc%20def') -> 'abc def'.""" - res = s.split('%') - for i in range(1, len(res)): - item = res[i] - try: - res[i] = _hextochr[item[:2]] + item[2:] - except KeyError: - res[i] = '%' + item - except UnicodeDecodeError: - res[i] = chr(int(item[:2], 16)) + item[2:] - return "".join(res) - -def unquote_plus(s): - """unquote('%7e/abc+def') -> '~/abc def'""" - s = s.replace('+', ' ') - return unquote(s) - -always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' - 'abcdefghijklmnopqrstuvwxyz' - '0123456789' '_.-') -_safe_quoters= {} - -class Quoter: - def __init__(self, safe): - self.cache = {} - self.safe = safe + always_safe - - def __call__(self, c): - try: - return self.cache[c] - except KeyError: - if ord(c) < 256: - res = (c in self.safe) and c or ('%%%02X' % ord(c)) - self.cache[c] = res - return res - else: - return "".join(['%%%02X' % i for i in c.encode("utf-8")]) - -def quote(s, safe = '/'): - """quote('abc def') -> 'abc%20def' - - Each part of a URL, e.g. the path info, the query, etc., has a - different set of reserved characters that must be quoted. - - RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists - the following reserved characters. - - reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | - "$" | "," - - Each of these characters is reserved in some component of a URL, - but not necessarily in all of them. - - By default, the quote function is intended for quoting the path - section of a URL. Thus, it will not encode '/'. This character - is reserved, but in typical usage the quote function is being - called on a path where the existing slash characters are used as - reserved characters. - """ - cachekey = (safe, always_safe) - try: - quoter = _safe_quoters[cachekey] - except KeyError: - quoter = Quoter(safe) - _safe_quoters[cachekey] = quoter - res = map(quoter, s) - return ''.join(res) - -def quote_plus(s, safe = ''): - """Quote the query fragment of a URL; replacing ' ' with '+'""" - if ' ' in s: - s = quote(s, safe + ' ') - return s.replace(' ', '+') - return quote(s, safe) - -def urlencode(query,doseq=0): - """Encode a sequence of two-element tuples or dictionary into a URL query string. - - If any values in the query arg are sequences and doseq is true, each - sequence element is converted to a separate parameter. - - If the query arg is a sequence of two-element tuples, the order of the - parameters in the output will match the order of parameters in the - input. - """ - - if hasattr(query,"items"): - # mapping objects - query = query.items() - else: - # it's a bother at times that strings and string-like objects are - # sequences... - try: - # non-sequence items should not work with len() - # non-empty strings will fail this - if len(query) and not isinstance(query[0], tuple): - raise TypeError - # zero-length sequences of all types will get here and succeed, - # but that's a minor nit - since the original implementation - # allowed empty dicts that type of behavior probably should be - # preserved for consistency - except TypeError: - ty,va,tb = sys.exc_info() - raise TypeError("not a valid non-string sequence or mapping object").with_traceback(tb) - - l = [] - if not doseq: - # preserve old behavior - for k, v in query: - k = quote_plus(str(k)) - v = quote_plus(str(v)) - l.append(k + '=' + v) - else: - for k, v in query: - k = quote_plus(str(k)) - if isinstance(v, str): - v = quote_plus(v) - l.append(k + '=' + v) - elif isinstance(v, str): - # is there a reasonable way to convert to ASCII? - # encode generates a string, but "replace" or "ignore" - # lose information and "strict" can raise UnicodeError - v = quote_plus(v.encode("ASCII","replace")) - l.append(k + '=' + v) - else: - try: - # is this a sufficient test for sequence-ness? - x = len(v) - except TypeError: - # not a sequence - v = quote_plus(str(v)) - l.append(k + '=' + v) - else: - # loop over the sequence - for elt in v: - l.append(k + '=' + quote_plus(str(elt))) - return '&'.join(l) - -# Proxy handling -def getproxies_environment(): - """Return a dictionary of scheme -> proxy server URL mappings. - - Scan the environment for variables named _proxy; - this seems to be the standard convention. If you need a - different way, you can pass a proxies dictionary to the - [Fancy]URLopener constructor. - - """ - proxies = {} - for name, value in os.environ.items(): - name = name.lower() - if name == 'no_proxy': - # handled in proxy_bypass_environment - continue - if value and name[-6:] == '_proxy': - proxies[name[:-6]] = value - return proxies - -def proxy_bypass_environment(host): - """Test if proxies should not be used for a particular host. - - Checks the environment for a variable named no_proxy, which should - be a list of DNS suffixes separated by commas, or '*' for all hosts. - """ - no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '') - # '*' is special case for always bypass - if no_proxy == '*': - return 1 - # strip port off host - hostonly, port = splitport(host) - # check if the host ends with any of the DNS suffixes - for name in no_proxy.split(','): - if name and (hostonly.endswith(name) or host.endswith(name)): - return 1 - # otherwise, don't bypass - return 0 - - -if sys.platform == 'darwin': - - def _CFSetup(sc): - from ctypes import c_int32, c_void_p, c_char_p, c_int - sc.CFStringCreateWithCString.argtypes = [ c_void_p, c_char_p, c_int32 ] - sc.CFStringCreateWithCString.restype = c_void_p - sc.SCDynamicStoreCopyProxies.argtypes = [ c_void_p ] - sc.SCDynamicStoreCopyProxies.restype = c_void_p - sc.CFDictionaryGetValue.argtypes = [ c_void_p, c_void_p ] - sc.CFDictionaryGetValue.restype = c_void_p - sc.CFStringGetLength.argtypes = [ c_void_p ] - sc.CFStringGetLength.restype = c_int32 - sc.CFStringGetCString.argtypes = [ c_void_p, c_char_p, c_int32, c_int32 ] - sc.CFStringGetCString.restype = c_int32 - sc.CFNumberGetValue.argtypes = [ c_void_p, c_int, c_void_p ] - sc.CFNumberGetValue.restype = c_int32 - sc.CFRelease.argtypes = [ c_void_p ] - sc.CFRelease.restype = None - - def _CStringFromCFString(sc, value): - from ctypes import create_string_buffer - length = sc.CFStringGetLength(value) + 1 - buff = create_string_buffer(length) - sc.CFStringGetCString(value, buff, length, 0) - return buff.value - - def _CFNumberToInt32(sc, cfnum): - from ctypes import byref, c_int - val = c_int() - kCFNumberSInt32Type = 3 - sc.CFNumberGetValue(cfnum, kCFNumberSInt32Type, byref(val)) - return val.value - - - def proxy_bypass_macosx_sysconf(host): - """ - Return True iff this host shouldn't be accessed using a proxy - - This function uses the MacOSX framework SystemConfiguration - to fetch the proxy information. - """ - from ctypes import cdll - from ctypes.util import find_library - import re - import socket - from fnmatch import fnmatch - - def ip2num(ipAddr): - parts = ipAddr.split('.') - parts = map(int, parts) - if len(parts) != 4: - parts = (parts + [0, 0, 0, 0])[:4] - return (parts[0] << 24) | (parts[1] << 16) | (parts[2] << 8) | parts[3] - - sc = cdll.LoadLibrary(find_library("SystemConfiguration")) - _CFSetup(sc) - - hostIP = None - - if not sc: - return False - - kSCPropNetProxiesExceptionsList = sc.CFStringCreateWithCString(0, "ExceptionsList", 0) - kSCPropNetProxiesExcludeSimpleHostnames = sc.CFStringCreateWithCString(0, - "ExcludeSimpleHostnames", 0) - - - proxyDict = sc.SCDynamicStoreCopyProxies(None) - if proxyDict is None: - return False - - try: - # Check for simple host names: - if '.' not in host: - exclude_simple = sc.CFDictionaryGetValue(proxyDict, - kSCPropNetProxiesExcludeSimpleHostnames) - if exclude_simple and _CFNumberToInt32(sc, exclude_simple): - return True - - - # Check the exceptions list: - exceptions = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesExceptionsList) - if exceptions: - # Items in the list are strings like these: *.local, 169.254/16 - for index in xrange(sc.CFArrayGetCount(exceptions)): - value = sc.CFArrayGetValueAtIndex(exceptions, index) - if not value: continue - value = _CStringFromCFString(sc, value) - - m = re.match(r"(\d+(?:\.\d+)*)(/\d+)?", value) - if m is not None: - if hostIP is None: - hostIP = socket.gethostbyname(host) - hostIP = ip2num(hostIP) - - base = ip2num(m.group(1)) - mask = int(m.group(2)[1:]) - mask = 32 - mask - - if (hostIP >> mask) == (base >> mask): - return True - - elif fnmatch(host, value): - return True - - return False - - finally: - sc.CFRelease(kSCPropNetProxiesExceptionsList) - sc.CFRelease(kSCPropNetProxiesExcludeSimpleHostnames) - - - - def getproxies_macosx_sysconf(): - """Return a dictionary of scheme -> proxy server URL mappings. - - This function uses the MacOSX framework SystemConfiguration - to fetch the proxy information. - """ - from ctypes import cdll - from ctypes.util import find_library - - sc = cdll.LoadLibrary(find_library("SystemConfiguration")) - _CFSetup(sc) - - if not sc: - return {} - - kSCPropNetProxiesHTTPEnable = sc.CFStringCreateWithCString(0, b"HTTPEnable", 0) - kSCPropNetProxiesHTTPProxy = sc.CFStringCreateWithCString(0, b"HTTPProxy", 0) - kSCPropNetProxiesHTTPPort = sc.CFStringCreateWithCString(0, b"HTTPPort", 0) - - kSCPropNetProxiesHTTPSEnable = sc.CFStringCreateWithCString(0, b"HTTPSEnable", 0) - kSCPropNetProxiesHTTPSProxy = sc.CFStringCreateWithCString(0, b"HTTPSProxy", 0) - kSCPropNetProxiesHTTPSPort = sc.CFStringCreateWithCString(0, b"HTTPSPort", 0) - - kSCPropNetProxiesFTPEnable = sc.CFStringCreateWithCString(0, b"FTPEnable", 0) - kSCPropNetProxiesFTPPassive = sc.CFStringCreateWithCString(0, b"FTPPassive", 0) - kSCPropNetProxiesFTPPort = sc.CFStringCreateWithCString(0, b"FTPPort", 0) - kSCPropNetProxiesFTPProxy = sc.CFStringCreateWithCString(0, b"FTPProxy", 0) - - kSCPropNetProxiesGopherEnable = sc.CFStringCreateWithCString(0, b"GopherEnable", 0) - kSCPropNetProxiesGopherPort = sc.CFStringCreateWithCString(0, b"GopherPort", 0) - kSCPropNetProxiesGopherProxy = sc.CFStringCreateWithCString(0, b"GopherProxy", 0) - - proxies = {} - proxyDict = sc.SCDynamicStoreCopyProxies(None) - - try: - # HTTP: - enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPEnable) - if enabled and _CFNumberToInt32(sc, enabled): - proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPProxy) - port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPPort) - - if proxy: - proxy = _CStringFromCFString(sc, proxy) - if port: - port = _CFNumberToInt32(sc, port) - proxies["http"] = "http://%s:%i" % (proxy, port) - else: - proxies["http"] = "http://%s" % (proxy, ) - - # HTTPS: - enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPSEnable) - if enabled and _CFNumberToInt32(sc, enabled): - proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPSProxy) - port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesHTTPSPort) - - if proxy: - proxy = _CStringFromCFString(sc, proxy) - if port: - port = _CFNumberToInt32(sc, port) - proxies["https"] = "http://%s:%i" % (proxy, port) - else: - proxies["https"] = "http://%s" % (proxy, ) - - # FTP: - enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesFTPEnable) - if enabled and _CFNumberToInt32(sc, enabled): - proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesFTPProxy) - port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesFTPPort) - - if proxy: - proxy = _CStringFromCFString(sc, proxy) - if port: - port = _CFNumberToInt32(sc, port) - proxies["ftp"] = "http://%s:%i" % (proxy, port) - else: - proxies["ftp"] = "http://%s" % (proxy, ) - - # Gopher: - enabled = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesGopherEnable) - if enabled and _CFNumberToInt32(sc, enabled): - proxy = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesGopherProxy) - port = sc.CFDictionaryGetValue(proxyDict, kSCPropNetProxiesGopherPort) - - if proxy: - proxy = _CStringFromCFString(sc, proxy) - if port: - port = _CFNumberToInt32(sc, port) - proxies["gopher"] = "http://%s:%i" % (proxy, port) - else: - proxies["gopher"] = "http://%s" % (proxy, ) - finally: - sc.CFRelease(proxyDict) - - sc.CFRelease(kSCPropNetProxiesHTTPEnable) - sc.CFRelease(kSCPropNetProxiesHTTPProxy) - sc.CFRelease(kSCPropNetProxiesHTTPPort) - sc.CFRelease(kSCPropNetProxiesFTPEnable) - sc.CFRelease(kSCPropNetProxiesFTPPassive) - sc.CFRelease(kSCPropNetProxiesFTPPort) - sc.CFRelease(kSCPropNetProxiesFTPProxy) - sc.CFRelease(kSCPropNetProxiesGopherEnable) - sc.CFRelease(kSCPropNetProxiesGopherPort) - sc.CFRelease(kSCPropNetProxiesGopherProxy) - - return proxies - - - - def proxy_bypass(host): - if getproxies_environment(): - return proxy_bypass_environment(host) - else: - return proxy_bypass_macosx_sysconf(host) - - def getproxies(): - return getproxies_environment() or getproxies_macosx_sysconf() - -elif os.name == 'nt': - def getproxies_registry(): - """Return a dictionary of scheme -> proxy server URL mappings. - - Win32 uses the registry to store proxies. - - """ - proxies = {} - try: - import winreg - except ImportError: - # Std module, so should be around - but you never know! - return proxies - try: - internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, - r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') - proxyEnable = winreg.QueryValueEx(internetSettings, - 'ProxyEnable')[0] - if proxyEnable: - # Returned as Unicode but problems if not converted to ASCII - proxyServer = str(winreg.QueryValueEx(internetSettings, - 'ProxyServer')[0]) - if '=' in proxyServer: - # Per-protocol settings - for p in proxyServer.split(';'): - protocol, address = p.split('=', 1) - # See if address has a type:// prefix - import re - if not re.match('^([^/:]+)://', address): - address = '%s://%s' % (protocol, address) - proxies[protocol] = address - else: - # Use one setting for all protocols - if proxyServer[:5] == 'http:': - proxies['http'] = proxyServer - else: - proxies['http'] = 'http://%s' % proxyServer - proxies['ftp'] = 'ftp://%s' % proxyServer - internetSettings.Close() - except (WindowsError, ValueError, TypeError): - # Either registry key not found etc, or the value in an - # unexpected format. - # proxies already set up to be empty so nothing to do - pass - return proxies - - def getproxies(): - """Return a dictionary of scheme -> proxy server URL mappings. - - Returns settings gathered from the environment, if specified, - or the registry. - - """ - return getproxies_environment() or getproxies_registry() - - def proxy_bypass_registry(host): - try: - import winreg - import re - except ImportError: - # Std modules, so should be around - but you never know! - return 0 - try: - internetSettings = winreg.OpenKey(winreg.HKEY_CURRENT_USER, - r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') - proxyEnable = winreg.QueryValueEx(internetSettings, - 'ProxyEnable')[0] - proxyOverride = str(winreg.QueryValueEx(internetSettings, - 'ProxyOverride')[0]) - # ^^^^ Returned as Unicode but problems if not converted to ASCII - except WindowsError: - return 0 - if not proxyEnable or not proxyOverride: - return 0 - # try to make a host list from name and IP address. - rawHost, port = splitport(host) - host = [rawHost] - try: - addr = socket.gethostbyname(rawHost) - if addr != rawHost: - host.append(addr) - except socket.error: - pass - try: - fqdn = socket.getfqdn(rawHost) - if fqdn != rawHost: - host.append(fqdn) - except socket.error: - pass - # make a check value list from the registry entry: replace the - # '' string by the localhost entry and the corresponding - # canonical entry. - proxyOverride = proxyOverride.split(';') - i = 0 - while i < len(proxyOverride): - if proxyOverride[i] == '': - proxyOverride[i:i+1] = ['localhost', - '127.0.0.1', - socket.gethostname(), - socket.gethostbyname( - socket.gethostname())] - i += 1 - # print proxyOverride - # now check if we match one of the registry values. - for test in proxyOverride: - test = test.replace(".", r"\.") # mask dots - test = test.replace("*", r".*") # change glob sequence - test = test.replace("?", r".") # change glob char - for val in host: - # print "%s <--> %s" %( test, val ) - if re.match(test, val, re.I): - return 1 - return 0 - - def proxy_bypass(host): - """Return a dictionary of scheme -> proxy server URL mappings. - - Returns settings gathered from the environment, if specified, - or the registry. - - """ - if getproxies_environment(): - return proxy_bypass_environment(host) - else: - return proxy_bypass_registry(host) - -else: - # By default use environment variables - getproxies = getproxies_environment - proxy_bypass = proxy_bypass_environment - -# Test and time quote() and unquote() -def test1(): - s = '' - for i in range(256): s = s + chr(i) - s = s*4 - t0 = time.time() - qs = quote(s) - uqs = unquote(qs) - t1 = time.time() - if uqs != s: - print('Wrong!') - print(repr(s)) - print(repr(qs)) - print(repr(uqs)) - print(round(t1 - t0, 3), 'sec') - - -def reporthook(blocknum, blocksize, totalsize): - # Report during remote transfers - print("Block number: %d, Block size: %d, Total size: %d" % ( - blocknum, blocksize, totalsize)) - -# Test program -def test(args=[]): - if not args: - args = [ - '/etc/passwd', - 'file:/etc/passwd', - 'file://localhost/etc/passwd', - 'ftp://ftp.gnu.org/pub/README', - 'http://www.python.org/index.html', - ] - if hasattr(URLopener, "open_https"): - args.append('https://synergy.as.cmu.edu/~geek/') - try: - for url in args: - print('-'*10, url, '-'*10) - fn, h = urlretrieve(url, None, reporthook) - print(fn) - if h: - print('======') - for k in h.keys(): print(k + ':', h[k]) - print('======') - fp = open(fn, 'rb') - data = fp.read() - del fp - data = data.replace("\r", "") - print(data) - fn, h = None, None - print('-'*40) - finally: - urlcleanup() - -def main(): - import getopt, sys - try: - opts, args = getopt.getopt(sys.argv[1:], "th") - except getopt.error as msg: - print(msg) - print("Use -h for help") - return - t = 0 - for o, a in opts: - if o == '-t': - t = t + 1 - if o == '-h': - print("Usage: python urllib.py [-t] [url ...]") - print("-t runs self-test;", end=' ') - print("otherwise, contents of urls are printed") - return - if t: - if t > 1: - test1() - test(args) - else: - if not args: - print("Use -h for help") - for url in args: - print(urlopen(url).read(), end=' ') - -# Run test program when run as a script -if __name__ == '__main__': - main() diff --git a/Lib/urllib/__init__.py b/Lib/urllib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Lib/urllib/error.py b/Lib/urllib/error.py new file mode 100644 index 0000000..300c3fe --- /dev/null +++ b/Lib/urllib/error.py @@ -0,0 +1,59 @@ +"""Exception classes raised by urllib. + +The base exception class is URLError, which inherits from IOError. It +doesn't define any behavior of its own, but is the base class for all +exceptions defined in this package. + +HTTPError is an exception class that is also a valid HTTP response +instance. It behaves this way because HTTP protocol errors are valid +responses, with a status code, headers, and a body. In some contexts, +an application may want to handle an exception like a regular +response. +""" + +import urllib.response + +# do these error classes make sense? +# make sure all of the IOError stuff is overridden. we just want to be +# subtypes. + +class URLError(IOError): + # URLError is a sub-type of IOError, but it doesn't share any of + # the implementation. need to override __init__ and __str__. + # It sets self.args for compatibility with other EnvironmentError + # subclasses, but args doesn't have the typical format with errno in + # slot 0 and strerror in slot 1. This may be better than nothing. + def __init__(self, reason, filename=None): + self.args = reason, + self.reason = reason + if filename is not None: + self.filename = filename + + def __str__(self): + return '' % self.reason + +class HTTPError(URLError, urllib.response.addinfourl): + """Raised when HTTP error occurs, but also acts like non-error return""" + __super_init = urllib.response.addinfourl.__init__ + + def __init__(self, url, code, msg, hdrs, fp): + self.code = code + self.msg = msg + self.hdrs = hdrs + self.fp = fp + self.filename = url + # The addinfourl classes depend on fp being a valid file + # object. In some cases, the HTTPError may not have a valid + # file object. If this happens, the simplest workaround is to + # not initialize the base classes. + if fp is not None: + self.__super_init(fp, hdrs, url, code) + + def __str__(self): + return 'HTTP Error %s: %s' % (self.code, self.msg) + +# exception raised when downloaded size does not match content-length +class ContentTooShortError(URLError): + def __init__(self, message, content): + URLError.__init__(self, message) + self.content = content diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py new file mode 100644 index 0000000..71cc369 --- /dev/null +++ b/Lib/urllib/parse.py @@ -0,0 +1,630 @@ +"""Parse (absolute and relative) URLs. + +See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, +UC Irvine, June 1995. +""" + +__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", + "urlsplit", "urlunsplit"] + +# A classification of schemes ('' means apply by default) +uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', + 'wais', 'file', 'https', 'shttp', 'mms', + 'prospero', 'rtsp', 'rtspu', '', 'sftp'] +uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', + 'imap', 'wais', 'file', 'mms', 'https', 'shttp', + 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', + 'svn', 'svn+ssh', 'sftp'] +non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', + 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] +uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', + 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', + 'mms', '', 'sftp'] +uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', + 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] +uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', + 'nntp', 'wais', 'https', 'shttp', 'snews', + 'file', 'prospero', ''] + +# Characters valid in scheme names +scheme_chars = ('abcdefghijklmnopqrstuvwxyz' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + '0123456789' + '+-.') + +MAX_CACHE_SIZE = 20 +_parse_cache = {} + +def clear_cache(): + """Clear the parse cache.""" + _parse_cache.clear() + + +class ResultMixin(object): + """Shared methods for the parsed result objects.""" + + @property + def username(self): + netloc = self.netloc + if "@" in netloc: + userinfo = netloc.rsplit("@", 1)[0] + if ":" in userinfo: + userinfo = userinfo.split(":", 1)[0] + return userinfo + return None + + @property + def password(self): + netloc = self.netloc + if "@" in netloc: + userinfo = netloc.rsplit("@", 1)[0] + if ":" in userinfo: + return userinfo.split(":", 1)[1] + return None + + @property + def hostname(self): + netloc = self.netloc + if "@" in netloc: + netloc = netloc.rsplit("@", 1)[1] + if ":" in netloc: + netloc = netloc.split(":", 1)[0] + return netloc.lower() or None + + @property + def port(self): + netloc = self.netloc + if "@" in netloc: + netloc = netloc.rsplit("@", 1)[1] + if ":" in netloc: + port = netloc.split(":", 1)[1] + return int(port, 10) + return None + +from collections import namedtuple + +class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin): + + __slots__ = () + + def geturl(self): + return urlunsplit(self) + + +class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin): + + __slots__ = () + + def geturl(self): + return urlunparse(self) + + +def urlparse(url, scheme='', allow_fragments=True): + """Parse a URL into 6 components: + :///;?# + Return a 6-tuple: (scheme, netloc, path, params, query, fragment). + Note that we don't break the components up in smaller bits + (e.g. netloc is a single string) and we don't expand % escapes.""" + tuple = urlsplit(url, scheme, allow_fragments) + scheme, netloc, url, query, fragment = tuple + if scheme in uses_params and ';' in url: + url, params = _splitparams(url) + else: + params = '' + return ParseResult(scheme, netloc, url, params, query, fragment) + +def _splitparams(url): + if '/' in url: + i = url.find(';', url.rfind('/')) + if i < 0: + return url, '' + else: + i = url.find(';') + return url[:i], url[i+1:] + +def _splitnetloc(url, start=0): + delim = len(url) # position of end of domain part of url, default is end + for c in '/?#': # look for delimiters; the order is NOT important + wdelim = url.find(c, start) # find first of this delim + if wdelim >= 0: # if found + delim = min(delim, wdelim) # use earliest delim position + return url[start:delim], url[delim:] # return (domain, rest) + +def urlsplit(url, scheme='', allow_fragments=True): + """Parse a URL into 5 components: + :///?# + Return a 5-tuple: (scheme, netloc, path, query, fragment). + Note that we don't break the components up in smaller bits + (e.g. netloc is a single string) and we don't expand % escapes.""" + allow_fragments = bool(allow_fragments) + key = url, scheme, allow_fragments, type(url), type(scheme) + cached = _parse_cache.get(key, None) + if cached: + return cached + if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth + clear_cache() + netloc = query = fragment = '' + i = url.find(':') + if i > 0: + if url[:i] == 'http': # optimize the common case + scheme = url[:i].lower() + url = url[i+1:] + if url[:2] == '//': + netloc, url = _splitnetloc(url, 2) + if allow_fragments and '#' in url: + url, fragment = url.split('#', 1) + if '?' in url: + url, query = url.split('?', 1) + v = SplitResult(scheme, netloc, url, query, fragment) + _parse_cache[key] = v + return v + for c in url[:i]: + if c not in scheme_chars: + break + else: + scheme, url = url[:i].lower(), url[i+1:] + if scheme in uses_netloc and url[:2] == '//': + netloc, url = _splitnetloc(url, 2) + if allow_fragments and scheme in uses_fragment and '#' in url: + url, fragment = url.split('#', 1) + if scheme in uses_query and '?' in url: + url, query = url.split('?', 1) + v = SplitResult(scheme, netloc, url, query, fragment) + _parse_cache[key] = v + return v + +def urlunparse(components): + """Put a parsed URL back together again. This may result in a + slightly different, but equivalent URL, if the URL that was parsed + originally had redundant delimiters, e.g. a ? with an empty query + (the draft states that these are equivalent).""" + scheme, netloc, url, params, query, fragment = components + if params: + url = "%s;%s" % (url, params) + return urlunsplit((scheme, netloc, url, query, fragment)) + +def urlunsplit(components): + scheme, netloc, url, query, fragment = components + if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): + if url and url[:1] != '/': url = '/' + url + url = '//' + (netloc or '') + url + if scheme: + url = scheme + ':' + url + if query: + url = url + '?' + query + if fragment: + url = url + '#' + fragment + return url + +def urljoin(base, url, allow_fragments=True): + """Join a base URL and a possibly relative URL to form an absolute + interpretation of the latter.""" + if not base: + return url + if not url: + return base + bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ + urlparse(base, '', allow_fragments) + scheme, netloc, path, params, query, fragment = \ + urlparse(url, bscheme, allow_fragments) + if scheme != bscheme or scheme not in uses_relative: + return url + if scheme in uses_netloc: + if netloc: + return urlunparse((scheme, netloc, path, + params, query, fragment)) + netloc = bnetloc + if path[:1] == '/': + return urlunparse((scheme, netloc, path, + params, query, fragment)) + if not (path or params or query): + return urlunparse((scheme, netloc, bpath, + bparams, bquery, fragment)) + segments = bpath.split('/')[:-1] + path.split('/') + # XXX The stuff below is bogus in various ways... + if segments[-1] == '.': + segments[-1] = '' + while '.' in segments: + segments.remove('.') + while 1: + i = 1 + n = len(segments) - 1 + while i < n: + if (segments[i] == '..' + and segments[i-1] not in ('', '..')): + del segments[i-1:i+1] + break + i = i+1 + else: + break + if segments == ['', '..']: + segments[-1] = '' + elif len(segments) >= 2 and segments[-1] == '..': + segments[-2:] = [''] + return urlunparse((scheme, netloc, '/'.join(segments), + params, query, fragment)) + +def urldefrag(url): + """Removes any existing fragment from URL. + + Returns a tuple of the defragmented URL and the fragment. If + the URL contained no fragments, the second element is the + empty string. + """ + if '#' in url: + s, n, p, a, q, frag = urlparse(url) + defrag = urlunparse((s, n, p, a, q, '')) + return defrag, frag + else: + return url, '' + + +_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) +_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) + +def unquote(s): + """unquote('abc%20def') -> 'abc def'.""" + res = s.split('%') + for i in range(1, len(res)): + item = res[i] + try: + res[i] = _hextochr[item[:2]] + item[2:] + except KeyError: + res[i] = '%' + item + except UnicodeDecodeError: + res[i] = chr(int(item[:2], 16)) + item[2:] + return "".join(res) + +def unquote_plus(s): + """unquote('%7e/abc+def') -> '~/abc def'""" + s = s.replace('+', ' ') + return unquote(s) + +always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' '_.-') +_safe_quoters= {} + +class Quoter: + def __init__(self, safe): + self.cache = {} + self.safe = safe + always_safe + + def __call__(self, c): + try: + return self.cache[c] + except KeyError: + if ord(c) < 256: + res = (c in self.safe) and c or ('%%%02X' % ord(c)) + self.cache[c] = res + return res + else: + return "".join(['%%%02X' % i for i in c.encode("utf-8")]) + +def quote(s, safe = '/'): + """quote('abc def') -> 'abc%20def' + + Each part of a URL, e.g. the path info, the query, etc., has a + different set of reserved characters that must be quoted. + + RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists + the following reserved characters. + + reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | + "$" | "," + + Each of these characters is reserved in some component of a URL, + but not necessarily in all of them. + + By default, the quote function is intended for quoting the path + section of a URL. Thus, it will not encode '/'. This character + is reserved, but in typical usage the quote function is being + called on a path where the existing slash characters are used as + reserved characters. + """ + cachekey = (safe, always_safe) + try: + quoter = _safe_quoters[cachekey] + except KeyError: + quoter = Quoter(safe) + _safe_quoters[cachekey] = quoter + res = map(quoter, s) + return ''.join(res) + +def quote_plus(s, safe = ''): + """Quote the query fragment of a URL; replacing ' ' with '+'""" + if ' ' in s: + s = quote(s, safe + ' ') + return s.replace(' ', '+') + return quote(s, safe) + +def urlencode(query,doseq=0): + """Encode a sequence of two-element tuples or dictionary into a URL query string. + + If any values in the query arg are sequences and doseq is true, each + sequence element is converted to a separate parameter. + + If the query arg is a sequence of two-element tuples, the order of the + parameters in the output will match the order of parameters in the + input. + """ + + if hasattr(query,"items"): + # mapping objects + query = query.items() + else: + # it's a bother at times that strings and string-like objects are + # sequences... + try: + # non-sequence items should not work with len() + # non-empty strings will fail this + if len(query) and not isinstance(query[0], tuple): + raise TypeError + # zero-length sequences of all types will get here and succeed, + # but that's a minor nit - since the original implementation + # allowed empty dicts that type of behavior probably should be + # preserved for consistency + except TypeError: + ty,va,tb = sys.exc_info() + raise TypeError("not a valid non-string sequence or mapping object").with_traceback(tb) + + l = [] + if not doseq: + # preserve old behavior + for k, v in query: + k = quote_plus(str(k)) + v = quote_plus(str(v)) + l.append(k + '=' + v) + else: + for k, v in query: + k = quote_plus(str(k)) + if isinstance(v, str): + v = quote_plus(v) + l.append(k + '=' + v) + elif isinstance(v, str): + # is there a reasonable way to convert to ASCII? + # encode generates a string, but "replace" or "ignore" + # lose information and "strict" can raise UnicodeError + v = quote_plus(v.encode("ASCII","replace")) + l.append(k + '=' + v) + else: + try: + # is this a sufficient test for sequence-ness? + x = len(v) + except TypeError: + # not a sequence + v = quote_plus(str(v)) + l.append(k + '=' + v) + else: + # loop over the sequence + for elt in v: + l.append(k + '=' + quote_plus(str(elt))) + return '&'.join(l) + +# Utilities to parse URLs (most of these return None for missing parts): +# unwrap('') --> 'type://host/path' +# splittype('type:opaquestring') --> 'type', 'opaquestring' +# splithost('//host[:port]/path') --> 'host[:port]', '/path' +# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' +# splitpasswd('user:passwd') -> 'user', 'passwd' +# splitport('host:port') --> 'host', 'port' +# splitquery('/path?query') --> '/path', 'query' +# splittag('/path#tag') --> '/path', 'tag' +# splitattr('/path;attr1=value1;attr2=value2;...') -> +# '/path', ['attr1=value1', 'attr2=value2', ...] +# splitvalue('attr=value') --> 'attr', 'value' +# urllib.parse.unquote('abc%20def') -> 'abc def' +# quote('abc def') -> 'abc%20def') + +def toBytes(url): + """toBytes(u"URL") --> 'URL'.""" + # Most URL schemes require ASCII. If that changes, the conversion + # can be relaxed. + # XXX get rid of toBytes() + if isinstance(url, str): + try: + url = url.encode("ASCII").decode() + except UnicodeError: + raise UnicodeError("URL " + repr(url) + + " contains non-ASCII characters") + return url + +def unwrap(url): + """unwrap('') --> 'type://host/path'.""" + url = str(url).strip() + if url[:1] == '<' and url[-1:] == '>': + url = url[1:-1].strip() + if url[:4] == 'URL:': url = url[4:].strip() + return url + +_typeprog = None +def splittype(url): + """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" + global _typeprog + if _typeprog is None: + import re + _typeprog = re.compile('^([^/:]+):') + + match = _typeprog.match(url) + if match: + scheme = match.group(1) + return scheme.lower(), url[len(scheme) + 1:] + return None, url + +_hostprog = None +def splithost(url): + """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" + global _hostprog + if _hostprog is None: + import re + _hostprog = re.compile('^//([^/?]*)(.*)$') + + match = _hostprog.match(url) + if match: return match.group(1, 2) + return None, url + +_userprog = None +def splituser(host): + """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" + global _userprog + if _userprog is None: + import re + _userprog = re.compile('^(.*)@(.*)$') + + match = _userprog.match(host) + if match: return map(unquote, match.group(1, 2)) + return None, host + +_passwdprog = None +def splitpasswd(user): + """splitpasswd('user:passwd') -> 'user', 'passwd'.""" + global _passwdprog + if _passwdprog is None: + import re + _passwdprog = re.compile('^([^:]*):(.*)$') + + match = _passwdprog.match(user) + if match: return match.group(1, 2) + return user, None + +# splittag('/path#tag') --> '/path', 'tag' +_portprog = None +def splitport(host): + """splitport('host:port') --> 'host', 'port'.""" + global _portprog + if _portprog is None: + import re + _portprog = re.compile('^(.*):([0-9]+)$') + + match = _portprog.match(host) + if match: return match.group(1, 2) + return host, None + +_nportprog = None +def splitnport(host, defport=-1): + """Split host and port, returning numeric port. + Return given default port if no ':' found; defaults to -1. + Return numerical port if a valid number are found after ':'. + Return None if ':' but not a valid number.""" + global _nportprog + if _nportprog is None: + import re + _nportprog = re.compile('^(.*):(.*)$') + + match = _nportprog.match(host) + if match: + host, port = match.group(1, 2) + try: + if not port: raise ValueError("no digits") + nport = int(port) + except ValueError: + nport = None + return host, nport + return host, defport + +_queryprog = None +def splitquery(url): + """splitquery('/path?query') --> '/path', 'query'.""" + global _queryprog + if _queryprog is None: + import re + _queryprog = re.compile('^(.*)\?([^?]*)$') + + match = _queryprog.match(url) + if match: return match.group(1, 2) + return url, None + +_tagprog = None +def splittag(url): + """splittag('/path#tag') --> '/path', 'tag'.""" + global _tagprog + if _tagprog is None: + import re + _tagprog = re.compile('^(.*)#([^#]*)$') + + match = _tagprog.match(url) + if match: return match.group(1, 2) + return url, None + +def splitattr(url): + """splitattr('/path;attr1=value1;attr2=value2;...') -> + '/path', ['attr1=value1', 'attr2=value2', ...].""" + words = url.split(';') + return words[0], words[1:] + +_valueprog = None +def splitvalue(attr): + """splitvalue('attr=value') --> 'attr', 'value'.""" + global _valueprog + if _valueprog is None: + import re + _valueprog = re.compile('^([^=]*)=(.*)$') + + match = _valueprog.match(attr) + if match: return match.group(1, 2) + return attr, None + +test_input = """ + http://a/b/c/d + + g:h = + http:g = + http: = + g = + ./g = + g/ = + /g = + //g = + ?y = + g?y = + g?y/./x = + . = + ./ = + .. = + ../ = + ../g = + ../.. = + ../../g = + ../../../g = + ./../g = + ./g/. = + /./g = + g/./h = + g/../h = + http:g = + http: = + http:?y = + http:g?y = + http:g?y/./x = +""" + +def test(): + import sys + base = '' + if sys.argv[1:]: + fn = sys.argv[1] + if fn == '-': + fp = sys.stdin + else: + fp = open(fn) + else: + from io import StringIO + fp = StringIO(test_input) + for line in fp: + words = line.split() + if not words: + continue + url = words[0] + parts = urlparse(url) + print('%-10s : %s' % (url, parts)) + abs = urljoin(base, url) + if not base: + base = abs + wrapped = '' % abs + print('%-10s = %s' % (url, wrapped)) + if len(words) == 3 and words[1] == '=': + if wrapped != words[2]: + print('EXPECTED', words[2], '!!!!!!!!!!') + +if __name__ == '__main__': + test() diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py new file mode 100644 index 0000000..cd4729a --- /dev/null +++ b/Lib/urllib/request.py @@ -0,0 +1,2295 @@ +# Issues in merging urllib and urllib2: +# 1. They both define a function named urlopen() + +"""An extensible library for opening URLs using a variety of protocols + +The simplest way to use this module is to call the urlopen function, +which accepts a string containing a URL or a Request object (described +below). It opens the URL and returns the results as file-like +object; the returned object has some extra methods described below. + +The OpenerDirector manages a collection of Handler objects that do +all the actual work. Each Handler implements a particular protocol or +option. The OpenerDirector is a composite object that invokes the +Handlers needed to open the requested URL. For example, the +HTTPHandler performs HTTP GET and POST requests and deals with +non-error returns. The HTTPRedirectHandler automatically deals with +HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler +deals with digest authentication. + +urlopen(url, data=None) -- Basic usage is the same as original +urllib. pass the url and optionally data to post to an HTTP URL, and +get a file-like object back. One difference is that you can also pass +a Request instance instead of URL. Raises a URLError (subclass of +IOError); for HTTP errors, raises an HTTPError, which can also be +treated as a valid response. + +build_opener -- Function that creates a new OpenerDirector instance. +Will install the default handlers. Accepts one or more Handlers as +arguments, either instances or Handler classes that it will +instantiate. If one of the argument is a subclass of the default +handler, the argument will be installed instead of the default. + +install_opener -- Installs a new opener as the default opener. + +objects of interest: +OpenerDirector -- + +Request -- An object that encapsulates the state of a request. The +state can be as simple as the URL. It can also include extra HTTP +headers, e.g. a User-Agent. + +BaseHandler -- + +internals: +BaseHandler and parent +_call_chain conventions + +Example usage: + +import urllib2 + +# set up authentication info +authinfo = urllib2.HTTPBasicAuthHandler() +authinfo.add_password(realm='PDQ Application', + uri='https://mahler:8092/site-updates.py', + user='klem', + passwd='geheim$parole') + +proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"}) + +# build a new opener that adds authentication and caching FTP handlers +opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler) + +# install it +urllib2.install_opener(opener) + +f = urllib2.urlopen('http://www.python.org/') +""" + +# XXX issues: +# If an authentication error handler that tries to perform +# authentication for some reason but fails, how should the error be +# signalled? The client needs to know the HTTP error code. But if +# the handler knows that the problem was, e.g., that it didn't know +# that hash algo that requested in the challenge, it would be good to +# pass that information along to the client, too. +# ftp errors aren't handled cleanly +# check digest against correct (i.e. non-apache) implementation + +# Possible extensions: +# complex proxies XXX not sure what exactly was meant by this +# abstract factory for opener + +import base64 +import email +import hashlib +import http.client +import io +import os +import posixpath +import random +import re +import socket +import sys +import time +import urllib.parse, urllib.error, urllib.response +import bisect + +from io import StringIO + +# check for SSL +try: + import ssl +except: + _have_ssl = False +else: + _have_ssl = True +assert _have_ssl + +# used in User-Agent header sent +__version__ = sys.version[:3] + +_opener = None +def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): + global _opener + if _opener is None: + _opener = build_opener() + return _opener.open(url, data, timeout) + +def install_opener(opener): + global _opener + _opener = opener + +# TODO(jhylton): Make this work with the same global opener. +_urlopener = None +def urlretrieve(url, filename=None, reporthook=None, data=None): + global _urlopener + if not _urlopener: + _urlopener = FancyURLopener() + return _urlopener.retrieve(url, filename, reporthook, data) + +def urlcleanup(): + if _urlopener: + _urlopener.cleanup() + global _opener + if _opener: + _opener = None + +# copied from cookielib.py +_cut_port_re = re.compile(r":\d+$") +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.get_full_url() + host = urllib.parse.urlparse(url)[1] + if host == "": + host = request.get_header("Host", "") + + # remove port, if present + host = _cut_port_re.sub("", host, 1) + return host.lower() + +class Request: + + def __init__(self, url, data=None, headers={}, + origin_req_host=None, unverifiable=False): + # unwrap('') --> 'type://host/path' + self.__original = urllib.parse.unwrap(url) + self.type = None + # self.__r_type is what's left after doing the splittype + self.host = None + self.port = None + self.data = data + self.headers = {} + for key, value in headers.items(): + self.add_header(key, value) + self.unredirected_hdrs = {} + if origin_req_host is None: + origin_req_host = request_host(self) + self.origin_req_host = origin_req_host + self.unverifiable = unverifiable + + def __getattr__(self, attr): + # XXX this is a fallback mechanism to guard against these + # methods getting called in a non-standard order. this may be + # too complicated and/or unnecessary. + # XXX should the __r_XXX attributes be public? + if attr[:12] == '_Request__r_': + name = attr[12:] + if hasattr(Request, 'get_' + name): + getattr(self, 'get_' + name)() + return getattr(self, attr) + raise AttributeError(attr) + + def get_method(self): + if self.has_data(): + return "POST" + else: + return "GET" + + # XXX these helper methods are lame + + def add_data(self, data): + self.data = data + + def has_data(self): + return self.data is not None + + def get_data(self): + return self.data + + def get_full_url(self): + return self.__original + + def get_type(self): + if self.type is None: + self.type, self.__r_type = urllib.parse.splittype(self.__original) + if self.type is None: + raise ValueError("unknown url type: %s" % self.__original) + return self.type + + def get_host(self): + if self.host is None: + self.host, self.__r_host = urllib.parse.splithost(self.__r_type) + if self.host: + self.host = urllib.parse.unquote(self.host) + return self.host + + def get_selector(self): + return self.__r_host + + def set_proxy(self, host, type): + self.host, self.type = host, type + self.__r_host = self.__original + + def get_origin_req_host(self): + return self.origin_req_host + + def is_unverifiable(self): + return self.unverifiable + + def add_header(self, key, val): + # useful for something like authentication + self.headers[key.capitalize()] = val + + def add_unredirected_header(self, key, val): + # will not be added to a redirected request + self.unredirected_hdrs[key.capitalize()] = val + + def has_header(self, header_name): + return (header_name in self.headers or + header_name in self.unredirected_hdrs) + + def get_header(self, header_name, default=None): + return self.headers.get( + header_name, + self.unredirected_hdrs.get(header_name, default)) + + def header_items(self): + hdrs = self.unredirected_hdrs.copy() + hdrs.update(self.headers) + return list(hdrs.items()) + +class OpenerDirector: + def __init__(self): + client_version = "Python-urllib/%s" % __version__ + self.addheaders = [('User-agent', client_version)] + # manage the individual handlers + self.handlers = [] + self.handle_open = {} + self.handle_error = {} + self.process_response = {} + self.process_request = {} + + def add_handler(self, handler): + if not hasattr(handler, "add_parent"): + raise TypeError("expected BaseHandler instance, got %r" % + type(handler)) + + added = False + for meth in dir(handler): + if meth in ["redirect_request", "do_open", "proxy_open"]: + # oops, coincidental match + continue + + i = meth.find("_") + protocol = meth[:i] + condition = meth[i+1:] + + if condition.startswith("error"): + j = condition.find("_") + i + 1 + kind = meth[j+1:] + try: + kind = int(kind) + except ValueError: + pass + lookup = self.handle_error.get(protocol, {}) + self.handle_error[protocol] = lookup + elif condition == "open": + kind = protocol + lookup = self.handle_open + elif condition == "response": + kind = protocol + lookup = self.process_response + elif condition == "request": + kind = protocol + lookup = self.process_request + else: + continue + + handlers = lookup.setdefault(kind, []) + if handlers: + bisect.insort(handlers, handler) + else: + handlers.append(handler) + added = True + + if added: + # the handlers must work in an specific order, the order + # is specified in a Handler attribute + bisect.insort(self.handlers, handler) + handler.add_parent(self) + + def close(self): + # Only exists for backwards compatibility. + pass + + def _call_chain(self, chain, kind, meth_name, *args): + # Handlers raise an exception if no one else should try to handle + # the request, or return None if they can't but another handler + # could. Otherwise, they return the response. + handlers = chain.get(kind, ()) + for handler in handlers: + func = getattr(handler, meth_name) + + result = func(*args) + if result is not None: + return result + + def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): + # accept a URL or a Request object + if isinstance(fullurl, str): + req = Request(fullurl, data) + else: + req = fullurl + if data is not None: + req.add_data(data) + + req.timeout = timeout + protocol = req.get_type() + + # pre-process request + meth_name = protocol+"_request" + for processor in self.process_request.get(protocol, []): + meth = getattr(processor, meth_name) + req = meth(req) + + response = self._open(req, data) + + # post-process response + meth_name = protocol+"_response" + for processor in self.process_response.get(protocol, []): + meth = getattr(processor, meth_name) + response = meth(req, response) + + return response + + def _open(self, req, data=None): + result = self._call_chain(self.handle_open, 'default', + 'default_open', req) + if result: + return result + + protocol = req.get_type() + result = self._call_chain(self.handle_open, protocol, protocol + + '_open', req) + if result: + return result + + return self._call_chain(self.handle_open, 'unknown', + 'unknown_open', req) + + def error(self, proto, *args): + if proto in ('http', 'https'): + # XXX http[s] protocols are special-cased + dict = self.handle_error['http'] # https is not different than http + proto = args[2] # YUCK! + meth_name = 'http_error_%s' % proto + http_err = 1 + orig_args = args + else: + dict = self.handle_error + meth_name = proto + '_error' + http_err = 0 + args = (dict, proto, meth_name) + args + result = self._call_chain(*args) + if result: + return result + + if http_err: + args = (dict, 'default', 'http_error_default') + orig_args + return self._call_chain(*args) + +# XXX probably also want an abstract factory that knows when it makes +# sense to skip a superclass in favor of a subclass and when it might +# make sense to include both + +def build_opener(*handlers): + """Create an opener object from a list of handlers. + + The opener will use several default handlers, including support + for HTTP and FTP. + + If any of the handlers passed as arguments are subclasses of the + default handlers, the default handlers will not be used. + """ + def isclass(obj): + return isinstance(obj, type) or hasattr(obj, "__bases__") + + opener = OpenerDirector() + default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, + HTTPDefaultErrorHandler, HTTPRedirectHandler, + FTPHandler, FileHandler, HTTPErrorProcessor] + if hasattr(http.client, "HTTPSConnection"): + default_classes.append(HTTPSHandler) + else: + import pdb; pdb.set_trace() + skip = set() + for klass in default_classes: + for check in handlers: + if isclass(check): + if issubclass(check, klass): + skip.add(klass) + elif isinstance(check, klass): + skip.add(klass) + for klass in skip: + default_classes.remove(klass) + + for klass in default_classes: + opener.add_handler(klass()) + + for h in handlers: + if isclass(h): + h = h() + opener.add_handler(h) + return opener + +class BaseHandler: + handler_order = 500 + + def add_parent(self, parent): + self.parent = parent + + def close(self): + # Only exists for backwards compatibility + pass + + def __lt__(self, other): + if not hasattr(other, "handler_order"): + # Try to preserve the old behavior of having custom classes + # inserted after default ones (works only for custom user + # classes which are not aware of handler_order). + return True + return self.handler_order < other.handler_order + + +class HTTPErrorProcessor(BaseHandler): + """Process HTTP error responses.""" + handler_order = 1000 # after all other processing + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + # According to RFC 2616, "2xx" code indicates that the client's + # request was successfully received, understood, and accepted. + if not (200 <= code < 300): + response = self.parent.error( + 'http', request, response, code, msg, hdrs) + + return response + + https_response = http_response + +class HTTPDefaultErrorHandler(BaseHandler): + def http_error_default(self, req, fp, code, msg, hdrs): + raise urllib.error.HTTPError(req.get_full_url(), code, msg, hdrs, fp) + +class HTTPRedirectHandler(BaseHandler): + # maximum number of redirections to any single URL + # this is needed because of the state that cookies introduce + max_repeats = 4 + # maximum total number of redirections (regardless of URL) before + # assuming we're in a loop + max_redirections = 10 + + def redirect_request(self, req, fp, code, msg, headers, newurl): + """Return a Request or None in response to a redirect. + + This is called by the http_error_30x methods when a + redirection response is received. If a redirection should + take place, return a new Request to allow http_error_30x to + perform the redirect. Otherwise, raise HTTPError if no-one + else should try to handle this url. Return None if you can't + but another Handler might. + """ + m = req.get_method() + if (not (code in (301, 302, 303, 307) and m in ("GET", "HEAD") + or code in (301, 302, 303) and m == "POST")): + raise urllib.error.HTTPError(req.get_full_url(), + code, msg, headers, fp) + + # Strictly (according to RFC 2616), 301 or 302 in response to + # a POST MUST NOT cause a redirection without confirmation + # from the user (of urllib2, in this case). In practice, + # essentially all clients do redirect in this case, so we do + # the same. + # be conciliant with URIs containing a space + newurl = newurl.replace(' ', '%20') + CONTENT_HEADERS = ("content-length", "content-type") + newheaders = dict((k, v) for k, v in req.headers.items() + if k.lower() not in CONTENT_HEADERS) + return Request(newurl, + headers=newheaders, + origin_req_host=req.get_origin_req_host(), + unverifiable=True) + + # Implementation note: To avoid the server sending us into an + # infinite loop, the request object needs to track what URLs we + # have already seen. Do this by adding a handler-specific + # attribute to the Request object. + def http_error_302(self, req, fp, code, msg, headers): + # Some servers (incorrectly) return multiple Location headers + # (so probably same goes for URI). Use first header. + if "location" in headers: + newurl = headers["location"] + elif "uri" in headers: + newurl = headers["uri"] + else: + return + newurl = urllib.parse.urljoin(req.get_full_url(), newurl) + + # XXX Probably want to forget about the state of the current + # request, although that might interact poorly with other + # handlers that also use handler-specific request attributes + new = self.redirect_request(req, fp, code, msg, headers, newurl) + if new is None: + return + + # loop detection + # .redirect_dict has a key url if url was previously visited. + if hasattr(req, 'redirect_dict'): + visited = new.redirect_dict = req.redirect_dict + if (visited.get(newurl, 0) >= self.max_repeats or + len(visited) >= self.max_redirections): + raise urllib.error.HTTPError(req.get_full_url(), code, + self.inf_msg + msg, headers, fp) + else: + visited = new.redirect_dict = req.redirect_dict = {} + visited[newurl] = visited.get(newurl, 0) + 1 + + # Don't close the fp until we are sure that we won't use it + # with HTTPError. + fp.read() + fp.close() + + return self.parent.open(new) + + http_error_301 = http_error_303 = http_error_307 = http_error_302 + + inf_msg = "The HTTP server returned a redirect error that would " \ + "lead to an infinite loop.\n" \ + "The last 30x error message was:\n" + + +def _parse_proxy(proxy): + """Return (scheme, user, password, host/port) given a URL or an authority. + + If a URL is supplied, it must have an authority (host:port) component. + According to RFC 3986, having an authority component means the URL must + have two slashes after the scheme: + + >>> _parse_proxy('file:/ftp.example.com/') + Traceback (most recent call last): + ValueError: proxy URL with no authority: 'file:/ftp.example.com/' + + The first three items of the returned tuple may be None. + + Examples of authority parsing: + + >>> _parse_proxy('proxy.example.com') + (None, None, None, 'proxy.example.com') + >>> _parse_proxy('proxy.example.com:3128') + (None, None, None, 'proxy.example.com:3128') + + The authority component may optionally include userinfo (assumed to be + username:password): + + >>> _parse_proxy('joe:password@proxy.example.com') + (None, 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('joe:password@proxy.example.com:3128') + (None, 'joe', 'password', 'proxy.example.com:3128') + + Same examples, but with URLs instead: + + >>> _parse_proxy('http://proxy.example.com/') + ('http', None, None, 'proxy.example.com') + >>> _parse_proxy('http://proxy.example.com:3128/') + ('http', None, None, 'proxy.example.com:3128') + >>> _parse_proxy('http://joe:password@proxy.example.com/') + ('http', 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('http://joe:password@proxy.example.com:3128') + ('http', 'joe', 'password', 'proxy.example.com:3128') + + Everything after the authority is ignored: + + >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') + ('ftp', 'joe', 'password', 'proxy.example.com') + + Test for no trailing '/' case: + + >>> _parse_proxy('http://joe:password@proxy.example.com') + ('http', 'joe', 'password', 'proxy.example.com') + + """ + scheme, r_scheme = urllib.parse.splittype(proxy) + if not r_scheme.startswith("/"): + # authority + scheme = None + authority = proxy + else: + # URL + if not r_scheme.startswith("//"): + raise ValueError("proxy URL with no authority: %r" % proxy) + # We have an authority, so for RFC 3986-compliant URLs (by ss 3. + # and 3.3.), path is empty or starts with '/' + end = r_scheme.find("/", 2) + if end == -1: + end = None + authority = r_scheme[2:end] + userinfo, hostport = urllib.parse.splituser(authority) + if userinfo is not None: + user, password = urllib.parse.splitpasswd(userinfo) + else: + user = password = None + return scheme, user, password, hostport + +class ProxyHandler(BaseHandler): + # Proxies must be in front + handler_order = 100 + + def __init__(self, proxies=None): + if proxies is None: + proxies = getproxies() + assert hasattr(proxies, 'keys'), "proxies must be a mapping" + self.proxies = proxies + for type, url in proxies.items(): + setattr(self, '%s_open' % type, + lambda r, proxy=url, type=type, meth=self.proxy_open: \ + meth(r, proxy, type)) + + def proxy_open(self, req, proxy, type): + orig_type = req.get_type() + proxy_type, user, password, hostport = _parse_proxy(proxy) + if proxy_type is None: + proxy_type = orig_type + if user and password: + user_pass = '%s:%s' % (unquote(user), + urllib.parse.unquote(password)) + creds = base64.b64encode(user_pass.encode()).decode("ascii") + req.add_header('Proxy-authorization', 'Basic ' + creds) + hostport = urllib.parse.unquote(hostport) + req.set_proxy(hostport, proxy_type) + if orig_type == proxy_type: + # let other handlers take care of it + return None + else: + # need to start over, because the other handlers don't + # grok the proxy's URL type + # e.g. if we have a constructor arg proxies like so: + # {'http': 'ftp://proxy.example.com'}, we may end up turning + # a request for http://acme.example.com/a into one for + # ftp://proxy.example.com/a + return self.parent.open(req) + +class HTTPPasswordMgr: + + def __init__(self): + self.passwd = {} + + def add_password(self, realm, uri, user, passwd): + # uri could be a single URI or a sequence + if isinstance(uri, str): + uri = [uri] + if not realm in self.passwd: + self.passwd[realm] = {} + for default_port in True, False: + reduced_uri = tuple( + [self.reduce_uri(u, default_port) for u in uri]) + self.passwd[realm][reduced_uri] = (user, passwd) + + def find_user_password(self, realm, authuri): + domains = self.passwd.get(realm, {}) + for default_port in True, False: + reduced_authuri = self.reduce_uri(authuri, default_port) + for uris, authinfo in domains.items(): + for uri in uris: + if self.is_suburi(uri, reduced_authuri): + return authinfo + return None, None + + def reduce_uri(self, uri, default_port=True): + """Accept authority or URI and extract only the authority and path.""" + # note HTTP URLs do not have a userinfo component + parts = urllib.parse.urlsplit(uri) + if parts[1]: + # URI + scheme = parts[0] + authority = parts[1] + path = parts[2] or '/' + else: + # host or host:port + scheme = None + authority = uri + path = '/' + host, port = urllib.parse.splitport(authority) + if default_port and port is None and scheme is not None: + dport = {"http": 80, + "https": 443, + }.get(scheme) + if dport is not None: + authority = "%s:%d" % (host, dport) + return authority, path + + def is_suburi(self, base, test): + """Check if test is below base in a URI tree + + Both args must be URIs in reduced form. + """ + if base == test: + return True + if base[0] != test[0]: + return False + common = posixpath.commonprefix((base[1], test[1])) + if len(common) == len(base[1]): + return True + return False + + +class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): + + def find_user_password(self, realm, authuri): + user, password = HTTPPasswordMgr.find_user_password(self, realm, + authuri) + if user is not None: + return user, password + return HTTPPasswordMgr.find_user_password(self, None, authuri) + + +class AbstractBasicAuthHandler: + + # XXX this allows for multiple auth-schemes, but will stupidly pick + # the last one with a realm specified. + + # allow for double- and single-quoted realm values + # (single quotes are a violation of the RFC, but appear in the wild) + rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' + 'realm=(["\'])(.*?)\\2', re.I) + + # XXX could pre-emptively send auth info already accepted (RFC 2617, + # end of section 2, and section 1.2 immediately after "credentials" + # production). + + def __init__(self, password_mgr=None): + if password_mgr is None: + password_mgr = HTTPPasswordMgr() + self.passwd = password_mgr + self.add_password = self.passwd.add_password + + def http_error_auth_reqed(self, authreq, host, req, headers): + # host may be an authority (without userinfo) or a URL with an + # authority + # XXX could be multiple headers + authreq = headers.get(authreq, None) + if authreq: + mo = AbstractBasicAuthHandler.rx.search(authreq) + if mo: + scheme, quote, realm = mo.groups() + if scheme.lower() == 'basic': + return self.retry_http_basic_auth(host, req, realm) + + def retry_http_basic_auth(self, host, req, realm): + user, pw = self.passwd.find_user_password(realm, host) + if pw is not None: + raw = "%s:%s" % (user, pw) + auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii") + if req.headers.get(self.auth_header, None) == auth: + return None + req.add_header(self.auth_header, auth) + return self.parent.open(req) + else: + return None + + +class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Authorization' + + def http_error_401(self, req, fp, code, msg, headers): + url = req.get_full_url() + return self.http_error_auth_reqed('www-authenticate', + url, req, headers) + + +class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): + + auth_header = 'Proxy-authorization' + + def http_error_407(self, req, fp, code, msg, headers): + # http_error_auth_reqed requires that there is no userinfo component in + # authority. Assume there isn't one, since urllib2 does not (and + # should not, RFC 3986 s. 3.2.1) support requests for URLs containing + # userinfo. + authority = req.get_host() + return self.http_error_auth_reqed('proxy-authenticate', + authority, req, headers) + + +def randombytes(n): + """Return n random bytes.""" + return os.urandom(n) + +class AbstractDigestAuthHandler: + # Digest authentication is specified in RFC 2617. + + # XXX The client does not inspect the Authentication-Info header + # in a successful response. + + # XXX It should be possible to test this implementation against + # a mock server that just generates a static set of challenges. + + # XXX qop="auth-int" supports is shaky + + def __init__(self, passwd=None): + if passwd is None: + passwd = HTTPPasswordMgr() + self.passwd = passwd + self.add_password = self.passwd.add_password + self.retried = 0 + self.nonce_count = 0 + + def reset_retry_count(self): + self.retried = 0 + + def http_error_auth_reqed(self, auth_header, host, req, headers): + authreq = headers.get(auth_header, None) + if self.retried > 5: + # Don't fail endlessly - if we failed once, we'll probably + # fail a second time. Hm. Unless the Password Manager is + # prompting for the information. Crap. This isn't great + # but it's better than the current 'repeat until recursion + # depth exceeded' approach + raise urllib.error.HTTPError(req.get_full_url(), 401, + "digest auth failed", + headers, None) + else: + self.retried += 1 + if authreq: + scheme = authreq.split()[0] + if scheme.lower() == 'digest': + return self.retry_http_digest_auth(req, authreq) + + def retry_http_digest_auth(self, req, auth): + token, challenge = auth.split(' ', 1) + chal = parse_keqv_list(filter(None, parse_http_list(challenge))) + auth = self.get_authorization(req, chal) + if auth: + auth_val = 'Digest %s' % auth + if req.headers.get(self.auth_header, None) == auth_val: + return None + req.add_unredirected_header(self.auth_header, auth_val) + resp = self.parent.open(req) + return resp + + def get_cnonce(self, nonce): + # The cnonce-value is an opaque + # quoted string value provided by the client and used by both client + # and server to avoid chosen plaintext attacks, to provide mutual + # authentication, and to provide some message integrity protection. + # This isn't a fabulous effort, but it's probably Good Enough. + s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime()) + b = s.encode("ascii") + randombytes(8) + dig = hashlib.sha1(b).hexdigest() + return dig[:16] + + def get_authorization(self, req, chal): + try: + realm = chal['realm'] + nonce = chal['nonce'] + qop = chal.get('qop') + algorithm = chal.get('algorithm', 'MD5') + # mod_digest doesn't send an opaque, even though it isn't + # supposed to be optional + opaque = chal.get('opaque', None) + except KeyError: + return None + + H, KD = self.get_algorithm_impls(algorithm) + if H is None: + return None + + user, pw = self.passwd.find_user_password(realm, req.get_full_url()) + if user is None: + return None + + # XXX not implemented yet + if req.has_data(): + entdig = self.get_entity_digest(req.get_data(), chal) + else: + entdig = None + + A1 = "%s:%s:%s" % (user, realm, pw) + A2 = "%s:%s" % (req.get_method(), + # XXX selector: what about proxies and full urls + req.get_selector()) + if qop == 'auth': + self.nonce_count += 1 + ncvalue = '%08x' % self.nonce_count + cnonce = self.get_cnonce(nonce) + noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) + respdig = KD(H(A1), noncebit) + elif qop is None: + respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) + else: + # XXX handle auth-int. + raise urllib.error.URLError("qop '%s' is not supported." % qop) + + # XXX should the partial digests be encoded too? + + base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ + 'response="%s"' % (user, realm, nonce, req.get_selector(), + respdig) + if opaque: + base += ', opaque="%s"' % opaque + if entdig: + base += ', digest="%s"' % entdig + base += ', algorithm="%s"' % algorithm + if qop: + base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) + return base + + def get_algorithm_impls(self, algorithm): + # lambdas assume digest modules are imported at the top level + if algorithm == 'MD5': + H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest() + elif algorithm == 'SHA': + H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest() + # XXX MD5-sess + KD = lambda s, d: H("%s:%s" % (s, d)) + return H, KD + + def get_entity_digest(self, data, chal): + # XXX not implemented yet + return None + + +class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + """An authentication protocol defined by RFC 2069 + + Digest authentication improves on basic authentication because it + does not transmit passwords in the clear. + """ + + auth_header = 'Authorization' + handler_order = 490 # before Basic auth + + def http_error_401(self, req, fp, code, msg, headers): + host = urllib.parse.urlparse(req.get_full_url())[1] + retry = self.http_error_auth_reqed('www-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + + +class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): + + auth_header = 'Proxy-Authorization' + handler_order = 490 # before Basic auth + + def http_error_407(self, req, fp, code, msg, headers): + host = req.get_host() + retry = self.http_error_auth_reqed('proxy-authenticate', + host, req, headers) + self.reset_retry_count() + return retry + +class AbstractHTTPHandler(BaseHandler): + + def __init__(self, debuglevel=0): + self._debuglevel = debuglevel + + def set_http_debuglevel(self, level): + self._debuglevel = level + + def do_request_(self, request): + host = request.get_host() + if not host: + raise urllib.error.URLError('no host given') + + if request.has_data(): # POST + data = request.get_data() + if not request.has_header('Content-type'): + request.add_unredirected_header( + 'Content-type', + 'application/x-www-form-urlencoded') + if not request.has_header('Content-length'): + request.add_unredirected_header( + 'Content-length', '%d' % len(data)) + + scheme, sel = urllib.parse.splittype(request.get_selector()) + sel_host, sel_path = urllib.parse.splithost(sel) + if not request.has_header('Host'): + request.add_unredirected_header('Host', sel_host or host) + for name, value in self.parent.addheaders: + name = name.capitalize() + if not request.has_header(name): + request.add_unredirected_header(name, value) + + return request + + def do_open(self, http_class, req): + """Return an addinfourl object for the request, using http_class. + + http_class must implement the HTTPConnection API from http.client. + The addinfourl return value is a file-like object. It also + has methods and attributes including: + - info(): return a mimetools.Message object for the headers + - geturl(): return the original request URL + - code: HTTP status code + """ + host = req.get_host() + if not host: + raise urllib.error.URLError('no host given') + + h = http_class(host, timeout=req.timeout) # will parse host:port + headers = dict(req.headers) + headers.update(req.unredirected_hdrs) + + # TODO(jhylton): Should this be redesigned to handle + # persistent connections? + + # We want to make an HTTP/1.1 request, but the addinfourl + # class isn't prepared to deal with a persistent connection. + # It will try to read all remaining data from the socket, + # which will block while the server waits for the next request. + # So make sure the connection gets closed after the (only) + # request. + headers["Connection"] = "close" + headers = dict( + (name.title(), val) for name, val in headers.items()) + try: + h.request(req.get_method(), req.get_selector(), req.data, headers) + r = h.getresponse() + except socket.error as err: # XXX what error? + raise urllib.error.URLError(err) + + resp = urllib.response.addinfourl(r.fp, r.msg, req.get_full_url()) + resp.code = r.status + resp.msg = r.reason + return resp + + +class HTTPHandler(AbstractHTTPHandler): + + def http_open(self, req): + return self.do_open(http.client.HTTPConnection, req) + + http_request = AbstractHTTPHandler.do_request_ + +if hasattr(http.client, 'HTTPSConnection'): + class HTTPSHandler(AbstractHTTPHandler): + + def https_open(self, req): + return self.do_open(http.client.HTTPSConnection, req) + + https_request = AbstractHTTPHandler.do_request_ + +class HTTPCookieProcessor(BaseHandler): + def __init__(self, cookiejar=None): + import http.cookiejar + if cookiejar is None: + cookiejar = http.cookiejar.CookieJar() + self.cookiejar = cookiejar + + def http_request(self, request): + self.cookiejar.add_cookie_header(request) + return request + + def http_response(self, request, response): + self.cookiejar.extract_cookies(response, request) + return response + + https_request = http_request + https_response = http_response + +class UnknownHandler(BaseHandler): + def unknown_open(self, req): + type = req.get_type() + raise urllib.error.URLError('unknown url type: %s' % type) + +def parse_keqv_list(l): + """Parse list of key=value strings where keys are not duplicated.""" + parsed = {} + for elt in l: + k, v = elt.split('=', 1) + if v[0] == '"' and v[-1] == '"': + v = v[1:-1] + parsed[k] = v + return parsed + +def parse_http_list(s): + """Parse lists as described by RFC 2068 Section 2. + + In particular, parse comma-separated lists where the elements of + the list may include quoted-strings. A quoted-string could + contain a comma. A non-quoted string could have quotes in the + middle. Neither commas nor quotes count if they are escaped. + Only double-quotes count, not single-quotes. + """ + res = [] + part = '' + + escape = quote = False + for cur in s: + if escape: + part += cur + escape = False + continue + if quote: + if cur == '\\': + escape = True + continue + elif cur == '"': + quote = False + part += cur + continue + + if cur == ',': + res.append(part) + part = '' + continue + + if cur == '"': + quote = True + + part += cur + + # append last part + if part: + res.append(part) + + return [part.strip() for part in res] + +class FileHandler(BaseHandler): + # Use local file or FTP depending on form of URL + def file_open(self, req): + url = req.get_selector() + if url[:2] == '//' and url[2:3] != '/': + req.type = 'ftp' + return self.parent.open(req) + else: + return self.open_local_file(req) + + # names for the localhost + names = None + def get_names(self): + if FileHandler.names is None: + try: + FileHandler.names = (socket.gethostbyname('localhost'), + socket.gethostbyname(socket.gethostname())) + except socket.gaierror: + FileHandler.names = (socket.gethostbyname('localhost'),) + return FileHandler.names + + # not entirely sure what the rules are here + def open_local_file(self, req): + import email.utils + import mimetypes + host = req.get_host() + file = req.get_selector() + localfile = url2pathname(file) + try: + stats = os.stat(localfile) + size = stats.st_size + modified = email.utils.formatdate(stats.st_mtime, usegmt=True) + mtype = mimetypes.guess_type(file)[0] + headers = email.message_from_string( + 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % + (mtype or 'text/plain', size, modified)) + if host: + host, port = urllib.parse.splitport(host) + if not host or \ + (not port and _safe_gethostbyname(host) in self.get_names()): + return urllib.response.addinfourl(open(localfile, 'rb'), + headers, 'file:'+file) + except OSError as msg: + # urllib2 users shouldn't expect OSErrors coming from urlopen() + raise urllib.error.URLError(msg) + raise urllib.error.URLError('file not on local host') + +def _safe_gethostbyname(host): + try: + return socket.gethostbyname(host) + except socket.gaierror: + return None + +class FTPHandler(BaseHandler): + def ftp_open(self, req): + import ftplib + import mimetypes + host = req.get_host() + if not host: + raise urllib.error.URLError('ftp error: no host given') + host, port = urllib.parse.splitport(host) + if port is None: + port = ftplib.FTP_PORT + else: + port = int(port) + + # username/password handling + user, host = urllib.parse.splituser(host) + if user: + user, passwd = urllib.parse.splitpasswd(user) + else: + passwd = None + host = urllib.parse.unquote(host) + user = urllib.parse.unquote(user or '') + passwd = urllib.parse.unquote(passwd or '') + + try: + host = socket.gethostbyname(host) + except socket.error as msg: + raise urllib.error.URLError(msg) + path, attrs = urllib.parse.splitattr(req.get_selector()) + dirs = path.split('/') + dirs = list(map(urllib.parse.unquote, dirs)) + dirs, file = dirs[:-1], dirs[-1] + if dirs and not dirs[0]: + dirs = dirs[1:] + try: + fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) + type = file and 'I' or 'D' + for attr in attrs: + attr, value = urllib.parse.splitvalue(attr) + if attr.lower() == 'type' and \ + value in ('a', 'A', 'i', 'I', 'd', 'D'): + type = value.upper() + fp, retrlen = fw.retrfile(file, type) + headers = "" + mtype = mimetypes.guess_type(req.get_full_url())[0] + if mtype: + headers += "Content-type: %s\n" % mtype + if retrlen is not None and retrlen >= 0: + headers += "Content-length: %d\n" % retrlen + headers = email.message_from_string(headers) + return urllib.response.addinfourl(fp, headers, req.get_full_url()) + except ftplib.all_errors as msg: + exc = urllib.error.URLError('ftp error: %s' % msg) + raise exc.with_traceback(sys.exc_info()[2]) + + def connect_ftp(self, user, passwd, host, port, dirs, timeout): + fw = ftpwrapper(user, passwd, host, port, dirs, timeout) + return fw + +class CacheFTPHandler(FTPHandler): + # XXX would be nice to have pluggable cache strategies + # XXX this stuff is definitely not thread safe + def __init__(self): + self.cache = {} + self.timeout = {} + self.soonest = 0 + self.delay = 60 + self.max_conns = 16 + + def setTimeout(self, t): + self.delay = t + + def setMaxConns(self, m): + self.max_conns = m + + def connect_ftp(self, user, passwd, host, port, dirs, timeout): + key = user, host, port, '/'.join(dirs), timeout + if key in self.cache: + self.timeout[key] = time.time() + self.delay + else: + self.cache[key] = ftpwrapper(user, passwd, host, port, + dirs, timeout) + self.timeout[key] = time.time() + self.delay + self.check_cache() + return self.cache[key] + + def check_cache(self): + # first check for old ones + t = time.time() + if self.soonest <= t: + for k, v in list(self.timeout.items()): + if v < t: + self.cache[k].close() + del self.cache[k] + del self.timeout[k] + self.soonest = min(list(self.timeout.values())) + + # then check the size + if len(self.cache) == self.max_conns: + for k, v in list(self.timeout.items()): + if v == self.soonest: + del self.cache[k] + del self.timeout[k] + break + self.soonest = min(list(self.timeout.values())) + +# Code move from the old urllib module + +MAXFTPCACHE = 10 # Trim the ftp cache beyond this size + +# Helper for non-unix systems +if os.name == 'mac': + from macurl2path import url2pathname, pathname2url +elif os.name == 'nt': + from nturl2path import url2pathname, pathname2url +else: + def url2pathname(pathname): + """OS-specific conversion from a relative URL of the 'file' scheme + to a file system path; not recommended for general use.""" + return urllib.parse.unquote(pathname) + + def pathname2url(pathname): + """OS-specific conversion from a file system path to a relative URL + of the 'file' scheme; not recommended for general use.""" + return urllib.parse.quote(pathname) + +# This really consists of two pieces: +# (1) a class which handles opening of all sorts of URLs +# (plus assorted utilities etc.) +# (2) a set of functions for parsing URLs +# XXX Should these be separated out into different modules? + + +ftpcache = {} +class URLopener: + """Class to open URLs. + This is a class rather than just a subroutine because we may need + more than one set of global protocol-specific options. + Note -- this is a base class for those who don't want the + automatic handling of errors type 302 (relocated) and 401 + (authorization needed).""" + + __tempfiles = None + + version = "Python-urllib/%s" % __version__ + + # Constructor + def __init__(self, proxies=None, **x509): + if proxies is None: + proxies = getproxies() + assert hasattr(proxies, 'keys'), "proxies must be a mapping" + self.proxies = proxies + self.key_file = x509.get('key_file') + self.cert_file = x509.get('cert_file') + self.addheaders = [('User-Agent', self.version)] + self.__tempfiles = [] + self.__unlink = os.unlink # See cleanup() + self.tempcache = None + # Undocumented feature: if you assign {} to tempcache, + # it is used to cache files retrieved with + # self.retrieve(). This is not enabled by default + # since it does not work for changing documents (and I + # haven't got the logic to check expiration headers + # yet). + self.ftpcache = ftpcache + # Undocumented feature: you can use a different + # ftp cache by assigning to the .ftpcache member; + # in case you want logically independent URL openers + # XXX This is not threadsafe. Bah. + + def __del__(self): + self.close() + + def close(self): + self.cleanup() + + def cleanup(self): + # This code sometimes runs when the rest of this module + # has already been deleted, so it can't use any globals + # or import anything. + if self.__tempfiles: + for file in self.__tempfiles: + try: + self.__unlink(file) + except OSError: + pass + del self.__tempfiles[:] + if self.tempcache: + self.tempcache.clear() + + def addheader(self, *args): + """Add a header to be used by the HTTP interface only + e.g. u.addheader('Accept', 'sound/basic')""" + self.addheaders.append(args) + + # External interface + def open(self, fullurl, data=None): + """Use URLopener().open(file) instead of open(file, 'r').""" + fullurl = urllib.parse.unwrap(urllib.parse.toBytes(fullurl)) + if self.tempcache and fullurl in self.tempcache: + filename, headers = self.tempcache[fullurl] + fp = open(filename, 'rb') + return urllib.response.addinfourl(fp, headers, fullurl) + urltype, url = urllib.parse.splittype(fullurl) + if not urltype: + urltype = 'file' + if urltype in self.proxies: + proxy = self.proxies[urltype] + urltype, proxyhost = urllib.parse.splittype(proxy) + host, selector = urllib.parse.splithost(proxyhost) + url = (host, fullurl) # Signal special case to open_*() + else: + proxy = None + name = 'open_' + urltype + self.type = urltype + name = name.replace('-', '_') + if not hasattr(self, name): + if proxy: + return self.open_unknown_proxy(proxy, fullurl, data) + else: + return self.open_unknown(fullurl, data) + try: + if data is None: + return getattr(self, name)(url) + else: + return getattr(self, name)(url, data) + except socket.error as msg: + raise IOError('socket error', msg).with_traceback(sys.exc_info()[2]) + + def open_unknown(self, fullurl, data=None): + """Overridable interface to open unknown URL type.""" + type, url = urllib.parse.splittype(fullurl) + raise IOError('url error', 'unknown url type', type) + + def open_unknown_proxy(self, proxy, fullurl, data=None): + """Overridable interface to open unknown URL type.""" + type, url = urllib.parse.splittype(fullurl) + raise IOError('url error', 'invalid proxy for %s' % type, proxy) + + # External interface + def retrieve(self, url, filename=None, reporthook=None, data=None): + """retrieve(url) returns (filename, headers) for a local object + or (tempfilename, headers) for a remote object.""" + url = urllib.parse.unwrap(urllib.parse.toBytes(url)) + if self.tempcache and url in self.tempcache: + return self.tempcache[url] + type, url1 = urllib.parse.splittype(url) + if filename is None and (not type or type == 'file'): + try: + fp = self.open_local_file(url1) + hdrs = fp.info() + del fp + return url2pathname(urllib.parse.splithost(url1)[1]), hdrs + except IOError as msg: + pass + fp = self.open(url, data) + headers = fp.info() + if filename: + tfp = open(filename, 'wb') + else: + import tempfile + garbage, path = urllib.parse.splittype(url) + garbage, path = urllib.parse.splithost(path or "") + path, garbage = urllib.parse.splitquery(path or "") + path, garbage = urllib.parse.splitattr(path or "") + suffix = os.path.splitext(path)[1] + (fd, filename) = tempfile.mkstemp(suffix) + self.__tempfiles.append(filename) + tfp = os.fdopen(fd, 'wb') + result = filename, headers + if self.tempcache is not None: + self.tempcache[url] = result + bs = 1024*8 + size = -1 + read = 0 + blocknum = 0 + if reporthook: + if "content-length" in headers: + size = int(headers["Content-Length"]) + reporthook(blocknum, bs, size) + while 1: + block = fp.read(bs) + if not block: + break + read += len(block) + tfp.write(block) + blocknum += 1 + if reporthook: + reporthook(blocknum, bs, size) + fp.close() + tfp.close() + del fp + del tfp + + # raise exception if actual size does not match content-length header + if size >= 0 and read < size: + raise urllib.error.ContentTooShortError( + "retrieval incomplete: got only %i out of %i bytes" + % (read, size), result) + + return result + + # Each method named open_ knows how to open that type of URL + + def _open_generic_http(self, connection_factory, url, data): + """Make an HTTP connection using connection_class. + + This is an internal method that should be called from + open_http() or open_https(). + + Arguments: + - connection_factory should take a host name and return an + HTTPConnection instance. + - url is the url to retrieval or a host, relative-path pair. + - data is payload for a POST request or None. + """ + + user_passwd = None + proxy_passwd= None + if isinstance(url, str): + host, selector = urllib.parse.splithost(url) + if host: + user_passwd, host = urllib.parse.splituser(host) + host = urllib.parse.unquote(host) + realhost = host + else: + host, selector = url + # check whether the proxy contains authorization information + proxy_passwd, host = urllib.parse.splituser(host) + # now we proceed with the url we want to obtain + urltype, rest = urllib.parse.splittype(selector) + url = rest + user_passwd = None + if urltype.lower() != 'http': + realhost = None + else: + realhost, rest = urllib.parse.splithost(rest) + if realhost: + user_passwd, realhost = urllib.parse.splituser(realhost) + if user_passwd: + selector = "%s://%s%s" % (urltype, realhost, rest) + if proxy_bypass(realhost): + host = realhost + + #print "proxy via http:", host, selector + if not host: raise IOError('http error', 'no host given') + + if proxy_passwd: + import base64 + proxy_auth = base64.b64encode(proxy_passwd).strip() + else: + proxy_auth = None + + if user_passwd: + import base64 + auth = base64.b64encode(user_passwd).strip() + else: + auth = None + http_conn = connection_factory(host) + # XXX We should fix urllib so that it works with HTTP/1.1. + http_conn._http_vsn = 10 + http_conn._http_vsn_str = "HTTP/1.0" + + headers = {} + if proxy_auth: + headers["Proxy-Authorization"] = "Basic %s" % proxy_auth + if auth: + headers["Authorization"] = "Basic %s" % auth + if realhost: + headers["Host"] = realhost + for header, value in self.addheaders: + headers[header] = value + + if data is not None: + headers["Content-Type"] = "application/x-www-form-urlencoded" + http_conn.request("POST", selector, data, headers) + else: + http_conn.request("GET", selector, headers=headers) + + try: + response = http_conn.getresponse() + except http.client.BadStatusLine: + # something went wrong with the HTTP status line + raise urllib.error.URLError("http protocol error: bad status line") + + # According to RFC 2616, "2xx" code indicates that the client's + # request was successfully received, understood, and accepted. + if 200 <= response.status < 300: + return urllib.response.addinfourl(response.fp, response.msg, + "http:" + url, + response.status) + else: + return self.http_error( + url, response.fp, + response.status, response.reason, response.msg, data) + + def open_http(self, url, data=None): + """Use HTTP protocol.""" + return self._open_generic_http(http.client.HTTPConnection, url, data) + + def http_error(self, url, fp, errcode, errmsg, headers, data=None): + """Handle http errors. + + Derived class can override this, or provide specific handlers + named http_error_DDD where DDD is the 3-digit error code.""" + # First check if there's a specific handler for this error + name = 'http_error_%d' % errcode + if hasattr(self, name): + method = getattr(self, name) + if data is None: + result = method(url, fp, errcode, errmsg, headers) + else: + result = method(url, fp, errcode, errmsg, headers, data) + if result: return result + return self.http_error_default(url, fp, errcode, errmsg, headers) + + def http_error_default(self, url, fp, errcode, errmsg, headers): + """Default error handler: close the connection and raise IOError.""" + void = fp.read() + fp.close() + raise urllib.error.HTTPError(url, errcode, errmsg, headers, None) + + if _have_ssl: + def _https_connection(self, host): + return http.client.HTTPSConnection(host, + key_file=self.key_file, + cert_file=self.cert_file) + + def open_https(self, url, data=None): + """Use HTTPS protocol.""" + return self._open_generic_http(self._https_connection, url, data) + + def open_file(self, url): + """Use local file or FTP depending on form of URL.""" + if not isinstance(url, str): + raise URLError('file error', 'proxy support for file protocol currently not implemented') + if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/': + return self.open_ftp(url) + else: + return self.open_local_file(url) + + def open_local_file(self, url): + """Use local file.""" + import mimetypes, email.utils + from io import StringIO + host, file = urllib.parse.splithost(url) + localname = url2pathname(file) + try: + stats = os.stat(localname) + except OSError as e: + raise URLError(e.errno, e.strerror, e.filename) + size = stats.st_size + modified = email.utils.formatdate(stats.st_mtime, usegmt=True) + mtype = mimetypes.guess_type(url)[0] + headers = email.message_from_string( + 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % + (mtype or 'text/plain', size, modified)) + if not host: + urlfile = file + if file[:1] == '/': + urlfile = 'file://' + file + return urllib.response.addinfourl(open(localname, 'rb'), + headers, urlfile) + host, port = urllib.parse.splitport(host) + if (not port + and socket.gethostbyname(host) in (localhost(), thishost())): + urlfile = file + if file[:1] == '/': + urlfile = 'file://' + file + return urllib.response.addinfourl(open(localname, 'rb'), + headers, urlfile) + raise URLError('local file error', 'not on local host') + + def open_ftp(self, url): + """Use FTP protocol.""" + if not isinstance(url, str): + raise URLError('ftp error', 'proxy support for ftp protocol currently not implemented') + import mimetypes + from io import StringIO + host, path = urllib.parse.splithost(url) + if not host: raise URLError('ftp error', 'no host given') + host, port = urllib.parse.splitport(host) + user, host = urllib.parse.splituser(host) + if user: user, passwd = urllib.parse.splitpasswd(user) + else: passwd = None + host = urllib.parse.unquote(host) + user = urllib.parse.unquote(user or '') + passwd = urllib.parse.unquote(passwd or '') + host = socket.gethostbyname(host) + if not port: + import ftplib + port = ftplib.FTP_PORT + else: + port = int(port) + path, attrs = urllib.parse.splitattr(path) + path = urllib.parse.unquote(path) + dirs = path.split('/') + dirs, file = dirs[:-1], dirs[-1] + if dirs and not dirs[0]: dirs = dirs[1:] + if dirs and not dirs[0]: dirs[0] = '/' + key = user, host, port, '/'.join(dirs) + # XXX thread unsafe! + if len(self.ftpcache) > MAXFTPCACHE: + # Prune the cache, rather arbitrarily + for k in self.ftpcache.keys(): + if k != key: + v = self.ftpcache[k] + del self.ftpcache[k] + v.close() + try: + if not key in self.ftpcache: + self.ftpcache[key] = \ + ftpwrapper(user, passwd, host, port, dirs) + if not file: type = 'D' + else: type = 'I' + for attr in attrs: + attr, value = urllib.parse.splitvalue(attr) + if attr.lower() == 'type' and \ + value in ('a', 'A', 'i', 'I', 'd', 'D'): + type = value.upper() + (fp, retrlen) = self.ftpcache[key].retrfile(file, type) + mtype = mimetypes.guess_type("ftp:" + url)[0] + headers = "" + if mtype: + headers += "Content-Type: %s\n" % mtype + if retrlen is not None and retrlen >= 0: + headers += "Content-Length: %d\n" % retrlen + headers = email.message_from_string(headers) + return urllib.response.addinfourl(fp, headers, "ftp:" + url) + except ftperrors() as msg: + raise URLError('ftp error', msg).with_traceback(sys.exc_info()[2]) + + def open_data(self, url, data=None): + """Use "data" URL.""" + if not isinstance(url, str): + raise URLError('data error', 'proxy support for data protocol currently not implemented') + # ignore POSTed data + # + # syntax of data URLs: + # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data + # mediatype := [ type "/" subtype ] *( ";" parameter ) + # data := *urlchar + # parameter := attribute "=" value + try: + [type, data] = url.split(',', 1) + except ValueError: + raise IOError('data error', 'bad data URL') + if not type: + type = 'text/plain;charset=US-ASCII' + semi = type.rfind(';') + if semi >= 0 and '=' not in type[semi:]: + encoding = type[semi+1:] + type = type[:semi] + else: + encoding = '' + msg = [] + msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT', + time.gmtime(time.time()))) + msg.append('Content-type: %s' % type) + if encoding == 'base64': + import base64 + data = base64.decodestring(data) + else: + data = urllib.parse.unquote(data) + msg.append('Content-Length: %d' % len(data)) + msg.append('') + msg.append(data) + msg = '\n'.join(msg) + headers = mimetools.message_from_string(msg) + #f.fileno = None # needed for addinfourl + return urllib.response.addinfourl(f, headers, url) + + +class FancyURLopener(URLopener): + """Derived class with handlers for errors we can handle (perhaps).""" + + def __init__(self, *args, **kwargs): + URLopener.__init__(self, *args, **kwargs) + self.auth_cache = {} + self.tries = 0 + self.maxtries = 10 + + def http_error_default(self, url, fp, errcode, errmsg, headers): + """Default error handling -- don't raise an exception.""" + return urllib.response.addinfourl(fp, headers, "http:" + url, errcode) + + def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): + """Error 302 -- relocated (temporarily).""" + self.tries += 1 + if self.maxtries and self.tries >= self.maxtries: + if hasattr(self, "http_error_500"): + meth = self.http_error_500 + else: + meth = self.http_error_default + self.tries = 0 + return meth(url, fp, 500, + "Internal Server Error: Redirect Recursion", headers) + result = self.redirect_internal(url, fp, errcode, errmsg, headers, + data) + self.tries = 0 + return result + + def redirect_internal(self, url, fp, errcode, errmsg, headers, data): + if 'location' in headers: + newurl = headers['location'] + elif 'uri' in headers: + newurl = headers['uri'] + else: + return + void = fp.read() + fp.close() + # In case the server sent a relative URL, join with original: + newurl = basejoin(self.type + ":" + url, newurl) + return self.open(newurl) + + def http_error_301(self, url, fp, errcode, errmsg, headers, data=None): + """Error 301 -- also relocated (permanently).""" + return self.http_error_302(url, fp, errcode, errmsg, headers, data) + + def http_error_303(self, url, fp, errcode, errmsg, headers, data=None): + """Error 303 -- also relocated (essentially identical to 302).""" + return self.http_error_302(url, fp, errcode, errmsg, headers, data) + + def http_error_307(self, url, fp, errcode, errmsg, headers, data=None): + """Error 307 -- relocated, but turn POST into error.""" + if data is None: + return self.http_error_302(url, fp, errcode, errmsg, headers, data) + else: + return self.http_error_default(url, fp, errcode, errmsg, headers) + + def http_error_401(self, url, fp, errcode, errmsg, headers, data=None): + """Error 401 -- authentication required. + This function supports Basic authentication only.""" + if not 'www-authenticate' in headers: + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + stuff = headers['www-authenticate'] + import re + match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) + if not match: + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + scheme, realm = match.groups() + if scheme.lower() != 'basic': + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + name = 'retry_' + self.type + '_basic_auth' + if data is None: + return getattr(self,name)(url, realm) + else: + return getattr(self,name)(url, realm, data) + + def http_error_407(self, url, fp, errcode, errmsg, headers, data=None): + """Error 407 -- proxy authentication required. + This function supports Basic authentication only.""" + if not 'proxy-authenticate' in headers: + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + stuff = headers['proxy-authenticate'] + import re + match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff) + if not match: + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + scheme, realm = match.groups() + if scheme.lower() != 'basic': + URLopener.http_error_default(self, url, fp, + errcode, errmsg, headers) + name = 'retry_proxy_' + self.type + '_basic_auth' + if data is None: + return getattr(self,name)(url, realm) + else: + return getattr(self,name)(url, realm, data) + + def retry_proxy_http_basic_auth(self, url, realm, data=None): + host, selector = urllib.parse.splithost(url) + newurl = 'http://' + host + selector + proxy = self.proxies['http'] + urltype, proxyhost = urllib.parse.splittype(proxy) + proxyhost, proxyselector = urllib.parse.splithost(proxyhost) + i = proxyhost.find('@') + 1 + proxyhost = proxyhost[i:] + user, passwd = self.get_user_passwd(proxyhost, realm, i) + if not (user or passwd): return None + proxyhost = "%s:%s@%s" % (urllib.parse.quote(user, safe=''), + quote(passwd, safe=''), proxyhost) + self.proxies['http'] = 'http://' + proxyhost + proxyselector + if data is None: + return self.open(newurl) + else: + return self.open(newurl, data) + + def retry_proxy_https_basic_auth(self, url, realm, data=None): + host, selector = urllib.parse.splithost(url) + newurl = 'https://' + host + selector + proxy = self.proxies['https'] + urltype, proxyhost = urllib.parse.splittype(proxy) + proxyhost, proxyselector = urllib.parse.splithost(proxyhost) + i = proxyhost.find('@') + 1 + proxyhost = proxyhost[i:] + user, passwd = self.get_user_passwd(proxyhost, realm, i) + if not (user or passwd): return None + proxyhost = "%s:%s@%s" % (urllib.parse.quote(user, safe=''), + quote(passwd, safe=''), proxyhost) + self.proxies['https'] = 'https://' + proxyhost + proxyselector + if data is None: + return self.open(newurl) + else: + return self.open(newurl, data) + + def retry_http_basic_auth(self, url, realm, data=None): + host, selector = urllib.parse.splithost(url) + i = host.find('@') + 1 + host = host[i:] + user, passwd = self.get_user_passwd(host, realm, i) + if not (user or passwd): return None + host = "%s:%s@%s" % (urllib.parse.quote(user, safe=''), + quote(passwd, safe=''), host) + newurl = 'http://' + host + selector + if data is None: + return self.open(newurl) + else: + return self.open(newurl, data) + + def retry_https_basic_auth(self, url, realm, data=None): + host, selector = urllib.parse.splithost(url) + i = host.find('@') + 1 + host = host[i:] + user, passwd = self.get_user_passwd(host, realm, i) + if not (user or passwd): return None + host = "%s:%s@%s" % (urllib.parse.quote(user, safe=''), + quote(passwd, safe=''), host) + newurl = 'https://' + host + selector + if data is None: + return self.open(newurl) + else: + return self.open(newurl, data) + + def get_user_passwd(self, host, realm, clear_cache = 0): + key = realm + '@' + host.lower() + if key in self.auth_cache: + if clear_cache: + del self.auth_cache[key] + else: + return self.auth_cache[key] + user, passwd = self.prompt_user_passwd(host, realm) + if user or passwd: self.auth_cache[key] = (user, passwd) + return user, passwd + + def prompt_user_passwd(self, host, realm): + """Override this in a GUI environment!""" + import getpass + try: + user = input("Enter username for %s at %s: " % (realm, host)) + passwd = getpass.getpass("Enter password for %s in %s at %s: " % + (user, realm, host)) + return user, passwd + except KeyboardInterrupt: + print() + return None, None + + +# Utility functions + +_localhost = None +def localhost(): + """Return the IP address of the magic hostname 'localhost'.""" + global _localhost + if _localhost is None: + _localhost = socket.gethostbyname('localhost') + return _localhost + +_thishost = None +def thishost(): + """Return the IP address of the current host.""" + global _thishost + if _thishost is None: + _thishost = socket.gethostbyname(socket.gethostname()) + return _thishost + +_ftperrors = None +def ftperrors(): + """Return the set of errors raised by the FTP class.""" + global _ftperrors + if _ftperrors is None: + import ftplib + _ftperrors = ftplib.all_errors + return _ftperrors + +_noheaders = None +def noheaders(): + """Return an empty mimetools.Message object.""" + global _noheaders + if _noheaders is None: + _noheaders = mimetools.message_from_string("") + return _noheaders + + +# Utility classes + +class ftpwrapper: + """Class used by open_ftp() for cache of open FTP connections.""" + + def __init__(self, user, passwd, host, port, dirs, timeout=None): + self.user = user + self.passwd = passwd + self.host = host + self.port = port + self.dirs = dirs + self.timeout = timeout + self.init() + + def init(self): + import ftplib + self.busy = 0 + self.ftp = ftplib.FTP() + self.ftp.connect(self.host, self.port, self.timeout) + self.ftp.login(self.user, self.passwd) + for dir in self.dirs: + self.ftp.cwd(dir) + + def retrfile(self, file, type): + import ftplib + self.endtransfer() + if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1 + else: cmd = 'TYPE ' + type; isdir = 0 + try: + self.ftp.voidcmd(cmd) + except ftplib.all_errors: + self.init() + self.ftp.voidcmd(cmd) + conn = None + if file and not isdir: + # Try to retrieve as a file + try: + cmd = 'RETR ' + file + conn = self.ftp.ntransfercmd(cmd) + except ftplib.error_perm as reason: + if str(reason)[:3] != '550': + raise urllib.error.URLError('ftp error', reason).with_traceback(sys.exc_info()[2]) + if not conn: + # Set transfer mode to ASCII! + self.ftp.voidcmd('TYPE A') + # Try a directory listing. Verify that directory exists. + if file: + pwd = self.ftp.pwd() + try: + try: + self.ftp.cwd(file) + except ftplib.error_perm as reason: + raise urllib.error.URLError('ftp error', reason) from reason + finally: + self.ftp.cwd(pwd) + cmd = 'LIST ' + file + else: + cmd = 'LIST' + conn = self.ftp.ntransfercmd(cmd) + self.busy = 1 + # Pass back both a suitably decorated object and a retrieval length + return (urllib.response.addclosehook(conn[0].makefile('rb'), + self.endtransfer), conn[1]) + def endtransfer(self): + if not self.busy: + return + self.busy = 0 + try: + self.ftp.voidresp() + except ftperrors(): + pass + + def close(self): + self.endtransfer() + try: + self.ftp.close() + except ftperrors(): + pass + +# Proxy handling +def getproxies_environment(): + """Return a dictionary of scheme -> proxy server URL mappings. + + Scan the environment for variables named _proxy; + this seems to be the standard convention. If you need a + different way, you can pass a proxies dictionary to the + [Fancy]URLopener constructor. + + """ + proxies = {} + for name, value in os.environ.items(): + name = name.lower() + if name == 'no_proxy': + # handled in proxy_bypass_environment + continue + if value and name[-6:] == '_proxy': + proxies[name[:-6]] = value + return proxies + +def proxy_bypass_environment(host): + """Test if proxies should not be used for a particular host. + + Checks the environment for a variable named no_proxy, which should + be a list of DNS suffixes separated by commas, or '*' for all hosts. + """ + no_proxy = os.environ.get('no_proxy', '') or os.environ.get('NO_PROXY', '') + # '*' is special case for always bypass + if no_proxy == '*': + return 1 + # strip port off host + hostonly, port = urllib.parse.splitport(host) + # check if the host ends with any of the DNS suffixes + for name in no_proxy.split(','): + if name and (hostonly.endswith(name) or host.endswith(name)): + return 1 + # otherwise, don't bypass + return 0 + + +if sys.platform == 'darwin': + def getproxies_internetconfig(): + """Return a dictionary of scheme -> proxy server URL mappings. + + By convention the mac uses Internet Config to store + proxies. An HTTP proxy, for instance, is stored under + the HttpProxy key. + + """ + try: + import ic + except ImportError: + return {} + + try: + config = ic.IC() + except ic.error: + return {} + proxies = {} + # HTTP: + if 'UseHTTPProxy' in config and config['UseHTTPProxy']: + try: + value = config['HTTPProxyHost'] + except ic.error: + pass + else: + proxies['http'] = 'http://%s' % value + # FTP: XXX To be done. + # Gopher: XXX To be done. + return proxies + + def proxy_bypass(host): + if getproxies_environment(): + return proxy_bypass_environment(host) + else: + return 0 + + def getproxies(): + return getproxies_environment() or getproxies_internetconfig() + +elif os.name == 'nt': + def getproxies_registry(): + """Return a dictionary of scheme -> proxy server URL mappings. + + Win32 uses the registry to store proxies. + + """ + proxies = {} + try: + import _winreg + except ImportError: + # Std module, so should be around - but you never know! + return proxies + try: + internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, + r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') + proxyEnable = _winreg.QueryValueEx(internetSettings, + 'ProxyEnable')[0] + if proxyEnable: + # Returned as Unicode but problems if not converted to ASCII + proxyServer = str(_winreg.QueryValueEx(internetSettings, + 'ProxyServer')[0]) + if '=' in proxyServer: + # Per-protocol settings + for p in proxyServer.split(';'): + protocol, address = p.split('=', 1) + # See if address has a type:// prefix + import re + if not re.match('^([^/:]+)://', address): + address = '%s://%s' % (protocol, address) + proxies[protocol] = address + else: + # Use one setting for all protocols + if proxyServer[:5] == 'http:': + proxies['http'] = proxyServer + else: + proxies['http'] = 'http://%s' % proxyServer + proxies['ftp'] = 'ftp://%s' % proxyServer + internetSettings.Close() + except (WindowsError, ValueError, TypeError): + # Either registry key not found etc, or the value in an + # unexpected format. + # proxies already set up to be empty so nothing to do + pass + return proxies + + def getproxies(): + """Return a dictionary of scheme -> proxy server URL mappings. + + Returns settings gathered from the environment, if specified, + or the registry. + + """ + return getproxies_environment() or getproxies_registry() + + def proxy_bypass_registry(host): + try: + import _winreg + import re + except ImportError: + # Std modules, so should be around - but you never know! + return 0 + try: + internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, + r'Software\Microsoft\Windows\CurrentVersion\Internet Settings') + proxyEnable = _winreg.QueryValueEx(internetSettings, + 'ProxyEnable')[0] + proxyOverride = str(_winreg.QueryValueEx(internetSettings, + 'ProxyOverride')[0]) + # ^^^^ Returned as Unicode but problems if not converted to ASCII + except WindowsError: + return 0 + if not proxyEnable or not proxyOverride: + return 0 + # try to make a host list from name and IP address. + rawHost, port = urllib.parse.splitport(host) + host = [rawHost] + try: + addr = socket.gethostbyname(rawHost) + if addr != rawHost: + host.append(addr) + except socket.error: + pass + try: + fqdn = socket.getfqdn(rawHost) + if fqdn != rawHost: + host.append(fqdn) + except socket.error: + pass + # make a check value list from the registry entry: replace the + # '' string by the localhost entry and the corresponding + # canonical entry. + proxyOverride = proxyOverride.split(';') + i = 0 + while i < len(proxyOverride): + if proxyOverride[i] == '': + proxyOverride[i:i+1] = ['localhost', + '127.0.0.1', + socket.gethostname(), + socket.gethostbyname( + socket.gethostname())] + i += 1 + # print proxyOverride + # now check if we match one of the registry values. + for test in proxyOverride: + test = test.replace(".", r"\.") # mask dots + test = test.replace("*", r".*") # change glob sequence + test = test.replace("?", r".") # change glob char + for val in host: + # print "%s <--> %s" %( test, val ) + if re.match(test, val, re.I): + return 1 + return 0 + + def proxy_bypass(host): + """Return a dictionary of scheme -> proxy server URL mappings. + + Returns settings gathered from the environment, if specified, + or the registry. + + """ + if getproxies_environment(): + return proxy_bypass_environment(host) + else: + return proxy_bypass_registry(host) + +else: + # By default use environment variables + getproxies = getproxies_environment + proxy_bypass = proxy_bypass_environment diff --git a/Lib/urllib/response.py b/Lib/urllib/response.py new file mode 100644 index 0000000..1352622 --- /dev/null +++ b/Lib/urllib/response.py @@ -0,0 +1,83 @@ +"""Response classes used by urllib. + +The base class, addbase, defines a minimal file-like interface, +including read() and readline(). The typical response object is an +addinfourl instance, which defines an info() method that returns +headers and a geturl() method that returns the url. +""" + +class addbase(object): + """Base class for addinfo and addclosehook.""" + + # XXX Add a method to expose the timeout on the underlying socket? + + def __init__(self, fp): + # TODO(jhylton): Is there a better way to delegate using io? + self.fp = fp + self.read = self.fp.read + self.readline = self.fp.readline + # TODO(jhylton): Make sure an object with readlines() is also iterable + if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines + if hasattr(self.fp, "fileno"): + self.fileno = self.fp.fileno + else: + self.fileno = lambda: None + if hasattr(self.fp, "__iter__"): + self.__iter__ = self.fp.__iter__ + if hasattr(self.fp, "__next__"): + self.__next__ = self.fp.__next__ + + def __repr__(self): + return '<%s at %r whose fp = %r>' % (self.__class__.__name__, + id(self), self.fp) + + def close(self): + self.read = None + self.readline = None + self.readlines = None + self.fileno = None + if self.fp: self.fp.close() + self.fp = None + +class addclosehook(addbase): + """Class to add a close hook to an open file.""" + + def __init__(self, fp, closehook, *hookargs): + addbase.__init__(self, fp) + self.closehook = closehook + self.hookargs = hookargs + + def close(self): + addbase.close(self) + if self.closehook: + self.closehook(*self.hookargs) + self.closehook = None + self.hookargs = None + +class addinfo(addbase): + """class to add an info() method to an open file.""" + + def __init__(self, fp, headers): + addbase.__init__(self, fp) + self.headers = headers + + def info(self): + return self.headers + +class addinfourl(addbase): + """class to add info() and geturl() methods to an open file.""" + + def __init__(self, fp, headers, url, code=None): + addbase.__init__(self, fp) + self.headers = headers + self.url = url + self.code = code + + def info(self): + return self.headers + + def getcode(self): + return self.code + + def geturl(self): + return self.url diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py new file mode 100644 index 0000000..a91df8d --- /dev/null +++ b/Lib/urllib/robotparser.py @@ -0,0 +1,191 @@ +""" robotparser.py + + Copyright (C) 2000 Bastian Kleineidam + + You can choose between two licenses when using this package: + 1) GNU GPLv2 + 2) PSF license for Python 2.2 + + The robots.txt Exclusion Protocol is implemented as specified in + http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html +""" + +import urllib.parse, urllib.request + +__all__ = ["RobotFileParser"] + +class RobotFileParser: + """ This class provides a set of methods to read, parse and answer + questions about a single robots.txt file. + + """ + + def __init__(self, url=''): + self.entries = [] + self.default_entry = None + self.disallow_all = False + self.allow_all = False + self.set_url(url) + self.last_checked = 0 + + def mtime(self): + """Returns the time the robots.txt file was last fetched. + + This is useful for long-running web spiders that need to + check for new robots.txt files periodically. + + """ + return self.last_checked + + def modified(self): + """Sets the time the robots.txt file was last fetched to the + current time. + + """ + import time + self.last_checked = time.time() + + def set_url(self, url): + """Sets the URL referring to a robots.txt file.""" + self.url = url + self.host, self.path = urllib.parse.urlparse(url)[1:3] + + def read(self): + """Reads the robots.txt URL and feeds it to the parser.""" + try: + f = urllib.request.urlopen(self.url) + except urllib.error.HTTPError as err: + if err.code in (401, 403): + self.disallow_all = True + elif err.code >= 400: + self.allow_all = True + else: + self.parse(f.read().splitlines()) + + def _add_entry(self, entry): + if "*" in entry.useragents: + # the default entry is considered last + self.default_entry = entry + else: + self.entries.append(entry) + + def parse(self, lines): + """Parse the input lines from a robots.txt file. + + We allow that a user-agent: line is not preceded by + one or more blank lines. + """ + state = 0 + entry = Entry() + + for line in lines: + if not line: + if state == 1: + entry = Entry() + state = 0 + elif state == 2: + self._add_entry(entry) + entry = Entry() + state = 0 + # remove optional comment and strip line + i = line.find('#') + if i >= 0: + line = line[:i] + line = line.strip() + if not line: + continue + line = line.split(':', 1) + if len(line) == 2: + line[0] = line[0].strip().lower() + line[1] = urllib.parse.unquote(line[1].strip()) + if line[0] == "user-agent": + if state == 2: + self._add_entry(entry) + entry = Entry() + entry.useragents.append(line[1]) + state = 1 + elif line[0] == "disallow": + if state != 0: + entry.rulelines.append(RuleLine(line[1], False)) + state = 2 + elif line[0] == "allow": + if state != 0: + entry.rulelines.append(RuleLine(line[1], True)) + if state == 2: + self.entries.append(entry) + + + def can_fetch(self, useragent, url): + """using the parsed robots.txt decide if useragent can fetch url""" + if self.disallow_all: + return False + if self.allow_all: + return True + # search for given user agent matches + # the first match counts + url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/" + for entry in self.entries: + if entry.applies_to(useragent): + return entry.allowance(url) + # try the default entry last + if self.default_entry: + return self.default_entry.allowance(url) + # agent not found ==> access granted + return True + + def __str__(self): + return ''.join([str(entry) + "\n" for entry in self.entries]) + + +class RuleLine: + """A rule line is a single "Allow:" (allowance==True) or "Disallow:" + (allowance==False) followed by a path.""" + def __init__(self, path, allowance): + if path == '' and not allowance: + # an empty value means allow all + allowance = True + self.path = urllib.parse.quote(path) + self.allowance = allowance + + def applies_to(self, filename): + return self.path == "*" or filename.startswith(self.path) + + def __str__(self): + return (self.allowance and "Allow" or "Disallow") + ": " + self.path + + +class Entry: + """An entry has one or more user-agents and zero or more rulelines""" + def __init__(self): + self.useragents = [] + self.rulelines = [] + + def __str__(self): + ret = [] + for agent in self.useragents: + ret.extend(["User-agent: ", agent, "\n"]) + for line in self.rulelines: + ret.extend([str(line), "\n"]) + return ''.join(ret) + + def applies_to(self, useragent): + """check if this entry applies to the specified agent""" + # split the name token and make it lower case + useragent = useragent.split("/")[0].lower() + for agent in self.useragents: + if agent == '*': + # we have the catch-all agent + return True + agent = agent.lower() + if agent in useragent: + return True + return False + + def allowance(self, filename): + """Preconditions: + - our agent applies to this entry + - filename is URL decoded""" + for line in self.rulelines: + if line.applies_to(filename): + return line.allowance + return True diff --git a/Lib/urllib2.py b/Lib/urllib2.py deleted file mode 100644 index 4bcd397..0000000 --- a/Lib/urllib2.py +++ /dev/null @@ -1,1351 +0,0 @@ -"""An extensible library for opening URLs using a variety of protocols - -The simplest way to use this module is to call the urlopen function, -which accepts a string containing a URL or a Request object (described -below). It opens the URL and returns the results as file-like -object; the returned object has some extra methods described below. - -The OpenerDirector manages a collection of Handler objects that do -all the actual work. Each Handler implements a particular protocol or -option. The OpenerDirector is a composite object that invokes the -Handlers needed to open the requested URL. For example, the -HTTPHandler performs HTTP GET and POST requests and deals with -non-error returns. The HTTPRedirectHandler automatically deals with -HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler -deals with digest authentication. - -urlopen(url, data=None) -- Basic usage is the same as original -urllib. pass the url and optionally data to post to an HTTP URL, and -get a file-like object back. One difference is that you can also pass -a Request instance instead of URL. Raises a URLError (subclass of -IOError); for HTTP errors, raises an HTTPError, which can also be -treated as a valid response. - -build_opener -- Function that creates a new OpenerDirector instance. -Will install the default handlers. Accepts one or more Handlers as -arguments, either instances or Handler classes that it will -instantiate. If one of the argument is a subclass of the default -handler, the argument will be installed instead of the default. - -install_opener -- Installs a new opener as the default opener. - -objects of interest: -OpenerDirector -- - -Request -- An object that encapsulates the state of a request. The -state can be as simple as the URL. It can also include extra HTTP -headers, e.g. a User-Agent. - -BaseHandler -- - -exceptions: -URLError -- A subclass of IOError, individual protocols have their own -specific subclass. - -HTTPError -- Also a valid HTTP response, so you can treat an HTTP error -as an exceptional event or valid response. - -internals: -BaseHandler and parent -_call_chain conventions - -Example usage: - -import urllib2 - -# set up authentication info -authinfo = urllib2.HTTPBasicAuthHandler() -authinfo.add_password(realm='PDQ Application', - uri='https://mahler:8092/site-updates.py', - user='klem', - passwd='geheim$parole') - -proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"}) - -# build a new opener that adds authentication and caching FTP handlers -opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler) - -# install it -urllib2.install_opener(opener) - -f = urllib2.urlopen('http://www.python.org/') - - -""" - -# XXX issues: -# If an authentication error handler that tries to perform -# authentication for some reason but fails, how should the error be -# signalled? The client needs to know the HTTP error code. But if -# the handler knows that the problem was, e.g., that it didn't know -# that hash algo that requested in the challenge, it would be good to -# pass that information along to the client, too. -# ftp errors aren't handled cleanly -# check digest against correct (i.e. non-apache) implementation - -# Possible extensions: -# complex proxies XXX not sure what exactly was meant by this -# abstract factory for opener - -import base64 -import hashlib -import http.client -import io -import email -import os -import posixpath -import random -import re -import socket -import sys -import time -import urlparse -import bisect - -from io import StringIO - -from urllib import (unwrap, unquote, splittype, splithost, quote, - addinfourl, splitport, splitquery, - splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue) - -# support for FileHandler, proxies via environment variables -from urllib import localhost, url2pathname, getproxies - -# used in User-Agent header sent -__version__ = sys.version[:3] - -_opener = None -def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): - global _opener - if _opener is None: - _opener = build_opener() - return _opener.open(url, data, timeout) - -def install_opener(opener): - global _opener - _opener = opener - -# do these error classes make sense? -# make sure all of the IOError stuff is overridden. we just want to be -# subtypes. - -class URLError(IOError): - # URLError is a sub-type of IOError, but it doesn't share any of - # the implementation. need to override __init__ and __str__. - # It sets self.args for compatibility with other EnvironmentError - # subclasses, but args doesn't have the typical format with errno in - # slot 0 and strerror in slot 1. This may be better than nothing. - def __init__(self, reason): - self.args = reason, - self.reason = reason - - def __str__(self): - return '' % self.reason - -class HTTPError(URLError, addinfourl): - """Raised when HTTP error occurs, but also acts like non-error return""" - __super_init = addinfourl.__init__ - - def __init__(self, url, code, msg, hdrs, fp): - self.code = code - self.msg = msg - self.hdrs = hdrs - self.fp = fp - self.filename = url - # The addinfourl classes depend on fp being a valid file - # object. In some cases, the HTTPError may not have a valid - # file object. If this happens, the simplest workaround is to - # not initialize the base classes. - if fp is not None: - self.__super_init(fp, hdrs, url, code) - - def __str__(self): - return 'HTTP Error %s: %s' % (self.code, self.msg) - -# copied from cookielib.py -_cut_port_re = re.compile(r":\d+$") -def request_host(request): - """Return request-host, as defined by RFC 2965. - - Variation from RFC: returned value is lowercased, for convenient - comparison. - - """ - url = request.get_full_url() - host = urlparse.urlparse(url)[1] - if host == "": - host = request.get_header("Host", "") - - # remove port, if present - host = _cut_port_re.sub("", host, 1) - return host.lower() - -class Request: - - def __init__(self, url, data=None, headers={}, - origin_req_host=None, unverifiable=False): - # unwrap('') --> 'type://host/path' - self.__original = unwrap(url) - self.type = None - # self.__r_type is what's left after doing the splittype - self.host = None - self.port = None - self.data = data - self.headers = {} - for key, value in headers.items(): - self.add_header(key, value) - self.unredirected_hdrs = {} - if origin_req_host is None: - origin_req_host = request_host(self) - self.origin_req_host = origin_req_host - self.unverifiable = unverifiable - - def __getattr__(self, attr): - # XXX this is a fallback mechanism to guard against these - # methods getting called in a non-standard order. this may be - # too complicated and/or unnecessary. - # XXX should the __r_XXX attributes be public? - if attr[:12] == '_Request__r_': - name = attr[12:] - if hasattr(Request, 'get_' + name): - getattr(self, 'get_' + name)() - return getattr(self, attr) - raise AttributeError(attr) - - def get_method(self): - if self.has_data(): - return "POST" - else: - return "GET" - - # XXX these helper methods are lame - - def add_data(self, data): - self.data = data - - def has_data(self): - return self.data is not None - - def get_data(self): - return self.data - - def get_full_url(self): - return self.__original - - def get_type(self): - if self.type is None: - self.type, self.__r_type = splittype(self.__original) - if self.type is None: - raise ValueError("unknown url type: %s" % self.__original) - return self.type - - def get_host(self): - if self.host is None: - self.host, self.__r_host = splithost(self.__r_type) - if self.host: - self.host = unquote(self.host) - return self.host - - def get_selector(self): - return self.__r_host - - def set_proxy(self, host, type): - self.host, self.type = host, type - self.__r_host = self.__original - - def get_origin_req_host(self): - return self.origin_req_host - - def is_unverifiable(self): - return self.unverifiable - - def add_header(self, key, val): - # useful for something like authentication - self.headers[key.capitalize()] = val - - def add_unredirected_header(self, key, val): - # will not be added to a redirected request - self.unredirected_hdrs[key.capitalize()] = val - - def has_header(self, header_name): - return (header_name in self.headers or - header_name in self.unredirected_hdrs) - - def get_header(self, header_name, default=None): - return self.headers.get( - header_name, - self.unredirected_hdrs.get(header_name, default)) - - def header_items(self): - hdrs = self.unredirected_hdrs.copy() - hdrs.update(self.headers) - return list(hdrs.items()) - -class OpenerDirector: - def __init__(self): - client_version = "Python-urllib/%s" % __version__ - self.addheaders = [('User-agent', client_version)] - # manage the individual handlers - self.handlers = [] - self.handle_open = {} - self.handle_error = {} - self.process_response = {} - self.process_request = {} - - def add_handler(self, handler): - if not hasattr(handler, "add_parent"): - raise TypeError("expected BaseHandler instance, got %r" % - type(handler)) - - added = False - for meth in dir(handler): - if meth in ["redirect_request", "do_open", "proxy_open"]: - # oops, coincidental match - continue - - i = meth.find("_") - protocol = meth[:i] - condition = meth[i+1:] - - if condition.startswith("error"): - j = condition.find("_") + i + 1 - kind = meth[j+1:] - try: - kind = int(kind) - except ValueError: - pass - lookup = self.handle_error.get(protocol, {}) - self.handle_error[protocol] = lookup - elif condition == "open": - kind = protocol - lookup = self.handle_open - elif condition == "response": - kind = protocol - lookup = self.process_response - elif condition == "request": - kind = protocol - lookup = self.process_request - else: - continue - - handlers = lookup.setdefault(kind, []) - if handlers: - bisect.insort(handlers, handler) - else: - handlers.append(handler) - added = True - - if added: - # the handlers must work in an specific order, the order - # is specified in a Handler attribute - bisect.insort(self.handlers, handler) - handler.add_parent(self) - - def close(self): - # Only exists for backwards compatibility. - pass - - def _call_chain(self, chain, kind, meth_name, *args): - # Handlers raise an exception if no one else should try to handle - # the request, or return None if they can't but another handler - # could. Otherwise, they return the response. - handlers = chain.get(kind, ()) - for handler in handlers: - func = getattr(handler, meth_name) - - result = func(*args) - if result is not None: - return result - - def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): - # accept a URL or a Request object - if isinstance(fullurl, str): - req = Request(fullurl, data) - else: - req = fullurl - if data is not None: - req.add_data(data) - - req.timeout = timeout - protocol = req.get_type() - - # pre-process request - meth_name = protocol+"_request" - for processor in self.process_request.get(protocol, []): - meth = getattr(processor, meth_name) - req = meth(req) - - response = self._open(req, data) - - # post-process response - meth_name = protocol+"_response" - for processor in self.process_response.get(protocol, []): - meth = getattr(processor, meth_name) - response = meth(req, response) - - return response - - def _open(self, req, data=None): - result = self._call_chain(self.handle_open, 'default', - 'default_open', req) - if result: - return result - - protocol = req.get_type() - result = self._call_chain(self.handle_open, protocol, protocol + - '_open', req) - if result: - return result - - return self._call_chain(self.handle_open, 'unknown', - 'unknown_open', req) - - def error(self, proto, *args): - if proto in ('http', 'https'): - # XXX http[s] protocols are special-cased - dict = self.handle_error['http'] # https is not different than http - proto = args[2] # YUCK! - meth_name = 'http_error_%s' % proto - http_err = 1 - orig_args = args - else: - dict = self.handle_error - meth_name = proto + '_error' - http_err = 0 - args = (dict, proto, meth_name) + args - result = self._call_chain(*args) - if result: - return result - - if http_err: - args = (dict, 'default', 'http_error_default') + orig_args - return self._call_chain(*args) - -# XXX probably also want an abstract factory that knows when it makes -# sense to skip a superclass in favor of a subclass and when it might -# make sense to include both - -def build_opener(*handlers): - """Create an opener object from a list of handlers. - - The opener will use several default handlers, including support - for HTTP and FTP. - - If any of the handlers passed as arguments are subclasses of the - default handlers, the default handlers will not be used. - """ - def isclass(obj): - return isinstance(obj, type) or hasattr(obj, "__bases__") - - opener = OpenerDirector() - default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, - HTTPDefaultErrorHandler, HTTPRedirectHandler, - FTPHandler, FileHandler, HTTPErrorProcessor] - if hasattr(http.client, 'HTTPS'): - default_classes.append(HTTPSHandler) - skip = set() - for klass in default_classes: - for check in handlers: - if isclass(check): - if issubclass(check, klass): - skip.add(klass) - elif isinstance(check, klass): - skip.add(klass) - for klass in skip: - default_classes.remove(klass) - - for klass in default_classes: - opener.add_handler(klass()) - - for h in handlers: - if isclass(h): - h = h() - opener.add_handler(h) - return opener - -class BaseHandler: - handler_order = 500 - - def add_parent(self, parent): - self.parent = parent - - def close(self): - # Only exists for backwards compatibility - pass - - def __lt__(self, other): - if not hasattr(other, "handler_order"): - # Try to preserve the old behavior of having custom classes - # inserted after default ones (works only for custom user - # classes which are not aware of handler_order). - return True - return self.handler_order < other.handler_order - - -class HTTPErrorProcessor(BaseHandler): - """Process HTTP error responses.""" - handler_order = 1000 # after all other processing - - def http_response(self, request, response): - code, msg, hdrs = response.code, response.msg, response.info() - - # According to RFC 2616, "2xx" code indicates that the client's - # request was successfully received, understood, and accepted. - if not (200 <= code < 300): - response = self.parent.error( - 'http', request, response, code, msg, hdrs) - - return response - - https_response = http_response - -class HTTPDefaultErrorHandler(BaseHandler): - def http_error_default(self, req, fp, code, msg, hdrs): - raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) - -class HTTPRedirectHandler(BaseHandler): - # maximum number of redirections to any single URL - # this is needed because of the state that cookies introduce - max_repeats = 4 - # maximum total number of redirections (regardless of URL) before - # assuming we're in a loop - max_redirections = 10 - - def redirect_request(self, req, fp, code, msg, headers, newurl): - """Return a Request or None in response to a redirect. - - This is called by the http_error_30x methods when a - redirection response is received. If a redirection should - take place, return a new Request to allow http_error_30x to - perform the redirect. Otherwise, raise HTTPError if no-one - else should try to handle this url. Return None if you can't - but another Handler might. - """ - m = req.get_method() - if (code in (301, 302, 303, 307) and m in ("GET", "HEAD") - or code in (301, 302, 303) and m == "POST"): - # Strictly (according to RFC 2616), 301 or 302 in response - # to a POST MUST NOT cause a redirection without confirmation - # from the user (of urllib2, in this case). In practice, - # essentially all clients do redirect in this case, so we - # do the same. - # be conciliant with URIs containing a space - newurl = newurl.replace(' ', '%20') - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type") - ) - return Request(newurl, - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True) - else: - raise HTTPError(req.get_full_url(), code, msg, headers, fp) - - # Implementation note: To avoid the server sending us into an - # infinite loop, the request object needs to track what URLs we - # have already seen. Do this by adding a handler-specific - # attribute to the Request object. - def http_error_302(self, req, fp, code, msg, headers): - # Some servers (incorrectly) return multiple Location headers - # (so probably same goes for URI). Use first header. - if 'location' in headers: - newurl = headers['location'] - elif 'uri' in headers: - newurl = headers['uri'] - else: - return - newurl = urlparse.urljoin(req.get_full_url(), newurl) - - # XXX Probably want to forget about the state of the current - # request, although that might interact poorly with other - # handlers that also use handler-specific request attributes - new = self.redirect_request(req, fp, code, msg, headers, newurl) - if new is None: - return - - # loop detection - # .redirect_dict has a key url if url was previously visited. - if hasattr(req, 'redirect_dict'): - visited = new.redirect_dict = req.redirect_dict - if (visited.get(newurl, 0) >= self.max_repeats or - len(visited) >= self.max_redirections): - raise HTTPError(req.get_full_url(), code, - self.inf_msg + msg, headers, fp) - else: - visited = new.redirect_dict = req.redirect_dict = {} - visited[newurl] = visited.get(newurl, 0) + 1 - - # Don't close the fp until we are sure that we won't use it - # with HTTPError. - fp.read() - fp.close() - - return self.parent.open(new) - - http_error_301 = http_error_303 = http_error_307 = http_error_302 - - inf_msg = "The HTTP server returned a redirect error that would " \ - "lead to an infinite loop.\n" \ - "The last 30x error message was:\n" - - -def _parse_proxy(proxy): - """Return (scheme, user, password, host/port) given a URL or an authority. - - If a URL is supplied, it must have an authority (host:port) component. - According to RFC 3986, having an authority component means the URL must - have two slashes after the scheme: - - >>> _parse_proxy('file:/ftp.example.com/') - Traceback (most recent call last): - ValueError: proxy URL with no authority: 'file:/ftp.example.com/' - - The first three items of the returned tuple may be None. - - Examples of authority parsing: - - >>> _parse_proxy('proxy.example.com') - (None, None, None, 'proxy.example.com') - >>> _parse_proxy('proxy.example.com:3128') - (None, None, None, 'proxy.example.com:3128') - - The authority component may optionally include userinfo (assumed to be - username:password): - - >>> _parse_proxy('joe:password@proxy.example.com') - (None, 'joe', 'password', 'proxy.example.com') - >>> _parse_proxy('joe:password@proxy.example.com:3128') - (None, 'joe', 'password', 'proxy.example.com:3128') - - Same examples, but with URLs instead: - - >>> _parse_proxy('http://proxy.example.com/') - ('http', None, None, 'proxy.example.com') - >>> _parse_proxy('http://proxy.example.com:3128/') - ('http', None, None, 'proxy.example.com:3128') - >>> _parse_proxy('http://joe:password@proxy.example.com/') - ('http', 'joe', 'password', 'proxy.example.com') - >>> _parse_proxy('http://joe:password@proxy.example.com:3128') - ('http', 'joe', 'password', 'proxy.example.com:3128') - - Everything after the authority is ignored: - - >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') - ('ftp', 'joe', 'password', 'proxy.example.com') - - Test for no trailing '/' case: - - >>> _parse_proxy('http://joe:password@proxy.example.com') - ('http', 'joe', 'password', 'proxy.example.com') - - """ - scheme, r_scheme = splittype(proxy) - if not r_scheme.startswith("/"): - # authority - scheme = None - authority = proxy - else: - # URL - if not r_scheme.startswith("//"): - raise ValueError("proxy URL with no authority: %r" % proxy) - # We have an authority, so for RFC 3986-compliant URLs (by ss 3. - # and 3.3.), path is empty or starts with '/' - end = r_scheme.find("/", 2) - if end == -1: - end = None - authority = r_scheme[2:end] - userinfo, hostport = splituser(authority) - if userinfo is not None: - user, password = splitpasswd(userinfo) - else: - user = password = None - return scheme, user, password, hostport - -class ProxyHandler(BaseHandler): - # Proxies must be in front - handler_order = 100 - - def __init__(self, proxies=None): - if proxies is None: - proxies = getproxies() - assert hasattr(proxies, 'keys'), "proxies must be a mapping" - self.proxies = proxies - for type, url in proxies.items(): - setattr(self, '%s_open' % type, - lambda r, proxy=url, type=type, meth=self.proxy_open: \ - meth(r, proxy, type)) - - def proxy_open(self, req, proxy, type): - orig_type = req.get_type() - proxy_type, user, password, hostport = _parse_proxy(proxy) - if proxy_type is None: - proxy_type = orig_type - if user and password: - user_pass = '%s:%s' % (unquote(user), unquote(password)) - creds = base64.b64encode(user_pass.encode()).decode("ascii") - req.add_header('Proxy-authorization', 'Basic ' + creds) - hostport = unquote(hostport) - req.set_proxy(hostport, proxy_type) - if orig_type == proxy_type: - # let other handlers take care of it - return None - else: - # need to start over, because the other handlers don't - # grok the proxy's URL type - # e.g. if we have a constructor arg proxies like so: - # {'http': 'ftp://proxy.example.com'}, we may end up turning - # a request for http://acme.example.com/a into one for - # ftp://proxy.example.com/a - return self.parent.open(req) - -class HTTPPasswordMgr: - - def __init__(self): - self.passwd = {} - - def add_password(self, realm, uri, user, passwd): - # uri could be a single URI or a sequence - if isinstance(uri, str): - uri = [uri] - if not realm in self.passwd: - self.passwd[realm] = {} - for default_port in True, False: - reduced_uri = tuple( - [self.reduce_uri(u, default_port) for u in uri]) - self.passwd[realm][reduced_uri] = (user, passwd) - - def find_user_password(self, realm, authuri): - domains = self.passwd.get(realm, {}) - for default_port in True, False: - reduced_authuri = self.reduce_uri(authuri, default_port) - for uris, authinfo in domains.items(): - for uri in uris: - if self.is_suburi(uri, reduced_authuri): - return authinfo - return None, None - - def reduce_uri(self, uri, default_port=True): - """Accept authority or URI and extract only the authority and path.""" - # note HTTP URLs do not have a userinfo component - parts = urlparse.urlsplit(uri) - if parts[1]: - # URI - scheme = parts[0] - authority = parts[1] - path = parts[2] or '/' - else: - # host or host:port - scheme = None - authority = uri - path = '/' - host, port = splitport(authority) - if default_port and port is None and scheme is not None: - dport = {"http": 80, - "https": 443, - }.get(scheme) - if dport is not None: - authority = "%s:%d" % (host, dport) - return authority, path - - def is_suburi(self, base, test): - """Check if test is below base in a URI tree - - Both args must be URIs in reduced form. - """ - if base == test: - return True - if base[0] != test[0]: - return False - common = posixpath.commonprefix((base[1], test[1])) - if len(common) == len(base[1]): - return True - return False - - -class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): - - def find_user_password(self, realm, authuri): - user, password = HTTPPasswordMgr.find_user_password(self, realm, - authuri) - if user is not None: - return user, password - return HTTPPasswordMgr.find_user_password(self, None, authuri) - - -class AbstractBasicAuthHandler: - - # XXX this allows for multiple auth-schemes, but will stupidly pick - # the last one with a realm specified. - - # allow for double- and single-quoted realm values - # (single quotes are a violation of the RFC, but appear in the wild) - rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' - 'realm=(["\'])(.*?)\\2', re.I) - - # XXX could pre-emptively send auth info already accepted (RFC 2617, - # end of section 2, and section 1.2 immediately after "credentials" - # production). - - def __init__(self, password_mgr=None): - if password_mgr is None: - password_mgr = HTTPPasswordMgr() - self.passwd = password_mgr - self.add_password = self.passwd.add_password - - def http_error_auth_reqed(self, authreq, host, req, headers): - # host may be an authority (without userinfo) or a URL with an - # authority - # XXX could be multiple headers - authreq = headers.get(authreq, None) - if authreq: - mo = AbstractBasicAuthHandler.rx.search(authreq) - if mo: - scheme, quote, realm = mo.groups() - if scheme.lower() == 'basic': - return self.retry_http_basic_auth(host, req, realm) - - def retry_http_basic_auth(self, host, req, realm): - user, pw = self.passwd.find_user_password(realm, host) - if pw is not None: - raw = "%s:%s" % (user, pw) - auth = "Basic " + base64.b64encode(raw.encode()).decode("ascii") - if req.headers.get(self.auth_header, None) == auth: - return None - req.add_header(self.auth_header, auth) - return self.parent.open(req) - else: - return None - - -class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): - - auth_header = 'Authorization' - - def http_error_401(self, req, fp, code, msg, headers): - url = req.get_full_url() - return self.http_error_auth_reqed('www-authenticate', - url, req, headers) - - -class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): - - auth_header = 'Proxy-authorization' - - def http_error_407(self, req, fp, code, msg, headers): - # http_error_auth_reqed requires that there is no userinfo component in - # authority. Assume there isn't one, since urllib2 does not (and - # should not, RFC 3986 s. 3.2.1) support requests for URLs containing - # userinfo. - authority = req.get_host() - return self.http_error_auth_reqed('proxy-authenticate', - authority, req, headers) - - -def randombytes(n): - """Return n random bytes.""" - return os.urandom(n) - -class AbstractDigestAuthHandler: - # Digest authentication is specified in RFC 2617. - - # XXX The client does not inspect the Authentication-Info header - # in a successful response. - - # XXX It should be possible to test this implementation against - # a mock server that just generates a static set of challenges. - - # XXX qop="auth-int" supports is shaky - - def __init__(self, passwd=None): - if passwd is None: - passwd = HTTPPasswordMgr() - self.passwd = passwd - self.add_password = self.passwd.add_password - self.retried = 0 - self.nonce_count = 0 - - def reset_retry_count(self): - self.retried = 0 - - def http_error_auth_reqed(self, auth_header, host, req, headers): - authreq = headers.get(auth_header, None) - if self.retried > 5: - # Don't fail endlessly - if we failed once, we'll probably - # fail a second time. Hm. Unless the Password Manager is - # prompting for the information. Crap. This isn't great - # but it's better than the current 'repeat until recursion - # depth exceeded' approach - raise HTTPError(req.get_full_url(), 401, "digest auth failed", - headers, None) - else: - self.retried += 1 - if authreq: - scheme = authreq.split()[0] - if scheme.lower() == 'digest': - return self.retry_http_digest_auth(req, authreq) - - def retry_http_digest_auth(self, req, auth): - token, challenge = auth.split(' ', 1) - chal = parse_keqv_list(parse_http_list(challenge)) - auth = self.get_authorization(req, chal) - if auth: - auth_val = 'Digest %s' % auth - if req.headers.get(self.auth_header, None) == auth_val: - return None - req.add_unredirected_header(self.auth_header, auth_val) - resp = self.parent.open(req) - return resp - - def get_cnonce(self, nonce): - # The cnonce-value is an opaque - # quoted string value provided by the client and used by both client - # and server to avoid chosen plaintext attacks, to provide mutual - # authentication, and to provide some message integrity protection. - # This isn't a fabulous effort, but it's probably Good Enough. - s = "%s:%s:%s:" % (self.nonce_count, nonce, time.ctime()) - b = s.encode("ascii") + randombytes(8) - dig = hashlib.sha1(b).hexdigest() - return dig[:16] - - def get_authorization(self, req, chal): - try: - realm = chal['realm'] - nonce = chal['nonce'] - qop = chal.get('qop') - algorithm = chal.get('algorithm', 'MD5') - # mod_digest doesn't send an opaque, even though it isn't - # supposed to be optional - opaque = chal.get('opaque', None) - except KeyError: - return None - - H, KD = self.get_algorithm_impls(algorithm) - if H is None: - return None - - user, pw = self.passwd.find_user_password(realm, req.get_full_url()) - if user is None: - return None - - # XXX not implemented yet - if req.has_data(): - entdig = self.get_entity_digest(req.get_data(), chal) - else: - entdig = None - - A1 = "%s:%s:%s" % (user, realm, pw) - A2 = "%s:%s" % (req.get_method(), - # XXX selector: what about proxies and full urls - req.get_selector()) - if qop == 'auth': - self.nonce_count += 1 - ncvalue = '%08x' % self.nonce_count - cnonce = self.get_cnonce(nonce) - noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) - respdig = KD(H(A1), noncebit) - elif qop is None: - respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) - else: - # XXX handle auth-int. - raise URLError("qop '%s' is not supported." % qop) - - # XXX should the partial digests be encoded too? - - base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ - 'response="%s"' % (user, realm, nonce, req.get_selector(), - respdig) - if opaque: - base += ', opaque="%s"' % opaque - if entdig: - base += ', digest="%s"' % entdig - base += ', algorithm="%s"' % algorithm - if qop: - base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) - return base - - def get_algorithm_impls(self, algorithm): - # algorithm should be case-insensitive according to RFC2617 - algorithm = algorithm.upper() - # lambdas assume digest modules are imported at the top level - if algorithm == 'MD5': - H = lambda x: hashlib.md5(x.encode("ascii")).hexdigest() - elif algorithm == 'SHA': - H = lambda x: hashlib.sha1(x.encode("ascii")).hexdigest() - # XXX MD5-sess - KD = lambda s, d: H("%s:%s" % (s, d)) - return H, KD - - def get_entity_digest(self, data, chal): - # XXX not implemented yet - return None - - -class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): - """An authentication protocol defined by RFC 2069 - - Digest authentication improves on basic authentication because it - does not transmit passwords in the clear. - """ - - auth_header = 'Authorization' - handler_order = 490 # before Basic auth - - def http_error_401(self, req, fp, code, msg, headers): - host = urlparse.urlparse(req.get_full_url())[1] - retry = self.http_error_auth_reqed('www-authenticate', - host, req, headers) - self.reset_retry_count() - return retry - - -class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): - - auth_header = 'Proxy-Authorization' - handler_order = 490 # before Basic auth - - def http_error_407(self, req, fp, code, msg, headers): - host = req.get_host() - retry = self.http_error_auth_reqed('proxy-authenticate', - host, req, headers) - self.reset_retry_count() - return retry - -class AbstractHTTPHandler(BaseHandler): - - def __init__(self, debuglevel=0): - self._debuglevel = debuglevel - - def set_http_debuglevel(self, level): - self._debuglevel = level - - def do_request_(self, request): - host = request.get_host() - if not host: - raise URLError('no host given') - - if request.has_data(): # POST - data = request.get_data() - if not request.has_header('Content-type'): - request.add_unredirected_header( - 'Content-type', - 'application/x-www-form-urlencoded') - if not request.has_header('Content-length'): - request.add_unredirected_header( - 'Content-length', '%d' % len(data)) - - scheme, sel = splittype(request.get_selector()) - sel_host, sel_path = splithost(sel) - if not request.has_header('Host'): - request.add_unredirected_header('Host', sel_host or host) - for name, value in self.parent.addheaders: - name = name.capitalize() - if not request.has_header(name): - request.add_unredirected_header(name, value) - - return request - - def do_open(self, http_class, req): - """Return an addinfourl object for the request, using http_class. - - http_class must implement the HTTPConnection API from http.client. - The addinfourl return value is a file-like object. It also - has methods and attributes including: - - info(): return a email.message.Message object for the headers - - geturl(): return the original request URL - - code: HTTP status code - """ - host = req.get_host() - if not host: - raise URLError('no host given') - - h = http_class(host, timeout=req.timeout) # will parse host:port - h.set_debuglevel(self._debuglevel) - - headers = dict(req.headers) - headers.update(req.unredirected_hdrs) - # We want to make an HTTP/1.1 request, but the addinfourl - # class isn't prepared to deal with a persistent connection. - # It will try to read all remaining data from the socket, - # which will block while the server waits for the next request. - # So make sure the connection gets closed after the (only) - # request. - headers["Connection"] = "close" - headers = dict( - (name.title(), val) for name, val in headers.items()) - try: - h.request(req.get_method(), req.get_selector(), req.data, headers) - r = h.getresponse() - except socket.error as err: # XXX what error? - raise URLError(err) - - # Pick apart the HTTPResponse object to get the addinfourl - # object initialized properly. - - # XXX Should an HTTPResponse object really be passed to - # BufferedReader? If so, we should change http.client to support - # this use directly. - - # Add some fake methods to the reader to satisfy BufferedReader. - r.readable = lambda: True - r.writable = r.seekable = lambda: False - r._checkReadable = lambda: True - r._checkWritable = lambda: False - fp = io.BufferedReader(r) - - resp = addinfourl(fp, r.msg, req.get_full_url()) - resp.code = r.status - resp.msg = r.reason - return resp - - -class HTTPHandler(AbstractHTTPHandler): - - def http_open(self, req): - return self.do_open(http.client.HTTPConnection, req) - - http_request = AbstractHTTPHandler.do_request_ - -if hasattr(http.client, 'HTTPS'): - class HTTPSHandler(AbstractHTTPHandler): - - def https_open(self, req): - return self.do_open(http.client.HTTPSConnection, req) - - https_request = AbstractHTTPHandler.do_request_ - -class HTTPCookieProcessor(BaseHandler): - def __init__(self, cookiejar=None): - import http.cookiejar - if cookiejar is None: - cookiejar = http.cookiejar.CookieJar() - self.cookiejar = cookiejar - - def http_request(self, request): - self.cookiejar.add_cookie_header(request) - return request - - def http_response(self, request, response): - self.cookiejar.extract_cookies(response, request) - return response - - https_request = http_request - https_response = http_response - -class UnknownHandler(BaseHandler): - def unknown_open(self, req): - type = req.get_type() - raise URLError('unknown url type: %s' % type) - -def parse_keqv_list(l): - """Parse list of key=value strings where keys are not duplicated.""" - parsed = {} - for elt in l: - # Because of a trailing comma in the auth string, elt could be the - # empty string. - if not elt: - continue - k, v = elt.split('=', 1) - if v[0] == '"' and v[-1] == '"': - v = v[1:-1] - parsed[k] = v - return parsed - -def parse_http_list(s): - """Parse lists as described by RFC 2068 Section 2. - - In particular, parse comma-separated lists where the elements of - the list may include quoted-strings. A quoted-string could - contain a comma. A non-quoted string could have quotes in the - middle. Neither commas nor quotes count if they are escaped. - Only double-quotes count, not single-quotes. - """ - res = [] - part = '' - - escape = quote = False - for cur in s: - if escape: - part += cur - escape = False - continue - if quote: - if cur == '\\': - escape = True - continue - elif cur == '"': - quote = False - part += cur - continue - - if cur == ',': - res.append(part) - part = '' - continue - - if cur == '"': - quote = True - - part += cur - - # append last part - if part: - res.append(part) - - return [part.strip() for part in res] - -class FileHandler(BaseHandler): - # Use local file or FTP depending on form of URL - def file_open(self, req): - url = req.get_selector() - if url[:2] == '//' and url[2:3] != '/': - req.type = 'ftp' - return self.parent.open(req) - else: - return self.open_local_file(req) - - # names for the localhost - names = None - def get_names(self): - if FileHandler.names is None: - try: - FileHandler.names = (socket.gethostbyname('localhost'), - socket.gethostbyname(socket.gethostname())) - except socket.gaierror: - FileHandler.names = (socket.gethostbyname('localhost'),) - return FileHandler.names - - # not entirely sure what the rules are here - def open_local_file(self, req): - import email.utils - import mimetypes - host = req.get_host() - file = req.get_selector() - localfile = url2pathname(file) - try: - stats = os.stat(localfile) - size = stats.st_size - modified = email.utils.formatdate(stats.st_mtime, usegmt=True) - mtype = mimetypes.guess_type(file)[0] - headers = email.message_from_string( - 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % - (mtype or 'text/plain', size, modified)) - if host: - host, port = splitport(host) - if not host or \ - (not port and _safe_gethostbyname(host) in self.get_names()): - return addinfourl(open(localfile, 'rb'), - headers, 'file:'+file) - except OSError as msg: - # urllib2 users shouldn't expect OSErrors coming from urlopen() - raise URLError(msg) - raise URLError('file not on local host') - -def _safe_gethostbyname(host): - try: - return socket.gethostbyname(host) - except socket.gaierror: - return None - -class FTPHandler(BaseHandler): - def ftp_open(self, req): - import ftplib - import mimetypes - host = req.get_host() - if not host: - raise URLError('ftp error: no host given') - host, port = splitport(host) - if port is None: - port = ftplib.FTP_PORT - else: - port = int(port) - - # username/password handling - user, host = splituser(host) - if user: - user, passwd = splitpasswd(user) - else: - passwd = None - host = unquote(host) - user = unquote(user or '') - passwd = unquote(passwd or '') - - try: - host = socket.gethostbyname(host) - except socket.error as msg: - raise URLError(msg) - path, attrs = splitattr(req.get_selector()) - dirs = path.split('/') - dirs = list(map(unquote, dirs)) - dirs, file = dirs[:-1], dirs[-1] - if dirs and not dirs[0]: - dirs = dirs[1:] - try: - fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) - type = file and 'I' or 'D' - for attr in attrs: - attr, value = splitvalue(attr) - if attr.lower() == 'type' and \ - value in ('a', 'A', 'i', 'I', 'd', 'D'): - type = value.upper() - fp, retrlen = fw.retrfile(file, type) - headers = "" - mtype = mimetypes.guess_type(req.get_full_url())[0] - if mtype: - headers += "Content-type: %s\n" % mtype - if retrlen is not None and retrlen >= 0: - headers += "Content-length: %d\n" % retrlen - headers = email.message_from_string(headers) - sf = StringIO(str(headers)) - return addinfourl(fp, headers, req.get_full_url()) - except ftplib.all_errors as msg: - raise URLError('ftp error: %s' % msg).with_traceback(sys.exc_info()[2]) - - def connect_ftp(self, user, passwd, host, port, dirs, timeout): - fw = ftpwrapper(user, passwd, host, port, dirs, timeout) - return fw - -class CacheFTPHandler(FTPHandler): - # XXX would be nice to have pluggable cache strategies - # XXX this stuff is definitely not thread safe - def __init__(self): - self.cache = {} - self.timeout = {} - self.soonest = 0 - self.delay = 60 - self.max_conns = 16 - - def setTimeout(self, t): - self.delay = t - - def setMaxConns(self, m): - self.max_conns = m - - def connect_ftp(self, user, passwd, host, port, dirs, timeout): - key = user, host, port, '/'.join(dirs), timeout - if key in self.cache: - self.timeout[key] = time.time() + self.delay - else: - self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout) - self.timeout[key] = time.time() + self.delay - self.check_cache() - return self.cache[key] - - def check_cache(self): - # first check for old ones - t = time.time() - if self.soonest <= t: - for k, v in list(self.timeout.items()): - if v < t: - self.cache[k].close() - del self.cache[k] - del self.timeout[k] - self.soonest = min(list(self.timeout.values())) - - # then check the size - if len(self.cache) == self.max_conns: - for k, v in list(self.timeout.items()): - if v == self.soonest: - del self.cache[k] - del self.timeout[k] - break - self.soonest = min(list(self.timeout.values())) diff --git a/Lib/urlparse.py b/Lib/urlparse.py deleted file mode 100644 index 30de699..0000000 --- a/Lib/urlparse.py +++ /dev/null @@ -1,325 +0,0 @@ -"""Parse (absolute and relative) URLs. - -See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, -UC Irvine, June 1995. -""" - -__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", - "urlsplit", "urlunsplit"] - -# A classification of schemes ('' means apply by default) -uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', - 'wais', 'file', 'https', 'shttp', 'mms', - 'prospero', 'rtsp', 'rtspu', '', 'sftp'] -uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', - 'imap', 'wais', 'file', 'mms', 'https', 'shttp', - 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', - 'svn', 'svn+ssh', 'sftp'] -non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', - 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] -uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', - 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', - 'mms', '', 'sftp'] -uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', - 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] -uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', - 'nntp', 'wais', 'https', 'shttp', 'snews', - 'file', 'prospero', ''] - -# Characters valid in scheme names -scheme_chars = ('abcdefghijklmnopqrstuvwxyz' - 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' - '0123456789' - '+-.') - -MAX_CACHE_SIZE = 20 -_parse_cache = {} - -def clear_cache(): - """Clear the parse cache.""" - _parse_cache.clear() - - -class ResultMixin(object): - """Shared methods for the parsed result objects.""" - - @property - def username(self): - netloc = self.netloc - if "@" in netloc: - userinfo = netloc.rsplit("@", 1)[0] - if ":" in userinfo: - userinfo = userinfo.split(":", 1)[0] - return userinfo - return None - - @property - def password(self): - netloc = self.netloc - if "@" in netloc: - userinfo = netloc.rsplit("@", 1)[0] - if ":" in userinfo: - return userinfo.split(":", 1)[1] - return None - - @property - def hostname(self): - netloc = self.netloc - if "@" in netloc: - netloc = netloc.rsplit("@", 1)[1] - if ":" in netloc: - netloc = netloc.split(":", 1)[0] - return netloc.lower() or None - - @property - def port(self): - netloc = self.netloc - if "@" in netloc: - netloc = netloc.rsplit("@", 1)[1] - if ":" in netloc: - port = netloc.split(":", 1)[1] - return int(port, 10) - return None - -from collections import namedtuple - -class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin): - - __slots__ = () - - def geturl(self): - return urlunsplit(self) - - -class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin): - - __slots__ = () - - def geturl(self): - return urlunparse(self) - - -def urlparse(url, scheme='', allow_fragments=True): - """Parse a URL into 6 components: - :///;?# - Return a 6-tuple: (scheme, netloc, path, params, query, fragment). - Note that we don't break the components up in smaller bits - (e.g. netloc is a single string) and we don't expand % escapes.""" - tuple = urlsplit(url, scheme, allow_fragments) - scheme, netloc, url, query, fragment = tuple - if scheme in uses_params and ';' in url: - url, params = _splitparams(url) - else: - params = '' - return ParseResult(scheme, netloc, url, params, query, fragment) - -def _splitparams(url): - if '/' in url: - i = url.find(';', url.rfind('/')) - if i < 0: - return url, '' - else: - i = url.find(';') - return url[:i], url[i+1:] - -def _splitnetloc(url, start=0): - delim = len(url) # position of end of domain part of url, default is end - for c in '/?#': # look for delimiters; the order is NOT important - wdelim = url.find(c, start) # find first of this delim - if wdelim >= 0: # if found - delim = min(delim, wdelim) # use earliest delim position - return url[start:delim], url[delim:] # return (domain, rest) - -def urlsplit(url, scheme='', allow_fragments=True): - """Parse a URL into 5 components: - :///?# - Return a 5-tuple: (scheme, netloc, path, query, fragment). - Note that we don't break the components up in smaller bits - (e.g. netloc is a single string) and we don't expand % escapes.""" - allow_fragments = bool(allow_fragments) - key = url, scheme, allow_fragments, type(url), type(scheme) - cached = _parse_cache.get(key, None) - if cached: - return cached - if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth - clear_cache() - netloc = query = fragment = '' - i = url.find(':') - if i > 0: - if url[:i] == 'http': # optimize the common case - scheme = url[:i].lower() - url = url[i+1:] - if url[:2] == '//': - netloc, url = _splitnetloc(url, 2) - if allow_fragments and '#' in url: - url, fragment = url.split('#', 1) - if '?' in url: - url, query = url.split('?', 1) - v = SplitResult(scheme, netloc, url, query, fragment) - _parse_cache[key] = v - return v - for c in url[:i]: - if c not in scheme_chars: - break - else: - scheme, url = url[:i].lower(), url[i+1:] - if scheme in uses_netloc and url[:2] == '//': - netloc, url = _splitnetloc(url, 2) - if allow_fragments and scheme in uses_fragment and '#' in url: - url, fragment = url.split('#', 1) - if scheme in uses_query and '?' in url: - url, query = url.split('?', 1) - v = SplitResult(scheme, netloc, url, query, fragment) - _parse_cache[key] = v - return v - -def urlunparse(components): - """Put a parsed URL back together again. This may result in a - slightly different, but equivalent URL, if the URL that was parsed - originally had redundant delimiters, e.g. a ? with an empty query - (the draft states that these are equivalent).""" - scheme, netloc, url, params, query, fragment = components - if params: - url = "%s;%s" % (url, params) - return urlunsplit((scheme, netloc, url, query, fragment)) - -def urlunsplit(components): - scheme, netloc, url, query, fragment = components - if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): - if url and url[:1] != '/': url = '/' + url - url = '//' + (netloc or '') + url - if scheme: - url = scheme + ':' + url - if query: - url = url + '?' + query - if fragment: - url = url + '#' + fragment - return url - -def urljoin(base, url, allow_fragments=True): - """Join a base URL and a possibly relative URL to form an absolute - interpretation of the latter.""" - if not base: - return url - if not url: - return base - bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ - urlparse(base, '', allow_fragments) - scheme, netloc, path, params, query, fragment = \ - urlparse(url, bscheme, allow_fragments) - if scheme != bscheme or scheme not in uses_relative: - return url - if scheme in uses_netloc: - if netloc: - return urlunparse((scheme, netloc, path, - params, query, fragment)) - netloc = bnetloc - if path[:1] == '/': - return urlunparse((scheme, netloc, path, - params, query, fragment)) - if not (path or params or query): - return urlunparse((scheme, netloc, bpath, - bparams, bquery, fragment)) - segments = bpath.split('/')[:-1] + path.split('/') - # XXX The stuff below is bogus in various ways... - if segments[-1] == '.': - segments[-1] = '' - while '.' in segments: - segments.remove('.') - while 1: - i = 1 - n = len(segments) - 1 - while i < n: - if (segments[i] == '..' - and segments[i-1] not in ('', '..')): - del segments[i-1:i+1] - break - i = i+1 - else: - break - if segments == ['', '..']: - segments[-1] = '' - elif len(segments) >= 2 and segments[-1] == '..': - segments[-2:] = [''] - return urlunparse((scheme, netloc, '/'.join(segments), - params, query, fragment)) - -def urldefrag(url): - """Removes any existing fragment from URL. - - Returns a tuple of the defragmented URL and the fragment. If - the URL contained no fragments, the second element is the - empty string. - """ - if '#' in url: - s, n, p, a, q, frag = urlparse(url) - defrag = urlunparse((s, n, p, a, q, '')) - return defrag, frag - else: - return url, '' - - -test_input = """ - http://a/b/c/d - - g:h = - http:g = - http: = - g = - ./g = - g/ = - /g = - //g = - ?y = - g?y = - g?y/./x = - . = - ./ = - .. = - ../ = - ../g = - ../.. = - ../../g = - ../../../g = - ./../g = - ./g/. = - /./g = - g/./h = - g/../h = - http:g = - http: = - http:?y = - http:g?y = - http:g?y/./x = -""" - -def test(): - import sys - base = '' - if sys.argv[1:]: - fn = sys.argv[1] - if fn == '-': - fp = sys.stdin - else: - fp = open(fn) - else: - from io import StringIO - fp = StringIO(test_input) - for line in fp: - words = line.split() - if not words: - continue - url = words[0] - parts = urlparse(url) - print('%-10s : %s' % (url, parts)) - abs = urljoin(base, url) - if not base: - base = abs - wrapped = '' % abs - print('%-10s = %s' % (url, wrapped)) - if len(words) == 3 and words[1] == '=': - if wrapped != words[2]: - print('EXPECTED', words[2], '!!!!!!!!!!') - -if __name__ == '__main__': - test() diff --git a/Lib/wsgiref/simple_server.py b/Lib/wsgiref/simple_server.py index 44a91fa..a82c80a 100644 --- a/Lib/wsgiref/simple_server.py +++ b/Lib/wsgiref/simple_server.py @@ -11,7 +11,8 @@ module. See also the BaseHTTPServer module docs for other API information. """ from http.server import BaseHTTPRequestHandler, HTTPServer -import urllib, sys +import sys +import urllib.parse from wsgiref.handlers import SimpleHandler __version__ = "0.1" @@ -93,7 +94,7 @@ class WSGIRequestHandler(BaseHTTPRequestHandler): else: path,query = self.path,'' - env['PATH_INFO'] = urllib.unquote(path) + env['PATH_INFO'] = urllib.parse.unquote(path) env['QUERY_STRING'] = query host = self.address_string() diff --git a/Lib/wsgiref/util.py b/Lib/wsgiref/util.py index a4ca02f..2686b66 100644 --- a/Lib/wsgiref/util.py +++ b/Lib/wsgiref/util.py @@ -50,7 +50,7 @@ def guess_scheme(environ): def application_uri(environ): """Return the application's base URI (no PATH_INFO or QUERY_STRING)""" url = environ['wsgi.url_scheme']+'://' - from urllib import quote + from urllib.parse import quote if environ.get('HTTP_HOST'): url += environ['HTTP_HOST'] @@ -70,7 +70,7 @@ def application_uri(environ): def request_uri(environ, include_query=1): """Return the full request URI, optionally including the query string""" url = application_uri(environ) - from urllib import quote + from urllib.parse import quote path_info = quote(environ.get('PATH_INFO','')) if not environ.get('SCRIPT_NAME'): url += path_info[1:] diff --git a/Lib/xml/dom/xmlbuilder.py b/Lib/xml/dom/xmlbuilder.py index dc7c5d4..d798624 100644 --- a/Lib/xml/dom/xmlbuilder.py +++ b/Lib/xml/dom/xmlbuilder.py @@ -190,8 +190,8 @@ class DOMBuilder: options.errorHandler = self.errorHandler fp = input.byteStream if fp is None and options.systemId: - import urllib2 - fp = urllib2.urlopen(input.systemId) + import urllib.request + fp = urllib.request.urlopen(input.systemId) return self._parse_bytestream(fp, options) def parseWithContext(self, input, cnode, action): @@ -223,14 +223,14 @@ class DOMEntityResolver(object): source.encoding = self._guess_media_encoding(source) # determine the base URI is we can - import posixpath, urlparse - parts = urlparse.urlparse(systemId) + import posixpath, urllib.parse + parts = urllib.parse.urlparse(systemId) scheme, netloc, path, params, query, fragment = parts # XXX should we check the scheme here as well? if path and not path.endswith("/"): path = posixpath.dirname(path) + "/" parts = scheme, netloc, path, params, query, fragment - source.baseURI = urlparse.urlunparse(parts) + source.baseURI = urllib.parse.urlunparse(parts) return source @@ -242,8 +242,8 @@ class DOMEntityResolver(object): return self._opener def _create_opener(self): - import urllib2 - return urllib2.build_opener() + import urllib.request + return urllib.request.build_opener() def _guess_media_encoding(self, source): info = source.byteStream.info() diff --git a/Lib/xml/sax/saxutils.py b/Lib/xml/sax/saxutils.py index a569c1d..e5d43a5 100644 --- a/Lib/xml/sax/saxutils.py +++ b/Lib/xml/sax/saxutils.py @@ -3,7 +3,7 @@ A library of useful helper classes to the SAX classes, for the convenience of application and driver writers. """ -import os, urlparse, urllib +import os, urllib.parse, urllib.request from . import handler from . import xmlreader @@ -289,8 +289,8 @@ def prepare_input_source(source, base = ""): source.setSystemId(sysidfilename) f = open(sysidfilename, "rb") else: - source.setSystemId(urlparse.urljoin(base, sysid)) - f = urllib.urlopen(source.getSystemId()) + source.setSystemId(urllib.parse.urljoin(base, sysid)) + f = urllib.request.urlopen(source.getSystemId()) source.setByteStream(f) diff --git a/Lib/xmlrpc/client.py b/Lib/xmlrpc/client.py index 6868d3b..121fedf 100644 --- a/Lib/xmlrpc/client.py +++ b/Lib/xmlrpc/client.py @@ -1160,12 +1160,12 @@ class Transport: if isinstance(host, tuple): host, x509 = host - import urllib - auth, host = urllib.splituser(host) + import urllib.parse + auth, host = urllib.parse.splituser(host) if auth: import base64 - auth = base64.encodestring(urllib.unquote(auth)) + auth = base64.encodestring(urllib.parse.unquote(auth)) auth = "".join(auth.split()) # get rid of whitespace extra_headers = [ ("Authorization", "Basic " + auth) @@ -1321,11 +1321,11 @@ class ServerProxy: # establish a "logical" server connection # get the url - import urllib - type, uri = urllib.splittype(uri) + import urllib.parse + type, uri = urllib.parse.splittype(uri) if type not in ("http", "https"): raise IOError("unsupported XML-RPC protocol") - self.__host, self.__handler = urllib.splithost(uri) + self.__host, self.__handler = urllib.parse.splithost(uri) if not self.__handler: self.__handler = "/RPC2" diff --git a/Makefile.pre.in b/Makefile.pre.in index 0d89d12..3de0dbd 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -809,7 +809,7 @@ LIBSUBDIRS= tkinter site-packages test test/output test/data \ email email/mime email/test email/test/data \ html json json/tests http dbm xmlrpc \ sqlite3 sqlite3/test \ - logging bsddb bsddb/test csv wsgiref \ + logging bsddb bsddb/test csv wsgiref urllib \ lib2to3 lib2to3/fixes lib2to3/pgen2 lib2to3/tests \ ctypes ctypes/test ctypes/macholib idlelib idlelib/Icons \ distutils distutils/command distutils/tests $(XMLLIBSUBDIRS) \ diff --git a/Misc/NEWS b/Misc/NEWS index 07c5edd..fe4c07c 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -81,6 +81,15 @@ Extension Modules Library ------- +- a new ``urllib`` package was created. It consists of code from + ``urllib``, ``urllib2``, ``urlparse``, and ``robotparser``. The old + modules have all been removed. The new package has five submodules: + ``urllib.parse``, ``urllib.request``, ``urllib.response``, + ``urllib.error``, and ``urllib.robotparser``. The + ``urllib.request.urlopen()`` function uses the url opener from + ``urllib2``. (Note that the unittests have not been renamed for the + beta, but they will be renamed in the future.) + - rfc822 has been removed in favor of the email package. - mimetools has been removed in favor of the email package. -- cgit v0.12