diff options
-rw-r--r-- | Doc/lib/liburllib2.tex | 32 | ||||
-rw-r--r-- | Lib/test/test_urllib2.py | 643 | ||||
-rw-r--r-- | Lib/urllib2.py | 194 |
3 files changed, 783 insertions, 86 deletions
diff --git a/Doc/lib/liburllib2.tex b/Doc/lib/liburllib2.tex index 3f8ff3a..0020b6c 100644 --- a/Doc/lib/liburllib2.tex +++ b/Doc/lib/liburllib2.tex @@ -52,7 +52,7 @@ front of the \var{handler}s, unless the \var{handler}s contain them, instances of them or subclasses of them: \class{ProxyHandler}, \class{UnknownHandler}, \class{HTTPHandler}, \class{HTTPDefaultErrorHandler}, \class{HTTPRedirectHandler}, -\class{FTPHandler}, \class{FileHandler} +\class{FTPHandler}, \class{FileHandler}, \class{HTTPErrorProcessor}. If the Python installation has SSL support (\function{socket.ssl()} exists), \class{HTTPSHandler} will also be added. @@ -248,6 +248,15 @@ when used more than once have a (header-specific) way of gaining the same functionality using only one header. \end{methoddesc} +\begin{methoddesc}[Request]{add_unredirected_header}{key, header} +Add a header that will not be added to a redirected request. +\end{methoddesc} + +\begin{methoddesc}[Request]{has_header}{header} +Return whether the instance has the named header (checks both regular +and unredirected). +\end{methoddesc} + \begin{methoddesc}[Request]{get_full_url}{} Return the URL given in the constructor. \end{methoddesc} @@ -286,6 +295,12 @@ following methods are searched, and added to the possible chains. \item \method{\var{protocol}_error_\var{type}()} --- signal that the handler knows how to handle \var{type} errors from \var{protocol}. + \item \method{\var{protocol}_request()} --- + signal that the handler knows how to pre-process \var{protocol} + requests. + \item \method{\var{protocol}_response()} --- + signal that the handler knows how to post-process \var{protocol} + responses. \end{itemize} \end{methoddesc} @@ -620,6 +635,21 @@ Raise a \exception{URLError} exception. \end{methoddesc} +\subsection{HTTPErrorProcessor Objects \label{http-error-processor-objects}} + +\begin{methoddesc}[HTTPErrorProcessor]{unknown_open}{} +Process HTTP error responses. + +For 200 error codes, the response object is returned immediately. + +For non-200 error codes, this simply passes the job on to the +\method{\var{protocol}_error_\var{code}()} handler methods, via +\method{OpenerDirector.error()}. Eventually, +\class{urllib2.HTTPDefaultErrorHandler} will raise an +\exception{HTTPError} if no other handler handles the error. +\end{methoddesc} + + \subsection{Examples \label{urllib2-examples}} This example gets the python.org main page and displays the first 100 diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index 8238bca..d8bf83e 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -1,31 +1,616 @@ -from test.test_support import verify -import urllib2 +import unittest +from test import test_support + import os +import StringIO + +import urllib2 +from urllib2 import Request, OpenerDirector + +# XXX +# Request +# CacheFTPHandler (hard to write) +# parse_keqv_list, parse_http_list (I'm leaving this for Anthony Baxter +# and Greg Stein, since they're doing Digest Authentication) +# Authentication stuff (ditto) +# ProxyHandler, CustomProxy, CustomProxyHandler (I don't use a proxy) +# GopherHandler (haven't used gopher for a decade or so...) + +class TrivialTests(unittest.TestCase): + def test_trivial(self): + # A couple trivial tests + + self.assertRaises(ValueError, urllib2.urlopen, 'bogus url') + + # XXX Name hacking to get this to work on Windows. + fname = os.path.abspath(urllib2.__file__).replace('\\', '/') + if fname[1:2] == ":": + fname = fname[2:] + # And more hacking to get it to work on MacOS. This assumes + # urllib.pathname2url works, unfortunately... + if os.name == 'mac': + fname = '/' + fname.replace(':', '/') + elif os.name == 'riscos': + import string + fname = os.expand(fname) + fname = fname.translate(string.maketrans("/.", "./")) + + file_url = "file://%s" % fname + f = urllib2.urlopen(file_url) + + buf = f.read() + f.close() + + +class MockOpener: + addheaders = [] + def open(self, req, data=None): + self.req, self.data = req, data + def error(self, proto, *args): + self.proto, self.args = proto, args + +class MockFile: + def read(self, count=None): pass + def readline(self, count=None): pass + def close(self): pass + +class MockResponse(StringIO.StringIO): + def __init__(self, code, msg, headers, data, url=None): + StringIO.StringIO.__init__(self, data) + self.code, self.msg, self.headers, self.url = code, msg, headers, url + def info(self): + return self.headers + def geturl(self): + return self.url + +class FakeMethod: + def __init__(self, meth_name, action, handle): + self.meth_name = meth_name + self.handle = handle + self.action = action + def __call__(self, *args): + return self.handle(self.meth_name, self.action, *args) + +class MockHandler: + def __init__(self, methods): + self._define_methods(methods) + def _define_methods(self, methods): + for spec in methods: + if len(spec) == 2: name, action = spec + else: name, action = spec, None + meth = FakeMethod(name, action, self.handle) + setattr(self.__class__, name, meth) + def handle(self, fn_name, action, *args, **kwds): + self.parent.calls.append((self, fn_name, args, kwds)) + if action is None: + return None + elif action == "return self": + return self + elif action == "return response": + res = MockResponse(200, "OK", {}, "") + return res + elif action == "return request": + return Request("http://blah/") + elif action.startswith("error"): + code = action[action.rfind(" ")+1:] + try: + code = int(code) + except ValueError: + pass + res = MockResponse(200, "OK", {}, "") + return self.parent.error("http", args[0], res, code, "", {}) + elif action == "raise": + raise urllib2.URLError("blah") + assert False + def close(self): pass + def add_parent(self, parent): + self.parent = parent + self.parent.calls = [] + def __lt__(self, other): + if not hasattr(other, "handler_order"): + # No handler_order, leave in original order. Yuck. + return True + return self.handler_order < other.handler_order + +def add_ordered_mock_handlers(opener, meth_spec): + """Create MockHandlers and add them to an OpenerDirector. + + meth_spec: list of lists of tuples and strings defining methods to define + on handlers. eg: + + [["http_error", "ftp_open"], ["http_open"]] + + defines methods .http_error() and .ftp_open() on one handler, and + .http_open() on another. These methods just record their arguments and + return None. Using a tuple instead of a string causes the method to + perform some action (see MockHandler.handle()), eg: + + [["http_error"], [("http_open", "return request")]] + + defines .http_error() on one handler (which simply returns None), and + .http_open() on another handler, which returns a Request object. + + """ + handlers = [] + count = 0 + for meths in meth_spec: + class MockHandlerSubclass(MockHandler): pass + h = MockHandlerSubclass(meths) + h.handler_order = count + h.add_parent(opener) + count = count + 1 + handlers.append(h) + opener.add_handler(h) + return handlers + +class OpenerDirectorTests(unittest.TestCase): + + def test_handled(self): + # handler returning non-None means no more handlers will be called + o = OpenerDirector() + meth_spec = [ + ["http_open", "ftp_open", "http_error_302"], + ["ftp_open"], + [("http_open", "return self")], + [("http_open", "return self")], + ] + handlers = add_ordered_mock_handlers(o, meth_spec) + + req = Request("http://example.com/") + r = o.open(req) + # Second .http_open() gets called, third doesn't, since second returned + # non-None. Handlers without .http_open() never get any methods called + # on them. + # In fact, second mock handler defining .http_open() returns self + # (instead of response), which becomes the OpenerDirector's return + # value. + self.assert_(r == handlers[2]) + calls = [(handlers[0], "http_open"), (handlers[2], "http_open")] + for expected, got in zip(calls, o.calls): + handler, name, args, kwds = got + self.assert_((handler, name) == expected) + self.assert_(args == (req,)) + + def test_handler_order(self): + o = OpenerDirector() + handlers = [] + for meths, handler_order in [ + ([("http_open", "return self")], 500), + (["http_open"], 0), + ]: + class MockHandlerSubclass(MockHandler): pass + h = MockHandlerSubclass(meths) + h.handler_order = handler_order + handlers.append(h) + o.add_handler(h) + + r = o.open("http://example.com/") + # handlers called in reverse order, thanks to their sort order + self.assert_(o.calls[0][0] == handlers[1]) + self.assert_(o.calls[1][0] == handlers[0]) + + def test_raise(self): + # raising URLError stops processing of request + o = OpenerDirector() + meth_spec = [ + [("http_open", "raise")], + [("http_open", "return self")], + ] + handlers = add_ordered_mock_handlers(o, meth_spec) + + req = Request("http://example.com/") + self.assertRaises(urllib2.URLError, o.open, req) + self.assert_(o.calls == [(handlers[0], "http_open", (req,), {})]) + +## def test_error(self): +## # XXX this doesn't actually seem to be used in standard library, +## # but should really be tested anyway... + + def test_http_error(self): + # XXX http_error_default + # http errors are a special case + o = OpenerDirector() + meth_spec = [ + [("http_open", "error 302")], + [("http_error_400", "raise"), "http_open"], + [("http_error_302", "return response"), "http_error_303", + "http_error"], + [("http_error_302")], + ] + handlers = add_ordered_mock_handlers(o, meth_spec) + + class Unknown: + def __eq__(self, other): return True + + req = Request("http://example.com/") + r = o.open(req) + assert len(o.calls) == 2 + calls = [(handlers[0], "http_open", (req,)), + (handlers[2], "http_error_302", (req, Unknown(), 302, "", {}))] + for expected, got in zip(calls, o.calls): + handler, method_name, args = expected + self.assert_((handler, method_name) == got[:2]) + assert args == got[2] + + def test_processors(self): + # *_request / *_response methods get called appropriately + o = OpenerDirector() + meth_spec = [ + [("http_request", "return request"), + ("http_response", "return response")], + [("http_request", "return request"), + ("http_response", "return response")], + ] + handlers = add_ordered_mock_handlers(o, meth_spec) + + req = Request("http://example.com/") + r = o.open(req) + # processor methods are called on *all* handlers that define them, + # not just the first handler that handles the request + calls = [(handlers[0], "http_request"), (handlers[1], "http_request"), + (handlers[0], "http_response"), (handlers[1], "http_response")] + + for i, (handler, name, args, kwds) in enumerate(o.calls): + if i < 2: + # *_request + self.assert_((handler, name) == calls[i]) + self.assert_(len(args) == 1) + self.assert_(isinstance(args[0], Request)) + else: + # *_response + self.assert_((handler, name) == calls[i]) + self.assert_(len(args) == 2) + self.assert_(isinstance(args[0], Request)) + # response from opener.open is None, because there's no + # handler that defines http_open to handle it + self.assert_(args[1] is None or + isinstance(args[1], MockResponse)) + + +class HandlerTests(unittest.TestCase): + + def test_ftp(self): + class MockFTPWrapper: + def __init__(self, data): self.data = data + def retrfile(self, filename, filetype): + self.filename, self.filetype = filename, filetype + return StringIO.StringIO(self.data), len(self.data) + + class NullFTPHandler(urllib2.FTPHandler): + def __init__(self, data): self.data = data + def connect_ftp(self, user, passwd, host, port, dirs): + self.user, self.passwd = user, passwd + self.host, self.port = host, port + self.dirs = dirs + self.ftpwrapper = MockFTPWrapper(self.data) + return self.ftpwrapper + + import ftplib, socket + data = "rheum rhaponicum" + h = NullFTPHandler(data) + o = h.parent = MockOpener() + + for url, host, port, type_, dirs, filename, mimetype in [ + ("ftp://localhost/foo/bar/baz.html", + "localhost", ftplib.FTP_PORT, "I", + ["foo", "bar"], "baz.html", "text/html"), +# XXXX Bug: FTPHandler tries to gethostbyname "localhost:80", with the +# port still there. +## ("ftp://localhost:80/foo/bar/", +## "localhost", 80, "D", +## ["foo", "bar"], "", None), +# XXXX bug: second use of splitattr() in FTPHandler should be splitvalue() +## ("ftp://localhost/baz.gif;type=a", +## "localhost", ftplib.FTP_PORT, "A", +## [], "baz.gif", "image/gif"), + ]: + r = h.ftp_open(Request(url)) + # ftp authentication not yet implemented by FTPHandler + self.assert_(h.user == h.passwd == "") + self.assert_(h.host == socket.gethostbyname(host)) + self.assert_(h.port == port) + self.assert_(h.dirs == dirs) + self.assert_(h.ftpwrapper.filename == filename) + self.assert_(h.ftpwrapper.filetype == type_) + headers = r.info() + self.assert_(headers["Content-type"] == mimetype) + self.assert_(int(headers["Content-length"]) == len(data)) + + def test_file(self): + import time, rfc822, socket + h = urllib2.FileHandler() + o = h.parent = MockOpener() + + #from test_support import TESTFN + TESTFN = "test.txt" + towrite = "hello, world\n" + for url in [ + "file://localhost%s/%s" % (os.getcwd(), TESTFN), + "file://%s/%s" % (os.getcwd(), TESTFN), + "file://%s%s/%s" % (socket.gethostbyname('localhost'), + os.getcwd(), TESTFN), + "file://%s%s/%s" % (socket.gethostbyname(socket.gethostname()), + os.getcwd(), TESTFN), + # XXX Windows / Mac format(s), ... ? + ]: + f = open(TESTFN, "w") + try: + try: + f.write(towrite) + finally: + f.close() + + r = h.file_open(Request(url)) + try: + data = r.read() + read_time = time.time() + headers = r.info() + newurl = r.geturl() + finally: + r.close() + finally: + os.remove(TESTFN) + self.assert_(data == towrite) + self.assert_(headers["Content-type"] == "text/plain") + self.assert_(headers["Content-length"] == "13") + # Fudge Last-modified string comparison by one second to + # prevent spurious failure on crossing a second boundary while + # executing this test. + unfudged = rfc822.formatdate(read_time) + fudged = rfc822.formatdate(read_time-1) + self.assert_(headers["Last-modified"] in [unfudged, fudged]) + + for url in [ + "file://localhost:80%s/%s" % (os.getcwd(), TESTFN), +# XXXX bug: these fail with socket.gaierror, should be URLError +## "file://%s:80%s/%s" % (socket.gethostbyname('localhost'), +## os.getcwd(), TESTFN), +## "file://somerandomhost.ontheinternet.com%s/%s" % +## (os.getcwd(), TESTFN), + ]: + try: + f = open(TESTFN, "w") + try: + f.write(towrite) + finally: + f.close() + + self.assertRaises(urllib2.URLError, + h.file_open, Request(url)) + finally: + os.remove(TESTFN) + + h = urllib2.FileHandler() + o = h.parent = MockOpener() + # XXXX why does // mean ftp (and /// mean not ftp!), and where + # is file: scheme specified? I think this is really a bug, and + # what was intended was to distinguish between URLs like: + # file:/blah.txt (a file) + # file://localhost/blah.txt (a file) + # file:///blah.txt (a file) + # file://ftp.example.com/blah.txt (an ftp URL) + for url, ftp in [ + ("file://ftp.example.com//foo.txt", True), + ("file://ftp.example.com///foo.txt", False), +# XXXX bug: fails with OSError, should be URLError + ("file://ftp.example.com/foo.txt", False), + ]: + req = Request(url) + try: + h.file_open(req) + # XXXX remove OSError when bug fixed + except (urllib2.URLError, OSError): + self.assert_(not ftp) + else: + self.assert_(o.req is req) + self.assert_(req.type == "ftp") + + def test_http(self): + class MockHTTPClass: + def __init__(self): + self.req_headers = [] + self.data = None + self.raise_on_endheaders = False + def __call__(self, host): + self.host = host + return self + def set_debuglevel(self, level): self.level = level + def putrequest(self, method, selector): + self.method, self.selector = method, selector + def putheader(self, key, value): + self.req_headers.append((key, value)) + def endheaders(self): + if self.raise_on_endheaders: + import socket + raise socket.error() + def send(self, data): self.data = data + def getreply(self): return 200, "OK", {} + def getfile(self): return MockFile() + + h = urllib2.AbstractHTTPHandler() + o = h.parent = MockOpener() + + url = "http://example.com/" + for method, data in [("GET", None), ("POST", "blah")]: + req = Request(url, data, {"Foo": "bar"}) + req.add_unredirected_header("Spam", "eggs") + http = MockHTTPClass() + r = h.do_open(http, req) + + # result attributes + r.read; r.readline # wrapped MockFile methods + r.info; r.geturl # addinfourl methods + r.code, r.msg == 200, "OK" # added from MockHTTPClass.getreply() + hdrs = r.info() + hdrs.get; hdrs.has_key # r.info() gives dict from .getreply() + self.assert_(r.geturl() == url) + + self.assert_(http.host == "example.com") + self.assert_(http.level == 0) + self.assert_(http.method == method) + self.assert_(http.selector == "/") + self.assert_(http.req_headers == [("Foo", "bar"), ("Spam", "eggs")]) + self.assert_(http.data == data) + + # check socket.error converted to URLError + http.raise_on_endheaders = True + self.assertRaises(urllib2.URLError, h.do_open, http, req) + + # check adding of standard headers + o.addheaders = [("Spam", "eggs")] + for data in "", None: # POST, GET + req = Request("http://example.com/", data) + r = MockResponse(200, "OK", {}, "") + newreq = h.do_request(req) + if data is None: # GET + self.assert_("Content-length" not in req.unredirected_hdrs) + self.assert_("Content-type" not in req.unredirected_hdrs) + else: # POST + self.assert_(req.unredirected_hdrs["Content-length"] == "0") + self.assert_(req.unredirected_hdrs["Content-type"] == + "application/x-www-form-urlencoded") + # XXX the details of Host could be better tested + self.assert_(req.unredirected_hdrs["Host"] == "example.com") + self.assert_(req.unredirected_hdrs["Spam"] == "eggs") + + # don't clobber existing headers + req.add_unredirected_header("Content-length", "foo") + req.add_unredirected_header("Content-type", "bar") + req.add_unredirected_header("Host", "baz") + req.add_unredirected_header("Spam", "foo") + newreq = h.do_request(req) + self.assert_(req.unredirected_hdrs["Content-length"] == "foo") + self.assert_(req.unredirected_hdrs["Content-type"] == "bar") + self.assert_(req.unredirected_hdrs["Host"] == "baz") + self.assert_(req.unredirected_hdrs["Spam"] == "foo") + + def test_errors(self): + h = urllib2.HTTPErrorProcessor() + o = h.parent = MockOpener() + + url = "http://example.com/" + req = Request(url) + # 200 OK is passed through + r = MockResponse(200, "OK", {}, "", url) + newr = h.http_response(req, r) + self.assert_(r is newr) + self.assert_(not hasattr(o, "proto")) # o.error not called + # anything else calls o.error (and MockOpener returns None, here) + r = MockResponse(201, "Created", {}, "", url) + self.assert_(h.http_response(req, r) is None) + self.assert_(o.proto == "http") # o.error called + self.assert_(o.args == (req, r, 201, "Created", {})) + + def test_redirect(self): + from_url = "http://example.com/a.html" + to_url = "http://example.com/b.html" + h = urllib2.HTTPRedirectHandler() + o = h.parent = MockOpener() + + # ordinary redirect behaviour + for code in 301, 302, 303, 307: + for data in None, "blah\nblah\n": + method = getattr(h, "http_error_%s" % code) + req = Request(from_url, data) + req.add_header("Nonsense", "viking=withhold") + req.add_unredirected_header("Spam", "spam") + try: + method(req, MockFile(), code, "Blah", {"location": to_url}) + except urllib2.HTTPError: + # 307 in response to POST requires user OK + self.assert_(code == 307 and data is not None) + self.assert_(o.req.get_full_url() == to_url) + try: + self.assert_(o.req.get_method() == "GET") + except AttributeError: + self.assert_(not o.req.has_data()) + self.assert_(o.req.headers["Nonsense"] == "viking=withhold") + self.assert_("Spam" not in o.req.headers) + self.assert_("Spam" not in o.req.unredirected_hdrs) + + # loop detection + req = Request(from_url) + req.origin_req_host = "example.com" + def redirect(h, req, code, url=to_url): + method = getattr(h, "http_error_%s" % code) + method(req, MockFile(), code, "Blah", {"location": url}) + # Note that the *original* request shares the same record of + # redirections with the sub-requests caused by the redirections. + # once + redirect(h, req, 302) + # twice: loop detected + self.assertRaises(urllib2.HTTPError, redirect, h, req, 302) + # and again + self.assertRaises(urllib2.HTTPError, redirect, h, req, 302) + # but this is a different redirect code, so OK... + redirect(h, req, 301) + self.assertRaises(urllib2.HTTPError, redirect, h, req, 301) + # order doesn't matter + redirect(h, req, 303) + redirect(h, req, 307) + self.assertRaises(urllib2.HTTPError, redirect, h, req, 303) + + # detect endless non-repeating chain of redirects + req = Request(from_url) + req.origin_req_host = "example.com" + count = 0 + try: + while 1: + redirect(h, req, 302, "http://example.com/%d" % count) + count = count + 1 + except urllib2.HTTPError: + self.assert_(count == urllib2.HTTPRedirectHandler.max_redirections) + + +class MiscTests(unittest.TestCase): + + def test_build_opener(self): + class MyHTTPHandler(urllib2.HTTPHandler): pass + class FooHandler(urllib2.BaseHandler): + def foo_open(self): pass + class BarHandler(urllib2.BaseHandler): + def bar_open(self): pass + + build_opener = urllib2.build_opener + + o = build_opener(FooHandler, BarHandler) + self.opener_has_handler(o, FooHandler) + self.opener_has_handler(o, BarHandler) + + # can take a mix of classes and instances + o = build_opener(FooHandler, BarHandler()) + self.opener_has_handler(o, FooHandler) + self.opener_has_handler(o, BarHandler) + + # subclasses of default handlers override default handlers + o = build_opener(MyHTTPHandler) + self.opener_has_handler(o, MyHTTPHandler) + + # a particular case of overriding: default handlers can be passed + # in explicitly + o = build_opener() + self.opener_has_handler(o, urllib2.HTTPHandler) + o = build_opener(urllib2.HTTPHandler) + self.opener_has_handler(o, urllib2.HTTPHandler) + o = build_opener(urllib2.HTTPHandler()) + self.opener_has_handler(o, urllib2.HTTPHandler) + + def opener_has_handler(self, opener, handler_class): + for h in opener.handlers: + if h.__class__ == handler_class: + break + else: + self.assert_(False) + + +def test_main(verbose=None): + from test import test_sets + test_support.run_unittest( + TrivialTests, + OpenerDirectorTests, + HandlerTests, + MiscTests, + ) -# A couple trivial tests - -try: - urllib2.urlopen('bogus url') -except ValueError: - pass -else: - verify(0) - -# XXX Name hacking to get this to work on Windows. -fname = os.path.abspath(urllib2.__file__).replace('\\', '/') -if fname[1:2] == ":": - fname = fname[2:] -# And more hacking to get it to work on MacOS. This assumes -# urllib.pathname2url works, unfortunately... -if os.name == 'mac': - fname = '/' + fname.replace(':', '/') -elif os.name == 'riscos': - import string - fname = os.expand(fname) - fname = fname.translate(string.maketrans("/.", "./")) - -file_url = "file://%s" % fname -f = urllib2.urlopen(file_url) - -buf = f.read() -f.close() +if __name__ == "__main__": + test_main(verbose=True) diff --git a/Lib/urllib2.py b/Lib/urllib2.py index 5c90aea..3fbb5e3 100644 --- a/Lib/urllib2.py +++ b/Lib/urllib2.py @@ -105,6 +105,7 @@ import socket import sys import time import urlparse +import bisect try: from cStringIO import StringIO @@ -192,6 +193,7 @@ class Request: self.headers = {} for key, value in headers.items(): self.add_header(key, value) + self.unredirected_hdrs = {} def __getattr__(self, attr): # XXX this is a fallback mechanism to guard against these @@ -248,6 +250,15 @@ class Request: # useful for something like authentication self.headers[key.capitalize()] = val + def add_unredirected_header(self, key, val): + # will not be added to a redirected request + self.unredirected_hdrs[key.capitalize()] = val + + def has_header(self, header_name): + return bool(header_name in self.headers or + header_name in self.unredirected_hdrs) + + class OpenerDirector: def __init__(self): server_version = "Python-urllib/%s" % __version__ @@ -256,40 +267,44 @@ class OpenerDirector: self.handlers = [] self.handle_open = {} self.handle_error = {} + self.process_response = {} + self.process_request = {} def add_handler(self, handler): - added = 0 + added = False for meth in dir(handler): - if meth[-5:] == '_open': - protocol = meth[:-5] - if protocol in self.handle_open: - self.handle_open[protocol].append(handler) - self.handle_open[protocol].sort() - else: - self.handle_open[protocol] = [handler] - added = 1 - continue - i = meth.find('_') - j = meth[i+1:].find('_') + i + 1 - if j != -1 and meth[i+1:j] == 'error': - proto = meth[:i] + i = meth.find("_") + protocol = meth[:i] + condition = meth[i+1:] + + if condition.startswith("error"): + j = meth[i+1:].find("_") + i + 1 kind = meth[j+1:] try: kind = int(kind) except ValueError: pass - dict = self.handle_error.get(proto, {}) - if kind in dict: - dict[kind].append(handler) - dict[kind].sort() - else: - dict[kind] = [handler] - self.handle_error[proto] = dict - added = 1 + lookup = self.handle_error.get(protocol, {}) + self.handle_error[protocol] = lookup + elif condition == "open": + kind = protocol + lookup = getattr(self, "handle_"+condition) + elif condition in ["response", "request"]: + kind = protocol + lookup = getattr(self, "process_"+condition) + else: continue + + handlers = lookup.setdefault(kind, []) + if handlers: + bisect.insort(handlers, handler) + else: + handlers.append(handler) + added = True + if added: - self.handlers.append(handler) - self.handlers.sort() + # XXX why does self.handlers need to be sorted? + bisect.insort(self.handlers, handler) handler.add_parent(self) def __del__(self): @@ -320,13 +335,32 @@ class OpenerDirector: if data is not None: req.add_data(data) + protocol = req.get_type() + + # pre-process request + meth_name = protocol+"_request" + for processor in self.process_request.get(protocol, []): + meth = getattr(processor, meth_name) + req = meth(req) + + response = self._open(req, data) + + # post-process response + meth_name = protocol+"_response" + for processor in self.process_response.get(protocol, []): + meth = getattr(processor, meth_name) + response = meth(req, response) + + return response + + def _open(self, req, data=None): result = self._call_chain(self.handle_open, 'default', 'default_open', req) if result: return result - type_ = req.get_type() - result = self._call_chain(self.handle_open, type_, type_ + \ + protocol = req.get_type() + result = self._call_chain(self.handle_open, protocol, protocol + '_open', req) if result: return result @@ -339,7 +373,7 @@ class OpenerDirector: # XXX http[s] protocols are special-cased dict = self.handle_error['http'] # https is not different than http proto = args[2] # YUCK! - meth_name = 'http_error_%d' % proto + meth_name = 'http_error_%s' % proto http_err = 1 orig_args = args else: @@ -372,7 +406,7 @@ def build_opener(*handlers): opener = OpenerDirector() default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, HTTPDefaultErrorHandler, HTTPRedirectHandler, - FTPHandler, FileHandler] + FTPHandler, FileHandler, HTTPErrorProcessor] if hasattr(httplib, 'HTTPS'): default_classes.append(HTTPSHandler) skip = [] @@ -400,8 +434,10 @@ class BaseHandler: def add_parent(self, parent): self.parent = parent + def close(self): self.parent = None + def __lt__(self, other): if not hasattr(other, "handler_order"): # Try to preserve the old behavior of having custom classes @@ -411,11 +447,29 @@ class BaseHandler: return self.handler_order < other.handler_order +class HTTPErrorProcessor(BaseHandler): + """Process HTTP error responses.""" + handler_order = 1000 # after all other processing + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + if code != 200: + response = self.parent.error( + 'http', request, response, code, msg, hdrs) + + return response + + https_response = http_response + class HTTPDefaultErrorHandler(BaseHandler): def http_error_default(self, req, fp, code, msg, hdrs): raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) class HTTPRedirectHandler(BaseHandler): + # maximum number of redirections before assuming we're in a loop + max_redirections = 10 + def redirect_request(self, req, fp, code, msg, headers, newurl): """Return a Request or None in response to a redirect. @@ -459,14 +513,20 @@ class HTTPRedirectHandler(BaseHandler): return # loop detection - new.error_302_dict = {} - if hasattr(req, 'error_302_dict'): - if len(req.error_302_dict)>10 or \ - newurl in req.error_302_dict: + # .redirect_dict has a key (url, code) if url was previously + # visited as a result of a redirection with that code. The + # code is needed in addition to the URL because visiting a URL + # twice isn't necessarily a loop: there is more than one way + # to redirect (301, 302, 303, 307, refresh). + key = (newurl, code) + if hasattr(req, 'redirect_dict'): + visited = new.redirect_dict = req.redirect_dict + if key in visited or len(visited) >= self.max_redirections: raise HTTPError(req.get_full_url(), code, self.inf_msg + msg, headers, fp) - new.error_302_dict.update(req.error_302_dict) - new.error_302_dict[newurl] = newurl + else: + visited = new.redirect_dict = req.redirect_dict = {} + visited[key] = None # Don't close the fp until we are sure that we won't use it # with HTTPError. @@ -853,6 +913,38 @@ class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): class AbstractHTTPHandler(BaseHandler): + def __init__(self, debuglevel=0): + self._debuglevel = debuglevel + + def set_http_debuglevel(self, level): + self._debuglevel = level + + def do_request(self, request): + host = request.get_host() + if not host: + raise URLError('no host given') + + if request.has_data(): # POST + data = request.get_data() + if not request.has_header('Content-type'): + request.add_unredirected_header( + 'Content-type', + 'application/x-www-form-urlencoded') + if not request.has_header('Content-length'): + request.add_unredirected_header( + 'Content-length', '%d' % len(data)) + + scheme, sel = splittype(request.get_selector()) + sel_host, sel_path = splithost(sel) + if not request.has_header('Host'): + request.add_unredirected_header('Host', sel_host or host) + for name, value in self.parent.addheaders: + name = name.capitalize() + if not request.has_header(name): + request.add_unredirected_header(name, value) + + return request + # XXX Should rewrite do_open() to use the new httplib interface, # would be a little simpler. @@ -862,26 +954,13 @@ class AbstractHTTPHandler(BaseHandler): raise URLError('no host given') h = http_class(host) # will parse host:port - if req.has_data(): - data = req.get_data() - h.putrequest('POST', req.get_selector()) - if not 'Content-type' in req.headers: - h.putheader('Content-type', - 'application/x-www-form-urlencoded') - if not 'Content-length' in req.headers: - h.putheader('Content-length', '%d' % len(data)) - else: - h.putrequest('GET', req.get_selector()) + h.set_debuglevel(self._debuglevel) - scheme, sel = splittype(req.get_selector()) - sel_host, sel_path = splithost(sel) - h.putheader('Host', sel_host or host) - for name, value in self.parent.addheaders: - name = name.capitalize() - if name not in req.headers: - h.putheader(name, value) + h.putrequest(req.get_method(), req.get_selector()) for k, v in req.headers.items(): h.putheader(k, v) + for k, v in req.unredirected_hdrs.items(): + h.putheader(k, v) # httplib will attempt to connect() here. be prepared # to convert a socket error to a URLError. try: @@ -889,14 +968,15 @@ class AbstractHTTPHandler(BaseHandler): except socket.error, err: raise URLError(err) if req.has_data(): - h.send(data) + h.send(req.get_data()) code, msg, hdrs = h.getreply() fp = h.getfile() - if code == 200: - return addinfourl(fp, hdrs, req.get_full_url()) - else: - return self.parent.error('http', req, fp, code, msg, hdrs) + response = addinfourl(fp, hdrs, req.get_full_url()) + # XXXX should these be methods, for uniformity with rest of interface? + response.code = code + response.msg = msg + return response class HTTPHandler(AbstractHTTPHandler): @@ -904,6 +984,7 @@ class HTTPHandler(AbstractHTTPHandler): def http_open(self, req): return self.do_open(httplib.HTTP, req) + http_request = AbstractHTTPHandler.do_request if hasattr(httplib, 'HTTPS'): class HTTPSHandler(AbstractHTTPHandler): @@ -911,6 +992,7 @@ if hasattr(httplib, 'HTTPS'): def https_open(self, req): return self.do_open(httplib.HTTPS, req) + https_request = AbstractHTTPHandler.do_request class UnknownHandler(BaseHandler): def unknown_open(self, req): |