diff options
author | Jeremy Hylton <jeremy@alum.mit.edu> | 2003-12-14 05:27:34 (GMT) |
---|---|---|
committer | Jeremy Hylton <jeremy@alum.mit.edu> | 2003-12-14 05:27:34 (GMT) |
commit | c1be59f40ae7d1268961b95c5734297b5ca3d872 (patch) | |
tree | cceb19a2807171fe77387742c2f6a42a4fbfdbe9 /Lib/urllib2.py | |
parent | 328f338196f79e68f867a35cda171f01abef4f8e (diff) | |
download | cpython-c1be59f40ae7d1268961b95c5734297b5ca3d872.zip cpython-c1be59f40ae7d1268961b95c5734297b5ca3d872.tar.gz cpython-c1be59f40ae7d1268961b95c5734297b5ca3d872.tar.bz2 |
SF patch 852995: add processors feature to urllib2
John J. Lee writes: "the patch makes it possible to implement
functionality like HTTP cookie handling, Refresh handling,
etc. etc. using handler objects. At the moment urllib2's handler
objects aren't quite up to the job, which results in a lot of
cut-n-paste and subclassing. I believe the changes are
backwards-compatible, with the exception of people who've
reimplemented build_opener()'s functionality -- those people would
need to call opener.add_handler(HTTPErrorProcessor).
The main change is allowing handlers to implement
methods like:
http_request(request)
http_response(request, response)
In addition to the usual
http_open(request)
http_error{_*}(...)
"
Note that the change isn't well documented at least in part because
handlers aren't well documented at all. Need to fix this.
Add a bunch of new tests. It appears that none of these tests
actually use the network, so they don't need to be guarded by a
resource flag.
Diffstat (limited to 'Lib/urllib2.py')
-rw-r--r-- | Lib/urllib2.py | 194 |
1 files changed, 138 insertions, 56 deletions
diff --git a/Lib/urllib2.py b/Lib/urllib2.py index 5c90aea..3fbb5e3 100644 --- a/Lib/urllib2.py +++ b/Lib/urllib2.py @@ -105,6 +105,7 @@ import socket import sys import time import urlparse +import bisect try: from cStringIO import StringIO @@ -192,6 +193,7 @@ class Request: self.headers = {} for key, value in headers.items(): self.add_header(key, value) + self.unredirected_hdrs = {} def __getattr__(self, attr): # XXX this is a fallback mechanism to guard against these @@ -248,6 +250,15 @@ class Request: # useful for something like authentication self.headers[key.capitalize()] = val + def add_unredirected_header(self, key, val): + # will not be added to a redirected request + self.unredirected_hdrs[key.capitalize()] = val + + def has_header(self, header_name): + return bool(header_name in self.headers or + header_name in self.unredirected_hdrs) + + class OpenerDirector: def __init__(self): server_version = "Python-urllib/%s" % __version__ @@ -256,40 +267,44 @@ class OpenerDirector: self.handlers = [] self.handle_open = {} self.handle_error = {} + self.process_response = {} + self.process_request = {} def add_handler(self, handler): - added = 0 + added = False for meth in dir(handler): - if meth[-5:] == '_open': - protocol = meth[:-5] - if protocol in self.handle_open: - self.handle_open[protocol].append(handler) - self.handle_open[protocol].sort() - else: - self.handle_open[protocol] = [handler] - added = 1 - continue - i = meth.find('_') - j = meth[i+1:].find('_') + i + 1 - if j != -1 and meth[i+1:j] == 'error': - proto = meth[:i] + i = meth.find("_") + protocol = meth[:i] + condition = meth[i+1:] + + if condition.startswith("error"): + j = meth[i+1:].find("_") + i + 1 kind = meth[j+1:] try: kind = int(kind) except ValueError: pass - dict = self.handle_error.get(proto, {}) - if kind in dict: - dict[kind].append(handler) - dict[kind].sort() - else: - dict[kind] = [handler] - self.handle_error[proto] = dict - added = 1 + lookup = self.handle_error.get(protocol, {}) + self.handle_error[protocol] = lookup + elif condition == "open": + kind = protocol + lookup = getattr(self, "handle_"+condition) + elif condition in ["response", "request"]: + kind = protocol + lookup = getattr(self, "process_"+condition) + else: continue + + handlers = lookup.setdefault(kind, []) + if handlers: + bisect.insort(handlers, handler) + else: + handlers.append(handler) + added = True + if added: - self.handlers.append(handler) - self.handlers.sort() + # XXX why does self.handlers need to be sorted? + bisect.insort(self.handlers, handler) handler.add_parent(self) def __del__(self): @@ -320,13 +335,32 @@ class OpenerDirector: if data is not None: req.add_data(data) + protocol = req.get_type() + + # pre-process request + meth_name = protocol+"_request" + for processor in self.process_request.get(protocol, []): + meth = getattr(processor, meth_name) + req = meth(req) + + response = self._open(req, data) + + # post-process response + meth_name = protocol+"_response" + for processor in self.process_response.get(protocol, []): + meth = getattr(processor, meth_name) + response = meth(req, response) + + return response + + def _open(self, req, data=None): result = self._call_chain(self.handle_open, 'default', 'default_open', req) if result: return result - type_ = req.get_type() - result = self._call_chain(self.handle_open, type_, type_ + \ + protocol = req.get_type() + result = self._call_chain(self.handle_open, protocol, protocol + '_open', req) if result: return result @@ -339,7 +373,7 @@ class OpenerDirector: # XXX http[s] protocols are special-cased dict = self.handle_error['http'] # https is not different than http proto = args[2] # YUCK! - meth_name = 'http_error_%d' % proto + meth_name = 'http_error_%s' % proto http_err = 1 orig_args = args else: @@ -372,7 +406,7 @@ def build_opener(*handlers): opener = OpenerDirector() default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, HTTPDefaultErrorHandler, HTTPRedirectHandler, - FTPHandler, FileHandler] + FTPHandler, FileHandler, HTTPErrorProcessor] if hasattr(httplib, 'HTTPS'): default_classes.append(HTTPSHandler) skip = [] @@ -400,8 +434,10 @@ class BaseHandler: def add_parent(self, parent): self.parent = parent + def close(self): self.parent = None + def __lt__(self, other): if not hasattr(other, "handler_order"): # Try to preserve the old behavior of having custom classes @@ -411,11 +447,29 @@ class BaseHandler: return self.handler_order < other.handler_order +class HTTPErrorProcessor(BaseHandler): + """Process HTTP error responses.""" + handler_order = 1000 # after all other processing + + def http_response(self, request, response): + code, msg, hdrs = response.code, response.msg, response.info() + + if code != 200: + response = self.parent.error( + 'http', request, response, code, msg, hdrs) + + return response + + https_response = http_response + class HTTPDefaultErrorHandler(BaseHandler): def http_error_default(self, req, fp, code, msg, hdrs): raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) class HTTPRedirectHandler(BaseHandler): + # maximum number of redirections before assuming we're in a loop + max_redirections = 10 + def redirect_request(self, req, fp, code, msg, headers, newurl): """Return a Request or None in response to a redirect. @@ -459,14 +513,20 @@ class HTTPRedirectHandler(BaseHandler): return # loop detection - new.error_302_dict = {} - if hasattr(req, 'error_302_dict'): - if len(req.error_302_dict)>10 or \ - newurl in req.error_302_dict: + # .redirect_dict has a key (url, code) if url was previously + # visited as a result of a redirection with that code. The + # code is needed in addition to the URL because visiting a URL + # twice isn't necessarily a loop: there is more than one way + # to redirect (301, 302, 303, 307, refresh). + key = (newurl, code) + if hasattr(req, 'redirect_dict'): + visited = new.redirect_dict = req.redirect_dict + if key in visited or len(visited) >= self.max_redirections: raise HTTPError(req.get_full_url(), code, self.inf_msg + msg, headers, fp) - new.error_302_dict.update(req.error_302_dict) - new.error_302_dict[newurl] = newurl + else: + visited = new.redirect_dict = req.redirect_dict = {} + visited[key] = None # Don't close the fp until we are sure that we won't use it # with HTTPError. @@ -853,6 +913,38 @@ class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): class AbstractHTTPHandler(BaseHandler): + def __init__(self, debuglevel=0): + self._debuglevel = debuglevel + + def set_http_debuglevel(self, level): + self._debuglevel = level + + def do_request(self, request): + host = request.get_host() + if not host: + raise URLError('no host given') + + if request.has_data(): # POST + data = request.get_data() + if not request.has_header('Content-type'): + request.add_unredirected_header( + 'Content-type', + 'application/x-www-form-urlencoded') + if not request.has_header('Content-length'): + request.add_unredirected_header( + 'Content-length', '%d' % len(data)) + + scheme, sel = splittype(request.get_selector()) + sel_host, sel_path = splithost(sel) + if not request.has_header('Host'): + request.add_unredirected_header('Host', sel_host or host) + for name, value in self.parent.addheaders: + name = name.capitalize() + if not request.has_header(name): + request.add_unredirected_header(name, value) + + return request + # XXX Should rewrite do_open() to use the new httplib interface, # would be a little simpler. @@ -862,26 +954,13 @@ class AbstractHTTPHandler(BaseHandler): raise URLError('no host given') h = http_class(host) # will parse host:port - if req.has_data(): - data = req.get_data() - h.putrequest('POST', req.get_selector()) - if not 'Content-type' in req.headers: - h.putheader('Content-type', - 'application/x-www-form-urlencoded') - if not 'Content-length' in req.headers: - h.putheader('Content-length', '%d' % len(data)) - else: - h.putrequest('GET', req.get_selector()) + h.set_debuglevel(self._debuglevel) - scheme, sel = splittype(req.get_selector()) - sel_host, sel_path = splithost(sel) - h.putheader('Host', sel_host or host) - for name, value in self.parent.addheaders: - name = name.capitalize() - if name not in req.headers: - h.putheader(name, value) + h.putrequest(req.get_method(), req.get_selector()) for k, v in req.headers.items(): h.putheader(k, v) + for k, v in req.unredirected_hdrs.items(): + h.putheader(k, v) # httplib will attempt to connect() here. be prepared # to convert a socket error to a URLError. try: @@ -889,14 +968,15 @@ class AbstractHTTPHandler(BaseHandler): except socket.error, err: raise URLError(err) if req.has_data(): - h.send(data) + h.send(req.get_data()) code, msg, hdrs = h.getreply() fp = h.getfile() - if code == 200: - return addinfourl(fp, hdrs, req.get_full_url()) - else: - return self.parent.error('http', req, fp, code, msg, hdrs) + response = addinfourl(fp, hdrs, req.get_full_url()) + # XXXX should these be methods, for uniformity with rest of interface? + response.code = code + response.msg = msg + return response class HTTPHandler(AbstractHTTPHandler): @@ -904,6 +984,7 @@ class HTTPHandler(AbstractHTTPHandler): def http_open(self, req): return self.do_open(httplib.HTTP, req) + http_request = AbstractHTTPHandler.do_request if hasattr(httplib, 'HTTPS'): class HTTPSHandler(AbstractHTTPHandler): @@ -911,6 +992,7 @@ if hasattr(httplib, 'HTTPS'): def https_open(self, req): return self.do_open(httplib.HTTPS, req) + https_request = AbstractHTTPHandler.do_request class UnknownHandler(BaseHandler): def unknown_open(self, req): |