diff options
author | Moshe Zadka <moshez@math.huji.ac.il> | 2001-03-01 08:40:42 (GMT) |
---|---|---|
committer | Moshe Zadka <moshez@math.huji.ac.il> | 2001-03-01 08:40:42 (GMT) |
commit | 8a18e99008c28156a7ba701ca8d6824a50fb0a9e (patch) | |
tree | 301cbec622d1abeaa992063babaaee5a6cfb540a /Doc | |
parent | 251083142fe4c114753cef12c37589bd55761912 (diff) | |
download | cpython-8a18e99008c28156a7ba701ca8d6824a50fb0a9e.zip cpython-8a18e99008c28156a7ba701ca8d6824a50fb0a9e.tar.gz cpython-8a18e99008c28156a7ba701ca8d6824a50fb0a9e.tar.bz2 |
Checking in patch 404826 -- urllib2 enhancements and documentations.
(please not that the library reference does *not* include the
urllib2 documnetation -- that will wiat for Fred)
Diffstat (limited to 'Doc')
-rw-r--r-- | Doc/lib/liburllib2.tex | 501 |
1 files changed, 501 insertions, 0 deletions
diff --git a/Doc/lib/liburllib2.tex b/Doc/lib/liburllib2.tex new file mode 100644 index 0000000..90243a5 --- /dev/null +++ b/Doc/lib/liburllib2.tex @@ -0,0 +1,501 @@ +\section{\module{urllib2} --- + extensible library for opening URLs} + +\declaremodule{standard}{urllib2} + +\moduleauthor{Jeremy Hylton}{jhylton@users.sourceforge.net} +\sectionauthor{Moshe Zadka}{moshez@users.sourceforge.net} + +\modulesynopsis{An extensible library for opening URLs using a variety of + protocols} + +The \module{urllib2} module defines functions and classes which help +in opening URLs (mostly HTTP) in a complex world -- basic and digest +authentication, redirections and more. + +The \module{urllib2} module defines the following functions: + +\begin{funcdesc}{urlopen}{url\optional{, data}} +Open the url \var{url}, which can either a string or a \class{Request} +object (currently the code checks that it really is a \class{Request} +instance, or an instance of a subclass of \class{Request}. + +\var{data} should be a string, which specifies additional data to +send to the server. In HTTP requests, which are the only ones that +support \var{data}, it should be a buffer in the format of +\code{application/x-www-form-urlencoded}, for example one returned +from \function{urllib.urlencode}. + +This function returns a file-like object with two additional methods: + +\begin{itemize} + + \item \code{geturl()} --- return the URL of the resource retrieved + \item \code{info()} --- return the meta-information of the page, as + a dictionary-like object +\end{itemize} + +Raises \exception{URLError} on errors. +\end{funcdesc} + +\begin{funcdesc}{install_opener}{opener} +Install a \class{OpenerDirector} instance as the default opener. +The code does not check for a real \class{OpenerDirector}, and any +class with the appropriate interface will work. +\end{funcdesc} + +\begin{funcdesc}{build_opener}{\optional{handler\optional{, + handler\optional{, ...}}}} +Return an \class{OpenerDirector} instance, which chains the +handlers in the order given. \var{handler}s can be either instances +of \class{BaseHandler}, or subclasses of \class{BaseHandler} (in +which case it must be possible to call the constructor without +any parameters. Instances of the following classes will be in +the front of the \var{handler}s, unless the \var{handler}s contain +them, instances of them or subclasses of them: + +\code{ProxyHandler, UnknownHandler, HTTPHandler, HTTPDefaultErrorHandler, + HTTPRedirectHandler, FTPHandler, FileHandler} + +If the Python installation has SSL support (\code{socket.ssl} exists), +\class{HTTPSHandler} will also be added. +\end{funcdesc} + +\begin{excdesc}{URLError} +The error handlers raise when they run into a problem. It is a subclass +of \exception{IOError}. +\end{excdesc} + +\begin{excdesc}{HTTPError} +A subclass of \exception{URLError}, it can also function as a +non-exceptional file-like return value (the same thing that \function{urlopen} +returns). This is useful when handling exotic HTTP errors, such as +requests for authentications. +\end{excdesc} + +\begin{excdesc}{GopherError} +A subclass of \exception{URLError}, this is the error raised by the +Gopher handler. +\end{excdesc} + +\begin{classdesc}{Request}{url\optional{data, \optional{, headers}}} +This class is an abstraction of a URL request. + +\var{url} should be a string which is a valid URL. For descrtion +of \var{data} see the \method{add_data} description. +\var{headers} should be a dictionary, and will be treated as if +\method{add_header} was called with each key and value as arguments. +\end{classdesc} + +The following methods describe all of \class{Request}'s public interface, +and so all must be overridden in subclasses. + +\begin{methoddesc}[Request]{add_data}{data} +Set the \class{Request} data to \var{data} is ignored +by all handlers except HTTP handlers --- and there it should be an +\code{application/x-www-form-encoded} buffer, and will change the +request to be \code{POST} rather then \code{GET}. +\end{methoddesc} + +\begin{methoddesc}[Request]{has_data}{data} +Return whether the instance has a non-\code{None} data. +\end{methoddesc} + +\begin{methoddesc}[Request]{get_data}{data} +Return the instance's data. +\end{methoddesc} + +\begin{methoddesc}[Request]{add_header}{key, val} +Add another header to the request. Headers +are currently ignored by all handlers except HTTP handlers, where they +are added to the list of headers sent to the server. Note that there +cannot be more then one header with the same name, and later calls +will overwrite previous calls in case the \var{key} collides. Currently, +this is no loss of HTTP functionality, since all headers which have meaning +when used more then once have a (header-specific) way of gaining the +same functionality using only one header. +\end{methoddesc} + +\begin{methoddesc}[Request]{get_full_url}{} +Return the URL given in the constructor. +\end{methoddesc} + +\begin{methoddesc}[Request]{get_type}{} +Return the type of the URL --- also known as the schema. +\end{methoddesc} + +\begin{methoddesc}[Request]{get_host}{} +Return the host to which connection will be made. +\end{methoddesc} + +\begin{methoddesc}[Request]{get_selector}{} +Return the selector --- the part of the URL that is sent to +the server. +\end{methoddesc} + +\begin{methoddesc}[Request]{set_proxy}{host, type} +Make the request by connecting to a proxy server. The \var{host} and \var{type} +will replace those of the instance, and the instance's selector will be +the original URL given in the constructor. +\end{methoddesc} + +\begin{classdesc}{OpenerDirector}{} +The \class{OpenerDirector} class opens URLs via \class{BaseHandler}s chained +together. It manages the chaining of handlers, and recovery from errors. +\end{classdesc} + +\begin{methoddesc}[OpenerDirector]{add_handler}{handler} +\var{handler} should be an instance of \class{BaseHandler}. The following +methods are searched, and added to the possible chains. + +\begin{itemize} + \item \code{{\em protocol}_open} --- signal that the handler knows how + to open {\em protocol} URLs. + \item \code{{\em protocol}_error_{\em type}} -- signal that the handler + knows how to handle {\em type} + errors from {\em protocol}. +\end{itemize} + +\end{methoddesc} + +\begin{methoddesc}[OpenerDirector]{close}{} +Explicitly break cycles, and delete all the handlers. +Because the \class{OpenerDirector} needs to know the registered handlers, +and a handler needs to know who the \class{OpenerDirector} who called +it is, there is a reference cycles. Even though recent versions of Python +have cycle-collection, it is sometimes preferable to explicitly break +the cycles. +\end{methoddesc} + +\begin{methoddesc}[OpenerDirector]{open}{url\optional{, data}} +Open the given \var{url}. (which can be a request object or a string), +optionally passing the given \var{data}. +Arguments, return values and exceptions raised are the same as those +of \function{urlopen} (which simply calls the \method{open()} method +on the default installed \class{OpenerDirector}. +\end{methoddesc} + +\begin{methoddesc}[OpenerDirector]{error}{proto\optional{, arg\optional{, ...}}} +Handle an error in a given protocol. The HTTP protocol is special cased to +use the code as the error. This will call the registered error handlers +for the given protocol with the given arguments (which are protocol specific). + +Return values and exceptions raised are the same as those +of \function{urlopen}. +\end{methoddesc} + +\begin{classdesc}{BaseHandler}{} +This is the base class for all registered handlers --- and handles only +the simple mechanics of registration. +\end{classdesc} + +\begin{methoddesc}[BaseHandler]{add_parent}{director} +Add a director as parent. +\end{methoddesc} + +\begin{methoddesc}[BaseHandler]{close}{} +Remove any parents. +\end{methoddesc} + +The following members and methods should be used only be classes derived +from \class{BaseHandler}: + +\begin{memberdesc}[BaseHandler]{parent} +A valid \class{OpenerDirector}, which can be used to open using a different +protocol, or handle errors. +\end{memberdesc} + +\begin{methoddesc}[BaseHandler]{default_open}{req} +This method is {\em not} defined in \class{BaseHandler}, but subclasses +should define it if they want to catch all URLs. + +This method, if exists, will be called by the \member{parent} +\class{OpenerDirector}. It should return a file-like object as described +in the return value of the \method{open} of \class{OpenerDirector} or +\code{None}. It should raise \exception{URLError}, unless a truly exceptional +thing happens (for example, \exception{MemoryError} should not be mapped +to \exception{URLError}. + +This method will be called before any protocol-specific open method. +\end{methoddesc} + +\begin{methoddesc}[BaseHandler]{{\em protocol}_open}{req} +This method is {\em not} defined in \class{BaseHandler}, but subclasses +should define it if they want to handle URLs with the given protocol. + +This method, if exists, will be called by the \member{parent} +\class{OpenerDirector}. Return values should be the same as for +\method{default_open}. +\end{methoddesc} + +\begin{methoddesc}[BaseHandler]{unknown_open}{req} +This method is {\em not} defined in \class{BaseHandler}, but subclasses +should define it if they want to catch all URLs with no specific +registerd handler to open it. + +This method, if exists, will be called by the \member{parent} +\class{OpenerDirector}. Return values should be the same as for +\method{default_open}. +\end{methoddesc} + +\begin{methoddesc}[BaseHandler]{http_error_default}{req, fp, code, msg, hdrs} +This method is {\em not} defined in \class{BaseHandler}, but subclasses +should override it if they intend to provide a catch-all for otherwise +unhandled HTTP errors. It will be called automatically by the +\class{OpenerDirector} getting the error, and should not normally be called +in other circumstances. + +\var{req} will be a \class{Request} object, \var{fp} will be a file-like +object with the HTTP error body, \var{code} will be the three-digit code +of the error, \var{msg} will be the user-visible explanation of the +code and \var{hdrs} will be a dictionary-like object with the headers of +the error. + +Return values and exceptions raised should be the same as those +of \function{urlopen}. +\end{methoddesc} + +\begin{methoddesc}[BaseHandler]{http_error_{\em nnn}}{req, fp, code, msg, hdrs} +\code{nnn} should be a three-digit HTTP error code. This method is also +not defined in \class{BaseHandler}, but will be called, if it exists, on +an instance of a subclass, when an HTTP error with code \code{nnn} occurse. + +Subclasses should override this method to handle specific HTTP errors. + +Arguments, return values and exceptions raised shoudl be the same as for +\method{http_error_default} +\end{methoddesc} + + +\begin{classdesc}{HTTPDefaultErrorHandler}{} +A class which catches all HTTP errors. +\end{classdesc} + +\begin{methoddesc}[HTTPDefaultErrorHandler]{http_error_default}{req, fp, code, + msg, hdrs} +Raise an \exception{HTTPError} +\end{methoddesc} + +\begin{classdesc}{HTTPRedirectHandler}{} +A class to handle redirections. +\end{classdesc} + +\begin{methoddesc}[HTTPRedirectHandler]{http_error_301}{req, fp, code, + msg, hdrs} +Redirect to the \code{Location:} URL. This method gets called by +the parent \class{OpenerDirector} when getting an HTTP permanent-redirect +error. +\end{methoddesc} + +\begin{methoddesc}[HTTPRedirectHandler]{http_error_302}{req, fp, code, + msg, hdrs} +The same as \method{http_error_301}. +\end{methoddesc} + +\strong{Note:} 303 redirection is not supported by this version of +\module{urllib2}. + +\begin{classdesc}{ProxyHandler}{\optional{proxies}} +Cause requests to go through a proxy. +If \var{proxies} is given, it must be a dictionary mapping +protocol names to URLs of proxies. +The default is to read the list of proxies from the environment +variables \code{{\em protocol}_proxy}. +\end{classdesc} + +\begin{methoddesc}[ProxyHandler]{{\em protocol}_open}{request} +The \class{ProxyHandler} will have a method \code{{\em protocol}_open} for +every {\em protocol} which has a proxy in the \var{proxies} dictionary +given in the constructor. The method will modify requests to go +through the proxy, by calling \code{request.set_proxy()}, and call the next +handler in the chain to actually execute the protocol. +\end{methoddesc} + +\begin{classdesc}{HTTPPasswordMgr}{} +Keep a database of +\code{(\var{realm}, \var{uri}) -> (\var{user}, \var{password})} mapping. +\end{classdesc} + +\begin{methoddesc}[HTTPPasswordMgr]{add_password}{realm, uri, user, passwd} +\var{uri} can be either a single URI, or a sequene of URIs. \var{realm}, +\var{user} and \var{passwd} must be strings. This causes + \code{(\var{user}, \var{passwd})} to be used as authentication tokens +when authentication for \var{realm} and a super-URI of any of the +given URIs is given. +\end{methoddesc} + +\begin{methoddesc}[HTTPPasswordMgr]{find_user_password}{realm, authuri} +Get user/password for given realm and URI, if any. This method will +return \code{(None, None)} if there is no user/password is known. +\end{methoddesc} + +\begin{classdesc}{HTTPPasswordMgrWithDefaultRealm}{} +Keep a database of +\code{(\var{realm}, \var{uri}) -> (\var{user}, \var{password})} mapping. +A realm of \code{None} is considered a catch-all realm, which is searched +if no other realm fits. +\end{classdesc} + +\begin{methoddesc}[HTTPPasswordMgrWithDefaultRealm]{add_password} + {realm, uri, user, passwd} +\var{uri} can be either a single URI, or a sequene of URIs. \var{realm}, +\var{user} and \var{passwd} must be strings. This causes + \code{(\var{user}, \var{passwd})} to be used as authentication tokens +when authentication for \var{realm} and a super-URI of any of the +given URIs is given. +\end{methoddesc} + +\begin{methoddesc}[HTTPPasswordMgr]{find_user_password}{realm, authuri} +Get user/password for given realm and URI, if any. This method will +return \code{(None, None)} if there is no user/password is known. +If the given \var{realm} has no user/password, the realm \code{None} +will be searched. +\end{methoddesc} + +\begin{classdesc}[AbstractBasicAuthHandler]{\optional{password_mgr}} +This is a mixin class, that helps with HTTP authentication, both +to the remote host and to a proxy. + +\var{password_mgr} should be something that is compatible with +\class{HTTPPasswordMgr} --- supplies the documented interface above. +\end{classdesc} + +\begin{methoddesc}[AbstractBasicAuthHandler]{handle_authentication_request} + {authreq, host, req, headers} +Handle an authentication request by getting user/password pair, and retrying. +\var{authreq} should be the name of the header where the information about +the realm, \var{host} is the host to authenticate too, \var{req} should be the +(failed) \class{Request} object, and \var{headers} should be the error headers. +\end{methoddesc} + +\begin{classdesc}{HTTPBasicAuthHandler}{\optional{password_mgr}} +Handle authentication with the remote host. +Valid \var{password_mgr}, if given, are the same as for +\class{AbstractBasicAuthHandler}. +\end{classdesc} + +\begin{methoddesc}[HTTPBasicAuthHandler]{http_error_401}{req, fp, code, + msg, hdrs} +Retry the request with authentication info, if available. +\end{methoddesc} + +\begin{classdesc}{ProxyBasicAuthHandler}{\optional{password_mgr}} +Handle authentication with the proxy. +Valid \var{password_mgr}, if given, are the same as for +\class{AbstractBasicAuthHandler}. +\end{classdesc} + +\begin{methoddesc}[ProxyBasicAuthHandler]{http_error_407}{req, fp, code, + msg, hdrs} +Retry the request with authentication info, if available. +\end{methoddesc} + +\begin{classdesc}{AbstractDigestAuthHandler}{\optional{password_mgr}} +This is a mixin class, that helps with HTTP authentication, both +to the remote host and to a proxy. + +\var{password_mgr} should be something that is compatible with +\class{HTTPPasswordMgr} --- supplies the documented interface above. +\end{classdesc} + +\begin{methoddesc}[AbstractBasicAuthHandler]{handle_authentication_request} + {authreq, host, req, headers} +\var{authreq} should be the name of the header where the information about +the realm, \var{host} should be the host to authenticate too, \var{req} +should be the (failed) \class{Request} object, and \var{headers} should be the +error headers. +\end{methoddesc} + +\begin{classdesc}{HTTPDigestAuthHandler}{\optional{password_mgr}} +Handle authentication with the remote host. +Valid \var{password_mgr}, if given, are the same as for +\class{AbstractBasicAuthHandler}. +\end{classdesc} + +\begin{methoddesc}[HTTPDigestAuthHandler]{http_error_401}{req, fp, code, + msg, hdrs} +Retry the request with authentication info, if available. +\end{methoddesc} + +\begin{classdesc}{ProxyDigestAuthHandler}{\optional{password_mgr}} +Handle authentication with the proxy. +\var{password_mgr}, if given, shoudl be the same as for +the constructor of \class{AbstractDigestAuthHandler}. +\end{classdesc} + +\begin{methoddesc}[ProxyDigestAuthHandler]{http_error_407}{req, fp, code, + msg, hdrs} +Retry the request with authentication info, if available. +\end{methoddesc} + +\begin{classdesc}{HTTPHandler}{} +A class to handle opening of HTTP URLs +\end{classdesc} + +\begin{methoddesc}[HTTPHandler]{http_open}{req} +Send an HTTP request (either GET or POST, depending on whether +\code{req.has_data()}. +\end{methoddesc} + +\begin{classdesc}{HTTPSHandler}{} +A class to handle opening of HTTPS URLs +\end{classdesc} + +\begin{methoddesc}[HTTPSHandler]{https_open}{req} +Send an HTTPS request (either GET or POST, depending on whether +\code{req.has_data()}. +\end{methoddesc} + +\begin{classdesc}{UknownHandler}{} +A catch-all class to handle unknown URLs. +\end{classdesc} + +\begin{methoddesc}[UknownHandler]{unknown_open} +Raise a \exception{URLError} exception +\end{methoddesc} + +\begin{classdesc}{FileHandler}{} +Open local files. +\end{classdesc} + +\begin{methoddesc}[FileHandler]{file_open}{req} +Open the file locally, if there is no host name, or +the host name is \code{"localhost"}. Change the +protocol to \code{ftp} otherwise, and retry opening +it using \member{parent}. +\end{methoddesc} + +\begin{classdesc}{FTPHandler}{} +Open FTP URLs. +\end{classdesc} + +\begin{methoddesc}[FTPHandler]{ftp_open}{req} +Open the FTP file indicated by \var{req}. +The login is always done with empty username and password. +\end{methoddesc} + +\begin{classdesc}{CacheFTPHandler}{} +Open FTP URLs, keeping a cache of open FTP connections to minimize +delays. +\end{classdesc} + +\begin{methoddesc}[CacheFTPHandler]{ftp_open}{req} +Open the FTP file indicated by \var{req}. +The login is always done with empty username and password. +\end{methoddesc} + +\begin{methoddesc}[CacheFTPHandler]{setTimeout}{t} +Set timeout of connections to \var{t} seconds. +\end{methoddesc} + +\begin{methoddesc}[CacheFTPHandler]{setMaxConns}{m} +Set maximum number of cached connections to \var{m}. +\end{methoddesc} + +\begin{classdesc}{GopherHandler}{} +Open gopher URLs. +\end{classdesc} + +\begin{methoddesc}[GopherHandler]{gopher_open}{req} +Open the gopher resource indicated by \var{req}. +\end{methoddesc} |