diff options
author | Martin v. Löwis <martin@v.loewis.de> | 2004-05-31 18:22:40 (GMT) |
---|---|---|
committer | Martin v. Löwis <martin@v.loewis.de> | 2004-05-31 18:22:40 (GMT) |
commit | 2a6ba9097ee3942ae328befaf074ce9722b93ca0 (patch) | |
tree | 475a4e8bfd2d8aedd0c38f5e813ba8ad4d23e991 | |
parent | 0a6d0ff8d9ad27d66eb8195b9366e5b49dbb69b6 (diff) | |
download | cpython-2a6ba9097ee3942ae328befaf074ce9722b93ca0.zip cpython-2a6ba9097ee3942ae328befaf074ce9722b93ca0.tar.gz cpython-2a6ba9097ee3942ae328befaf074ce9722b93ca0.tar.bz2 |
Patch #963318: Add support for client-side cookie management.
-rw-r--r-- | Doc/lib/lib.tex | 1 | ||||
-rw-r--r-- | Doc/lib/libcookie.tex | 4 | ||||
-rw-r--r-- | Doc/lib/libcookielib.tex | 679 | ||||
-rw-r--r-- | Doc/lib/liburllib2.tex | 161 | ||||
-rw-r--r-- | Doc/whatsnew/whatsnew24.tex | 19 | ||||
-rw-r--r-- | Lib/_LWPCookieJar.py | 171 | ||||
-rw-r--r-- | Lib/_MozillaCookieJar.py | 145 | ||||
-rw-r--r-- | Lib/cookielib.py | 1756 | ||||
-rw-r--r-- | Lib/test/test_cookielib.py | 1620 | ||||
-rw-r--r-- | Lib/test/test_urllib2.py | 99 | ||||
-rw-r--r-- | Lib/urllib2.py | 75 | ||||
-rw-r--r-- | Misc/NEWS | 4 |
12 files changed, 4666 insertions, 68 deletions
diff --git a/Doc/lib/lib.tex b/Doc/lib/lib.tex index 56ae04f..e6879fd 100644 --- a/Doc/lib/lib.tex +++ b/Doc/lib/lib.tex @@ -233,6 +233,7 @@ and how to embed it in other applications. \input{libbasehttp} \input{libsimplehttp} \input{libcgihttp} +\input{libcookielib} \input{libcookie} \input{libxmlrpclib} \input{libsimplexmlrpc} diff --git a/Doc/lib/libcookie.tex b/Doc/lib/libcookie.tex index f872ed2..bba9c79 100644 --- a/Doc/lib/libcookie.tex +++ b/Doc/lib/libcookie.tex @@ -68,6 +68,10 @@ you should not use the \class{SerialCookie} class. \begin{seealso} + \seemodule{cookielib}{HTTP cookie handling for for web + \emph{clients}. The \module{cookielib} and \module{Cookie} + modules do not depend on each other.} + \seerfc{2109}{HTTP State Management Mechanism}{This is the state management specification implemented by this module.} \end{seealso} diff --git a/Doc/lib/libcookielib.tex b/Doc/lib/libcookielib.tex new file mode 100644 index 0000000..ee42594 --- /dev/null +++ b/Doc/lib/libcookielib.tex @@ -0,0 +1,679 @@ +\section{\module{cookielib} --- + Cookie handling for HTTP clients} + +\declaremodule{standard}{cookielib} +\moduleauthor{John J. Lee}{jjl@pobox.com} +\sectionauthor{John J. Lee}{jjl@pobox.com} + +\modulesynopsis{Cookie handling for HTTP clients} + +The \module{cookielib} module defines classes for automatic handling +of HTTP cookies. It is useful for accessing web sites that require +small pieces of data -- \dfn{cookies} -- to be set on the client +machine by an HTTP response from a web server, and then returned to +the server in later HTTP requests. + +Both the regular Netscape cookie protocol and the protocol defined by +\rfc{2965} are handled. RFC 2965 handling is switched off by default. +\rfc{2109} cookies are parsed as Netscape cookies and subsequently +treated as RFC 2965 cookies. Note that the great majority of cookies +on the Internet are Netscape cookies. \module{cookielib} attempts to +follow the de-facto Netscape cookie protocol (which differs +substantially from that set out in the original Netscape +specification), including taking note of the \code{max-age} and +\code{port} cookie-attributes introduced with RFC 2109. \note{The +various named parameters found in \mailheader{Set-Cookie} and +\mailheader{Set-Cookie2} headers (eg. \code{domain} and +\code{expires}) are conventionally referred to as \dfn{attributes}. +To distinguish them from Python attributes, the documentation for this +module uses the term \dfn{cookie-attribute} instead}. + + +The module defines the following exception: + +\begin{excdesc}{LoadError} +Instances of \class{FileCookieJar} raise this exception on failure to +load cookies from a file. +\end{excdesc} + + +The following classes are provided: + +\begin{classdesc}{CookieJar}{policy=\constant{None}} +\var{policy} is an object implementing the \class{CookiePolicy} +interface. + +The \class{CookieJar} class stores HTTP cookies. It extracts cookies +from HTTP requests, and returns them in HTTP responses. +\class{CookieJar} instances automatically expire contained cookies +when necessary. Subclasses are also responsible for storing and +retrieving cookies from a file or database. +\end{classdesc} + +\begin{classdesc}{FileCookieJar}{filename, delayload=\constant{None}, + policy=\constant{None}} +\var{policy} is an object implementing the \class{CookiePolicy} +interface. For the other arguments, see the documentation for the +corresponding attributes. + +A \class{CookieJar} which can load cookies from, and perhaps save +cookies to, a file on disk. Cookies are \strong{NOT} loaded from the +named file until either the \method{load()} or \method{revert()} +method is called. Subclasses of this class are documented in section +\ref{file-cookie-jar-classes}. +\end{classdesc} + +\begin{classdesc}{CookiePolicy}{} +This class is responsible for deciding whether each cookie should be +accepted from / returned to the server. +\end{classdesc} + +\begin{classdesc}{DefaultCookiePolicy}{ + blocked_domains=\constant{None}, + allowed_domains=\constant{None}, + netscape=\constant{True}, rfc2965=\constant{False}, + hide_cookie2=\constant{False}, + strict_domain=\constant{False}, + strict_rfc2965_unverifiable=\constant{True}, + strict_ns_unverifiable=\constant{False}, + strict_ns_domain=\constant{DefaultCookiePolicy.DomainLiberal}, + strict_ns_set_initial_dollar=\constant{False}, + strict_ns_set_path=\constant{False} + } + +Constructor arguments should be passed as keyword arguments only. +\var{blocked_domains} is a sequence of domain names that we never +accept cookies from, nor return cookies to. \var{allowed_domains} if +not \constant{None}, this is a sequence of the only domains for which +we accept and return cookies. For all other arguments, see the +documentation for \class{CookiePolicy} and \class{DefaultCookiePolicy} +objects. + +\class{DefaultCookiePolicy} implements the standard accept / reject +rules for Netscape and RFC 2965 cookies. RFC 2109 cookies +(ie. cookies received in a \mailheader{Set-Cookie} header with a +version cookie-attribute of 1) are treated according to the RFC 2965 +rules. \class{DefaultCookiePolicy} also provides some parameters to +allow some fine-tuning of policy. +\end{classdesc} + +\begin{classdesc}{Cookie}{} +This class represents Netscape, RFC 2109 and RFC 2965 cookies. It is +not expected that users of \module{cookielib} construct their own +\class{Cookie} instances. Instead, if necessary, call +\method{make_cookies()} on a \class{CookieJar} instance. +\end{classdesc} + +\begin{seealso} + +\seemodule{urllib2}{URL opening with automatic cookie handling.} + +\seemodule{Cookie}{HTTP cookie classes, principally useful for +server-side code. The \module{cookielib} and \module{Cookie} modules +do not depend on each other.} + +\seeurl{http://wwwsearch.sf.net/ClientCookie/}{Extensions to this +module, including a class for reading Microsoft Internet Explorer +cookies on Windows.} + +\seeurl{http://www.netscape.com/newsref/std/cookie_spec.html}{The +specification of the original Netscape cookie protocol. Though this +is still the dominant protocol, the 'Netscape cookie protocol' +implemented by all the major browsers (and \module{cookielib}) only +bears a passing resemblance to the one sketched out in +\code{cookie_spec.html}.} + +\seerfc{2109}{HTTP State Management Mechanism}{Obsoleted by RFC 2965. +Uses \mailheader{Set-Cookie} with version=1.} + +\seerfc{2965}{HTTP State Management Mechanism}{The Netscape protocol +with the bugs fixed. Uses \mailheader{Set-Cookie2} in place of +\mailheader{Set-Cookie}. Not widely used.} + +\seeurl{http://kristol.org/cookie/errata.html}{Unfinished errata to +RFC 2965.} + +\seerfc{2964}{Use of HTTP State Management}{} + +\end{seealso} + + +\subsection{CookieJar and FileCookieJar Objects \label{cookie-jar-objects}} + +\class{CookieJar} objects support the iterator protocol. + +\class{CookieJar} has the following methods: + +\begin{methoddesc}[CookieJar]{add_cookie_header}{request} +Add correct \mailheader{Cookie} header to \var{request}. + +If the CookiePolicy allows (ie. the \class{CookiePolicy} instance's +\member{rfc2965} and \member{hide_cookie2} attributes are true and +false respectively), the \mailheader{Cookie2} header is also added +when appropriate. + +The \var{request} object (usually a \class{urllib2.Request} instance) +must support the methods \method{get_full_url()}, \method{get_host()}, +\method{get_type()}, \method{unverifiable()}, +\method{get_origin_req_host()}, \method{has_header()}, +\method{get_header()}, \method{header_items()}, and +\method{add_unredirected_header()},as documented by \module{urllib2}. +\end{methoddesc} + +\begin{methoddesc}[CookieJar]{extract_cookies}{response, request} +Extract cookies from HTTP \var{response} and store them in the +\class{CookieJar}, where allowed by policy. + +The \class{CookieJar} will look for allowable \mailheader{Set-Cookie} +and \mailheader{Set-Cookie2} headers in the \var{response} argument, +and store cookies as appropriate (subject to the +\method{CookiePolicy.set_ok()} method's approval). + +The \var{response} object (usually the result of a call to +\method{urllib2.urlopen()}, or similar) should support an +\method{info()} method, which returns an object with a +\method{getallmatchingheaders()} method (usually a +\class{mimetools.Message} instance). + +The \var{request} object (usually a \class{urllib2.Request} instance) +must support the methods \method{get_full_url()}, \method{get_host()}, +\method{unverifiable()}, and \method{get_origin_req_host()}, as +documented by \module{urllib2}. The request is used to set default +values for cookie-attributes as well as for checking that the cookie +is allowed to be set. +\end{methoddesc} + +\begin{methoddesc}[CookieJar]{set_policy}{policy} +Set the \class{CookiePolicy} instance to be used. +\end{methoddesc} + +\begin{methoddesc}[CookieJar]{make_cookies}{response, request} +Return sequence of \class{Cookie} objects extracted from +\var{response} object. + +See the documentation for \method{extract_cookies} for the interfaces +required of the \var{response} and \var{request} arguments. +\end{methoddesc} + +\begin{methoddesc}[CookieJar]{set_cookie_if_ok}{cookie, request} +Set a \class{Cookie} if policy says it's OK to do so. +\end{methoddesc} + +\begin{methoddesc}[CookieJar]{set_cookie}{cookie} +Set a \class{Cookie}, without checking with policy to see whether or +not it should be set. +\end{methoddesc} + +\begin{methoddesc}[CookieJar]{clear}{\optional{domain\optional{, + path\optional{, name}}}} +Clear some cookies. + +If invoked without arguments, clear all cookies. If given a single +argument, only cookies belonging to that \var{domain} will be removed. +If given two arguments, cookies belonging to the specified +\var{domain} and URL \var{path} are removed. If given three +arguments, then the cookie with the specified \var{domain}, \var{path} +and \var{name} is removed. + +Raises \exception{KeyError} if no matching cookie exists. +\end{methoddesc} + +\begin{methoddesc}[CookieJar]{clear_session_cookies}{} +Discard all session cookies. + +Discards all contained cookies that have a true \member{discard} +attribute (usually because they had either no \code{max-age} or +\code{expires} cookie-attribute, or an explicit \code{discard} +cookie-attribute). For interactive browsers, the end of a session +usually corresponds to closing the browser window. + +Note that the \method{save()} method won't save session cookies +anyway, unless you ask otherwise by passing a true +\var{ignore_discard} argument. +\end{methoddesc} + +\class{FileCookieJar} implements the following additional methods: + +\begin{methoddesc}[FileCookieJar]{save}{filename=\constant{None}, + ignore_discard=\constant{False}, ignore_expires=\constant{False}} +Save cookies to a file. + +This base class raises \class{NotImplementedError}. Subclasses may +leave this method unimplemented. + +\var{filename} is the name of file in which to save cookies. If +\var{filename} is not specified, \member{self.filename} is used (whose +default is the value passed to the constructor, if any); if +\member{self.filename} is \constant{None}, \exception{ValueError} is +raised. + +\var{ignore_discard}: save even cookies set to be discarded. +\var{ignore_expires}: save even cookies that have expired + +The file is overwritten if it already exists, thus wiping all the +cookies it contains. Saved cookies can be restored later using the +\method{load()} or \method{revert()} methods. +\end{methoddesc} + +\begin{methoddesc}[FileCookieJar]{load}{filename=\constant{None}, + ignore_discard=\constant{False}, ignore_expires=\constant{False}} +Load cookies from a file. + +Old cookies are kept unless overwritten by newly loaded ones. + +Arguments are as for \method{save()}. + +The named file must be in the format understood by the class, or +\exception{LoadError} will be raised. +\end{methoddesc} + +\begin{methoddesc}[FileCookieJar]{revert}{filename=\constant{None}, + ignore_discard=\constant{False}, ignore_expires=\constant{False}} +Clear all cookies and reload cookies from a saved file. + +Raises \exception{cookielib.LoadError} or \exception{IOError} if +reversion is not successful; the object's state will not be altered if +this happens. +\end{methoddesc} + +\class{FileCookieJar} instances have the following public attributes: + +\begin{memberdesc}{filename} +Filename of default file in which to keep cookies. +\end{memberdesc} + +\begin{memberdesc}{delayload} +If true, load cookies lazily from disk. This is only a hint, since +this only affects performance, not behaviour (unless the cookies on +disk are changing). A \class{CookieJar} object may ignore it. None +of the \class{FileCookieJar} classes included in the standard library +lazily loads cookies. +\end{memberdesc} + + +\subsection{FileCookieJar subclasses and co-operation with web browsers + \label{file-cookie-jar-classes}} + +The following \class{CookieJar} subclasses are provided for reading +and writing . Further \class{CookieJar} subclasses, including one +that reads Microsoft Internet Explorer cookies, are available at +\url{http://wwwsearch.sf.net/ClientCookie/}. + +\begin{classdesc}{MozillaCookieJar}{filename, delayload=\constant{None}, + policy=\constant{None}} +A \class{FileCookieJar} that can load from and save cookies to disk in +the Mozilla \code{cookies.txt} file format (which is also used by the +lynx and Netscape browsers). \note{This loses information about RFC +2965 cookies, and also about newer or non-standard cookie-attributes +such as \code{port}.} + +\warning{Back up your cookies before saving if you have cookies whose +loss / corruption would be inconvenient (there are some subtleties +which may lead to slight changes in the file over a load / save +round-trip).} + +Also note that cookies saved while Mozilla is running will get +clobbered by Mozilla. +\end{classdesc} + +\begin{classdesc}{LWPCookieJar}{filename, delayload=\constant{None}, + policy=\constant{None}} +A \class{FileCookieJar} that can load from and save cookies to disk in +format compatible with the libwww-perl library's \code{Set-Cookie3} +file format. This is convenient if you want to store cookies in a +human-readable file. +\end{classdesc} + + +\subsection{CookiePolicy Objects \label{cookie-policy-objects}} + +Objects implementing the \class{CookiePolicy} interface have the +following methods: + +\begin{methoddesc}[CookiePolicy]{set_ok}{cookie, request} +Return boolean value indicating whether cookie should be accepted from server. + +\var{cookie} is a \class{cookielib.Cookie} instance. \var{request} is +an object implementing the interface defined by the documentation for +\method{CookieJar.extract_cookies()}. +\end{methoddesc} + +\begin{methoddesc}[CookiePolicy]{return_ok}{cookie, request} +Return boolean value indicating whether cookie should be returned to server. + +\var{cookie} is a \class{cookielib.Cookie} instance. \var{request} is +an object implementing the interface defined by the documentation for +\method{CookieJar.add_cookie_header()}. +\end{methoddesc} + +\begin{methoddesc}[CookiePolicy]{domain_return_ok}{domain, request} +Return false if cookies should not be returned, given cookie domain. + +This method is an optimization. It removes the need for checking +every cookie with a particular domain (which might involve reading +many files). The default implementations of +\method{domain_return_ok()} and \method{path_return_ok()} +(\samp{return True}) leave all the work to \method{return_ok()}. + +If \method{domain_return_ok()} returns true for the cookie domain, +\method{path_return_ok()} is called for the cookie path. Otherwise, +\method{path_return_ok()} and \method{return_ok()} are never called +for that cookie domain. If \method{path_return_ok()} returns true, +\method{return_ok()} is called with the \class{Cookie} object itself +for a full check. Otherwise, \method{return_ok()} is never called for +that cookie path. + +Note that \method{domain_return_ok()} is called for every +\emph{cookie} domain, not just for the \emph{request} domain. For +example, the function might be called with both \code{".example.com"} +and \code{"www.example.com"} if the request domain is +\code{"www.example.com"}. The same goes for +\method{path_return_ok()}. + +The \var{request} argument is as documented for \method{return_ok()}. +\end{methoddesc} + +\begin{methoddesc}[CookiePolicy]{path_return_ok}{path, request} +Return false if cookies should not be returned, given cookie path. + +See the documentation for \method{domain_return_ok()}. +\end{methoddesc} + + +In addition to implementing the methods above, implementations of the +\class{CookiePolicy} interface must also supply the following +attributes, indicating which protocols should be used, and how. All +of these attributes may be assigned to. + +\begin{memberdesc}{netscape} +Implement netscape protocol. +\end{memberdesc} +\begin{memberdesc}{rfc2965} +Implement RFC 2965 protocol. +\end{memberdesc} +\begin{memberdesc}{hide_cookie2} +Don't add Cookie2 header to requests (the presence of this header +indicates to the server that we understand RFC 2965 cookies). +\end{memberdesc} + +The most useful way to define a \class{CookiePolicy} class is by +subclassing from \class{DefaultCookiePolicy} and overriding some or +all of the methods above. \class{CookiePolicy} itself may be used as +a 'null policy' to allow setting and receiving any and all cookies. + + +\subsection{DefaultCookiePolicy Objects \label{default-cookie-policy-objects}} + +Implements the standard rules for accepting and returning cookies. + +Both RFC 2965 and Netscape cookies are covered. RFC 2965 handling is +switched off by default. + +The easiest way to provide your own policy is to override this class +and call its methods in your overriden implementations before adding +your own additional checks: + +\begin{verbatim} +import cookielib +class MyCookiePolicy(cookielib.DefaultCookiePolicy): + def set_ok(self, cookie, request): + if not cookielib.DefaultCookiePolicy.set_ok(self, cookie, request): + return False + if i_dont_want_to_store_this_cookie(cookie): + return False + return True +\end{verbatim} + +In addition to the features required to implement the +\class{CookiePolicy} interface, this class allows you to block and +allow domains from setting and receiving cookies. There are also some +strictness switches that allow you to tighten up the rather loose +Netscape protocol rules a little bit (at the cost of blocking some +benign cookies). + +A domain blacklist and whitelist is provided (both off by default). +Only domains not in the blacklist and present in the whitelist (if the +whitelist is active) participate in cookie setting and returning. Use +the \var{blocked_domains} constructor argument, and +\method{blocked_domains()} and \method{set_blocked_domains()} methods +(and the corresponding argument and methods for +\var{allowed_domains}). If you set a whitelist, you can turn it off +again by setting it to \constant{None}. + +Domains in block or allow lists that do not start with a dot must be +equal. For example, \code{"example.com"} matches a blacklist entry of +\code{"example.com"}, but \code{"www.example.com"} does not. Domains +that do start with a dot are matched by more specific domains too. +For example, both \code{"www.example.com"} and +\code{"www.coyote.example.com"} match \code{".example.com"} (but +\code{"example.com"} itself does not). IP addresses are an exception, +and must match exactly. For example, if blocked_domains contains +\code{"192.168.1.2"} and \code{".168.1.2"}, 192.168.1.2 is blocked, +but 193.168.1.2 is not. + +\class{DefaultCookiePolicy} implements the following additional +methods: + +\begin{methoddesc}[DefaultCookiePolicy]{blocked_domains}{} +Return the sequence of blocked domains (as a tuple). +\end{methoddesc} + +\begin{methoddesc}[DefaultCookiePolicy]{set_blocked_domains} + {blocked_domains} +Set the sequence of blocked domains. +\end{methoddesc} + +\begin{methoddesc}[DefaultCookiePolicy]{is_blocked}{domain} +Return whether \var{domain} is on the blacklist for setting or +receiving cookies. +\end{methoddesc} + +\begin{methoddesc}[DefaultCookiePolicy]{allowed_domains}{} +Return \constant{None}, or the sequence of allowed domains (as a tuple). +\end{methoddesc} + +\begin{methoddesc}[DefaultCookiePolicy]{set_allowed_domains} + {allowed_domains} +Set the sequence of allowed domains, or \constant{None}. +\end{methoddesc} + +\begin{methoddesc}[DefaultCookiePolicy]{is_not_allowed}{domain} +Return whether \var{domain} is not on the whitelist for setting or +receiving cookies. +\end{methoddesc} + +\class{DefaultCookiePolicy} instances have the following attributes, +which are all initialised from the constructor arguments of the same +name, and which may all be assigned to. + +General strictness switches: + +\begin{memberdesc}{strict_domain} +Don't allow sites to set two-component domains with country-code +top-level domains like \code{.co.uk}, \code{.gov.uk}, +\code{.co.nz}.etc. This is far from perfect and isn't guaranteed to +work! +\end{memberdesc} + +RFC 2965 protocol strictness switches: + +\begin{memberdesc}{strict_rfc2965_unverifiable} +Follow RFC 2965 rules on unverifiable transactions (usually, an +unverifiable transaction is one resulting from a redirect or a request +for an image hosted on another site). If this is false, cookies are +\emph{never} blocked on the basis of verifiability +\end{memberdesc} + +Netscape protocol strictness switches: + +\begin{memberdesc}{strict_ns_unverifiable} +apply RFC 2965 rules on unverifiable transactions even to Netscape +cookies +\end{memberdesc} +\begin{memberdesc}{strict_ns_domain} +Flags indicating how strict to be with domain-matching rules for +Netscape cookies. See below for acceptable values. +\end{memberdesc} +\begin{memberdesc}{strict_ns_set_initial_dollar} +Ignore cookies in Set-Cookie: headers that have names starting with +\code{'\$'}. +\end{memberdesc} +\begin{memberdesc}{strict_ns_set_path} +Don't allow setting cookies whose path doesn't path-match request URI. +\end{memberdesc} + +\member{strict_ns_domain} is a collection of flags. Its value is +constructed by or-ing together (for example, +\code{DomainStrictNoDots|DomainStrictNonDomain} means both flags are +set). + +\begin{memberdesc}{DomainStrictNoDots} +When setting cookies, the 'host prefix' must not contain a dot +(eg. \code{www.foo.bar.com} can't set a cookie for \code{.bar.com}, +because \code{www.foo} contains a dot). +\end{memberdesc} +\begin{memberdesc}{DomainStrictNonDomain} +Cookies that did not explicitly specify a \code{domain} +cookie-attribute can only be returned to a domain that string-compares +equal to the domain that set the cookie (eg. \code{spam.example.com} +won't be returned cookies from \code{example.com} that had no +\code{domain} cookie-attribute). +\end{memberdesc} +\begin{memberdesc}{DomainRFC2965Match} +When setting cookies, require a full RFC 2965 domain-match. +\end{memberdesc} + +The following attributes are provided for convenience, and are the +most useful combinations of the above flags: + +\begin{memberdesc}{DomainLiberal} +Equivalent to 0 (ie. all of the above Netscape domain strictness flags +switched off). +\end{memberdesc} +\begin{memberdesc}{DomainStrict} +Equivalent to \code{DomainStrictNoDots|DomainStrictNonDomain}. +\end{memberdesc} + + +\subsection{Cookie Objects \label{cookie-jar-objects}} + +\class{Cookie} instances have Python attributes roughly corresponding +to the standard cookie-attributes specified in the various cookie +standards. The correspondence is not one-to-one, because there are +complicated rules for assigning default values, and because the +\code{max-age} and \code{expires} cookie-attributes contain equivalent +information. + +Assignment to these attributes should not be necessary other than in +rare circumstances in a \class{CookiePolicy} method. The class does +not enforce internal consistency, so you should know what you're +doing if you do that. + +\begin{memberdesc}[Cookie]{version} +Integer or \constant{None}. Netscape cookies have version 0. RFC +2965 and RFC 2109 cookies have version 1. +\end{memberdesc} +\begin{memberdesc}[Cookie]{name} +Cookie name (a string), or \constant{None}. +\end{memberdesc} +\begin{memberdesc}[Cookie]{value} +Cookie value (a string). +\end{memberdesc} +\begin{memberdesc}[Cookie]{port} +String representing a port or a set of ports (eg. '80', or '80,8080'), +or \constant{None}. +\end{memberdesc} +\begin{memberdesc}[Cookie]{path} +Cookie path (a string, eg. '/acme/rocket_launchers'). +\end{memberdesc} +\begin{memberdesc}[Cookie]{secure} +True if cookie should only be returned over a secure connection. +\end{memberdesc} +\begin{memberdesc}[Cookie]{expires} +Integer expiry date in seconds since epoch, or \constant{None}. See +also the \method{is_expired()} method. +\end{memberdesc} +\begin{memberdesc}[Cookie]{discard} +True if this is a session cookie. +\end{memberdesc} +\begin{memberdesc}[Cookie]{comment} +String comment from the server explaining the function of this cookie, +or \constant{None}. +\end{memberdesc} +\begin{memberdesc}[Cookie]{comment_url} +URL linking to a comment from the server explaining the function of +this cookie, or \constant{None}. +\end{memberdesc} + +\begin{memberdesc}[Cookie]{port_specified} +True if a port or set of ports was explicitly specified by the server +(in the \mailheader{Set-Cookie} / \mailheader{Set-Cookie2} header). +\end{memberdesc} +\begin{memberdesc}[Cookie]{domain_specified} +True if a domain was explicitly specified by the server. +\end{memberdesc} +\begin{memberdesc}[Cookie]{domain_initial_dot} +True if the domain explicitly specified by the server began with a +dot ('.'). +\end{memberdesc} + +Cookies may have additional non-standard cookie-attributes. These may +be accessed using the following methods: + +\begin{methoddesc}[Cookie]{has_nonstandard_attr}{name} +Return true if cookie has the named cookie-attribute. +\end{methoddesc} +\begin{methoddesc}[Cookie]{get_nonstandard_attr}{name, default=\constant{None}} +If cookie has the named cookie-attribute, return its value. +Otherwise, return \var{default}. +\end{methoddesc} +\begin{methoddesc}[Cookie]{set_nonstandard_attr}{name, value} +Set the value of the named cookie-attribute. +\end{methoddesc} + +The \class{Cookie} class also defines the following method: + +\begin{methoddesc}[Cookie]{is_expired}{\optional{now=\constant{None}}} +True if cookie has passed the time at which the server requested it +should expire. If \var{now} is given (in seconds since the epoch), +return whether the cookie has expired at the specified time. +\end{methoddesc} + + +\subsection{Examples \label{cookielib-examples}} + +The first example shows the most common usage of \module{cookielib}: + +\begin{verbatim} +import cookielib, urllib2 +cj = cookielib.CookieJar() +opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) +r = opener.open("http://example.com/") +\end{verbatim} + +This example illustrates how to open a URL using your Netscape, +Mozilla, or lynx cookies (assumes \UNIX{} convention for location of +the cookies file): + +\begin{verbatim} +import os, cookielib, urllib2 +cj = cookielib.MozillaCookieJar() +cj.load(os.path.join(os.environ["HOME"], "/.netscape/cookies.txt")) +opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) +r = opener.open("http://example.com/") +\end{verbatim} + +The next example illustrates the use of \class{DefaultCookiePolicy}. +Turn on RFC 2965 cookies, be more strict about domains when setting +and returning Netscape cookies, and block some domains from setting +cookies or having them returned: + +\begin{verbatim} +import urllib2 +from cookielib import CookieJar, DefaultCookiePolicy as Policy +policy = Policy(rfc2965=True, strict_ns_domain=Policy.DomainStrict, + blocked_domains=["ads.net", ".ads.net"]) +cj = CookieJar(policy) +opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) +r = opener.open("http://example.com/") +\end{verbatim} diff --git a/Doc/lib/liburllib2.tex b/Doc/lib/liburllib2.tex index 2399645..3ce95fb 100644 --- a/Doc/lib/liburllib2.tex +++ b/Doc/lib/liburllib2.tex @@ -10,14 +10,13 @@ The \module{urllib2} module defines functions and classes which help in opening URLs (mostly HTTP) in a complex world --- basic and digest -authentication, redirections and more. +authentication, redirections, cookies and more. The \module{urllib2} module defines the following functions: \begin{funcdesc}{urlopen}{url\optional{, data}} Open the URL \var{url}, which can be either a string or a \class{Request} -object (currently the code checks that it really is a \class{Request} -instance, or an instance of a subclass of \class{Request}). +object. \var{data} should be a string, which specifies additional data to send to the server. In HTTP requests, which are the only ones that @@ -87,13 +86,32 @@ Gopher handler. The following classes are provided: -\begin{classdesc}{Request}{url\optional{, data\optional{, headers}}} +\begin{classdesc}{Request}{url\optional{, data}\optional{, headers} + \optional{, origin_req_host}\optional{, unverifiable}} This class is an abstraction of a URL request. \var{url} should be a string which is a valid URL. For a description of \var{data} see the \method{add_data()} description. \var{headers} should be a dictionary, and will be treated as if \method{add_header()} was called with each key and value as arguments. + +The final two arguments are only of interest for correct handling of +third-party HTTP cookies: + +\var{origin_req_host} should be the request-host of the origin +transaction, as defined by \rfc{2965}. It defaults to +\code{cookielib.request_host(self)}. This is the host name or IP +address of the original request that was initiated by the user. For +example, if the request is for an image in an HTML document, this +should be the request-host of the request for the page containing the +image. + +\var{unverifiable} should indicate whether the request is +unverifiable, as defined by RFC 2965. It defaults to False. An +unverifiable request is one whose URL the user did not have the option +to approve. For example, if the request is for an image in an HTML +document, and the user had no option to approve the automatic fetching +of the image, this should be true. \end{classdesc} \begin{classdesc}{OpenerDirector}{} @@ -116,6 +134,10 @@ responses are turned into \exception{HTTPError} exceptions. A class to handle redirections. \end{classdesc} +\begin{classdesc}{HTTPCookieProcessor}{\optional{cookiejar}} +A class to handle HTTP Cookies. +\end{classdesc} + \begin{classdesc}{ProxyHandler}{\optional{proxies}} Cause requests to go through a proxy. If \var{proxies} is given, it must be a dictionary mapping @@ -217,10 +239,10 @@ The following methods describe all of \class{Request}'s public interface, and so all must be overridden in subclasses. \begin{methoddesc}[Request]{add_data}{data} -Set the \class{Request} data to \var{data}. This is ignored -by all handlers except HTTP handlers --- and there it should be an -\mimetype{application/x-www-form-encoded} buffer, and will change the -request to be \code{POST} rather than \code{GET}. +Set the \class{Request} data to \var{data}. This is ignored by all +handlers except HTTP handlers --- and there it should be a byte +string, and will change the request to be \code{POST} rather than +\code{GET}. \end{methoddesc} \begin{methoddesc}[Request]{get_method}{} @@ -282,6 +304,17 @@ and \var{type} will replace those of the instance, and the instance's selector will be the original URL given in the constructor. \end{methoddesc} +\begin{methoddesc}[Request]{get_origin_req_host}{} +Return the request-host of the origin transaction, as defined by +\rfc{2965}. See the documentation for the \class{Request} +constructor. +\end{methoddesc} + +\begin{methoddesc}[Request]{is_unverifiable}{} +Return whether the request is unverifiable, as defined by RFC 2965. +See the documentation for the \class{Request} constructor. +\end{methoddesc} + \subsection{OpenerDirector Objects \label{opener-director-objects}} @@ -289,14 +322,18 @@ selector will be the original URL given in the constructor. \begin{methoddesc}[OpenerDirector]{add_handler}{handler} \var{handler} should be an instance of \class{BaseHandler}. The -following methods are searched, and added to the possible chains. +following methods are searched, and added to the possible chains (note +that HTTP errors are a special case). \begin{itemize} \item \method{\var{protocol}_open()} --- signal that the handler knows how to open \var{protocol} URLs. - \item \method{\var{protocol}_error_\var{type}()} --- - signal that the handler knows how to handle \var{type} errors from - \var{protocol}. + \item \method{http_error_\var{type}()} --- + signal that the handler knows how to handle HTTP errors with HTTP + error code \var{type}. + \item \method{\var{protocol}_error()} --- + signal that the handler knows how to handle errors from + (non-\code{http}) \var{protocol}. \item \method{\var{protocol}_request()} --- signal that the handler knows how to pre-process \var{protocol} requests. @@ -306,26 +343,17 @@ following methods are searched, and added to the possible chains. \end{itemize} \end{methoddesc} -\begin{methoddesc}[OpenerDirector]{close}{} -Explicitly break cycles, and delete all the handlers. -Because the \class{OpenerDirector} needs to know the registered handlers, -and a handler needs to know who the \class{OpenerDirector} who called -it is, there is a reference cycle. Even though recent versions of Python -have cycle-collection, it is sometimes preferable to explicitly break -the cycles. -\end{methoddesc} - \begin{methoddesc}[OpenerDirector]{open}{url\optional{, data}} Open the given \var{url} (which can be a request object or a string), optionally passing the given \var{data}. Arguments, return values and exceptions raised are the same as those of \function{urlopen()} (which simply calls the \method{open()} method -on the default installed \class{OpenerDirector}). +on the currently installed global \class{OpenerDirector}). \end{methoddesc} \begin{methoddesc}[OpenerDirector]{error}{proto\optional{, arg\optional{, \moreargs}}} -Handle an error in a given protocol. This will call the registered +Handle an error of the given protocol. This will call the registered error handlers for the given protocol with the given arguments (which are protocol specific). The HTTP protocol is a special case which uses the HTTP response code to determine the specific error handler; @@ -335,6 +363,45 @@ Return values and exceptions raised are the same as those of \function{urlopen()}. \end{methoddesc} +OpenerDirector objects open URLs in three stages: + +\begin{enumerate} + \item Every handler with a method named like + \method{\var{protocol}_request()} has that method called to + pre-process the request. + + The order in which these methods are called is determined by + sorting the handler instances by the \member{.processor_order} + attribute. + + \item Handlers with a method named like + \method{\var{protocol}_open()} are called to handle the request. + This stage ends when a handler either returns a + non-\constant{None} value (ie. a response), or raises an exception + (usually URLError). Exceptions are allowed to propagate. + + In fact, the above algorithm is first tried for methods named + \method{default_open}. If all such methods return + \constant{None}, the algorithm is repeated for methods named like + \method{\var{protocol}_open()}. If all such methods return + \constant{None}, the algorithm is repeated for methods named + \method{unknown_open()}. + + Note that the implementation of these methods may involve calls of + the parent \class{OpenerDirector} instance's \method{.open()} and + \method{.error()} methods. + + The order in which these methods are called is determined by + sorting the handler instances. + + \item Every handler with a method named like + \method{\var{protocol}_response()} has that method called to + post-process the response. + + The order in which these methods are called is determined by + sorting the handler instances by the \member{.processor_order} + attribute. +\end{enumerate} \subsection{BaseHandler Objects \label{base-handler-objects}} @@ -351,7 +418,11 @@ Remove any parents. \end{methoddesc} The following members and methods should only be used by classes -derived from \class{BaseHandler}: +derived from \class{BaseHandler}. \note{The convention has been +adopted that subclasses defining \method{\var{protocol}_request()} or +\method{\var{protocol}_response()} methods are named +\class{*Processor}; all others are named \class{*Handler}.} + \begin{memberdesc}[BaseHandler]{parent} A valid \class{OpenerDirector}, which can be used to open using a @@ -423,6 +494,29 @@ Arguments, return values and exceptions raised should be the same as for \method{http_error_default()}. \end{methoddesc} +\begin{methoddescni}[BaseHandler]{\var{protocol}_request}{req} +This method is \emph{not} defined in \class{BaseHandler}, but +subclasses should define it if they want to pre-process requests of +the given protocol. + +This method, if defined, will be called by the parent +\class{OpenerDirector}. \var{req} will be a \class{Request} object. +The return value should be a \class{Request} object. +\end{methoddescni} + +\begin{methoddescni}[BaseHandler]{\var{protocol}_response}{req, response} +This method is \emph{not} defined in \class{BaseHandler}, but +subclasses should define it if they want to post-process responses of +the given protocol. + +This method, if defined, will be called by the parent +\class{OpenerDirector}. \var{req} will be a \class{Request} object. +\var{response} will be an object implementing the same interface as +the return value of \function{urlopen()}. The return value should +implement the same interface as the return value of +\function{urlopen()}. +\end{methoddescni} + \subsection{HTTPRedirectHandler Objects \label{http-redirect-handler}} \note{Some HTTP redirections require action from this module's client @@ -434,12 +528,12 @@ for \method{http_error_default()}. fp, code, msg, hdrs} Return a \class{Request} or \code{None} in response to a redirect. This is called by the default implementations of the -\method{http_error_30*()} methods when a redirection is received -from the server. If a redirection should take place, return a new +\method{http_error_30*()} methods when a redirection is received from +the server. If a redirection should take place, return a new \class{Request} to allow \method{http_error_30*()} to perform the -redirect. Otherwise, raise \exception{HTTPError} if no other -\class{Handler} should try to handle this URL, or return \code{None} -if you can't but another \class{Handler} might. +redirect. Otherwise, raise \exception{HTTPError} if no other handler +should try to handle this URL, or return \code{None} if you can't but +another handler might. \begin{notice} The default implementation of this method does not strictly @@ -478,6 +572,15 @@ The same as \method{http_error_301()}, but called for the \end{methoddesc} +\subsection{HTTPCookieProcessor Objects \label{http-cookie-processor}} + +\class{HTTPCookieProcessor} instances have one attribute: + +\begin{memberdesc}{cookiejar} +The \class{cookielib.CookieJar} in which cookies are stored. +\end{memberdesc} + + \subsection{ProxyHandler Objects \label{proxy-handler}} \begin{methoddescni}[ProxyHandler]{\var{protocol}_open}{request} diff --git a/Doc/whatsnew/whatsnew24.tex b/Doc/whatsnew/whatsnew24.tex index bcb9134..a394971 100644 --- a/Doc/whatsnew/whatsnew24.tex +++ b/Doc/whatsnew/whatsnew24.tex @@ -569,6 +569,25 @@ For example: %====================================================================== % whole new modules get described in \subsections here +\subsection{cookielib} + +The \module{cookielib} library supports client-side handling for HTTP +cookies, just as the \module{Cookie} provides server-side cookie +support in CGI scripts. This library manages cookies in a way similar +to web browsers. Cookies are stored in cookie jars; the library +transparently stores cookies offered by the web server in the cookie +jar, and fetches the cookie from the jar when connecting to the +server. Similar to web browsers, policy objects control whether +cookies are accepted or not. + +In order to store cookies across sessions, two implementations of +cookie jars are provided: one that stores cookies in the Netscape +format, so applications can use the Mozilla or Lynx cookie jars, and +one that stores cookies in the same format as the Perl libwww libary. + +\module{urllib2} has been changed to interact with \module{cookielib}: +\class{HTTPCookieProcessor} manages a cookie jar that is used when +accessing URLs. % ====================================================================== \section{Build and C API Changes} diff --git a/Lib/_LWPCookieJar.py b/Lib/_LWPCookieJar.py new file mode 100644 index 0000000..2eb83a6 --- /dev/null +++ b/Lib/_LWPCookieJar.py @@ -0,0 +1,171 @@ +"""Load / save to libwww-perl (LWP) format files. + +Actually, the format is slightly extended from that used by LWP's +(libwww-perl's) HTTP::Cookies, to avoid losing some RFC 2965 information +not recorded by LWP. + +It uses the version string "2.0", though really there isn't an LWP Cookies +2.0 format. This indicates that there is extra information in here +(domain_dot and # port_spec) while still being compatible with +libwww-perl, I hope. + +""" + +import time, re, logging +from cookielib import reraise_unmasked_exceptions, FileCookieJar, Cookie, \ + MISSING_FILENAME_TEXT, join_header_words, split_header_words, \ + iso2time, time2isoz + +def lwp_cookie_str(cookie): + """Return string representation of Cookie in an the LWP cookie file format. + + Actually, the format is extended a bit -- see module docstring. + + """ + h = [(cookie.name, cookie.value), + ("path", cookie.path), + ("domain", cookie.domain)] + if cookie.port is not None: h.append(("port", cookie.port)) + if cookie.path_specified: h.append(("path_spec", None)) + if cookie.port_specified: h.append(("port_spec", None)) + if cookie.domain_initial_dot: h.append(("domain_dot", None)) + if cookie.secure: h.append(("secure", None)) + if cookie.expires: h.append(("expires", + time2isoz(float(cookie.expires)))) + if cookie.discard: h.append(("discard", None)) + if cookie.comment: h.append(("comment", cookie.comment)) + if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) + + keys = cookie._rest.keys() + keys.sort() + for k in keys: + h.append((k, str(cookie._rest[k]))) + + h.append(("version", str(cookie.version))) + + return join_header_words([h]) + +class LWPCookieJar(FileCookieJar): + """ + The LWPCookieJar saves a sequence of"Set-Cookie3" lines. + "Set-Cookie3" is the format used by the libwww-perl libary, not known + to be compatible with any browser, but which is easy to read and + doesn't lose information about RFC 2965 cookies. + + Additional methods + + as_lwp_str(ignore_discard=True, ignore_expired=True) + + """ + + def as_lwp_str(self, ignore_discard=True, ignore_expires=True): + """Return cookies as a string of "\n"-separated "Set-Cookie3" headers. + + ignore_discard and ignore_expires: see docstring for FileCookieJar.save + + """ + now = time.time() + r = [] + for cookie in self: + if not ignore_discard and cookie.discard: + continue + if not ignore_expires and cookie.is_expired(now): + continue + r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) + return "\n".join(r+[""]) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename, "w") + try: + # There really isn't an LWP Cookies 2.0 format, but this indicates + # that there is extra information in here (domain_dot and + # port_spec) while still being compatible with libwww-perl, I hope. + f.write("#LWP-Cookies-2.0\n") + f.write(self.as_lwp_str(ignore_discard, ignore_expires)) + finally: + f.close() + + def _really_load(self, f, filename, ignore_discard, ignore_expires): + magic = f.readline() + if not re.search(self.magic_re, magic): + msg = "%s does not seem to contain cookies" % filename + raise IOError(msg) + + now = time.time() + + header = "Set-Cookie3:" + boolean_attrs = ("port_spec", "path_spec", "domain_dot", + "secure", "discard") + value_attrs = ("version", + "port", "path", "domain", + "expires", + "comment", "commenturl") + + try: + while 1: + line = f.readline() + if line == "": break + if not line.startswith(header): + continue + line = line[len(header):].strip() + + for data in split_header_words([line]): + name, value = data[0] + # name and value are an exception here, since a plain "foo" + # (with no "=", unlike "bar=foo") means a cookie with no + # name and value "foo". With all other cookie-attributes, + # the situation is reversed: "foo" means an attribute named + # "foo" with no value! + if value is None: + name, value = value, name + standard = {} + rest = {} + for k in boolean_attrs: + standard[k] = False + for k, v in data[1:]: + if k is not None: + lc = k.lower() + else: + lc = None + # don't lose case distinction for unknown fields + if (lc in value_attrs) or (lc in boolean_attrs): + k = lc + if k in boolean_attrs: + if v is None: v = True + standard[k] = v + elif k in value_attrs: + standard[k] = v + else: + rest[k] = v + + h = standard.get + expires = h("expires") + discard = h("discard") + if expires is not None: + expires = iso2time(expires) + if expires is None: + discard = True + domain = h("domain") + domain_specified = domain.startswith(".") + c = Cookie(h("version"), name, value, + h("port"), h("port_spec"), + domain, domain_specified, h("domain_dot"), + h("path"), h("path_spec"), + h("secure"), + expires, + discard, + h("comment"), + h("commenturl"), + rest) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + except: + reraise_unmasked_exceptions((IOError,)) + raise IOError("invalid Set-Cookie3 format file %s" % filename) diff --git a/Lib/_MozillaCookieJar.py b/Lib/_MozillaCookieJar.py new file mode 100644 index 0000000..761a879 --- /dev/null +++ b/Lib/_MozillaCookieJar.py @@ -0,0 +1,145 @@ +"""Mozilla / Netscape cookie loading / saving.""" + +import re, time, logging + +from cookielib import reraise_unmasked_exceptions, FileCookieJar, Cookie, \ + MISSING_FILENAME_TEXT + +class MozillaCookieJar(FileCookieJar): + """ + + WARNING: you may want to backup your browser's cookies file if you use + this class to save cookies. I *think* it works, but there have been + bugs in the past! + + This class differs from CookieJar only in the format it uses to save and + load cookies to and from a file. This class uses the Mozilla/Netscape + `cookies.txt' format. lynx uses this file format, too. + + Don't expect cookies saved while the browser is running to be noticed by + the browser (in fact, Mozilla on unix will overwrite your saved cookies if + you change them on disk while it's running; on Windows, you probably can't + save at all while the browser is running). + + Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to + Netscape cookies on saving. + + In particular, the cookie version and port number information is lost, + together with information about whether or not Path, Port and Discard were + specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the + domain as set in the HTTP header started with a dot (yes, I'm aware some + domains in Netscape files start with a dot and some don't -- trust me, you + really don't want to know any more about this). + + Note that though Mozilla and Netscape use the same format, they use + slightly different headers. The class saves cookies using the Netscape + header by default (Mozilla can cope with that). + + """ + magic_re = "#( Netscape)? HTTP Cookie File" + header = """\ + # Netscape HTTP Cookie File + # http://www.netscape.com/newsref/std/cookie_spec.html + # This is a generated file! Do not edit. + +""" + + def _really_load(self, f, filename, ignore_discard, ignore_expires): + now = time.time() + + magic = f.readline() + if not re.search(self.magic_re, magic): + f.close() + raise IOError( + "%s does not look like a Netscape format cookies file" % + filename) + + try: + while 1: + line = f.readline() + if line == "": break + + # last field may be absent, so keep any trailing tab + if line.endswith("\n"): line = line[:-1] + + # skip comments and blank lines XXX what is $ for? + if (line.strip().startswith("#") or + line.strip().startswith("$") or + line.strip() == ""): + continue + + domain, domain_specified, path, secure, expires, name, value = \ + line.split("\t") + secure = (secure == "TRUE") + domain_specified = (domain_specified == "TRUE") + if name == "": + name = value + value = None + + initial_dot = domain.startswith(".") + assert domain_specified == initial_dot + + discard = False + if expires == "": + expires = None + discard = True + + # assume path_specified is false + c = Cookie(0, name, value, + None, False, + domain, domain_specified, initial_dot, + path, False, + secure, + expires, + discard, + None, + None, + {}) + if not ignore_discard and c.discard: + continue + if not ignore_expires and c.is_expired(now): + continue + self.set_cookie(c) + + except: + reraise_unmasked_exceptions((IOError,)) + raise IOError("invalid Netscape format file %s: %s" % + (filename, line)) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename, "w") + try: + f.write(self.header) + now = time.time() + for cookie in self: + if not ignore_discard and cookie.discard: + continue + if not ignore_expires and cookie.is_expired(now): + continue + if cookie.secure: secure = "TRUE" + else: secure = "FALSE" + if cookie.domain.startswith("."): initial_dot = "TRUE" + else: initial_dot = "FALSE" + if cookie.expires is not None: + expires = str(cookie.expires) + else: + expires = "" + if cookie.value is None: + # cookies.txt regards 'Set-Cookie: foo' as a cookie + # with no name, whereas cookielib regards it as a + # cookie with no value. + name = "" + value = cookie.name + else: + name = cookie.name + value = cookie.value + f.write( + "\t".join([cookie.domain, initial_dot, cookie.path, + secure, expires, name, value])+ + "\n") + finally: + f.close() diff --git a/Lib/cookielib.py b/Lib/cookielib.py new file mode 100644 index 0000000..b82886a --- /dev/null +++ b/Lib/cookielib.py @@ -0,0 +1,1756 @@ +"""HTTP cookie handling for web clients. + +This module has (now fairly distant) origins in Gisle Aas' Perl module +HTTP::Cookies, from the libwww-perl library. + +Docstrings, comments and debug strings in this code refer to the +attributes of the HTTP cookie system as cookie-attributes, to distinguish +them clearly from Python attributes. + +Class diagram (note that the classes which do not derive from +FileCookieJar are not distributed with the Python standard library, but +are available from http://wwwsearch.sf.net/): + + CookieJar____ + / \ \ + FileCookieJar \ \ + / | \ \ \ + MozillaCookieJar | LWPCookieJar \ \ + | | \ + | ---MSIEBase | \ + | / | | \ + | / MSIEDBCookieJar BSDDBCookieJar + |/ + MSIECookieJar + +""" + +import sys, re, urlparse, copy, time, struct, urllib, types, logging +from types import StringTypes +try: + import threading as _threading +except ImportError: + import dummy_threading as _threading +import httplib # only for the default HTTP port +from calendar import timegm + +logging.getLogger("cookielib").addHandler(logging.StreamHandler()) +debug = logging.getLogger("cookielib").debug + +DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT) +MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " + "instance initialised with one)") + +def reraise_unmasked_exceptions(unmasked=()): + # There are a few catch-all except: statements in this module, for + # catching input that's bad in unexpected ways. + # This function re-raises some exceptions we don't want to trap. + unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError) + etype = sys.exc_info()[0] + if issubclass(etype, unmasked): + raise + # swallowed an exception + import warnings + warnings.warn("cookielib bug!", stacklevel=2) + import traceback + traceback.print_exc() + + +# Date/time conversion +# ----------------------------------------------------------------------------- + +EPOCH_YEAR = 1970 +def _timegm(tt): + year, month, mday, hour, min, sec = tt[:6] + if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and + (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): + return timegm(tt) + else: + return None + +DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] +MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] +MONTHS_LOWER = [] +for month in MONTHS: MONTHS_LOWER.append(month.lower()) + +def time2isoz(t=None): + """Return a string representing time in seconds since epoch, t. + + If the function is called without an argument, it will use the current + time. + + The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", + representing Universal Time (UTC, aka GMT). An example of this format is: + + 1994-11-24 08:49:37Z + + """ + if t is None: t = time.time() + year, mon, mday, hour, min, sec = time.gmtime(t)[:6] + return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( + year, mon, mday, hour, min, sec) + +def time2netscape(t=None): + """Return a string representing time in seconds since epoch, t. + + If the function is called without an argument, it will use the current + time. + + The format of the returned string is like this: + + Wed, DD-Mon-YYYY HH:MM:SS GMT + + """ + if t is None: t = time.time() + year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7] + return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % ( + DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec) + + +UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} + +TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$") +def offset_from_tz_string(tz): + offset = None + if tz in UTC_ZONES: + offset = 0 + else: + m = TIMEZONE_RE.search(tz) + if m: + offset = 3600 * int(m.group(2)) + if m.group(3): + offset = offset + 60 * int(m.group(3)) + if m.group(1) == '-': + offset = -offset + return offset + +def _str2time(day, mon, yr, hr, min, sec, tz): + # translate month name to number + # month numbers start with 1 (January) + try: + mon = MONTHS_LOWER.index(mon.lower())+1 + except ValueError: + # maybe it's already a number + try: + imon = int(mon) + except ValueError: + return None + if 1 <= imon <= 12: + mon = imon + else: + return None + + # make sure clock elements are defined + if hr is None: hr = 0 + if min is None: min = 0 + if sec is None: sec = 0 + + yr = int(yr) + day = int(day) + hr = int(hr) + min = int(min) + sec = int(sec) + + if yr < 1000: + # find "obvious" year + cur_yr = time.localtime(time.time())[0] + m = cur_yr % 100 + tmp = yr + yr = yr + cur_yr - m + m = m - tmp + if abs(m) > 50: + if m > 0: yr = yr + 100 + else: yr = yr - 100 + + # convert UTC time tuple to seconds since epoch (not timezone-adjusted) + t = _timegm((yr, mon, day, hr, min, sec, tz)) + + if t is not None: + # adjust time using timezone string, to get absolute time since epoch + if tz is None: + tz = "UTC" + tz = tz.upper() + offset = offset_from_tz_string(tz) + if offset is None: + return None + t = t - offset + + return t + +STRICT_DATE_RE = re.compile( + r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " + "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$") +WEEKDAY_RE = re.compile( + r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I) +LOOSE_HTTP_DATE_RE = re.compile( + r"""^ + (\d\d?) # day + (?:\s+|[-\/]) + (\w+) # month + (?:\s+|[-\/]) + (\d+) # year + (?: + (?:\s+|:) # separator before clock + (\d\d?):(\d\d) # hour:min + (?::(\d\d))? # optional seconds + )? # optional clock + \s* + ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone + \s* + (?:\(\w+\))? # ASCII representation of timezone in parens. + \s*$""", re.X) +def http2time(text): + """Returns time in seconds since epoch of time represented by a string. + + Return value is an integer. + + None is returned if the format of str is unrecognized, the time is outside + the representable range, or the timezone string is not recognized. If the + string contains no timezone, UTC is assumed. + + The timezone in the string may be numerical (like "-0800" or "+0100") or a + string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the + timezone strings equivalent to UTC (zero offset) are known to the function. + + The function loosely parses the following formats: + + Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format + Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format + Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format + 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) + 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) + 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) + + The parser ignores leading and trailing whitespace. The time may be + absent. + + If the year is given with only 2 digits, the function will select the + century that makes the year closest to the current date. + + """ + # fast exit for strictly conforming string + m = STRICT_DATE_RE.search(text) + if m: + g = m.groups() + mon = MONTHS_LOWER.index(g[1].lower()) + 1 + tt = (int(g[2]), mon, int(g[0]), + int(g[3]), int(g[4]), float(g[5])) + return _timegm(tt) + + # No, we need some messy parsing... + + # clean up + text = text.lstrip() + text = WEEKDAY_RE.sub("", text, 1) # Useless weekday + + # tz is time zone specifier string + day, mon, yr, hr, min, sec, tz = [None]*7 + + # loose regexp parse + m = LOOSE_HTTP_DATE_RE.search(text) + if m is not None: + day, mon, yr, hr, min, sec, tz = m.groups() + else: + return None # bad format + + return _str2time(day, mon, yr, hr, min, sec, tz) + +ISO_DATE_RE = re.compile( + """^ + (\d{4}) # year + [-\/]? + (\d\d?) # numerical month + [-\/]? + (\d\d?) # day + (?: + (?:\s+|[-:Tt]) # separator before clock + (\d\d?):?(\d\d) # hour:min + (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) + )? # optional clock + \s* + ([-+]?\d\d?:?(:?\d\d)? + |Z|z)? # timezone (Z is "zero meridian", i.e. GMT) + \s*$""", re.X) +def iso2time(text): + """ + As for http2time, but parses the ISO 8601 formats: + + 1994-02-03 14:15:29 -0100 -- ISO 8601 format + 1994-02-03 14:15:29 -- zone is optional + 1994-02-03 -- only date + 1994-02-03T14:15:29 -- Use T as separator + 19940203T141529Z -- ISO 8601 compact format + 19940203 -- only date + + """ + # clean up + text = text.lstrip() + + # tz is time zone specifier string + day, mon, yr, hr, min, sec, tz = [None]*7 + + # loose regexp parse + m = ISO_DATE_RE.search(text) + if m is not None: + # XXX there's an extra bit of the timezone I'm ignoring here: is + # this the right thing to do? + yr, mon, day, hr, min, sec, tz, _ = m.groups() + else: + return None # bad format + + return _str2time(day, mon, yr, hr, min, sec, tz) + + +# Header parsing +# ----------------------------------------------------------------------------- + +def unmatched(match): + """Return unmatched part of re.Match object.""" + start, end = match.span(0) + return match.string[:start]+match.string[end:] + +HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)") +HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") +HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)") +HEADER_ESCAPE_RE = re.compile(r"\\(.)") +def split_header_words(header_values): + r"""Parse header values into a list of lists containing key,value pairs. + + The function knows how to deal with ",", ";" and "=" as well as quoted + values after "=". A list of space separated tokens are parsed as if they + were separated by ";". + + If the header_values passed as argument contains multiple values, then they + are treated as if they were a single value separated by comma ",". + + This means that this function is useful for parsing header fields that + follow this syntax (BNF as from the HTTP/1.1 specification, but we relax + the requirement for tokens). + + headers = #header + header = (token | parameter) *( [";"] (token | parameter)) + + token = 1*<any CHAR except CTLs or separators> + separators = "(" | ")" | "<" | ">" | "@" + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT + + quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) + qdtext = <any TEXT except <">> + quoted-pair = "\" CHAR + + parameter = attribute "=" value + attribute = token + value = token | quoted-string + + Each header is represented by a list of key/value pairs. The value for a + simple token (not part of a parameter) is None. Syntactically incorrect + headers will not necessarily be parsed as you would want. + + This is easier to describe with some examples: + + >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) + [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] + >>> split_header_words(['text/html; charset="iso-8859-1"']) + [[('text/html', None), ('charset', 'iso-8859-1')]] + >>> split_header_words([r'Basic realm="\"foo\bar\""']) + [[('Basic', None), ('realm', '"foobar"')]] + + """ + assert type(header_values) not in StringTypes + result = [] + for text in header_values: + orig_text = text + pairs = [] + while text: + m = HEADER_TOKEN_RE.search(text) + if m: + text = unmatched(m) + name = m.group(1) + m = HEADER_QUOTED_VALUE_RE.search(text) + if m: # quoted value + text = unmatched(m) + value = m.group(1) + value = HEADER_ESCAPE_RE.sub(r"\1", value) + else: + m = HEADER_VALUE_RE.search(text) + if m: # unquoted value + text = unmatched(m) + value = m.group(1) + value = value.rstrip() + else: + # no value, a lone token + value = None + pairs.append((name, value)) + elif text.lstrip().startswith(","): + # concatenated headers, as per RFC 2616 section 4.2 + text = text.lstrip()[1:] + if pairs: result.append(pairs) + pairs = [] + else: + # skip junk + non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text) + assert nr_junk_chars > 0, ( + "split_header_words bug: '%s', '%s', %s" % + (orig_text, text, pairs)) + text = non_junk + if pairs: result.append(pairs) + return result + +HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])") +def join_header_words(lists): + """Do the inverse (almost) of the conversion done by split_header_words. + + Takes a list of lists of (key, value) pairs and produces a single header + value. Attribute values are quoted if needed. + + >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]]) + 'text/plain; charset="iso-8859/1"' + >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]]) + 'text/plain, charset="iso-8859/1"' + + """ + headers = [] + for pairs in lists: + attr = [] + for k, v in pairs: + if v is not None: + if not re.search(r"^\w+$", v): + v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \ + v = '"%s"' % v + k = "%s=%s" % (k, v) + attr.append(k) + if attr: headers.append("; ".join(attr)) + return ", ".join(headers) + +def parse_ns_headers(ns_headers): + """Ad-hoc parser for Netscape protocol cookie-attributes. + + The old Netscape cookie format for Set-Cookie can for instance contain + an unquoted "," in the expires field, so we have to use this ad-hoc + parser instead of split_header_words. + + XXX This may not make the best possible effort to parse all the crap + that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient + parser is probably better, so could do worse than following that if + this ever gives any trouble. + + Currently, this is also used for parsing RFC 2109 cookies. + + """ + known_attrs = ("expires", "domain", "path", "secure", + # RFC 2109 attrs (may turn up in Netscape cookies, too) + "port", "max-age") + + result = [] + for ns_header in ns_headers: + pairs = [] + version_set = False + for param in re.split(r";\s*", ns_header): + param = param.rstrip() + if param == "": continue + if "=" not in param: + if param.lower() in known_attrs: + k, v = param, None + else: + # cookie with missing value + k, v = param, None + else: + k, v = re.split(r"\s*=\s*", param, 1) + k = k.lstrip() + if k is not None: + lc = k.lower() + if lc in known_attrs: + k = lc + if k == "version": + # This is an RFC 2109 cookie. Will be treated as RFC 2965 + # cookie in rest of code. + # Probably it should be parsed with split_header_words, but + # that's too much hassle. + version_set = True + if k == "expires": + # convert expires date to seconds since epoch + if v.startswith('"'): v = v[1:] + if v.endswith('"'): v = v[:-1] + v = http2time(v) # None if invalid + pairs.append((k, v)) + + if pairs: + if not version_set: + pairs.append(("version", "0")) + result.append(pairs) + + return result + + +IPV4_RE = re.compile(r"\.\d+$") +def is_HDN(text): + """Return True if text is a host domain name.""" + # XXX + # This may well be wrong. Which RFC is HDN defined in, if any (for + # the purposes of RFC 2965)? + # For the current implementation, what about IPv6? Remember to look + # at other uses of IPV4_RE also, if change this. + if IPV4_RE.search(text): + return False + if text == "": + return False + if text[0] == "." or text[-1] == ".": + return False + return True + +def domain_match(A, B): + """Return True if domain A domain-matches domain B, according to RFC 2965. + + A and B may be host domain names or IP addresses. + + RFC 2965, section 1: + + Host names can be specified either as an IP address or a HDN string. + Sometimes we compare one host name with another. (Such comparisons SHALL + be case-insensitive.) Host A's name domain-matches host B's if + + * their host name strings string-compare equal; or + + * A is a HDN string and has the form NB, where N is a non-empty + name string, B has the form .B', and B' is a HDN string. (So, + x.y.com domain-matches .Y.com but not Y.com.) + + Note that domain-match is not a commutative operation: a.b.c.com + domain-matches .c.com, but not the reverse. + + """ + # Note that, if A or B are IP addresses, the only relevant part of the + # definition of the domain-match algorithm is the direct string-compare. + A = A.lower() + B = B.lower() + if A == B: + return True + if not is_HDN(A): + return False + i = A.rfind(B) + if i == -1 or i == 0: + # A does not have form NB, or N is the empty string + return False + if not B.startswith("."): + return False + if not is_HDN(B[1:]): + return False + return True + +def liberal_is_HDN(text): + """Return True if text is a sort-of-like a host domain name. + + For accepting/blocking domains. + + """ + if IPV4_RE.search(text): + return False + return True + +def user_domain_match(A, B): + """For blocking/accepting domains. + + A and B may be host domain names or IP addresses. + + """ + A = A.lower() + B = B.lower() + if not (liberal_is_HDN(A) and liberal_is_HDN(B)): + if A == B: + # equal IP addresses + return True + return False + initial_dot = B.startswith(".") + if initial_dot and A.endswith(B): + return True + if not initial_dot and A == B: + return True + return False + +cut_port_re = re.compile(r":\d+$") +def request_host(request): + """Return request-host, as defined by RFC 2965. + + Variation from RFC: returned value is lowercased, for convenient + comparison. + + """ + url = request.get_full_url() + host = urlparse.urlparse(url)[1] + if host == "": + host = request.get_header("Host", "") + + # remove port, if present + host = cut_port_re.sub("", host, 1) + return host.lower() + +def eff_request_host(request): + """Return a tuple (request-host, effective request-host name). + + As defined by RFC 2965, except both are lowercased. + + """ + erhn = req_host = request_host(request) + if req_host.find(".") == -1 and not IPV4_RE.search(req_host): + erhn = req_host + ".local" + return req_host, erhn + +def request_path(request): + """request-URI, as defined by RFC 2965.""" + url = request.get_full_url() + #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url) + #req_path = escape_path("".join(urlparse.urlparse(url)[2:])) + path, parameters, query, frag = urlparse.urlparse(url)[2:] + if parameters: + path = "%s;%s" % (path, parameters) + path = escape_path(path) + req_path = urlparse.urlunparse(("", "", path, "", query, frag)) + if not req_path.startswith("/"): + # fix bad RFC 2396 absoluteURI + req_path = "/"+req_path + return req_path + +def request_port(request): + host = request.get_host() + i = host.find(':') + if i >= 0: + port = host[i+1:] + try: + int(port) + except ValueError: + debug("nonnumeric port: '%s'", port) + return None + else: + port = DEFAULT_HTTP_PORT + return port + +# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't +# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). +HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" +ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") +def uppercase_escaped_char(match): + return "%%%s" % match.group(1).upper() +def escape_path(path): + """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" + # There's no knowing what character encoding was used to create URLs + # containing %-escapes, but since we have to pick one to escape invalid + # path characters, we pick UTF-8, as recommended in the HTML 4.0 + # specification: + # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 + # And here, kind of: draft-fielding-uri-rfc2396bis-03 + # (And in draft IRI specification: draft-duerst-iri-05) + # (And here, for new URI schemes: RFC 2718) + if isinstance(path, types.UnicodeType): + path = path.encode("utf-8") + path = urllib.quote(path, HTTP_PATH_SAFE) + path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) + return path + +def reach(h): + """Return reach of host h, as defined by RFC 2965, section 1. + + The reach R of a host name H is defined as follows: + + * If + + - H is the host domain name of a host; and, + + - H has the form A.B; and + + - A has no embedded (that is, interior) dots; and + + - B has at least one embedded dot, or B is the string "local". + then the reach of H is .B. + + * Otherwise, the reach of H is H. + + >>> reach("www.acme.com") + '.acme.com' + >>> reach("acme.com") + 'acme.com' + >>> reach("acme.local") + '.local' + + """ + i = h.find(".") + if i >= 0: + #a = h[:i] # this line is only here to show what a is + b = h[i+1:] + i = b.find(".") + if is_HDN(h) and (i >= 0 or b == "local"): + return "."+b + return h + +def is_third_party(request): + """ + + RFC 2965, section 3.3.6: + + An unverifiable transaction is to a third-party host if its request- + host U does not domain-match the reach R of the request-host O in the + origin transaction. + + """ + req_host = request_host(request) + if not domain_match(req_host, reach(request.get_origin_req_host())): + return True + else: + return False + + +class Cookie: + """HTTP Cookie. + + This class represents both Netscape and RFC 2965 cookies. + + This is deliberately a very simple class. It just holds attributes. It's + possible to construct Cookie instances that don't comply with the cookie + standards. CookieJar.make_cookies is the factory function for Cookie + objects -- it deals with cookie parsing, supplying defaults, and + normalising to the representation used in this class. CookiePolicy is + responsible for checking them to see whether they should be accepted from + and returned to the server. + + Note that the port may be present in the headers, but unspecified ("Port" + rather than"Port=80", for example); if this is the case, port is None. + + """ + + def __init__(self, version, name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest): + + if version is not None: version = int(version) + if expires is not None: expires = int(expires) + if port is None and port_specified is True: + raise ValueError("if port is None, port_specified must be false") + + self.version = version + self.name = name + self.value = value + self.port = port + self.port_specified = port_specified + # normalise case, as per RFC 2965 section 3.3.3 + self.domain = domain.lower() + self.domain_specified = domain_specified + # Sigh. We need to know whether the domain given in the + # cookie-attribute had an initial dot, in order to follow RFC 2965 + # (as clarified in draft errata). Needed for the returned $Domain + # value. + self.domain_initial_dot = domain_initial_dot + self.path = path + self.path_specified = path_specified + self.secure = secure + self.expires = expires + self.discard = discard + self.comment = comment + self.comment_url = comment_url + + self._rest = copy.copy(rest) + + def has_nonstandard_attr(self, name): + return name in self._rest + def get_nonstandard_attr(self, name, default=None): + return self._rest.get(name, default) + def set_nonstandard_attr(self, name, value): + self._rest[name] = value + + def is_expired(self, now=None): + if now is None: now = time.time() + if (self.expires is not None) and (self.expires <= now): + return True + return False + + def __str__(self): + if self.port is None: p = "" + else: p = ":"+self.port + limit = self.domain + p + self.path + if self.value is not None: + namevalue = "%s=%s" % (self.name, self.value) + else: + namevalue = self.name + return "<Cookie %s for %s>" % (namevalue, limit) + + def __repr__(self): + args = [] + for name in ["version", "name", "value", + "port", "port_specified", + "domain", "domain_specified", "domain_initial_dot", + "path", "path_specified", + "secure", "expires", "discard", "comment", "comment_url", + ]: + attr = getattr(self, name) + args.append("%s=%s" % (name, repr(attr))) + args.append("rest=%s" % repr(self._rest)) + return "Cookie(%s)" % ", ".join(args) + + +class CookiePolicy: + """Defines which cookies get accepted from and returned to server. + + May also modify cookies, though this is probably a bad idea. + + The subclass DefaultCookiePolicy defines the standard rules for Netscape + and RFC 2965 cookies -- override that if you want a customised policy. + + """ + def set_ok(self, cookie, request): + """Return true if (and only if) cookie should be accepted from server. + + Currently, pre-expired cookies never get this far -- the CookieJar + class deletes such cookies itself. + + """ + raise NotImplementedError() + + def return_ok(self, cookie, request): + """Return true if (and only if) cookie should be returned to server.""" + raise NotImplementedError() + + def domain_return_ok(self, domain, request): + """Return false if cookies should not be returned, given cookie domain. + """ + return True + + def path_return_ok(self, path, request): + """Return false if cookies should not be returned, given cookie path. + """ + return True + + +class DefaultCookiePolicy(CookiePolicy): + """Implements the standard rules for accepting and returning cookies.""" + + DomainStrictNoDots = 1 + DomainStrictNonDomain = 2 + DomainRFC2965Match = 4 + + DomainLiberal = 0 + DomainStrict = DomainStrictNoDots|DomainStrictNonDomain + + def __init__(self, + blocked_domains=None, allowed_domains=None, + netscape=True, rfc2965=False, + hide_cookie2=False, + strict_domain=False, + strict_rfc2965_unverifiable=True, + strict_ns_unverifiable=False, + strict_ns_domain=DomainLiberal, + strict_ns_set_initial_dollar=False, + strict_ns_set_path=False, + ): + """Constructor arguments should be passed as keyword arguments only.""" + self.netscape = netscape + self.rfc2965 = rfc2965 + self.hide_cookie2 = hide_cookie2 + self.strict_domain = strict_domain + self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable + self.strict_ns_unverifiable = strict_ns_unverifiable + self.strict_ns_domain = strict_ns_domain + self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar + self.strict_ns_set_path = strict_ns_set_path + + if blocked_domains is not None: + self._blocked_domains = tuple(blocked_domains) + else: + self._blocked_domains = () + + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def blocked_domains(self): + """Return the sequence of blocked domains (as a tuple).""" + return self._blocked_domains + def set_blocked_domains(self, blocked_domains): + """Set the sequence of blocked domains.""" + self._blocked_domains = tuple(blocked_domains) + + def is_blocked(self, domain): + for blocked_domain in self._blocked_domains: + if user_domain_match(domain, blocked_domain): + return True + return False + + def allowed_domains(self): + """Return None, or the sequence of allowed domains (as a tuple).""" + return self._allowed_domains + def set_allowed_domains(self, allowed_domains): + """Set the sequence of allowed domains, or None.""" + if allowed_domains is not None: + allowed_domains = tuple(allowed_domains) + self._allowed_domains = allowed_domains + + def is_not_allowed(self, domain): + if self._allowed_domains is None: + return False + for allowed_domain in self._allowed_domains: + if user_domain_match(domain, allowed_domain): + return False + return True + + def set_ok(self, cookie, request): + """ + If you override .set_ok(), be sure to call this method. If it returns + false, so should your subclass (assuming your subclass wants to be more + strict about which cookies to accept). + + """ + debug(" - checking cookie %s=%s", cookie.name, cookie.value) + + assert cookie.name is not None + + for n in "version", "verifiability", "name", "path", "domain", "port": + fn_name = "set_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + + return True + + def set_ok_version(self, cookie, request): + if cookie.version is None: + # Version is always set to 0 by parse_ns_headers if it's a Netscape + # cookie, so this must be an invalid RFC 2965 cookie. + debug(" Set-Cookie2 without version attribute (%s=%s)", + cookie.name, cookie.value) + return False + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def set_ok_verifiability(self, cookie, request): + if request.is_unverifiable() and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during " + "unverifiable transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during " + "unverifiable transaction") + return False + return True + + def set_ok_name(self, cookie, request): + # Try and stop servers setting V0 cookies designed to hack other + # servers that know both V0 and V1 protocols. + if (cookie.version == 0 and self.strict_ns_set_initial_dollar and + cookie.name.startswith("$")): + debug(" illegal name (starts with '$'): '%s'", cookie.name) + return False + return True + + def set_ok_path(self, cookie, request): + if cookie.path_specified: + req_path = request_path(request) + if ((cookie.version > 0 or + (cookie.version == 0 and self.strict_ns_set_path)) and + not req_path.startswith(cookie.path)): + debug(" path attribute %s is not a prefix of request " + "path %s", cookie.path, req_path) + return False + return True + + def set_ok_domain(self, cookie, request): + if self.is_blocked(cookie.domain): + debug(" domain %s is in user block-list", cookie.domain) + return False + if self.is_not_allowed(cookie.domain): + debug(" domain %s is not in user allow-list", cookie.domain) + return False + if cookie.domain_specified: + req_host, erhn = eff_request_host(request) + domain = cookie.domain + if self.strict_domain and (domain.count(".") >= 2): + i = domain.rfind(".") + j = domain.rfind(".", 0, i) + if j == 0: # domain like .foo.bar + tld = domain[i+1:] + sld = domain[j+1:i] + if (sld.lower() in [ + "co", "ac", + "com", "edu", "org", "net", "gov", "mil", "int"] and + len(tld) == 2): + # domain like .co.uk + debug(" country-code second level domain %s", domain) + return False + if domain.startswith("."): + undotted_domain = domain[1:] + else: + undotted_domain = domain + embedded_dots = (undotted_domain.find(".") >= 0) + if not embedded_dots and domain != ".local": + debug(" non-local domain %s contains no embedded dot", + domain) + return False + if cookie.version == 0: + if (not erhn.endswith(domain) and + (not erhn.startswith(".") and + not ("."+erhn).endswith(domain))): + debug(" effective request-host %s (even with added " + "initial dot) does not end end with %s", + erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainRFC2965Match)): + if not domain_match(erhn, domain): + debug(" effective request-host %s does not domain-match " + "%s", erhn, domain) + return False + if (cookie.version > 0 or + (self.strict_ns_domain & self.DomainStrictNoDots)): + host_prefix = req_host[:-len(domain)] + if (host_prefix.find(".") >= 0 and + not IPV4_RE.search(req_host)): + debug(" host prefix %s for domain %s contains a dot", + host_prefix, domain) + return False + return True + + def set_ok_port(self, cookie, request): + if cookie.port_specified: + req_port = request_port(request) + if req_port is None: + req_port = "80" + else: + req_port = str(req_port) + for p in cookie.port.split(","): + try: + int(p) + except ValueError: + debug(" bad port %s (not numeric)", p) + return False + if p == req_port: + break + else: + debug(" request port (%s) not found in %s", + req_port, cookie.port) + return False + return True + + def return_ok(self, cookie, request): + """ + If you override .return_ok(), be sure to call this method. If it + returns false, so should your subclass (assuming your subclass wants to + be more strict about which cookies to return). + + """ + # Path has already been checked by .path_return_ok(), and domain + # blocking done by .domain_return_ok(). + debug(" - checking cookie %s=%s", cookie.name, cookie.value) + + for n in "version", "verifiability", "secure", "expires", "port", "domain": + fn_name = "return_ok_"+n + fn = getattr(self, fn_name) + if not fn(cookie, request): + return False + return True + + def return_ok_version(self, cookie, request): + if cookie.version > 0 and not self.rfc2965: + debug(" RFC 2965 cookies are switched off") + return False + elif cookie.version == 0 and not self.netscape: + debug(" Netscape cookies are switched off") + return False + return True + + def return_ok_verifiability(self, cookie, request): + if request.is_unverifiable() and is_third_party(request): + if cookie.version > 0 and self.strict_rfc2965_unverifiable: + debug(" third-party RFC 2965 cookie during unverifiable " + "transaction") + return False + elif cookie.version == 0 and self.strict_ns_unverifiable: + debug(" third-party Netscape cookie during unverifiable " + "transaction") + return False + return True + + def return_ok_secure(self, cookie, request): + if cookie.secure and request.get_type() != "https": + debug(" secure cookie with non-secure request") + return False + return True + + def return_ok_expires(self, cookie, request): + if cookie.is_expired(self._now): + debug(" cookie expired") + return False + return True + + def return_ok_port(self, cookie, request): + if cookie.port: + req_port = request_port(request) + if req_port is None: + req_port = "80" + for p in cookie.port.split(","): + if p == req_port: + break + else: + debug(" request port %s does not match cookie port %s", + req_port, cookie.port) + return False + return True + + def return_ok_domain(self, cookie, request): + req_host, erhn = eff_request_host(request) + domain = cookie.domain + + # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't + if (cookie.version == 0 and + (self.strict_ns_domain & self.DomainStrictNonDomain) and + not cookie.domain_specified and domain != erhn): + debug(" cookie with unspecified domain does not string-compare " + "equal to request domain") + return False + + if cookie.version > 0 and not domain_match(erhn, domain): + debug(" effective request-host name %s does not domain-match " + "RFC 2965 cookie domain %s", erhn, domain) + return False + if cookie.version == 0 and not ("."+erhn).endswith(domain): + debug(" request-host %s does not match Netscape cookie domain " + "%s", req_host, domain) + return False + return True + + def domain_return_ok(self, domain, request): + # Liberal check of. This is here as an optimization to avoid + # having to load lots of MSIE cookie files unless necessary. + req_host, erhn = eff_request_host(request) + if not req_host.startswith("."): + dotted_req_host = "."+req_host + if not erhn.startswith("."): + dotted_erhn = "."+erhn + if not (dotted_req_host.endswith(domain) or + dotted_erhn.endswith(domain)): + #debug(" request domain %s does not match cookie domain %s", + # req_host, domain) + return False + + if self.is_blocked(domain): + debug(" domain %s is in user block-list", domain) + return False + if self.is_not_allowed(domain): + debug(" domain %s is not in user allow-list", domain) + return False + + return True + + def path_return_ok(self, path, request): + debug("- checking cookie path=%s", path) + req_path = request_path(request) + if not req_path.startswith(path): + debug(" %s does not path-match %s", req_path, path) + return False + return True + + +def vals_sorted_by_key(adict): + keys = adict.keys() + keys.sort() + return map(adict.get, keys) + +def deepvalues(mapping): + """Iterates over nested mapping, depth-first, in sorted order by key.""" + values = vals_sorted_by_key(mapping) + for obj in values: + mapping = False + try: + obj.items + except AttributeError: + pass + else: + mapping = True + for subobj in deepvalues(obj): + yield subobj + if not mapping: + yield obj + + +# Used as second parameter to dict.get() method, to distinguish absent +# dict key from one with a None value. +class Absent: pass + +class CookieJar: + """Collection of HTTP cookies. + + You may not need to know about this class: try + urllib2.build_opener(HTTPCookieProcessor).open(url). + + """ + + non_word_re = re.compile(r"\W") + quote_re = re.compile(r"([\"\\])") + strict_domain_re = re.compile(r"\.?[^.]*") + domain_re = re.compile(r"[^.]*") + dots_re = re.compile(r"^\.+") + + magic_re = r"^\#LWP-Cookies-(\d+\.\d+)" + + def __init__(self, policy=None): + if policy is None: + policy = DefaultCookiePolicy() + self._policy = policy + + self._cookies_lock = _threading.RLock() + self._cookies = {} + + def set_policy(self, policy): + self._policy = policy + + def _cookies_for_domain(self, domain, request): + cookies = [] + if not self._policy.domain_return_ok(domain, request): + return [] + debug("Checking %s for cookies to return", domain) + cookies_by_path = self._cookies[domain] + for path in cookies_by_path.keys(): + if not self._policy.path_return_ok(path, request): + continue + cookies_by_name = cookies_by_path[path] + for cookie in cookies_by_name.values(): + if not self._policy.return_ok(cookie, request): + debug(" not returning cookie") + continue + debug(" it's a match") + cookies.append(cookie) + return cookies + + def _cookies_for_request(self, request): + """Return a list of cookies to be returned to server.""" + cookies = [] + for domain in self._cookies.keys(): + cookies.extend(self._cookies_for_domain(domain, request)) + return cookies + + def _cookie_attrs(self, cookies): + """Return a list of cookie-attributes to be returned to server. + + like ['foo="bar"; $Path="/"', ...] + + The $Version attribute is also added when appropriate (currently only + once per request). + + """ + # add cookies in order of most specific (ie. longest) path first + def decreasing_size(a, b): return cmp(len(b.path), len(a.path)) + cookies.sort(decreasing_size) + + version_set = False + + attrs = [] + for cookie in cookies: + # set version of Cookie header + # XXX + # What should it be if multiple matching Set-Cookie headers have + # different versions themselves? + # Answer: there is no answer; was supposed to be settled by + # RFC 2965 errata, but that may never appear... + version = cookie.version + if not version_set: + version_set = True + if version > 0: + attrs.append("$Version=%s" % version) + + # quote cookie value if necessary + # (not for Netscape protocol, which already has any quotes + # intact, due to the poorly-specified Netscape Cookie: syntax) + if ((cookie.value is not None) and + self.non_word_re.search(cookie.value) and version > 0): + value = self.quote_re.sub(r"\\\1", cookie.value) + else: + value = cookie.value + + # add cookie-attributes to be returned in Cookie header + if cookie.value is None: + attrs.append(cookie.name) + else: + attrs.append("%s=%s" % (cookie.name, value)) + if version > 0: + if cookie.path_specified: + attrs.append('$Path="%s"' % cookie.path) + if cookie.domain.startswith("."): + domain = cookie.domain + if (not cookie.domain_initial_dot and + domain.startswith(".")): + domain = domain[1:] + attrs.append('$Domain="%s"' % domain) + if cookie.port is not None: + p = "$Port" + if cookie.port_specified: + p = p + ('="%s"' % cookie.port) + attrs.append(p) + + return attrs + + def add_cookie_header(self, request): + """Add correct Cookie: header to request (urllib2.Request object). + + The Cookie2 header is also added unless policy.hide_cookie2 is true. + + """ + debug("add_cookie_header") + self._cookies_lock.acquire() + + self._policy._now = self._now = int(time.time()) + + req_host, erhn = eff_request_host(request) + strict_non_domain = ( + self._policy.strict_ns_domain & self._policy.DomainStrictNonDomain) + + cookies = self._cookies_for_request(request) + + attrs = self._cookie_attrs(cookies) + if attrs: + if not request.has_header("Cookie"): + request.add_unredirected_header( + "Cookie", "; ".join(attrs)) + + # if necessary, advertise that we know RFC 2965 + if (self._policy.rfc2965 and not self._policy.hide_cookie2 and + not request.has_header("Cookie2")): + for cookie in cookies: + if cookie.version != 1: + request.add_unredirected_header("Cookie2", '$Version="1"') + break + + self._cookies_lock.release() + + self.clear_expired_cookies() + + def _normalized_cookie_tuples(self, attrs_set): + """Return list of tuples containing normalised cookie information. + + attrs_set is the list of lists of key,value pairs extracted from + the Set-Cookie or Set-Cookie2 headers. + + Tuples are name, value, standard, rest, where name and value are the + cookie name and value, standard is a dictionary containing the standard + cookie-attributes (discard, secure, version, expires or max-age, + domain, path and port) and rest is a dictionary containing the rest of + the cookie-attributes. + + """ + cookie_tuples = [] + + boolean_attrs = "discard", "secure" + value_attrs = ("version", + "expires", "max-age", + "domain", "path", "port", + "comment", "commenturl") + + for cookie_attrs in attrs_set: + name, value = cookie_attrs[0] + + # Build dictionary of standard cookie-attributes (standard) and + # dictionary of other cookie-attributes (rest). + + # Note: expiry time is normalised to seconds since epoch. V0 + # cookies should have the Expires cookie-attribute, and V1 cookies + # should have Max-Age, but since V1 includes RFC 2109 cookies (and + # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we + # accept either (but prefer Max-Age). + max_age_set = False + + bad_cookie = False + + standard = {} + rest = {} + for k, v in cookie_attrs[1:]: + lc = k.lower() + # don't lose case distinction for unknown fields + if lc in value_attrs or lc in boolean_attrs: + k = lc + if k in boolean_attrs and v is None: + # boolean cookie-attribute is present, but has no value + # (like "discard", rather than "port=80") + v = True + if k in standard: + # only first value is significant + continue + if k == "domain": + if v is None: + debug(" missing value for domain attribute") + bad_cookie = True + break + # RFC 2965 section 3.3.3 + v = v.lower() + if k == "expires": + if max_age_set: + # Prefer max-age to expires (like Mozilla) + continue + if v is None: + debug(" missing or invalid value for expires " + "attribute: treating as session cookie") + continue + if k == "max-age": + max_age_set = True + try: + v = int(v) + except ValueError: + debug(" missing or invalid (non-numeric) value for " + "max-age attribute") + bad_cookie = True + break + # convert RFC 2965 Max-Age to seconds since epoch + # XXX Strictly you're supposed to follow RFC 2616 + # age-calculation rules. Remember that zero Max-Age is a + # is a request to discard (old and new) cookie, though. + k = "expires" + v = self._now + v + if (k in value_attrs) or (k in boolean_attrs): + if (v is None and + k not in ["port", "comment", "commenturl"]): + debug(" missing value for %s attribute" % k) + bad_cookie = True + break + standard[k] = v + else: + rest[k] = v + + if bad_cookie: + continue + + cookie_tuples.append((name, value, standard, rest)) + + return cookie_tuples + + def _cookie_from_cookie_tuple(self, tup, request): + # standard is dict of standard cookie-attributes, rest is dict of the + # rest of them + name, value, standard, rest = tup + + domain = standard.get("domain", Absent) + path = standard.get("path", Absent) + port = standard.get("port", Absent) + expires = standard.get("expires", Absent) + + # set the easy defaults + version = standard.get("version", None) + if version is not None: version = int(version) + secure = standard.get("secure", False) + # (discard is also set if expires is Absent) + discard = standard.get("discard", False) + comment = standard.get("comment", None) + comment_url = standard.get("commenturl", None) + + # set default path + if path is not Absent and path != "": + path_specified = True + path = escape_path(path) + else: + path_specified = False + path = request_path(request) + i = path.rfind("/") + if i != -1: + if version == 0: + # Netscape spec parts company from reality here + path = path[:i] + else: + path = path[:i+1] + if len(path) == 0: path = "/" + + # set default domain + domain_specified = domain is not Absent + # but first we have to remember whether it starts with a dot + domain_initial_dot = False + if domain_specified: + domain_initial_dot = bool(domain.startswith(".")) + if domain is Absent: + req_host, erhn = eff_request_host(request) + domain = erhn + elif not domain.startswith("."): + domain = "."+domain + + # set default port + port_specified = False + if port is not Absent: + if port is None: + # Port attr present, but has no value: default to request port. + # Cookie should then only be sent back on that port. + port = request_port(request) + else: + port_specified = True + port = re.sub(r"\s+", "", port) + else: + # No port attr present. Cookie can be sent back on any port. + port = None + + # set default expires and discard + if expires is Absent: + expires = None + discard = True + elif expires <= self._now: + # Expiry date in past is request to delete cookie. This can't be + # in DefaultCookiePolicy, because can't delete cookies there. + try: + self.clear(domain, path, name) + except KeyError: + pass + debug("Expiring cookie, domain='%s', path='%s', name='%s'", + domain, path, name) + return None + + return Cookie(version, + name, value, + port, port_specified, + domain, domain_specified, domain_initial_dot, + path, path_specified, + secure, + expires, + discard, + comment, + comment_url, + rest) + + def _cookies_from_attrs_set(self, attrs_set, request): + cookie_tuples = self._normalized_cookie_tuples(attrs_set) + + cookies = [] + for tup in cookie_tuples: + cookie = self._cookie_from_cookie_tuple(tup, request) + if cookie: cookies.append(cookie) + return cookies + + def make_cookies(self, response, request): + """Return sequence of Cookie objects extracted from response object.""" + # get cookie-attributes for RFC 2965 and Netscape protocols + headers = response.info() + rfc2965_hdrs = headers.getheaders("Set-Cookie2") + ns_hdrs = headers.getheaders("Set-Cookie") + + rfc2965 = self._policy.rfc2965 + netscape = self._policy.netscape + + if ((not rfc2965_hdrs and not ns_hdrs) or + (not ns_hdrs and not rfc2965) or + (not rfc2965_hdrs and not netscape) or + (not netscape and not rfc2965)): + return [] # no relevant cookie headers: quick exit + + try: + cookies = self._cookies_from_attrs_set( + split_header_words(rfc2965_hdrs), request) + except: + reraise_unmasked_exceptions() + cookies = [] + + if ns_hdrs and netscape: + try: + ns_cookies = self._cookies_from_attrs_set( + parse_ns_headers(ns_hdrs), request) + except: + reraise_unmasked_exceptions() + ns_cookies = [] + + # Look for Netscape cookies (from Set-Cookie headers) that match + # corresponding RFC 2965 cookies (from Set-Cookie2 headers). + # For each match, keep the RFC 2965 cookie and ignore the Netscape + # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are + # bundled in with the Netscape cookies for this purpose, which is + # reasonable behaviour. + if rfc2965: + lookup = {} + for cookie in cookies: + lookup[(cookie.domain, cookie.path, cookie.name)] = None + + def no_matching_rfc2965(ns_cookie, lookup=lookup): + key = ns_cookie.domain, ns_cookie.path, ns_cookie.name + return key not in lookup + ns_cookies = filter(no_matching_rfc2965, ns_cookies) + + if ns_cookies: + cookies.extend(ns_cookies) + + return cookies + + def set_cookie_if_ok(self, cookie, request): + """Set a cookie if policy says it's OK to do so.""" + self._cookies_lock.acquire() + self._policy._now = self._now = int(time.time()) + + if self._policy.set_ok(cookie, request): + self.set_cookie(cookie) + + self._cookies_lock.release() + + def set_cookie(self, cookie): + """Set a cookie, without checking whether or not it should be set.""" + c = self._cookies + self._cookies_lock.acquire() + try: + if cookie.domain not in c: c[cookie.domain] = {} + c2 = c[cookie.domain] + if cookie.path not in c2: c2[cookie.path] = {} + c3 = c2[cookie.path] + c3[cookie.name] = cookie + finally: + self._cookies_lock.release() + + def extract_cookies(self, response, request): + """Extract cookies from response, where allowable given the request.""" + debug("extract_cookies: %s", response.info()) + self._cookies_lock.acquire() + self._policy._now = self._now = int(time.time()) + + for cookie in self.make_cookies(response, request): + if self._policy.set_ok(cookie, request): + debug(" setting cookie: %s", cookie) + self.set_cookie(cookie) + self._cookies_lock.release() + + def clear(self, domain=None, path=None, name=None): + """Clear some cookies. + + Invoking this method without arguments will clear all cookies. If + given a single argument, only cookies belonging to that domain will be + removed. If given two arguments, cookies belonging to the specified + path within that domain are removed. If given three arguments, then + the cookie with the specified name, path and domain is removed. + + Raises KeyError if no matching cookie exists. + + """ + if name is not None: + if (domain is None) or (path is None): + raise ValueError( + "domain and path must be given to remove a cookie by name") + del self._cookies[domain][path][name] + elif path is not None: + if domain is None: + raise ValueError( + "domain must be given to remove cookies by path") + del self._cookies[domain][path] + elif domain is not None: + del self._cookies[domain] + else: + self._cookies = {} + + def clear_session_cookies(self): + """Discard all session cookies. + + Note that the .save() method won't save session cookies anyway, unless + you ask otherwise by passing a true ignore_discard argument. + + """ + self._cookies_lock.acquire() + for cookie in self: + if cookie.discard: + self.clear(cookie.domain, cookie.path, cookie.name) + self._cookies_lock.release() + + def clear_expired_cookies(self): + """Discard all expired cookies. + + You probably don't need to call this method: expired cookies are never + sent back to the server (provided you're using DefaultCookiePolicy), + this method is called by CookieJar itself every so often, and the + .save() method won't save expired cookies anyway (unless you ask + otherwise by passing a true ignore_expires argument). + + """ + self._cookies_lock.acquire() + now = time.time() + for cookie in self: + if cookie.is_expired(now): + self.clear(cookie.domain, cookie.path, cookie.name) + self._cookies_lock.release() + + def __iter__(self): + return deepvalues(self._cookies) + + def __len__(self): + """Return number of contained cookies.""" + i = 0 + for cookie in self: i = i + 1 + return i + + def __repr__(self): + r = [] + for cookie in self: r.append(repr(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + def __str__(self): + r = [] + for cookie in self: r.append(str(cookie)) + return "<%s[%s]>" % (self.__class__, ", ".join(r)) + + +class LoadError(Exception): pass + +class FileCookieJar(CookieJar): + """CookieJar that can be loaded from and saved to a file.""" + + def __init__(self, filename=None, delayload=False, policy=None): + """ + Cookies are NOT loaded from the named file until either the .load() or + .revert() method is called. + + """ + CookieJar.__init__(self, policy) + if filename is not None: + try: + filename+"" + except: + raise ValueError("filename must be string-like") + self.filename = filename + self.delayload = bool(delayload) + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + """Save cookies to a file.""" + raise NotImplementedError() + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file.""" + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + f = open(filename) + try: + self._really_load(f, filename, ignore_discard, ignore_expires) + finally: + f.close() + + def revert(self, filename=None, + ignore_discard=False, ignore_expires=False): + """Clear all cookies and reload cookies from a saved file. + + Raises LoadError (or IOError) if reversion is not successful; the + object's state will not be altered if this happens. + + """ + if filename is None: + if self.filename is not None: filename = self.filename + else: raise ValueError(MISSING_FILENAME_TEXT) + + self._cookies_lock.acquire() + + old_state = copy.deepcopy(self._cookies) + self._cookies = {} + try: + self.load(filename, ignore_discard, ignore_expires) + except (LoadError, IOError): + self._cookies = old_state + raise + + self._cookies_lock.release() + +from _LWPCookieJar import LWPCookieJar, lwp_cookie_str +from _MozillaCookieJar import MozillaCookieJar diff --git a/Lib/test/test_cookielib.py b/Lib/test/test_cookielib.py new file mode 100644 index 0000000..72c9fc8 --- /dev/null +++ b/Lib/test/test_cookielib.py @@ -0,0 +1,1620 @@ +# -*- coding: utf-8 -*- +"""Tests for cookielib.py.""" + +import re, os, time +from unittest import TestCase + +from test import test_support + +class DateTimeTests(TestCase): + + def test_time2isoz(self): + from cookielib import time2isoz + + base = 1019227000 + day = 24*3600 + self.assertEquals(time2isoz(base), "2002-04-19 14:36:40Z") + self.assertEquals(time2isoz(base+day), "2002-04-20 14:36:40Z") + self.assertEquals(time2isoz(base+2*day), "2002-04-21 14:36:40Z") + self.assertEquals(time2isoz(base+3*day), "2002-04-22 14:36:40Z") + + az = time2isoz() + bz = time2isoz(500000) + for text in (az, bz): + self.assert_(re.search(r"^\d{4}-\d\d-\d\d \d\d:\d\d:\d\dZ$", text), + "bad time2isoz format: %s %s" % (az, bz)) + + def test_http2time(self): + from cookielib import http2time + + def parse_date(text): + return time.gmtime(http2time(text))[:6] + + self.assertEquals(parse_date("01 Jan 2001"), (2001, 1, 1, 0, 0, 0.0)) + + # this test will break around year 2070 + self.assertEquals(parse_date("03-Feb-20"), (2020, 2, 3, 0, 0, 0.0)) + + # this test will break around year 2048 + self.assertEquals(parse_date("03-Feb-98"), (1998, 2, 3, 0, 0, 0.0)) + + def test_http2time_formats(self): + from cookielib import http2time, time2isoz + + # test http2time for supported dates. Test cases with 2 digit year + # will probably break in year 2044. + tests = [ + 'Thu, 03 Feb 1994 00:00:00 GMT', # proposed new HTTP format + 'Thursday, 03-Feb-94 00:00:00 GMT', # old rfc850 HTTP format + 'Thursday, 03-Feb-1994 00:00:00 GMT', # broken rfc850 HTTP format + + '03 Feb 1994 00:00:00 GMT', # HTTP format (no weekday) + '03-Feb-94 00:00:00 GMT', # old rfc850 (no weekday) + '03-Feb-1994 00:00:00 GMT', # broken rfc850 (no weekday) + '03-Feb-1994 00:00 GMT', # broken rfc850 (no weekday, no seconds) + '03-Feb-1994 00:00', # broken rfc850 (no weekday, no seconds, no tz) + + '03-Feb-94', # old rfc850 HTTP format (no weekday, no time) + '03-Feb-1994', # broken rfc850 HTTP format (no weekday, no time) + '03 Feb 1994', # proposed new HTTP format (no weekday, no time) + + # A few tests with extra space at various places + ' 03 Feb 1994 0:00 ', + ' 03-Feb-1994 ', + ] + + test_t = 760233600 # assume broken POSIX counting of seconds + result = time2isoz(test_t) + expected = "1994-02-03 00:00:00Z" + self.assertEquals(result, expected, + "%s => '%s' (%s)" % (test_t, result, expected)) + + for s in tests: + t = http2time(s) + t2 = http2time(s.lower()) + t3 = http2time(s.upper()) + + self.assert_(t == t2 == t3 == test_t, + "'%s' => %s, %s, %s (%s)" % (s, t, t2, t3, test_t)) + + def test_http2time_garbage(self): + from cookielib import http2time + + for test in [ + '', + 'Garbage', + 'Mandag 16. September 1996', + '01-00-1980', + '01-13-1980', + '00-01-1980', + '32-01-1980', + '01-01-1980 25:00:00', + '01-01-1980 00:61:00', + '01-01-1980 00:00:62', + ]: + self.assert_(http2time(test) is None, + "http2time(%s) is not None\n" + "http2time(test) %s" % (test, http2time(test)) + ) + + +class HeaderTests(TestCase): + def test_parse_ns_headers(self): + from cookielib import parse_ns_headers + + # quotes should be stripped + expected = [[('expires', 2209069412L), ('version', '0')]] + for hdr in [ + 'expires=01 Jan 2040 22:23:32 GMT', + 'expires="01 Jan 2040 22:23:32 GMT"', + ]: + self.assertEquals(parse_ns_headers([hdr]), expected) + + def test_join_header_words(self): + from cookielib import join_header_words + + joined = join_header_words([[("foo", None), ("bar", "baz")]]) + self.assertEquals(joined, "foo; bar=baz") + + self.assertEquals(join_header_words([[]]), "") + + def test_split_header_words(self): + from cookielib import split_header_words + + tests = [ + ("foo", [[("foo", None)]]), + ("foo=bar", [[("foo", "bar")]]), + (" foo ", [[("foo", None)]]), + (" foo= ", [[("foo", "")]]), + (" foo=", [[("foo", "")]]), + (" foo= ; ", [[("foo", "")]]), + (" foo= ; bar= baz ", [[("foo", ""), ("bar", "baz")]]), + ("foo=bar bar=baz", [[("foo", "bar"), ("bar", "baz")]]), + # doesn't really matter if this next fails, but it works ATM + ("foo= bar=baz", [[("foo", "bar=baz")]]), + ("foo=bar;bar=baz", [[("foo", "bar"), ("bar", "baz")]]), + ('foo bar baz', [[("foo", None), ("bar", None), ("baz", None)]]), + ("a, b, c", [[("a", None)], [("b", None)], [("c", None)]]), + (r'foo; bar=baz, spam=, foo="\,\;\"", bar= ', + [[("foo", None), ("bar", "baz")], + [("spam", "")], [("foo", ',;"')], [("bar", "")]]), + ] + + for arg, expect in tests: + try: + result = split_header_words([arg]) + except: + import traceback, StringIO + f = StringIO.StringIO() + traceback.print_exc(None, f) + result = "(error -- traceback follows)\n\n%s" % f.getvalue() + self.assertEquals(result, expect, """ +When parsing: '%s' +Expected: '%s' +Got: '%s' +""" % (arg, expect, result)) + + def test_roundtrip(self): + from cookielib import split_header_words, join_header_words + + tests = [ + ("foo", "foo"), + ("foo=bar", "foo=bar"), + (" foo ", "foo"), + ("foo=", 'foo=""'), + ("foo=bar bar=baz", "foo=bar; bar=baz"), + ("foo=bar;bar=baz", "foo=bar; bar=baz"), + ('foo bar baz', "foo; bar; baz"), + (r'foo="\"" bar="\\"', r'foo="\""; bar="\\"'), + ('foo,,,bar', 'foo, bar'), + ('foo=bar,bar=baz', 'foo=bar, bar=baz'), + + ('text/html; charset=iso-8859-1', + 'text/html; charset="iso-8859-1"'), + + ('foo="bar"; port="80,81"; discard, bar=baz', + 'foo=bar; port="80,81"; discard, bar=baz'), + + (r'Basic realm="\"foo\\\\bar\""', + r'Basic; realm="\"foo\\\\bar\""') + ] + + for arg, expect in tests: + input = split_header_words([arg]) + res = join_header_words(input) + self.assertEquals(res, expect, """ +When parsing: '%s' +Expected: '%s' +Got: '%s' +Input was: '%s' +""" % (arg, expect, res, input)) + + +class FakeResponse: + def __init__(self, headers=[], url=None): + """ + headers: list of RFC822-style 'Key: value' strings + """ + import mimetools, StringIO + f = StringIO.StringIO("\n".join(headers)) + self._headers = mimetools.Message(f) + self._url = url + def info(self): return self._headers + +def interact_2965(cookiejar, url, *set_cookie_hdrs): + return _interact(cookiejar, url, set_cookie_hdrs, "Set-Cookie2") + +def interact_netscape(cookiejar, url, *set_cookie_hdrs): + return _interact(cookiejar, url, set_cookie_hdrs, "Set-Cookie") + +def _interact(cookiejar, url, set_cookie_hdrs, hdr_name): + """Perform a single request / response cycle, returning Cookie: header.""" + from urllib2 import Request + req = Request(url) + cookiejar.add_cookie_header(req) + cookie_hdr = req.get_header("Cookie", "") + headers = [] + for hdr in set_cookie_hdrs: + headers.append("%s: %s" % (hdr_name, hdr)) + res = FakeResponse(headers, url) + cookiejar.extract_cookies(res, req) + return cookie_hdr + + +class CookieTests(TestCase): + # XXX + # Get rid of string comparisons where not actually testing str / repr. + # .clear() etc. + # IP addresses like 50 (single number, no dot) and domain-matching + # functions (and is_HDN)? See draft RFC 2965 errata. + # Strictness switches + # is_third_party() + # unverifiability / third-party blocking + # Netscape cookies work the same as RFC 2965 with regard to port. + # Set-Cookie with negative max age. + # If turn RFC 2965 handling off, Set-Cookie2 cookies should not clobber + # Set-Cookie cookies. + # Cookie2 should be sent if *any* cookies are not V1 (ie. V0 OR V2 etc.). + # Cookies (V1 and V0) with no expiry date should be set to be discarded. + # RFC 2965 Quoting: + # Should accept unquoted cookie-attribute values? check errata draft. + # Which are required on the way in and out? + # Should always return quoted cookie-attribute values? + # Proper testing of when RFC 2965 clobbers Netscape (waiting for errata). + # Path-match on return (same for V0 and V1). + # RFC 2965 acceptance and returning rules + # Set-Cookie2 without version attribute is rejected. + + # Netscape peculiarities list from Ronald Tschalar. + # The first two still need tests, the rest are covered. +## - Quoting: only quotes around the expires value are recognized as such +## (and yes, some folks quote the expires value); quotes around any other +## value are treated as part of the value. +## - White space: white space around names and values is ignored +## - Default path: if no path parameter is given, the path defaults to the +## path in the request-uri up to, but not including, the last '/'. Note +## that this is entirely different from what the spec says. +## - Commas and other delimiters: Netscape just parses until the next ';'. +## This means it will allow commas etc inside values (and yes, both +## commas and equals are commonly appear in the cookie value). This also +## means that if you fold multiple Set-Cookie header fields into one, +## comma-separated list, it'll be a headache to parse (at least my head +## starts hurting everytime I think of that code). +## - Expires: You'll get all sorts of date formats in the expires, +## including emtpy expires attributes ("expires="). Be as flexible as you +## can, and certainly don't expect the weekday to be there; if you can't +## parse it, just ignore it and pretend it's a session cookie. +## - Domain-matching: Netscape uses the 2-dot rule for _all_ domains, not +## just the 7 special TLD's listed in their spec. And folks rely on +## that... + + def test_domain_return_ok(self): + # test optimization: .domain_return_ok() should filter out most + # domains in the CookieJar before we try to access them (because that + # may require disk access -- in particular, with MSIECookieJar) + # This is only a rough check for performance reasons, so it's not too + # critical as long as it's sufficiently liberal. + import cookielib, urllib2 + pol = cookielib.DefaultCookiePolicy() + for url, domain, ok in [ + ("http://foo.bar.com/", "blah.com", False), + ("http://foo.bar.com/", "rhubarb.blah.com", False), + ("http://foo.bar.com/", "rhubarb.foo.bar.com", False), + ("http://foo.bar.com/", ".foo.bar.com", True), + ("http://foo.bar.com/", "foo.bar.com", True), + ("http://foo.bar.com/", ".bar.com", True), + ("http://foo.bar.com/", "com", True), + ("http://foo.com/", "rhubarb.foo.com", False), + ("http://foo.com/", ".foo.com", True), + ("http://foo.com/", "foo.com", True), + ("http://foo.com/", "com", True), + ("http://foo/", "rhubarb.foo", False), + ("http://foo/", ".foo", True), + ("http://foo/", "foo", True), + ("http://foo/", "foo.local", True), + ("http://foo/", ".local", True), + ]: + request = urllib2.Request(url) + r = pol.domain_return_ok(domain, request) + if ok: self.assert_(r) + else: self.assert_(not r) + + def test_missing_value(self): + from cookielib import MozillaCookieJar, lwp_cookie_str + + # missing = sign in Cookie: header is regarded by Mozilla as a missing + # name, and by cookielib as a missing value + filename = test_support.TESTFN + c = MozillaCookieJar(filename) + interact_netscape(c, "http://www.acme.com/", 'eggs') + interact_netscape(c, "http://www.acme.com/", '"spam"; path=/foo/') + cookie = c._cookies["www.acme.com"]["/"]["eggs"] + self.assert_(cookie.value is None) + self.assertEquals(cookie.name, "eggs") + cookie = c._cookies["www.acme.com"]['/foo/']['"spam"'] + self.assert_(cookie.value is None) + self.assertEquals(cookie.name, '"spam"') + self.assertEquals(lwp_cookie_str(cookie), ( + r'"spam"; path="/foo/"; domain="www.acme.com"; ' + 'path_spec; discard; version=0')) + old_str = repr(c) + c.save(ignore_expires=True, ignore_discard=True) + try: + c = MozillaCookieJar(filename) + c.revert(ignore_expires=True, ignore_discard=True) + finally: + os.unlink(c.filename) + # cookies unchanged apart from lost info re. whether path was specified + self.assertEquals( + repr(c), + re.sub("path_specified=%s" % True, "path_specified=%s" % False, + old_str) + ) + self.assertEquals(interact_netscape(c, "http://www.acme.com/foo/"), + '"spam"; eggs') + + def test_ns_parser(self): + from cookielib import CookieJar, DEFAULT_HTTP_PORT + + c = CookieJar() + interact_netscape(c, "http://www.acme.com/", + 'spam=eggs; DoMain=.acme.com; port; blArgh="feep"') + interact_netscape(c, "http://www.acme.com/", 'ni=ni; port=80,8080') + interact_netscape(c, "http://www.acme.com:80/", 'nini=ni') + interact_netscape(c, "http://www.acme.com:80/", 'foo=bar; expires=') + interact_netscape(c, "http://www.acme.com:80/", 'spam=eggs; ' + 'expires="Foo Bar 25 33:22:11 3022"') + + cookie = c._cookies[".acme.com"]["/"]["spam"] + self.assertEquals(cookie.domain, ".acme.com") + self.assert_(cookie.domain_specified) + self.assertEquals(cookie.port, DEFAULT_HTTP_PORT) + self.assert_(not cookie.port_specified) + # case is preserved + self.assert_(cookie.has_nonstandard_attr("blArgh") and + not cookie.has_nonstandard_attr("blargh")) + + cookie = c._cookies["www.acme.com"]["/"]["ni"] + self.assertEquals(cookie.domain, "www.acme.com") + self.assert_(not cookie.domain_specified) + self.assertEquals(cookie.port, "80,8080") + self.assert_(cookie.port_specified) + + cookie = c._cookies["www.acme.com"]["/"]["nini"] + self.assert_(cookie.port is None) + self.assert_(not cookie.port_specified) + + # invalid expires should not cause cookie to be dropped + foo = c._cookies["www.acme.com"]["/"]["foo"] + spam = c._cookies["www.acme.com"]["/"]["foo"] + self.assert_(foo.expires is None) + self.assert_(spam.expires is None) + + def test_expires(self): + from cookielib import time2netscape, CookieJar + + # if expires is in future, keep cookie... + c = CookieJar() + future = time2netscape(time.time()+3600) + interact_netscape(c, "http://www.acme.com/", 'spam="bar"; expires=%s' % + future) + self.assertEquals(len(c), 1) + now = time2netscape(time.time()-1) + # ... and if in past or present, discard it + interact_netscape(c, "http://www.acme.com/", 'foo="eggs"; expires=%s' % + now) + h = interact_netscape(c, "http://www.acme.com/") + self.assertEquals(len(c), 1) + self.assert_('spam="bar"' in h and "foo" not in h) + + # max-age takes precedence over expires, and zero max-age is request to + # delete both new cookie and any old matching cookie + interact_netscape(c, "http://www.acme.com/", 'eggs="bar"; expires=%s' % + future) + interact_netscape(c, "http://www.acme.com/", 'bar="bar"; expires=%s' % + future) + self.assertEquals(len(c), 3) + interact_netscape(c, "http://www.acme.com/", 'eggs="bar"; ' + 'expires=%s; max-age=0' % future) + interact_netscape(c, "http://www.acme.com/", 'bar="bar"; ' + 'max-age=0; expires=%s' % future) + h = interact_netscape(c, "http://www.acme.com/") + self.assertEquals(len(c), 1) + + # test expiry at end of session for cookies with no expires attribute + interact_netscape(c, "http://www.rhubarb.net/", 'whum="fizz"') + self.assertEquals(len(c), 2) + c.clear_session_cookies() + self.assertEquals(len(c), 1) + self.assert_('spam="bar"' in h) + + # XXX RFC 2965 expiry rules (some apply to V0 too) + + def test_default_path(self): + from cookielib import CookieJar, DefaultCookiePolicy + + # RFC 2965 + pol = DefaultCookiePolicy(rfc2965=True) + + c = CookieJar(pol) + interact_2965(c, "http://www.acme.com/", 'spam="bar"; Version="1"') + self.assert_("/" in c._cookies["www.acme.com"]) + + c = CookieJar(pol) + interact_2965(c, "http://www.acme.com/blah", 'eggs="bar"; Version="1"') + self.assert_("/" in c._cookies["www.acme.com"]) + + c = CookieJar(pol) + interact_2965(c, "http://www.acme.com/blah/rhubarb", + 'eggs="bar"; Version="1"') + self.assert_("/blah/" in c._cookies["www.acme.com"]) + + c = CookieJar(pol) + interact_2965(c, "http://www.acme.com/blah/rhubarb/", + 'eggs="bar"; Version="1"') + self.assert_("/blah/rhubarb/" in c._cookies["www.acme.com"]) + + # Netscape + + c = CookieJar() + interact_netscape(c, "http://www.acme.com/", 'spam="bar"') + self.assert_("/" in c._cookies["www.acme.com"]) + + c = CookieJar() + interact_netscape(c, "http://www.acme.com/blah", 'eggs="bar"') + self.assert_("/" in c._cookies["www.acme.com"]) + + c = CookieJar() + interact_netscape(c, "http://www.acme.com/blah/rhubarb", 'eggs="bar"') + self.assert_("/blah" in c._cookies["www.acme.com"]) + + c = CookieJar() + interact_netscape(c, "http://www.acme.com/blah/rhubarb/", 'eggs="bar"') + self.assert_("/blah/rhubarb" in c._cookies["www.acme.com"]) + + def test_escape_path(self): + from cookielib import escape_path + cases = [ + # quoted safe + ("/foo%2f/bar", "/foo%2F/bar"), + ("/foo%2F/bar", "/foo%2F/bar"), + # quoted % + ("/foo%%/bar", "/foo%%/bar"), + # quoted unsafe + ("/fo%19o/bar", "/fo%19o/bar"), + ("/fo%7do/bar", "/fo%7Do/bar"), + # unquoted safe + ("/foo/bar&", "/foo/bar&"), + ("/foo//bar", "/foo//bar"), + ("\176/foo/bar", "\176/foo/bar"), + # unquoted unsafe + ("/foo\031/bar", "/foo%19/bar"), + ("/\175foo/bar", "/%7Dfoo/bar"), + # unicode + (u"/foo/bar\uabcd", "/foo/bar%EA%AF%8D"), # UTF-8 encoded + ] + for arg, result in cases: + self.assertEquals(escape_path(arg), result) + + def test_request_path(self): + from urllib2 import Request + from cookielib import request_path + # with parameters + req = Request("http://www.example.com/rheum/rhaponicum;" + "foo=bar;sing=song?apples=pears&spam=eggs#ni") + self.assertEquals(request_path(req), "/rheum/rhaponicum;" + "foo=bar;sing=song?apples=pears&spam=eggs#ni") + # without parameters + req = Request("http://www.example.com/rheum/rhaponicum?" + "apples=pears&spam=eggs#ni") + self.assertEquals(request_path(req), "/rheum/rhaponicum?" + "apples=pears&spam=eggs#ni") + # missing final slash + req = Request("http://www.example.com") + self.assertEquals(request_path(req), "/") + + def test_request_port(self): + from urllib2 import Request + from cookielib import request_port, DEFAULT_HTTP_PORT + req = Request("http://www.acme.com:1234/", + headers={"Host": "www.acme.com:4321"}) + self.assertEquals(request_port(req), "1234") + req = Request("http://www.acme.com/", + headers={"Host": "www.acme.com:4321"}) + self.assertEquals(request_port(req), DEFAULT_HTTP_PORT) + + def test_request_host(self): + from urllib2 import Request + from cookielib import request_host + # this request is illegal (RFC2616, 14.2.3) + req = Request("http://1.1.1.1/", + headers={"Host": "www.acme.com:80"}) + # libwww-perl wants this response, but that seems wrong (RFC 2616, + # section 5.2, point 1., and RFC 2965 section 1, paragraph 3) + #self.assertEquals(request_host(req), "www.acme.com") + self.assertEquals(request_host(req), "1.1.1.1") + req = Request("http://www.acme.com/", + headers={"Host": "irrelevant.com"}) + self.assertEquals(request_host(req), "www.acme.com") + # not actually sure this one is valid Request object, so maybe should + # remove test for no host in url in request_host function? + req = Request("/resource.html", + headers={"Host": "www.acme.com"}) + self.assertEquals(request_host(req), "www.acme.com") + # port shouldn't be in request-host + req = Request("http://www.acme.com:2345/resource.html", + headers={"Host": "www.acme.com:5432"}) + self.assertEquals(request_host(req), "www.acme.com") + + def test_is_HDN(self): + from cookielib import is_HDN + self.assert_(is_HDN("foo.bar.com")) + self.assert_(is_HDN("1foo2.3bar4.5com")) + self.assert_(not is_HDN("192.168.1.1")) + self.assert_(not is_HDN("")) + self.assert_(not is_HDN(".")) + self.assert_(not is_HDN(".foo.bar.com")) + self.assert_(not is_HDN("..foo")) + self.assert_(not is_HDN("foo.")) + + def test_reach(self): + from cookielib import reach + self.assertEquals(reach("www.acme.com"), ".acme.com") + self.assertEquals(reach("acme.com"), "acme.com") + self.assertEquals(reach("acme.local"), ".local") + self.assertEquals(reach(".local"), ".local") + self.assertEquals(reach(".com"), ".com") + self.assertEquals(reach("."), ".") + self.assertEquals(reach(""), "") + self.assertEquals(reach("192.168.0.1"), "192.168.0.1") + + def test_domain_match(self): + from cookielib import domain_match, user_domain_match + self.assert_(domain_match("192.168.1.1", "192.168.1.1")) + self.assert_(not domain_match("192.168.1.1", ".168.1.1")) + self.assert_(domain_match("x.y.com", "x.Y.com")) + self.assert_(domain_match("x.y.com", ".Y.com")) + self.assert_(not domain_match("x.y.com", "Y.com")) + self.assert_(domain_match("a.b.c.com", ".c.com")) + self.assert_(not domain_match(".c.com", "a.b.c.com")) + self.assert_(domain_match("example.local", ".local")) + self.assert_(not domain_match("blah.blah", "")) + self.assert_(not domain_match("", ".rhubarb.rhubarb")) + self.assert_(domain_match("", "")) + + self.assert_(user_domain_match("acme.com", "acme.com")) + self.assert_(not user_domain_match("acme.com", ".acme.com")) + self.assert_(user_domain_match("rhubarb.acme.com", ".acme.com")) + self.assert_(user_domain_match("www.rhubarb.acme.com", ".acme.com")) + self.assert_(user_domain_match("x.y.com", "x.Y.com")) + self.assert_(user_domain_match("x.y.com", ".Y.com")) + self.assert_(not user_domain_match("x.y.com", "Y.com")) + self.assert_(user_domain_match("y.com", "Y.com")) + self.assert_(not user_domain_match(".y.com", "Y.com")) + self.assert_(user_domain_match(".y.com", ".Y.com")) + self.assert_(user_domain_match("x.y.com", ".com")) + self.assert_(not user_domain_match("x.y.com", "com")) + self.assert_(not user_domain_match("x.y.com", "m")) + self.assert_(not user_domain_match("x.y.com", ".m")) + self.assert_(not user_domain_match("x.y.com", "")) + self.assert_(not user_domain_match("x.y.com", ".")) + self.assert_(user_domain_match("192.168.1.1", "192.168.1.1")) + # not both HDNs, so must string-compare equal to match + self.assert_(not user_domain_match("192.168.1.1", ".168.1.1")) + self.assert_(not user_domain_match("192.168.1.1", ".")) + # empty string is a special case + self.assert_(not user_domain_match("192.168.1.1", "")) + + def test_wrong_domain(self): + # Cookies whose effective request-host name does not domain-match the + # domain are rejected. + + # XXX far from complete + from cookielib import CookieJar + c = CookieJar() + interact_2965(c, "http://www.nasty.com/", + 'foo=bar; domain=friendly.org; Version="1"') + self.assertEquals(len(c), 0) + + def test_two_component_domain_ns(self): + # Netscape: .www.bar.com, www.bar.com, .bar.com, bar.com, no domain + # should all get accepted, as should .acme.com, acme.com and no domain + # for 2-component domains like acme.com. + from cookielib import CookieJar, DefaultCookiePolicy + + c = CookieJar() + + # two-component V0 domain is OK + interact_netscape(c, "http://foo.net/", 'ns=bar') + self.assertEquals(len(c), 1) + self.assertEquals(c._cookies["foo.net"]["/"]["ns"].value, "bar") + self.assertEquals(interact_netscape(c, "http://foo.net/"), "ns=bar") + # *will* be returned to any other domain (unlike RFC 2965)... + self.assertEquals(interact_netscape(c, "http://www.foo.net/"), + "ns=bar") + # ...unless requested otherwise + pol = DefaultCookiePolicy( + strict_ns_domain=DefaultCookiePolicy.DomainStrictNonDomain) + c.set_policy(pol) + self.assertEquals(interact_netscape(c, "http://www.foo.net/"), "") + + # unlike RFC 2965, even explicit two-component domain is OK, + # because .foo.net matches foo.net + interact_netscape(c, "http://foo.net/foo/", + 'spam1=eggs; domain=foo.net') + # even if starts with a dot -- in NS rules, .foo.net matches foo.net! + interact_netscape(c, "http://foo.net/foo/bar/", + 'spam2=eggs; domain=.foo.net') + self.assertEquals(len(c), 3) + self.assertEquals(c._cookies[".foo.net"]["/foo"]["spam1"].value, + "eggs") + self.assertEquals(c._cookies[".foo.net"]["/foo/bar"]["spam2"].value, + "eggs") + self.assertEquals(interact_netscape(c, "http://foo.net/foo/bar/"), + "spam2=eggs; spam1=eggs; ns=bar") + + # top-level domain is too general + interact_netscape(c, "http://foo.net/", 'nini="ni"; domain=.net') + self.assertEquals(len(c), 3) + +## # Netscape protocol doesn't allow non-special top level domains (such +## # as co.uk) in the domain attribute unless there are at least three +## # dots in it. + # Oh yes it does! Real implementations don't check this, and real + # cookies (of course) rely on that behaviour. + interact_netscape(c, "http://foo.co.uk", 'nasty=trick; domain=.co.uk') +## self.assertEquals(len(c), 2) + self.assertEquals(len(c), 4) + + def test_two_component_domain_rfc2965(self): + from cookielib import CookieJar, DefaultCookiePolicy + + pol = DefaultCookiePolicy(rfc2965=True) + c = CookieJar(pol) + + # two-component V1 domain is OK + interact_2965(c, "http://foo.net/", 'foo=bar; Version="1"') + self.assertEquals(len(c), 1) + self.assertEquals(c._cookies["foo.net"]["/"]["foo"].value, "bar") + self.assertEquals(interact_2965(c, "http://foo.net/"), + "$Version=1; foo=bar") + # won't be returned to any other domain (because domain was implied) + self.assertEquals(interact_2965(c, "http://www.foo.net/"), "") + + # unless domain is given explicitly, because then it must be + # rewritten to start with a dot: foo.net --> .foo.net, which does + # not domain-match foo.net + interact_2965(c, "http://foo.net/foo", + 'spam=eggs; domain=foo.net; path=/foo; Version="1"') + self.assertEquals(len(c), 1) + self.assertEquals(interact_2965(c, "http://foo.net/foo"), + "$Version=1; foo=bar") + + # explicit foo.net from three-component domain www.foo.net *does* get + # set, because .foo.net domain-matches .foo.net + interact_2965(c, "http://www.foo.net/foo/", + 'spam=eggs; domain=foo.net; Version="1"') + self.assertEquals(c._cookies[".foo.net"]["/foo/"]["spam"].value, + "eggs") + self.assertEquals(len(c), 2) + self.assertEquals(interact_2965(c, "http://foo.net/foo/"), + "$Version=1; foo=bar") + self.assertEquals(interact_2965(c, "http://www.foo.net/foo/"), + '$Version=1; spam=eggs; $Domain="foo.net"') + + # top-level domain is too general + interact_2965(c, "http://foo.net/", + 'ni="ni"; domain=".net"; Version="1"') + self.assertEquals(len(c), 2) + + # RFC 2965 doesn't require blocking this + interact_2965(c, "http://foo.co.uk/", + 'nasty=trick; domain=.co.uk; Version="1"') + self.assertEquals(len(c), 3) + + def test_domain_allow(self): + from cookielib import CookieJar, DefaultCookiePolicy + from urllib2 import Request + + c = CookieJar(policy=DefaultCookiePolicy( + blocked_domains=["acme.com"], + allowed_domains=["www.acme.com"])) + + req = Request("http://acme.com/") + headers = ["Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/"] + res = FakeResponse(headers, "http://acme.com/") + c.extract_cookies(res, req) + self.assertEquals(len(c), 0) + + req = Request("http://www.acme.com/") + res = FakeResponse(headers, "http://www.acme.com/") + c.extract_cookies(res, req) + self.assertEquals(len(c), 1) + + req = Request("http://www.coyote.com/") + res = FakeResponse(headers, "http://www.coyote.com/") + c.extract_cookies(res, req) + self.assertEquals(len(c), 1) + + # set a cookie with non-allowed domain... + req = Request("http://www.coyote.com/") + res = FakeResponse(headers, "http://www.coyote.com/") + cookies = c.make_cookies(res, req) + c.set_cookie(cookies[0]) + self.assertEquals(len(c), 2) + # ... and check is doesn't get returned + c.add_cookie_header(req) + self.assert_(not req.has_header("Cookie")) + + def test_domain_block(self): + from cookielib import CookieJar, DefaultCookiePolicy + from urllib2 import Request + + pol = DefaultCookiePolicy( + rfc2965=True, blocked_domains=[".acme.com"]) + c = CookieJar(policy=pol) + headers = ["Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/"] + + req = Request("http://www.acme.com/") + res = FakeResponse(headers, "http://www.acme.com/") + c.extract_cookies(res, req) + self.assertEquals(len(c), 0) + + p = pol.set_blocked_domains(["acme.com"]) + c.extract_cookies(res, req) + self.assertEquals(len(c), 1) + + c.clear() + req = Request("http://www.roadrunner.net/") + res = FakeResponse(headers, "http://www.roadrunner.net/") + c.extract_cookies(res, req) + self.assertEquals(len(c), 1) + req = Request("http://www.roadrunner.net/") + c.add_cookie_header(req) + self.assert_((req.has_header("Cookie") and + req.has_header("Cookie2"))) + + c.clear() + pol.set_blocked_domains([".acme.com"]) + c.extract_cookies(res, req) + self.assertEquals(len(c), 1) + + # set a cookie with blocked domain... + req = Request("http://www.acme.com/") + res = FakeResponse(headers, "http://www.acme.com/") + cookies = c.make_cookies(res, req) + c.set_cookie(cookies[0]) + self.assertEquals(len(c), 2) + # ... and check is doesn't get returned + c.add_cookie_header(req) + self.assert_(not req.has_header("Cookie")) + + def test_secure(self): + from cookielib import CookieJar, DefaultCookiePolicy + + for ns in True, False: + for whitespace in " ", "": + c = CookieJar() + if ns: + pol = DefaultCookiePolicy(rfc2965=False) + int = interact_netscape + vs = "" + else: + pol = DefaultCookiePolicy(rfc2965=True) + int = interact_2965 + vs = "; Version=1" + c.set_policy(pol) + url = "http://www.acme.com/" + int(c, url, "foo1=bar%s%s" % (vs, whitespace)) + int(c, url, "foo2=bar%s; secure%s" % (vs, whitespace)) + self.assert_( + not c._cookies["www.acme.com"]["/"]["foo1"].secure, + "non-secure cookie registered secure") + self.assert_( + c._cookies["www.acme.com"]["/"]["foo2"].secure, + "secure cookie registered non-secure") + + def test_quote_cookie_value(self): + from cookielib import CookieJar, DefaultCookiePolicy + c = CookieJar(policy=DefaultCookiePolicy(rfc2965=True)) + interact_2965(c, "http://www.acme.com/", r'foo=\b"a"r; Version=1') + h = interact_2965(c, "http://www.acme.com/") + self.assertEquals(h, r'$Version=1; foo=\\b\"a\"r') + + def test_missing_final_slash(self): + # Missing slash from request URL's abs_path should be assumed present. + from cookielib import CookieJar, DefaultCookiePolicy + from urllib2 import Request + url = "http://www.acme.com" + c = CookieJar(DefaultCookiePolicy(rfc2965=True)) + interact_2965(c, url, "foo=bar; Version=1") + req = Request(url) + self.assertEquals(len(c), 1) + c.add_cookie_header(req) + self.assert_(req.has_header("Cookie")) + + def test_domain_mirror(self): + from cookielib import CookieJar, DefaultCookiePolicy + + pol = DefaultCookiePolicy(rfc2965=True) + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, "spam=eggs; Version=1") + h = interact_2965(c, url) + self.assert_("Domain" not in h, + "absent domain returned with domain present") + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, 'spam=eggs; Version=1; Domain=.bar.com') + h = interact_2965(c, url) + self.assert_('$Domain=".bar.com"' in h, "domain not returned") + + c = CookieJar(pol) + url = "http://foo.bar.com/" + # note missing initial dot in Domain + interact_2965(c, url, 'spam=eggs; Version=1; Domain=bar.com') + h = interact_2965(c, url) + self.assert_('$Domain="bar.com"' in h, "domain not returned") + + def test_path_mirror(self): + from cookielib import CookieJar, DefaultCookiePolicy + + pol = DefaultCookiePolicy(rfc2965=True) + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, "spam=eggs; Version=1") + h = interact_2965(c, url) + self.assert_("Path" not in h, + "absent path returned with path present") + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, 'spam=eggs; Version=1; Path=/') + h = interact_2965(c, url) + self.assert_('$Path="/"' in h, "path not returned") + + def test_port_mirror(self): + from cookielib import CookieJar, DefaultCookiePolicy + + pol = DefaultCookiePolicy(rfc2965=True) + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, "spam=eggs; Version=1") + h = interact_2965(c, url) + self.assert_("Port" not in h, + "absent port returned with port present") + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, "spam=eggs; Version=1; Port") + h = interact_2965(c, url) + self.assert_(re.search("\$Port([^=]|$)", h), + "port with no value not returned with no value") + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, 'spam=eggs; Version=1; Port="80"') + h = interact_2965(c, url) + self.assert_('$Port="80"' in h, + "port with single value not returned with single value") + + c = CookieJar(pol) + url = "http://foo.bar.com/" + interact_2965(c, url, 'spam=eggs; Version=1; Port="80,8080"') + h = interact_2965(c, url) + self.assert_('$Port="80,8080"' in h, + "port with multiple values not returned with multiple " + "values") + + def test_no_return_comment(self): + from cookielib import CookieJar, DefaultCookiePolicy + + c = CookieJar(DefaultCookiePolicy(rfc2965=True)) + url = "http://foo.bar.com/" + interact_2965(c, url, 'spam=eggs; Version=1; ' + 'Comment="does anybody read these?"; ' + 'CommentURL="http://foo.bar.net/comment.html"') + h = interact_2965(c, url) + self.assert_( + "Comment" not in h, + "Comment or CommentURL cookie-attributes returned to server") + + def test_Cookie_iterator(self): + from cookielib import CookieJar, Cookie, DefaultCookiePolicy + + cs = CookieJar(DefaultCookiePolicy(rfc2965=True)) + # add some random cookies + interact_2965(cs, "http://blah.spam.org/", 'foo=eggs; Version=1; ' + 'Comment="does anybody read these?"; ' + 'CommentURL="http://foo.bar.net/comment.html"') + interact_netscape(cs, "http://www.acme.com/blah/", "spam=bar; secure") + interact_2965(cs, "http://www.acme.com/blah/", + "foo=bar; secure; Version=1") + interact_2965(cs, "http://www.acme.com/blah/", + "foo=bar; path=/; Version=1") + interact_2965(cs, "http://www.sol.no", + r'bang=wallop; version=1; domain=".sol.no"; ' + r'port="90,100, 80,8080"; ' + r'max-age=100; Comment = "Just kidding! (\"|\\\\) "') + + versions = [1, 1, 1, 0, 1] + names = ["bang", "foo", "foo", "spam", "foo"] + domains = [".sol.no", "blah.spam.org", "www.acme.com", + "www.acme.com", "www.acme.com"] + paths = ["/", "/", "/", "/blah", "/blah/"] + + for i in range(4): + i = 0 + for c in cs: + self.assert_(isinstance(c, Cookie)) + self.assertEquals(c.version, versions[i]) + self.assertEquals(c.name, names[i]) + self.assertEquals(c.domain, domains[i]) + self.assertEquals(c.path, paths[i]) + i = i + 1 + + def test_parse_ns_headers(self): + from cookielib import parse_ns_headers + + # missing domain value (invalid cookie) + self.assertEquals( + parse_ns_headers(["foo=bar; path=/; domain"]), + [[("foo", "bar"), + ("path", "/"), ("domain", None), ("version", "0")]] + ) + # invalid expires value + self.assertEquals( + parse_ns_headers(["foo=bar; expires=Foo Bar 12 33:22:11 2000"]), + [[("foo", "bar"), ("expires", None), ("version", "0")]] + ) + # missing cookie value (valid cookie) + self.assertEquals( + parse_ns_headers(["foo"]), + [[("foo", None), ("version", "0")]] + ) + # shouldn't add version if header is empty + self.assertEquals(parse_ns_headers([""]), []) + + def test_bad_cookie_header(self): + + def cookiejar_from_cookie_headers(headers): + from cookielib import CookieJar + from urllib2 import Request + c = CookieJar() + req = Request("http://www.example.com/") + r = FakeResponse(headers, "http://www.example.com/") + c.extract_cookies(r, req) + return c + + # none of these bad headers should cause an exception to be raised + for headers in [ + ["Set-Cookie: "], # actually, nothing wrong with this + ["Set-Cookie2: "], # ditto + # missing domain value + ["Set-Cookie2: a=foo; path=/; Version=1; domain"], + # bad max-age + ["Set-Cookie: b=foo; max-age=oops"], + ]: + c = cookiejar_from_cookie_headers(headers) + # these bad cookies shouldn't be set + self.assertEquals(len(c), 0) + + # cookie with invalid expires is treated as session cookie + headers = ["Set-Cookie: c=foo; expires=Foo Bar 12 33:22:11 2000"] + c = cookiejar_from_cookie_headers(headers) + cookie = c._cookies["www.example.com"]["/"]["c"] + self.assert_(cookie.expires is None) + + +class LWPCookieTests(TestCase): + # Tests taken from libwww-perl, with a few modifications and additions. + + def test_netscape_example_1(self): + from cookielib import CookieJar, DefaultCookiePolicy + from urllib2 import Request + + #------------------------------------------------------------------- + # First we check that it works for the original example at + # http://www.netscape.com/newsref/std/cookie_spec.html + + # Client requests a document, and receives in the response: + # + # Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/; expires=Wednesday, 09-Nov-99 23:12:40 GMT + # + # When client requests a URL in path "/" on this server, it sends: + # + # Cookie: CUSTOMER=WILE_E_COYOTE + # + # Client requests a document, and receives in the response: + # + # Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/ + # + # When client requests a URL in path "/" on this server, it sends: + # + # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001 + # + # Client receives: + # + # Set-Cookie: SHIPPING=FEDEX; path=/fo + # + # When client requests a URL in path "/" on this server, it sends: + # + # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001 + # + # When client requests a URL in path "/foo" on this server, it sends: + # + # Cookie: CUSTOMER=WILE_E_COYOTE; PART_NUMBER=ROCKET_LAUNCHER_0001; SHIPPING=FEDEX + # + # The last Cookie is buggy, because both specifications say that the + # most specific cookie must be sent first. SHIPPING=FEDEX is the + # most specific and should thus be first. + + year_plus_one = time.localtime()[0] + 1 + + headers = [] + + c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) + + #req = Request("http://1.1.1.1/", + # headers={"Host": "www.acme.com:80"}) + req = Request("http://www.acme.com:80/", + headers={"Host": "www.acme.com:80"}) + + headers.append( + "Set-Cookie: CUSTOMER=WILE_E_COYOTE; path=/ ; " + "expires=Wednesday, 09-Nov-%d 23:12:40 GMT" % year_plus_one) + res = FakeResponse(headers, "http://www.acme.com/") + c.extract_cookies(res, req) + + req = Request("http://www.acme.com/") + c.add_cookie_header(req) + + self.assertEqual(req.get_header("Cookie"), "CUSTOMER=WILE_E_COYOTE") + self.assertEqual(req.get_header("Cookie2"), '$Version="1"') + + headers.append("Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/") + res = FakeResponse(headers, "http://www.acme.com/") + c.extract_cookies(res, req) + + req = Request("http://www.acme.com/foo/bar") + c.add_cookie_header(req) + + h = req.get_header("Cookie") + self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and + "CUSTOMER=WILE_E_COYOTE" in h) + + headers.append('Set-Cookie: SHIPPING=FEDEX; path=/foo') + res = FakeResponse(headers, "http://www.acme.com") + c.extract_cookies(res, req) + + req = Request("http://www.acme.com/") + c.add_cookie_header(req) + + h = req.get_header("Cookie") + self.assert_("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and + "CUSTOMER=WILE_E_COYOTE" in h and + "SHIPPING=FEDEX" not in h) + + req = Request("http://www.acme.com/foo/") + c.add_cookie_header(req) + + h = req.get_header("Cookie") + self.assert_(("PART_NUMBER=ROCKET_LAUNCHER_0001" in h and + "CUSTOMER=WILE_E_COYOTE" in h and + h.startswith("SHIPPING=FEDEX;"))) + + def test_netscape_example_2(self): + from cookielib import CookieJar + from urllib2 import Request + + # Second Example transaction sequence: + # + # Assume all mappings from above have been cleared. + # + # Client receives: + # + # Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/ + # + # When client requests a URL in path "/" on this server, it sends: + # + # Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001 + # + # Client receives: + # + # Set-Cookie: PART_NUMBER=RIDING_ROCKET_0023; path=/ammo + # + # When client requests a URL in path "/ammo" on this server, it sends: + # + # Cookie: PART_NUMBER=RIDING_ROCKET_0023; PART_NUMBER=ROCKET_LAUNCHER_0001 + # + # NOTE: There are two name/value pairs named "PART_NUMBER" due to + # the inheritance of the "/" mapping in addition to the "/ammo" mapping. + + c = CookieJar() + headers = [] + + req = Request("http://www.acme.com/") + headers.append("Set-Cookie: PART_NUMBER=ROCKET_LAUNCHER_0001; path=/") + res = FakeResponse(headers, "http://www.acme.com/") + + c.extract_cookies(res, req) + + req = Request("http://www.acme.com/") + c.add_cookie_header(req) + + self.assertEquals(req.get_header("Cookie"), + "PART_NUMBER=ROCKET_LAUNCHER_0001") + + headers.append( + "Set-Cookie: PART_NUMBER=RIDING_ROCKET_0023; path=/ammo") + res = FakeResponse(headers, "http://www.acme.com/") + c.extract_cookies(res, req) + + req = Request("http://www.acme.com/ammo") + c.add_cookie_header(req) + + self.assert_(re.search(r"PART_NUMBER=RIDING_ROCKET_0023;\s*" + "PART_NUMBER=ROCKET_LAUNCHER_0001", + req.get_header("Cookie"))) + + def test_ietf_example_1(self): + from cookielib import CookieJar, DefaultCookiePolicy + #------------------------------------------------------------------- + # Then we test with the examples from draft-ietf-http-state-man-mec-03.txt + # + # 5. EXAMPLES + + c = CookieJar(DefaultCookiePolicy(rfc2965=True)) + + # + # 5.1 Example 1 + # + # Most detail of request and response headers has been omitted. Assume + # the user agent has no stored cookies. + # + # 1. User Agent -> Server + # + # POST /acme/login HTTP/1.1 + # [form data] + # + # User identifies self via a form. + # + # 2. Server -> User Agent + # + # HTTP/1.1 200 OK + # Set-Cookie2: Customer="WILE_E_COYOTE"; Version="1"; Path="/acme" + # + # Cookie reflects user's identity. + + cookie = interact_2965( + c, 'http://www.acme.com/acme/login', + 'Customer="WILE_E_COYOTE"; Version="1"; Path="/acme"') + self.assert_(not cookie) + + # + # 3. User Agent -> Server + # + # POST /acme/pickitem HTTP/1.1 + # Cookie: $Version="1"; Customer="WILE_E_COYOTE"; $Path="/acme" + # [form data] + # + # User selects an item for ``shopping basket.'' + # + # 4. Server -> User Agent + # + # HTTP/1.1 200 OK + # Set-Cookie2: Part_Number="Rocket_Launcher_0001"; Version="1"; + # Path="/acme" + # + # Shopping basket contains an item. + + cookie = interact_2965(c, 'http://www.acme.com/acme/pickitem', + 'Part_Number="Rocket_Launcher_0001"; ' + 'Version="1"; Path="/acme"'); + self.assert_(re.search( + r'^\$Version="?1"?; Customer="?WILE_E_COYOTE"?; \$Path="/acme"$', + cookie)) + + # + # 5. User Agent -> Server + # + # POST /acme/shipping HTTP/1.1 + # Cookie: $Version="1"; + # Customer="WILE_E_COYOTE"; $Path="/acme"; + # Part_Number="Rocket_Launcher_0001"; $Path="/acme" + # [form data] + # + # User selects shipping method from form. + # + # 6. Server -> User Agent + # + # HTTP/1.1 200 OK + # Set-Cookie2: Shipping="FedEx"; Version="1"; Path="/acme" + # + # New cookie reflects shipping method. + + cookie = interact_2965(c, "http://www.acme.com/acme/shipping", + 'Shipping="FedEx"; Version="1"; Path="/acme"') + + self.assert_(re.search(r'^\$Version="?1"?;', cookie)) + self.assert_(re.search(r'Part_Number="?Rocket_Launcher_0001"?;' + '\s*\$Path="\/acme"', cookie)) + self.assert_(re.search(r'Customer="?WILE_E_COYOTE"?;\s*\$Path="\/acme"', + cookie)) + + # + # 7. User Agent -> Server + # + # POST /acme/process HTTP/1.1 + # Cookie: $Version="1"; + # Customer="WILE_E_COYOTE"; $Path="/acme"; + # Part_Number="Rocket_Launcher_0001"; $Path="/acme"; + # Shipping="FedEx"; $Path="/acme" + # [form data] + # + # User chooses to process order. + # + # 8. Server -> User Agent + # + # HTTP/1.1 200 OK + # + # Transaction is complete. + + cookie = interact_2965(c, "http://www.acme.com/acme/process") + self.assert_( + re.search(r'Shipping="?FedEx"?;\s*\$Path="\/acme"', cookie) and + "WILE_E_COYOTE" in cookie) + + # + # The user agent makes a series of requests on the origin server, after + # each of which it receives a new cookie. All the cookies have the same + # Path attribute and (default) domain. Because the request URLs all have + # /acme as a prefix, and that matches the Path attribute, each request + # contains all the cookies received so far. + + def test_ietf_example_2(self): + from cookielib import CookieJar, DefaultCookiePolicy + + # 5.2 Example 2 + # + # This example illustrates the effect of the Path attribute. All detail + # of request and response headers has been omitted. Assume the user agent + # has no stored cookies. + + c = CookieJar(DefaultCookiePolicy(rfc2965=True)) + + # Imagine the user agent has received, in response to earlier requests, + # the response headers + # + # Set-Cookie2: Part_Number="Rocket_Launcher_0001"; Version="1"; + # Path="/acme" + # + # and + # + # Set-Cookie2: Part_Number="Riding_Rocket_0023"; Version="1"; + # Path="/acme/ammo" + + interact_2965( + c, "http://www.acme.com/acme/ammo/specific", + 'Part_Number="Rocket_Launcher_0001"; Version="1"; Path="/acme"', + 'Part_Number="Riding_Rocket_0023"; Version="1"; Path="/acme/ammo"') + + # A subsequent request by the user agent to the (same) server for URLs of + # the form /acme/ammo/... would include the following request header: + # + # Cookie: $Version="1"; + # Part_Number="Riding_Rocket_0023"; $Path="/acme/ammo"; + # Part_Number="Rocket_Launcher_0001"; $Path="/acme" + # + # Note that the NAME=VALUE pair for the cookie with the more specific Path + # attribute, /acme/ammo, comes before the one with the less specific Path + # attribute, /acme. Further note that the same cookie name appears more + # than once. + + cookie = interact_2965(c, "http://www.acme.com/acme/ammo/...") + self.assert_( + re.search(r"Riding_Rocket_0023.*Rocket_Launcher_0001", cookie)) + + # A subsequent request by the user agent to the (same) server for a URL of + # the form /acme/parts/ would include the following request header: + # + # Cookie: $Version="1"; Part_Number="Rocket_Launcher_0001"; $Path="/acme" + # + # Here, the second cookie's Path attribute /acme/ammo is not a prefix of + # the request URL, /acme/parts/, so the cookie does not get forwarded to + # the server. + + cookie = interact_2965(c, "http://www.acme.com/acme/parts/") + self.assert_("Rocket_Launcher_0001" in cookie and + "Riding_Rocket_0023" not in cookie) + + def test_rejection(self): + # Test rejection of Set-Cookie2 responses based on domain, path, port. + from cookielib import DefaultCookiePolicy, LWPCookieJar + + pol = DefaultCookiePolicy(rfc2965=True) + + c = LWPCookieJar(policy=pol) + + max_age = "max-age=3600" + + # illegal domain (no embedded dots) + cookie = interact_2965(c, "http://www.acme.com", + 'foo=bar; domain=".com"; version=1') + self.assert_(not c) + + # legal domain + cookie = interact_2965(c, "http://www.acme.com", + 'ping=pong; domain="acme.com"; version=1') + self.assertEquals(len(c), 1) + + # illegal domain (host prefix "www.a" contains a dot) + cookie = interact_2965(c, "http://www.a.acme.com", + 'whiz=bang; domain="acme.com"; version=1') + self.assertEquals(len(c), 1) + + # legal domain + cookie = interact_2965(c, "http://www.a.acme.com", + 'wow=flutter; domain=".a.acme.com"; version=1') + self.assertEquals(len(c), 2) + + # can't partially match an IP-address + cookie = interact_2965(c, "http://125.125.125.125", + 'zzzz=ping; domain="125.125.125"; version=1') + self.assertEquals(len(c), 2) + + # illegal path (must be prefix of request path) + cookie = interact_2965(c, "http://www.sol.no", + 'blah=rhubarb; domain=".sol.no"; path="/foo"; ' + 'version=1') + self.assertEquals(len(c), 2) + + # legal path + cookie = interact_2965(c, "http://www.sol.no/foo/bar", + 'bing=bong; domain=".sol.no"; path="/foo"; ' + 'version=1') + self.assertEquals(len(c), 3) + + # illegal port (request-port not in list) + cookie = interact_2965(c, "http://www.sol.no", + 'whiz=ffft; domain=".sol.no"; port="90,100"; ' + 'version=1') + self.assertEquals(len(c), 3) + + # legal port + cookie = interact_2965( + c, "http://www.sol.no", + r'bang=wallop; version=1; domain=".sol.no"; ' + r'port="90,100, 80,8080"; ' + r'max-age=100; Comment = "Just kidding! (\"|\\\\) "') + self.assertEquals(len(c), 4) + + # port attribute without any value (current port) + cookie = interact_2965(c, "http://www.sol.no", + 'foo9=bar; version=1; domain=".sol.no"; port; ' + 'max-age=100;') + self.assertEquals(len(c), 5) + + # encoded path + # LWP has this test, but unescaping allowed path characters seems + # like a bad idea, so I think this should fail: +## cookie = interact_2965(c, "http://www.sol.no/foo/", +## r'foo8=bar; version=1; path="/%66oo"') + # but this is OK, because '<' is not an allowed HTTP URL path + # character: + cookie = interact_2965(c, "http://www.sol.no/<oo/", + r'foo8=bar; version=1; path="/%3coo"') + self.assertEquals(len(c), 6) + + # save and restore + filename = test_support.TESTFN + + try: + c.save(filename, ignore_discard=True) + old = repr(c) + + c = LWPCookieJar(policy=pol) + c.load(filename, ignore_discard=True) + finally: + try: os.unlink(filename) + except OSError: pass + + self.assertEquals(old, repr(c)) + + def test_url_encoding(self): + # Try some URL encodings of the PATHs. + # (the behaviour here has changed from libwww-perl) + from cookielib import CookieJar, DefaultCookiePolicy + + c = CookieJar(DefaultCookiePolicy(rfc2965=True)) + interact_2965(c, "http://www.acme.com/foo%2f%25/%3c%3c%0Anew%E5/%E5", + "foo = bar; version = 1") + + cookie = interact_2965( + c, "http://www.acme.com/foo%2f%25/<<%0anewå/æøå", + 'bar=baz; path="/foo/"; version=1'); + version_re = re.compile(r'^\$version=\"?1\"?', re.I) + self.assert_("foo=bar" in cookie and version_re.search(cookie)) + + cookie = interact_2965( + c, "http://www.acme.com/foo/%25/<<%0anewå/æøå") + self.assert_(not cookie) + + # unicode URL doesn't raise exception + cookie = interact_2965(c, u"http://www.acme.com/\xfc") + + def test_mozilla(self): + # Save / load Mozilla/Netscape cookie file format. + from cookielib import MozillaCookieJar, DefaultCookiePolicy + + year_plus_one = time.localtime()[0] + 1 + + filename = test_support.TESTFN + + c = MozillaCookieJar(filename, + policy=DefaultCookiePolicy(rfc2965=True)) + interact_2965(c, "http://www.acme.com/", + "foo1=bar; max-age=100; Version=1") + interact_2965(c, "http://www.acme.com/", + 'foo2=bar; port="80"; max-age=100; Discard; Version=1') + interact_2965(c, "http://www.acme.com/", "foo3=bar; secure; Version=1") + + expires = "expires=09-Nov-%d 23:12:40 GMT" % (year_plus_one,) + interact_netscape(c, "http://www.foo.com/", + "fooa=bar; %s" % expires) + interact_netscape(c, "http://www.foo.com/", + "foob=bar; Domain=.foo.com; %s" % expires) + interact_netscape(c, "http://www.foo.com/", + "fooc=bar; Domain=www.foo.com; %s" % expires) + + def save_and_restore(cj, ignore_discard): + try: + cj.save(ignore_discard=ignore_discard) + new_c = MozillaCookieJar(filename, + DefaultCookiePolicy(rfc2965=True)) + new_c.load(ignore_discard=ignore_discard) + finally: + try: os.unlink(filename) + except OSError: pass + return new_c + + new_c = save_and_restore(c, True) + self.assertEquals(len(new_c), 6) # none discarded + self.assert_("name='foo1', value='bar'" in repr(new_c)) + + new_c = save_and_restore(c, False) + self.assertEquals(len(new_c), 4) # 2 of them discarded on save + self.assert_("name='foo1', value='bar'" in repr(new_c)) + + def test_netscape_misc(self): + # Some additional Netscape cookies tests. + from cookielib import CookieJar + from urllib2 import Request + + c = CookieJar() + headers = [] + req = Request("http://foo.bar.acme.com/foo") + + # Netscape allows a host part that contains dots + headers.append("Set-Cookie: Customer=WILE_E_COYOTE; domain=.acme.com") + res = FakeResponse(headers, "http://www.acme.com/foo") + c.extract_cookies(res, req) + + # and that the domain is the same as the host without adding a leading + # dot to the domain. Should not quote even if strange chars are used + # in the cookie value. + headers.append("Set-Cookie: PART_NUMBER=3,4; domain=foo.bar.acme.com") + res = FakeResponse(headers, "http://www.acme.com/foo") + c.extract_cookies(res, req) + + req = Request("http://foo.bar.acme.com/foo") + c.add_cookie_header(req) + self.assert_( + "PART_NUMBER=3,4" in req.get_header("Cookie") and + "Customer=WILE_E_COYOTE" in req.get_header("Cookie")) + + def test_intranet_domains_2965(self): + # Test handling of local intranet hostnames without a dot. + from cookielib import CookieJar, DefaultCookiePolicy + + c = CookieJar(DefaultCookiePolicy(rfc2965=True)) + interact_2965(c, "http://example/", + "foo1=bar; PORT; Discard; Version=1;") + cookie = interact_2965(c, "http://example/", + 'foo2=bar; domain=".local"; Version=1') + self.assert_("foo1=bar" in cookie) + + interact_2965(c, "http://example/", 'foo3=bar; Version=1') + cookie = interact_2965(c, "http://example/") + self.assert_("foo2=bar" in cookie and len(c) == 3) + + def test_intranet_domains_ns(self): + from cookielib import CookieJar, DefaultCookiePolicy + + c = CookieJar(DefaultCookiePolicy(rfc2965 = False)) + interact_netscape(c, "http://example/", "foo1=bar") + cookie = interact_netscape(c, "http://example/", + 'foo2=bar; domain=.local') + self.assertEquals(len(c), 2) + self.assert_("foo1=bar" in cookie) + + cookie = interact_netscape(c, "http://example/") + self.assert_("foo2=bar" in cookie) + self.assertEquals(len(c), 2) + + def test_empty_path(self): + from cookielib import CookieJar, DefaultCookiePolicy + from urllib2 import Request + + # Test for empty path + # Broken web-server ORION/1.3.38 returns to the client response like + # + # Set-Cookie: JSESSIONID=ABCDERANDOM123; Path= + # + # ie. with Path set to nothing. + # In this case, extract_cookies() must set cookie to / (root) + c = CookieJar(DefaultCookiePolicy(rfc2965 = True)) + headers = [] + + req = Request("http://www.ants.com/") + headers.append("Set-Cookie: JSESSIONID=ABCDERANDOM123; Path=") + res = FakeResponse(headers, "http://www.ants.com/") + c.extract_cookies(res, req) + + req = Request("http://www.ants.com/") + c.add_cookie_header(req) + + self.assertEquals(req.get_header("Cookie"), + "JSESSIONID=ABCDERANDOM123") + self.assertEquals(req.get_header("Cookie2"), '$Version="1"') + + # missing path in the request URI + req = Request("http://www.ants.com:8080") + c.add_cookie_header(req) + + self.assertEquals(req.get_header("Cookie"), + "JSESSIONID=ABCDERANDOM123") + self.assertEquals(req.get_header("Cookie2"), '$Version="1"') + + def test_session_cookies(self): + from cookielib import CookieJar + from urllib2 import Request + + year_plus_one = time.localtime()[0] + 1 + + # Check session cookies are deleted properly by + # CookieJar.clear_session_cookies method + + req = Request('http://www.perlmeister.com/scripts') + headers = [] + headers.append("Set-Cookie: s1=session;Path=/scripts") + headers.append("Set-Cookie: p1=perm; Domain=.perlmeister.com;" + "Path=/;expires=Fri, 02-Feb-%d 23:24:20 GMT" % + year_plus_one) + headers.append("Set-Cookie: p2=perm;Path=/;expires=Fri, " + "02-Feb-%d 23:24:20 GMT" % year_plus_one) + headers.append("Set-Cookie: s2=session;Path=/scripts;" + "Domain=.perlmeister.com") + headers.append('Set-Cookie2: s3=session;Version=1;Discard;Path="/"') + res = FakeResponse(headers, 'http://www.perlmeister.com/scripts') + + c = CookieJar() + c.extract_cookies(res, req) + # How many session/permanent cookies do we have? + counter = {"session_after": 0, + "perm_after": 0, + "session_before": 0, + "perm_before": 0} + for cookie in c: + key = "%s_before" % cookie.value + counter[key] = counter[key] + 1 + c.clear_session_cookies() + # How many now? + for cookie in c: + key = "%s_after" % cookie.value + counter[key] = counter[key] + 1 + + self.assert_(not ( + # a permanent cookie got lost accidently + counter["perm_after"] != counter["perm_before"] or + # a session cookie hasn't been cleared + counter["session_after"] != 0 or + # we didn't have session cookies in the first place + counter["session_before"] == 0)) + + +def test_main(verbose=None): + from test import test_sets + test_support.run_unittest( + DateTimeTests, + HeaderTests, + CookieTests, + LWPCookieTests, + ) + +if __name__ == "__main__": + test_main(verbose=True) diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index f38a4a3..8a7cf65 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -54,6 +54,10 @@ class MockFile: def readline(self, count=None): pass def close(self): pass +class MockHeaders(dict): + def getheaders(self, name): + return self.values() + class MockResponse(StringIO.StringIO): def __init__(self, code, msg, headers, data, url=None): StringIO.StringIO.__init__(self, data) @@ -63,6 +67,12 @@ class MockResponse(StringIO.StringIO): def geturl(self): return self.url +class MockCookieJar: + def add_cookie_header(self, request): + self.ach_req = request + def extract_cookies(self, response, request): + self.ec_req, self.ec_r = request, response + class FakeMethod: def __init__(self, meth_name, action, handle): self.meth_name = meth_name @@ -474,7 +484,7 @@ class HandlerTests(unittest.TestCase): for data in "", None: # POST, GET req = Request("http://example.com/", data) r = MockResponse(200, "OK", {}, "") - newreq = h.do_request(req) + newreq = h.do_request_(req) if data is None: # GET self.assert_("Content-length" not in req.unredirected_hdrs) self.assert_("Content-type" not in req.unredirected_hdrs) @@ -491,7 +501,7 @@ class HandlerTests(unittest.TestCase): req.add_unredirected_header("Content-type", "bar") req.add_unredirected_header("Host", "baz") req.add_unredirected_header("Spam", "foo") - newreq = h.do_request(req) + newreq = h.do_request_(req) self.assertEqual(req.unredirected_hdrs["Content-length"], "foo") self.assertEqual(req.unredirected_hdrs["Content-type"], "bar") self.assertEqual(req.unredirected_hdrs["Host"], "baz") @@ -514,6 +524,21 @@ class HandlerTests(unittest.TestCase): self.assertEqual(o.proto, "http") # o.error called self.assertEqual(o.args, (req, r, 201, "Created", {})) + def test_cookies(self): + cj = MockCookieJar() + h = urllib2.HTTPCookieProcessor(cj) + o = h.parent = MockOpener() + + req = Request("http://example.com/") + r = MockResponse(200, "OK", {}, "") + newreq = h.http_request(req) + self.assert_(cj.ach_req is req is newreq) + self.assertEquals(req.get_origin_req_host(), "example.com") + self.assert_(not req.is_unverifiable()) + newr = h.http_response(req, r) + self.assert_(cj.ec_req is req) + self.assert_(cj.ec_r is r is newr) + def test_redirect(self): from_url = "http://example.com/a.html" to_url = "http://example.com/b.html" @@ -528,7 +553,8 @@ class HandlerTests(unittest.TestCase): req.add_header("Nonsense", "viking=withhold") req.add_unredirected_header("Spam", "spam") try: - method(req, MockFile(), code, "Blah", {"location": to_url}) + method(req, MockFile(), code, "Blah", + MockHeaders({"location": to_url})) except urllib2.HTTPError: # 307 in response to POST requires user OK self.assert_(code == 307 and data is not None) @@ -544,38 +570,65 @@ class HandlerTests(unittest.TestCase): # loop detection req = Request(from_url) - req.origin_req_host = "example.com" - def redirect(h, req, code, url=to_url): - method = getattr(h, "http_error_%s" % code) - method(req, MockFile(), code, "Blah", {"location": url}) + def redirect(h, req, url=to_url): + h.http_error_302(req, MockFile(), 302, "Blah", + MockHeaders({"location": url})) # Note that the *original* request shares the same record of # redirections with the sub-requests caused by the redirections. - # once - redirect(h, req, 302) - # twice: loop detected - self.assertRaises(urllib2.HTTPError, redirect, h, req, 302) - # and again - self.assertRaises(urllib2.HTTPError, redirect, h, req, 302) - # but this is a different redirect code, so OK... - redirect(h, req, 301) - self.assertRaises(urllib2.HTTPError, redirect, h, req, 301) - # order doesn't matter - redirect(h, req, 303) - redirect(h, req, 307) - self.assertRaises(urllib2.HTTPError, redirect, h, req, 303) + + # detect infinite loop redirect of a URL to itself + req = Request(from_url, origin_req_host="example.com") + count = 0 + try: + while 1: + redirect(h, req, "http://example.com/") + count = count + 1 + except urllib2.HTTPError: + # don't stop until max_repeats, because cookies may introduce state + self.assertEqual(count, urllib2.HTTPRedirectHandler.max_repeats) # detect endless non-repeating chain of redirects - req = Request(from_url) - req.origin_req_host = "example.com" + req = Request(from_url, origin_req_host="example.com") count = 0 try: while 1: - redirect(h, req, 302, "http://example.com/%d" % count) + redirect(h, req, "http://example.com/%d" % count) count = count + 1 except urllib2.HTTPError: self.assertEqual(count, urllib2.HTTPRedirectHandler.max_redirections) + def test_cookie_redirect(self): + class MockHTTPHandler(urllib2.HTTPHandler): + def __init__(self): self._count = 0 + def http_open(self, req): + import mimetools + from StringIO import StringIO + if self._count == 0: + self._count = self._count + 1 + msg = mimetools.Message( + StringIO("Location: http://www.cracker.com/\r\n\r\n")) + return self.parent.error( + "http", req, MockFile(), 302, "Found", msg) + else: + self.req = req + msg = mimetools.Message(StringIO("\r\n\r\n")) + return MockResponse(200, "OK", msg, "", req.get_full_url()) + # cookies shouldn't leak into redirected requests + from cookielib import CookieJar + from urllib2 import build_opener, HTTPHandler, HTTPError, \ + HTTPCookieProcessor + + from test_cookielib import interact_netscape + + cj = CookieJar() + interact_netscape(cj, "http://www.example.com/", "spam=eggs") + hh = MockHTTPHandler() + cp = HTTPCookieProcessor(cj) + o = build_opener(hh, cp) + o.open("http://www.example.com/") + self.assert_(not hh.req.has_header("Cookie")) + class MiscTests(unittest.TestCase): diff --git a/Lib/urllib2.py b/Lib/urllib2.py index 773cd7d..448e948 100644 --- a/Lib/urllib2.py +++ b/Lib/urllib2.py @@ -106,6 +106,7 @@ import sys import time import urlparse import bisect +import cookielib try: from cStringIO import StringIO @@ -176,7 +177,8 @@ class GopherError(URLError): class Request: - def __init__(self, url, data=None, headers={}): + def __init__(self, url, data=None, headers={}, + origin_req_host=None, unverifiable=False): # unwrap('<URL:type://host/path>') --> 'type://host/path' self.__original = unwrap(url) self.type = None @@ -188,6 +190,10 @@ class Request: for key, value in headers.items(): self.add_header(key, value) self.unredirected_hdrs = {} + if origin_req_host is None: + origin_req_host = cookielib.request_host(self) + self.origin_req_host = origin_req_host + self.unverifiable = unverifiable def __getattr__(self, attr): # XXX this is a fallback mechanism to guard against these @@ -242,6 +248,12 @@ class Request: self.host, self.type = host, type self.__r_host = self.__original + def get_origin_req_host(self): + return self.origin_req_host + + def is_unverifiable(self): + return self.unverifiable + def add_header(self, key, val): # useful for something like authentication self.headers[key.capitalize()] = val @@ -254,6 +266,15 @@ class Request: return bool(header_name in self.headers or header_name in self.unredirected_hdrs) + def get_header(self, header_name, default=None): + return self.headers.get( + header_name, + self.unredirected_hdrs.get(header_name, default)) + + def header_items(self): + hdrs = self.unredirected_hdrs.copy() + hdrs.update(self.headers) + return hdrs.items() class OpenerDirector: def __init__(self): @@ -460,7 +481,11 @@ class HTTPDefaultErrorHandler(BaseHandler): raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) class HTTPRedirectHandler(BaseHandler): - # maximum number of redirections before assuming we're in a loop + # maximum number of redirections to any single URL + # this is needed because of the state that cookies introduce + max_repeats = 4 + # maximum total number of redirections (regardless of URL) before + # assuming we're in a loop max_redirections = 10 def redirect_request(self, req, fp, code, msg, headers, newurl): @@ -481,7 +506,10 @@ class HTTPRedirectHandler(BaseHandler): # from the user (of urllib2, in this case). In practice, # essentially all clients do redirect in this case, so we # do the same. - return Request(newurl, headers=req.headers) + return Request(newurl, + headers=req.headers, + origin_req_host=req.get_origin_req_host(), + unverifiable=True) else: raise HTTPError(req.get_full_url(), code, msg, headers, fp) @@ -490,10 +518,12 @@ class HTTPRedirectHandler(BaseHandler): # have already seen. Do this by adding a handler-specific # attribute to the Request object. def http_error_302(self, req, fp, code, msg, headers): + # Some servers (incorrectly) return multiple Location headers + # (so probably same goes for URI). Use first header. if 'location' in headers: - newurl = headers['location'] + newurl = headers.getheaders('location')[0] elif 'uri' in headers: - newurl = headers['uri'] + newurl = headers.getheaders('uri')[0] else: return newurl = urlparse.urljoin(req.get_full_url(), newurl) @@ -506,20 +536,16 @@ class HTTPRedirectHandler(BaseHandler): return # loop detection - # .redirect_dict has a key (url, code) if url was previously - # visited as a result of a redirection with that code. The - # code is needed in addition to the URL because visiting a URL - # twice isn't necessarily a loop: there is more than one way - # to redirect (301, 302, 303, 307, refresh). - key = (newurl, code) + # .redirect_dict has a key url if url was previously visited. if hasattr(req, 'redirect_dict'): visited = new.redirect_dict = req.redirect_dict - if key in visited or len(visited) >= self.max_redirections: + if (visited.get(newurl, 0) >= self.max_repeats or + len(visited) >= self.max_redirections): raise HTTPError(req.get_full_url(), code, self.inf_msg + msg, headers, fp) else: visited = new.redirect_dict = req.redirect_dict = {} - visited[key] = None + visited[newurl] = visited.get(newurl, 0) + 1 # Don't close the fp until we are sure that we won't use it # with HTTPError. @@ -912,7 +938,7 @@ class AbstractHTTPHandler(BaseHandler): def set_http_debuglevel(self, level): self._debuglevel = level - def do_request(self, request): + def do_request_(self, request): host = request.get_host() if not host: raise URLError('no host given') @@ -987,7 +1013,7 @@ class HTTPHandler(AbstractHTTPHandler): def http_open(self, req): return self.do_open(httplib.HTTPConnection, req) - http_request = AbstractHTTPHandler.do_request + http_request = AbstractHTTPHandler.do_request_ if hasattr(httplib, 'HTTPS'): class HTTPSHandler(AbstractHTTPHandler): @@ -995,7 +1021,24 @@ if hasattr(httplib, 'HTTPS'): def https_open(self, req): return self.do_open(httplib.HTTPSConnection, req) - https_request = AbstractHTTPHandler.do_request + https_request = AbstractHTTPHandler.do_request_ + +class HTTPCookieProcessor(BaseHandler): + def __init__(self, cookiejar=None): + if cookiejar is None: + cookiejar = CookieJar() + self.cookiejar = cookiejar + + def http_request(self, request): + self.cookiejar.add_cookie_header(request) + return request + + def http_response(self, request, response): + self.cookiejar.extract_cookies(response, request) + return response + + https_request = http_request + https_response = http_response class UnknownHandler(BaseHandler): def unknown_open(self, req): @@ -311,6 +311,10 @@ Extension modules Library ------- +- Added a new module: cookielib. Automatic cookie handling for HTTP + clients. Also, support for cookielib has been added to urllib2, so + urllib2.urlopen() can transparently handle cookies. + - stringprep.py now uses built-in set() instead of sets.Set(). - Bug #876278: Unbounded recursion in modulefinder |