diff options
author | Fred Drake <fdrake@acm.org> | 2000-03-31 17:51:10 (GMT) |
---|---|---|
committer | Fred Drake <fdrake@acm.org> | 2000-03-31 17:51:10 (GMT) |
commit | 3c9f936eee8bd826b631ee886ec3bb063da35fbd (patch) | |
tree | 54b7c58530933c9da34a6e63a372229202fe2826 /Doc/lib/librobotparser.tex | |
parent | 32abe6f7d0a82358efd0494992f3c388d7b24036 (diff) | |
download | cpython-3c9f936eee8bd826b631ee886ec3bb063da35fbd.zip cpython-3c9f936eee8bd826b631ee886ec3bb063da35fbd.tar.gz cpython-3c9f936eee8bd826b631ee886ec3bb063da35fbd.tar.bz2 |
Two new sections. Preliminary.
Diffstat (limited to 'Doc/lib/librobotparser.tex')
-rw-r--r-- | Doc/lib/librobotparser.tex | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/Doc/lib/librobotparser.tex b/Doc/lib/librobotparser.tex new file mode 100644 index 0000000..bf35fac --- /dev/null +++ b/Doc/lib/librobotparser.tex @@ -0,0 +1,68 @@ +\section{\module{robotparser} --- + Parser for \filenq{robots.txt}} + +\declaremodule{standard}{robotparser} +\modulesynopsis{Accepts as input a list of lines or URL that refers to a + robots.txt file, parses the file, then builds a + set of rules from that list and answers questions + about fetchability of other URLs.} +\sectionauthor{Skip Montanaro}{skip@mojam.com} + +\index{WWW} +\index{World-Wide Web} +\index{URL} +\index{robots.txt} + +This module provides a single class, \class{RobotFileParser}, which answers +questions about whether or not a particular user agent can fetch a URL on +the web site that published the \file{robots.txt} file. For more details on +the structure of \file{robots.txt} files, see +\url{http://info.webcrawler.com/mak/projects/robots/norobots.html}. + +\begin{classdesc}{RobotFileParser}{} + +This class provides a set of methods to read, parse and answer questions +about a single \file{robots.txt} file. + +\begin{methoddesc}{set_url}{url} +Sets the URL referring to a \file{robots.txt} file. +\end{methoddesc} + +\begin{methoddesc}{read}{} +Reads the \file{robots.txt} URL and feeds it to the parser. +\end{methoddesc} + +\begin{methoddesc}{parse}{lines} +Parses the lines argument. +\end{methoddesc} + +\begin{methoddesc}{can_fetch}{useragent, url} +Returns true if the \var{useragent} is allowed to fetch the \var{url} +according to the rules contained in the parsed \file{robots.txt} file. +\end{methoddesc} + +\begin{methoddesc}{mtime}{} +Returns the time the \code{robots.txt} file was last fetched. This is +useful for long-running web spiders that need to check for new +\code{robots.txt} files periodically. +\end{methoddesc} + +\begin{methoddesc}{modified}{} +Sets the time the \code{robots.txt} file was last fetched to the current +time. +\end{methoddesc} + +\end{classdesc} + +The following example demonstrates basic use of the RobotFileParser class. + +\begin{verbatim} +>>> import robotparser +>>> rp = robotparser.RobotFileParser() +>>> rp.set_url("http://www.musi-cal.com/robots.txt") +>>> rp.read() +>>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco") +0 +>>> rp.can_fetch("*", "http://www.musi-cal.com/") +1 +\end{verbatim} |