summaryrefslogtreecommitdiffstats
path: root/Doc/lib/librobotparser.tex
diff options
context:
space:
mode:
authorFred Drake <fdrake@acm.org>2000-03-31 17:51:10 (GMT)
committerFred Drake <fdrake@acm.org>2000-03-31 17:51:10 (GMT)
commit3c9f936eee8bd826b631ee886ec3bb063da35fbd (patch)
tree54b7c58530933c9da34a6e63a372229202fe2826 /Doc/lib/librobotparser.tex
parent32abe6f7d0a82358efd0494992f3c388d7b24036 (diff)
downloadcpython-3c9f936eee8bd826b631ee886ec3bb063da35fbd.zip
cpython-3c9f936eee8bd826b631ee886ec3bb063da35fbd.tar.gz
cpython-3c9f936eee8bd826b631ee886ec3bb063da35fbd.tar.bz2
Two new sections. Preliminary.
Diffstat (limited to 'Doc/lib/librobotparser.tex')
-rw-r--r--Doc/lib/librobotparser.tex68
1 files changed, 68 insertions, 0 deletions
diff --git a/Doc/lib/librobotparser.tex b/Doc/lib/librobotparser.tex
new file mode 100644
index 0000000..bf35fac
--- /dev/null
+++ b/Doc/lib/librobotparser.tex
@@ -0,0 +1,68 @@
+\section{\module{robotparser} ---
+ Parser for \filenq{robots.txt}}
+
+\declaremodule{standard}{robotparser}
+\modulesynopsis{Accepts as input a list of lines or URL that refers to a
+ robots.txt file, parses the file, then builds a
+ set of rules from that list and answers questions
+ about fetchability of other URLs.}
+\sectionauthor{Skip Montanaro}{skip@mojam.com}
+
+\index{WWW}
+\index{World-Wide Web}
+\index{URL}
+\index{robots.txt}
+
+This module provides a single class, \class{RobotFileParser}, which answers
+questions about whether or not a particular user agent can fetch a URL on
+the web site that published the \file{robots.txt} file. For more details on
+the structure of \file{robots.txt} files, see
+\url{http://info.webcrawler.com/mak/projects/robots/norobots.html}.
+
+\begin{classdesc}{RobotFileParser}{}
+
+This class provides a set of methods to read, parse and answer questions
+about a single \file{robots.txt} file.
+
+\begin{methoddesc}{set_url}{url}
+Sets the URL referring to a \file{robots.txt} file.
+\end{methoddesc}
+
+\begin{methoddesc}{read}{}
+Reads the \file{robots.txt} URL and feeds it to the parser.
+\end{methoddesc}
+
+\begin{methoddesc}{parse}{lines}
+Parses the lines argument.
+\end{methoddesc}
+
+\begin{methoddesc}{can_fetch}{useragent, url}
+Returns true if the \var{useragent} is allowed to fetch the \var{url}
+according to the rules contained in the parsed \file{robots.txt} file.
+\end{methoddesc}
+
+\begin{methoddesc}{mtime}{}
+Returns the time the \code{robots.txt} file was last fetched. This is
+useful for long-running web spiders that need to check for new
+\code{robots.txt} files periodically.
+\end{methoddesc}
+
+\begin{methoddesc}{modified}{}
+Sets the time the \code{robots.txt} file was last fetched to the current
+time.
+\end{methoddesc}
+
+\end{classdesc}
+
+The following example demonstrates basic use of the RobotFileParser class.
+
+\begin{verbatim}
+>>> import robotparser
+>>> rp = robotparser.RobotFileParser()
+>>> rp.set_url("http://www.musi-cal.com/robots.txt")
+>>> rp.read()
+>>> rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
+0
+>>> rp.can_fetch("*", "http://www.musi-cal.com/")
+1
+\end{verbatim}