summaryrefslogtreecommitdiffstats
path: root/Lib/robotparser.py
diff options
context:
space:
mode:
authorSkip Montanaro <skip@pobox.com>2008-04-28 03:27:53 (GMT)
committerSkip Montanaro <skip@pobox.com>2008-04-28 03:27:53 (GMT)
commitb8bdbc04e702409e5aaaaff74c6e5cd93226af07 (patch)
tree90e02f7cfb9fcb6f980a9f8ccad46bf7e6ab27e1 /Lib/robotparser.py
parentdfd982715bc81103dfcb3eecdccff32675a772a3 (diff)
downloadcpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.zip
cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.tar.gz
cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.tar.bz2
Get rid of _test(), _main(), _debug() and _check(). Tests are no longer
needed (better set available in Lib/test/test_robotparser.py). Clean up a few PEP 8 nits (compound statements on a single line, whitespace around operators).
Diffstat (limited to 'Lib/robotparser.py')
-rw-r--r--Lib/robotparser.py105
1 files changed, 12 insertions, 93 deletions
diff --git a/Lib/robotparser.py b/Lib/robotparser.py
index 52ab348..5b1d797 100644
--- a/Lib/robotparser.py
+++ b/Lib/robotparser.py
@@ -9,15 +9,11 @@
The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
"""
-import urlparse,urllib
+import urlparse
+import urllib
__all__ = ["RobotFileParser"]
-debug = 0
-
-def _debug(msg):
- if debug: print msg
-
class RobotFileParser:
""" This class provides a set of methods to read, parse and answer
@@ -67,12 +63,9 @@ class RobotFileParser:
self.errcode = opener.errcode
if self.errcode in (401, 403):
self.disallow_all = True
- _debug("disallow all")
elif self.errcode >= 400:
self.allow_all = True
- _debug("allow all")
elif self.errcode == 200 and lines:
- _debug("parse lines")
self.parse(lines)
def _add_entry(self, entry):
@@ -93,19 +86,16 @@ class RobotFileParser:
for line in lines:
linenumber = linenumber + 1
if not line:
- if state==1:
- _debug("line %d: warning: you should insert"
- " allow: or disallow: directives below any"
- " user-agent: line" % linenumber)
+ if state == 1:
entry = Entry()
state = 0
- elif state==2:
+ elif state == 2:
self._add_entry(entry)
entry = Entry()
state = 0
# remove optional comment and strip line
i = line.find('#')
- if i>=0:
+ if i >= 0:
line = line[:i]
line = line.strip()
if not line:
@@ -115,41 +105,24 @@ class RobotFileParser:
line[0] = line[0].strip().lower()
line[1] = urllib.unquote(line[1].strip())
if line[0] == "user-agent":
- if state==2:
- _debug("line %d: warning: you should insert a blank"
- " line before any user-agent"
- " directive" % linenumber)
+ if state == 2:
self._add_entry(entry)
entry = Entry()
entry.useragents.append(line[1])
state = 1
elif line[0] == "disallow":
- if state==0:
- _debug("line %d: error: you must insert a user-agent:"
- " directive before this line" % linenumber)
- else:
+ if state != 0:
entry.rulelines.append(RuleLine(line[1], False))
state = 2
elif line[0] == "allow":
- if state==0:
- _debug("line %d: error: you must insert a user-agent:"
- " directive before this line" % linenumber)
- else:
+ if state != 0:
entry.rulelines.append(RuleLine(line[1], True))
- else:
- _debug("line %d: warning: unknown key %s" % (linenumber,
- line[0]))
- else:
- _debug("line %d: error: malformed line %s"%(linenumber, line))
- if state==2:
+ if state == 2:
self.entries.append(entry)
- _debug("Parsed rules:\n%s" % str(self))
def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url"""
- _debug("Checking robots.txt allowance for:\n user agent: %s\n url: %s" %
- (useragent, url))
if self.disallow_all:
return False
if self.allow_all:
@@ -182,10 +155,10 @@ class RuleLine:
self.allowance = allowance
def applies_to(self, filename):
- return self.path=="*" or filename.startswith(self.path)
+ return self.path == "*" or filename.startswith(self.path)
def __str__(self):
- return (self.allowance and "Allow" or "Disallow")+": "+self.path
+ return (self.allowance and "Allow" or "Disallow") + ": " + self.path
class Entry:
@@ -207,7 +180,7 @@ class Entry:
# split the name token and make it lower case
useragent = useragent.split("/")[0].lower()
for agent in self.useragents:
- if agent=='*':
+ if agent == '*':
# we have the catch-all agent
return True
agent = agent.lower()
@@ -220,7 +193,6 @@ class Entry:
- our agent applies to this entry
- filename is URL decoded"""
for line in self.rulelines:
- _debug((filename, str(line), line.allowance))
if line.applies_to(filename):
return line.allowance
return True
@@ -239,56 +211,3 @@ class URLopener(urllib.FancyURLopener):
self.errcode = errcode
return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
errmsg, headers)
-
-def _check(a,b):
- if not b:
- ac = "access denied"
- else:
- ac = "access allowed"
- if a!=b:
- print "failed"
- else:
- print "ok (%s)" % ac
- print
-
-def _test():
- global debug
- rp = RobotFileParser()
- debug = 1
-
- # robots.txt that exists, gotten to by redirection
- rp.set_url('http://www.musi-cal.com/robots.txt')
- rp.read()
-
- # test for re.escape
- _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
- # this should match the first rule, which is a disallow
- _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
- # various cherry pickers
- _check(rp.can_fetch('CherryPickerSE',
- 'http://www.musi-cal.com/cgi-bin/event-search'
- '?city=San+Francisco'), 0)
- _check(rp.can_fetch('CherryPickerSE/1.0',
- 'http://www.musi-cal.com/cgi-bin/event-search'
- '?city=San+Francisco'), 0)
- _check(rp.can_fetch('CherryPickerSE/1.5',
- 'http://www.musi-cal.com/cgi-bin/event-search'
- '?city=San+Francisco'), 0)
- # case sensitivity
- _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
- _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
- # substring test
- _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
- # tests for catch-all * agent
- _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
- _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
- _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
- _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
-
- # robots.txt that does not exist
- rp.set_url('http://www.lycos.com/robots.txt')
- rp.read()
- _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
-
-if __name__ == '__main__':
- _test()