diff options
author | Skip Montanaro <skip@pobox.com> | 2008-04-28 03:27:53 (GMT) |
---|---|---|
committer | Skip Montanaro <skip@pobox.com> | 2008-04-28 03:27:53 (GMT) |
commit | b8bdbc04e702409e5aaaaff74c6e5cd93226af07 (patch) | |
tree | 90e02f7cfb9fcb6f980a9f8ccad46bf7e6ab27e1 /Lib/robotparser.py | |
parent | dfd982715bc81103dfcb3eecdccff32675a772a3 (diff) | |
download | cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.zip cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.tar.gz cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.tar.bz2 |
Get rid of _test(), _main(), _debug() and _check(). Tests are no longer
needed (better set available in Lib/test/test_robotparser.py). Clean up a
few PEP 8 nits (compound statements on a single line, whitespace around
operators).
Diffstat (limited to 'Lib/robotparser.py')
-rw-r--r-- | Lib/robotparser.py | 105 |
1 files changed, 12 insertions, 93 deletions
diff --git a/Lib/robotparser.py b/Lib/robotparser.py index 52ab348..5b1d797 100644 --- a/Lib/robotparser.py +++ b/Lib/robotparser.py @@ -9,15 +9,11 @@ The robots.txt Exclusion Protocol is implemented as specified in http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html """ -import urlparse,urllib +import urlparse +import urllib __all__ = ["RobotFileParser"] -debug = 0 - -def _debug(msg): - if debug: print msg - class RobotFileParser: """ This class provides a set of methods to read, parse and answer @@ -67,12 +63,9 @@ class RobotFileParser: self.errcode = opener.errcode if self.errcode in (401, 403): self.disallow_all = True - _debug("disallow all") elif self.errcode >= 400: self.allow_all = True - _debug("allow all") elif self.errcode == 200 and lines: - _debug("parse lines") self.parse(lines) def _add_entry(self, entry): @@ -93,19 +86,16 @@ class RobotFileParser: for line in lines: linenumber = linenumber + 1 if not line: - if state==1: - _debug("line %d: warning: you should insert" - " allow: or disallow: directives below any" - " user-agent: line" % linenumber) + if state == 1: entry = Entry() state = 0 - elif state==2: + elif state == 2: self._add_entry(entry) entry = Entry() state = 0 # remove optional comment and strip line i = line.find('#') - if i>=0: + if i >= 0: line = line[:i] line = line.strip() if not line: @@ -115,41 +105,24 @@ class RobotFileParser: line[0] = line[0].strip().lower() line[1] = urllib.unquote(line[1].strip()) if line[0] == "user-agent": - if state==2: - _debug("line %d: warning: you should insert a blank" - " line before any user-agent" - " directive" % linenumber) + if state == 2: self._add_entry(entry) entry = Entry() entry.useragents.append(line[1]) state = 1 elif line[0] == "disallow": - if state==0: - _debug("line %d: error: you must insert a user-agent:" - " directive before this line" % linenumber) - else: + if state != 0: entry.rulelines.append(RuleLine(line[1], False)) state = 2 elif line[0] == "allow": - if state==0: - _debug("line %d: error: you must insert a user-agent:" - " directive before this line" % linenumber) - else: + if state != 0: entry.rulelines.append(RuleLine(line[1], True)) - else: - _debug("line %d: warning: unknown key %s" % (linenumber, - line[0])) - else: - _debug("line %d: error: malformed line %s"%(linenumber, line)) - if state==2: + if state == 2: self.entries.append(entry) - _debug("Parsed rules:\n%s" % str(self)) def can_fetch(self, useragent, url): """using the parsed robots.txt decide if useragent can fetch url""" - _debug("Checking robots.txt allowance for:\n user agent: %s\n url: %s" % - (useragent, url)) if self.disallow_all: return False if self.allow_all: @@ -182,10 +155,10 @@ class RuleLine: self.allowance = allowance def applies_to(self, filename): - return self.path=="*" or filename.startswith(self.path) + return self.path == "*" or filename.startswith(self.path) def __str__(self): - return (self.allowance and "Allow" or "Disallow")+": "+self.path + return (self.allowance and "Allow" or "Disallow") + ": " + self.path class Entry: @@ -207,7 +180,7 @@ class Entry: # split the name token and make it lower case useragent = useragent.split("/")[0].lower() for agent in self.useragents: - if agent=='*': + if agent == '*': # we have the catch-all agent return True agent = agent.lower() @@ -220,7 +193,6 @@ class Entry: - our agent applies to this entry - filename is URL decoded""" for line in self.rulelines: - _debug((filename, str(line), line.allowance)) if line.applies_to(filename): return line.allowance return True @@ -239,56 +211,3 @@ class URLopener(urllib.FancyURLopener): self.errcode = errcode return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, errmsg, headers) - -def _check(a,b): - if not b: - ac = "access denied" - else: - ac = "access allowed" - if a!=b: - print "failed" - else: - print "ok (%s)" % ac - print - -def _test(): - global debug - rp = RobotFileParser() - debug = 1 - - # robots.txt that exists, gotten to by redirection - rp.set_url('http://www.musi-cal.com/robots.txt') - rp.read() - - # test for re.escape - _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1) - # this should match the first rule, which is a disallow - _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0) - # various cherry pickers - _check(rp.can_fetch('CherryPickerSE', - 'http://www.musi-cal.com/cgi-bin/event-search' - '?city=San+Francisco'), 0) - _check(rp.can_fetch('CherryPickerSE/1.0', - 'http://www.musi-cal.com/cgi-bin/event-search' - '?city=San+Francisco'), 0) - _check(rp.can_fetch('CherryPickerSE/1.5', - 'http://www.musi-cal.com/cgi-bin/event-search' - '?city=San+Francisco'), 0) - # case sensitivity - _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0) - _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0) - # substring test - _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0) - # tests for catch-all * agent - _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0) - _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1) - _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) - _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) - - # robots.txt that does not exist - rp.set_url('http://www.lycos.com/robots.txt') - rp.read() - _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1) - -if __name__ == '__main__': - _test() |