diff options
-rw-r--r-- | Lib/robotparser.py | 123 |
1 files changed, 89 insertions, 34 deletions
diff --git a/Lib/robotparser.py b/Lib/robotparser.py index d627c9a..ff25dfe 100644 --- a/Lib/robotparser.py +++ b/Lib/robotparser.py @@ -39,28 +39,19 @@ class RobotFileParser: self.host, self.path = urlparse.urlparse(url)[1:3] def read(self): - import httplib - tries = 0 - while tries<5: - connection = httplib.HTTP(self.host) - connection.putrequest("GET", self.path) - connection.putheader("Host", self.host) - connection.endheaders() - status, text, mime = connection.getreply() - if status in [301,302] and mime: - tries = tries + 1 - newurl = mime.get("Location", mime.get("Uri", "")) - newurl = urlparse.urljoin(self.url, newurl) - self.set_url(newurl) - else: - break - if status==401 or status==403: + opener = URLopener() + f = opener.open(self.url) + lines = f.readlines() + self.errcode = opener.errcode + if self.errcode == 401 or self.errcode == 403: self.disallow_all = 1 - elif status>=400: + _debug("disallow all") + elif self.errcode >= 400: self.allow_all = 1 - else: - # status < 400 - self.parse(connection.getfile().readlines()) + _debug("allow all") + elif self.errcode == 200 and lines: + _debug("parse lines") + self.parse(lines) def parse(self, lines): """parse the input lines from a robot.txt file. @@ -129,15 +120,15 @@ class RobotFileParser: def can_fetch(self, useragent, url): """using the parsed robots.txt decide if useragent can fetch url""" - _debug("Checking robot.txt allowance for\n%s\n%s" % (useragent, url)) + _debug("Checking robot.txt allowance for:\n user agent: %s\n url: %s" % + (useragent, url)) if self.disallow_all: return 0 if self.allow_all: return 1 # search for given user agent matches # the first match counts - useragent = useragent.lower() - url = urllib.quote(urlparse.urlparse(url)[2]) + url = urllib.quote(urlparse.urlparse(url)[2]) or "/" for entry in self.entries: if entry.applies_to(useragent): return entry.allowance(url) @@ -181,11 +172,16 @@ class Entry: return ret def applies_to(self, useragent): - "check if this entry applies to the specified agent" + """check if this entry applies to the specified agent""" + # split the name token and make it lower case + useragent = useragent.split("/")[0].lower() for agent in self.useragents: - if agent=="*": + if agent=='*': + # we have the catch-all agent return 1 - if re.match(agent, useragent): + agent = agent.lower() + # don't forget to re.escape + if re.search(re.escape(useragent), agent): return 1 return 0 @@ -194,25 +190,84 @@ class Entry: - our agent applies to this entry - filename is URL decoded""" for line in self.rulelines: + _debug((filename, str(line), line.allowance)) if line.applies_to(filename): return line.allowance return 1 +class URLopener(urllib.FancyURLopener): + def __init__(self, *args): + apply(urllib.FancyURLopener.__init__, (self,) + args) + self.errcode = 200 + self.tries = 0 + self.maxtries = 10 + + def http_error_default(self, url, fp, errcode, errmsg, headers): + self.errcode = errcode + return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, + errmsg, headers) + + def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): + self.tries += 1 + if self.tries >= self.maxtries: + return self.http_error_default(url, fp, 500, + "Internal Server Error: Redirect Recursion", + headers) + result = urllib.FancyURLopener.http_error_302(self, url, fp, errcode, + errmsg, headers, data) + self.tries = 0 + return result + +def _check(a,b): + if not b: + ac = "access denied" + else: + ac = "access allowed" + if a!=b: + print "failed" + else: + print "ok (%s)" % ac + print def _test(): global debug import sys rp = RobotFileParser() debug = 1 - if len(sys.argv) <= 1: - rp.set_url('http://www.musi-cal.com/robots.txt') - rp.read() - else: - rp.parse(open(sys.argv[1]).readlines()) - print rp.can_fetch('*', 'http://www.musi-cal.com/') - print rp.can_fetch('Musi-Cal-Robot/1.0', + + # robots.txt that exists, gotten to by redirection + rp.set_url('http://www.musi-cal.com/robots.txt') + rp.read() + + # test for re.escape + _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1) + # this should match the first rule, which is a disallow + _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0) + # various cherry pickers + _check(rp.can_fetch('CherryPickerSE', + 'http://www.musi-cal.com/cgi-bin/event-search' + '?city=San+Francisco'), 0) + _check(rp.can_fetch('CherryPickerSE/1.0', 'http://www.musi-cal.com/cgi-bin/event-search' - '?city=San+Francisco') + '?city=San+Francisco'), 0) + _check(rp.can_fetch('CherryPickerSE/1.5', + 'http://www.musi-cal.com/cgi-bin/event-search' + '?city=San+Francisco'), 0) + # case sensitivity + _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0) + _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0) + # substring test + _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0) + # tests for catch-all * agent + _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0) + _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1) + _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) + _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) + + # robots.txt that does not exist + rp.set_url('http://www.lycos.com/robots.txt') + rp.read() + _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1) if __name__ == '__main__': _test() |