summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
Diffstat (limited to 'Lib')
-rw-r--r--Lib/robotparser.py123
1 files changed, 89 insertions, 34 deletions
diff --git a/Lib/robotparser.py b/Lib/robotparser.py
index d627c9a..ff25dfe 100644
--- a/Lib/robotparser.py
+++ b/Lib/robotparser.py
@@ -39,28 +39,19 @@ class RobotFileParser:
self.host, self.path = urlparse.urlparse(url)[1:3]
def read(self):
- import httplib
- tries = 0
- while tries<5:
- connection = httplib.HTTP(self.host)
- connection.putrequest("GET", self.path)
- connection.putheader("Host", self.host)
- connection.endheaders()
- status, text, mime = connection.getreply()
- if status in [301,302] and mime:
- tries = tries + 1
- newurl = mime.get("Location", mime.get("Uri", ""))
- newurl = urlparse.urljoin(self.url, newurl)
- self.set_url(newurl)
- else:
- break
- if status==401 or status==403:
+ opener = URLopener()
+ f = opener.open(self.url)
+ lines = f.readlines()
+ self.errcode = opener.errcode
+ if self.errcode == 401 or self.errcode == 403:
self.disallow_all = 1
- elif status>=400:
+ _debug("disallow all")
+ elif self.errcode >= 400:
self.allow_all = 1
- else:
- # status < 400
- self.parse(connection.getfile().readlines())
+ _debug("allow all")
+ elif self.errcode == 200 and lines:
+ _debug("parse lines")
+ self.parse(lines)
def parse(self, lines):
"""parse the input lines from a robot.txt file.
@@ -129,15 +120,15 @@ class RobotFileParser:
def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url"""
- _debug("Checking robot.txt allowance for\n%s\n%s" % (useragent, url))
+ _debug("Checking robot.txt allowance for:\n user agent: %s\n url: %s" %
+ (useragent, url))
if self.disallow_all:
return 0
if self.allow_all:
return 1
# search for given user agent matches
# the first match counts
- useragent = useragent.lower()
- url = urllib.quote(urlparse.urlparse(url)[2])
+ url = urllib.quote(urlparse.urlparse(url)[2]) or "/"
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)
@@ -181,11 +172,16 @@ class Entry:
return ret
def applies_to(self, useragent):
- "check if this entry applies to the specified agent"
+ """check if this entry applies to the specified agent"""
+ # split the name token and make it lower case
+ useragent = useragent.split("/")[0].lower()
for agent in self.useragents:
- if agent=="*":
+ if agent=='*':
+ # we have the catch-all agent
return 1
- if re.match(agent, useragent):
+ agent = agent.lower()
+ # don't forget to re.escape
+ if re.search(re.escape(useragent), agent):
return 1
return 0
@@ -194,25 +190,84 @@ class Entry:
- our agent applies to this entry
- filename is URL decoded"""
for line in self.rulelines:
+ _debug((filename, str(line), line.allowance))
if line.applies_to(filename):
return line.allowance
return 1
+class URLopener(urllib.FancyURLopener):
+ def __init__(self, *args):
+ apply(urllib.FancyURLopener.__init__, (self,) + args)
+ self.errcode = 200
+ self.tries = 0
+ self.maxtries = 10
+
+ def http_error_default(self, url, fp, errcode, errmsg, headers):
+ self.errcode = errcode
+ return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
+ errmsg, headers)
+
+ def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
+ self.tries += 1
+ if self.tries >= self.maxtries:
+ return self.http_error_default(url, fp, 500,
+ "Internal Server Error: Redirect Recursion",
+ headers)
+ result = urllib.FancyURLopener.http_error_302(self, url, fp, errcode,
+ errmsg, headers, data)
+ self.tries = 0
+ return result
+
+def _check(a,b):
+ if not b:
+ ac = "access denied"
+ else:
+ ac = "access allowed"
+ if a!=b:
+ print "failed"
+ else:
+ print "ok (%s)" % ac
+ print
def _test():
global debug
import sys
rp = RobotFileParser()
debug = 1
- if len(sys.argv) <= 1:
- rp.set_url('http://www.musi-cal.com/robots.txt')
- rp.read()
- else:
- rp.parse(open(sys.argv[1]).readlines())
- print rp.can_fetch('*', 'http://www.musi-cal.com/')
- print rp.can_fetch('Musi-Cal-Robot/1.0',
+
+ # robots.txt that exists, gotten to by redirection
+ rp.set_url('http://www.musi-cal.com/robots.txt')
+ rp.read()
+
+ # test for re.escape
+ _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
+ # this should match the first rule, which is a disallow
+ _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
+ # various cherry pickers
+ _check(rp.can_fetch('CherryPickerSE',
+ 'http://www.musi-cal.com/cgi-bin/event-search'
+ '?city=San+Francisco'), 0)
+ _check(rp.can_fetch('CherryPickerSE/1.0',
'http://www.musi-cal.com/cgi-bin/event-search'
- '?city=San+Francisco')
+ '?city=San+Francisco'), 0)
+ _check(rp.can_fetch('CherryPickerSE/1.5',
+ 'http://www.musi-cal.com/cgi-bin/event-search'
+ '?city=San+Francisco'), 0)
+ # case sensitivity
+ _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
+ _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
+ # substring test
+ _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
+ # tests for catch-all * agent
+ _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
+ _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
+ _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
+ _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
+
+ # robots.txt that does not exist
+ rp.set_url('http://www.lycos.com/robots.txt')
+ rp.read()
+ _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
if __name__ == '__main__':
_test()