From 1c63f6e48913f71968509ccf5708034f4ff09cf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Thu, 28 Feb 2002 15:24:47 +0000 Subject: Correct various errors: - Use substring search, not re search for user-agent and paths. - Consider * entry last. Unquote, then requote URLs. - Treat empty Disallow as "allow everything". Add test cases. Fixes #523041 --- Lib/robotparser.py | 22 +++++-- Lib/test/test_robotparser.py | 141 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+), 6 deletions(-) create mode 100644 Lib/test/test_robotparser.py diff --git a/Lib/robotparser.py b/Lib/robotparser.py index 724ec28..bfc0739 100644 --- a/Lib/robotparser.py +++ b/Lib/robotparser.py @@ -22,6 +22,7 @@ def _debug(msg): class RobotFileParser: def __init__(self, url=''): self.entries = [] + self.default_entry = None self.disallow_all = 0 self.allow_all = 0 self.set_url(url) @@ -72,7 +73,11 @@ class RobotFileParser: entry = Entry() state = 0 elif state==2: - self.entries.append(entry) + if "*" in entry.useragents: + # the default entry is considered last + self.default_entry = entry + else: + self.entries.append(entry) entry = Entry() state = 0 # remove optional comment and strip line @@ -85,7 +90,7 @@ class RobotFileParser: line = line.split(':', 1) if len(line) == 2: line[0] = line[0].strip().lower() - line[1] = line[1].strip() + line[1] = urllib.unquote(line[1].strip()) if line[0] == "user-agent": if state==2: _debug("line %d: warning: you should insert a blank" @@ -128,10 +133,13 @@ class RobotFileParser: return 1 # search for given user agent matches # the first match counts - url = urllib.quote(urlparse.urlparse(url)[2]) or "/" + url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" for entry in self.entries: if entry.applies_to(useragent): return entry.allowance(url) + # try the default entry last + if self.default_entry: + return self.default_entry.allowance(url) # agent not found ==> access granted return 1 @@ -147,11 +155,14 @@ class RuleLine: """A rule line is a single "Allow:" (allowance==1) or "Disallow:" (allowance==0) followed by a path.""" def __init__(self, path, allowance): + if path == '' and not allowance: + # an empty value means allow all + allowance = 1 self.path = urllib.quote(path) self.allowance = allowance def applies_to(self, filename): - return self.path=="*" or re.match(self.path, filename) + return self.path=="*" or filename.startswith(self.path) def __str__(self): return (self.allowance and "Allow" or "Disallow")+": "+self.path @@ -180,8 +191,7 @@ class Entry: # we have the catch-all agent return 1 agent = agent.lower() - # don't forget to re.escape - if re.search(re.escape(useragent), agent): + if useragent.find(agent) != -1: return 1 return 0 diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py new file mode 100644 index 0000000..a010725 --- /dev/null +++ b/Lib/test/test_robotparser.py @@ -0,0 +1,141 @@ +import unittest, StringIO, robotparser, test_support + +class RobotTestCase(unittest.TestCase): + def __init__(self, index, parser, url, good, agent): + unittest.TestCase.__init__(self) + if good: + self.str = "RobotTest(%d, good, %s)" % (index, url) + else: + self.str = "RobotTest(%d, bad, %s)" % (index, url) + self.parser = parser + self.url = url + self.good = good + self.agent = agent + + def runTest(self): + if isinstance(self.url, tuple): + agent, url = self.url + else: + url = self.url + agent = self.agent + if self.good: + self.failUnless(self.parser.can_fetch(agent, url)) + else: + self.failIf(self.parser.can_fetch(agent, url)) + + def __str__(self): + return self.str + +tests = unittest.TestSuite() + +def RobotTest(index, robots_txt, good_urls, bad_urls, + agent="test_robotparser"): + + lines = StringIO.StringIO(robots_txt).readlines() + parser = robotparser.RobotFileParser() + parser.parse(lines) + for url in good_urls: + tests.addTest(RobotTestCase(index, parser, url, 1, agent)) + for url in bad_urls: + tests.addTest(RobotTestCase(index, parser, url, 0, agent)) + +# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) + +# 1. +doc = """ +User-agent: * +Disallow: /cyberworld/map/ # This is an infinite virtual URL space +Disallow: /tmp/ # these will soon disappear +Disallow: /foo.html +""" + +good = ['/','/test.html'] +bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html'] + +RobotTest(1, doc, good, bad) + +# 2. +doc = """ +# robots.txt for http://www.example.com/ + +User-agent: * +Disallow: /cyberworld/map/ # This is an infinite virtual URL space + +# Cybermapper knows where to go. +User-agent: cybermapper +Disallow: + +""" + +good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')] +bad = ['/cyberworld/map/index.html'] + +RobotTest(2, doc, good, bad) + +# 3. +doc = """ +# go away +User-agent: * +Disallow: / +""" + +good = [] +bad = ['/cyberworld/map/index.html','/','/tmp/'] + +RobotTest(3, doc, good, bad) + +# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002) + +# 4. +doc = """ +User-agent: figtree +Disallow: /tmp +Disallow: /a%3cd.html +Disallow: /a%2fb.html +Disallow: /%7ejoe/index.html +""" + +good = [] # XFAIL '/a/b.html' +bad = ['/tmp','/tmp.html','/tmp/a.html', + '/a%3cd.html','/a%3Cd.html','/a%2fb.html', + '/~joe/index.html' + ] + +RobotTest(4, doc, good, bad, 'figtree') +RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04') + +# 6. +doc = """ +User-agent: * +Disallow: /tmp/ +Disallow: /a%3Cd.html +Disallow: /a/b.html +Disallow: /%7ejoe/index.html +""" + +good = ['/tmp',] # XFAIL: '/a%2fb.html' +bad = ['/tmp/','/tmp/a.html', + '/a%3cd.html','/a%3Cd.html',"/a/b.html", + '/%7Ejoe/index.html'] + +RobotTest(6, doc, good, bad) + +# From bug report #523041 + +# 7. +doc = """ +User-Agent: * +Disallow: /. +""" + +good = ['/foo.html'] +bad = [] # Bug report says "/" should be denied, but that is not in the RFC + +RobotTest(7, doc, good, bad) + +def test_main(): + test_support.run_suite(tests) + +if __name__=='__main__': + test_support.Verbose = 1 + test_support.run_suite(tests) -- cgit v0.12