diff options
Diffstat (limited to 'Lib/test/test_robotparser.py')
-rw-r--r-- | Lib/test/test_robotparser.py | 54 |
1 files changed, 36 insertions, 18 deletions
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 51b48ce..0f64ba8 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -79,32 +79,17 @@ Disallow: / bad = ['/cyberworld/map/index.html', '/', '/tmp/'] -class CrawlDelayAndRequestRateTest(BaseRobotTest, unittest.TestCase): - robots_txt = """\ -User-agent: figtree -Crawl-delay: 3 -Request-rate: 9/30 -Disallow: /tmp -Disallow: /a%3cd.html -Disallow: /a%2fb.html -Disallow: /%7ejoe/index.html - """ - agent = 'figtree' - request_rate = namedtuple('req_rate', 'requests seconds')(9, 30) - crawl_delay = 3 - good = [('figtree', '/foo.html')] - bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', - '/a%2fb.html', '/~joe/index.html'] +class BaseRequestRateTest(BaseRobotTest): def test_request_rate(self): - for url in self.good: + for url in self.good + self.bad: agent, url = self.get_agent_and_url(url) with self.subTest(url=url, agent=agent): if self.crawl_delay: self.assertEqual( self.parser.crawl_delay(agent), self.crawl_delay ) - if self.request_rate and self.parser.request_rate(agent): + if self.request_rate: self.assertEqual( self.parser.request_rate(agent).requests, self.request_rate.requests @@ -115,6 +100,24 @@ Disallow: /%7ejoe/index.html ) +class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): + robots_txt = """\ +User-agent: figtree +Crawl-delay: 3 +Request-rate: 9/30 +Disallow: /tmp +Disallow: /a%3cd.html +Disallow: /a%2fb.html +Disallow: /%7ejoe/index.html + """ + agent = 'figtree' + request_rate = namedtuple('req_rate', 'requests seconds')(9, 30) + crawl_delay = 3 + good = [('figtree', '/foo.html')] + bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', + '/a%2fb.html', '/~joe/index.html'] + + class DifferentAgentTest(CrawlDelayAndRequestRateTest): agent = 'FigTree Robot libwww-perl/5.04' # these are not actually tested, but we still need to parse it @@ -230,6 +233,19 @@ Disallow: /another/path? bad = ['/another/path?'] +class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase): + robots_txt = """\ +User-agent: * +Crawl-delay: 1 +Request-rate: 3/15 +Disallow: /cyberworld/map/ + """ + request_rate = namedtuple('req_rate', 'requests seconds')(3, 15) + crawl_delay = 1 + good = ['/', '/test.html'] + bad = ['/cyberworld/map/index.html'] + + class RobotHandler(BaseHTTPRequestHandler): def do_GET(self): @@ -309,6 +325,8 @@ class NetworkTestCase(unittest.TestCase): self.assertTrue(parser.allow_all) self.assertFalse(parser.disallow_all) self.assertEqual(parser.mtime(), 0) + self.assertIsNone(parser.crawl_delay('*')) + self.assertIsNone(parser.request_rate('*')) if __name__=='__main__': unittest.main() |