diff options
author | Berker Peksag <berker.peksag@gmail.com> | 2016-09-18 17:17:58 (GMT) |
---|---|---|
committer | Berker Peksag <berker.peksag@gmail.com> | 2016-09-18 17:17:58 (GMT) |
commit | 9a7bbb2e3f12faaf4110ecd15fb739e94f4bc8f6 (patch) | |
tree | 0b8430da11ccc8beb8fc9d4c70780543de1e781e | |
parent | 85c98bf9682a46f7b15e9c79c68d38af8a9109b0 (diff) | |
download | cpython-9a7bbb2e3f12faaf4110ecd15fb739e94f4bc8f6.zip cpython-9a7bbb2e3f12faaf4110ecd15fb739e94f4bc8f6.tar.gz cpython-9a7bbb2e3f12faaf4110ecd15fb739e94f4bc8f6.tar.bz2 |
Issue #25400: RobotFileParser now correctly returns default values for crawl_delay and request_rate
Initial patch by Peter Wirtz.
-rw-r--r-- | Lib/test/test_robotparser.py | 54 | ||||
-rw-r--r-- | Lib/urllib/robotparser.py | 8 | ||||
-rw-r--r-- | Misc/NEWS | 3 |
3 files changed, 45 insertions, 20 deletions
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 51b48ce..0f64ba8 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -79,32 +79,17 @@ Disallow: / bad = ['/cyberworld/map/index.html', '/', '/tmp/'] -class CrawlDelayAndRequestRateTest(BaseRobotTest, unittest.TestCase): - robots_txt = """\ -User-agent: figtree -Crawl-delay: 3 -Request-rate: 9/30 -Disallow: /tmp -Disallow: /a%3cd.html -Disallow: /a%2fb.html -Disallow: /%7ejoe/index.html - """ - agent = 'figtree' - request_rate = namedtuple('req_rate', 'requests seconds')(9, 30) - crawl_delay = 3 - good = [('figtree', '/foo.html')] - bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', - '/a%2fb.html', '/~joe/index.html'] +class BaseRequestRateTest(BaseRobotTest): def test_request_rate(self): - for url in self.good: + for url in self.good + self.bad: agent, url = self.get_agent_and_url(url) with self.subTest(url=url, agent=agent): if self.crawl_delay: self.assertEqual( self.parser.crawl_delay(agent), self.crawl_delay ) - if self.request_rate and self.parser.request_rate(agent): + if self.request_rate: self.assertEqual( self.parser.request_rate(agent).requests, self.request_rate.requests @@ -115,6 +100,24 @@ Disallow: /%7ejoe/index.html ) +class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): + robots_txt = """\ +User-agent: figtree +Crawl-delay: 3 +Request-rate: 9/30 +Disallow: /tmp +Disallow: /a%3cd.html +Disallow: /a%2fb.html +Disallow: /%7ejoe/index.html + """ + agent = 'figtree' + request_rate = namedtuple('req_rate', 'requests seconds')(9, 30) + crawl_delay = 3 + good = [('figtree', '/foo.html')] + bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', + '/a%2fb.html', '/~joe/index.html'] + + class DifferentAgentTest(CrawlDelayAndRequestRateTest): agent = 'FigTree Robot libwww-perl/5.04' # these are not actually tested, but we still need to parse it @@ -230,6 +233,19 @@ Disallow: /another/path? bad = ['/another/path?'] +class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase): + robots_txt = """\ +User-agent: * +Crawl-delay: 1 +Request-rate: 3/15 +Disallow: /cyberworld/map/ + """ + request_rate = namedtuple('req_rate', 'requests seconds')(3, 15) + crawl_delay = 1 + good = ['/', '/test.html'] + bad = ['/cyberworld/map/index.html'] + + class RobotHandler(BaseHTTPRequestHandler): def do_GET(self): @@ -309,6 +325,8 @@ class NetworkTestCase(unittest.TestCase): self.assertTrue(parser.allow_all) self.assertFalse(parser.disallow_all) self.assertEqual(parser.mtime(), 0) + self.assertIsNone(parser.crawl_delay('*')) + self.assertIsNone(parser.request_rate('*')) if __name__=='__main__': unittest.main() diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 85add16..9dab4c1 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -175,16 +175,20 @@ class RobotFileParser: return True def crawl_delay(self, useragent): + if not self.mtime(): + return None for entry in self.entries: if entry.applies_to(useragent): return entry.delay - return None + return self.default_entry.delay def request_rate(self, useragent): + if not self.mtime(): + return None for entry in self.entries: if entry.applies_to(useragent): return entry.req_rate - return None + return self.default_entry.req_rate def __str__(self): return ''.join([str(entry) + "\n" for entry in self.entries]) @@ -29,6 +29,9 @@ Core and Builtins Library ------- +- Issue #25400: RobotFileParser now correctly returns default values for + crawl_delay and request_rate. Initial patch by Peter Wirtz. + - Issue #27932: Prevent memory leak in win32_ver(). - Fix UnboundLocalError in socket._sendfile_use_sendfile. |