summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBerker Peksag <berker.peksag@gmail.com>2016-09-18 17:17:58 (GMT)
committerBerker Peksag <berker.peksag@gmail.com>2016-09-18 17:17:58 (GMT)
commit9a7bbb2e3f12faaf4110ecd15fb739e94f4bc8f6 (patch)
tree0b8430da11ccc8beb8fc9d4c70780543de1e781e
parent85c98bf9682a46f7b15e9c79c68d38af8a9109b0 (diff)
downloadcpython-9a7bbb2e3f12faaf4110ecd15fb739e94f4bc8f6.zip
cpython-9a7bbb2e3f12faaf4110ecd15fb739e94f4bc8f6.tar.gz
cpython-9a7bbb2e3f12faaf4110ecd15fb739e94f4bc8f6.tar.bz2
Issue #25400: RobotFileParser now correctly returns default values for crawl_delay and request_rate
Initial patch by Peter Wirtz.
-rw-r--r--Lib/test/test_robotparser.py54
-rw-r--r--Lib/urllib/robotparser.py8
-rw-r--r--Misc/NEWS3
3 files changed, 45 insertions, 20 deletions
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
index 51b48ce..0f64ba8 100644
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -79,32 +79,17 @@ Disallow: /
bad = ['/cyberworld/map/index.html', '/', '/tmp/']
-class CrawlDelayAndRequestRateTest(BaseRobotTest, unittest.TestCase):
- robots_txt = """\
-User-agent: figtree
-Crawl-delay: 3
-Request-rate: 9/30
-Disallow: /tmp
-Disallow: /a%3cd.html
-Disallow: /a%2fb.html
-Disallow: /%7ejoe/index.html
- """
- agent = 'figtree'
- request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
- crawl_delay = 3
- good = [('figtree', '/foo.html')]
- bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
- '/a%2fb.html', '/~joe/index.html']
+class BaseRequestRateTest(BaseRobotTest):
def test_request_rate(self):
- for url in self.good:
+ for url in self.good + self.bad:
agent, url = self.get_agent_and_url(url)
with self.subTest(url=url, agent=agent):
if self.crawl_delay:
self.assertEqual(
self.parser.crawl_delay(agent), self.crawl_delay
)
- if self.request_rate and self.parser.request_rate(agent):
+ if self.request_rate:
self.assertEqual(
self.parser.request_rate(agent).requests,
self.request_rate.requests
@@ -115,6 +100,24 @@ Disallow: /%7ejoe/index.html
)
+class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
+ robots_txt = """\
+User-agent: figtree
+Crawl-delay: 3
+Request-rate: 9/30
+Disallow: /tmp
+Disallow: /a%3cd.html
+Disallow: /a%2fb.html
+Disallow: /%7ejoe/index.html
+ """
+ agent = 'figtree'
+ request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
+ crawl_delay = 3
+ good = [('figtree', '/foo.html')]
+ bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
+ '/a%2fb.html', '/~joe/index.html']
+
+
class DifferentAgentTest(CrawlDelayAndRequestRateTest):
agent = 'FigTree Robot libwww-perl/5.04'
# these are not actually tested, but we still need to parse it
@@ -230,6 +233,19 @@ Disallow: /another/path?
bad = ['/another/path?']
+class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
+ robots_txt = """\
+User-agent: *
+Crawl-delay: 1
+Request-rate: 3/15
+Disallow: /cyberworld/map/
+ """
+ request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
+ crawl_delay = 1
+ good = ['/', '/test.html']
+ bad = ['/cyberworld/map/index.html']
+
+
class RobotHandler(BaseHTTPRequestHandler):
def do_GET(self):
@@ -309,6 +325,8 @@ class NetworkTestCase(unittest.TestCase):
self.assertTrue(parser.allow_all)
self.assertFalse(parser.disallow_all)
self.assertEqual(parser.mtime(), 0)
+ self.assertIsNone(parser.crawl_delay('*'))
+ self.assertIsNone(parser.request_rate('*'))
if __name__=='__main__':
unittest.main()
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 85add16..9dab4c1 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -175,16 +175,20 @@ class RobotFileParser:
return True
def crawl_delay(self, useragent):
+ if not self.mtime():
+ return None
for entry in self.entries:
if entry.applies_to(useragent):
return entry.delay
- return None
+ return self.default_entry.delay
def request_rate(self, useragent):
+ if not self.mtime():
+ return None
for entry in self.entries:
if entry.applies_to(useragent):
return entry.req_rate
- return None
+ return self.default_entry.req_rate
def __str__(self):
return ''.join([str(entry) + "\n" for entry in self.entries])
diff --git a/Misc/NEWS b/Misc/NEWS
index 671a9b4..e26a5c0 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -29,6 +29,9 @@ Core and Builtins
Library
-------
+- Issue #25400: RobotFileParser now correctly returns default values for
+ crawl_delay and request_rate. Initial patch by Peter Wirtz.
+
- Issue #27932: Prevent memory leak in win32_ver().
- Fix UnboundLocalError in socket._sendfile_use_sendfile.