summaryrefslogtreecommitdiffstats
path: root/Lib/test/test_robotparser.py
diff options
context:
space:
mode:
authorChristopher Beacham <mcscope@gmail.com>2018-05-16 14:52:07 (GMT)
committerNed Deily <nad@python.org>2018-05-16 14:52:07 (GMT)
commit5db5c0669e624767375593cc1a01f32092c91c58 (patch)
tree0172f5c0c9ae6879ca21c82c175be1c2b829b4c9 /Lib/test/test_robotparser.py
parent7a1c02750171d9895754da5d560700aaba93da56 (diff)
downloadcpython-5db5c0669e624767375593cc1a01f32092c91c58.zip
cpython-5db5c0669e624767375593cc1a01f32092c91c58.tar.gz
cpython-5db5c0669e624767375593cc1a01f32092c91c58.tar.bz2
bpo-21475: Support the Sitemap extension in robotparser (GH-6883)
Diffstat (limited to 'Lib/test/test_robotparser.py')
-rw-r--r--Lib/test/test_robotparser.py21
1 files changed, 21 insertions, 0 deletions
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
index bee8d23..84a267a 100644
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -12,6 +12,7 @@ class BaseRobotTest:
agent = 'test_robotparser'
good = []
bad = []
+ site_maps = None
def setUp(self):
lines = io.StringIO(self.robots_txt).readlines()
@@ -36,6 +37,9 @@ class BaseRobotTest:
with self.subTest(url=url, agent=agent):
self.assertFalse(self.parser.can_fetch(agent, url))
+ def test_site_maps(self):
+ self.assertEqual(self.parser.site_maps(), self.site_maps)
+
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
@@ -65,6 +69,23 @@ Disallow:
bad = ['/cyberworld/map/index.html']
+class SitemapTest(BaseRobotTest, unittest.TestCase):
+ robots_txt = """\
+# robots.txt for http://www.example.com/
+
+User-agent: *
+Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
+Sitemap: http://www.google.com/hostednews/sitemap_index.xml
+Request-rate: 3/15
+Disallow: /cyberworld/map/ # This is an infinite virtual URL space
+
+ """
+ good = ['/', '/test.html']
+ bad = ['/cyberworld/map/index.html']
+ site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
+ 'http://www.google.com/hostednews/sitemap_index.xml']
+
+
class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
# go away