summaryrefslogtreecommitdiffstats
path: root/Lib/urllib
diff options
context:
space:
mode:
authorChristopher Beacham <mcscope@gmail.com>2018-05-16 14:52:07 (GMT)
committerNed Deily <nad@python.org>2018-05-16 14:52:07 (GMT)
commit5db5c0669e624767375593cc1a01f32092c91c58 (patch)
tree0172f5c0c9ae6879ca21c82c175be1c2b829b4c9 /Lib/urllib
parent7a1c02750171d9895754da5d560700aaba93da56 (diff)
downloadcpython-5db5c0669e624767375593cc1a01f32092c91c58.zip
cpython-5db5c0669e624767375593cc1a01f32092c91c58.tar.gz
cpython-5db5c0669e624767375593cc1a01f32092c91c58.tar.bz2
bpo-21475: Support the Sitemap extension in robotparser (GH-6883)
Diffstat (limited to 'Lib/urllib')
-rw-r--r--Lib/urllib/robotparser.py12
1 files changed, 12 insertions, 0 deletions
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 92e4efe..7089916 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -27,6 +27,7 @@ class RobotFileParser:
def __init__(self, url=''):
self.entries = []
+ self.sitemaps = []
self.default_entry = None
self.disallow_all = False
self.allow_all = False
@@ -141,6 +142,12 @@ class RobotFileParser:
and numbers[1].strip().isdigit()):
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
state = 2
+ elif line[0] == "sitemap":
+ # According to http://www.sitemaps.org/protocol.html
+ # "This directive is independent of the user-agent line,
+ # so it doesn't matter where you place it in your file."
+ # Therefore we do not change the state of the parser.
+ self.sitemaps.append(line[1])
if state == 2:
self._add_entry(entry)
@@ -189,6 +196,11 @@ class RobotFileParser:
return entry.req_rate
return self.default_entry.req_rate
+ def site_maps(self):
+ if not self.sitemaps:
+ return None
+ return self.sitemaps
+
def __str__(self):
entries = self.entries
if self.default_entry is not None: