diff options
author | Christopher Beacham <mcscope@gmail.com> | 2018-05-16 14:52:07 (GMT) |
---|---|---|
committer | Ned Deily <nad@python.org> | 2018-05-16 14:52:07 (GMT) |
commit | 5db5c0669e624767375593cc1a01f32092c91c58 (patch) | |
tree | 0172f5c0c9ae6879ca21c82c175be1c2b829b4c9 /Lib/urllib | |
parent | 7a1c02750171d9895754da5d560700aaba93da56 (diff) | |
download | cpython-5db5c0669e624767375593cc1a01f32092c91c58.zip cpython-5db5c0669e624767375593cc1a01f32092c91c58.tar.gz cpython-5db5c0669e624767375593cc1a01f32092c91c58.tar.bz2 |
bpo-21475: Support the Sitemap extension in robotparser (GH-6883)
Diffstat (limited to 'Lib/urllib')
-rw-r--r-- | Lib/urllib/robotparser.py | 12 |
1 files changed, 12 insertions, 0 deletions
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 92e4efe..7089916 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -27,6 +27,7 @@ class RobotFileParser: def __init__(self, url=''): self.entries = [] + self.sitemaps = [] self.default_entry = None self.disallow_all = False self.allow_all = False @@ -141,6 +142,12 @@ class RobotFileParser: and numbers[1].strip().isdigit()): entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1])) state = 2 + elif line[0] == "sitemap": + # According to http://www.sitemaps.org/protocol.html + # "This directive is independent of the user-agent line, + # so it doesn't matter where you place it in your file." + # Therefore we do not change the state of the parser. + self.sitemaps.append(line[1]) if state == 2: self._add_entry(entry) @@ -189,6 +196,11 @@ class RobotFileParser: return entry.req_rate return self.default_entry.req_rate + def site_maps(self): + if not self.sitemaps: + return None + return self.sitemaps + def __str__(self): entries = self.entries if self.default_entry is not None: |