summaryrefslogtreecommitdiffstats
path: root/Lib/urllib/robotparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/urllib/robotparser.py')
-rw-r--r--Lib/urllib/robotparser.py39
1 files changed, 37 insertions, 2 deletions
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 4fbb0cb..4ac553a 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -10,7 +10,9 @@
http://www.robotstxt.org/norobots-rfc.txt
"""
-import urllib.parse, urllib.request
+import collections
+import urllib.parse
+import urllib.request
__all__ = ["RobotFileParser"]
@@ -120,10 +122,29 @@ class RobotFileParser:
if state != 0:
entry.rulelines.append(RuleLine(line[1], True))
state = 2
+ elif line[0] == "crawl-delay":
+ if state != 0:
+ # before trying to convert to int we need to make
+ # sure that robots.txt has valid syntax otherwise
+ # it will crash
+ if line[1].strip().isdigit():
+ entry.delay = int(line[1])
+ state = 2
+ elif line[0] == "request-rate":
+ if state != 0:
+ numbers = line[1].split('/')
+ # check if all values are sane
+ if (len(numbers) == 2 and numbers[0].strip().isdigit()
+ and numbers[1].strip().isdigit()):
+ req_rate = collections.namedtuple('req_rate',
+ 'requests seconds')
+ entry.req_rate = req_rate
+ entry.req_rate.requests = int(numbers[0])
+ entry.req_rate.seconds = int(numbers[1])
+ state = 2
if state == 2:
self._add_entry(entry)
-
def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url"""
if self.disallow_all:
@@ -153,6 +174,18 @@ class RobotFileParser:
# agent not found ==> access granted
return True
+ def crawl_delay(self, useragent):
+ for entry in self.entries:
+ if entry.applies_to(useragent):
+ return entry.delay
+ return None
+
+ def request_rate(self, useragent):
+ for entry in self.entries:
+ if entry.applies_to(useragent):
+ return entry.req_rate
+ return None
+
def __str__(self):
return ''.join([str(entry) + "\n" for entry in self.entries])
@@ -180,6 +213,8 @@ class Entry:
def __init__(self):
self.useragents = []
self.rulelines = []
+ self.delay = None
+ self.req_rate = None
def __str__(self):
ret = []