summaryrefslogtreecommitdiffstats
path: root/Lib/dos-8x3/robotpar.py
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>2000-05-08 17:31:04 (GMT)
committerGuido van Rossum <guido@python.org>2000-05-08 17:31:04 (GMT)
commitaad6761ccea28e0a0da6761570b18adc72e01c37 (patch)
tree731b55d5648f08e1bc755bcace1f836413cd8aae /Lib/dos-8x3/robotpar.py
parent0b095bc0929fb43157019c50e3e680a29ec94a65 (diff)
downloadcpython-aad6761ccea28e0a0da6761570b18adc72e01c37.zip
cpython-aad6761ccea28e0a0da6761570b18adc72e01c37.tar.gz
cpython-aad6761ccea28e0a0da6761570b18adc72e01c37.tar.bz2
The usual...
Diffstat (limited to 'Lib/dos-8x3/robotpar.py')
-rw-r--r--Lib/dos-8x3/robotpar.py97
1 files changed, 97 insertions, 0 deletions
diff --git a/Lib/dos-8x3/robotpar.py b/Lib/dos-8x3/robotpar.py
new file mode 100644
index 0000000..3f4396b
--- /dev/null
+++ b/Lib/dos-8x3/robotpar.py
@@ -0,0 +1,97 @@
+"""
+
+Robots.txt file parser class. Accepts a list of lines or robots.txt URL as
+input, builds a set of rules from that list, then answers questions about
+fetchability of other URLs.
+
+"""
+
+class RobotFileParser:
+
+ def __init__(self):
+ self.rules = {}
+ self.debug = 0
+ self.url = ''
+ self.last_checked = 0
+
+ def mtime(self):
+ return self.last_checked
+
+ def modified(self):
+ import time
+ self.last_checked = time.time()
+
+ def set_url(self, url):
+ self.url = url
+
+ def read(self):
+ import urllib
+ self.parse(urllib.urlopen(self.url).readlines())
+
+ def parse(self, lines):
+ """parse the input lines from a robot.txt file"""
+ import string, re
+ active = []
+ for line in lines:
+ if self.debug: print '>', line,
+ # blank line terminates current record
+ if not line[:-1]:
+ active = []
+ continue
+ # remove optional comment and strip line
+ line = string.strip(line[:string.find(line, '#')])
+ if not line:
+ continue
+ line = re.split(' *: *', line)
+ if len(line) == 2:
+ line[0] = string.lower(line[0])
+ if line[0] == 'user-agent':
+ # this record applies to this user agent
+ if self.debug: print '>> user-agent:', line[1]
+ active.append(line[1])
+ if not self.rules.has_key(line[1]):
+ self.rules[line[1]] = []
+ elif line[0] == 'disallow':
+ if line[1]:
+ if self.debug: print '>> disallow:', line[1]
+ for agent in active:
+ self.rules[agent].append(re.compile(line[1]))
+ else:
+ pass
+ for agent in active:
+ if self.debug: print '>> allow', agent
+ self.rules[agent] = []
+ else:
+ if self.debug: print '>> unknown:', line
+
+ self.modified()
+
+ # returns true if agent is allowed to fetch url
+ def can_fetch(self, useragent, url):
+ """using the parsed robots.txt decide if useragent can fetch url"""
+ import urlparse
+ ag = useragent
+ if not self.rules.has_key(ag): ag = '*'
+ if not self.rules.has_key(ag):
+ if self.debug: print '>> allowing', url, 'fetch by', useragent
+ return 1
+ path = urlparse.urlparse(url)[2]
+ for rule in self.rules[ag]:
+ if rule.match(path) is not None:
+ if self.debug: print '>> disallowing', url, 'fetch by', useragent
+ return 0
+ if self.debug: print '>> allowing', url, 'fetch by', useragent
+ return 1
+
+def _test():
+ rp = RobotFileParser()
+ rp.debug = 1
+ rp.set_url('http://www.musi-cal.com/robots.txt')
+ rp.read()
+ print rp.rules
+ print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
+ print rp.can_fetch('Musi-Cal-Robot',
+ 'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
+
+if __name__ == "__main__":
+ _test()