diff options
author | Guido van Rossum <guido@python.org> | 2000-05-08 17:31:04 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 2000-05-08 17:31:04 (GMT) |
commit | aad6761ccea28e0a0da6761570b18adc72e01c37 (patch) | |
tree | 731b55d5648f08e1bc755bcace1f836413cd8aae /Lib/dos-8x3/robotpar.py | |
parent | 0b095bc0929fb43157019c50e3e680a29ec94a65 (diff) | |
download | cpython-aad6761ccea28e0a0da6761570b18adc72e01c37.zip cpython-aad6761ccea28e0a0da6761570b18adc72e01c37.tar.gz cpython-aad6761ccea28e0a0da6761570b18adc72e01c37.tar.bz2 |
The usual...
Diffstat (limited to 'Lib/dos-8x3/robotpar.py')
-rw-r--r-- | Lib/dos-8x3/robotpar.py | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/Lib/dos-8x3/robotpar.py b/Lib/dos-8x3/robotpar.py new file mode 100644 index 0000000..3f4396b --- /dev/null +++ b/Lib/dos-8x3/robotpar.py @@ -0,0 +1,97 @@ +""" + +Robots.txt file parser class. Accepts a list of lines or robots.txt URL as +input, builds a set of rules from that list, then answers questions about +fetchability of other URLs. + +""" + +class RobotFileParser: + + def __init__(self): + self.rules = {} + self.debug = 0 + self.url = '' + self.last_checked = 0 + + def mtime(self): + return self.last_checked + + def modified(self): + import time + self.last_checked = time.time() + + def set_url(self, url): + self.url = url + + def read(self): + import urllib + self.parse(urllib.urlopen(self.url).readlines()) + + def parse(self, lines): + """parse the input lines from a robot.txt file""" + import string, re + active = [] + for line in lines: + if self.debug: print '>', line, + # blank line terminates current record + if not line[:-1]: + active = [] + continue + # remove optional comment and strip line + line = string.strip(line[:string.find(line, '#')]) + if not line: + continue + line = re.split(' *: *', line) + if len(line) == 2: + line[0] = string.lower(line[0]) + if line[0] == 'user-agent': + # this record applies to this user agent + if self.debug: print '>> user-agent:', line[1] + active.append(line[1]) + if not self.rules.has_key(line[1]): + self.rules[line[1]] = [] + elif line[0] == 'disallow': + if line[1]: + if self.debug: print '>> disallow:', line[1] + for agent in active: + self.rules[agent].append(re.compile(line[1])) + else: + pass + for agent in active: + if self.debug: print '>> allow', agent + self.rules[agent] = [] + else: + if self.debug: print '>> unknown:', line + + self.modified() + + # returns true if agent is allowed to fetch url + def can_fetch(self, useragent, url): + """using the parsed robots.txt decide if useragent can fetch url""" + import urlparse + ag = useragent + if not self.rules.has_key(ag): ag = '*' + if not self.rules.has_key(ag): + if self.debug: print '>> allowing', url, 'fetch by', useragent + return 1 + path = urlparse.urlparse(url)[2] + for rule in self.rules[ag]: + if rule.match(path) is not None: + if self.debug: print '>> disallowing', url, 'fetch by', useragent + return 0 + if self.debug: print '>> allowing', url, 'fetch by', useragent + return 1 + +def _test(): + rp = RobotFileParser() + rp.debug = 1 + rp.set_url('http://www.musi-cal.com/robots.txt') + rp.read() + print rp.rules + print rp.can_fetch('*', 'http://www.musi-cal.com.com/') + print rp.can_fetch('Musi-Cal-Robot', + 'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco') + +if __name__ == "__main__": + _test() |