""" Robots.txt file parser class. Accepts a list of lines or robots.txt URL as input, builds a set of rules from that list, then answers questions about fetchability of other URLs. """ class RobotFileParser: def __init__(self): self.rules = {} self.debug = 0 self.url = '' self.last_checked = 0 def mtime(self): return self.last_checked def modified(self): import time self.last_checked = time.time() def set_url(self, url): self.url = url def read(self): import urllib self.parse(urllib.urlopen(self.url).readlines()) def parse(self, lines): """parse the input lines from a robot.txt file""" import string, re active = [] for line in lines: if self.debug: print '>', line, # blank line terminates current record if not line[:-1]: active = [] continue # remove optional comment and strip line line = string.strip(line[:string.find(line, '#')]) if not line: continue line = re.split(' *: *', line) if len(line) == 2: line[0] = string.lower(line[0]) if line[0] == 'user-agent': # this record applies to this user agent if self.debug: print '>> user-agent:', line[1] active.append(line[1]) if not self.rules.has_key(line[1]): self.rules[line[1]] = [] elif line[0] == 'disallow': if line[1]: if self.debug: print '>> disallow:', line[1] for agent in active: self.rules[agent].append(re.compile(line[1])) else: pass for agent in active: if self.debug: print '>> allow', agent self.rules[agent] = [] else: if self.debug: print '>> unknown:', line self.modified() # returns true if agent is allowed to fetch url def can_fetch(self, useragent, url): """using the parsed robots.txt decide if useragent can fetch url""" import urlparse ag = useragent if not self.rules.has_key(ag): ag = '*' if not self.rules.has_key(ag): if self.debug: print '>> allowing', url, 'fetch by', useragent return 1 path = urlparse.urlparse(url)[2] for rule in self.rules[ag]: if rule.match(path) is not None: if self.debug: print '>> disallowing', url, 'fetch by', useragent return 0 if self.debug: print '>> allowing', url, 'fetch by', useragent return 1 def _test(): rp = RobotFileParser() rp.debug = 1 rp.set_url('http://www.musi-cal.com/robots.txt') rp.read() print rp.rules print rp.can_fetch('*', 'http://www.musi-cal.com.com/') print rp.can_fetch('Musi-Cal-Robot', 'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco') if __name__ == "__main__": _test()