diff options
Diffstat (limited to 'Tools/webchecker')
-rw-r--r-- | Tools/webchecker/robotparser.py | 97 |
1 files changed, 0 insertions, 97 deletions
diff --git a/Tools/webchecker/robotparser.py b/Tools/webchecker/robotparser.py deleted file mode 100644 index 3f4396b..0000000 --- a/Tools/webchecker/robotparser.py +++ /dev/null @@ -1,97 +0,0 @@ -""" - -Robots.txt file parser class. Accepts a list of lines or robots.txt URL as -input, builds a set of rules from that list, then answers questions about -fetchability of other URLs. - -""" - -class RobotFileParser: - - def __init__(self): - self.rules = {} - self.debug = 0 - self.url = '' - self.last_checked = 0 - - def mtime(self): - return self.last_checked - - def modified(self): - import time - self.last_checked = time.time() - - def set_url(self, url): - self.url = url - - def read(self): - import urllib - self.parse(urllib.urlopen(self.url).readlines()) - - def parse(self, lines): - """parse the input lines from a robot.txt file""" - import string, re - active = [] - for line in lines: - if self.debug: print '>', line, - # blank line terminates current record - if not line[:-1]: - active = [] - continue - # remove optional comment and strip line - line = string.strip(line[:string.find(line, '#')]) - if not line: - continue - line = re.split(' *: *', line) - if len(line) == 2: - line[0] = string.lower(line[0]) - if line[0] == 'user-agent': - # this record applies to this user agent - if self.debug: print '>> user-agent:', line[1] - active.append(line[1]) - if not self.rules.has_key(line[1]): - self.rules[line[1]] = [] - elif line[0] == 'disallow': - if line[1]: - if self.debug: print '>> disallow:', line[1] - for agent in active: - self.rules[agent].append(re.compile(line[1])) - else: - pass - for agent in active: - if self.debug: print '>> allow', agent - self.rules[agent] = [] - else: - if self.debug: print '>> unknown:', line - - self.modified() - - # returns true if agent is allowed to fetch url - def can_fetch(self, useragent, url): - """using the parsed robots.txt decide if useragent can fetch url""" - import urlparse - ag = useragent - if not self.rules.has_key(ag): ag = '*' - if not self.rules.has_key(ag): - if self.debug: print '>> allowing', url, 'fetch by', useragent - return 1 - path = urlparse.urlparse(url)[2] - for rule in self.rules[ag]: - if rule.match(path) is not None: - if self.debug: print '>> disallowing', url, 'fetch by', useragent - return 0 - if self.debug: print '>> allowing', url, 'fetch by', useragent - return 1 - -def _test(): - rp = RobotFileParser() - rp.debug = 1 - rp.set_url('http://www.musi-cal.com/robots.txt') - rp.read() - print rp.rules - print rp.can_fetch('*', 'http://www.musi-cal.com.com/') - print rp.can_fetch('Musi-Cal-Robot', - 'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco') - -if __name__ == "__main__": - _test() |