diff options
Diffstat (limited to 'Tools/webchecker/robotparser.py')
-rw-r--r-- | Tools/webchecker/robotparser.py | 120 |
1 files changed, 60 insertions, 60 deletions
diff --git a/Tools/webchecker/robotparser.py b/Tools/webchecker/robotparser.py index 634c3fe..6f85afa 100644 --- a/Tools/webchecker/robotparser.py +++ b/Tools/webchecker/robotparser.py @@ -9,79 +9,79 @@ fetchability of other URLs. class RobotFileParser: def __init__(self): - self.rules = {} - self.debug = 0 - self.url = '' - self.last_checked = 0 + self.rules = {} + self.debug = 0 + self.url = '' + self.last_checked = 0 def mtime(self): - return self.last_checked + return self.last_checked def modified(self): - import time - self.last_checked = time.time() + import time + self.last_checked = time.time() def set_url(self, url): - self.url = url -## import urlmisc -## self.url = urlmisc.canonical_url(url) + self.url = url +## import urlmisc +## self.url = urlmisc.canonical_url(url) def read(self): - import urllib - self.parse(urllib.urlopen(self.url).readlines()) + import urllib + self.parse(urllib.urlopen(self.url).readlines()) def parse(self, lines): - import regsub, string, regex - active = [] - for line in lines: - if self.debug: print '>', line, - # blank line terminates current record - if not line[:-1]: - active = [] - continue - # remove optional comment and strip line - line = string.strip(line[:string.find(line, '#')]) - if not line: - continue - line = regsub.split(line, ' *: *') - if len(line) == 2: - line[0] = string.lower(line[0]) - if line[0] == 'user-agent': - # this record applies to this user agent - if self.debug: print '>> user-agent:', line[1] - active.append(line[1]) - if not self.rules.has_key(line[1]): - self.rules[line[1]] = [] - elif line[0] == 'disallow': - if line[1]: - if self.debug: print '>> disallow:', line[1] - for agent in active: - self.rules[agent].append(regex.compile(line[1])) - else: - pass - for agent in active: - if self.debug: print '>> allow', agent - self.rules[agent] = [] - else: - if self.debug: print '>> unknown:', line + import regsub, string, regex + active = [] + for line in lines: + if self.debug: print '>', line, + # blank line terminates current record + if not line[:-1]: + active = [] + continue + # remove optional comment and strip line + line = string.strip(line[:string.find(line, '#')]) + if not line: + continue + line = regsub.split(line, ' *: *') + if len(line) == 2: + line[0] = string.lower(line[0]) + if line[0] == 'user-agent': + # this record applies to this user agent + if self.debug: print '>> user-agent:', line[1] + active.append(line[1]) + if not self.rules.has_key(line[1]): + self.rules[line[1]] = [] + elif line[0] == 'disallow': + if line[1]: + if self.debug: print '>> disallow:', line[1] + for agent in active: + self.rules[agent].append(regex.compile(line[1])) + else: + pass + for agent in active: + if self.debug: print '>> allow', agent + self.rules[agent] = [] + else: + if self.debug: print '>> unknown:', line - self.modified() + self.modified() # returns true if agent is allowed to fetch url def can_fetch(self, agent, url): - import urlparse - ag = agent - if not self.rules.has_key(ag): ag = '*' - if not self.rules.has_key(ag): - if self.debug: print '>> allowing', url, 'fetch by', agent - return 1 - path = urlparse.urlparse(url)[2] - for rule in self.rules[ag]: - if rule.match(path) != -1: - if self.debug: print '>> disallowing', url, 'fetch by', agent - return 0 - if self.debug: print '>> allowing', url, 'fetch by', agent - return 1 + import urlparse + ag = agent + if not self.rules.has_key(ag): ag = '*' + if not self.rules.has_key(ag): + if self.debug: print '>> allowing', url, 'fetch by', agent + return 1 + path = urlparse.urlparse(url)[2] + for rule in self.rules[ag]: + if rule.match(path) != -1: + if self.debug: print '>> disallowing', url, 'fetch by', agent + return 0 + if self.debug: print '>> allowing', url, 'fetch by', agent + return 1 def test(): rp = RobotFileParser() @@ -91,7 +91,7 @@ def test(): print rp.rules print rp.can_fetch('*', 'http://www.calendar.com/concerts/') print rp.can_fetch('Musi-Cal-Robot', - 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones') + 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones') print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/') print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001') |