diff options
author | Guido van Rossum <guido@python.org> | 1997-01-30 03:18:23 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 1997-01-30 03:18:23 (GMT) |
commit | bbf8c2fafd59a5ecac49b4487639df1d9190e7cb (patch) | |
tree | d283d9fecb17d7e5013af6419184a2e0152387c7 /Tools/webchecker | |
parent | 272b37d6867723c56bea4dd10d7dc3b35d00665a (diff) | |
download | cpython-bbf8c2fafd59a5ecac49b4487639df1d9190e7cb.zip cpython-bbf8c2fafd59a5ecac49b4487639df1d9190e7cb.tar.gz cpython-bbf8c2fafd59a5ecac49b4487639df1d9190e7cb.tar.bz2 |
Skip Montanaro's robots.txt parser.
Diffstat (limited to 'Tools/webchecker')
-rw-r--r-- | Tools/webchecker/robotparser.py | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/Tools/webchecker/robotparser.py b/Tools/webchecker/robotparser.py new file mode 100644 index 0000000..634c3fe --- /dev/null +++ b/Tools/webchecker/robotparser.py @@ -0,0 +1,97 @@ +""" + +Robots.txt file parser class. Accepts a list of lines or robots.txt URL as +input, builds a set of rules from that list, then answers questions about +fetchability of other URLs. + +""" + +class RobotFileParser: + + def __init__(self): + self.rules = {} + self.debug = 0 + self.url = '' + self.last_checked = 0 + + def mtime(self): + return self.last_checked + + def modified(self): + import time + self.last_checked = time.time() + + def set_url(self, url): + self.url = url +## import urlmisc +## self.url = urlmisc.canonical_url(url) + + def read(self): + import urllib + self.parse(urllib.urlopen(self.url).readlines()) + + def parse(self, lines): + import regsub, string, regex + active = [] + for line in lines: + if self.debug: print '>', line, + # blank line terminates current record + if not line[:-1]: + active = [] + continue + # remove optional comment and strip line + line = string.strip(line[:string.find(line, '#')]) + if not line: + continue + line = regsub.split(line, ' *: *') + if len(line) == 2: + line[0] = string.lower(line[0]) + if line[0] == 'user-agent': + # this record applies to this user agent + if self.debug: print '>> user-agent:', line[1] + active.append(line[1]) + if not self.rules.has_key(line[1]): + self.rules[line[1]] = [] + elif line[0] == 'disallow': + if line[1]: + if self.debug: print '>> disallow:', line[1] + for agent in active: + self.rules[agent].append(regex.compile(line[1])) + else: + pass + for agent in active: + if self.debug: print '>> allow', agent + self.rules[agent] = [] + else: + if self.debug: print '>> unknown:', line + + self.modified() + + # returns true if agent is allowed to fetch url + def can_fetch(self, agent, url): + import urlparse + ag = agent + if not self.rules.has_key(ag): ag = '*' + if not self.rules.has_key(ag): + if self.debug: print '>> allowing', url, 'fetch by', agent + return 1 + path = urlparse.urlparse(url)[2] + for rule in self.rules[ag]: + if rule.match(path) != -1: + if self.debug: print '>> disallowing', url, 'fetch by', agent + return 0 + if self.debug: print '>> allowing', url, 'fetch by', agent + return 1 + +def test(): + rp = RobotFileParser() + rp.debug = 1 + rp.set_url('http://www.automatrix.com/robots.txt') + rp.read() + print rp.rules + print rp.can_fetch('*', 'http://www.calendar.com/concerts/') + print rp.can_fetch('Musi-Cal-Robot', + 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones') + + print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/') + print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001') |