From bbf8c2fafd59a5ecac49b4487639df1d9190e7cb Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 30 Jan 1997 03:18:23 +0000 Subject: Skip Montanaro's robots.txt parser. --- Lib/robotparser.py | 97 +++++++++++++++++++++++++++++++++++++++++ Tools/webchecker/robotparser.py | 97 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 Lib/robotparser.py create mode 100644 Tools/webchecker/robotparser.py diff --git a/Lib/robotparser.py b/Lib/robotparser.py new file mode 100644 index 0000000..634c3fe --- /dev/null +++ b/Lib/robotparser.py @@ -0,0 +1,97 @@ +""" + +Robots.txt file parser class. Accepts a list of lines or robots.txt URL as +input, builds a set of rules from that list, then answers questions about +fetchability of other URLs. + +""" + +class RobotFileParser: + + def __init__(self): + self.rules = {} + self.debug = 0 + self.url = '' + self.last_checked = 0 + + def mtime(self): + return self.last_checked + + def modified(self): + import time + self.last_checked = time.time() + + def set_url(self, url): + self.url = url +## import urlmisc +## self.url = urlmisc.canonical_url(url) + + def read(self): + import urllib + self.parse(urllib.urlopen(self.url).readlines()) + + def parse(self, lines): + import regsub, string, regex + active = [] + for line in lines: + if self.debug: print '>', line, + # blank line terminates current record + if not line[:-1]: + active = [] + continue + # remove optional comment and strip line + line = string.strip(line[:string.find(line, '#')]) + if not line: + continue + line = regsub.split(line, ' *: *') + if len(line) == 2: + line[0] = string.lower(line[0]) + if line[0] == 'user-agent': + # this record applies to this user agent + if self.debug: print '>> user-agent:', line[1] + active.append(line[1]) + if not self.rules.has_key(line[1]): + self.rules[line[1]] = [] + elif line[0] == 'disallow': + if line[1]: + if self.debug: print '>> disallow:', line[1] + for agent in active: + self.rules[agent].append(regex.compile(line[1])) + else: + pass + for agent in active: + if self.debug: print '>> allow', agent + self.rules[agent] = [] + else: + if self.debug: print '>> unknown:', line + + self.modified() + + # returns true if agent is allowed to fetch url + def can_fetch(self, agent, url): + import urlparse + ag = agent + if not self.rules.has_key(ag): ag = '*' + if not self.rules.has_key(ag): + if self.debug: print '>> allowing', url, 'fetch by', agent + return 1 + path = urlparse.urlparse(url)[2] + for rule in self.rules[ag]: + if rule.match(path) != -1: + if self.debug: print '>> disallowing', url, 'fetch by', agent + return 0 + if self.debug: print '>> allowing', url, 'fetch by', agent + return 1 + +def test(): + rp = RobotFileParser() + rp.debug = 1 + rp.set_url('http://www.automatrix.com/robots.txt') + rp.read() + print rp.rules + print rp.can_fetch('*', 'http://www.calendar.com/concerts/') + print rp.can_fetch('Musi-Cal-Robot', + 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones') + + print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/') + print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001') diff --git a/Tools/webchecker/robotparser.py b/Tools/webchecker/robotparser.py new file mode 100644 index 0000000..634c3fe --- /dev/null +++ b/Tools/webchecker/robotparser.py @@ -0,0 +1,97 @@ +""" + +Robots.txt file parser class. Accepts a list of lines or robots.txt URL as +input, builds a set of rules from that list, then answers questions about +fetchability of other URLs. + +""" + +class RobotFileParser: + + def __init__(self): + self.rules = {} + self.debug = 0 + self.url = '' + self.last_checked = 0 + + def mtime(self): + return self.last_checked + + def modified(self): + import time + self.last_checked = time.time() + + def set_url(self, url): + self.url = url +## import urlmisc +## self.url = urlmisc.canonical_url(url) + + def read(self): + import urllib + self.parse(urllib.urlopen(self.url).readlines()) + + def parse(self, lines): + import regsub, string, regex + active = [] + for line in lines: + if self.debug: print '>', line, + # blank line terminates current record + if not line[:-1]: + active = [] + continue + # remove optional comment and strip line + line = string.strip(line[:string.find(line, '#')]) + if not line: + continue + line = regsub.split(line, ' *: *') + if len(line) == 2: + line[0] = string.lower(line[0]) + if line[0] == 'user-agent': + # this record applies to this user agent + if self.debug: print '>> user-agent:', line[1] + active.append(line[1]) + if not self.rules.has_key(line[1]): + self.rules[line[1]] = [] + elif line[0] == 'disallow': + if line[1]: + if self.debug: print '>> disallow:', line[1] + for agent in active: + self.rules[agent].append(regex.compile(line[1])) + else: + pass + for agent in active: + if self.debug: print '>> allow', agent + self.rules[agent] = [] + else: + if self.debug: print '>> unknown:', line + + self.modified() + + # returns true if agent is allowed to fetch url + def can_fetch(self, agent, url): + import urlparse + ag = agent + if not self.rules.has_key(ag): ag = '*' + if not self.rules.has_key(ag): + if self.debug: print '>> allowing', url, 'fetch by', agent + return 1 + path = urlparse.urlparse(url)[2] + for rule in self.rules[ag]: + if rule.match(path) != -1: + if self.debug: print '>> disallowing', url, 'fetch by', agent + return 0 + if self.debug: print '>> allowing', url, 'fetch by', agent + return 1 + +def test(): + rp = RobotFileParser() + rp.debug = 1 + rp.set_url('http://www.automatrix.com/robots.txt') + rp.read() + print rp.rules + print rp.can_fetch('*', 'http://www.calendar.com/concerts/') + print rp.can_fetch('Musi-Cal-Robot', + 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones') + + print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/') + print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001') -- cgit v0.12