summaryrefslogtreecommitdiffstats
path: root/Tools/webchecker
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>1997-01-30 03:18:23 (GMT)
committerGuido van Rossum <guido@python.org>1997-01-30 03:18:23 (GMT)
commitbbf8c2fafd59a5ecac49b4487639df1d9190e7cb (patch)
treed283d9fecb17d7e5013af6419184a2e0152387c7 /Tools/webchecker
parent272b37d6867723c56bea4dd10d7dc3b35d00665a (diff)
downloadcpython-bbf8c2fafd59a5ecac49b4487639df1d9190e7cb.zip
cpython-bbf8c2fafd59a5ecac49b4487639df1d9190e7cb.tar.gz
cpython-bbf8c2fafd59a5ecac49b4487639df1d9190e7cb.tar.bz2
Skip Montanaro's robots.txt parser.
Diffstat (limited to 'Tools/webchecker')
-rw-r--r--Tools/webchecker/robotparser.py97
1 files changed, 97 insertions, 0 deletions
diff --git a/Tools/webchecker/robotparser.py b/Tools/webchecker/robotparser.py
new file mode 100644
index 0000000..634c3fe
--- /dev/null
+++ b/Tools/webchecker/robotparser.py
@@ -0,0 +1,97 @@
+"""
+
+Robots.txt file parser class. Accepts a list of lines or robots.txt URL as
+input, builds a set of rules from that list, then answers questions about
+fetchability of other URLs.
+
+"""
+
+class RobotFileParser:
+
+ def __init__(self):
+ self.rules = {}
+ self.debug = 0
+ self.url = ''
+ self.last_checked = 0
+
+ def mtime(self):
+ return self.last_checked
+
+ def modified(self):
+ import time
+ self.last_checked = time.time()
+
+ def set_url(self, url):
+ self.url = url
+## import urlmisc
+## self.url = urlmisc.canonical_url(url)
+
+ def read(self):
+ import urllib
+ self.parse(urllib.urlopen(self.url).readlines())
+
+ def parse(self, lines):
+ import regsub, string, regex
+ active = []
+ for line in lines:
+ if self.debug: print '>', line,
+ # blank line terminates current record
+ if not line[:-1]:
+ active = []
+ continue
+ # remove optional comment and strip line
+ line = string.strip(line[:string.find(line, '#')])
+ if not line:
+ continue
+ line = regsub.split(line, ' *: *')
+ if len(line) == 2:
+ line[0] = string.lower(line[0])
+ if line[0] == 'user-agent':
+ # this record applies to this user agent
+ if self.debug: print '>> user-agent:', line[1]
+ active.append(line[1])
+ if not self.rules.has_key(line[1]):
+ self.rules[line[1]] = []
+ elif line[0] == 'disallow':
+ if line[1]:
+ if self.debug: print '>> disallow:', line[1]
+ for agent in active:
+ self.rules[agent].append(regex.compile(line[1]))
+ else:
+ pass
+ for agent in active:
+ if self.debug: print '>> allow', agent
+ self.rules[agent] = []
+ else:
+ if self.debug: print '>> unknown:', line
+
+ self.modified()
+
+ # returns true if agent is allowed to fetch url
+ def can_fetch(self, agent, url):
+ import urlparse
+ ag = agent
+ if not self.rules.has_key(ag): ag = '*'
+ if not self.rules.has_key(ag):
+ if self.debug: print '>> allowing', url, 'fetch by', agent
+ return 1
+ path = urlparse.urlparse(url)[2]
+ for rule in self.rules[ag]:
+ if rule.match(path) != -1:
+ if self.debug: print '>> disallowing', url, 'fetch by', agent
+ return 0
+ if self.debug: print '>> allowing', url, 'fetch by', agent
+ return 1
+
+def test():
+ rp = RobotFileParser()
+ rp.debug = 1
+ rp.set_url('http://www.automatrix.com/robots.txt')
+ rp.read()
+ print rp.rules
+ print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
+ print rp.can_fetch('Musi-Cal-Robot',
+ 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
+
+ print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
+ print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')