The usual...

author: Guido van Rossum <guido@python.org> 2000-05-08 17:31:04 (GMT)
committer: Guido van Rossum <guido@python.org> 2000-05-08 17:31:04 (GMT)
commit: aad6761ccea28e0a0da6761570b18adc72e01c37 (patch)
tree: 731b55d5648f08e1bc755bcace1f836413cd8aae /Lib/dos-8x3/robotpar.py
parent: 0b095bc0929fb43157019c50e3e680a29ec94a65 (diff)
download: cpython-aad6761ccea28e0a0da6761570b18adc72e01c37.zip
cpython-aad6761ccea28e0a0da6761570b18adc72e01c37.tar.gz
cpython-aad6761ccea28e0a0da6761570b18adc72e01c37.tar.bz2
1 files changed, 97 insertions, 0 deletions
diff --git a/Lib/dos-8x3/robotpar.py b/Lib/dos-8x3/robotpar.py
new file mode 100644
index 0000000..3f4396b
--- /dev/null
+++ b/Lib/dos-8x3/robotpar.py
@@ -0,0 +1,97 @@
+"""
+
+Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as
+input, builds a set of rules from that list, then answers questions about
+fetchability of other URLs.
+
+"""
+
+class RobotFileParser:
+
+    def __init__(self):
+        self.rules = {}
+        self.debug = 0
+        self.url = ''
+        self.last_checked = 0
+
+    def mtime(self):
+        return self.last_checked
+
+    def modified(self):
+        import time
+        self.last_checked = time.time()
+
+    def set_url(self, url):
+        self.url = url
+
+    def read(self):
+        import urllib
+        self.parse(urllib.urlopen(self.url).readlines())
+
+    def parse(self, lines):
+        """parse the input lines from a robot.txt file"""
+        import string, re
+        active = []
+        for line in lines:
+            if self.debug: print '>', line,
+            # blank line terminates current record
+            if not line[:-1]:
+                active = []
+                continue
+            # remove optional comment and strip line
+            line = string.strip(line[:string.find(line, '#')])
+            if not line:
+                continue
+            line = re.split(' *: *', line)
+            if len(line) == 2:
+                line[0] = string.lower(line[0])
+                if line[0] == 'user-agent':
+                    # this record applies to this user agent
+                    if self.debug: print '>> user-agent:', line[1]
+                    active.append(line[1])
+                    if not self.rules.has_key(line[1]):
+                        self.rules[line[1]] = []
+                elif line[0] == 'disallow':
+                    if line[1]:
+                        if self.debug: print '>> disallow:', line[1]
+                        for agent in active:
+                            self.rules[agent].append(re.compile(line[1]))
+                    else:
+                        pass
+                        for agent in active:
+                            if self.debug: print '>> allow', agent
+                            self.rules[agent] = []
+                else:
+                    if self.debug: print '>> unknown:', line
+
+        self.modified()
+
+    # returns true if agent is allowed to fetch url
+    def can_fetch(self, useragent, url):
+        """using the parsed robots.txt decide if useragent can fetch url"""
+        import urlparse
+        ag = useragent
+        if not self.rules.has_key(ag): ag = '*'
+        if not self.rules.has_key(ag):
+            if self.debug: print '>> allowing', url, 'fetch by', useragent
+            return 1
+        path = urlparse.urlparse(url)[2]
+        for rule in self.rules[ag]:
+            if rule.match(path) is not None:
+                if self.debug: print '>> disallowing', url, 'fetch by', useragent
+                return 0
+        if self.debug: print '>> allowing', url, 'fetch by', useragent
+        return 1
+
+def _test():
+    rp = RobotFileParser()
+    rp.debug = 1
+    rp.set_url('http://www.musi-cal.com/robots.txt')
+    rp.read()
+    print rp.rules
+    print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
+    print rp.can_fetch('Musi-Cal-Robot',
+                       'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
+
+if __name__ == "__main__":
+    _test()
author	Guido van Rossum <guido@python.org>	2000-05-08 17:31:04 (GMT)
committer	Guido van Rossum <guido@python.org>	2000-05-08 17:31:04 (GMT)
commit	aad6761ccea28e0a0da6761570b18adc72e01c37 (patch)
tree	731b55d5648f08e1bc755bcace1f836413cd8aae /Lib/dos-8x3/robotpar.py
parent	0b095bc0929fb43157019c50e3e680a29ec94a65 (diff)
download	cpython-aad6761ccea28e0a0da6761570b18adc72e01c37.zip cpython-aad6761ccea28e0a0da6761570b18adc72e01c37.tar.gz cpython-aad6761ccea28e0a0da6761570b18adc72e01c37.tar.bz2