Moved robotparser.py to the Lib directory.

If you do a "cvs update" in the Lib directory, it will pop up there.
author: Guido van Rossum <guido@python.org> 2000-03-29 16:02:45 (GMT)
committer: Guido van Rossum <guido@python.org> 2000-03-29 16:02:45 (GMT)
commit: 918429b3b286645884d222b48eea8d7cf8fa7556 (patch)
tree: d0f6bbb74e1ad7f7ac983dacb6cecf0ff8bd7783 /Tools/webchecker/robotparser.py
parent: 1d16a9f040aadbfff9a0628d185c44c4159ce0b3 (diff)
download: cpython-918429b3b286645884d222b48eea8d7cf8fa7556.zip
cpython-918429b3b286645884d222b48eea8d7cf8fa7556.tar.gz
cpython-918429b3b286645884d222b48eea8d7cf8fa7556.tar.bz2
1 files changed, 0 insertions, 97 deletions
diff --git a/Tools/webchecker/robotparser.py b/Tools/webchecker/robotparser.py
deleted file mode 100644
index 3f4396b..0000000
--- a/Tools/webchecker/robotparser.py
+++ /dev/null
@@ -1,97 +0,0 @@
-"""
-
-Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as
-input, builds a set of rules from that list, then answers questions about
-fetchability of other URLs.
-
-"""
-
-class RobotFileParser:
-
-    def __init__(self):
-        self.rules = {}
-        self.debug = 0
-        self.url = ''
-        self.last_checked = 0
-
-    def mtime(self):
-        return self.last_checked
-
-    def modified(self):
-        import time
-        self.last_checked = time.time()
-
-    def set_url(self, url):
-        self.url = url
-
-    def read(self):
-        import urllib
-        self.parse(urllib.urlopen(self.url).readlines())
-
-    def parse(self, lines):
-        """parse the input lines from a robot.txt file"""
-        import string, re
-        active = []
-        for line in lines:
-            if self.debug: print '>', line,
-            # blank line terminates current record
-            if not line[:-1]:
-                active = []
-                continue
-            # remove optional comment and strip line
-            line = string.strip(line[:string.find(line, '#')])
-            if not line:
-                continue
-            line = re.split(' *: *', line)
-            if len(line) == 2:
-                line[0] = string.lower(line[0])
-                if line[0] == 'user-agent':
-                    # this record applies to this user agent
-                    if self.debug: print '>> user-agent:', line[1]
-                    active.append(line[1])
-                    if not self.rules.has_key(line[1]):
-                        self.rules[line[1]] = []
-                elif line[0] == 'disallow':
-                    if line[1]:
-                        if self.debug: print '>> disallow:', line[1]
-                        for agent in active:
-                            self.rules[agent].append(re.compile(line[1]))
-                    else:
-                        pass
-                        for agent in active:
-                            if self.debug: print '>> allow', agent
-                            self.rules[agent] = []
-                else:
-                    if self.debug: print '>> unknown:', line
-
-        self.modified()
-
-    # returns true if agent is allowed to fetch url
-    def can_fetch(self, useragent, url):
-        """using the parsed robots.txt decide if useragent can fetch url"""
-        import urlparse
-        ag = useragent
-        if not self.rules.has_key(ag): ag = '*'
-        if not self.rules.has_key(ag):
-            if self.debug: print '>> allowing', url, 'fetch by', useragent
-            return 1
-        path = urlparse.urlparse(url)[2]
-        for rule in self.rules[ag]:
-            if rule.match(path) is not None:
-                if self.debug: print '>> disallowing', url, 'fetch by', useragent
-                return 0
-        if self.debug: print '>> allowing', url, 'fetch by', useragent
-        return 1
-
-def _test():
-    rp = RobotFileParser()
-    rp.debug = 1
-    rp.set_url('http://www.musi-cal.com/robots.txt')
-    rp.read()
-    print rp.rules
-    print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
-    print rp.can_fetch('Musi-Cal-Robot',
-                       'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
-
-if __name__ == "__main__":
-    _test()
author	Guido van Rossum <guido@python.org>	2000-03-29 16:02:45 (GMT)
committer	Guido van Rossum <guido@python.org>	2000-03-29 16:02:45 (GMT)
commit	918429b3b286645884d222b48eea8d7cf8fa7556 (patch)
tree	d0f6bbb74e1ad7f7ac983dacb6cecf0ff8bd7783 /Tools/webchecker/robotparser.py
parent	1d16a9f040aadbfff9a0628d185c44c4159ce0b3 (diff)
download	cpython-918429b3b286645884d222b48eea8d7cf8fa7556.zip cpython-918429b3b286645884d222b48eea8d7cf8fa7556.tar.gz cpython-918429b3b286645884d222b48eea8d7cf8fa7556.tar.bz2