Lib/robotparser.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

"""

Robots.txt file parser class.  Accepts a list of lines or robots.txt URL as
input, builds a set of rules from that list, then answers questions about
fetchability of other URLs.

"""

class RobotFileParser:

    def __init__(self):
        self.rules = {}
        self.debug = 0
        self.url = ''
        self.last_checked = 0

    def mtime(self):
        return self.last_checked

    def modified(self):
        import time
        self.last_checked = time.time()

    def set_url(self, url):
        self.url = url

    def read(self):
        import urllib
        self.parse(urllib.urlopen(self.url).readlines())

    def parse(self, lines):
        """parse the input lines from a robot.txt file"""
        import string, re
        active = []
        for line in lines:
            if self.debug: print '>', line,
            # blank line terminates current record
            if not line[:-1]:
                active = []
                continue
            # remove optional comment and strip line
            line = string.strip(line[:string.find(line, '#')])
            if not line:
                continue
            line = re.split(' *: *', line)
            if len(line) == 2:
                line[0] = string.lower(line[0])
                if line[0] == 'user-agent':
                    # this record applies to this user agent
                    if self.debug: print '>> user-agent:', line[1]
                    active.append(line[1])
                    if not self.rules.has_key(line[1]):
                        self.rules[line[1]] = []
                elif line[0] == 'disallow':
                    if line[1]:
                        if self.debug: print '>> disallow:', line[1]
                        for agent in active:
                            self.rules[agent].append(re.compile(line[1]))
                    else:
                        pass
                        for agent in active:
                            if self.debug: print '>> allow', agent
                            self.rules[agent] = []
                else:
                    if self.debug: print '>> unknown:', line

        self.modified()

    # returns true if agent is allowed to fetch url
    def can_fetch(self, useragent, url):
        """using the parsed robots.txt decide if useragent can fetch url"""
        import urlparse
        ag = useragent
        if not self.rules.has_key(ag): ag = '*'
        if not self.rules.has_key(ag):
            if self.debug: print '>> allowing', url, 'fetch by', useragent
            return 1
        path = urlparse.urlparse(url)[2]
        for rule in self.rules[ag]:
            if rule.match(path) is not None:
                if self.debug: print '>> disallowing', url, 'fetch by', useragent
                return 0
        if self.debug: print '>> allowing', url, 'fetch by', useragent
        return 1

def _test():
    rp = RobotFileParser()
    rp.debug = 1
    rp.set_url('http://www.musi-cal.com/robots.txt')
    rp.read()
    print rp.rules
    print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
    print rp.can_fetch('Musi-Cal-Robot',
                       'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')

if __name__ == "__main__":
    _test()