diff options
| author | Skip Montanaro <skip@pobox.com> | 2008-04-28 03:27:53 (GMT) | 
|---|---|---|
| committer | Skip Montanaro <skip@pobox.com> | 2008-04-28 03:27:53 (GMT) | 
| commit | b8bdbc04e702409e5aaaaff74c6e5cd93226af07 (patch) | |
| tree | 90e02f7cfb9fcb6f980a9f8ccad46bf7e6ab27e1 /Lib/robotparser.py | |
| parent | dfd982715bc81103dfcb3eecdccff32675a772a3 (diff) | |
| download | cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.zip cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.tar.gz cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.tar.bz2  | |
Get rid of _test(), _main(), _debug() and _check().  Tests are no longer
needed (better set available in Lib/test/test_robotparser.py).  Clean up a
few PEP 8 nits (compound statements on a single line, whitespace around
operators).
Diffstat (limited to 'Lib/robotparser.py')
| -rw-r--r-- | Lib/robotparser.py | 105 | 
1 files changed, 12 insertions, 93 deletions
diff --git a/Lib/robotparser.py b/Lib/robotparser.py index 52ab348..5b1d797 100644 --- a/Lib/robotparser.py +++ b/Lib/robotparser.py @@ -9,15 +9,11 @@      The robots.txt Exclusion Protocol is implemented as specified in      http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html  """ -import urlparse,urllib +import urlparse +import urllib  __all__ = ["RobotFileParser"] -debug = 0 - -def _debug(msg): -    if debug: print msg -  class RobotFileParser:      """ This class provides a set of methods to read, parse and answer @@ -67,12 +63,9 @@ class RobotFileParser:          self.errcode = opener.errcode          if self.errcode in (401, 403):              self.disallow_all = True -            _debug("disallow all")          elif self.errcode >= 400:              self.allow_all = True -            _debug("allow all")          elif self.errcode == 200 and lines: -            _debug("parse lines")              self.parse(lines)      def _add_entry(self, entry): @@ -93,19 +86,16 @@ class RobotFileParser:          for line in lines:              linenumber = linenumber + 1              if not line: -                if state==1: -                    _debug("line %d: warning: you should insert" -                           " allow: or disallow: directives below any" -                           " user-agent: line" % linenumber) +                if state == 1:                      entry = Entry()                      state = 0 -                elif state==2: +                elif state == 2:                      self._add_entry(entry)                      entry = Entry()                      state = 0              # remove optional comment and strip line              i = line.find('#') -            if i>=0: +            if i >= 0:                  line = line[:i]              line = line.strip()              if not line: @@ -115,41 +105,24 @@ class RobotFileParser:                  line[0] = line[0].strip().lower()                  line[1] = urllib.unquote(line[1].strip())                  if line[0] == "user-agent": -                    if state==2: -                        _debug("line %d: warning: you should insert a blank" -                               " line before any user-agent" -                               " directive" % linenumber) +                    if state == 2:                          self._add_entry(entry)                          entry = Entry()                      entry.useragents.append(line[1])                      state = 1                  elif line[0] == "disallow": -                    if state==0: -                        _debug("line %d: error: you must insert a user-agent:" -                               " directive before this line" % linenumber) -                    else: +                    if state != 0:                          entry.rulelines.append(RuleLine(line[1], False))                          state = 2                  elif line[0] == "allow": -                    if state==0: -                        _debug("line %d: error: you must insert a user-agent:" -                               " directive before this line" % linenumber) -                    else: +                    if state != 0:                          entry.rulelines.append(RuleLine(line[1], True)) -                else: -                    _debug("line %d: warning: unknown key %s" % (linenumber, -                               line[0])) -            else: -                _debug("line %d: error: malformed line %s"%(linenumber, line)) -        if state==2: +        if state == 2:              self.entries.append(entry) -        _debug("Parsed rules:\n%s" % str(self))      def can_fetch(self, useragent, url):          """using the parsed robots.txt decide if useragent can fetch url""" -        _debug("Checking robots.txt allowance for:\n  user agent: %s\n  url: %s" % -               (useragent, url))          if self.disallow_all:              return False          if self.allow_all: @@ -182,10 +155,10 @@ class RuleLine:          self.allowance = allowance      def applies_to(self, filename): -        return self.path=="*" or filename.startswith(self.path) +        return self.path == "*" or filename.startswith(self.path)      def __str__(self): -        return (self.allowance and "Allow" or "Disallow")+": "+self.path +        return (self.allowance and "Allow" or "Disallow") + ": " + self.path  class Entry: @@ -207,7 +180,7 @@ class Entry:          # split the name token and make it lower case          useragent = useragent.split("/")[0].lower()          for agent in self.useragents: -            if agent=='*': +            if agent == '*':                  # we have the catch-all agent                  return True              agent = agent.lower() @@ -220,7 +193,6 @@ class Entry:          - our agent applies to this entry          - filename is URL decoded"""          for line in self.rulelines: -            _debug((filename, str(line), line.allowance))              if line.applies_to(filename):                  return line.allowance          return True @@ -239,56 +211,3 @@ class URLopener(urllib.FancyURLopener):          self.errcode = errcode          return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,                                                          errmsg, headers) - -def _check(a,b): -    if not b: -        ac = "access denied" -    else: -        ac = "access allowed" -    if a!=b: -        print "failed" -    else: -        print "ok (%s)" % ac -    print - -def _test(): -    global debug -    rp = RobotFileParser() -    debug = 1 - -    # robots.txt that exists, gotten to by redirection -    rp.set_url('http://www.musi-cal.com/robots.txt') -    rp.read() - -    # test for re.escape -    _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1) -    # this should match the first rule, which is a disallow -    _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0) -    # various cherry pickers -    _check(rp.can_fetch('CherryPickerSE', -                       'http://www.musi-cal.com/cgi-bin/event-search' -                       '?city=San+Francisco'), 0) -    _check(rp.can_fetch('CherryPickerSE/1.0', -                       'http://www.musi-cal.com/cgi-bin/event-search' -                       '?city=San+Francisco'), 0) -    _check(rp.can_fetch('CherryPickerSE/1.5', -                       'http://www.musi-cal.com/cgi-bin/event-search' -                       '?city=San+Francisco'), 0) -    # case sensitivity -    _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0) -    _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0) -    # substring test -    _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0) -    # tests for catch-all * agent -    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0) -    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1) -    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) -    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) - -    # robots.txt that does not exist -    rp.set_url('http://www.lycos.com/robots.txt') -    rp.read() -    _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1) - -if __name__ == '__main__': -    _test()  | 
