Get rid of _test(), _main(), _debug() and _check(). Tests are no longer

needed (better set available in Lib/test/test_robotparser.py). Clean up a few PEP 8 nits (compound statements on a single line, whitespace around operators).
author: Skip Montanaro <skip@pobox.com> 2008-04-28 03:27:53 (GMT)
committer: Skip Montanaro <skip@pobox.com> 2008-04-28 03:27:53 (GMT)
commit: b8bdbc04e702409e5aaaaff74c6e5cd93226af07 (patch)
tree: 90e02f7cfb9fcb6f980a9f8ccad46bf7e6ab27e1 /Lib/robotparser.py
parent: dfd982715bc81103dfcb3eecdccff32675a772a3 (diff)
download: cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.zip
cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.tar.gz
cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.tar.bz2
1 files changed, 12 insertions, 93 deletions
diff --git a/Lib/robotparser.py b/Lib/robotparser.py
index 52ab348..5b1d797 100644
--- a/Lib/robotparser.py
+++ b/Lib/robotparser.py
@@ -9,15 +9,11 @@
     The robots.txt Exclusion Protocol is implemented as specified in
     http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
 """
-import urlparse,urllib
+import urlparse
+import urllib
 
 __all__ = ["RobotFileParser"]
 
-debug = 0
-
-def _debug(msg):
-    if debug: print msg
-
 
 class RobotFileParser:
     """ This class provides a set of methods to read, parse and answer
@@ -67,12 +63,9 @@ class RobotFileParser:
         self.errcode = opener.errcode
         if self.errcode in (401, 403):
             self.disallow_all = True
-            _debug("disallow all")
         elif self.errcode >= 400:
             self.allow_all = True
-            _debug("allow all")
         elif self.errcode == 200 and lines:
-            _debug("parse lines")
             self.parse(lines)
 
     def _add_entry(self, entry):
@@ -93,19 +86,16 @@ class RobotFileParser:
         for line in lines:
             linenumber = linenumber + 1
             if not line:
-                if state==1:
-                    _debug("line %d: warning: you should insert"
-                           " allow: or disallow: directives below any"
-                           " user-agent: line" % linenumber)
+                if state == 1:
                     entry = Entry()
                     state = 0
-                elif state==2:
+                elif state == 2:
                     self._add_entry(entry)
                     entry = Entry()
                     state = 0
             # remove optional comment and strip line
             i = line.find('#')
-            if i>=0:
+            if i >= 0:
                 line = line[:i]
             line = line.strip()
             if not line:
@@ -115,41 +105,24 @@ class RobotFileParser:
                 line[0] = line[0].strip().lower()
                 line[1] = urllib.unquote(line[1].strip())
                 if line[0] == "user-agent":
-                    if state==2:
-                        _debug("line %d: warning: you should insert a blank"
-                               " line before any user-agent"
-                               " directive" % linenumber)
+                    if state == 2:
                         self._add_entry(entry)
                         entry = Entry()
                     entry.useragents.append(line[1])
                     state = 1
                 elif line[0] == "disallow":
-                    if state==0:
-                        _debug("line %d: error: you must insert a user-agent:"
-                               " directive before this line" % linenumber)
-                    else:
+                    if state != 0:
                         entry.rulelines.append(RuleLine(line[1], False))
                         state = 2
                 elif line[0] == "allow":
-                    if state==0:
-                        _debug("line %d: error: you must insert a user-agent:"
-                               " directive before this line" % linenumber)
-                    else:
+                    if state != 0:
                         entry.rulelines.append(RuleLine(line[1], True))
-                else:
-                    _debug("line %d: warning: unknown key %s" % (linenumber,
-                               line[0]))
-            else:
-                _debug("line %d: error: malformed line %s"%(linenumber, line))
-        if state==2:
+        if state == 2:
             self.entries.append(entry)
-        _debug("Parsed rules:\n%s" % str(self))
 
 
     def can_fetch(self, useragent, url):
         """using the parsed robots.txt decide if useragent can fetch url"""
-        _debug("Checking robots.txt allowance for:\n  user agent: %s\n  url: %s" %
-               (useragent, url))
         if self.disallow_all:
             return False
         if self.allow_all:
@@ -182,10 +155,10 @@ class RuleLine:
         self.allowance = allowance
 
     def applies_to(self, filename):
-        return self.path=="*" or filename.startswith(self.path)
+        return self.path == "*" or filename.startswith(self.path)
 
     def __str__(self):
-        return (self.allowance and "Allow" or "Disallow")+": "+self.path
+        return (self.allowance and "Allow" or "Disallow") + ": " + self.path
 
 
 class Entry:
@@ -207,7 +180,7 @@ class Entry:
         # split the name token and make it lower case
         useragent = useragent.split("/")[0].lower()
         for agent in self.useragents:
-            if agent=='*':
+            if agent == '*':
                 # we have the catch-all agent
                 return True
             agent = agent.lower()
@@ -220,7 +193,6 @@ class Entry:
         - our agent applies to this entry
         - filename is URL decoded"""
         for line in self.rulelines:
-            _debug((filename, str(line), line.allowance))
             if line.applies_to(filename):
                 return line.allowance
         return True
@@ -239,56 +211,3 @@ class URLopener(urllib.FancyURLopener):
         self.errcode = errcode
         return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
                                                         errmsg, headers)
-
-def _check(a,b):
-    if not b:
-        ac = "access denied"
-    else:
-        ac = "access allowed"
-    if a!=b:
-        print "failed"
-    else:
-        print "ok (%s)" % ac
-    print
-
-def _test():
-    global debug
-    rp = RobotFileParser()
-    debug = 1
-
-    # robots.txt that exists, gotten to by redirection
-    rp.set_url('http://www.musi-cal.com/robots.txt')
-    rp.read()
-
-    # test for re.escape
-    _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
-    # this should match the first rule, which is a disallow
-    _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
-    # various cherry pickers
-    _check(rp.can_fetch('CherryPickerSE',
-                       'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco'), 0)
-    _check(rp.can_fetch('CherryPickerSE/1.0',
-                       'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco'), 0)
-    _check(rp.can_fetch('CherryPickerSE/1.5',
-                       'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco'), 0)
-    # case sensitivity
-    _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
-    _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
-    # substring test
-    _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
-    # tests for catch-all * agent
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
-
-    # robots.txt that does not exist
-    rp.set_url('http://www.lycos.com/robots.txt')
-    rp.read()
-    _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
-
-if __name__ == '__main__':
-    _test()
author	Skip Montanaro <skip@pobox.com>	2008-04-28 03:27:53 (GMT)
committer	Skip Montanaro <skip@pobox.com>	2008-04-28 03:27:53 (GMT)
commit	b8bdbc04e702409e5aaaaff74c6e5cd93226af07 (patch)
tree	90e02f7cfb9fcb6f980a9f8ccad46bf7e6ab27e1 /Lib/robotparser.py
parent	dfd982715bc81103dfcb3eecdccff32675a772a3 (diff)
download	cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.zip cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.tar.gz cpython-b8bdbc04e702409e5aaaaff74c6e5cd93226af07.tar.bz2