diff options
author | Skip Montanaro <skip@pobox.com> | 2008-07-27 00:49:02 (GMT) |
---|---|---|
committer | Skip Montanaro <skip@pobox.com> | 2008-07-27 00:49:02 (GMT) |
commit | 1ef19f0de12da730b5ba3096ab4a1d78af5185b4 (patch) | |
tree | 474c5c3011d3fafb38929aa0e263cabe1f855186 | |
parent | 4b99e9b4790af9951b81925e28bd07850cb5c630 (diff) | |
download | cpython-1ef19f0de12da730b5ba3096ab4a1d78af5185b4.zip cpython-1ef19f0de12da730b5ba3096ab4a1d78af5185b4.tar.gz cpython-1ef19f0de12da730b5ba3096ab4a1d78af5185b4.tar.bz2 |
Close issue 3437 - missing state change when Allow lines are processed.
Adds test cases which use Allow: as well.
-rw-r--r-- | Lib/robotparser.py | 5 | ||||
-rw-r--r-- | Lib/test/test_robotparser.py | 69 |
2 files changed, 74 insertions, 0 deletions
diff --git a/Lib/robotparser.py b/Lib/robotparser.py index f249187..447563f 100644 --- a/Lib/robotparser.py +++ b/Lib/robotparser.py @@ -76,6 +76,10 @@ class RobotFileParser: """parse the input lines from a robots.txt file. We allow that a user-agent: line is not preceded by one or more blank lines.""" + # states: + # 0: start state + # 1: saw user-agent line + # 2: saw an allow or disallow line state = 0 linenumber = 0 entry = Entry() @@ -114,6 +118,7 @@ class RobotFileParser: elif line[0] == "allow": if state != 0: entry.rulelines.append(RuleLine(line[1], True)) + state = 2 if state == 2: self.entries.append(entry) diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index b7911fd..431b8ff 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -134,6 +134,75 @@ bad = [] # Bug report says "/" should be denied, but that is not in the RFC RobotTest(7, doc, good, bad) +# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364 + +# 8. +doc = """ +User-agent: Googlebot +Allow: /folder1/myfile.html +Disallow: /folder1/ +""" + +good = ['/folder1/myfile.html'] +bad = ['/folder1/anotherfile.html'] + +RobotTest(8, doc, good, bad, agent="Googlebot") + +# 9. This file is incorrect because "Googlebot" is a substring of +# "Googlebot-Mobile", so test 10 works just like test 9. +doc = """ +User-agent: Googlebot +Disallow: / + +User-agent: Googlebot-Mobile +Allow: / +""" + +good = [] +bad = ['/something.jpg'] + +RobotTest(9, doc, good, bad, agent="Googlebot") + +good = [] +bad = ['/something.jpg'] + +RobotTest(10, doc, good, bad, agent="Googlebot-Mobile") + +# 11. Get the order correct. +doc = """ +User-agent: Googlebot-Mobile +Allow: / + +User-agent: Googlebot +Disallow: / +""" + +good = [] +bad = ['/something.jpg'] + +RobotTest(11, doc, good, bad, agent="Googlebot") + +good = ['/something.jpg'] +bad = [] + +RobotTest(12, doc, good, bad, agent="Googlebot-Mobile") + + +# 13. Google also got the order wrong in #8. You need to specify the +# URLs from more specific to more general. +doc = """ +User-agent: Googlebot +Allow: /folder1/myfile.html +Disallow: /folder1/ +""" + +good = ['/folder1/myfile.html'] +bad = ['/folder1/anotherfile.html'] + +RobotTest(13, doc, good, bad, agent="googlebot") + + + class TestCase(unittest.TestCase): def runTest(self): test_support.requires('network') |