summaryrefslogtreecommitdiffstats
path: root/Tools/webchecker
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>1997-10-06 18:54:01 (GMT)
committerGuido van Rossum <guido@python.org>1997-10-06 18:54:01 (GMT)
commit2237b73baff12f2032604764ac6a82dedea05140 (patch)
tree962e2471d2701aade222085205df628ba8458793 /Tools/webchecker
parentdc0f00ad0342a81052dc2e96ed493391ea7b95ec (diff)
downloadcpython-2237b73baff12f2032604764ac6a82dedea05140.zip
cpython-2237b73baff12f2032604764ac6a82dedea05140.tar.gz
cpython-2237b73baff12f2032604764ac6a82dedea05140.tar.bz2
Several changes:
- Change the code that looks for robots.txt to always look in /, even if the "root" path is somewhere deep down below. - Add link processing in <AREA> tags. - Change safeclose() to avoid crashing when the file has no geturl() method.
Diffstat (limited to 'Tools/webchecker')
-rwxr-xr-xTools/webchecker/webchecker.py30
1 files changed, 24 insertions, 6 deletions
diff --git a/Tools/webchecker/webchecker.py b/Tools/webchecker/webchecker.py
index dba641c..f412011 100755
--- a/Tools/webchecker/webchecker.py
+++ b/Tools/webchecker/webchecker.py
@@ -251,11 +251,21 @@ class Checker:
def addroot(self, root):
if root not in self.roots:
- self.roots.append(root)
+ troot = root
+ scheme, netloc, path, params, query, fragment = \
+ urlparse.urlparse(root)
+ i = string.rfind(path, "/") + 1
+ if 0 < i < len(path):
+ path = path[:i]
+ troot = urlparse.urlunparse((scheme, netloc, path,
+ params, query, fragment))
+ self.roots.append(troot)
self.addrobot(root)
self.newlink(root, ("<root>", root))
def addrobot(self, root):
+ root = urlparse.urljoin(root, "/")
+ if self.robots.has_key(root): return
url = urlparse.urljoin(root, "/robots.txt")
self.robots[root] = rp = robotparser.RobotFileParser()
if verbose > 2:
@@ -357,6 +367,7 @@ class Checker:
def inroots(self, url):
for root in self.roots:
if url[:len(root)] == root:
+ root = urlparse.urljoin(root, "/")
return self.robots[root].can_fetch(AGENTNAME, url)
return 0
@@ -528,6 +539,9 @@ class MyHTMLParser(sgmllib.SGMLParser):
def end_a(self): pass
+ def do_area(self, attributes):
+ self.link_attr(attributes, 'href')
+
def do_img(self, attributes):
self.link_attr(attributes, 'src', 'lowsrc')
@@ -580,11 +594,15 @@ def sanitize(msg):
def safeclose(f):
- url = f.geturl()
- if url[:4] == 'ftp:' or url[:7] == 'file://':
- # Apparently ftp connections don't like to be closed
- # prematurely...
- text = f.read()
+ try:
+ url = f.geturl()
+ except AttributeError:
+ pass
+ else:
+ if url[:4] == 'ftp:' or url[:7] == 'file://':
+ # Apparently ftp connections don't like to be closed
+ # prematurely...
+ text = f.read()
f.close()