diff options
author | Guido van Rossum <guido@python.org> | 1997-01-31 18:57:23 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 1997-01-31 18:57:23 (GMT) |
commit | 0b0b5f0279683b6b48762cf11df05a0f1ffc1bbc (patch) | |
tree | 296e85c2e864806162aa8380eaaa57b9e1c10279 | |
parent | 42218ce33cd11fb2dc4f7a4e2eef092c4d0f2bbf (diff) | |
download | cpython-0b0b5f0279683b6b48762cf11df05a0f1ffc1bbc.zip cpython-0b0b5f0279683b6b48762cf11df05a0f1ffc1bbc.tar.gz cpython-0b0b5f0279683b6b48762cf11df05a0f1ffc1bbc.tar.bz2 |
Spin off checking of external page in a subroutine.
Increase MAXPAGE to 150K.
Add back printing of __doc__ for usage message.
-rwxr-xr-x | Tools/webchecker/webchecker.py | 37 |
1 files changed, 20 insertions, 17 deletions
diff --git a/Tools/webchecker/webchecker.py b/Tools/webchecker/webchecker.py index c454861..9e676ca 100755 --- a/Tools/webchecker/webchecker.py +++ b/Tools/webchecker/webchecker.py @@ -121,7 +121,7 @@ import robotparser # Tunable parameters DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL -MAXPAGE = 50000 # Ignore files bigger than this +MAXPAGE = 150000 # Ignore files bigger than this ROUNDSIZE = 50 # Number of links processed per round DUMPFILE = "@webchecker.pickle" # Pickled checkpoint AGENTNAME = "webchecker" # Agent name for robots.txt parser @@ -145,6 +145,7 @@ def main(): except getopt.error, msg: sys.stdout = sys.stderr print msg + print __doc__%globals() sys.exit(2) for o, a in opts: if o == '-R': @@ -314,22 +315,24 @@ class Checker: for url in urls: if verbose > 0: show("HREF ", url, " from", self.ext[url]) - if not checkext: - continue - if url[:7] == 'mailto:': - if verbose > 2: print "Not checking", url - continue - if verbose > 2: print "Checking", url, "..." - try: - f = self.urlopener.open(url) - safeclose(f) - if verbose > 3: print "OK" - if self.bad.has_key(url): - self.setgood(url) - except IOError, msg: - msg = sanitize(msg) - if verbose > 0: print "Error", msg - self.setbad(url, msg) + if checkext: + self.checkextpage(url) + + def checkextpage(self, url): + if url[:7] == 'mailto:' or url[:5] == 'news:': + if verbose > 2: print "Not checking", url + return + if verbose > 2: print "Checking", url, "..." + try: + f = self.urlopener.open(url) + safeclose(f) + if verbose > 3: print "OK" + if self.bad.has_key(url): + self.setgood(url) + except IOError, msg: + msg = sanitize(msg) + if verbose > 0: print "Error", msg + self.setbad(url, msg) def report_errors(self): if not self.bad: |