diff options
author | Guido van Rossum <guido@python.org> | 1998-02-21 20:08:39 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 1998-02-21 20:08:39 (GMT) |
commit | 1a7eae919a83618c9f5036416611fc9c8cf3025d (patch) | |
tree | d4e511111de9deab42ac5b9c2b6c0725b100b447 | |
parent | 00756bd4a61c7502ce741a14c860b5b06d92ec04 (diff) | |
download | cpython-1a7eae919a83618c9f5036416611fc9c8cf3025d.zip cpython-1a7eae919a83618c9f5036416611fc9c8cf3025d.tar.gz cpython-1a7eae919a83618c9f5036416611fc9c8cf3025d.tar.bz2 |
Adapt to new webchecker structure. Due to better structure of
getpage(), much less duplicate code is needed -- we only need to
override readhtml().
-rwxr-xr-x | Tools/webchecker/websucker.py | 92 |
1 files changed, 33 insertions, 59 deletions
diff --git a/Tools/webchecker/websucker.py b/Tools/webchecker/websucker.py index 31cefb2..6169446 100755 --- a/Tools/webchecker/websucker.py +++ b/Tools/webchecker/websucker.py @@ -2,7 +2,7 @@ """A variant on webchecker that creates a mirror copy of a remote site.""" -__version__ = "0.1" +__version__ = "$Revision$" import os import sys @@ -11,22 +11,28 @@ import urllib import getopt import webchecker -verbose = webchecker.verbose + +# Extract real version number if necessary +if __version__[0] == '$': + _v = string.split(__version__) + if len(_v) == 3: + __version__ = _v[1] def main(): - global verbose + verbose = webchecker.VERBOSE try: opts, args = getopt.getopt(sys.argv[1:], "qv") except getopt.error, msg: print msg - print "usage:", sys.argv[0], "[-v] ... [rooturl] ..." + print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..." return 2 for o, a in opts: if o == "-q": - webchecker.verbose = verbose = 0 + verbose = 0 if o == "-v": - webchecker.verbose = verbose = verbose + 1 - c = Sucker(0) + verbose = verbose + 1 + c = Sucker() + c.setflags(verbose=verbose) c.urlopener.addheaders = [ ('User-agent', 'websucker/%s' % __version__), ] @@ -38,63 +44,31 @@ def main(): class Sucker(webchecker.Checker): - # Alas, had to copy this to make one change... - def getpage(self, url): - if url[:7] == 'mailto:' or url[:5] == 'news:': - if verbose > 1: print " Not checking mailto/news URL" - return None - isint = self.inroots(url) - if not isint and not self.checkext: - if verbose > 1: print " Not checking ext link" - return None + checkext = 0 + + def readhtml(self, url): + text = None path = self.savefilename(url) - saved = 0 try: f = open(path, "rb") except IOError: - try: - f = self.urlopener.open(url) - except IOError, msg: - msg = webchecker.sanitize(msg) - if verbose > 0: - print "Error ", msg - if verbose > 0: - webchecker.show(" HREF ", url, " from", self.todo[url]) - self.setbad(url, msg) - return None - if not isint: - if verbose > 1: print " Not gathering links from ext URL" - safeclose(f) - return None - nurl = f.geturl() - if nurl != url: - path = self.savefilename(nurl) - info = f.info() + f = self.openpage(url) + if f: + info = f.info() + nurl = f.geturl() + if nurl != url: + url = nurl + path = self.savefilename(url) + text = f.read() + f.close() + self.savefile(text, path) + if not self.checkforhtml(info, url): + text = None else: - if verbose: print "Loading cached URL", url - saved = 1 - nurl = url - info = {} - if url[-1:] == "/": - info["content-type"] = "text/html" - text = f.read() - if not saved: self.savefile(text, path) - if info.has_key('content-type'): - ctype = string.lower(info['content-type']) - else: - ctype = None - if nurl != url: - if verbose > 1: - print " Redirected to", nurl - if not ctype: - ctype, encoding = webchecker.mimetypes.guess_type(nurl) - if ctype != 'text/html': - webchecker.safeclose(f) - if verbose > 1: - print " Not HTML, mime type", ctype - return None - f.close() - return webchecker.Page(text, nurl) + if self.checkforhtml({}, url): + text = f.read() + f.close() + return text, url def savefile(self, text, path): dir, base = os.path.split(path) |