From 1a7eae919a83618c9f5036416611fc9c8cf3025d Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Sat, 21 Feb 1998 20:08:39 +0000 Subject: Adapt to new webchecker structure. Due to better structure of getpage(), much less duplicate code is needed -- we only need to override readhtml(). --- Tools/webchecker/websucker.py | 92 ++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 59 deletions(-) diff --git a/Tools/webchecker/websucker.py b/Tools/webchecker/websucker.py index 31cefb2..6169446 100755 --- a/Tools/webchecker/websucker.py +++ b/Tools/webchecker/websucker.py @@ -2,7 +2,7 @@ """A variant on webchecker that creates a mirror copy of a remote site.""" -__version__ = "0.1" +__version__ = "$Revision$" import os import sys @@ -11,22 +11,28 @@ import urllib import getopt import webchecker -verbose = webchecker.verbose + +# Extract real version number if necessary +if __version__[0] == '$': + _v = string.split(__version__) + if len(_v) == 3: + __version__ = _v[1] def main(): - global verbose + verbose = webchecker.VERBOSE try: opts, args = getopt.getopt(sys.argv[1:], "qv") except getopt.error, msg: print msg - print "usage:", sys.argv[0], "[-v] ... [rooturl] ..." + print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..." return 2 for o, a in opts: if o == "-q": - webchecker.verbose = verbose = 0 + verbose = 0 if o == "-v": - webchecker.verbose = verbose = verbose + 1 - c = Sucker(0) + verbose = verbose + 1 + c = Sucker() + c.setflags(verbose=verbose) c.urlopener.addheaders = [ ('User-agent', 'websucker/%s' % __version__), ] @@ -38,63 +44,31 @@ def main(): class Sucker(webchecker.Checker): - # Alas, had to copy this to make one change... - def getpage(self, url): - if url[:7] == 'mailto:' or url[:5] == 'news:': - if verbose > 1: print " Not checking mailto/news URL" - return None - isint = self.inroots(url) - if not isint and not self.checkext: - if verbose > 1: print " Not checking ext link" - return None + checkext = 0 + + def readhtml(self, url): + text = None path = self.savefilename(url) - saved = 0 try: f = open(path, "rb") except IOError: - try: - f = self.urlopener.open(url) - except IOError, msg: - msg = webchecker.sanitize(msg) - if verbose > 0: - print "Error ", msg - if verbose > 0: - webchecker.show(" HREF ", url, " from", self.todo[url]) - self.setbad(url, msg) - return None - if not isint: - if verbose > 1: print " Not gathering links from ext URL" - safeclose(f) - return None - nurl = f.geturl() - if nurl != url: - path = self.savefilename(nurl) - info = f.info() + f = self.openpage(url) + if f: + info = f.info() + nurl = f.geturl() + if nurl != url: + url = nurl + path = self.savefilename(url) + text = f.read() + f.close() + self.savefile(text, path) + if not self.checkforhtml(info, url): + text = None else: - if verbose: print "Loading cached URL", url - saved = 1 - nurl = url - info = {} - if url[-1:] == "/": - info["content-type"] = "text/html" - text = f.read() - if not saved: self.savefile(text, path) - if info.has_key('content-type'): - ctype = string.lower(info['content-type']) - else: - ctype = None - if nurl != url: - if verbose > 1: - print " Redirected to", nurl - if not ctype: - ctype, encoding = webchecker.mimetypes.guess_type(nurl) - if ctype != 'text/html': - webchecker.safeclose(f) - if verbose > 1: - print " Not HTML, mime type", ctype - return None - f.close() - return webchecker.Page(text, nurl) + if self.checkforhtml({}, url): + text = f.read() + f.close() + return text, url def savefile(self, text, path): dir, base = os.path.split(path) -- cgit v0.12