From e5605ba3c28f778457f1e930aeae2c0ab7534026 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 31 Jan 1997 14:43:15 +0000 Subject: Many misc changes. - Faster HTML parser derivede from SGMLparser (Fred Gansevles). - All manipulations of todo, done, ext, bad are done via methods, so a derived class can override. Also moved the 'done' marking to dopage(), so run() is much simpler. - Added a method status() which returns a string containing the summary counts; added a "total" count. - Drop the guessing of the file type before opening the document -- we still need to check those links for validity! - Added a subroutine to close a connection which first slurps up the remaining data when it's an ftp URL -- apparently closing an ftp connection without reading till the end makes it hang. - Added -n option to skip running (only useful with -R). - The Checker object now has an instance variable which is set to 1 when it is changed. This is not pickled. --- Tools/webchecker/webchecker.py | 237 ++++++++++++++++++++++++----------------- 1 file changed, 142 insertions(+), 95 deletions(-) diff --git a/Tools/webchecker/webchecker.py b/Tools/webchecker/webchecker.py index 12daa1f..c454861 100755 --- a/Tools/webchecker/webchecker.py +++ b/Tools/webchecker/webchecker.py @@ -59,12 +59,13 @@ by the robots.txt file are reported as external URLs. skipped. The size limit can be set with the -m option. - Before fetching a page, it guesses its type based on its extension. -If it is a known extension and the type is not text/http, the page is +If it is a known extension and the type is not text/html, the page is not fetched. This is a huge optimization but occasionally it means -links can be missed. The mimetypes.py module (also in this directory) -has a built-in table mapping most currently known suffixes, and in -addition attempts to read the mime.types configuration files in the -default locations of Netscape and the NCSA HTTP daemon. +links can be missed, and such links aren't checked for validity +(XXX!). The mimetypes.py module (also in this directory) has a +built-in table mapping most currently known suffixes, and in addition +attempts to read the mime.types configuration files in the default +locations of Netscape and the NCSA HTTP daemon. - It only follows links indicated by tags. It doesn't follow links in
or or whatever other tags might contain @@ -83,6 +84,7 @@ Options: -R -- restart from checkpoint file -d file -- checkpoint filename (default %(DUMPFILE)s) -m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d) +-n -- reports only, no checking (use with -R) -q -- quiet operation (also suppresses external links report) -r number -- number of links processed per round (default %(ROUNDSIZE)d) -v -- verbose operation; repeating -v will increase verbosity @@ -95,7 +97,10 @@ rooturl -- URL to start checking """ -__version__ = "0.2" +# ' Emacs bait + + +__version__ = "0.3" import sys @@ -108,8 +113,7 @@ import pickle import urllib import urlparse -import htmllib -import formatter +import sgmllib import mimetypes import robotparser @@ -134,9 +138,10 @@ def main(): dumpfile = DUMPFILE restart = 0 checkext = 0 + norun = 0 try: - opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:vx') + opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:vx') except getopt.error, msg: sys.stdout = sys.stderr print msg @@ -148,6 +153,8 @@ def main(): dumpfile = a if o == '-m': maxpage = string.atoi(a) + if o == '-n': + norun = 1 if o == '-q': verbose = 0 if o == '-r': @@ -157,7 +164,7 @@ def main(): if o == '-x': checkext = 1 - if verbose: + if verbose > 0: print AGENTNAME, "version", __version__ if restart: @@ -177,32 +184,33 @@ def main(): for arg in args: c.addroot(arg) - if not c.todo: - needsave = 0 - else: - needsave = 1 - try: - c.run() - except KeyboardInterrupt: - if verbose > 0: - print "[run interrupted]" + if not norun: + try: + c.run() + except KeyboardInterrupt: + if verbose > 0: + print "[run interrupted]" + try: c.report(checkext) except KeyboardInterrupt: if verbose > 0: print "[report interrupted]" - if not needsave: + + if not c.changed: if verbose > 0: print print "No need to save checkpoint" - elif dumpfile: + elif not dumpfile: + if verbose > 0: + print "No dumpfile, won't save checkpoint" + else: if verbose > 0: print print "Saving checkpoint to %s ..." % dumpfile newfile = dumpfile + ".new" f = open(newfile, "wb") pickle.dump(c, f) - f.flush() f.close() try: os.unlink(dumpfile) @@ -226,9 +234,11 @@ class Checker: self.done = {} self.ext = {} self.bad = {} - self.urlopener = MyURLopener() self.round = 0 + # The following are not pickled: self.robots = {} + self.urlopener = MyURLopener() + self.changed = 0 def __getstate__(self): return (self.roots, self.todo, self.done, @@ -243,15 +253,15 @@ class Checker: def addroot(self, root): if root not in self.roots: self.roots.append(root) - self.todo[root] = [] self.addrobot(root) + self.newintlink(root, ("", root)) def addrobot(self, root): url = urlparse.urljoin(root, "/robots.txt") self.robots[root] = rp = robotparser.RobotFileParser() if verbose > 2: print "Parsing", url - rp.debug = 1 + rp.debug = verbose > 3 rp.set_url(url) try: rp.read() @@ -264,24 +274,23 @@ class Checker: self.round = self.round + 1 if verbose > 0: print - print "Round", self.round, - print "(%d to do, %d done, %d external, %d bad)" % ( - len(self.todo), len(self.done), - len(self.ext), len(self.bad)) - print + print "Round", self.round, self.status() + print urls = self.todo.keys()[:roundsize] for url in urls: self.dopage(url) - self.done[url] = self.todo[url] - del self.todo[url] + + def status(self): + return "(%d total, %d to do, %d done, %d external, %d bad)" % ( + len(self.todo)+len(self.done), + len(self.todo), len(self.done), + len(self.ext), len(self.bad)) def report(self, checkext=0): print if not self.todo: print "Final", else: print "Interim", - print "Report (%d to do, %d done, %d external, %d bad)" % ( - len(self.todo), len(self.done), - len(self.ext), len(self.bad)) + print "Report", self.status() if verbose > 0 or checkext: self.report_extrefs(checkext) # Report errors last because the output may get truncated @@ -313,12 +322,14 @@ class Checker: if verbose > 2: print "Checking", url, "..." try: f = self.urlopener.open(url) - f.close() + safeclose(f) if verbose > 3: print "OK" + if self.bad.has_key(url): + self.setgood(url) except IOError, msg: msg = sanitize(msg) if verbose > 0: print "Error", msg - self.bad[url] = msg + self.setbad(url, msg) def report_errors(self): if not self.bad: @@ -366,36 +377,51 @@ class Checker: else: print "Page ", url page = self.getpage(url) - if not page: - return - for info in page.getlinkinfos(): - link, rawlink = info - origin = url, rawlink - if not self.inroots(link): - try: - self.ext[link].append(origin) - if verbose > 3: - print " New ext link", link, - if link != rawlink: print "(%s)" % rawlink, - print - except KeyError: - if verbose > 3: - print " Seen ext link", link, - if link != rawlink: print "(%s)" % rawlink, - print - self.ext[link] = [origin] - elif self.done.has_key(link): - if verbose > 3: - print " Done link", link - self.done[link].append(origin) - elif self.todo.has_key(link): - if verbose > 3: - print " Seen todo link", link - self.todo[link].append(origin) - else: - if verbose > 3: - print " New todo link", link - self.todo[link] = [origin] + if page: + for info in page.getlinkinfos(): + link, rawlink = info + origin = url, rawlink + if not self.inroots(link): + self.newextlink(link, origin) + else: + self.newintlink(link, origin) + self.markdone(url) + + def newextlink(self, url, origin): + try: + self.ext[url].append(origin) + if verbose > 3: + print " New ext link", url + except KeyError: + self.ext[url] = [origin] + if verbose > 3: + print " Seen ext link", url + + def newintlink(self, url, origin): + if self.done.has_key(url): + self.newdonelink(url, origin) + else: + self.newtodolink(url, origin) + + def newdonelink(self, url, origin): + self.done[url].append(origin) + if verbose > 3: + print " Done link", url + + def newtodolink(self, url, origin): + if self.todo.has_key(url): + self.todo[url].append(origin) + if verbose > 3: + print " Seen todo link", url + else: + self.todo[url] = [origin] + if verbose > 3: + print " New todo link", url + + def markdone(self, url): + self.done[url] = self.todo[url] + del self.todo[url] + self.changed = 1 def inroots(self, url): for root in self.roots: @@ -404,15 +430,6 @@ class Checker: return 0 def getpage(self, url): - ctype, encoding = mimetypes.guess_type(url) - if encoding: - if verbose > 2: - print " Won't bother, URL suggests encoding %s" % `encoding` - return None - if ctype and ctype != 'text/html': - if verbose > 2: - print " Won't bother, URL suggests mime type %s" % `ctype` - return None try: f = self.urlopener.open(url) except IOError, msg: @@ -421,26 +438,43 @@ class Checker: print "Error ", msg if verbose > 0: show(" HREF ", url, " from", self.todo[url]) - self.bad[url] = msg + self.setbad(url, msg) return None nurl = f.geturl() info = f.info() if info.has_key('content-type'): ctype = string.lower(info['content-type']) + else: + ctype = None if nurl != url: if verbose > 1: print " Redirected to", nurl - if not ctype: - ctype, encoding = mimetypes.guess_type(nurl) + if not ctype: + ctype, encoding = mimetypes.guess_type(nurl) if ctype != 'text/html': - f.close() - if verbose > 2: - print " Not HTML, mime type", ctype + safeclose(f) + if verbose > 1: + print " Not HTML, mime type", ctype return None text = f.read() f.close() return Page(text, nurl) + def setgood(self, url): + if self.bad.has_key(url): + del self.bad[url] + self.changed = 1 + if verbose > 0: + print "(Clear previously seen error)" + + def setbad(self, url, msg): + if self.bad.has_key(url) and self.bad[url] == msg: + if verbose > 0: + print "(Seen this error before)" + return + self.bad[url] = msg + self.changed = 1 + class Page: @@ -457,7 +491,7 @@ class Page: return [] if verbose > 2: print " Parsing", self.url, "(%d bytes)" % size - parser = MyHTMLParser(formatter.NullFormatter()) + parser = MyHTMLParser() parser.feed(self.text) parser.close() rawlinks = parser.getlinks() @@ -519,28 +553,32 @@ class MyURLopener(urllib.FancyURLopener): return urllib.FancyURLopener.open_file(self, path) -class MyHTMLParser(htmllib.HTMLParser): +class MyHTMLParser(sgmllib.SGMLParser): - def __init__(*args): - self = args[0] + def __init__(self): self.base = None - self.links = [] - apply(htmllib.HTMLParser.__init__, args) + self.links = {} + sgmllib.SGMLParser.__init__ (self) def start_a(self, attributes): for name, value in attributes: - if name == 'href' and value and value not in self.links: - self.links.append(string.strip(value)) + if name == 'href': + if value: value = string.strip(value) + if value: self.links[value] = None + return # match only first href def do_base(self, attributes): for name, value in attributes: - if name == 'href' and value: - if verbose > 1: - print " Base", value - self.base = value + if name == 'href': + if value: value = string.strip(value) + if value: + if verbose > 1: + print " Base", value + self.base = value + return # match only first href def getlinks(self): - return self.links + return self.links.keys() def getbase(self): return self.base @@ -569,5 +607,14 @@ def sanitize(msg): return msg +def safeclose(f): + url = f.geturl() + if url[:4] == 'ftp:' or url[:7] == 'file://': + # Apparently ftp connections don't like to be closed + # prematurely... + text = f.read() + f.close() + + if __name__ == '__main__': main() -- cgit v0.12