summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xTools/webchecker/webchecker.py488
1 files changed, 488 insertions, 0 deletions
diff --git a/Tools/webchecker/webchecker.py b/Tools/webchecker/webchecker.py
new file mode 100755
index 0000000..255c490
--- /dev/null
+++ b/Tools/webchecker/webchecker.py
@@ -0,0 +1,488 @@
+#! /usr/bin/env python
+
+"""Web tree checker.
+
+This utility is handy to check a subweb of the world-wide web for
+errors. A subweb is specified by giving one or more ``root URLs''; a
+page belongs to the subweb if one of the root URLs is an initial
+prefix of it.
+
+File URL extension:
+
+In order to easy the checking of subwebs via the local file system,
+the interpretation of ``file:'' URLs is extended to mimic the behavior
+of your average HTTP daemon: if a directory pathname is given, the
+file index.html in that directory is returned if it exists, otherwise
+a directory listing is returned. Now, you can point webchecker to the
+document tree in the local file system of your HTTP daemon, and have
+most of it checked. In fact the default works this way if your local
+web tree is located at /usr/local/etc/httpd/htdpcs (the default for
+the NCSA HTTP daemon and probably others).
+
+Reports printed:
+
+When done, it reports links to pages outside the web (unless -q is
+specified), and pages with bad links within the subweb. When
+interrupted, it print those same reports for the pages that it has
+checked already.
+
+In verbose mode, additional messages are printed during the
+information gathering phase. By default, it prints a summary of its
+work status every 50 URLs (adjustable with the -r option), and it
+reports errors as they are encountered. Use the -q option to disable
+this output.
+
+Checkpoint feature:
+
+Whether interrupted or not, it dumps its state (a Python pickle) to a
+checkpoint file and the -R option allows it to restart from the
+checkpoint (assuming that the pages on the subweb that were already
+processed haven't changed). Even when it has run till completion, -R
+can still be useful -- it will print the reports again, and -Rq prints
+the errors only. In this case, the checkpoint file is not written
+again. The checkpoint file can be set with the -d option.
+
+The checkpoint file is written as a Python pickle. Remember that
+Python's pickle module is currently quite slow. Give it the time it
+needs to load and save the checkpoint file. When interrupted while
+writing the checkpoint file, the old checkpoint file is not
+overwritten, but all work done in the current run is lost.
+
+Miscellaneous:
+
+- Because the HTML parser is a bit slow, very large HTML files are
+ skipped. The size limit can be set with the -m option.
+
+- Before fetching a page, it guesses its type based on its extension.
+If it is a known extension and the type is not text/http, the page is
+not fetched. This is a huge optimization but occasionally it means
+links can be missed. The mimetypes.py module (also in this directory)
+has a built-in table mapping most currently known suffixes, and in
+addition attempts to read the mime.types configuration files in the
+default locations of Netscape and the NCSA HTTP daemon.
+
+- It only follows links indicated by <A> tags. It doesn't follow
+links in <FORM> or <IMG> or whatever other tags might contain
+hyperlinks. It does honor the <BASE> tag.
+
+- It could be argued that it should also check external links for
+validity. This is true, but is is more error-prone. I think I will
+make this an option in the future.
+
+
+Usage: webchecker.py [option] ... [rooturl] ...
+
+Options:
+
+-R -- restart from checkpoint file
+-d file -- checkpoint filename (default %(DUMPFILE)s)
+-m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
+-q -- quiet operation (also suppresses external links report)
+-r number -- number of links processed per round (default %(ROUNDSIZE)d)
+-v -- verbose operation; repeating -v will increase verbosity
+
+Arguments:
+
+rooturl -- URL to start checking
+ (default %(DEFROOT)s)
+
+"""
+
+
+import sys
+import os
+from types import *
+import string
+import StringIO
+import getopt
+import pickle
+
+import urllib
+import urlparse
+import htmllib
+import formatter
+
+import mimetypes
+
+
+# Tunable parameters
+DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
+MAXPAGE = 50000 # Ignore files bigger than this
+ROUNDSIZE = 50 # Number of links processed per round
+DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
+
+
+# Global variables
+verbose = 1
+maxpage = MAXPAGE
+roundsize = ROUNDSIZE
+
+
+def main():
+ global verbose, maxpage, roundsize
+ dumpfile = DUMPFILE
+ restart = 0
+
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:v')
+ except getopt.error, msg:
+ sys.stdout = sys.stderr
+ print msg
+ print __doc__ % globals()
+ sys.exit(2)
+ for o, a in opts:
+ if o == '-R':
+ restart = 1
+ if o == '-d':
+ dumpfile = a
+ if o == '-m':
+ maxpage = string.atoi(a)
+ if o == '-q':
+ verbose = 0
+ if o == '-r':
+ roundsize = string.atoi(a)
+ if o == '-v':
+ verbose = verbose + 1
+
+ if restart:
+ if verbose > 0:
+ print "Loading checkpoint from %s ..." % dumpfile
+ f = open(dumpfile, "rb")
+ c = pickle.load(f)
+ f.close()
+ if verbose > 0:
+ print "Done."
+ print "Root:", string.join(c.roots, "\n ")
+ else:
+ c = Checker()
+ if not args:
+ args.append(DEFROOT)
+
+ for arg in args:
+ c.addroot(arg)
+
+ if not c.todo:
+ needsave = 0
+ else:
+ needsave = 1
+ try:
+ c.run()
+ except KeyboardInterrupt:
+ if verbose > 0:
+ print "[interrupted]"
+ c.report()
+ if not needsave:
+ if verbose > 0:
+ print
+ print "No need to save checkpoint"
+ elif dumpfile:
+ if verbose > 0:
+ print
+ print "Saving checkpoint to %s ..." % dumpfile
+ newfile = dumpfile + ".new"
+ f = open(newfile, "wb")
+ pickle.dump(c, f)
+ f.flush()
+ f.close()
+ try:
+ os.unlink(dumpfile)
+ except os.error:
+ pass
+ os.rename(newfile, dumpfile)
+ if verbose > 0:
+ print "Done."
+ if dumpfile == DUMPFILE:
+ print "Use ``%s -R'' to restart." % sys.argv[0]
+ else:
+ print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
+ dumpfile)
+
+
+class Checker:
+
+ def __init__(self):
+ self.roots = []
+ self.todo = {}
+ self.done = {}
+ self.ext = {}
+ self.bad = {}
+ self.urlopener = MyURLopener()
+ self.round = 0
+
+ def addroot(self, root):
+ if root not in self.roots:
+ self.roots.append(root)
+ self.todo[root] = []
+
+ def run(self):
+ while self.todo:
+ self.round = self.round + 1
+ if verbose > 0:
+ print
+ print "Round", self.round,
+ print "(%d to do, %d done, %d external, %d bad)" % (
+ len(self.todo), len(self.done),
+ len(self.ext), len(self.bad))
+ print
+ urls = self.todo.keys()[:roundsize]
+ for url in urls:
+ self.dopage(url)
+ self.done[url] = self.todo[url]
+ del self.todo[url]
+
+ def report(self):
+ print
+ if not self.todo: print "Final",
+ else: print "Interim",
+ print "Report (%d to do, %d done, %d external, %d bad)" % (
+ len(self.todo), len(self.done),
+ len(self.ext), len(self.bad))
+ if verbose > 0:
+ self.report_extrefs()
+ # Report errors last because the output may get truncated
+ self.report_errors()
+
+ def report_extrefs(self):
+ if not self.ext:
+ print
+ print "No external URLs"
+ return
+ print
+ print "External URLs:"
+ print
+ urls = self.ext.keys()
+ urls.sort()
+ for url in urls:
+ show("HREF ", url, " from", self.ext[url])
+
+ def report_errors(self):
+ if not self.bad:
+ print
+ print "No errors"
+ return
+ print
+ print "Error Report:"
+ urls = self.bad.keys()
+ urls.sort()
+ bysource = {}
+ for url in urls:
+ try:
+ origins = self.done[url]
+ except KeyError:
+ origins = self.todo[url]
+ for source, rawlink in origins:
+ triple = url, rawlink, self.bad[url]
+ try:
+ bysource[source].append(triple)
+ except KeyError:
+ bysource[source] = [triple]
+ sources = bysource.keys()
+ sources.sort()
+ for source in sources:
+ triples = bysource[source]
+ print
+ if len(triples) > 1:
+ print len(triples), "Errors in", source
+ else:
+ print "Error in", source
+ for url, rawlink, msg in triples:
+ print " HREF", url,
+ if rawlink != url: print "(%s)" % rawlink,
+ print
+ print " msg", msg
+
+ def dopage(self, url):
+ if verbose > 1:
+ if verbose > 2:
+ show("Page ", url, " from", self.todo[url])
+ else:
+ print "Page ", url
+ page = self.getpage(url)
+ if not page:
+ return
+ for info in page.getlinkinfos():
+ link, rawlink = info
+ origin = url, rawlink
+ if not self.inroots(link):
+ try:
+ self.ext[link].append(origin)
+ if verbose > 3:
+ print " New ext link", link,
+ if link != rawlink: print "(%s)" % rawlink,
+ print
+ except KeyError:
+ if verbose > 3:
+ print " Seen ext link", link,
+ if link != rawlink: print "(%s)" % rawlink,
+ print
+ self.ext[link] = [origin]
+ elif self.done.has_key(link):
+ if verbose > 3:
+ print " Done link", link
+ self.done[link].append(origin)
+ elif self.todo.has_key(link):
+ if verbose > 3:
+ print " Seen todo link", link
+ self.todo[link].append(origin)
+ else:
+ if verbose > 3:
+ print " New todo link", link
+ self.todo[link] = [origin]
+
+ def inroots(self, url):
+ for root in self.roots:
+ if url[:len(root)] == root:
+ return 1
+ return 0
+
+ def getpage(self, url):
+ ctype, encoding = mimetypes.guess_type(url)
+ if encoding:
+ if verbose > 2:
+ print " Won't bother, URL suggests encoding %s" % `encoding`
+ return None
+ if ctype and ctype != 'text/html':
+ if verbose > 2:
+ print " Won't bother, URL suggests mime type %s" % `ctype`
+ return None
+ try:
+ f = self.urlopener.open(url)
+ except IOError, msg:
+ if verbose > 0:
+ print "Error ", msg
+ if verbose > 0:
+ show(" HREF ", url, " from", self.todo[url])
+ self.bad[url] = msg
+ return None
+ nurl = f.geturl()
+ info = f.info()
+ if info.has_key('content-type'):
+ ctype = string.lower(info['content-type'])
+ if nurl != url:
+ if verbose > 1:
+ print "Redirected to", nurl
+ if not ctype:
+ ctype, encoding = mimetypes.guess_type(nurl)
+ if ctype != 'text/html':
+ f.close()
+ if verbose > 2:
+ print " Not HTML, mime type", ctype
+ return None
+ text = f.read()
+ f.close()
+ return Page(text, nurl)
+
+
+class Page:
+
+ def __init__(self, text, url):
+ self.text = text
+ self.url = url
+
+ def getlinkinfos(self):
+ size = len(self.text)
+ if size > maxpage:
+ if verbose > 0:
+ print "Skip huge file", self.url
+ print " (%.0f Kbytes)" % (size*0.001)
+ return []
+ if verbose > 2:
+ print " Parsing", self.url, "(%d bytes)" % size
+ parser = MyHTMLParser(formatter.NullFormatter())
+ parser.feed(self.text)
+ parser.close()
+ rawlinks = parser.getlinks()
+ base = urlparse.urljoin(self.url, parser.getbase() or "")
+ infos = []
+ for rawlink in rawlinks:
+ t = urlparse.urlparse(rawlink)
+ t = t[:-1] + ('',)
+ rawlink = urlparse.urlunparse(t)
+ link = urlparse.urljoin(base, rawlink)
+ infos.append((link, rawlink))
+ return infos
+
+
+class MyStringIO(StringIO.StringIO):
+
+ def __init__(self, url, info):
+ self.__url = url
+ self.__info = info
+ StringIO.StringIO.__init__(self)
+
+ def info(self):
+ return self.__info
+
+ def geturl(self):
+ return self.__url
+
+
+class MyURLopener(urllib.FancyURLopener):
+
+ http_error_default = urllib.URLopener.http_error_default
+
+ def open_file(self, url):
+ path = urllib.url2pathname(urllib.unquote(url))
+ if path[-1] != os.sep:
+ url = url + '/'
+ if os.path.isdir(path):
+ indexpath = os.path.join(path, "index.html")
+ if os.path.exists(indexpath):
+ return self.open_file(url + "index.html")
+ try:
+ names = os.listdir(path)
+ except os.error, msg:
+ raise IOError, msg, sys.exc_traceback
+ names.sort()
+ s = MyStringIO("file:"+url, {'content-type': 'text/html'})
+ s.write('<BASE HREF="file:%s">\n' %
+ urllib.quote(os.path.join(path, "")))
+ for name in names:
+ q = urllib.quote(name)
+ s.write('<A HREF="%s">%s</A>\n' % (q, q))
+ s.seek(0)
+ return s
+ return urllib.FancyURLopener.open_file(self, path)
+
+
+class MyHTMLParser(htmllib.HTMLParser):
+
+ def __init__(*args):
+ self = args[0]
+ self.base = None
+ self.links = []
+ apply(htmllib.HTMLParser.__init__, args)
+
+ def start_a(self, attributes):
+ for name, value in attributes:
+ if name == 'href' and value and value not in self.links:
+ self.links.append(string.strip(value))
+
+ def do_base(self, attributes):
+ for name, value in attributes:
+ if name == 'href' and value:
+ if verbose > 1:
+ print " Base", value
+ self.base = value
+
+ def getlinks(self):
+ return self.links
+
+ def getbase(self):
+ return self.base
+
+
+def show(p1, link, p2, origins):
+ print p1, link
+ i = 0
+ for source, rawlink in origins:
+ i = i+1
+ if i == 2:
+ p2 = ' '*len(p2)
+ print p2, source,
+ if rawlink != link: print "(%s)" % rawlink,
+ print
+
+
+if __name__ == '__main__':
+ main()