diff options
Diffstat (limited to 'Tools/webchecker')
-rwxr-xr-x | Tools/webchecker/webchecker.py | 321 |
1 files changed, 191 insertions, 130 deletions
diff --git a/Tools/webchecker/webchecker.py b/Tools/webchecker/webchecker.py index f412011..23dcf80 100755 --- a/Tools/webchecker/webchecker.py +++ b/Tools/webchecker/webchecker.py @@ -94,7 +94,7 @@ rooturl -- URL to start checking """ -__version__ = "0.5" +__version__ = "$Revision$" import sys @@ -112,9 +112,17 @@ import sgmllib import mimetypes import robotparser +# Extract real version number if necessary +if __version__[0] == '$': + _v = string.split(__version__) + if len(_v) == 3: + __version__ = _v[1] + # Tunable parameters DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL +CHECKEXT = 1 # Check external references (1 deep) +VERBOSE = 1 # Verbosity level (0-3) MAXPAGE = 150000 # Ignore files bigger than this ROUNDSIZE = 50 # Number of links processed per round DUMPFILE = "@webchecker.pickle" # Pickled checkpoint @@ -122,16 +130,15 @@ AGENTNAME = "webchecker" # Agent name for robots.txt parser # Global variables -verbose = 1 -maxpage = MAXPAGE -roundsize = ROUNDSIZE def main(): - global verbose, maxpage, roundsize + checkext = CHECKEXT + verbose = VERBOSE + maxpage = MAXPAGE + roundsize = ROUNDSIZE dumpfile = DUMPFILE restart = 0 - checkext = 1 norun = 0 try: @@ -163,18 +170,15 @@ def main(): print AGENTNAME, "version", __version__ if restart: - if verbose > 0: - print "Loading checkpoint from %s ..." % dumpfile - f = open(dumpfile, "rb") - c = pickle.load(f) - f.close() - if verbose > 0: - print "Done." - print "Root:", string.join(c.roots, "\n ") + c = load_pickle(dumpfile=dumpfile, verbose=verbose) else: - c = Checker(checkext) - if not args: - args.append(DEFROOT) + c = Checker() + + c.setflags(checkext=checkext, verbose=verbose, + maxpage=maxpage, roundsize=roundsize) + + if not restart and not args: + args.append(DEFROOT) for arg in args: c.addroot(arg) @@ -192,40 +196,43 @@ def main(): if verbose > 0: print "[report interrupted]" - if not c.changed: - if verbose > 0: - print - print "No need to save checkpoint" - elif not dumpfile: - if verbose > 0: - print "No dumpfile, won't save checkpoint" - else: - if verbose > 0: - print - print "Saving checkpoint to %s ..." % dumpfile - newfile = dumpfile + ".new" - f = open(newfile, "wb") - pickle.dump(c, f) - f.close() - try: - os.unlink(dumpfile) - except os.error: - pass - os.rename(newfile, dumpfile) - if verbose > 0: - print "Done." - if dumpfile == DUMPFILE: - print "Use ``%s -R'' to restart." % sys.argv[0] - else: - print "Use ``%s -R -d %s'' to restart." % (sys.argv[0], - dumpfile) + if c.save_pickle(dumpfile): + if dumpfile == DUMPFILE: + print "Use ``%s -R'' to restart." % sys.argv[0] + else: + print "Use ``%s -R -d %s'' to restart." % (sys.argv[0], dumpfile) + + +def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE): + if verbose > 0: + print "Loading checkpoint from %s ..." % dumpfile + f = open(dumpfile, "rb") + c = pickle.load(f) + f.close() + if verbose > 0: + print "Done." + print "Root:", string.join(c.roots, "\n ") + return c class Checker: - def __init__(self, checkext=1): + checkext = CHECKEXT + verbose = VERBOSE + maxpage = MAXPAGE + roundsize = ROUNDSIZE + + validflags = tuple(dir()) + + def __init__(self): self.reset() - self.checkext = checkext + + def setflags(self, **kw): + for key in kw.keys(): + if key not in self.validflags: + raise NameError, "invalid keyword argument: %s" % str(key) + for key, value in kw.items(): + setattr(self, key, value) def reset(self): self.roots = [] @@ -243,6 +250,7 @@ class Checker: return (self.roots, self.todo, self.done, self.bad, self.round) def __setstate__(self, state): + self.reset() (self.roots, self.todo, self.done, self.bad, self.round) = state for root in self.roots: self.addrobot(root) @@ -268,24 +276,24 @@ class Checker: if self.robots.has_key(root): return url = urlparse.urljoin(root, "/robots.txt") self.robots[root] = rp = robotparser.RobotFileParser() - if verbose > 2: + if self.verbose > 2: print "Parsing", url - rp.debug = verbose > 3 + rp.debug = self.verbose > 3 rp.set_url(url) try: rp.read() except IOError, msg: - if verbose > 1: + if self.verbose > 1: print "I/O error parsing", url, ":", msg def run(self): while self.todo: self.round = self.round + 1 - if verbose > 0: + if self.verbose > 0: print print "Round %d (%s)" % (self.round, self.status()) print - urls = self.todo.keys()[:roundsize] + urls = self.todo.keys()[:self.roundsize] for url in urls: self.dopage(url) @@ -325,9 +333,9 @@ class Checker: print " msg", msg def dopage(self, url): - if verbose > 1: - if verbose > 2: - show("Check ", url, " from", self.todo[url]) + if self.verbose > 1: + if self.verbose > 2: + self.show("Check ", url, " from", self.todo[url]) else: print "Check ", url page = self.getpage(url) @@ -346,17 +354,17 @@ class Checker: def newdonelink(self, url, origin): self.done[url].append(origin) - if verbose > 3: + if self.verbose > 3: print " Done link", url def newtodolink(self, url, origin): if self.todo.has_key(url): self.todo[url].append(origin) - if verbose > 3: + if self.verbose > 3: print " Seen todo link", url else: self.todo[url] = [origin] - if verbose > 3: + if self.verbose > 3: print " New todo link", url def markdone(self, url): @@ -373,56 +381,79 @@ class Checker: def getpage(self, url): if url[:7] == 'mailto:' or url[:5] == 'news:': - if verbose > 1: print " Not checking mailto/news URL" + if self.verbose > 1: print " Not checking mailto/news URL" return None isint = self.inroots(url) - if not isint and not self.checkext: - if verbose > 1: print " Not checking ext link" + if not isint: + if not self.checkext: + if self.verbose > 1: print " Not checking ext link" + return None + f = self.openpage(url) + if f: + self.safeclose(f) return None + text, nurl = self.readhtml(url) + if nurl != url: + if self.verbose > 1: + print " Redirected to", nurl + url = nurl + if text: + return Page(text, url, verbose=self.verbose, maxpage=self.maxpage) + + def readhtml(self, url): + text = None + f, url = self.openhtml(url) + if f: + text = f.read() + f.close() + return text, url + + def openhtml(self, url): + f = self.openpage(url) + if f: + url = f.geturl() + info = f.info() + if not self.checkforhtml(info, url): + self.safeclose(f) + f = None + return f, url + + def openpage(self, url): try: - f = self.urlopener.open(url) + return self.urlopener.open(url) except IOError, msg: - msg = sanitize(msg) - if verbose > 0: + msg = self.sanitize(msg) + if self.verbose > 0: print "Error ", msg - if verbose > 0: - show(" HREF ", url, " from", self.todo[url]) + if self.verbose > 0: + self.show(" HREF ", url, " from", self.todo[url]) self.setbad(url, msg) return None - if not isint: - if verbose > 1: print " Not gathering links from ext URL" - safeclose(f) - return None - nurl = f.geturl() - info = f.info() + + def checkforhtml(self, info, url): if info.has_key('content-type'): ctype = string.lower(info['content-type']) else: - ctype = None - if nurl != url: - if verbose > 1: - print " Redirected to", nurl - if not ctype: - ctype, encoding = mimetypes.guess_type(nurl) - if ctype != 'text/html': - safeclose(f) - if verbose > 1: + if url[-1:] == "/": + return 1 + ctype, encoding = mimetypes.guess_type(url) + if ctype == 'text/html': + return 1 + else: + if self.verbose > 1: print " Not HTML, mime type", ctype - return None - text = f.read() - f.close() - return Page(text, nurl) + return 0 def setgood(self, url): if self.bad.has_key(url): del self.bad[url] self.changed = 1 - if verbose > 0: + if self.verbose > 0: print "(Clear previously seen error)" def setbad(self, url, msg): if self.bad.has_key(url) and self.bad[url] == msg: - if verbose > 0: + if self.verbose > 0: print "(Seen this error before)" return self.bad[url] = msg @@ -444,23 +475,88 @@ class Checker: except KeyError: self.errors[url] = [triple] + # The following used to be toplevel functions; they have been + # changed into methods so they can be overridden in subclasses. + + def show(self, p1, link, p2, origins): + print p1, link + i = 0 + for source, rawlink in origins: + i = i+1 + if i == 2: + p2 = ' '*len(p2) + print p2, source, + if rawlink != link: print "(%s)" % rawlink, + print + + def sanitize(self, msg): + if isinstance(IOError, ClassType) and isinstance(msg, IOError): + # Do the other branch recursively + msg.args = self.sanitize(msg.args) + elif isinstance(msg, TupleType): + if len(msg) >= 4 and msg[0] == 'http error' and \ + isinstance(msg[3], InstanceType): + # Remove the Message instance -- it may contain + # a file object which prevents pickling. + msg = msg[:3] + msg[4:] + return msg + + def safeclose(self, f): + try: + url = f.geturl() + except AttributeError: + pass + else: + if url[:4] == 'ftp:' or url[:7] == 'file://': + # Apparently ftp connections don't like to be closed + # prematurely... + text = f.read() + f.close() + + def save_pickle(self, dumpfile=DUMPFILE): + if not self.changed: + if self.verbose > 0: + print + print "No need to save checkpoint" + elif not dumpfile: + if self.verbose > 0: + print "No dumpfile, won't save checkpoint" + else: + if self.verbose > 0: + print + print "Saving checkpoint to %s ..." % dumpfile + newfile = dumpfile + ".new" + f = open(newfile, "wb") + pickle.dump(self, f) + f.close() + try: + os.unlink(dumpfile) + except os.error: + pass + os.rename(newfile, dumpfile) + if self.verbose > 0: + print "Done." + return 1 + class Page: - def __init__(self, text, url): + def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE): self.text = text self.url = url + self.verbose = verbose + self.maxpage = maxpage def getlinkinfos(self): size = len(self.text) - if size > maxpage: - if verbose > 0: + if size > self.maxpage: + if self.verbose > 0: print "Skip huge file", self.url print " (%.0f Kbytes)" % (size*0.001) return [] - if verbose > 2: + if self.verbose > 2: print " Parsing", self.url, "(%d bytes)" % size - parser = MyHTMLParser() + parser = MyHTMLParser(verbose=self.verbose) parser.feed(self.text) parser.close() rawlinks = parser.getlinks() @@ -529,10 +625,11 @@ class MyURLopener(urllib.FancyURLopener): class MyHTMLParser(sgmllib.SGMLParser): - def __init__(self): + def __init__(self, verbose=VERBOSE): self.base = None self.links = {} - sgmllib.SGMLParser.__init__ (self) + self.myverbose = verbose + sgmllib.SGMLParser.__init__(self) def start_a(self, attributes): self.link_attr(attributes, 'href') @@ -559,7 +656,7 @@ class MyHTMLParser(sgmllib.SGMLParser): if name == 'href': if value: value = string.strip(value) if value: - if verbose > 1: + if self.myverbose > 1: print " Base", value self.base = value @@ -570,41 +667,5 @@ class MyHTMLParser(sgmllib.SGMLParser): return self.base -def show(p1, link, p2, origins): - print p1, link - i = 0 - for source, rawlink in origins: - i = i+1 - if i == 2: - p2 = ' '*len(p2) - print p2, source, - if rawlink != link: print "(%s)" % rawlink, - print - - -def sanitize(msg): - if (type(msg) == TupleType and - len(msg) >= 4 and - msg[0] == 'http error' and - type(msg[3]) == InstanceType): - # Remove the Message instance -- it may contain - # a file object which prevents pickling. - msg = msg[:3] + msg[4:] - return msg - - -def safeclose(f): - try: - url = f.geturl() - except AttributeError: - pass - else: - if url[:4] == 'ftp:' or url[:7] == 'file://': - # Apparently ftp connections don't like to be closed - # prematurely... - text = f.read() - f.close() - - if __name__ == '__main__': main() |