#! /usr/bin/env python """A variant on webchecker that creates a mirror copy of a remote site.""" __version__ = "0.1" import os import sys import string import urllib import getopt import webchecker verbose = webchecker.verbose def main(): global verbose try: opts, args = getopt.getopt(sys.argv[1:], "qv") except getopt.error, msg: print msg print "usage:", sys.argv[0], "[-v] ... [rooturl] ..." return 2 for o, a in opts: if o == "-q": webchecker.verbose = verbose = 0 if o == "-v": webchecker.verbose = verbose = verbose + 1 c = Sucker(0) c.urlopener.addheaders = [ ('User-agent', 'websucker/%s' % __version__), ] for arg in args: print "Adding root", arg c.addroot(arg) print "Run..." c.run() class Sucker(webchecker.Checker): # Alas, had to copy this to make one change... def getpage(self, url): if url[:7] == 'mailto:' or url[:5] == 'news:': if verbose > 1: print " Not checking mailto/news URL" return None isint = self.inroots(url) if not isint and not self.checkext: if verbose > 1: print " Not checking ext link" return None path = self.savefilename(url) saved = 0 try: f = open(path, "rb") except IOError: try: f = self.urlopener.open(url) except IOError, msg: msg = webchecker.sanitize(msg) if verbose > 0: print "Error ", msg if verbose > 0: webchecker.show(" HREF ", url, " from", self.todo[url]) self.setbad(url, msg) return None if not isint: if verbose > 1: print " Not gathering links from ext URL" safeclose(f) return None nurl = f.geturl() if nurl != url: path = self.savefilename(nurl) info = f.info() else: if verbose: print "Loading cached URL", url saved = 1 nurl = url info = {} if url[-1:] == "/": info["content-type"] = "text/html" text = f.read() if not saved: self.savefile(text, path) if info.has_key('content-type'): ctype = string.lower(info['content-type']) else: ctype = None if nurl != url: if verbose > 1: print " Redirected to", nurl if not ctype: ctype, encoding = webchecker.mimetypes.guess_type(nurl) if ctype != 'text/html': webchecker.safeclose(f) if verbose > 1: print " Not HTML, mime type", ctype return None f.close() return webchecker.Page(text, nurl) def savefile(self, text, path): dir, base = os.path.split(path) makedirs(dir) f = open(path, "wb") f.write(text) f.close() print "saved", path def savefilename(self, url): type, rest = urllib.splittype(url) host, path = urllib.splithost(rest) while path[:1] == "/": path = path[1:] user, host = urllib.splituser(host) host, port = urllib.splitnport(host) host = string.lower(host) path = os.path.join(host, path) if path[-1] == "/": path = path + "index.html" if os.sep != "/": path = string.join(string.split(path, "/"), os.sep) return path def makedirs(dir): if not dir or os.path.exists(dir): return head, tail = os.path.split(dir) if not tail: print "Huh? Don't know how to make dir", dir return makedirs(head) os.mkdir(dir, 0777) if __name__ == '__main__': sys.exit(main() or 0)