summaryrefslogtreecommitdiffstats
path: root/Tools/webchecker
diff options
context:
space:
mode:
Diffstat (limited to 'Tools/webchecker')
-rwxr-xr-xTools/webchecker/websucker.py131
1 files changed, 131 insertions, 0 deletions
diff --git a/Tools/webchecker/websucker.py b/Tools/webchecker/websucker.py
new file mode 100755
index 0000000..31cefb2
--- /dev/null
+++ b/Tools/webchecker/websucker.py
@@ -0,0 +1,131 @@
+#! /usr/bin/env python
+
+"""A variant on webchecker that creates a mirror copy of a remote site."""
+
+__version__ = "0.1"
+
+import os
+import sys
+import string
+import urllib
+import getopt
+
+import webchecker
+verbose = webchecker.verbose
+
+def main():
+ global verbose
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], "qv")
+ except getopt.error, msg:
+ print msg
+ print "usage:", sys.argv[0], "[-v] ... [rooturl] ..."
+ return 2
+ for o, a in opts:
+ if o == "-q":
+ webchecker.verbose = verbose = 0
+ if o == "-v":
+ webchecker.verbose = verbose = verbose + 1
+ c = Sucker(0)
+ c.urlopener.addheaders = [
+ ('User-agent', 'websucker/%s' % __version__),
+ ]
+ for arg in args:
+ print "Adding root", arg
+ c.addroot(arg)
+ print "Run..."
+ c.run()
+
+class Sucker(webchecker.Checker):
+
+ # Alas, had to copy this to make one change...
+ def getpage(self, url):
+ if url[:7] == 'mailto:' or url[:5] == 'news:':
+ if verbose > 1: print " Not checking mailto/news URL"
+ return None
+ isint = self.inroots(url)
+ if not isint and not self.checkext:
+ if verbose > 1: print " Not checking ext link"
+ return None
+ path = self.savefilename(url)
+ saved = 0
+ try:
+ f = open(path, "rb")
+ except IOError:
+ try:
+ f = self.urlopener.open(url)
+ except IOError, msg:
+ msg = webchecker.sanitize(msg)
+ if verbose > 0:
+ print "Error ", msg
+ if verbose > 0:
+ webchecker.show(" HREF ", url, " from", self.todo[url])
+ self.setbad(url, msg)
+ return None
+ if not isint:
+ if verbose > 1: print " Not gathering links from ext URL"
+ safeclose(f)
+ return None
+ nurl = f.geturl()
+ if nurl != url:
+ path = self.savefilename(nurl)
+ info = f.info()
+ else:
+ if verbose: print "Loading cached URL", url
+ saved = 1
+ nurl = url
+ info = {}
+ if url[-1:] == "/":
+ info["content-type"] = "text/html"
+ text = f.read()
+ if not saved: self.savefile(text, path)
+ if info.has_key('content-type'):
+ ctype = string.lower(info['content-type'])
+ else:
+ ctype = None
+ if nurl != url:
+ if verbose > 1:
+ print " Redirected to", nurl
+ if not ctype:
+ ctype, encoding = webchecker.mimetypes.guess_type(nurl)
+ if ctype != 'text/html':
+ webchecker.safeclose(f)
+ if verbose > 1:
+ print " Not HTML, mime type", ctype
+ return None
+ f.close()
+ return webchecker.Page(text, nurl)
+
+ def savefile(self, text, path):
+ dir, base = os.path.split(path)
+ makedirs(dir)
+ f = open(path, "wb")
+ f.write(text)
+ f.close()
+ print "saved", path
+
+ def savefilename(self, url):
+ type, rest = urllib.splittype(url)
+ host, path = urllib.splithost(rest)
+ while path[:1] == "/": path = path[1:]
+ user, host = urllib.splituser(host)
+ host, port = urllib.splitnport(host)
+ host = string.lower(host)
+ path = os.path.join(host, path)
+ if path[-1] == "/": path = path + "index.html"
+ if os.sep != "/":
+ path = string.join(string.split(path, "/"), os.sep)
+ return path
+
+def makedirs(dir):
+ if not dir or os.path.exists(dir):
+ return
+ head, tail = os.path.split(dir)
+ if not tail:
+ print "Huh? Don't know how to make dir", dir
+ return
+ makedirs(head)
+ os.mkdir(dir, 0777)
+
+if __name__ == '__main__':
+ sys.exit(main() or 0)