diff options
Diffstat (limited to 'Tools/webchecker/websucker.py')
-rwxr-xr-x | Tools/webchecker/websucker.py | 123 |
1 files changed, 0 insertions, 123 deletions
diff --git a/Tools/webchecker/websucker.py b/Tools/webchecker/websucker.py deleted file mode 100755 index 4657b52..0000000 --- a/Tools/webchecker/websucker.py +++ /dev/null @@ -1,123 +0,0 @@ -#! /usr/bin/env python3 - -"""A variant on webchecker that creates a mirror copy of a remote site.""" - -__version__ = "$Revision$" - -import os -import sys -import getopt -import urllib.parse - -import webchecker - -# Extract real version number if necessary -if __version__[0] == '$': - _v = __version__.split() - if len(_v) == 3: - __version__ = _v[1] - -def main(): - verbose = webchecker.VERBOSE - try: - opts, args = getopt.getopt(sys.argv[1:], "qv") - except getopt.error as msg: - print(msg) - print("usage:", sys.argv[0], "[-qv] ... [rooturl] ...") - return 2 - for o, a in opts: - if o == "-q": - verbose = 0 - if o == "-v": - verbose = verbose + 1 - c = Sucker() - c.setflags(verbose=verbose) - c.urlopener.addheaders = [ - ('User-agent', 'websucker/%s' % __version__), - ] - for arg in args: - print("Adding root", arg) - c.addroot(arg) - print("Run...") - c.run() - -class Sucker(webchecker.Checker): - - checkext = 0 - nonames = 1 - - # SAM 11/13/99: in general, URLs are now URL pairs. - # Since we've suppressed name anchor checking, - # we can ignore the second dimension. - - def readhtml(self, url_pair): - url = url_pair[0] - text = None - path = self.savefilename(url) - try: - f = open(path, "rb") - except IOError: - f = self.openpage(url_pair) - if f: - info = f.info() - nurl = f.geturl() - if nurl != url: - url = nurl - path = self.savefilename(url) - text = f.read() - f.close() - self.savefile(text, path) - if not self.checkforhtml(info, url): - text = None - else: - if self.checkforhtml({}, url): - text = f.read() - f.close() - return text, url - - def savefile(self, text, path): - dir, base = os.path.split(path) - makedirs(dir) - try: - f = open(path, "wb") - f.write(text) - f.close() - self.message("saved %s", path) - except IOError as msg: - self.message("didn't save %s: %s", path, str(msg)) - - def savefilename(self, url): - type, rest = urllib.parse.splittype(url) - host, path = urllib.parse.splithost(rest) - path = path.lstrip("/") - user, host = urllib.parse.splituser(host) - host, port = urllib.parse.splitnport(host) - host = host.lower() - if not path or path[-1] == "/": - path = path + "index.html" - if os.sep != "/": - path = os.sep.join(path.split("/")) - path = os.path.join(host, path) - return path - -def makedirs(dir): - if not dir: - return - if os.path.exists(dir): - if not os.path.isdir(dir): - try: - os.rename(dir, dir + ".bak") - os.mkdir(dir) - os.rename(dir + ".bak", os.path.join(dir, "index.html")) - except os.error: - pass - return - head, tail = os.path.split(dir) - if not tail: - print("Huh? Don't know how to make dir", dir) - return - makedirs(head) - os.mkdir(dir, 0o777) - -if __name__ == '__main__': - sys.exit(main() or 0) |