Tools/webchecker/websucker.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

#! /usr/bin/env python

"""A variant on webchecker that creates a mirror copy of a remote site."""

__version__ = "0.1"

import os
import sys
import string
import urllib
import getopt

import webchecker
verbose = webchecker.verbose

def main():
    global verbose
    try:
	opts, args = getopt.getopt(sys.argv[1:], "qv")
    except getopt.error, msg:
	print msg
	print "usage:", sys.argv[0], "[-v] ... [rooturl] ..."
	return 2
    for o, a in opts:
	if o == "-q":
	    webchecker.verbose = verbose = 0
	if o == "-v":
	    webchecker.verbose = verbose = verbose + 1
    c = Sucker(0)
    c.urlopener.addheaders = [
	    ('User-agent', 'websucker/%s' % __version__),
	]
    for arg in args:
	print "Adding root", arg
	c.addroot(arg)
    print "Run..."
    c.run()

class Sucker(webchecker.Checker):

    # Alas, had to copy this to make one change...
    def getpage(self, url):
	if url[:7] == 'mailto:' or url[:5] == 'news:':
	    if verbose > 1: print " Not checking mailto/news URL"
	    return None
	isint = self.inroots(url)
	if not isint and not self.checkext:
	    if verbose > 1: print " Not checking ext link"
	    return None
	path = self.savefilename(url)
	saved = 0
	try:
	    f = open(path, "rb")
	except IOError:
	    try:
		f = self.urlopener.open(url)
	    except IOError, msg:
		msg = webchecker.sanitize(msg)
		if verbose > 0:
		    print "Error ", msg
		if verbose > 0:
		    webchecker.show(" HREF ", url, "  from", self.todo[url])
		self.setbad(url, msg)
		return None
	    if not isint:
		if verbose > 1: print " Not gathering links from ext URL"
		safeclose(f)
		return None
	    nurl = f.geturl()
	    if nurl != url:
		path = self.savefilename(nurl)
	    info = f.info()
	else:
	    if verbose: print "Loading cached URL", url
	    saved = 1
	    nurl = url
	    info = {}
	    if url[-1:] == "/":
		info["content-type"] = "text/html"
	text = f.read()
	if not saved: self.savefile(text, path)
	if info.has_key('content-type'):
	    ctype = string.lower(info['content-type'])
	else:
	    ctype = None
	if nurl != url:
	    if verbose > 1:
		print " Redirected to", nurl
	if not ctype:
	    ctype, encoding = webchecker.mimetypes.guess_type(nurl)
	if ctype != 'text/html':
	    webchecker.safeclose(f)
	    if verbose > 1:
		print " Not HTML, mime type", ctype
	    return None
	f.close()
	return webchecker.Page(text, nurl)

    def savefile(self, text, path):
	dir, base = os.path.split(path)
	makedirs(dir)
	f = open(path, "wb")
	f.write(text)
	f.close()
	print "saved", path

    def savefilename(self, url):
	type, rest = urllib.splittype(url)
	host, path = urllib.splithost(rest)
	while path[:1] == "/": path = path[1:]
	user, host = urllib.splituser(host)
	host, port = urllib.splitnport(host)
	host = string.lower(host)
	path = os.path.join(host, path)
	if path[-1] == "/": path = path + "index.html"
	if os.sep != "/":
	    path = string.join(string.split(path, "/"), os.sep)
	return path

def makedirs(dir):
    if not dir or os.path.exists(dir):
	return
    head, tail = os.path.split(dir)
    if not tail:
	print "Huh?  Don't know how to make dir", dir
	return
    makedirs(head)
    os.mkdir(dir, 0777)

if __name__ == '__main__':
    sys.exit(main() or 0)