summaryrefslogtreecommitdiffstats
path: root/Tools/webchecker/webchecker.py
diff options
context:
space:
mode:
Diffstat (limited to 'Tools/webchecker/webchecker.py')
-rwxr-xr-xTools/webchecker/webchecker.py758
1 files changed, 379 insertions, 379 deletions
diff --git a/Tools/webchecker/webchecker.py b/Tools/webchecker/webchecker.py
index 23dcf80..5459e97 100755
--- a/Tools/webchecker/webchecker.py
+++ b/Tools/webchecker/webchecker.py
@@ -116,17 +116,17 @@ import robotparser
if __version__[0] == '$':
_v = string.split(__version__)
if len(_v) == 3:
- __version__ = _v[1]
+ __version__ = _v[1]
# Tunable parameters
-DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
-CHECKEXT = 1 # Check external references (1 deep)
-VERBOSE = 1 # Verbosity level (0-3)
-MAXPAGE = 150000 # Ignore files bigger than this
-ROUNDSIZE = 50 # Number of links processed per round
-DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
-AGENTNAME = "webchecker" # Agent name for robots.txt parser
+DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
+CHECKEXT = 1 # Check external references (1 deep)
+VERBOSE = 1 # Verbosity level (0-3)
+MAXPAGE = 150000 # Ignore files bigger than this
+ROUNDSIZE = 50 # Number of links processed per round
+DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
+AGENTNAME = "webchecker" # Agent name for robots.txt parser
# Global variables
@@ -142,76 +142,76 @@ def main():
norun = 0
try:
- opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:vx')
+ opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:vx')
except getopt.error, msg:
- sys.stdout = sys.stderr
- print msg
- print __doc__%globals()
- sys.exit(2)
+ sys.stdout = sys.stderr
+ print msg
+ print __doc__%globals()
+ sys.exit(2)
for o, a in opts:
- if o == '-R':
- restart = 1
- if o == '-d':
- dumpfile = a
- if o == '-m':
- maxpage = string.atoi(a)
- if o == '-n':
- norun = 1
- if o == '-q':
- verbose = 0
- if o == '-r':
- roundsize = string.atoi(a)
- if o == '-v':
- verbose = verbose + 1
- if o == '-x':
- checkext = not checkext
+ if o == '-R':
+ restart = 1
+ if o == '-d':
+ dumpfile = a
+ if o == '-m':
+ maxpage = string.atoi(a)
+ if o == '-n':
+ norun = 1
+ if o == '-q':
+ verbose = 0
+ if o == '-r':
+ roundsize = string.atoi(a)
+ if o == '-v':
+ verbose = verbose + 1
+ if o == '-x':
+ checkext = not checkext
if verbose > 0:
- print AGENTNAME, "version", __version__
+ print AGENTNAME, "version", __version__
if restart:
- c = load_pickle(dumpfile=dumpfile, verbose=verbose)
+ c = load_pickle(dumpfile=dumpfile, verbose=verbose)
else:
- c = Checker()
+ c = Checker()
c.setflags(checkext=checkext, verbose=verbose,
- maxpage=maxpage, roundsize=roundsize)
+ maxpage=maxpage, roundsize=roundsize)
if not restart and not args:
- args.append(DEFROOT)
+ args.append(DEFROOT)
for arg in args:
- c.addroot(arg)
+ c.addroot(arg)
if not norun:
- try:
- c.run()
- except KeyboardInterrupt:
- if verbose > 0:
- print "[run interrupted]"
+ try:
+ c.run()
+ except KeyboardInterrupt:
+ if verbose > 0:
+ print "[run interrupted]"
try:
- c.report()
+ c.report()
except KeyboardInterrupt:
- if verbose > 0:
- print "[report interrupted]"
+ if verbose > 0:
+ print "[report interrupted]"
if c.save_pickle(dumpfile):
- if dumpfile == DUMPFILE:
- print "Use ``%s -R'' to restart." % sys.argv[0]
- else:
- print "Use ``%s -R -d %s'' to restart." % (sys.argv[0], dumpfile)
+ if dumpfile == DUMPFILE:
+ print "Use ``%s -R'' to restart." % sys.argv[0]
+ else:
+ print "Use ``%s -R -d %s'' to restart." % (sys.argv[0], dumpfile)
def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE):
if verbose > 0:
- print "Loading checkpoint from %s ..." % dumpfile
+ print "Loading checkpoint from %s ..." % dumpfile
f = open(dumpfile, "rb")
c = pickle.load(f)
f.close()
if verbose > 0:
- print "Done."
- print "Root:", string.join(c.roots, "\n ")
+ print "Done."
+ print "Root:", string.join(c.roots, "\n ")
return c
@@ -225,364 +225,364 @@ class Checker:
validflags = tuple(dir())
def __init__(self):
- self.reset()
+ self.reset()
def setflags(self, **kw):
- for key in kw.keys():
- if key not in self.validflags:
- raise NameError, "invalid keyword argument: %s" % str(key)
- for key, value in kw.items():
- setattr(self, key, value)
+ for key in kw.keys():
+ if key not in self.validflags:
+ raise NameError, "invalid keyword argument: %s" % str(key)
+ for key, value in kw.items():
+ setattr(self, key, value)
def reset(self):
- self.roots = []
- self.todo = {}
- self.done = {}
- self.bad = {}
- self.round = 0
- # The following are not pickled:
- self.robots = {}
- self.errors = {}
- self.urlopener = MyURLopener()
- self.changed = 0
+ self.roots = []
+ self.todo = {}
+ self.done = {}
+ self.bad = {}
+ self.round = 0
+ # The following are not pickled:
+ self.robots = {}
+ self.errors = {}
+ self.urlopener = MyURLopener()
+ self.changed = 0
def __getstate__(self):
- return (self.roots, self.todo, self.done, self.bad, self.round)
+ return (self.roots, self.todo, self.done, self.bad, self.round)
def __setstate__(self, state):
- self.reset()
- (self.roots, self.todo, self.done, self.bad, self.round) = state
- for root in self.roots:
- self.addrobot(root)
- for url in self.bad.keys():
- self.markerror(url)
+ self.reset()
+ (self.roots, self.todo, self.done, self.bad, self.round) = state
+ for root in self.roots:
+ self.addrobot(root)
+ for url in self.bad.keys():
+ self.markerror(url)
def addroot(self, root):
- if root not in self.roots:
- troot = root
- scheme, netloc, path, params, query, fragment = \
- urlparse.urlparse(root)
- i = string.rfind(path, "/") + 1
- if 0 < i < len(path):
- path = path[:i]
- troot = urlparse.urlunparse((scheme, netloc, path,
- params, query, fragment))
- self.roots.append(troot)
- self.addrobot(root)
- self.newlink(root, ("<root>", root))
+ if root not in self.roots:
+ troot = root
+ scheme, netloc, path, params, query, fragment = \
+ urlparse.urlparse(root)
+ i = string.rfind(path, "/") + 1
+ if 0 < i < len(path):
+ path = path[:i]
+ troot = urlparse.urlunparse((scheme, netloc, path,
+ params, query, fragment))
+ self.roots.append(troot)
+ self.addrobot(root)
+ self.newlink(root, ("<root>", root))
def addrobot(self, root):
- root = urlparse.urljoin(root, "/")
- if self.robots.has_key(root): return
- url = urlparse.urljoin(root, "/robots.txt")
- self.robots[root] = rp = robotparser.RobotFileParser()
- if self.verbose > 2:
- print "Parsing", url
- rp.debug = self.verbose > 3
- rp.set_url(url)
- try:
- rp.read()
- except IOError, msg:
- if self.verbose > 1:
- print "I/O error parsing", url, ":", msg
+ root = urlparse.urljoin(root, "/")
+ if self.robots.has_key(root): return
+ url = urlparse.urljoin(root, "/robots.txt")
+ self.robots[root] = rp = robotparser.RobotFileParser()
+ if self.verbose > 2:
+ print "Parsing", url
+ rp.debug = self.verbose > 3
+ rp.set_url(url)
+ try:
+ rp.read()
+ except IOError, msg:
+ if self.verbose > 1:
+ print "I/O error parsing", url, ":", msg
def run(self):
- while self.todo:
- self.round = self.round + 1
- if self.verbose > 0:
- print
- print "Round %d (%s)" % (self.round, self.status())
- print
- urls = self.todo.keys()[:self.roundsize]
- for url in urls:
- self.dopage(url)
+ while self.todo:
+ self.round = self.round + 1
+ if self.verbose > 0:
+ print
+ print "Round %d (%s)" % (self.round, self.status())
+ print
+ urls = self.todo.keys()[:self.roundsize]
+ for url in urls:
+ self.dopage(url)
def status(self):
- return "%d total, %d to do, %d done, %d bad" % (
- len(self.todo)+len(self.done),
- len(self.todo), len(self.done),
- len(self.bad))
+ return "%d total, %d to do, %d done, %d bad" % (
+ len(self.todo)+len(self.done),
+ len(self.todo), len(self.done),
+ len(self.bad))
def report(self):
- print
- if not self.todo: print "Final",
- else: print "Interim",
- print "Report (%s)" % self.status()
- self.report_errors()
+ print
+ if not self.todo: print "Final",
+ else: print "Interim",
+ print "Report (%s)" % self.status()
+ self.report_errors()
def report_errors(self):
- if not self.bad:
- print
- print "No errors"
- return
- print
- print "Error Report:"
- sources = self.errors.keys()
- sources.sort()
- for source in sources:
- triples = self.errors[source]
- print
- if len(triples) > 1:
- print len(triples), "Errors in", source
- else:
- print "Error in", source
- for url, rawlink, msg in triples:
- print " HREF", url,
- if rawlink != url: print "(%s)" % rawlink,
- print
- print " msg", msg
+ if not self.bad:
+ print
+ print "No errors"
+ return
+ print
+ print "Error Report:"
+ sources = self.errors.keys()
+ sources.sort()
+ for source in sources:
+ triples = self.errors[source]
+ print
+ if len(triples) > 1:
+ print len(triples), "Errors in", source
+ else:
+ print "Error in", source
+ for url, rawlink, msg in triples:
+ print " HREF", url,
+ if rawlink != url: print "(%s)" % rawlink,
+ print
+ print " msg", msg
def dopage(self, url):
- if self.verbose > 1:
- if self.verbose > 2:
- self.show("Check ", url, " from", self.todo[url])
- else:
- print "Check ", url
- page = self.getpage(url)
- if page:
- for info in page.getlinkinfos():
- link, rawlink = info
- origin = url, rawlink
- self.newlink(link, origin)
- self.markdone(url)
+ if self.verbose > 1:
+ if self.verbose > 2:
+ self.show("Check ", url, " from", self.todo[url])
+ else:
+ print "Check ", url
+ page = self.getpage(url)
+ if page:
+ for info in page.getlinkinfos():
+ link, rawlink = info
+ origin = url, rawlink
+ self.newlink(link, origin)
+ self.markdone(url)
def newlink(self, url, origin):
- if self.done.has_key(url):
- self.newdonelink(url, origin)
- else:
- self.newtodolink(url, origin)
+ if self.done.has_key(url):
+ self.newdonelink(url, origin)
+ else:
+ self.newtodolink(url, origin)
def newdonelink(self, url, origin):
- self.done[url].append(origin)
- if self.verbose > 3:
- print " Done link", url
+ self.done[url].append(origin)
+ if self.verbose > 3:
+ print " Done link", url
def newtodolink(self, url, origin):
- if self.todo.has_key(url):
- self.todo[url].append(origin)
- if self.verbose > 3:
- print " Seen todo link", url
- else:
- self.todo[url] = [origin]
- if self.verbose > 3:
- print " New todo link", url
+ if self.todo.has_key(url):
+ self.todo[url].append(origin)
+ if self.verbose > 3:
+ print " Seen todo link", url
+ else:
+ self.todo[url] = [origin]
+ if self.verbose > 3:
+ print " New todo link", url
def markdone(self, url):
- self.done[url] = self.todo[url]
- del self.todo[url]
- self.changed = 1
+ self.done[url] = self.todo[url]
+ del self.todo[url]
+ self.changed = 1
def inroots(self, url):
- for root in self.roots:
- if url[:len(root)] == root:
- root = urlparse.urljoin(root, "/")
- return self.robots[root].can_fetch(AGENTNAME, url)
- return 0
+ for root in self.roots:
+ if url[:len(root)] == root:
+ root = urlparse.urljoin(root, "/")
+ return self.robots[root].can_fetch(AGENTNAME, url)
+ return 0
def getpage(self, url):
- if url[:7] == 'mailto:' or url[:5] == 'news:':
- if self.verbose > 1: print " Not checking mailto/news URL"
- return None
- isint = self.inroots(url)
- if not isint:
- if not self.checkext:
- if self.verbose > 1: print " Not checking ext link"
- return None
- f = self.openpage(url)
- if f:
- self.safeclose(f)
- return None
- text, nurl = self.readhtml(url)
- if nurl != url:
- if self.verbose > 1:
- print " Redirected to", nurl
- url = nurl
- if text:
- return Page(text, url, verbose=self.verbose, maxpage=self.maxpage)
+ if url[:7] == 'mailto:' or url[:5] == 'news:':
+ if self.verbose > 1: print " Not checking mailto/news URL"
+ return None
+ isint = self.inroots(url)
+ if not isint:
+ if not self.checkext:
+ if self.verbose > 1: print " Not checking ext link"
+ return None
+ f = self.openpage(url)
+ if f:
+ self.safeclose(f)
+ return None
+ text, nurl = self.readhtml(url)
+ if nurl != url:
+ if self.verbose > 1:
+ print " Redirected to", nurl
+ url = nurl
+ if text:
+ return Page(text, url, verbose=self.verbose, maxpage=self.maxpage)
def readhtml(self, url):
- text = None
- f, url = self.openhtml(url)
- if f:
- text = f.read()
- f.close()
- return text, url
+ text = None
+ f, url = self.openhtml(url)
+ if f:
+ text = f.read()
+ f.close()
+ return text, url
def openhtml(self, url):
- f = self.openpage(url)
- if f:
- url = f.geturl()
- info = f.info()
- if not self.checkforhtml(info, url):
- self.safeclose(f)
- f = None
- return f, url
+ f = self.openpage(url)
+ if f:
+ url = f.geturl()
+ info = f.info()
+ if not self.checkforhtml(info, url):
+ self.safeclose(f)
+ f = None
+ return f, url
def openpage(self, url):
- try:
- return self.urlopener.open(url)
- except IOError, msg:
- msg = self.sanitize(msg)
- if self.verbose > 0:
- print "Error ", msg
- if self.verbose > 0:
- self.show(" HREF ", url, " from", self.todo[url])
- self.setbad(url, msg)
- return None
+ try:
+ return self.urlopener.open(url)
+ except IOError, msg:
+ msg = self.sanitize(msg)
+ if self.verbose > 0:
+ print "Error ", msg
+ if self.verbose > 0:
+ self.show(" HREF ", url, " from", self.todo[url])
+ self.setbad(url, msg)
+ return None
def checkforhtml(self, info, url):
- if info.has_key('content-type'):
- ctype = string.lower(info['content-type'])
- else:
- if url[-1:] == "/":
- return 1
- ctype, encoding = mimetypes.guess_type(url)
- if ctype == 'text/html':
- return 1
- else:
- if self.verbose > 1:
- print " Not HTML, mime type", ctype
- return 0
+ if info.has_key('content-type'):
+ ctype = string.lower(info['content-type'])
+ else:
+ if url[-1:] == "/":
+ return 1
+ ctype, encoding = mimetypes.guess_type(url)
+ if ctype == 'text/html':
+ return 1
+ else:
+ if self.verbose > 1:
+ print " Not HTML, mime type", ctype
+ return 0
def setgood(self, url):
- if self.bad.has_key(url):
- del self.bad[url]
- self.changed = 1
- if self.verbose > 0:
- print "(Clear previously seen error)"
+ if self.bad.has_key(url):
+ del self.bad[url]
+ self.changed = 1
+ if self.verbose > 0:
+ print "(Clear previously seen error)"
def setbad(self, url, msg):
- if self.bad.has_key(url) and self.bad[url] == msg:
- if self.verbose > 0:
- print "(Seen this error before)"
- return
- self.bad[url] = msg
- self.changed = 1
- self.markerror(url)
-
+ if self.bad.has_key(url) and self.bad[url] == msg:
+ if self.verbose > 0:
+ print "(Seen this error before)"
+ return
+ self.bad[url] = msg
+ self.changed = 1
+ self.markerror(url)
+
def markerror(self, url):
- try:
- origins = self.todo[url]
- except KeyError:
- origins = self.done[url]
- for source, rawlink in origins:
- triple = url, rawlink, self.bad[url]
- self.seterror(source, triple)
+ try:
+ origins = self.todo[url]
+ except KeyError:
+ origins = self.done[url]
+ for source, rawlink in origins:
+ triple = url, rawlink, self.bad[url]
+ self.seterror(source, triple)
def seterror(self, url, triple):
- try:
- self.errors[url].append(triple)
- except KeyError:
- self.errors[url] = [triple]
+ try:
+ self.errors[url].append(triple)
+ except KeyError:
+ self.errors[url] = [triple]
# The following used to be toplevel functions; they have been
# changed into methods so they can be overridden in subclasses.
def show(self, p1, link, p2, origins):
- print p1, link
- i = 0
- for source, rawlink in origins:
- i = i+1
- if i == 2:
- p2 = ' '*len(p2)
- print p2, source,
- if rawlink != link: print "(%s)" % rawlink,
- print
+ print p1, link
+ i = 0
+ for source, rawlink in origins:
+ i = i+1
+ if i == 2:
+ p2 = ' '*len(p2)
+ print p2, source,
+ if rawlink != link: print "(%s)" % rawlink,
+ print
def sanitize(self, msg):
- if isinstance(IOError, ClassType) and isinstance(msg, IOError):
- # Do the other branch recursively
- msg.args = self.sanitize(msg.args)
- elif isinstance(msg, TupleType):
- if len(msg) >= 4 and msg[0] == 'http error' and \
- isinstance(msg[3], InstanceType):
- # Remove the Message instance -- it may contain
- # a file object which prevents pickling.
- msg = msg[:3] + msg[4:]
- return msg
+ if isinstance(IOError, ClassType) and isinstance(msg, IOError):
+ # Do the other branch recursively
+ msg.args = self.sanitize(msg.args)
+ elif isinstance(msg, TupleType):
+ if len(msg) >= 4 and msg[0] == 'http error' and \
+ isinstance(msg[3], InstanceType):
+ # Remove the Message instance -- it may contain
+ # a file object which prevents pickling.
+ msg = msg[:3] + msg[4:]
+ return msg
def safeclose(self, f):
- try:
- url = f.geturl()
- except AttributeError:
- pass
- else:
- if url[:4] == 'ftp:' or url[:7] == 'file://':
- # Apparently ftp connections don't like to be closed
- # prematurely...
- text = f.read()
- f.close()
+ try:
+ url = f.geturl()
+ except AttributeError:
+ pass
+ else:
+ if url[:4] == 'ftp:' or url[:7] == 'file://':
+ # Apparently ftp connections don't like to be closed
+ # prematurely...
+ text = f.read()
+ f.close()
def save_pickle(self, dumpfile=DUMPFILE):
- if not self.changed:
- if self.verbose > 0:
- print
- print "No need to save checkpoint"
- elif not dumpfile:
- if self.verbose > 0:
- print "No dumpfile, won't save checkpoint"
- else:
- if self.verbose > 0:
- print
- print "Saving checkpoint to %s ..." % dumpfile
- newfile = dumpfile + ".new"
- f = open(newfile, "wb")
- pickle.dump(self, f)
- f.close()
- try:
- os.unlink(dumpfile)
- except os.error:
- pass
- os.rename(newfile, dumpfile)
- if self.verbose > 0:
- print "Done."
- return 1
+ if not self.changed:
+ if self.verbose > 0:
+ print
+ print "No need to save checkpoint"
+ elif not dumpfile:
+ if self.verbose > 0:
+ print "No dumpfile, won't save checkpoint"
+ else:
+ if self.verbose > 0:
+ print
+ print "Saving checkpoint to %s ..." % dumpfile
+ newfile = dumpfile + ".new"
+ f = open(newfile, "wb")
+ pickle.dump(self, f)
+ f.close()
+ try:
+ os.unlink(dumpfile)
+ except os.error:
+ pass
+ os.rename(newfile, dumpfile)
+ if self.verbose > 0:
+ print "Done."
+ return 1
class Page:
def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE):
- self.text = text
- self.url = url
- self.verbose = verbose
- self.maxpage = maxpage
+ self.text = text
+ self.url = url
+ self.verbose = verbose
+ self.maxpage = maxpage
def getlinkinfos(self):
- size = len(self.text)
- if size > self.maxpage:
- if self.verbose > 0:
- print "Skip huge file", self.url
- print " (%.0f Kbytes)" % (size*0.001)
- return []
- if self.verbose > 2:
- print " Parsing", self.url, "(%d bytes)" % size
- parser = MyHTMLParser(verbose=self.verbose)
- parser.feed(self.text)
- parser.close()
- rawlinks = parser.getlinks()
- base = urlparse.urljoin(self.url, parser.getbase() or "")
- infos = []
- for rawlink in rawlinks:
- t = urlparse.urlparse(rawlink)
- t = t[:-1] + ('',)
- rawlink = urlparse.urlunparse(t)
- link = urlparse.urljoin(base, rawlink)
- infos.append((link, rawlink))
- return infos
+ size = len(self.text)
+ if size > self.maxpage:
+ if self.verbose > 0:
+ print "Skip huge file", self.url
+ print " (%.0f Kbytes)" % (size*0.001)
+ return []
+ if self.verbose > 2:
+ print " Parsing", self.url, "(%d bytes)" % size
+ parser = MyHTMLParser(verbose=self.verbose)
+ parser.feed(self.text)
+ parser.close()
+ rawlinks = parser.getlinks()
+ base = urlparse.urljoin(self.url, parser.getbase() or "")
+ infos = []
+ for rawlink in rawlinks:
+ t = urlparse.urlparse(rawlink)
+ t = t[:-1] + ('',)
+ rawlink = urlparse.urlunparse(t)
+ link = urlparse.urljoin(base, rawlink)
+ infos.append((link, rawlink))
+ return infos
class MyStringIO(StringIO.StringIO):
def __init__(self, url, info):
- self.__url = url
- self.__info = info
- StringIO.StringIO.__init__(self)
+ self.__url = url
+ self.__info = info
+ StringIO.StringIO.__init__(self)
def info(self):
- return self.__info
+ return self.__info
def geturl(self):
- return self.__url
+ return self.__url
class MyURLopener(urllib.FancyURLopener):
@@ -590,81 +590,81 @@ class MyURLopener(urllib.FancyURLopener):
http_error_default = urllib.URLopener.http_error_default
def __init__(*args):
- self = args[0]
- apply(urllib.FancyURLopener.__init__, args)
- self.addheaders = [
- ('User-agent', 'Python-webchecker/%s' % __version__),
- ]
+ self = args[0]
+ apply(urllib.FancyURLopener.__init__, args)
+ self.addheaders = [
+ ('User-agent', 'Python-webchecker/%s' % __version__),
+ ]
def http_error_401(self, url, fp, errcode, errmsg, headers):
return None
def open_file(self, url):
- path = urllib.url2pathname(urllib.unquote(url))
- if path[-1] != os.sep:
- url = url + '/'
- if os.path.isdir(path):
- indexpath = os.path.join(path, "index.html")
- if os.path.exists(indexpath):
- return self.open_file(url + "index.html")
- try:
- names = os.listdir(path)
- except os.error, msg:
- raise IOError, msg, sys.exc_traceback
- names.sort()
- s = MyStringIO("file:"+url, {'content-type': 'text/html'})
- s.write('<BASE HREF="file:%s">\n' %
- urllib.quote(os.path.join(path, "")))
- for name in names:
- q = urllib.quote(name)
- s.write('<A HREF="%s">%s</A>\n' % (q, q))
- s.seek(0)
- return s
- return urllib.FancyURLopener.open_file(self, path)
+ path = urllib.url2pathname(urllib.unquote(url))
+ if path[-1] != os.sep:
+ url = url + '/'
+ if os.path.isdir(path):
+ indexpath = os.path.join(path, "index.html")
+ if os.path.exists(indexpath):
+ return self.open_file(url + "index.html")
+ try:
+ names = os.listdir(path)
+ except os.error, msg:
+ raise IOError, msg, sys.exc_traceback
+ names.sort()
+ s = MyStringIO("file:"+url, {'content-type': 'text/html'})
+ s.write('<BASE HREF="file:%s">\n' %
+ urllib.quote(os.path.join(path, "")))
+ for name in names:
+ q = urllib.quote(name)
+ s.write('<A HREF="%s">%s</A>\n' % (q, q))
+ s.seek(0)
+ return s
+ return urllib.FancyURLopener.open_file(self, path)
class MyHTMLParser(sgmllib.SGMLParser):
def __init__(self, verbose=VERBOSE):
- self.base = None
- self.links = {}
- self.myverbose = verbose
- sgmllib.SGMLParser.__init__(self)
+ self.base = None
+ self.links = {}
+ self.myverbose = verbose
+ sgmllib.SGMLParser.__init__(self)
def start_a(self, attributes):
- self.link_attr(attributes, 'href')
+ self.link_attr(attributes, 'href')
def end_a(self): pass
def do_area(self, attributes):
- self.link_attr(attributes, 'href')
+ self.link_attr(attributes, 'href')
def do_img(self, attributes):
- self.link_attr(attributes, 'src', 'lowsrc')
+ self.link_attr(attributes, 'src', 'lowsrc')
def do_frame(self, attributes):
- self.link_attr(attributes, 'src')
+ self.link_attr(attributes, 'src')
def link_attr(self, attributes, *args):
- for name, value in attributes:
- if name in args:
- if value: value = string.strip(value)
- if value: self.links[value] = None
+ for name, value in attributes:
+ if name in args:
+ if value: value = string.strip(value)
+ if value: self.links[value] = None
def do_base(self, attributes):
- for name, value in attributes:
- if name == 'href':
- if value: value = string.strip(value)
- if value:
- if self.myverbose > 1:
- print " Base", value
- self.base = value
+ for name, value in attributes:
+ if name == 'href':
+ if value: value = string.strip(value)
+ if value:
+ if self.myverbose > 1:
+ print " Base", value
+ self.base = value
def getlinks(self):
- return self.links.keys()
+ return self.links.keys()
def getbase(self):
- return self.base
+ return self.base
if __name__ == '__main__':