Integrated Sam Bayer's wcnew.py code. It seems silly to keep two

files. Removed Sam's "SLB" change comments; otherwise this is the same as wcnew.py.
author: Guido van Rossum <guido@python.org> 1999-11-17 15:40:08 (GMT)
committer: Guido van Rossum <guido@python.org> 1999-11-17 15:40:08 (GMT)
commit: e284b21457c51d4906fa6185862bf77268ad115e (patch)
tree: 82dc456201edce9040d7bf54319d76f1b78725f7
parent: 61b95db389965c0746c27c6be2e1a3c54d4309bc (diff)
download: cpython-e284b21457c51d4906fa6185862bf77268ad115e.zip
cpython-e284b21457c51d4906fa6185862bf77268ad115e.tar.gz
cpython-e284b21457c51d4906fa6185862bf77268ad115e.tar.bz2
1 files changed, 185 insertions, 46 deletions
diff --git a/Tools/webchecker/webchecker.py b/Tools/webchecker/webchecker.py
index cbb9f08..5d127cb 100755
--- a/Tools/webchecker/webchecker.py
+++ b/Tools/webchecker/webchecker.py
@@ -1,5 +1,8 @@
 #! /usr/bin/env python
 
+# Original code by Guido van Rossum; extensive changes by Sam Bayer,
+# including code to check URL fragments.
+
 """Web tree checker.
 
 This utility is handy to check a subweb of the world-wide web for
@@ -64,14 +67,18 @@ directory) has a built-in table mapping most currently known suffixes,
 and in addition attempts to read the mime.types configuration files in
 the default locations of Netscape and the NCSA HTTP daemon.
 
-- We follows links indicated by <A>, <FRAME> and <IMG> tags.  We also
+- We follow links indicated by <A>, <FRAME> and <IMG> tags.  We also
 honor the <BASE> tag.
 
+- We now check internal NAME anchor links, as well as toplevel links.
+
 - Checking external links is now done by default; use -x to *disable*
 this feature.  External links are now checked during normal
 processing.  (XXX The status of a checked link could be categorized
 better.  Later...)
 
+- If external links are not checked, you can use the -t flag to
+provide specific overrides to -x.
 
 Usage: webchecker.py [option] ... [rooturl] ...
 
@@ -83,8 +90,10 @@ Options:
 -n        -- reports only, no checking (use with -R)
 -q        -- quiet operation (also suppresses external links report)
 -r number -- number of links processed per round (default %(ROUNDSIZE)d)
+-t root   -- specify root dir which should be treated as internal (can repeat)
 -v        -- verbose operation; repeating -v will increase verbosity
 -x        -- don't check external links (these are often slow to check)
+-a        -- don't check name anchors
 
 Arguments:
 
@@ -127,6 +136,7 @@ MAXPAGE = 150000                        # Ignore files bigger than this
 ROUNDSIZE = 50                          # Number of links processed per round
 DUMPFILE = "@webchecker.pickle"         # Pickled checkpoint
 AGENTNAME = "webchecker"                # Agent name for robots.txt parser
+NONAMES = 0                             # Force name anchor checking
 
 
 # Global variables
@@ -142,12 +152,17 @@ def main():
     norun = 0
 
     try:
-        opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:vx')
+        opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
     except getopt.error, msg:
         sys.stdout = sys.stderr
         print msg
         print __doc__%globals()
         sys.exit(2)
+
+    # The extra_roots variable collects extra roots.
+    extra_roots = []
+    nonames = NONAMES
+
     for o, a in opts:
         if o == '-R':
             restart = 1
@@ -161,6 +176,10 @@ def main():
             verbose = 0
         if o == '-r':
             roundsize = string.atoi(a)
+        if o == '-t':
+            extra_roots.append(a)
+        if o == '-a':
+            nonames = not nonames
         if o == '-v':
             verbose = verbose + 1
         if o == '-x':
@@ -175,7 +194,9 @@ def main():
         c = Checker()
 
     c.setflags(checkext=checkext, verbose=verbose,
-               maxpage=maxpage, roundsize=roundsize)
+               maxpage=maxpage, roundsize=roundsize,
+               nonames=nonames
+               )
 
     if not restart and not args:
         args.append(DEFROOT)
@@ -183,6 +204,17 @@ def main():
     for arg in args:
         c.addroot(arg)
 
+    # The -t flag is only needed if external links are not to be
+    # checked. So -t values are ignored unless -x was specified.
+    if not checkext:
+        for root in extra_roots:
+            # Make sure it's terminated by a slash,
+            # so that addroot doesn't discard the last
+            # directory component.
+            if root[-1] != "/":
+                root = root + "/"
+            c.addroot(root, add_to_do = 0)
+
     try:
 
         if not norun:
@@ -225,6 +257,7 @@ class Checker:
     verbose = VERBOSE
     maxpage = MAXPAGE
     roundsize = ROUNDSIZE
+    nonames = NONAMES
 
     validflags = tuple(dir())
 
@@ -243,19 +276,24 @@ class Checker:
         self.todo = {}
         self.done = {}
         self.bad = {}
+
+        # Add a name table, so that the name URLs can be checked. Also
+        # serves as an implicit cache for which URLs are done.
+        self.name_table = {}
+
         self.round = 0
         # The following are not pickled:
         self.robots = {}
         self.errors = {}
         self.urlopener = MyURLopener()
         self.changed = 0
-        
+
     def note(self, level, format, *args):
         if self.verbose > level:
             if args:
                 format = format%args
             self.message(format)
-    
+
     def message(self, format, *args):
         if args:
             format = format%args
@@ -272,7 +310,7 @@ class Checker:
         for url in self.bad.keys():
             self.markerror(url)
 
-    def addroot(self, root):
+    def addroot(self, root, add_to_do = 1):
         if root not in self.roots:
             troot = root
             scheme, netloc, path, params, query, fragment = \
@@ -284,7 +322,8 @@ class Checker:
                                              params, query, fragment))
             self.roots.append(troot)
             self.addrobot(root)
-            self.newlink(root, ("<root>", root))
+            if add_to_do:
+                self.newlink((root, ""), ("<root>", root))
 
     def addrobot(self, root):
         root = urlparse.urljoin(root, "/")
@@ -336,24 +375,53 @@ class Checker:
                 self.message("%d Errors in %s", len(triples), source)
             else:
                 self.message("Error in %s", source)
-            for url, rawlink, msg in triples:
-                if rawlink != url: s = " (%s)" % rawlink
+            # Call self.format_url() instead of referring
+            # to the URL directly, since the URLs in these
+            # triples is now a (URL, fragment) pair. The value
+            # of the "source" variable comes from the list of
+            # origins, and is a URL, not a pair.
+            for url, rawlink, msg in triples:           
+                if rawlink != self.format_url(url): s = " (%s)" % rawlink
                 else: s = ""
-                self.message("  HREF %s%s\n    msg %s", url, s, msg)
+                self.message("  HREF %s%s\n    msg %s",
+                             self.format_url(url), s, msg)
+
+    def dopage(self, url_pair):
 
-    def dopage(self, url):
+        # All printing of URLs uses format_url(); argument changed to
+        # url_pair for clarity.
         if self.verbose > 1:
             if self.verbose > 2:
-                self.show("Check ", url, "  from", self.todo[url])
+                self.show("Check ", self.format_url(url_pair),
+                          "  from", self.todo[url_pair])
             else:
-                self.message("Check %s", url)
-        page = self.getpage(url)
+                self.message("Check %s", self.format_url(url_pair))
+        url, local_fragment = url_pair
+        if local_fragment and self.nonames:
+            self.markdone(url_pair)
+            return
+        page = self.getpage(url_pair)
         if page:
+            # Store the page which corresponds to this URL.
+            self.name_table[url] = page
+            # If there is a fragment in this url_pair, and it's not
+            # in the list of names for the page, call setbad(), since
+            # it's a missing anchor.
+            if local_fragment and local_fragment not in page.getnames():
+                self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment))
             for info in page.getlinkinfos():
-                link, rawlink = info
+                # getlinkinfos() now returns the fragment as well,
+                # and we store that fragment here in the "todo" dictionary.
+                link, rawlink, fragment = info
+                # However, we don't want the fragment as the origin, since
+                # the origin is logically a page.
                 origin = url, rawlink
-                self.newlink(link, origin)
-        self.markdone(url)
+                self.newlink((link, fragment), origin)
+        else:
+            # If no page has been created yet, we want to
+            # record that fact.
+            self.name_table[url_pair[0]] = None
+        self.markdone(url_pair)
 
     def newlink(self, url, origin):
         if self.done.has_key(url):
@@ -362,21 +430,34 @@ class Checker:
             self.newtodolink(url, origin)
 
     def newdonelink(self, url, origin):
-        self.done[url].append(origin)
-        self.note(3, "  Done link %s", url)
+        if origin not in self.done[url]:
+            self.done[url].append(origin)
+
+        # Call self.format_url(), since the URL here
+        # is now a (URL, fragment) pair.
+        self.note(3, "  Done link %s", self.format_url(url))
+
+        # Make sure that if it's bad, that the origin gets added.
         if self.bad.has_key(url):
             source, rawlink = origin
             triple = url, rawlink, self.bad[url]
             self.seterror(source, triple)
 
     def newtodolink(self, url, origin):
+        # Call self.format_url(), since the URL here
+        # is now a (URL, fragment) pair.
         if self.todo.has_key(url):
             if origin not in self.todo[url]:
                 self.todo[url].append(origin)
-            self.note(3, "  Seen todo link %s", url)
+            self.note(3, "  Seen todo link %s", self.format_url(url))
         else:
             self.todo[url] = [origin]
-            self.note(3, "  New todo link %s", url)
+            self.note(3, "  New todo link %s", self.format_url(url))
+
+    def format_url(self, url):  
+        link, fragment = url
+        if fragment: return link + "#" + fragment
+        else: return link
 
     def markdone(self, url):
         self.done[url] = self.todo[url]
@@ -388,41 +469,57 @@ class Checker:
             if url[:len(root)] == root:
                 return self.isallowed(root, url)
         return 0
-    
+
     def isallowed(self, root, url):
         root = urlparse.urljoin(root, "/")
         return self.robots[root].can_fetch(AGENTNAME, url)
 
-    def getpage(self, url):
+    def getpage(self, url_pair):
+        # Incoming argument name is a (URL, fragment) pair.
+        # The page may have been cached in the name_table variable.
+        url, fragment = url_pair
+        if self.name_table.has_key(url):
+            return self.name_table[url]
+
         if url[:7] == 'mailto:' or url[:5] == 'news:':
             self.note(1, " Not checking mailto/news URL")
             return None
         isint = self.inroots(url)
+
+        # Ensure that openpage gets the URL pair to
+        # print out its error message and record the error pair
+        # correctly.
         if not isint:
             if not self.checkext:
                 self.note(1, " Not checking ext link")
                 return None
-            f = self.openpage(url)
+            f = self.openpage(url_pair)
             if f:
                 self.safeclose(f)
             return None
-        text, nurl = self.readhtml(url)
+        text, nurl = self.readhtml(url_pair)
+
         if nurl != url:
             self.note(1, " Redirected to %s", nurl)
             url = nurl
         if text:
             return Page(text, url, maxpage=self.maxpage, checker=self)
 
-    def readhtml(self, url):
+    # These next three functions take (URL, fragment) pairs as
+    # arguments, so that openpage() receives the appropriate tuple to
+    # record error messages.
+    def readhtml(self, url_pair):
+        url, fragment = url_pair
         text = None
-        f, url = self.openhtml(url)
+        f, url = self.openhtml(url_pair)
         if f:
             text = f.read()
             f.close()
         return text, url
 
-    def openhtml(self, url):
-        f = self.openpage(url)
+    def openhtml(self, url_pair):
+        url, fragment = url_pair
+        f = self.openpage(url_pair)
         if f:
             url = f.geturl()
             info = f.info()
@@ -431,15 +528,16 @@ class Checker:
                 f = None
         return f, url
 
-    def openpage(self, url):
+    def openpage(self, url_pair):
+        url, fragment = url_pair
         try:
             return self.urlopener.open(url)
         except IOError, msg:
             msg = self.sanitize(msg)
             self.note(0, "Error %s", msg)
             if self.verbose > 0:
-                self.show(" HREF ", url, "  from", self.todo[url])
-            self.setbad(url, msg)
+                self.show(" HREF ", url, "  from", self.todo[url_pair])
+            self.setbad(url_pair, msg)
             return None
 
     def checkforhtml(self, info, url):
@@ -468,7 +566,7 @@ class Checker:
         self.bad[url] = msg
         self.changed = 1
         self.markerror(url)
-        
+
     def markerror(self, url):
         try:
             origins = self.todo[url]
@@ -480,7 +578,13 @@ class Checker:
 
     def seterror(self, url, triple):
         try:
-            self.errors[url].append(triple)
+            # Because of the way the URLs are now processed, I need to
+            # check to make sure the URL hasn't been entered in the
+            # error list.  The first element of the triple here is a
+            # (URL, fragment) pair, but the URL key is not, since it's
+            # from the list of origins.
+            if triple not in self.errors[url]:
+                self.errors[url].append(triple)
         except KeyError:
             self.errors[url] = [triple]
 
@@ -551,6 +655,21 @@ class Page:
         self.maxpage = maxpage
         self.checker = checker
 
+        # The parsing of the page is done in the __init__() routine in
+        # order to initialize the list of names the file
+        # contains. Stored the parser in an instance variable. Passed
+        # the URL to MyHTMLParser().
+        size = len(self.text)
+        if size > self.maxpage:
+            self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
+            self.parser = None
+            return
+        self.checker.note(2, "  Parsing %s (%d bytes)", self.url, size)
+        self.parser = MyHTMLParser(url, verbose=self.verbose,
+                                   checker=self.checker)
+        self.parser.feed(self.text)
+        self.parser.close()
+
     def note(self, level, msg, *args):
         if self.checker:
             apply(self.checker.note, (level, msg) + args)
@@ -560,24 +679,30 @@ class Page:
                     msg = msg%args
                 print msg
 
+    # Method to retrieve names.
+    def getnames(self):
+        return self.parser.names
+
     def getlinkinfos(self):
-        size = len(self.text)
-        if size > self.maxpage:
-            self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
-            return []
-        self.checker.note(2, "  Parsing %s (%d bytes)", self.url, size)
-        parser = MyHTMLParser(verbose=self.verbose, checker=self.checker)
-        parser.feed(self.text)
-        parser.close()
-        rawlinks = parser.getlinks()
-        base = urlparse.urljoin(self.url, parser.getbase() or "")
+        # File reading is done in __init__() routine.  Store parser in
+        # local variable to indicate success of parsing.
+
+        # If no parser was stored, fail.
+        if not self.parser: return []
+
+        rawlinks = self.parser.getlinks()
+        base = urlparse.urljoin(self.url, self.parser.getbase() or "")
         infos = []
         for rawlink in rawlinks:
             t = urlparse.urlparse(rawlink)
+            # DON'T DISCARD THE FRAGMENT! Instead, include
+            # it in the tuples which are returned. See Checker.dopage().
+            fragment = t[-1]
             t = t[:-1] + ('',)
             rawlink = urlparse.urlunparse(t)
             link = urlparse.urljoin(base, rawlink)
-            infos.append((link, rawlink))
+            infos.append((link, rawlink, fragment))     
+
         return infos
 
 
@@ -635,16 +760,30 @@ class MyURLopener(urllib.FancyURLopener):
 
 class MyHTMLParser(sgmllib.SGMLParser):
 
-    def __init__(self, verbose=VERBOSE, checker=None):
+    def __init__(self, url, verbose=VERBOSE, checker=None):
         self.myverbose = verbose # now unused
         self.checker = checker
         self.base = None
         self.links = {}
+        self.names = []
+        self.url = url
         sgmllib.SGMLParser.__init__(self)
 
     def start_a(self, attributes):
         self.link_attr(attributes, 'href')
 
+        # We must rescue the NAME
+        # attributes from the anchor, in order to
+        # cache the internal anchors which are made
+        # available in the page.
+        for name, value in attributes:
+            if name == "name":
+                if value in self.names:
+                    self.checker.message("WARNING: duplicate name %s in %s",
+                                         value, self.url)
+                else: self.names.append(value)
+                break
+
     def end_a(self): pass
 
     def do_area(self, attributes):
author	Guido van Rossum <guido@python.org>	1999-11-17 15:40:08 (GMT)
committer	Guido van Rossum <guido@python.org>	1999-11-17 15:40:08 (GMT)
commit	e284b21457c51d4906fa6185862bf77268ad115e (patch)
tree	82dc456201edce9040d7bf54319d76f1b78725f7
parent	61b95db389965c0746c27c6be2e1a3c54d4309bc (diff)
download	cpython-e284b21457c51d4906fa6185862bf77268ad115e.zip cpython-e284b21457c51d4906fa6185862bf77268ad115e.tar.gz cpython-e284b21457c51d4906fa6185862bf77268ad115e.tar.bz2