diff options
Diffstat (limited to 'Tools/webchecker')
-rw-r--r-- | Tools/webchecker/mimetypes.py | 191 | ||||
-rw-r--r-- | Tools/webchecker/robotparser.py | 120 | ||||
-rw-r--r-- | Tools/webchecker/tktools.py | 164 | ||||
-rwxr-xr-x | Tools/webchecker/wcgui.py | 548 | ||||
-rwxr-xr-x | Tools/webchecker/webchecker.py | 758 | ||||
-rwxr-xr-x | Tools/webchecker/websucker.py | 110 |
6 files changed, 850 insertions, 1041 deletions
diff --git a/Tools/webchecker/mimetypes.py b/Tools/webchecker/mimetypes.py deleted file mode 100644 index 0b1748e..0000000 --- a/Tools/webchecker/mimetypes.py +++ /dev/null @@ -1,191 +0,0 @@ -"""Guess the MIME type of a file. - -This module defines one useful function: - -guess_type(url) -- guess the MIME type and encoding of a URL. - -It also contains the following, for tuning the behavior: - -Data: - -knownfiles -- list of files to parse -inited -- flag set when init() has been called -suffixes_map -- dictionary mapping suffixes to suffixes -encodings_map -- dictionary mapping suffixes to encodings -types_map -- dictionary mapping suffixes to types - -Functions: - -init([files]) -- parse a list of files, default knownfiles -read_mime_types(file) -- parse one file, return a dictionary or None - -""" - -import string -import posixpath - -knownfiles = [ - "/usr/local/etc/httpd/conf/mime.types", - "/usr/local/lib/netscape/mime.types", - ] - -inited = 0 - -def guess_type(url): - """Guess the type of a file based on its URL. - - Return value is a tuple (type, encoding) where type is None if the - type can't be guessed (no or unknown suffix) or a string of the - form type/subtype, usable for a MIME Content-type header; and - encoding is None for no encoding or the name of the program used - to encode (e.g. compress or gzip). The mappings are table - driven. Encoding suffixes are case sensitive; type suffixes are - first tried case sensitive, then case insensitive. - - The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped - to ".tar.gz". (This is table-driven too, using the dictionary - suffixes_map). - - """ - if not inited: - init() - base, ext = posixpath.splitext(url) - while suffix_map.has_key(ext): - base, ext = posixpath.splitext(base + suffix_map[ext]) - if encodings_map.has_key(ext): - encoding = encodings_map[ext] - base, ext = posixpath.splitext(base) - else: - encoding = None - if types_map.has_key(ext): - return types_map[ext], encoding - elif types_map.has_key(string.lower(ext)): - return types_map[string.lower(ext)], encoding - else: - return None, encoding - -def init(files=None): - global inited - for file in files or knownfiles: - s = read_mime_types(file) - if s: - for key, value in s.items(): - types_map[key] = value - inited = 1 - -def read_mime_types(file): - try: - f = open(file) - except IOError: - return None - map = {} - while 1: - line = f.readline() - if not line: break - words = string.split(line) - for i in range(len(words)): - if words[i][0] == '#': - del words[i:] - break - if not words: continue - type, suffixes = words[0], words[1:] - for suff in suffixes: - map['.'+suff] = type - f.close() - return map - -suffix_map = { - '.tgz': '.tar.gz', - '.taz': '.tar.gz', - '.tz': '.tar.gz', -} - -encodings_map = { - '.gz': 'gzip', - '.Z': 'compress', - } - -types_map = { - '.a': 'application/octet-stream', - '.ai': 'application/postscript', - '.aif': 'audio/x-aiff', - '.aifc': 'audio/x-aiff', - '.aiff': 'audio/x-aiff', - '.au': 'audio/basic', - '.avi': 'video/x-msvideo', - '.bcpio': 'application/x-bcpio', - '.bin': 'application/octet-stream', - '.cdf': 'application/x-netcdf', - '.cpio': 'application/x-cpio', - '.csh': 'application/x-csh', - '.dll': 'application/octet-stream', - '.dvi': 'application/x-dvi', - '.exe': 'application/octet-stream', - '.eps': 'application/postscript', - '.etx': 'text/x-setext', - '.gif': 'image/gif', - '.gtar': 'application/x-gtar', - '.hdf': 'application/x-hdf', - '.htm': 'text/html', - '.html': 'text/html', - '.shtml': 'text/html', - '.ief': 'image/ief', - '.jpe': 'image/jpeg', - '.jpeg': 'image/jpeg', - '.jpg': 'image/jpeg', - '.latex': 'application/x-latex', - '.man': 'application/x-troff-man', - '.me': 'application/x-troff-me', - '.mif': 'application/x-mif', - '.mov': 'video/quicktime', - '.movie': 'video/x-sgi-movie', - '.mpe': 'video/mpeg', - '.mpeg': 'video/mpeg', - '.mpg': 'video/mpeg', - '.ms': 'application/x-troff-ms', - '.nc': 'application/x-netcdf', - '.o': 'application/octet-stream', - '.obj': 'application/octet-stream', - '.oda': 'application/oda', - '.pbm': 'image/x-portable-bitmap', - '.pdf': 'application/pdf', - '.pgm': 'image/x-portable-graymap', - '.pnm': 'image/x-portable-anymap', - '.png': 'image/png', - '.ppm': 'image/x-portable-pixmap', - '.py': 'text/x-python', - '.pyc': 'application/x-python-code', - '.ps': 'application/postscript', - '.qt': 'video/quicktime', - '.ras': 'image/x-cmu-raster', - '.rgb': 'image/x-rgb', - '.roff': 'application/x-troff', - '.rtf': 'application/rtf', - '.rtx': 'text/richtext', - '.sgm': 'text/x-sgml', - '.sgml': 'text/x-sgml', - '.sh': 'application/x-sh', - '.shar': 'application/x-shar', - '.snd': 'audio/basic', - '.so': 'application/octet-stream', - '.src': 'application/x-wais-source', - '.sv4cpio': 'application/x-sv4cpio', - '.sv4crc': 'application/x-sv4crc', - '.t': 'application/x-troff', - '.tar': 'application/x-tar', - '.tcl': 'application/x-tcl', - '.tex': 'application/x-tex', - '.texi': 'application/x-texinfo', - '.texinfo': 'application/x-texinfo', - '.tif': 'image/tiff', - '.tiff': 'image/tiff', - '.tr': 'application/x-troff', - '.tsv': 'text/tab-separated-values', - '.txt': 'text/plain', - '.ustar': 'application/x-ustar', - '.wav': 'audio/x-wav', - '.xbm': 'image/x-xbitmap', - '.xpm': 'image/x-xpixmap', - '.xwd': 'image/x-xwindowdump', - '.zip': 'application/zip', - } diff --git a/Tools/webchecker/robotparser.py b/Tools/webchecker/robotparser.py index 634c3fe..6f85afa 100644 --- a/Tools/webchecker/robotparser.py +++ b/Tools/webchecker/robotparser.py @@ -9,79 +9,79 @@ fetchability of other URLs. class RobotFileParser: def __init__(self): - self.rules = {} - self.debug = 0 - self.url = '' - self.last_checked = 0 + self.rules = {} + self.debug = 0 + self.url = '' + self.last_checked = 0 def mtime(self): - return self.last_checked + return self.last_checked def modified(self): - import time - self.last_checked = time.time() + import time + self.last_checked = time.time() def set_url(self, url): - self.url = url -## import urlmisc -## self.url = urlmisc.canonical_url(url) + self.url = url +## import urlmisc +## self.url = urlmisc.canonical_url(url) def read(self): - import urllib - self.parse(urllib.urlopen(self.url).readlines()) + import urllib + self.parse(urllib.urlopen(self.url).readlines()) def parse(self, lines): - import regsub, string, regex - active = [] - for line in lines: - if self.debug: print '>', line, - # blank line terminates current record - if not line[:-1]: - active = [] - continue - # remove optional comment and strip line - line = string.strip(line[:string.find(line, '#')]) - if not line: - continue - line = regsub.split(line, ' *: *') - if len(line) == 2: - line[0] = string.lower(line[0]) - if line[0] == 'user-agent': - # this record applies to this user agent - if self.debug: print '>> user-agent:', line[1] - active.append(line[1]) - if not self.rules.has_key(line[1]): - self.rules[line[1]] = [] - elif line[0] == 'disallow': - if line[1]: - if self.debug: print '>> disallow:', line[1] - for agent in active: - self.rules[agent].append(regex.compile(line[1])) - else: - pass - for agent in active: - if self.debug: print '>> allow', agent - self.rules[agent] = [] - else: - if self.debug: print '>> unknown:', line + import regsub, string, regex + active = [] + for line in lines: + if self.debug: print '>', line, + # blank line terminates current record + if not line[:-1]: + active = [] + continue + # remove optional comment and strip line + line = string.strip(line[:string.find(line, '#')]) + if not line: + continue + line = regsub.split(line, ' *: *') + if len(line) == 2: + line[0] = string.lower(line[0]) + if line[0] == 'user-agent': + # this record applies to this user agent + if self.debug: print '>> user-agent:', line[1] + active.append(line[1]) + if not self.rules.has_key(line[1]): + self.rules[line[1]] = [] + elif line[0] == 'disallow': + if line[1]: + if self.debug: print '>> disallow:', line[1] + for agent in active: + self.rules[agent].append(regex.compile(line[1])) + else: + pass + for agent in active: + if self.debug: print '>> allow', agent + self.rules[agent] = [] + else: + if self.debug: print '>> unknown:', line - self.modified() + self.modified() # returns true if agent is allowed to fetch url def can_fetch(self, agent, url): - import urlparse - ag = agent - if not self.rules.has_key(ag): ag = '*' - if not self.rules.has_key(ag): - if self.debug: print '>> allowing', url, 'fetch by', agent - return 1 - path = urlparse.urlparse(url)[2] - for rule in self.rules[ag]: - if rule.match(path) != -1: - if self.debug: print '>> disallowing', url, 'fetch by', agent - return 0 - if self.debug: print '>> allowing', url, 'fetch by', agent - return 1 + import urlparse + ag = agent + if not self.rules.has_key(ag): ag = '*' + if not self.rules.has_key(ag): + if self.debug: print '>> allowing', url, 'fetch by', agent + return 1 + path = urlparse.urlparse(url)[2] + for rule in self.rules[ag]: + if rule.match(path) != -1: + if self.debug: print '>> disallowing', url, 'fetch by', agent + return 0 + if self.debug: print '>> allowing', url, 'fetch by', agent + return 1 def test(): rp = RobotFileParser() @@ -91,7 +91,7 @@ def test(): print rp.rules print rp.can_fetch('*', 'http://www.calendar.com/concerts/') print rp.can_fetch('Musi-Cal-Robot', - 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones') + 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones') print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/') print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001') diff --git a/Tools/webchecker/tktools.py b/Tools/webchecker/tktools.py index 6734530..0db4d49 100644 --- a/Tools/webchecker/tktools.py +++ b/Tools/webchecker/tktools.py @@ -7,8 +7,8 @@ from Tkinter import * def _clear_entry_widget(event): try: - widget = event.widget - widget.delete(0, INSERT) + widget = event.widget + widget.delete(0, INSERT) except: pass def install_keybindings(root): root.bind_class('Entry', '<Control-u>', _clear_entry_widget) @@ -23,12 +23,12 @@ def make_toplevel(master, title=None, class_=None): """ if class_: - widget = Toplevel(master, class_=class_) + widget = Toplevel(master, class_=class_) else: - widget = Toplevel(master) + widget = Toplevel(master) if title: - widget.title(title) - widget.iconname(title) + widget.title(title) + widget.iconname(title) return widget def set_transient(widget, master, relx=0.5, rely=0.3, expose=1): @@ -43,26 +43,26 @@ def set_transient(widget, master, relx=0.5, rely=0.3, expose=1): widget.transient(master) widget.update_idletasks() # Actualize geometry information if master.winfo_ismapped(): - m_width = master.winfo_width() - m_height = master.winfo_height() - m_x = master.winfo_rootx() - m_y = master.winfo_rooty() + m_width = master.winfo_width() + m_height = master.winfo_height() + m_x = master.winfo_rootx() + m_y = master.winfo_rooty() else: - m_width = master.winfo_screenwidth() - m_height = master.winfo_screenheight() - m_x = m_y = 0 + m_width = master.winfo_screenwidth() + m_height = master.winfo_screenheight() + m_x = m_y = 0 w_width = widget.winfo_reqwidth() w_height = widget.winfo_reqheight() x = m_x + (m_width - w_width) * relx y = m_y + (m_height - w_height) * rely widget.geometry("+%d+%d" % (x, y)) if expose: - widget.deiconify() # Become visible at the desired location + widget.deiconify() # Become visible at the desired location return widget def make_scrollbars(parent, hbar, vbar, pack=1, class_=None, name=None, - takefocus=0): + takefocus=0): """Subroutine to create a frame with scrollbars. @@ -76,38 +76,38 @@ def make_scrollbars(parent, hbar, vbar, pack=1, class_=None, name=None, """ if class_: - if name: frame = Frame(parent, class_=class_, name=name) - else: frame = Frame(parent, class_=class_) + if name: frame = Frame(parent, class_=class_, name=name) + else: frame = Frame(parent, class_=class_) else: - if name: frame = Frame(parent, name=name) - else: frame = Frame(parent) + if name: frame = Frame(parent, name=name) + else: frame = Frame(parent) if pack: - frame.pack(fill=BOTH, expand=1) + frame.pack(fill=BOTH, expand=1) corner = None if vbar: - if not hbar: - vbar = Scrollbar(frame, takefocus=takefocus) - vbar.pack(fill=Y, side=RIGHT) - else: - vbarframe = Frame(frame, borderwidth=0) - vbarframe.pack(fill=Y, side=RIGHT) - vbar = Scrollbar(frame, name="vbar", takefocus=takefocus) - vbar.pack(in_=vbarframe, expand=1, fill=Y, side=TOP) - sbwidth = vbar.winfo_reqwidth() - corner = Frame(vbarframe, width=sbwidth, height=sbwidth) - corner.propagate(0) - corner.pack(side=BOTTOM) + if not hbar: + vbar = Scrollbar(frame, takefocus=takefocus) + vbar.pack(fill=Y, side=RIGHT) + else: + vbarframe = Frame(frame, borderwidth=0) + vbarframe.pack(fill=Y, side=RIGHT) + vbar = Scrollbar(frame, name="vbar", takefocus=takefocus) + vbar.pack(in_=vbarframe, expand=1, fill=Y, side=TOP) + sbwidth = vbar.winfo_reqwidth() + corner = Frame(vbarframe, width=sbwidth, height=sbwidth) + corner.propagate(0) + corner.pack(side=BOTTOM) else: - vbar = None + vbar = None if hbar: - hbar = Scrollbar(frame, orient=HORIZONTAL, name="hbar", - takefocus=takefocus) - hbar.pack(fill=X, side=BOTTOM) + hbar = Scrollbar(frame, orient=HORIZONTAL, name="hbar", + takefocus=takefocus) + hbar.pack(fill=X, side=BOTTOM) else: - hbar = None + hbar = None return hbar, vbar, frame @@ -121,20 +121,20 @@ def set_scroll_commands(widget, hbar, vbar): """ if vbar: - widget['yscrollcommand'] = (vbar, 'set') - vbar['command'] = (widget, 'yview') + widget['yscrollcommand'] = (vbar, 'set') + vbar['command'] = (widget, 'yview') if hbar: - widget['xscrollcommand'] = (hbar, 'set') - hbar['command'] = (widget, 'xview') + widget['xscrollcommand'] = (hbar, 'set') + hbar['command'] = (widget, 'xview') widget.vbar = vbar widget.hbar = hbar def make_text_box(parent, width=0, height=0, hbar=0, vbar=1, - fill=BOTH, expand=1, wrap=WORD, pack=1, - class_=None, name=None, takefocus=None): + fill=BOTH, expand=1, wrap=WORD, pack=1, + class_=None, name=None, takefocus=None): """Subroutine to create a text box. @@ -148,8 +148,8 @@ def make_text_box(parent, width=0, height=0, hbar=0, vbar=1, """ hbar, vbar, frame = make_scrollbars(parent, hbar, vbar, pack, - class_=class_, name=name, - takefocus=takefocus) + class_=class_, name=name, + takefocus=takefocus) widget = Text(frame, wrap=wrap, name="text") if width: widget.config(width=width) @@ -162,16 +162,16 @@ def make_text_box(parent, width=0, height=0, hbar=0, vbar=1, def make_list_box(parent, width=0, height=0, hbar=0, vbar=1, - fill=BOTH, expand=1, pack=1, class_=None, name=None, - takefocus=None): + fill=BOTH, expand=1, pack=1, class_=None, name=None, + takefocus=None): """Subroutine to create a list box. Like make_text_box(). """ hbar, vbar, frame = make_scrollbars(parent, hbar, vbar, pack, - class_=class_, name=name, - takefocus=takefocus) + class_=class_, name=name, + takefocus=takefocus) widget = Listbox(frame, name="listbox") if width: widget.config(width=width) @@ -184,8 +184,8 @@ def make_list_box(parent, width=0, height=0, hbar=0, vbar=1, def make_canvas(parent, width=0, height=0, hbar=1, vbar=1, - fill=BOTH, expand=1, pack=1, class_=None, name=None, - takefocus=None): + fill=BOTH, expand=1, pack=1, class_=None, name=None, + takefocus=None): """Subroutine to create a canvas. @@ -194,8 +194,8 @@ def make_canvas(parent, width=0, height=0, hbar=1, vbar=1, """ hbar, vbar, frame = make_scrollbars(parent, hbar, vbar, pack, - class_=class_, name=name, - takefocus=takefocus) + class_=class_, name=name, + takefocus=takefocus) widget = Canvas(frame, scrollregion=(0, 0, width, height), name="canvas") if width: widget.config(width=width) @@ -228,9 +228,9 @@ def make_form_entry(parent, label, borderwidth=None): label.pack(side=LEFT) if borderwidth is None: - entry = Entry(frame, relief=SUNKEN) + entry = Entry(frame, relief=SUNKEN) else: - entry = Entry(frame, relief=SUNKEN, borderwidth=borderwidth) + entry = Entry(frame, relief=SUNKEN, borderwidth=borderwidth) entry.pack(side=LEFT, fill=X, expand=1) return entry, frame @@ -243,8 +243,8 @@ def make_form_entry(parent, label, borderwidth=None): # expandable while still aligning the colons. This doesn't work yet. # def make_labeled_form_entry(parent, label, entrywidth=20, entryheight=1, - labelwidth=0, borderwidth=None, - takefocus=None): + labelwidth=0, borderwidth=None, + takefocus=None): """Subroutine to create a form entry. Create: @@ -261,32 +261,32 @@ def make_labeled_form_entry(parent, label, entrywidth=20, entryheight=1, label = Label(frame, text=label, width=labelwidth, anchor=E) label.pack(side=LEFT) if entryheight == 1: - if borderwidth is None: - entry = Entry(frame, relief=SUNKEN, width=entrywidth) - else: - entry = Entry(frame, relief=SUNKEN, width=entrywidth, - borderwidth=borderwidth) - entry.pack(side=RIGHT, expand=1, fill=X) - frame.pack(fill=X) + if borderwidth is None: + entry = Entry(frame, relief=SUNKEN, width=entrywidth) + else: + entry = Entry(frame, relief=SUNKEN, width=entrywidth, + borderwidth=borderwidth) + entry.pack(side=RIGHT, expand=1, fill=X) + frame.pack(fill=X) else: - entry = make_text_box(frame, entrywidth, entryheight, 1, 1, - takefocus=takefocus) - frame.pack(fill=BOTH, expand=1) + entry = make_text_box(frame, entrywidth, entryheight, 1, 1, + takefocus=takefocus) + frame.pack(fill=BOTH, expand=1) return entry, frame, label def make_double_frame(master=None, class_=None, name=None, relief=RAISED, - borderwidth=1): + borderwidth=1): """Create a pair of frames suitable for 'hosting' a dialog.""" if name: - if class_: frame = Frame(master, class_=class_, name=name) - else: frame = Frame(master, name=name) + if class_: frame = Frame(master, class_=class_, name=name) + else: frame = Frame(master, name=name) else: - if class_: frame = Frame(master, class_=class_) - else: frame = Frame(master) + if class_: frame = Frame(master, class_=class_) + else: frame = Frame(master) top = Frame(frame, name="topframe", relief=relief, - borderwidth=borderwidth) + borderwidth=borderwidth) bottom = Frame(frame, name="bottomframe") bottom.pack(fill=X, padx='1m', pady='1m', side=BOTTOM) top.pack(expand=1, fill=BOTH, padx='1m', pady='1m') @@ -298,7 +298,7 @@ def make_double_frame(master=None, class_=None, name=None, relief=RAISED, def make_group_frame(master, name=None, label=None, fill=Y, - side=None, expand=None, font=None): + side=None, expand=None, font=None): """Create nested frames with a border and optional label. The outer frame is only used to provide the decorative border, to @@ -311,7 +311,7 @@ def make_group_frame(master, name=None, label=None, fill=Y, outer = Frame(master, borderwidth=2, relief=GROOVE) outer.pack(expand=expand, fill=fill, side=side) if label: - Label(outer, text=label, font=font, anchor=W).pack(fill=X) + Label(outer, text=label, font=font, anchor=W).pack(fill=X) inner = Frame(master, borderwidth='1m', name=name) inner.pack(expand=1, fill=BOTH, in_=outer) inner.forget = outer.forget @@ -326,20 +326,20 @@ def unify_button_widths(*buttons): """ wid = 0 for btn in buttons: - wid = max(wid, len(btn["text"])) + wid = max(wid, len(btn["text"])) for btn in buttons: - btn["width"] = wid + btn["width"] = wid def flatten(msg): """Turn a list or tuple into a single string -- recursively.""" t = type(msg) if t in (ListType, TupleType): - msg = string.join(map(flatten, msg)) + msg = string.join(map(flatten, msg)) elif t is ClassType: - msg = msg.__name__ + msg = msg.__name__ else: - msg = str(msg) + msg = str(msg) return msg @@ -356,8 +356,8 @@ def test(): entry, eframe = make_form_entry(root, 'Boolean:') text, tframe = make_text_box(root) def enter(event, entry=entry, text=text): - s = boolean(entry.get()) and '\nyes' or '\nno' - text.insert('end', s) + s = boolean(entry.get()) and '\nyes' or '\nno' + text.insert('end', s) entry.bind('<Return>', enter) entry.insert(END, flatten(sys.argv)) root.mainloop() diff --git a/Tools/webchecker/wcgui.py b/Tools/webchecker/wcgui.py index 027718f..6000829 100755 --- a/Tools/webchecker/wcgui.py +++ b/Tools/webchecker/wcgui.py @@ -72,365 +72,365 @@ if sys.platform == 'mac': def main(): try: - opts, args = getopt.getopt(sys.argv[1:], 'm:qv') + opts, args = getopt.getopt(sys.argv[1:], 'm:qv') except getopt.error, msg: - sys.stdout = sys.stderr - print msg - print __doc__%vars(webchecker) - sys.exit(2) + sys.stdout = sys.stderr + print msg + print __doc__%vars(webchecker) + sys.exit(2) for o, a in opts: - if o == '-m': - webchecker.maxpage = string.atoi(a) - if o == '-q': - webchecker.verbose = 0 - if o == '-v': - webchecker.verbose = webchecker.verbose + 1 + if o == '-m': + webchecker.maxpage = string.atoi(a) + if o == '-q': + webchecker.verbose = 0 + if o == '-v': + webchecker.verbose = webchecker.verbose + 1 root = Tk(className='Webchecker') root.protocol("WM_DELETE_WINDOW", root.quit) c = CheckerWindow(root) if args: - for arg in args[:-1]: - c.addroot(arg) - c.suggestroot(args[-1]) + for arg in args[:-1]: + c.addroot(arg) + c.suggestroot(args[-1]) root.mainloop() class CheckerWindow(webchecker.Checker): def __init__(self, parent, root=webchecker.DEFROOT): - self.__parent = parent - - self.__topcontrols = Frame(parent) - self.__topcontrols.pack(side=TOP, fill=X) - self.__label = Label(self.__topcontrols, text="Root URL:") - self.__label.pack(side=LEFT) - self.__rootentry = Entry(self.__topcontrols, width=60) - self.__rootentry.pack(side=LEFT) - self.__rootentry.bind('<Return>', self.enterroot) - self.__rootentry.focus_set() - - self.__controls = Frame(parent) - self.__controls.pack(side=TOP, fill=X) - self.__running = 0 - self.__start = Button(self.__controls, text="Run", command=self.start) - self.__start.pack(side=LEFT) - self.__stop = Button(self.__controls, text="Stop", command=self.stop, - state=DISABLED) - self.__stop.pack(side=LEFT) - self.__step = Button(self.__controls, text="Check one", - command=self.step) - self.__step.pack(side=LEFT) - self.__cv = BooleanVar(parent) - self.__cv.set(self.checkext) - self.__checkext = Checkbutton(self.__controls, variable=self.__cv, - command=self.update_checkext, - text="Check nonlocal links",) - self.__checkext.pack(side=LEFT) - self.__reset = Button(self.__controls, text="Start over", command=self.reset) - self.__reset.pack(side=LEFT) - if __name__ == '__main__': # No Quit button under Grail! - self.__quit = Button(self.__controls, text="Quit", - command=self.__parent.quit) - self.__quit.pack(side=RIGHT) - - self.__status = Label(parent, text="Status: initial", anchor=W) - self.__status.pack(side=TOP, fill=X) - self.__checking = Label(parent, text="Idle", anchor=W) - self.__checking.pack(side=TOP, fill=X) - self.__mp = mp = MultiPanel(parent) - sys.stdout = self.__log = LogPanel(mp, "Log") - self.__todo = ListPanel(mp, "To check", self.showinfo) - self.__done = ListPanel(mp, "Checked", self.showinfo) - self.__bad = ListPanel(mp, "Bad links", self.showinfo) - self.__errors = ListPanel(mp, "Pages w/ bad links", self.showinfo) - self.__details = LogPanel(mp, "Details") - webchecker.Checker.__init__(self) - if root: - root = string.strip(str(root)) - if root: - self.suggestroot(root) - self.newstatus() + self.__parent = parent + + self.__topcontrols = Frame(parent) + self.__topcontrols.pack(side=TOP, fill=X) + self.__label = Label(self.__topcontrols, text="Root URL:") + self.__label.pack(side=LEFT) + self.__rootentry = Entry(self.__topcontrols, width=60) + self.__rootentry.pack(side=LEFT) + self.__rootentry.bind('<Return>', self.enterroot) + self.__rootentry.focus_set() + + self.__controls = Frame(parent) + self.__controls.pack(side=TOP, fill=X) + self.__running = 0 + self.__start = Button(self.__controls, text="Run", command=self.start) + self.__start.pack(side=LEFT) + self.__stop = Button(self.__controls, text="Stop", command=self.stop, + state=DISABLED) + self.__stop.pack(side=LEFT) + self.__step = Button(self.__controls, text="Check one", + command=self.step) + self.__step.pack(side=LEFT) + self.__cv = BooleanVar(parent) + self.__cv.set(self.checkext) + self.__checkext = Checkbutton(self.__controls, variable=self.__cv, + command=self.update_checkext, + text="Check nonlocal links",) + self.__checkext.pack(side=LEFT) + self.__reset = Button(self.__controls, text="Start over", command=self.reset) + self.__reset.pack(side=LEFT) + if __name__ == '__main__': # No Quit button under Grail! + self.__quit = Button(self.__controls, text="Quit", + command=self.__parent.quit) + self.__quit.pack(side=RIGHT) + + self.__status = Label(parent, text="Status: initial", anchor=W) + self.__status.pack(side=TOP, fill=X) + self.__checking = Label(parent, text="Idle", anchor=W) + self.__checking.pack(side=TOP, fill=X) + self.__mp = mp = MultiPanel(parent) + sys.stdout = self.__log = LogPanel(mp, "Log") + self.__todo = ListPanel(mp, "To check", self.showinfo) + self.__done = ListPanel(mp, "Checked", self.showinfo) + self.__bad = ListPanel(mp, "Bad links", self.showinfo) + self.__errors = ListPanel(mp, "Pages w/ bad links", self.showinfo) + self.__details = LogPanel(mp, "Details") + webchecker.Checker.__init__(self) + if root: + root = string.strip(str(root)) + if root: + self.suggestroot(root) + self.newstatus() def reset(self): - webchecker.Checker.reset(self) - for p in self.__todo, self.__done, self.__bad, self.__errors: - p.clear() + webchecker.Checker.reset(self) + for p in self.__todo, self.__done, self.__bad, self.__errors: + p.clear() def suggestroot(self, root): - self.__rootentry.delete(0, END) - self.__rootentry.insert(END, root) - self.__rootentry.select_range(0, END) + self.__rootentry.delete(0, END) + self.__rootentry.insert(END, root) + self.__rootentry.select_range(0, END) def enterroot(self, event=None): - root = self.__rootentry.get() - root = string.strip(root) - if root: - self.__checking.config(text="Adding root "+root) - self.__checking.update_idletasks() - self.addroot(root) - self.__checking.config(text="Idle") - try: - i = self.__todo.items.index(root) - except (ValueError, IndexError): - pass - else: - self.__todo.list.select_clear(0, END) - self.__todo.list.select_set(i) - self.__todo.list.yview(i) - self.__rootentry.delete(0, END) + root = self.__rootentry.get() + root = string.strip(root) + if root: + self.__checking.config(text="Adding root "+root) + self.__checking.update_idletasks() + self.addroot(root) + self.__checking.config(text="Idle") + try: + i = self.__todo.items.index(root) + except (ValueError, IndexError): + pass + else: + self.__todo.list.select_clear(0, END) + self.__todo.list.select_set(i) + self.__todo.list.yview(i) + self.__rootentry.delete(0, END) def start(self): - self.__start.config(state=DISABLED, relief=SUNKEN) - self.__stop.config(state=NORMAL) - self.__step.config(state=DISABLED) - self.enterroot() - self.__running = 1 - self.go() + self.__start.config(state=DISABLED, relief=SUNKEN) + self.__stop.config(state=NORMAL) + self.__step.config(state=DISABLED) + self.enterroot() + self.__running = 1 + self.go() def stop(self): - self.__stop.config(state=DISABLED, relief=SUNKEN) - self.__running = 0 + self.__stop.config(state=DISABLED, relief=SUNKEN) + self.__running = 0 def step(self): - self.__start.config(state=DISABLED) - self.__step.config(state=DISABLED, relief=SUNKEN) - self.enterroot() - self.__running = 0 - self.dosomething() + self.__start.config(state=DISABLED) + self.__step.config(state=DISABLED, relief=SUNKEN) + self.enterroot() + self.__running = 0 + self.dosomething() def go(self): - if self.__running: - self.__parent.after_idle(self.dosomething) - else: - self.__checking.config(text="Idle") - self.__start.config(state=NORMAL, relief=RAISED) - self.__stop.config(state=DISABLED, relief=RAISED) - self.__step.config(state=NORMAL, relief=RAISED) + if self.__running: + self.__parent.after_idle(self.dosomething) + else: + self.__checking.config(text="Idle") + self.__start.config(state=NORMAL, relief=RAISED) + self.__stop.config(state=DISABLED, relief=RAISED) + self.__step.config(state=NORMAL, relief=RAISED) __busy = 0 def dosomething(self): - if self.__busy: return - self.__busy = 1 - if self.todo: - l = self.__todo.selectedindices() - if l: - i = l[0] - else: - i = 0 - self.__todo.list.select_set(i) - self.__todo.list.yview(i) - url = self.__todo.items[i] - self.__checking.config(text="Checking "+url) - self.__parent.update() - self.dopage(url) - else: - self.stop() - self.__busy = 0 - self.go() + if self.__busy: return + self.__busy = 1 + if self.todo: + l = self.__todo.selectedindices() + if l: + i = l[0] + else: + i = 0 + self.__todo.list.select_set(i) + self.__todo.list.yview(i) + url = self.__todo.items[i] + self.__checking.config(text="Checking "+url) + self.__parent.update() + self.dopage(url) + else: + self.stop() + self.__busy = 0 + self.go() def showinfo(self, url): - d = self.__details - d.clear() - d.put("URL: %s\n" % url) - if self.bad.has_key(url): - d.put("Error: %s\n" % str(self.bad[url])) - if url in self.roots: - d.put("Note: This is a root URL\n") - if self.done.has_key(url): - d.put("Status: checked\n") - o = self.done[url] - elif self.todo.has_key(url): - d.put("Status: to check\n") - o = self.todo[url] - else: - d.put("Status: unknown (!)\n") - o = [] - if self.errors.has_key(url): - d.put("Bad links from this page:\n") - for triple in self.errors[url]: - link, rawlink, msg = triple - d.put(" HREF %s" % link) - if link != rawlink: d.put(" (%s)" %rawlink) - d.put("\n") - d.put(" error %s\n" % str(msg)) - self.__mp.showpanel("Details") - for source, rawlink in o: - d.put("Origin: %s" % source) - if rawlink != url: - d.put(" (%s)" % rawlink) - d.put("\n") - d.text.yview("1.0") + d = self.__details + d.clear() + d.put("URL: %s\n" % url) + if self.bad.has_key(url): + d.put("Error: %s\n" % str(self.bad[url])) + if url in self.roots: + d.put("Note: This is a root URL\n") + if self.done.has_key(url): + d.put("Status: checked\n") + o = self.done[url] + elif self.todo.has_key(url): + d.put("Status: to check\n") + o = self.todo[url] + else: + d.put("Status: unknown (!)\n") + o = [] + if self.errors.has_key(url): + d.put("Bad links from this page:\n") + for triple in self.errors[url]: + link, rawlink, msg = triple + d.put(" HREF %s" % link) + if link != rawlink: d.put(" (%s)" %rawlink) + d.put("\n") + d.put(" error %s\n" % str(msg)) + self.__mp.showpanel("Details") + for source, rawlink in o: + d.put("Origin: %s" % source) + if rawlink != url: + d.put(" (%s)" % rawlink) + d.put("\n") + d.text.yview("1.0") def setbad(self, url, msg): - webchecker.Checker.setbad(self, url, msg) - self.__bad.insert(url) - self.newstatus() + webchecker.Checker.setbad(self, url, msg) + self.__bad.insert(url) + self.newstatus() def setgood(self, url): - webchecker.Checker.setgood(self, url) - self.__bad.remove(url) - self.newstatus() + webchecker.Checker.setgood(self, url) + self.__bad.remove(url) + self.newstatus() def newlink(self, url, origin): - webchecker.Checker.newlink(self, url, origin) - if self.done.has_key(url): - self.__done.insert(url) - elif self.todo.has_key(url): - self.__todo.insert(url) - self.newstatus() + webchecker.Checker.newlink(self, url, origin) + if self.done.has_key(url): + self.__done.insert(url) + elif self.todo.has_key(url): + self.__todo.insert(url) + self.newstatus() def markdone(self, url): - webchecker.Checker.markdone(self, url) - self.__done.insert(url) - self.__todo.remove(url) - self.newstatus() + webchecker.Checker.markdone(self, url) + self.__done.insert(url) + self.__todo.remove(url) + self.newstatus() def seterror(self, url, triple): - webchecker.Checker.seterror(self, url, triple) - self.__errors.insert(url) - self.newstatus() + webchecker.Checker.seterror(self, url, triple) + self.__errors.insert(url) + self.newstatus() def newstatus(self): - self.__status.config(text="Status: "+self.status()) - self.__parent.update() + self.__status.config(text="Status: "+self.status()) + self.__parent.update() def update_checkext(self): - self.checkext = self.__cv.get() + self.checkext = self.__cv.get() class ListPanel: def __init__(self, mp, name, showinfo=None): - self.mp = mp - self.name = name - self.showinfo = showinfo - self.panel = mp.addpanel(name) - self.list, self.frame = tktools.make_list_box( - self.panel, width=60, height=5) - self.list.config(exportselection=0) - if showinfo: - self.list.bind('<Double-Button-1>', self.doubleclick) - self.items = [] + self.mp = mp + self.name = name + self.showinfo = showinfo + self.panel = mp.addpanel(name) + self.list, self.frame = tktools.make_list_box( + self.panel, width=60, height=5) + self.list.config(exportselection=0) + if showinfo: + self.list.bind('<Double-Button-1>', self.doubleclick) + self.items = [] def clear(self): - self.items = [] - self.list.delete(0, END) - self.mp.hidepanel(self.name) + self.items = [] + self.list.delete(0, END) + self.mp.hidepanel(self.name) def doubleclick(self, event): - l = self.selectedindices() - if l: - self.showinfo(self.list.get(l[0])) + l = self.selectedindices() + if l: + self.showinfo(self.list.get(l[0])) def selectedindices(self): - l = self.list.curselection() - if not l: return [] - return map(string.atoi, l) + l = self.list.curselection() + if not l: return [] + return map(string.atoi, l) def insert(self, url): - if url not in self.items: - if not self.items: - self.mp.showpanel(self.name) - # (I tried sorting alphabetically, but the display is too jumpy) - i = len(self.items) - self.list.insert(i, url) - self.list.yview(i) - self.items.insert(i, url) + if url not in self.items: + if not self.items: + self.mp.showpanel(self.name) + # (I tried sorting alphabetically, but the display is too jumpy) + i = len(self.items) + self.list.insert(i, url) + self.list.yview(i) + self.items.insert(i, url) def remove(self, url): - try: - i = self.items.index(url) - except (ValueError, IndexError): - pass - else: - was_selected = i in self.selectedindices() - self.list.delete(i) - del self.items[i] - if not self.items: - self.mp.hidepanel(self.name) - elif was_selected: - if i >= len(self.items): - i = len(self.items) - 1 - self.list.select_set(i) + try: + i = self.items.index(url) + except (ValueError, IndexError): + pass + else: + was_selected = i in self.selectedindices() + self.list.delete(i) + del self.items[i] + if not self.items: + self.mp.hidepanel(self.name) + elif was_selected: + if i >= len(self.items): + i = len(self.items) - 1 + self.list.select_set(i) class LogPanel: def __init__(self, mp, name): - self.mp = mp - self.name = name - self.panel = mp.addpanel(name) - self.text, self.frame = tktools.make_text_box(self.panel, height=10) - self.text.config(wrap=NONE) + self.mp = mp + self.name = name + self.panel = mp.addpanel(name) + self.text, self.frame = tktools.make_text_box(self.panel, height=10) + self.text.config(wrap=NONE) def clear(self): - self.text.delete("1.0", END) - self.text.yview("1.0") + self.text.delete("1.0", END) + self.text.yview("1.0") def put(self, s): - self.text.insert(END, s) - if '\n' in s: - self.text.yview(END) + self.text.insert(END, s) + if '\n' in s: + self.text.yview(END) def write(self, s): - self.text.insert(END, s) - if '\n' in s: - self.text.yview(END) - self.panel.update() + self.text.insert(END, s) + if '\n' in s: + self.text.yview(END) + self.panel.update() class MultiPanel: def __init__(self, parent): - self.parent = parent - self.frame = Frame(self.parent) - self.frame.pack(expand=1, fill=BOTH) - self.topframe = Frame(self.frame, borderwidth=2, relief=RAISED) - self.topframe.pack(fill=X) - self.botframe = Frame(self.frame) - self.botframe.pack(expand=1, fill=BOTH) - self.panelnames = [] - self.panels = {} + self.parent = parent + self.frame = Frame(self.parent) + self.frame.pack(expand=1, fill=BOTH) + self.topframe = Frame(self.frame, borderwidth=2, relief=RAISED) + self.topframe.pack(fill=X) + self.botframe = Frame(self.frame) + self.botframe.pack(expand=1, fill=BOTH) + self.panelnames = [] + self.panels = {} def addpanel(self, name, on=0): - v = StringVar(self.parent) - if on: - v.set(name) - else: - v.set("") - check = Checkbutton(self.topframe, text=name, - offvalue="", onvalue=name, variable=v, - command=self.checkpanel) - check.pack(side=LEFT) - panel = Frame(self.botframe) - label = Label(panel, text=name, borderwidth=2, relief=RAISED, anchor=W) - label.pack(side=TOP, fill=X) - t = v, check, panel - self.panelnames.append(name) - self.panels[name] = t - if on: - panel.pack(expand=1, fill=BOTH) - return panel + v = StringVar(self.parent) + if on: + v.set(name) + else: + v.set("") + check = Checkbutton(self.topframe, text=name, + offvalue="", onvalue=name, variable=v, + command=self.checkpanel) + check.pack(side=LEFT) + panel = Frame(self.botframe) + label = Label(panel, text=name, borderwidth=2, relief=RAISED, anchor=W) + label.pack(side=TOP, fill=X) + t = v, check, panel + self.panelnames.append(name) + self.panels[name] = t + if on: + panel.pack(expand=1, fill=BOTH) + return panel def showpanel(self, name): - v, check, panel = self.panels[name] - v.set(name) - panel.pack(expand=1, fill=BOTH) + v, check, panel = self.panels[name] + v.set(name) + panel.pack(expand=1, fill=BOTH) def hidepanel(self, name): - v, check, panel = self.panels[name] - v.set("") - panel.pack_forget() + v, check, panel = self.panels[name] + v.set("") + panel.pack_forget() def checkpanel(self): - for name in self.panelnames: - v, check, panel = self.panels[name] - panel.pack_forget() - for name in self.panelnames: - v, check, panel = self.panels[name] - if v.get(): - panel.pack(expand=1, fill=BOTH) + for name in self.panelnames: + v, check, panel = self.panels[name] + panel.pack_forget() + for name in self.panelnames: + v, check, panel = self.panels[name] + if v.get(): + panel.pack(expand=1, fill=BOTH) if __name__ == '__main__': diff --git a/Tools/webchecker/webchecker.py b/Tools/webchecker/webchecker.py index 23dcf80..5459e97 100755 --- a/Tools/webchecker/webchecker.py +++ b/Tools/webchecker/webchecker.py @@ -116,17 +116,17 @@ import robotparser if __version__[0] == '$': _v = string.split(__version__) if len(_v) == 3: - __version__ = _v[1] + __version__ = _v[1] # Tunable parameters -DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL -CHECKEXT = 1 # Check external references (1 deep) -VERBOSE = 1 # Verbosity level (0-3) -MAXPAGE = 150000 # Ignore files bigger than this -ROUNDSIZE = 50 # Number of links processed per round -DUMPFILE = "@webchecker.pickle" # Pickled checkpoint -AGENTNAME = "webchecker" # Agent name for robots.txt parser +DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL +CHECKEXT = 1 # Check external references (1 deep) +VERBOSE = 1 # Verbosity level (0-3) +MAXPAGE = 150000 # Ignore files bigger than this +ROUNDSIZE = 50 # Number of links processed per round +DUMPFILE = "@webchecker.pickle" # Pickled checkpoint +AGENTNAME = "webchecker" # Agent name for robots.txt parser # Global variables @@ -142,76 +142,76 @@ def main(): norun = 0 try: - opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:vx') + opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:vx') except getopt.error, msg: - sys.stdout = sys.stderr - print msg - print __doc__%globals() - sys.exit(2) + sys.stdout = sys.stderr + print msg + print __doc__%globals() + sys.exit(2) for o, a in opts: - if o == '-R': - restart = 1 - if o == '-d': - dumpfile = a - if o == '-m': - maxpage = string.atoi(a) - if o == '-n': - norun = 1 - if o == '-q': - verbose = 0 - if o == '-r': - roundsize = string.atoi(a) - if o == '-v': - verbose = verbose + 1 - if o == '-x': - checkext = not checkext + if o == '-R': + restart = 1 + if o == '-d': + dumpfile = a + if o == '-m': + maxpage = string.atoi(a) + if o == '-n': + norun = 1 + if o == '-q': + verbose = 0 + if o == '-r': + roundsize = string.atoi(a) + if o == '-v': + verbose = verbose + 1 + if o == '-x': + checkext = not checkext if verbose > 0: - print AGENTNAME, "version", __version__ + print AGENTNAME, "version", __version__ if restart: - c = load_pickle(dumpfile=dumpfile, verbose=verbose) + c = load_pickle(dumpfile=dumpfile, verbose=verbose) else: - c = Checker() + c = Checker() c.setflags(checkext=checkext, verbose=verbose, - maxpage=maxpage, roundsize=roundsize) + maxpage=maxpage, roundsize=roundsize) if not restart and not args: - args.append(DEFROOT) + args.append(DEFROOT) for arg in args: - c.addroot(arg) + c.addroot(arg) if not norun: - try: - c.run() - except KeyboardInterrupt: - if verbose > 0: - print "[run interrupted]" + try: + c.run() + except KeyboardInterrupt: + if verbose > 0: + print "[run interrupted]" try: - c.report() + c.report() except KeyboardInterrupt: - if verbose > 0: - print "[report interrupted]" + if verbose > 0: + print "[report interrupted]" if c.save_pickle(dumpfile): - if dumpfile == DUMPFILE: - print "Use ``%s -R'' to restart." % sys.argv[0] - else: - print "Use ``%s -R -d %s'' to restart." % (sys.argv[0], dumpfile) + if dumpfile == DUMPFILE: + print "Use ``%s -R'' to restart." % sys.argv[0] + else: + print "Use ``%s -R -d %s'' to restart." % (sys.argv[0], dumpfile) def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE): if verbose > 0: - print "Loading checkpoint from %s ..." % dumpfile + print "Loading checkpoint from %s ..." % dumpfile f = open(dumpfile, "rb") c = pickle.load(f) f.close() if verbose > 0: - print "Done." - print "Root:", string.join(c.roots, "\n ") + print "Done." + print "Root:", string.join(c.roots, "\n ") return c @@ -225,364 +225,364 @@ class Checker: validflags = tuple(dir()) def __init__(self): - self.reset() + self.reset() def setflags(self, **kw): - for key in kw.keys(): - if key not in self.validflags: - raise NameError, "invalid keyword argument: %s" % str(key) - for key, value in kw.items(): - setattr(self, key, value) + for key in kw.keys(): + if key not in self.validflags: + raise NameError, "invalid keyword argument: %s" % str(key) + for key, value in kw.items(): + setattr(self, key, value) def reset(self): - self.roots = [] - self.todo = {} - self.done = {} - self.bad = {} - self.round = 0 - # The following are not pickled: - self.robots = {} - self.errors = {} - self.urlopener = MyURLopener() - self.changed = 0 + self.roots = [] + self.todo = {} + self.done = {} + self.bad = {} + self.round = 0 + # The following are not pickled: + self.robots = {} + self.errors = {} + self.urlopener = MyURLopener() + self.changed = 0 def __getstate__(self): - return (self.roots, self.todo, self.done, self.bad, self.round) + return (self.roots, self.todo, self.done, self.bad, self.round) def __setstate__(self, state): - self.reset() - (self.roots, self.todo, self.done, self.bad, self.round) = state - for root in self.roots: - self.addrobot(root) - for url in self.bad.keys(): - self.markerror(url) + self.reset() + (self.roots, self.todo, self.done, self.bad, self.round) = state + for root in self.roots: + self.addrobot(root) + for url in self.bad.keys(): + self.markerror(url) def addroot(self, root): - if root not in self.roots: - troot = root - scheme, netloc, path, params, query, fragment = \ - urlparse.urlparse(root) - i = string.rfind(path, "/") + 1 - if 0 < i < len(path): - path = path[:i] - troot = urlparse.urlunparse((scheme, netloc, path, - params, query, fragment)) - self.roots.append(troot) - self.addrobot(root) - self.newlink(root, ("<root>", root)) + if root not in self.roots: + troot = root + scheme, netloc, path, params, query, fragment = \ + urlparse.urlparse(root) + i = string.rfind(path, "/") + 1 + if 0 < i < len(path): + path = path[:i] + troot = urlparse.urlunparse((scheme, netloc, path, + params, query, fragment)) + self.roots.append(troot) + self.addrobot(root) + self.newlink(root, ("<root>", root)) def addrobot(self, root): - root = urlparse.urljoin(root, "/") - if self.robots.has_key(root): return - url = urlparse.urljoin(root, "/robots.txt") - self.robots[root] = rp = robotparser.RobotFileParser() - if self.verbose > 2: - print "Parsing", url - rp.debug = self.verbose > 3 - rp.set_url(url) - try: - rp.read() - except IOError, msg: - if self.verbose > 1: - print "I/O error parsing", url, ":", msg + root = urlparse.urljoin(root, "/") + if self.robots.has_key(root): return + url = urlparse.urljoin(root, "/robots.txt") + self.robots[root] = rp = robotparser.RobotFileParser() + if self.verbose > 2: + print "Parsing", url + rp.debug = self.verbose > 3 + rp.set_url(url) + try: + rp.read() + except IOError, msg: + if self.verbose > 1: + print "I/O error parsing", url, ":", msg def run(self): - while self.todo: - self.round = self.round + 1 - if self.verbose > 0: - print - print "Round %d (%s)" % (self.round, self.status()) - print - urls = self.todo.keys()[:self.roundsize] - for url in urls: - self.dopage(url) + while self.todo: + self.round = self.round + 1 + if self.verbose > 0: + print + print "Round %d (%s)" % (self.round, self.status()) + print + urls = self.todo.keys()[:self.roundsize] + for url in urls: + self.dopage(url) def status(self): - return "%d total, %d to do, %d done, %d bad" % ( - len(self.todo)+len(self.done), - len(self.todo), len(self.done), - len(self.bad)) + return "%d total, %d to do, %d done, %d bad" % ( + len(self.todo)+len(self.done), + len(self.todo), len(self.done), + len(self.bad)) def report(self): - print - if not self.todo: print "Final", - else: print "Interim", - print "Report (%s)" % self.status() - self.report_errors() + print + if not self.todo: print "Final", + else: print "Interim", + print "Report (%s)" % self.status() + self.report_errors() def report_errors(self): - if not self.bad: - print - print "No errors" - return - print - print "Error Report:" - sources = self.errors.keys() - sources.sort() - for source in sources: - triples = self.errors[source] - print - if len(triples) > 1: - print len(triples), "Errors in", source - else: - print "Error in", source - for url, rawlink, msg in triples: - print " HREF", url, - if rawlink != url: print "(%s)" % rawlink, - print - print " msg", msg + if not self.bad: + print + print "No errors" + return + print + print "Error Report:" + sources = self.errors.keys() + sources.sort() + for source in sources: + triples = self.errors[source] + print + if len(triples) > 1: + print len(triples), "Errors in", source + else: + print "Error in", source + for url, rawlink, msg in triples: + print " HREF", url, + if rawlink != url: print "(%s)" % rawlink, + print + print " msg", msg def dopage(self, url): - if self.verbose > 1: - if self.verbose > 2: - self.show("Check ", url, " from", self.todo[url]) - else: - print "Check ", url - page = self.getpage(url) - if page: - for info in page.getlinkinfos(): - link, rawlink = info - origin = url, rawlink - self.newlink(link, origin) - self.markdone(url) + if self.verbose > 1: + if self.verbose > 2: + self.show("Check ", url, " from", self.todo[url]) + else: + print "Check ", url + page = self.getpage(url) + if page: + for info in page.getlinkinfos(): + link, rawlink = info + origin = url, rawlink + self.newlink(link, origin) + self.markdone(url) def newlink(self, url, origin): - if self.done.has_key(url): - self.newdonelink(url, origin) - else: - self.newtodolink(url, origin) + if self.done.has_key(url): + self.newdonelink(url, origin) + else: + self.newtodolink(url, origin) def newdonelink(self, url, origin): - self.done[url].append(origin) - if self.verbose > 3: - print " Done link", url + self.done[url].append(origin) + if self.verbose > 3: + print " Done link", url def newtodolink(self, url, origin): - if self.todo.has_key(url): - self.todo[url].append(origin) - if self.verbose > 3: - print " Seen todo link", url - else: - self.todo[url] = [origin] - if self.verbose > 3: - print " New todo link", url + if self.todo.has_key(url): + self.todo[url].append(origin) + if self.verbose > 3: + print " Seen todo link", url + else: + self.todo[url] = [origin] + if self.verbose > 3: + print " New todo link", url def markdone(self, url): - self.done[url] = self.todo[url] - del self.todo[url] - self.changed = 1 + self.done[url] = self.todo[url] + del self.todo[url] + self.changed = 1 def inroots(self, url): - for root in self.roots: - if url[:len(root)] == root: - root = urlparse.urljoin(root, "/") - return self.robots[root].can_fetch(AGENTNAME, url) - return 0 + for root in self.roots: + if url[:len(root)] == root: + root = urlparse.urljoin(root, "/") + return self.robots[root].can_fetch(AGENTNAME, url) + return 0 def getpage(self, url): - if url[:7] == 'mailto:' or url[:5] == 'news:': - if self.verbose > 1: print " Not checking mailto/news URL" - return None - isint = self.inroots(url) - if not isint: - if not self.checkext: - if self.verbose > 1: print " Not checking ext link" - return None - f = self.openpage(url) - if f: - self.safeclose(f) - return None - text, nurl = self.readhtml(url) - if nurl != url: - if self.verbose > 1: - print " Redirected to", nurl - url = nurl - if text: - return Page(text, url, verbose=self.verbose, maxpage=self.maxpage) + if url[:7] == 'mailto:' or url[:5] == 'news:': + if self.verbose > 1: print " Not checking mailto/news URL" + return None + isint = self.inroots(url) + if not isint: + if not self.checkext: + if self.verbose > 1: print " Not checking ext link" + return None + f = self.openpage(url) + if f: + self.safeclose(f) + return None + text, nurl = self.readhtml(url) + if nurl != url: + if self.verbose > 1: + print " Redirected to", nurl + url = nurl + if text: + return Page(text, url, verbose=self.verbose, maxpage=self.maxpage) def readhtml(self, url): - text = None - f, url = self.openhtml(url) - if f: - text = f.read() - f.close() - return text, url + text = None + f, url = self.openhtml(url) + if f: + text = f.read() + f.close() + return text, url def openhtml(self, url): - f = self.openpage(url) - if f: - url = f.geturl() - info = f.info() - if not self.checkforhtml(info, url): - self.safeclose(f) - f = None - return f, url + f = self.openpage(url) + if f: + url = f.geturl() + info = f.info() + if not self.checkforhtml(info, url): + self.safeclose(f) + f = None + return f, url def openpage(self, url): - try: - return self.urlopener.open(url) - except IOError, msg: - msg = self.sanitize(msg) - if self.verbose > 0: - print "Error ", msg - if self.verbose > 0: - self.show(" HREF ", url, " from", self.todo[url]) - self.setbad(url, msg) - return None + try: + return self.urlopener.open(url) + except IOError, msg: + msg = self.sanitize(msg) + if self.verbose > 0: + print "Error ", msg + if self.verbose > 0: + self.show(" HREF ", url, " from", self.todo[url]) + self.setbad(url, msg) + return None def checkforhtml(self, info, url): - if info.has_key('content-type'): - ctype = string.lower(info['content-type']) - else: - if url[-1:] == "/": - return 1 - ctype, encoding = mimetypes.guess_type(url) - if ctype == 'text/html': - return 1 - else: - if self.verbose > 1: - print " Not HTML, mime type", ctype - return 0 + if info.has_key('content-type'): + ctype = string.lower(info['content-type']) + else: + if url[-1:] == "/": + return 1 + ctype, encoding = mimetypes.guess_type(url) + if ctype == 'text/html': + return 1 + else: + if self.verbose > 1: + print " Not HTML, mime type", ctype + return 0 def setgood(self, url): - if self.bad.has_key(url): - del self.bad[url] - self.changed = 1 - if self.verbose > 0: - print "(Clear previously seen error)" + if self.bad.has_key(url): + del self.bad[url] + self.changed = 1 + if self.verbose > 0: + print "(Clear previously seen error)" def setbad(self, url, msg): - if self.bad.has_key(url) and self.bad[url] == msg: - if self.verbose > 0: - print "(Seen this error before)" - return - self.bad[url] = msg - self.changed = 1 - self.markerror(url) - + if self.bad.has_key(url) and self.bad[url] == msg: + if self.verbose > 0: + print "(Seen this error before)" + return + self.bad[url] = msg + self.changed = 1 + self.markerror(url) + def markerror(self, url): - try: - origins = self.todo[url] - except KeyError: - origins = self.done[url] - for source, rawlink in origins: - triple = url, rawlink, self.bad[url] - self.seterror(source, triple) + try: + origins = self.todo[url] + except KeyError: + origins = self.done[url] + for source, rawlink in origins: + triple = url, rawlink, self.bad[url] + self.seterror(source, triple) def seterror(self, url, triple): - try: - self.errors[url].append(triple) - except KeyError: - self.errors[url] = [triple] + try: + self.errors[url].append(triple) + except KeyError: + self.errors[url] = [triple] # The following used to be toplevel functions; they have been # changed into methods so they can be overridden in subclasses. def show(self, p1, link, p2, origins): - print p1, link - i = 0 - for source, rawlink in origins: - i = i+1 - if i == 2: - p2 = ' '*len(p2) - print p2, source, - if rawlink != link: print "(%s)" % rawlink, - print + print p1, link + i = 0 + for source, rawlink in origins: + i = i+1 + if i == 2: + p2 = ' '*len(p2) + print p2, source, + if rawlink != link: print "(%s)" % rawlink, + print def sanitize(self, msg): - if isinstance(IOError, ClassType) and isinstance(msg, IOError): - # Do the other branch recursively - msg.args = self.sanitize(msg.args) - elif isinstance(msg, TupleType): - if len(msg) >= 4 and msg[0] == 'http error' and \ - isinstance(msg[3], InstanceType): - # Remove the Message instance -- it may contain - # a file object which prevents pickling. - msg = msg[:3] + msg[4:] - return msg + if isinstance(IOError, ClassType) and isinstance(msg, IOError): + # Do the other branch recursively + msg.args = self.sanitize(msg.args) + elif isinstance(msg, TupleType): + if len(msg) >= 4 and msg[0] == 'http error' and \ + isinstance(msg[3], InstanceType): + # Remove the Message instance -- it may contain + # a file object which prevents pickling. + msg = msg[:3] + msg[4:] + return msg def safeclose(self, f): - try: - url = f.geturl() - except AttributeError: - pass - else: - if url[:4] == 'ftp:' or url[:7] == 'file://': - # Apparently ftp connections don't like to be closed - # prematurely... - text = f.read() - f.close() + try: + url = f.geturl() + except AttributeError: + pass + else: + if url[:4] == 'ftp:' or url[:7] == 'file://': + # Apparently ftp connections don't like to be closed + # prematurely... + text = f.read() + f.close() def save_pickle(self, dumpfile=DUMPFILE): - if not self.changed: - if self.verbose > 0: - print - print "No need to save checkpoint" - elif not dumpfile: - if self.verbose > 0: - print "No dumpfile, won't save checkpoint" - else: - if self.verbose > 0: - print - print "Saving checkpoint to %s ..." % dumpfile - newfile = dumpfile + ".new" - f = open(newfile, "wb") - pickle.dump(self, f) - f.close() - try: - os.unlink(dumpfile) - except os.error: - pass - os.rename(newfile, dumpfile) - if self.verbose > 0: - print "Done." - return 1 + if not self.changed: + if self.verbose > 0: + print + print "No need to save checkpoint" + elif not dumpfile: + if self.verbose > 0: + print "No dumpfile, won't save checkpoint" + else: + if self.verbose > 0: + print + print "Saving checkpoint to %s ..." % dumpfile + newfile = dumpfile + ".new" + f = open(newfile, "wb") + pickle.dump(self, f) + f.close() + try: + os.unlink(dumpfile) + except os.error: + pass + os.rename(newfile, dumpfile) + if self.verbose > 0: + print "Done." + return 1 class Page: def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE): - self.text = text - self.url = url - self.verbose = verbose - self.maxpage = maxpage + self.text = text + self.url = url + self.verbose = verbose + self.maxpage = maxpage def getlinkinfos(self): - size = len(self.text) - if size > self.maxpage: - if self.verbose > 0: - print "Skip huge file", self.url - print " (%.0f Kbytes)" % (size*0.001) - return [] - if self.verbose > 2: - print " Parsing", self.url, "(%d bytes)" % size - parser = MyHTMLParser(verbose=self.verbose) - parser.feed(self.text) - parser.close() - rawlinks = parser.getlinks() - base = urlparse.urljoin(self.url, parser.getbase() or "") - infos = [] - for rawlink in rawlinks: - t = urlparse.urlparse(rawlink) - t = t[:-1] + ('',) - rawlink = urlparse.urlunparse(t) - link = urlparse.urljoin(base, rawlink) - infos.append((link, rawlink)) - return infos + size = len(self.text) + if size > self.maxpage: + if self.verbose > 0: + print "Skip huge file", self.url + print " (%.0f Kbytes)" % (size*0.001) + return [] + if self.verbose > 2: + print " Parsing", self.url, "(%d bytes)" % size + parser = MyHTMLParser(verbose=self.verbose) + parser.feed(self.text) + parser.close() + rawlinks = parser.getlinks() + base = urlparse.urljoin(self.url, parser.getbase() or "") + infos = [] + for rawlink in rawlinks: + t = urlparse.urlparse(rawlink) + t = t[:-1] + ('',) + rawlink = urlparse.urlunparse(t) + link = urlparse.urljoin(base, rawlink) + infos.append((link, rawlink)) + return infos class MyStringIO(StringIO.StringIO): def __init__(self, url, info): - self.__url = url - self.__info = info - StringIO.StringIO.__init__(self) + self.__url = url + self.__info = info + StringIO.StringIO.__init__(self) def info(self): - return self.__info + return self.__info def geturl(self): - return self.__url + return self.__url class MyURLopener(urllib.FancyURLopener): @@ -590,81 +590,81 @@ class MyURLopener(urllib.FancyURLopener): http_error_default = urllib.URLopener.http_error_default def __init__(*args): - self = args[0] - apply(urllib.FancyURLopener.__init__, args) - self.addheaders = [ - ('User-agent', 'Python-webchecker/%s' % __version__), - ] + self = args[0] + apply(urllib.FancyURLopener.__init__, args) + self.addheaders = [ + ('User-agent', 'Python-webchecker/%s' % __version__), + ] def http_error_401(self, url, fp, errcode, errmsg, headers): return None def open_file(self, url): - path = urllib.url2pathname(urllib.unquote(url)) - if path[-1] != os.sep: - url = url + '/' - if os.path.isdir(path): - indexpath = os.path.join(path, "index.html") - if os.path.exists(indexpath): - return self.open_file(url + "index.html") - try: - names = os.listdir(path) - except os.error, msg: - raise IOError, msg, sys.exc_traceback - names.sort() - s = MyStringIO("file:"+url, {'content-type': 'text/html'}) - s.write('<BASE HREF="file:%s">\n' % - urllib.quote(os.path.join(path, ""))) - for name in names: - q = urllib.quote(name) - s.write('<A HREF="%s">%s</A>\n' % (q, q)) - s.seek(0) - return s - return urllib.FancyURLopener.open_file(self, path) + path = urllib.url2pathname(urllib.unquote(url)) + if path[-1] != os.sep: + url = url + '/' + if os.path.isdir(path): + indexpath = os.path.join(path, "index.html") + if os.path.exists(indexpath): + return self.open_file(url + "index.html") + try: + names = os.listdir(path) + except os.error, msg: + raise IOError, msg, sys.exc_traceback + names.sort() + s = MyStringIO("file:"+url, {'content-type': 'text/html'}) + s.write('<BASE HREF="file:%s">\n' % + urllib.quote(os.path.join(path, ""))) + for name in names: + q = urllib.quote(name) + s.write('<A HREF="%s">%s</A>\n' % (q, q)) + s.seek(0) + return s + return urllib.FancyURLopener.open_file(self, path) class MyHTMLParser(sgmllib.SGMLParser): def __init__(self, verbose=VERBOSE): - self.base = None - self.links = {} - self.myverbose = verbose - sgmllib.SGMLParser.__init__(self) + self.base = None + self.links = {} + self.myverbose = verbose + sgmllib.SGMLParser.__init__(self) def start_a(self, attributes): - self.link_attr(attributes, 'href') + self.link_attr(attributes, 'href') def end_a(self): pass def do_area(self, attributes): - self.link_attr(attributes, 'href') + self.link_attr(attributes, 'href') def do_img(self, attributes): - self.link_attr(attributes, 'src', 'lowsrc') + self.link_attr(attributes, 'src', 'lowsrc') def do_frame(self, attributes): - self.link_attr(attributes, 'src') + self.link_attr(attributes, 'src') def link_attr(self, attributes, *args): - for name, value in attributes: - if name in args: - if value: value = string.strip(value) - if value: self.links[value] = None + for name, value in attributes: + if name in args: + if value: value = string.strip(value) + if value: self.links[value] = None def do_base(self, attributes): - for name, value in attributes: - if name == 'href': - if value: value = string.strip(value) - if value: - if self.myverbose > 1: - print " Base", value - self.base = value + for name, value in attributes: + if name == 'href': + if value: value = string.strip(value) + if value: + if self.myverbose > 1: + print " Base", value + self.base = value def getlinks(self): - return self.links.keys() + return self.links.keys() def getbase(self): - return self.base + return self.base if __name__ == '__main__': diff --git a/Tools/webchecker/websucker.py b/Tools/webchecker/websucker.py index 6169446..852df07 100755 --- a/Tools/webchecker/websucker.py +++ b/Tools/webchecker/websucker.py @@ -16,29 +16,29 @@ import webchecker if __version__[0] == '$': _v = string.split(__version__) if len(_v) == 3: - __version__ = _v[1] + __version__ = _v[1] def main(): verbose = webchecker.VERBOSE try: - opts, args = getopt.getopt(sys.argv[1:], "qv") + opts, args = getopt.getopt(sys.argv[1:], "qv") except getopt.error, msg: - print msg - print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..." - return 2 + print msg + print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..." + return 2 for o, a in opts: - if o == "-q": - verbose = 0 - if o == "-v": - verbose = verbose + 1 + if o == "-q": + verbose = 0 + if o == "-v": + verbose = verbose + 1 c = Sucker() c.setflags(verbose=verbose) c.urlopener.addheaders = [ - ('User-agent', 'websucker/%s' % __version__), - ] + ('User-agent', 'websucker/%s' % __version__), + ] for arg in args: - print "Adding root", arg - c.addroot(arg) + print "Adding root", arg + c.addroot(arg) print "Run..." c.run() @@ -47,57 +47,57 @@ class Sucker(webchecker.Checker): checkext = 0 def readhtml(self, url): - text = None - path = self.savefilename(url) - try: - f = open(path, "rb") - except IOError: - f = self.openpage(url) - if f: - info = f.info() - nurl = f.geturl() - if nurl != url: - url = nurl - path = self.savefilename(url) - text = f.read() - f.close() - self.savefile(text, path) - if not self.checkforhtml(info, url): - text = None - else: - if self.checkforhtml({}, url): - text = f.read() - f.close() - return text, url + text = None + path = self.savefilename(url) + try: + f = open(path, "rb") + except IOError: + f = self.openpage(url) + if f: + info = f.info() + nurl = f.geturl() + if nurl != url: + url = nurl + path = self.savefilename(url) + text = f.read() + f.close() + self.savefile(text, path) + if not self.checkforhtml(info, url): + text = None + else: + if self.checkforhtml({}, url): + text = f.read() + f.close() + return text, url def savefile(self, text, path): - dir, base = os.path.split(path) - makedirs(dir) - f = open(path, "wb") - f.write(text) - f.close() - print "saved", path + dir, base = os.path.split(path) + makedirs(dir) + f = open(path, "wb") + f.write(text) + f.close() + print "saved", path def savefilename(self, url): - type, rest = urllib.splittype(url) - host, path = urllib.splithost(rest) - while path[:1] == "/": path = path[1:] - user, host = urllib.splituser(host) - host, port = urllib.splitnport(host) - host = string.lower(host) - path = os.path.join(host, path) - if path[-1] == "/": path = path + "index.html" - if os.sep != "/": - path = string.join(string.split(path, "/"), os.sep) - return path + type, rest = urllib.splittype(url) + host, path = urllib.splithost(rest) + while path[:1] == "/": path = path[1:] + user, host = urllib.splituser(host) + host, port = urllib.splitnport(host) + host = string.lower(host) + path = os.path.join(host, path) + if path[-1] == "/": path = path + "index.html" + if os.sep != "/": + path = string.join(string.split(path, "/"), os.sep) + return path def makedirs(dir): if not dir or os.path.exists(dir): - return + return head, tail = os.path.split(dir) if not tail: - print "Huh? Don't know how to make dir", dir - return + print "Huh? Don't know how to make dir", dir + return makedirs(head) os.mkdir(dir, 0777) |