diff options
Diffstat (limited to 'Tools')
-rw-r--r-- | Tools/webchecker/README | 23 | ||||
-rw-r--r-- | Tools/webchecker/tktools.py | 366 | ||||
-rwxr-xr-x | Tools/webchecker/wcgui.py | 456 | ||||
-rw-r--r-- | Tools/webchecker/wcmac.py | 9 | ||||
-rwxr-xr-x | Tools/webchecker/webchecker.py | 890 | ||||
-rwxr-xr-x | Tools/webchecker/websucker.py | 123 | ||||
-rwxr-xr-x | Tools/webchecker/wsgui.py | 240 |
7 files changed, 0 insertions, 2107 deletions
diff --git a/Tools/webchecker/README b/Tools/webchecker/README deleted file mode 100644 index a51bb3d..0000000 --- a/Tools/webchecker/README +++ /dev/null @@ -1,23 +0,0 @@ -Webchecker ----------- - -This is a simple web tree checker, useful to find bad links in a web -tree. It currently checks links pointing within the same subweb for -validity. The main program is "webchecker.py". See its doc string -(or invoke it with the option "-?") for more defails. - -History: - -- Jan 1997. First release. The module robotparser.py was written by -Skip Montanaro; the rest is original work by Guido van Rossum. - -- May 1999. Sam Bayer contributed a new version, wcnew.py, which -supports checking internal links (#spam fragments in URLs) and some -other options. - -- Nov 1999. Sam Bayer contributed patches to reintegrate wcnew.py -into webchecker.py, and corresponding mods to wcgui.py and -websucker.py. - -- Mar 2004. Chris Herborth contributed a patch to let webchecker.py -handle XHTML's 'id' attribute. diff --git a/Tools/webchecker/tktools.py b/Tools/webchecker/tktools.py deleted file mode 100644 index 3a68f9a..0000000 --- a/Tools/webchecker/tktools.py +++ /dev/null @@ -1,366 +0,0 @@ -"""Assorted Tk-related subroutines used in Grail.""" - - -from types import * -from Tkinter import * - -def _clear_entry_widget(event): - try: - widget = event.widget - widget.delete(0, INSERT) - except: pass -def install_keybindings(root): - root.bind_class('Entry', '<Control-u>', _clear_entry_widget) - - -def make_toplevel(master, title=None, class_=None): - """Create a Toplevel widget. - - This is a shortcut for a Toplevel() instantiation plus calls to - set the title and icon name of the widget. - - """ - - if class_: - widget = Toplevel(master, class_=class_) - else: - widget = Toplevel(master) - if title: - widget.title(title) - widget.iconname(title) - return widget - -def set_transient(widget, master, relx=0.5, rely=0.3, expose=1): - """Make an existing toplevel widget transient for a master. - - The widget must exist but should not yet have been placed; in - other words, this should be called after creating all the - subwidget but before letting the user interact. - """ - - widget.withdraw() # Remain invisible while we figure out the geometry - widget.transient(master) - widget.update_idletasks() # Actualize geometry information - if master.winfo_ismapped(): - m_width = master.winfo_width() - m_height = master.winfo_height() - m_x = master.winfo_rootx() - m_y = master.winfo_rooty() - else: - m_width = master.winfo_screenwidth() - m_height = master.winfo_screenheight() - m_x = m_y = 0 - w_width = widget.winfo_reqwidth() - w_height = widget.winfo_reqheight() - x = m_x + (m_width - w_width) * relx - y = m_y + (m_height - w_height) * rely - widget.geometry("+%d+%d" % (x, y)) - if expose: - widget.deiconify() # Become visible at the desired location - return widget - - -def make_scrollbars(parent, hbar, vbar, pack=1, class_=None, name=None, - takefocus=0): - - """Subroutine to create a frame with scrollbars. - - This is used by make_text_box and similar routines. - - Note: the caller is responsible for setting the x/y scroll command - properties (e.g. by calling set_scroll_commands()). - - Return a tuple containing the hbar, the vbar, and the frame, where - hbar and vbar are None if not requested. - - """ - if class_: - if name: frame = Frame(parent, class_=class_, name=name) - else: frame = Frame(parent, class_=class_) - else: - if name: frame = Frame(parent, name=name) - else: frame = Frame(parent) - - if pack: - frame.pack(fill=BOTH, expand=1) - - corner = None - if vbar: - if not hbar: - vbar = Scrollbar(frame, takefocus=takefocus) - vbar.pack(fill=Y, side=RIGHT) - else: - vbarframe = Frame(frame, borderwidth=0) - vbarframe.pack(fill=Y, side=RIGHT) - vbar = Scrollbar(frame, name="vbar", takefocus=takefocus) - vbar.pack(in_=vbarframe, expand=1, fill=Y, side=TOP) - sbwidth = vbar.winfo_reqwidth() - corner = Frame(vbarframe, width=sbwidth, height=sbwidth) - corner.propagate(0) - corner.pack(side=BOTTOM) - else: - vbar = None - - if hbar: - hbar = Scrollbar(frame, orient=HORIZONTAL, name="hbar", - takefocus=takefocus) - hbar.pack(fill=X, side=BOTTOM) - else: - hbar = None - - return hbar, vbar, frame - - -def set_scroll_commands(widget, hbar, vbar): - - """Link a scrollable widget to its scroll bars. - - The scroll bars may be empty. - - """ - - if vbar: - widget['yscrollcommand'] = (vbar, 'set') - vbar['command'] = (widget, 'yview') - - if hbar: - widget['xscrollcommand'] = (hbar, 'set') - hbar['command'] = (widget, 'xview') - - widget.vbar = vbar - widget.hbar = hbar - - -def make_text_box(parent, width=0, height=0, hbar=0, vbar=1, - fill=BOTH, expand=1, wrap=WORD, pack=1, - class_=None, name=None, takefocus=None): - - """Subroutine to create a text box. - - Create: - - a both-ways filling and expanding frame, containing: - - a text widget on the left, and - - possibly a vertical scroll bar on the right. - - possibly a horizonta; scroll bar at the bottom. - - Return the text widget and the frame widget. - - """ - hbar, vbar, frame = make_scrollbars(parent, hbar, vbar, pack, - class_=class_, name=name, - takefocus=takefocus) - - widget = Text(frame, wrap=wrap, name="text") - if width: widget.config(width=width) - if height: widget.config(height=height) - widget.pack(expand=expand, fill=fill, side=LEFT) - - set_scroll_commands(widget, hbar, vbar) - - return widget, frame - - -def make_list_box(parent, width=0, height=0, hbar=0, vbar=1, - fill=BOTH, expand=1, pack=1, class_=None, name=None, - takefocus=None): - - """Subroutine to create a list box. - - Like make_text_box(). - """ - hbar, vbar, frame = make_scrollbars(parent, hbar, vbar, pack, - class_=class_, name=name, - takefocus=takefocus) - - widget = Listbox(frame, name="listbox") - if width: widget.config(width=width) - if height: widget.config(height=height) - widget.pack(expand=expand, fill=fill, side=LEFT) - - set_scroll_commands(widget, hbar, vbar) - - return widget, frame - - -def make_canvas(parent, width=0, height=0, hbar=1, vbar=1, - fill=BOTH, expand=1, pack=1, class_=None, name=None, - takefocus=None): - - """Subroutine to create a canvas. - - Like make_text_box(). - - """ - - hbar, vbar, frame = make_scrollbars(parent, hbar, vbar, pack, - class_=class_, name=name, - takefocus=takefocus) - - widget = Canvas(frame, scrollregion=(0, 0, width, height), name="canvas") - if width: widget.config(width=width) - if height: widget.config(height=height) - widget.pack(expand=expand, fill=fill, side=LEFT) - - set_scroll_commands(widget, hbar, vbar) - - return widget, frame - - - -def make_form_entry(parent, label, borderwidth=None): - - """Subroutine to create a form entry. - - Create: - - a horizontally filling and expanding frame, containing: - - a label on the left, and - - a text entry on the right. - - Return the entry widget and the frame widget. - - """ - - frame = Frame(parent) - frame.pack(fill=X) - - label = Label(frame, text=label) - label.pack(side=LEFT) - - if borderwidth is None: - entry = Entry(frame, relief=SUNKEN) - else: - entry = Entry(frame, relief=SUNKEN, borderwidth=borderwidth) - entry.pack(side=LEFT, fill=X, expand=1) - - return entry, frame - -# This is a slightly modified version of the function above. This -# version does the proper alighnment of labels with their fields. It -# should probably eventually replace make_form_entry altogether. -# -# The one annoying bug is that the text entry field should be -# expandable while still aligning the colons. This doesn't work yet. -# -def make_labeled_form_entry(parent, label, entrywidth=20, entryheight=1, - labelwidth=0, borderwidth=None, - takefocus=None): - """Subroutine to create a form entry. - - Create: - - a horizontally filling and expanding frame, containing: - - a label on the left, and - - a text entry on the right. - - Return the entry widget and the frame widget. - """ - if label and label[-1] != ':': label = label + ':' - - frame = Frame(parent) - - label = Label(frame, text=label, width=labelwidth, anchor=E) - label.pack(side=LEFT) - if entryheight == 1: - if borderwidth is None: - entry = Entry(frame, relief=SUNKEN, width=entrywidth) - else: - entry = Entry(frame, relief=SUNKEN, width=entrywidth, - borderwidth=borderwidth) - entry.pack(side=RIGHT, expand=1, fill=X) - frame.pack(fill=X) - else: - entry = make_text_box(frame, entrywidth, entryheight, 1, 1, - takefocus=takefocus) - frame.pack(fill=BOTH, expand=1) - - return entry, frame, label - - -def make_double_frame(master=None, class_=None, name=None, relief=RAISED, - borderwidth=1): - """Create a pair of frames suitable for 'hosting' a dialog.""" - if name: - if class_: frame = Frame(master, class_=class_, name=name) - else: frame = Frame(master, name=name) - else: - if class_: frame = Frame(master, class_=class_) - else: frame = Frame(master) - top = Frame(frame, name="topframe", relief=relief, - borderwidth=borderwidth) - bottom = Frame(frame, name="bottomframe") - bottom.pack(fill=X, padx='1m', pady='1m', side=BOTTOM) - top.pack(expand=1, fill=BOTH, padx='1m', pady='1m') - frame.pack(expand=1, fill=BOTH) - top = Frame(top) - top.pack(expand=1, fill=BOTH, padx='2m', pady='2m') - - return frame, top, bottom - - -def make_group_frame(master, name=None, label=None, fill=Y, - side=None, expand=None, font=None): - """Create nested frames with a border and optional label. - - The outer frame is only used to provide the decorative border, to - control packing, and to host the label. The inner frame is packed - to fill the outer frame and should be used as the parent of all - sub-widgets. Only the inner frame is returned. - - """ - font = font or "-*-helvetica-medium-r-normal-*-*-100-*-*-*-*-*-*" - outer = Frame(master, borderwidth=2, relief=GROOVE) - outer.pack(expand=expand, fill=fill, side=side) - if label: - Label(outer, text=label, font=font, anchor=W).pack(fill=X) - inner = Frame(master, borderwidth='1m', name=name) - inner.pack(expand=1, fill=BOTH, in_=outer) - inner.forget = outer.forget - return inner - - -def unify_button_widths(*buttons): - """Make buttons passed in all have the same width. - - Works for labels and other widgets with the 'text' option. - - """ - wid = 0 - for btn in buttons: - wid = max(wid, len(btn["text"])) - for btn in buttons: - btn["width"] = wid - - -def flatten(msg): - """Turn a list or tuple into a single string -- recursively.""" - t = type(msg) - if t in (ListType, TupleType): - msg = ' '.join(map(flatten, msg)) - elif t is ClassType: - msg = msg.__name__ - else: - msg = str(msg) - return msg - - -def boolean(s): - """Test whether a string is a Tk boolean, without error checking.""" - if s.lower() in ('', '0', 'no', 'off', 'false'): return 0 - else: return 1 - - -def test(): - """Test make_text_box(), make_form_entry(), flatten(), boolean().""" - import sys - root = Tk() - entry, eframe = make_form_entry(root, 'Boolean:') - text, tframe = make_text_box(root) - def enter(event, entry=entry, text=text): - s = boolean(entry.get()) and '\nyes' or '\nno' - text.insert('end', s) - entry.bind('<Return>', enter) - entry.insert(END, flatten(sys.argv)) - root.mainloop() - - -if __name__ == '__main__': - test() diff --git a/Tools/webchecker/wcgui.py b/Tools/webchecker/wcgui.py deleted file mode 100755 index b0c69d3..0000000 --- a/Tools/webchecker/wcgui.py +++ /dev/null @@ -1,456 +0,0 @@ -#! /usr/bin/env python3 - -"""GUI interface to webchecker. - -This works as a Grail applet too! E.g. - - <APPLET CODE=wcgui.py NAME=CheckerWindow></APPLET> - -Checkpoints are not (yet??? ever???) supported. - -User interface: - -Enter a root to check in the text entry box. To enter more than one root, -enter them one at a time and press <Return> for each one. - -Command buttons Start, Stop and "Check one" govern the checking process in -the obvious way. Start and "Check one" also enter the root from the text -entry box if one is present. There's also a check box (enabled by default) -to decide whether actually to follow external links (since this can slow -the checking down considerably). Finally there's a Quit button. - -A series of checkbuttons determines whether the corresponding output panel -is shown. List panels are also automatically shown or hidden when their -status changes between empty to non-empty. There are six panels: - -Log -- raw output from the checker (-v, -q affect this) -To check -- links discovered but not yet checked -Checked -- links that have been checked -Bad links -- links that failed upon checking -Errors -- pages containing at least one bad link -Details -- details about one URL; double click on a URL in any of - the above list panels (not in Log) will show details - for that URL - -Use your window manager's Close command to quit. - -Command line options: - --m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d) --q -- quiet operation (also suppresses external links report) --v -- verbose operation; repeating -v will increase verbosity --t root -- specify root dir which should be treated as internal (can repeat) --a -- don't check name anchors - -Command line arguments: - -rooturl -- URL to start checking - (default %(DEFROOT)s) - -XXX The command line options (-m, -q, -v) should be GUI accessible. - -XXX The roots should be visible as a list (?). - -XXX The multipanel user interface is clumsy. - -""" - -# ' Emacs bait - - -import sys -import getopt -from Tkinter import * -import tktools -import webchecker - -def main(): - try: - opts, args = getopt.getopt(sys.argv[1:], 't:m:qva') - except getopt.error as msg: - sys.stdout = sys.stderr - print(msg) - print(__doc__%vars(webchecker)) - sys.exit(2) - webchecker.verbose = webchecker.VERBOSE - webchecker.nonames = webchecker.NONAMES - webchecker.maxpage = webchecker.MAXPAGE - extra_roots = [] - for o, a in opts: - if o == '-m': - webchecker.maxpage = int(a) - if o == '-q': - webchecker.verbose = 0 - if o == '-v': - webchecker.verbose = webchecker.verbose + 1 - if o == '-t': - extra_roots.append(a) - if o == '-a': - webchecker.nonames = not webchecker.nonames - root = Tk(className='Webchecker') - root.protocol("WM_DELETE_WINDOW", root.quit) - c = CheckerWindow(root) - c.setflags(verbose=webchecker.verbose, maxpage=webchecker.maxpage, - nonames=webchecker.nonames) - if args: - for arg in args[:-1]: - c.addroot(arg) - c.suggestroot(args[-1]) - # Usually conditioned on whether external links - # will be checked, but since that's not a command - # line option, just toss them in. - for url_root in extra_roots: - # Make sure it's terminated by a slash, - # so that addroot doesn't discard the last - # directory component. - if url_root[-1] != "/": - url_root = url_root + "/" - c.addroot(url_root, add_to_do = 0) - root.mainloop() - - -class CheckerWindow(webchecker.Checker): - - def __init__(self, parent, root=webchecker.DEFROOT): - self.__parent = parent - - self.__topcontrols = Frame(parent) - self.__topcontrols.pack(side=TOP, fill=X) - self.__label = Label(self.__topcontrols, text="Root URL:") - self.__label.pack(side=LEFT) - self.__rootentry = Entry(self.__topcontrols, width=60) - self.__rootentry.pack(side=LEFT) - self.__rootentry.bind('<Return>', self.enterroot) - self.__rootentry.focus_set() - - self.__controls = Frame(parent) - self.__controls.pack(side=TOP, fill=X) - self.__running = 0 - self.__start = Button(self.__controls, text="Run", command=self.start) - self.__start.pack(side=LEFT) - self.__stop = Button(self.__controls, text="Stop", command=self.stop, - state=DISABLED) - self.__stop.pack(side=LEFT) - self.__step = Button(self.__controls, text="Check one", - command=self.step) - self.__step.pack(side=LEFT) - self.__cv = BooleanVar(parent) - self.__cv.set(self.checkext) - self.__checkext = Checkbutton(self.__controls, variable=self.__cv, - command=self.update_checkext, - text="Check nonlocal links",) - self.__checkext.pack(side=LEFT) - self.__reset = Button(self.__controls, text="Start over", command=self.reset) - self.__reset.pack(side=LEFT) - if __name__ == '__main__': # No Quit button under Grail! - self.__quit = Button(self.__controls, text="Quit", - command=self.__parent.quit) - self.__quit.pack(side=RIGHT) - - self.__status = Label(parent, text="Status: initial", anchor=W) - self.__status.pack(side=TOP, fill=X) - self.__checking = Label(parent, text="Idle", anchor=W) - self.__checking.pack(side=TOP, fill=X) - self.__mp = mp = MultiPanel(parent) - sys.stdout = self.__log = LogPanel(mp, "Log") - self.__todo = ListPanel(mp, "To check", self, self.showinfo) - self.__done = ListPanel(mp, "Checked", self, self.showinfo) - self.__bad = ListPanel(mp, "Bad links", self, self.showinfo) - self.__errors = ListPanel(mp, "Pages w/ bad links", self, self.showinfo) - self.__details = LogPanel(mp, "Details") - self.root_seed = None - webchecker.Checker.__init__(self) - if root: - root = str(root).strip() - if root: - self.suggestroot(root) - self.newstatus() - - def reset(self): - webchecker.Checker.reset(self) - for p in self.__todo, self.__done, self.__bad, self.__errors: - p.clear() - if self.root_seed: - self.suggestroot(self.root_seed) - - def suggestroot(self, root): - self.__rootentry.delete(0, END) - self.__rootentry.insert(END, root) - self.__rootentry.select_range(0, END) - self.root_seed = root - - def enterroot(self, event=None): - root = self.__rootentry.get() - root = root.strip() - if root: - self.__checking.config(text="Adding root "+root) - self.__checking.update_idletasks() - self.addroot(root) - self.__checking.config(text="Idle") - try: - i = self.__todo.items.index(root) - except (ValueError, IndexError): - pass - else: - self.__todo.list.select_clear(0, END) - self.__todo.list.select_set(i) - self.__todo.list.yview(i) - self.__rootentry.delete(0, END) - - def start(self): - self.__start.config(state=DISABLED, relief=SUNKEN) - self.__stop.config(state=NORMAL) - self.__step.config(state=DISABLED) - self.enterroot() - self.__running = 1 - self.go() - - def stop(self): - self.__stop.config(state=DISABLED, relief=SUNKEN) - self.__running = 0 - - def step(self): - self.__start.config(state=DISABLED) - self.__step.config(state=DISABLED, relief=SUNKEN) - self.enterroot() - self.__running = 0 - self.dosomething() - - def go(self): - if self.__running: - self.__parent.after_idle(self.dosomething) - else: - self.__checking.config(text="Idle") - self.__start.config(state=NORMAL, relief=RAISED) - self.__stop.config(state=DISABLED, relief=RAISED) - self.__step.config(state=NORMAL, relief=RAISED) - - __busy = 0 - - def dosomething(self): - if self.__busy: return - self.__busy = 1 - if self.todo: - l = self.__todo.selectedindices() - if l: - i = l[0] - else: - i = 0 - self.__todo.list.select_set(i) - self.__todo.list.yview(i) - url = self.__todo.items[i] - self.__checking.config(text="Checking "+self.format_url(url)) - self.__parent.update() - self.dopage(url) - else: - self.stop() - self.__busy = 0 - self.go() - - def showinfo(self, url): - d = self.__details - d.clear() - d.put("URL: %s\n" % self.format_url(url)) - if url in self.bad: - d.put("Error: %s\n" % str(self.bad[url])) - if url in self.roots: - d.put("Note: This is a root URL\n") - if url in self.done: - d.put("Status: checked\n") - o = self.done[url] - elif url in self.todo: - d.put("Status: to check\n") - o = self.todo[url] - else: - d.put("Status: unknown (!)\n") - o = [] - if (not url[1]) and url[0] in self.errors: - d.put("Bad links from this page:\n") - for triple in self.errors[url[0]]: - link, rawlink, msg = triple - d.put(" HREF %s" % self.format_url(link)) - if self.format_url(link) != rawlink: d.put(" (%s)" %rawlink) - d.put("\n") - d.put(" error %s\n" % str(msg)) - self.__mp.showpanel("Details") - for source, rawlink in o: - d.put("Origin: %s" % source) - if rawlink != self.format_url(url): - d.put(" (%s)" % rawlink) - d.put("\n") - d.text.yview("1.0") - - def setbad(self, url, msg): - webchecker.Checker.setbad(self, url, msg) - self.__bad.insert(url) - self.newstatus() - - def setgood(self, url): - webchecker.Checker.setgood(self, url) - self.__bad.remove(url) - self.newstatus() - - def newlink(self, url, origin): - webchecker.Checker.newlink(self, url, origin) - if url in self.done: - self.__done.insert(url) - elif url in self.todo: - self.__todo.insert(url) - self.newstatus() - - def markdone(self, url): - webchecker.Checker.markdone(self, url) - self.__done.insert(url) - self.__todo.remove(url) - self.newstatus() - - def seterror(self, url, triple): - webchecker.Checker.seterror(self, url, triple) - self.__errors.insert((url, '')) - self.newstatus() - - def newstatus(self): - self.__status.config(text="Status: "+self.status()) - self.__parent.update() - - def update_checkext(self): - self.checkext = self.__cv.get() - - -class ListPanel: - - def __init__(self, mp, name, checker, showinfo=None): - self.mp = mp - self.name = name - self.showinfo = showinfo - self.checker = checker - self.panel = mp.addpanel(name) - self.list, self.frame = tktools.make_list_box( - self.panel, width=60, height=5) - self.list.config(exportselection=0) - if showinfo: - self.list.bind('<Double-Button-1>', self.doubleclick) - self.items = [] - - def clear(self): - self.items = [] - self.list.delete(0, END) - self.mp.hidepanel(self.name) - - def doubleclick(self, event): - l = self.selectedindices() - if l: - self.showinfo(self.items[l[0]]) - - def selectedindices(self): - l = self.list.curselection() - if not l: return [] - return list(map(int, l)) - - def insert(self, url): - if url not in self.items: - if not self.items: - self.mp.showpanel(self.name) - # (I tried sorting alphabetically, but the display is too jumpy) - i = len(self.items) - self.list.insert(i, self.checker.format_url(url)) - self.list.yview(i) - self.items.insert(i, url) - - def remove(self, url): - try: - i = self.items.index(url) - except (ValueError, IndexError): - pass - else: - was_selected = i in self.selectedindices() - self.list.delete(i) - del self.items[i] - if not self.items: - self.mp.hidepanel(self.name) - elif was_selected: - if i >= len(self.items): - i = len(self.items) - 1 - self.list.select_set(i) - - -class LogPanel: - - def __init__(self, mp, name): - self.mp = mp - self.name = name - self.panel = mp.addpanel(name) - self.text, self.frame = tktools.make_text_box(self.panel, height=10) - self.text.config(wrap=NONE) - - def clear(self): - self.text.delete("1.0", END) - self.text.yview("1.0") - - def put(self, s): - self.text.insert(END, s) - if '\n' in s: - self.text.yview(END) - - def write(self, s): - self.text.insert(END, s) - if '\n' in s: - self.text.yview(END) - self.panel.update() - - -class MultiPanel: - - def __init__(self, parent): - self.parent = parent - self.frame = Frame(self.parent) - self.frame.pack(expand=1, fill=BOTH) - self.topframe = Frame(self.frame, borderwidth=2, relief=RAISED) - self.topframe.pack(fill=X) - self.botframe = Frame(self.frame) - self.botframe.pack(expand=1, fill=BOTH) - self.panelnames = [] - self.panels = {} - - def addpanel(self, name, on=0): - v = StringVar(self.parent) - if on: - v.set(name) - else: - v.set("") - check = Checkbutton(self.topframe, text=name, - offvalue="", onvalue=name, variable=v, - command=self.checkpanel) - check.pack(side=LEFT) - panel = Frame(self.botframe) - label = Label(panel, text=name, borderwidth=2, relief=RAISED, anchor=W) - label.pack(side=TOP, fill=X) - t = v, check, panel - self.panelnames.append(name) - self.panels[name] = t - if on: - panel.pack(expand=1, fill=BOTH) - return panel - - def showpanel(self, name): - v, check, panel = self.panels[name] - v.set(name) - panel.pack(expand=1, fill=BOTH) - - def hidepanel(self, name): - v, check, panel = self.panels[name] - v.set("") - panel.pack_forget() - - def checkpanel(self): - for name in self.panelnames: - v, check, panel = self.panels[name] - panel.pack_forget() - for name in self.panelnames: - v, check, panel = self.panels[name] - if v.get(): - panel.pack(expand=1, fill=BOTH) - - -if __name__ == '__main__': - main() diff --git a/Tools/webchecker/wcmac.py b/Tools/webchecker/wcmac.py deleted file mode 100644 index 9edcd5d..0000000 --- a/Tools/webchecker/wcmac.py +++ /dev/null @@ -1,9 +0,0 @@ -import webchecker, sys -webchecker.DEFROOT = "http://www.python.org/python/" -webchecker.MAXPAGE = 50000 -webchecker.verbose = 2 -sys.argv.append('-x') -webchecker.main() -sys.stdout.write("\nCR to exit: ") -sys.stdout.flush() -sys.stdin.readline() diff --git a/Tools/webchecker/webchecker.py b/Tools/webchecker/webchecker.py deleted file mode 100755 index 7c3124c..0000000 --- a/Tools/webchecker/webchecker.py +++ /dev/null @@ -1,890 +0,0 @@ -#! /usr/bin/env python3 - -# Original code by Guido van Rossum; extensive changes by Sam Bayer, -# including code to check URL fragments. - -"""Web tree checker. - -This utility is handy to check a subweb of the world-wide web for -errors. A subweb is specified by giving one or more ``root URLs''; a -page belongs to the subweb if one of the root URLs is an initial -prefix of it. - -File URL extension: - -In order to easy the checking of subwebs via the local file system, -the interpretation of ``file:'' URLs is extended to mimic the behavior -of your average HTTP daemon: if a directory pathname is given, the -file index.html in that directory is returned if it exists, otherwise -a directory listing is returned. Now, you can point webchecker to the -document tree in the local file system of your HTTP daemon, and have -most of it checked. In fact the default works this way if your local -web tree is located at /usr/local/etc/httpd/htdpcs (the default for -the NCSA HTTP daemon and probably others). - -Report printed: - -When done, it reports pages with bad links within the subweb. When -interrupted, it reports for the pages that it has checked already. - -In verbose mode, additional messages are printed during the -information gathering phase. By default, it prints a summary of its -work status every 50 URLs (adjustable with the -r option), and it -reports errors as they are encountered. Use the -q option to disable -this output. - -Checkpoint feature: - -Whether interrupted or not, it dumps its state (a Python pickle) to a -checkpoint file and the -R option allows it to restart from the -checkpoint (assuming that the pages on the subweb that were already -processed haven't changed). Even when it has run till completion, -R -can still be useful -- it will print the reports again, and -Rq prints -the errors only. In this case, the checkpoint file is not written -again. The checkpoint file can be set with the -d option. - -The checkpoint file is written as a Python pickle. Remember that -Python's pickle module is currently quite slow. Give it the time it -needs to load and save the checkpoint file. When interrupted while -writing the checkpoint file, the old checkpoint file is not -overwritten, but all work done in the current run is lost. - -Miscellaneous: - -- You may find the (Tk-based) GUI version easier to use. See wcgui.py. - -- Webchecker honors the "robots.txt" convention. Thanks to Skip -Montanaro for his robotparser.py module (included in this directory)! -The agent name is hardwired to "webchecker". URLs that are disallowed -by the robots.txt file are reported as external URLs. - -- Because the SGML parser is a bit slow, very large SGML files are -skipped. The size limit can be set with the -m option. - -- When the server or protocol does not tell us a file's type, we guess -it based on the URL's suffix. The mimetypes.py module (also in this -directory) has a built-in table mapping most currently known suffixes, -and in addition attempts to read the mime.types configuration files in -the default locations of Netscape and the NCSA HTTP daemon. - -- We follow links indicated by <A>, <FRAME> and <IMG> tags. We also -honor the <BASE> tag. - -- We now check internal NAME anchor links, as well as toplevel links. - -- Checking external links is now done by default; use -x to *disable* -this feature. External links are now checked during normal -processing. (XXX The status of a checked link could be categorized -better. Later...) - -- If external links are not checked, you can use the -t flag to -provide specific overrides to -x. - -Usage: webchecker.py [option] ... [rooturl] ... - -Options: - --R -- restart from checkpoint file --d file -- checkpoint filename (default %(DUMPFILE)s) --m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d) --n -- reports only, no checking (use with -R) --q -- quiet operation (also suppresses external links report) --r number -- number of links processed per round (default %(ROUNDSIZE)d) --t root -- specify root dir which should be treated as internal (can repeat) --v -- verbose operation; repeating -v will increase verbosity --x -- don't check external links (these are often slow to check) --a -- don't check name anchors - -Arguments: - -rooturl -- URL to start checking - (default %(DEFROOT)s) - -""" - - -__version__ = "$Revision$" - - -import sys -import os -from types import * -import io -import getopt -import pickle - -import urllib.request -import urllib.parse as urlparse -import sgmllib -import cgi - -import mimetypes -from urllib import robotparser - -# Extract real version number if necessary -if __version__[0] == '$': - _v = __version__.split() - if len(_v) == 3: - __version__ = _v[1] - - -# Tunable parameters -DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL -CHECKEXT = 1 # Check external references (1 deep) -VERBOSE = 1 # Verbosity level (0-3) -MAXPAGE = 150000 # Ignore files bigger than this -ROUNDSIZE = 50 # Number of links processed per round -DUMPFILE = "@webchecker.pickle" # Pickled checkpoint -AGENTNAME = "webchecker" # Agent name for robots.txt parser -NONAMES = 0 # Force name anchor checking - - -# Global variables - - -def main(): - checkext = CHECKEXT - verbose = VERBOSE - maxpage = MAXPAGE - roundsize = ROUNDSIZE - dumpfile = DUMPFILE - restart = 0 - norun = 0 - - try: - opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa') - except getopt.error as msg: - sys.stdout = sys.stderr - print(msg) - print(__doc__%globals()) - sys.exit(2) - - # The extra_roots variable collects extra roots. - extra_roots = [] - nonames = NONAMES - - for o, a in opts: - if o == '-R': - restart = 1 - if o == '-d': - dumpfile = a - if o == '-m': - maxpage = int(a) - if o == '-n': - norun = 1 - if o == '-q': - verbose = 0 - if o == '-r': - roundsize = int(a) - if o == '-t': - extra_roots.append(a) - if o == '-a': - nonames = not nonames - if o == '-v': - verbose = verbose + 1 - if o == '-x': - checkext = not checkext - - if verbose > 0: - print(AGENTNAME, "version", __version__) - - if restart: - c = load_pickle(dumpfile=dumpfile, verbose=verbose) - else: - c = Checker() - - c.setflags(checkext=checkext, verbose=verbose, - maxpage=maxpage, roundsize=roundsize, - nonames=nonames - ) - - if not restart and not args: - args.append(DEFROOT) - - for arg in args: - c.addroot(arg) - - # The -t flag is only needed if external links are not to be - # checked. So -t values are ignored unless -x was specified. - if not checkext: - for root in extra_roots: - # Make sure it's terminated by a slash, - # so that addroot doesn't discard the last - # directory component. - if root[-1] != "/": - root = root + "/" - c.addroot(root, add_to_do = 0) - - try: - - if not norun: - try: - c.run() - except KeyboardInterrupt: - if verbose > 0: - print("[run interrupted]") - - try: - c.report() - except KeyboardInterrupt: - if verbose > 0: - print("[report interrupted]") - - finally: - if c.save_pickle(dumpfile): - if dumpfile == DUMPFILE: - print("Use ``%s -R'' to restart." % sys.argv[0]) - else: - print("Use ``%s -R -d %s'' to restart." % (sys.argv[0], - dumpfile)) - - -def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE): - if verbose > 0: - print("Loading checkpoint from %s ..." % dumpfile) - f = open(dumpfile, "rb") - c = pickle.load(f) - f.close() - if verbose > 0: - print("Done.") - print("Root:", "\n ".join(c.roots)) - return c - - -class Checker: - - checkext = CHECKEXT - verbose = VERBOSE - maxpage = MAXPAGE - roundsize = ROUNDSIZE - nonames = NONAMES - - validflags = tuple(dir()) - - def __init__(self): - self.reset() - - def setflags(self, **kw): - for key in kw: - if key not in self.validflags: - raise NameError("invalid keyword argument: %s" % str(key)) - for key, value in kw.items(): - setattr(self, key, value) - - def reset(self): - self.roots = [] - self.todo = {} - self.done = {} - self.bad = {} - - # Add a name table, so that the name URLs can be checked. Also - # serves as an implicit cache for which URLs are done. - self.name_table = {} - - self.round = 0 - # The following are not pickled: - self.robots = {} - self.errors = {} - self.urlopener = MyURLopener() - self.changed = 0 - - def note(self, level, format, *args): - if self.verbose > level: - if args: - format = format%args - self.message(format) - - def message(self, format, *args): - if args: - format = format%args - print(format) - - def __getstate__(self): - return (self.roots, self.todo, self.done, self.bad, self.round) - - def __setstate__(self, state): - self.reset() - (self.roots, self.todo, self.done, self.bad, self.round) = state - for root in self.roots: - self.addrobot(root) - for url in self.bad: - self.markerror(url) - - def addroot(self, root, add_to_do = 1): - if root not in self.roots: - troot = root - scheme, netloc, path, params, query, fragment = \ - urlparse.urlparse(root) - i = path.rfind("/") + 1 - if 0 < i < len(path): - path = path[:i] - troot = urlparse.urlunparse((scheme, netloc, path, - params, query, fragment)) - self.roots.append(troot) - self.addrobot(root) - if add_to_do: - self.newlink((root, ""), ("<root>", root)) - - def addrobot(self, root): - root = urlparse.urljoin(root, "/") - if root in self.robots: return - url = urlparse.urljoin(root, "/robots.txt") - self.robots[root] = rp = robotparser.RobotFileParser() - self.note(2, "Parsing %s", url) - rp.debug = self.verbose > 3 - rp.set_url(url) - try: - rp.read() - except (OSError, IOError) as msg: - self.note(1, "I/O error parsing %s: %s", url, msg) - - def run(self): - while self.todo: - self.round = self.round + 1 - self.note(0, "\nRound %d (%s)\n", self.round, self.status()) - urls = sorted(self.todo.keys()) - del urls[self.roundsize:] - for url in urls: - self.dopage(url) - - def status(self): - return "%d total, %d to do, %d done, %d bad" % ( - len(self.todo)+len(self.done), - len(self.todo), len(self.done), - len(self.bad)) - - def report(self): - self.message("") - if not self.todo: s = "Final" - else: s = "Interim" - self.message("%s Report (%s)", s, self.status()) - self.report_errors() - - def report_errors(self): - if not self.bad: - self.message("\nNo errors") - return - self.message("\nError Report:") - sources = sorted(self.errors.keys()) - for source in sources: - triples = self.errors[source] - self.message("") - if len(triples) > 1: - self.message("%d Errors in %s", len(triples), source) - else: - self.message("Error in %s", source) - # Call self.format_url() instead of referring - # to the URL directly, since the URLs in these - # triples is now a (URL, fragment) pair. The value - # of the "source" variable comes from the list of - # origins, and is a URL, not a pair. - for url, rawlink, msg in triples: - if rawlink != self.format_url(url): s = " (%s)" % rawlink - else: s = "" - self.message(" HREF %s%s\n msg %s", - self.format_url(url), s, msg) - - def dopage(self, url_pair): - - # All printing of URLs uses format_url(); argument changed to - # url_pair for clarity. - if self.verbose > 1: - if self.verbose > 2: - self.show("Check ", self.format_url(url_pair), - " from", self.todo[url_pair]) - else: - self.message("Check %s", self.format_url(url_pair)) - url, local_fragment = url_pair - if local_fragment and self.nonames: - self.markdone(url_pair) - return - try: - page = self.getpage(url_pair) - except sgmllib.SGMLParseError as msg: - msg = self.sanitize(msg) - self.note(0, "Error parsing %s: %s", - self.format_url(url_pair), msg) - # Dont actually mark the URL as bad - it exists, just - # we can't parse it! - page = None - if page: - # Store the page which corresponds to this URL. - self.name_table[url] = page - # If there is a fragment in this url_pair, and it's not - # in the list of names for the page, call setbad(), since - # it's a missing anchor. - if local_fragment and local_fragment not in page.getnames(): - self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment)) - for info in page.getlinkinfos(): - # getlinkinfos() now returns the fragment as well, - # and we store that fragment here in the "todo" dictionary. - link, rawlink, fragment = info - # However, we don't want the fragment as the origin, since - # the origin is logically a page. - origin = url, rawlink - self.newlink((link, fragment), origin) - else: - # If no page has been created yet, we want to - # record that fact. - self.name_table[url_pair[0]] = None - self.markdone(url_pair) - - def newlink(self, url, origin): - if url in self.done: - self.newdonelink(url, origin) - else: - self.newtodolink(url, origin) - - def newdonelink(self, url, origin): - if origin not in self.done[url]: - self.done[url].append(origin) - - # Call self.format_url(), since the URL here - # is now a (URL, fragment) pair. - self.note(3, " Done link %s", self.format_url(url)) - - # Make sure that if it's bad, that the origin gets added. - if url in self.bad: - source, rawlink = origin - triple = url, rawlink, self.bad[url] - self.seterror(source, triple) - - def newtodolink(self, url, origin): - # Call self.format_url(), since the URL here - # is now a (URL, fragment) pair. - if url in self.todo: - if origin not in self.todo[url]: - self.todo[url].append(origin) - self.note(3, " Seen todo link %s", self.format_url(url)) - else: - self.todo[url] = [origin] - self.note(3, " New todo link %s", self.format_url(url)) - - def format_url(self, url): - link, fragment = url - if fragment: return link + "#" + fragment - else: return link - - def markdone(self, url): - self.done[url] = self.todo[url] - del self.todo[url] - self.changed = 1 - - def inroots(self, url): - for root in self.roots: - if url[:len(root)] == root: - return self.isallowed(root, url) - return 0 - - def isallowed(self, root, url): - root = urlparse.urljoin(root, "/") - return self.robots[root].can_fetch(AGENTNAME, url) - - def getpage(self, url_pair): - # Incoming argument name is a (URL, fragment) pair. - # The page may have been cached in the name_table variable. - url, fragment = url_pair - if url in self.name_table: - return self.name_table[url] - - scheme, path = urllib.request.splittype(url) - if scheme in ('mailto', 'news', 'javascript', 'telnet'): - self.note(1, " Not checking %s URL" % scheme) - return None - isint = self.inroots(url) - - # Ensure that openpage gets the URL pair to - # print out its error message and record the error pair - # correctly. - if not isint: - if not self.checkext: - self.note(1, " Not checking ext link") - return None - f = self.openpage(url_pair) - if f: - self.safeclose(f) - return None - text, nurl = self.readhtml(url_pair) - - if nurl != url: - self.note(1, " Redirected to %s", nurl) - url = nurl - if text: - return Page(text, url, maxpage=self.maxpage, checker=self) - - # These next three functions take (URL, fragment) pairs as - # arguments, so that openpage() receives the appropriate tuple to - # record error messages. - def readhtml(self, url_pair): - url, fragment = url_pair - text = None - f, url = self.openhtml(url_pair) - if f: - text = f.read() - f.close() - return text, url - - def openhtml(self, url_pair): - url, fragment = url_pair - f = self.openpage(url_pair) - if f: - url = f.geturl() - info = f.info() - if not self.checkforhtml(info, url): - self.safeclose(f) - f = None - return f, url - - def openpage(self, url_pair): - url, fragment = url_pair - try: - return self.urlopener.open(url) - except (OSError, IOError) as msg: - msg = self.sanitize(msg) - self.note(0, "Error %s", msg) - if self.verbose > 0: - self.show(" HREF ", url, " from", self.todo[url_pair]) - self.setbad(url_pair, msg) - return None - - def checkforhtml(self, info, url): - if 'content-type' in info: - ctype = cgi.parse_header(info['content-type'])[0].lower() - if ';' in ctype: - # handle content-type: text/html; charset=iso8859-1 : - ctype = ctype.split(';', 1)[0].strip() - else: - if url[-1:] == "/": - return 1 - ctype, encoding = mimetypes.guess_type(url) - if ctype == 'text/html': - return 1 - else: - self.note(1, " Not HTML, mime type %s", ctype) - return 0 - - def setgood(self, url): - if url in self.bad: - del self.bad[url] - self.changed = 1 - self.note(0, "(Clear previously seen error)") - - def setbad(self, url, msg): - if url in self.bad and self.bad[url] == msg: - self.note(0, "(Seen this error before)") - return - self.bad[url] = msg - self.changed = 1 - self.markerror(url) - - def markerror(self, url): - try: - origins = self.todo[url] - except KeyError: - origins = self.done[url] - for source, rawlink in origins: - triple = url, rawlink, self.bad[url] - self.seterror(source, triple) - - def seterror(self, url, triple): - try: - # Because of the way the URLs are now processed, I need to - # check to make sure the URL hasn't been entered in the - # error list. The first element of the triple here is a - # (URL, fragment) pair, but the URL key is not, since it's - # from the list of origins. - if triple not in self.errors[url]: - self.errors[url].append(triple) - except KeyError: - self.errors[url] = [triple] - - # The following used to be toplevel functions; they have been - # changed into methods so they can be overridden in subclasses. - - def show(self, p1, link, p2, origins): - self.message("%s %s", p1, link) - i = 0 - for source, rawlink in origins: - i = i+1 - if i == 2: - p2 = ' '*len(p2) - if rawlink != link: s = " (%s)" % rawlink - else: s = "" - self.message("%s %s%s", p2, source, s) - - def sanitize(self, msg): - if isinstance(IOError, ClassType) and isinstance(msg, IOError): - # Do the other branch recursively - msg.args = self.sanitize(msg.args) - elif isinstance(msg, TupleType): - if len(msg) >= 4 and msg[0] == 'http error' and \ - isinstance(msg[3], InstanceType): - # Remove the Message instance -- it may contain - # a file object which prevents pickling. - msg = msg[:3] + msg[4:] - return msg - - def safeclose(self, f): - try: - url = f.geturl() - except AttributeError: - pass - else: - if url[:4] == 'ftp:' or url[:7] == 'file://': - # Apparently ftp connections don't like to be closed - # prematurely... - text = f.read() - f.close() - - def save_pickle(self, dumpfile=DUMPFILE): - if not self.changed: - self.note(0, "\nNo need to save checkpoint") - elif not dumpfile: - self.note(0, "No dumpfile, won't save checkpoint") - else: - self.note(0, "\nSaving checkpoint to %s ...", dumpfile) - newfile = dumpfile + ".new" - f = open(newfile, "wb") - pickle.dump(self, f) - f.close() - try: - os.unlink(dumpfile) - except os.error: - pass - os.rename(newfile, dumpfile) - self.note(0, "Done.") - return 1 - - -class Page: - - def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None): - self.text = text - self.url = url - self.verbose = verbose - self.maxpage = maxpage - self.checker = checker - - # The parsing of the page is done in the __init__() routine in - # order to initialize the list of names the file - # contains. Stored the parser in an instance variable. Passed - # the URL to MyHTMLParser(). - size = len(self.text) - if size > self.maxpage: - self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001)) - self.parser = None - return - self.checker.note(2, " Parsing %s (%d bytes)", self.url, size) - self.parser = MyHTMLParser(url, verbose=self.verbose, - checker=self.checker) - self.parser.feed(self.text) - self.parser.close() - - def note(self, level, msg, *args): - if self.checker: - self.checker.note(level, msg, *args) - else: - if self.verbose >= level: - if args: - msg = msg%args - print(msg) - - # Method to retrieve names. - def getnames(self): - if self.parser: - return self.parser.names - else: - return [] - - def getlinkinfos(self): - # File reading is done in __init__() routine. Store parser in - # local variable to indicate success of parsing. - - # If no parser was stored, fail. - if not self.parser: return [] - - rawlinks = self.parser.getlinks() - base = urlparse.urljoin(self.url, self.parser.getbase() or "") - infos = [] - for rawlink in rawlinks: - t = urlparse.urlparse(rawlink) - # DON'T DISCARD THE FRAGMENT! Instead, include - # it in the tuples which are returned. See Checker.dopage(). - fragment = t[-1] - t = t[:-1] + ('',) - rawlink = urlparse.urlunparse(t) - link = urlparse.urljoin(base, rawlink) - infos.append((link, rawlink, fragment)) - - return infos - - -class MyStringIO(io.StringIO): - - def __init__(self, url, info): - self.__url = url - self.__info = info - super(MyStringIO, self).__init__(self) - - def info(self): - return self.__info - - def geturl(self): - return self.__url - - -class MyURLopener(urllib.request.FancyURLopener): - - http_error_default = urllib.request.URLopener.http_error_default - - def __init__(*args): - self = args[0] - urllib.request.FancyURLopener.__init__(*args) - self.addheaders = [ - ('User-agent', 'Python-webchecker/%s' % __version__), - ] - - def http_error_401(self, url, fp, errcode, errmsg, headers): - return None - - def open_file(self, url): - path = urllib.url2pathname(urllib.unquote(url)) - if os.path.isdir(path): - if path[-1] != os.sep: - url = url + '/' - indexpath = os.path.join(path, "index.html") - if os.path.exists(indexpath): - return self.open_file(url + "index.html") - try: - names = os.listdir(path) - except os.error as msg: - exc_type, exc_value, exc_tb = sys.exc_info() - raise IOError(msg).with_traceback(exc_tb) - names.sort() - s = MyStringIO("file:"+url, {'content-type': 'text/html'}) - s.write('<BASE HREF="file:%s">\n' % - urllib.quote(os.path.join(path, ""))) - for name in names: - q = urllib.quote(name) - s.write('<A HREF="%s">%s</A>\n' % (q, q)) - s.seek(0) - return s - return urllib.request.FancyURLopener.open_file(self, url) - - -class MyHTMLParser(sgmllib.SGMLParser): - - def __init__(self, url, verbose=VERBOSE, checker=None): - self.myverbose = verbose # now unused - self.checker = checker - self.base = None - self.links = {} - self.names = [] - self.url = url - sgmllib.SGMLParser.__init__(self) - - def check_name_id(self, attributes): - """ Check the name or id attributes on an element. - """ - # We must rescue the NAME or id (name is deprecated in XHTML) - # attributes from the anchor, in order to - # cache the internal anchors which are made - # available in the page. - for name, value in attributes: - if name == "name" or name == "id": - if value in self.names: - self.checker.message("WARNING: duplicate ID name %s in %s", - value, self.url) - else: self.names.append(value) - break - - def unknown_starttag(self, tag, attributes): - """ In XHTML, you can have id attributes on any element. - """ - self.check_name_id(attributes) - - def start_a(self, attributes): - self.link_attr(attributes, 'href') - self.check_name_id(attributes) - - def end_a(self): pass - - def do_area(self, attributes): - self.link_attr(attributes, 'href') - self.check_name_id(attributes) - - def do_body(self, attributes): - self.link_attr(attributes, 'background', 'bgsound') - self.check_name_id(attributes) - - def do_img(self, attributes): - self.link_attr(attributes, 'src', 'lowsrc') - self.check_name_id(attributes) - - def do_frame(self, attributes): - self.link_attr(attributes, 'src', 'longdesc') - self.check_name_id(attributes) - - def do_iframe(self, attributes): - self.link_attr(attributes, 'src', 'longdesc') - self.check_name_id(attributes) - - def do_link(self, attributes): - for name, value in attributes: - if name == "rel": - parts = value.lower().split() - if ( parts == ["stylesheet"] - or parts == ["alternate", "stylesheet"]): - self.link_attr(attributes, "href") - break - self.check_name_id(attributes) - - def do_object(self, attributes): - self.link_attr(attributes, 'data', 'usemap') - self.check_name_id(attributes) - - def do_script(self, attributes): - self.link_attr(attributes, 'src') - self.check_name_id(attributes) - - def do_table(self, attributes): - self.link_attr(attributes, 'background') - self.check_name_id(attributes) - - def do_td(self, attributes): - self.link_attr(attributes, 'background') - self.check_name_id(attributes) - - def do_th(self, attributes): - self.link_attr(attributes, 'background') - self.check_name_id(attributes) - - def do_tr(self, attributes): - self.link_attr(attributes, 'background') - self.check_name_id(attributes) - - def link_attr(self, attributes, *args): - for name, value in attributes: - if name in args: - if value: value = value.strip() - if value: self.links[value] = None - - def do_base(self, attributes): - for name, value in attributes: - if name == 'href': - if value: value = value.strip() - if value: - if self.checker: - self.checker.note(1, " Base %s", value) - self.base = value - self.check_name_id(attributes) - - def getlinks(self): - return list(self.links.keys()) - - def getbase(self): - return self.base - - -if __name__ == '__main__': - main() diff --git a/Tools/webchecker/websucker.py b/Tools/webchecker/websucker.py deleted file mode 100755 index 4657b52..0000000 --- a/Tools/webchecker/websucker.py +++ /dev/null @@ -1,123 +0,0 @@ -#! /usr/bin/env python3 - -"""A variant on webchecker that creates a mirror copy of a remote site.""" - -__version__ = "$Revision$" - -import os -import sys -import getopt -import urllib.parse - -import webchecker - -# Extract real version number if necessary -if __version__[0] == '$': - _v = __version__.split() - if len(_v) == 3: - __version__ = _v[1] - -def main(): - verbose = webchecker.VERBOSE - try: - opts, args = getopt.getopt(sys.argv[1:], "qv") - except getopt.error as msg: - print(msg) - print("usage:", sys.argv[0], "[-qv] ... [rooturl] ...") - return 2 - for o, a in opts: - if o == "-q": - verbose = 0 - if o == "-v": - verbose = verbose + 1 - c = Sucker() - c.setflags(verbose=verbose) - c.urlopener.addheaders = [ - ('User-agent', 'websucker/%s' % __version__), - ] - for arg in args: - print("Adding root", arg) - c.addroot(arg) - print("Run...") - c.run() - -class Sucker(webchecker.Checker): - - checkext = 0 - nonames = 1 - - # SAM 11/13/99: in general, URLs are now URL pairs. - # Since we've suppressed name anchor checking, - # we can ignore the second dimension. - - def readhtml(self, url_pair): - url = url_pair[0] - text = None - path = self.savefilename(url) - try: - f = open(path, "rb") - except IOError: - f = self.openpage(url_pair) - if f: - info = f.info() - nurl = f.geturl() - if nurl != url: - url = nurl - path = self.savefilename(url) - text = f.read() - f.close() - self.savefile(text, path) - if not self.checkforhtml(info, url): - text = None - else: - if self.checkforhtml({}, url): - text = f.read() - f.close() - return text, url - - def savefile(self, text, path): - dir, base = os.path.split(path) - makedirs(dir) - try: - f = open(path, "wb") - f.write(text) - f.close() - self.message("saved %s", path) - except IOError as msg: - self.message("didn't save %s: %s", path, str(msg)) - - def savefilename(self, url): - type, rest = urllib.parse.splittype(url) - host, path = urllib.parse.splithost(rest) - path = path.lstrip("/") - user, host = urllib.parse.splituser(host) - host, port = urllib.parse.splitnport(host) - host = host.lower() - if not path or path[-1] == "/": - path = path + "index.html" - if os.sep != "/": - path = os.sep.join(path.split("/")) - path = os.path.join(host, path) - return path - -def makedirs(dir): - if not dir: - return - if os.path.exists(dir): - if not os.path.isdir(dir): - try: - os.rename(dir, dir + ".bak") - os.mkdir(dir) - os.rename(dir + ".bak", os.path.join(dir, "index.html")) - except os.error: - pass - return - head, tail = os.path.split(dir) - if not tail: - print("Huh? Don't know how to make dir", dir) - return - makedirs(head) - os.mkdir(dir, 0o777) - -if __name__ == '__main__': - sys.exit(main() or 0) diff --git a/Tools/webchecker/wsgui.py b/Tools/webchecker/wsgui.py deleted file mode 100755 index 032523b..0000000 --- a/Tools/webchecker/wsgui.py +++ /dev/null @@ -1,240 +0,0 @@ -#! /usr/bin/env python3 - -"""Tkinter-based GUI for websucker. - -Easy use: type or paste source URL and destination directory in -their respective text boxes, click GO or hit return, and presto. -""" - -from Tkinter import * -import websucker -import os -import threading -import queue -import time - -VERBOSE = 2 - - -try: - class Canceled(Exception): - "Exception used to cancel run()." -except (NameError, TypeError): - Canceled = __name__ + ".Canceled" - - -class SuckerThread(websucker.Sucker): - - stopit = 0 - savedir = None - rootdir = None - - def __init__(self, msgq): - self.msgq = msgq - websucker.Sucker.__init__(self) - self.setflags(verbose=VERBOSE) - self.urlopener.addheaders = [ - ('User-agent', 'websucker/%s' % websucker.__version__), - ] - - def message(self, format, *args): - if args: - format = format%args - ##print format - self.msgq.put(format) - - def run1(self, url): - try: - try: - self.reset() - self.addroot(url) - self.run() - except Canceled: - self.message("[canceled]") - else: - self.message("[done]") - finally: - self.msgq.put(None) - - def savefile(self, text, path): - if self.stopit: - raise Canceled - websucker.Sucker.savefile(self, text, path) - - def getpage(self, url): - if self.stopit: - raise Canceled - return websucker.Sucker.getpage(self, url) - - def savefilename(self, url): - path = websucker.Sucker.savefilename(self, url) - if self.savedir: - n = len(self.rootdir) - if path[:n] == self.rootdir: - path = path[n:] - while path[:1] == os.sep: - path = path[1:] - path = os.path.join(self.savedir, path) - return path - - def XXXaddrobot(self, *args): - pass - - def XXXisallowed(self, *args): - return 1 - - -class App: - - sucker = None - msgq = None - - def __init__(self, top): - self.top = top - top.columnconfigure(99, weight=1) - self.url_label = Label(top, text="URL:") - self.url_label.grid(row=0, column=0, sticky='e') - self.url_entry = Entry(top, width=60, exportselection=0) - self.url_entry.grid(row=0, column=1, sticky='we', - columnspan=99) - self.url_entry.focus_set() - self.url_entry.bind("<Key-Return>", self.go) - self.dir_label = Label(top, text="Directory:") - self.dir_label.grid(row=1, column=0, sticky='e') - self.dir_entry = Entry(top) - self.dir_entry.grid(row=1, column=1, sticky='we', - columnspan=99) - self.go_button = Button(top, text="Go", command=self.go) - self.go_button.grid(row=2, column=1, sticky='w') - self.cancel_button = Button(top, text="Cancel", - command=self.cancel, - state=DISABLED) - self.cancel_button.grid(row=2, column=2, sticky='w') - self.auto_button = Button(top, text="Paste+Go", - command=self.auto) - self.auto_button.grid(row=2, column=3, sticky='w') - self.status_label = Label(top, text="[idle]") - self.status_label.grid(row=2, column=4, sticky='w') - self.top.update_idletasks() - self.top.grid_propagate(0) - - def message(self, text, *args): - if args: - text = text % args - self.status_label.config(text=text) - - def check_msgq(self): - while not self.msgq.empty(): - msg = self.msgq.get() - if msg is None: - self.go_button.configure(state=NORMAL) - self.auto_button.configure(state=NORMAL) - self.cancel_button.configure(state=DISABLED) - if self.sucker: - self.sucker.stopit = 0 - self.top.bell() - else: - self.message(msg) - self.top.after(100, self.check_msgq) - - def go(self, event=None): - if not self.msgq: - self.msgq = queue.Queue(0) - self.check_msgq() - if not self.sucker: - self.sucker = SuckerThread(self.msgq) - if self.sucker.stopit: - return - self.url_entry.selection_range(0, END) - url = self.url_entry.get() - url = url.strip() - if not url: - self.top.bell() - self.message("[Error: No URL entered]") - return - self.rooturl = url - dir = self.dir_entry.get().strip() - if not dir: - self.sucker.savedir = None - else: - self.sucker.savedir = dir - self.sucker.rootdir = os.path.dirname( - websucker.Sucker.savefilename(self.sucker, url)) - self.go_button.configure(state=DISABLED) - self.auto_button.configure(state=DISABLED) - self.cancel_button.configure(state=NORMAL) - self.message( '[running...]') - self.sucker.stopit = 0 - t = threading.Thread(target=self.sucker.run1, args=(url,)) - t.start() - - def cancel(self): - if self.sucker: - self.sucker.stopit = 1 - self.message("[canceling...]") - - def auto(self): - tries = ['PRIMARY', 'CLIPBOARD'] - text = "" - for t in tries: - try: - text = self.top.selection_get(selection=t) - except TclError: - continue - text = text.strip() - if text: - break - if not text: - self.top.bell() - self.message("[Error: clipboard is empty]") - return - self.url_entry.delete(0, END) - self.url_entry.insert(0, text) - self.go() - - -class AppArray: - - def __init__(self, top=None): - if not top: - top = Tk() - top.title("websucker GUI") - top.iconname("wsgui") - top.wm_protocol('WM_DELETE_WINDOW', self.exit) - self.top = top - self.appframe = Frame(self.top) - self.appframe.pack(fill='both') - self.applist = [] - self.exit_button = Button(top, text="Exit", command=self.exit) - self.exit_button.pack(side=RIGHT) - self.new_button = Button(top, text="New", command=self.addsucker) - self.new_button.pack(side=LEFT) - self.addsucker() - ##self.applist[0].url_entry.insert(END, "http://www.python.org/doc/essays/") - - def addsucker(self): - self.top.geometry("") - frame = Frame(self.appframe, borderwidth=2, relief=GROOVE) - frame.pack(fill='x') - app = App(frame) - self.applist.append(app) - - done = 0 - - def mainloop(self): - while not self.done: - time.sleep(0.1) - self.top.update() - - def exit(self): - for app in self.applist: - app.cancel() - app.message("[exiting...]") - self.done = 1 - - -def main(): - AppArray().mainloop() - -if __name__ == '__main__': - main() |