"""HTML 2.0 parser. See the HTML 2.0 specification: http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html """ import sys import regsub import string from sgmllib import SGMLParser from formatter import AS_IS class HTMLParser(SGMLParser): def __init__(self, formatter): SGMLParser.__init__(self) self.formatter = formatter self.savedata = None self.isindex = 0 self.title = None self.base = None self.anchor = None self.anchorlist = [] self.nofill = 0 self.list_stack = [] # ------ Methods used internally; some may be overridden # --- Formatter interface, taking care of 'savedata' mode; # shouldn't need to be overridden def handle_data(self, data): if self.savedata is not None: self.savedata = self.savedata + data else: if self.nofill: self.formatter.add_literal_data(data) else: self.formatter.add_flowing_data(data) # --- Hooks to save data; shouldn't need to be overridden def save_bgn(self): self.savedata = '' def save_end(self): data = self.savedata self.savedata = None return string.join(string.split(data)) # --- Hooks for anchors; should probably be overridden def anchor_bgn(self, href, name, type): self.anchor = href if self.anchor: self.anchorlist.append(href) def anchor_end(self): if self.anchor: self.handle_data("[%d]" % len(self.anchorlist)) self.anchor = None # --- Hook for images; should probably be overridden def handle_image(self, src, alt): self.handle_data(alt) # --- Hooks for forms; should probably be overridden def form_bgn(self, action, method, enctype): self.do_p([]) self.handle_data("
") def form_end(self): self.handle_data("
") self.do_p([]) def handle_input(self, type, options): self.handle_data("") def select_bgn(self, name, size, multiple): self.handle_data("") def handle_option(self, value, selected): self.handle_data("