From f54d967fec5d3287a33b965316513a7250fa8de6 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Mon, 7 Aug 1995 20:07:44 +0000 Subject: new formatter module; redid htmllib module to use it --- Lib/htmllib.py | 605 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 356 insertions(+), 249 deletions(-) diff --git a/Lib/htmllib.py b/Lib/htmllib.py index 4af446a..38312c6 100644 --- a/Lib/htmllib.py +++ b/Lib/htmllib.py @@ -1,139 +1,103 @@ -# New HTML class +"""HTML 2.0 parser. -# XXX Check against HTML 2.0 spec - -# XXX reorder methods according to hierarchy -# - html structure: head, body, title, isindex -# - headers -# - lists, items -# - paragraph styles -# - forms -# - character styles -# - images -# - bookkeeping -# - output generation +See the HTML 2.0 specification: +http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html +""" import sys import regsub import string from sgmllib import SGMLParser - - -ROMAN = 0 -ITALIC = 1 -BOLD = 2 -FIXED = 3 +from formatter import AS_IS class HTMLParser(SGMLParser): - def __init__(self): - SGMLParser.__init__(self) - self.savedata = None - self.isindex = 0 - self.title = '' - self.para = None - self.lists = [] - self.styles = [] - self.nofill = 0 - self.nospace = 1 - self.softspace = 0 + def __init__(self, formatter): + SGMLParser.__init__(self) + self.formatter = formatter + self.savedata = None + self.isindex = 0 + self.title = None + self.base = None + self.anchor = None + self.anchorlist = [] + self.nofill = 0 + self.list_stack = [] - # --- Data + # ------ Methods used internally; some may be overridden - def handle_image(self, src, alt): - self.handle_data(alt) + # --- Formatter interface, taking care of 'savedata' mode; + # shouldn't need to be overridden def handle_data(self, data): - if self.nofill: - self.handle_literal(data) - return - data = regsub.gsub('[ \t\n\r]+', ' ', data) - if self.nospace and data[:1] == ' ': data = data[1:] - if not data: return - self.nospace = 0 - if self.softspace and data[:1] != ' ': data = ' ' + data - if data[-1:] == ' ': - data = data[:-1] - self.softspace = 1 - self.output_data(data) - - def handle_literal(self, data): - self.nospace = 0 - self.softspace = 0 - self.output_data(data) - - def output_data(self, data): - if self.savedata is not None: + if self.savedata is not None: self.savedata = self.savedata + data - else: - self.write_data(data) + else: + if self.nofill: + self.formatter.add_literal_data(data) + else: + self.formatter.add_flowing_data(data) - def write_data(self, data): - sys.stdout.write(data) + # --- Hooks to save data; shouldn't need to be overridden def save_bgn(self): - self.savedata = '' - self.nospace = 1 - self.softspace = 0 + self.savedata = '' def save_end(self): - saved = self.savedata - self.savedata = None - self.nospace = 1 - self.softspace = 0 - return saved + data = self.savedata + self.savedata = None + return string.join(string.split(data)) + + # --- Hooks for anchors; should probably be overridden - def new_para(self): - pass + def anchor_bgn(self, href, name, type): + self.anchor = href + if self.anchor: + self.anchorlist.append(href) - def new_style(self): - pass + def anchor_end(self): + if self.anchor: + self.handle_data("[%d]" % len(self.anchorlist)) + self.anchor = None - # --- Generic style changes + # --- Hook for images; should probably be overridden - def para_bgn(self, tag): - if not self.nospace: - self.handle_literal('\n') - self.nospace = 1 - self.softspace = 0 - if tag is not None: - self.para = tag - self.new_para() + def handle_image(self, src, alt): + self.handle_data(alt) - def para_end(self): - self.para_bgn('') + # --- Hooks for forms; should probably be overridden - def push_list(self, tag): - self.lists.append(tag) - self.para_bgn(None) + def form_bgn(self, action, method, enctype): + self.do_p([]) + self.handle_data("
") - def pop_list(self): - del self.lists[-1] - self.para_end() + def form_end(self): + self.handle_data("
") + self.do_p([]) - def literal_bgn(self, tag, attrs): - self.para_bgn(tag) + def handle_input(self, type, options): + self.handle_data("") - def literal_end(self, tag): - self.para_end() + def select_bgn(self, name, size, multiple): + self.handle_data("") - def pop_style(self): - del self.styles[-1] - self.new_style() + def handle_option(self, value, selected): + self.handle_data("