diff options
-rw-r--r-- | Lib/htmllib.py | 916 |
1 files changed, 330 insertions, 586 deletions
diff --git a/Lib/htmllib.py b/Lib/htmllib.py index 10ca810..4af446a 100644 --- a/Lib/htmllib.py +++ b/Lib/htmllib.py @@ -1,633 +1,377 @@ -# A parser for HTML documents +# New HTML class +# XXX Check against HTML 2.0 spec -# HTML: HyperText Markup Language; an SGML-like syntax used by WWW to -# describe hypertext documents -# -# SGML: Standard Generalized Markup Language -# -# WWW: World-Wide Web; a distributed hypertext system develped at CERN -# -# CERN: European Particle Physics Laboratory in Geneva, Switzerland +# XXX reorder methods according to hierarchy +# - html structure: head, body, title, isindex +# - headers +# - lists, items +# - paragraph styles +# - forms +# - character styles +# - images +# - bookkeeping +# - output generation -# This file is only concerned with parsing and formatting HTML -# documents, not with the other (hypertext and networking) aspects of -# the WWW project. (It does support highlighting of anchors.) - - -import os import sys -import regex +import regsub import string -import sgmllib - - -class HTMLParser(sgmllib.SGMLParser): - - # Copy base class entities and add some - entitydefs = {} - for key in sgmllib.SGMLParser.entitydefs.keys(): - entitydefs[key] = sgmllib.SGMLParser.entitydefs[key] - entitydefs['bullet'] = '*' - - # Provided -- handlers for tags introducing literal text - - def start_listing(self, attrs): - self.setliteral('listing') - self.literal_bgn('listing', attrs) - - def end_listing(self): - self.literal_end('listing') - - def start_xmp(self, attrs): - self.setliteral('xmp') - self.literal_bgn('xmp', attrs) - - def end_xmp(self): - self.literal_end('xmp') - - def do_plaintext(self, attrs): - self.setnomoretags() - self.literal_bgn('plaintext', attrs) - - # To be overridden -- begin/end literal mode - def literal_bgn(self, tag, attrs): pass - def literal_end(self, tag): pass - - -# Next level of sophistication -- collect anchors, title, nextid and isindex -class CollectingParser(HTMLParser): - # - def __init__(self): - HTMLParser.__init__(self) - self.savetext = None - self.nextid = [] - self.isindex = 0 - self.title = '' - self.inanchor = 0 - self.anchors = [] - self.anchornames = [] - self.anchortypes = [] - # - def start_a(self, attrs): - self.inanchor = 0 - href = '' - name = '' - type = '' - for attrname, value in attrs: - if attrname == 'href': - href = value - if attrname == 'name=': - name = value - if attrname == 'type=': - type = string.lower(value) - if not (href or name): - return - self.anchors.append(href) - self.anchornames.append(name) - self.anchortypes.append(type) - self.inanchor = len(self.anchors) - if not href: - self.inanchor = -self.inanchor - # - def end_a(self): - if self.inanchor > 0: - # Don't show anchors pointing into the current document - if self.anchors[self.inanchor-1][:1] <> '#': - self.handle_data('[' + `self.inanchor` + ']') - self.inanchor = 0 - # - def start_html(self, attrs): pass - def end_html(self): pass - # - def start_head(self, attrs): pass - def end_head(self): pass - # - def start_body(self, attrs): pass - def end_body(self): pass - # - def do_nextid(self, attrs): - self.nextid = attrs - # - def do_isindex(self, attrs): - self.isindex = 1 - # - def start_title(self, attrs): - self.savetext = '' - # - def end_title(self): - if self.savetext <> None: - self.title = self.savetext - self.savetext = None - # - def handle_data(self, text): - if self.savetext is not None: - self.savetext = self.savetext + text - - -# Formatting parser -- takes a formatter and a style sheet as arguments - -# XXX The use of style sheets should change: for each tag and end tag -# there should be a style definition, and a style definition should -# encompass many more parameters: font, justification, indentation, -# vspace before, vspace after, hanging tag... - -wordprog = regex.compile('[^ \t\n]*') -spaceprog = regex.compile('[ \t\n]*') - -class FormattingParser(CollectingParser): - - def __init__(self, formatter, stylesheet): - CollectingParser.__init__(self) - self.fmt = formatter - self.stl = stylesheet - self.savetext = None - self.compact = 0 - self.nofill = 0 - self.resetfont() - self.setindent(self.stl.stdindent) - - def resetfont(self): - self.fontstack = [] - self.stylestack = [] - self.fontset = self.stl.stdfontset - self.style = ROMAN - self.passfont() - - def passfont(self): - font = self.fontset[self.style] - self.fmt.setfont(font) - - def pushstyle(self, style): - self.stylestack.append(self.style) - self.style = min(style, len(self.fontset)-1) - self.passfont() - - def popstyle(self): - self.style = self.stylestack[-1] - del self.stylestack[-1] - self.passfont() - - def pushfontset(self, fontset, style): - self.fontstack.append(self.fontset) - self.fontset = fontset - self.pushstyle(style) - - def popfontset(self): - self.fontset = self.fontstack[-1] - del self.fontstack[-1] - self.popstyle() - - def flush(self): - self.fmt.flush() - - def setindent(self, n): - self.fmt.setleftindent(n) - - def needvspace(self, n): - self.fmt.needvspace(n) - - def close(self): - HTMLParser.close(self) - self.fmt.flush() - - def handle_literal(self, text): - lines = string.splitfields(text, '\n') - for i in range(1, len(lines)): - lines[i] = string.expandtabs(lines[i], 8) - for line in lines[:-1]: - self.fmt.addword(line, 0) - self.fmt.flush() - self.fmt.nospace = 0 - for line in lines[-1:]: - self.fmt.addword(line, 0) - - def handle_data(self, text): - if self.savetext is not None: - self.savetext = self.savetext + text - return - if self.literal: - self.handle_literal(text) - return - i = 0 - n = len(text) - while i < n: - j = i + wordprog.match(text, i) - word = text[i:j] - i = j + spaceprog.match(text, j) - self.fmt.addword(word, i-j) - if self.nofill and '\n' in text[j:i]: - self.fmt.flush() - self.fmt.nospace = 0 - i = j+1 - while text[i-1] <> '\n': i = i+1 - - def literal_bgn(self, tag, attrs): - if tag == 'plaintext': - self.flush() - else: - self.needvspace(1) - self.pushfontset(self.stl.stdfontset, FIXED) - self.setindent(self.stl.literalindent) - - def literal_end(self, tag): - self.needvspace(1) - self.popfontset() - self.setindent(self.stl.stdindent) - - def start_title(self, attrs): - self.flush() - self.savetext = '' - # NB end_title is unchanged - - def do_p(self, attrs): - if self.compact: - self.flush() - else: - self.needvspace(1) - - def start_h1(self, attrs): - self.needvspace(2) - self.setindent(self.stl.h1indent) - self.pushfontset(self.stl.h1fontset, BOLD) - self.fmt.setjust('c') - - def end_h1(self): - self.popfontset() - self.needvspace(2) - self.setindent(self.stl.stdindent) - self.fmt.setjust('l') - - def start_h2(self, attrs): - self.needvspace(1) - self.setindent(self.stl.h2indent) - self.pushfontset(self.stl.h2fontset, BOLD) - - def end_h2(self): - self.popfontset() - self.needvspace(1) - self.setindent(self.stl.stdindent) - - def start_h3(self, attrs): - self.needvspace(1) - self.setindent(self.stl.stdindent) - self.pushfontset(self.stl.h3fontset, BOLD) - - def end_h3(self): - self.popfontset() - self.needvspace(1) - self.setindent(self.stl.stdindent) - - def start_h4(self, attrs): - self.needvspace(1) - self.setindent(self.stl.stdindent) - self.pushfontset(self.stl.stdfontset, BOLD) - - def end_h4(self): - self.popfontset() - self.needvspace(1) - self.setindent(self.stl.stdindent) - - start_h5 = start_h4 - end_h5 = end_h4 - - start_h6 = start_h5 - end_h6 = end_h5 - - start_h7 = start_h6 - end_h7 = end_h6 - - def start_ul(self, attrs): - self.needvspace(1) - for attrname, value in attrs: - if attrname == 'compact': - self.compact = 1 - self.setindent(0) - break - else: - self.setindent(self.stl.ulindent) - - start_dir = start_menu = start_ol = start_ul - - do_li = do_p - - def end_ul(self): - self.compact = 0 - self.needvspace(1) - self.setindent(self.stl.stdindent) - - end_dir = end_menu = end_ol = end_ul - - def start_dl(self, attrs): - for attrname, value in attrs: - if attrname == 'compact': - self.compact = 1 - self.needvspace(1) - - def end_dl(self): - self.compact = 0 - self.needvspace(1) - self.setindent(self.stl.stdindent) +from sgmllib import SGMLParser - def do_dt(self, attrs): - if self.compact: - self.flush() - else: - self.needvspace(1) - self.setindent(self.stl.stdindent) - def do_dd(self, attrs): - self.fmt.addword('', 1) - self.setindent(self.stl.ddindent) +ROMAN = 0 +ITALIC = 1 +BOLD = 2 +FIXED = 3 - def start_address(self, attrs): - self.compact = 1 - self.needvspace(1) - self.fmt.setjust('r') - def end_address(self): - self.compact = 0 - self.needvspace(1) - self.setindent(self.stl.stdindent) - self.fmt.setjust('l') +class HTMLParser(SGMLParser): + + def __init__(self): + SGMLParser.__init__(self) + self.savedata = None + self.isindex = 0 + self.title = '' + self.para = None + self.lists = [] + self.styles = [] + self.nofill = 0 + self.nospace = 1 + self.softspace = 0 + + # --- Data + + def handle_image(self, src, alt): + self.handle_data(alt) + + def handle_data(self, data): + if self.nofill: + self.handle_literal(data) + return + data = regsub.gsub('[ \t\n\r]+', ' ', data) + if self.nospace and data[:1] == ' ': data = data[1:] + if not data: return + self.nospace = 0 + if self.softspace and data[:1] != ' ': data = ' ' + data + if data[-1:] == ' ': + data = data[:-1] + self.softspace = 1 + self.output_data(data) - def start_pre(self, attrs): - self.needvspace(1) - self.nofill = self.nofill + 1 - self.pushstyle(FIXED) + def handle_literal(self, data): + self.nospace = 0 + self.softspace = 0 + self.output_data(data) - def end_pre(self): - self.popstyle() - self.nofill = self.nofill - 1 - self.needvspace(1) + def output_data(self, data): + if self.savedata is not None: + self.savedata = self.savedata + data + else: + self.write_data(data) - start_typewriter = start_pre - end_typewriter = end_pre + def write_data(self, data): + sys.stdout.write(data) - def do_img(self, attrs): - self.fmt.addword('(image)', 0) + def save_bgn(self): + self.savedata = '' + self.nospace = 1 + self.softspace = 0 - # Physical styles + def save_end(self): + saved = self.savedata + self.savedata = None + self.nospace = 1 + self.softspace = 0 + return saved + + def new_para(self): + pass + + def new_style(self): + pass + + # --- Generic style changes + + def para_bgn(self, tag): + if not self.nospace: + self.handle_literal('\n') + self.nospace = 1 + self.softspace = 0 + if tag is not None: + self.para = tag + self.new_para() + + def para_end(self): + self.para_bgn('') + + def push_list(self, tag): + self.lists.append(tag) + self.para_bgn(None) + + def pop_list(self): + del self.lists[-1] + self.para_end() + + def literal_bgn(self, tag, attrs): + self.para_bgn(tag) + + def literal_end(self, tag): + self.para_end() - def start_tt(self, attrs): self.pushstyle(FIXED) - def end_tt(self): self.popstyle() + def push_style(self, tag): + self.styles.append(tag) + self.new_style() - def start_b(self, attrs): self.pushstyle(BOLD) - def end_b(self): self.popstyle() + def pop_style(self): + del self.styles[-1] + self.new_style() + + def anchor_bgn(self, href, name, type): + self.push_style(href and 'a' or None) + + def anchor_end(self): + self.pop_style() + + # --- Top level tags - def start_i(self, attrs): self.pushstyle(ITALIC) - def end_i(self): self.popstyle() + def start_html(self, attrs): pass + def end_html(self): pass - def start_u(self, attrs): self.pushstyle(ITALIC) # Underline??? - def end_u(self): self.popstyle() + def start_head(self, attrs): pass + def end_head(self): pass - def start_r(self, attrs): self.pushstyle(ROMAN) # Not official - def end_r(self): self.popstyle() + def start_body(self, attrs): pass + def end_body(self): pass - # Logical styles + def do_isindex(self, attrs): + self.isindex = 1 - start_em = start_i - end_em = end_i + def start_title(self, attrs): + self.save_bgn() - start_strong = start_b - end_strong = end_b + def end_title(self): + self.title = self.save_end() - start_code = start_tt - end_code = end_tt + # --- Old HTML 'literal text' tags - start_samp = start_tt - end_samp = end_tt + def start_listing(self, attrs): + self.setliteral('listing') + self.literal_bgn('listing', attrs) - start_kbd = start_tt - end_kbd = end_tt + def end_listing(self): + self.literal_end('listing') - start_file = start_tt # unofficial - end_file = end_tt + def start_xmp(self, attrs): + self.setliteral('xmp') + self.literal_bgn('xmp', attrs) - start_var = start_i - end_var = end_i + def end_xmp(self): + self.literal_end('xmp') - start_dfn = start_i - end_dfn = end_i + def do_plaintext(self, attrs): + self.setnomoretags() + self.literal_bgn('plaintext', attrs) - start_cite = start_i - end_cite = end_i + # --- Anchors - start_hp1 = start_i - end_hp1 = start_i + def start_a(self, attrs): + href = '' + name = '' + type = '' + for attrname, value in attrs: + if attrname == 'href': + href = value + if attrname == 'name': + name = value + if attrname == 'type': + type = string.lower(value) + if not (href or name): + return + self.anchor_bgn(href, name, type) - start_hp2 = start_b - end_hp2 = end_b + def end_a(self): + self.anchor_end() - def unknown_starttag(self, tag, attrs): - print '*** unknown <' + tag + '>' + # --- Paragraph tags - def unknown_endtag(self, tag): - print '*** unknown </' + tag + '>' + def do_p(self, attrs): + self.para_bgn(None) + def do_br(self, attrs): + self.handle_literal('\n') + self.nospace = 1 + self.softspace = 0 -# An extension of the formatting parser which formats anchors differently. -class AnchoringParser(FormattingParser): + def do_hr(self, attrs): + self.para_bgn(None) + self.handle_literal('-'*40) + self.para_end() - def start_a(self, attrs): - FormattingParser.start_a(self, attrs) - if self.inanchor: - self.fmt.bgn_anchor(self.inanchor) + def start_h1(self, attrs): + self.para_bgn('h1') - def end_a(self): - if self.inanchor: - self.fmt.end_anchor(self.inanchor) - self.inanchor = 0 + def start_h2(self, attrs): + self.para_bgn('h2') + def start_h3(self, attrs): + self.para_bgn('h3') -# Style sheet -- this is never instantiated, but the attributes -# of the class object itself are used to specify fonts to be used -# for various paragraph styles. -# A font set is a non-empty list of fonts, in the order: -# [roman, italic, bold, fixed]. -# When a style is not available the nearest lower style is used + def start_h4(self, attrs): + self.para_bgn('h4') -ROMAN = 0 -ITALIC = 1 -BOLD = 2 -FIXED = 3 + def start_h5(self, attrs): + self.para_bgn('h5') + + def start_h6(self, attrs): + self.para_bgn('h6') + + def end_h1(self): + self.para_end() + + end_h2 = end_h1 + end_h3 = end_h2 + end_h4 = end_h3 + end_h5 = end_h4 + end_h6 = end_h5 + + def start_ul(self, attrs): + self.para_bgn(None) + self.push_list('ul') + + def start_ol(self, attrs): + self.para_bgn(None) + self.push_list('ol') + + def end_ul(self): + self.pop_list() + self.para_end() + + def do_li(self, attrs): + self.para_bgn('li%d' % len(self.lists)) + + start_dir = start_menu = start_ul + end_dir = end_menu = end_ol = end_ul + + def start_dl(self, attrs): + self.para_bgn(None) + self.push_list('dl') + + def end_dl(self): + self.pop_list() + self.para_end() + + def do_dt(self, attrs): + self.para_bgn('dt%d' % len(self.lists)) + + def do_dd(self, attrs): + self.para_bgn('dd%d' % len(self.lists)) + + def start_address(self, attrs): + self.para_bgn('address') + + def end_address(self): + self.para_end() + + def start_pre(self, attrs): + self.para_bgn('pre') + self.nofill = self.nofill + 1 + + def end_pre(self): + self.nofill = self.nofill - 1 + self.para_end() + + start_typewriter = start_pre + end_typewriter = end_pre + + def do_img(self, attrs): + src = '' + alt = ' (image) ' + for attrname, value in attrs: + if attrname == 'alt': + alt = value + if attrname == 'src': + src = value + self.handle_image(src, alt) + + # --- Character tags -- physical styles + + def start_tt(self, attrs): self.push_style(FIXED) + def end_tt(self): self.pop_style() + + def start_b(self, attrs): self.push_style(BOLD) + def end_b(self): self.pop_style() + + def start_i(self, attrs): self.push_style(ITALIC) + def end_i(self): self.pop_style() + + def start_u(self, attrs): self.push_style(ITALIC) # Underline??? + def end_u(self): self.pop_style() + + def start_r(self, attrs): self.push_style(ROMAN) # Not official + def end_r(self): self.pop_style() + + # --- Charaacter tags -- logical styles + + start_em = start_i + end_em = end_i + + start_strong = start_b + end_strong = end_b + + start_code = start_tt + end_code = end_tt + + start_samp = start_tt + end_samp = end_tt + + start_kbd = start_tt + end_kbd = end_tt + + start_file = start_tt # unofficial + end_file = end_tt + + start_var = start_i + end_var = end_i + + start_dfn = start_i + end_dfn = end_i + + start_cite = start_i + end_cite = end_i + + start_hp1 = start_i + end_hp1 = start_i + + start_hp2 = start_b + end_hp2 = end_b + + # --- Form tags + + def start_form(self, attrs): + self.para_bgn(None) + + def end_form(self): + self.para_end() + + # --- Unhandled tags + + def unknown_starttag(self, tag, attrs): + pass + + def unknown_endtag(self, tag): + pass -class NullStylesheet: - # Fonts -- none - stdfontset = [None] - h1fontset = [None] - h2fontset = [None] - h3fontset = [None] - # Indents - stdindent = 2 - ddindent = 25 - ulindent = 4 - h1indent = 0 - h2indent = 0 - literalindent = 0 - - -class X11Stylesheet(NullStylesheet): - stdfontset = [ - '-*-helvetica-medium-r-normal-*-*-100-100-*-*-*-*-*', - '-*-helvetica-medium-o-normal-*-*-100-100-*-*-*-*-*', - '-*-helvetica-bold-r-normal-*-*-100-100-*-*-*-*-*', - '-*-courier-medium-r-normal-*-*-100-100-*-*-*-*-*', - ] - h1fontset = [ - '-*-helvetica-medium-r-normal-*-*-180-100-*-*-*-*-*', - '-*-helvetica-medium-o-normal-*-*-180-100-*-*-*-*-*', - '-*-helvetica-bold-r-normal-*-*-180-100-*-*-*-*-*', - ] - h2fontset = [ - '-*-helvetica-medium-r-normal-*-*-140-100-*-*-*-*-*', - '-*-helvetica-medium-o-normal-*-*-140-100-*-*-*-*-*', - '-*-helvetica-bold-r-normal-*-*-140-100-*-*-*-*-*', - ] - h3fontset = [ - '-*-helvetica-medium-r-normal-*-*-120-100-*-*-*-*-*', - '-*-helvetica-medium-o-normal-*-*-120-100-*-*-*-*-*', - '-*-helvetica-bold-r-normal-*-*-120-100-*-*-*-*-*', - ] - ddindent = 40 - - -class MacStylesheet(NullStylesheet): - stdfontset = [ - ('Geneva', 'p', 10), - ('Geneva', 'i', 10), - ('Geneva', 'b', 10), - ('Monaco', 'p', 10), - ] - h1fontset = [ - ('Geneva', 'p', 18), - ('Geneva', 'i', 18), - ('Geneva', 'b', 18), - ('Monaco', 'p', 18), - ] - h3fontset = [ - ('Geneva', 'p', 14), - ('Geneva', 'i', 14), - ('Geneva', 'b', 14), - ('Monaco', 'p', 14), - ] - h3fontset = [ - ('Geneva', 'p', 12), - ('Geneva', 'i', 12), - ('Geneva', 'b', 12), - ('Monaco', 'p', 12), - ] - - -if os.name == 'mac': - StdwinStylesheet = MacStylesheet -else: - StdwinStylesheet = X11Stylesheet - - -class GLStylesheet(NullStylesheet): - stdfontset = [ - 'Helvetica 10', - 'Helvetica-Italic 10', - 'Helvetica-Bold 10', - 'Courier 10', - ] - h1fontset = [ - 'Helvetica 18', - 'Helvetica-Italic 18', - 'Helvetica-Bold 18', - 'Courier 18', - ] - h2fontset = [ - 'Helvetica 14', - 'Helvetica-Italic 14', - 'Helvetica-Bold 14', - 'Courier 14', - ] - h3fontset = [ - 'Helvetica 12', - 'Helvetica-Italic 12', - 'Helvetica-Bold 12', - 'Courier 12', - ] - - -# Test program -- produces no output but times how long it takes -# to send a document to a null formatter, exclusive of I/O def test(): - import fmt - import time - if sys.argv[1:]: file = sys.argv[1] - else: file = 'test.html' - data = open(file, 'r').read() - t0 = time.time() - fmtr = fmt.WritingFormatter(sys.stdout, 79) - p = FormattingParser(fmtr, NullStylesheet) - p.feed(data) - p.close() - t1 = time.time() - print - print '*** Formatting time:', round(t1-t0, 3), 'seconds.' - - -# Test program using stdwin - -def testStdwin(): - import stdwin, fmt - from stdwinevents import * - if sys.argv[1:]: file = sys.argv[1] - else: file = 'test.html' - data = open(file, 'r').read() - window = stdwin.open('testStdwin') - b = None - while 1: - etype, ewin, edetail = stdwin.getevent() - if etype == WE_CLOSE: - break - if etype == WE_SIZE: - window.setdocsize(0, 0) - window.setorigin(0, 0) - window.change((0, 0), (10000, 30000)) # XXX - if etype == WE_DRAW: - if not b: - b = fmt.StdwinBackEnd(window, 1) - f = fmt.BaseFormatter(b.d, b) - p = FormattingParser(f, MacStylesheet) - p.feed(data) - p.close() - b.finish() - else: - b.redraw(edetail) - window.close() - - -# Test program using GL - -def testGL(): - import gl, GL, fmt - if sys.argv[1:]: file = sys.argv[1] - else: file = 'test.html' - data = open(file, 'r').read() - W, H = 600, 600 - gl.foreground() - gl.prefsize(W, H) - wid = gl.winopen('testGL') - gl.ortho2(0, W, H, 0) - gl.color(GL.WHITE) - gl.clear() - gl.color(GL.BLACK) - b = fmt.GLBackEnd(wid) - f = fmt.BaseFormatter(b.d, b) - p = FormattingParser(f, GLStylesheet) - p.feed(data) - p.close() - b.finish() - # - import time - time.sleep(5) + file = 'test.html' + f = open(file, 'r') + data = f.read() + f.close() + p = HTMLParser() + p.feed(data) + p.close() if __name__ == '__main__': - test() + test() |