# A parser for HTML documents # HTML: HyperText Markup Language; an SGML-like syntax used by WWW to # describe hypertext documents # # SGML: Standard Generalized Markup Language # # WWW: World-Wide Web; a distributed hypertext system develped at CERN # # CERN: European Particle Physics Laboratory in Geneva, Switzerland # This file is only concerned with parsing and formatting HTML # documents, not with the other (hypertext and networking) aspects of # the WWW project. (It does support highlighting of anchors.) import os import sys import regex import string import sgmllib class HTMLParser(sgmllib.SGMLParser): # Copy base class entities and add some entitydefs = {} for key in sgmllib.SGMLParser.entitydefs.keys(): entitydefs[key] = sgmllib.SGMLParser.entitydefs[key] entitydefs['bullet'] = '*' # Provided -- handlers for tags introducing literal text def start_listing(self, attrs): self.setliteral('listing') self.literal_bgn('listing', attrs) def end_listing(self): self.literal_end('listing') def start_xmp(self, attrs): self.setliteral('xmp') self.literal_bgn('xmp', attrs) def end_xmp(self): self.literal_end('xmp') def do_plaintext(self, attrs): self.setnomoretags() self.literal_bgn('plaintext', attrs) # To be overridden -- begin/end literal mode def literal_bgn(self, tag, attrs): pass def literal_end(self, tag): pass # Next level of sophistication -- collect anchors, title, nextid and isindex class CollectingParser(HTMLParser): # def __init__(self): HTMLParser.__init__(self) self.savetext = None self.nextid = '' self.isindex = 0 self.title = '' self.inanchor = 0 self.anchors = [] self.anchornames = [] self.anchortypes = [] # def start_a(self, attrs): self.inanchor = 0 href = '' name = '' type = '' for attrname, value in attrs: if attrname == 'href': href = value if attrname == 'name=': name = value if attrname == 'type=': type = string.lower(value) if not (href or name): return self.anchors.append(href) self.anchornames.append(name) self.anchortypes.append(type) self.inanchor = len(self.anchors) if not href: self.inanchor = -self.inanchor # def end_a(self): if self.inanchor > 0: # Don't show anchors pointing into the current document if self.anchors[self.inanchor-1][:1] <> '#': self.handle_data('[' + `self.inanchor` + ']') self.inanchor = 0 # def start_header(self, attrs): pass def end_header(self): pass # # (head is the same as header) def start_head(self, attrs): pass def end_head(self): pass # def start_body(self, attrs): pass def end_body(self): pass # def do_nextid(self, attrs): self.nextid = attrs # def do_isindex(self, attrs): self.isindex = 1 # def start_title(self, attrs): self.savetext = '' # def end_title(self): if self.savetext <> None: self.title = self.savetext self.savetext = None # def handle_data(self, text): if self.savetext is not None: self.savetext = self.savetext + text # Formatting parser -- takes a formatter and a style sheet as arguments # XXX The use of style sheets should change: for each tag and end tag # there should be a style definition, and a style definition should # encompass many more parameters: font, justification, indentation, # vspace before, vspace after, hanging tag... wordprog = regex.compile('[^ \t\n]*') spaceprog = regex.compile('[ \t\n]*') class FormattingParser(CollectingParser): def __init__(self, formatter, stylesheet): CollectingParser.__init__(self) self.fmt = formatter self.stl = stylesheet self.savetext = None self.compact = 0 self.nofill = 0 self.resetfont() self.setindent(self.stl.stdindent) def resetfont(self): self.fontstack = [] self.stylestack = [] self.fontset = self.stl.stdfontset self.style = ROMAN self.passfont() def passfont(self): font = self.fontset[self.style] self.fmt.setfont(font) def pushstyle(self, style): self.stylestack.append(self.style) self.style = min(style, len(self.fontset)-1) self.passfont() def popstyle(self): self.style = self.stylestack[-1] del self.stylestack[-1] self.passfont() def pushfontset(self, fontset, style): self.fontstack.append(self.fontset) self.fontset = fontset self.pushstyle(style) def popfontset(self): self.fontset = self.fontstack[-1] del self.fontstack[-1] self.popstyle() def flush(self): self.fmt.flush() def setindent(self, n): self.fmt.setleftindent(n) def needvspace(self, n): self.fmt.needvspace(n) def close(self): HTMLParser.close(self) self.fmt.flush() def handle_literal(self, text): lines = string.splitfields(text, '\n') for i in range(1, len(lines)): lines[i] = string.expandtabs(lines[i], 8) for line in lines[:-1]: self.fmt.addword(line, 0) self.fmt.flush() self.fmt.nospace = 0 for line in lines[-1:]: self.fmt.addword(line, 0) def handle_data(self, text): if self.savetext is not None: self.savetext = self.savetext + text return if self.literal: self.handle_literal(text) return i = 0 n = len(text) while i < n: j = i + wordprog.match(text, i) word = text[i:j] i = j + spaceprog.match(text, j) self.fmt.addword(word, i-j) if self.nofill and '\n' in text[j:i]: self.fmt.flush() self.fmt.nospace = 0 i = j+1 while text[i-1] <> '\n': i = i+1 def literal_bgn(self, tag, attrs): if tag == 'plaintext': self.flush() else: self.needvspace(1) self.pushfontset(self.stl.stdfontset, FIXED) self.setindent(self.stl.literalindent) def literal_end(self, tag): self.needvspace(1) self.popfontset() self.setindent(self.stl.stdindent) def start_title(self, attrs): self.flush() self.savetext = '' # NB end_title is unchanged def do_p(self, attrs): if self.compact: self.flush() else: self.needvspace(1) def start_h1(self, attrs): self.needvspace(2) self.setindent(self.stl.h1indent) self.pushfontset(self.stl.h1fontset, BOLD) self.fmt.setjust('c') def end_h1(self): self.popfontset() self.needvspace(2) self.setindent(self.stl.stdindent) self.fmt.setjust('l') def start_h2(self, attrs): self.needvspace(1) self.setindent(self.stl.h2indent) self.pushfontset(self.stl.h2fontset, BOLD) def end_h2(self): self.popfontset() self.needvspace(1) self.setindent(self.stl.stdindent) def start_h3(self, attrs): self.needvspace(1) self.setindent(self.stl.stdindent) self.pushfontset(self.stl.h3fontset, BOLD) def end_h3(self): self.popfontset() self.needvspace(1) self.setindent(self.stl.stdindent) def start_h4(self, attrs): self.needvspace(1) self.setindent(self.stl.stdindent) self.pushfontset(self.stl.stdfontset, BOLD) def end_h4(self): self.popfontset() self.needvspace(1) self.setindent(self.stl.stdindent) start_h5 = start_h4 end_h5 = end_h4 start_h6 = start_h5 end_h6 = end_h5 start_h7 = start_h6 end_h7 = end_h6 def start_ul(self, attrs): self.needvspace(1) for attrname, value in attrs: if attrname == 'compact': self.compact = 1 self.setindent(0) break else: self.setindent(self.stl.ulindent) start_dir = start_menu = start_ol = start_ul do_li = do_p def end_ul(self): self.compact = 0 self.needvspace(1) self.setindent(self.stl.stdindent) end_dir = end_menu = end_ol = end_ul def start_dl(self, attrs): for attrname, value in attrs: if attrname == 'compact': self.compact = 1 self.needvspace(1) def end_dl(self): self.compact = 0 self.needvspace(1) self.setindent(self.stl.stdindent) def do_dt(self, attrs): if self.compact: self.flush() else: self.needvspace(1) self.setindent(self.stl.stdindent) def do_dd(self, attrs): self.fmt.addword('', 1) self.setindent(self.stl.ddindent) def start_address(self, attrs): self.compact = 1 self.needvspace(1) self.fmt.setjust('r') def end_address(self): self.compact = 0 self.needvspace(1) self.setindent(self.stl.stdindent) self.fmt.setjust('l') def start_pre(self, attrs): self.needvspace(1) self.nofill = self.nofill + 1 self.pushstyle(FIXED) def end_pre(self): self.popstyle() self.nofill = self.nofill - 1 self.needvspace(1) start_typewriter = start_pre end_typewriter = end_pre def do_img(self, attrs): self.fmt.addword('(image)', 0) # Physical styles def start_tt(self, attrs): self.pushstyle(FIXED) def end_tt(self): self.popstyle() def start_b(self, attrs): self.pushstyle(BOLD) def end_b(self): self.popstyle() def start_i(self, attrs): self.pushstyle(ITALIC) def end_i(self): self.popstyle() def start_u(self, attrs): self.pushstyle(ITALIC) # Underline??? def end_u(self): self.popstyle() def start_r(self, attrs): self.pushstyle(ROMAN) # Not official def end_r(self): self.popstyle() # Logical styles start_em = start_i end_em = end_i start_strong = start_b end_strong = end_b start_code = start_tt end_code = end_tt start_samp = start_tt end_samp = end_tt start_kbd = start_tt end_kbd = end_tt start_file = start_tt # unofficial end_file = end_tt start_var = start_i end_var = end_i start_dfn = start_i end_dfn = end_i start_cite = start_i end_cite = end_i start_hp1 = start_i end_hp1 = start_i start_hp2 = start_b end_hp2 = end_b def unknown_starttag(self, tag, attrs): print '*** unknown <' + tag + '>' def unknown_endtag(self, tag): print '*** unknown ' # An extension of the formatting parser which formats anchors differently. class AnchoringParser(FormattingParser): def start_a(self, attrs): FormattingParser.start_a(self, attrs) if self.inanchor: self.fmt.bgn_anchor(self.inanchor) def end_a(self): if self.inanchor: self.fmt.end_anchor(self.inanchor) self.inanchor = 0 # Style sheet -- this is never instantiated, but the attributes # of the class object itself are used to specify fonts to be used # for various paragraph styles. # A font set is a non-empty list of fonts, in the order: # [roman, italic, bold, fixed]. # When a style is not available the nearest lower style is used ROMAN = 0 ITALIC = 1 BOLD = 2 FIXED = 3 class NullStylesheet: # Fonts -- none stdfontset = [None] h1fontset = [None] h2fontset = [None] h3fontset = [None] # Indents stdindent = 2 ddindent = 25 ulindent = 4 h1indent = 0 h2indent = 0 literalindent = 0 class X11Stylesheet(NullStylesheet): stdfontset = [ \ '-*-helvetica-medium-r-normal-*-*-100-100-*-*-*-*-*', \ '-*-helvetica-medium-o-normal-*-*-100-100-*-*-*-*-*', \ '-*-helvetica-bold-r-normal-*-*-100-100-*-*-*-*-*', \ '-*-courier-medium-r-normal-*-*-100-100-*-*-*-*-*', \ ] h1fontset = [ \ '-*-helvetica-medium-r-normal-*-*-180-100-*-*-*-*-*', \ '-*-helvetica-medium-o-normal-*-*-180-100-*-*-*-*-*', \ '-*-helvetica-bold-r-normal-*-*-180-100-*-*-*-*-*', \ ] h2fontset = [ \ '-*-helvetica-medium-r-normal-*-*-140-100-*-*-*-*-*', \ '-*-helvetica-medium-o-normal-*-*-140-100-*-*-*-*-*', \ '-*-helvetica-bold-r-normal-*-*-140-100-*-*-*-*-*', \ ] h3fontset = [ \ '-*-helvetica-medium-r-normal-*-*-120-100-*-*-*-*-*', \ '-*-helvetica-medium-o-normal-*-*-120-100-*-*-*-*-*', \ '-*-helvetica-bold-r-normal-*-*-120-100-*-*-*-*-*', \ ] ddindent = 40 class MacStylesheet(NullStylesheet): stdfontset = [ \ ('Geneva', 'p', 10), \ ('Geneva', 'i', 10), \ ('Geneva', 'b', 10), \ ('Monaco', 'p', 10), \ ] h1fontset = [ \ ('Geneva', 'p', 18), \ ('Geneva', 'i', 18), \ ('Geneva', 'b', 18), \ ('Monaco', 'p', 18), \ ] h3fontset = [ \ ('Geneva', 'p', 14), \ ('Geneva', 'i', 14), \ ('Geneva', 'b', 14), \ ('Monaco', 'p', 14), \ ] h3fontset = [ \ ('Geneva', 'p', 12), \ ('Geneva', 'i', 12), \ ('Geneva', 'b', 12), \ ('Monaco', 'p', 12), \ ] if os.name == 'mac': StdwinStylesheet = MacStylesheet else: StdwinStylesheet = X11Stylesheet class GLStylesheet(NullStylesheet): stdfontset = [ \ 'Helvetica 10', \ 'Helvetica-Italic 10', \ 'Helvetica-Bold 10', \ 'Courier 10', \ ] h1fontset = [ \ 'Helvetica 18', \ 'Helvetica-Italic 18', \ 'Helvetica-Bold 18', \ 'Courier 18', \ ] h2fontset = [ \ 'Helvetica 14', \ 'Helvetica-Italic 14', \ 'Helvetica-Bold 14', \ 'Courier 14', \ ] h3fontset = [ \ 'Helvetica 12', \ 'Helvetica-Italic 12', \ 'Helvetica-Bold 12', \ 'Courier 12', \ ] # Test program -- produces no output but times how long it takes # to send a document to a null formatter, exclusive of I/O def test(): import fmt import time if sys.argv[1:]: file = sys.argv[1] else: file = 'test.html' data = open(file, 'r').read() t0 = time.time() fmtr = fmt.WritingFormatter(sys.stdout, 79) p = FormattingParser(fmtr, NullStylesheet) p.feed(data) p.close() t1 = time.time() print print '*** Formatting time:', round(t1-t0, 3), 'seconds.' # Test program using stdwin def testStdwin(): import stdwin, fmt from stdwinevents import * if sys.argv[1:]: file = sys.argv[1] else: file = 'test.html' data = open(file, 'r').read() window = stdwin.open('testStdwin') b = None while 1: etype, ewin, edetail = stdwin.getevent() if etype == WE_CLOSE: break if etype == WE_SIZE: window.setdocsize(0, 0) window.setorigin(0, 0) window.change((0, 0), (10000, 30000)) # XXX if etype == WE_DRAW: if not b: b = fmt.StdwinBackEnd(window, 1) f = fmt.BaseFormatter(b.d, b) p = FormattingParser(f, \ MacStylesheet) p.feed(data) p.close() b.finish() else: b.redraw(edetail) window.close() # Test program using GL def testGL(): import gl, GL, fmt if sys.argv[1:]: file = sys.argv[1] else: file = 'test.html' data = open(file, 'r').read() W, H = 600, 600 gl.foreground() gl.prefsize(W, H) wid = gl.winopen('testGL') gl.ortho2(0, W, H, 0) gl.color(GL.WHITE) gl.clear() gl.color(GL.BLACK) b = fmt.GLBackEnd(wid) f = fmt.BaseFormatter(b.d, b) p = FormattingParser(f, GLStylesheet) p.feed(data) p.close() b.finish() # import time time.sleep(5) if __name__ == '__main__': test()