summaryrefslogtreecommitdiffstats
path: root/Lib/htmllib.py
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>1995-08-07 20:07:44 (GMT)
committerGuido van Rossum <guido@python.org>1995-08-07 20:07:44 (GMT)
commitf54d967fec5d3287a33b965316513a7250fa8de6 (patch)
tree4be39677c8295d5eb48d9356a6ea9e4ceeb28f07 /Lib/htmllib.py
parenta0eab1d3670897e8bc4407e8706fdec315a7daf9 (diff)
downloadcpython-f54d967fec5d3287a33b965316513a7250fa8de6.zip
cpython-f54d967fec5d3287a33b965316513a7250fa8de6.tar.gz
cpython-f54d967fec5d3287a33b965316513a7250fa8de6.tar.bz2
new formatter module; redid htmllib module to use it
Diffstat (limited to 'Lib/htmllib.py')
-rw-r--r--Lib/htmllib.py605
1 files changed, 356 insertions, 249 deletions
diff --git a/Lib/htmllib.py b/Lib/htmllib.py
index 4af446a..38312c6 100644
--- a/Lib/htmllib.py
+++ b/Lib/htmllib.py
@@ -1,139 +1,103 @@
-# New HTML class
+"""HTML 2.0 parser.
-# XXX Check against HTML 2.0 spec
-
-# XXX reorder methods according to hierarchy
-# - html structure: head, body, title, isindex
-# - headers
-# - lists, items
-# - paragraph styles
-# - forms
-# - character styles
-# - images
-# - bookkeeping
-# - output generation
+See the HTML 2.0 specification:
+http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
+"""
import sys
import regsub
import string
from sgmllib import SGMLParser
-
-
-ROMAN = 0
-ITALIC = 1
-BOLD = 2
-FIXED = 3
+from formatter import AS_IS
class HTMLParser(SGMLParser):
- def __init__(self):
- SGMLParser.__init__(self)
- self.savedata = None
- self.isindex = 0
- self.title = ''
- self.para = None
- self.lists = []
- self.styles = []
- self.nofill = 0
- self.nospace = 1
- self.softspace = 0
+ def __init__(self, formatter):
+ SGMLParser.__init__(self)
+ self.formatter = formatter
+ self.savedata = None
+ self.isindex = 0
+ self.title = None
+ self.base = None
+ self.anchor = None
+ self.anchorlist = []
+ self.nofill = 0
+ self.list_stack = []
- # --- Data
+ # ------ Methods used internally; some may be overridden
- def handle_image(self, src, alt):
- self.handle_data(alt)
+ # --- Formatter interface, taking care of 'savedata' mode;
+ # shouldn't need to be overridden
def handle_data(self, data):
- if self.nofill:
- self.handle_literal(data)
- return
- data = regsub.gsub('[ \t\n\r]+', ' ', data)
- if self.nospace and data[:1] == ' ': data = data[1:]
- if not data: return
- self.nospace = 0
- if self.softspace and data[:1] != ' ': data = ' ' + data
- if data[-1:] == ' ':
- data = data[:-1]
- self.softspace = 1
- self.output_data(data)
-
- def handle_literal(self, data):
- self.nospace = 0
- self.softspace = 0
- self.output_data(data)
-
- def output_data(self, data):
- if self.savedata is not None:
+ if self.savedata is not None:
self.savedata = self.savedata + data
- else:
- self.write_data(data)
+ else:
+ if self.nofill:
+ self.formatter.add_literal_data(data)
+ else:
+ self.formatter.add_flowing_data(data)
- def write_data(self, data):
- sys.stdout.write(data)
+ # --- Hooks to save data; shouldn't need to be overridden
def save_bgn(self):
- self.savedata = ''
- self.nospace = 1
- self.softspace = 0
+ self.savedata = ''
def save_end(self):
- saved = self.savedata
- self.savedata = None
- self.nospace = 1
- self.softspace = 0
- return saved
+ data = self.savedata
+ self.savedata = None
+ return string.join(string.split(data))
+
+ # --- Hooks for anchors; should probably be overridden
- def new_para(self):
- pass
+ def anchor_bgn(self, href, name, type):
+ self.anchor = href
+ if self.anchor:
+ self.anchorlist.append(href)
- def new_style(self):
- pass
+ def anchor_end(self):
+ if self.anchor:
+ self.handle_data("[%d]" % len(self.anchorlist))
+ self.anchor = None
- # --- Generic style changes
+ # --- Hook for images; should probably be overridden
- def para_bgn(self, tag):
- if not self.nospace:
- self.handle_literal('\n')
- self.nospace = 1
- self.softspace = 0
- if tag is not None:
- self.para = tag
- self.new_para()
+ def handle_image(self, src, alt):
+ self.handle_data(alt)
- def para_end(self):
- self.para_bgn('')
+ # --- Hooks for forms; should probably be overridden
- def push_list(self, tag):
- self.lists.append(tag)
- self.para_bgn(None)
+ def form_bgn(self, action, method, enctype):
+ self.do_p([])
+ self.handle_data("<FORM>")
- def pop_list(self):
- del self.lists[-1]
- self.para_end()
+ def form_end(self):
+ self.handle_data("</FORM>")
+ self.do_p([])
- def literal_bgn(self, tag, attrs):
- self.para_bgn(tag)
+ def handle_input(self, type, options):
+ self.handle_data("<INPUT>")
- def literal_end(self, tag):
- self.para_end()
+ def select_bgn(self, name, size, multiple):
+ self.handle_data("<SELECT>")
- def push_style(self, tag):
- self.styles.append(tag)
- self.new_style()
+ def select_end(self):
+ self.handle_data("</SELECT>")
- def pop_style(self):
- del self.styles[-1]
- self.new_style()
+ def handle_option(self, value, selected):
+ self.handle_data("<OPTION>")
- def anchor_bgn(self, href, name, type):
- self.push_style(href and 'a' or None)
+ def textarea_bgn(self, name, rows, cols):
+ self.handle_data("<TEXTAREA>")
+ self.start_pre([])
- def anchor_end(self):
- self.pop_style()
+ def textarea_end(self):
+ self.end_pre()
+ self.handle_data("</TEXTAREA>")
- # --- Top level tags
+ # --------- Top level elememts
def start_html(self, attrs): pass
def end_html(self): pass
@@ -144,231 +108,374 @@ class HTMLParser(SGMLParser):
def start_body(self, attrs): pass
def end_body(self): pass
- def do_isindex(self, attrs):
- self.isindex = 1
+ # ------ Head elements
def start_title(self, attrs):
- self.save_bgn()
+ self.save_bgn()
def end_title(self):
- self.title = self.save_end()
+ self.title = self.save_end()
- # --- Old HTML 'literal text' tags
+ def do_base(self, attrs):
+ for a, v in attrs:
+ if a == 'href':
+ self.base = v
- def start_listing(self, attrs):
- self.setliteral('listing')
- self.literal_bgn('listing', attrs)
+ def do_isindex(self, attrs):
+ self.isindex = 1
- def end_listing(self):
- self.literal_end('listing')
+ def do_link(self, attrs):
+ pass
- def start_xmp(self, attrs):
- self.setliteral('xmp')
- self.literal_bgn('xmp', attrs)
+ def do_meta(self, attrs):
+ pass
- def end_xmp(self):
- self.literal_end('xmp')
+ def do_nextid(self, attrs): # Deprecated
+ pass
- def do_plaintext(self, attrs):
- self.setnomoretags()
- self.literal_bgn('plaintext', attrs)
+ # ------ Body elements
- # --- Anchors
+ # --- Headings
- def start_a(self, attrs):
- href = ''
- name = ''
- type = ''
- for attrname, value in attrs:
- if attrname == 'href':
- href = value
- if attrname == 'name':
- name = value
- if attrname == 'type':
- type = string.lower(value)
- if not (href or name):
- return
- self.anchor_bgn(href, name, type)
+ def start_h1(self, attrs):
+ self.formatter.end_paragraph(1)
+ self.formatter.push_font(('h1', 0, 1, 0))
- def end_a(self):
- self.anchor_end()
+ def end_h1(self):
+ self.formatter.end_paragraph(1)
+ self.formatter.pop_font()
+
+ def start_h2(self, attrs):
+ self.formatter.end_paragraph(1)
+ self.formatter.push_font(('h2', 0, 1, 0))
+
+ def end_h2(self):
+ self.formatter.end_paragraph(1)
+ self.formatter.pop_font()
+
+ def start_h3(self, attrs):
+ self.formatter.end_paragraph(1)
+ self.formatter.push_font(('h3', 0, 1, 0))
+
+ def end_h3(self):
+ self.formatter.end_paragraph(1)
+ self.formatter.pop_font()
- # --- Paragraph tags
+ def start_h4(self, attrs):
+ self.formatter.end_paragraph(1)
+ self.formatter.push_font(('h4', 0, 1, 0))
+
+ def end_h4(self):
+ self.formatter.end_paragraph(1)
+ self.formatter.pop_font()
+
+ def start_h5(self, attrs):
+ self.formatter.end_paragraph(1)
+ self.formatter.push_font(('h5', 0, 1, 0))
+
+ def end_h5(self):
+ self.formatter.end_paragraph(1)
+ self.formatter.pop_font()
+
+ def start_h6(self, attrs):
+ self.formatter.end_paragraph(1)
+ self.formatter.push_font(('h6', 0, 1, 0))
+
+ def end_h6(self):
+ self.formatter.end_paragraph(1)
+ self.formatter.pop_font()
+
+ # --- Block Structuring Elements
def do_p(self, attrs):
- self.para_bgn(None)
+ self.formatter.end_paragraph(1)
- def do_br(self, attrs):
- self.handle_literal('\n')
- self.nospace = 1
- self.softspace = 0
+ def start_pre(self, attrs):
+ self.formatter.end_paragraph(1)
+ self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
+ self.nofill = self.nofill + 1
- def do_hr(self, attrs):
- self.para_bgn(None)
- self.handle_literal('-'*40)
- self.para_end()
+ def end_pre(self):
+ self.formatter.end_paragraph(1)
+ self.formatter.pop_font()
+ self.nofill = max(0, self.nofill - 1)
- def start_h1(self, attrs):
- self.para_bgn('h1')
+ def start_xmp(self, attrs):
+ self.start_pre(attrs)
+ self.setliteral('xmp') # Tell SGML parser
- def start_h2(self, attrs):
- self.para_bgn('h2')
+ def end_xmp(self):
+ self.end_pre()
- def start_h3(self, attrs):
- self.para_bgn('h3')
+ def start_listing(self, attrs):
+ self.start_pre(attrs)
+ self.setliteral('listing') # Tell SGML parser
- def start_h4(self, attrs):
- self.para_bgn('h4')
+ def end_listing(self):
+ self.end_pre()
- def start_h5(self, attrs):
- self.para_bgn('h5')
+ def start_address(self, attrs):
+ self.formatter.end_paragraph(0)
+ self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
- def start_h6(self, attrs):
- self.para_bgn('h6')
+ def end_address(self):
+ self.formatter.end_paragraph(0)
+ self.formatter.pop_font()
- def end_h1(self):
- self.para_end()
+ def start_blockquote(self, attrs):
+ self.formatter.end_paragraph(1)
+ self.formatter.push_margin('blockquote')
- end_h2 = end_h1
- end_h3 = end_h2
- end_h4 = end_h3
- end_h5 = end_h4
- end_h6 = end_h5
+ def end_blockquote(self):
+ self.formatter.end_paragraph(0)
+ self.formatter.pop_margin()
- def start_ul(self, attrs):
- self.para_bgn(None)
- self.push_list('ul')
+ # --- List Elements
- def start_ol(self, attrs):
- self.para_bgn(None)
- self.push_list('ol')
+ def start_ul(self, attrs):
+ self.formatter.end_paragraph(not self.list_stack)
+ self.formatter.push_margin('ul')
+ self.list_stack.append(['ul', '*', 0])
def end_ul(self):
- self.pop_list()
- self.para_end()
+ if self.list_stack: del self.list_stack[-1]
+ self.formatter.end_paragraph(not self.list_stack)
+ self.formatter.pop_margin()
def do_li(self, attrs):
- self.para_bgn('li%d' % len(self.lists))
+ self.formatter.end_paragraph(0)
+ if self.list_stack:
+ [dummy, label, counter] = top = self.list_stack[-1]
+ top[2] = counter = counter+1
+ else:
+ label, counter = '*', 0
+ self.formatter.add_label_data(label, counter)
+
+ def start_ol(self, attrs):
+ self.formatter.end_paragraph(not self.list_stack)
+ self.formatter.push_margin('ol')
+ label = '1.'
+ for a, v in attrs:
+ if a == 'type':
+ if len(v) == 1: v = v + '.'
+ label = v
+ self.list_stack.append(['ol', label, 0])
- start_dir = start_menu = start_ul
- end_dir = end_menu = end_ol = end_ul
+ def end_ol(self):
+ if self.list_stack: del self.list_stack[-1]
+ self.formatter.end_paragraph(not self.list_stack)
+ self.formatter.pop_margin()
+
+ def start_menu(self, attrs):
+ self.start_ul(attrs)
+
+ def end_menu(self):
+ self.end_ul()
+
+ def start_dir(self, attrs):
+ self.start_ul(attrs)
+
+ def end_dir(self):
+ self.end_ul()
def start_dl(self, attrs):
- self.para_bgn(None)
- self.push_list('dl')
+ self.formatter.end_paragraph(0)
+ self.list_stack.append(['dl', '', 0])
def end_dl(self):
- self.pop_list()
- self.para_end()
+ self.ddpop()
+ if self.list_stack: del self.list_stack[-1]
def do_dt(self, attrs):
- self.para_bgn('dt%d' % len(self.lists))
+ self.ddpop()
def do_dd(self, attrs):
- self.para_bgn('dd%d' % len(self.lists))
-
- def start_address(self, attrs):
- self.para_bgn('address')
+ self.ddpop()
+ self.formatter.push_margin('dd')
+ self.list_stack.append(['dd', '', 0])
- def end_address(self):
- self.para_end()
+ def ddpop(self):
+ self.formatter.end_paragraph(0)
+ if self.list_stack:
+ if self.list_stack[-1][0] == 'dd':
+ del self.list_stack[-1]
+ self.formatter.pop_margin()
- def start_pre(self, attrs):
- self.para_bgn('pre')
- self.nofill = self.nofill + 1
+ # --- Phrase Markup
- def end_pre(self):
- self.nofill = self.nofill - 1
- self.para_end()
+ # Idiomatic Elements
- start_typewriter = start_pre
- end_typewriter = end_pre
+ def start_cite(self, attrs): self.start_i(attrs)
+ def end_cite(self): self.end_i()
- def do_img(self, attrs):
- src = ''
- alt = ' (image) '
- for attrname, value in attrs:
- if attrname == 'alt':
- alt = value
- if attrname == 'src':
- src = value
- self.handle_image(src, alt)
+ def start_code(self, attrs): self.start_tt(attrs)
+ def end_code(self): self.end_tt()
- # --- Character tags -- physical styles
+ def start_em(self, attrs): self.start_i(attrs)
+ def end_em(self): self.end_i()
- def start_tt(self, attrs): self.push_style(FIXED)
- def end_tt(self): self.pop_style()
+ def start_kbd(self, attrs): self.start_tt(attrs)
+ def end_kbd(self): self.end_tt()
- def start_b(self, attrs): self.push_style(BOLD)
- def end_b(self): self.pop_style()
+ def start_samp(self, attrs): self.start_tt(attrs)
+ def end_samp(self): self.end_tt()
- def start_i(self, attrs): self.push_style(ITALIC)
- def end_i(self): self.pop_style()
+ def start_string(self, attrs): self.start_b(attrs)
+ def end_b(self): self.end_b()
- def start_u(self, attrs): self.push_style(ITALIC) # Underline???
- def end_u(self): self.pop_style()
+ def start_var(self, attrs): self.start_i(attrs)
+ def end_var(self): self.end_var()
- def start_r(self, attrs): self.push_style(ROMAN) # Not official
- def end_r(self): self.pop_style()
+ # Typographic Elements
- # --- Charaacter tags -- logical styles
+ def start_i(self, attrs):
+ self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
+ def end_i(self):
+ self.formatter.pop_font()
- start_em = start_i
- end_em = end_i
+ def start_b(self, attrs):
+ self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
+ def end_b(self):
+ self.formatter.pop_font()
- start_strong = start_b
- end_strong = end_b
+ def start_tt(self, attrs):
+ self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
+ def end_tt(self):
+ self.formatter.pop_font()
- start_code = start_tt
- end_code = end_tt
-
- start_samp = start_tt
- end_samp = end_tt
-
- start_kbd = start_tt
- end_kbd = end_tt
+ def start_a(self, attrs):
+ href = ''
+ name = ''
+ type = ''
+ for attrname, value in attrs:
+ if attrname == 'href':
+ href = value
+ if attrname == 'name':
+ name = value
+ if attrname == 'type':
+ type = string.lower(value)
+ self.anchor_bgn(href, name, type)
- start_file = start_tt # unofficial
- end_file = end_tt
+ def end_a(self):
+ self.anchor_end()
- start_var = start_i
- end_var = end_i
+ # --- Line Break
- start_dfn = start_i
- end_dfn = end_i
+ def do_br(self, attrs):
+ self.formatter.add_line_break()
- start_cite = start_i
- end_cite = end_i
+ # --- Horizontal Rule
- start_hp1 = start_i
- end_hp1 = start_i
+ def do_hr(self, attrs):
+ self.formatter.add_hor_rule()
- start_hp2 = start_b
- end_hp2 = end_b
+ # --- Image
- # --- Form tags
+ def do_img(self, attrs):
+ align = ''
+ alt = '(image)'
+ ismap = ''
+ src = ''
+ for attrname, value in attrs:
+ if attrname == 'align':
+ align = value
+ if attrname == 'alt':
+ alt = value
+ if attrname == 'ismap':
+ ismap = value
+ if attrname == 'src':
+ src = value
+ self.handle_image(src, alt)
+
+ # ------ Forms
def start_form(self, attrs):
- self.para_bgn(None)
+ action = ''
+ method = ''
+ enctype = ''
+ for a, v in attrs:
+ if a == 'action': action = v
+ if a == 'method': method = v
+ if a == 'enctype': enctype = v
+ self.form_bgn(action, method, enctype)
def end_form(self):
- self.para_end()
+ self.form_end()
+
+ def do_input(self, attrs):
+ type = ''
+ options = {}
+ for a, v in attrs:
+ if a == 'type': type = string.lower(v)
+ else: options[a] = v
+ self.handle_input(type, options)
+
+ def start_select(self, attrs):
+ name = ''
+ size = 0
+ multiple = 0
+ for a, v in attrs:
+ if a == 'multiple': multiple = 1
+ if a == 'name': name = v
+ if a == 'size':
+ try: size = string.atoi(size)
+ except: pass
+ self.select_bgn(name, size, multiple)
+
+ def end_select(self):
+ self.select_end()
+
+ def do_option(self, attrs):
+ value = ''
+ selected = 1
+ for a, v in attrs:
+ if a == 'value': value = v
+ if a == 'selected': selected = 1
+ self.handle_option(value, selected)
+
+ def start_textarea(self, attrs):
+ name = ''
+ rows = 0
+ cols = 0
+ for a, v in attrs:
+ if a == 'name': name = v
+ if a == 'rows':
+ try: rows = string.atoi(v)
+ except: pass
+ if a == 'cols':
+ try: cols = string.atoi(v)
+ except: pass
+ self.textarea_bgn(name, rows, cols)
+
+ def end_textarea(self):
+ self.textarea_end()
+
+ # --- Really Old Unofficial Deprecated Stuff
+
+ def do_plaintext(self, attrs):
+ self.start_pre(attrs)
+ self.setnomoretags() # Tell SGML parser
# --- Unhandled tags
def unknown_starttag(self, tag, attrs):
- pass
+ pass
def unknown_endtag(self, tag):
- pass
+ pass
def test():
+ import sys
file = 'test.html'
- f = open(file, 'r')
- data = f.read()
- f.close()
- p = HTMLParser()
+ if sys.argv[1:]: file = sys.argv[1]
+ fp = open(file, 'r')
+ data = fp.read()
+ fp.close()
+ from formatter import DumbWriter, AbstractFormatter
+ w = DumbWriter()
+ f = AbstractFormatter(w)
+ p = HTMLParser(f)
p.feed(data)
p.close()