summaryrefslogtreecommitdiffstats
path: root/Lib/htmllib.py
diff options
context:
space:
mode:
authorGeorg Brandl <georg@python.org>2008-06-01 21:25:55 (GMT)
committerGeorg Brandl <georg@python.org>2008-06-01 21:25:55 (GMT)
commit877b10add4a676c3f868b86bd31e92a181b5a5b1 (patch)
tree9a7455ce3aa5f07e8f433a8aa9b59540b2cf2b40 /Lib/htmllib.py
parent6b38daa80dc0b63a089ac4557e25abe1f76b95af (diff)
downloadcpython-877b10add4a676c3f868b86bd31e92a181b5a5b1.zip
cpython-877b10add4a676c3f868b86bd31e92a181b5a5b1.tar.gz
cpython-877b10add4a676c3f868b86bd31e92a181b5a5b1.tar.bz2
Remove the htmllib and sgmllib modules as per PEP 3108.
Diffstat (limited to 'Lib/htmllib.py')
-rw-r--r--Lib/htmllib.py486
1 files changed, 0 insertions, 486 deletions
diff --git a/Lib/htmllib.py b/Lib/htmllib.py
deleted file mode 100644
index a580006..0000000
--- a/Lib/htmllib.py
+++ /dev/null
@@ -1,486 +0,0 @@
-"""HTML 2.0 parser.
-
-See the HTML 2.0 specification:
-http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
-"""
-
-import sgmllib
-
-from formatter import AS_IS
-
-__all__ = ["HTMLParser", "HTMLParseError"]
-
-
-class HTMLParseError(sgmllib.SGMLParseError):
- """Error raised when an HTML document can't be parsed."""
-
-
-class HTMLParser(sgmllib.SGMLParser):
- """This is the basic HTML parser class.
-
- It supports all entity names required by the XHTML 1.0 Recommendation.
- It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
- elements.
-
- """
-
- from html.entities import entitydefs
-
- def __init__(self, formatter, verbose=0):
- """Creates an instance of the HTMLParser class.
-
- The formatter parameter is the formatter instance associated with
- the parser.
-
- """
- sgmllib.SGMLParser.__init__(self, verbose)
- self.formatter = formatter
-
- def error(self, message):
- raise HTMLParseError(message)
-
- def reset(self):
- sgmllib.SGMLParser.reset(self)
- self.savedata = None
- self.isindex = 0
- self.title = None
- self.base = None
- self.anchor = None
- self.anchorlist = []
- self.nofill = 0
- self.list_stack = []
-
- # ------ Methods used internally; some may be overridden
-
- # --- Formatter interface, taking care of 'savedata' mode;
- # shouldn't need to be overridden
-
- def handle_data(self, data):
- if self.savedata is not None:
- self.savedata = self.savedata + data
- else:
- if self.nofill:
- self.formatter.add_literal_data(data)
- else:
- self.formatter.add_flowing_data(data)
-
- # --- Hooks to save data; shouldn't need to be overridden
-
- def save_bgn(self):
- """Begins saving character data in a buffer instead of sending it
- to the formatter object.
-
- Retrieve the stored data via the save_end() method. Use of the
- save_bgn() / save_end() pair may not be nested.
-
- """
- self.savedata = ''
-
- def save_end(self):
- """Ends buffering character data and returns all data saved since
- the preceding call to the save_bgn() method.
-
- If the nofill flag is false, whitespace is collapsed to single
- spaces. A call to this method without a preceding call to the
- save_bgn() method will raise a TypeError exception.
-
- """
- data = self.savedata
- self.savedata = None
- if not self.nofill:
- data = ' '.join(data.split())
- return data
-
- # --- Hooks for anchors; should probably be overridden
-
- def anchor_bgn(self, href, name, type):
- """This method is called at the start of an anchor region.
-
- The arguments correspond to the attributes of the <A> tag with
- the same names. The default implementation maintains a list of
- hyperlinks (defined by the HREF attribute for <A> tags) within
- the document. The list of hyperlinks is available as the data
- attribute anchorlist.
-
- """
- self.anchor = href
- if self.anchor:
- self.anchorlist.append(href)
-
- def anchor_end(self):
- """This method is called at the end of an anchor region.
-
- The default implementation adds a textual footnote marker using an
- index into the list of hyperlinks created by the anchor_bgn()method.
-
- """
- if self.anchor:
- self.handle_data("[%d]" % len(self.anchorlist))
- self.anchor = None
-
- # --- Hook for images; should probably be overridden
-
- def handle_image(self, src, alt, *args):
- """This method is called to handle images.
-
- The default implementation simply passes the alt value to the
- handle_data() method.
-
- """
- self.handle_data(alt)
-
- # --------- Top level elememts
-
- def start_html(self, attrs): pass
- def end_html(self): pass
-
- def start_head(self, attrs): pass
- def end_head(self): pass
-
- def start_body(self, attrs): pass
- def end_body(self): pass
-
- # ------ Head elements
-
- def start_title(self, attrs):
- self.save_bgn()
-
- def end_title(self):
- self.title = self.save_end()
-
- def do_base(self, attrs):
- for a, v in attrs:
- if a == 'href':
- self.base = v
-
- def do_isindex(self, attrs):
- self.isindex = 1
-
- def do_link(self, attrs):
- pass
-
- def do_meta(self, attrs):
- pass
-
- def do_nextid(self, attrs): # Deprecated
- pass
-
- # ------ Body elements
-
- # --- Headings
-
- def start_h1(self, attrs):
- self.formatter.end_paragraph(1)
- self.formatter.push_font(('h1', 0, 1, 0))
-
- def end_h1(self):
- self.formatter.end_paragraph(1)
- self.formatter.pop_font()
-
- def start_h2(self, attrs):
- self.formatter.end_paragraph(1)
- self.formatter.push_font(('h2', 0, 1, 0))
-
- def end_h2(self):
- self.formatter.end_paragraph(1)
- self.formatter.pop_font()
-
- def start_h3(self, attrs):
- self.formatter.end_paragraph(1)
- self.formatter.push_font(('h3', 0, 1, 0))
-
- def end_h3(self):
- self.formatter.end_paragraph(1)
- self.formatter.pop_font()
-
- def start_h4(self, attrs):
- self.formatter.end_paragraph(1)
- self.formatter.push_font(('h4', 0, 1, 0))
-
- def end_h4(self):
- self.formatter.end_paragraph(1)
- self.formatter.pop_font()
-
- def start_h5(self, attrs):
- self.formatter.end_paragraph(1)
- self.formatter.push_font(('h5', 0, 1, 0))
-
- def end_h5(self):
- self.formatter.end_paragraph(1)
- self.formatter.pop_font()
-
- def start_h6(self, attrs):
- self.formatter.end_paragraph(1)
- self.formatter.push_font(('h6', 0, 1, 0))
-
- def end_h6(self):
- self.formatter.end_paragraph(1)
- self.formatter.pop_font()
-
- # --- Block Structuring Elements
-
- def do_p(self, attrs):
- self.formatter.end_paragraph(1)
-
- def start_pre(self, attrs):
- self.formatter.end_paragraph(1)
- self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
- self.nofill = self.nofill + 1
-
- def end_pre(self):
- self.formatter.end_paragraph(1)
- self.formatter.pop_font()
- self.nofill = max(0, self.nofill - 1)
-
- def start_xmp(self, attrs):
- self.start_pre(attrs)
- self.setliteral('xmp') # Tell SGML parser
-
- def end_xmp(self):
- self.end_pre()
-
- def start_listing(self, attrs):
- self.start_pre(attrs)
- self.setliteral('listing') # Tell SGML parser
-
- def end_listing(self):
- self.end_pre()
-
- def start_address(self, attrs):
- self.formatter.end_paragraph(0)
- self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
-
- def end_address(self):
- self.formatter.end_paragraph(0)
- self.formatter.pop_font()
-
- def start_blockquote(self, attrs):
- self.formatter.end_paragraph(1)
- self.formatter.push_margin('blockquote')
-
- def end_blockquote(self):
- self.formatter.end_paragraph(1)
- self.formatter.pop_margin()
-
- # --- List Elements
-
- def start_ul(self, attrs):
- self.formatter.end_paragraph(not self.list_stack)
- self.formatter.push_margin('ul')
- self.list_stack.append(['ul', '*', 0])
-
- def end_ul(self):
- if self.list_stack: del self.list_stack[-1]
- self.formatter.end_paragraph(not self.list_stack)
- self.formatter.pop_margin()
-
- def do_li(self, attrs):
- self.formatter.end_paragraph(0)
- if self.list_stack:
- [dummy, label, counter] = top = self.list_stack[-1]
- top[2] = counter = counter+1
- else:
- label, counter = '*', 0
- self.formatter.add_label_data(label, counter)
-
- def start_ol(self, attrs):
- self.formatter.end_paragraph(not self.list_stack)
- self.formatter.push_margin('ol')
- label = '1.'
- for a, v in attrs:
- if a == 'type':
- if len(v) == 1: v = v + '.'
- label = v
- self.list_stack.append(['ol', label, 0])
-
- def end_ol(self):
- if self.list_stack: del self.list_stack[-1]
- self.formatter.end_paragraph(not self.list_stack)
- self.formatter.pop_margin()
-
- def start_menu(self, attrs):
- self.start_ul(attrs)
-
- def end_menu(self):
- self.end_ul()
-
- def start_dir(self, attrs):
- self.start_ul(attrs)
-
- def end_dir(self):
- self.end_ul()
-
- def start_dl(self, attrs):
- self.formatter.end_paragraph(1)
- self.list_stack.append(['dl', '', 0])
-
- def end_dl(self):
- self.ddpop(1)
- if self.list_stack: del self.list_stack[-1]
-
- def do_dt(self, attrs):
- self.ddpop()
-
- def do_dd(self, attrs):
- self.ddpop()
- self.formatter.push_margin('dd')
- self.list_stack.append(['dd', '', 0])
-
- def ddpop(self, bl=0):
- self.formatter.end_paragraph(bl)
- if self.list_stack:
- if self.list_stack[-1][0] == 'dd':
- del self.list_stack[-1]
- self.formatter.pop_margin()
-
- # --- Phrase Markup
-
- # Idiomatic Elements
-
- def start_cite(self, attrs): self.start_i(attrs)
- def end_cite(self): self.end_i()
-
- def start_code(self, attrs): self.start_tt(attrs)
- def end_code(self): self.end_tt()
-
- def start_em(self, attrs): self.start_i(attrs)
- def end_em(self): self.end_i()
-
- def start_kbd(self, attrs): self.start_tt(attrs)
- def end_kbd(self): self.end_tt()
-
- def start_samp(self, attrs): self.start_tt(attrs)
- def end_samp(self): self.end_tt()
-
- def start_strong(self, attrs): self.start_b(attrs)
- def end_strong(self): self.end_b()
-
- def start_var(self, attrs): self.start_i(attrs)
- def end_var(self): self.end_i()
-
- # Typographic Elements
-
- def start_i(self, attrs):
- self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
- def end_i(self):
- self.formatter.pop_font()
-
- def start_b(self, attrs):
- self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
- def end_b(self):
- self.formatter.pop_font()
-
- def start_tt(self, attrs):
- self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
- def end_tt(self):
- self.formatter.pop_font()
-
- def start_a(self, attrs):
- href = ''
- name = ''
- type = ''
- for attrname, value in attrs:
- value = value.strip()
- if attrname == 'href':
- href = value
- if attrname == 'name':
- name = value
- if attrname == 'type':
- type = value.lower()
- self.anchor_bgn(href, name, type)
-
- def end_a(self):
- self.anchor_end()
-
- # --- Line Break
-
- def do_br(self, attrs):
- self.formatter.add_line_break()
-
- # --- Horizontal Rule
-
- def do_hr(self, attrs):
- self.formatter.add_hor_rule()
-
- # --- Image
-
- def do_img(self, attrs):
- align = ''
- alt = '(image)'
- ismap = ''
- src = ''
- width = 0
- height = 0
- for attrname, value in attrs:
- if attrname == 'align':
- align = value
- if attrname == 'alt':
- alt = value
- if attrname == 'ismap':
- ismap = value
- if attrname == 'src':
- src = value
- if attrname == 'width':
- try: width = int(value)
- except ValueError: pass
- if attrname == 'height':
- try: height = int(value)
- except ValueError: pass
- self.handle_image(src, alt, ismap, align, width, height)
-
- # --- Really Old Unofficial Deprecated Stuff
-
- def do_plaintext(self, attrs):
- self.start_pre(attrs)
- self.setnomoretags() # Tell SGML parser
-
- # --- Unhandled tags
-
- def unknown_starttag(self, tag, attrs):
- pass
-
- def unknown_endtag(self, tag):
- pass
-
-
-def test(args = None):
- import sys, formatter
-
- if not args:
- args = sys.argv[1:]
-
- silent = args and args[0] == '-s'
- if silent:
- del args[0]
-
- if args:
- file = args[0]
- else:
- file = 'test.html'
-
- if file == '-':
- f = sys.stdin
- else:
- try:
- f = open(file, 'r')
- except IOError as msg:
- print(file, ":", msg)
- sys.exit(1)
-
- data = f.read()
-
- if f is not sys.stdin:
- f.close()
-
- if silent:
- f = formatter.NullFormatter()
- else:
- f = formatter.AbstractFormatter(formatter.DumbWriter())
-
- p = HTMLParser(f)
- p.feed(data)
- p.close()
-
-
-if __name__ == '__main__':
- test()