diff options
author | Georg Brandl <georg@python.org> | 2008-06-01 21:25:55 (GMT) |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2008-06-01 21:25:55 (GMT) |
commit | 877b10add4a676c3f868b86bd31e92a181b5a5b1 (patch) | |
tree | 9a7455ce3aa5f07e8f433a8aa9b59540b2cf2b40 /Lib/htmllib.py | |
parent | 6b38daa80dc0b63a089ac4557e25abe1f76b95af (diff) | |
download | cpython-877b10add4a676c3f868b86bd31e92a181b5a5b1.zip cpython-877b10add4a676c3f868b86bd31e92a181b5a5b1.tar.gz cpython-877b10add4a676c3f868b86bd31e92a181b5a5b1.tar.bz2 |
Remove the htmllib and sgmllib modules as per PEP 3108.
Diffstat (limited to 'Lib/htmllib.py')
-rw-r--r-- | Lib/htmllib.py | 486 |
1 files changed, 0 insertions, 486 deletions
diff --git a/Lib/htmllib.py b/Lib/htmllib.py deleted file mode 100644 index a580006..0000000 --- a/Lib/htmllib.py +++ /dev/null @@ -1,486 +0,0 @@ -"""HTML 2.0 parser. - -See the HTML 2.0 specification: -http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html -""" - -import sgmllib - -from formatter import AS_IS - -__all__ = ["HTMLParser", "HTMLParseError"] - - -class HTMLParseError(sgmllib.SGMLParseError): - """Error raised when an HTML document can't be parsed.""" - - -class HTMLParser(sgmllib.SGMLParser): - """This is the basic HTML parser class. - - It supports all entity names required by the XHTML 1.0 Recommendation. - It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2 - elements. - - """ - - from html.entities import entitydefs - - def __init__(self, formatter, verbose=0): - """Creates an instance of the HTMLParser class. - - The formatter parameter is the formatter instance associated with - the parser. - - """ - sgmllib.SGMLParser.__init__(self, verbose) - self.formatter = formatter - - def error(self, message): - raise HTMLParseError(message) - - def reset(self): - sgmllib.SGMLParser.reset(self) - self.savedata = None - self.isindex = 0 - self.title = None - self.base = None - self.anchor = None - self.anchorlist = [] - self.nofill = 0 - self.list_stack = [] - - # ------ Methods used internally; some may be overridden - - # --- Formatter interface, taking care of 'savedata' mode; - # shouldn't need to be overridden - - def handle_data(self, data): - if self.savedata is not None: - self.savedata = self.savedata + data - else: - if self.nofill: - self.formatter.add_literal_data(data) - else: - self.formatter.add_flowing_data(data) - - # --- Hooks to save data; shouldn't need to be overridden - - def save_bgn(self): - """Begins saving character data in a buffer instead of sending it - to the formatter object. - - Retrieve the stored data via the save_end() method. Use of the - save_bgn() / save_end() pair may not be nested. - - """ - self.savedata = '' - - def save_end(self): - """Ends buffering character data and returns all data saved since - the preceding call to the save_bgn() method. - - If the nofill flag is false, whitespace is collapsed to single - spaces. A call to this method without a preceding call to the - save_bgn() method will raise a TypeError exception. - - """ - data = self.savedata - self.savedata = None - if not self.nofill: - data = ' '.join(data.split()) - return data - - # --- Hooks for anchors; should probably be overridden - - def anchor_bgn(self, href, name, type): - """This method is called at the start of an anchor region. - - The arguments correspond to the attributes of the <A> tag with - the same names. The default implementation maintains a list of - hyperlinks (defined by the HREF attribute for <A> tags) within - the document. The list of hyperlinks is available as the data - attribute anchorlist. - - """ - self.anchor = href - if self.anchor: - self.anchorlist.append(href) - - def anchor_end(self): - """This method is called at the end of an anchor region. - - The default implementation adds a textual footnote marker using an - index into the list of hyperlinks created by the anchor_bgn()method. - - """ - if self.anchor: - self.handle_data("[%d]" % len(self.anchorlist)) - self.anchor = None - - # --- Hook for images; should probably be overridden - - def handle_image(self, src, alt, *args): - """This method is called to handle images. - - The default implementation simply passes the alt value to the - handle_data() method. - - """ - self.handle_data(alt) - - # --------- Top level elememts - - def start_html(self, attrs): pass - def end_html(self): pass - - def start_head(self, attrs): pass - def end_head(self): pass - - def start_body(self, attrs): pass - def end_body(self): pass - - # ------ Head elements - - def start_title(self, attrs): - self.save_bgn() - - def end_title(self): - self.title = self.save_end() - - def do_base(self, attrs): - for a, v in attrs: - if a == 'href': - self.base = v - - def do_isindex(self, attrs): - self.isindex = 1 - - def do_link(self, attrs): - pass - - def do_meta(self, attrs): - pass - - def do_nextid(self, attrs): # Deprecated - pass - - # ------ Body elements - - # --- Headings - - def start_h1(self, attrs): - self.formatter.end_paragraph(1) - self.formatter.push_font(('h1', 0, 1, 0)) - - def end_h1(self): - self.formatter.end_paragraph(1) - self.formatter.pop_font() - - def start_h2(self, attrs): - self.formatter.end_paragraph(1) - self.formatter.push_font(('h2', 0, 1, 0)) - - def end_h2(self): - self.formatter.end_paragraph(1) - self.formatter.pop_font() - - def start_h3(self, attrs): - self.formatter.end_paragraph(1) - self.formatter.push_font(('h3', 0, 1, 0)) - - def end_h3(self): - self.formatter.end_paragraph(1) - self.formatter.pop_font() - - def start_h4(self, attrs): - self.formatter.end_paragraph(1) - self.formatter.push_font(('h4', 0, 1, 0)) - - def end_h4(self): - self.formatter.end_paragraph(1) - self.formatter.pop_font() - - def start_h5(self, attrs): - self.formatter.end_paragraph(1) - self.formatter.push_font(('h5', 0, 1, 0)) - - def end_h5(self): - self.formatter.end_paragraph(1) - self.formatter.pop_font() - - def start_h6(self, attrs): - self.formatter.end_paragraph(1) - self.formatter.push_font(('h6', 0, 1, 0)) - - def end_h6(self): - self.formatter.end_paragraph(1) - self.formatter.pop_font() - - # --- Block Structuring Elements - - def do_p(self, attrs): - self.formatter.end_paragraph(1) - - def start_pre(self, attrs): - self.formatter.end_paragraph(1) - self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) - self.nofill = self.nofill + 1 - - def end_pre(self): - self.formatter.end_paragraph(1) - self.formatter.pop_font() - self.nofill = max(0, self.nofill - 1) - - def start_xmp(self, attrs): - self.start_pre(attrs) - self.setliteral('xmp') # Tell SGML parser - - def end_xmp(self): - self.end_pre() - - def start_listing(self, attrs): - self.start_pre(attrs) - self.setliteral('listing') # Tell SGML parser - - def end_listing(self): - self.end_pre() - - def start_address(self, attrs): - self.formatter.end_paragraph(0) - self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) - - def end_address(self): - self.formatter.end_paragraph(0) - self.formatter.pop_font() - - def start_blockquote(self, attrs): - self.formatter.end_paragraph(1) - self.formatter.push_margin('blockquote') - - def end_blockquote(self): - self.formatter.end_paragraph(1) - self.formatter.pop_margin() - - # --- List Elements - - def start_ul(self, attrs): - self.formatter.end_paragraph(not self.list_stack) - self.formatter.push_margin('ul') - self.list_stack.append(['ul', '*', 0]) - - def end_ul(self): - if self.list_stack: del self.list_stack[-1] - self.formatter.end_paragraph(not self.list_stack) - self.formatter.pop_margin() - - def do_li(self, attrs): - self.formatter.end_paragraph(0) - if self.list_stack: - [dummy, label, counter] = top = self.list_stack[-1] - top[2] = counter = counter+1 - else: - label, counter = '*', 0 - self.formatter.add_label_data(label, counter) - - def start_ol(self, attrs): - self.formatter.end_paragraph(not self.list_stack) - self.formatter.push_margin('ol') - label = '1.' - for a, v in attrs: - if a == 'type': - if len(v) == 1: v = v + '.' - label = v - self.list_stack.append(['ol', label, 0]) - - def end_ol(self): - if self.list_stack: del self.list_stack[-1] - self.formatter.end_paragraph(not self.list_stack) - self.formatter.pop_margin() - - def start_menu(self, attrs): - self.start_ul(attrs) - - def end_menu(self): - self.end_ul() - - def start_dir(self, attrs): - self.start_ul(attrs) - - def end_dir(self): - self.end_ul() - - def start_dl(self, attrs): - self.formatter.end_paragraph(1) - self.list_stack.append(['dl', '', 0]) - - def end_dl(self): - self.ddpop(1) - if self.list_stack: del self.list_stack[-1] - - def do_dt(self, attrs): - self.ddpop() - - def do_dd(self, attrs): - self.ddpop() - self.formatter.push_margin('dd') - self.list_stack.append(['dd', '', 0]) - - def ddpop(self, bl=0): - self.formatter.end_paragraph(bl) - if self.list_stack: - if self.list_stack[-1][0] == 'dd': - del self.list_stack[-1] - self.formatter.pop_margin() - - # --- Phrase Markup - - # Idiomatic Elements - - def start_cite(self, attrs): self.start_i(attrs) - def end_cite(self): self.end_i() - - def start_code(self, attrs): self.start_tt(attrs) - def end_code(self): self.end_tt() - - def start_em(self, attrs): self.start_i(attrs) - def end_em(self): self.end_i() - - def start_kbd(self, attrs): self.start_tt(attrs) - def end_kbd(self): self.end_tt() - - def start_samp(self, attrs): self.start_tt(attrs) - def end_samp(self): self.end_tt() - - def start_strong(self, attrs): self.start_b(attrs) - def end_strong(self): self.end_b() - - def start_var(self, attrs): self.start_i(attrs) - def end_var(self): self.end_i() - - # Typographic Elements - - def start_i(self, attrs): - self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) - def end_i(self): - self.formatter.pop_font() - - def start_b(self, attrs): - self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS)) - def end_b(self): - self.formatter.pop_font() - - def start_tt(self, attrs): - self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) - def end_tt(self): - self.formatter.pop_font() - - def start_a(self, attrs): - href = '' - name = '' - type = '' - for attrname, value in attrs: - value = value.strip() - if attrname == 'href': - href = value - if attrname == 'name': - name = value - if attrname == 'type': - type = value.lower() - self.anchor_bgn(href, name, type) - - def end_a(self): - self.anchor_end() - - # --- Line Break - - def do_br(self, attrs): - self.formatter.add_line_break() - - # --- Horizontal Rule - - def do_hr(self, attrs): - self.formatter.add_hor_rule() - - # --- Image - - def do_img(self, attrs): - align = '' - alt = '(image)' - ismap = '' - src = '' - width = 0 - height = 0 - for attrname, value in attrs: - if attrname == 'align': - align = value - if attrname == 'alt': - alt = value - if attrname == 'ismap': - ismap = value - if attrname == 'src': - src = value - if attrname == 'width': - try: width = int(value) - except ValueError: pass - if attrname == 'height': - try: height = int(value) - except ValueError: pass - self.handle_image(src, alt, ismap, align, width, height) - - # --- Really Old Unofficial Deprecated Stuff - - def do_plaintext(self, attrs): - self.start_pre(attrs) - self.setnomoretags() # Tell SGML parser - - # --- Unhandled tags - - def unknown_starttag(self, tag, attrs): - pass - - def unknown_endtag(self, tag): - pass - - -def test(args = None): - import sys, formatter - - if not args: - args = sys.argv[1:] - - silent = args and args[0] == '-s' - if silent: - del args[0] - - if args: - file = args[0] - else: - file = 'test.html' - - if file == '-': - f = sys.stdin - else: - try: - f = open(file, 'r') - except IOError as msg: - print(file, ":", msg) - sys.exit(1) - - data = f.read() - - if f is not sys.stdin: - f.close() - - if silent: - f = formatter.NullFormatter() - else: - f = formatter.AbstractFormatter(formatter.DumbWriter()) - - p = HTMLParser(f) - p.feed(data) - p.close() - - -if __name__ == '__main__': - test() |