diff options
author | Guido van Rossum <guido@python.org> | 1995-02-27 13:16:55 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 1995-02-27 13:16:55 (GMT) |
commit | 7c750e1e099128157430d26ffa7e2a44d87daf3c (patch) | |
tree | 7c74472b5402733b5d52519799fbc9415fc4cb6c /Lib | |
parent | eb9e9d2b2a61629e7562587a679367c3bb52c92b (diff) | |
download | cpython-7c750e1e099128157430d26ffa7e2a44d87daf3c.zip cpython-7c750e1e099128157430d26ffa7e2a44d87daf3c.tar.gz cpython-7c750e1e099128157430d26ffa7e2a44d87daf3c.tar.bz2 |
added html parser and supporting cast
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/Para.py | 408 | ||||
-rw-r--r-- | Lib/fmt.py | 621 | ||||
-rw-r--r-- | Lib/htmllib.py | 635 | ||||
-rw-r--r-- | Lib/lib-old/Para.py | 408 | ||||
-rw-r--r-- | Lib/lib-old/fmt.py | 621 | ||||
-rw-r--r-- | Lib/sgmllib.py | 321 |
6 files changed, 3014 insertions, 0 deletions
diff --git a/Lib/Para.py b/Lib/Para.py new file mode 100644 index 0000000..6a7057d --- /dev/null +++ b/Lib/Para.py @@ -0,0 +1,408 @@ +# Text formatting abstractions + + +# Oft-used type object +Int = type(0) + + +# Represent a paragraph. This is a list of words with associated +# font and size information, plus indents and justification for the +# entire paragraph. +# Once the words have been added to a paragraph, it can be laid out +# for different line widths. Once laid out, it can be rendered at +# different screen locations. Once rendered, it can be queried +# for mouse hits, and parts of the text can be highlighted +class Para: + # + def __init__(self): + self.words = [] # The words + self.just = 'l' # Justification: 'l', 'r', 'lr' or 'c' + self.indent_left = self.indent_right = self.indent_hang = 0 + # Final lay-out parameters, may change + self.left = self.top = self.right = self.bottom = \ + self.width = self.height = self.lines = None + # + # Add a word, computing size information for it. + # Words may also be added manually by appending to self.words + # Each word should be a 7-tuple: + # (font, text, width, space, stretch, ascent, descent) + def addword(self, d, font, text, space, stretch): + if font <> None: + d.setfont(font) + width = d.textwidth(text) + ascent = d.baseline() + descent = d.lineheight() - ascent + spw = d.textwidth(' ') + space = space * spw + stretch = stretch * spw + tuple = (font, text, width, space, stretch, ascent, descent) + self.words.append(tuple) + # + # Hooks to begin and end anchors -- insert numbers in the word list! + def bgn_anchor(self, id): + self.words.append(id) + # + def end_anchor(self, id): + self.words.append(0) + # + # Return the total length (width) of the text added so far, in pixels + def getlength(self): + total = 0 + for word in self.words: + if type(word) <> Int: + total = total + word[2] + word[3] + return total + # + # Tab to a given position (relative to the current left indent): + # remove all stretch, add fixed space up to the new indent. + # If the current position is already beying the tab stop, + # don't add any new space (but still remove the stretch) + def tabto(self, tab): + total = 0 + as, de = 1, 0 + for i in range(len(self.words)): + word = self.words[i] + if type(word) == Int: continue + fo, te, wi, sp, st, as, de = word + self.words[i] = fo, te, wi, sp, 0, as, de + total = total + wi + sp + if total < tab: + self.words.append(None, '', 0, tab-total, 0, as, de) + # + # Make a hanging tag: tab to hang, increment indent_left by hang, + # and reset indent_hang to -hang + def makehangingtag(self, hang): + self.tabto(hang) + self.indent_left = self.indent_left + hang + self.indent_hang = -hang + # + # Decide where the line breaks will be given some screen width + def layout(self, linewidth): + self.width = linewidth + height = 0 + self.lines = lines = [] + avail1 = self.width - self.indent_left - self.indent_right + avail = avail1 - self.indent_hang + words = self.words + i = 0 + n = len(words) + lastfont = None + while i < n: + firstfont = lastfont + charcount = 0 + width = 0 + stretch = 0 + ascent = 0 + descent = 0 + lsp = 0 + j = i + while i < n: + word = words[i] + if type(word) == Int: + if word > 0 and width >= avail: + break + i = i+1 + continue + fo, te, wi, sp, st, as, de = word + if width + wi > avail and width > 0 and wi > 0: + break + if fo <> None: + lastfont = fo + if width == 0: + firstfont = fo + charcount = charcount + len(te) + (sp > 0) + width = width + wi + sp + lsp = sp + stretch = stretch + st + lst = st + ascent = max(ascent, as) + descent = max(descent, de) + i = i+1 + while i > j and type(words[i-1]) == Int and \ + words[i-1] > 0: i = i-1 + width = width - lsp + if i < n: + stretch = stretch - lst + else: + stretch = 0 + tuple = i-j, firstfont, charcount, width, stretch, \ + ascent, descent + lines.append(tuple) + height = height + ascent + descent + avail = avail1 + self.height = height + # + # Call a function for all words in a line + def visit(self, wordfunc, anchorfunc): + avail1 = self.width - self.indent_left - self.indent_right + avail = avail1 - self.indent_hang + v = self.top + i = 0 + for tuple in self.lines: + wordcount, firstfont, charcount, width, stretch, \ + ascent, descent = tuple + h = self.left + self.indent_left + if i == 0: h = h + self.indent_hang + extra = 0 + if self.just == 'r': h = h + avail - width + elif self.just == 'c': h = h + (avail - width) / 2 + elif self.just == 'lr' and stretch > 0: + extra = avail - width + v2 = v + ascent + descent + for j in range(i, i+wordcount): + word = self.words[j] + if type(word) == Int: + ok = anchorfunc(self, tuple, word, \ + h, v) + if ok <> None: return ok + continue + fo, te, wi, sp, st, as, de = word + if extra > 0 and stretch > 0: + ex = extra * st / stretch + extra = extra - ex + stretch = stretch - st + else: + ex = 0 + h2 = h + wi + sp + ex + ok = wordfunc(self, tuple, word, h, v, \ + h2, v2, (j==i), (j==i+wordcount-1)) + if ok <> None: return ok + h = h2 + v = v2 + i = i + wordcount + avail = avail1 + # + # Render a paragraph in "drawing object" d, using the rectangle + # given by (left, top, right) with an unspecified bottom. + # Return the computed bottom of the text. + def render(self, d, left, top, right): + if self.width <> right-left: + self.layout(right-left) + self.left = left + self.top = top + self.right = right + self.bottom = self.top + self.height + self.anchorid = 0 + try: + self.d = d + self.visit(self.__class__._renderword, \ + self.__class__._renderanchor) + finally: + self.d = None + return self.bottom + # + def _renderword(self, tuple, word, h, v, h2, v2, isfirst, islast): + if word[0] <> None: self.d.setfont(word[0]) + baseline = v + tuple[5] + self.d.text((h, baseline - word[5]), word[1]) + if self.anchorid > 0: + self.d.line((h, baseline+2), (h2, baseline+2)) + # + def _renderanchor(self, tuple, word, h, v): + self.anchorid = word + # + # Return which anchor(s) was hit by the mouse + def hitcheck(self, mouseh, mousev): + self.mouseh = mouseh + self.mousev = mousev + self.anchorid = 0 + self.hits = [] + self.visit(self.__class__._hitcheckword, \ + self.__class__._hitcheckanchor) + return self.hits + # + def _hitcheckword(self, tuple, word, h, v, h2, v2, isfirst, islast): + if self.anchorid > 0 and h <= self.mouseh <= h2 and \ + v <= self.mousev <= v2: + self.hits.append(self.anchorid) + # + def _hitcheckanchor(self, tuple, word, h, v): + self.anchorid = word + # + # Return whether the given anchor id is present + def hasanchor(self, id): + return id in self.words or -id in self.words + # + # Extract the raw text from the word list, substituting one space + # for non-empty inter-word space, and terminating with '\n' + def extract(self): + text = '' + for w in self.words: + if type(w) <> Int: + word = w[1] + if w[3]: word = word + ' ' + text = text + word + return text + '\n' + # + # Return which character position was hit by the mouse, as + # an offset in the entire text as returned by extract(). + # Return None if the mouse was not in this paragraph + def whereis(self, d, mouseh, mousev): + if mousev < self.top or mousev > self.bottom: + return None + self.mouseh = mouseh + self.mousev = mousev + self.lastfont = None + self.charcount = 0 + try: + self.d = d + return self.visit(self.__class__._whereisword, \ + self.__class__._whereisanchor) + finally: + self.d = None + # + def _whereisword(self, tuple, word, h1, v1, h2, v2, isfirst, islast): + fo, te, wi, sp, st, as, de = word + if fo <> None: self.lastfont = fo + h = h1 + if isfirst: h1 = 0 + if islast: h2 = 999999 + if not (v1 <= self.mousev <= v2 and h1 <= self.mouseh <= h2): + self.charcount = self.charcount + len(te) + (sp > 0) + return + if self.lastfont <> None: + self.d.setfont(self.lastfont) + cc = 0 + for c in te: + cw = self.d.textwidth(c) + if self.mouseh <= h + cw/2: + return self.charcount + cc + cc = cc+1 + h = h+cw + self.charcount = self.charcount + cc + if self.mouseh <= (h+h2) / 2: + return self.charcount + else: + return self.charcount + 1 + # + def _whereisanchor(self, tuple, word, h, v): + pass + # + # Return screen position corresponding to position in paragraph. + # Return tuple (h, vtop, vbaseline, vbottom). + # This is more or less the inverse of whereis() + def screenpos(self, d, pos): + if pos < 0: + ascent, descent = self.lines[0][5:7] + return self.left, self.top, self.top + ascent, \ + self.top + ascent + descent + self.pos = pos + self.lastfont = None + try: + self.d = d + ok = self.visit(self.__class__._screenposword, \ + self.__class__._screenposanchor) + finally: + self.d = None + if ok == None: + ascent, descent = self.lines[-1][5:7] + ok = self.right, self.bottom - ascent - descent, \ + self.bottom - descent, self.bottom + return ok + # + def _screenposword(self, tuple, word, h1, v1, h2, v2, isfirst, islast): + fo, te, wi, sp, st, as, de = word + if fo <> None: self.lastfont = fo + cc = len(te) + (sp > 0) + if self.pos > cc: + self.pos = self.pos - cc + return + if self.pos < cc: + self.d.setfont(self.lastfont) + h = h1 + self.d.textwidth(te[:self.pos]) + else: + h = h2 + ascent, descent = tuple[5:7] + return h, v1, v1+ascent, v2 + # + def _screenposanchor(self, tuple, word, h, v): + pass + # + # Invert the stretch of text between pos1 and pos2. + # If pos1 is None, the beginning is implied; + # if pos2 is None, the end is implied. + # Undoes its own effect when called again with the same arguments + def invert(self, d, pos1, pos2): + if pos1 == None: + pos1 = self.left, self.top, self.top, self.top + else: + pos1 = self.screenpos(d, pos1) + if pos2 == None: + pos2 = self.right, self.bottom,self.bottom,self.bottom + else: + pos2 = self.screenpos(d, pos2) + h1, top1, baseline1, bottom1 = pos1 + h2, top2, baseline2, bottom2 = pos2 + if bottom1 <= top2: + d.invert((h1, top1), (self.right, bottom1)) + h1 = self.left + if bottom1 < top2: + d.invert((h1, bottom1), (self.right, top2)) + top1, bottom1 = top2, bottom2 + d.invert((h1, top1), (h2, bottom2)) + + +# Test class Para +# XXX This was last used on the Mac, hence the weird fonts... +def test(): + import stdwin + from stdwinevents import * + words = 'The', 'quick', 'brown', 'fox', 'jumps', 'over', \ + 'the', 'lazy', 'dog.' + paralist = [] + for just in 'l', 'r', 'lr', 'c': + p = Para() + p.just = just + p.addword(stdwin, ('New York', 'p', 12), words[0], 1, 1) + for word in words[1:-1]: + p.addword(stdwin, None, word, 1, 1) + p.addword(stdwin, None, words[-1], 2, 4) + p.addword(stdwin, ('New York', 'b', 18), 'Bye!', 0, 0) + p.addword(stdwin, ('New York', 'p', 10), 'Bye!', 0, 0) + paralist.append(p) + window = stdwin.open('Para.test()') + start = stop = selpara = None + while 1: + etype, win, detail = stdwin.getevent() + if etype == WE_CLOSE: + break + if etype == WE_SIZE: + window.change((0, 0), (1000, 1000)) + if etype == WE_DRAW: + width, height = window.getwinsize() + d = None + try: + d = window.begindrawing() + d.cliprect(detail) + d.erase(detail) + v = 0 + for p in paralist: + v = p.render(d, 0, v, width) + if p == selpara and \ + start <> None and stop <> None: + p.invert(d, start, stop) + finally: + if d: d.close() + if etype == WE_MOUSE_DOWN: + if selpara and start <> None and stop <> None: + d = window.begindrawing() + selpara.invert(d, start, stop) + d.close() + start = stop = selpara = None + mouseh, mousev = detail[0] + for p in paralist: + start = p.whereis(stdwin, mouseh, mousev) + if start <> None: + selpara = p + break + if etype == WE_MOUSE_UP and start <> None and selpara: + mouseh, mousev = detail[0] + stop = selpara.whereis(stdwin, mouseh, mousev) + if stop == None: start = selpara = None + else: + if start > stop: + start, stop = stop, start + d = window.begindrawing() + selpara.invert(d, start, stop) + d.close() + window.close() diff --git a/Lib/fmt.py b/Lib/fmt.py new file mode 100644 index 0000000..c096306 --- /dev/null +++ b/Lib/fmt.py @@ -0,0 +1,621 @@ +# Text formatting abstractions + + +import string +import Para + + +# A formatter back-end object has one method that is called by the formatter: +# addpara(p), where p is a paragraph object. For example: + + +# Formatter back-end to do nothing at all with the paragraphs +class NullBackEnd: + # + def __init__(self): + pass + # + def addpara(self, p): + pass + # + def bgn_anchor(self, id): + pass + # + def end_anchor(self, id): + pass + + +# Formatter back-end to collect the paragraphs in a list +class SavingBackEnd(NullBackEnd): + # + def __init__(self): + self.paralist = [] + # + def addpara(self, p): + self.paralist.append(p) + # + def hitcheck(self, h, v): + hits = [] + for p in self.paralist: + if p.top <= v <= p.bottom: + for id in p.hitcheck(h, v): + if id not in hits: + hits.append(id) + return hits + # + def extract(self): + text = '' + for p in self.paralist: + text = text + (p.extract()) + return text + # + def extractpart(self, long1, long2): + if long1 > long2: long1, long2 = long2, long1 + para1, pos1 = long1 + para2, pos2 = long2 + text = '' + while para1 < para2: + ptext = self.paralist[para1].extract() + text = text + ptext[pos1:] + pos1 = 0 + para1 = para1 + 1 + ptext = self.paralist[para2].extract() + return text + ptext[pos1:pos2] + # + def whereis(self, d, h, v): + total = 0 + for i in range(len(self.paralist)): + p = self.paralist[i] + result = p.whereis(d, h, v) + if result <> None: + return i, result + return None + # + def roundtowords(self, long1, long2): + i, offset = long1 + text = self.paralist[i].extract() + while offset > 0 and text[offset-1] <> ' ': offset = offset-1 + long1 = i, offset + # + i, offset = long2 + text = self.paralist[i].extract() + n = len(text) + while offset < n-1 and text[offset] <> ' ': offset = offset+1 + long2 = i, offset + # + return long1, long2 + # + def roundtoparagraphs(self, long1, long2): + long1 = long1[0], 0 + long2 = long2[0], len(self.paralist[long2[0]].extract()) + return long1, long2 + + +# Formatter back-end to send the text directly to the drawing object +class WritingBackEnd(NullBackEnd): + # + def __init__(self, d, width): + self.d = d + self.width = width + self.lineno = 0 + # + def addpara(self, p): + self.lineno = p.render(self.d, 0, self.lineno, self.width) + + +# A formatter receives a stream of formatting instructions and assembles +# these into a stream of paragraphs on to a back-end. The assembly is +# parametrized by a text measurement object, which must match the output +# operations of the back-end. The back-end is responsible for splitting +# paragraphs up in lines of a given maximum width. (This is done because +# in a windowing environment, when the window size changes, there is no +# need to redo the assembly into paragraphs, but the splitting into lines +# must be done taking the new window size into account.) + + +# Formatter base class. Initialize it with a text measurement object, +# which is used for text measurements, and a back-end object, +# which receives the completed paragraphs. The formatting methods are: +# setfont(font) +# setleftindent(nspaces) +# setjust(type) where type is 'l', 'c', 'r', or 'lr' +# flush() +# vspace(nlines) +# needvspace(nlines) +# addword(word, nspaces) +class BaseFormatter: + # + def __init__(self, d, b): + # Drawing object used for text measurements + self.d = d + # + # BackEnd object receiving completed paragraphs + self.b = b + # + # Parameters of the formatting model + self.leftindent = 0 + self.just = 'l' + self.font = None + self.blanklines = 0 + # + # Parameters derived from the current font + self.space = d.textwidth(' ') + self.line = d.lineheight() + self.ascent = d.baseline() + self.descent = self.line - self.ascent + # + # Parameter derived from the default font + self.n_space = self.space + # + # Current paragraph being built + self.para = None + self.nospace = 1 + # + # Font to set on the next word + self.nextfont = None + # + def newpara(self): + return Para.Para() + # + def setfont(self, font): + if font == None: return + self.font = self.nextfont = font + d = self.d + d.setfont(font) + self.space = d.textwidth(' ') + self.line = d.lineheight() + self.ascent = d.baseline() + self.descent = self.line - self.ascent + # + def setleftindent(self, nspaces): + self.leftindent = int(self.n_space * nspaces) + if self.para: + hang = self.leftindent - self.para.indent_left + if hang > 0 and self.para.getlength() <= hang: + self.para.makehangingtag(hang) + self.nospace = 1 + else: + self.flush() + # + def setrightindent(self, nspaces): + self.rightindent = int(self.n_space * nspaces) + if self.para: + self.para.indent_right = self.rightindent + self.flush() + # + def setjust(self, just): + self.just = just + if self.para: + self.para.just = self.just + # + def flush(self): + if self.para: + self.b.addpara(self.para) + self.para = None + if self.font <> None: + self.d.setfont(self.font) + self.nospace = 1 + # + def vspace(self, nlines): + self.flush() + if nlines > 0: + self.para = self.newpara() + tuple = None, '', 0, 0, 0, int(nlines*self.line), 0 + self.para.words.append(tuple) + self.flush() + self.blanklines = self.blanklines + nlines + # + def needvspace(self, nlines): + self.flush() # Just to be sure + if nlines > self.blanklines: + self.vspace(nlines - self.blanklines) + # + def addword(self, text, space): + if self.nospace and not text: + return + self.nospace = 0 + self.blanklines = 0 + if not self.para: + self.para = self.newpara() + self.para.indent_left = self.leftindent + self.para.just = self.just + self.nextfont = self.font + space = int(space * self.space) + self.para.words.append(self.nextfont, text, \ + self.d.textwidth(text), space, space, \ + self.ascent, self.descent) + self.nextfont = None + # + def bgn_anchor(self, id): + if not self.para: + self.nospace = 0 + self.addword('', 0) + self.para.bgn_anchor(id) + # + def end_anchor(self, id): + if not self.para: + self.nospace = 0 + self.addword('', 0) + self.para.end_anchor(id) + + +# Measuring object for measuring text as viewed on a tty +class NullMeasurer: + # + def __init__(self): + pass + # + def setfont(self, font): + pass + # + def textwidth(self, text): + return len(text) + # + def lineheight(self): + return 1 + # + def baseline(self): + return 0 + + +# Drawing object for writing plain ASCII text to a file +class FileWriter: + # + def __init__(self, fp): + self.fp = fp + self.lineno, self.colno = 0, 0 + # + def setfont(self, font): + pass + # + def text(self, (h, v), str): + if not str: return + if '\n' in str: + raise ValueError, 'can\'t write \\n' + while self.lineno < v: + self.fp.write('\n') + self.colno, self.lineno = 0, self.lineno + 1 + while self.lineno > v: + # XXX This should never happen... + self.fp.write('\033[A') # ANSI up arrow + self.lineno = self.lineno - 1 + if self.colno < h: + self.fp.write(' ' * (h - self.colno)) + elif self.colno > h: + self.fp.write('\b' * (self.colno - h)) + self.colno = h + self.fp.write(str) + self.colno = h + len(str) + + +# Formatting class to do nothing at all with the data +class NullFormatter(BaseFormatter): + # + def __init__(self): + d = NullMeasurer() + b = NullBackEnd() + BaseFormatter.__init__(self, d, b) + + +# Formatting class to write directly to a file +class WritingFormatter(BaseFormatter): + # + def __init__(self, fp, width): + dm = NullMeasurer() + dw = FileWriter(fp) + b = WritingBackEnd(dw, width) + BaseFormatter.__init__(self, dm, b) + self.blanklines = 1 + # + # Suppress multiple blank lines + def needvspace(self, nlines): + BaseFormatter.needvspace(self, min(1, nlines)) + + +# A "FunnyFormatter" writes ASCII text with a twist: *bold words*, +# _italic text_ and _underlined words_, and `quoted text'. +# It assumes that the fonts are 'r', 'i', 'b', 'u', 'q': (roman, +# italic, bold, underline, quote). +# Moreover, if the font is in upper case, the text is converted to +# UPPER CASE. +class FunnyFormatter(WritingFormatter): + # + def flush(self): + if self.para: finalize(self.para) + WritingFormatter.flush(self) + + +# Surrounds *bold words* and _italic text_ in a paragraph with +# appropriate markers, fixing the size (assuming these characters' +# width is 1). +openchar = \ + {'b':'*', 'i':'_', 'u':'_', 'q':'`', 'B':'*', 'I':'_', 'U':'_', 'Q':'`'} +closechar = \ + {'b':'*', 'i':'_', 'u':'_', 'q':'\'', 'B':'*', 'I':'_', 'U':'_', 'Q':'\''} +def finalize(para): + oldfont = curfont = 'r' + para.words.append('r', '', 0, 0, 0, 0) # temporary, deleted at end + for i in range(len(para.words)): + fo, te, wi = para.words[i][:3] + if fo <> None: curfont = fo + if curfont <> oldfont: + if closechar.has_key(oldfont): + c = closechar[oldfont] + j = i-1 + while j > 0 and para.words[j][1] == '': j = j-1 + fo1, te1, wi1 = para.words[j][:3] + te1 = te1 + c + wi1 = wi1 + len(c) + para.words[j] = (fo1, te1, wi1) + \ + para.words[j][3:] + if openchar.has_key(curfont) and te: + c = openchar[curfont] + te = c + te + wi = len(c) + wi + para.words[i] = (fo, te, wi) + \ + para.words[i][3:] + if te: oldfont = curfont + else: oldfont = 'r' + if curfont in string.uppercase: + te = string.upper(te) + para.words[i] = (fo, te, wi) + para.words[i][3:] + del para.words[-1] + + +# Formatter back-end to draw the text in a window. +# This has an option to draw while the paragraphs are being added, +# to minimize the delay before the user sees anything. +# This manages the entire "document" of the window. +class StdwinBackEnd(SavingBackEnd): + # + def __init__(self, window, drawnow): + self.window = window + self.drawnow = drawnow + self.width = window.getwinsize()[0] + self.selection = None + self.height = 0 + window.setorigin(0, 0) + window.setdocsize(0, 0) + self.d = window.begindrawing() + SavingBackEnd.__init__(self) + # + def finish(self): + self.d.close() + self.d = None + self.window.setdocsize(0, self.height) + # + def addpara(self, p): + self.paralist.append(p) + if self.drawnow: + self.height = \ + p.render(self.d, 0, self.height, self.width) + else: + p.layout(self.width) + p.left = 0 + p.top = self.height + p.right = self.width + p.bottom = self.height + p.height + self.height = p.bottom + # + def resize(self): + self.window.change((0, 0), (self.width, self.height)) + self.width = self.window.getwinsize()[0] + self.height = 0 + for p in self.paralist: + p.layout(self.width) + p.left = 0 + p.top = self.height + p.right = self.width + p.bottom = self.height + p.height + self.height = p.bottom + self.window.change((0, 0), (self.width, self.height)) + self.window.setdocsize(0, self.height) + # + def redraw(self, area): + d = self.window.begindrawing() + (left, top), (right, bottom) = area + d.erase(area) + d.cliprect(area) + for p in self.paralist: + if top < p.bottom and p.top < bottom: + v = p.render(d, p.left, p.top, p.right) + if self.selection: + self.invert(d, self.selection) + d.close() + # + def setselection(self, new): + if new: + long1, long2 = new + pos1 = long1[:3] + pos2 = long2[:3] + new = pos1, pos2 + if new <> self.selection: + d = self.window.begindrawing() + if self.selection: + self.invert(d, self.selection) + if new: + self.invert(d, new) + d.close() + self.selection = new + # + def getselection(self): + return self.selection + # + def extractselection(self): + if self.selection: + a, b = self.selection + return self.extractpart(a, b) + else: + return None + # + def invert(self, d, region): + long1, long2 = region + if long1 > long2: long1, long2 = long2, long1 + para1, pos1 = long1 + para2, pos2 = long2 + while para1 < para2: + self.paralist[para1].invert(d, pos1, None) + pos1 = None + para1 = para1 + 1 + self.paralist[para2].invert(d, pos1, pos2) + # + def search(self, prog): + import regex, string + if type(prog) == type(''): + prog = regex.compile(string.lower(prog)) + if self.selection: + iold = self.selection[0][0] + else: + iold = -1 + hit = None + for i in range(len(self.paralist)): + if i == iold or i < iold and hit: + continue + p = self.paralist[i] + text = string.lower(p.extract()) + if prog.search(text) >= 0: + a, b = prog.regs[0] + long1 = i, a + long2 = i, b + hit = long1, long2 + if i > iold: + break + if hit: + self.setselection(hit) + i = hit[0][0] + p = self.paralist[i] + self.window.show((p.left, p.top), (p.right, p.bottom)) + return 1 + else: + return 0 + # + def showanchor(self, id): + for i in range(len(self.paralist)): + p = self.paralist[i] + if p.hasanchor(id): + long1 = i, 0 + long2 = i, len(p.extract()) + hit = long1, long2 + self.setselection(hit) + self.window.show( \ + (p.left, p.top), (p.right, p.bottom)) + break + + +# GL extensions + +class GLFontCache: + # + def __init__(self): + self.reset() + self.setfont('') + # + def reset(self): + self.fontkey = None + self.fonthandle = None + self.fontinfo = None + self.fontcache = {} + # + def close(self): + self.reset() + # + def setfont(self, fontkey): + if fontkey == '': + fontkey = 'Times-Roman 12' + elif ' ' not in fontkey: + fontkey = fontkey + ' 12' + if fontkey == self.fontkey: + return + if self.fontcache.has_key(fontkey): + handle = self.fontcache[fontkey] + else: + import string + i = string.index(fontkey, ' ') + name, sizestr = fontkey[:i], fontkey[i:] + size = eval(sizestr) + key1 = name + ' 1' + key = name + ' ' + `size` + # NB key may differ from fontkey! + if self.fontcache.has_key(key): + handle = self.fontcache[key] + else: + if self.fontcache.has_key(key1): + handle = self.fontcache[key1] + else: + import fm + handle = fm.findfont(name) + self.fontcache[key1] = handle + handle = handle.scalefont(size) + self.fontcache[fontkey] = \ + self.fontcache[key] = handle + self.fontkey = fontkey + if self.fonthandle <> handle: + self.fonthandle = handle + self.fontinfo = handle.getfontinfo() + handle.setfont() + + +class GLMeasurer(GLFontCache): + # + def textwidth(self, text): + return self.fonthandle.getstrwidth(text) + # + def baseline(self): + return self.fontinfo[6] - self.fontinfo[3] + # + def lineheight(self): + return self.fontinfo[6] + + +class GLWriter(GLFontCache): + # + # NOTES: + # (1) Use gl.ortho2 to use X pixel coordinates! + # + def text(self, (h, v), text): + import gl, fm + gl.cmov2i(h, v + self.fontinfo[6] - self.fontinfo[3]) + fm.prstr(text) + # + def setfont(self, fontkey): + oldhandle = self.fonthandle + GLFontCache.setfont(fontkey) + if self.fonthandle <> oldhandle: + handle.setfont() + + +class GLMeasurerWriter(GLMeasurer, GLWriter): + pass + + +class GLBackEnd(SavingBackEnd): + # + def __init__(self, wid): + import gl + gl.winset(wid) + self.wid = wid + self.width = gl.getsize()[1] + self.height = 0 + self.d = GLMeasurerWriter() + SavingBackEnd.__init__(self) + # + def finish(self): + pass + # + def addpara(self, p): + self.paralist.append(p) + self.height = p.render(self.d, 0, self.height, self.width) + # + def redraw(self): + import gl + gl.winset(self.wid) + width = gl.getsize()[1] + if width <> self.width: + setdocsize = 1 + self.width = width + for p in self.paralist: + p.top = p.bottom = None + d = self.d + v = 0 + for p in self.paralist: + v = p.render(d, 0, v, width) diff --git a/Lib/htmllib.py b/Lib/htmllib.py new file mode 100644 index 0000000..8b3e62b --- /dev/null +++ b/Lib/htmllib.py @@ -0,0 +1,635 @@ +# A parser for HTML documents + + +# HTML: HyperText Markup Language; an SGML-like syntax used by WWW to +# describe hypertext documents +# +# SGML: Standard Generalized Markup Language +# +# WWW: World-Wide Web; a distributed hypertext system develped at CERN +# +# CERN: European Particle Physics Laboratory in Geneva, Switzerland + + +# This file is only concerned with parsing and formatting HTML +# documents, not with the other (hypertext and networking) aspects of +# the WWW project. (It does support highlighting of anchors.) + + +import os +import sys +import regex +import string +import sgmllib + + +class HTMLParser(sgmllib.SGMLParser): + + # Copy base class entities and add some + entitydefs = {} + for key in sgmllib.SGMLParser.entitydefs.keys(): + entitydefs[key] = sgmllib.SGMLParser.entitydefs[key] + entitydefs['bullet'] = '*' + + # Provided -- handlers for tags introducing literal text + + def start_listing(self, attrs): + self.setliteral('listing') + self.literal_bgn('listing', attrs) + + def end_listing(self): + self.literal_end('listing') + + def start_xmp(self, attrs): + self.setliteral('xmp') + self.literal_bgn('xmp', attrs) + + def end_xmp(self): + self.literal_end('xmp') + + def do_plaintext(self, attrs): + self.setnomoretags() + self.literal_bgn('plaintext', attrs) + + # To be overridden -- begin/end literal mode + def literal_bgn(self, tag, attrs): pass + def literal_end(self, tag): pass + + +# Next level of sophistication -- collect anchors, title, nextid and isindex +class CollectingParser(HTMLParser): + # + def __init__(self): + HTMLParser.__init__(self) + self.savetext = None + self.nextid = '' + self.isindex = 0 + self.title = '' + self.inanchor = 0 + self.anchors = [] + self.anchornames = [] + self.anchortypes = [] + # + def start_a(self, attrs): + self.inanchor = 0 + href = '' + name = '' + type = '' + for attrname, value in attrs: + if attrname == 'href': + href = value + if attrname == 'name=': + name = value + if attrname == 'type=': + type = string.lower(value) + if not (href or name): + return + self.anchors.append(href) + self.anchornames.append(name) + self.anchortypes.append(type) + self.inanchor = len(self.anchors) + if not href: + self.inanchor = -self.inanchor + # + def end_a(self): + if self.inanchor > 0: + # Don't show anchors pointing into the current document + if self.anchors[self.inanchor-1][:1] <> '#': + self.handle_data('[' + `self.inanchor` + ']') + self.inanchor = 0 + # + def start_header(self, attrs): pass + def end_header(self): pass + # + # (head is the same as header) + def start_head(self, attrs): pass + def end_head(self): pass + # + def start_body(self, attrs): pass + def end_body(self): pass + # + def do_nextid(self, attrs): + self.nextid = attrs + # + def do_isindex(self, attrs): + self.isindex = 1 + # + def start_title(self, attrs): + self.savetext = '' + # + def end_title(self): + if self.savetext <> None: + self.title = self.savetext + self.savetext = None + # + def handle_data(self, text): + if self.savetext is not None: + self.savetext = self.savetext + text + + +# Formatting parser -- takes a formatter and a style sheet as arguments + +# XXX The use of style sheets should change: for each tag and end tag +# there should be a style definition, and a style definition should +# encompass many more parameters: font, justification, indentation, +# vspace before, vspace after, hanging tag... + +wordprog = regex.compile('[^ \t\n]*') +spaceprog = regex.compile('[ \t\n]*') + +class FormattingParser(CollectingParser): + + def __init__(self, formatter, stylesheet): + CollectingParser.__init__(self) + self.fmt = formatter + self.stl = stylesheet + self.savetext = None + self.compact = 0 + self.nofill = 0 + self.resetfont() + self.setindent(self.stl.stdindent) + + def resetfont(self): + self.fontstack = [] + self.stylestack = [] + self.fontset = self.stl.stdfontset + self.style = ROMAN + self.passfont() + + def passfont(self): + font = self.fontset[self.style] + self.fmt.setfont(font) + + def pushstyle(self, style): + self.stylestack.append(self.style) + self.style = min(style, len(self.fontset)-1) + self.passfont() + + def popstyle(self): + self.style = self.stylestack[-1] + del self.stylestack[-1] + self.passfont() + + def pushfontset(self, fontset, style): + self.fontstack.append(self.fontset) + self.fontset = fontset + self.pushstyle(style) + + def popfontset(self): + self.fontset = self.fontstack[-1] + del self.fontstack[-1] + self.popstyle() + + def flush(self): + self.fmt.flush() + + def setindent(self, n): + self.fmt.setleftindent(n) + + def needvspace(self, n): + self.fmt.needvspace(n) + + def close(self): + HTMLParser.close(self) + self.fmt.flush() + + def handle_literal(self, text): + lines = string.splitfields(text, '\n') + for i in range(1, len(lines)): + lines[i] = string.expandtabs(lines[i], 8) + for line in lines[:-1]: + self.fmt.addword(line, 0) + self.fmt.flush() + self.fmt.nospace = 0 + for line in lines[-1:]: + self.fmt.addword(line, 0) + + def handle_data(self, text): + if self.savetext is not None: + self.savetext = self.savetext + text + return + if self.literal: + self.handle_literal(text) + return + i = 0 + n = len(text) + while i < n: + j = i + wordprog.match(text, i) + word = text[i:j] + i = j + spaceprog.match(text, j) + self.fmt.addword(word, i-j) + if self.nofill and '\n' in text[j:i]: + self.fmt.flush() + self.fmt.nospace = 0 + i = j+1 + while text[i-1] <> '\n': i = i+1 + + def literal_bgn(self, tag, attrs): + if tag == 'plaintext': + self.flush() + else: + self.needvspace(1) + self.pushfontset(self.stl.stdfontset, FIXED) + self.setindent(self.stl.literalindent) + + def literal_end(self, tag): + self.needvspace(1) + self.popfontset() + self.setindent(self.stl.stdindent) + + def start_title(self, attrs): + self.flush() + self.savetext = '' + # NB end_title is unchanged + + def do_p(self, attrs): + if self.compact: + self.flush() + else: + self.needvspace(1) + + def start_h1(self, attrs): + self.needvspace(2) + self.setindent(self.stl.h1indent) + self.pushfontset(self.stl.h1fontset, BOLD) + self.fmt.setjust('c') + + def end_h1(self): + self.popfontset() + self.needvspace(2) + self.setindent(self.stl.stdindent) + self.fmt.setjust('l') + + def start_h2(self, attrs): + self.needvspace(1) + self.setindent(self.stl.h2indent) + self.pushfontset(self.stl.h2fontset, BOLD) + + def end_h2(self): + self.popfontset() + self.needvspace(1) + self.setindent(self.stl.stdindent) + + def start_h3(self, attrs): + self.needvspace(1) + self.setindent(self.stl.stdindent) + self.pushfontset(self.stl.h3fontset, BOLD) + + def end_h3(self): + self.popfontset() + self.needvspace(1) + self.setindent(self.stl.stdindent) + + def start_h4(self, attrs): + self.needvspace(1) + self.setindent(self.stl.stdindent) + self.pushfontset(self.stl.stdfontset, BOLD) + + def end_h4(self): + self.popfontset() + self.needvspace(1) + self.setindent(self.stl.stdindent) + + start_h5 = start_h4 + end_h5 = end_h4 + + start_h6 = start_h5 + end_h6 = end_h5 + + start_h7 = start_h6 + end_h7 = end_h6 + + def start_ul(self, attrs): + self.needvspace(1) + for attrname, value in attrs: + if attrname == 'compact': + self.compact = 1 + self.setindent(0) + break + else: + self.setindent(self.stl.ulindent) + + start_dir = start_menu = start_ol = start_ul + + do_li = do_p + + def end_ul(self): + self.compact = 0 + self.needvspace(1) + self.setindent(self.stl.stdindent) + + end_dir = end_menu = end_ol = end_ul + + def start_dl(self, attrs): + for attrname, value in attrs: + if attrname == 'compact': + self.compact = 1 + self.needvspace(1) + + def end_dl(self): + self.compact = 0 + self.needvspace(1) + self.setindent(self.stl.stdindent) + + def do_dt(self, attrs): + if self.compact: + self.flush() + else: + self.needvspace(1) + self.setindent(self.stl.stdindent) + + def do_dd(self, attrs): + self.fmt.addword('', 1) + self.setindent(self.stl.ddindent) + + def start_address(self, attrs): + self.compact = 1 + self.needvspace(1) + self.fmt.setjust('r') + + def end_address(self): + self.compact = 0 + self.needvspace(1) + self.setindent(self.stl.stdindent) + self.fmt.setjust('l') + + def start_pre(self, attrs): + self.needvspace(1) + self.nofill = self.nofill + 1 + self.pushstyle(FIXED) + + def end_pre(self): + self.popstyle() + self.nofill = self.nofill - 1 + self.needvspace(1) + + start_typewriter = start_pre + end_typewriter = end_pre + + def do_img(self, attrs): + self.fmt.addword('(image)', 0) + + # Physical styles + + def start_tt(self, attrs): self.pushstyle(FIXED) + def end_tt(self): self.popstyle() + + def start_b(self, attrs): self.pushstyle(BOLD) + def end_b(self): self.popstyle() + + def start_i(self, attrs): self.pushstyle(ITALIC) + def end_i(self): self.popstyle() + + def start_u(self, attrs): self.pushstyle(ITALIC) # Underline??? + def end_u(self): self.popstyle() + + def start_r(self, attrs): self.pushstyle(ROMAN) # Not official + def end_r(self): self.popstyle() + + # Logical styles + + start_em = start_i + end_em = end_i + + start_strong = start_b + end_strong = end_b + + start_code = start_tt + end_code = end_tt + + start_samp = start_tt + end_samp = end_tt + + start_kbd = start_tt + end_kbd = end_tt + + start_file = start_tt # unofficial + end_file = end_tt + + start_var = start_i + end_var = end_i + + start_dfn = start_i + end_dfn = end_i + + start_cite = start_i + end_cite = end_i + + start_hp1 = start_i + end_hp1 = start_i + + start_hp2 = start_b + end_hp2 = end_b + + def unknown_starttag(self, tag, attrs): + print '*** unknown <' + tag + '>' + + def unknown_endtag(self, tag): + print '*** unknown </' + tag + '>' + + +# An extension of the formatting parser which formats anchors differently. +class AnchoringParser(FormattingParser): + + def start_a(self, attrs): + FormattingParser.start_a(self, attrs) + if self.inanchor: + self.fmt.bgn_anchor(self.inanchor) + + def end_a(self): + if self.inanchor: + self.fmt.end_anchor(self.inanchor) + self.inanchor = 0 + + +# Style sheet -- this is never instantiated, but the attributes +# of the class object itself are used to specify fonts to be used +# for various paragraph styles. +# A font set is a non-empty list of fonts, in the order: +# [roman, italic, bold, fixed]. +# When a style is not available the nearest lower style is used + +ROMAN = 0 +ITALIC = 1 +BOLD = 2 +FIXED = 3 + +class NullStylesheet: + # Fonts -- none + stdfontset = [None] + h1fontset = [None] + h2fontset = [None] + h3fontset = [None] + # Indents + stdindent = 2 + ddindent = 25 + ulindent = 4 + h1indent = 0 + h2indent = 0 + literalindent = 0 + + +class X11Stylesheet(NullStylesheet): + stdfontset = [ \ + '-*-helvetica-medium-r-normal-*-*-100-100-*-*-*-*-*', \ + '-*-helvetica-medium-o-normal-*-*-100-100-*-*-*-*-*', \ + '-*-helvetica-bold-r-normal-*-*-100-100-*-*-*-*-*', \ + '-*-courier-medium-r-normal-*-*-100-100-*-*-*-*-*', \ + ] + h1fontset = [ \ + '-*-helvetica-medium-r-normal-*-*-180-100-*-*-*-*-*', \ + '-*-helvetica-medium-o-normal-*-*-180-100-*-*-*-*-*', \ + '-*-helvetica-bold-r-normal-*-*-180-100-*-*-*-*-*', \ + ] + h2fontset = [ \ + '-*-helvetica-medium-r-normal-*-*-140-100-*-*-*-*-*', \ + '-*-helvetica-medium-o-normal-*-*-140-100-*-*-*-*-*', \ + '-*-helvetica-bold-r-normal-*-*-140-100-*-*-*-*-*', \ + ] + h3fontset = [ \ + '-*-helvetica-medium-r-normal-*-*-120-100-*-*-*-*-*', \ + '-*-helvetica-medium-o-normal-*-*-120-100-*-*-*-*-*', \ + '-*-helvetica-bold-r-normal-*-*-120-100-*-*-*-*-*', \ + ] + ddindent = 40 + + +class MacStylesheet(NullStylesheet): + stdfontset = [ \ + ('Geneva', 'p', 10), \ + ('Geneva', 'i', 10), \ + ('Geneva', 'b', 10), \ + ('Monaco', 'p', 10), \ + ] + h1fontset = [ \ + ('Geneva', 'p', 18), \ + ('Geneva', 'i', 18), \ + ('Geneva', 'b', 18), \ + ('Monaco', 'p', 18), \ + ] + h3fontset = [ \ + ('Geneva', 'p', 14), \ + ('Geneva', 'i', 14), \ + ('Geneva', 'b', 14), \ + ('Monaco', 'p', 14), \ + ] + h3fontset = [ \ + ('Geneva', 'p', 12), \ + ('Geneva', 'i', 12), \ + ('Geneva', 'b', 12), \ + ('Monaco', 'p', 12), \ + ] + + +if os.name == 'mac': + StdwinStylesheet = MacStylesheet +else: + StdwinStylesheet = X11Stylesheet + + +class GLStylesheet(NullStylesheet): + stdfontset = [ \ + 'Helvetica 10', \ + 'Helvetica-Italic 10', \ + 'Helvetica-Bold 10', \ + 'Courier 10', \ + ] + h1fontset = [ \ + 'Helvetica 18', \ + 'Helvetica-Italic 18', \ + 'Helvetica-Bold 18', \ + 'Courier 18', \ + ] + h2fontset = [ \ + 'Helvetica 14', \ + 'Helvetica-Italic 14', \ + 'Helvetica-Bold 14', \ + 'Courier 14', \ + ] + h3fontset = [ \ + 'Helvetica 12', \ + 'Helvetica-Italic 12', \ + 'Helvetica-Bold 12', \ + 'Courier 12', \ + ] + + +# Test program -- produces no output but times how long it takes +# to send a document to a null formatter, exclusive of I/O + +def test(): + import fmt + import time + if sys.argv[1:]: file = sys.argv[1] + else: file = 'test.html' + data = open(file, 'r').read() + t0 = time.time() + fmtr = fmt.WritingFormatter(sys.stdout, 79) + p = FormattingParser(fmtr, NullStylesheet) + p.feed(data) + p.close() + t1 = time.time() + print + print '*** Formatting time:', round(t1-t0, 3), 'seconds.' + + +# Test program using stdwin + +def testStdwin(): + import stdwin, fmt + from stdwinevents import * + if sys.argv[1:]: file = sys.argv[1] + else: file = 'test.html' + data = open(file, 'r').read() + window = stdwin.open('testStdwin') + b = None + while 1: + etype, ewin, edetail = stdwin.getevent() + if etype == WE_CLOSE: + break + if etype == WE_SIZE: + window.setdocsize(0, 0) + window.setorigin(0, 0) + window.change((0, 0), (10000, 30000)) # XXX + if etype == WE_DRAW: + if not b: + b = fmt.StdwinBackEnd(window, 1) + f = fmt.BaseFormatter(b.d, b) + p = FormattingParser(f, \ + MacStylesheet) + p.feed(data) + p.close() + b.finish() + else: + b.redraw(edetail) + window.close() + + +# Test program using GL + +def testGL(): + import gl, GL, fmt + if sys.argv[1:]: file = sys.argv[1] + else: file = 'test.html' + data = open(file, 'r').read() + W, H = 600, 600 + gl.foreground() + gl.prefsize(W, H) + wid = gl.winopen('testGL') + gl.ortho2(0, W, H, 0) + gl.color(GL.WHITE) + gl.clear() + gl.color(GL.BLACK) + b = fmt.GLBackEnd(wid) + f = fmt.BaseFormatter(b.d, b) + p = FormattingParser(f, GLStylesheet) + p.feed(data) + p.close() + b.finish() + # + import time + time.sleep(5) + + +if __name__ == '__main__': + test() diff --git a/Lib/lib-old/Para.py b/Lib/lib-old/Para.py new file mode 100644 index 0000000..6a7057d --- /dev/null +++ b/Lib/lib-old/Para.py @@ -0,0 +1,408 @@ +# Text formatting abstractions + + +# Oft-used type object +Int = type(0) + + +# Represent a paragraph. This is a list of words with associated +# font and size information, plus indents and justification for the +# entire paragraph. +# Once the words have been added to a paragraph, it can be laid out +# for different line widths. Once laid out, it can be rendered at +# different screen locations. Once rendered, it can be queried +# for mouse hits, and parts of the text can be highlighted +class Para: + # + def __init__(self): + self.words = [] # The words + self.just = 'l' # Justification: 'l', 'r', 'lr' or 'c' + self.indent_left = self.indent_right = self.indent_hang = 0 + # Final lay-out parameters, may change + self.left = self.top = self.right = self.bottom = \ + self.width = self.height = self.lines = None + # + # Add a word, computing size information for it. + # Words may also be added manually by appending to self.words + # Each word should be a 7-tuple: + # (font, text, width, space, stretch, ascent, descent) + def addword(self, d, font, text, space, stretch): + if font <> None: + d.setfont(font) + width = d.textwidth(text) + ascent = d.baseline() + descent = d.lineheight() - ascent + spw = d.textwidth(' ') + space = space * spw + stretch = stretch * spw + tuple = (font, text, width, space, stretch, ascent, descent) + self.words.append(tuple) + # + # Hooks to begin and end anchors -- insert numbers in the word list! + def bgn_anchor(self, id): + self.words.append(id) + # + def end_anchor(self, id): + self.words.append(0) + # + # Return the total length (width) of the text added so far, in pixels + def getlength(self): + total = 0 + for word in self.words: + if type(word) <> Int: + total = total + word[2] + word[3] + return total + # + # Tab to a given position (relative to the current left indent): + # remove all stretch, add fixed space up to the new indent. + # If the current position is already beying the tab stop, + # don't add any new space (but still remove the stretch) + def tabto(self, tab): + total = 0 + as, de = 1, 0 + for i in range(len(self.words)): + word = self.words[i] + if type(word) == Int: continue + fo, te, wi, sp, st, as, de = word + self.words[i] = fo, te, wi, sp, 0, as, de + total = total + wi + sp + if total < tab: + self.words.append(None, '', 0, tab-total, 0, as, de) + # + # Make a hanging tag: tab to hang, increment indent_left by hang, + # and reset indent_hang to -hang + def makehangingtag(self, hang): + self.tabto(hang) + self.indent_left = self.indent_left + hang + self.indent_hang = -hang + # + # Decide where the line breaks will be given some screen width + def layout(self, linewidth): + self.width = linewidth + height = 0 + self.lines = lines = [] + avail1 = self.width - self.indent_left - self.indent_right + avail = avail1 - self.indent_hang + words = self.words + i = 0 + n = len(words) + lastfont = None + while i < n: + firstfont = lastfont + charcount = 0 + width = 0 + stretch = 0 + ascent = 0 + descent = 0 + lsp = 0 + j = i + while i < n: + word = words[i] + if type(word) == Int: + if word > 0 and width >= avail: + break + i = i+1 + continue + fo, te, wi, sp, st, as, de = word + if width + wi > avail and width > 0 and wi > 0: + break + if fo <> None: + lastfont = fo + if width == 0: + firstfont = fo + charcount = charcount + len(te) + (sp > 0) + width = width + wi + sp + lsp = sp + stretch = stretch + st + lst = st + ascent = max(ascent, as) + descent = max(descent, de) + i = i+1 + while i > j and type(words[i-1]) == Int and \ + words[i-1] > 0: i = i-1 + width = width - lsp + if i < n: + stretch = stretch - lst + else: + stretch = 0 + tuple = i-j, firstfont, charcount, width, stretch, \ + ascent, descent + lines.append(tuple) + height = height + ascent + descent + avail = avail1 + self.height = height + # + # Call a function for all words in a line + def visit(self, wordfunc, anchorfunc): + avail1 = self.width - self.indent_left - self.indent_right + avail = avail1 - self.indent_hang + v = self.top + i = 0 + for tuple in self.lines: + wordcount, firstfont, charcount, width, stretch, \ + ascent, descent = tuple + h = self.left + self.indent_left + if i == 0: h = h + self.indent_hang + extra = 0 + if self.just == 'r': h = h + avail - width + elif self.just == 'c': h = h + (avail - width) / 2 + elif self.just == 'lr' and stretch > 0: + extra = avail - width + v2 = v + ascent + descent + for j in range(i, i+wordcount): + word = self.words[j] + if type(word) == Int: + ok = anchorfunc(self, tuple, word, \ + h, v) + if ok <> None: return ok + continue + fo, te, wi, sp, st, as, de = word + if extra > 0 and stretch > 0: + ex = extra * st / stretch + extra = extra - ex + stretch = stretch - st + else: + ex = 0 + h2 = h + wi + sp + ex + ok = wordfunc(self, tuple, word, h, v, \ + h2, v2, (j==i), (j==i+wordcount-1)) + if ok <> None: return ok + h = h2 + v = v2 + i = i + wordcount + avail = avail1 + # + # Render a paragraph in "drawing object" d, using the rectangle + # given by (left, top, right) with an unspecified bottom. + # Return the computed bottom of the text. + def render(self, d, left, top, right): + if self.width <> right-left: + self.layout(right-left) + self.left = left + self.top = top + self.right = right + self.bottom = self.top + self.height + self.anchorid = 0 + try: + self.d = d + self.visit(self.__class__._renderword, \ + self.__class__._renderanchor) + finally: + self.d = None + return self.bottom + # + def _renderword(self, tuple, word, h, v, h2, v2, isfirst, islast): + if word[0] <> None: self.d.setfont(word[0]) + baseline = v + tuple[5] + self.d.text((h, baseline - word[5]), word[1]) + if self.anchorid > 0: + self.d.line((h, baseline+2), (h2, baseline+2)) + # + def _renderanchor(self, tuple, word, h, v): + self.anchorid = word + # + # Return which anchor(s) was hit by the mouse + def hitcheck(self, mouseh, mousev): + self.mouseh = mouseh + self.mousev = mousev + self.anchorid = 0 + self.hits = [] + self.visit(self.__class__._hitcheckword, \ + self.__class__._hitcheckanchor) + return self.hits + # + def _hitcheckword(self, tuple, word, h, v, h2, v2, isfirst, islast): + if self.anchorid > 0 and h <= self.mouseh <= h2 and \ + v <= self.mousev <= v2: + self.hits.append(self.anchorid) + # + def _hitcheckanchor(self, tuple, word, h, v): + self.anchorid = word + # + # Return whether the given anchor id is present + def hasanchor(self, id): + return id in self.words or -id in self.words + # + # Extract the raw text from the word list, substituting one space + # for non-empty inter-word space, and terminating with '\n' + def extract(self): + text = '' + for w in self.words: + if type(w) <> Int: + word = w[1] + if w[3]: word = word + ' ' + text = text + word + return text + '\n' + # + # Return which character position was hit by the mouse, as + # an offset in the entire text as returned by extract(). + # Return None if the mouse was not in this paragraph + def whereis(self, d, mouseh, mousev): + if mousev < self.top or mousev > self.bottom: + return None + self.mouseh = mouseh + self.mousev = mousev + self.lastfont = None + self.charcount = 0 + try: + self.d = d + return self.visit(self.__class__._whereisword, \ + self.__class__._whereisanchor) + finally: + self.d = None + # + def _whereisword(self, tuple, word, h1, v1, h2, v2, isfirst, islast): + fo, te, wi, sp, st, as, de = word + if fo <> None: self.lastfont = fo + h = h1 + if isfirst: h1 = 0 + if islast: h2 = 999999 + if not (v1 <= self.mousev <= v2 and h1 <= self.mouseh <= h2): + self.charcount = self.charcount + len(te) + (sp > 0) + return + if self.lastfont <> None: + self.d.setfont(self.lastfont) + cc = 0 + for c in te: + cw = self.d.textwidth(c) + if self.mouseh <= h + cw/2: + return self.charcount + cc + cc = cc+1 + h = h+cw + self.charcount = self.charcount + cc + if self.mouseh <= (h+h2) / 2: + return self.charcount + else: + return self.charcount + 1 + # + def _whereisanchor(self, tuple, word, h, v): + pass + # + # Return screen position corresponding to position in paragraph. + # Return tuple (h, vtop, vbaseline, vbottom). + # This is more or less the inverse of whereis() + def screenpos(self, d, pos): + if pos < 0: + ascent, descent = self.lines[0][5:7] + return self.left, self.top, self.top + ascent, \ + self.top + ascent + descent + self.pos = pos + self.lastfont = None + try: + self.d = d + ok = self.visit(self.__class__._screenposword, \ + self.__class__._screenposanchor) + finally: + self.d = None + if ok == None: + ascent, descent = self.lines[-1][5:7] + ok = self.right, self.bottom - ascent - descent, \ + self.bottom - descent, self.bottom + return ok + # + def _screenposword(self, tuple, word, h1, v1, h2, v2, isfirst, islast): + fo, te, wi, sp, st, as, de = word + if fo <> None: self.lastfont = fo + cc = len(te) + (sp > 0) + if self.pos > cc: + self.pos = self.pos - cc + return + if self.pos < cc: + self.d.setfont(self.lastfont) + h = h1 + self.d.textwidth(te[:self.pos]) + else: + h = h2 + ascent, descent = tuple[5:7] + return h, v1, v1+ascent, v2 + # + def _screenposanchor(self, tuple, word, h, v): + pass + # + # Invert the stretch of text between pos1 and pos2. + # If pos1 is None, the beginning is implied; + # if pos2 is None, the end is implied. + # Undoes its own effect when called again with the same arguments + def invert(self, d, pos1, pos2): + if pos1 == None: + pos1 = self.left, self.top, self.top, self.top + else: + pos1 = self.screenpos(d, pos1) + if pos2 == None: + pos2 = self.right, self.bottom,self.bottom,self.bottom + else: + pos2 = self.screenpos(d, pos2) + h1, top1, baseline1, bottom1 = pos1 + h2, top2, baseline2, bottom2 = pos2 + if bottom1 <= top2: + d.invert((h1, top1), (self.right, bottom1)) + h1 = self.left + if bottom1 < top2: + d.invert((h1, bottom1), (self.right, top2)) + top1, bottom1 = top2, bottom2 + d.invert((h1, top1), (h2, bottom2)) + + +# Test class Para +# XXX This was last used on the Mac, hence the weird fonts... +def test(): + import stdwin + from stdwinevents import * + words = 'The', 'quick', 'brown', 'fox', 'jumps', 'over', \ + 'the', 'lazy', 'dog.' + paralist = [] + for just in 'l', 'r', 'lr', 'c': + p = Para() + p.just = just + p.addword(stdwin, ('New York', 'p', 12), words[0], 1, 1) + for word in words[1:-1]: + p.addword(stdwin, None, word, 1, 1) + p.addword(stdwin, None, words[-1], 2, 4) + p.addword(stdwin, ('New York', 'b', 18), 'Bye!', 0, 0) + p.addword(stdwin, ('New York', 'p', 10), 'Bye!', 0, 0) + paralist.append(p) + window = stdwin.open('Para.test()') + start = stop = selpara = None + while 1: + etype, win, detail = stdwin.getevent() + if etype == WE_CLOSE: + break + if etype == WE_SIZE: + window.change((0, 0), (1000, 1000)) + if etype == WE_DRAW: + width, height = window.getwinsize() + d = None + try: + d = window.begindrawing() + d.cliprect(detail) + d.erase(detail) + v = 0 + for p in paralist: + v = p.render(d, 0, v, width) + if p == selpara and \ + start <> None and stop <> None: + p.invert(d, start, stop) + finally: + if d: d.close() + if etype == WE_MOUSE_DOWN: + if selpara and start <> None and stop <> None: + d = window.begindrawing() + selpara.invert(d, start, stop) + d.close() + start = stop = selpara = None + mouseh, mousev = detail[0] + for p in paralist: + start = p.whereis(stdwin, mouseh, mousev) + if start <> None: + selpara = p + break + if etype == WE_MOUSE_UP and start <> None and selpara: + mouseh, mousev = detail[0] + stop = selpara.whereis(stdwin, mouseh, mousev) + if stop == None: start = selpara = None + else: + if start > stop: + start, stop = stop, start + d = window.begindrawing() + selpara.invert(d, start, stop) + d.close() + window.close() diff --git a/Lib/lib-old/fmt.py b/Lib/lib-old/fmt.py new file mode 100644 index 0000000..c096306 --- /dev/null +++ b/Lib/lib-old/fmt.py @@ -0,0 +1,621 @@ +# Text formatting abstractions + + +import string +import Para + + +# A formatter back-end object has one method that is called by the formatter: +# addpara(p), where p is a paragraph object. For example: + + +# Formatter back-end to do nothing at all with the paragraphs +class NullBackEnd: + # + def __init__(self): + pass + # + def addpara(self, p): + pass + # + def bgn_anchor(self, id): + pass + # + def end_anchor(self, id): + pass + + +# Formatter back-end to collect the paragraphs in a list +class SavingBackEnd(NullBackEnd): + # + def __init__(self): + self.paralist = [] + # + def addpara(self, p): + self.paralist.append(p) + # + def hitcheck(self, h, v): + hits = [] + for p in self.paralist: + if p.top <= v <= p.bottom: + for id in p.hitcheck(h, v): + if id not in hits: + hits.append(id) + return hits + # + def extract(self): + text = '' + for p in self.paralist: + text = text + (p.extract()) + return text + # + def extractpart(self, long1, long2): + if long1 > long2: long1, long2 = long2, long1 + para1, pos1 = long1 + para2, pos2 = long2 + text = '' + while para1 < para2: + ptext = self.paralist[para1].extract() + text = text + ptext[pos1:] + pos1 = 0 + para1 = para1 + 1 + ptext = self.paralist[para2].extract() + return text + ptext[pos1:pos2] + # + def whereis(self, d, h, v): + total = 0 + for i in range(len(self.paralist)): + p = self.paralist[i] + result = p.whereis(d, h, v) + if result <> None: + return i, result + return None + # + def roundtowords(self, long1, long2): + i, offset = long1 + text = self.paralist[i].extract() + while offset > 0 and text[offset-1] <> ' ': offset = offset-1 + long1 = i, offset + # + i, offset = long2 + text = self.paralist[i].extract() + n = len(text) + while offset < n-1 and text[offset] <> ' ': offset = offset+1 + long2 = i, offset + # + return long1, long2 + # + def roundtoparagraphs(self, long1, long2): + long1 = long1[0], 0 + long2 = long2[0], len(self.paralist[long2[0]].extract()) + return long1, long2 + + +# Formatter back-end to send the text directly to the drawing object +class WritingBackEnd(NullBackEnd): + # + def __init__(self, d, width): + self.d = d + self.width = width + self.lineno = 0 + # + def addpara(self, p): + self.lineno = p.render(self.d, 0, self.lineno, self.width) + + +# A formatter receives a stream of formatting instructions and assembles +# these into a stream of paragraphs on to a back-end. The assembly is +# parametrized by a text measurement object, which must match the output +# operations of the back-end. The back-end is responsible for splitting +# paragraphs up in lines of a given maximum width. (This is done because +# in a windowing environment, when the window size changes, there is no +# need to redo the assembly into paragraphs, but the splitting into lines +# must be done taking the new window size into account.) + + +# Formatter base class. Initialize it with a text measurement object, +# which is used for text measurements, and a back-end object, +# which receives the completed paragraphs. The formatting methods are: +# setfont(font) +# setleftindent(nspaces) +# setjust(type) where type is 'l', 'c', 'r', or 'lr' +# flush() +# vspace(nlines) +# needvspace(nlines) +# addword(word, nspaces) +class BaseFormatter: + # + def __init__(self, d, b): + # Drawing object used for text measurements + self.d = d + # + # BackEnd object receiving completed paragraphs + self.b = b + # + # Parameters of the formatting model + self.leftindent = 0 + self.just = 'l' + self.font = None + self.blanklines = 0 + # + # Parameters derived from the current font + self.space = d.textwidth(' ') + self.line = d.lineheight() + self.ascent = d.baseline() + self.descent = self.line - self.ascent + # + # Parameter derived from the default font + self.n_space = self.space + # + # Current paragraph being built + self.para = None + self.nospace = 1 + # + # Font to set on the next word + self.nextfont = None + # + def newpara(self): + return Para.Para() + # + def setfont(self, font): + if font == None: return + self.font = self.nextfont = font + d = self.d + d.setfont(font) + self.space = d.textwidth(' ') + self.line = d.lineheight() + self.ascent = d.baseline() + self.descent = self.line - self.ascent + # + def setleftindent(self, nspaces): + self.leftindent = int(self.n_space * nspaces) + if self.para: + hang = self.leftindent - self.para.indent_left + if hang > 0 and self.para.getlength() <= hang: + self.para.makehangingtag(hang) + self.nospace = 1 + else: + self.flush() + # + def setrightindent(self, nspaces): + self.rightindent = int(self.n_space * nspaces) + if self.para: + self.para.indent_right = self.rightindent + self.flush() + # + def setjust(self, just): + self.just = just + if self.para: + self.para.just = self.just + # + def flush(self): + if self.para: + self.b.addpara(self.para) + self.para = None + if self.font <> None: + self.d.setfont(self.font) + self.nospace = 1 + # + def vspace(self, nlines): + self.flush() + if nlines > 0: + self.para = self.newpara() + tuple = None, '', 0, 0, 0, int(nlines*self.line), 0 + self.para.words.append(tuple) + self.flush() + self.blanklines = self.blanklines + nlines + # + def needvspace(self, nlines): + self.flush() # Just to be sure + if nlines > self.blanklines: + self.vspace(nlines - self.blanklines) + # + def addword(self, text, space): + if self.nospace and not text: + return + self.nospace = 0 + self.blanklines = 0 + if not self.para: + self.para = self.newpara() + self.para.indent_left = self.leftindent + self.para.just = self.just + self.nextfont = self.font + space = int(space * self.space) + self.para.words.append(self.nextfont, text, \ + self.d.textwidth(text), space, space, \ + self.ascent, self.descent) + self.nextfont = None + # + def bgn_anchor(self, id): + if not self.para: + self.nospace = 0 + self.addword('', 0) + self.para.bgn_anchor(id) + # + def end_anchor(self, id): + if not self.para: + self.nospace = 0 + self.addword('', 0) + self.para.end_anchor(id) + + +# Measuring object for measuring text as viewed on a tty +class NullMeasurer: + # + def __init__(self): + pass + # + def setfont(self, font): + pass + # + def textwidth(self, text): + return len(text) + # + def lineheight(self): + return 1 + # + def baseline(self): + return 0 + + +# Drawing object for writing plain ASCII text to a file +class FileWriter: + # + def __init__(self, fp): + self.fp = fp + self.lineno, self.colno = 0, 0 + # + def setfont(self, font): + pass + # + def text(self, (h, v), str): + if not str: return + if '\n' in str: + raise ValueError, 'can\'t write \\n' + while self.lineno < v: + self.fp.write('\n') + self.colno, self.lineno = 0, self.lineno + 1 + while self.lineno > v: + # XXX This should never happen... + self.fp.write('\033[A') # ANSI up arrow + self.lineno = self.lineno - 1 + if self.colno < h: + self.fp.write(' ' * (h - self.colno)) + elif self.colno > h: + self.fp.write('\b' * (self.colno - h)) + self.colno = h + self.fp.write(str) + self.colno = h + len(str) + + +# Formatting class to do nothing at all with the data +class NullFormatter(BaseFormatter): + # + def __init__(self): + d = NullMeasurer() + b = NullBackEnd() + BaseFormatter.__init__(self, d, b) + + +# Formatting class to write directly to a file +class WritingFormatter(BaseFormatter): + # + def __init__(self, fp, width): + dm = NullMeasurer() + dw = FileWriter(fp) + b = WritingBackEnd(dw, width) + BaseFormatter.__init__(self, dm, b) + self.blanklines = 1 + # + # Suppress multiple blank lines + def needvspace(self, nlines): + BaseFormatter.needvspace(self, min(1, nlines)) + + +# A "FunnyFormatter" writes ASCII text with a twist: *bold words*, +# _italic text_ and _underlined words_, and `quoted text'. +# It assumes that the fonts are 'r', 'i', 'b', 'u', 'q': (roman, +# italic, bold, underline, quote). +# Moreover, if the font is in upper case, the text is converted to +# UPPER CASE. +class FunnyFormatter(WritingFormatter): + # + def flush(self): + if self.para: finalize(self.para) + WritingFormatter.flush(self) + + +# Surrounds *bold words* and _italic text_ in a paragraph with +# appropriate markers, fixing the size (assuming these characters' +# width is 1). +openchar = \ + {'b':'*', 'i':'_', 'u':'_', 'q':'`', 'B':'*', 'I':'_', 'U':'_', 'Q':'`'} +closechar = \ + {'b':'*', 'i':'_', 'u':'_', 'q':'\'', 'B':'*', 'I':'_', 'U':'_', 'Q':'\''} +def finalize(para): + oldfont = curfont = 'r' + para.words.append('r', '', 0, 0, 0, 0) # temporary, deleted at end + for i in range(len(para.words)): + fo, te, wi = para.words[i][:3] + if fo <> None: curfont = fo + if curfont <> oldfont: + if closechar.has_key(oldfont): + c = closechar[oldfont] + j = i-1 + while j > 0 and para.words[j][1] == '': j = j-1 + fo1, te1, wi1 = para.words[j][:3] + te1 = te1 + c + wi1 = wi1 + len(c) + para.words[j] = (fo1, te1, wi1) + \ + para.words[j][3:] + if openchar.has_key(curfont) and te: + c = openchar[curfont] + te = c + te + wi = len(c) + wi + para.words[i] = (fo, te, wi) + \ + para.words[i][3:] + if te: oldfont = curfont + else: oldfont = 'r' + if curfont in string.uppercase: + te = string.upper(te) + para.words[i] = (fo, te, wi) + para.words[i][3:] + del para.words[-1] + + +# Formatter back-end to draw the text in a window. +# This has an option to draw while the paragraphs are being added, +# to minimize the delay before the user sees anything. +# This manages the entire "document" of the window. +class StdwinBackEnd(SavingBackEnd): + # + def __init__(self, window, drawnow): + self.window = window + self.drawnow = drawnow + self.width = window.getwinsize()[0] + self.selection = None + self.height = 0 + window.setorigin(0, 0) + window.setdocsize(0, 0) + self.d = window.begindrawing() + SavingBackEnd.__init__(self) + # + def finish(self): + self.d.close() + self.d = None + self.window.setdocsize(0, self.height) + # + def addpara(self, p): + self.paralist.append(p) + if self.drawnow: + self.height = \ + p.render(self.d, 0, self.height, self.width) + else: + p.layout(self.width) + p.left = 0 + p.top = self.height + p.right = self.width + p.bottom = self.height + p.height + self.height = p.bottom + # + def resize(self): + self.window.change((0, 0), (self.width, self.height)) + self.width = self.window.getwinsize()[0] + self.height = 0 + for p in self.paralist: + p.layout(self.width) + p.left = 0 + p.top = self.height + p.right = self.width + p.bottom = self.height + p.height + self.height = p.bottom + self.window.change((0, 0), (self.width, self.height)) + self.window.setdocsize(0, self.height) + # + def redraw(self, area): + d = self.window.begindrawing() + (left, top), (right, bottom) = area + d.erase(area) + d.cliprect(area) + for p in self.paralist: + if top < p.bottom and p.top < bottom: + v = p.render(d, p.left, p.top, p.right) + if self.selection: + self.invert(d, self.selection) + d.close() + # + def setselection(self, new): + if new: + long1, long2 = new + pos1 = long1[:3] + pos2 = long2[:3] + new = pos1, pos2 + if new <> self.selection: + d = self.window.begindrawing() + if self.selection: + self.invert(d, self.selection) + if new: + self.invert(d, new) + d.close() + self.selection = new + # + def getselection(self): + return self.selection + # + def extractselection(self): + if self.selection: + a, b = self.selection + return self.extractpart(a, b) + else: + return None + # + def invert(self, d, region): + long1, long2 = region + if long1 > long2: long1, long2 = long2, long1 + para1, pos1 = long1 + para2, pos2 = long2 + while para1 < para2: + self.paralist[para1].invert(d, pos1, None) + pos1 = None + para1 = para1 + 1 + self.paralist[para2].invert(d, pos1, pos2) + # + def search(self, prog): + import regex, string + if type(prog) == type(''): + prog = regex.compile(string.lower(prog)) + if self.selection: + iold = self.selection[0][0] + else: + iold = -1 + hit = None + for i in range(len(self.paralist)): + if i == iold or i < iold and hit: + continue + p = self.paralist[i] + text = string.lower(p.extract()) + if prog.search(text) >= 0: + a, b = prog.regs[0] + long1 = i, a + long2 = i, b + hit = long1, long2 + if i > iold: + break + if hit: + self.setselection(hit) + i = hit[0][0] + p = self.paralist[i] + self.window.show((p.left, p.top), (p.right, p.bottom)) + return 1 + else: + return 0 + # + def showanchor(self, id): + for i in range(len(self.paralist)): + p = self.paralist[i] + if p.hasanchor(id): + long1 = i, 0 + long2 = i, len(p.extract()) + hit = long1, long2 + self.setselection(hit) + self.window.show( \ + (p.left, p.top), (p.right, p.bottom)) + break + + +# GL extensions + +class GLFontCache: + # + def __init__(self): + self.reset() + self.setfont('') + # + def reset(self): + self.fontkey = None + self.fonthandle = None + self.fontinfo = None + self.fontcache = {} + # + def close(self): + self.reset() + # + def setfont(self, fontkey): + if fontkey == '': + fontkey = 'Times-Roman 12' + elif ' ' not in fontkey: + fontkey = fontkey + ' 12' + if fontkey == self.fontkey: + return + if self.fontcache.has_key(fontkey): + handle = self.fontcache[fontkey] + else: + import string + i = string.index(fontkey, ' ') + name, sizestr = fontkey[:i], fontkey[i:] + size = eval(sizestr) + key1 = name + ' 1' + key = name + ' ' + `size` + # NB key may differ from fontkey! + if self.fontcache.has_key(key): + handle = self.fontcache[key] + else: + if self.fontcache.has_key(key1): + handle = self.fontcache[key1] + else: + import fm + handle = fm.findfont(name) + self.fontcache[key1] = handle + handle = handle.scalefont(size) + self.fontcache[fontkey] = \ + self.fontcache[key] = handle + self.fontkey = fontkey + if self.fonthandle <> handle: + self.fonthandle = handle + self.fontinfo = handle.getfontinfo() + handle.setfont() + + +class GLMeasurer(GLFontCache): + # + def textwidth(self, text): + return self.fonthandle.getstrwidth(text) + # + def baseline(self): + return self.fontinfo[6] - self.fontinfo[3] + # + def lineheight(self): + return self.fontinfo[6] + + +class GLWriter(GLFontCache): + # + # NOTES: + # (1) Use gl.ortho2 to use X pixel coordinates! + # + def text(self, (h, v), text): + import gl, fm + gl.cmov2i(h, v + self.fontinfo[6] - self.fontinfo[3]) + fm.prstr(text) + # + def setfont(self, fontkey): + oldhandle = self.fonthandle + GLFontCache.setfont(fontkey) + if self.fonthandle <> oldhandle: + handle.setfont() + + +class GLMeasurerWriter(GLMeasurer, GLWriter): + pass + + +class GLBackEnd(SavingBackEnd): + # + def __init__(self, wid): + import gl + gl.winset(wid) + self.wid = wid + self.width = gl.getsize()[1] + self.height = 0 + self.d = GLMeasurerWriter() + SavingBackEnd.__init__(self) + # + def finish(self): + pass + # + def addpara(self, p): + self.paralist.append(p) + self.height = p.render(self.d, 0, self.height, self.width) + # + def redraw(self): + import gl + gl.winset(self.wid) + width = gl.getsize()[1] + if width <> self.width: + setdocsize = 1 + self.width = width + for p in self.paralist: + p.top = p.bottom = None + d = self.d + v = 0 + for p in self.paralist: + v = p.render(d, 0, v, width) diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py new file mode 100644 index 0000000..af75e0d --- /dev/null +++ b/Lib/sgmllib.py @@ -0,0 +1,321 @@ +# A parser for SGML, using the derived class as static DTD. + +# XXX This only supports those SGML features used by HTML. + +# XXX There should be a way to distinguish between PCDATA (parsed +# character data -- the normal case), RCDATA (replaceable character +# data -- only char and entity references and end tags are special) +# and CDATA (character data -- only end tags are special). + + +import regex +import string + + +# Regular expressions used for parsing + +incomplete = regex.compile( \ + '<!-?\|</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*\|</?\|' + \ + '&#[a-zA-Z0-9]*\|&[a-zA-Z][a-zA-Z0-9]*\|&') +entityref = regex.compile('&[a-zA-Z][a-zA-Z0-9]*[;.]') +charref = regex.compile('&#[a-zA-Z0-9]+;') +starttagopen = regex.compile('<[a-zA-Z]') +endtag = regex.compile('</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*>') +commentopen = regex.compile('<!--') + + +# SGML parser base class -- find tags and call handler functions. +# Usage: p = SGMLParser(); p.feed(data); ...; p.close(). +# The dtd is defined by deriving a class which defines methods +# with special names to handle tags: start_foo and end_foo to handle +# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself. +# (Tags are converted to lower case for this purpose.) The data +# between tags is passed to the parser by calling self.handle_data() +# with some data as argument (the data may be split up in arbutrary +# chunks). Entity references are passed by calling +# self.handle_entityref() with the entity reference as argument. + +class SGMLParser: + + # Interface -- initialize and reset this instance + def __init__(self): + self.reset() + + # Interface -- reset this instance. Loses all unprocessed data + def reset(self): + self.rawdata = '' + self.stack = [] + self.nomoretags = 0 + self.literal = 0 + + # For derived classes only -- enter literal mode (CDATA) till EOF + def setnomoretags(self): + self.nomoretags = self.literal = 1 + + # For derived classes only -- enter literal mode (CDATA) + def setliteral(self, *args): + self.literal = 1 + + # Interface -- feed some data to the parser. Call this as + # often as you want, with as little or as much text as you + # want (may include '\n'). (This just saves the text, all the + # processing is done by process() or close().) + def feed(self, data): + self.rawdata = self.rawdata + data + self.goahead(0) + + # Interface -- handle the remaining data + def close(self): + self.goahead(1) + + # Internal -- handle data as far as reasonable. May leave state + # and data to be processed by a subsequent call. If 'end' is + # true, force handling all data as if followed by EOF marker. + def goahead(self, end): + rawdata = self.rawdata + i = 0 + n = len(rawdata) + while i < n: + if self.nomoretags: + self.handle_data(rawdata[i:n]) + i = n + break + j = incomplete.search(rawdata, i) + if j < 0: j = n + if i < j: self.handle_data(rawdata[i:j]) + i = j + if i == n: break + if rawdata[i] == '<': + if starttagopen.match(rawdata, i) >= 0: + if self.literal: + self.handle_data(rawdata[i]) + i = i+1 + continue + k = self.parse_starttag(i) + if k < 0: break + i = i + k + continue + k = endtag.match(rawdata, i) + if k >= 0: + j = i+k + self.parse_endtag(rawdata[i:j]) + i = j + self.literal = 0 + continue + if commentopen.match(rawdata, i) >= 0: + if self.literal: + self.handle_data(rawdata[i]) + i = i+1 + continue + k = self.parse_comment(i) + if k < 0: break + i = i+k + continue + elif rawdata[i] == '&': + k = charref.match(rawdata, i) + if k >= 0: + j = i+k + self.handle_charref(rawdata[i+2:j-1]) + i = j + continue + k = entityref.match(rawdata, i) + if k >= 0: + j = i+k + self.handle_entityref(rawdata[i+1:j-1]) + i = j + continue + else: + raise RuntimeError, 'neither < nor & ??' + # We get here only if incomplete matches but + # nothing else + k = incomplete.match(rawdata, i) + if k < 0: raise RuntimeError, 'no incomplete match ??' + j = i+k + if j == n: break # Really incomplete + self.handle_data(rawdata[i:j]) + i = j + # end while + if end and i < n: + self.handle_data(rawdata[i:n]) + i = n + self.rawdata = rawdata[i:] + # XXX if end: check for empty stack + + # Internal -- parse comment, return length or -1 if not ternimated + def parse_comment(self, i): + rawdata = self.rawdata + if rawdata[i:i+4] <> '<!--': + raise RuntimeError, 'unexpected call to handle_comment' + try: + j = string.index(rawdata, '--', i+4) + except string.index_error: + return -1 + self.handle_comment(rawdata[i+4: j]) + j = j+2 + n = len(rawdata) + while j < n and rawdata[j] in ' \t\n': j = j+1 + if j == n: return -1 # Wait for final '>' + if rawdata[j] == '>': + j = j+1 + else: + print '*** comment not terminated with >' + print repr(rawdata[j-5:j]), '*!*', repr(rawdata[j:j+5]) + return j-i + + # Internal -- handle starttag, return length or -1 if not terminated + def parse_starttag(self, i): + rawdata = self.rawdata + try: + j = string.index(rawdata, '>', i) + except string.index_error: + return -1 + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + tagfind = regex.compile('[a-zA-Z][a-zA-Z0-9]*') + attrfind = regex.compile( \ + '[ \t\n]+\([a-zA-Z][a-zA-Z0-9]*\)' + \ + '\([ \t\n]*=[ \t\n]*' + \ + '\(\'[^\']*\';\|"[^"]*"\|[-a-zA-Z0-9./:+*%?!()_#]+\)\)?') + k = tagfind.match(rawdata, i+1) + if k < 0: + raise RuntimeError, 'unexpected call to parse_starttag' + k = i+1+k + tag = string.lower(rawdata[i+1:k]) + while k < j: + l = attrfind.match(rawdata, k) + if l < 0: break + regs = attrfind.regs + a1, b1 = regs[1] + a2, b2 = regs[2] + a3, b3 = regs[3] + attrname = rawdata[a1:b1] + if '=' in rawdata[k:k+l]: + attrvalue = rawdata[a3:b3] + if attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: + attrvalue = attrvalue[1:-1] + else: + attrvalue = '' + attrs.append(string.lower(attrname), attrvalue) + k = k + l + j = j+1 + try: + method = getattr(self, 'start_' + tag) + except AttributeError: + try: + method = getattr(self, 'do_' + tag) + except AttributeError: + self.unknown_starttag(tag, attrs) + return j-i + method(attrs) + return j-i + self.stack.append(tag) + method(attrs) + return j-i + + # Internal -- parse endtag + def parse_endtag(self, data): + if data[:2] <> '</' or data[-1:] <> '>': + raise RuntimeError, 'unexpected call to parse_endtag' + tag = string.lower(string.strip(data[2:-1])) + try: + method = getattr(self, 'end_' + tag) + except AttributeError: + self.unknown_endtag(tag) + return + if self.stack and self.stack[-1] == tag: + del self.stack[-1] + else: + print '*** Unbalanced </' + tag + '>' + print '*** Stack:', self.stack + found = None + for i in range(len(self.stack)): + if self.stack[i] == tag: found = i + if found <> None: + del self.stack[found:] + method() + + # Example -- handle character reference, no need to override + def handle_charref(self, name): + try: + n = string.atoi(name) + except string.atoi_error: + self.unknown_charref(name) + return + if not 0 <= n <= 255: + self.unknown_charref(name) + return + self.handle_data(chr(n)) + + # Definition of entities -- derived classes may override + entitydefs = \ + {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} + + # Example -- handle entity reference, no need to override + def handle_entityref(self, name): + table = self.__class__.entitydefs + name = string.lower(name) + if table.has_key(name): + self.handle_data(table[name]) + else: + self.unknown_entityref(name) + return + + # Example -- handle data, should be overridden + def handle_data(self, data): + pass + + # Example -- handle comment, could be overridden + def handle_comment(self, data): + pass + + # To be overridden -- handlers for unknown objects + def unknown_starttag(self, tag, attrs): pass + def unknown_endtag(self, tag): pass + def unknown_charref(self, ref): pass + def unknown_entityref(self, ref): pass + + +class TestSGML(SGMLParser): + + def handle_data(self, data): + r = repr(data) + if len(r) > 72: + r = r[:35] + '...' + r[-35:] + print 'data:', r + + def handle_comment(self, data): + r = repr(data) + if len(r) > 68: + r = r[:32] + '...' + r[-32:] + print 'comment:', r + + def unknown_starttag(self, tag, attrs): + print 'start tag: <' + tag, + for name, value in attrs: + print name + '=' + '"' + value + '"', + print '>' + + def unknown_endtag(self, tag): + print 'end tag: </' + tag + '>' + + def unknown_entityref(self, ref): + print '*** unknown entity ref: &' + ref + ';' + + def unknown_charref(self, ref): + print '*** unknown char ref: &#' + ref + ';' + + +def test(): + file = 'test.html' + f = open(file, 'r') + x = TestSGML() + while 1: + line = f.readline() + if not line: + x.close() + break + x.feed(line) + + +#test() |