diff options
-rw-r--r-- | Doc/tools/sgmlconv/esistools.py | 319 |
1 files changed, 273 insertions, 46 deletions
diff --git a/Doc/tools/sgmlconv/esistools.py b/Doc/tools/sgmlconv/esistools.py index b89476c..40c9121 100644 --- a/Doc/tools/sgmlconv/esistools.py +++ b/Doc/tools/sgmlconv/esistools.py @@ -3,26 +3,33 @@ __version__ = '$Revision$' import re import string -import sys -import xml.dom.core -import xml.dom.esis_builder +import xml.dom.pulldom -_data_rx = re.compile(r"[^\\][^\\]*") +import xml.sax +import xml.sax.handler +import xml.sax.xmlreader + + +_data_match = re.compile(r"[^\\][^\\]*").match def decode(s): r = '' while s: - m = _data_rx.match(s) + m = _data_match(s) if m: r = r + m.group() - s = s[len(m.group()):] + s = s[m.end():] elif s[1] == "\\": r = r + "\\" s = s[2:] elif s[1] == "n": r = r + "\n" s = s[2:] + elif s[1] == "%": + s = s[2:] + n, s = s.split(";", 1) + r = r + unichr(int(n)) else: raise ValueError, "can't handle " + `s` return r @@ -35,49 +42,269 @@ _charmap["\n"] = r"\n" _charmap["\\"] = r"\\" del c +_null_join = ''.join def encode(s): - return string.join(map(_charmap.get, s), '') + return _null_join(map(_charmap.get, s)) -class ExtendedEsisBuilder(xml.dom.esis_builder.EsisBuilder): - def __init__(self, *args, **kw): - self.__empties = {} - self.__is_empty = 0 - apply(xml.dom.esis_builder.EsisBuilder.__init__, (self,) + args, kw) - self.buildFragment() +class ESISReader(xml.sax.xmlreader.XMLReader): + """SAX Reader which reads from an ESIS stream. - def feed(self, data): - for line in string.split(data, '\n'): - if not line: - break - event = line[0] - text = line[1:] - if event == '(': - element = self.document.createElement(text, self.attr_store) - self.attr_store = {} - self.push(element) - if self.__is_empty: - self.__empties[text] = text - self.__is_empty = 0 - elif event == ')': - self.pop() - elif event == 'A': - l = re.split(' ', text, 2) - name = l[0] - value = decode(l[2]) - self.attr_store[name] = value - elif event == '-': - text = self.document.createText(decode(text)) - self.push(text) - elif event == 'C': - return - elif event == 'e': - self.__is_empty = 1 - elif event == '&': - eref = self.document.createEntityReference(text) - self.push(eref) - else: - sys.stderr.write('Unknown event: %s\n' % line) + No verification of the document structure is performed by the + reader; a general verifier could be used as the target + ContentHandler instance. + + """ + _decl_handler = None + _lexical_handler = None + + _public_id = None + _system_id = None + + _buffer = "" + _is_empty = 0 + _lineno = 0 + _started = 0 + + def __init__(self, contentHandler=None, errorHandler=None): + xml.sax.xmlreader.XMLReader.__init__(self) + self._attrs = {} + self._attributes = Attributes(self._attrs) + self._locator = Locator() + self._empties = {} + if contentHandler: + self.setContentHandler(contentHandler) + if errorHandler: + self.setErrorHandler(errorHandler) def get_empties(self): - return self.__empties.keys() + return self._empties.keys() + + # + # XMLReader interface + # + + def parse(self, source): + raise RuntimeError + self._locator._public_id = source.getPublicId() + self._locator._system_id = source.getSystemId() + fp = source.getByteStream() + handler = self.getContentHandler() + if handler: + handler.startDocument() + lineno = 0 + while 1: + token, data = self._get_token(fp) + if token is None: + break + lineno = lineno + 1 + self._locator._lineno = lineno + self._handle_token(token, data) + handler = self.getContentHandler() + if handler: + handler.startDocument() + + def feed(self, data): + if not self._started: + handler = self.getContentHandler() + if handler: + handler.startDocument() + self._started = 1 + data = self._buffer + data + self._buffer = None + lines = data.split("\n") + if lines: + for line in lines[:-1]: + self._lineno = self._lineno + 1 + self._locator._lineno = self._lineno + if not line: + e = xml.sax.SAXParseException( + "ESIS input line contains no token type mark", + None, self._locator) + self.getErrorHandler().error(e) + else: + self._handle_token(line[0], line[1:]) + self._buffer = lines[-1] + else: + self._buffer = "" + + def close(self): + handler = self.getContentHandler() + if handler: + handler.endDocument() + self._buffer = "" + + def _get_token(self, fp): + try: + line = fp.readline() + except IOError, e: + e = SAXException("I/O error reading input stream", e) + self.getErrorHandler().fatalError(e) + return + if not line: + return None, None + if line[-1] == "\n": + line = line[:-1] + if not line: + e = xml.sax.SAXParseException( + "ESIS input line contains no token type mark", + None, self._locator) + self.getErrorHandler().error(e) + return + return line[0], line[1:] + + def _handle_token(self, token, data): + handler = self.getContentHandler() + if token == '-': + if data and handler: + handler.characters(decode(data)) + elif token == ')': + if handler: + handler.endElement(decode(data)) + elif token == '(': + if self._is_empty: + self._empties[data] = 1 + if handler: + handler.startElement(data, self._attributes) + self._attrs.clear() + self._is_empty = 0 + elif token == 'A': + name, value = data.split(' ', 1) + if value != "IMPLIED": + type, value = value.split(' ', 1) + self._attrs[name] = (decode(value), type) + elif token == '&': + # entity reference in SAX? + pass + elif token == '?': + if handler: + if ' ' in data: + target, data = string.split(data, None, 1) + else: + target, data = data, "" + handler.processingInstruction(target, decode(data)) + elif token == 'N': + handler = self.getDTDHandler() + if handler: + handler.notationDecl(data, self._public_id, self._system_id) + self._public_id = None + self._system_id = None + elif token == 'p': + self._public_id = decode(data) + elif token == 's': + self._system_id = decode(data) + elif token == 'e': + self._is_empty = 1 + elif token == 'C': + pass + else: + e = SAXParseException("unknown ESIS token in event stream", + None, self._locator) + self.getErrorHandler().error(e) + + def setContentHandler(self, handler): + old = self.getContentHandler() + if old: + old.setDocumentLocator(None) + if handler: + handler.setDocumentLocator(self._locator) + xml.sax.xmlreader.XMLReader.setContentHandler(self, handler) + + def getProperty(self, property): + if property == xml.sax.handler.property_lexical_handler: + return self._lexical_handler + + elif property == xml.sax.handler.property_declaration_handler: + return self._decl_handler + + else: + raise xml.sax.SAXNotRecognizedException("unknown property %s" + % `property`) + + def setProperty(self, property, value): + if property == xml.sax.handler.property_lexical_handler: + if self._lexical_handler: + self._lexical_handler.setDocumentLocator(None) + if value: + value.setDocumentLocator(self._locator) + self._lexical_handler = value + + elif property == xml.sax.handler.property_declaration_handler: + if self._decl_handler: + self._decl_handler.setDocumentLocator(None) + if value: + value.setDocumentLocator(self._locator) + self._decl_handler = value + + else: + raise xml.sax.SAXNotRecognizedException() + + def getFeature(self, feature): + if feature == xml.sax.handler.feature_namespaces: + return 1 + else: + return xml.sax.xmlreader.XMLReader.getFeature(self, feature) + + def setFeature(self, feature, enabled): + if feature == xml.sax.handler.feature_namespaces: + pass + else: + xml.sax.xmlreader.XMLReader.setFeature(self, feature, enabled) + + +class Attributes(xml.sax.xmlreader.AttributesImpl): + # self._attrs has the form {name: (value, type)} + + def getType(self, name): + return self._attrs[name][1] + + def getValue(self, name): + return self._attrs[name][0] + + def getValueByQName(self, name): + return self._attrs[name][0] + + def __getitem__(self, name): + return self._attrs[name][0] + + def get(self, name, default=None): + if self._attrs.has_key(name): + return self._attrs[name][0] + return default + + def items(self): + L = [] + for name, (value, type) in self._attrs.items(): + L.append((name, value)) + return L + + def values(self): + L = [] + for value, type in self._attrs.values(): + L.append(value) + return L + + +class Locator(xml.sax.xmlreader.Locator): + _lineno = -1 + _public_id = None + _system_id = None + + def getLineNumber(self): + return self._lineno + + def getPublicId(self): + return self._public_id + + def getSystemId(self): + return self._system_id + + +def parse(stream_or_string, parser=None): + if type(stream_or_string) in [type(""), type(u"")]: + stream = open(stream_or_string) + else: + stream = stream_or_string + if not parser: + parser = ESISReader() + return xml.dom.pulldom.DOMEventStream(stream, parser, (2 ** 14) - 20) |