diff options
Diffstat (limited to 'libxml2/python/drv_libxml2.py')
-rw-r--r-- | libxml2/python/drv_libxml2.py | 379 |
1 files changed, 0 insertions, 379 deletions
diff --git a/libxml2/python/drv_libxml2.py b/libxml2/python/drv_libxml2.py deleted file mode 100644 index 71b1c67..0000000 --- a/libxml2/python/drv_libxml2.py +++ /dev/null @@ -1,379 +0,0 @@ -# -*- coding: iso-8859-1 -*- -""" A SAX2 driver for libxml2, on top of it's XmlReader API - -USAGE - # put this file (drv_libxml2.py) in PYTHONPATH - import xml.sax - reader = xml.sax.make_parser(["drv_libxml2"]) - # ...and the rest is standard python sax. - -CAVEATS - - Lexical handlers are supported, except for start/endEntity - (waiting for XmlReader.ResolveEntity) and start/endDTD - - Error callbacks are not exactly synchronous, they tend - to be invoked before the corresponding content callback, - because the underlying reader interface parses - data by chunks of 512 bytes - -TODO - - search for TODO - - some ErrorHandler events (warning) - - some ContentHandler events (setDocumentLocator, skippedEntity) - - EntityResolver (using libxml2.?) - - DTDHandler (if/when libxml2 exposes such node types) - - DeclHandler (if/when libxml2 exposes such node types) - - property_xml_string? - - feature_string_interning? - - Incremental parser - - additional performance tuning: - - one might cache callbacks to avoid some name lookups - - one might implement a smarter way to pass attributes to startElement - (some kind of lazy evaluation?) - - there might be room for improvement in start/endPrefixMapping - - other? - -""" - -__author__ = "Stéphane Bidoul <sbi@skynet.be>" -__version__ = "0.3" - -import sys -import codecs - -if sys.version_info[0] < 3: - __author__ = codecs.unicode_escape_decode(__author__)[0] - - StringTypes = (str, unicode) - # libxml2 returns strings as UTF8 - _decoder = codecs.lookup("utf8")[1] - def _d(s): - if s is None: - return s - else: - return _decoder(s)[0] -else: - StringTypes = str - # s is Unicode `str` already - def _d(s): - return s - -from xml.sax._exceptions import * -from xml.sax import xmlreader, saxutils -from xml.sax.handler import \ - feature_namespaces, \ - feature_namespace_prefixes, \ - feature_string_interning, \ - feature_validation, \ - feature_external_ges, \ - feature_external_pes, \ - property_lexical_handler, \ - property_declaration_handler, \ - property_dom_node, \ - property_xml_string - -try: - import libxml2 -except ImportError: - raise SAXReaderNotAvailable("libxml2 not available: " \ - "import error was: %s" % sys.exc_info()[1]) - -class Locator(xmlreader.Locator): - """SAX Locator adapter for libxml2.xmlTextReaderLocator""" - - def __init__(self,locator): - self.__locator = locator - - def getColumnNumber(self): - "Return the column number where the current event ends." - return -1 - - def getLineNumber(self): - "Return the line number where the current event ends." - return self.__locator.LineNumber() - - def getPublicId(self): - "Return the public identifier for the current event." - return None - - def getSystemId(self): - "Return the system identifier for the current event." - return self.__locator.BaseURI() - -class LibXml2Reader(xmlreader.XMLReader): - - def __init__(self): - xmlreader.XMLReader.__init__(self) - # features - self.__ns = 0 - self.__nspfx = 0 - self.__validate = 0 - self.__extparams = 1 - # parsing flag - self.__parsing = 0 - # additional handlers - self.__lex_handler = None - self.__decl_handler = None - # error messages accumulator - self.__errors = None - - def _errorHandler(self,arg,msg,severity,locator): - if self.__errors is None: - self.__errors = [] - self.__errors.append((severity, - SAXParseException(msg,None, - Locator(locator)))) - - def _reportErrors(self,fatal): - for severity,exception in self.__errors: - if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING, - libxml2.PARSER_SEVERITY_WARNING): - self._err_handler.warning(exception) - else: - # when fatal is set, the parse will stop; - # we consider that the last error reported - # is the fatal one. - if fatal and exception is self.__errors[-1][1]: - self._err_handler.fatalError(exception) - else: - self._err_handler.error(exception) - self.__errors = None - - def parse(self, source): - self.__parsing = 1 - try: - # prepare source and create reader - if isinstance(source, StringTypes): - reader = libxml2.newTextReaderFilename(source) - else: - source = saxutils.prepare_input_source(source) - input = libxml2.inputBuffer(source.getByteStream()) - reader = input.newTextReader(source.getSystemId()) - reader.SetErrorHandler(self._errorHandler,None) - # configure reader - if self.__extparams: - reader.SetParserProp(libxml2.PARSER_LOADDTD,1) - reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1) - reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1) - reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate) - else: - reader.SetParserProp(libxml2.PARSER_LOADDTD, 0) - # we reuse attribute maps (for a slight performance gain) - if self.__ns: - attributesNSImpl = xmlreader.AttributesNSImpl({},{}) - else: - attributesImpl = xmlreader.AttributesImpl({}) - # prefixes to pop (for endPrefixMapping) - prefixes = [] - # start loop - self._cont_handler.startDocument() - while 1: - r = reader.Read() - # check for errors - if r == 1: - if not self.__errors is None: - self._reportErrors(0) - elif r == 0: - if not self.__errors is None: - self._reportErrors(0) - break # end of parse - else: - if not self.__errors is None: - self._reportErrors(1) - else: - self._err_handler.fatalError(\ - SAXException("Read failed (no details available)")) - break # fatal parse error - # get node type - nodeType = reader.NodeType() - # Element - if nodeType == 1: - if self.__ns: - eltName = (_d(reader.NamespaceUri()),\ - _d(reader.LocalName())) - eltQName = _d(reader.Name()) - attributesNSImpl._attrs = attrs = {} - attributesNSImpl._qnames = qnames = {} - newPrefixes = [] - while reader.MoveToNextAttribute(): - qname = _d(reader.Name()) - value = _d(reader.Value()) - if qname.startswith("xmlns"): - if len(qname) > 5: - newPrefix = qname[6:] - else: - newPrefix = None - newPrefixes.append(newPrefix) - self._cont_handler.startPrefixMapping(\ - newPrefix,value) - if not self.__nspfx: - continue # don't report xmlns attribute - attName = (_d(reader.NamespaceUri()), - _d(reader.LocalName())) - qnames[attName] = qname - attrs[attName] = value - reader.MoveToElement() - self._cont_handler.startElementNS( \ - eltName,eltQName,attributesNSImpl) - if reader.IsEmptyElement(): - self._cont_handler.endElementNS(eltName,eltQName) - for newPrefix in newPrefixes: - self._cont_handler.endPrefixMapping(newPrefix) - else: - prefixes.append(newPrefixes) - else: - eltName = _d(reader.Name()) - attributesImpl._attrs = attrs = {} - while reader.MoveToNextAttribute(): - attName = _d(reader.Name()) - attrs[attName] = _d(reader.Value()) - reader.MoveToElement() - self._cont_handler.startElement( \ - eltName,attributesImpl) - if reader.IsEmptyElement(): - self._cont_handler.endElement(eltName) - # EndElement - elif nodeType == 15: - if self.__ns: - self._cont_handler.endElementNS( \ - (_d(reader.NamespaceUri()),_d(reader.LocalName())), - _d(reader.Name())) - for prefix in prefixes.pop(): - self._cont_handler.endPrefixMapping(prefix) - else: - self._cont_handler.endElement(_d(reader.Name())) - # Text - elif nodeType == 3: - self._cont_handler.characters(_d(reader.Value())) - # Whitespace - elif nodeType == 13: - self._cont_handler.ignorableWhitespace(_d(reader.Value())) - # SignificantWhitespace - elif nodeType == 14: - self._cont_handler.characters(_d(reader.Value())) - # CDATA - elif nodeType == 4: - if not self.__lex_handler is None: - self.__lex_handler.startCDATA() - self._cont_handler.characters(_d(reader.Value())) - if not self.__lex_handler is None: - self.__lex_handler.endCDATA() - # EntityReference - elif nodeType == 5: - if not self.__lex_handler is None: - self.startEntity(_d(reader.Name())) - reader.ResolveEntity() - # EndEntity - elif nodeType == 16: - if not self.__lex_handler is None: - self.endEntity(_d(reader.Name())) - # ProcessingInstruction - elif nodeType == 7: - self._cont_handler.processingInstruction( \ - _d(reader.Name()),_d(reader.Value())) - # Comment - elif nodeType == 8: - if not self.__lex_handler is None: - self.__lex_handler.comment(_d(reader.Value())) - # DocumentType - elif nodeType == 10: - #if not self.__lex_handler is None: - # self.__lex_handler.startDTD() - pass # TODO (how to detect endDTD? on first non-dtd event?) - # XmlDeclaration - elif nodeType == 17: - pass # TODO - # Entity - elif nodeType == 6: - pass # TODO (entity decl) - # Notation (decl) - elif nodeType == 12: - pass # TODO - # Attribute (never in this loop) - #elif nodeType == 2: - # pass - # Document (not exposed) - #elif nodeType == 9: - # pass - # DocumentFragment (never returned by XmlReader) - #elif nodeType == 11: - # pass - # None - #elif nodeType == 0: - # pass - # - - else: - raise SAXException("Unexpected node type %d" % nodeType) - if r == 0: - self._cont_handler.endDocument() - reader.Close() - finally: - self.__parsing = 0 - - def setDTDHandler(self, handler): - # TODO (when supported, the inherited method works just fine) - raise SAXNotSupportedException("DTDHandler not supported") - - def setEntityResolver(self, resolver): - # TODO (when supported, the inherited method works just fine) - raise SAXNotSupportedException("EntityResolver not supported") - - def getFeature(self, name): - if name == feature_namespaces: - return self.__ns - elif name == feature_namespace_prefixes: - return self.__nspfx - elif name == feature_validation: - return self.__validate - elif name == feature_external_ges: - return 1 # TODO (does that relate to PARSER_LOADDTD)? - elif name == feature_external_pes: - return self.__extparams - else: - raise SAXNotRecognizedException("Feature '%s' not recognized" % \ - name) - - def setFeature(self, name, state): - if self.__parsing: - raise SAXNotSupportedException("Cannot set feature %s " \ - "while parsing" % name) - if name == feature_namespaces: - self.__ns = state - elif name == feature_namespace_prefixes: - self.__nspfx = state - elif name == feature_validation: - self.__validate = state - elif name == feature_external_ges: - if state == 0: - # TODO (does that relate to PARSER_LOADDTD)? - raise SAXNotSupportedException("Feature '%s' not supported" % \ - name) - elif name == feature_external_pes: - self.__extparams = state - else: - raise SAXNotRecognizedException("Feature '%s' not recognized" % \ - name) - - def getProperty(self, name): - if name == property_lexical_handler: - return self.__lex_handler - elif name == property_declaration_handler: - return self.__decl_handler - else: - raise SAXNotRecognizedException("Property '%s' not recognized" % \ - name) - - def setProperty(self, name, value): - if name == property_lexical_handler: - self.__lex_handler = value - elif name == property_declaration_handler: - # TODO: remove if/when libxml2 supports dtd events - raise SAXNotSupportedException("Property '%s' not supported" % \ - name) - self.__decl_handler = value - else: - raise SAXNotRecognizedException("Property '%s' not recognized" % \ - name) - -def create_parser(): - return LibXml2Reader() - |