# A parser for XML, using the derived class as static DTD. # Author: Sjoerd Mullender. import re import string version = '0.2' # Regular expressions used for parsing _S = '[ \t\r\n]+' # white space _opS = '[ \t\r\n]*' # optional white space _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name _QStr = "(?:'[^']*'|\"[^\"]*\")" # quoted XML string illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content interesting = re.compile('[]&<]') amp = re.compile('&') ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]') entityref = re.compile('&(?P' + _Name + ')[^-a-zA-Z0-9._:]') charref = re.compile('&#(?P[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])') space = re.compile(_S + '$') newline = re.compile('\n') attrfind = re.compile( _S + '(?P' + _Name + ')' '(' + _opS + '=' + _opS + '(?P'+_QStr+'|[-a-zA-Z0-9.:+*%?!()_#=~]+))?') starttagopen = re.compile('<' + _Name) starttagend = re.compile(_opS + '(?P/?)>') starttagmatch = re.compile('<(?P'+_Name+')' '(?P(?:'+attrfind.pattern+')*)'+ starttagend.pattern) endtagopen = re.compile('') endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>') tagfind = re.compile(_Name) cdataopen = re.compile(r'') # this matches one of the following: # SYSTEM SystemLiteral # PUBLIC PubidLiteral SystemLiteral _SystemLiteral = '(?P<%s>'+_QStr+')' _PublicLiteral = '(?P<%s>"[-\'()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \ "'[-()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')" _ExternalId = '(?:SYSTEM|' \ 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \ ')'+_S+_SystemLiteral%'syslit' doctype = re.compile(''+_Name+')' '(?:'+_S+_ExternalId+')?'+_opS) xmldecl = re.compile('<\?xml'+_S+ 'version'+_opS+'='+_opS+'(?P'+_QStr+')'+ '(?:'+_S+'encoding'+_opS+'='+_opS+ "(?P'[A-Za-z][-A-Za-z0-9._]*'|" '"[A-Za-z][-A-Za-z0-9._]*"))?' '(?:'+_S+'standalone'+_opS+'='+_opS+ '(?P\'(?:yes|no)\'|"(?:yes|no)"))?'+ _opS+'\?>') procopen = re.compile(r'<\?(?P' + _Name + ')' + _opS) procclose = re.compile(_opS + r'\?>') commentopen = re.compile('') doubledash = re.compile('--') attrtrans = string.maketrans(' \r\n\t', ' ') # definitions for XML namespaces _NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":" ncname = re.compile(_NCName + '$') qname = re.compile('(?:(?P' + _NCName + '):)?' # optional prefix '(?P' + _NCName + ')$') xmlns = re.compile('xmlns(?::(?P'+_NCName+'))?$') # XML parser base class -- find tags and call handler functions. # Usage: p = XMLParser(); p.feed(data); ...; p.close(). # The dtd is defined by deriving a class which defines methods with # special names to handle tags: start_foo and end_foo to handle # and , respectively. The data between tags is passed to the # parser by calling self.handle_data() with some data as argument (the # data may be split up in arbutrary chunks). Entity references are # passed by calling self.handle_entityref() with the entity reference # as argument. class XMLParser: attributes = {} # default, to be overridden elements = {} # default, to be overridden # Interface -- initialize and reset this instance def __init__(self): self.reset() if self.elements is XMLParser.elements: self.__fixelements() def __fixelements(self): self.elements = {} self.__fixdict(self.__dict__) self.__fixclass(self.__class__) def __fixclass(self, kl): self.__fixdict(kl.__dict__) for k in kl.__bases__: self.__fixclass(k) def __fixdict(self, dict): for key, val in dict.items(): if key[:6] == 'start_': key = key[6:] start, end = self.elements.get(key, (None, None)) if start is None: self.elements[key] = val, end elif key[:4] == 'end_': key = key[4:] start, end = self.elements.get(key, (None, None)) if end is None: self.elements[key] = start, val # Interface -- reset this instance. Loses all unprocessed data def reset(self): self.rawdata = '' self.stack = [] self.nomoretags = 0 self.literal = 0 self.lineno = 1 self.__at_start = 1 self.__seen_doctype = None self.__seen_starttag = 0 self.__use_namespaces = 0 self.__namespaces = {'xml':None} # xml is implicitly declared # For derived classes only -- enter literal mode (CDATA) till EOF def setnomoretags(self): self.nomoretags = self.literal = 1 # For derived classes only -- enter literal mode (CDATA) def setliteral(self, *args): self.literal = 1 # Interface -- feed some data to the parser. Call this as # often as you want, with as little or as much text as you # want (may include '\n'). (This just saves the text, all the # processing is done by goahead().) def feed(self, data): self.rawdata = self.rawdata + data self.goahead(0) # Interface -- handle the remaining data def close(self): self.goahead(1) # Interface -- translate references def translate_references(self, data, all = 1): i = 0 while 1: res = amp.search(data, i) if res is None: return data res = ref.match(data, res.start(0)) if res is None: self.syntax_error("bogus `&'") i =i+1 continue i = res.end(0) if data[i - 1] != ';': self.syntax_error("`;' missing after entity/char reference") i = i-1 str = res.group(1) pre = data[:res.start(0)] post = data[i:] if str[0] == '#': if str[1] == 'x': str = chr(string.atoi(str[2:], 16)) else: str = chr(string.atoi(str[1:])) data = pre + str + post i = res.start(0)+len(str) elif all: if self.entitydefs.has_key(str): data = pre + self.entitydefs[str] + post i = res.start(0) # rescan substituted text else: self.syntax_error('reference to unknown entity') # can't do it, so keep the entity ref in data = pre + '&' + str + ';' + post i = res.start(0) + len(str) + 2 else: # just translating character references pass # i is already postioned correctly # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. def goahead(self, end): rawdata = self.rawdata i = 0 n = len(rawdata) while i < n: if i > 0: self.__at_start = 0 if self.nomoretags: data = rawdata[i:n] self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = n break res = interesting.search(rawdata, i) if res: j = res.start(0) else: j = n if i < j: data = rawdata[i:j] if self.__at_start and space.match(data) is None: self.syntax_error('illegal data at start of file') self.__at_start = 0 if not self.stack and space.match(data) is None: self.syntax_error('data not in content') if illegal.search(data): self.syntax_error('illegal character in content') self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = j if i == n: break if rawdata[i] == '<': if starttagopen.match(rawdata, i): if self.literal: data = rawdata[i] self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = i+1 continue k = self.parse_starttag(i) if k < 0: break self.__seen_starttag = 1 self.lineno = self.lineno + string.count(rawdata[i:k], '\n') i = k continue if endtagopen.match(rawdata, i): k = self.parse_endtag(i) if k < 0: break self.lineno = self.lineno + string.count(rawdata[i:k], '\n') i = k continue if commentopen.match(rawdata, i): if self.literal: data = rawdata[i] self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = i+1 continue k = self.parse_comment(i) if k < 0: break self.lineno = self.lineno + string.count(rawdata[i:k], '\n') i = k continue if cdataopen.match(rawdata, i): k = self.parse_cdata(i) if k < 0: break self.lineno = self.lineno + string.count(rawdata[i:i], '\n') i = k continue res = xmldecl.match(rawdata, i) if res: if not self.__at_start: self.syntax_error(" declaration not at start of document") version, encoding, standalone = res.group('version', 'encoding', 'standalone') if version[1:-1] != '1.0': raise RuntimeError, 'only XML version 1.0 supported' if encoding: encoding = encoding[1:-1] if standalone: standalone = standalone[1:-1] self.handle_xml(encoding, standalone) i = res.end(0) continue res = procopen.match(rawdata, i) if res: k = self.parse_proc(i) if k < 0: break self.lineno = self.lineno + string.count(rawdata[i:k], '\n') i = k continue res = doctype.match(rawdata, i) if res: if self.literal: data = rawdata[i] self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = i+1 continue if self.__seen_doctype: self.syntax_error('multiple DOCTYPE elements') if self.__seen_starttag: self.syntax_error('DOCTYPE not at beginning of document') k = self.parse_doctype(res) if k < 0: break self.__seen_doctype = res.group('name') self.lineno = self.lineno + string.count(rawdata[i:k], '\n') i = k continue elif rawdata[i] == '&': if self.literal: data = rawdata[i] self.handle_data(data) i = i+1 continue res = charref.match(rawdata, i) if res is not None: i = res.end(0) if rawdata[i-1] != ';': self.syntax_error("`;' missing in charref") i = i-1 if not self.stack: self.syntax_error('data not in content') self.handle_charref(res.group('char')[:-1]) self.lineno = self.lineno + string.count(res.group(0), '\n') continue res = entityref.match(rawdata, i) if res is not None: i = res.end(0) if rawdata[i-1] != ';': self.syntax_error("`;' missing in entityref") i = i-1 name = res.group('name') if self.entitydefs.has_key(name): self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:] n = len(rawdata) i = res.start(0) else: self.syntax_error('reference to unknown entity') self.unknown_entityref(name) self.lineno = self.lineno + string.count(res.group(0), '\n') continue elif rawdata[i] == ']': if self.literal: data = rawdata[i] self.handle_data(data) i = i+1 continue if n-i < 3: break if cdataclose.match(rawdata, i): self.syntax_error("bogus `]]>'") self.handle_data(rawdata[i]) i = i+1 continue else: raise RuntimeError, 'neither < nor & ??' # We get here only if incomplete matches but # nothing else break # end while if i > 0: self.__at_start = 0 if end and i < n: data = rawdata[i] self.syntax_error("bogus `%s'" % data) if illegal.search(data): self.syntax_error('illegal character in content') self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') self.rawdata = rawdata[i+1:] return self.goahead(end) self.rawdata = rawdata[i:] if end: if not self.__seen_starttag: self.syntax_error('no elements in file') if self.stack: self.syntax_error('missing end tags') while self.stack: self.finish_endtag(self.stack[-1][0]) # Internal -- parse comment, return length or -1 if not terminated def parse_comment(self, i): rawdata = self.rawdata if rawdata[i:i+4] <> '