diff options
author | Guido van Rossum <guido@python.org> | 1998-04-03 16:02:39 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 1998-04-03 16:02:39 (GMT) |
commit | 7e07b3845b27f1f9c30733431ebd4dccbae8b9d4 (patch) | |
tree | c502f613fcbd3d0d495399ab3e5c70c1bd2b4e99 /Lib/xmllib.py | |
parent | 0454b51282efd92564b5e338e8a364be3054bf10 (diff) | |
download | cpython-7e07b3845b27f1f9c30733431ebd4dccbae8b9d4.zip cpython-7e07b3845b27f1f9c30733431ebd4dccbae8b9d4.tar.gz cpython-7e07b3845b27f1f9c30733431ebd4dccbae8b9d4.tar.bz2 |
Sjoerd's latest.
Diffstat (limited to 'Lib/xmllib.py')
-rw-r--r-- | Lib/xmllib.py | 312 |
1 files changed, 195 insertions, 117 deletions
diff --git a/Lib/xmllib.py b/Lib/xmllib.py index 6d7f1d1..355714f 100644 --- a/Lib/xmllib.py +++ b/Lib/xmllib.py @@ -5,34 +5,50 @@ import re import string +version = '0.1' + # Regular expressions used for parsing _S = '[ \t\r\n]+' _opS = '[ \t\r\n]*' _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' -interesting = re.compile('[&<]') -incomplete = re.compile('&(' + _Name + '|#[0-9]*|#x[0-9a-fA-F]*)?|' - '<([a-zA-Z_:][^<>]*|' - '/([a-zA-Z_:][^<>]*)?|' - '![^<>]*|' - r'\?[^<>]*)?') - -ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+);?') +illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content +interesting = re.compile('[]&<]') + +amp = re.compile('&') +ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]') entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]') charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])') -space = re.compile(_S) +space = re.compile(_S + '$') newline = re.compile('\n') starttagopen = re.compile('<' + _Name) endtagopen = re.compile('</') starttagend = re.compile(_opS + '(?P<slash>/?)>') -endbracket = re.compile('>') +endbracket = re.compile(_opS + '>') tagfind = re.compile(_Name) cdataopen = re.compile(r'<!\[CDATA\[') cdataclose = re.compile(r'\]\]>') -doctype = re.compile('<!DOCTYPE' + _S + '(?P<name>' + _Name + ')' + _S) -special = re.compile('<!(?P<special>[^<>]*)>') -procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _S) +# this matches one of the following: +# SYSTEM SystemLiteral +# PUBLIC PubidLiteral SystemLiteral +_SystemLiteral = '(?P<%s>\'[^\']*\'|"[^"]*")' +_PublicLiteral = '(?P<%s>"[-\'()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \ + "'[-()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')" +_ExternalId = '(?:SYSTEM|' \ + 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \ + ')'+_S+_SystemLiteral%'syslit' +doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')' + '(?:'+_S+_ExternalId+')?'+_opS) +xmldecl = re.compile('<\?xml'+_S+ + 'version'+_opS+'='+_opS+'(?P<version>\'[^\']*\'|"[^"]*")'+ + '(?:'+_S+'encoding'+_opS+'='+_opS+ + "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|" + '"[A-Za-z][-A-Za-z0-9._]*"))?' + '(?:'+_S+'standalone'+_opS+'='+_opS+ + '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+ + _opS+'\?>') +procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS) procclose = re.compile(_opS + r'\?>') commentopen = re.compile('<!--') commentclose = re.compile('-->') @@ -41,6 +57,7 @@ attrfind = re.compile( _S + '(?P<name>' + _Name + ')' '(' + _opS + '=' + _opS + '(?P<value>\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9.:+*%?!()_#=~]+))') +attrtrans = string.maketrans(' \r\n\t', ' ') # XML parser base class -- find tags and call handler functions. @@ -92,30 +109,43 @@ class XMLParser: self.goahead(1) # Interface -- translate references - def translate_references(self, data): - newdata = [] + def translate_references(self, data, all = 1): i = 0 while 1: - res = ref.search(data, i) + res = amp.search(data, i) if res is None: - newdata.append(data[i:]) - return string.join(newdata, '') - if data[res.end(0) - 1] != ';': + return data + res = ref.match(data, res.start(0)) + if res is None: + self.syntax_error("bogus `&'") + i =i+1 + continue + i = res.end(0) + if data[i - 1] != ';': self.syntax_error("`;' missing after entity/char reference") - newdata.append(data[i:res.start(0)]) + i = i-1 str = res.group(1) + pre = data[:res.start(0)] + post = data[i:] if str[0] == '#': if str[1] == 'x': - newdata.append(chr(string.atoi(str[2:], 16))) + str = chr(string.atoi(str[2:], 16)) else: - newdata.append(chr(string.atoi(str[1:]))) - else: - try: - newdata.append(self.entitydefs[str]) - except KeyError: + str = chr(string.atoi(str[1:])) + data = pre + str + post + i = res.start(0)+len(str) + elif all: + if self.entitydefs.has_key(str): + data = pre + self.entitydefs[str] + post + i = res.start(0) # rescan substituted text + else: + self.syntax_error('reference to unknown entity') # can't do it, so keep the entity ref in - newdata.append('&' + str + ';') - i = res.end(0) + data = pre + '&' + str + ';' + post + i = res.start(0) + len(str) + 2 + else: + # just translating character references + pass # i is already postioned correctly # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is @@ -139,8 +169,14 @@ class XMLParser: else: j = n if i < j: + if self.__at_start: + self.syntax_error('illegal data at start of file') self.__at_start = 0 data = rawdata[i:j] + if not self.stack and not space.match(data): + self.syntax_error('data not in content') + if illegal.search(data): + self.syntax_error('illegal character in content') self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') i = j @@ -184,6 +220,20 @@ class XMLParser: self.lineno = self.lineno + string.count(rawdata[i:i], '\n') i = k continue + res = xmldecl.match(rawdata, i) + if res: + if not self.__at_start: + self.syntax_error("<?xml?> declaration not at start of document") + version, encoding, standalone = res.group('version', + 'encoding', + 'standalone') + if version[1:-1] != '1.0': + raise RuntimeError, 'only XML version 1.0 supported' + if encoding: encoding = encoding[1:-1] + if standalone: standalone = standalone[1:-1] + self.handle_xml(encoding, standalone) + i = res.end(0) + continue res = procopen.match(rawdata, i) if res: k = self.parse_proc(i) @@ -209,18 +259,6 @@ class XMLParser: self.lineno = self.lineno + string.count(rawdata[i:k], '\n') i = k continue - res = special.match(rawdata, i) - if res: - if self.literal: - data = rawdata[i] - self.handle_data(data) - self.lineno = self.lineno + string.count(data, '\n') - i = i+1 - continue - self.handle_special(res.group('special')) - self.lineno = self.lineno + string.count(res.group(0), '\n') - i = res.end(0) - continue elif rawdata[i] == '&': res = charref.match(rawdata, i) if res is not None: @@ -228,6 +266,8 @@ class XMLParser: if rawdata[i-1] != ';': self.syntax_error("`;' missing in charref") i = i-1 + if not self.stack: + self.syntax_error('data not in content') self.handle_charref(res.group('char')[:-1]) self.lineno = self.lineno + string.count(res.group(0), '\n') continue @@ -237,36 +277,45 @@ class XMLParser: if rawdata[i-1] != ';': self.syntax_error("`;' missing in entityref") i = i-1 - self.handle_entityref(res.group('name')) + name = res.group('name') + if self.entitydefs.has_key(name): + self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:] + n = len(rawdata) + i = res.start(0) + else: + self.syntax_error('reference to unknown entity') + self.unknown_entityref(name) self.lineno = self.lineno + string.count(res.group(0), '\n') continue + elif rawdata[i] == ']': + if n-i < 3: + break + if cdataclose.match(rawdata, i): + self.syntax_error("bogus `]]>'") + self.handle_data(rawdata[i]) + i = i+1 + continue else: raise RuntimeError, 'neither < nor & ??' # We get here only if incomplete matches but # nothing else - res = incomplete.match(rawdata, i) - if not res: - data = rawdata[i] - self.handle_data(data) - self.lineno = self.lineno + string.count(data, '\n') - i = i+1 - continue - j = res.end(0) - if j == n: - break # Really incomplete - self.syntax_error("bogus `<' or `&'") - data = res.group(0) - self.handle_data(data) - self.lineno = self.lineno + string.count(data, '\n') - i = j + break # end while + if i > 0: + self.__at_start = 0 if end and i < n: - data = rawdata[i:n] + data = rawdata[i] + self.syntax_error("bogus `%s'" % data) + if illegal.search(data): + self.syntax_error('illegal character in content') self.handle_data(data) self.lineno = self.lineno + string.count(data, '\n') - i = n + self.rawdata = rawdata[i+1:] + return self.goahead(end) self.rawdata = rawdata[i:] if end: + if not self.__seen_starttag: + self.syntax_error('no elements in file') if self.stack: self.syntax_error('missing end tags') while self.stack: @@ -280,9 +329,12 @@ class XMLParser: res = commentclose.search(rawdata, i+4) if not res: return -1 - # doubledash search will succeed because it's a subset of commentclose - if doubledash.search(rawdata, i+4).start(0) < res.start(0): + if doubledash.search(rawdata, i+4, res.start(0)): self.syntax_error("`--' inside comment") + if rawdata[res.start(0)-1] == '-': + self.syntax_error('comment cannot end in three dashes') + if illegal.search(rawdata, i+4, res.start(0)): + self.syntax_error('illegal character in comment') self.handle_comment(rawdata[i+4: res.start(0)]) return res.end(0) @@ -291,28 +343,59 @@ class XMLParser: rawdata = self.rawdata n = len(rawdata) name = res.group('name') + pubid, syslit = res.group('pubid', 'syslit') + if pubid is not None: + pubid = pubid[1:-1] # remove quotes + pubid = string.join(string.split(pubid)) # normalize + if syslit is not None: syslit = syslit[1:-1] # remove quotes j = k = res.end(0) - level = 0 - while k < n: - c = rawdata[k] - if c == '<': - level = level + 1 - elif c == '>': - if level == 0: - self.handle_doctype(name, rawdata[j:k]) - return k+1 - level = level - 1 + if k >= n: + return -1 + if rawdata[k] == '[': + level = 0 k = k+1 - return -1 + dq = sq = 0 + while k < n: + c = rawdata[k] + if not sq and c == '"': + dq = not dq + elif not dq and c == "'": + sq = not sq + elif sq or dq: + pass + elif level <= 0 and c == ']': + res = endbracket.match(rawdata, k+1) + if not res: + return -1 + self.handle_doctype(name, pubid, syslit, rawdata[j+1:k]) + return res.end(0) + elif c == '<': + level = level + 1 + elif c == '>': + level = level - 1 + if level < 0: + self.syntax_error("bogus `>' in DOCTYPE") + k = k+1 + res = endbracket.search(rawdata, k) + if not res: + return -1 + if res.start(0) != k: + self.syntax_error('garbage in DOCTYPE') + self.handle_doctype(name, pubid, syslit, None) + return res.end(0) # Internal -- handle CDATA tag, return length or -1 if not terminated def parse_cdata(self, i): rawdata = self.rawdata if rawdata[i:i+9] <> '<![CDATA[': - raise RuntimeError, 'unexpected call to handle_cdata' + raise RuntimeError, 'unexpected call to parse_cdata' res = cdataclose.search(rawdata, i+9) if not res: return -1 + if illegal.search(rawdata, i+9, res.start(0)): + self.syntax_error('illegal character in CDATA') + if not self.stack: + self.syntax_error('CDATA not in content') self.handle_cdata(rawdata[i+9:res.start(0)]) return res.end(0) @@ -324,24 +407,15 @@ class XMLParser: if not end: return -1 j = end.start(0) + if illegal.search(rawdata, i+2, j): + self.syntax_error('illegal character in processing instruction') res = tagfind.match(rawdata, i+2) if not res: raise RuntimeError, 'unexpected call to parse_proc' k = res.end(0) name = res.group(0) - if name == 'xml': - if self.__at_start: - attrdict, k = self.parse_attributes('xml', k, j, - self.__xml_attributes) - if k != j: - self.syntax_error('garbage at end of <?xml?>') - if attrdict['version'] != '1.0': - self.syntax_error('only XML version 1.0 supported') - self.handle_xml(attrdict.get('encoding', None), - attrdict['standalone']) - return end.end(0) - else: - self.syntax_error("<?xml?> tag not at start of document") + if string.find(string.lower(name), 'xml') >= 0: + self.syntax_error('illegal processing instruction target name') self.handle_proc(name, rawdata[k:j]) return end.end(0) @@ -375,6 +449,7 @@ class XMLParser: (attrname, tag)) if attrdict.has_key(attrname): self.syntax_error('attribute specified twice') + attrvalue = string.translate(attrvalue, attrtrans) attrdict[attrname] = self.translate_references(attrvalue) k = res.end(0) if attributes is not None: @@ -400,6 +475,8 @@ class XMLParser: if not self.__seen_starttag and self.__seen_doctype: if tag != self.__seen_doctype: self.syntax_error('starttag does not match DOCTYPE') + if self.__seen_starttag and not self.stack: + self.syntax_error('multiple elements on top level') if hasattr(self, tag + '_attributes'): attributes = getattr(self, tag + '_attributes') else: @@ -428,10 +505,7 @@ class XMLParser: tag = res.group(0) k = res.end(0) if k != end.start(0): - # check that there is only white space at end of tag - res = space.match(rawdata, k) - if res is None or res.end(0) != end.start(0): - self.syntax_error('garbage in end tag') + self.syntax_error('garbage in end tag') self.finish_endtag(tag) return end.end(0) @@ -439,17 +513,18 @@ class XMLParser: # Return -1 for unknown tag, 1 for balanced tag def finish_starttag(self, tag, attrs): self.stack.append(tag) - try: - method = getattr(self, 'start_' + tag) - except AttributeError: - self.unknown_starttag(tag, attrs) - return -1 - else: + methodname = 'start_' + tag + if hasattr(self, methodname): + method = getattr(self, methodname) self.handle_starttag(tag, method, attrs) return 1 + else: + self.unknown_starttag(tag, attrs) + return -1 # Internal -- finish processing of end tag def finish_endtag(self, tag): + methodname = 'end_' + tag if not tag: self.syntax_error('name-less end tag') found = len(self.stack) - 1 @@ -459,9 +534,10 @@ class XMLParser: else: if tag not in self.stack: self.syntax_error('unopened end tag') - try: - method = getattr(self, 'end_' + tag) - except AttributeError: + if hasattr(self, methodname): + method = getattr(self, methodname) + self.handle_endtag(tag, method) + else: self.unknown_endtag(tag) return found = len(self.stack) @@ -472,11 +548,8 @@ class XMLParser: if found < len(self.stack) - 1: self.syntax_error('missing close tag for %s' % self.stack[-1]) tag = self.stack[-1] - try: - method = getattr(self, 'end_' + tag) - except AttributeError: - method = None - if method: + if hasattr(self, methodname): + method = getattr(self, methodname) self.handle_endtag(tag, method) else: self.unknown_endtag(tag) @@ -487,7 +560,7 @@ class XMLParser: pass # Overridable -- handle DOCTYPE - def handle_doctype(self, tag, data): + def handle_doctype(self, tag, pubid, syslit, data): pass # Overridable -- handle start tag @@ -514,7 +587,12 @@ class XMLParser: self.handle_data(chr(n)) # Definition of entities -- derived classes may override - entitydefs = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': "'"} + entitydefs = {'lt': '<', # must use charref + 'gt': '>', + 'amp': '&', # must use charref + 'quot': '"', + 'apos': ''', + } # Example -- handle entity reference, no need to override def handle_entityref(self, name): @@ -541,10 +619,6 @@ class XMLParser: def handle_proc(self, name, data): pass - # Example -- handle special instructions, could be overridden - def handle_special(self, data): - pass - # Example -- handle relatively harmless syntax errors, could be overridden def syntax_error(self, message): raise RuntimeError, 'Syntax error at line %d: %s' % (self.lineno, message) @@ -566,10 +640,14 @@ class TestXMLParser(XMLParser): self.flush() print 'xml: encoding =',encoding,'standalone =',standalone - def handle_doctype(self, tag, data): + def handle_doctype(self, tag, pubid, syslit, data): self.flush() print 'DOCTYPE:',tag, `data` + def handle_entity(self, name, strval, pubid, syslit, ndata): + self.flush() + print 'ENTITY:',`data` + def handle_data(self, data): self.testdata = self.testdata + data if len(`self.testdata`) >= 70: @@ -589,10 +667,6 @@ class TestXMLParser(XMLParser): self.flush() print 'processing:',name,`data` - def handle_special(self, data): - self.flush() - print 'special:',`data` - def handle_comment(self, data): self.flush() r = `data` @@ -660,9 +734,13 @@ def test(args = None): f.close() x = klass() - for c in data: - x.feed(c) - x.close() + try: + for c in data: + x.feed(c) + x.close() + except RuntimeError, msg: + print msg + sys.exit(1) if __name__ == '__main__': |