summaryrefslogtreecommitdiffstats
path: root/Lib/xmllib.py
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>1998-04-03 16:02:39 (GMT)
committerGuido van Rossum <guido@python.org>1998-04-03 16:02:39 (GMT)
commit7e07b3845b27f1f9c30733431ebd4dccbae8b9d4 (patch)
treec502f613fcbd3d0d495399ab3e5c70c1bd2b4e99 /Lib/xmllib.py
parent0454b51282efd92564b5e338e8a364be3054bf10 (diff)
downloadcpython-7e07b3845b27f1f9c30733431ebd4dccbae8b9d4.zip
cpython-7e07b3845b27f1f9c30733431ebd4dccbae8b9d4.tar.gz
cpython-7e07b3845b27f1f9c30733431ebd4dccbae8b9d4.tar.bz2
Sjoerd's latest.
Diffstat (limited to 'Lib/xmllib.py')
-rw-r--r--Lib/xmllib.py312
1 files changed, 195 insertions, 117 deletions
diff --git a/Lib/xmllib.py b/Lib/xmllib.py
index 6d7f1d1..355714f 100644
--- a/Lib/xmllib.py
+++ b/Lib/xmllib.py
@@ -5,34 +5,50 @@ import re
import string
+version = '0.1'
+
# Regular expressions used for parsing
_S = '[ \t\r\n]+'
_opS = '[ \t\r\n]*'
_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'
-interesting = re.compile('[&<]')
-incomplete = re.compile('&(' + _Name + '|#[0-9]*|#x[0-9a-fA-F]*)?|'
- '<([a-zA-Z_:][^<>]*|'
- '/([a-zA-Z_:][^<>]*)?|'
- '![^<>]*|'
- r'\?[^<>]*)?')
-
-ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+);?')
+illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
+interesting = re.compile('[]&<]')
+
+amp = re.compile('&')
+ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
-space = re.compile(_S)
+space = re.compile(_S + '$')
newline = re.compile('\n')
starttagopen = re.compile('<' + _Name)
endtagopen = re.compile('</')
starttagend = re.compile(_opS + '(?P<slash>/?)>')
-endbracket = re.compile('>')
+endbracket = re.compile(_opS + '>')
tagfind = re.compile(_Name)
cdataopen = re.compile(r'<!\[CDATA\[')
cdataclose = re.compile(r'\]\]>')
-doctype = re.compile('<!DOCTYPE' + _S + '(?P<name>' + _Name + ')' + _S)
-special = re.compile('<!(?P<special>[^<>]*)>')
-procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _S)
+# this matches one of the following:
+# SYSTEM SystemLiteral
+# PUBLIC PubidLiteral SystemLiteral
+_SystemLiteral = '(?P<%s>\'[^\']*\'|"[^"]*")'
+_PublicLiteral = '(?P<%s>"[-\'()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
+ "'[-()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
+_ExternalId = '(?:SYSTEM|' \
+ 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
+ ')'+_S+_SystemLiteral%'syslit'
+doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
+ '(?:'+_S+_ExternalId+')?'+_opS)
+xmldecl = re.compile('<\?xml'+_S+
+ 'version'+_opS+'='+_opS+'(?P<version>\'[^\']*\'|"[^"]*")'+
+ '(?:'+_S+'encoding'+_opS+'='+_opS+
+ "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
+ '"[A-Za-z][-A-Za-z0-9._]*"))?'
+ '(?:'+_S+'standalone'+_opS+'='+_opS+
+ '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
+ _opS+'\?>')
+procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
procclose = re.compile(_opS + r'\?>')
commentopen = re.compile('<!--')
commentclose = re.compile('-->')
@@ -41,6 +57,7 @@ attrfind = re.compile(
_S + '(?P<name>' + _Name + ')'
'(' + _opS + '=' + _opS +
'(?P<value>\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9.:+*%?!()_#=~]+))')
+attrtrans = string.maketrans(' \r\n\t', ' ')
# XML parser base class -- find tags and call handler functions.
@@ -92,30 +109,43 @@ class XMLParser:
self.goahead(1)
# Interface -- translate references
- def translate_references(self, data):
- newdata = []
+ def translate_references(self, data, all = 1):
i = 0
while 1:
- res = ref.search(data, i)
+ res = amp.search(data, i)
if res is None:
- newdata.append(data[i:])
- return string.join(newdata, '')
- if data[res.end(0) - 1] != ';':
+ return data
+ res = ref.match(data, res.start(0))
+ if res is None:
+ self.syntax_error("bogus `&'")
+ i =i+1
+ continue
+ i = res.end(0)
+ if data[i - 1] != ';':
self.syntax_error("`;' missing after entity/char reference")
- newdata.append(data[i:res.start(0)])
+ i = i-1
str = res.group(1)
+ pre = data[:res.start(0)]
+ post = data[i:]
if str[0] == '#':
if str[1] == 'x':
- newdata.append(chr(string.atoi(str[2:], 16)))
+ str = chr(string.atoi(str[2:], 16))
else:
- newdata.append(chr(string.atoi(str[1:])))
- else:
- try:
- newdata.append(self.entitydefs[str])
- except KeyError:
+ str = chr(string.atoi(str[1:]))
+ data = pre + str + post
+ i = res.start(0)+len(str)
+ elif all:
+ if self.entitydefs.has_key(str):
+ data = pre + self.entitydefs[str] + post
+ i = res.start(0) # rescan substituted text
+ else:
+ self.syntax_error('reference to unknown entity')
# can't do it, so keep the entity ref in
- newdata.append('&' + str + ';')
- i = res.end(0)
+ data = pre + '&' + str + ';' + post
+ i = res.start(0) + len(str) + 2
+ else:
+ # just translating character references
+ pass # i is already postioned correctly
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
@@ -139,8 +169,14 @@ class XMLParser:
else:
j = n
if i < j:
+ if self.__at_start:
+ self.syntax_error('illegal data at start of file')
self.__at_start = 0
data = rawdata[i:j]
+ if not self.stack and not space.match(data):
+ self.syntax_error('data not in content')
+ if illegal.search(data):
+ self.syntax_error('illegal character in content')
self.handle_data(data)
self.lineno = self.lineno + string.count(data, '\n')
i = j
@@ -184,6 +220,20 @@ class XMLParser:
self.lineno = self.lineno + string.count(rawdata[i:i], '\n')
i = k
continue
+ res = xmldecl.match(rawdata, i)
+ if res:
+ if not self.__at_start:
+ self.syntax_error("<?xml?> declaration not at start of document")
+ version, encoding, standalone = res.group('version',
+ 'encoding',
+ 'standalone')
+ if version[1:-1] != '1.0':
+ raise RuntimeError, 'only XML version 1.0 supported'
+ if encoding: encoding = encoding[1:-1]
+ if standalone: standalone = standalone[1:-1]
+ self.handle_xml(encoding, standalone)
+ i = res.end(0)
+ continue
res = procopen.match(rawdata, i)
if res:
k = self.parse_proc(i)
@@ -209,18 +259,6 @@ class XMLParser:
self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
i = k
continue
- res = special.match(rawdata, i)
- if res:
- if self.literal:
- data = rawdata[i]
- self.handle_data(data)
- self.lineno = self.lineno + string.count(data, '\n')
- i = i+1
- continue
- self.handle_special(res.group('special'))
- self.lineno = self.lineno + string.count(res.group(0), '\n')
- i = res.end(0)
- continue
elif rawdata[i] == '&':
res = charref.match(rawdata, i)
if res is not None:
@@ -228,6 +266,8 @@ class XMLParser:
if rawdata[i-1] != ';':
self.syntax_error("`;' missing in charref")
i = i-1
+ if not self.stack:
+ self.syntax_error('data not in content')
self.handle_charref(res.group('char')[:-1])
self.lineno = self.lineno + string.count(res.group(0), '\n')
continue
@@ -237,36 +277,45 @@ class XMLParser:
if rawdata[i-1] != ';':
self.syntax_error("`;' missing in entityref")
i = i-1
- self.handle_entityref(res.group('name'))
+ name = res.group('name')
+ if self.entitydefs.has_key(name):
+ self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
+ n = len(rawdata)
+ i = res.start(0)
+ else:
+ self.syntax_error('reference to unknown entity')
+ self.unknown_entityref(name)
self.lineno = self.lineno + string.count(res.group(0), '\n')
continue
+ elif rawdata[i] == ']':
+ if n-i < 3:
+ break
+ if cdataclose.match(rawdata, i):
+ self.syntax_error("bogus `]]>'")
+ self.handle_data(rawdata[i])
+ i = i+1
+ continue
else:
raise RuntimeError, 'neither < nor & ??'
# We get here only if incomplete matches but
# nothing else
- res = incomplete.match(rawdata, i)
- if not res:
- data = rawdata[i]
- self.handle_data(data)
- self.lineno = self.lineno + string.count(data, '\n')
- i = i+1
- continue
- j = res.end(0)
- if j == n:
- break # Really incomplete
- self.syntax_error("bogus `<' or `&'")
- data = res.group(0)
- self.handle_data(data)
- self.lineno = self.lineno + string.count(data, '\n')
- i = j
+ break
# end while
+ if i > 0:
+ self.__at_start = 0
if end and i < n:
- data = rawdata[i:n]
+ data = rawdata[i]
+ self.syntax_error("bogus `%s'" % data)
+ if illegal.search(data):
+ self.syntax_error('illegal character in content')
self.handle_data(data)
self.lineno = self.lineno + string.count(data, '\n')
- i = n
+ self.rawdata = rawdata[i+1:]
+ return self.goahead(end)
self.rawdata = rawdata[i:]
if end:
+ if not self.__seen_starttag:
+ self.syntax_error('no elements in file')
if self.stack:
self.syntax_error('missing end tags')
while self.stack:
@@ -280,9 +329,12 @@ class XMLParser:
res = commentclose.search(rawdata, i+4)
if not res:
return -1
- # doubledash search will succeed because it's a subset of commentclose
- if doubledash.search(rawdata, i+4).start(0) < res.start(0):
+ if doubledash.search(rawdata, i+4, res.start(0)):
self.syntax_error("`--' inside comment")
+ if rawdata[res.start(0)-1] == '-':
+ self.syntax_error('comment cannot end in three dashes')
+ if illegal.search(rawdata, i+4, res.start(0)):
+ self.syntax_error('illegal character in comment')
self.handle_comment(rawdata[i+4: res.start(0)])
return res.end(0)
@@ -291,28 +343,59 @@ class XMLParser:
rawdata = self.rawdata
n = len(rawdata)
name = res.group('name')
+ pubid, syslit = res.group('pubid', 'syslit')
+ if pubid is not None:
+ pubid = pubid[1:-1] # remove quotes
+ pubid = string.join(string.split(pubid)) # normalize
+ if syslit is not None: syslit = syslit[1:-1] # remove quotes
j = k = res.end(0)
- level = 0
- while k < n:
- c = rawdata[k]
- if c == '<':
- level = level + 1
- elif c == '>':
- if level == 0:
- self.handle_doctype(name, rawdata[j:k])
- return k+1
- level = level - 1
+ if k >= n:
+ return -1
+ if rawdata[k] == '[':
+ level = 0
k = k+1
- return -1
+ dq = sq = 0
+ while k < n:
+ c = rawdata[k]
+ if not sq and c == '"':
+ dq = not dq
+ elif not dq and c == "'":
+ sq = not sq
+ elif sq or dq:
+ pass
+ elif level <= 0 and c == ']':
+ res = endbracket.match(rawdata, k+1)
+ if not res:
+ return -1
+ self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
+ return res.end(0)
+ elif c == '<':
+ level = level + 1
+ elif c == '>':
+ level = level - 1
+ if level < 0:
+ self.syntax_error("bogus `>' in DOCTYPE")
+ k = k+1
+ res = endbracket.search(rawdata, k)
+ if not res:
+ return -1
+ if res.start(0) != k:
+ self.syntax_error('garbage in DOCTYPE')
+ self.handle_doctype(name, pubid, syslit, None)
+ return res.end(0)
# Internal -- handle CDATA tag, return length or -1 if not terminated
def parse_cdata(self, i):
rawdata = self.rawdata
if rawdata[i:i+9] <> '<![CDATA[':
- raise RuntimeError, 'unexpected call to handle_cdata'
+ raise RuntimeError, 'unexpected call to parse_cdata'
res = cdataclose.search(rawdata, i+9)
if not res:
return -1
+ if illegal.search(rawdata, i+9, res.start(0)):
+ self.syntax_error('illegal character in CDATA')
+ if not self.stack:
+ self.syntax_error('CDATA not in content')
self.handle_cdata(rawdata[i+9:res.start(0)])
return res.end(0)
@@ -324,24 +407,15 @@ class XMLParser:
if not end:
return -1
j = end.start(0)
+ if illegal.search(rawdata, i+2, j):
+ self.syntax_error('illegal character in processing instruction')
res = tagfind.match(rawdata, i+2)
if not res:
raise RuntimeError, 'unexpected call to parse_proc'
k = res.end(0)
name = res.group(0)
- if name == 'xml':
- if self.__at_start:
- attrdict, k = self.parse_attributes('xml', k, j,
- self.__xml_attributes)
- if k != j:
- self.syntax_error('garbage at end of <?xml?>')
- if attrdict['version'] != '1.0':
- self.syntax_error('only XML version 1.0 supported')
- self.handle_xml(attrdict.get('encoding', None),
- attrdict['standalone'])
- return end.end(0)
- else:
- self.syntax_error("<?xml?> tag not at start of document")
+ if string.find(string.lower(name), 'xml') >= 0:
+ self.syntax_error('illegal processing instruction target name')
self.handle_proc(name, rawdata[k:j])
return end.end(0)
@@ -375,6 +449,7 @@ class XMLParser:
(attrname, tag))
if attrdict.has_key(attrname):
self.syntax_error('attribute specified twice')
+ attrvalue = string.translate(attrvalue, attrtrans)
attrdict[attrname] = self.translate_references(attrvalue)
k = res.end(0)
if attributes is not None:
@@ -400,6 +475,8 @@ class XMLParser:
if not self.__seen_starttag and self.__seen_doctype:
if tag != self.__seen_doctype:
self.syntax_error('starttag does not match DOCTYPE')
+ if self.__seen_starttag and not self.stack:
+ self.syntax_error('multiple elements on top level')
if hasattr(self, tag + '_attributes'):
attributes = getattr(self, tag + '_attributes')
else:
@@ -428,10 +505,7 @@ class XMLParser:
tag = res.group(0)
k = res.end(0)
if k != end.start(0):
- # check that there is only white space at end of tag
- res = space.match(rawdata, k)
- if res is None or res.end(0) != end.start(0):
- self.syntax_error('garbage in end tag')
+ self.syntax_error('garbage in end tag')
self.finish_endtag(tag)
return end.end(0)
@@ -439,17 +513,18 @@ class XMLParser:
# Return -1 for unknown tag, 1 for balanced tag
def finish_starttag(self, tag, attrs):
self.stack.append(tag)
- try:
- method = getattr(self, 'start_' + tag)
- except AttributeError:
- self.unknown_starttag(tag, attrs)
- return -1
- else:
+ methodname = 'start_' + tag
+ if hasattr(self, methodname):
+ method = getattr(self, methodname)
self.handle_starttag(tag, method, attrs)
return 1
+ else:
+ self.unknown_starttag(tag, attrs)
+ return -1
# Internal -- finish processing of end tag
def finish_endtag(self, tag):
+ methodname = 'end_' + tag
if not tag:
self.syntax_error('name-less end tag')
found = len(self.stack) - 1
@@ -459,9 +534,10 @@ class XMLParser:
else:
if tag not in self.stack:
self.syntax_error('unopened end tag')
- try:
- method = getattr(self, 'end_' + tag)
- except AttributeError:
+ if hasattr(self, methodname):
+ method = getattr(self, methodname)
+ self.handle_endtag(tag, method)
+ else:
self.unknown_endtag(tag)
return
found = len(self.stack)
@@ -472,11 +548,8 @@ class XMLParser:
if found < len(self.stack) - 1:
self.syntax_error('missing close tag for %s' % self.stack[-1])
tag = self.stack[-1]
- try:
- method = getattr(self, 'end_' + tag)
- except AttributeError:
- method = None
- if method:
+ if hasattr(self, methodname):
+ method = getattr(self, methodname)
self.handle_endtag(tag, method)
else:
self.unknown_endtag(tag)
@@ -487,7 +560,7 @@ class XMLParser:
pass
# Overridable -- handle DOCTYPE
- def handle_doctype(self, tag, data):
+ def handle_doctype(self, tag, pubid, syslit, data):
pass
# Overridable -- handle start tag
@@ -514,7 +587,12 @@ class XMLParser:
self.handle_data(chr(n))
# Definition of entities -- derived classes may override
- entitydefs = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': "'"}
+ entitydefs = {'lt': '&#60;', # must use charref
+ 'gt': '&#62;',
+ 'amp': '&#38;', # must use charref
+ 'quot': '&#34;',
+ 'apos': '&#39;',
+ }
# Example -- handle entity reference, no need to override
def handle_entityref(self, name):
@@ -541,10 +619,6 @@ class XMLParser:
def handle_proc(self, name, data):
pass
- # Example -- handle special instructions, could be overridden
- def handle_special(self, data):
- pass
-
# Example -- handle relatively harmless syntax errors, could be overridden
def syntax_error(self, message):
raise RuntimeError, 'Syntax error at line %d: %s' % (self.lineno, message)
@@ -566,10 +640,14 @@ class TestXMLParser(XMLParser):
self.flush()
print 'xml: encoding =',encoding,'standalone =',standalone
- def handle_doctype(self, tag, data):
+ def handle_doctype(self, tag, pubid, syslit, data):
self.flush()
print 'DOCTYPE:',tag, `data`
+ def handle_entity(self, name, strval, pubid, syslit, ndata):
+ self.flush()
+ print 'ENTITY:',`data`
+
def handle_data(self, data):
self.testdata = self.testdata + data
if len(`self.testdata`) >= 70:
@@ -589,10 +667,6 @@ class TestXMLParser(XMLParser):
self.flush()
print 'processing:',name,`data`
- def handle_special(self, data):
- self.flush()
- print 'special:',`data`
-
def handle_comment(self, data):
self.flush()
r = `data`
@@ -660,9 +734,13 @@ def test(args = None):
f.close()
x = klass()
- for c in data:
- x.feed(c)
- x.close()
+ try:
+ for c in data:
+ x.feed(c)
+ x.close()
+ except RuntimeError, msg:
+ print msg
+ sys.exit(1)
if __name__ == '__main__':