diff options
author | Fred Drake <fdrake@acm.org> | 2001-07-16 18:30:35 (GMT) |
---|---|---|
committer | Fred Drake <fdrake@acm.org> | 2001-07-16 18:30:35 (GMT) |
commit | fb38c76e0f15e15d08e4635a24719cc120809191 (patch) | |
tree | 84f02d0e0bf37352e792425f82d6aed4b2c614ca | |
parent | e16c7aee4bc2a8851b9a9bae60a00c2544722f67 (diff) | |
download | cpython-fb38c76e0f15e15d08e4635a24719cc120809191.zip cpython-fb38c76e0f15e15d08e4635a24719cc120809191.tar.gz cpython-fb38c76e0f15e15d08e4635a24719cc120809191.tar.bz2 |
In CDATA mode, make sure entity-reference syntax is not interpreted;
entity references are not allowed in that mode.
Do a better job of scanning <!DOCTYPE ...> declarations; based on the
code in HTMLParser.py.
-rw-r--r-- | Lib/sgmllib.py | 34 |
1 files changed, 26 insertions, 8 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 5ff9f70..3422980 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -5,7 +5,8 @@ # XXX There should be a way to distinguish between PCDATA (parsed # character data -- the normal case), RCDATA (replaceable character # data -- only char and entity references and end tags are special) -# and CDATA (character data -- only end tags are special). +# and CDATA (character data -- only end tags are special). RCDATA is +# not supported at all. import re @@ -34,6 +35,9 @@ endbracket = re.compile('[<>]') special = re.compile('<![^<>]*>') commentopen = re.compile('<!--') commentclose = re.compile(r'--\s*>') +declopen = re.compile('<!') +declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*') +declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' @@ -160,6 +164,10 @@ class SGMLParser: i = k continue elif rawdata[i] == '&': + if self.literal: + self.handle_data(rawdata[i]) + i = i+1 + continue match = charref.match(rawdata, i) if match: name = match.group(1) @@ -210,11 +218,20 @@ class SGMLParser: # Internal -- parse declaration. def parse_declaration(self, i): + # This is some sort of declaration; in "HTML as + # deployed," this should only be the document type + # declaration ("<!DOCTYPE html...>"). rawdata = self.rawdata j = i + 2 + assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" + if rawdata[j:j+1] in ("-", ""): + # Start of comment followed by buffer boundary, + # or just a buffer boundary. + return -1 + # in practice, this should look like: ((name|stringlit) S*)+ '>' n = len(rawdata) while j < n: - c = rawdata[j:j+1] + c = rawdata[j] if c == ">": # end of declaration syntax self.handle_decl(rawdata[i+2:j]) @@ -222,15 +239,16 @@ class SGMLParser: if c in "\"'": m = declstringlit.match(rawdata, j) if not m: - # incomplete or an error? - return -1 + return -1 # incomplete j = m.end() - else: - m = decldata.match(rawdata, j) + elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": + m = declname.match(rawdata, j) if not m: - # incomplete or an error? - return -1 + return -1 # incomplete j = m.end() + else: + raise SGMLParseError( + "unexpected char in declaration: %s" % `rawdata[j]`) # end of buffer between tokens return -1 |