summaryrefslogtreecommitdiffstats
path: root/Lib
diff options
context:
space:
mode:
authorFred Drake <fdrake@acm.org>2001-07-16 18:30:35 (GMT)
committerFred Drake <fdrake@acm.org>2001-07-16 18:30:35 (GMT)
commitfb38c76e0f15e15d08e4635a24719cc120809191 (patch)
tree84f02d0e0bf37352e792425f82d6aed4b2c614ca /Lib
parente16c7aee4bc2a8851b9a9bae60a00c2544722f67 (diff)
downloadcpython-fb38c76e0f15e15d08e4635a24719cc120809191.zip
cpython-fb38c76e0f15e15d08e4635a24719cc120809191.tar.gz
cpython-fb38c76e0f15e15d08e4635a24719cc120809191.tar.bz2
In CDATA mode, make sure entity-reference syntax is not interpreted;
entity references are not allowed in that mode. Do a better job of scanning <!DOCTYPE ...> declarations; based on the code in HTMLParser.py.
Diffstat (limited to 'Lib')
-rw-r--r--Lib/sgmllib.py34
1 files changed, 26 insertions, 8 deletions
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index 5ff9f70..3422980 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@@ -5,7 +5,8 @@
# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
-# and CDATA (character data -- only end tags are special).
+# and CDATA (character data -- only end tags are special). RCDATA is
+# not supported at all.
import re
@@ -34,6 +35,9 @@ endbracket = re.compile('[<>]')
special = re.compile('<![^<>]*>')
commentopen = re.compile('<!--')
commentclose = re.compile(r'--\s*>')
+declopen = re.compile('<!')
+declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
+declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
@@ -160,6 +164,10 @@ class SGMLParser:
i = k
continue
elif rawdata[i] == '&':
+ if self.literal:
+ self.handle_data(rawdata[i])
+ i = i+1
+ continue
match = charref.match(rawdata, i)
if match:
name = match.group(1)
@@ -210,11 +218,20 @@ class SGMLParser:
# Internal -- parse declaration.
def parse_declaration(self, i):
+ # This is some sort of declaration; in "HTML as
+ # deployed," this should only be the document type
+ # declaration ("<!DOCTYPE html...>").
rawdata = self.rawdata
j = i + 2
+ assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
+ if rawdata[j:j+1] in ("-", ""):
+ # Start of comment followed by buffer boundary,
+ # or just a buffer boundary.
+ return -1
+ # in practice, this should look like: ((name|stringlit) S*)+ '>'
n = len(rawdata)
while j < n:
- c = rawdata[j:j+1]
+ c = rawdata[j]
if c == ">":
# end of declaration syntax
self.handle_decl(rawdata[i+2:j])
@@ -222,15 +239,16 @@ class SGMLParser:
if c in "\"'":
m = declstringlit.match(rawdata, j)
if not m:
- # incomplete or an error?
- return -1
+ return -1 # incomplete
j = m.end()
- else:
- m = decldata.match(rawdata, j)
+ elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
+ m = declname.match(rawdata, j)
if not m:
- # incomplete or an error?
- return -1
+ return -1 # incomplete
j = m.end()
+ else:
+ raise SGMLParseError(
+ "unexpected char in declaration: %s" % `rawdata[j]`)
# end of buffer between tokens
return -1