summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/markupbase.py71
-rw-r--r--Lib/sgmllib.py22
-rw-r--r--Lib/test/test_htmllib.py27
-rw-r--r--Misc/NEWS3
4 files changed, 107 insertions, 16 deletions
diff --git a/Lib/markupbase.py b/Lib/markupbase.py
index acd0726..f97cf10 100644
--- a/Lib/markupbase.py
+++ b/Lib/markupbase.py
@@ -4,6 +4,13 @@ import re
_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
+_commentclose = re.compile(r'--\s*>')
+_markedsectionclose = re.compile(r']\s*]\s*>')
+
+# An analysis of the MS-Word extensions is available at
+# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
+
+_msmarkedsectionclose = re.compile(r']\s*>')
del re
@@ -53,6 +60,13 @@ class ParserBase:
# This is some sort of declaration; in "HTML as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
+ # ISO 8879:1986, however, has more complex
+ # declaration syntax for elements in <!...>, including:
+ # --comment--
+ # [marked section]
+ # name in the following list: ENTITY, DOCTYPE, ELEMENT,
+ # ATTLIST, NOTATION, SHORTREF, USEMAP,
+ # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
rawdata = self.rawdata
j = i + 2
assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
@@ -60,9 +74,19 @@ class ParserBase:
# Start of comment followed by buffer boundary,
# or just a buffer boundary.
return -1
- # in practice, this should look like: ((name|stringlit) S*)+ '>'
+ # A simple, practical version could look like: ((name|stringlit) S*) + '>'
n = len(rawdata)
- decltype, j = self._scan_name(j, i)
+ if rawdata[j:j+1] == '--': #comment
+ # Locate --.*-- as the body of the comment
+ return self.parse_comment(i)
+ elif rawdata[j] == '[': #marked section
+ # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
+ # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
+ # Note that this is extended by Microsoft Office "Save as Web" function
+ # to include [if...] and [endif].
+ return self.parse_marked_section(i)
+ else: #all other declaration elements
+ decltype, j = self._scan_name(j, i)
if j < 0:
return j
if decltype == "doctype":
@@ -87,8 +111,15 @@ class ParserBase:
elif c in self._decl_otherchars:
j = j + 1
elif c == "[":
+ # this could be handled in a separate doctype parser
if decltype == "doctype":
j = self._parse_doctype_subset(j + 1, i)
+ elif decltype in ("attlist", "linktype", "link", "element"):
+ # must tolerate []'d groups in a content model in an element declaration
+ # also in data attribute specifications of attlist declaration
+ # also link type declaration subsets in linktype declarations
+ # also link attribute specification lists in link declarations
+ self.error("unsupported '[' char in %s declaration" % decltype)
else:
self.error("unexpected '[' char in declaration")
else:
@@ -98,6 +129,42 @@ class ParserBase:
return j
return -1 # incomplete
+ # Internal -- parse a marked section
+ # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
+ def parse_marked_section( self, i, report=1 ):
+ rawdata= self.rawdata
+ assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
+ sectName, j = self._scan_name( i+3, i )
+ if j < 0:
+ return j
+ if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
+ # look for standard ]]> ending
+ match= _markedsectionclose.search(rawdata, i+3)
+ elif sectName in ("if", "else", "endif"):
+ # look for MS Office ]> ending
+ match= _msmarkedsectionclose.search(rawdata, i+3)
+ else:
+ self.error('unknown status keyword %s in marked section' % `rawdata[i+3:j]`)
+ if not match:
+ return -1
+ if report:
+ j = match.start(0)
+ self.unknown_decl(rawdata[i+3: j])
+ return match.end(0)
+
+ # Internal -- parse comment, return length or -1 if not terminated
+ def parse_comment(self, i, report=1):
+ rawdata = self.rawdata
+ if rawdata[i:i+4] != '<!--':
+ self.error('unexpected call to parse_comment()')
+ match = _commentclose.search(rawdata, i+4)
+ if not match:
+ return -1
+ if report:
+ j = match.start(0)
+ self.handle_comment(rawdata[i+4: j])
+ return match.end(0)
+
# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
# returning the index just past any whitespace following the trailing ']'.
def _parse_doctype_subset(self, i, declstartpos):
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index a4f0a8b..b259328 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@@ -30,7 +30,6 @@ shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
piclose = re.compile('>')
endbracket = re.compile('[<>]')
-commentclose = re.compile(r'--\s*>')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
@@ -145,6 +144,10 @@ class SGMLParser(markupbase.ParserBase):
break
continue
if rawdata.startswith("<!--", i):
+ # Strictly speaking, a comment is --.*--
+ # within a declaration tag <!...>.
+ # This should be removed,
+ # and comments handled only in parse_declaration.
k = self.parse_comment(i)
if k < 0: break
i = k
@@ -202,19 +205,6 @@ class SGMLParser(markupbase.ParserBase):
self.rawdata = rawdata[i:]
# XXX if end: check for empty stack
- # Internal -- parse comment, return length or -1 if not terminated
- def parse_comment(self, i, report=1):
- rawdata = self.rawdata
- if rawdata[i:i+4] != '<!--':
- self.error('unexpected call to parse_comment()')
- match = commentclose.search(rawdata, i+4)
- if not match:
- return -1
- if report:
- j = match.start(0)
- self.handle_comment(rawdata[i+4: j])
- return match.end(0)
-
# Extensions for the DOCTYPE scanner:
_decl_otherchars = '='
@@ -471,6 +461,10 @@ class TestSGMLParser(SGMLParser):
self.flush()
print '*** unknown char ref: &#' + ref + ';'
+ def unknown_decl(self, data):
+ self.flush()
+ print '*** unknown decl: [' + data + ']'
+
def close(self):
SGMLParser.close(self)
self.flush()
diff --git a/Lib/test/test_htmllib.py b/Lib/test/test_htmllib.py
index e283d11..a20f43b 100644
--- a/Lib/test/test_htmllib.py
+++ b/Lib/test/test_htmllib.py
@@ -16,6 +16,17 @@ class AnchorCollector(htmllib.HTMLParser):
def anchor_bgn(self, *args):
self.__anchors.append(args)
+class DeclCollector(htmllib.HTMLParser):
+ def __init__(self, *args, **kw):
+ self.__decls = []
+ htmllib.HTMLParser.__init__(self, *args, **kw)
+
+ def get_decl_info(self):
+ return self.__decls
+
+ def unknown_decl(self, data):
+ self.__decls.append(data)
+
class HTMLParserTestCase(unittest.TestCase):
def test_anchor_collection(self):
@@ -33,6 +44,22 @@ class HTMLParserTestCase(unittest.TestCase):
('', 'frob', ''),
])
+ def test_decl_collection(self):
+ # See SF patch #545300
+ parser = DeclCollector(formatter.NullFormatter(), verbose=1)
+ parser.feed(
+ """<html>
+ <body>
+ hallo
+ <![if !supportEmptyParas]>&nbsp;<![endif]>
+ </body>
+ </html>
+ """)
+ parser.close()
+ self.assertEquals(parser.get_decl_info(),
+ ["if !supportEmptyParas",
+ "endif"
+ ])
def test_main():
test_support.run_unittest(HTMLParserTestCase)
diff --git a/Misc/NEWS b/Misc/NEWS
index 3a1c875..52c638f 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -67,6 +67,9 @@ Extension modules
Library
-------
+- sgmllib now supports SGML marked sections, in particular the
+ MS Office extensions.
+
- The urllib module now offers support for the iterator protocol.
SF patch 698520 contributed by Brett Cannon.