diff options
-rw-r--r-- | Lib/markupbase.py | 71 | ||||
-rw-r--r-- | Lib/sgmllib.py | 22 | ||||
-rw-r--r-- | Lib/test/test_htmllib.py | 27 | ||||
-rw-r--r-- | Misc/NEWS | 3 |
4 files changed, 107 insertions, 16 deletions
diff --git a/Lib/markupbase.py b/Lib/markupbase.py index acd0726..f97cf10 100644 --- a/Lib/markupbase.py +++ b/Lib/markupbase.py @@ -4,6 +4,13 @@ import re _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match +_commentclose = re.compile(r'--\s*>') +_markedsectionclose = re.compile(r']\s*]\s*>') + +# An analysis of the MS-Word extensions is available at +# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf + +_msmarkedsectionclose = re.compile(r']\s*>') del re @@ -53,6 +60,13 @@ class ParserBase: # This is some sort of declaration; in "HTML as # deployed," this should only be the document type # declaration ("<!DOCTYPE html...>"). + # ISO 8879:1986, however, has more complex + # declaration syntax for elements in <!...>, including: + # --comment-- + # [marked section] + # name in the following list: ENTITY, DOCTYPE, ELEMENT, + # ATTLIST, NOTATION, SHORTREF, USEMAP, + # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM rawdata = self.rawdata j = i + 2 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" @@ -60,9 +74,19 @@ class ParserBase: # Start of comment followed by buffer boundary, # or just a buffer boundary. return -1 - # in practice, this should look like: ((name|stringlit) S*)+ '>' + # A simple, practical version could look like: ((name|stringlit) S*) + '>' n = len(rawdata) - decltype, j = self._scan_name(j, i) + if rawdata[j:j+1] == '--': #comment + # Locate --.*-- as the body of the comment + return self.parse_comment(i) + elif rawdata[j] == '[': #marked section + # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section + # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA + # Note that this is extended by Microsoft Office "Save as Web" function + # to include [if...] and [endif]. + return self.parse_marked_section(i) + else: #all other declaration elements + decltype, j = self._scan_name(j, i) if j < 0: return j if decltype == "doctype": @@ -87,8 +111,15 @@ class ParserBase: elif c in self._decl_otherchars: j = j + 1 elif c == "[": + # this could be handled in a separate doctype parser if decltype == "doctype": j = self._parse_doctype_subset(j + 1, i) + elif decltype in ("attlist", "linktype", "link", "element"): + # must tolerate []'d groups in a content model in an element declaration + # also in data attribute specifications of attlist declaration + # also link type declaration subsets in linktype declarations + # also link attribute specification lists in link declarations + self.error("unsupported '[' char in %s declaration" % decltype) else: self.error("unexpected '[' char in declaration") else: @@ -98,6 +129,42 @@ class ParserBase: return j return -1 # incomplete + # Internal -- parse a marked section + # Override this to handle MS-word extension syntax <![if word]>content<![endif]> + def parse_marked_section( self, i, report=1 ): + rawdata= self.rawdata + assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" + sectName, j = self._scan_name( i+3, i ) + if j < 0: + return j + if sectName in ("temp", "cdata", "ignore", "include", "rcdata"): + # look for standard ]]> ending + match= _markedsectionclose.search(rawdata, i+3) + elif sectName in ("if", "else", "endif"): + # look for MS Office ]> ending + match= _msmarkedsectionclose.search(rawdata, i+3) + else: + self.error('unknown status keyword %s in marked section' % `rawdata[i+3:j]`) + if not match: + return -1 + if report: + j = match.start(0) + self.unknown_decl(rawdata[i+3: j]) + return match.end(0) + + # Internal -- parse comment, return length or -1 if not terminated + def parse_comment(self, i, report=1): + rawdata = self.rawdata + if rawdata[i:i+4] != '<!--': + self.error('unexpected call to parse_comment()') + match = _commentclose.search(rawdata, i+4) + if not match: + return -1 + if report: + j = match.start(0) + self.handle_comment(rawdata[i+4: j]) + return match.end(0) + # Internal -- scan past the internal subset in a <!DOCTYPE declaration, # returning the index just past any whitespace following the trailing ']'. def _parse_doctype_subset(self, i, declstartpos): diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index a4f0a8b..b259328 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -30,7 +30,6 @@ shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') piclose = re.compile('>') endbracket = re.compile('[<>]') -commentclose = re.compile(r'--\s*>') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' @@ -145,6 +144,10 @@ class SGMLParser(markupbase.ParserBase): break continue if rawdata.startswith("<!--", i): + # Strictly speaking, a comment is --.*-- + # within a declaration tag <!...>. + # This should be removed, + # and comments handled only in parse_declaration. k = self.parse_comment(i) if k < 0: break i = k @@ -202,19 +205,6 @@ class SGMLParser(markupbase.ParserBase): self.rawdata = rawdata[i:] # XXX if end: check for empty stack - # Internal -- parse comment, return length or -1 if not terminated - def parse_comment(self, i, report=1): - rawdata = self.rawdata - if rawdata[i:i+4] != '<!--': - self.error('unexpected call to parse_comment()') - match = commentclose.search(rawdata, i+4) - if not match: - return -1 - if report: - j = match.start(0) - self.handle_comment(rawdata[i+4: j]) - return match.end(0) - # Extensions for the DOCTYPE scanner: _decl_otherchars = '=' @@ -471,6 +461,10 @@ class TestSGMLParser(SGMLParser): self.flush() print '*** unknown char ref: &#' + ref + ';' + def unknown_decl(self, data): + self.flush() + print '*** unknown decl: [' + data + ']' + def close(self): SGMLParser.close(self) self.flush() diff --git a/Lib/test/test_htmllib.py b/Lib/test/test_htmllib.py index e283d11..a20f43b 100644 --- a/Lib/test/test_htmllib.py +++ b/Lib/test/test_htmllib.py @@ -16,6 +16,17 @@ class AnchorCollector(htmllib.HTMLParser): def anchor_bgn(self, *args): self.__anchors.append(args) +class DeclCollector(htmllib.HTMLParser): + def __init__(self, *args, **kw): + self.__decls = [] + htmllib.HTMLParser.__init__(self, *args, **kw) + + def get_decl_info(self): + return self.__decls + + def unknown_decl(self, data): + self.__decls.append(data) + class HTMLParserTestCase(unittest.TestCase): def test_anchor_collection(self): @@ -33,6 +44,22 @@ class HTMLParserTestCase(unittest.TestCase): ('', 'frob', ''), ]) + def test_decl_collection(self): + # See SF patch #545300 + parser = DeclCollector(formatter.NullFormatter(), verbose=1) + parser.feed( + """<html> + <body> + hallo + <![if !supportEmptyParas]> <![endif]> + </body> + </html> + """) + parser.close() + self.assertEquals(parser.get_decl_info(), + ["if !supportEmptyParas", + "endif" + ]) def test_main(): test_support.run_unittest(HTMLParserTestCase) @@ -67,6 +67,9 @@ Extension modules Library ------- +- sgmllib now supports SGML marked sections, in particular the + MS Office extensions. + - The urllib module now offers support for the iterator protocol. SF patch 698520 contributed by Brett Cannon. |