Patch #545300: Support marked sections.

author: Martin v. Löwis <martin@v.loewis.de> 2003-03-30 14:25:40 (GMT)
committer: Martin v. Löwis <martin@v.loewis.de> 2003-03-30 14:25:40 (GMT)
commit: 3163a3b4b2a820ef38e6c7282033fe2db9d43ebe (patch)
tree: e83e06cbe049f06610d6c5e025e813e644fc9b01
parent: a965649386dbd385a92b2b93934abaff80c94198 (diff)
download: cpython-3163a3b4b2a820ef38e6c7282033fe2db9d43ebe.zip
cpython-3163a3b4b2a820ef38e6c7282033fe2db9d43ebe.tar.gz
cpython-3163a3b4b2a820ef38e6c7282033fe2db9d43ebe.tar.bz2
4 files changed, 107 insertions, 16 deletions
diff --git a/Lib/markupbase.py b/Lib/markupbase.py
index acd0726..f97cf10 100644
--- a/Lib/markupbase.py
+++ b/Lib/markupbase.py
@@ -4,6 +4,13 @@ import re
 
 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
+_commentclose = re.compile(r'--\s*>')
+_markedsectionclose = re.compile(r']\s*]\s*>')
+
+# An analysis of the MS-Word extensions is available at
+# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
+
+_msmarkedsectionclose = re.compile(r']\s*>')
 
 del re
 
@@ -53,6 +60,13 @@ class ParserBase:
         # This is some sort of declaration; in "HTML as
         # deployed," this should only be the document type
         # declaration ("<!DOCTYPE html...>").
+        # ISO 8879:1986, however, has more complex 
+        # declaration syntax for elements in <!...>, including:
+        # --comment--
+        # [marked section]
+        # name in the following list: ENTITY, DOCTYPE, ELEMENT, 
+        # ATTLIST, NOTATION, SHORTREF, USEMAP, 
+        # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
         rawdata = self.rawdata
         j = i + 2
         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
@@ -60,9 +74,19 @@ class ParserBase:
             # Start of comment followed by buffer boundary,
             # or just a buffer boundary.
             return -1
-        # in practice, this should look like: ((name|stringlit) S*)+ '>'
+        # A simple, practical version could look like: ((name|stringlit) S*) + '>'
         n = len(rawdata)
-        decltype, j = self._scan_name(j, i)
+        if rawdata[j:j+1] == '--': #comment
+            # Locate --.*-- as the body of the comment
+            return self.parse_comment(i)
+        elif rawdata[j] == '[': #marked section
+            # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
+            # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
+            # Note that this is extended by Microsoft Office "Save as Web" function
+            # to include [if...] and [endif].
+            return self.parse_marked_section(i)
+        else: #all other declaration elements
+            decltype, j = self._scan_name(j, i)
         if j < 0:
             return j
         if decltype == "doctype":
@@ -87,8 +111,15 @@ class ParserBase:
             elif c in self._decl_otherchars:
                 j = j + 1
             elif c == "[":
+                # this could be handled in a separate doctype parser
                 if decltype == "doctype":
                     j = self._parse_doctype_subset(j + 1, i)
+                elif decltype in ("attlist", "linktype", "link", "element"):
+                    # must tolerate []'d groups in a content model in an element declaration
+                    # also in data attribute specifications of attlist declaration
+                    # also link type declaration subsets in linktype declarations
+                    # also link attribute specification lists in link declarations
+                    self.error("unsupported '[' char in %s declaration" % decltype)
                 else:
                     self.error("unexpected '[' char in declaration")
             else:
@@ -98,6 +129,42 @@ class ParserBase:
                 return j
         return -1 # incomplete
 
+    # Internal -- parse a marked section
+    # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
+    def parse_marked_section( self, i, report=1 ):
+        rawdata= self.rawdata
+        assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
+        sectName, j = self._scan_name( i+3, i )
+        if j < 0:
+            return j
+        if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
+            # look for standard ]]> ending
+            match= _markedsectionclose.search(rawdata, i+3)
+        elif sectName in ("if", "else", "endif"):
+            # look for MS Office ]> ending
+            match= _msmarkedsectionclose.search(rawdata, i+3)
+        else:
+            self.error('unknown status keyword %s in marked section' % `rawdata[i+3:j]`)
+        if not match:
+            return -1
+        if report:
+            j = match.start(0)
+            self.unknown_decl(rawdata[i+3: j])
+        return match.end(0)
+            
+    # Internal -- parse comment, return length or -1 if not terminated
+    def parse_comment(self, i, report=1):
+        rawdata = self.rawdata
+        if rawdata[i:i+4] != '<!--':
+            self.error('unexpected call to parse_comment()')
+        match = _commentclose.search(rawdata, i+4)
+        if not match:
+            return -1
+        if report:
+            j = match.start(0)
+            self.handle_comment(rawdata[i+4: j])
+        return match.end(0)
+
     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
     # returning the index just past any whitespace following the trailing ']'.
     def _parse_doctype_subset(self, i, declstartpos):
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index a4f0a8b..b259328 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@@ -30,7 +30,6 @@ shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
 piclose = re.compile('>')
 endbracket = re.compile('[<>]')
-commentclose = re.compile(r'--\s*>')
 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
 attrfind = re.compile(
     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
@@ -145,6 +144,10 @@ class SGMLParser(markupbase.ParserBase):
                         break
                     continue
                 if rawdata.startswith("<!--", i):
+                	# Strictly speaking, a comment is --.*-- 
+                	# within a declaration tag <!...>.
+                	# This should be removed, 
+                	# and comments handled only in parse_declaration.
                     k = self.parse_comment(i)
                     if k < 0: break
                     i = k
@@ -202,19 +205,6 @@ class SGMLParser(markupbase.ParserBase):
         self.rawdata = rawdata[i:]
         # XXX if end: check for empty stack
 
-    # Internal -- parse comment, return length or -1 if not terminated
-    def parse_comment(self, i, report=1):
-        rawdata = self.rawdata
-        if rawdata[i:i+4] != '<!--':
-            self.error('unexpected call to parse_comment()')
-        match = commentclose.search(rawdata, i+4)
-        if not match:
-            return -1
-        if report:
-            j = match.start(0)
-            self.handle_comment(rawdata[i+4: j])
-        return match.end(0)
-
     # Extensions for the DOCTYPE scanner:
     _decl_otherchars = '='
 
@@ -471,6 +461,10 @@ class TestSGMLParser(SGMLParser):
         self.flush()
         print '*** unknown char ref: &#' + ref + ';'
 
+    def unknown_decl(self, data):
+        self.flush()
+        print '*** unknown decl: [' + data + ']'
+
     def close(self):
         SGMLParser.close(self)
         self.flush()
diff --git a/Lib/test/test_htmllib.py b/Lib/test/test_htmllib.py
index e283d11..a20f43b 100644
--- a/Lib/test/test_htmllib.py
+++ b/Lib/test/test_htmllib.py
@@ -16,6 +16,17 @@ class AnchorCollector(htmllib.HTMLParser):
     def anchor_bgn(self, *args):
         self.__anchors.append(args)
 
+class DeclCollector(htmllib.HTMLParser):
+    def __init__(self, *args, **kw):
+        self.__decls = []
+        htmllib.HTMLParser.__init__(self, *args, **kw)
+
+    def get_decl_info(self):
+        return self.__decls
+
+    def unknown_decl(self, data):
+        self.__decls.append(data)
+
 
 class HTMLParserTestCase(unittest.TestCase):
     def test_anchor_collection(self):
@@ -33,6 +44,22 @@ class HTMLParserTestCase(unittest.TestCase):
                            ('', 'frob', ''),
                            ])
 
+    def test_decl_collection(self):
+        # See SF patch #545300
+        parser = DeclCollector(formatter.NullFormatter(), verbose=1)
+        parser.feed(
+            """<html>
+            <body>
+            hallo
+            <![if !supportEmptyParas]>&nbsp;<![endif]>
+            </body>
+            </html>
+            """)
+        parser.close()
+        self.assertEquals(parser.get_decl_info(),
+                          ["if !supportEmptyParas",
+                           "endif"
+                           ])
 
 def test_main():
     test_support.run_unittest(HTMLParserTestCase)
diff --git a/Misc/NEWS b/Misc/NEWS
index 3a1c875..52c638f 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -67,6 +67,9 @@ Extension modules
 Library
 -------
 
+- sgmllib now supports SGML marked sections, in particular the 
+  MS Office extensions.
+
 - The urllib module now offers support for the iterator protocol.
   SF patch 698520 contributed by Brett Cannon.
author	Martin v. Löwis <martin@v.loewis.de>	2003-03-30 14:25:40 (GMT)
committer	Martin v. Löwis <martin@v.loewis.de>	2003-03-30 14:25:40 (GMT)
commit	3163a3b4b2a820ef38e6c7282033fe2db9d43ebe (patch)
tree	e83e06cbe049f06610d6c5e025e813e644fc9b01
parent	a965649386dbd385a92b2b93934abaff80c94198 (diff)
download	cpython-3163a3b4b2a820ef38e6c7282033fe2db9d43ebe.zip cpython-3163a3b4b2a820ef38e6c7282033fe2db9d43ebe.tar.gz cpython-3163a3b4b2a820ef38e6c7282033fe2db9d43ebe.tar.bz2