Script to squirrel around with the DOM tree of document fragments from the

LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary.
author: Fred Drake <fdrake@acm.org> 1998-11-23 17:02:03 (GMT)
committer: Fred Drake <fdrake@acm.org> 1998-11-23 17:02:03 (GMT)
commit: 0320473a29fef5d35a5da1056e5c30e72cf25ac0 (patch)
tree: 2dcffb074c9d95057b4b70be3fa1efb7bcff515c
parent: 30a68c7a2bb12b8f10d239206033492e4a9e27a6 (diff)
download: cpython-0320473a29fef5d35a5da1056e5c30e72cf25ac0.zip
cpython-0320473a29fef5d35a5da1056e5c30e72cf25ac0.tar.gz
cpython-0320473a29fef5d35a5da1056e5c30e72cf25ac0.tar.bz2
1 files changed, 193 insertions, 0 deletions
diff --git a/Doc/tools/sgmlconv/docfixer.py b/Doc/tools/sgmlconv/docfixer.py
new file mode 100755
index 0000000..00d4727
--- /dev/null
+++ b/Doc/tools/sgmlconv/docfixer.py
@@ -0,0 +1,193 @@
+#! /usr/bin/env python
+
+"""Promote the IDs from <label/> elements to the enclosing section / chapter /
+whatever, then remove the <label/> elements.  This allows *ML style internal
+linking rather than the bogus LaTeX model.
+
+Note that <label/>s in <title> elements are promoted two steps, since the
+<title> elements are artificially created from the section parameter, and the
+label really refers to the sectioning construct.
+"""
+__version__ = '$Revision$'
+
+
+import errno
+import string
+import sys
+import xml.dom.core
+import xml.dom.esis_builder
+
+
+# Workaround to deal with invalid documents (multiple root elements).  This
+# does not indicate a bug in the DOM implementation.
+#
+def get_documentElement(self):
+    docelem = None
+    for n in self._node.children:
+        if n.type == xml.dom.core.ELEMENT:
+            docelem = xml.dom.core.Element(n, self, self)
+    return docelem
+
+xml.dom.core.Document.get_documentElement = get_documentElement
+
+
+# Replace get_childNodes for the Document class; without this, children
+# accessed from the Document object via .childNodes (no matter how many
+# levels of access are used) will be given an ownerDocument of None.
+#
+def get_childNodes(self):
+    return xml.dom.core.NodeList(self._node.children, self, self)
+
+xml.dom.core.Document.get_childNodes = get_childNodes
+
+
+def get_first_element(doc, gi):
+    for n in doc.childNodes:
+        if n.nodeType == xml.dom.core.ELEMENT and n.tagName == gi:
+            return n
+
+def extract_first_element(doc, gi):
+    node = get_first_element(doc, gi)
+    if node is not None:
+        doc.removeChild(node)
+    return node
+
+
+def simplify(doc):
+    # Try to rationalize the document a bit, since these things are simply
+    # not valid SGML/XML documents as they stand, and need a little work.
+    documentclass = "document"
+    inputs = []
+    node = extract_first_element(doc, "documentclass")
+    if node is not None:
+        documentclass = node.getAttribute("classname")
+    node = extract_first_element(doc, "title")
+    if node is not None:
+        inputs.append(node)
+    # update the name of the root element
+    node = get_first_element(doc, "document")
+    if node is not None:
+        node._node.name = documentclass
+    while 1:
+        node = extract_first_element(doc, "input")
+        if node is None:
+            break
+        inputs.append(node)
+    if inputs:
+        docelem = doc.documentElement
+        inputs.reverse()
+        for node in inputs:
+            text = doc.createTextNode("\n")
+            docelem.insertBefore(text, docelem.firstChild)
+            docelem.insertBefore(node, text)
+        docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
+    while doc.firstChild.nodeType == xml.dom.core.TEXT:
+        doc.removeChild(doc.firstChild)
+
+
+def cleanup_root_text(doc):
+    discards = []
+    skip = 0
+    for n in doc.childNodes:
+        prevskip = skip
+        skip = 0
+        if n.nodeType == xml.dom.core.TEXT and not prevskip:
+            discards.append(n)
+        elif n.nodeType == xml.dom.core.COMMENT:
+            skip = 1
+    for node in discards:
+        doc.removeChild(node)
+
+
+def rewrite_desc_entries(doc, argname_gi):
+    argnodes = doc.getElementsByTagName(argname_gi)
+    for node in argnodes:
+        parent = node.parentNode
+        nodes = []
+        for n in parent.childNodes:
+            if n.nodeType != xml.dom.core.ELEMENT or n.tagName != argname_gi:
+                nodes.append(n)
+        desc = doc.createElement("description")
+        for n in nodes:
+            parent.removeChild(n)
+            desc.appendChild(n)
+        if node.childNodes:
+            # keep the <args>...</args>, newline & indent
+            parent.insertBefore(doc.createText("\n  "), node)
+        else:
+            # no arguments, remove the <args/> node
+            parent.removeChild(node)
+        parent.appendChild(doc.createText("\n  "))
+        parent.appendChild(desc)
+        parent.appendChild(doc.createText("\n"))
+
+def handle_args(doc):
+    rewrite_desc_entries(doc, "args")
+    rewrite_desc_entries(doc, "constructor-args")
+
+
+def handle_comments(doc, node=None):
+    if node is None:
+        node = doc
+    for n in node.childNodes:
+        if n.nodeType == xml.dom.core.ELEMENT:
+            if n.tagName == "COMMENT":
+                comment = doc.createComment(n.childNodes[0].data)
+                node.replaceChild(comment, n)
+            else:
+                handle_comments(doc, n)
+
+
+def handle_labels(doc):
+    labels = doc.getElementsByTagName("label")
+    for label in labels:
+        id = label.getAttribute("id")
+        if not id:
+            continue
+        parent = label.parentNode
+        if parent.tagName == "title":
+            parent.parentNode.setAttribute("id", id)
+        else:
+            parent.setAttribute("id", id)
+        # now, remove <label id="..."/> from parent:
+        parent.removeChild(label)
+
+
+def convert(ifp, ofp):
+    p = xml.dom.esis_builder.EsisBuilder()
+    p.feed(ifp.read())
+    doc = p.document
+    handle_args(doc)
+    handle_comments(doc)
+    simplify(doc)
+    handle_labels(doc)
+    cleanup_root_text(doc)
+    try:
+        ofp.write(doc.toxml())
+        ofp.write("\n")
+    except IOError, (err, msg):
+        # Ignore EPIPE; it just means that whoever we're writing to stopped
+        # reading.  The rest of the output would be ignored.  All other errors
+        # should still be reported,
+        if err != errno.EPIPE:
+            raise
+
+
+def main():
+    if len(sys.argv) == 1:
+        ifp = sys.stdin
+        ofp = sys.stdout
+    elif len(sys.argv) == 2:
+        ifp = open(sys.argv[1])
+        ofp = sys.stdout
+    elif len(sys.argv) == 3:
+        ifp = open(sys.argv[1])
+        ofp = open(sys.argv[2], "w")
+    else:
+        usage()
+        sys.exit(2)
+    convert(ifp, ofp)
+
+
+if __name__ == "__main__":
+    main()
author	Fred Drake <fdrake@acm.org>	1998-11-23 17:02:03 (GMT)
committer	Fred Drake <fdrake@acm.org>	1998-11-23 17:02:03 (GMT)
commit	0320473a29fef5d35a5da1056e5c30e72cf25ac0 (patch)
tree	2dcffb074c9d95057b4b70be3fa1efb7bcff515c
parent	30a68c7a2bb12b8f10d239206033492e4a9e27a6 (diff)
download	cpython-0320473a29fef5d35a5da1056e5c30e72cf25ac0.zip cpython-0320473a29fef5d35a5da1056e5c30e72cf25ac0.tar.gz cpython-0320473a29fef5d35a5da1056e5c30e72cf25ac0.tar.bz2