diff options
-rwxr-xr-x | Doc/tools/sgmlconv/docfixer.py | 260 |
1 files changed, 159 insertions, 101 deletions
diff --git a/Doc/tools/sgmlconv/docfixer.py b/Doc/tools/sgmlconv/docfixer.py index 9628e30..b180d78 100755 --- a/Doc/tools/sgmlconv/docfixer.py +++ b/Doc/tools/sgmlconv/docfixer.py @@ -1,12 +1,7 @@ #! /usr/bin/env python -"""Promote the IDs from <label/> elements to the enclosing section / chapter / -whatever, then remove the <label/> elements. This allows *ML style internal -linking rather than the bogus LaTeX model. - -Note that <label/>s in <title> elements are promoted two steps, since the -<title> elements are artificially created from the section parameter, and the -label really refers to the sectioning construct. +"""Perform massive transformations on a document tree created from the LaTeX +of the Python documentation, and dump the ESIS data for the transformed tree. """ __version__ = '$Revision$' @@ -26,6 +21,13 @@ class ConversionError(Exception): DEBUG_PARA_FIXER = 0 +if DEBUG_PARA_FIXER: + def para_msg(s): + sys.stderr.write("*** %s\n" % s) +else: + def para_msg(s): + pass + # Workaround to deal with invalid documents (multiple root elements). This # does not indicate a bug in the DOM implementation. @@ -62,6 +64,19 @@ def extract_first_element(doc, gi): return node +def find_all_elements(doc, gi): + nodes = [] + if doc.nodeType == xml.dom.core.ELEMENT and doc.tagName == gi: + nodes.append(doc) + for child in doc.childNodes: + if child.nodeType == xml.dom.core.ELEMENT: + if child.tagName == gi: + nodes.append(child) + for node in child.getElementsByTagName(gi): + nodes.append(node) + return nodes + + def simplify(doc): # Try to rationalize the document a bit, since these things are simply # not valid SGML/XML documents as they stand, and need a little work. @@ -108,31 +123,50 @@ def cleanup_root_text(doc): doc.removeChild(node) -def rewrite_desc_entries(doc, argname_gi): - argnodes = doc.getElementsByTagName(argname_gi) - for node in argnodes: +def handle_args(doc): + for node in find_all_elements(doc, "args"): parent = node.parentNode nodes = [] for n in parent.childNodes: - if n.nodeType != xml.dom.core.ELEMENT or n.tagName != argname_gi: + if n.nodeType != xml.dom.core.ELEMENT or n.tagName != "args": nodes.append(n) + signature = doc.createElement("signature") + signature.appendChild(doc.createTextNode("\n ")) + name = doc.createElement("name") + name.appendChild(doc.createTextNode(parent.getAttribute("name"))) + parent.removeAttribute("name") + signature.appendChild(name) desc = doc.createElement("description") for n in nodes: parent.removeChild(n) desc.appendChild(n) + desc.appendChild(doc.createTextNode("\n ")) + parent.replaceChild(signature, node) + parent.insertBefore(doc.createTextNode("\n "), signature) if node.childNodes: # keep the <args>...</args>, newline & indent - parent.insertBefore(doc.createText("\n "), node) - else: - # no arguments, remove the <args/> node - parent.removeChild(node) + signature.appendChild(doc.createTextNode("\n ")) + signature.appendChild(node) parent.appendChild(doc.createText("\n ")) parent.appendChild(desc) parent.appendChild(doc.createText("\n")) + signature.appendChild(doc.createTextNode("\n ")) -def handle_args(doc): - rewrite_desc_entries(doc, "args") - rewrite_desc_entries(doc, "constructor-args") + +def methodline_to_signature(doc, methodline): + signature = doc.createElement("signature") + signature.appendChild(doc.createTextNode("\n ")) + name = doc.createElement("name") + name.appendChild(doc.createTextNode(methodline.getAttribute("name"))) + signature.appendChild(name) + methodline.parentNode.removeChild(methodline) + if len(methodline.childNodes): + methodline._node.name = "args" + methodline.removeAttribute("name") + signature.appendChild(doc.createTextNode("\n ")) + signature.appendChild(methodline) + signature.appendChild(doc.createTextNode("\n ")) + return signature def handle_appendix(doc): @@ -165,20 +199,17 @@ def handle_appendix(doc): def handle_labels(doc): - for node in doc.childNodes: - if node.nodeType == xml.dom.core.ELEMENT: - labels = node.getElementsByTagName("label") - for label in labels: - id = label.getAttribute("id") - if not id: - continue - parent = label.parentNode - if parent.tagName == "title": - parent.parentNode.setAttribute("id", id) - else: - parent.setAttribute("id", id) - # now, remove <label id="..."/> from parent: - parent.removeChild(label) + for label in find_all_elements(doc, "label"): + id = label.getAttribute("id") + if not id: + continue + parent = label.parentNode + if parent.tagName == "title": + parent.parentNode.setAttribute("id", id) + else: + parent.setAttribute("id", id) + # now, remove <label id="..."/> from parent: + parent.removeChild(label) def fixup_trailing_whitespace(doc, wsmap): @@ -306,7 +337,8 @@ def create_module_info(doc, section): if first_data.data[:4] == " ---": first_data.data = string.lstrip(first_data.data[4:]) title._node.name = "short-synopsis" - if children[-1].data[-1:] == ".": + if children[-1].nodeType == xml.dom.core.TEXT \ + and children[-1].data[-1:] == ".": children[-1].data = children[-1].data[:-1] section.removeChild(title) section.removeChild(section.childNodes[0]) @@ -335,10 +367,8 @@ def create_module_info(doc, section): def cleanup_synopses(doc): - for node in doc.childNodes: - if node.nodeType == xml.dom.core.ELEMENT \ - and node.tagName == "section": - create_module_info(doc, node) + for node in find_all_elements(doc, "section"): + create_module_info(doc, node) def remap_element_names(root, name_map): @@ -361,11 +391,9 @@ def remap_element_names(root, name_map): def fixup_table_structures(doc): # must be done after remap_element_names(), or the tables won't be found - for child in doc.childNodes: - if child.nodeType == xml.dom.core.ELEMENT: - tables = child.getElementsByTagName("table") - for table in tables: - fixup_table(doc, table) + for table in find_all_elements(doc, "table"): + fixup_table(doc, table) + def fixup_table(doc, table): # create the table head @@ -443,66 +471,81 @@ def move_elements_by_name(doc, source, dest, name, sep=None): FIXUP_PARA_ELEMENTS = ( "chapter", "section", "subsection", "subsubsection", - "paragraph", "subparagraph", "description", - "opcodedesc", "classdesc", - "funcdesc", "methoddesc", "excdesc", "datadesc", - "funcdescni", "methoddescni", "excdescni", "datadescni", + "paragraph", "subparagraph", + "excdesc", "datadesc", + "excdescni", "datadescni", + ) + +RECURSE_INTO_PARA_CONTAINERS = ( + "chapter", + "section", "subsection", "subsubsection", + "paragraph", "subparagraph", + "abstract", + "memberdesc", "memberdescni", "datadesc", "datadescni", ) PARA_LEVEL_ELEMENTS = ( "moduleinfo", "title", "verbatim", "opcodedesc", "classdesc", - "funcdesc", "methoddesc", "excdesc", "datadesc", - "funcdescni", "methoddescni", "excdescni", "datadescni", + "funcdesc", "methoddesc", "excdesc", + "funcdescni", "methoddescni", "excdescni", "tableii", "tableiii", "tableiv", "localmoduletable", - "sectionauthor", + "sectionauthor", "seealso", # include <para>, so we can just do it again to get subsequent paras: "para", ) PARA_LEVEL_PRECEEDERS = ( "index", "indexii", "indexiii", "indexiv", - "stindex", "obindex", "COMMENT", "label", + "stindex", "obindex", "COMMENT", "label", "input", + "memberline", "memberlineni", + "methodline", "methodlineni", ) + def fixup_paras(doc): for child in doc.childNodes: if child.nodeType == xml.dom.core.ELEMENT \ - and child.tagName in FIXUP_PARA_ELEMENTS: + and child.tagName in RECURSE_INTO_PARA_CONTAINERS: + # fixup_paras_helper(doc, child) descriptions = child.getElementsByTagName("description") for description in descriptions: - if DEBUG_PARA_FIXER: - sys.stderr.write("-- Fixing up <description> element...\n") fixup_paras_helper(doc, description) -def fixup_paras_helper(doc, container): +def fixup_paras_helper(doc, container, depth=0): # document is already normalized children = container.childNodes start = 0 - start_fixed = 0 - i = len(children) - SKIP_ELEMENTS = PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS - if DEBUG_PARA_FIXER: - sys.stderr.write("fixup_paras_helper() called on <%s>; %d, %d\n" - % (container.tagName, start, i)) - if i > start: - # the first [start:i] children shoudl be rewritten as <para> elements - # start by breaking text nodes that contain \n\n+ into multiple nodes - nstart, i = skip_leading_nodes(container.childNodes, start, i) - if i > nstart: - build_para(doc, container, nstart, i) - fixup_paras_helper(doc, container) + while len(children) > start: + start = skip_leading_nodes(children, start) + if start >= len(children): + break + # + # Either paragraph material or something to recurse into: + # + if (children[start].nodeType == xml.dom.core.ELEMENT) \ + and (children[start].tagName in RECURSE_INTO_PARA_CONTAINERS): + fixup_paras_helper(doc, children[start]) + start = skip_leading_nodes(children, start + 1) + continue + # + # paragraph material: + # + build_para(doc, container, start, len(children)) + if DEBUG_PARA_FIXER and depth == 10: + sys.exit(1) + start = start + 1 def build_para(doc, parent, start, i): children = parent.childNodes - # collect all children until \n\n+ is found in a text node or a - # PARA_LEVEL_ELEMENT is found. after = start + 1 have_last = 0 BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + FIXUP_PARA_ELEMENTS + # Collect all children until \n\n+ is found in a text node or a + # member of BREAK_ELEMENTS is found. for j in range(start, i): after = j + 1 child = children[j] @@ -521,6 +564,9 @@ def build_para(doc, parent, start, i): break else: have_last = 1 + if (start + 1) > after: + raise ConversionError( + "build_para() could not identify content to turn into a paragraph") if children[after - 1].nodeType == xml.dom.core.TEXT: # we may need to split off trailing white space: child = children[after - 1] @@ -528,66 +574,60 @@ def build_para(doc, parent, start, i): if string.rstrip(data) != data: have_last = 0 child.splitText(len(string.rstrip(data))) - children = parent.childNodes para = doc.createElement("para") prev = None indexes = range(start, after) indexes.reverse() for j in indexes: - node = children[j] + node = parent.childNodes[j] parent.removeChild(node) para.insertBefore(node, prev) prev = node if have_last: parent.appendChild(para) + return len(parent.childNodes) else: parent.insertBefore(para, parent.childNodes[start]) + return start + 1 + +def skip_leading_nodes(children, start): + """Return index into children of a node at which paragraph building should + begin or a recursive call to fixup_paras_helper() should be made (for + subsections, etc.). -def skip_leading_nodes(children, start, i): - i = min(i, len(children)) + When the return value >= len(children), we've built all the paras we can + from this list of children. + """ + i = len(children) while i > start: # skip over leading comments and whitespace: - try: - child = children[start] - except IndexError: - sys.stderr.write( - "skip_leading_nodes() failed at index %d\n" % start) - raise + child = children[start] nodeType = child.nodeType - if nodeType == xml.dom.core.COMMENT: - start = start + 1 - elif nodeType == xml.dom.core.TEXT: + if nodeType == xml.dom.core.TEXT: data = child.data shortened = string.lstrip(data) if shortened: if data != shortened: # break into two nodes: whitespace and non-whitespace child.splitText(len(data) - len(shortened)) - return start + 1, i + 1 - break + return start + 1 + return start # all whitespace, just skip - start = start + 1 elif nodeType == xml.dom.core.ELEMENT: - if child.tagName in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS: - start = start + 1 - else: - break - else: - break - return start, i + tagName = child.tagName + if tagName in RECURSE_INTO_PARA_CONTAINERS: + return start + if tagName not in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS: + return start + start = start + 1 + return start def fixup_rfc_references(doc): - rfc_nodes = [] - for child in doc.childNodes: - if child.nodeType == xml.dom.core.ELEMENT: - kids = child.getElementsByTagName("rfc") - for k in kids: - rfc_nodes.append(k) - for rfc_node in rfc_nodes: - rfc_node.appendChild(doc.createTextNode( - "RFC " + rfc_node.getAttribute("num"))) + for rfcnode in find_all_elements(doc, "rfc"): + rfcnode.appendChild(doc.createTextNode( + "RFC " + rfcnode.getAttribute("num"))) def fixup_signatures(doc): @@ -596,6 +636,7 @@ def fixup_signatures(doc): args = child.getElementsByTagName("args") for arg in args: fixup_args(doc, arg) + arg.normalize() args = child.getElementsByTagName("constructor-args") for arg in args: fixup_args(doc, arg) @@ -618,6 +659,22 @@ def fixup_args(doc, arglist): return fixup_args(doc, arglist) +def fixup_sectionauthors(doc): + for sectauth in find_all_elements(doc, "sectionauthor"): + section = sectauth.parentNode + section.removeChild(sectauth) + sectauth._node.name = "author" + sectauth.appendChild(doc.createTextNode( + sectauth.getAttribute("name"))) + sectauth.removeAttribute("name") + after = section.childNodes[2] + title = section.childNodes[1] + if title.nodeType == xml.dom.core.ELEMENT and title.tagName != "title": + after = section.childNodes[0] + section.insertBefore(doc.createTextNode("\n "), after) + section.insertBefore(sectauth, after) + + _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$") def write_esis(doc, ofp, knownempty): @@ -669,6 +726,7 @@ def convert(ifp, ofp): cleanup_synopses(doc) normalize(doc) fixup_paras(doc) + fixup_sectionauthors(doc) remap_element_names(doc, { "tableii": ("table", {"cols": "2"}), "tableiii": ("table", {"cols": "3"}), |