diff options
-rwxr-xr-x | Doc/tools/sgmlconv/docfixer.py | 185 |
1 files changed, 94 insertions, 91 deletions
diff --git a/Doc/tools/sgmlconv/docfixer.py b/Doc/tools/sgmlconv/docfixer.py index 11c487d..f700134 100755 --- a/Doc/tools/sgmlconv/docfixer.py +++ b/Doc/tools/sgmlconv/docfixer.py @@ -12,7 +12,10 @@ import re import string import sys import xml.dom.core -import xml.dom.esis_builder + +from xml.dom.core import \ + ELEMENT, \ + TEXT class ConversionError(Exception): @@ -32,11 +35,11 @@ else: # Workaround to deal with invalid documents (multiple root elements). This # does not indicate a bug in the DOM implementation. # -def get_documentElement(self): +def get_documentElement(doc): docelem = None - for n in self._node.children: - if n.type == xml.dom.core.ELEMENT: - docelem = xml.dom.core.Element(n, self, self) + for n in doc.childNodes: + if n.nodeType == ELEMENT: + docelem = n return docelem xml.dom.core.Document.get_documentElement = get_documentElement @@ -46,15 +49,15 @@ xml.dom.core.Document.get_documentElement = get_documentElement # accessed from the Document object via .childNodes (no matter how many # levels of access are used) will be given an ownerDocument of None. # -def get_childNodes(self): - return xml.dom.core.NodeList(self._node.children, self, self) +def get_childNodes(doc): + return xml.dom.core.NodeList(doc._node.children, doc._node) xml.dom.core.Document.get_childNodes = get_childNodes def get_first_element(doc, gi): for n in doc.childNodes: - if n.nodeType == xml.dom.core.ELEMENT and n.tagName == gi: + if n.nodeType == ELEMENT and n.tagName == gi: return n def extract_first_element(doc, gi): @@ -66,10 +69,10 @@ def extract_first_element(doc, gi): def find_all_elements(doc, gi): nodes = [] - if doc.nodeType == xml.dom.core.ELEMENT and doc.tagName == gi: + if doc.nodeType == ELEMENT and doc.tagName == gi: nodes.append(doc) for child in doc.childNodes: - if child.nodeType == xml.dom.core.ELEMENT: + if child.nodeType == ELEMENT: if child.tagName == gi: nodes.append(child) for node in child.getElementsByTagName(gi): @@ -77,36 +80,36 @@ def find_all_elements(doc, gi): return nodes -def simplify(doc): +def simplify(doc, fragment): # Try to rationalize the document a bit, since these things are simply # not valid SGML/XML documents as they stand, and need a little work. documentclass = "document" inputs = [] - node = extract_first_element(doc, "documentclass") + node = extract_first_element(fragment, "documentclass") if node is not None: documentclass = node.getAttribute("classname") - node = extract_first_element(doc, "title") + node = extract_first_element(fragment, "title") if node is not None: inputs.append(node) # update the name of the root element - node = get_first_element(doc, "document") + node = get_first_element(fragment, "document") if node is not None: node._node.name = documentclass while 1: - node = extract_first_element(doc, "input") + node = extract_first_element(fragment, "input") if node is None: break inputs.append(node) if inputs: - docelem = doc.documentElement + docelem = get_documentElement(fragment) inputs.reverse() for node in inputs: text = doc.createTextNode("\n") docelem.insertBefore(text, docelem.firstChild) docelem.insertBefore(node, text) docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild) - while doc.firstChild.nodeType == xml.dom.core.TEXT: - doc.removeChild(doc.firstChild) + while fragment.firstChild.nodeType == TEXT: + fragment.removeChild(fragment.firstChild) def cleanup_root_text(doc): @@ -115,9 +118,9 @@ def cleanup_root_text(doc): for n in doc.childNodes: prevskip = skip skip = 0 - if n.nodeType == xml.dom.core.TEXT and not prevskip: + if n.nodeType == TEXT and not prevskip: discards.append(n) - elif n.nodeType == xml.dom.core.ELEMENT and n.tagName == "COMMENT": + elif n.nodeType == ELEMENT and n.tagName == "COMMENT": skip = 1 for node in discards: doc.removeChild(node) @@ -130,8 +133,8 @@ DESCRIPTOR_ELEMENTS = ( "datadesc", "datadescni", ) -def fixup_descriptors(doc): - sections = find_all_elements(doc, "section") +def fixup_descriptors(doc, fragment): + sections = find_all_elements(fragment, "section") for section in sections: find_and_fix_descriptors(doc, section) @@ -139,7 +142,7 @@ def fixup_descriptors(doc): def find_and_fix_descriptors(doc, container): children = container.childNodes for child in children: - if child.nodeType == xml.dom.core.ELEMENT: + if child.nodeType == ELEMENT: tagName = child.tagName if tagName in DESCRIPTOR_ELEMENTS: rewrite_descriptor(doc, child) @@ -191,7 +194,7 @@ def rewrite_descriptor(doc, descriptor): pos = skip_leading_nodes(children, 0) if pos < len(children): child = children[pos] - if child.nodeType == xml.dom.core.ELEMENT and child.tagName == "args": + if child.nodeType == ELEMENT and child.tagName == "args": # create an <args> in <signature>: args = doc.createElement("args") argchildren = [] @@ -205,7 +208,7 @@ def rewrite_descriptor(doc, descriptor): # 3, 4. pos = skip_leading_nodes(children, pos + 1) while pos < len(children) \ - and children[pos].nodeType == xml.dom.core.ELEMENT \ + and children[pos].nodeType == ELEMENT \ and children[pos].tagName in (linename, "versionadded"): if children[pos].tagName == linename: # this is really a supplemental signature, create <signature> @@ -222,7 +225,7 @@ def rewrite_descriptor(doc, descriptor): newchildren.append(description) move_children(descriptor, description, pos) last = description.childNodes[-1] - if last.nodeType == xml.dom.core.TEXT: + if last.nodeType == TEXT: last.data = string.rstrip(last.data) + "\n " # 6. # should have nothing but whitespace and signature lines in <descriptor>; @@ -259,16 +262,16 @@ def move_children(origin, dest, start=0): dest.appendChild(node) -def handle_appendix(doc): +def handle_appendix(doc, fragment): # must be called after simplfy() if document is multi-rooted to begin with - docelem = doc.documentElement + docelem = get_documentElement(fragment) toplevel = docelem.tagName == "manual" and "chapter" or "section" appendices = 0 nodes = [] for node in docelem.childNodes: if appendices: nodes.append(node) - elif node.nodeType == xml.dom.core.ELEMENT: + elif node.nodeType == ELEMENT: appnodes = node.getElementsByTagName("appendix") if appnodes: appendices = 1 @@ -281,7 +284,7 @@ def handle_appendix(doc): back = doc.createElement("back-matter") docelem.appendChild(back) back.appendChild(doc.createTextNode("\n")) - while nodes and nodes[0].nodeType == xml.dom.core.TEXT \ + while nodes and nodes[0].nodeType == TEXT \ and not string.strip(nodes[0].data): del nodes[0] map(back.appendChild, nodes) @@ -307,28 +310,28 @@ def fixup_trailing_whitespace(doc, wsmap): while queue: node = queue[0] del queue[0] - if node.nodeType == xml.dom.core.ELEMENT \ + if node.nodeType == ELEMENT \ and wsmap.has_key(node.tagName): ws = wsmap[node.tagName] children = node.childNodes children.reverse() - if children[0].nodeType == xml.dom.core.TEXT: + if children[0].nodeType == TEXT: data = string.rstrip(children[0].data) + ws children[0].data = data children.reverse() # hack to get the title in place: if node.tagName == "title" \ - and node.parentNode.firstChild.nodeType == xml.dom.core.ELEMENT: + and node.parentNode.firstChild.nodeType == ELEMENT: node.parentNode.insertBefore(doc.createText("\n "), node.parentNode.firstChild) for child in node.childNodes: - if child.nodeType == xml.dom.core.ELEMENT: + if child.nodeType == ELEMENT: queue.append(child) def normalize(doc): for node in doc.childNodes: - if node.nodeType == xml.dom.core.ELEMENT: + if node.nodeType == ELEMENT: node.normalize() @@ -339,7 +342,7 @@ def cleanup_trailing_parens(doc, element_names): rewrite_element = d.has_key queue = [] for node in doc.childNodes: - if node.nodeType == xml.dom.core.ELEMENT: + if node.nodeType == ELEMENT: queue.append(node) while queue: node = queue[0] @@ -347,13 +350,13 @@ def cleanup_trailing_parens(doc, element_names): if rewrite_element(node.tagName): children = node.childNodes if len(children) == 1 \ - and children[0].nodeType == xml.dom.core.TEXT: + and children[0].nodeType == TEXT: data = children[0].data if data[-2:] == "()": children[0].data = data[:-2] else: for child in node.childNodes: - if child.nodeType == xml.dom.core.ELEMENT: + if child.nodeType == ELEMENT: queue.append(child) @@ -366,13 +369,13 @@ def contents_match(left, right): nodeType = l.nodeType if nodeType != r.nodeType: return 0 - if nodeType == xml.dom.core.ELEMENT: + if nodeType == ELEMENT: if l.tagName != r.tagName: return 0 # should check attributes, but that's not a problem here if not contents_match(l, r): return 0 - elif nodeType == xml.dom.core.TEXT: + elif nodeType == TEXT: if l.data != r.data: return 0 else: @@ -388,7 +391,7 @@ def create_module_info(doc, section): return node._node.name = "synopsis" lastchild = node.childNodes[-1] - if lastchild.nodeType == xml.dom.core.TEXT \ + if lastchild.nodeType == TEXT \ and lastchild.data[-1:] == ".": lastchild.data = lastchild.data[:-1] modauthor = extract_first_element(section, "moduleauthor") @@ -423,7 +426,7 @@ def create_module_info(doc, section): if title: children = title.childNodes if len(children) >= 2 \ - and children[0].nodeType == xml.dom.core.ELEMENT \ + and children[0].nodeType == ELEMENT \ and children[0].tagName == "module" \ and children[0].childNodes[0].data == name: # this is it; morph the <title> into <short-synopsis> @@ -431,7 +434,7 @@ def create_module_info(doc, section): if first_data.data[:4] == " ---": first_data.data = string.lstrip(first_data.data[4:]) title._node.name = "short-synopsis" - if children[-1].nodeType == xml.dom.core.TEXT \ + if children[-1].nodeType == TEXT \ and children[-1].data[-1:] == ".": children[-1].data = children[-1].data[:-1] section.removeChild(title) @@ -470,10 +473,10 @@ def create_module_info(doc, section): children = section.childNodes for i in range(len(children)): node = children[i] - if node.nodeType == xml.dom.core.ELEMENT \ + if node.nodeType == ELEMENT \ and node.tagName == "moduleinfo": nextnode = children[i+1] - if nextnode.nodeType == xml.dom.core.TEXT: + if nextnode.nodeType == TEXT: data = nextnode.data if len(string.lstrip(data)) < (len(data) - 4): nextnode.data = "\n\n\n" + string.lstrip(data) @@ -487,7 +490,7 @@ def cleanup_synopses(doc): def remap_element_names(root, name_map): queue = [] for child in root.childNodes: - if child.nodeType == xml.dom.core.ELEMENT: + if child.nodeType == ELEMENT: queue.append(child) while queue: node = queue.pop() @@ -498,13 +501,13 @@ def remap_element_names(root, name_map): for attr, value in attrs.items(): node.setAttribute(attr, value) for child in node.childNodes: - if child.nodeType == xml.dom.core.ELEMENT: + if child.nodeType == ELEMENT: queue.append(child) -def fixup_table_structures(doc): +def fixup_table_structures(doc, fragment): # must be done after remap_element_names(), or the tables won't be found - for table in find_all_elements(doc, "table"): + for table in find_all_elements(fragment, "table"): fixup_table(doc, table) @@ -522,7 +525,7 @@ def fixup_table(doc, table): last_was_hline = 0 children = table.childNodes for child in children: - if child.nodeType == xml.dom.core.ELEMENT: + if child.nodeType == ELEMENT: tagName = child.tagName if tagName == "hline" and prev_row is not None: prev_row.setAttribute("rowsep", "1") @@ -535,12 +538,12 @@ def fixup_table(doc, table): while children: child = children[0] nodeType = child.nodeType - if nodeType == xml.dom.core.TEXT: + if nodeType == TEXT: if string.strip(child.data): raise ConversionError("unexpected free data in table") table.removeChild(child) continue - if nodeType == xml.dom.core.ELEMENT: + if nodeType == ELEMENT: if child.tagName != "hline": raise ConversionError( "unexpected <%s> in table" % child.tagName) @@ -572,7 +575,7 @@ def fixup_row(doc, row): def move_elements_by_name(doc, source, dest, name, sep=None): nodes = [] for child in source.childNodes: - if child.nodeType == xml.dom.core.ELEMENT and child.tagName == name: + if child.nodeType == ELEMENT and child.tagName == name: nodes.append(child) for node in nodes: source.removeChild(node) @@ -606,13 +609,13 @@ PARA_LEVEL_PRECEEDERS = ( ) -def fixup_paras(doc): - for child in doc.childNodes: - if child.nodeType == xml.dom.core.ELEMENT \ +def fixup_paras(doc, fragment): + for child in fragment.childNodes: + if child.nodeType == ELEMENT \ and child.tagName in RECURSE_INTO_PARA_CONTAINERS: # fixup_paras_helper(doc, child) - descriptions = find_all_elements(doc, "description") + descriptions = find_all_elements(fragment, "description") for description in descriptions: fixup_paras_helper(doc, description) @@ -628,7 +631,7 @@ def fixup_paras_helper(doc, container, depth=0): # # Either paragraph material or something to recurse into: # - if (children[start].nodeType == xml.dom.core.ELEMENT) \ + if (children[start].nodeType == ELEMENT) \ and (children[start].tagName in RECURSE_INTO_PARA_CONTAINERS): fixup_paras_helper(doc, children[start]) start = skip_leading_nodes(children, start + 1) @@ -653,11 +656,11 @@ def build_para(doc, parent, start, i): after = j + 1 child = children[j] nodeType = child.nodeType - if nodeType == xml.dom.core.ELEMENT: + if nodeType == ELEMENT: if child.tagName in BREAK_ELEMENTS: after = j break - elif nodeType == xml.dom.core.TEXT: + elif nodeType == TEXT: pos = string.find(child.data, "\n\n") if pos == 0: after = j @@ -670,7 +673,7 @@ def build_para(doc, parent, start, i): if (start + 1) > after: raise ConversionError( "build_para() could not identify content to turn into a paragraph") - if children[after - 1].nodeType == xml.dom.core.TEXT: + if children[after - 1].nodeType == TEXT: # we may need to split off trailing white space: child = children[after - 1] data = child.data @@ -707,7 +710,7 @@ def skip_leading_nodes(children, start): # skip over leading comments and whitespace: child = children[start] nodeType = child.nodeType - if nodeType == xml.dom.core.TEXT: + if nodeType == TEXT: data = child.data shortened = string.lstrip(data) if shortened: @@ -717,7 +720,7 @@ def skip_leading_nodes(children, start): return start + 1 return start # all whitespace, just skip - elif nodeType == xml.dom.core.ELEMENT: + elif nodeType == ELEMENT: tagName = child.tagName if tagName in RECURSE_INTO_PARA_CONTAINERS: return start @@ -727,15 +730,15 @@ def skip_leading_nodes(children, start): return start -def fixup_rfc_references(doc): - for rfcnode in find_all_elements(doc, "rfc"): +def fixup_rfc_references(doc, fragment): + for rfcnode in find_all_elements(fragment, "rfc"): rfcnode.appendChild(doc.createTextNode( "RFC " + rfcnode.getAttribute("num"))) -def fixup_signatures(doc): - for child in doc.childNodes: - if child.nodeType == xml.dom.core.ELEMENT: +def fixup_signatures(doc, fragment): + for child in fragment.childNodes: + if child.nodeType == ELEMENT: args = child.getElementsByTagName("args") for arg in args: fixup_args(doc, arg) @@ -748,7 +751,7 @@ def fixup_signatures(doc): def fixup_args(doc, arglist): for child in arglist.childNodes: - if child.nodeType == xml.dom.core.ELEMENT \ + if child.nodeType == ELEMENT \ and child.tagName == "optional": # found it; fix and return arglist.insertBefore(doc.createTextNode("["), child) @@ -762,8 +765,8 @@ def fixup_args(doc, arglist): return fixup_args(doc, arglist) -def fixup_sectionauthors(doc): - for sectauth in find_all_elements(doc, "sectionauthor"): +def fixup_sectionauthors(doc, fragment): + for sectauth in find_all_elements(fragment, "sectionauthor"): section = sectauth.parentNode section.removeChild(sectauth) sectauth._node.name = "author" @@ -772,7 +775,7 @@ def fixup_sectionauthors(doc): sectauth.removeAttribute("name") after = section.childNodes[2] title = section.childNodes[1] - if title.nodeType == xml.dom.core.ELEMENT and title.tagName != "title": + if title.nodeType == ELEMENT and title.tagName != "title": after = section.childNodes[0] section.insertBefore(doc.createTextNode("\n "), after) section.insertBefore(sectauth, after) @@ -781,10 +784,9 @@ def fixup_sectionauthors(doc): def fixup_verbatims(doc): for verbatim in find_all_elements(doc, "verbatim"): child = verbatim.childNodes[0] - if child.nodeType == xml.dom.core.TEXT \ + if child.nodeType == TEXT \ and string.lstrip(child.data)[:3] == ">>>": - verbatim._node.name = "interpreter-session" - #verbatim.setAttribute("interactive", "interactive") + verbatim._node.name = "interactive-session" _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$") @@ -792,7 +794,7 @@ _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$") def write_esis(doc, ofp, knownempty): for node in doc.childNodes: nodeType = node.nodeType - if nodeType == xml.dom.core.ELEMENT: + if nodeType == ELEMENT: gi = node.tagName if knownempty(gi): if node.hasChildNodes(): @@ -808,7 +810,7 @@ def write_esis(doc, ofp, knownempty): ofp.write("(%s\n" % gi) write_esis(node, ofp, knownempty) ofp.write(")%s\n" % gi) - elif nodeType == xml.dom.core.TEXT: + elif nodeType == TEXT: ofp.write("-%s\n" % esistools.encode(node.data)) else: raise RuntimeError, "unsupported node type: %s" % nodeType @@ -818,10 +820,11 @@ def convert(ifp, ofp): p = esistools.ExtendedEsisBuilder() p.feed(ifp.read()) doc = p.document - normalize(doc) - simplify(doc) - handle_labels(doc) - handle_appendix(doc) + fragment = p.fragment + normalize(fragment) + simplify(doc, fragment) + handle_labels(fragment) + handle_appendix(doc, fragment) fixup_trailing_whitespace(doc, { "abstract": "\n", "title": "", @@ -835,12 +838,12 @@ def convert(ifp, ofp): cleanup_root_text(doc) cleanup_trailing_parens(doc, ["function", "method", "cfunction"]) cleanup_synopses(doc) - fixup_descriptors(doc) - fixup_verbatims(doc) - normalize(doc) - fixup_paras(doc) - fixup_sectionauthors(doc) - remap_element_names(doc, { + fixup_descriptors(doc, fragment) + fixup_verbatims(fragment) + normalize(fragment) + fixup_paras(doc, fragment) + fixup_sectionauthors(doc, fragment) + remap_element_names(fragment, { "tableii": ("table", {"cols": "2"}), "tableiii": ("table", {"cols": "3"}), "tableiv": ("table", {"cols": "4"}), @@ -849,9 +852,9 @@ def convert(ifp, ofp): "lineiv": ("row", {}), "refmodule": ("module", {"link": "link"}), }) - fixup_table_structures(doc) - fixup_rfc_references(doc) - fixup_signatures(doc) + fixup_table_structures(doc, fragment) + fixup_rfc_references(doc, fragment) + fixup_signatures(doc, fragment) # d = {} for gi in p.get_empties(): @@ -861,7 +864,7 @@ def convert(ifp, ofp): knownempty = d.has_key # try: - write_esis(doc, ofp, knownempty) + write_esis(fragment, ofp, knownempty) except IOError, (err, msg): # Ignore EPIPE; it just means that whoever we're writing to stopped # reading. The rest of the output would be ignored. All other errors |