diff options
Diffstat (limited to 'Doc/tools')
-rwxr-xr-x | Doc/tools/sgmlconv/docfixer.py | 186 |
1 files changed, 95 insertions, 91 deletions
diff --git a/Doc/tools/sgmlconv/docfixer.py b/Doc/tools/sgmlconv/docfixer.py index 7fa856f..1e9b5c3 100755 --- a/Doc/tools/sgmlconv/docfixer.py +++ b/Doc/tools/sgmlconv/docfixer.py @@ -11,12 +11,12 @@ import esistools import re import string import sys -import xml.dom.core +import xml.dom +import xml.dom.minidom -from xml.dom.core import \ - ELEMENT, \ - ENTITY_REFERENCE, \ - TEXT +ELEMENT = xml.dom.Node.ELEMENT_NODE +ENTITY_REFERENCE = xml.dom.Node.ENTITY_REFERENCE_NODE +TEXT = xml.dom.Node.TEXT_NODE class ConversionError(Exception): @@ -49,32 +49,9 @@ else: pass -# Workaround to deal with invalid documents (multiple root elements). This -# does not indicate a bug in the DOM implementation. -# -def get_documentElement(doc): - docelem = None - for n in doc.childNodes: - if n.nodeType == ELEMENT: - docelem = n - return docelem - -xml.dom.core.Document.get_documentElement = get_documentElement - - -# Replace get_childNodes for the Document class; without this, children -# accessed from the Document object via .childNodes (no matter how many -# levels of access are used) will be given an ownerDocument of None. -# -def get_childNodes(doc): - return xml.dom.core.NodeList(doc._node.children, doc._node) - -xml.dom.core.Document.get_childNodes = get_childNodes - - def get_first_element(doc, gi): for n in doc.childNodes: - if n.get_nodeName() == gi: + if n.nodeName == gi: return n def extract_first_element(doc, gi): @@ -84,13 +61,25 @@ def extract_first_element(doc, gi): return node +def get_documentElement(node): + result = None + for child in node.childNodes: + if child.nodeType == ELEMENT: + result = child + return result + + +def set_tagName(elem, gi): + elem.nodeName = elem.tagName = gi + + def find_all_elements(doc, gi): nodes = [] - if doc.get_nodeName() == gi: + if doc.nodeName == gi: nodes.append(doc) for child in doc.childNodes: if child.nodeType == ELEMENT: - if child.get_tagName() == gi: + if child.tagName == gi: nodes.append(child) for node in child.getElementsByTagName(gi): nodes.append(node) @@ -99,18 +88,19 @@ def find_all_elements(doc, gi): def find_all_child_elements(doc, gi): nodes = [] for child in doc.childNodes: - if child.get_nodeName() == gi: + if child.nodeName == gi: nodes.append(child) return nodes + def find_all_elements_from_set(doc, gi_set): return __find_all_elements_from_set(doc, gi_set, []) def __find_all_elements_from_set(doc, gi_set, nodes): - if doc.get_nodeName() in gi_set: + if doc.nodeName in gi_set: nodes.append(doc) for child in doc.childNodes: - if child.get_nodeType() == ELEMENT: + if child.nodeType == ELEMENT: __find_all_elements_from_set(child, gi_set, nodes) return nodes @@ -129,7 +119,7 @@ def simplify(doc, fragment): # update the name of the root element node = get_first_element(fragment, "document") if node is not None: - node._node.name = documentclass + set_tagName(node, documentclass) while 1: node = extract_first_element(fragment, "input") if node is None: @@ -143,7 +133,7 @@ def simplify(doc, fragment): docelem.insertBefore(text, docelem.firstChild) docelem.insertBefore(node, text) docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild) - while fragment.firstChild and fragment.firstChild.get_nodeType() == TEXT: + while fragment.firstChild and fragment.firstChild.nodeType == TEXT: fragment.removeChild(fragment.firstChild) @@ -153,9 +143,9 @@ def cleanup_root_text(doc): for n in doc.childNodes: prevskip = skip skip = 0 - if n.get_nodeType() == TEXT and not prevskip: + if n.nodeType == TEXT and not prevskip: discards.append(n) - elif n.get_nodeName() == "COMMENT": + elif n.nodeName == "COMMENT": skip = 1 for node in discards: doc.removeChild(node) @@ -177,8 +167,8 @@ def fixup_descriptors(doc, fragment): def find_and_fix_descriptors(doc, container): children = container.childNodes for child in children: - if child.get_nodeType() == ELEMENT: - tagName = child.get_tagName() + if child.nodeType == ELEMENT: + tagName = child.tagName if tagName in DESCRIPTOR_ELEMENTS: rewrite_descriptor(doc, child) elif tagName == "subsection": @@ -200,12 +190,12 @@ def rewrite_descriptor(doc, descriptor): # 6. Put it back together. # # 1. - descname = descriptor.get_tagName() + descname = descriptor.tagName index = 1 if descname[-2:] == "ni": descname = descname[:-2] descriptor.setAttribute("index", "no") - descriptor._node.name = descname + set_tagName(descriptor, descname) index = 0 desctype = descname[:-4] # remove 'desc' linename = desctype + "line" @@ -219,7 +209,7 @@ def rewrite_descriptor(doc, descriptor): name.appendChild(doc.createTextNode(descriptor.getAttribute("name"))) descriptor.removeAttribute("name") # 2a. - if descriptor.attributes.has_key("var"): + if descriptor.hasAttribute("var"): if descname != "opcodedesc": raise RuntimeError, \ "got 'var' attribute on descriptor other than opcodedesc" @@ -245,10 +235,15 @@ def rewrite_descriptor(doc, descriptor): # 3, 4. pos = skip_leading_nodes(children, pos) while pos < len(children) \ - and children[pos].get_nodeName() in (linename, "versionadded"): - if children[pos].get_tagName() == linename: + and children[pos].nodeName in (linename, "versionadded"): + if children[pos].tagName == linename: # this is really a supplemental signature, create <signature> - sig = methodline_to_signature(doc, children[pos]) + oldchild = children[pos].cloneNode(1) + try: + sig = methodline_to_signature(doc, children[pos]) + except KeyError: + print oldchild.toxml() + raise newchildren.append(sig) else: # <versionadded added=...> @@ -301,7 +296,7 @@ def move_children(origin, dest, start=0): def handle_appendix(doc, fragment): # must be called after simplfy() if document is multi-rooted to begin with docelem = get_documentElement(fragment) - toplevel = docelem.get_tagName() == "manual" and "chapter" or "section" + toplevel = docelem.tagName == "manual" and "chapter" or "section" appendices = 0 nodes = [] for node in docelem.childNodes: @@ -333,7 +328,7 @@ def handle_labels(doc, fragment): if not id: continue parent = label.parentNode - parentTagName = parent.get_tagName() + parentTagName = parent.tagName if parentTagName == "title": parent.parentNode.setAttribute("id", id) else: @@ -352,8 +347,8 @@ def fixup_trailing_whitespace(doc, wsmap): while queue: node = queue[0] del queue[0] - if wsmap.has_key(node.get_nodeName()): - ws = wsmap[node.get_tagName()] + if wsmap.has_key(node.nodeName): + ws = wsmap[node.tagName] children = node.childNodes children.reverse() if children[0].nodeType == TEXT: @@ -361,8 +356,8 @@ def fixup_trailing_whitespace(doc, wsmap): children[0].data = data children.reverse() # hack to get the title in place: - if node.get_tagName() == "title" \ - and node.parentNode.firstChild.get_nodeType() == ELEMENT: + if node.tagName == "title" \ + and node.parentNode.firstChild.nodeType == ELEMENT: node.parentNode.insertBefore(doc.createText("\n "), node.parentNode.firstChild) for child in node.childNodes: @@ -388,7 +383,7 @@ def cleanup_trailing_parens(doc, element_names): while queue: node = queue[0] del queue[0] - if rewrite_element(node.get_tagName()): + if rewrite_element(node.tagName): children = node.childNodes if len(children) == 1 \ and children[0].nodeType == TEXT: @@ -411,7 +406,7 @@ def contents_match(left, right): if nodeType != r.nodeType: return 0 if nodeType == ELEMENT: - if l.get_tagName() != r.get_tagName(): + if l.tagName != r.tagName: return 0 # should check attributes, but that's not a problem here if not contents_match(l, r): @@ -430,19 +425,19 @@ def create_module_info(doc, section): node = extract_first_element(section, "modulesynopsis") if node is None: return - node._node.name = "synopsis" + set_tagName(node, "synopsis") lastchild = node.childNodes[-1] if lastchild.nodeType == TEXT \ and lastchild.data[-1:] == ".": lastchild.data = lastchild.data[:-1] modauthor = extract_first_element(section, "moduleauthor") if modauthor: - modauthor._node.name = "author" + set_tagName(modauthor, "author") modauthor.appendChild(doc.createTextNode( modauthor.getAttribute("name"))) modauthor.removeAttribute("name") platform = extract_first_element(section, "platform") - if section.get_tagName() == "section": + if section.tagName == "section": modinfo_pos = 2 modinfo = doc.createElement("moduleinfo") moddecl = extract_first_element(section, "declaremodule") @@ -467,13 +462,13 @@ def create_module_info(doc, section): if title: children = title.childNodes if len(children) >= 2 \ - and children[0].get_nodeName() == "module" \ + and children[0].nodeName == "module" \ and children[0].childNodes[0].data == name: # this is it; morph the <title> into <short-synopsis> first_data = children[1] if first_data.data[:4] == " ---": first_data.data = string.lstrip(first_data.data[4:]) - title._node.name = "short-synopsis" + set_tagName(title, "short-synopsis") if children[-1].nodeType == TEXT \ and children[-1].data[-1:] == ".": children[-1].data = children[-1].data[:-1] @@ -511,7 +506,7 @@ def create_module_info(doc, section): children = section.childNodes for i in range(len(children)): node = children[i] - if node.get_nodeName() == "moduleinfo": + if node.nodeName == "moduleinfo": nextnode = children[i+1] if nextnode.nodeType == TEXT: data = nextnode.data @@ -544,7 +539,7 @@ def fixup_table(doc, table): children = table.childNodes for child in children: if child.nodeType == ELEMENT: - tagName = child.get_tagName() + tagName = child.tagName if tagName == "hline" and prev_row is not None: prev_row.setAttribute("rowsep", "1") elif tagName == "row": @@ -558,13 +553,14 @@ def fixup_table(doc, table): nodeType = child.nodeType if nodeType == TEXT: if string.strip(child.data): - raise ConversionError("unexpected free data in table") + raise ConversionError("unexpected free data in <%s>: %r" + % (table.tagName, child.data)) table.removeChild(child) continue if nodeType == ELEMENT: - if child.get_tagName() != "hline": + if child.tagName != "hline": raise ConversionError( - "unexpected <%s> in table" % child.get_tagName()) + "unexpected <%s> in table" % child.tagName) table.removeChild(child) continue raise ConversionError( @@ -593,7 +589,7 @@ def fixup_row(doc, row): def move_elements_by_name(doc, source, dest, name, sep=None): nodes = [] for child in source.childNodes: - if child.get_nodeName() == name: + if child.nodeName == name: nodes.append(child) for node in nodes: source.removeChild(node) @@ -633,7 +629,7 @@ PARA_LEVEL_PRECEEDERS = ( def fixup_paras(doc, fragment): for child in fragment.childNodes: - if child.get_nodeName() in RECURSE_INTO_PARA_CONTAINERS: + if child.nodeName in RECURSE_INTO_PARA_CONTAINERS: fixup_paras_helper(doc, child) descriptions = find_all_elements(fragment, "description") for description in descriptions: @@ -645,7 +641,7 @@ def fixup_paras_helper(doc, container, depth=0): children = container.childNodes start = skip_leading_nodes(children) while len(children) > start: - if children[start].get_nodeName() in RECURSE_INTO_PARA_CONTAINERS: + if children[start].nodeName in RECURSE_INTO_PARA_CONTAINERS: # Something to recurse into: fixup_paras_helper(doc, children[start]) else: @@ -668,7 +664,7 @@ def build_para(doc, parent, start, i): child = children[j] nodeType = child.nodeType if nodeType == ELEMENT: - if child.get_tagName() in BREAK_ELEMENTS: + if child.tagName in BREAK_ELEMENTS: after = j break elif nodeType == TEXT: @@ -742,7 +738,7 @@ def skip_leading_nodes(children, start=0): return start # all whitespace, just skip elif nodeType == ELEMENT: - tagName = child.get_tagName() + tagName = child.tagName if tagName in RECURSE_INTO_PARA_CONTAINERS: return start if tagName not in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS: @@ -772,7 +768,7 @@ def fixup_signatures(doc, fragment): def fixup_args(doc, arglist): for child in arglist.childNodes: - if child.get_nodeName() == "optional": + if child.nodeName == "optional": # found it; fix and return arglist.insertBefore(doc.createTextNode("["), child) optkids = child.childNodes @@ -789,13 +785,13 @@ def fixup_sectionauthors(doc, fragment): for sectauth in find_all_elements(fragment, "sectionauthor"): section = sectauth.parentNode section.removeChild(sectauth) - sectauth._node.name = "author" + set_tagName(sectauth, "author") sectauth.appendChild(doc.createTextNode( sectauth.getAttribute("name"))) sectauth.removeAttribute("name") after = section.childNodes[2] title = section.childNodes[1] - if title.get_nodeName() != "title": + if title.nodeName != "title": after = section.childNodes[0] section.insertBefore(doc.createTextNode("\n "), after) section.insertBefore(sectauth, after) @@ -806,17 +802,17 @@ def fixup_verbatims(doc): child = verbatim.childNodes[0] if child.nodeType == TEXT \ and string.lstrip(child.data)[:3] == ">>>": - verbatim._node.name = "interactive-session" + set_tagName(verbatim, "interactive-session") def add_node_ids(fragment, counter=0): - fragment._node.node_id = counter + fragment.node_id = counter for node in fragment.childNodes: counter = counter + 1 if node.nodeType == ELEMENT: counter = add_node_ids(node, counter) else: - node._node.node_id = counter + node.node_id = counter return counter + 1 @@ -831,14 +827,14 @@ def fixup_refmodindexes(fragment): d = {} for node in nodes: parent = node.parentNode - d[parent._node.node_id] = parent + d[parent.node_id] = parent del nodes map(fixup_refmodindexes_chunk, d.values()) def fixup_refmodindexes_chunk(container): # node is probably a <para>; let's see how often it isn't: - if container.get_tagName() != PARA_ELEMENT: + if container.tagName != PARA_ELEMENT: bwrite("--- fixup_refmodindexes_chunk(%s)\n" % container) module_entries = find_all_elements(container, "module") if not module_entries: @@ -849,7 +845,7 @@ def fixup_refmodindexes_chunk(container): children = entry.childNodes if len(children) != 0: bwrite("--- unexpected number of children for %s node:\n" - % entry.get_tagName()) + % entry.tagName) ewrite(entry.toxml() + "\n") continue found = 0 @@ -873,7 +869,7 @@ def fixup_bifuncindexes(fragment): # make sure that each parent is only processed once: for node in nodes: parent = node.parentNode - d[parent._node.node_id] = parent + d[parent.node_id] = parent del nodes map(fixup_bifuncindexes_chunk, d.values()) @@ -905,7 +901,7 @@ def join_adjacent_elements(container, gi): while queue: parent = queue.pop() i = 0 - children = parent.get_childNodes() + children = parent.childNodes nchildren = len(children) while i < (nchildren - 1): child = children[i] @@ -914,7 +910,7 @@ def join_adjacent_elements(container, gi): ewrite("--- merging two <%s/> elements\n" % gi) child = children[i] nextchild = children[i+1] - nextchildren = nextchild.get_childNodes() + nextchildren = nextchild.childNodes while len(nextchildren): node = nextchildren[0] nextchild.removeChild(node) @@ -932,14 +928,13 @@ def write_esis(doc, ofp, knownempty): for node in doc.childNodes: nodeType = node.nodeType if nodeType == ELEMENT: - gi = node.get_tagName() + gi = node.tagName if knownempty(gi): if node.hasChildNodes(): raise ValueError, \ "declared-empty node <%s> has children" % gi ofp.write("e\n") - for k, v in node.attributes.items(): - value = v.value + for k, value in node.attributes.items(): if _token_rx.match(value): dtype = "TOKEN" else: @@ -951,16 +946,17 @@ def write_esis(doc, ofp, knownempty): elif nodeType == TEXT: ofp.write("-%s\n" % esistools.encode(node.data)) elif nodeType == ENTITY_REFERENCE: - ofp.write("&%s\n" % node.get_nodeName()) + ofp.write("&%s\n" % node.nodeName) else: raise RuntimeError, "unsupported node type: %s" % nodeType def convert(ifp, ofp): - p = esistools.ExtendedEsisBuilder() - p.feed(ifp.read()) - doc = p.document - fragment = p.fragment + events = esistools.parse(ifp) + toktype, doc = events.getEvent() + fragment = doc.createDocumentFragment() + events.expandNode(fragment) + normalize(fragment) simplify(doc, fragment) handle_labels(doc, fragment) @@ -994,8 +990,10 @@ def convert(ifp, ofp): join_adjacent_elements(fragment, "option") # d = {} - for gi in p.get_empties(): + for gi in events.parser.get_empties(): d[gi] = gi + if d.has_key("author"): + del d["author"] if d.has_key("rfc"): del d["rfc"] knownempty = d.has_key @@ -1019,11 +1017,17 @@ def main(): ofp = sys.stdout elif len(sys.argv) == 3: ifp = open(sys.argv[1]) - ofp = open(sys.argv[2], "w") + import StringIO + ofp = StringIO.StringIO() else: usage() sys.exit(2) convert(ifp, ofp) + if len(sys.argv) == 3: + fp = open(sys.argv[2], "w") + fp.write(ofp.getvalue()) + fp.close() + ofp.close() if __name__ == "__main__": |