diff options
author | Fred Drake <fdrake@acm.org> | 1999-01-06 22:50:52 (GMT) |
---|---|---|
committer | Fred Drake <fdrake@acm.org> | 1999-01-06 22:50:52 (GMT) |
commit | fcc5910090ff82b1514dc5e0746dbaaab35f8f81 (patch) | |
tree | a11c224c9a9a338556ed7e8d1c3c831d8beac9a6 /Doc/tools/sgmlconv/docfixer.py | |
parent | 5c355201e238c34916aceba597f1d3086ce29a7e (diff) | |
download | cpython-fcc5910090ff82b1514dc5e0746dbaaab35f8f81.zip cpython-fcc5910090ff82b1514dc5e0746dbaaab35f8f81.tar.gz cpython-fcc5910090ff82b1514dc5e0746dbaaab35f8f81.tar.bz2 |
Preliminary code to mark paragraphs. Seems to work, but very slow.
Diffstat (limited to 'Doc/tools/sgmlconv/docfixer.py')
-rwxr-xr-x | Doc/tools/sgmlconv/docfixer.py | 152 |
1 files changed, 150 insertions, 2 deletions
diff --git a/Doc/tools/sgmlconv/docfixer.py b/Doc/tools/sgmlconv/docfixer.py index b23b0f6..802f3b3 100755 --- a/Doc/tools/sgmlconv/docfixer.py +++ b/Doc/tools/sgmlconv/docfixer.py @@ -20,6 +20,9 @@ import xml.dom.core import xml.dom.esis_builder +DEBUG_PARA_FIXER = 0 + + # Workaround to deal with invalid documents (multiple root elements). This # does not indicate a bug in the DOM implementation. # @@ -323,12 +326,157 @@ def cleanup_synopses(doc): create_module_info(doc, node) +FIXUP_PARA_ELEMENTS = ( + "chapter", + "section", "subsection", "subsubsection", + "paragraph", "subparagraph") + +PARA_LEVEL_ELEMENTS = ( + "moduleinfo", "title", "opcodedesc", + "verbatim", "funcdesc", "methoddesc", "excdesc", "datadesc", + "funcdescni", "methoddescni", "excdescni", "datadescni", + "tableii", "tableiii", "tableiv", "localmoduletable", + "sectionauthor", + # include <para>, so we can just do it again to get subsequent paras: + "para", + ) + +PARA_LEVEL_PRECEEDERS = ( + "index", "indexii", "indexiii", "indexiv", + "stindex", "obindex", "COMMENT", "label", + ) + def fixup_paras(doc): - pass + for child in doc.childNodes: + if child.nodeType == xml.dom.core.ELEMENT \ + and child.tagName in FIXUP_PARA_ELEMENTS: + fixup_paras_helper(doc, child) + descriptions = child.getElementsByTagName("description") + for description in descriptions: + if DEBUG_PARA_FIXER: + sys.stderr.write("-- Fixing up <description> element...\n") + fixup_paras_helper(doc, description) + + +def fixup_paras_helper(doc, container): + # document is already normalized + children = container.childNodes + start = 0 + start_fixed = 0 + i = 0 + SKIP_ELEMENTS = PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS + for child in children: + if child.nodeType == xml.dom.core.ELEMENT: + if child.tagName in FIXUP_PARA_ELEMENTS: + fixup_paras_helper(doc, child) + break + elif child.tagName in SKIP_ELEMENTS: + if not start_fixed: + start = i + 1 + elif not start_fixed: + start_fixed = 1 + i = i + 1 + else: + if child.nodeType == xml.dom.core.TEXT \ + and string.strip(child.data) and not start_fixed: + start_fixed = 1 + i = i + 1 + if DEBUG_PARA_FIXER: + sys.stderr.write("fixup_paras_helper() called on <%s>; %d, %d\n" + % (container.tagName, start, i)) + if i > start: + # the first [start:i] children shoudl be rewritten as <para> elements + # start by breaking text nodes that contain \n\n+ into multiple nodes + nstart, i = skip_leading_nodes(container.childNodes, start, i) + if i > nstart: + build_para(doc, container, nstart, i) + fixup_paras_helper(doc, container) + + +def build_para(doc, parent, start, i): + children = parent.childNodes + # collect all children until \n\n+ is found in a text node or a + # PARA_LEVEL_ELEMENT is found. + after = start + 1 + have_last = 0 + BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + FIXUP_PARA_ELEMENTS + for j in range(start, i): + after = j + 1 + child = children[j] + nodeType = child.nodeType + if nodeType == xml.dom.core.ELEMENT: + if child.tagName in BREAK_ELEMENTS: + after = j + break + elif nodeType == xml.dom.core.TEXT: + pos = string.find(child.data, "\n\n") + if pos == 0: + after = j + break + if pos >= 1: + child.splitText(pos) + break + else: + have_last = 1 + if children[after - 1].nodeType == xml.dom.core.TEXT: + # we may need to split off trailing white space: + child = children[after - 1] + data = child.data + if string.rstrip(data) != data: + have_last = 0 + child.splitText(len(string.rstrip(data))) + children = parent.childNodes + para = doc.createElement("para") + prev = None + indexes = range(start, after) + indexes.reverse() + for j in indexes: + node = children[j] + parent.removeChild(node) + para.insertBefore(node, prev) + prev = node + if have_last: + parent.appendChild(para) + else: + parent.insertBefore(para, parent.childNodes[start]) + + +def skip_leading_nodes(children, start, i): + i = min(i, len(children)) + while i > start: + # skip over leading comments and whitespace: + try: + child = children[start] + except IndexError: + sys.stderr.write( + "skip_leading_nodes() failed at index %d\n" % start) + raise + nodeType = child.nodeType + if nodeType == xml.dom.core.COMMENT: + start = start + 1 + elif nodeType == xml.dom.core.TEXT: + data = child.data + shortened = string.lstrip(data) + if shortened: + if data != shortened: + # break into two nodes: whitespace and non-whitespace + child.splitText(len(data) - len(shortened)) + return start + 1, i + 1 + break + # all whitespace, just skip + start = start + 1 + elif nodeType == xml.dom.core.ELEMENT: + if child.tagName in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS: + start = start + 1 + else: + break + else: + break + return start, i _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$") - + def write_esis(doc, ofp, knownempty): for node in doc.childNodes: nodeType = node.nodeType |