Doc/tools/sgmlconv/docfixer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674

#! /usr/bin/env python

"""Promote the IDs from <label/> elements to the enclosing section / chapter /
whatever, then remove the <label/> elements.  This allows *ML style internal
linking rather than the bogus LaTeX model.

Note that <label/>s in <title> elements are promoted two steps, since the
<title> elements are artificially created from the section parameter, and the
label really refers to the sectioning construct.
"""
__version__ = '$Revision$'


import errno
import esistools
import re
import string
import sys
import xml.dom.core
import xml.dom.esis_builder


class ConversionError(Exception):
    pass


DEBUG_PARA_FIXER = 0


# Workaround to deal with invalid documents (multiple root elements).  This
# does not indicate a bug in the DOM implementation.
#
def get_documentElement(self):
    docelem = None
    for n in self._node.children:
        if n.type == xml.dom.core.ELEMENT:
            docelem = xml.dom.core.Element(n, self, self)
    return docelem

xml.dom.core.Document.get_documentElement = get_documentElement


# Replace get_childNodes for the Document class; without this, children
# accessed from the Document object via .childNodes (no matter how many
# levels of access are used) will be given an ownerDocument of None.
#
def get_childNodes(self):
    return xml.dom.core.NodeList(self._node.children, self, self)

xml.dom.core.Document.get_childNodes = get_childNodes


def get_first_element(doc, gi):
    for n in doc.childNodes:
        if n.nodeType == xml.dom.core.ELEMENT and n.tagName == gi:
            return n

def extract_first_element(doc, gi):
    node = get_first_element(doc, gi)
    if node is not None:
        doc.removeChild(node)
    return node


def simplify(doc):
    # Try to rationalize the document a bit, since these things are simply
    # not valid SGML/XML documents as they stand, and need a little work.
    documentclass = "document"
    inputs = []
    node = extract_first_element(doc, "documentclass")
    if node is not None:
        documentclass = node.getAttribute("classname")
    node = extract_first_element(doc, "title")
    if node is not None:
        inputs.append(node)
    # update the name of the root element
    node = get_first_element(doc, "document")
    if node is not None:
        node._node.name = documentclass
    while 1:
        node = extract_first_element(doc, "input")
        if node is None:
            break
        inputs.append(node)
    if inputs:
        docelem = doc.documentElement
        inputs.reverse()
        for node in inputs:
            text = doc.createTextNode("\n")
            docelem.insertBefore(text, docelem.firstChild)
            docelem.insertBefore(node, text)
        docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
    while doc.firstChild.nodeType == xml.dom.core.TEXT:
        doc.removeChild(doc.firstChild)


def cleanup_root_text(doc):
    discards = []
    skip = 0
    for n in doc.childNodes:
        prevskip = skip
        skip = 0
        if n.nodeType == xml.dom.core.TEXT and not prevskip:
            discards.append(n)
        elif n.nodeType == xml.dom.core.ELEMENT and n.tagName == "COMMENT":
            skip = 1
    for node in discards:
        doc.removeChild(node)


def rewrite_desc_entries(doc, argname_gi):
    argnodes = doc.getElementsByTagName(argname_gi)
    for node in argnodes:
        parent = node.parentNode
        nodes = []
        for n in parent.childNodes:
            if n.nodeType != xml.dom.core.ELEMENT or n.tagName != argname_gi:
                nodes.append(n)
        desc = doc.createElement("description")
        for n in nodes:
            parent.removeChild(n)
            desc.appendChild(n)
        if node.childNodes:
            # keep the <args>...</args>, newline & indent
            parent.insertBefore(doc.createText("\n  "), node)
        else:
            # no arguments, remove the <args/> node
            parent.removeChild(node)
        parent.appendChild(doc.createText("\n  "))
        parent.appendChild(desc)
        parent.appendChild(doc.createText("\n"))

def handle_args(doc):
    rewrite_desc_entries(doc, "args")
    rewrite_desc_entries(doc, "constructor-args")


def handle_appendix(doc):
    # must be called after simplfy() if document is multi-rooted to begin with
    docelem = doc.documentElement
    toplevel = docelem.tagName == "manual" and "chapter" or "section"
    appendices = 0
    nodes = []
    for node in docelem.childNodes:
        if appendices:
            nodes.append(node)
        elif node.nodeType == xml.dom.core.ELEMENT:
            appnodes = node.getElementsByTagName("appendix")
            if appnodes:
                appendices = 1
                parent = appnodes[0].parentNode
                parent.removeChild(appnodes[0])
                parent.normalize()
    if nodes:
        map(docelem.removeChild, nodes)
        docelem.appendChild(doc.createTextNode("\n\n\n"))
        back = doc.createElement("back-matter")
        docelem.appendChild(back)
        back.appendChild(doc.createTextNode("\n"))
        while nodes and nodes[0].nodeType == xml.dom.core.TEXT \
              and not string.strip(nodes[0].data):
            del nodes[0]
        map(back.appendChild, nodes)
        docelem.appendChild(doc.createTextNode("\n"))


def handle_labels(doc):
    labels = doc.getElementsByTagName("label")
    for label in labels:
        id = label.getAttribute("id")
        if not id:
            continue
        parent = label.parentNode
        if parent.tagName == "title":
            parent.parentNode.setAttribute("id", id)
        else:
            parent.setAttribute("id", id)
        # now, remove <label id="..."/> from parent:
        parent.removeChild(label)


def fixup_trailing_whitespace(doc, wsmap):
    queue = [doc]
    while queue:
        node = queue[0]
        del queue[0]
        if node.nodeType == xml.dom.core.ELEMENT \
           and wsmap.has_key(node.tagName):
            ws = wsmap[node.tagName]
            children = node.childNodes
            children.reverse()
            if children[0].nodeType == xml.dom.core.TEXT:
                data = string.rstrip(children[0].data) + ws
                children[0].data = data
            children.reverse()
            # hack to get the title in place:
            if node.tagName == "title" \
               and node.parentNode.firstChild.nodeType == xml.dom.core.ELEMENT:
                node.parentNode.insertBefore(doc.createText("\n  "),
                                             node.parentNode.firstChild)
        for child in node.childNodes:
            if child.nodeType == xml.dom.core.ELEMENT:
                queue.append(child)


def normalize(doc):
    for node in doc.childNodes:
        if node.nodeType == xml.dom.core.ELEMENT:
            node.normalize()


def cleanup_trailing_parens(doc, element_names):
    d = {}
    for gi in element_names:
        d[gi] = gi
    rewrite_element = d.has_key
    queue = []
    for node in doc.childNodes:
        if node.nodeType == xml.dom.core.ELEMENT:
            queue.append(node)
    while queue:
        node = queue[0]
        del queue[0]
        if rewrite_element(node.tagName):
            children = node.childNodes
            if len(children) == 1 \
               and children[0].nodeType == xml.dom.core.TEXT:
                data = children[0].data
                if data[-2:] == "()":
                    children[0].data = data[:-2]
        else:
            for child in node.childNodes:
                if child.nodeType == xml.dom.core.ELEMENT:
                    queue.append(child)


def contents_match(left, right):
    left_children = left.childNodes
    right_children = right.childNodes
    if len(left_children) != len(right_children):
        return 0
    for l, r in map(None, left_children, right_children):
        nodeType = l.nodeType
        if nodeType != r.nodeType:
            return 0
        if nodeType == xml.dom.core.ELEMENT:
            if l.tagName != r.tagName:
                return 0
            # should check attributes, but that's not a problem here
            if not contents_match(l, r):
                return 0
        elif nodeType == xml.dom.core.TEXT:
            if l.data != r.data:
                return 0
        else:
            # not quite right, but good enough
            return 0
    return 1


def create_module_info(doc, section):
    # Heavy.
    node = extract_first_element(section, "modulesynopsis")
    if node is None:
        return
    node._node.name = "synopsis"
    lastchild = node.childNodes[-1]
    if lastchild.nodeType == xml.dom.core.TEXT \
       and lastchild.data[-1:] == ".":
        lastchild.data = lastchild.data[:-1]
    if section.tagName == "section":
        modinfo_pos = 2
        modinfo = doc.createElement("moduleinfo")
        moddecl = extract_first_element(section, "declaremodule")
        name = None
        if moddecl:
            modinfo.appendChild(doc.createTextNode("\n    "))
            name = moddecl.attributes["name"].value
            namenode = doc.createElement("name")
            namenode.appendChild(doc.createTextNode(name))
            modinfo.appendChild(namenode)
            type = moddecl.attributes.get("type")
            if type:
                type = type.value
                modinfo.appendChild(doc.createTextNode("\n    "))
                typenode = doc.createElement("type")
                typenode.appendChild(doc.createTextNode(type))
                modinfo.appendChild(typenode)
        title = get_first_element(section, "title")
        if title:
            children = title.childNodes
            if len(children) >= 2 \
               and children[0].nodeType == xml.dom.core.ELEMENT \
               and children[0].tagName == "module" \
               and children[0].childNodes[0].data == name:
                # this is it; morph the <title> into <short-synopsis>
                first_data = children[1]
                if first_data.data[:4] == " ---":
                    first_data.data = string.lstrip(first_data.data[4:])
                title._node.name = "short-synopsis"
                if children[-1].data[-1:] == ".":
                    children[-1].data = children[-1].data[:-1]
                section.removeChild(title)
                section.removeChild(section.childNodes[0])
                title.removeChild(children[0])
                modinfo_pos = 0
            else:
                sys.stderr.write(
                    "module name in title doesn't match"
                    " <declaremodule>; no <short-synopsis>\n")
        else:
            sys.stderr.write(
                "Unexpected condition: <section> without <title>\n")
        modinfo.appendChild(doc.createTextNode("\n    "))
        modinfo.appendChild(node)
        if title and not contents_match(title, node):
            # The short synopsis is actually different,
            # and needs to be stored:
            modinfo.appendChild(doc.createTextNode("\n    "))
            modinfo.appendChild(title)
        modinfo.appendChild(doc.createTextNode("\n  "))
        section.insertBefore(modinfo, section.childNodes[modinfo_pos])
        section.insertBefore(doc.createTextNode("\n  "), modinfo)


def cleanup_synopses(doc):
    for node in doc.childNodes:
        if node.nodeType == xml.dom.core.ELEMENT \
           and node.tagName == "section":
            create_module_info(doc, node)


def remap_element_names(root, name_map):
    queue = []
    for child in root.childNodes:
        if child.nodeType == xml.dom.core.ELEMENT:
            queue.append(child)
    while queue:
        node = queue.pop()
        tagName = node.tagName
        if name_map.has_key(tagName):
            name, attrs = name_map[tagName]
            node._node.name = name
            for attr, value in attrs.items():
                node.setAttribute(attr, value)
        for child in node.childNodes:
            if child.nodeType == xml.dom.core.ELEMENT:
                queue.append(child)


def fixup_table_structures(doc):
    # must be done after remap_element_names(), or the tables won't be found
    for child in doc.childNodes:
        if child.nodeType == xml.dom.core.ELEMENT:
            tables = child.getElementsByTagName("table")
            for table in tables:
                fixup_table(doc, table)

def fixup_table(doc, table):
    # create the table head
    thead = doc.createElement("thead")
    row = doc.createElement("row")
    move_elements_by_name(doc, table, row, "entry")
    thead.appendChild(doc.createTextNode("\n    "))
    thead.appendChild(row)
    thead.appendChild(doc.createTextNode("\n    "))
    # create the table body
    tbody = doc.createElement("tbody")
    prev_row = None
    last_was_hline = 0
    children = table.childNodes
    for child in children:
        if child.nodeType == xml.dom.core.ELEMENT:
            tagName = child.tagName
            if tagName == "hline" and prev_row is not None:
                prev_row.setAttribute("rowsep", "1")
            elif tagName == "row":
                prev_row = child
    # save the rows:
    tbody.appendChild(doc.createTextNode("\n    "))
    move_elements_by_name(doc, table, tbody, "row", sep="\n    ")
    # and toss the rest:
    while children:
        child = children[0]
        nodeType = child.nodeType
        if nodeType == xml.dom.core.TEXT:
            if string.strip(child.data):
                raise ConversionError("unexpected free data in table")
            table.removeChild(child)
            continue
        if nodeType == xml.dom.core.ELEMENT:
            if child.tagName != "hline":
                raise ConversionError(
                    "unexpected <%s> in table" % child.tagName)
            table.removeChild(child)
            continue
        raise ConversionError(
            "unexpected %s node in table" % child.__class__.__name__)
    # nothing left in the <table>; add the <thead> and <tbody>
    tgroup = doc.createElement("tgroup")
    tgroup.appendChild(doc.createTextNode("\n  "))
    tgroup.appendChild(thead)
    tgroup.appendChild(doc.createTextNode("\n  "))
    tgroup.appendChild(tbody)
    tgroup.appendChild(doc.createTextNode("\n  "))
    table.appendChild(tgroup)
    # now make the <entry>s look nice:
    for row in table.getElementsByTagName("row"):
        fixup_row(doc, row)


def fixup_row(doc, row):
    entries = []
    map(entries.append, row.childNodes[1:])
    for entry in entries:
        row.insertBefore(doc.createTextNode("\n         "), entry)
#    row.appendChild(doc.createTextNode("\n      "))


def move_elements_by_name(doc, source, dest, name, sep=None):
    nodes = []
    for child in source.childNodes:
        if child.nodeType == xml.dom.core.ELEMENT and child.tagName == name:
            nodes.append(child)
    for node in nodes:
        source.removeChild(node)
        dest.appendChild(node)
        if sep:
            dest.appendChild(doc.createTextNode(sep))


FIXUP_PARA_ELEMENTS = (
    "chapter",
    "section", "subsection", "subsubsection",
    "paragraph", "subparagraph")

PARA_LEVEL_ELEMENTS = (
    "moduleinfo", "title", "opcodedesc",
    "verbatim", "funcdesc", "methoddesc", "excdesc", "datadesc",
    "funcdescni", "methoddescni", "excdescni", "datadescni",
    "tableii", "tableiii", "tableiv", "localmoduletable",
    "sectionauthor",
    # include <para>, so we can just do it again to get subsequent paras:
    "para",
    )

PARA_LEVEL_PRECEEDERS = (
    "index", "indexii", "indexiii", "indexiv",
    "stindex", "obindex", "COMMENT", "label",
    )

def fixup_paras(doc):
    for child in doc.childNodes:
        if child.nodeType == xml.dom.core.ELEMENT \
           and child.tagName in FIXUP_PARA_ELEMENTS:
            fixup_paras_helper(doc, child)
            descriptions = child.getElementsByTagName("description")
            for description in descriptions:
                if DEBUG_PARA_FIXER:
                    sys.stderr.write("-- Fixing up <description> element...\n")
                fixup_paras_helper(doc, description)


def fixup_paras_helper(doc, container):
    # document is already normalized
    children = container.childNodes
    start = 0
    start_fixed = 0
    i = 0
    SKIP_ELEMENTS = PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS
    for child in children:
        if child.nodeType == xml.dom.core.ELEMENT:
            if child.tagName in FIXUP_PARA_ELEMENTS:
                fixup_paras_helper(doc, child)
                break
            elif child.tagName in SKIP_ELEMENTS:
                if not start_fixed:
                    start = i + 1
            elif not start_fixed:
                start_fixed = 1
            i = i + 1
        else:
            if child.nodeType == xml.dom.core.TEXT \
               and string.strip(child.data) and not start_fixed:
                start_fixed = 1
            i = i + 1
    if DEBUG_PARA_FIXER:
        sys.stderr.write("fixup_paras_helper() called on <%s>; %d, %d\n"
                         % (container.tagName, start, i))
    if i > start:
        # the first [start:i] children shoudl be rewritten as <para> elements
        # start by breaking text nodes that contain \n\n+ into multiple nodes
        nstart, i = skip_leading_nodes(container.childNodes, start, i)
        if i > nstart:
            build_para(doc, container, nstart, i)
            fixup_paras_helper(doc, container)


def build_para(doc, parent, start, i):
    children = parent.childNodes
    # collect all children until \n\n+ is found in a text node or a
    # PARA_LEVEL_ELEMENT is found.
    after = start + 1
    have_last = 0
    BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + FIXUP_PARA_ELEMENTS
    for j in range(start, i):
        after = j + 1
        child = children[j]
        nodeType = child.nodeType
        if nodeType == xml.dom.core.ELEMENT:
            if child.tagName in BREAK_ELEMENTS:
                after = j
                break
        elif nodeType == xml.dom.core.TEXT:
            pos = string.find(child.data, "\n\n")
            if pos == 0:
                after = j
                break
            if pos >= 1:
                child.splitText(pos)
                break
    else:
        have_last = 1
    if children[after - 1].nodeType == xml.dom.core.TEXT:
        # we may need to split off trailing white space:
        child = children[after - 1]
        data = child.data
        if string.rstrip(data) != data:
            have_last = 0
            child.splitText(len(string.rstrip(data)))
    children = parent.childNodes
    para = doc.createElement("para")
    prev = None
    indexes = range(start, after)
    indexes.reverse()
    for j in indexes:
        node = children[j]
        parent.removeChild(node)
        para.insertBefore(node, prev)
        prev = node
    if have_last:
        parent.appendChild(para)
    else:
        parent.insertBefore(para, parent.childNodes[start])


def skip_leading_nodes(children, start, i):
    i = min(i, len(children))
    while i > start:
        # skip over leading comments and whitespace:
        try:
            child = children[start]
        except IndexError:
            sys.stderr.write(
                "skip_leading_nodes() failed at index %d\n" % start)
            raise
        nodeType = child.nodeType
        if nodeType == xml.dom.core.COMMENT:
            start = start + 1
        elif nodeType == xml.dom.core.TEXT:
            data = child.data
            shortened = string.lstrip(data)
            if shortened:
                if data != shortened:
                    # break into two nodes: whitespace and non-whitespace
                    child.splitText(len(data) - len(shortened))
                    return start + 1, i + 1
                break
            # all whitespace, just skip
            start = start + 1
        elif nodeType == xml.dom.core.ELEMENT:
            if child.tagName in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS:
                start = start + 1
            else:
                break
        else:
            break
    return start, i


_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")

def write_esis(doc, ofp, knownempty):
    for node in doc.childNodes:
        nodeType = node.nodeType
        if nodeType == xml.dom.core.ELEMENT:
            gi = node.tagName
            if knownempty(gi):
                if node.hasChildNodes():
                    raise ValueError, "declared-empty node has children"
                ofp.write("e\n")
            for k, v in node.attributes.items():
                value = v.value
                if _token_rx.match(value):
                    dtype = "TOKEN"
                else:
                    dtype = "CDATA"
                ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))
            ofp.write("(%s\n" % gi)
            write_esis(node, ofp, knownempty)
            ofp.write(")%s\n" % gi)
        elif nodeType == xml.dom.core.TEXT:
            ofp.write("-%s\n" % esistools.encode(node.data))
        else:
            raise RuntimeError, "unsupported node type: %s" % nodeType


def convert(ifp, ofp):
    p = esistools.ExtendedEsisBuilder()
    p.feed(ifp.read())
    doc = p.document
    normalize(doc)
    handle_args(doc)
    simplify(doc)
    handle_labels(doc)
    handle_appendix(doc)
    fixup_trailing_whitespace(doc, {
        "abstract": "\n",
        "title": "",
        "chapter": "\n\n",
        "section": "\n\n",
        "subsection": "\n\n",
        "subsubsection": "\n\n",
        "paragraph": "\n\n",
        "subparagraph": "\n\n",
        })
    cleanup_root_text(doc)
    cleanup_trailing_parens(doc, ["function", "method", "cfunction"])
    cleanup_synopses(doc)
    normalize(doc)
    fixup_paras(doc)
    remap_element_names(doc, {
        "tableii": ("table", {"cols": "2"}),
        "tableiii": ("table", {"cols": "3"}),
        "tableiv": ("table", {"cols": "4"}),
        "lineii": ("row", {}),
        "lineiii": ("row", {}),
        "lineiv": ("row", {}),
        })
    fixup_table_structures(doc)
    #
    d = {}
    for gi in p.get_empties():
        d[gi] = gi
    knownempty = d.has_key
    #
    try:
        write_esis(doc, ofp, knownempty)
    except IOError, (err, msg):
        # Ignore EPIPE; it just means that whoever we're writing to stopped
        # reading.  The rest of the output would be ignored.  All other errors
        # should still be reported,
        if err != errno.EPIPE:
            raise


def main():
    if len(sys.argv) == 1:
        ifp = sys.stdin
        ofp = sys.stdout
    elif len(sys.argv) == 2:
        ifp = open(sys.argv[1])
        ofp = sys.stdout
    elif len(sys.argv) == 3:
        ifp = open(sys.argv[1])
        ofp = open(sys.argv[2], "w")
    else:
        usage()
        sys.exit(2)
    convert(ifp, ofp)


if __name__ == "__main__":
    main()