#! /usr/bin/env python

"""Generate ESIS events based on a LaTeX source document and configuration
data.
"""
__version__ = '$Revision$'

import errno
import re
import string
import StringIO
import sys

from esistools import encode


DEBUG = 0


class Error(Exception):
    pass

class LaTeXFormatError(Error):
    pass


_begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
_end_env_rx = re.compile(r"[\\]end{([^}]*)}")
_begin_macro_rx = re.compile("[\\\\]([a-zA-Z]+[*]?)({|\\s*\n?)")
_comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
_text_rx = re.compile(r"[^]%\\{}]+")
_optional_rx = re.compile(r"\s*[[]([^]]*)[]]")
# _parameter_rx is this complicated to allow {...} inside a parameter;
# this is useful to match tabular layout specifications like {c|p{24pt}}
_parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
_start_group_rx = re.compile("[ \n]*{")
_start_optional_rx = re.compile("[ \n]*[[]")


ESCAPED_CHARS = "$%#^ {}&~"


def pushing(name, point, depth):
    if DEBUG:
        sys.stderr.write("%s<%s> at %s\n" % (" "*depth, name, point))

def popping(name, point, depth):
    if DEBUG:
        sys.stderr.write("%s</%s> at %s\n" % (" "*depth, name, point))


class Conversion:
    def __init__(self, ifp, ofp, table=None, discards=(), autoclosing=()):
        self.ofp_stack = [ofp]
        self.pop_output()
        self.table = table
        self.discards = discards
        self.autoclosing = autoclosing
        self.line = string.join(map(string.rstrip, ifp.readlines()), "\n")
        self.err_write = sys.stderr.write
        self.preamble = 1

    def push_output(self, ofp):
        self.ofp_stack.append(self.ofp)
        self.ofp = ofp
        self.write = ofp.write

    def pop_output(self):
        self.ofp = self.ofp_stack.pop()
        self.write = self.ofp.write

    def subconvert(self, endchar=None, depth=0):
        if DEBUG and endchar:
            self.err_write(
                "subconvert(%s)\n  line = %s\n" % (`endchar`, `line[:20]`))
        stack = []
        line = self.line
        while line:
            if line[0] == endchar and not stack:
                if DEBUG:
                    self.err_write("subconvert() --> %s\n" % `line[1:21]`)
                self.line = line
                return line
            m = _comment_rx.match(line)
            if m:
                text = m.group(1)
                if text:
                    self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
                               % encode(text))
                line = line[m.end():]
                continue
            m = _begin_env_rx.match(line)
            if m:
                # re-write to use the macro handler
                line = r"\%s %s" % (m.group(1), line[m.end():])
                continue
            m = _end_env_rx.match(line)
            if m:
                # end of environment
                envname = m.group(1)
                if envname == "document":
                    # special magic
                    for n in stack[1:]:
                        if n not in self.autoclosing:
                            raise LaTeXFormatError(
                                "open element on stack: " + `n`)
                    # should be more careful, but this is easier to code:
                    stack = []
                    self.write(")document\n")
                elif envname == stack[-1]:
                    self.write(")%s\n" % envname)
                    del stack[-1]
                    popping(envname, "a", len(stack) + depth)
                else:
                    self.err_write("stack: %s\n" % `stack`)
                    raise LaTeXFormatError(
                        "environment close for %s doesn't match" % envname)
                line = line[m.end():]
                continue
            m = _begin_macro_rx.match(line)
            if m:
                # start of macro
                macroname = m.group(1)
                if macroname == "verbatim":
                    # really magic case!
                    pos = string.find(line, "\\end{verbatim}")
                    text = line[m.end(1):pos]
                    self.write("(verbatim\n")
                    self.write("-%s\n" % encode(text))
                    self.write(")verbatim\n")
                    line = line[pos + len("\\end{verbatim}"):]
                    continue
                numbered = 1
                opened = 0
                if macroname[-1] == "*":
                    macroname = macroname[:-1]
                    numbered = 0
                if macroname in self.autoclosing and macroname in stack:
                    while stack[-1] != macroname:
                        top = stack.pop()
                        if top and top not in self.discards:
                            self.write(")%s\n-\\n\n" % top)
                        popping(top, "b", len(stack) + depth)
                    if macroname not in self.discards:
                        self.write("-\\n\n)%s\n-\\n\n" % macroname)
                    popping(macroname, "c", len(stack) + depth - 1)
                    del stack[-1]
                #
                if macroname in self.discards:
                    self.push_output(StringIO.StringIO())
                else:
                    self.push_output(self.ofp)
                #
                params, optional, empty, environ = self.start_macro(macroname)
                if not numbered:
                    self.write("Anumbered TOKEN no\n")
                # rip off the macroname
                if params:
                    if optional and len(params) == 1:
                        line = line[m.end():]
                    else:
                        line = line[m.end(1):]
                elif empty:
                    line = line[m.end(1):]
                else:
                    line = line[m.end():]
                #
                # Very ugly special case to deal with \item[].  The catch
                # is that this needs to occur outside the for loop that
                # handles attribute parsing so we can 'continue' the outer
                # loop.
                #
                if optional and type(params[0]) is type(()):
                    # the attribute name isn't used in this special case
                    pushing(macroname, "a", depth + len(stack))
                    stack.append(macroname)
                    self.write("(%s\n" % macroname)
                    m = _start_optional_rx.match(line)
                    if m:
                        self.line = line[m.end():]
                        line = self.subconvert("]", depth + len(stack))
                    line = "}" + line
                    continue
                # handle attribute mappings here:
                for attrname in params:
                    if optional:
                        optional = 0
                        if type(attrname) is type(""):
                            m = _optional_rx.match(line)
                            if m:
                                line = line[m.end():]
                                self.write("A%s TOKEN %s\n"
                                           % (attrname, encode(m.group(1))))
                    elif type(attrname) is type(()):
                        # This is a sub-element; but don't place the
                        # element we found on the stack (\section-like)
                        pushing(macroname, "b", len(stack) + depth)
                        stack.append(macroname)
                        self.write("(%s\n" % macroname)
                        macroname = attrname[0]
                        m = _start_group_rx.match(line)
                        if m:
                            line = line[m.end():]
                    elif type(attrname) is type([]):
                        # A normal subelement.
                        attrname = attrname[0]
                        if not opened:
                            opened = 1
                            self.write("(%s\n" % macroname)
                            pushing(macroname, "c", len(stack) + depth)
                        self.write("(%s\n" % attrname)
                        pushing(attrname, "sub-elem", len(stack) + depth + 1)
                        self.line = skip_white(line)[1:]
                        line = subconvert("}", depth + len(stack) + 2)
                        popping(attrname, "sub-elem", len(stack) + depth + 1)
                        self.write(")%s\n" % attrname)
                    else:
                        m = _parameter_rx.match(line)
                        if not m:
                            raise LaTeXFormatError(
                                "could not extract parameter %s for %s: %s"
                                % (attrname, macroname, `line[:100]`))
                        value = m.group(1)
                        if _token_rx.match(value):
                            dtype = "TOKEN"
                        else:
                            dtype = "CDATA"
                        self.write("A%s %s %s\n"
                                   % (attrname, dtype, encode(value)))
                        line = line[m.end():]
                if params and type(params[-1]) is type('') \
                   and (not empty) and not environ:
                    # attempt to strip off next '{'
                    m = _start_group_rx.match(line)
                    if not m:
                        raise LaTeXFormatError(
                            "non-empty element '%s' has no content: %s"
                            % (macroname, line[:12]))
                    line = line[m.end():]
                if not opened:
                    self.write("(%s\n" % macroname)
                    pushing(macroname, "d", len(stack) + depth)
                if empty:
                    line = "}" + line
                stack.append(macroname)
                self.pop_output()
                continue
            if line[0] == endchar and not stack:
                if DEBUG:
                    self.err_write("subconvert() --> %s\n" % `line[1:21]`)
                self.line = line[1:]
                return self.line
            if line[0] == "}":
                # end of macro or group
                macroname = stack[-1]
                conversion = self.table.get(macroname)
                if macroname \
                   and macroname not in self.discards \
                   and type(conversion) is not type(""):
                    # otherwise, it was just a bare group
                    self.write(")%s\n" % stack[-1])
                popping(macroname, "d", len(stack) + depth - 1)
                del stack[-1]
                line = line[1:]
                continue
            if line[0] == "{":
                pushing("", "e", len(stack) + depth)
                stack.append("")
                line = line[1:]
                continue
            if line[0] == "\\" and line[1] in ESCAPED_CHARS:
                self.write("-%s\n" % encode(line[1]))
                line = line[2:]
                continue
            if line[:2] == r"\\":
                self.write("(BREAK\n)BREAK\n")
                line = line[2:]
                continue
            m = _text_rx.match(line)
            if m:
                text = encode(m.group())
                self.write("-%s\n" % text)
                line = line[m.end():]
                continue
            # special case because of \item[]
            if line[0] == "]":
                self.write("-]\n")
                line = line[1:]
                continue
            # avoid infinite loops
            extra = ""
            if len(line) > 100:
                extra = "..."
            raise LaTeXFormatError("could not identify markup: %s%s"
                                   % (`line[:100]`, extra))
        while stack and stack[-1] in self.autoclosing:
            self.write("-\\n\n")
            self.write(")%s\n" % stack[-1])
            popping(stack.pop(), "e", len(stack) + depth - 1)
        if stack:
            raise LaTeXFormatError("elements remain on stack: "
                                   + string.join(stack, ", "))
        # otherwise we just ran out of input here...

    def convert(self):
        self.subconvert()

    def start_macro(self, name):
        conversion = self.table.get(name, ([], 0, 0, 0, 0))
        params, optional, empty, environ, nocontent = conversion
        if empty:
            self.write("e\n")
        elif nocontent:
            empty = 1
        return params, optional, empty, environ


def convert(ifp, ofp, table={}, discards=(), autoclosing=()):
    c = Conversion(ifp, ofp, table, discards, autoclosing)
    try:
        c.convert()
    except IOError, (err, msg):
        if err != errno.EPIPE:
            raise


def skip_white(line):
    while line and line[0] in " %\n\t":
        line = string.lstrip(line[1:])
    return line


def main():
    if len(sys.argv) == 2:
        ifp = open(sys.argv[1])
        ofp = sys.stdout
    elif len(sys.argv) == 3:
        ifp = open(sys.argv[1])
        ofp = open(sys.argv[2], "w")
    else:
        usage()
        sys.exit(2)
    convert(ifp, ofp, {
        # entries have the form:
        # name: ([attribute names], is1stOptional, isEmpty, isEnv, nocontent)
        # attribute names can be:
        #   "string" -- normal attribute
        #   ("string",) -- sub-element with content of macro; like for \section
        #   ["string"] -- sub-element
        "appendix": ([], 0, 1, 0, 0),
        "bifuncindex": (["name"], 0, 1, 0, 0),
        "catcode": ([], 0, 1, 0, 0),
        "cfuncdesc": (["type", "name", ("args",)], 0, 0, 1, 0),
        "chapter": ([("title",)], 0, 0, 0, 0),
        "chapter*": ([("title",)], 0, 0, 0, 0),
        "classdesc": (["name", ("args",)], 0, 0, 1, 0),
        "ctypedesc": (["name"], 0, 0, 1, 0),
        "cvardesc":  (["type", "name"], 0, 0, 1, 0),
        "datadesc":  (["name"], 0, 0, 1, 0),
        "declaremodule": (["id", "type", "name"], 1, 1, 0, 0),
        "deprecated": (["release"], 0, 0, 0, 0),
        "documentclass": (["classname"], 0, 1, 0, 0),
        "excdesc": (["name"], 0, 0, 1, 0),
        "funcdesc": (["name", ("args",)], 0, 0, 1, 0),
        "funcdescni": (["name", ("args",)], 0, 0, 1, 0),
        "funcline": (["name"], 0, 0, 0, 0),
        "funclineni": (["name"], 0, 0, 0, 0),
        "geq": ([], 0, 1, 0, 0),
        "hline": ([], 0, 1, 0, 0),
        "indexii": (["ie1", "ie2"], 0, 1, 0, 0),
        "indexiii": (["ie1", "ie2", "ie3"], 0, 1, 0, 0),
        "indexiv": (["ie1", "ie2", "ie3", "ie4"], 0, 1, 0, 0),
        "indexname": ([], 0, 0, 0, 0),
        "input": (["source"], 0, 1, 0, 0),
        "item": ([("leader",)], 1, 0, 0, 0),
        "label": (["id"], 0, 1, 0, 0),
        "labelwidth": ([], 0, 1, 0, 0),
        "LaTeX": ([], 0, 1, 0, 0),
        "leftmargin": ([], 0, 1, 0, 0),
        "leq": ([], 0, 1, 0, 0),
        "lineii": ([["entry"], ["entry"]], 0, 0, 0, 1),
        "lineiii": ([["entry"], ["entry"], ["entry"]], 0, 0, 0, 1),
        "lineiv": ([["entry"], ["entry"], ["entry"], ["entry"]], 0, 0, 0, 1),
        "localmoduletable": ([], 0, 1, 0, 0),
        "makeindex": ([], 0, 1, 0, 0), 
        "makemodindex": ([], 0, 1, 0, 0), 
        "maketitle": ([], 0, 1, 0, 0),
        "manpage": (["name", "section"], 0, 1, 0, 0),
        "memberdesc": (["class", "name"], 1, 0, 1, 0),
        "methoddesc": (["class", "name", ("args",)], 1, 0, 1, 0),
        "methoddescni": (["class", "name", ("args",)], 1, 0, 1, 0),
        "methodline": (["class", "name"], 1, 0, 0, 0),
        "methodlineni": (["class", "name"], 1, 0, 0, 0),
        "moduleauthor": (["name", "email"], 0, 1, 0, 0),
        "opcodedesc": (["name", "var"], 0, 0, 1, 0),
        "par": ([], 0, 1, 0, 0),
        "paragraph": ([("title",)], 0, 0, 0, 0),
        "renewcommand": (["macro"], 0, 0, 0, 0),
        "rfc": (["num"], 0, 1, 0, 0),
        "section": ([("title",)], 0, 0, 0, 0),
        "sectionauthor": (["name", "email"], 0, 1, 0, 0),
        "seemodule": (["ref", "name"], 1, 0, 0, 0),
        "stindex": (["type"], 0, 1, 0, 0),
        "subparagraph": ([("title",)], 0, 0, 0, 0),
        "subsection": ([("title",)], 0, 0, 0, 0),
        "subsubsection": ([("title",)], 0, 0, 0, 0),
        "list": (["bullet", "init"], 0, 0, 1, 0),
        "tableii": (["colspec", "style",
                     ["entry"], ["entry"]], 0, 0, 1, 0),
        "tableiii": (["colspec", "style",
                      ["entry"], ["entry"], ["entry"]], 0, 0, 1, 0),
        "tableiv": (["colspec", "style",
                     ["entry"], ["entry"], ["entry"], ["entry"]], 0, 0, 1, 0),
        "version": ([], 0, 1, 0, 0),
        "versionadded": (["version"], 0, 1, 0, 0),
        "versionchanged": (["version"], 0, 1, 0, 0),
        "withsubitem": (["text"], 0, 0, 0, 0),
        #
        "ABC": ([], 0, 1, 0, 0),
        "ASCII": ([], 0, 1, 0, 0),
        "C": ([], 0, 1, 0, 0),
        "Cpp": ([], 0, 1, 0, 0),
        "EOF": ([], 0, 1, 0, 0),
        "e": ([], 0, 1, 0, 0),
        "ldots": ([], 0, 1, 0, 0),
        "NULL": ([], 0, 1, 0, 0),
        "POSIX": ([], 0, 1, 0, 0),
        "UNIX": ([], 0, 1, 0, 0),
        #
        # Things that will actually be going away!
        #
        "fi": ([], 0, 1, 0, 0),
        "ifhtml": ([], 0, 1, 0, 0),
        "makeindex": ([], 0, 1, 0, 0),
        "makemodindex": ([], 0, 1, 0, 0),
        "maketitle": ([], 0, 1, 0, 0),
        "noindent": ([], 0, 1, 0, 0),
        "protect": ([], 0, 1, 0, 0),
        "tableofcontents": ([], 0, 1, 0, 0),
        },
            discards=["fi", "ifhtml", "makeindex", "makemodindex", "maketitle",
                      "noindent", "tableofcontents"],
            autoclosing=["chapter", "section", "subsection", "subsubsection",
                         "paragraph", "subparagraph", ])


if __name__ == "__main__":
    main()