summaryrefslogtreecommitdiffstats
path: root/Lib/compiler
diff options
context:
space:
mode:
authorJeremy Hylton <jeremy@alum.mit.edu>2000-02-08 19:01:29 (GMT)
committerJeremy Hylton <jeremy@alum.mit.edu>2000-02-08 19:01:29 (GMT)
commit53187f32ebbb6156fa8e3ddf834dfde74cfa3208 (patch)
treede9996353e090d4a20420bd89f7d78bbfaf4a465 /Lib/compiler
parentaa9d2d6123a54dc1ed7f782fc935abcfadb54167 (diff)
downloadcpython-53187f32ebbb6156fa8e3ddf834dfde74cfa3208.zip
cpython-53187f32ebbb6156fa8e3ddf834dfde74cfa3208.tar.gz
cpython-53187f32ebbb6156fa8e3ddf834dfde74cfa3208.tar.bz2
now produces valid pyc files for a least a trivial subset of the
language. CodeGenerator: * modify to track stack depth * add emit method that call's PythonVMCode's makeCodeObject * thread filenames through in hackish way * set flags for code objects for modules and functions XXX the docs for the flags seem out of date and/or incomplete PythonVMCode: * add doc string describing the elements of a real code object LineAddrTable: * creates an lnotab (no quite correctly though)
Diffstat (limited to 'Lib/compiler')
-rw-r--r--Lib/compiler/pycodegen.py386
1 files changed, 306 insertions, 80 deletions
diff --git a/Lib/compiler/pycodegen.py b/Lib/compiler/pycodegen.py
index 067ebc4..ee0c75b 100644
--- a/Lib/compiler/pycodegen.py
+++ b/Lib/compiler/pycodegen.py
@@ -11,6 +11,10 @@ import misc
import marshal
import new
import string
+import sys
+import os
+import stat
+import struct
def parse(path):
f = open(path)
@@ -60,7 +64,7 @@ class ASTVisitor:
XXX Perhaps I can use a postorder walk for the code generator?
"""
- VERBOSE = 0
+ VERBOSE = 1
def __init__(self):
self.node = None
@@ -101,9 +105,34 @@ class ASTVisitor:
return meth(node)
class CodeGenerator:
- def __init__(self):
- self.code = PythonVMCode()
+ def __init__(self, filename=None):
+ self.filename = filename
+ self.code = PythonVMCode(filename=filename)
+ self.code.setFlags(0)
self.locals = misc.Stack()
+ # track the current and max stack size
+ # XXX does this belong here or in the PythonVMCode?
+ self.curStack = 0
+ self.maxStack = 0
+
+ def emit(self):
+ """Create a Python code object
+
+ XXX It is confusing that this method isn't related to the
+ method named emit in the PythonVMCode.
+ """
+ return self.code.makeCodeObject(self.maxStack)
+
+ def push(self, n):
+ self.curStack = self.curStack + n
+ if self.curStack > self.maxStack:
+ self.maxStack = self.curStack
+
+ def pop(self, n):
+ if n >= self.curStack:
+ self.curStack = self.curStack - n
+ else:
+ self.curStack = 0
def visitDiscard(self, node):
return 1
@@ -112,16 +141,16 @@ class CodeGenerator:
lnf = walk(node.node, LocalNameFinder())
self.locals.push(lnf.getLocals())
self.visit(node.node)
- self.code.emit('LOAD_CONST', 'None')
+ self.code.emit('LOAD_CONST', None)
self.code.emit('RETURN_VALUE')
return 1
def visitFunction(self, node):
- codeBody = NestedCodeGenerator(node.code, node.argnames)
- walk(node.code, codeBody)
+ codeBody = NestedCodeGenerator(node, filename=self.filename)
+ walk(node, codeBody)
self.code.setLineNo(node.lineno)
- self.code.emit('LOAD_CONST', codeBody.code)
- self.code.emit('MAKE_FUNCTION')
+ self.code.emit('LOAD_CONST', codeBody)
+ self.code.emit('MAKE_FUNCTION', 0)
self.code.emit('STORE_NAME', node.name)
return 1
@@ -212,6 +241,7 @@ class CodeGenerator:
self.visit(node.left)
self.visit(node.right)
self.code.emit(op)
+ self.pop(1)
return 1
def visitAdd(self, node):
@@ -232,9 +262,11 @@ class CodeGenerator:
self.code.loadFast(node.name)
else:
self.code.loadGlobal(node.name)
+ self.push(1)
def visitConst(self, node):
self.code.loadConst(node.value)
+ self.push(1)
def visitReturn(self, node):
self.code.setLineNo(node.lineno)
@@ -262,6 +294,7 @@ class CodeGenerator:
for child in node.nodes:
self.visit(child)
self.code.emit('PRINT_ITEM')
+ self.pop(len(node.nodes))
return 1
def visitPrintnl(self, node):
@@ -276,26 +309,38 @@ class NestedCodeGenerator(CodeGenerator):
"""
super_init = CodeGenerator.__init__
- def __init__(self, code, args):
+ def __init__(self, func, filename='<?>'):
"""code and args of function or class being walked
XXX need to separately pass to ASTVisitor. the constructor
only uses the code object to find the local names
+
+ Copies code form parent __init__ rather than calling it.
"""
- self.super_init()
- lnf = walk(code, LocalNameFinder(args))
+ self.name = func.name
+ self.super_init(filename)
+ args = func.argnames
+ self.code = PythonVMCode(len(args), name=func.name,
+ filename=filename)
+ if func.varargs:
+ self.code.setVarArgs()
+ if func.kwargs:
+ self.code.setKWArgs()
+ lnf = walk(func.code, LocalNameFinder(args))
self.locals.push(lnf.getLocals())
+ def __repr__(self):
+ return "<NestedCodeGenerator: %s>" % self.name
+
def visitFunction(self, node):
lnf = walk(node.code, LocalNameFinder(node.argnames))
self.locals.push(lnf.getLocals())
# XXX need to handle def foo((a, b)):
self.code.setLineNo(node.lineno)
self.visit(node.code)
- self.code.emit('LOAD_CONST', 'None')
+ self.code.emit('LOAD_CONST', None)
self.code.emit('RETURN_VALUE')
return 1
-
class LocalNameFinder:
def __init__(self, names=()):
@@ -353,64 +398,86 @@ class ForwardRef:
def resolve(self):
return self.val
-class CompiledModule:
- """Store the code object for a compiled module
+def add_hook(hooks, type, meth):
+ """Helper function for PythonVMCode _emit_hooks"""
+ l = hooks.get(type, [])
+ l.append(meth)
+ hooks[type] = l
- XXX Not clear how the code objects will be stored. Seems possible
- that a single code attribute is sufficient, because it will
- contains references to all the need code objects. That might be
- messy, though.
+class PythonVMCode:
+ """Creates Python code objects
+
+ The new module is used to create the code object. The following
+ attribute definitions are included from the reference manual:
+
+ co_name gives the function name
+ co_argcount is the number of positional arguments (including
+ arguments with default values)
+ co_nlocals is the number of local variables used by the function
+ (including arguments)
+ co_varnames is a tuple containing the names of the local variables
+ (starting with the argument names)
+ co_code is a string representing the sequence of bytecode instructions
+ co_consts is a tuple containing the literals used by the bytecode
+ co_names is a tuple containing the names used by the bytecode
+ co_filename is the filename from which the code was compiled
+ co_firstlineno is the first line number of the function
+ co_lnotab is a string encoding the mapping from byte code offsets
+ to line numbers (for detais see the source code of the
+ interpreter)
+ see code com_set_lineno and com_add_lnotab
+ it's a string with 2bytes per set_lineno
+
+ co_stacksize is the required stack size (including local variables)
+ co_flags is an integer encoding a number of flags for the
+ interpreter.
+
+ The following flag bits are defined for co_flags: bit 2 is set if
+ the function uses the "*arguments" syntax to accept an arbitrary
+ number of positional arguments; bit 3 is set if the function uses
+ the "**keywords" syntax to accept arbitrary keyword arguments;
+ other bits are used internally or reserved for future use.
+
+ If a code object represents a function, the first item in
+ co_consts is the documentation string of the function, or None if
+ undefined.
"""
- MAGIC = (20121 | (ord('\r')<<16) | (ord('\n')<<24))
-
- def __init__(self):
- self.code = None
- def addCode(self, code):
- """addCode(self: SelfType, code: PythonVMCode)"""
-
- def dump(self, path):
- """create a .pyc file"""
- f = open(path, 'wb')
- f.write(self._pyc_header())
- marshal.dump(self.code, f)
- f.close()
-
- def _pyc_header(self, path):
- # compile.c uses marshal to write a long directly, with
- # calling the interface that would also generate a 1-byte code
- # to indicate the type of the value. simplest way to get the
- # same effect is to call marshal and then skip the code.
- buf = marshal.dumps(self.MAGIC)[1:]
- # skip the mtime for now, since I don't have the write
- # structure to pass the filename being compiled into this
- # instance
- return buf + chr(0) * 4
-
-class PythonVMCode:
+ # XXX flag bits
+ VARARGS = 0x04
+ KWARGS = 0x08
- def __init__(self):
+ def __init__(self, argcount=0, name='?', filename='<?>',
+ docstring=None):
+ # XXX why is the default value for flags 3?
self.insts = []
# used by makeCodeObject
- self.argcount = 0
+ self.argcount = argcount
self.code = ''
- self.consts = []
- self.filename = ''
- self.firstlineno = 0
- self.flags = 0
- self.lnotab = None
- self.name = ''
+ self.consts = [docstring]
+ self.filename = filename
+ self.flags = 3
+ self.name = name
self.names = []
- self.nlocals = 0
- self.stacksize = 2
self.varnames = []
+ # lnotab support
+ self.firstlineno = 0
+ self.lastlineno = 0
+ self.last_addr = 0
+ self.lnotab = ''
def __repr__(self):
return "<bytecode: %d instrs>" % len(self.insts)
- def emit(self, *args):
- print "emit", args
- self.insts.append(args)
+ def setFlags(self, val):
+ """XXX for module's function"""
+ self.flags = 0
+
+ def setVarArgs(self):
+ self.flags = self.flags | self.VARARGS
+
+ def setKWArgs(self):
+ self.flags = self.flags | self.KWARGS
def getCurInst(self):
return len(self.insts)
@@ -418,23 +485,70 @@ class PythonVMCode:
def getNextInst(self):
return len(self.insts) + 1
- def convert(self):
- """Convert human-readable names to real bytecode"""
- pass
+ def dump(self, io=sys.stdout):
+ i = 0
+ for inst in self.insts:
+ if inst[0] == 'SET_LINENO':
+ io.write("\n")
+ io.write(" %3d " % i)
+ if len(inst) == 1:
+ io.write("%s\n" % inst)
+ else:
+ io.write("%-15.15s\t%s\n" % inst)
+ i = i + 1
+
+ def makeCodeObject(self, stacksize):
+ """Make a Python code object
+
+ This creates a Python code object using the new module. This
+ seems simpler than reverse-engineering the way marshal dumps
+ code objects into .pyc files. One of the key difficulties is
+ figuring out how to layout references to code objects that
+ appear on the VM stack; e.g.
+ 3 SET_LINENO 1
+ 6 LOAD_CONST 0 (<code object fact at 8115878 [...]
+ 9 MAKE_FUNCTION 0
+ 12 STORE_NAME 0 (fact)
- def makeCodeObject(self):
- """Make a Python code object"""
- code = []
+ """
+
self._findOffsets()
+ lnotab = LineAddrTable()
for t in self.insts:
opname = t[0]
if len(t) == 1:
- code.append(chr(self.opnum[opname]))
+ lnotab.addCode(chr(self.opnum[opname]))
elif len(t) == 2:
oparg = self._convertArg(opname, t[1])
+ if opname == 'SET_LINENO':
+ lnotab.nextLine(oparg)
hi, lo = divmod(oparg, 256)
- code.append(chr(self.opnum[opname]) + chr(lo) + chr(hi))
- return string.join(code, '')
+ lnotab.addCode(chr(self.opnum[opname]) + chr(lo) +
+ chr(hi))
+ # why is a module a special case?
+ if self.flags == 0:
+ nlocals = 0
+ else:
+ nlocals = len(self.varnames)
+ co = new.code(self.argcount, nlocals, stacksize,
+ self.flags, lnotab.getCode(), self._getConsts(),
+ tuple(self.names), tuple(self.varnames),
+ self.filename, self.name, self.firstlineno,
+ lnotab.getTable())
+ return co
+
+ def _getConsts(self):
+ """Return a tuple for the const slot of a code object
+
+ Converts PythonVMCode objects to code objects
+ """
+ l = []
+ for elt in self.consts:
+ if isinstance(elt, CodeGenerator):
+ l.append(elt.emit())
+ else:
+ l.append(elt)
+ return tuple(l)
def _findOffsets(self):
"""Find offsets for use in resolving ForwardRefs"""
@@ -464,7 +578,10 @@ class PythonVMCode:
if op == 'LOAD_CONST':
return self._lookupName(arg, self.consts)
if op == 'LOAD_FAST':
- return self._lookupName(arg, self.varnames, self.names)
+ if arg in self.names:
+ return self._lookupName(arg, self.varnames)
+ else:
+ return self._lookupName(arg, self.varnames, self.names)
if op == 'LOAD_GLOBAL':
return self._lookupName(arg, self.names)
if op == 'STORE_NAME':
@@ -475,7 +592,6 @@ class PythonVMCode:
return self.offsets[arg.resolve()]
if self.hasjabs.has_elt(op):
return self.offsets[arg.resolve()] - arg.__offset
- print op, arg
return arg
def _lookupName(self, name, list, list2=None):
@@ -511,6 +627,11 @@ class PythonVMCode:
# it seems redundant to add a function for each opcode,
# particularly because the method and opcode basically have the
# same name.
+ # on the other hand, we need to track things like stack depth in
+ # order to generator code objects. if we wrap instructions in a
+ # method, we get an easy way to track these. a simpler
+ # approach, however, would be to define hooks that can be called
+ # by emit.
def setLineNo(self, num):
self.emit('SET_LINENO', num)
@@ -557,15 +678,120 @@ class PythonVMCode:
def callFunction(self, num):
self.emit('CALL_FUNCTION', num)
+ # this version of emit + arbitrary hooks might work, but it's damn
+ # messy.
+
+ def emit(self, *args):
+ self._emitDispatch(args[0], args[1:])
+ self.insts.append(args)
+
+ def _emitDispatch(self, type, args):
+ for func in self._emit_hooks.get(type, []):
+ func(self, args)
+
+ _emit_hooks = {}
+
+class LineAddrTable:
+ """lnotab
+
+ This class builds the lnotab, which is undocumented but described
+ by com_set_lineno in compile.c. Here's an attempt at explanation:
+
+ For each SET_LINENO instruction after the first one, two bytes are
+ added to lnotab. (In some cases, multiple two-byte entries are
+ added.) The first byte is the distance in bytes between the
+ instruction for the last SET_LINENO and the current SET_LINENO.
+ The second byte is offset in line numbers. If either offset is
+ greater than 255, multiple two-byte entries are added -- one entry
+ for each factor of 255.
+ """
+
+ def __init__(self):
+ self.code = []
+ self.codeOffset = 0
+ self.firstline = 0
+ self.lastline = 0
+ self.lastoff = 0
+ self.lnotab = []
+
+ def addCode(self, code):
+ self.code.append(code)
+ self.codeOffset = self.codeOffset + len(code)
+
+ def nextLine(self, lineno):
+ if self.firstline == 0:
+ self.firstline = lineno
+ self.lastline = lineno
+ else:
+ # compute deltas
+ addr = self.codeOffset - self.lastoff
+ line = lineno - self.lastline
+ while addr > 0 or line > 0:
+ # write the values in 1-byte chunks that sum
+ # to desired value
+ trunc_addr = addr
+ trunc_line = line
+ if trunc_addr > 255:
+ trunc_addr = 255
+ if trunc_line > 255:
+ trunc_line = 255
+ self.lnotab.append(trunc_addr)
+ self.lnotab.append(trunc_line)
+ addr = addr - trunc_addr
+ line = line - trunc_line
+ self.lastline = lineno
+ self.lastoff = self.codeOffset
+
+ def getCode(self):
+ return string.join(self.code, '')
+
+ def getTable(self):
+ return string.join(map(chr, self.lnotab), '')
+
+class CompiledModule:
+ """Store the code object for a compiled module
+
+ XXX Not clear how the code objects will be stored. Seems possible
+ that a single code attribute is sufficient, because it will
+ contains references to all the need code objects. That might be
+ messy, though.
+ """
+ MAGIC = (20121 | (ord('\r')<<16) | (ord('\n')<<24))
+
+ def __init__(self, source, filename):
+ self.source = source
+ self.filename = filename
+
+ def compile(self):
+ t = transformer.Transformer()
+ self.ast = t.parsesuite(self.source)
+ cg = CodeGenerator(self.filename)
+ walk(self.ast, cg)
+ self.code = cg.emit()
+
+ def dump(self, path):
+ """create a .pyc file"""
+ f = open(path, 'wb')
+ f.write(self._pyc_header())
+ marshal.dump(self.code, f)
+ f.close()
+
+ def _pyc_header(self):
+ # compile.c uses marshal to write a long directly, with
+ # calling the interface that would also generate a 1-byte code
+ # to indicate the type of the value. simplest way to get the
+ # same effect is to call marshal and then skip the code.
+ magic = marshal.dumps(self.MAGIC)[1:]
+ mtime = os.stat(self.filename)[stat.ST_MTIME]
+ mtime = struct.pack('i', mtime)
+ return magic + mtime
+
if __name__ == "__main__":
- tree = parse('test.py')
- cg = CodeGenerator()
- ASTVisitor.VERBOSE = 1
- w = walk(tree, cg)
- w.VERBOSE = 1
- for i in range(len(cg.code.insts)):
- inst = cg.code.insts[i]
- if inst[0] == 'SET_LINENO':
- print
- print "%4d" % i, inst
- code = cg.code.makeCodeObject()
+ if len(sys.argv) > 1:
+ filename = sys.argv[1]
+ else:
+ filename = 'test.py'
+ buf = open(filename).read()
+ mod = CompiledModule(buf, filename)
+ mod.compile()
+ mod.dump(filename + 'c')