summaryrefslogtreecommitdiffstats
path: root/Lib/compiler/pyassem.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/compiler/pyassem.py')
-rw-r--r--Lib/compiler/pyassem.py624
1 files changed, 341 insertions, 283 deletions
diff --git a/Lib/compiler/pyassem.py b/Lib/compiler/pyassem.py
index 4cb910c..3272419 100644
--- a/Lib/compiler/pyassem.py
+++ b/Lib/compiler/pyassem.py
@@ -1,40 +1,127 @@
-"""Assembler for Python bytecode
-
-The new module is used to create the code object. The following
-attribute definitions are included from the reference manual:
-
-co_name gives the function name
-co_argcount is the number of positional arguments (including
- arguments with default values)
-co_nlocals is the number of local variables used by the function
- (including arguments)
-co_varnames is a tuple containing the names of the local variables
- (starting with the argument names)
-co_code is a string representing the sequence of bytecode instructions
-co_consts is a tuple containing the literals used by the bytecode
-co_names is a tuple containing the names used by the bytecode
-co_filename is the filename from which the code was compiled
-co_firstlineno is the first line number of the function
-co_lnotab is a string encoding the mapping from byte code offsets
- to line numbers. see LineAddrTable below.
-co_stacksize is the required stack size (including local variables)
-co_flags is an integer encoding a number of flags for the
- interpreter. There are four flags:
- CO_OPTIMIZED -- uses load fast
- CO_NEWLOCALS -- everything?
- CO_VARARGS -- use *args
- CO_VARKEYWORDS -- uses **args
-
-If a code object represents a function, the first item in co_consts is
-the documentation string of the function, or None if undefined.
-"""
-
-import sys
+"""A flow graph representation for Python bytecode"""
+
import dis
import new
import string
+import types
+
+from compiler import misc
+
+class FlowGraph:
+ def __init__(self):
+ self.current = self.entry = Block()
+ self.exit = Block("exit")
+ self.blocks = misc.Set()
+ self.blocks.add(self.entry)
+ self.blocks.add(self.exit)
+
+ def startBlock(self, block):
+ self.current = block
+
+ def nextBlock(self, block=None):
+ if block is None:
+ block = self.newBlock()
+ # XXX think we need to specify when there is implicit transfer
+ # from one block to the next
+ #
+ # I think this strategy works: each block has a child
+ # designated as "next" which is returned as the last of the
+ # children. because the nodes in a graph are emitted in
+ # reverse post order, the "next" block will always be emitted
+ # immediately after its parent.
+ # Worry: maintaining this invariant could be tricky
+ self.current.addNext(block)
+ self.startBlock(block)
+
+ def newBlock(self):
+ b = Block()
+ self.blocks.add(b)
+ return b
+
+ def startExitBlock(self):
+ self.startBlock(self.exit)
+
+ def emit(self, *inst):
+ # XXX should jump instructions implicitly call nextBlock?
+ if inst[0] == 'RETURN_VALUE':
+ self.current.addOutEdge(self.exit)
+ self.current.emit(inst)
+
+ def getBlocks(self):
+ """Return the blocks in reverse postorder
+
+ i.e. each node appears before all of its successors
+ """
+ # XXX make sure every node that doesn't have an explicit next
+ # is set so that next points to exit
+ for b in self.blocks.elements():
+ if b is self.exit:
+ continue
+ if not b.next:
+ b.addNext(self.exit)
+ order = dfs_postorder(self.entry, {})
+ order.reverse()
+ # hack alert
+ if not self.exit in order:
+ order.append(self.exit)
+ return order
+
+def dfs_postorder(b, seen):
+ """Depth-first search of tree rooted at b, return in postorder"""
+ order = []
+ seen[b] = b
+ for c in b.children():
+ if seen.has_key(c):
+ continue
+ order = order + dfs_postorder(c, seen)
+ order.append(b)
+ return order
+
+class Block:
+ _count = 0
+
+ def __init__(self, label=''):
+ self.insts = []
+ self.inEdges = misc.Set()
+ self.outEdges = misc.Set()
+ self.label = label
+ self.bid = Block._count
+ self.next = []
+ Block._count = Block._count + 1
+
+ def __repr__(self):
+ if self.label:
+ return "<block %s id=%d len=%d>" % (self.label, self.bid,
+ len(self.insts))
+ else:
+ return "<block id=%d len=%d>" % (self.bid, len(self.insts))
+
+ def __str__(self):
+ insts = map(str, self.insts)
+ return "<block %s %d:\n%s>" % (self.label, self.bid,
+ string.join(insts, '\n'))
+
+ def emit(self, inst):
+ op = inst[0]
+ if op[:4] == 'JUMP':
+ self.outEdges.add(inst[1])
+ self.insts.append(inst)
+
+ def getInstructions(self):
+ return self.insts
+
+ def addInEdge(self, block):
+ self.inEdges.add(block)
+
+ def addOutEdge(self, block):
+ self.outEdges.add(block)
+
+ def addNext(self, block):
+ self.next.append(block)
+ assert len(self.next) == 1, map(str, self.next)
-import misc
+ def children(self):
+ return self.outEdges.elements() + self.next
# flags for code objects
CO_OPTIMIZED = 0x0001
@@ -42,224 +129,128 @@ CO_NEWLOCALS = 0x0002
CO_VARARGS = 0x0004
CO_VARKEYWORDS = 0x0008
-class TupleArg:
- def __init__(self, count, names):
- self.count = count
- self.names = names
- def __repr__(self):
- return "TupleArg(%s, %s)" % (self.count, self.names)
- def getName(self):
- return ".nested%d" % self.count
-
-class PyAssembler:
- """Creates Python code objects
- """
-
- # XXX this class needs to major refactoring
-
- def __init__(self, args=(), name='?', filename='<?>',
- docstring=None):
- # XXX why is the default value for flags 3?
- self.insts = []
- # used by makeCodeObject
- self._getArgCount(args)
- self.code = ''
- self.consts = [docstring]
- self.filename = filename
- self.flags = CO_NEWLOCALS
- self.name = name
- self.names = []
+# the FlowGraph is transformed in place; it exists in one of these states
+RAW = "RAW"
+FLAT = "FLAT"
+CONV = "CONV"
+DONE = "DONE"
+
+class PyFlowGraph(FlowGraph):
+ super_init = FlowGraph.__init__
+
+ def __init__(self, name, filename, args=(), optimized=0):
+ self.super_init()
+ self.name = name
+ self.filename = filename
+ self.docstring = None
+ self.args = args # XXX
+ self.argcount = getArgCount(args)
+ if optimized:
+ self.flags = CO_OPTIMIZED | CO_NEWLOCALS
+ else:
+ self.flags = 0
+ self.firstlineno = None
+ self.consts = []
+ self.names = []
self.varnames = list(args) or []
for i in range(len(self.varnames)):
var = self.varnames[i]
if isinstance(var, TupleArg):
self.varnames[i] = var.getName()
- # lnotab support
- self.firstlineno = 0
- self.lastlineno = 0
- self.last_addr = 0
- self.lnotab = ''
-
- def _getArgCount(self, args):
- self.argcount = len(args)
- if args:
- for arg in args:
- if isinstance(arg, TupleArg):
- numNames = len(misc.flatten(arg.names))
- self.argcount = self.argcount - numNames
+ self.stage = RAW
- def __repr__(self):
- return "<bytecode: %d instrs>" % len(self.insts)
-
- def setFlags(self, val):
- """XXX for module's function"""
- self.flags = val
-
- def setOptimized(self):
- self.flags = self.flags | CO_OPTIMIZED
-
- def setVarArgs(self):
- if not self.flags & CO_VARARGS:
- self.flags = self.flags | CO_VARARGS
- self.argcount = self.argcount - 1
-
- def setKWArgs(self):
- self.flags = self.flags | CO_VARKEYWORDS
-
- def getCurInst(self):
- return len(self.insts)
+ def setDocstring(self, doc):
+ self.docstring = doc
+ self.consts.insert(0, doc)
- def getNextInst(self):
- return len(self.insts) + 1
+ def setFlag(self, flag):
+ self.flags = self.flags | flag
+ if flag == CO_VARARGS:
+ self.argcount = self.argcount - 1
- def dump(self, io=sys.stdout):
- i = 0
- for inst in self.insts:
- if inst[0] == 'SET_LINENO':
- io.write("\n")
- io.write(" %3d " % i)
- if len(inst) == 1:
- io.write("%s\n" % inst)
- else:
- io.write("%-15.15s\t%s\n" % inst)
- i = i + 1
-
- def makeCodeObject(self):
- """Make a Python code object
-
- This creates a Python code object using the new module. This
- seems simpler than reverse-engineering the way marshal dumps
- code objects into .pyc files. One of the key difficulties is
- figuring out how to layout references to code objects that
- appear on the VM stack; e.g.
- 3 SET_LINENO 1
- 6 LOAD_CONST 0 (<code object fact at 8115878 [...]
- 9 MAKE_FUNCTION 0
- 12 STORE_NAME 0 (fact)
- """
-
- self._findOffsets()
- lnotab = LineAddrTable()
+ def getCode(self):
+ """Get a Python code object"""
+ if self.stage == RAW:
+ self.flattenGraph()
+ if self.stage == FLAT:
+ self.convertArgs()
+ if self.stage == CONV:
+ self.makeByteCode()
+ if self.stage == DONE:
+ return self.newCodeObject()
+ raise RuntimeError, "inconsistent PyFlowGraph state"
+
+ def dump(self, io=None):
+ if io:
+ save = sys.stdout
+ sys.stdout = io
+ pc = 0
for t in self.insts:
opname = t[0]
+ if opname == "SET_LINENO":
+ print
if len(t) == 1:
- lnotab.addCode(self.opnum[opname])
- elif len(t) == 2:
- if opname == 'SET_LINENO':
- oparg = t[1]
- lnotab.nextLine(oparg)
+ print "\t", "%3d" % pc, opname
+ pc = pc + 1
+ else:
+ print "\t", "%3d" % pc, opname, t[1]
+ pc = pc + 3
+ if io:
+ sys.stdout = save
+
+ def flattenGraph(self):
+ """Arrange the blocks in order and resolve jumps"""
+ assert self.stage == RAW
+ self.insts = insts = []
+ pc = 0
+ begin = {}
+ end = {}
+ for b in self.getBlocks():
+ begin[b] = pc
+ for inst in b.getInstructions():
+ insts.append(inst)
+ if len(inst) == 1:
+ pc = pc + 1
else:
- oparg = self._convertArg(opname, t[1])
- try:
- hi, lo = divmod(oparg, 256)
- except TypeError:
- raise TypeError, "untranslated arg: %s, %s" % (opname, oparg)
- lnotab.addCode(self.opnum[opname], lo, hi)
-
- # why is a module a special case?
- if self.flags == 0:
- nlocals = 0
- else:
- nlocals = len(self.varnames)
- # XXX danger! can't pass through here twice
- if self.flags & CO_VARKEYWORDS:
- self.argcount = self.argcount - 1
- stacksize = findDepth(self.insts)
- try:
- co = new.code(self.argcount, nlocals, stacksize,
- self.flags, lnotab.getCode(), self._getConsts(),
- tuple(self.names), tuple(self.varnames),
- self.filename, self.name, self.firstlineno,
- lnotab.getTable())
- except SystemError, err:
- print err
- print repr(self.argcount)
- print repr(nlocals)
- print repr(stacksize)
- print repr(self.flags)
- print repr(lnotab.getCode())
- print repr(self._getConsts())
- print repr(self.names)
- print repr(self.varnames)
- print repr(self.filename)
- print repr(self.name)
- print repr(self.firstlineno)
- print repr(lnotab.getTable())
- raise
- return co
-
- def _getConsts(self):
- """Return a tuple for the const slot of a code object
-
- Converts PythonVMCode objects to code objects
- """
- l = []
- for elt in self.consts:
- # XXX might be clearer to just as isinstance(CodeGen)
- if hasattr(elt, 'asConst'):
- l.append(elt.asConst())
+ # arg takes 2 bytes
+ pc = pc + 3
+ end[b] = pc
+ pc = 0
+ for i in range(len(insts)):
+ inst = insts[i]
+ if len(inst) == 1:
+ pc = pc + 1
else:
- l.append(elt)
- return tuple(l)
+ pc = pc + 3
+ opname = inst[0]
+ if self.hasjrel.has_elt(opname):
+ oparg = inst[1]
+ offset = begin[oparg] - pc
+ insts[i] = opname, offset
+ elif self.hasjabs.has_elt(opname):
+ insts[i] = opname, begin[inst[1]]
+ self.stacksize = findDepth(self.insts)
+ self.stage = FLAT
- def _findOffsets(self):
- """Find offsets for use in resolving StackRefs"""
- self.offsets = []
- cur = 0
- for t in self.insts:
- self.offsets.append(cur)
- l = len(t)
- if l == 1:
- cur = cur + 1
- elif l == 2:
- cur = cur + 3
- arg = t[1]
- # XXX this is a total hack: for a reference used
- # multiple times, we create a list of offsets and
- # expect that we when we pass through the code again
- # to actually generate the offsets, we'll pass in the
- # same order.
- if isinstance(arg, StackRef):
- try:
- arg.__offset.append(cur)
- except AttributeError:
- arg.__offset = [cur]
-
- def _convertArg(self, op, arg):
- """Convert the string representation of an arg to a number
-
- The specific handling depends on the opcode.
-
- XXX This first implementation isn't going to be very
- efficient.
- """
- if op == 'SET_LINENO':
- return arg
- if op == 'LOAD_CONST':
- return self._lookupName(arg, self.consts)
- if op in self.localOps:
- # make sure it's in self.names, but use the bytecode offset
- self._lookupName(arg, self.names)
- return self._lookupName(arg, self.varnames)
- if op in self.globalOps:
- return self._lookupName(arg, self.names)
- if op in self.nameOps:
- return self._lookupName(arg, self.names)
- if op == 'COMPARE_OP':
- return self.cmp_op.index(arg)
- if self.hasjrel.has_elt(op):
- offset = arg.__offset[0]
- del arg.__offset[0]
- return self.offsets[arg.resolve()] - offset
- if self.hasjabs.has_elt(op):
- return self.offsets[arg.resolve()]
- return arg
-
- nameOps = ('STORE_NAME', 'IMPORT_NAME', 'IMPORT_FROM',
- 'STORE_ATTR', 'LOAD_ATTR', 'LOAD_NAME', 'DELETE_NAME',
- 'DELETE_ATTR')
- localOps = ('LOAD_FAST', 'STORE_FAST', 'DELETE_FAST')
- globalOps = ('LOAD_GLOBAL', 'STORE_GLOBAL', 'DELETE_GLOBAL')
+ hasjrel = misc.Set()
+ for i in dis.hasjrel:
+ hasjrel.add(dis.opname[i])
+ hasjabs = misc.Set()
+ for i in dis.hasjabs:
+ hasjabs.add(dis.opname[i])
+
+ def convertArgs(self):
+ """Convert arguments from symbolic to concrete form"""
+ assert self.stage == FLAT
+ for i in range(len(self.insts)):
+ t = self.insts[i]
+ if len(t) == 2:
+ opname = t[0]
+ oparg = t[1]
+ conv = self._converters.get(opname, None)
+ if conv:
+ self.insts[i] = opname, conv(self, oparg)
+ self.stage = CONV
def _lookupName(self, name, list):
"""Return index of name in list, appending if necessary"""
@@ -276,32 +267,124 @@ class PyAssembler:
list.append(name)
return end
- # Convert some stuff from the dis module for local use
-
- cmp_op = list(dis.cmp_op)
- hasjrel = misc.Set()
- for i in dis.hasjrel:
- hasjrel.add(dis.opname[i])
- hasjabs = misc.Set()
- for i in dis.hasjabs:
- hasjabs.add(dis.opname[i])
-
+ _converters = {}
+ def _convert_LOAD_CONST(self, arg):
+ return self._lookupName(arg, self.consts)
+
+ def _convert_LOAD_FAST(self, arg):
+ self._lookupName(arg, self.names)
+ return self._lookupName(arg, self.varnames)
+ _convert_STORE_FAST = _convert_LOAD_FAST
+ _convert_DELETE_FAST = _convert_LOAD_FAST
+
+ def _convert_NAME(self, arg):
+ return self._lookupName(arg, self.names)
+ _convert_LOAD_NAME = _convert_NAME
+ _convert_STORE_NAME = _convert_NAME
+ _convert_DELETE_NAME = _convert_NAME
+ _convert_IMPORT_NAME = _convert_NAME
+ _convert_IMPORT_FROM = _convert_NAME
+ _convert_STORE_ATTR = _convert_NAME
+ _convert_LOAD_ATTR = _convert_NAME
+ _convert_DELETE_ATTR = _convert_NAME
+ _convert_LOAD_GLOBAL = _convert_NAME
+ _convert_STORE_GLOBAL = _convert_NAME
+ _convert_DELETE_GLOBAL = _convert_NAME
+
+ _cmp = list(dis.cmp_op)
+ def _convert_COMPARE_OP(self, arg):
+ return self._cmp.index(arg)
+
+ # similarly for other opcodes...
+
+ for name, obj in locals().items():
+ if name[:9] == "_convert_":
+ opname = name[9:]
+ _converters[opname] = obj
+ del name, obj, opname
+
+ def makeByteCode(self):
+ assert self.stage == CONV
+ self.lnotab = lnotab = LineAddrTable()
+ for t in self.insts:
+ opname = t[0]
+ if len(t) == 1:
+ lnotab.addCode(self.opnum[opname])
+ else:
+ oparg = t[1]
+ if opname == "SET_LINENO":
+ lnotab.nextLine(oparg)
+ if self.firstlineno is None:
+ self.firstlineno = oparg
+ hi, lo = twobyte(oparg)
+ try:
+ lnotab.addCode(self.opnum[opname], lo, hi)
+ except ValueError:
+ print opname, oparg
+ print self.opnum[opname], lo, hi
+ raise
+ self.stage = DONE
+
opnum = {}
for num in range(len(dis.opname)):
opnum[dis.opname[num]] = num
+ del num
- # this version of emit + arbitrary hooks might work, but it's damn
- # messy.
+ def newCodeObject(self):
+ assert self.stage == DONE
+ if self.flags == 0:
+ nlocals = 0
+ else:
+ nlocals = len(self.varnames)
+ argcount = self.argcount
+ if self.flags & CO_VARKEYWORDS:
+ argcount = argcount - 1
+ return new.code(argcount, nlocals, self.stacksize, self.flags,
+ self.lnotab.getCode(), self.getConsts(),
+ tuple(self.names), tuple(self.varnames),
+ self.filename, self.name, self.firstlineno,
+ self.lnotab.getTable())
+
+ def getConsts(self):
+ """Return a tuple for the const slot of the code object
+
+ Must convert references to code (MAKE_FUNCTION) to code
+ objects recursively.
+ """
+ l = []
+ for elt in self.consts:
+ if isinstance(elt, PyFlowGraph):
+ elt = elt.getCode()
+ l.append(elt)
+ return tuple(l)
+
+def isJump(opname):
+ if opname[:4] == 'JUMP':
+ return 1
- def emit(self, *args):
- self._emitDispatch(args[0], args[1:])
- self.insts.append(args)
+class TupleArg:
+ """Helper for marking func defs with nested tuples in arglist"""
+ def __init__(self, count, names):
+ self.count = count
+ self.names = names
+ def __repr__(self):
+ return "TupleArg(%s, %s)" % (self.count, self.names)
+ def getName(self):
+ return ".nested%d" % self.count
- def _emitDispatch(self, type, args):
- for func in self._emit_hooks.get(type, []):
- func(self, args)
+def getArgCount(args):
+ argcount = len(args)
+ if args:
+ for arg in args:
+ if isinstance(arg, TupleArg):
+ numNames = len(misc.flatten(arg.names))
+ argcount = argcount - numNames
+ return argcount
- _emit_hooks = {}
+def twobyte(val):
+ """Convert an int argument into high and low bytes"""
+ assert type(val) == types.IntType
+ return divmod(val, 256)
class LineAddrTable:
"""lnotab
@@ -361,34 +444,9 @@ class LineAddrTable:
def getTable(self):
return string.join(map(chr, self.lnotab), '')
-class StackRef:
- """Manage stack locations for jumps, loops, etc."""
- count = 0
-
- def __init__(self, id=None, val=None):
- if id is None:
- id = StackRef.count
- StackRef.count = StackRef.count + 1
- self.id = id
- self.val = val
-
- def __repr__(self):
- if self.val:
- return "StackRef(val=%d)" % self.val
- else:
- return "StackRef(id=%d)" % self.id
-
- def bind(self, inst):
- self.val = inst
-
- def resolve(self):
- if self.val is None:
- print "UNRESOLVE REF", self
- return 0
- return self.val
-
class StackDepthTracker:
- # XXX need to keep track of stack depth on jumps
+ # XXX 1. need to keep track of stack depth on jumps
+ # XXX 2. at least partly as a result, this code is broken
def findDepth(self, insts):
depth = 0