diff options
Diffstat (limited to 'Parser')
32 files changed, 4298 insertions, 3651 deletions
diff --git a/Parser/Python.asdl b/Parser/Python.asdl index 126d478..9a9b933 100644 --- a/Parser/Python.asdl +++ b/Parser/Python.asdl @@ -1,129 +1,115 @@ --- ASDL's 5 builtin types are: --- identifier, int, string, object, constant +-- ASDL's five builtin types are identifier, int, string, object, bool -module Python +module Python version "$Revision$" { - mod = Module(stmt* body, type_ignore *type_ignores) - | Interactive(stmt* body) - | Expression(expr body) - | FunctionType(expr* argtypes, expr returns) - - -- not really an actual node but useful in Jython's typesystem. - | Suite(stmt* body) - - stmt = FunctionDef(identifier name, arguments args, - stmt* body, expr* decorator_list, expr? returns, - string? type_comment) - | AsyncFunctionDef(identifier name, arguments args, - stmt* body, expr* decorator_list, expr? returns, - string? type_comment) - - | ClassDef(identifier name, - expr* bases, - keyword* keywords, - stmt* body, - expr* decorator_list) - | Return(expr? value) - - | Delete(expr* targets) - | Assign(expr* targets, expr value, string? type_comment) - | AugAssign(expr target, operator op, expr value) - -- 'simple' indicates that we annotate simple name without parens - | AnnAssign(expr target, expr annotation, expr? value, int simple) - - -- use 'orelse' because else is a keyword in target languages - | For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment) - | AsyncFor(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment) - | While(expr test, stmt* body, stmt* orelse) - | If(expr test, stmt* body, stmt* orelse) - | With(withitem* items, stmt* body, string? type_comment) - | AsyncWith(withitem* items, stmt* body, string? type_comment) - - | Raise(expr? exc, expr? cause) - | Try(stmt* body, excepthandler* handlers, stmt* orelse, stmt* finalbody) - | Assert(expr test, expr? msg) - - | Import(alias* names) - | ImportFrom(identifier? module, alias* names, int? level) - - | Global(identifier* names) - | Nonlocal(identifier* names) - | Expr(expr value) - | Pass | Break | Continue - - -- XXX Jython will be different - -- col_offset is the byte offset in the utf8 string the parser uses - attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset) - - -- BoolOp() can use left & right? - expr = BoolOp(boolop op, expr* values) - | NamedExpr(expr target, expr value) - | BinOp(expr left, operator op, expr right) - | UnaryOp(unaryop op, expr operand) - | Lambda(arguments args, expr body) - | IfExp(expr test, expr body, expr orelse) - | Dict(expr* keys, expr* values) - | Set(expr* elts) - | ListComp(expr elt, comprehension* generators) - | SetComp(expr elt, comprehension* generators) - | DictComp(expr key, expr value, comprehension* generators) - | GeneratorExp(expr elt, comprehension* generators) - -- the grammar constrains where yield expressions can occur - | Await(expr value) - | Yield(expr? value) - | YieldFrom(expr value) - -- need sequences for compare to distinguish between - -- x < 4 < 3 and (x < 4) < 3 - | Compare(expr left, cmpop* ops, expr* comparators) - | Call(expr func, expr* args, keyword* keywords) - | FormattedValue(expr value, int? conversion, expr? format_spec) - | JoinedStr(expr* values) - | Constant(constant value, string? kind) - - -- the following expression can appear in assignment context - | Attribute(expr value, identifier attr, expr_context ctx) - | Subscript(expr value, slice slice, expr_context ctx) - | Starred(expr value, expr_context ctx) - | Name(identifier id, expr_context ctx) - | List(expr* elts, expr_context ctx) - | Tuple(expr* elts, expr_context ctx) - - -- col_offset is the byte offset in the utf8 string the parser uses - attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset) - - expr_context = Load | Store | Del | AugLoad | AugStore | Param - - slice = Slice(expr? lower, expr? upper, expr? step) - | ExtSlice(slice* dims) - | Index(expr value) - - boolop = And | Or - - operator = Add | Sub | Mult | MatMult | Div | Mod | Pow | LShift + mod = Module(stmt* body) + | Interactive(stmt* body) + | Expression(expr body) + + -- not really an actual node but useful in Jython's typesystem. + | Suite(stmt* body) + + stmt = FunctionDef(identifier name, arguments args, + stmt* body, expr* decorator_list) + | ClassDef(identifier name, expr* bases, stmt* body, expr* decorator_list) + | Return(expr? value) + + | Delete(expr* targets) + | Assign(expr* targets, expr value) + | AugAssign(expr target, operator op, expr value) + + -- not sure if bool is allowed, can always use int + | Print(expr? dest, expr* values, bool nl) + + -- use 'orelse' because else is a keyword in target languages + | For(expr target, expr iter, stmt* body, stmt* orelse) + | While(expr test, stmt* body, stmt* orelse) + | If(expr test, stmt* body, stmt* orelse) + | With(expr context_expr, expr? optional_vars, stmt* body) + + -- 'type' is a bad name + | Raise(expr? type, expr? inst, expr? tback) + | TryExcept(stmt* body, excepthandler* handlers, stmt* orelse) + | TryFinally(stmt* body, stmt* finalbody) + | Assert(expr test, expr? msg) + + | Import(alias* names) + | ImportFrom(identifier? module, alias* names, int? level) + + -- Doesn't capture requirement that locals must be + -- defined if globals is + -- still supports use as a function! + | Exec(expr body, expr? globals, expr? locals) + + | Global(identifier* names) + | Expr(expr value) + | Pass | Break | Continue + + -- XXX Jython will be different + -- col_offset is the byte offset in the utf8 string the parser uses + attributes (int lineno, int col_offset) + + -- BoolOp() can use left & right? + expr = BoolOp(boolop op, expr* values) + | BinOp(expr left, operator op, expr right) + | UnaryOp(unaryop op, expr operand) + | Lambda(arguments args, expr body) + | IfExp(expr test, expr body, expr orelse) + | Dict(expr* keys, expr* values) + | Set(expr* elts) + | ListComp(expr elt, comprehension* generators) + | SetComp(expr elt, comprehension* generators) + | DictComp(expr key, expr value, comprehension* generators) + | GeneratorExp(expr elt, comprehension* generators) + -- the grammar constrains where yield expressions can occur + | Yield(expr? value) + -- need sequences for compare to distinguish between + -- x < 4 < 3 and (x < 4) < 3 + | Compare(expr left, cmpop* ops, expr* comparators) + | Call(expr func, expr* args, keyword* keywords, + expr? starargs, expr? kwargs) + | Repr(expr value) + | Num(object n) -- a number as a PyObject. + | Str(string s) -- need to specify raw, unicode, etc? + -- other literals? bools? + + -- the following expression can appear in assignment context + | Attribute(expr value, identifier attr, expr_context ctx) + | Subscript(expr value, slice slice, expr_context ctx) + | Name(identifier id, expr_context ctx) + | List(expr* elts, expr_context ctx) + | Tuple(expr* elts, expr_context ctx) + + -- col_offset is the byte offset in the utf8 string the parser uses + attributes (int lineno, int col_offset) + + expr_context = Load | Store | Del | AugLoad | AugStore | Param + + slice = Ellipsis | Slice(expr? lower, expr? upper, expr? step) + | ExtSlice(slice* dims) + | Index(expr value) + + boolop = And | Or + + operator = Add | Sub | Mult | Div | Mod | Pow | LShift | RShift | BitOr | BitXor | BitAnd | FloorDiv - unaryop = Invert | Not | UAdd | USub + unaryop = Invert | Not | UAdd | USub - cmpop = Eq | NotEq | Lt | LtE | Gt | GtE | Is | IsNot | In | NotIn + cmpop = Eq | NotEq | Lt | LtE | Gt | GtE | Is | IsNot | In | NotIn - comprehension = (expr target, expr iter, expr* ifs, int is_async) + comprehension = (expr target, expr iter, expr* ifs) - excepthandler = ExceptHandler(expr? type, identifier? name, stmt* body) - attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset) + -- not sure what to call the first argument for raise and except + excepthandler = ExceptHandler(expr? type, expr? name, stmt* body) + attributes (int lineno, int col_offset) - arguments = (arg* posonlyargs, arg* args, arg? vararg, arg* kwonlyargs, - expr* kw_defaults, arg? kwarg, expr* defaults) + arguments = (expr* args, identifier? vararg, + identifier? kwarg, expr* defaults) - arg = (identifier arg, expr? annotation, string? type_comment) - attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset) + -- keyword arguments supplied to call + keyword = (identifier arg, expr value) - -- keyword arguments supplied to call (NULL identifier for **kwargs) - keyword = (identifier? arg, expr value) - - -- import name with optional 'as' alias. - alias = (identifier name, identifier? asname) - - withitem = (expr context_expr, expr? optional_vars) - - type_ignore = TypeIgnore(int lineno, string tag) + -- import name with optional 'as' alias. + alias = (identifier name, identifier? asname) } diff --git a/Parser/acceler.c b/Parser/acceler.c index e515833..9b14263 100644 --- a/Parser/acceler.c +++ b/Parser/acceler.c @@ -10,21 +10,22 @@ are not part of the static data structure written on graminit.[ch] by the parser generator. */ -#include "Python.h" +#include "pgenheaders.h" #include "grammar.h" #include "node.h" #include "token.h" #include "parser.h" /* Forward references */ -static void fixdfa(grammar *, const dfa *); +static void fixdfa(grammar *, dfa *); static void fixstate(grammar *, state *); void PyGrammar_AddAccelerators(grammar *g) { + dfa *d; int i; - const dfa *d = g->g_dfa; + d = g->g_dfa; for (i = g->g_ndfas; --i >= 0; d++) fixdfa(g, d); g->g_accel = 1; @@ -33,9 +34,10 @@ PyGrammar_AddAccelerators(grammar *g) void PyGrammar_RemoveAccelerators(grammar *g) { + dfa *d; int i; g->g_accel = 0; - const dfa *d = g->g_dfa; + d = g->g_dfa; for (i = g->g_ndfas; --i >= 0; d++) { state *s; int j; @@ -49,7 +51,7 @@ PyGrammar_RemoveAccelerators(grammar *g) } static void -fixdfa(grammar *g, const dfa *d) +fixdfa(grammar *g, dfa *d) { state *s; int j; @@ -61,7 +63,7 @@ fixdfa(grammar *g, const dfa *d) static void fixstate(grammar *g, state *s) { - const arc *a; + arc *a; int k; int *accel; int nl = g->g_ll.ll_nlabels; @@ -76,14 +78,14 @@ fixstate(grammar *g, state *s) a = s->s_arc; for (k = s->s_narcs; --k >= 0; a++) { int lbl = a->a_lbl; - const label *l = &g->g_ll.ll_label[lbl]; + label *l = &g->g_ll.ll_label[lbl]; int type = l->lb_type; if (a->a_arrow >= (1 << 7)) { printf("XXX too many states!\n"); continue; } if (ISNONTERMINAL(type)) { - const dfa *d1 = PyGrammar_FindDFA(g, type); + dfa *d1 = PyGrammar_FindDFA(g, type); int ibit; if (type - NT_OFFSET >= (1 << 7)) { printf("XXX too high nonterminal number!\n"); diff --git a/Parser/asdl.py b/Parser/asdl.py index 62f5c19..1ddc3f8 100644 --- a/Parser/asdl.py +++ b/Parser/asdl.py @@ -1,53 +1,243 @@ -#------------------------------------------------------------------------------- -# Parser for ASDL [1] definition files. Reads in an ASDL description and parses -# it into an AST that describes it. -# -# The EBNF we're parsing here: Figure 1 of the paper [1]. Extended to support -# modules and attributes after a product. Words starting with Capital letters -# are terminals. Literal tokens are in "double quotes". Others are -# non-terminals. Id is either TokenId or ConstructorId. -# -# module ::= "module" Id "{" [definitions] "}" -# definitions ::= { TypeId "=" type } -# type ::= product | sum -# product ::= fields ["attributes" fields] -# fields ::= "(" { field, "," } field ")" -# field ::= TypeId ["?" | "*"] [Id] -# sum ::= constructor { "|" constructor } ["attributes" fields] -# constructor ::= ConstructorId [fields] -# -# [1] "The Zephyr Abstract Syntax Description Language" by Wang, et. al. See -# http://asdl.sourceforge.net/ -#------------------------------------------------------------------------------- -from collections import namedtuple -import re - -__all__ = [ - 'builtin_types', 'parse', 'AST', 'Module', 'Type', 'Constructor', - 'Field', 'Sum', 'Product', 'VisitorBase', 'Check', 'check'] - -# The following classes define nodes into which the ASDL description is parsed. -# Note: this is a "meta-AST". ASDL files (such as Python.asdl) describe the AST -# structure used by a programming language. But ASDL files themselves need to be -# parsed. This module parses ASDL files and uses a simple AST to represent them. -# See the EBNF at the top of the file to understand the logical connection -# between the various node types. - -builtin_types = {'identifier', 'string', 'bytes', 'int', 'object', 'singleton', - 'constant'} - -class AST: +"""An implementation of the Zephyr Abstract Syntax Definition Language. + +See http://asdl.sourceforge.net/ and +http://www.cs.princeton.edu/research/techreps/TR-554-97 + +Only supports top level module decl, not view. I'm guessing that view +is intended to support the browser and I'm not interested in the +browser. + +Changes for Python: Add support for module versions +""" + +import os +import traceback + +import spark + +class Token(object): + # spark seems to dispatch in the parser based on a token's + # type attribute + def __init__(self, type, lineno): + self.type = type + self.lineno = lineno + + def __str__(self): + return self.type + def __repr__(self): - raise NotImplementedError + return str(self) + +class Id(Token): + def __init__(self, value, lineno): + self.type = 'Id' + self.value = value + self.lineno = lineno + + def __str__(self): + return self.value + +class String(Token): + def __init__(self, value, lineno): + self.type = 'String' + self.value = value + self.lineno = lineno + +class ASDLSyntaxError(Exception): + + def __init__(self, lineno, token=None, msg=None): + self.lineno = lineno + self.token = token + self.msg = msg + + def __str__(self): + if self.msg is None: + return "Error at '%s', line %d" % (self.token, self.lineno) + else: + return "%s, line %d" % (self.msg, self.lineno) + +class ASDLScanner(spark.GenericScanner, object): + + def tokenize(self, input): + self.rv = [] + self.lineno = 1 + super(ASDLScanner, self).tokenize(input) + return self.rv + + def t_id(self, s): + r"[\w\.]+" + # XXX doesn't distinguish upper vs. lower, which is + # significant for ASDL. + self.rv.append(Id(s, self.lineno)) + + def t_string(self, s): + r'"[^"]*"' + self.rv.append(String(s, self.lineno)) + + def t_xxx(self, s): # not sure what this production means + r"<=" + self.rv.append(Token(s, self.lineno)) + + def t_punctuation(self, s): + r"[\{\}\*\=\|\(\)\,\?\:]" + self.rv.append(Token(s, self.lineno)) + + def t_comment(self, s): + r"\-\-[^\n]*" + pass + + def t_newline(self, s): + r"\n" + self.lineno += 1 + + def t_whitespace(self, s): + r"[ \t]+" + pass + + def t_default(self, s): + r" . +" + raise ValueError, "unmatched input: %s" % `s` + +class ASDLParser(spark.GenericParser, object): + def __init__(self): + super(ASDLParser, self).__init__("module") + + def typestring(self, tok): + return tok.type + + def error(self, tok): + raise ASDLSyntaxError(tok.lineno, tok) + + def p_module_0(self, (module, name, version, _0, _1)): + " module ::= Id Id version { } " + if module.value != "module": + raise ASDLSyntaxError(module.lineno, + msg="expected 'module', found %s" % module) + return Module(name, None, version) + + def p_module(self, (module, name, version, _0, definitions, _1)): + " module ::= Id Id version { definitions } " + if module.value != "module": + raise ASDLSyntaxError(module.lineno, + msg="expected 'module', found %s" % module) + return Module(name, definitions, version) + + def p_version(self, (version, V)): + "version ::= Id String" + if version.value != "version": + raise ASDLSyntaxError(version.lineno, + msg="expected 'version', found %s" % version) + return V + + def p_definition_0(self, (definition,)): + " definitions ::= definition " + return definition + + def p_definition_1(self, (definitions, definition)): + " definitions ::= definition definitions " + return definitions + definition + + def p_definition(self, (id, _, type)): + " definition ::= Id = type " + return [Type(id, type)] + + def p_type_0(self, (product,)): + " type ::= product " + return product + + def p_type_1(self, (sum,)): + " type ::= sum " + return Sum(sum) + + def p_type_2(self, (sum, id, _0, attributes, _1)): + " type ::= sum Id ( fields ) " + if id.value != "attributes": + raise ASDLSyntaxError(id.lineno, + msg="expected attributes, found %s" % id) + if attributes: + attributes.reverse() + return Sum(sum, attributes) + + def p_product(self, (_0, fields, _1)): + " product ::= ( fields ) " + # XXX can't I just construct things in the right order? + fields.reverse() + return Product(fields) + + def p_sum_0(self, (constructor,)): + " sum ::= constructor " + return [constructor] + + def p_sum_1(self, (constructor, _, sum)): + " sum ::= constructor | sum " + return [constructor] + sum + + def p_sum_2(self, (constructor, _, sum)): + " sum ::= constructor | sum " + return [constructor] + sum + + def p_constructor_0(self, (id,)): + " constructor ::= Id " + return Constructor(id) + + def p_constructor_1(self, (id, _0, fields, _1)): + " constructor ::= Id ( fields ) " + # XXX can't I just construct things in the right order? + fields.reverse() + return Constructor(id, fields) + + def p_fields_0(self, (field,)): + " fields ::= field " + return [field] + + def p_fields_1(self, (field, _, fields)): + " fields ::= field , fields " + return fields + [field] + + def p_field_0(self, (type,)): + " field ::= Id " + return Field(type) + + def p_field_1(self, (type, name)): + " field ::= Id Id " + return Field(type, name) + + def p_field_2(self, (type, _, name)): + " field ::= Id * Id " + return Field(type, name, seq=True) + + def p_field_3(self, (type, _, name)): + " field ::= Id ? Id " + return Field(type, name, opt=True) + + def p_field_4(self, (type, _)): + " field ::= Id * " + return Field(type, seq=True) + + def p_field_5(self, (type, _)): + " field ::= Id ? " + return Field(type, opt=True) + +builtin_types = ("identifier", "string", "int", "bool", "object") + +# below is a collection of classes to capture the AST of an AST :-) +# not sure if any of the methods are useful yet, but I'm adding them +# piecemeal as they seem helpful + +class AST(object): + pass # a marker class class Module(AST): - def __init__(self, name, dfns): + def __init__(self, name, dfns, version): self.name = name self.dfns = dfns - self.types = {type.name: type.value for type in dfns} + self.version = version + self.types = {} # maps type name to value (from dfns) + for type in dfns: + self.types[type.name.value] = type.value def __repr__(self): - return 'Module({0.name}, {0.dfns})'.format(self) + return "Module(%s, %s)" % (self.name, self.dfns) class Type(AST): def __init__(self, name, value): @@ -55,7 +245,7 @@ class Type(AST): self.value = value def __repr__(self): - return 'Type({0.name}, {0.value})'.format(self) + return "Type(%s, %s)" % (self.name, self.value) class Constructor(AST): def __init__(self, name, fields=None): @@ -63,7 +253,7 @@ class Constructor(AST): self.fields = fields or [] def __repr__(self): - return 'Constructor({0.name}, {0.fields})'.format(self) + return "Constructor(%s, %s)" % (self.name, self.fields) class Field(AST): def __init__(self, type, name=None, seq=False, opt=False): @@ -80,9 +270,9 @@ class Field(AST): else: extra = "" if self.name is None: - return 'Field({0.type}{1})'.format(self, extra) + return "Field(%s%s)" % (self.type, extra) else: - return 'Field({0.type}, {0.name}{1})'.format(self, extra) + return "Field(%s, %s%s)" % (self.type, self.name, extra) class Sum(AST): def __init__(self, types, attributes=None): @@ -90,54 +280,56 @@ class Sum(AST): self.attributes = attributes or [] def __repr__(self): - if self.attributes: - return 'Sum({0.types}, {0.attributes})'.format(self) + if self.attributes is None: + return "Sum(%s)" % self.types else: - return 'Sum({0.types})'.format(self) + return "Sum(%s, %s)" % (self.types, self.attributes) class Product(AST): - def __init__(self, fields, attributes=None): + def __init__(self, fields): self.fields = fields - self.attributes = attributes or [] def __repr__(self): - if self.attributes: - return 'Product({0.fields}, {0.attributes})'.format(self) - else: - return 'Product({0.fields})'.format(self) - -# A generic visitor for the meta-AST that describes ASDL. This can be used by -# emitters. Note that this visitor does not provide a generic visit method, so a -# subclass needs to define visit methods from visitModule to as deep as the -# interesting node. -# We also define a Check visitor that makes sure the parsed ASDL is well-formed. + return "Product(%s)" % self.fields class VisitorBase(object): - """Generic tree visitor for ASTs.""" - def __init__(self): + + def __init__(self, skip=False): self.cache = {} + self.skip = skip - def visit(self, obj, *args): - klass = obj.__class__ + def visit(self, object, *args): + meth = self._dispatch(object) + if meth is None: + return + try: + meth(object, *args) + except Exception, err: + print "Error visiting", repr(object) + print err + traceback.print_exc() + # XXX hack + if hasattr(self, 'file'): + self.file.flush() + os._exit(1) + + def _dispatch(self, object): + assert isinstance(object, AST), repr(object) + klass = object.__class__ meth = self.cache.get(klass) if meth is None: methname = "visit" + klass.__name__ - meth = getattr(self, methname, None) + if self.skip: + meth = getattr(self, methname, None) + else: + meth = getattr(self, methname) self.cache[klass] = meth - if meth: - try: - meth(obj, *args) - except Exception as e: - print("Error visiting %r: %s" % (obj, e)) - raise + return meth class Check(VisitorBase): - """A visitor that checks a parsed ASDL tree for correctness. - Errors are printed and accumulated. - """ def __init__(self): - super(Check, self).__init__() + super(Check, self).__init__(skip=True) self.cons = {} self.errors = 0 self.types = {} @@ -159,8 +351,8 @@ class Check(VisitorBase): if conflict is None: self.cons[key] = name else: - print('Redefinition of constructor {}'.format(key)) - print('Defined in {} and {}'.format(conflict, name)) + print "Redefinition of constructor %s" % key + print "Defined in %s and %s" % (conflict, name) self.errors += 1 for f in cons.fields: self.visit(f, key) @@ -175,11 +367,6 @@ class Check(VisitorBase): self.visit(f, name) def check(mod): - """Check the parsed ASDL tree for correctness. - - Return True if success. For failure, the errors are printed out and False - is returned. - """ v = Check() v.visit(mod) @@ -187,190 +374,40 @@ def check(mod): if t not in mod.types and not t in builtin_types: v.errors += 1 uses = ", ".join(v.types[t]) - print('Undefined type {}, used in {}'.format(t, uses)) - return not v.errors - -# The ASDL parser itself comes next. The only interesting external interface -# here is the top-level parse function. - -def parse(filename): - """Parse ASDL from the given file and return a Module node describing it.""" - with open(filename) as f: - parser = ASDLParser() - return parser.parse(f.read()) - -# Types for describing tokens in an ASDL specification. -class TokenKind: - """TokenKind is provides a scope for enumerated token kinds.""" - (ConstructorId, TypeId, Equals, Comma, Question, Pipe, Asterisk, - LParen, RParen, LBrace, RBrace) = range(11) + print "Undefined type %s, used in %s" % (t, uses) - operator_table = { - '=': Equals, ',': Comma, '?': Question, '|': Pipe, '(': LParen, - ')': RParen, '*': Asterisk, '{': LBrace, '}': RBrace} - -Token = namedtuple('Token', 'kind value lineno') - -class ASDLSyntaxError(Exception): - def __init__(self, msg, lineno=None): - self.msg = msg - self.lineno = lineno or '<unknown>' - - def __str__(self): - return 'Syntax error on line {0.lineno}: {0.msg}'.format(self) - -def tokenize_asdl(buf): - """Tokenize the given buffer. Yield Token objects.""" - for lineno, line in enumerate(buf.splitlines(), 1): - for m in re.finditer(r'\s*(\w+|--.*|.)', line.strip()): - c = m.group(1) - if c[0].isalpha(): - # Some kind of identifier - if c[0].isupper(): - yield Token(TokenKind.ConstructorId, c, lineno) - else: - yield Token(TokenKind.TypeId, c, lineno) - elif c[:2] == '--': - # Comment - break - else: - # Operators - try: - op_kind = TokenKind.operator_table[c] - except KeyError: - raise ASDLSyntaxError('Invalid operator %s' % c, lineno) - yield Token(op_kind, c, lineno) - -class ASDLParser: - """Parser for ASDL files. - - Create, then call the parse method on a buffer containing ASDL. - This is a simple recursive descent parser that uses tokenize_asdl for the - lexing. - """ - def __init__(self): - self._tokenizer = None - self.cur_token = None - - def parse(self, buf): - """Parse the ASDL in the buffer and return an AST with a Module root. - """ - self._tokenizer = tokenize_asdl(buf) - self._advance() - return self._parse_module() - - def _parse_module(self): - if self._at_keyword('module'): - self._advance() - else: - raise ASDLSyntaxError( - 'Expected "module" (found {})'.format(self.cur_token.value), - self.cur_token.lineno) - name = self._match(self._id_kinds) - self._match(TokenKind.LBrace) - defs = self._parse_definitions() - self._match(TokenKind.RBrace) - return Module(name, defs) - - def _parse_definitions(self): - defs = [] - while self.cur_token.kind == TokenKind.TypeId: - typename = self._advance() - self._match(TokenKind.Equals) - type = self._parse_type() - defs.append(Type(typename, type)) - return defs - - def _parse_type(self): - if self.cur_token.kind == TokenKind.LParen: - # If we see a (, it's a product - return self._parse_product() - else: - # Otherwise it's a sum. Look for ConstructorId - sumlist = [Constructor(self._match(TokenKind.ConstructorId), - self._parse_optional_fields())] - while self.cur_token.kind == TokenKind.Pipe: - # More constructors - self._advance() - sumlist.append(Constructor( - self._match(TokenKind.ConstructorId), - self._parse_optional_fields())) - return Sum(sumlist, self._parse_optional_attributes()) - - def _parse_product(self): - return Product(self._parse_fields(), self._parse_optional_attributes()) - - def _parse_fields(self): - fields = [] - self._match(TokenKind.LParen) - while self.cur_token.kind == TokenKind.TypeId: - typename = self._advance() - is_seq, is_opt = self._parse_optional_field_quantifier() - id = (self._advance() if self.cur_token.kind in self._id_kinds - else None) - fields.append(Field(typename, id, seq=is_seq, opt=is_opt)) - if self.cur_token.kind == TokenKind.RParen: - break - elif self.cur_token.kind == TokenKind.Comma: - self._advance() - self._match(TokenKind.RParen) - return fields - - def _parse_optional_fields(self): - if self.cur_token.kind == TokenKind.LParen: - return self._parse_fields() - else: - return None + return not v.errors - def _parse_optional_attributes(self): - if self._at_keyword('attributes'): - self._advance() - return self._parse_fields() +def parse(file): + scanner = ASDLScanner() + parser = ASDLParser() + + buf = open(file).read() + tokens = scanner.tokenize(buf) + try: + return parser.parse(tokens) + except ASDLSyntaxError, err: + print err + lines = buf.split("\n") + print lines[err.lineno - 1] # lines starts at 0, files at 1 + +if __name__ == "__main__": + import glob + import sys + + if len(sys.argv) > 1: + files = sys.argv[1:] + else: + testdir = "tests" + files = glob.glob(testdir + "/*.asdl") + + for file in files: + print file + mod = parse(file) + print "module", mod.name + print len(mod.dfns), "definitions" + if not check(mod): + print "Check failed" else: - return None - - def _parse_optional_field_quantifier(self): - is_seq, is_opt = False, False - if self.cur_token.kind == TokenKind.Asterisk: - is_seq = True - self._advance() - elif self.cur_token.kind == TokenKind.Question: - is_opt = True - self._advance() - return is_seq, is_opt - - def _advance(self): - """ Return the value of the current token and read the next one into - self.cur_token. - """ - cur_val = None if self.cur_token is None else self.cur_token.value - try: - self.cur_token = next(self._tokenizer) - except StopIteration: - self.cur_token = None - return cur_val - - _id_kinds = (TokenKind.ConstructorId, TokenKind.TypeId) - - def _match(self, kind): - """The 'match' primitive of RD parsers. - - * Verifies that the current token is of the given kind (kind can - be a tuple, in which the kind must match one of its members). - * Returns the value of the current token - * Reads in the next token - """ - if (isinstance(kind, tuple) and self.cur_token.kind in kind or - self.cur_token.kind == kind - ): - value = self.cur_token.value - self._advance() - return value - else: - raise ASDLSyntaxError( - 'Unmatched {} (found {})'.format(kind, self.cur_token.kind), - self.cur_token.lineno) - - def _at_keyword(self, keyword): - return (self.cur_token.kind == TokenKind.TypeId and - self.cur_token.value == keyword) + for dfn in mod.dfns: + print dfn.type diff --git a/Parser/asdl_c.py b/Parser/asdl_c.py index 52495e9..ac61c78 100755 --- a/Parser/asdl_c.py +++ b/Parser/asdl_c.py @@ -1,18 +1,25 @@ #! /usr/bin/env python """Generate C code from an ASDL description.""" +# TO DO +# handle fields that have a type but no name + import os, sys import asdl -TABSIZE = 4 +TABSIZE = 8 MAX_COL = 80 def get_c_type(name): """Return a string for the C name of the type. - This function special cases the default types provided by asdl. + This function special cases the default types provided by asdl: + identifier, string, int, bool. """ + # XXX ack! need to figure out where Id is useful and where string + if isinstance(name, asdl.Id): + name = name.value if name in asdl.builtin_types: return name else: @@ -77,20 +84,8 @@ class EmitVisitor(asdl.VisitorBase): def __init__(self, file): self.file = file - self.identifiers = set() - self.singletons = set() - self.types = set() super(EmitVisitor, self).__init__() - def emit_identifier(self, name): - self.identifiers.add(str(name)) - - def emit_singleton(self, name): - self.singletons.add(str(name)) - - def emit_type(self, name): - self.types.add(str(name)) - def emit(self, s, depth, reflow=True): # XXX reflow long lines? if reflow: @@ -98,9 +93,8 @@ class EmitVisitor(asdl.VisitorBase): else: lines = [s] for line in lines: - if line: - line = (" " * TABSIZE * depth) + line - self.file.write(line + "\n") + line = (" " * TABSIZE * depth) + line + "\n" + self.file.write(line) class TypeDefVisitor(EmitVisitor): @@ -186,6 +180,9 @@ class StructVisitor(EmitVisitor): self.visit(f, depth + 1) self.emit("} %s;" % cons.name, depth) self.emit("", depth) + else: + # XXX not sure what I want here, nothing is probably fine + pass def visitField(self, field, depth): # XXX need to lookup field.type, because it might be something @@ -193,7 +190,7 @@ class StructVisitor(EmitVisitor): ctype = get_c_type(field.type) name = field.name if field.seq: - if field.type == 'cmpop': + if field.type.value in ('cmpop',): self.emit("asdl_int_seq *%(name)s;" % locals(), depth) else: self.emit("asdl_seq *%(name)s;" % locals(), depth) @@ -204,11 +201,6 @@ class StructVisitor(EmitVisitor): self.emit("struct _%(name)s {" % locals(), depth) for f in product.fields: self.visit(f, depth + 1) - for field in product.attributes: - # rudimentary attribute handling - type = str(field.type) - assert type in asdl.builtin_types, type - self.emit("%s %s;" % (type, field.name), depth + 1); self.emit("};", depth) self.emit("", depth) @@ -248,7 +240,7 @@ class PrototypeVisitor(EmitVisitor): name = f.name # XXX should extend get_c_type() to handle this if f.seq: - if f.type == 'cmpop': + if f.type.value in ('cmpop',): ctype = "asdl_int_seq *" else: ctype = "asdl_seq *" @@ -280,9 +272,7 @@ class PrototypeVisitor(EmitVisitor): def visitProduct(self, prod, name): self.emit_function(name, get_c_type(name), - self.get_args(prod.fields), - self.get_args(prod.attributes), - union=False) + self.get_args(prod.fields), [], union=False) class FunctionVisitor(PrototypeVisitor): @@ -302,7 +292,8 @@ class FunctionVisitor(PrototypeVisitor): emit("{") emit("%s p;" % ctype, 1) for argtype, argname, opt in args: - if not opt and argtype != "int": + # XXX hack alert: false is allowed for a bool + if not opt and not (argtype == "bool" or argtype == "int"): emit("if (!%s) {" % argname, 1) emit("PyErr_SetString(PyExc_ValueError,", 2) msg = "field %s is required for %s" % (argname, name) @@ -336,8 +327,7 @@ class FunctionVisitor(PrototypeVisitor): self.emit(s, depth, reflow) for argtype, argname, opt in args: emit("p->%s = %s;" % (argname, argname), 1) - for argtype, argname, opt in attrs: - emit("p->%s = %s;" % (argname, argname), 1) + assert not attrs class PickleVisitor(EmitVisitor): @@ -376,18 +366,20 @@ class Obj2ModVisitor(PickleVisitor): self.emit("int", 0) self.emit("obj2ast_%s(PyObject* obj, %s* out, PyArena* arena)" % (name, ctype), 0) self.emit("{", 0) + self.emit("PyObject* tmp = NULL;", 1) self.emit("int isinstance;", 1) self.emit("", 0) - def sumTrailer(self, name, add_label=False): + def sumTrailer(self, name): self.emit("", 0) + self.emit("tmp = PyObject_Repr(obj);", 1) # there's really nothing more we can do if this fails ... - error = "expected some sort of %s, but got %%R" % name - format = "PyErr_Format(PyExc_TypeError, \"%s\", obj);" + self.emit("if (tmp == NULL) goto failed;", 1) + error = "expected some sort of %s, but got %%.400s" % name + format = "PyErr_Format(PyExc_TypeError, \"%s\", PyString_AS_STRING(tmp));" self.emit(format % error, 1, reflow=False) - if add_label: - self.emit("failed:", 1) - self.emit("Py_XDECREF(tmp);", 1) + self.emit("failed:", 0) + self.emit("Py_XDECREF(tmp);", 1) self.emit("return 1;", 1) self.emit("}", 0) self.emit("", 0) @@ -396,7 +388,7 @@ class Obj2ModVisitor(PickleVisitor): self.funcHeader(name) for t in sum.types: line = ("isinstance = PyObject_IsInstance(obj, " - "astmodulestate_global->%s_type);") + "(PyObject *)%s_type);") self.emit(line % (t.name,), 1) self.emit("if (isinstance == -1) {", 1) self.emit("return 1;", 2) @@ -412,8 +404,6 @@ class Obj2ModVisitor(PickleVisitor): def complexSum(self, sum, name): self.funcHeader(name) - self.emit("PyObject *tmp = NULL;", 1) - self.emit("PyObject *tp;", 1) for a in sum.attributes: self.visitAttributeDeclaration(a, name, sum=sum) self.emit("", 0) @@ -425,8 +415,8 @@ class Obj2ModVisitor(PickleVisitor): for a in sum.attributes: self.visitField(a, name, sum=sum, depth=1) for t in sum.types: - self.emit("tp = astmodulestate_global->%s_type;" % (t.name,), 1) - self.emit("isinstance = PyObject_IsInstance(obj, tp);", 1) + line = "isinstance = PyObject_IsInstance(obj, (PyObject*)%s_type);" + self.emit(line % (t.name,), 1) self.emit("if (isinstance == -1) {", 1) self.emit("return 1;", 2) self.emit("}", 1) @@ -436,12 +426,12 @@ class Obj2ModVisitor(PickleVisitor): self.emit("", 0) for f in t.fields: self.visitField(f, t.name, sum=sum, depth=2) - args = [f.name for f in t.fields] + [a.name for a in sum.attributes] + args = [f.name.value for f in t.fields] + [a.name.value for a in sum.attributes] self.emit("*out = %s(%s);" % (t.name, self.buildArgs(args)), 2) self.emit("if (*out == NULL) goto failed;", 2) self.emit("return 0;", 2) self.emit("}", 1) - self.sumTrailer(name, True) + self.sumTrailer(name) def visitAttributeDeclaration(self, a, name, sum=sum): ctype = get_c_type(a.type) @@ -461,15 +451,10 @@ class Obj2ModVisitor(PickleVisitor): self.emit("PyObject* tmp = NULL;", 1) for f in prod.fields: self.visitFieldDeclaration(f, name, prod=prod, depth=1) - for a in prod.attributes: - self.visitFieldDeclaration(a, name, prod=prod, depth=1) self.emit("", 0) for f in prod.fields: self.visitField(f, name, prod=prod, depth=1) - for a in prod.attributes: - self.visitField(a, name, prod=prod, depth=1) - args = [f.name for f in prod.fields] - args.extend([a.name for a in prod.attributes]) + args = [f.name.value for f in prod.fields] self.emit("*out = %s(%s);" % (name, self.buildArgs(args)), 1) self.emit("return 0;", 1) self.emit("failed:", 0) @@ -491,8 +476,8 @@ class Obj2ModVisitor(PickleVisitor): def isSimpleSum(self, field): # XXX can the members of this list be determined automatically? - return field.type in ('expr_context', 'boolop', 'operator', - 'unaryop', 'cmpop') + return field.type.value in ('expr_context', 'boolop', 'operator', + 'unaryop', 'cmpop') def isNumeric(self, field): return get_c_type(field.type) in ("int", "bool") @@ -502,52 +487,31 @@ class Obj2ModVisitor(PickleVisitor): def visitField(self, field, name, sum=None, prod=None, depth=0): ctype = get_c_type(field.type) - line = "if (_PyObject_LookupAttr(obj, astmodulestate_global->%s, &tmp) < 0) {" - self.emit(line % field.name, depth) - self.emit("return 1;", depth+1) - self.emit("}", depth) - if not field.opt: - self.emit("if (tmp == NULL) {", depth) - message = "required field \\\"%s\\\" missing from %s" % (field.name, name) - format = "PyErr_SetString(PyExc_TypeError, \"%s\");" - self.emit(format % message, depth+1, reflow=False) - self.emit("return 1;", depth+1) - else: - self.emit("if (tmp == NULL || tmp == Py_None) {", depth) - self.emit("Py_CLEAR(tmp);", depth+1) - if self.isNumeric(field): - self.emit("%s = 0;" % field.name, depth+1) - elif not self.isSimpleType(field): - self.emit("%s = NULL;" % field.name, depth+1) - else: - raise TypeError("could not determine the default value for %s" % field.name) - self.emit("}", depth) - self.emit("else {", depth) - + self.emit("if (PyObject_HasAttrString(obj, \"%s\")) {" % field.name, depth) self.emit("int res;", depth+1) if field.seq: self.emit("Py_ssize_t len;", depth+1) self.emit("Py_ssize_t i;", depth+1) + self.emit("tmp = PyObject_GetAttrString(obj, \"%s\");" % field.name, depth+1) + self.emit("if (tmp == NULL) goto failed;", depth+1) + if field.seq: self.emit("if (!PyList_Check(tmp)) {", depth+1) self.emit("PyErr_Format(PyExc_TypeError, \"%s field \\\"%s\\\" must " - "be a list, not a %%.200s\", _PyType_Name(Py_TYPE(tmp)));" % + "be a list, not a %%.200s\", tmp->ob_type->tp_name);" % (name, field.name), depth+2, reflow=False) self.emit("goto failed;", depth+2) self.emit("}", depth+1) self.emit("len = PyList_GET_SIZE(tmp);", depth+1) if self.isSimpleType(field): - self.emit("%s = _Py_asdl_int_seq_new(len, arena);" % field.name, depth+1) + self.emit("%s = asdl_int_seq_new(len, arena);" % field.name, depth+1) else: - self.emit("%s = _Py_asdl_seq_new(len, arena);" % field.name, depth+1) + self.emit("%s = asdl_seq_new(len, arena);" % field.name, depth+1) self.emit("if (%s == NULL) goto failed;" % field.name, depth+1) self.emit("for (i = 0; i < len; i++) {", depth+1) self.emit("%s val;" % ctype, depth+2) - self.emit("PyObject *tmp2 = PyList_GET_ITEM(tmp, i);", depth+2) - self.emit("Py_INCREF(tmp2);", depth+2) - self.emit("res = obj2ast_%s(tmp2, &val, arena);" % + self.emit("res = obj2ast_%s(PyList_GET_ITEM(tmp, i), &val, arena);" % field.type, depth+2, reflow=False) - self.emit("Py_DECREF(tmp2);", depth+2) self.emit("if (res != 0) goto failed;", depth+2) self.emit("if (len != PyList_GET_SIZE(tmp)) {", depth+2) self.emit("PyErr_SetString(PyExc_RuntimeError, \"%s field \\\"%s\\\" " @@ -563,7 +527,21 @@ class Obj2ModVisitor(PickleVisitor): (field.type, field.name), depth+1) self.emit("if (res != 0) goto failed;", depth+1) - self.emit("Py_CLEAR(tmp);", depth+1) + self.emit("Py_XDECREF(tmp);", depth+1) + self.emit("tmp = NULL;", depth+1) + self.emit("} else {", depth) + if not field.opt: + message = "required field \\\"%s\\\" missing from %s" % (field.name, name) + format = "PyErr_SetString(PyExc_TypeError, \"%s\");" + self.emit(format % message, depth+1, reflow=False) + self.emit("return 1;", depth+1) + else: + if self.isNumeric(field): + self.emit("%s = 0;" % field.name, depth+1) + elif not self.isSimpleType(field): + self.emit("%s = NULL;" % field.name, depth+1) + else: + raise TypeError("could not determine the default value for %s" % field.name) self.emit("}", depth) @@ -580,46 +558,37 @@ class MarshalPrototypeVisitor(PickleVisitor): class PyTypesDeclareVisitor(PickleVisitor): def visitProduct(self, prod, name): - self.emit_type("%s_type" % name) + self.emit("static PyTypeObject *%s_type;" % name, 0) self.emit("static PyObject* ast2obj_%s(void*);" % name, 0) - if prod.attributes: - for a in prod.attributes: - self.emit_identifier(a.name) - self.emit("static const char * const %s_attributes[] = {" % name, 0) - for a in prod.attributes: - self.emit('"%s",' % a.name, 1) - self.emit("};", 0) if prod.fields: - for f in prod.fields: - self.emit_identifier(f.name) - self.emit("static const char * const %s_fields[]={" % name,0) + self.emit("static char *%s_fields[]={" % name,0) for f in prod.fields: self.emit('"%s",' % f.name, 1) self.emit("};", 0) def visitSum(self, sum, name): - self.emit_type("%s_type" % name) + self.emit("static PyTypeObject *%s_type;" % name, 0) if sum.attributes: - for a in sum.attributes: - self.emit_identifier(a.name) - self.emit("static const char * const %s_attributes[] = {" % name, 0) + self.emit("static char *%s_attributes[] = {" % name, 0) for a in sum.attributes: self.emit('"%s",' % a.name, 1) self.emit("};", 0) ptype = "void*" if is_simple(sum): ptype = get_c_type(name) + tnames = [] for t in sum.types: - self.emit_singleton("%s_singleton" % t.name) + tnames.append(str(t.name)+"_singleton") + tnames = ", *".join(tnames) + self.emit("static PyObject *%s;" % tnames, 0) self.emit("static PyObject* ast2obj_%s(%s);" % (name, ptype), 0) for t in sum.types: self.visitConstructor(t, name) def visitConstructor(self, cons, name): + self.emit("static PyTypeObject *%s_type;" % cons.name, 0) if cons.fields: - for t in cons.fields: - self.emit_identifier(t.name) - self.emit("static const char * const %s_fields[]={" % cons.name, 0) + self.emit("static char *%s_fields[]={" % cons.name, 0) for t in cons.fields: self.emit('"%s",' % t.name, 1) self.emit("};",0) @@ -628,74 +597,43 @@ class PyTypesVisitor(PickleVisitor): def visitModule(self, mod): self.emit(""" - -typedef struct { - PyObject_HEAD - PyObject *dict; -} AST_object; - -static void -ast_dealloc(AST_object *self) -{ - /* bpo-31095: UnTrack is needed before calling any callbacks */ - PyTypeObject *tp = Py_TYPE(self); - PyObject_GC_UnTrack(self); - Py_CLEAR(self->dict); - freefunc free_func = PyType_GetSlot(tp, Py_tp_free); - assert(free_func != NULL); - free_func(self); - Py_DECREF(tp); -} - -static int -ast_traverse(AST_object *self, visitproc visit, void *arg) -{ - Py_VISIT(self->dict); - return 0; -} - -static int -ast_clear(AST_object *self) -{ - Py_CLEAR(self->dict); - return 0; -} - static int ast_type_init(PyObject *self, PyObject *args, PyObject *kw) { Py_ssize_t i, numfields = 0; int res = -1; PyObject *key, *value, *fields; - if (_PyObject_LookupAttr((PyObject*)Py_TYPE(self), astmodulestate_global->_fields, &fields) < 0) { - goto cleanup; - } + fields = PyObject_GetAttrString((PyObject*)Py_TYPE(self), "_fields"); + if (!fields) + PyErr_Clear(); if (fields) { numfields = PySequence_Size(fields); if (numfields == -1) goto cleanup; } - res = 0; /* if no error occurs, this stays 0 to the end */ - if (numfields < PyTuple_GET_SIZE(args)) { - PyErr_Format(PyExc_TypeError, "%.400s constructor takes at most " - "%zd positional argument%s", - _PyType_Name(Py_TYPE(self)), - numfields, numfields == 1 ? "" : "s"); - res = -1; - goto cleanup; - } - for (i = 0; i < PyTuple_GET_SIZE(args); i++) { - /* cannot be reached when fields is NULL */ - PyObject *name = PySequence_GetItem(fields, i); - if (!name) { + if (PyTuple_GET_SIZE(args) > 0) { + if (numfields != PyTuple_GET_SIZE(args)) { + PyErr_Format(PyExc_TypeError, "%.400s constructor takes %s" + "%zd positional argument%s", + Py_TYPE(self)->tp_name, + numfields == 0 ? "" : "either 0 or ", + numfields, numfields == 1 ? "" : "s"); res = -1; goto cleanup; } - res = PyObject_SetAttr(self, name, PyTuple_GET_ITEM(args, i)); - Py_DECREF(name); - if (res < 0) - goto cleanup; + for (i = 0; i < PyTuple_GET_SIZE(args); i++) { + /* cannot be reached when fields is NULL */ + PyObject *name = PySequence_GetItem(fields, i); + if (!name) { + res = -1; + goto cleanup; + } + res = PyObject_SetAttr(self, name, PyTuple_GET_ITEM(args, i)); + Py_DECREF(name); + if (res < 0) + goto cleanup; + } } if (kw) { i = 0; /* needed by PyDict_Next */ @@ -714,95 +652,105 @@ ast_type_init(PyObject *self, PyObject *args, PyObject *kw) static PyObject * ast_type_reduce(PyObject *self, PyObject *unused) { - PyObject *dict; - if (_PyObject_LookupAttr(self, astmodulestate_global->__dict__, &dict) < 0) { - return NULL; + PyObject *res; + PyObject *dict = PyObject_GetAttrString(self, "__dict__"); + if (dict == NULL) { + if (PyErr_ExceptionMatches(PyExc_AttributeError)) + PyErr_Clear(); + else + return NULL; } if (dict) { - return Py_BuildValue("O()N", Py_TYPE(self), dict); + res = Py_BuildValue("O()O", Py_TYPE(self), dict); + Py_DECREF(dict); + return res; } return Py_BuildValue("O()", Py_TYPE(self)); } -static PyMemberDef ast_type_members[] = { - {"__dictoffset__", T_PYSSIZET, offsetof(AST_object, dict), READONLY}, - {NULL} /* Sentinel */ -}; - static PyMethodDef ast_type_methods[] = { {"__reduce__", ast_type_reduce, METH_NOARGS, NULL}, {NULL} }; -static PyGetSetDef ast_type_getsets[] = { - {"__dict__", PyObject_GenericGetDict, PyObject_GenericSetDict}, - {NULL} -}; - -static PyType_Slot AST_type_slots[] = { - {Py_tp_dealloc, ast_dealloc}, - {Py_tp_getattro, PyObject_GenericGetAttr}, - {Py_tp_setattro, PyObject_GenericSetAttr}, - {Py_tp_traverse, ast_traverse}, - {Py_tp_clear, ast_clear}, - {Py_tp_members, ast_type_members}, - {Py_tp_methods, ast_type_methods}, - {Py_tp_getset, ast_type_getsets}, - {Py_tp_init, ast_type_init}, - {Py_tp_alloc, PyType_GenericAlloc}, - {Py_tp_new, PyType_GenericNew}, - {Py_tp_free, PyObject_GC_Del}, - {0, 0}, -}; - -static PyType_Spec AST_type_spec = { +static PyTypeObject AST_type = { + PyVarObject_HEAD_INIT(&PyType_Type, 0) "_ast.AST", - sizeof(AST_object), + sizeof(PyObject), 0, - Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, - AST_type_slots + 0, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + PyObject_GenericSetAttr, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + ast_type_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)ast_type_init, /* tp_init */ + PyType_GenericAlloc, /* tp_alloc */ + PyType_GenericNew, /* tp_new */ + PyObject_Del, /* tp_free */ }; -static PyObject * -make_type(const char *type, PyObject* base, const char* const* fields, int num_fields) + +static PyTypeObject* make_type(char *type, PyTypeObject* base, char**fields, int num_fields) { PyObject *fnames, *result; int i; fnames = PyTuple_New(num_fields); if (!fnames) return NULL; for (i = 0; i < num_fields; i++) { - PyObject *field = PyUnicode_InternFromString(fields[i]); + PyObject *field = PyString_FromString(fields[i]); if (!field) { Py_DECREF(fnames); return NULL; } PyTuple_SET_ITEM(fnames, i, field); } - result = PyObject_CallFunction((PyObject*)&PyType_Type, "s(O){OOOO}", - type, base, - astmodulestate_global->_fields, fnames, - astmodulestate_global->__module__, - astmodulestate_global->_ast); + result = PyObject_CallFunction((PyObject*)&PyType_Type, "s(O){sOss}", + type, base, "_fields", fnames, "__module__", "_ast"); Py_DECREF(fnames); - return result; + return (PyTypeObject*)result; } -static int -add_attributes(PyObject *type, const char * const *attrs, int num_fields) +static int add_attributes(PyTypeObject* type, char**attrs, int num_fields) { int i, result; PyObject *s, *l = PyTuple_New(num_fields); if (!l) return 0; for (i = 0; i < num_fields; i++) { - s = PyUnicode_InternFromString(attrs[i]); + s = PyString_FromString(attrs[i]); if (!s) { Py_DECREF(l); return 0; } PyTuple_SET_ITEM(l, i, s); } - result = PyObject_SetAttr(type, astmodulestate_global->_attributes, l) >= 0; + result = PyObject_SetAttrString((PyObject*)type, "_attributes", l) >= 0; Py_DECREF(l); return result; } @@ -811,7 +759,7 @@ add_attributes(PyObject *type, const char * const *attrs, int num_fields) static PyObject* ast2obj_list(asdl_seq *seq, PyObject* (*func)(void*)) { - Py_ssize_t i, n = asdl_seq_LEN(seq); + int i, n = asdl_seq_LEN(seq); PyObject *result = PyList_New(n); PyObject *value; if (!result) @@ -834,15 +782,16 @@ static PyObject* ast2obj_object(void *o) Py_INCREF((PyObject*)o); return (PyObject*)o; } -#define ast2obj_singleton ast2obj_object -#define ast2obj_constant ast2obj_object #define ast2obj_identifier ast2obj_object #define ast2obj_string ast2obj_object -#define ast2obj_bytes ast2obj_object +static PyObject* ast2obj_bool(bool b) +{ + return PyBool_FromLong(b); +} static PyObject* ast2obj_int(long b) { - return PyLong_FromLong(b); + return PyInt_FromLong(b); } /* Conversion Python -> AST */ @@ -851,32 +800,18 @@ static int obj2ast_object(PyObject* obj, PyObject** out, PyArena* arena) { if (obj == Py_None) obj = NULL; - if (obj) { - if (PyArena_AddPyObject(arena, obj) < 0) { - *out = NULL; - return -1; - } - Py_INCREF(obj); - } - *out = obj; - return 0; -} - -static int obj2ast_constant(PyObject* obj, PyObject** out, PyArena* arena) -{ - if (PyArena_AddPyObject(arena, obj) < 0) { - *out = NULL; - return -1; - } - Py_INCREF(obj); + if (obj) + PyArena_AddPyObject(arena, obj); + Py_XINCREF(obj); *out = obj; return 0; } static int obj2ast_identifier(PyObject* obj, PyObject** out, PyArena* arena) { - if (!PyUnicode_CheckExact(obj) && obj != Py_None) { - PyErr_SetString(PyExc_TypeError, "AST identifier must be of type str"); + if (!PyString_CheckExact(obj) && obj != Py_None) { + PyErr_Format(PyExc_TypeError, + "AST identifier must be of type str"); return 1; } return obj2ast_object(obj, out, arena); @@ -884,8 +819,9 @@ static int obj2ast_identifier(PyObject* obj, PyObject** out, PyArena* arena) static int obj2ast_string(PyObject* obj, PyObject** out, PyArena* arena) { - if (!PyUnicode_CheckExact(obj) && !PyBytes_CheckExact(obj)) { - PyErr_SetString(PyExc_TypeError, "AST string must be of type str"); + if (!PyString_CheckExact(obj) && !PyUnicode_CheckExact(obj)) { + PyErr_SetString(PyExc_TypeError, + "AST string must be of type str or unicode"); return 1; } return obj2ast_object(obj, out, arena); @@ -894,25 +830,47 @@ static int obj2ast_string(PyObject* obj, PyObject** out, PyArena* arena) static int obj2ast_int(PyObject* obj, int* out, PyArena* arena) { int i; - if (!PyLong_Check(obj)) { - PyErr_Format(PyExc_ValueError, "invalid integer value: %R", obj); + if (!_PyAnyInt_Check(obj)) { + PyObject *s = PyObject_Repr(obj); + if (s == NULL) return 1; + PyErr_Format(PyExc_ValueError, "invalid integer value: %.400s", + PyString_AS_STRING(s)); + Py_DECREF(s); return 1; } - i = _PyLong_AsInt(obj); + i = (int)PyLong_AsLong(obj); if (i == -1 && PyErr_Occurred()) return 1; *out = i; return 0; } +static int obj2ast_bool(PyObject* obj, bool* out, PyArena* arena) +{ + if (!PyBool_Check(obj)) { + PyObject *s = PyObject_Repr(obj); + if (s == NULL) return 1; + PyErr_Format(PyExc_ValueError, "invalid boolean value: %.400s", + PyString_AS_STRING(s)); + Py_DECREF(s); + return 1; + } + + *out = (obj == Py_True); + return 0; +} + static int add_ast_fields(void) { - PyObject *empty_tuple; + PyObject *empty_tuple, *d; + if (PyType_Ready(&AST_type) < 0) + return -1; + d = AST_type.tp_dict; empty_tuple = PyTuple_New(0); if (!empty_tuple || - PyObject_SetAttrString(astmodulestate_global->AST_type, "_fields", empty_tuple) < 0 || - PyObject_SetAttrString(astmodulestate_global->AST_type, "_attributes", empty_tuple) < 0) { + PyDict_SetItemString(d, "_fields", empty_tuple) < 0 || + PyDict_SetItemString(d, "_attributes", empty_tuple) < 0) { Py_XDECREF(empty_tuple); return -1; } @@ -924,91 +882,71 @@ static int add_ast_fields(void) self.emit("static int init_types(void)",0) self.emit("{", 0) - self.emit("PyObject *m;", 1) - self.emit("if (PyState_FindModule(&_astmodule) == NULL) {", 1) - self.emit("m = PyModule_Create(&_astmodule);", 2) - self.emit("if (!m) return 0;", 2) - self.emit("PyState_AddModule(m, &_astmodule);", 2) - self.emit("}", 1) - self.emit("astmodulestate *state = astmodulestate_global;", 1) - self.emit("if (state->initialized) return 1;", 1) - self.emit("if (init_identifiers() < 0) return 0;", 1) - self.emit("state->AST_type = PyType_FromSpec(&AST_type_spec);", 1) - self.emit("if (!state->AST_type) return 0;", 1) + self.emit("static int initialized;", 1) + self.emit("if (initialized) return 1;", 1) self.emit("if (add_ast_fields() < 0) return 0;", 1) for dfn in mod.dfns: self.visit(dfn) - self.emit("state->initialized = 1;", 1) + self.emit("initialized = 1;", 1) self.emit("return 1;", 1); self.emit("}", 0) def visitProduct(self, prod, name): if prod.fields: - fields = name+"_fields" + fields = name.value+"_fields" else: fields = "NULL" - self.emit('state->%s_type = make_type("%s", state->AST_type, %s, %d);' % + self.emit('%s_type = make_type("%s", &AST_type, %s, %d);' % (name, name, fields, len(prod.fields)), 1) - self.emit("if (!state->%s_type) return 0;" % name, 1) - self.emit_type("AST_type") - self.emit_type("%s_type" % name) - if prod.attributes: - self.emit("if (!add_attributes(state->%s_type, %s_attributes, %d)) return 0;" % - (name, name, len(prod.attributes)), 1) - else: - self.emit("if (!add_attributes(state->%s_type, NULL, 0)) return 0;" % name, 1) + self.emit("if (!%s_type) return 0;" % name, 1) def visitSum(self, sum, name): - self.emit('state->%s_type = make_type("%s", state->AST_type, NULL, 0);' % + self.emit('%s_type = make_type("%s", &AST_type, NULL, 0);' % (name, name), 1) - self.emit_type("%s_type" % name) - self.emit("if (!state->%s_type) return 0;" % name, 1) + self.emit("if (!%s_type) return 0;" % name, 1) if sum.attributes: - self.emit("if (!add_attributes(state->%s_type, %s_attributes, %d)) return 0;" % + self.emit("if (!add_attributes(%s_type, %s_attributes, %d)) return 0;" % (name, name, len(sum.attributes)), 1) else: - self.emit("if (!add_attributes(state->%s_type, NULL, 0)) return 0;" % name, 1) + self.emit("if (!add_attributes(%s_type, NULL, 0)) return 0;" % name, 1) simple = is_simple(sum) for t in sum.types: self.visitConstructor(t, name, simple) def visitConstructor(self, cons, name, simple): if cons.fields: - fields = cons.name+"_fields" + fields = cons.name.value+"_fields" else: fields = "NULL" - self.emit('state->%s_type = make_type("%s", state->%s_type, %s, %d);' % + self.emit('%s_type = make_type("%s", %s_type, %s, %d);' % (cons.name, cons.name, name, fields, len(cons.fields)), 1) - self.emit("if (!state->%s_type) return 0;" % cons.name, 1) - self.emit_type("%s_type" % cons.name) + self.emit("if (!%s_type) return 0;" % cons.name, 1) if simple: - self.emit("state->%s_singleton = PyType_GenericNew((PyTypeObject *)" - "state->%s_type, NULL, NULL);" % + self.emit("%s_singleton = PyType_GenericNew(%s_type, NULL, NULL);" % (cons.name, cons.name), 1) - self.emit("if (!state->%s_singleton) return 0;" % cons.name, 1) + self.emit("if (!%s_singleton) return 0;" % cons.name, 1) class ASTModuleVisitor(PickleVisitor): def visitModule(self, mod): self.emit("PyMODINIT_FUNC", 0) - self.emit("PyInit__ast(void)", 0) + self.emit("init_ast(void)", 0) self.emit("{", 0) - self.emit("PyObject *m;", 1) - self.emit("if (!init_types()) return NULL;", 1) - self.emit('m = PyState_FindModule(&_astmodule);', 1) - self.emit("if (!m) return NULL;", 1) - self.emit('Py_INCREF(astmodulestate(m)->AST_type);', 1) - self.emit('if (PyModule_AddObject(m, "AST", astmodulestate_global->AST_type) < 0) return NULL;', 1) - self.emit('if (PyModule_AddIntMacro(m, PyCF_ALLOW_TOP_LEVEL_AWAIT) < 0)', 1) - self.emit("return NULL;", 2) - self.emit('if (PyModule_AddIntMacro(m, PyCF_ONLY_AST) < 0)', 1) - self.emit("return NULL;", 2) - self.emit('if (PyModule_AddIntMacro(m, PyCF_TYPE_COMMENTS) < 0)', 1) - self.emit("return NULL;", 2) + self.emit("PyObject *m, *d;", 1) + self.emit("if (!init_types()) return;", 1) + self.emit('m = Py_InitModule3("_ast", NULL, NULL);', 1) + self.emit("if (!m) return;", 1) + self.emit("d = PyModule_GetDict(m);", 1) + self.emit('if (PyDict_SetItemString(d, "AST", (PyObject*)&AST_type) < 0) return;', 1) + self.emit('if (PyModule_AddIntConstant(m, "PyCF_ONLY_AST", PyCF_ONLY_AST) < 0)', 1) + self.emit("return;", 2) + # Value of version: "$Revision$" + self.emit('if (PyModule_AddStringConstant(m, "__version__", "%s") < 0)' + % mod.version, 1) + self.emit("return;", 2) for dfn in mod.dfns: self.visit(dfn) - self.emit("return m;", 1) self.emit("}", 0) def visitProduct(self, prod, name): @@ -1023,9 +961,7 @@ class ASTModuleVisitor(PickleVisitor): self.addObj(cons.name) def addObj(self, name): - self.emit("if (PyModule_AddObject(m, \"%s\", " - "astmodulestate_global->%s_type) < 0) return NULL;" % (name, name), 1) - self.emit("Py_INCREF(astmodulestate(m)->%s_type);" % name, 1) + self.emit('if (PyDict_SetItemString(d, "%s", (PyObject*)%s_type) < 0) return;' % (name, name), 1) _SPECIALIZED_SEQUENCES = ('stmt', 'expr') @@ -1063,9 +999,9 @@ class ObjVisitor(PickleVisitor): self.emit("{", 0) self.emit("%s o = (%s)_o;" % (ctype, ctype), 1) self.emit("PyObject *result = NULL, *value = NULL;", 1) - self.emit("PyTypeObject *tp;", 1) self.emit('if (!o) {', 1) - self.emit("Py_RETURN_NONE;", 2) + self.emit("Py_INCREF(Py_None);", 2) + self.emit('return Py_None;', 2) self.emit("}", 1) self.emit('', 0) @@ -1091,7 +1027,7 @@ class ObjVisitor(PickleVisitor): for a in sum.attributes: self.emit("value = ast2obj_%s(o->%s);" % (a.type, a.name), 1) self.emit("if (!value) goto failed;", 1) - self.emit('if (PyObject_SetAttr(result, astmodulestate_global->%s, value) < 0)' % a.name, 1) + self.emit('if (PyObject_SetAttrString(result, "%s", value) < 0)' % a.name, 1) self.emit('goto failed;', 2) self.emit('Py_DECREF(value);', 1) self.func_end() @@ -1102,8 +1038,8 @@ class ObjVisitor(PickleVisitor): self.emit("switch(o) {", 1) for t in sum.types: self.emit("case %s:" % t.name, 2) - self.emit("Py_INCREF(astmodulestate_global->%s_singleton);" % t.name, 3) - self.emit("return astmodulestate_global->%s_singleton;" % t.name, 3) + self.emit("Py_INCREF(%s_singleton);" % t.name, 3) + self.emit("return %s_singleton;" % t.name, 3) self.emit("default:", 2) self.emit('/* should never happen, but just in case ... */', 3) code = "PyErr_Format(PyExc_SystemError, \"unknown %s found\");" % name @@ -1114,23 +1050,15 @@ class ObjVisitor(PickleVisitor): def visitProduct(self, prod, name): self.func_begin(name) - self.emit("tp = (PyTypeObject *)astmodulestate_global->%s_type;" % name, 1) - self.emit("result = PyType_GenericNew(tp, NULL, NULL);", 1); + self.emit("result = PyType_GenericNew(%s_type, NULL, NULL);" % name, 1); self.emit("if (!result) return NULL;", 1) for field in prod.fields: self.visitField(field, name, 1, True) - for a in prod.attributes: - self.emit("value = ast2obj_%s(o->%s);" % (a.type, a.name), 1) - self.emit("if (!value) goto failed;", 1) - self.emit("if (PyObject_SetAttr(result, astmodulestate_global->%s, value) < 0)" % a.name, 1) - self.emit('goto failed;', 2) - self.emit('Py_DECREF(value);', 1) self.func_end() def visitConstructor(self, cons, enum, name): self.emit("case %s_kind:" % cons.name, 1) - self.emit("tp = (PyTypeObject *)astmodulestate_global->%s_type;" % cons.name, 2) - self.emit("result = PyType_GenericNew(tp, NULL, NULL);", 2); + self.emit("result = PyType_GenericNew(%s_type, NULL, NULL);" % cons.name, 2); self.emit("if (!result) goto failed;", 2) for f in cons.fields: self.visitField(f, cons.name, 2, False) @@ -1145,7 +1073,7 @@ class ObjVisitor(PickleVisitor): value = "o->v.%s.%s" % (name, field.name) self.set(field, value, depth) emit("if (!value) goto failed;", 0) - emit("if (PyObject_SetAttr(result, astmodulestate_global->%s, value) == -1)" % field.name, 0) + emit('if (PyObject_SetAttrString(result, "%s", value) == -1)' % field.name, 0) emit("goto failed;", 1) emit("Py_DECREF(value);", 0) @@ -1164,11 +1092,11 @@ class ObjVisitor(PickleVisitor): def set(self, field, value, depth): if field.seq: # XXX should really check for is_simple, but that requires a symbol table - if field.type == "cmpop": + if field.type.value == "cmpop": # While the sequence elements are stored as void*, # ast2obj_cmpop expects an enum self.emit("{", depth) - self.emit("Py_ssize_t i, n = asdl_seq_LEN(%s);" % value, depth+1) + self.emit("int i, n = asdl_seq_LEN(%s);" % value, depth+1) self.emit("value = PyList_New(n);", depth+1) self.emit("if (!value) goto failed;", depth+1) self.emit("for(i = 0; i < n; i++)", depth+1) @@ -1188,41 +1116,38 @@ class PartingShots(StaticVisitor): CODE = """ PyObject* PyAST_mod2obj(mod_ty t) { - if (!init_types()) - return NULL; + init_types(); return ast2obj_mod(t); } /* mode is 0 for "exec", 1 for "eval" and 2 for "single" input */ mod_ty PyAST_obj2mod(PyObject* ast, PyArena* arena, int mode) { + mod_ty res; PyObject *req_type[3]; - const char * const req_name[] = {"Module", "Expression", "Interactive"}; + char *req_name[3]; int isinstance; - if (PySys_Audit("compile", "OO", ast, Py_None) < 0) { - return NULL; - } + req_type[0] = (PyObject*)Module_type; + req_type[1] = (PyObject*)Expression_type; + req_type[2] = (PyObject*)Interactive_type; - req_type[0] = astmodulestate_global->Module_type; - req_type[1] = astmodulestate_global->Expression_type; - req_type[2] = astmodulestate_global->Interactive_type; + req_name[0] = "Module"; + req_name[1] = "Expression"; + req_name[2] = "Interactive"; assert(0 <= mode && mode <= 2); - if (!init_types()) - return NULL; + init_types(); isinstance = PyObject_IsInstance(ast, req_type[mode]); if (isinstance == -1) return NULL; if (!isinstance) { PyErr_Format(PyExc_TypeError, "expected %s node, got %.400s", - req_name[mode], _PyType_Name(Py_TYPE(ast))); + req_name[mode], Py_TYPE(ast)->tp_name); return NULL; } - - mod_ty res = NULL; if (obj2ast_mod(ast, &res, arena) != 0) return NULL; else @@ -1231,9 +1156,8 @@ mod_ty PyAST_obj2mod(PyObject* ast, PyArena* arena, int mode) int PyAST_Check(PyObject* obj) { - if (!init_types()) - return -1; - return PyObject_IsInstance(obj, astmodulestate_global->AST_type); + init_types(); + return PyObject_IsInstance(obj, (PyObject*)&AST_type); } """ @@ -1246,172 +1170,81 @@ class ChainOfVisitors: v.visit(object) v.emit("", 0) +common_msg = "/* File automatically generated by %s. */\n\n" -def generate_module_def(f, mod): - # Gather all the data needed for ModuleSpec - visitor_list = set() - with open(os.devnull, "w") as devnull: - visitor = PyTypesDeclareVisitor(devnull) - visitor.visit(mod) - visitor_list.add(visitor) - visitor = PyTypesVisitor(devnull) - visitor.visit(mod) - visitor_list.add(visitor) - - state_strings = set(["__dict__", "_attributes", "_fields", "__module__", "_ast"]) - module_state = set(["__dict__", "_attributes", "_fields", "__module__", "_ast"]) - for visitor in visitor_list: - for identifier in visitor.identifiers: - module_state.add(identifier) - state_strings.add(identifier) - for singleton in visitor.singletons: - module_state.add(singleton) - for tp in visitor.types: - module_state.add(tp) - state_strings = sorted(state_strings) - module_state = sorted(module_state) - f.write('typedef struct {\n') - f.write(' int initialized;\n') - for s in module_state: - f.write(' PyObject *' + s + ';\n') - f.write('} astmodulestate;\n\n') - f.write(""" -#define astmodulestate(o) ((astmodulestate *)PyModule_GetState(o)) - -static int astmodule_clear(PyObject *module) -{ -""") - for s in module_state: - f.write(" Py_CLEAR(astmodulestate(module)->" + s + ');\n') - f.write(""" - return 0; -} - -static int astmodule_traverse(PyObject *module, visitproc visit, void* arg) -{ -""") - for s in module_state: - f.write(" Py_VISIT(astmodulestate(module)->" + s + ');\n') - f.write(""" - return 0; -} - -static void astmodule_free(void* module) { - astmodule_clear((PyObject*)module); -} - -static struct PyModuleDef _astmodule = { - PyModuleDef_HEAD_INIT, - "_ast", - NULL, - sizeof(astmodulestate), - NULL, - NULL, - astmodule_traverse, - astmodule_clear, - astmodule_free, -}; - -#define astmodulestate_global ((astmodulestate *)PyModule_GetState(PyState_FindModule(&_astmodule))) - -""") - f.write('static int init_identifiers(void)\n') - f.write('{\n') - f.write(' astmodulestate *state = astmodulestate_global;\n') - for identifier in state_strings: - f.write(' if ((state->' + identifier) - f.write(' = PyUnicode_InternFromString("') - f.write(identifier + '")) == NULL) return 0;\n') - f.write(' return 1;\n') - f.write('};\n\n') +c_file_msg = """ +/* + __version__ %s. + This module must be committed separately after each AST grammar change; + The __version__ number is set to the revision number of the commit + containing the grammar change. +*/ -common_msg = "/* File automatically generated by %s. */\n\n" +""" -def main(srcfile, dump_module=False): +def main(srcfile): argv0 = sys.argv[0] components = argv0.split(os.sep) argv0 = os.sep.join(components[-2:]) auto_gen_msg = common_msg % argv0 mod = asdl.parse(srcfile) - if dump_module: - print('Parsed Module:') - print(mod) + mod.version = "82160" if not asdl.check(mod): sys.exit(1) - if H_FILE: - with open(H_FILE, "w") as f: - f.write(auto_gen_msg) - f.write('#ifndef Py_PYTHON_AST_H\n') - f.write('#define Py_PYTHON_AST_H\n') - f.write('#ifdef __cplusplus\n') - f.write('extern "C" {\n') - f.write('#endif\n') - f.write('\n') - f.write('#ifndef Py_LIMITED_API\n') - f.write('#include "asdl.h"\n') - f.write('\n') - f.write('#undef Yield /* undefine macro conflicting with <winbase.h> */\n') - f.write('\n') - c = ChainOfVisitors(TypeDefVisitor(f), - StructVisitor(f)) - - c.visit(mod) - f.write("// Note: these macros affect function definitions, not only call sites.\n") - PrototypeVisitor(f).visit(mod) - f.write("\n") - f.write("PyObject* PyAST_mod2obj(mod_ty t);\n") - f.write("mod_ty PyAST_obj2mod(PyObject* ast, PyArena* arena, int mode);\n") - f.write("int PyAST_Check(PyObject* obj);\n") - f.write("#endif /* !Py_LIMITED_API */\n") - f.write('\n') - f.write('#ifdef __cplusplus\n') - f.write('}\n') - f.write('#endif\n') - f.write('#endif /* !Py_PYTHON_AST_H */\n') - - if C_FILE: - with open(C_FILE, "w") as f: - f.write(auto_gen_msg) - f.write('#include <stddef.h>\n') - f.write('\n') - f.write('#include "Python.h"\n') - f.write('#include "%s-ast.h"\n' % mod.name) - f.write('#include "structmember.h"\n') - f.write('\n') - - generate_module_def(f, mod) - - v = ChainOfVisitors( - PyTypesDeclareVisitor(f), - PyTypesVisitor(f), - Obj2ModPrototypeVisitor(f), - FunctionVisitor(f), - ObjVisitor(f), - Obj2ModVisitor(f), - ASTModuleVisitor(f), - PartingShots(f), - ) - v.visit(mod) + if INC_DIR: + p = "%s/%s-ast.h" % (INC_DIR, mod.name) + f = open(p, "wb") + f.write(auto_gen_msg) + f.write('#include "asdl.h"\n\n') + c = ChainOfVisitors(TypeDefVisitor(f), + StructVisitor(f), + PrototypeVisitor(f), + ) + c.visit(mod) + f.write("PyObject* PyAST_mod2obj(mod_ty t);\n") + f.write("mod_ty PyAST_obj2mod(PyObject* ast, PyArena* arena, int mode);\n") + f.write("int PyAST_Check(PyObject* obj);\n") + f.close() + + if SRC_DIR: + p = os.path.join(SRC_DIR, str(mod.name) + "-ast.c") + f = open(p, "wb") + f.write(auto_gen_msg) + f.write(c_file_msg % mod.version) + f.write('#include "Python.h"\n') + f.write('#include "%s-ast.h"\n' % mod.name) + f.write('\n') + f.write("static PyTypeObject AST_type;\n") + v = ChainOfVisitors( + PyTypesDeclareVisitor(f), + PyTypesVisitor(f), + Obj2ModPrototypeVisitor(f), + FunctionVisitor(f), + ObjVisitor(f), + Obj2ModVisitor(f), + ASTModuleVisitor(f), + PartingShots(f), + ) + v.visit(mod) + f.close() if __name__ == "__main__": + import sys import getopt - H_FILE = '' - C_FILE = '' - dump_module = False - opts, args = getopt.getopt(sys.argv[1:], "dh:c:") + INC_DIR = '' + SRC_DIR = '' + opts, args = getopt.getopt(sys.argv[1:], "h:c:") + if len(opts) != 1: + print "Must specify exactly one output file" + sys.exit(1) for o, v in opts: if o == '-h': - H_FILE = v - elif o == '-c': - C_FILE = v - elif o == '-d': - dump_module = True - if H_FILE and C_FILE: - print('Must specify exactly one output file') - sys.exit(1) - elif len(args) != 1: - print('Must specify single input file') + INC_DIR = v + if o == '-c': + SRC_DIR = v + if len(args) != 1: + print "Must specify single input file" sys.exit(1) - main(args[0], dump_module) + main(args[0]) diff --git a/Parser/bitset.c b/Parser/bitset.c new file mode 100644 index 0000000..f5bfd41 --- /dev/null +++ b/Parser/bitset.c @@ -0,0 +1,66 @@ + +/* Bitset primitives used by the parser generator */ + +#include "pgenheaders.h" +#include "bitset.h" + +bitset +newbitset(int nbits) +{ + int nbytes = NBYTES(nbits); + bitset ss = (char *)PyObject_MALLOC(sizeof(BYTE) * nbytes); + + if (ss == NULL) + Py_FatalError("no mem for bitset"); + + ss += nbytes; + while (--nbytes >= 0) + *--ss = 0; + return ss; +} + +void +delbitset(bitset ss) +{ + PyObject_FREE(ss); +} + +int +addbit(bitset ss, int ibit) +{ + int ibyte = BIT2BYTE(ibit); + BYTE mask = BIT2MASK(ibit); + + if (ss[ibyte] & mask) + return 0; /* Bit already set */ + ss[ibyte] |= mask; + return 1; +} + +#if 0 /* Now a macro */ +int +testbit(bitset ss, int ibit) +{ + return (ss[BIT2BYTE(ibit)] & BIT2MASK(ibit)) != 0; +} +#endif + +int +samebitset(bitset ss1, bitset ss2, int nbits) +{ + int i; + + for (i = NBYTES(nbits); --i >= 0; ) + if (*ss1++ != *ss2++) + return 0; + return 1; +} + +void +mergebitset(bitset ss1, bitset ss2, int nbits) +{ + int i; + + for (i = NBYTES(nbits); --i >= 0; ) + *ss1++ |= *ss2++; +} diff --git a/Parser/firstsets.c b/Parser/firstsets.c new file mode 100644 index 0000000..ee75d1b --- /dev/null +++ b/Parser/firstsets.c @@ -0,0 +1,113 @@ + +/* Computation of FIRST stets */ + +#include "pgenheaders.h" +#include "grammar.h" +#include "token.h" + +extern int Py_DebugFlag; + +/* Forward */ +static void calcfirstset(grammar *, dfa *); + +void +addfirstsets(grammar *g) +{ + int i; + dfa *d; + + if (Py_DebugFlag) + printf("Adding FIRST sets ...\n"); + for (i = 0; i < g->g_ndfas; i++) { + d = &g->g_dfa[i]; + if (d->d_first == NULL) + calcfirstset(g, d); + } +} + +static void +calcfirstset(grammar *g, dfa *d) +{ + int i, j; + state *s; + arc *a; + int nsyms; + int *sym; + int nbits; + static bitset dummy; + bitset result; + int type; + dfa *d1; + label *l0; + + if (Py_DebugFlag) + printf("Calculate FIRST set for '%s'\n", d->d_name); + + if (dummy == NULL) + dummy = newbitset(1); + if (d->d_first == dummy) { + fprintf(stderr, "Left-recursion for '%s'\n", d->d_name); + return; + } + if (d->d_first != NULL) { + fprintf(stderr, "Re-calculating FIRST set for '%s' ???\n", + d->d_name); + } + d->d_first = dummy; + + l0 = g->g_ll.ll_label; + nbits = g->g_ll.ll_nlabels; + result = newbitset(nbits); + + sym = (int *)PyObject_MALLOC(sizeof(int)); + if (sym == NULL) + Py_FatalError("no mem for new sym in calcfirstset"); + nsyms = 1; + sym[0] = findlabel(&g->g_ll, d->d_type, (char *)NULL); + + s = &d->d_state[d->d_initial]; + for (i = 0; i < s->s_narcs; i++) { + a = &s->s_arc[i]; + for (j = 0; j < nsyms; j++) { + if (sym[j] == a->a_lbl) + break; + } + if (j >= nsyms) { /* New label */ + sym = (int *)PyObject_REALLOC(sym, + sizeof(int) * (nsyms + 1)); + if (sym == NULL) + Py_FatalError( + "no mem to resize sym in calcfirstset"); + sym[nsyms++] = a->a_lbl; + type = l0[a->a_lbl].lb_type; + if (ISNONTERMINAL(type)) { + d1 = PyGrammar_FindDFA(g, type); + if (d1->d_first == dummy) { + fprintf(stderr, + "Left-recursion below '%s'\n", + d->d_name); + } + else { + if (d1->d_first == NULL) + calcfirstset(g, d1); + mergebitset(result, + d1->d_first, nbits); + } + } + else if (ISTERMINAL(type)) { + addbit(result, a->a_lbl); + } + } + } + d->d_first = result; + if (Py_DebugFlag) { + printf("FIRST set for '%s': {", d->d_name); + for (i = 0; i < nbits; i++) { + if (testbit(result, i)) + printf(" %s", PyGrammar_LabelRepr(&l0[i])); + } + printf(" }\n"); + } + + PyObject_FREE(sym); +} diff --git a/Parser/grammar.c b/Parser/grammar.c new file mode 100644 index 0000000..fcd2219 --- /dev/null +++ b/Parser/grammar.c @@ -0,0 +1,272 @@ + +/* Grammar implementation */ + +#include "Python.h" +#include "pgenheaders.h" + +#include <ctype.h> + +#include "token.h" +#include "grammar.h" + +#ifdef RISCOS +#include <unixlib.h> +#endif + +extern int Py_DebugFlag; + +grammar * +newgrammar(int start) +{ + grammar *g; + + g = (grammar *)PyObject_MALLOC(sizeof(grammar)); + if (g == NULL) + Py_FatalError("no mem for new grammar"); + g->g_ndfas = 0; + g->g_dfa = NULL; + g->g_start = start; + g->g_ll.ll_nlabels = 0; + g->g_ll.ll_label = NULL; + g->g_accel = 0; + return g; +} + +void +freegrammar(grammar *g) +{ + int i; + for (i = 0; i < g->g_ndfas; i++) { + int j; + free(g->g_dfa[i].d_name); + for (j = 0; j < g->g_dfa[i].d_nstates; j++) + PyObject_FREE(g->g_dfa[i].d_state[j].s_arc); + PyObject_FREE(g->g_dfa[i].d_state); + } + PyObject_FREE(g->g_dfa); + for (i = 0; i < g->g_ll.ll_nlabels; i++) + free(g->g_ll.ll_label[i].lb_str); + PyObject_FREE(g->g_ll.ll_label); + PyObject_FREE(g); +} + +dfa * +adddfa(grammar *g, int type, char *name) +{ + dfa *d; + + g->g_dfa = (dfa *)PyObject_REALLOC(g->g_dfa, + sizeof(dfa) * (g->g_ndfas + 1)); + if (g->g_dfa == NULL) + Py_FatalError("no mem to resize dfa in adddfa"); + d = &g->g_dfa[g->g_ndfas++]; + d->d_type = type; + d->d_name = strdup(name); + d->d_nstates = 0; + d->d_state = NULL; + d->d_initial = -1; + d->d_first = NULL; + return d; /* Only use while fresh! */ +} + +int +addstate(dfa *d) +{ + state *s; + + d->d_state = (state *)PyObject_REALLOC(d->d_state, + sizeof(state) * (d->d_nstates + 1)); + if (d->d_state == NULL) + Py_FatalError("no mem to resize state in addstate"); + s = &d->d_state[d->d_nstates++]; + s->s_narcs = 0; + s->s_arc = NULL; + s->s_lower = 0; + s->s_upper = 0; + s->s_accel = NULL; + s->s_accept = 0; + return s - d->d_state; +} + +void +addarc(dfa *d, int from, int to, int lbl) +{ + state *s; + arc *a; + + assert(0 <= from && from < d->d_nstates); + assert(0 <= to && to < d->d_nstates); + + s = &d->d_state[from]; + s->s_arc = (arc *)PyObject_REALLOC(s->s_arc, sizeof(arc) * (s->s_narcs + 1)); + if (s->s_arc == NULL) + Py_FatalError("no mem to resize arc list in addarc"); + a = &s->s_arc[s->s_narcs++]; + a->a_lbl = lbl; + a->a_arrow = to; +} + +int +addlabel(labellist *ll, int type, char *str) +{ + int i; + label *lb; + + for (i = 0; i < ll->ll_nlabels; i++) { + if (ll->ll_label[i].lb_type == type && + strcmp(ll->ll_label[i].lb_str, str) == 0) + return i; + } + ll->ll_label = (label *)PyObject_REALLOC(ll->ll_label, + sizeof(label) * (ll->ll_nlabels + 1)); + if (ll->ll_label == NULL) + Py_FatalError("no mem to resize labellist in addlabel"); + lb = &ll->ll_label[ll->ll_nlabels++]; + lb->lb_type = type; + lb->lb_str = strdup(str); + if (Py_DebugFlag) + printf("Label @ %8p, %d: %s\n", ll, ll->ll_nlabels, + PyGrammar_LabelRepr(lb)); + return lb - ll->ll_label; +} + +/* Same, but rather dies than adds */ + +int +findlabel(labellist *ll, int type, char *str) +{ + int i; + + for (i = 0; i < ll->ll_nlabels; i++) { + if (ll->ll_label[i].lb_type == type /*&& + strcmp(ll->ll_label[i].lb_str, str) == 0*/) + return i; + } + fprintf(stderr, "Label %d/'%s' not found\n", type, str); + Py_FatalError("grammar.c:findlabel()"); + return 0; /* Make gcc -Wall happy */ +} + +/* Forward */ +static void translabel(grammar *, label *); + +void +translatelabels(grammar *g) +{ + int i; + +#ifdef Py_DEBUG + printf("Translating labels ...\n"); +#endif + /* Don't translate EMPTY */ + for (i = EMPTY+1; i < g->g_ll.ll_nlabels; i++) + translabel(g, &g->g_ll.ll_label[i]); +} + +static void +translabel(grammar *g, label *lb) +{ + int i; + + if (Py_DebugFlag) + printf("Translating label %s ...\n", PyGrammar_LabelRepr(lb)); + + if (lb->lb_type == NAME) { + for (i = 0; i < g->g_ndfas; i++) { + if (strcmp(lb->lb_str, g->g_dfa[i].d_name) == 0) { + if (Py_DebugFlag) + printf( + "Label %s is non-terminal %d.\n", + lb->lb_str, + g->g_dfa[i].d_type); + lb->lb_type = g->g_dfa[i].d_type; + free(lb->lb_str); + lb->lb_str = NULL; + return; + } + } + for (i = 0; i < (int)N_TOKENS; i++) { + if (strcmp(lb->lb_str, _PyParser_TokenNames[i]) == 0) { + if (Py_DebugFlag) + printf("Label %s is terminal %d.\n", + lb->lb_str, i); + lb->lb_type = i; + free(lb->lb_str); + lb->lb_str = NULL; + return; + } + } + printf("Can't translate NAME label '%s'\n", lb->lb_str); + return; + } + + if (lb->lb_type == STRING) { + if (isalpha(Py_CHARMASK(lb->lb_str[1])) || + lb->lb_str[1] == '_') { + char *p; + char *src; + char *dest; + size_t name_len; + if (Py_DebugFlag) + printf("Label %s is a keyword\n", lb->lb_str); + lb->lb_type = NAME; + src = lb->lb_str + 1; + p = strchr(src, '\''); + if (p) + name_len = p - src; + else + name_len = strlen(src); + dest = (char *)malloc(name_len + 1); + if (!dest) { + printf("Can't alloc dest '%s'\n", src); + return; + } + strncpy(dest, src, name_len); + dest[name_len] = '\0'; + free(lb->lb_str); + lb->lb_str = dest; + } + else if (lb->lb_str[2] == lb->lb_str[0]) { + int type = (int) PyToken_OneChar(lb->lb_str[1]); + if (type != OP) { + lb->lb_type = type; + free(lb->lb_str); + lb->lb_str = NULL; + } + else + printf("Unknown OP label %s\n", + lb->lb_str); + } + else if (lb->lb_str[2] && lb->lb_str[3] == lb->lb_str[0]) { + int type = (int) PyToken_TwoChars(lb->lb_str[1], + lb->lb_str[2]); + if (type != OP) { + lb->lb_type = type; + free(lb->lb_str); + lb->lb_str = NULL; + } + else + printf("Unknown OP label %s\n", + lb->lb_str); + } + else if (lb->lb_str[2] && lb->lb_str[3] && lb->lb_str[4] == lb->lb_str[0]) { + int type = (int) PyToken_ThreeChars(lb->lb_str[1], + lb->lb_str[2], + lb->lb_str[3]); + if (type != OP) { + lb->lb_type = type; + free(lb->lb_str); + lb->lb_str = NULL; + } + else + printf("Unknown OP label %s\n", + lb->lb_str); + } + else + printf("Can't translate STRING label %s\n", + lb->lb_str); + } + else + printf("Can't translate label '%s'\n", + PyGrammar_LabelRepr(lb)); +} diff --git a/Parser/grammar1.c b/Parser/grammar1.c index e0b8fbb..1f7d264 100644 --- a/Parser/grammar1.c +++ b/Parser/grammar1.c @@ -2,21 +2,35 @@ /* Grammar subroutines needed by parser */ #include "Python.h" +#include "pgenheaders.h" #include "grammar.h" #include "token.h" /* Return the DFA for the given type */ -const dfa * -PyGrammar_FindDFA(grammar *g, int type) +dfa * +PyGrammar_FindDFA(grammar *g, register int type) { + register dfa *d; +#if 1 /* Massive speed-up */ - const dfa *d = &g->g_dfa[type - NT_OFFSET]; + d = &g->g_dfa[type - NT_OFFSET]; assert(d->d_type == type); return d; +#else + /* Old, slow version */ + register int i; + + for (i = g->g_ndfas, d = g->g_dfa; --i >= 0; d++) { + if (d->d_type == type) + return d; + } + assert(0); + /* NOTREACHED */ +#endif } -const char * +char * PyGrammar_LabelRepr(label *lb) { static char buf[100]; @@ -31,7 +45,7 @@ PyGrammar_LabelRepr(label *lb) else return lb->lb_str; } - else if (lb->lb_type < N_TOKENS) { + else { if (lb->lb_str == NULL) return _PyParser_TokenNames[lb->lb_type]; else { @@ -40,8 +54,4 @@ PyGrammar_LabelRepr(label *lb) return buf; } } - else { - Py_FatalError("invalid label"); - return NULL; - } } diff --git a/Parser/intrcheck.c b/Parser/intrcheck.c new file mode 100644 index 0000000..5844a9a --- /dev/null +++ b/Parser/intrcheck.c @@ -0,0 +1,178 @@ + +/* Check for interrupts */ + +#include "Python.h" +#include "pythread.h" + +#ifdef QUICKWIN + +#include <io.h> + +void +PyOS_InitInterrupts(void) +{ +} + +void +PyOS_FiniInterrupts(void) +{ +} + +int +PyOS_InterruptOccurred(void) +{ + _wyield(); +} + +#define OK + +#endif /* QUICKWIN */ + +#if defined(_M_IX86) && !defined(__QNX__) +#include <io.h> +#endif + +#if defined(MSDOS) && !defined(QUICKWIN) + +#ifdef __GNUC__ + +/* This is for DJGPP's GO32 extender. I don't know how to trap + * control-C (There's no API for ctrl-C, and I don't want to mess with + * the interrupt vectors.) However, this DOES catch control-break. + * --Amrit + */ + +#include <go32.h> + +void +PyOS_InitInterrupts(void) +{ + _go32_want_ctrl_break(1 /* TRUE */); +} + +void +PyOS_FiniInterrupts(void) +{ +} + +int +PyOS_InterruptOccurred(void) +{ + return _go32_was_ctrl_break_hit(); +} + +#else /* !__GNUC__ */ + +/* This might work for MS-DOS (untested though): */ + +void +PyOS_InitInterrupts(void) +{ +} + +void +PyOS_FiniInterrupts(void) +{ +} + +int +PyOS_InterruptOccurred(void) +{ + int interrupted = 0; + while (kbhit()) { + if (getch() == '\003') + interrupted = 1; + } + return interrupted; +} + +#endif /* __GNUC__ */ + +#define OK + +#endif /* MSDOS && !QUICKWIN */ + + +#ifndef OK + +/* Default version -- for real operating systems and for Standard C */ + +#include <stdio.h> +#include <string.h> +#include <signal.h> + +static int interrupted; + +void +PyErr_SetInterrupt(void) +{ + interrupted = 1; +} + +extern int PyErr_CheckSignals(void); + +static int +checksignals_witharg(void * arg) +{ + return PyErr_CheckSignals(); +} + +static void +intcatcher(int sig) +{ + extern void Py_Exit(int); + static char message[] = +"python: to interrupt a truly hanging Python program, interrupt once more.\n"; + switch (interrupted++) { + case 0: + break; + case 1: +#ifdef RISCOS + fprintf(stderr, message); +#else + write(2, message, strlen(message)); +#endif + break; + case 2: + interrupted = 0; + Py_Exit(1); + break; + } + PyOS_setsig(SIGINT, intcatcher); + Py_AddPendingCall(checksignals_witharg, NULL); +} + +static void (*old_siginthandler)(int) = SIG_DFL; + +void +PyOS_InitInterrupts(void) +{ + if ((old_siginthandler = PyOS_setsig(SIGINT, SIG_IGN)) != SIG_IGN) + PyOS_setsig(SIGINT, intcatcher); +} + +void +PyOS_FiniInterrupts(void) +{ + PyOS_setsig(SIGINT, old_siginthandler); +} + +int +PyOS_InterruptOccurred(void) +{ + if (!interrupted) + return 0; + interrupted = 0; + return 1; +} + +#endif /* !OK */ + +void +PyOS_AfterFork(void) +{ +#ifdef WITH_THREAD + PyThread_ReInitTLS(); + PyEval_ReInitThreads(); +#endif +} diff --git a/Parser/listnode.c b/Parser/listnode.c index d431ae5..b5f8ad2 100644 --- a/Parser/listnode.c +++ b/Parser/listnode.c @@ -1,8 +1,7 @@ /* List a node on a file */ -#include "Python.h" -#include "pycore_pystate.h" +#include "pgenheaders.h" #include "token.h" #include "node.h" @@ -16,22 +15,20 @@ PyNode_ListTree(node *n) listnode(stdout, n); } +static int level, atbol; + static void listnode(FILE *fp, node *n) { - PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE(); - - interp->parser.listnode.level = 0; - interp->parser.listnode.atbol = 1; + level = 0; + atbol = 1; list1node(fp, n); } static void list1node(FILE *fp, node *n) { - PyInterpreterState *interp; - - if (n == NULL) + if (n == 0) return; if (ISNONTERMINAL(TYPE(n))) { int i; @@ -39,26 +36,25 @@ list1node(FILE *fp, node *n) list1node(fp, CHILD(n, i)); } else if (ISTERMINAL(TYPE(n))) { - interp = _PyInterpreterState_GET_UNSAFE(); switch (TYPE(n)) { case INDENT: - interp->parser.listnode.level++; + ++level; break; case DEDENT: - interp->parser.listnode.level--; + --level; break; default: - if (interp->parser.listnode.atbol) { + if (atbol) { int i; - for (i = 0; i < interp->parser.listnode.level; ++i) + for (i = 0; i < level; ++i) fprintf(fp, "\t"); - interp->parser.listnode.atbol = 0; + atbol = 0; } if (TYPE(n) == NEWLINE) { if (STR(n) != NULL) fprintf(fp, "%s", STR(n)); fprintf(fp, "\n"); - interp->parser.listnode.atbol = 1; + atbol = 1; } else fprintf(fp, "%s ", STR(n)); diff --git a/Parser/metagrammar.c b/Parser/metagrammar.c new file mode 100644 index 0000000..53810b8 --- /dev/null +++ b/Parser/metagrammar.c @@ -0,0 +1,159 @@ + +#include "pgenheaders.h" +#include "metagrammar.h" +#include "grammar.h" +#include "pgen.h" +static arc arcs_0_0[3] = { + {2, 0}, + {3, 0}, + {4, 1}, +}; +static arc arcs_0_1[1] = { + {0, 1}, +}; +static state states_0[2] = { + {3, arcs_0_0}, + {1, arcs_0_1}, +}; +static arc arcs_1_0[1] = { + {5, 1}, +}; +static arc arcs_1_1[1] = { + {6, 2}, +}; +static arc arcs_1_2[1] = { + {7, 3}, +}; +static arc arcs_1_3[1] = { + {3, 4}, +}; +static arc arcs_1_4[1] = { + {0, 4}, +}; +static state states_1[5] = { + {1, arcs_1_0}, + {1, arcs_1_1}, + {1, arcs_1_2}, + {1, arcs_1_3}, + {1, arcs_1_4}, +}; +static arc arcs_2_0[1] = { + {8, 1}, +}; +static arc arcs_2_1[2] = { + {9, 0}, + {0, 1}, +}; +static state states_2[2] = { + {1, arcs_2_0}, + {2, arcs_2_1}, +}; +static arc arcs_3_0[1] = { + {10, 1}, +}; +static arc arcs_3_1[2] = { + {10, 1}, + {0, 1}, +}; +static state states_3[2] = { + {1, arcs_3_0}, + {2, arcs_3_1}, +}; +static arc arcs_4_0[2] = { + {11, 1}, + {13, 2}, +}; +static arc arcs_4_1[1] = { + {7, 3}, +}; +static arc arcs_4_2[3] = { + {14, 4}, + {15, 4}, + {0, 2}, +}; +static arc arcs_4_3[1] = { + {12, 4}, +}; +static arc arcs_4_4[1] = { + {0, 4}, +}; +static state states_4[5] = { + {2, arcs_4_0}, + {1, arcs_4_1}, + {3, arcs_4_2}, + {1, arcs_4_3}, + {1, arcs_4_4}, +}; +static arc arcs_5_0[3] = { + {5, 1}, + {16, 1}, + {17, 2}, +}; +static arc arcs_5_1[1] = { + {0, 1}, +}; +static arc arcs_5_2[1] = { + {7, 3}, +}; +static arc arcs_5_3[1] = { + {18, 1}, +}; +static state states_5[4] = { + {3, arcs_5_0}, + {1, arcs_5_1}, + {1, arcs_5_2}, + {1, arcs_5_3}, +}; +static dfa dfas[6] = { + {256, "MSTART", 0, 2, states_0, + "\070\000\000"}, + {257, "RULE", 0, 5, states_1, + "\040\000\000"}, + {258, "RHS", 0, 2, states_2, + "\040\010\003"}, + {259, "ALT", 0, 2, states_3, + "\040\010\003"}, + {260, "ITEM", 0, 5, states_4, + "\040\010\003"}, + {261, "ATOM", 0, 4, states_5, + "\040\000\003"}, +}; +static label labels[19] = { + {0, "EMPTY"}, + {256, 0}, + {257, 0}, + {4, 0}, + {0, 0}, + {1, 0}, + {11, 0}, + {258, 0}, + {259, 0}, + {18, 0}, + {260, 0}, + {9, 0}, + {10, 0}, + {261, 0}, + {16, 0}, + {14, 0}, + {3, 0}, + {7, 0}, + {8, 0}, +}; +static grammar _PyParser_Grammar = { + 6, + dfas, + {19, labels}, + 256 +}; + +grammar * +meta_grammar(void) +{ + return &_PyParser_Grammar; +} + +grammar * +Py_meta_grammar(void) +{ + return meta_grammar(); +} diff --git a/Parser/myreadline.c b/Parser/myreadline.c index 43e5583..5376214 100644 --- a/Parser/myreadline.c +++ b/Parser/myreadline.c @@ -10,31 +10,40 @@ */ #include "Python.h" -#include "pycore_pystate.h" #ifdef MS_WINDOWS #define WIN32_LEAN_AND_MEAN #include "windows.h" #endif /* MS_WINDOWS */ +#ifdef __VMS +extern char* vms__StdioReadline(FILE *sys_stdin, FILE *sys_stdout, char *prompt); +#endif + -PyThreadState* _PyOS_ReadlineTState = NULL; +PyThreadState* _PyOS_ReadlineTState; +#ifdef WITH_THREAD #include "pythread.h" static PyThread_type_lock _PyOS_ReadlineLock = NULL; +#endif int (*PyOS_InputHook)(void) = NULL; +#ifdef RISCOS +int Py_RISCOSWimpFlag; +#endif + /* This function restarts a fgets() after an EINTR error occurred except if PyOS_InterruptOccurred() returns true. */ static int my_fgets(char *buf, int len, FILE *fp) { + char *p; #ifdef MS_WINDOWS - HANDLE hInterruptEvent; + int i; #endif - char *p; - int err; + while (1) { if (PyOS_InputHook != NULL) (void)(PyOS_InputHook)(); @@ -43,29 +52,24 @@ my_fgets(char *buf, int len, FILE *fp) p = fgets(buf, len, fp); if (p != NULL) return 0; /* No error */ - err = errno; #ifdef MS_WINDOWS /* Ctrl-C anywhere on the line or Ctrl-Z if the only character on a line will set ERROR_OPERATION_ABORTED. Under normal circumstances Ctrl-C will also have caused the SIGINT handler - to fire which will have set the event object returned by - _PyOS_SigintEvent. This signal fires in another thread and - is not guaranteed to have occurred before this point in the - code. - - Therefore: check whether the event is set with a small timeout. - If it is, assume this is a Ctrl-C and reset the event. If it - isn't set assume that this is a Ctrl-Z on its own and drop - through to check for EOF. + to fire. This signal fires in another thread and is not + guaranteed to have occurred before this point in the code. + + Therefore: check in a small loop to see if the trigger has + fired, in which case assume this is a Ctrl-C event. If it + hasn't fired within 10ms assume that this is a Ctrl-Z on its + own or that the signal isn't going to fire for some other + reason and drop through to check for EOF. */ if (GetLastError()==ERROR_OPERATION_ABORTED) { - hInterruptEvent = _PyOS_SigintEvent(); - switch (WaitForSingleObjectEx(hInterruptEvent, 10, FALSE)) { - case WAIT_OBJECT_0: - ResetEvent(hInterruptEvent); - return 1; /* Interrupt */ - case WAIT_FAILED: - return -2; /* Error */ + for (i = 0; i < 10; i++) { + if (PyOS_InterruptOccurred()) + return 1; + Sleep(1); } } #endif /* MS_WINDOWS */ @@ -74,14 +78,18 @@ my_fgets(char *buf, int len, FILE *fp) return -1; /* EOF */ } #ifdef EINTR - if (err == EINTR) { + if (errno == EINTR) { int s; +#ifdef WITH_THREAD PyEval_RestoreThread(_PyOS_ReadlineTState); +#endif s = PyErr_CheckSignals(); +#ifdef WITH_THREAD PyEval_SaveThread(); +#endif if (s < 0) return 1; - /* try again */ + /* try again */ continue; } #endif @@ -93,185 +101,35 @@ my_fgets(char *buf, int len, FILE *fp) /* NOTREACHED */ } -#ifdef MS_WINDOWS -/* Readline implementation using ReadConsoleW */ - -extern char _get_console_type(HANDLE handle); - -char * -_PyOS_WindowsConsoleReadline(HANDLE hStdIn) -{ - static wchar_t wbuf_local[1024 * 16]; - const DWORD chunk_size = 1024; - - DWORD n_read, total_read, wbuflen, u8len; - wchar_t *wbuf; - char *buf = NULL; - int err = 0; - - n_read = (DWORD)-1; - total_read = 0; - wbuf = wbuf_local; - wbuflen = sizeof(wbuf_local) / sizeof(wbuf_local[0]) - 1; - while (1) { - if (PyOS_InputHook != NULL) { - (void)(PyOS_InputHook)(); - } - if (!ReadConsoleW(hStdIn, &wbuf[total_read], wbuflen - total_read, &n_read, NULL)) { - err = GetLastError(); - goto exit; - } - if (n_read == (DWORD)-1 && (err = GetLastError()) == ERROR_OPERATION_ABORTED) { - break; - } - if (n_read == 0) { - int s; - err = GetLastError(); - if (err != ERROR_OPERATION_ABORTED) - goto exit; - err = 0; - HANDLE hInterruptEvent = _PyOS_SigintEvent(); - if (WaitForSingleObjectEx(hInterruptEvent, 100, FALSE) - == WAIT_OBJECT_0) { - ResetEvent(hInterruptEvent); - PyEval_RestoreThread(_PyOS_ReadlineTState); - s = PyErr_CheckSignals(); - PyEval_SaveThread(); - if (s < 0) - goto exit; - } - break; - } - - total_read += n_read; - if (total_read == 0 || wbuf[total_read - 1] == L'\n') { - break; - } - wbuflen += chunk_size; - if (wbuf == wbuf_local) { - wbuf[total_read] = '\0'; - wbuf = (wchar_t*)PyMem_RawMalloc(wbuflen * sizeof(wchar_t)); - if (wbuf) - wcscpy_s(wbuf, wbuflen, wbuf_local); - else { - PyErr_NoMemory(); - goto exit; - } - } - else { - wchar_t *tmp = PyMem_RawRealloc(wbuf, wbuflen * sizeof(wchar_t)); - if (tmp == NULL) { - PyErr_NoMemory(); - goto exit; - } - wbuf = tmp; - } - } - - if (wbuf[0] == '\x1a') { - buf = PyMem_RawMalloc(1); - if (buf) - buf[0] = '\0'; - else { - PyErr_NoMemory(); - } - goto exit; - } - - u8len = WideCharToMultiByte(CP_UTF8, 0, wbuf, total_read, NULL, 0, NULL, NULL); - buf = PyMem_RawMalloc(u8len + 1); - if (buf == NULL) { - PyErr_NoMemory(); - goto exit; - } - u8len = WideCharToMultiByte(CP_UTF8, 0, wbuf, total_read, buf, u8len, NULL, NULL); - buf[u8len] = '\0'; - -exit: - if (wbuf != wbuf_local) - PyMem_RawFree(wbuf); - - if (err) { - PyEval_RestoreThread(_PyOS_ReadlineTState); - PyErr_SetFromWindowsErr(err); - PyEval_SaveThread(); - } - - return buf; -} - -#endif - /* Readline implementation using fgets() */ char * -PyOS_StdioReadline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt) +PyOS_StdioReadline(FILE *sys_stdin, FILE *sys_stdout, char *prompt) { size_t n; char *p, *pr; - -#ifdef MS_WINDOWS - if (!Py_LegacyWindowsStdioFlag && sys_stdin == stdin) { - HANDLE hStdIn, hStdErr; - - _Py_BEGIN_SUPPRESS_IPH - hStdIn = (HANDLE)_get_osfhandle(fileno(sys_stdin)); - hStdErr = (HANDLE)_get_osfhandle(fileno(stderr)); - _Py_END_SUPPRESS_IPH - - if (_get_console_type(hStdIn) == 'r') { - fflush(sys_stdout); - if (prompt) { - if (_get_console_type(hStdErr) == 'w') { - wchar_t *wbuf; - int wlen; - wlen = MultiByteToWideChar(CP_UTF8, 0, prompt, -1, - NULL, 0); - if (wlen) { - wbuf = PyMem_RawMalloc(wlen * sizeof(wchar_t)); - if (wbuf == NULL) { - PyErr_NoMemory(); - return NULL; - } - wlen = MultiByteToWideChar(CP_UTF8, 0, prompt, -1, - wbuf, wlen); - if (wlen) { - DWORD n; - fflush(stderr); - /* wlen includes null terminator, so subtract 1 */ - WriteConsoleW(hStdErr, wbuf, wlen - 1, &n, NULL); - } - PyMem_RawFree(wbuf); - } - } else { - fprintf(stderr, "%s", prompt); - fflush(stderr); - } - } - clearerr(sys_stdin); - return _PyOS_WindowsConsoleReadline(hStdIn); - } - } -#endif - n = 100; - p = (char *)PyMem_RawMalloc(n); - if (p == NULL) { - PyErr_NoMemory(); + if ((p = (char *)PyMem_MALLOC(n)) == NULL) return NULL; - } - fflush(sys_stdout); +#ifndef RISCOS if (prompt) fprintf(stderr, "%s", prompt); +#else + if (prompt) { + if(Py_RISCOSWimpFlag) + fprintf(stderr, "\x0cr%s\x0c", prompt); + else + fprintf(stderr, "%s", prompt); + } +#endif fflush(stderr); - switch (my_fgets(p, (int)n, sys_stdin)) { case 0: /* Normal case */ break; case 1: /* Interrupt */ - PyMem_RawFree(p); + PyMem_FREE(p); return NULL; case -1: /* EOF */ case -2: /* Error */ @@ -283,13 +141,13 @@ PyOS_StdioReadline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt) while (n > 0 && p[n-1] != '\n') { size_t incr = n+2; if (incr > INT_MAX) { - PyMem_RawFree(p); + PyMem_FREE(p); PyErr_SetString(PyExc_OverflowError, "input line too long"); return NULL; } - pr = (char *)PyMem_RawRealloc(p, n + incr); + pr = (char *)PyMem_REALLOC(p, n + incr); if (pr == NULL) { - PyMem_RawFree(p); + PyMem_FREE(p); PyErr_NoMemory(); return NULL; } @@ -298,9 +156,9 @@ PyOS_StdioReadline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt) break; n += strlen(p+n); } - pr = (char *)PyMem_RawRealloc(p, n+1); + pr = (char *)PyMem_REALLOC(p, n+1); if (pr == NULL) { - PyMem_RawFree(p); + PyMem_FREE(p); PyErr_NoMemory(); return NULL; } @@ -313,18 +171,17 @@ PyOS_StdioReadline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt) Note: Python expects in return a buffer allocated with PyMem_Malloc. */ -char *(*PyOS_ReadlineFunctionPointer)(FILE *, FILE *, const char *) = NULL; +char *(*PyOS_ReadlineFunctionPointer)(FILE *, FILE *, char *); /* Interface used by tokenizer.c and bltinmodule.c */ char * -PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt) +PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, char *prompt) { - char *rv, *res; - size_t len; + char *rv; - if (_PyOS_ReadlineTState == _PyThreadState_GET()) { + if (_PyOS_ReadlineTState == PyThreadState_GET()) { PyErr_SetString(PyExc_RuntimeError, "can't re-enter readline"); return NULL; @@ -332,20 +189,24 @@ PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt) if (PyOS_ReadlineFunctionPointer == NULL) { +#ifdef __VMS + PyOS_ReadlineFunctionPointer = vms__StdioReadline; +#else PyOS_ReadlineFunctionPointer = PyOS_StdioReadline; +#endif } +#ifdef WITH_THREAD if (_PyOS_ReadlineLock == NULL) { _PyOS_ReadlineLock = PyThread_allocate_lock(); - if (_PyOS_ReadlineLock == NULL) { - PyErr_SetString(PyExc_MemoryError, "can't allocate lock"); - return NULL; - } } +#endif - _PyOS_ReadlineTState = _PyThreadState_GET(); + _PyOS_ReadlineTState = PyThreadState_GET(); Py_BEGIN_ALLOW_THREADS +#ifdef WITH_THREAD PyThread_acquire_lock(_PyOS_ReadlineLock, 1); +#endif /* This is needed to handle the unlikely case that the * interpreter is in interactive mode *and* stdin/out are not @@ -359,22 +220,11 @@ PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt) prompt); Py_END_ALLOW_THREADS +#ifdef WITH_THREAD PyThread_release_lock(_PyOS_ReadlineLock); +#endif _PyOS_ReadlineTState = NULL; - if (rv == NULL) - return NULL; - - len = strlen(rv) + 1; - res = PyMem_Malloc(len); - if (res != NULL) { - memcpy(res, rv, len); - } - else { - PyErr_NoMemory(); - } - PyMem_RawFree(rv); - - return res; + return rv; } diff --git a/Parser/node.c b/Parser/node.c index f1b70e0..0dea30f 100644 --- a/Parser/node.c +++ b/Parser/node.c @@ -13,8 +13,6 @@ PyNode_New(int type) n->n_type = type; n->n_str = NULL; n->n_lineno = 0; - n->n_end_lineno = 0; - n->n_end_col_offset = -1; n->n_nchildren = 0; n->n_child = NULL; return n; @@ -72,39 +70,19 @@ fancy_roundup(int n) * Note that this would be straightforward if a node stored its current * capacity. The code is tricky to avoid that. */ -#define XXXROUNDUP(n) ((n) <= 1 ? (n) : \ - (n) <= 128 ? (int)_Py_SIZE_ROUND_UP((n), 4) : \ +#define XXXROUNDUP(n) ((n) <= 1 ? (n) : \ + (n) <= 128 ? (((n) + 3) & ~3) : \ fancy_roundup(n)) -void -_PyNode_FinalizeEndPos(node *n) -{ - int nch = NCH(n); - node *last; - if (nch == 0) { - return; - } - last = CHILD(n, nch - 1); - _PyNode_FinalizeEndPos(last); - n->n_end_lineno = last->n_end_lineno; - n->n_end_col_offset = last->n_end_col_offset; -} - int -PyNode_AddChild(node *n1, int type, char *str, int lineno, int col_offset, - int end_lineno, int end_col_offset) +PyNode_AddChild(register node *n1, int type, char *str, int lineno, int col_offset) { const int nch = n1->n_nchildren; int current_capacity; int required_capacity; node *n; - // finalize end position of previous node (if any) - if (nch > 0) { - _PyNode_FinalizeEndPos(CHILD(n1, nch - 1)); - } - if (nch == INT_MAX || nch < 0) return E_OVERFLOW; @@ -113,7 +91,7 @@ PyNode_AddChild(node *n1, int type, char *str, int lineno, int col_offset, if (current_capacity < 0 || required_capacity < 0) return E_OVERFLOW; if (current_capacity < required_capacity) { - if ((size_t)required_capacity > SIZE_MAX / sizeof(node)) { + if (required_capacity > PY_SIZE_MAX / sizeof(node)) { return E_NOMEM; } n = n1->n_child; @@ -129,8 +107,6 @@ PyNode_AddChild(node *n1, int type, char *str, int lineno, int col_offset, n->n_str = str; n->n_lineno = lineno; n->n_col_offset = col_offset; - n->n_end_lineno = end_lineno; // this and below will be updates after all children are added. - n->n_end_col_offset = end_col_offset; n->n_nchildren = 0; n->n_child = NULL; return 0; diff --git a/Parser/parser.c b/Parser/parser.c index 227b918..b753a17 100644 --- a/Parser/parser.c +++ b/Parser/parser.c @@ -6,12 +6,12 @@ /* XXX To do: error recovery */ #include "Python.h" +#include "pgenheaders.h" #include "token.h" #include "grammar.h" #include "node.h" #include "parser.h" #include "errcode.h" -#include "graminit.h" #ifdef Py_DEBUG @@ -35,9 +35,9 @@ s_reset(stack *s) #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK]) static int -s_push(stack *s, const dfa *d, node *parent) +s_push(register stack *s, dfa *d, node *parent) { - stackentry *top; + register stackentry *top; if (s->s_top == s->s_base) { fprintf(stderr, "s_push: parser stack overflow\n"); return E_NOMEM; @@ -52,7 +52,7 @@ s_push(stack *s, const dfa *d, node *parent) #ifdef Py_DEBUG static void -s_pop(stack *s) +s_pop(register stack *s) { if (s_empty(s)) Py_FatalError("s_pop: parser stack underflow -- FATAL"); @@ -105,13 +105,11 @@ PyParser_Delete(parser_state *ps) /* PARSER STACK OPERATIONS */ static int -shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset, - int end_lineno, int end_col_offset) +shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset) { int err; assert(!s_empty(s)); - err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset, - end_lineno, end_col_offset); + err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset); if (err) return err; s->s_top->s_state = newstate; @@ -119,15 +117,13 @@ shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset, } static int -push(stack *s, int type, const dfa *d, int newstate, int lineno, int col_offset, - int end_lineno, int end_col_offset) +push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset) { int err; - node *n; + register node *n; n = s->s_top->s_parent; assert(!s_empty(s)); - err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset, - end_lineno, end_col_offset); + err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset); if (err) return err; s->s_top->s_state = newstate; @@ -138,38 +134,34 @@ push(stack *s, int type, const dfa *d, int newstate, int lineno, int col_offset, /* PARSER PROPER */ static int -classify(parser_state *ps, int type, const char *str) +classify(parser_state *ps, int type, char *str) { grammar *g = ps->p_grammar; - int n = g->g_ll.ll_nlabels; + register int n = g->g_ll.ll_nlabels; if (type == NAME) { - const label *l = g->g_ll.ll_label; - int i; + register char *s = str; + register label *l = g->g_ll.ll_label; + register int i; for (i = n; i > 0; i--, l++) { if (l->lb_type != NAME || l->lb_str == NULL || - l->lb_str[0] != str[0] || - strcmp(l->lb_str, str) != 0) + l->lb_str[0] != s[0] || + strcmp(l->lb_str, s) != 0) continue; #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD -#if 0 - /* Leaving this in as an example */ - if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) { - if (str[0] == 'w' && strcmp(str, "with") == 0) - break; /* not a keyword yet */ - else if (str[0] == 'a' && strcmp(str, "as") == 0) - break; /* not a keyword yet */ + if (ps->p_flags & CO_FUTURE_PRINT_FUNCTION && + s[0] == 'p' && strcmp(s, "print") == 0) { + break; /* no longer a keyword */ } #endif -#endif D(printf("It's a keyword\n")); return n - i; } } { - const label *l = g->g_ll.ll_label; - int i; + register label *l = g->g_ll.ll_label; + register int i; for (i = n; i > 0; i--, l++) { if (l->lb_type == type && l->lb_str == NULL) { D(printf("It's a token we know\n")); @@ -183,8 +175,6 @@ classify(parser_state *ps, int type, const char *str) } #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD -#if 0 -/* Leaving this in as an example */ static void future_hack(parser_state *ps) { @@ -224,16 +214,13 @@ future_hack(parser_state *ps) } } } -#endif #endif /* future keyword */ int -PyParser_AddToken(parser_state *ps, int type, char *str, - int lineno, int col_offset, - int end_lineno, int end_col_offset, - int *expected_ret) +PyParser_AddToken(register parser_state *ps, register int type, char *str, + int lineno, int col_offset, int *expected_ret) { - int ilabel; + register int ilabel; int err; D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str)); @@ -246,43 +233,34 @@ PyParser_AddToken(parser_state *ps, int type, char *str, /* Loop until the token is shifted or an error occurred */ for (;;) { /* Fetch the current dfa and state */ - const dfa *d = ps->p_stack.s_top->s_dfa; - state *s = &d->d_state[ps->p_stack.s_top->s_state]; + register dfa *d = ps->p_stack.s_top->s_dfa; + register state *s = &d->d_state[ps->p_stack.s_top->s_state]; D(printf(" DFA '%s', state %d:", d->d_name, ps->p_stack.s_top->s_state)); /* Check accelerator */ if (s->s_lower <= ilabel && ilabel < s->s_upper) { - int x = s->s_accel[ilabel - s->s_lower]; + register int x = s->s_accel[ilabel - s->s_lower]; if (x != -1) { if (x & (1<<7)) { /* Push non-terminal */ int nt = (x >> 8) + NT_OFFSET; int arrow = x & ((1<<7)-1); - if (nt == func_body_suite && !(ps->p_flags & PyCF_TYPE_COMMENTS)) { - /* When parsing type comments is not requested, - we can provide better errors about bad indentation - by using 'suite' for the body of a funcdef */ - D(printf(" [switch func_body_suite to suite]")); - nt = suite; - } - const dfa *d1 = PyGrammar_FindDFA( + dfa *d1 = PyGrammar_FindDFA( ps->p_grammar, nt); if ((err = push(&ps->p_stack, nt, d1, - arrow, lineno, col_offset, - end_lineno, end_col_offset)) > 0) { + arrow, lineno, col_offset)) > 0) { D(printf(" MemError: push\n")); return err; } - D(printf(" Push '%s'\n", d1->d_name)); + D(printf(" Push ...\n")); continue; } /* Shift the token */ if ((err = shift(&ps->p_stack, type, str, - x, lineno, col_offset, - end_lineno, end_col_offset)) > 0) { + x, lineno, col_offset)) > 0) { D(printf(" MemError: shift.\n")); return err; } @@ -296,13 +274,11 @@ PyParser_AddToken(parser_state *ps, int type, char *str, d->d_name, ps->p_stack.s_top->s_state)); #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD -#if 0 if (d->d_name[0] == 'i' && strcmp(d->d_name, "import_stmt") == 0) future_hack(ps); #endif -#endif s_pop(&ps->p_stack); if (s_empty(&ps->p_stack)) { D(printf(" ACCEPT.\n")); @@ -316,12 +292,10 @@ PyParser_AddToken(parser_state *ps, int type, char *str, if (s->s_accept) { #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD -#if 0 if (d->d_name[0] == 'i' && strcmp(d->d_name, "import_stmt") == 0) future_hack(ps); #endif -#endif /* Pop this dfa and try again */ s_pop(&ps->p_stack); D(printf(" Pop ...\n")); diff --git a/Parser/parser.h b/Parser/parser.h index b16075e..403236d 100644 --- a/Parser/parser.h +++ b/Parser/parser.h @@ -7,42 +7,35 @@ extern "C" { /* Parser interface */ -#define MAXSTACK 1700 +#define MAXSTACK 1500 typedef struct { - int s_state; /* State in current DFA */ - const dfa *s_dfa; /* Current DFA */ - struct _node *s_parent; /* Where to add next node */ + int s_state; /* State in current DFA */ + dfa *s_dfa; /* Current DFA */ + struct _node *s_parent; /* Where to add next node */ } stackentry; typedef struct { - stackentry *s_top; /* Top entry */ - stackentry s_base[MAXSTACK];/* Array of stack entries */ - /* NB The stack grows down */ + stackentry *s_top; /* Top entry */ + stackentry s_base[MAXSTACK];/* Array of stack entries */ + /* NB The stack grows down */ } stack; typedef struct { - stack p_stack; /* Stack of parser states */ - grammar *p_grammar; /* Grammar to use */ - node *p_tree; /* Top of parse tree */ + stack p_stack; /* Stack of parser states */ + grammar *p_grammar; /* Grammar to use */ + node *p_tree; /* Top of parse tree */ #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD - unsigned long p_flags; /* see co_flags in Include/code.h */ + unsigned long p_flags; /* see co_flags in Include/code.h */ #endif } parser_state; parser_state *PyParser_New(grammar *g, int start); void PyParser_Delete(parser_state *ps); -int PyParser_AddToken(parser_state *ps, int type, char *str, - int lineno, int col_offset, - int end_lineno, int end_col_offset, +int PyParser_AddToken(parser_state *ps, int type, char *str, int lineno, int col_offset, int *expected_ret); void PyGrammar_AddAccelerators(grammar *g); - -#define showtree _Py_showtree -#define printtree _Py_printtree -#define dumptree _Py_dumptree - #ifdef __cplusplus } #endif diff --git a/Parser/parsetok.c b/Parser/parsetok.c index a5d7897..a5e9222 100644 --- a/Parser/parsetok.c +++ b/Parser/parsetok.c @@ -1,7 +1,7 @@ /* Parser-tokenizer link implementation */ -#include "Python.h" +#include "pgenheaders.h" #include "tokenizer.h" #include "node.h" #include "grammar.h" @@ -10,53 +10,12 @@ #include "errcode.h" #include "graminit.h" +int Py_TabcheckFlag; + /* Forward */ static node *parsetok(struct tok_state *, grammar *, int, perrdetail *, int *); -static int initerr(perrdetail *err_ret, PyObject * filename); - -typedef struct { - struct { - int lineno; - char *comment; - } *items; - size_t size; - size_t num_items; -} growable_comment_array; - -static int -growable_comment_array_init(growable_comment_array *arr, size_t initial_size) { - assert(initial_size > 0); - arr->items = malloc(initial_size * sizeof(*arr->items)); - arr->size = initial_size; - arr->num_items = 0; - - return arr->items != NULL; -} - -static int -growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) { - if (arr->num_items >= arr->size) { - arr->size *= 2; - arr->items = realloc(arr->items, arr->size * sizeof(*arr->items)); - if (!arr->items) { - return 0; - } - } - - arr->items[arr->num_items].lineno = lineno; - arr->items[arr->num_items].comment = comment; - arr->num_items++; - return 1; -} - -static void -growable_comment_array_deallocate(growable_comment_array *arr) { - for (unsigned i = 0; i < arr->num_items; i++) { - PyObject_FREE(arr->items[i].comment); - } - free(arr->items); -} +static void initerr(perrdetail *err_ret, const char* filename); /* Parse input coming from a string. Return error code, print some errors. */ node * @@ -84,135 +43,74 @@ PyParser_ParseStringFlagsFilename(const char *s, const char *filename, } node * -PyParser_ParseStringObject(const char *s, PyObject *filename, - grammar *g, int start, - perrdetail *err_ret, int *flags) +PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename, + grammar *g, int start, + perrdetail *err_ret, int *flags) { struct tok_state *tok; - int exec_input = start == file_input; - if (initerr(err_ret, filename) < 0) - return NULL; - - if (PySys_Audit("compile", "yO", s, err_ret->filename) < 0) { - err_ret->error = E_ERROR; - return NULL; - } + initerr(err_ret, filename); - if (*flags & PyPARSE_IGNORE_COOKIE) - tok = PyTokenizer_FromUTF8(s, exec_input); - else - tok = PyTokenizer_FromString(s, exec_input); - if (tok == NULL) { + if ((tok = PyTokenizer_FromString(s, start == file_input)) == NULL) { err_ret->error = PyErr_Occurred() ? E_DECODE : E_NOMEM; return NULL; } - if (*flags & PyPARSE_TYPE_COMMENTS) { - tok->type_comments = 1; + + tok->filename = filename ? filename : "<string>"; + if (Py_TabcheckFlag || Py_VerboseFlag) { + tok->altwarning = (tok->filename != NULL); + if (Py_TabcheckFlag >= 2) + tok->alterror++; } - Py_INCREF(err_ret->filename); - tok->filename = err_ret->filename; - if (*flags & PyPARSE_ASYNC_HACKS) - tok->async_hacks = 1; return parsetok(tok, g, start, err_ret, flags); } -node * -PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename_str, - grammar *g, int start, - perrdetail *err_ret, int *flags) -{ - node *n; - PyObject *filename = NULL; - if (filename_str != NULL) { - filename = PyUnicode_DecodeFSDefault(filename_str); - if (filename == NULL) { - err_ret->error = E_ERROR; - return NULL; - } - } - n = PyParser_ParseStringObject(s, filename, g, start, err_ret, flags); - Py_XDECREF(filename); - return n; -} - /* Parse input coming from a file. Return error code, print some errors. */ node * PyParser_ParseFile(FILE *fp, const char *filename, grammar *g, int start, - const char *ps1, const char *ps2, - perrdetail *err_ret) + char *ps1, char *ps2, perrdetail *err_ret) { - return PyParser_ParseFileFlags(fp, filename, NULL, - g, start, ps1, ps2, err_ret, 0); + return PyParser_ParseFileFlags(fp, filename, g, start, ps1, ps2, + err_ret, 0); } node * -PyParser_ParseFileFlags(FILE *fp, const char *filename, const char *enc, - grammar *g, int start, - const char *ps1, const char *ps2, - perrdetail *err_ret, int flags) +PyParser_ParseFileFlags(FILE *fp, const char *filename, grammar *g, int start, + char *ps1, char *ps2, perrdetail *err_ret, int flags) { int iflags = flags; - return PyParser_ParseFileFlagsEx(fp, filename, enc, g, start, ps1, - ps2, err_ret, &iflags); + return PyParser_ParseFileFlagsEx(fp, filename, g, start, ps1, ps2, err_ret, &iflags); } node * -PyParser_ParseFileObject(FILE *fp, PyObject *filename, - const char *enc, grammar *g, int start, - const char *ps1, const char *ps2, - perrdetail *err_ret, int *flags) +PyParser_ParseFileFlagsEx(FILE *fp, const char *filename, grammar *g, int start, + char *ps1, char *ps2, perrdetail *err_ret, int *flags) { struct tok_state *tok; - if (initerr(err_ret, filename) < 0) - return NULL; + initerr(err_ret, filename); - if (PySys_Audit("compile", "OO", Py_None, err_ret->filename) < 0) { - return NULL; - } - - if ((tok = PyTokenizer_FromFile(fp, enc, ps1, ps2)) == NULL) { + if ((tok = PyTokenizer_FromFile(fp, ps1, ps2)) == NULL) { err_ret->error = E_NOMEM; return NULL; } - if (*flags & PyPARSE_TYPE_COMMENTS) { - tok->type_comments = 1; + tok->filename = filename; + if (Py_TabcheckFlag || Py_VerboseFlag) { + tok->altwarning = (filename != NULL); + if (Py_TabcheckFlag >= 2) + tok->alterror++; } - Py_INCREF(err_ret->filename); - tok->filename = err_ret->filename; - return parsetok(tok, g, start, err_ret, flags); -} -node * -PyParser_ParseFileFlagsEx(FILE *fp, const char *filename, - const char *enc, grammar *g, int start, - const char *ps1, const char *ps2, - perrdetail *err_ret, int *flags) -{ - node *n; - PyObject *fileobj = NULL; - if (filename != NULL) { - fileobj = PyUnicode_DecodeFSDefault(filename); - if (fileobj == NULL) { - err_ret->error = E_ERROR; - return NULL; - } - } - n = PyParser_ParseFileObject(fp, fileobj, enc, g, - start, ps1, ps2, err_ret, flags); - Py_XDECREF(fileobj); - return n; + return parsetok(tok, g, start, err_ret, flags); } -#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD #if 0 -static const char with_msg[] = +static char with_msg[] = "%s:%d: Warning: 'with' will become a reserved keyword in Python 2.6\n"; -static const char as_msg[] = +static char as_msg[] = "%s:%d: Warning: 'as' will become a reserved keyword in Python 2.6\n"; static void @@ -223,7 +121,6 @@ warn(const char *msg, const char *filename, int lineno) PySys_WriteStderr(msg, filename, lineno); } #endif -#endif /* Parse input coming from the given tokenizer structure. Return error code. */ @@ -235,25 +132,21 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, parser_state *ps; node *n; int started = 0; - int col_offset, end_col_offset; - growable_comment_array type_ignores; - - if (!growable_comment_array_init(&type_ignores, 10)) { - err_ret->error = E_NOMEM; - PyTokenizer_Free(tok); - return NULL; - } if ((ps = PyParser_New(g, start)) == NULL) { + fprintf(stderr, "no mem for new parser\n"); err_ret->error = E_NOMEM; PyTokenizer_Free(tok); return NULL; } #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD - if (*flags & PyPARSE_BARRY_AS_BDFL) - ps->p_flags |= CO_FUTURE_BARRY_AS_BDFL; - if (*flags & PyPARSE_TYPE_COMMENTS) - ps->p_flags |= PyCF_TYPE_COMMENTS; + if (*flags & PyPARSE_PRINT_IS_FUNCTION) { + ps->p_flags |= CO_FUTURE_PRINT_FUNCTION; + } + if (*flags & PyPARSE_UNICODE_LITERALS) { + ps->p_flags |= CO_FUTURE_UNICODE_LITERALS; + } + #endif for (;;) { @@ -261,9 +154,7 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, int type; size_t len; char *str; - col_offset = -1; - int lineno; - const char *line_start; + int col_offset; type = PyTokenizer_Get(tok, &a, &b); if (type == ERRORTOKEN) { @@ -288,6 +179,7 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, len = (a != NULL && b != NULL) ? b - a : 0; str = (char *) PyObject_MALLOC(len + 1); if (str == NULL) { + fprintf(stderr, "no mem for next token\n"); err_ret->error = E_NOMEM; break; } @@ -296,56 +188,16 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, str[len] = '\0'; #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD - if (type == NOTEQUAL) { - if (!(ps->p_flags & CO_FUTURE_BARRY_AS_BDFL) && - strcmp(str, "!=")) { - PyObject_FREE(str); - err_ret->error = E_SYNTAX; - break; - } - else if ((ps->p_flags & CO_FUTURE_BARRY_AS_BDFL) && - strcmp(str, "<>")) { - PyObject_FREE(str); - err_ret->expected = NOTEQUAL; - err_ret->error = E_SYNTAX; - break; - } - } #endif - - /* Nodes of type STRING, especially multi line strings - must be handled differently in order to get both - the starting line number and the column offset right. - (cf. issue 16806) */ - lineno = type == STRING ? tok->first_lineno : tok->lineno; - line_start = type == STRING ? tok->multi_line_start : tok->line_start; - if (a != NULL && a >= line_start) { - col_offset = Py_SAFE_DOWNCAST(a - line_start, - intptr_t, int); + if (a != NULL && a >= tok->line_start) { + col_offset = a - tok->line_start; } else { col_offset = -1; } - if (b != NULL && b >= tok->line_start) { - end_col_offset = Py_SAFE_DOWNCAST(b - tok->line_start, - intptr_t, int); - } - else { - end_col_offset = -1; - } - - if (type == TYPE_IGNORE) { - if (!growable_comment_array_add(&type_ignores, tok->lineno, str)) { - err_ret->error = E_NOMEM; - break; - } - continue; - } - if ((err_ret->error = - PyParser_AddToken(ps, (int)type, str, - lineno, col_offset, tok->lineno, end_col_offset, + PyParser_AddToken(ps, (int)type, str, tok->lineno, col_offset, &(err_ret->expected))) != E_OK) { if (err_ret->error != E_DONE) { PyObject_FREE(str); @@ -358,87 +210,38 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, if (err_ret->error == E_DONE) { n = ps->p_tree; ps->p_tree = NULL; - - if (n->n_type == file_input) { - /* Put type_ignore nodes in the ENDMARKER of file_input. */ - int num; - node *ch; - size_t i; - - num = NCH(n); - ch = CHILD(n, num - 1); - REQ(ch, ENDMARKER); - - for (i = 0; i < type_ignores.num_items; i++) { - int res = PyNode_AddChild(ch, TYPE_IGNORE, type_ignores.items[i].comment, - type_ignores.items[i].lineno, 0, - type_ignores.items[i].lineno, 0); - if (res != 0) { - err_ret->error = res; - PyNode_Free(n); - n = NULL; - break; - } - type_ignores.items[i].comment = NULL; - } - } - - /* Check that the source for a single input statement really - is a single statement by looking at what is left in the - buffer after parsing. Trailing whitespace and comments - are OK. */ - if (err_ret->error == E_DONE && start == single_input) { - char *cur = tok->cur; - char c = *tok->cur; - - for (;;) { - while (c == ' ' || c == '\t' || c == '\n' || c == '\014') - c = *++cur; - - if (!c) - break; - - if (c != '#') { - err_ret->error = E_BADSINGLE; - PyNode_Free(n); - n = NULL; - break; - } - - /* Suck up comment. */ - while (c && c != '\n') - c = *++cur; - } - } } else n = NULL; - growable_comment_array_deallocate(&type_ignores); - #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD *flags = ps->p_flags; #endif PyParser_Delete(ps); if (n == NULL) { - if (tok->done == E_EOF) + if (tok->lineno <= 1 && tok->done == E_EOF) err_ret->error = E_EOF; err_ret->lineno = tok->lineno; if (tok->buf != NULL) { + char *text = NULL; size_t len; assert(tok->cur - tok->buf < INT_MAX); - /* if we've managed to parse a token, point the offset to its start, - * else use the current reading position of the tokenizer - */ - err_ret->offset = col_offset != -1 ? col_offset + 1 : ((int)(tok->cur - tok->buf)); + err_ret->offset = (int)(tok->cur - tok->buf); len = tok->inp - tok->buf; - err_ret->text = (char *) PyObject_MALLOC(len + 1); - if (err_ret->text != NULL) { - if (len > 0) - strncpy(err_ret->text, tok->buf, len); - err_ret->text[len] = '\0'; +#ifdef Py_USING_UNICODE + text = PyTokenizer_RestoreEncoding(tok, len, &err_ret->offset); + +#endif + if (text == NULL) { + text = (char *) PyObject_MALLOC(len + 1); + if (text != NULL) { + if (len > 0) + strncpy(text, tok->buf, len); + text[len] = '\0'; + } } + err_ret->text = text; } } else if (tok->encoding != NULL) { /* 'nodes->n_str' uses PyObject_*, while 'tok->encoding' was @@ -465,31 +268,17 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret, done: PyTokenizer_Free(tok); - if (n != NULL) { - _PyNode_FinalizeEndPos(n); - } return n; } -static int -initerr(perrdetail *err_ret, PyObject *filename) +static void +initerr(perrdetail *err_ret, const char *filename) { err_ret->error = E_OK; + err_ret->filename = filename; err_ret->lineno = 0; err_ret->offset = 0; err_ret->text = NULL; err_ret->token = -1; err_ret->expected = -1; - if (filename) { - Py_INCREF(filename); - err_ret->filename = filename; - } - else { - err_ret->filename = PyUnicode_FromString("<string>"); - if (err_ret->filename == NULL) { - err_ret->error = E_ERROR; - return -1; - } - } - return 0; } diff --git a/Parser/pgen.c b/Parser/pgen.c new file mode 100644 index 0000000..b20d976 --- /dev/null +++ b/Parser/pgen.c @@ -0,0 +1,726 @@ +/* Parser generator */ + +/* For a description, see the comments at end of this file */ + +#include "Python.h" +#include "pgenheaders.h" +#include "token.h" +#include "node.h" +#include "grammar.h" +#include "metagrammar.h" +#include "pgen.h" + +extern int Py_DebugFlag; +extern int Py_IgnoreEnvironmentFlag; /* needed by Py_GETENV */ + + +/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */ + +typedef struct _nfaarc { + int ar_label; + int ar_arrow; +} nfaarc; + +typedef struct _nfastate { + int st_narcs; + nfaarc *st_arc; +} nfastate; + +typedef struct _nfa { + int nf_type; + char *nf_name; + int nf_nstates; + nfastate *nf_state; + int nf_start, nf_finish; +} nfa; + +/* Forward */ +static void compile_rhs(labellist *ll, + nfa *nf, node *n, int *pa, int *pb); +static void compile_alt(labellist *ll, + nfa *nf, node *n, int *pa, int *pb); +static void compile_item(labellist *ll, + nfa *nf, node *n, int *pa, int *pb); +static void compile_atom(labellist *ll, + nfa *nf, node *n, int *pa, int *pb); + +static int +addnfastate(nfa *nf) +{ + nfastate *st; + + nf->nf_state = (nfastate *)PyObject_REALLOC(nf->nf_state, + sizeof(nfastate) * (nf->nf_nstates + 1)); + if (nf->nf_state == NULL) + Py_FatalError("out of mem"); + st = &nf->nf_state[nf->nf_nstates++]; + st->st_narcs = 0; + st->st_arc = NULL; + return st - nf->nf_state; +} + +static void +addnfaarc(nfa *nf, int from, int to, int lbl) +{ + nfastate *st; + nfaarc *ar; + + st = &nf->nf_state[from]; + st->st_arc = (nfaarc *)PyObject_REALLOC(st->st_arc, + sizeof(nfaarc) * (st->st_narcs + 1)); + if (st->st_arc == NULL) + Py_FatalError("out of mem"); + ar = &st->st_arc[st->st_narcs++]; + ar->ar_label = lbl; + ar->ar_arrow = to; +} + +static nfa * +newnfa(char *name) +{ + nfa *nf; + static int type = NT_OFFSET; /* All types will be disjunct */ + + nf = (nfa *)PyObject_MALLOC(sizeof(nfa)); + if (nf == NULL) + Py_FatalError("no mem for new nfa"); + nf->nf_type = type++; + nf->nf_name = name; /* XXX strdup(name) ??? */ + nf->nf_nstates = 0; + nf->nf_state = NULL; + nf->nf_start = nf->nf_finish = -1; + return nf; +} + +typedef struct _nfagrammar { + int gr_nnfas; + nfa **gr_nfa; + labellist gr_ll; +} nfagrammar; + +/* Forward */ +static void compile_rule(nfagrammar *gr, node *n); + +static nfagrammar * +newnfagrammar(void) +{ + nfagrammar *gr; + + gr = (nfagrammar *)PyObject_MALLOC(sizeof(nfagrammar)); + if (gr == NULL) + Py_FatalError("no mem for new nfa grammar"); + gr->gr_nnfas = 0; + gr->gr_nfa = NULL; + gr->gr_ll.ll_nlabels = 0; + gr->gr_ll.ll_label = NULL; + addlabel(&gr->gr_ll, ENDMARKER, "EMPTY"); + return gr; +} + +static void +freenfagrammar(nfagrammar *gr) +{ + int i; + for (i = 0; i < gr->gr_nnfas; i++) { + PyObject_FREE(gr->gr_nfa[i]->nf_state); + } + PyObject_FREE(gr->gr_nfa); + PyObject_FREE(gr); +} + +static nfa * +addnfa(nfagrammar *gr, char *name) +{ + nfa *nf; + + nf = newnfa(name); + gr->gr_nfa = (nfa **)PyObject_REALLOC(gr->gr_nfa, + sizeof(nfa*) * (gr->gr_nnfas + 1)); + if (gr->gr_nfa == NULL) + Py_FatalError("out of mem"); + gr->gr_nfa[gr->gr_nnfas++] = nf; + addlabel(&gr->gr_ll, NAME, nf->nf_name); + return nf; +} + +#ifdef Py_DEBUG + +static char REQNFMT[] = "metacompile: less than %d children\n"; + +#define REQN(i, count) do { \ + if (i < count) { \ + fprintf(stderr, REQNFMT, count); \ + Py_FatalError("REQN"); \ + } \ +} while (0) + +#else +#define REQN(i, count) /* empty */ +#endif + +static nfagrammar * +metacompile(node *n) +{ + nfagrammar *gr; + int i; + + if (Py_DebugFlag) + printf("Compiling (meta-) parse tree into NFA grammar\n"); + gr = newnfagrammar(); + REQ(n, MSTART); + i = n->n_nchildren - 1; /* Last child is ENDMARKER */ + n = n->n_child; + for (; --i >= 0; n++) { + if (n->n_type != NEWLINE) + compile_rule(gr, n); + } + return gr; +} + +static void +compile_rule(nfagrammar *gr, node *n) +{ + nfa *nf; + + REQ(n, RULE); + REQN(n->n_nchildren, 4); + n = n->n_child; + REQ(n, NAME); + nf = addnfa(gr, n->n_str); + n++; + REQ(n, COLON); + n++; + REQ(n, RHS); + compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish); + n++; + REQ(n, NEWLINE); +} + +static void +compile_rhs(labellist *ll, nfa *nf, node *n, int *pa, int *pb) +{ + int i; + int a, b; + + REQ(n, RHS); + i = n->n_nchildren; + REQN(i, 1); + n = n->n_child; + REQ(n, ALT); + compile_alt(ll, nf, n, pa, pb); + if (--i <= 0) + return; + n++; + a = *pa; + b = *pb; + *pa = addnfastate(nf); + *pb = addnfastate(nf); + addnfaarc(nf, *pa, a, EMPTY); + addnfaarc(nf, b, *pb, EMPTY); + for (; --i >= 0; n++) { + REQ(n, VBAR); + REQN(i, 1); + --i; + n++; + REQ(n, ALT); + compile_alt(ll, nf, n, &a, &b); + addnfaarc(nf, *pa, a, EMPTY); + addnfaarc(nf, b, *pb, EMPTY); + } +} + +static void +compile_alt(labellist *ll, nfa *nf, node *n, int *pa, int *pb) +{ + int i; + int a, b; + + REQ(n, ALT); + i = n->n_nchildren; + REQN(i, 1); + n = n->n_child; + REQ(n, ITEM); + compile_item(ll, nf, n, pa, pb); + --i; + n++; + for (; --i >= 0; n++) { + REQ(n, ITEM); + compile_item(ll, nf, n, &a, &b); + addnfaarc(nf, *pb, a, EMPTY); + *pb = b; + } +} + +static void +compile_item(labellist *ll, nfa *nf, node *n, int *pa, int *pb) +{ + int i; + int a, b; + + REQ(n, ITEM); + i = n->n_nchildren; + REQN(i, 1); + n = n->n_child; + if (n->n_type == LSQB) { + REQN(i, 3); + n++; + REQ(n, RHS); + *pa = addnfastate(nf); + *pb = addnfastate(nf); + addnfaarc(nf, *pa, *pb, EMPTY); + compile_rhs(ll, nf, n, &a, &b); + addnfaarc(nf, *pa, a, EMPTY); + addnfaarc(nf, b, *pb, EMPTY); + REQN(i, 1); + n++; + REQ(n, RSQB); + } + else { + compile_atom(ll, nf, n, pa, pb); + if (--i <= 0) + return; + n++; + addnfaarc(nf, *pb, *pa, EMPTY); + if (n->n_type == STAR) + *pb = *pa; + else + REQ(n, PLUS); + } +} + +static void +compile_atom(labellist *ll, nfa *nf, node *n, int *pa, int *pb) +{ + int i; + + REQ(n, ATOM); + i = n->n_nchildren; + (void)i; /* Don't warn about set but unused */ + REQN(i, 1); + n = n->n_child; + if (n->n_type == LPAR) { + REQN(i, 3); + n++; + REQ(n, RHS); + compile_rhs(ll, nf, n, pa, pb); + n++; + REQ(n, RPAR); + } + else if (n->n_type == NAME || n->n_type == STRING) { + *pa = addnfastate(nf); + *pb = addnfastate(nf); + addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str)); + } + else + REQ(n, NAME); +} + +static void +dumpstate(labellist *ll, nfa *nf, int istate) +{ + nfastate *st; + int i; + nfaarc *ar; + + printf("%c%2d%c", + istate == nf->nf_start ? '*' : ' ', + istate, + istate == nf->nf_finish ? '.' : ' '); + st = &nf->nf_state[istate]; + ar = st->st_arc; + for (i = 0; i < st->st_narcs; i++) { + if (i > 0) + printf("\n "); + printf("-> %2d %s", ar->ar_arrow, + PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label])); + ar++; + } + printf("\n"); +} + +static void +dumpnfa(labellist *ll, nfa *nf) +{ + int i; + + printf("NFA '%s' has %d states; start %d, finish %d\n", + nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish); + for (i = 0; i < nf->nf_nstates; i++) + dumpstate(ll, nf, i); +} + + +/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */ + +static void +addclosure(bitset ss, nfa *nf, int istate) +{ + if (addbit(ss, istate)) { + nfastate *st = &nf->nf_state[istate]; + nfaarc *ar = st->st_arc; + int i; + + for (i = st->st_narcs; --i >= 0; ) { + if (ar->ar_label == EMPTY) + addclosure(ss, nf, ar->ar_arrow); + ar++; + } + } +} + +typedef struct _ss_arc { + bitset sa_bitset; + int sa_arrow; + int sa_label; +} ss_arc; + +typedef struct _ss_state { + bitset ss_ss; + int ss_narcs; + struct _ss_arc *ss_arc; + int ss_deleted; + int ss_finish; + int ss_rename; +} ss_state; + +typedef struct _ss_dfa { + int sd_nstates; + ss_state *sd_state; +} ss_dfa; + +/* Forward */ +static void printssdfa(int xx_nstates, ss_state *xx_state, int nbits, + labellist *ll, char *msg); +static void simplify(int xx_nstates, ss_state *xx_state); +static void convert(dfa *d, int xx_nstates, ss_state *xx_state); + +static void +makedfa(nfagrammar *gr, nfa *nf, dfa *d) +{ + int nbits = nf->nf_nstates; + bitset ss; + int xx_nstates; + ss_state *xx_state, *yy; + ss_arc *zz; + int istate, jstate, iarc, jarc, ibit; + nfastate *st; + nfaarc *ar; + int i, j; + + ss = newbitset(nbits); + addclosure(ss, nf, nf->nf_start); + xx_state = (ss_state *)PyObject_MALLOC(sizeof(ss_state)); + if (xx_state == NULL) + Py_FatalError("no mem for xx_state in makedfa"); + xx_nstates = 1; + yy = &xx_state[0]; + yy->ss_ss = ss; + yy->ss_narcs = 0; + yy->ss_arc = NULL; + yy->ss_deleted = 0; + yy->ss_finish = testbit(ss, nf->nf_finish); + if (yy->ss_finish) + printf("Error: nonterminal '%s' may produce empty.\n", + nf->nf_name); + + /* This algorithm is from a book written before + the invention of structured programming... */ + + /* For each unmarked state... */ + for (istate = 0; istate < xx_nstates; ++istate) { + size_t size; + yy = &xx_state[istate]; + ss = yy->ss_ss; + /* For all its states... */ + for (ibit = 0; ibit < nf->nf_nstates; ++ibit) { + if (!testbit(ss, ibit)) + continue; + st = &nf->nf_state[ibit]; + /* For all non-empty arcs from this state... */ + for (iarc = 0; iarc < st->st_narcs; iarc++) { + ar = &st->st_arc[iarc]; + if (ar->ar_label == EMPTY) + continue; + /* Look up in list of arcs from this state */ + for (jarc = 0; jarc < yy->ss_narcs; ++jarc) { + zz = &yy->ss_arc[jarc]; + if (ar->ar_label == zz->sa_label) + goto found; + } + /* Add new arc for this state */ + size = sizeof(ss_arc) * (yy->ss_narcs + 1); + yy->ss_arc = (ss_arc *)PyObject_REALLOC( + yy->ss_arc, size); + if (yy->ss_arc == NULL) + Py_FatalError("out of mem"); + zz = &yy->ss_arc[yy->ss_narcs++]; + zz->sa_label = ar->ar_label; + zz->sa_bitset = newbitset(nbits); + zz->sa_arrow = -1; + found: ; + /* Add destination */ + addclosure(zz->sa_bitset, nf, ar->ar_arrow); + } + } + /* Now look up all the arrow states */ + for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) { + zz = &xx_state[istate].ss_arc[jarc]; + for (jstate = 0; jstate < xx_nstates; jstate++) { + if (samebitset(zz->sa_bitset, + xx_state[jstate].ss_ss, nbits)) { + zz->sa_arrow = jstate; + goto done; + } + } + size = sizeof(ss_state) * (xx_nstates + 1); + xx_state = (ss_state *)PyObject_REALLOC(xx_state, + size); + if (xx_state == NULL) + Py_FatalError("out of mem"); + zz->sa_arrow = xx_nstates; + yy = &xx_state[xx_nstates++]; + yy->ss_ss = zz->sa_bitset; + yy->ss_narcs = 0; + yy->ss_arc = NULL; + yy->ss_deleted = 0; + yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish); + done: ; + } + } + + if (Py_DebugFlag) + printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, + "before minimizing"); + + simplify(xx_nstates, xx_state); + + if (Py_DebugFlag) + printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll, + "after minimizing"); + + convert(d, xx_nstates, xx_state); + + for (i = 0; i < xx_nstates; i++) { + for (j = 0; j < xx_state[i].ss_narcs; j++) + delbitset(xx_state[i].ss_arc[j].sa_bitset); + PyObject_FREE(xx_state[i].ss_arc); + } + PyObject_FREE(xx_state); +} + +static void +printssdfa(int xx_nstates, ss_state *xx_state, int nbits, + labellist *ll, char *msg) +{ + int i, ibit, iarc; + ss_state *yy; + ss_arc *zz; + + printf("Subset DFA %s\n", msg); + for (i = 0; i < xx_nstates; i++) { + yy = &xx_state[i]; + if (yy->ss_deleted) + continue; + printf(" Subset %d", i); + if (yy->ss_finish) + printf(" (finish)"); + printf(" { "); + for (ibit = 0; ibit < nbits; ibit++) { + if (testbit(yy->ss_ss, ibit)) + printf("%d ", ibit); + } + printf("}\n"); + for (iarc = 0; iarc < yy->ss_narcs; iarc++) { + zz = &yy->ss_arc[iarc]; + printf(" Arc to state %d, label %s\n", + zz->sa_arrow, + PyGrammar_LabelRepr( + &ll->ll_label[zz->sa_label])); + } + } +} + + +/* PART THREE -- SIMPLIFY DFA */ + +/* Simplify the DFA by repeatedly eliminating states that are + equivalent to another oner. This is NOT Algorithm 3.3 from + [Aho&Ullman 77]. It does not always finds the minimal DFA, + but it does usually make a much smaller one... (For an example + of sub-optimal behavior, try S: x a b+ | y a b+.) +*/ + +static int +samestate(ss_state *s1, ss_state *s2) +{ + int i; + + if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish) + return 0; + for (i = 0; i < s1->ss_narcs; i++) { + if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow || + s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label) + return 0; + } + return 1; +} + +static void +renamestates(int xx_nstates, ss_state *xx_state, int from, int to) +{ + int i, j; + + if (Py_DebugFlag) + printf("Rename state %d to %d.\n", from, to); + for (i = 0; i < xx_nstates; i++) { + if (xx_state[i].ss_deleted) + continue; + for (j = 0; j < xx_state[i].ss_narcs; j++) { + if (xx_state[i].ss_arc[j].sa_arrow == from) + xx_state[i].ss_arc[j].sa_arrow = to; + } + } +} + +static void +simplify(int xx_nstates, ss_state *xx_state) +{ + int changes; + int i, j; + + do { + changes = 0; + for (i = 1; i < xx_nstates; i++) { + if (xx_state[i].ss_deleted) + continue; + for (j = 0; j < i; j++) { + if (xx_state[j].ss_deleted) + continue; + if (samestate(&xx_state[i], &xx_state[j])) { + xx_state[i].ss_deleted++; + renamestates(xx_nstates, xx_state, + i, j); + changes++; + break; + } + } + } + } while (changes); +} + + +/* PART FOUR -- GENERATE PARSING TABLES */ + +/* Convert the DFA into a grammar that can be used by our parser */ + +static void +convert(dfa *d, int xx_nstates, ss_state *xx_state) +{ + int i, j; + ss_state *yy; + ss_arc *zz; + + for (i = 0; i < xx_nstates; i++) { + yy = &xx_state[i]; + if (yy->ss_deleted) + continue; + yy->ss_rename = addstate(d); + } + + for (i = 0; i < xx_nstates; i++) { + yy = &xx_state[i]; + if (yy->ss_deleted) + continue; + for (j = 0; j < yy->ss_narcs; j++) { + zz = &yy->ss_arc[j]; + addarc(d, yy->ss_rename, + xx_state[zz->sa_arrow].ss_rename, + zz->sa_label); + } + if (yy->ss_finish) + addarc(d, yy->ss_rename, yy->ss_rename, 0); + } + + d->d_initial = 0; +} + + +/* PART FIVE -- GLUE IT ALL TOGETHER */ + +static grammar * +maketables(nfagrammar *gr) +{ + int i; + nfa *nf; + dfa *d; + grammar *g; + + if (gr->gr_nnfas == 0) + return NULL; + g = newgrammar(gr->gr_nfa[0]->nf_type); + /* XXX first rule must be start rule */ + g->g_ll = gr->gr_ll; + + for (i = 0; i < gr->gr_nnfas; i++) { + nf = gr->gr_nfa[i]; + if (Py_DebugFlag) { + printf("Dump of NFA for '%s' ...\n", nf->nf_name); + dumpnfa(&gr->gr_ll, nf); + printf("Making DFA for '%s' ...\n", nf->nf_name); + } + d = adddfa(g, nf->nf_type, nf->nf_name); + makedfa(gr, gr->gr_nfa[i], d); + } + + return g; +} + +grammar * +pgen(node *n) +{ + nfagrammar *gr; + grammar *g; + + gr = metacompile(n); + g = maketables(gr); + translatelabels(g); + addfirstsets(g); + freenfagrammar(gr); + return g; +} + +grammar * +Py_pgen(node *n) +{ + return pgen(n); +} + +/* + +Description +----------- + +Input is a grammar in extended BNF (using * for repetition, + for +at-least-once repetition, [] for optional parts, | for alternatives and +() for grouping). This has already been parsed and turned into a parse +tree. + +Each rule is considered as a regular expression in its own right. +It is turned into a Non-deterministic Finite Automaton (NFA), which +is then turned into a Deterministic Finite Automaton (DFA), which is then +optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, +or similar compiler books (this technique is more often used for lexical +analyzers). + +The DFA's are used by the parser as parsing tables in a special way +that's probably unique. Before they are usable, the FIRST sets of all +non-terminals are computed. + +Reference +--------- + +[Aho&Ullman 77] + Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 + (first edition) + +*/ diff --git a/Parser/pgen/__init__.py b/Parser/pgen/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/Parser/pgen/__init__.py +++ /dev/null diff --git a/Parser/pgen/__main__.py b/Parser/pgen/__main__.py deleted file mode 100644 index bb96e75..0000000 --- a/Parser/pgen/__main__.py +++ /dev/null @@ -1,33 +0,0 @@ -import argparse - -from .pgen import ParserGenerator - - -def main(): - parser = argparse.ArgumentParser(description="Parser generator main program.") - parser.add_argument( - "grammar", type=str, help="The file with the grammar definition in EBNF format" - ) - parser.add_argument("tokens", type=str, help="The file with the token definitions") - parser.add_argument( - "graminit_h", - type=argparse.FileType("w"), - help="The path to write the grammar's non-terminals as #defines", - ) - parser.add_argument( - "graminit_c", - type=argparse.FileType("w"), - help="The path to write the grammar as initialized data", - ) - - parser.add_argument("--verbose", "-v", action="count") - args = parser.parse_args() - - p = ParserGenerator(args.grammar, args.tokens, verbose=args.verbose) - grammar = p.make_grammar() - grammar.produce_graminit_h(args.graminit_h.write) - grammar.produce_graminit_c(args.graminit_c.write) - - -if __name__ == "__main__": - main() diff --git a/Parser/pgen/automata.py b/Parser/pgen/automata.py deleted file mode 100644 index 545a737..0000000 --- a/Parser/pgen/automata.py +++ /dev/null @@ -1,371 +0,0 @@ -"""Classes representing state-machine concepts""" - -class NFA: - """A non deterministic finite automata - - A non deterministic automata is a form of a finite state - machine. An NFA's rules are less restrictive than a DFA. - The NFA rules are: - - * A transition can be non-deterministic and can result in - nothing, one, or two or more states. - - * An epsilon transition consuming empty input is valid. - Transitions consuming labeled symbols are also permitted. - - This class assumes that there is only one starting state and one - accepting (ending) state. - - Attributes: - name (str): The name of the rule the NFA is representing. - start (NFAState): The starting state. - end (NFAState): The ending state - """ - - def __init__(self, start, end): - self.name = start.rule_name - self.start = start - self.end = end - - def __repr__(self): - return "NFA(start={}, end={})".format(self.start, self.end) - - def dump(self, writer=print): - """Dump a graphical representation of the NFA""" - todo = [self.start] - for i, state in enumerate(todo): - writer(" State", i, state is self.end and "(final)" or "") - for arc in state.arcs: - label = arc.label - next = arc.target - if next in todo: - j = todo.index(next) - else: - j = len(todo) - todo.append(next) - if label is None: - writer(" -> %d" % j) - else: - writer(" %s -> %d" % (label, j)) - - -class NFAArc: - """An arc representing a transition between two NFA states. - - NFA states can be connected via two ways: - - * A label transition: An input equal to the label must - be consumed to perform the transition. - * An epsilon transition: The transition can be taken without - consuming any input symbol. - - Attributes: - target (NFAState): The ending state of the transition arc. - label (Optional[str]): The label that must be consumed to make - the transition. An epsilon transition is represented - using `None`. - """ - - def __init__(self, target, label): - self.target = target - self.label = label - - def __repr__(self): - return "<%s: %s>" % (self.__class__.__name__, self.label) - - -class NFAState: - """A state of a NFA, non deterministic finite automata. - - Attributes: - target (rule_name): The name of the rule used to represent the NFA's - ending state after a transition. - arcs (Dict[Optional[str], NFAState]): A mapping representing transitions - between the current NFA state and another NFA state via following - a label. - """ - - def __init__(self, rule_name): - self.rule_name = rule_name - self.arcs = [] - - def add_arc(self, target, label=None): - """Add a new arc to connect the state to a target state within the NFA - - The method adds a new arc to the list of arcs available as transitions - from the present state. An optional label indicates a named transition - that consumes an input while the absence of a label represents an epsilon - transition. - - Attributes: - target (NFAState): The end of the transition that the arc represents. - label (Optional[str]): The label that must be consumed for making - the transition. If the label is not provided the transition is assumed - to be an epsilon-transition. - """ - assert label is None or isinstance(label, str) - assert isinstance(target, NFAState) - self.arcs.append(NFAArc(target, label)) - - def __repr__(self): - return "<%s: from %s>" % (self.__class__.__name__, self.rule_name) - - -class DFA: - """A deterministic finite automata - - A deterministic finite automata is a form of a finite state machine - that obeys the following rules: - - * Each of the transitions is uniquely determined by - the source state and input symbol - * Reading an input symbol is required for each state - transition (no epsilon transitions). - - The finite-state machine will accept or reject a string of symbols - and only produces a unique computation of the automaton for each input - string. The DFA must have a unique starting state (represented as the first - element in the list of states) but can have multiple final states. - - Attributes: - name (str): The name of the rule the DFA is representing. - states (List[DFAState]): A collection of DFA states. - """ - - def __init__(self, name, states): - self.name = name - self.states = states - - @classmethod - def from_nfa(cls, nfa): - """Constructs a DFA from a NFA using the Rabin–Scott construction algorithm. - - To simulate the operation of a DFA on a given input string, it's - necessary to keep track of a single state at any time, or more precisely, - the state that the automaton will reach after seeing a prefix of the - input. In contrast, to simulate an NFA, it's necessary to keep track of - a set of states: all of the states that the automaton could reach after - seeing the same prefix of the input, according to the nondeterministic - choices made by the automaton. There are two possible sources of - non-determinism: - - 1) Multiple (one or more) transitions with the same label - - 'A' +-------+ - +----------->+ State +----------->+ - | | 2 | - +-------+ +-------+ - | State | - | 1 | +-------+ - +-------+ | State | - +----------->+ 3 +----------->+ - 'A' +-------+ - - 2) Epsilon transitions (transitions that can be taken without consuming any input) - - +-------+ +-------+ - | State | ε | State | - | 1 +----------->+ 2 +----------->+ - +-------+ +-------+ - - Looking at the first case above, we can't determine which transition should be - followed when given an input A. We could choose whether or not to follow the - transition while in the second case the problem is that we can choose both to - follow the transition or not doing it. To solve this problem we can imagine that - we follow all possibilities at the same time and we construct new states from the - set of all possible reachable states. For every case in the previous example: - - - 1) For multiple transitions with the same label we colapse all of the - final states under the same one - - +-------+ +-------+ - | State | 'A' | State | - | 1 +----------->+ 2-3 +----------->+ - +-------+ +-------+ - - 2) For epsilon transitions we collapse all epsilon-reachable states - into the same one - - +-------+ - | State | - | 1-2 +-----------> - +-------+ - - Because the DFA states consist of sets of NFA states, an n-state NFA - may be converted to a DFA with at most 2**n states. Notice that the - constructed DFA is not minimal and can be simplified or reduced - afterwards. - - Parameters: - name (NFA): The NFA to transform to DFA. - """ - assert isinstance(nfa, NFA) - - def add_closure(nfa_state, base_nfa_set): - """Calculate the epsilon-closure of a given state - - Add to the *base_nfa_set* all the states that are - reachable from *nfa_state* via epsilon-transitions. - """ - assert isinstance(nfa_state, NFAState) - if nfa_state in base_nfa_set: - return - base_nfa_set.add(nfa_state) - for nfa_arc in nfa_state.arcs: - if nfa_arc.label is None: - add_closure(nfa_arc.target, base_nfa_set) - - # Calculte the epsilon-closure of the starting state - base_nfa_set = set() - add_closure(nfa.start, base_nfa_set) - - # Start by visiting the NFA starting state (there is only one). - states = [DFAState(nfa.name, base_nfa_set, nfa.end)] - - for state in states: # NB states grow while we're iterating - - # Find transitions from the current state to other reachable states - # and store them in mapping that correlates the label to all the - # possible reachable states that can be obtained by consuming a - # token equal to the label. Each set of all the states that can - # be reached after following a label will be the a DFA state. - arcs = {} - for nfa_state in state.nfa_set: - for nfa_arc in nfa_state.arcs: - if nfa_arc.label is not None: - nfa_set = arcs.setdefault(nfa_arc.label, set()) - # All states that can be reached by epsilon-transitions - # are also included in the set of reachable states. - add_closure(nfa_arc.target, nfa_set) - - # Now create new DFAs by visiting all posible transitions between - # the current DFA state and the new power-set states (each nfa_set) - # via the different labels. As the nodes are appended to *states* this - # is performing a breadth-first search traversal over the power-set of - # the states of the original NFA. - for label, nfa_set in sorted(arcs.items()): - for exisisting_state in states: - if exisisting_state.nfa_set == nfa_set: - # The DFA state already exists for this rule. - next_state = exisisting_state - break - else: - next_state = DFAState(nfa.name, nfa_set, nfa.end) - states.append(next_state) - - # Add a transition between the current DFA state and the new - # DFA state (the power-set state) via the current label. - state.add_arc(next_state, label) - - return cls(nfa.name, states) - - def __iter__(self): - return iter(self.states) - - def simplify(self): - """Attempt to reduce the number of states of the DFA - - Transform the DFA into an equivalent DFA that has fewer states. Two - classes of states can be removed or merged from the original DFA without - affecting the language it accepts to minimize it: - - * Unreachable states can not be reached from the initial - state of the DFA, for any input string. - * Nondistinguishable states are those that cannot be distinguished - from one another for any input string. - - This algorithm does not achieve the optimal fully-reduced solution, but it - works well enough for the particularities of the Python grammar. The - algorithm repeatedly looks for two states that have the same set of - arcs (same labels pointing to the same nodes) and unifies them, until - things stop changing. - """ - changes = True - while changes: - changes = False - for i, state_i in enumerate(self.states): - for j in range(i + 1, len(self.states)): - state_j = self.states[j] - if state_i == state_j: - del self.states[j] - for state in self.states: - state.unifystate(state_j, state_i) - changes = True - break - - def dump(self, writer=print): - """Dump a graphical representation of the DFA""" - for i, state in enumerate(self.states): - writer(" State", i, state.is_final and "(final)" or "") - for label, next in sorted(state.arcs.items()): - writer(" %s -> %d" % (label, self.states.index(next))) - - -class DFAState(object): - """A state of a DFA - - Attributes: - rule_name (rule_name): The name of the DFA rule containing the represented state. - nfa_set (Set[NFAState]): The set of NFA states used to create this state. - final (bool): True if the state represents an accepting state of the DFA - containing this state. - arcs (Dict[label, DFAState]): A mapping representing transitions between - the current DFA state and another DFA state via following a label. - """ - - def __init__(self, rule_name, nfa_set, final): - assert isinstance(nfa_set, set) - assert isinstance(next(iter(nfa_set)), NFAState) - assert isinstance(final, NFAState) - self.rule_name = rule_name - self.nfa_set = nfa_set - self.arcs = {} # map from terminals/nonterminals to DFAState - self.is_final = final in nfa_set - - def add_arc(self, target, label): - """Add a new arc to the current state. - - Parameters: - target (DFAState): The DFA state at the end of the arc. - label (str): The label respresenting the token that must be consumed - to perform this transition. - """ - assert isinstance(label, str) - assert label not in self.arcs - assert isinstance(target, DFAState) - self.arcs[label] = target - - def unifystate(self, old, new): - """Replace all arcs from the current node to *old* with *new*. - - Parameters: - old (DFAState): The DFA state to remove from all existing arcs. - new (DFAState): The DFA state to replace in all existing arcs. - """ - for label, next_ in self.arcs.items(): - if next_ is old: - self.arcs[label] = new - - def __eq__(self, other): - # The nfa_set does not matter for equality - assert isinstance(other, DFAState) - if self.is_final != other.is_final: - return False - # We cannot just return self.arcs == other.arcs because that - # would invoke this method recursively if there are any cycles. - if len(self.arcs) != len(other.arcs): - return False - for label, next_ in self.arcs.items(): - if next_ is not other.arcs.get(label): - return False - return True - - __hash__ = None # For Py3 compatibility. - - def __repr__(self): - return "<%s: %s is_final=%s>" % ( - self.__class__.__name__, - self.rule_name, - self.is_final, - ) diff --git a/Parser/pgen/grammar.py b/Parser/pgen/grammar.py deleted file mode 100644 index ce40e16..0000000 --- a/Parser/pgen/grammar.py +++ /dev/null @@ -1,147 +0,0 @@ -import collections - - -class Grammar: - """Pgen parsing tables class. - - The instance variables are as follows: - - symbol2number -- a dict mapping symbol names to numbers. Symbol - numbers are always 256 or higher, to distinguish - them from token numbers, which are between 0 and - 255 (inclusive). - - number2symbol -- a dict mapping numbers to symbol names; - these two are each other's inverse. - - states -- a list of DFAs, where each DFA is a list of - states, each state is a list of arcs, and each - arc is a (i, j) pair where i is a label and j is - a state number. The DFA number is the index into - this list. (This name is slightly confusing.) - Final states are represented by a special arc of - the form (0, j) where j is its own state number. - - dfas -- a dict mapping symbol numbers to (DFA, first) - pairs, where DFA is an item from the states list - above, and first is a set of tokens that can - begin this grammar rule. - - labels -- a list of (x, y) pairs where x is either a token - number or a symbol number, and y is either None - or a string; the strings are keywords. The label - number is the index in this list; label numbers - are used to mark state transitions (arcs) in the - DFAs. - - start -- the number of the grammar's start symbol. - - keywords -- a dict mapping keyword strings to arc labels. - - tokens -- a dict mapping token numbers to arc labels. - - """ - - def __init__(self): - self.symbol2number = collections.OrderedDict() - self.number2symbol = collections.OrderedDict() - self.states = [] - self.dfas = collections.OrderedDict() - self.labels = [(0, "EMPTY")] - self.keywords = collections.OrderedDict() - self.tokens = collections.OrderedDict() - self.symbol2label = collections.OrderedDict() - self.start = 256 - - def produce_graminit_h(self, writer): - writer("/* Generated by Parser/pgen */\n\n") - for number, symbol in self.number2symbol.items(): - writer("#define {} {}\n".format(symbol, number)) - - def produce_graminit_c(self, writer): - writer("/* Generated by Parser/pgen */\n\n") - - writer('#include "exports.h"\n') - writer('#include "grammar.h"\n') - writer("Py_EXPORTED_SYMBOL grammar _PyParser_Grammar;\n") - - self.print_dfas(writer) - self.print_labels(writer) - - writer("Py_EXPORTED_SYMBOL grammar _PyParser_Grammar = {\n") - writer(" {n_dfas},\n".format(n_dfas=len(self.dfas))) - writer(" dfas,\n") - writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels))) - writer(" {start_number}\n".format(start_number=self.start)) - writer("};\n") - - def print_labels(self, writer): - writer( - "static const label labels[{n_labels}] = {{\n".format( - n_labels=len(self.labels) - ) - ) - for label, name in self.labels: - label_name = '"{}"'.format(name) if name is not None else 0 - writer( - " {{{label}, {label_name}}},\n".format( - label=label, label_name=label_name - ) - ) - writer("};\n") - - def print_dfas(self, writer): - self.print_states(writer) - writer("static const dfa dfas[{}] = {{\n".format(len(self.dfas))) - for dfaindex, dfa_elem in enumerate(self.dfas.items()): - symbol, (dfa, first_sets) = dfa_elem - writer( - ' {{{dfa_symbol}, "{symbol_name}", '.format( - dfa_symbol=symbol, symbol_name=self.number2symbol[symbol] - ) - + "{n_states}, states_{dfa_index},\n".format( - n_states=len(dfa), dfa_index=dfaindex - ) - + ' "' - ) - - bitset = bytearray((len(self.labels) >> 3) + 1) - for token in first_sets: - bitset[token >> 3] |= 1 << (token & 7) - for byte in bitset: - writer("\\%03o" % (byte & 0xFF)) - writer('"},\n') - writer("};\n") - - def print_states(self, write): - for dfaindex, dfa in enumerate(self.states): - self.print_arcs(write, dfaindex, dfa) - write( - "static state states_{dfa_index}[{n_states}] = {{\n".format( - dfa_index=dfaindex, n_states=len(dfa) - ) - ) - for stateindex, state in enumerate(dfa): - narcs = len(state) - write( - " {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format( - n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex - ) - ) - write("};\n") - - def print_arcs(self, write, dfaindex, states): - for stateindex, state in enumerate(states): - narcs = len(state) - write( - "static const arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format( - dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs - ) - ) - for a, b in state: - write( - " {{{from_label}, {to_state}}},\n".format( - from_label=a, to_state=b - ) - ) - write("};\n") diff --git a/Parser/pgen/keywordgen.py b/Parser/pgen/keywordgen.py deleted file mode 100644 index f0234a8..0000000 --- a/Parser/pgen/keywordgen.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Generate Lib/keyword.py from the Grammar and Tokens files using pgen""" - -import argparse - -from .pgen import ParserGenerator - -TEMPLATE = r''' -"""Keywords (from "Grammar/Grammar") - -This file is automatically generated; please don't muck it up! - -To update the symbols in this file, 'cd' to the top directory of -the python source tree and run: - - python3 -m Parser.pgen.keywordgen Grammar/Grammar \ - Grammar/Tokens \ - Lib/keyword.py - -Alternatively, you can run 'make regen-keyword'. -""" - -__all__ = ["iskeyword", "kwlist"] - -kwlist = [ - {keywords} -] - -iskeyword = frozenset(kwlist).__contains__ -'''.lstrip() - -EXTRA_KEYWORDS = ["async", "await"] - - -def main(): - parser = argparse.ArgumentParser( - description="Generate the Lib/keywords.py " "file from the grammar." - ) - parser.add_argument( - "grammar", type=str, help="The file with the grammar definition in EBNF format" - ) - parser.add_argument("tokens", type=str, help="The file with the token definitions") - parser.add_argument( - "keyword_file", - type=argparse.FileType("w"), - help="The path to write the keyword definitions", - ) - args = parser.parse_args() - p = ParserGenerator(args.grammar, args.tokens) - grammar = p.make_grammar() - - with args.keyword_file as thefile: - all_keywords = sorted(list(grammar.keywords) + EXTRA_KEYWORDS) - - keywords = ",\n ".join(map(repr, all_keywords)) - thefile.write(TEMPLATE.format(keywords=keywords)) - - -if __name__ == "__main__": - main() diff --git a/Parser/pgen/metaparser.py b/Parser/pgen/metaparser.py deleted file mode 100644 index 074a083..0000000 --- a/Parser/pgen/metaparser.py +++ /dev/null @@ -1,152 +0,0 @@ -"""Parser for the Python metagrammar""" - -import io -import tokenize # from stdlib - -from .automata import NFA, NFAState - - -class GrammarParser: - """Parser for Python grammar files.""" - - _translation_table = { - tokenize.NAME: "NAME", - tokenize.STRING: "STRING", - tokenize.NEWLINE: "NEWLINE", - tokenize.NL: "NL", - tokenize.OP: "OP", - tokenize.ENDMARKER: "ENDMARKER", - tokenize.COMMENT: "COMMENT", - } - - def __init__(self, grammar): - self.grammar = grammar - grammar_adaptor = io.StringIO(grammar) - self.generator = tokenize.generate_tokens(grammar_adaptor.readline) - self._gettoken() # Initialize lookahead - self._current_rule_name = None - - def parse(self): - """Turn the grammar into a collection of NFAs""" - # grammar: (NEWLINE | rule)* ENDMARKER - while self.type != tokenize.ENDMARKER: - while self.type == tokenize.NEWLINE: - self._gettoken() - # rule: NAME ':' rhs NEWLINE - self._current_rule_name = self._expect(tokenize.NAME) - self._expect(tokenize.OP, ":") - a, z = self._parse_rhs() - self._expect(tokenize.NEWLINE) - - yield NFA(a, z) - - def _parse_rhs(self): - # rhs: items ('|' items)* - a, z = self._parse_items() - if self.value != "|": - return a, z - else: - aa = NFAState(self._current_rule_name) - zz = NFAState(self._current_rule_name) - while True: - # Allow to transit directly to the previous state and connect the end of the - # previous state to the end of the current one, effectively allowing to skip - # the current state. - aa.add_arc(a) - z.add_arc(zz) - if self.value != "|": - break - - self._gettoken() - a, z = self._parse_items() - return aa, zz - - def _parse_items(self): - # items: item+ - a, b = self._parse_item() - while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["): - c, d = self._parse_item() - # Allow a transition between the end of the previous state - # and the beginning of the new one, connecting all the items - # together. In this way we can only reach the end if we visit - # all the items. - b.add_arc(c) - b = d - return a, b - - def _parse_item(self): - # item: '[' rhs ']' | atom ['+' | '*'] - if self.value == "[": - self._gettoken() - a, z = self._parse_rhs() - self._expect(tokenize.OP, "]") - # Make a transition from the beginning to the end so it is possible to - # advance for free to the next state of this item # without consuming - # anything from the rhs. - a.add_arc(z) - return a, z - else: - a, z = self._parse_atom() - value = self.value - if value not in ("+", "*"): - return a, z - self._gettoken() - z.add_arc(a) - if value == "+": - # Create a cycle to the beginning so we go back to the old state in this - # item and repeat. - return a, z - else: - # The end state is the same as the beginning, so we can cycle arbitrarily - # and end in the beginning if necessary. - return a, a - - def _parse_atom(self): - # atom: '(' rhs ')' | NAME | STRING - if self.value == "(": - self._gettoken() - a, z = self._parse_rhs() - self._expect(tokenize.OP, ")") - return a, z - elif self.type in (tokenize.NAME, tokenize.STRING): - a = NFAState(self._current_rule_name) - z = NFAState(self._current_rule_name) - # We can transit to the next state only if we consume the value. - a.add_arc(z, self.value) - self._gettoken() - return a, z - else: - self._raise_error( - "expected (...) or NAME or STRING, got {} ({})", - self._translation_table.get(self.type, self.type), - self.value, - ) - - def _expect(self, type_, value=None): - if self.type != type_: - self._raise_error( - "expected {}, got {} ({})", - self._translation_table.get(type_, type_), - self._translation_table.get(self.type, self.type), - self.value, - ) - if value is not None and self.value != value: - self._raise_error("expected {}, got {}", value, self.value) - value = self.value - self._gettoken() - return value - - def _gettoken(self): - tup = next(self.generator) - while tup[0] in (tokenize.COMMENT, tokenize.NL): - tup = next(self.generator) - self.type, self.value, self.begin, self.end, self.line = tup - - def _raise_error(self, msg, *args): - if args: - try: - msg = msg.format(*args) - except Exception: - msg = " ".join([msg] + list(map(str, args))) - line = self.grammar.splitlines()[self.begin[0] - 1] - raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line)) diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py deleted file mode 100644 index 2f444eb..0000000 --- a/Parser/pgen/pgen.py +++ /dev/null @@ -1,305 +0,0 @@ -"""Python parser generator - - -This parser generator transforms a Python grammar file into parsing tables -that can be consumed by Python's LL(1) parser written in C. - -Concepts --------- - -* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a - top-down parser for a subset of context-free languages. It parses the input - from Left to right, performing Leftmost derivation of the sentence, and can - only use 1 token of lookahead when parsing a sentence. - -* A parsing table is a collection of data that a generic implementation of the - LL(1) parser consumes to know how to parse a given context-free grammar. In - this case the collection of data involves Deterministic Finite Automatons, - calculated first sets, keywords and transition labels. - -* A grammar is defined by production rules (or just 'productions') that specify - which symbols may replace which other symbols; these rules may be used to - generate strings, or to parse them. Each such rule has a head, or left-hand - side, which consists of the string that may be replaced, and a body, or - right-hand side, which consists of a string that may replace it. In the - Python grammar, rules are written in the form - - rule_name: rule_description; - - meaning the rule 'a: b' specifies that a can be replaced by b. A context-free - grammar is a grammar in which the left-hand side of each production rule - consists of only a single nonterminal symbol. Context-free grammars can - always be recognized by a Non-Deterministic Automatons. - -* Terminal symbols are literal symbols which may appear in the outputs of the - production rules of the grammar and which cannot be changed using the rules - of the grammar. Applying the rules recursively to a source string of symbols - will usually terminate in a final output string consisting only of terminal - symbols. - -* Nonterminal symbols are those symbols which can be replaced. The grammar - includes a start symbol a designated member of the set of nonterminals from - which all the strings in the language may be derived by successive - applications of the production rules. - -* The language defined by the grammar is defined as the set of terminal strings - that can be derived using the production rules. - -* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals - that can appear in the first position of any string derived from the rule. - This is useful for LL(1) parsers as the parser is only allowed to look at the - next token in the input to know which rule needs to parse. For example, given - this grammar: - - start: '(' A | B ')' - A: 'a' '<' - B: 'b' '<' - - and the input '(b<)' the parser can only look at 'b' to know if it needs - to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows - that needs to continue parsing rule B because only that rule can start - with 'b'. - -Description ------------ - -The input for the parser generator is a grammar in extended BNF form (using * -for repetition, + for at-least-once repetition, [] for optional parts, | for -alternatives and () for grouping). - -Each rule in the grammar file is considered as a regular expression in its -own right. It is turned into a Non-deterministic Finite Automaton (NFA), -which is then turned into a Deterministic Finite Automaton (DFA), which is -then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, -or similar compiler books (this technique is more often used for lexical -analyzers). - -The DFA's are used by the parser as parsing tables in a special way that's -probably unique. Before they are usable, the FIRST sets of all non-terminals -are computed so the LL(1) parser consuming the parsing tables can distinguish -between different transitions. -Reference ---------- - -[Aho&Ullman 77] - Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 - (first edition) -""" - -from ast import literal_eval -import collections - -from . import grammar, token -from .automata import DFA -from .metaparser import GrammarParser - -import enum - - -class LabelType(enum.Enum): - NONTERMINAL = 0 - NAMED_TOKEN = 1 - KEYWORD = 2 - OPERATOR = 3 - NONE = 4 - - -class Label(str): - def __init__(self, value): - self.type = self._get_type() - - def _get_type(self): - if self[0].isalpha(): - if self.upper() == self: - # NAMED tokens (ASYNC, NAME...) are all uppercase by convention - return LabelType.NAMED_TOKEN - else: - # If is not uppercase it must be a non terminal. - return LabelType.NONTERMINAL - else: - # Keywords and operators are wrapped in quotes - assert self[0] == self[-1] in ('"', "'"), self - value = literal_eval(self) - if value[0].isalpha(): - return LabelType.KEYWORD - else: - return LabelType.OPERATOR - - def __repr__(self): - return "{}({})".format(self.type, super().__repr__()) - - -class ParserGenerator(object): - def __init__(self, grammar_file, token_file, verbose=False): - with open(grammar_file) as f: - self.grammar = f.read() - with open(token_file) as tok_file: - token_lines = tok_file.readlines() - self.tokens = dict(token.generate_tokens(token_lines)) - self.opmap = dict(token.generate_opmap(token_lines)) - # Manually add <> so it does not collide with != - self.opmap["<>"] = "NOTEQUAL" - self.verbose = verbose - self.filename = grammar_file - self.dfas, self.startsymbol = self.create_dfas() - self.first = {} # map from symbol name to set of tokens - self.calculate_first_sets() - - def create_dfas(self): - rule_to_dfas = collections.OrderedDict() - start_nonterminal = None - for nfa in GrammarParser(self.grammar).parse(): - if self.verbose: - print("Dump of NFA for", nfa.name) - nfa.dump() - dfa = DFA.from_nfa(nfa) - if self.verbose: - print("Dump of DFA for", dfa.name) - dfa.dump() - dfa.simplify() - rule_to_dfas[dfa.name] = dfa - - if start_nonterminal is None: - start_nonterminal = dfa.name - - return rule_to_dfas, start_nonterminal - - def make_grammar(self): - c = grammar.Grammar() - c.all_labels = set() - names = list(self.dfas.keys()) - names.remove(self.startsymbol) - names.insert(0, self.startsymbol) - for name in names: - i = 256 + len(c.symbol2number) - c.symbol2number[Label(name)] = i - c.number2symbol[i] = Label(name) - c.all_labels.add(name) - for name in names: - self.make_label(c, name) - dfa = self.dfas[name] - states = [] - for state in dfa: - arcs = [] - for label, next in sorted(state.arcs.items()): - c.all_labels.add(label) - arcs.append((self.make_label(c, label), dfa.states.index(next))) - if state.is_final: - arcs.append((0, dfa.states.index(state))) - states.append(arcs) - c.states.append(states) - c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name)) - c.start = c.symbol2number[self.startsymbol] - - if self.verbose: - print("") - print("Grammar summary") - print("===============") - - print("- {n_labels} labels".format(n_labels=len(c.labels))) - print("- {n_dfas} dfas".format(n_dfas=len(c.dfas))) - print("- {n_tokens} tokens".format(n_tokens=len(c.tokens))) - print("- {n_keywords} keywords".format(n_keywords=len(c.keywords))) - print( - "- Start symbol: {start_symbol}".format( - start_symbol=c.number2symbol[c.start] - ) - ) - return c - - def make_first_sets(self, c, name): - rawfirst = self.first[name] - first = set() - for label in sorted(rawfirst): - ilabel = self.make_label(c, label) - ##assert ilabel not in first # XXX failed on <> ... != - first.add(ilabel) - return first - - def make_label(self, c, label): - label = Label(label) - ilabel = len(c.labels) - - if label.type == LabelType.NONTERMINAL: - if label in c.symbol2label: - return c.symbol2label[label] - else: - c.labels.append((c.symbol2number[label], None)) - c.symbol2label[label] = ilabel - return ilabel - elif label.type == LabelType.NAMED_TOKEN: - # A named token (NAME, NUMBER, STRING) - itoken = self.tokens.get(label, None) - assert isinstance(itoken, int), label - assert itoken in self.tokens.values(), label - if itoken in c.tokens: - return c.tokens[itoken] - else: - c.labels.append((itoken, None)) - c.tokens[itoken] = ilabel - return ilabel - elif label.type == LabelType.KEYWORD: - # A keyword - value = literal_eval(label) - if value in c.keywords: - return c.keywords[value] - else: - c.labels.append((self.tokens["NAME"], value)) - c.keywords[value] = ilabel - return ilabel - elif label.type == LabelType.OPERATOR: - # An operator (any non-numeric token) - value = literal_eval(label) - tok_name = self.opmap[value] # Fails if unknown token - itoken = self.tokens[tok_name] - if itoken in c.tokens: - return c.tokens[itoken] - else: - c.labels.append((itoken, None)) - c.tokens[itoken] = ilabel - return ilabel - else: - raise ValueError("Cannot categorize label {}".format(label)) - - def calculate_first_sets(self): - names = list(self.dfas.keys()) - for name in names: - if name not in self.first: - self.calculate_first_sets_for_rule(name) - - if self.verbose: - print("First set for {dfa_name}".format(dfa_name=name)) - for item in self.first[name]: - print(" - {terminal}".format(terminal=item)) - - def calculate_first_sets_for_rule(self, name): - dfa = self.dfas[name] - self.first[name] = None # dummy to detect left recursion - state = dfa.states[0] - totalset = set() - overlapcheck = {} - for label, next in state.arcs.items(): - if label in self.dfas: - if label in self.first: - fset = self.first[label] - if fset is None: - raise ValueError("recursion for rule %r" % name) - else: - self.calculate_first_sets_for_rule(label) - fset = self.first[label] - totalset.update(fset) - overlapcheck[label] = fset - else: - totalset.add(label) - overlapcheck[label] = {label} - inverse = {} - for label, itsfirst in overlapcheck.items(): - for symbol in itsfirst: - if symbol in inverse: - raise ValueError( - "rule %s is ambiguous; %s is in the" - " first sets of %s as well as %s" - % (name, symbol, label, inverse[symbol]) - ) - inverse[symbol] = label - self.first[name] = totalset diff --git a/Parser/pgen/token.py b/Parser/pgen/token.py deleted file mode 100644 index 2cff62c..0000000 --- a/Parser/pgen/token.py +++ /dev/null @@ -1,38 +0,0 @@ -import itertools - - -def generate_tokens(tokens): - numbers = itertools.count(0) - for line in tokens: - line = line.strip() - - if not line or line.startswith("#"): - continue - - name = line.split()[0] - yield (name, next(numbers)) - - yield ("N_TOKENS", next(numbers)) - yield ("NT_OFFSET", 256) - - -def generate_opmap(tokens): - for line in tokens: - line = line.strip() - - if not line or line.startswith("#"): - continue - - pieces = line.split() - - if len(pieces) != 2: - continue - - name, op = pieces - yield (op.strip("'"), name) - - # Yield independently <>. This is needed so it does not collide - # with the token generation in "generate_tokens" because if this - # symbol is included in Grammar/Tokens, it will collide with != - # as it has the same name (NOTEQUAL). - yield ("<>", "NOTEQUAL") diff --git a/Parser/pgenmain.c b/Parser/pgenmain.c new file mode 100644 index 0000000..0b47295 --- /dev/null +++ b/Parser/pgenmain.c @@ -0,0 +1,174 @@ + +/* Parser generator main program */ + +/* This expects a filename containing the grammar as argv[1] (UNIX) + or asks the console for such a file name (THINK C). + It writes its output on two files in the current directory: + - "graminit.c" gets the grammar as a bunch of initialized data + - "graminit.h" gets the grammar's non-terminals as #defines. + Error messages and status info during the generation process are + written to stdout, or sometimes to stderr. */ + +/* XXX TO DO: + - check for duplicate definitions of names (instead of fatal err) +*/ + +#include "Python.h" +#include "pgenheaders.h" +#include "grammar.h" +#include "node.h" +#include "parsetok.h" +#include "pgen.h" + +int Py_DebugFlag; +int Py_VerboseFlag; +int Py_IgnoreEnvironmentFlag; + +/* Forward */ +grammar *getgrammar(char *filename); + +void +Py_Exit(int sts) +{ + exit(sts); +} + +int +main(int argc, char **argv) +{ + grammar *g; + FILE *fp; + char *filename, *graminit_h, *graminit_c; + + if (argc != 4) { + fprintf(stderr, + "usage: %s grammar graminit.h graminit.c\n", argv[0]); + Py_Exit(2); + } + filename = argv[1]; + graminit_h = argv[2]; + graminit_c = argv[3]; + g = getgrammar(filename); + fp = fopen(graminit_c, "w"); + if (fp == NULL) { + perror(graminit_c); + Py_Exit(1); + } + if (Py_DebugFlag) + printf("Writing %s ...\n", graminit_c); + printgrammar(g, fp); + fclose(fp); + fp = fopen(graminit_h, "w"); + if (fp == NULL) { + perror(graminit_h); + Py_Exit(1); + } + if (Py_DebugFlag) + printf("Writing %s ...\n", graminit_h); + printnonterminals(g, fp); + fclose(fp); + freegrammar(g); + Py_Exit(0); + return 0; /* Make gcc -Wall happy */ +} + +grammar * +getgrammar(char *filename) +{ + FILE *fp; + node *n; + grammar *g0, *g; + perrdetail err; + + fp = fopen(filename, "r"); + if (fp == NULL) { + perror(filename); + Py_Exit(1); + } + g0 = meta_grammar(); + n = PyParser_ParseFile(fp, filename, g0, g0->g_start, + (char *)NULL, (char *)NULL, &err); + fclose(fp); + if (n == NULL) { + fprintf(stderr, "Parsing error %d, line %d.\n", + err.error, err.lineno); + if (err.text != NULL) { + size_t i; + fprintf(stderr, "%s", err.text); + i = strlen(err.text); + if (i == 0 || err.text[i-1] != '\n') + fprintf(stderr, "\n"); + for (i = 0; i < err.offset; i++) { + if (err.text[i] == '\t') + putc('\t', stderr); + else + putc(' ', stderr); + } + fprintf(stderr, "^\n"); + PyObject_FREE(err.text); + } + Py_Exit(1); + } + g = pgen(n); + if (g == NULL) { + printf("Bad grammar.\n"); + Py_Exit(1); + } + return g; +} + +/* Can't happen in pgen */ +PyObject* +PyErr_Occurred() +{ + return 0; +} + +void +Py_FatalError(const char *msg) +{ + fprintf(stderr, "pgen: FATAL ERROR: %s\n", msg); + Py_Exit(1); +} + +/* No-nonsense my_readline() for tokenizer.c */ + +char * +PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, char *prompt) +{ + size_t n = 1000; + char *p = (char *)PyMem_MALLOC(n); + char *q; + if (p == NULL) + return NULL; + fprintf(stderr, "%s", prompt); + q = fgets(p, n, sys_stdin); + if (q == NULL) { + *p = '\0'; + return p; + } + n = strlen(p); + if (n > 0 && p[n-1] != '\n') + p[n-1] = '\n'; + return (char *)PyMem_REALLOC(p, n+1); +} + +/* No-nonsense fgets */ +char * +Py_UniversalNewlineFgets(char *buf, int n, FILE *stream, PyObject *fobj) +{ + return fgets(buf, n, stream); +} + + +#include <stdarg.h> + +void +PySys_WriteStderr(const char *format, ...) +{ + va_list va; + + va_start(va, format); + vfprintf(stderr, format, va); + va_end(va); +} diff --git a/Parser/printgrammar.c b/Parser/printgrammar.c new file mode 100644 index 0000000..01f552f --- /dev/null +++ b/Parser/printgrammar.c @@ -0,0 +1,117 @@ + +/* Print a bunch of C initializers that represent a grammar */ + +#include "pgenheaders.h" +#include "grammar.h" + +/* Forward */ +static void printarcs(int, dfa *, FILE *); +static void printstates(grammar *, FILE *); +static void printdfas(grammar *, FILE *); +static void printlabels(grammar *, FILE *); + +void +printgrammar(grammar *g, FILE *fp) +{ + fprintf(fp, "/* Generated by Parser/pgen */\n\n"); + fprintf(fp, "#include \"pgenheaders.h\"\n"); + fprintf(fp, "#include \"grammar.h\"\n"); + fprintf(fp, "PyAPI_DATA(grammar) _PyParser_Grammar;\n"); + printdfas(g, fp); + printlabels(g, fp); + fprintf(fp, "grammar _PyParser_Grammar = {\n"); + fprintf(fp, " %d,\n", g->g_ndfas); + fprintf(fp, " dfas,\n"); + fprintf(fp, " {%d, labels},\n", g->g_ll.ll_nlabels); + fprintf(fp, " %d\n", g->g_start); + fprintf(fp, "};\n"); +} + +void +printnonterminals(grammar *g, FILE *fp) +{ + dfa *d; + int i; + + fprintf(fp, "/* Generated by Parser/pgen */\n\n"); + + d = g->g_dfa; + for (i = g->g_ndfas; --i >= 0; d++) + fprintf(fp, "#define %s %d\n", d->d_name, d->d_type); +} + +static void +printarcs(int i, dfa *d, FILE *fp) +{ + arc *a; + state *s; + int j, k; + + s = d->d_state; + for (j = 0; j < d->d_nstates; j++, s++) { + fprintf(fp, "static arc arcs_%d_%d[%d] = {\n", + i, j, s->s_narcs); + a = s->s_arc; + for (k = 0; k < s->s_narcs; k++, a++) + fprintf(fp, " {%d, %d},\n", a->a_lbl, a->a_arrow); + fprintf(fp, "};\n"); + } +} + +static void +printstates(grammar *g, FILE *fp) +{ + state *s; + dfa *d; + int i, j; + + d = g->g_dfa; + for (i = 0; i < g->g_ndfas; i++, d++) { + printarcs(i, d, fp); + fprintf(fp, "static state states_%d[%d] = {\n", + i, d->d_nstates); + s = d->d_state; + for (j = 0; j < d->d_nstates; j++, s++) + fprintf(fp, " {%d, arcs_%d_%d},\n", + s->s_narcs, i, j); + fprintf(fp, "};\n"); + } +} + +static void +printdfas(grammar *g, FILE *fp) +{ + dfa *d; + int i, j; + + printstates(g, fp); + fprintf(fp, "static dfa dfas[%d] = {\n", g->g_ndfas); + d = g->g_dfa; + for (i = 0; i < g->g_ndfas; i++, d++) { + fprintf(fp, " {%d, \"%s\", %d, %d, states_%d,\n", + d->d_type, d->d_name, d->d_initial, d->d_nstates, i); + fprintf(fp, " \""); + for (j = 0; j < NBYTES(g->g_ll.ll_nlabels); j++) + fprintf(fp, "\\%03o", d->d_first[j] & 0xff); + fprintf(fp, "\"},\n"); + } + fprintf(fp, "};\n"); +} + +static void +printlabels(grammar *g, FILE *fp) +{ + label *l; + int i; + + fprintf(fp, "static label labels[%d] = {\n", g->g_ll.ll_nlabels); + l = g->g_ll.ll_label; + for (i = g->g_ll.ll_nlabels; --i >= 0; l++) { + if (l->lb_str == NULL) + fprintf(fp, " {%d, 0},\n", l->lb_type); + else + fprintf(fp, " {%d, \"%s\"},\n", + l->lb_type, l->lb_str); + } + fprintf(fp, "};\n"); +} diff --git a/Parser/spark.py b/Parser/spark.py new file mode 100644 index 0000000..b064d62 --- /dev/null +++ b/Parser/spark.py @@ -0,0 +1,839 @@ +# Copyright (c) 1998-2002 John Aycock +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +__version__ = 'SPARK-0.7 (pre-alpha-5)' + +import re +import string + +def _namelist(instance): + namelist, namedict, classlist = [], {}, [instance.__class__] + for c in classlist: + for b in c.__bases__: + classlist.append(b) + for name in c.__dict__.keys(): + if not namedict.has_key(name): + namelist.append(name) + namedict[name] = 1 + return namelist + +class GenericScanner: + def __init__(self, flags=0): + pattern = self.reflect() + self.re = re.compile(pattern, re.VERBOSE|flags) + + self.index2func = {} + for name, number in self.re.groupindex.items(): + self.index2func[number-1] = getattr(self, 't_' + name) + + def makeRE(self, name): + doc = getattr(self, name).__doc__ + rv = '(?P<%s>%s)' % (name[2:], doc) + return rv + + def reflect(self): + rv = [] + for name in _namelist(self): + if name[:2] == 't_' and name != 't_default': + rv.append(self.makeRE(name)) + + rv.append(self.makeRE('t_default')) + return string.join(rv, '|') + + def error(self, s, pos): + print "Lexical error at position %s" % pos + raise SystemExit + + def tokenize(self, s): + pos = 0 + n = len(s) + while pos < n: + m = self.re.match(s, pos) + if m is None: + self.error(s, pos) + + groups = m.groups() + for i in range(len(groups)): + if groups[i] and self.index2func.has_key(i): + self.index2func[i](groups[i]) + pos = m.end() + + def t_default(self, s): + r'( . | \n )+' + print "Specification error: unmatched input" + raise SystemExit + +# +# Extracted from GenericParser and made global so that [un]picking works. +# +class _State: + def __init__(self, stateno, items): + self.T, self.complete, self.items = [], [], items + self.stateno = stateno + +class GenericParser: + # + # An Earley parser, as per J. Earley, "An Efficient Context-Free + # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley, + # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis, + # Carnegie-Mellon University, August 1968. New formulation of + # the parser according to J. Aycock, "Practical Earley Parsing + # and the SPARK Toolkit", Ph.D. thesis, University of Victoria, + # 2001, and J. Aycock and R. N. Horspool, "Practical Earley + # Parsing", unpublished paper, 2001. + # + + def __init__(self, start): + self.rules = {} + self.rule2func = {} + self.rule2name = {} + self.collectRules() + self.augment(start) + self.ruleschanged = 1 + + _NULLABLE = '\e_' + _START = 'START' + _BOF = '|-' + + # + # When pickling, take the time to generate the full state machine; + # some information is then extraneous, too. Unfortunately we + # can't save the rule2func map. + # + def __getstate__(self): + if self.ruleschanged: + # + # XXX - duplicated from parse() + # + self.computeNull() + self.newrules = {} + self.new2old = {} + self.makeNewRules() + self.ruleschanged = 0 + self.edges, self.cores = {}, {} + self.states = { 0: self.makeState0() } + self.makeState(0, self._BOF) + # + # XXX - should find a better way to do this.. + # + changes = 1 + while changes: + changes = 0 + for k, v in self.edges.items(): + if v is None: + state, sym = k + if self.states.has_key(state): + self.goto(state, sym) + changes = 1 + rv = self.__dict__.copy() + for s in self.states.values(): + del s.items + del rv['rule2func'] + del rv['nullable'] + del rv['cores'] + return rv + + def __setstate__(self, D): + self.rules = {} + self.rule2func = {} + self.rule2name = {} + self.collectRules() + start = D['rules'][self._START][0][1][1] # Blech. + self.augment(start) + D['rule2func'] = self.rule2func + D['makeSet'] = self.makeSet_fast + self.__dict__ = D + + # + # A hook for GenericASTBuilder and GenericASTMatcher. Mess + # thee not with this; nor shall thee toucheth the _preprocess + # argument to addRule. + # + def preprocess(self, rule, func): return rule, func + + def addRule(self, doc, func, _preprocess=1): + fn = func + rules = string.split(doc) + + index = [] + for i in range(len(rules)): + if rules[i] == '::=': + index.append(i-1) + index.append(len(rules)) + + for i in range(len(index)-1): + lhs = rules[index[i]] + rhs = rules[index[i]+2:index[i+1]] + rule = (lhs, tuple(rhs)) + + if _preprocess: + rule, fn = self.preprocess(rule, func) + + if self.rules.has_key(lhs): + self.rules[lhs].append(rule) + else: + self.rules[lhs] = [ rule ] + self.rule2func[rule] = fn + self.rule2name[rule] = func.__name__[2:] + self.ruleschanged = 1 + + def collectRules(self): + for name in _namelist(self): + if name[:2] == 'p_': + func = getattr(self, name) + doc = func.__doc__ + self.addRule(doc, func) + + def augment(self, start): + rule = '%s ::= %s %s' % (self._START, self._BOF, start) + self.addRule(rule, lambda args: args[1], 0) + + def computeNull(self): + self.nullable = {} + tbd = [] + + for rulelist in self.rules.values(): + lhs = rulelist[0][0] + self.nullable[lhs] = 0 + for rule in rulelist: + rhs = rule[1] + if len(rhs) == 0: + self.nullable[lhs] = 1 + continue + # + # We only need to consider rules which + # consist entirely of nonterminal symbols. + # This should be a savings on typical + # grammars. + # + for sym in rhs: + if not self.rules.has_key(sym): + break + else: + tbd.append(rule) + changes = 1 + while changes: + changes = 0 + for lhs, rhs in tbd: + if self.nullable[lhs]: + continue + for sym in rhs: + if not self.nullable[sym]: + break + else: + self.nullable[lhs] = 1 + changes = 1 + + def makeState0(self): + s0 = _State(0, []) + for rule in self.newrules[self._START]: + s0.items.append((rule, 0)) + return s0 + + def finalState(self, tokens): + # + # Yuck. + # + if len(self.newrules[self._START]) == 2 and len(tokens) == 0: + return 1 + start = self.rules[self._START][0][1][1] + return self.goto(1, start) + + def makeNewRules(self): + worklist = [] + for rulelist in self.rules.values(): + for rule in rulelist: + worklist.append((rule, 0, 1, rule)) + + for rule, i, candidate, oldrule in worklist: + lhs, rhs = rule + n = len(rhs) + while i < n: + sym = rhs[i] + if not self.rules.has_key(sym) or \ + not self.nullable[sym]: + candidate = 0 + i = i + 1 + continue + + newrhs = list(rhs) + newrhs[i] = self._NULLABLE+sym + newrule = (lhs, tuple(newrhs)) + worklist.append((newrule, i+1, + candidate, oldrule)) + candidate = 0 + i = i + 1 + else: + if candidate: + lhs = self._NULLABLE+lhs + rule = (lhs, rhs) + if self.newrules.has_key(lhs): + self.newrules[lhs].append(rule) + else: + self.newrules[lhs] = [ rule ] + self.new2old[rule] = oldrule + + def typestring(self, token): + return None + + def error(self, token): + print "Syntax error at or near `%s' token" % token + raise SystemExit + + def parse(self, tokens): + sets = [ [(1,0), (2,0)] ] + self.links = {} + + if self.ruleschanged: + self.computeNull() + self.newrules = {} + self.new2old = {} + self.makeNewRules() + self.ruleschanged = 0 + self.edges, self.cores = {}, {} + self.states = { 0: self.makeState0() } + self.makeState(0, self._BOF) + + for i in xrange(len(tokens)): + sets.append([]) + + if sets[i] == []: + break + self.makeSet(tokens[i], sets, i) + else: + sets.append([]) + self.makeSet(None, sets, len(tokens)) + + #_dump(tokens, sets, self.states) + + finalitem = (self.finalState(tokens), 0) + if finalitem not in sets[-2]: + if len(tokens) > 0: + self.error(tokens[i-1]) + else: + self.error(None) + + return self.buildTree(self._START, finalitem, + tokens, len(sets)-2) + + def isnullable(self, sym): + # + # For symbols in G_e only. If we weren't supporting 1.5, + # could just use sym.startswith(). + # + return self._NULLABLE == sym[0:len(self._NULLABLE)] + + def skip(self, (lhs, rhs), pos=0): + n = len(rhs) + while pos < n: + if not self.isnullable(rhs[pos]): + break + pos = pos + 1 + return pos + + def makeState(self, state, sym): + assert sym is not None + # + # Compute \epsilon-kernel state's core and see if + # it exists already. + # + kitems = [] + for rule, pos in self.states[state].items: + lhs, rhs = rule + if rhs[pos:pos+1] == (sym,): + kitems.append((rule, self.skip(rule, pos+1))) + core = kitems + + core.sort() + tcore = tuple(core) + if self.cores.has_key(tcore): + return self.cores[tcore] + # + # Nope, doesn't exist. Compute it and the associated + # \epsilon-nonkernel state together; we'll need it right away. + # + k = self.cores[tcore] = len(self.states) + K, NK = _State(k, kitems), _State(k+1, []) + self.states[k] = K + predicted = {} + + edges = self.edges + rules = self.newrules + for X in K, NK: + worklist = X.items + for item in worklist: + rule, pos = item + lhs, rhs = rule + if pos == len(rhs): + X.complete.append(rule) + continue + + nextSym = rhs[pos] + key = (X.stateno, nextSym) + if not rules.has_key(nextSym): + if not edges.has_key(key): + edges[key] = None + X.T.append(nextSym) + else: + edges[key] = None + if not predicted.has_key(nextSym): + predicted[nextSym] = 1 + for prule in rules[nextSym]: + ppos = self.skip(prule) + new = (prule, ppos) + NK.items.append(new) + # + # Problem: we know K needs generating, but we + # don't yet know about NK. Can't commit anything + # regarding NK to self.edges until we're sure. Should + # we delay committing on both K and NK to avoid this + # hacky code? This creates other problems.. + # + if X is K: + edges = {} + + if NK.items == []: + return k + + # + # Check for \epsilon-nonkernel's core. Unfortunately we + # need to know the entire set of predicted nonterminals + # to do this without accidentally duplicating states. + # + core = predicted.keys() + core.sort() + tcore = tuple(core) + if self.cores.has_key(tcore): + self.edges[(k, None)] = self.cores[tcore] + return k + + nk = self.cores[tcore] = self.edges[(k, None)] = NK.stateno + self.edges.update(edges) + self.states[nk] = NK + return k + + def goto(self, state, sym): + key = (state, sym) + if not self.edges.has_key(key): + # + # No transitions from state on sym. + # + return None + + rv = self.edges[key] + if rv is None: + # + # Target state isn't generated yet. Remedy this. + # + rv = self.makeState(state, sym) + self.edges[key] = rv + return rv + + def gotoT(self, state, t): + return [self.goto(state, t)] + + def gotoST(self, state, st): + rv = [] + for t in self.states[state].T: + if st == t: + rv.append(self.goto(state, t)) + return rv + + def add(self, set, item, i=None, predecessor=None, causal=None): + if predecessor is None: + if item not in set: + set.append(item) + else: + key = (item, i) + if item not in set: + self.links[key] = [] + set.append(item) + self.links[key].append((predecessor, causal)) + + def makeSet(self, token, sets, i): + cur, next = sets[i], sets[i+1] + + ttype = token is not None and self.typestring(token) or None + if ttype is not None: + fn, arg = self.gotoT, ttype + else: + fn, arg = self.gotoST, token + + for item in cur: + ptr = (item, i) + state, parent = item + add = fn(state, arg) + for k in add: + if k is not None: + self.add(next, (k, parent), i+1, ptr) + nk = self.goto(k, None) + if nk is not None: + self.add(next, (nk, i+1)) + + if parent == i: + continue + + for rule in self.states[state].complete: + lhs, rhs = rule + for pitem in sets[parent]: + pstate, pparent = pitem + k = self.goto(pstate, lhs) + if k is not None: + why = (item, i, rule) + pptr = (pitem, parent) + self.add(cur, (k, pparent), + i, pptr, why) + nk = self.goto(k, None) + if nk is not None: + self.add(cur, (nk, i)) + + def makeSet_fast(self, token, sets, i): + # + # Call *only* when the entire state machine has been built! + # It relies on self.edges being filled in completely, and + # then duplicates and inlines code to boost speed at the + # cost of extreme ugliness. + # + cur, next = sets[i], sets[i+1] + ttype = token is not None and self.typestring(token) or None + + for item in cur: + ptr = (item, i) + state, parent = item + if ttype is not None: + k = self.edges.get((state, ttype), None) + if k is not None: + #self.add(next, (k, parent), i+1, ptr) + #INLINED --v + new = (k, parent) + key = (new, i+1) + if new not in next: + self.links[key] = [] + next.append(new) + self.links[key].append((ptr, None)) + #INLINED --^ + #nk = self.goto(k, None) + nk = self.edges.get((k, None), None) + if nk is not None: + #self.add(next, (nk, i+1)) + #INLINED --v + new = (nk, i+1) + if new not in next: + next.append(new) + #INLINED --^ + else: + add = self.gotoST(state, token) + for k in add: + if k is not None: + self.add(next, (k, parent), i+1, ptr) + #nk = self.goto(k, None) + nk = self.edges.get((k, None), None) + if nk is not None: + self.add(next, (nk, i+1)) + + if parent == i: + continue + + for rule in self.states[state].complete: + lhs, rhs = rule + for pitem in sets[parent]: + pstate, pparent = pitem + #k = self.goto(pstate, lhs) + k = self.edges.get((pstate, lhs), None) + if k is not None: + why = (item, i, rule) + pptr = (pitem, parent) + #self.add(cur, (k, pparent), + # i, pptr, why) + #INLINED --v + new = (k, pparent) + key = (new, i) + if new not in cur: + self.links[key] = [] + cur.append(new) + self.links[key].append((pptr, why)) + #INLINED --^ + #nk = self.goto(k, None) + nk = self.edges.get((k, None), None) + if nk is not None: + #self.add(cur, (nk, i)) + #INLINED --v + new = (nk, i) + if new not in cur: + cur.append(new) + #INLINED --^ + + def predecessor(self, key, causal): + for p, c in self.links[key]: + if c == causal: + return p + assert 0 + + def causal(self, key): + links = self.links[key] + if len(links) == 1: + return links[0][1] + choices = [] + rule2cause = {} + for p, c in links: + rule = c[2] + choices.append(rule) + rule2cause[rule] = c + return rule2cause[self.ambiguity(choices)] + + def deriveEpsilon(self, nt): + if len(self.newrules[nt]) > 1: + rule = self.ambiguity(self.newrules[nt]) + else: + rule = self.newrules[nt][0] + #print rule + + rhs = rule[1] + attr = [None] * len(rhs) + + for i in range(len(rhs)-1, -1, -1): + attr[i] = self.deriveEpsilon(rhs[i]) + return self.rule2func[self.new2old[rule]](attr) + + def buildTree(self, nt, item, tokens, k): + state, parent = item + + choices = [] + for rule in self.states[state].complete: + if rule[0] == nt: + choices.append(rule) + rule = choices[0] + if len(choices) > 1: + rule = self.ambiguity(choices) + #print rule + + rhs = rule[1] + attr = [None] * len(rhs) + + for i in range(len(rhs)-1, -1, -1): + sym = rhs[i] + if not self.newrules.has_key(sym): + if sym != self._BOF: + attr[i] = tokens[k-1] + key = (item, k) + item, k = self.predecessor(key, None) + #elif self.isnullable(sym): + elif self._NULLABLE == sym[0:len(self._NULLABLE)]: + attr[i] = self.deriveEpsilon(sym) + else: + key = (item, k) + why = self.causal(key) + attr[i] = self.buildTree(sym, why[0], + tokens, why[1]) + item, k = self.predecessor(key, why) + return self.rule2func[self.new2old[rule]](attr) + + def ambiguity(self, rules): + # + # XXX - problem here and in collectRules() if the same rule + # appears in >1 method. Also undefined results if rules + # causing the ambiguity appear in the same method. + # + sortlist = [] + name2index = {} + for i in range(len(rules)): + lhs, rhs = rule = rules[i] + name = self.rule2name[self.new2old[rule]] + sortlist.append((len(rhs), name)) + name2index[name] = i + sortlist.sort() + list = map(lambda (a,b): b, sortlist) + return rules[name2index[self.resolve(list)]] + + def resolve(self, list): + # + # Resolve ambiguity in favor of the shortest RHS. + # Since we walk the tree from the top down, this + # should effectively resolve in favor of a "shift". + # + return list[0] + +# +# GenericASTBuilder automagically constructs a concrete/abstract syntax tree +# for a given input. The extra argument is a class (not an instance!) +# which supports the "__setslice__" and "__len__" methods. +# +# XXX - silently overrides any user code in methods. +# + +class GenericASTBuilder(GenericParser): + def __init__(self, AST, start): + GenericParser.__init__(self, start) + self.AST = AST + + def preprocess(self, rule, func): + rebind = lambda lhs, self=self: \ + lambda args, lhs=lhs, self=self: \ + self.buildASTNode(args, lhs) + lhs, rhs = rule + return rule, rebind(lhs) + + def buildASTNode(self, args, lhs): + children = [] + for arg in args: + if isinstance(arg, self.AST): + children.append(arg) + else: + children.append(self.terminal(arg)) + return self.nonterminal(lhs, children) + + def terminal(self, token): return token + + def nonterminal(self, type, args): + rv = self.AST(type) + rv[:len(args)] = args + return rv + +# +# GenericASTTraversal is a Visitor pattern according to Design Patterns. For +# each node it attempts to invoke the method n_<node type>, falling +# back onto the default() method if the n_* can't be found. The preorder +# traversal also looks for an exit hook named n_<node type>_exit (no default +# routine is called if it's not found). To prematurely halt traversal +# of a subtree, call the prune() method -- this only makes sense for a +# preorder traversal. Node type is determined via the typestring() method. +# + +class GenericASTTraversalPruningException: + pass + +class GenericASTTraversal: + def __init__(self, ast): + self.ast = ast + + def typestring(self, node): + return node.type + + def prune(self): + raise GenericASTTraversalPruningException + + def preorder(self, node=None): + if node is None: + node = self.ast + + try: + name = 'n_' + self.typestring(node) + if hasattr(self, name): + func = getattr(self, name) + func(node) + else: + self.default(node) + except GenericASTTraversalPruningException: + return + + for kid in node: + self.preorder(kid) + + name = name + '_exit' + if hasattr(self, name): + func = getattr(self, name) + func(node) + + def postorder(self, node=None): + if node is None: + node = self.ast + + for kid in node: + self.postorder(kid) + + name = 'n_' + self.typestring(node) + if hasattr(self, name): + func = getattr(self, name) + func(node) + else: + self.default(node) + + + def default(self, node): + pass + +# +# GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__" +# implemented. +# +# XXX - makes assumptions about how GenericParser walks the parse tree. +# + +class GenericASTMatcher(GenericParser): + def __init__(self, start, ast): + GenericParser.__init__(self, start) + self.ast = ast + + def preprocess(self, rule, func): + rebind = lambda func, self=self: \ + lambda args, func=func, self=self: \ + self.foundMatch(args, func) + lhs, rhs = rule + rhslist = list(rhs) + rhslist.reverse() + + return (lhs, tuple(rhslist)), rebind(func) + + def foundMatch(self, args, func): + func(args[-1]) + return args[-1] + + def match_r(self, node): + self.input.insert(0, node) + children = 0 + + for child in node: + if children == 0: + self.input.insert(0, '(') + children = children + 1 + self.match_r(child) + + if children > 0: + self.input.insert(0, ')') + + def match(self, ast=None): + if ast is None: + ast = self.ast + self.input = [] + + self.match_r(ast) + self.parse(self.input) + + def resolve(self, list): + # + # Resolve ambiguity in favor of the longest RHS. + # + return list[-1] + +def _dump(tokens, sets, states): + for i in range(len(sets)): + print 'set', i + for item in sets[i]: + print '\t', item + for (lhs, rhs), pos in states[item[0]].items: + print '\t\t', lhs, '::=', + print string.join(rhs[:pos]), + print '.', + print string.join(rhs[pos:]) + if i < len(tokens): + print + print 'token', str(tokens[i]) + print diff --git a/Parser/token.c b/Parser/token.c deleted file mode 100644 index a489668..0000000 --- a/Parser/token.c +++ /dev/null @@ -1,243 +0,0 @@ -/* Auto-generated by Tools/scripts/generate_token.py */ - -#include "Python.h" -#include "token.h" - -/* Token names */ - -const char * const _PyParser_TokenNames[] = { - "ENDMARKER", - "NAME", - "NUMBER", - "STRING", - "NEWLINE", - "INDENT", - "DEDENT", - "LPAR", - "RPAR", - "LSQB", - "RSQB", - "COLON", - "COMMA", - "SEMI", - "PLUS", - "MINUS", - "STAR", - "SLASH", - "VBAR", - "AMPER", - "LESS", - "GREATER", - "EQUAL", - "DOT", - "PERCENT", - "LBRACE", - "RBRACE", - "EQEQUAL", - "NOTEQUAL", - "LESSEQUAL", - "GREATEREQUAL", - "TILDE", - "CIRCUMFLEX", - "LEFTSHIFT", - "RIGHTSHIFT", - "DOUBLESTAR", - "PLUSEQUAL", - "MINEQUAL", - "STAREQUAL", - "SLASHEQUAL", - "PERCENTEQUAL", - "AMPEREQUAL", - "VBAREQUAL", - "CIRCUMFLEXEQUAL", - "LEFTSHIFTEQUAL", - "RIGHTSHIFTEQUAL", - "DOUBLESTAREQUAL", - "DOUBLESLASH", - "DOUBLESLASHEQUAL", - "AT", - "ATEQUAL", - "RARROW", - "ELLIPSIS", - "COLONEQUAL", - "OP", - "AWAIT", - "ASYNC", - "TYPE_IGNORE", - "TYPE_COMMENT", - "<ERRORTOKEN>", - "<COMMENT>", - "<NL>", - "<ENCODING>", - "<N_TOKENS>", -}; - -/* Return the token corresponding to a single character */ - -int -PyToken_OneChar(int c1) -{ - switch (c1) { - case '%': return PERCENT; - case '&': return AMPER; - case '(': return LPAR; - case ')': return RPAR; - case '*': return STAR; - case '+': return PLUS; - case ',': return COMMA; - case '-': return MINUS; - case '.': return DOT; - case '/': return SLASH; - case ':': return COLON; - case ';': return SEMI; - case '<': return LESS; - case '=': return EQUAL; - case '>': return GREATER; - case '@': return AT; - case '[': return LSQB; - case ']': return RSQB; - case '^': return CIRCUMFLEX; - case '{': return LBRACE; - case '|': return VBAR; - case '}': return RBRACE; - case '~': return TILDE; - } - return OP; -} - -int -PyToken_TwoChars(int c1, int c2) -{ - switch (c1) { - case '!': - switch (c2) { - case '=': return NOTEQUAL; - } - break; - case '%': - switch (c2) { - case '=': return PERCENTEQUAL; - } - break; - case '&': - switch (c2) { - case '=': return AMPEREQUAL; - } - break; - case '*': - switch (c2) { - case '*': return DOUBLESTAR; - case '=': return STAREQUAL; - } - break; - case '+': - switch (c2) { - case '=': return PLUSEQUAL; - } - break; - case '-': - switch (c2) { - case '=': return MINEQUAL; - case '>': return RARROW; - } - break; - case '/': - switch (c2) { - case '/': return DOUBLESLASH; - case '=': return SLASHEQUAL; - } - break; - case ':': - switch (c2) { - case '=': return COLONEQUAL; - } - break; - case '<': - switch (c2) { - case '<': return LEFTSHIFT; - case '=': return LESSEQUAL; - case '>': return NOTEQUAL; - } - break; - case '=': - switch (c2) { - case '=': return EQEQUAL; - } - break; - case '>': - switch (c2) { - case '=': return GREATEREQUAL; - case '>': return RIGHTSHIFT; - } - break; - case '@': - switch (c2) { - case '=': return ATEQUAL; - } - break; - case '^': - switch (c2) { - case '=': return CIRCUMFLEXEQUAL; - } - break; - case '|': - switch (c2) { - case '=': return VBAREQUAL; - } - break; - } - return OP; -} - -int -PyToken_ThreeChars(int c1, int c2, int c3) -{ - switch (c1) { - case '*': - switch (c2) { - case '*': - switch (c3) { - case '=': return DOUBLESTAREQUAL; - } - break; - } - break; - case '.': - switch (c2) { - case '.': - switch (c3) { - case '.': return ELLIPSIS; - } - break; - } - break; - case '/': - switch (c2) { - case '/': - switch (c3) { - case '=': return DOUBLESLASHEQUAL; - } - break; - } - break; - case '<': - switch (c2) { - case '<': - switch (c3) { - case '=': return LEFTSHIFTEQUAL; - } - break; - } - break; - case '>': - switch (c2) { - case '>': - switch (c3) { - case '=': return RIGHTSHIFTEQUAL; - } - break; - } - break; - } - return OP; -} diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index f84093d..8966661 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -2,6 +2,7 @@ /* Tokenizer implementation */ #include "Python.h" +#include "pgenheaders.h" #include <ctype.h> #include <assert.h> @@ -9,29 +10,16 @@ #include "tokenizer.h" #include "errcode.h" +#ifndef PGEN #include "unicodeobject.h" -#include "bytesobject.h" +#include "stringobject.h" #include "fileobject.h" #include "codecs.h" #include "abstract.h" +#include "pydebug.h" +#endif /* PGEN */ -/* Alternate tab spacing */ -#define ALTTABSIZE 1 - -#define is_potential_identifier_start(c) (\ - (c >= 'a' && c <= 'z')\ - || (c >= 'A' && c <= 'Z')\ - || c == '_'\ - || (c >= 128)) - -#define is_potential_identifier_char(c) (\ - (c >= 'a' && c <= 'z')\ - || (c >= 'A' && c <= 'Z')\ - || (c >= '0' && c <= '9')\ - || c == '_'\ - || (c >= 128)) - -extern char *PyOS_Readline(FILE *, FILE *, const char *); +extern char *PyOS_Readline(FILE *, FILE *, char *); /* Return malloc'ed string including trailing \n; empty malloc'ed string for EOF; NULL if interrupted */ @@ -44,10 +32,65 @@ static struct tok_state *tok_new(void); static int tok_nextc(struct tok_state *tok); static void tok_backup(struct tok_state *tok, int c); - -/* Spaces in this constant are treated as "zero or more spaces or tabs" when - tokenizing. */ -static const char* type_comment_prefix = "# type: "; +/* Token names */ + +char *_PyParser_TokenNames[] = { + "ENDMARKER", + "NAME", + "NUMBER", + "STRING", + "NEWLINE", + "INDENT", + "DEDENT", + "LPAR", + "RPAR", + "LSQB", + "RSQB", + "COLON", + "COMMA", + "SEMI", + "PLUS", + "MINUS", + "STAR", + "SLASH", + "VBAR", + "AMPER", + "LESS", + "GREATER", + "EQUAL", + "DOT", + "PERCENT", + "BACKQUOTE", + "LBRACE", + "RBRACE", + "EQEQUAL", + "NOTEQUAL", + "LESSEQUAL", + "GREATEREQUAL", + "TILDE", + "CIRCUMFLEX", + "LEFTSHIFT", + "RIGHTSHIFT", + "DOUBLESTAR", + "PLUSEQUAL", + "MINEQUAL", + "STAREQUAL", + "SLASHEQUAL", + "PERCENTEQUAL", + "AMPEREQUAL", + "VBAREQUAL", + "CIRCUMFLEXEQUAL", + "LEFTSHIFTEQUAL", + "RIGHTSHIFTEQUAL", + "DOUBLESTAREQUAL", + "DOUBLESLASH", + "DOUBLESLASHEQUAL", + "AT", + /* This table must match the #defines in token.h! */ + "OP", + "<ERRORTOKEN>", + "<N_TOKENS>" +}; /* Create and initialize a new tok_state structure */ @@ -65,45 +108,61 @@ tok_new(void) tok->tabsize = TABSIZE; tok->indent = 0; tok->indstack[0] = 0; - tok->atbol = 1; tok->pendin = 0; tok->prompt = tok->nextprompt = NULL; tok->lineno = 0; tok->level = 0; + tok->filename = NULL; + tok->altwarning = 0; + tok->alterror = 0; + tok->alttabsize = 1; tok->altindstack[0] = 0; - tok->decoding_state = STATE_INIT; + tok->decoding_state = 0; tok->decoding_erred = 0; tok->read_coding_spec = 0; - tok->enc = NULL; tok->encoding = NULL; tok->cont_line = 0; - tok->filename = NULL; +#ifndef PGEN tok->decoding_readline = NULL; tok->decoding_buffer = NULL; - tok->type_comments = 0; - - tok->async_hacks = 0; - tok->async_def = 0; - tok->async_def_indent = 0; - tok->async_def_nl = 0; - +#endif return tok; } static char * -new_string(const char *s, Py_ssize_t len, struct tok_state *tok) +new_string(const char *s, Py_ssize_t len) { char* result = (char *)PyMem_MALLOC(len + 1); - if (!result) { - tok->done = E_NOMEM; - return NULL; + if (result != NULL) { + memcpy(result, s, len); + result[len] = '\0'; } - memcpy(result, s, len); - result[len] = '\0'; return result; } +#ifdef PGEN + +static char * +decoding_fgets(char *s, int size, struct tok_state *tok) +{ + return fgets(s, size, tok->fp); +} + +static int +decoding_feof(struct tok_state *tok) +{ + return feof(tok->fp); +} + +static char * +decode_str(const char *str, int exec_input, struct tok_state *tok) +{ + return new_string(str, strlen(str)); +} + +#else /* PGEN */ + static char * error_ret(struct tok_state *tok) /* XXX */ { @@ -116,8 +175,8 @@ error_ret(struct tok_state *tok) /* XXX */ } -static const char * -get_normal_name(const char *s) /* for utf-8 and latin-1 */ +static char * +get_normal_name(char *s) /* for utf-8 and latin-1 */ { char buf[13]; int i; @@ -147,18 +206,17 @@ get_normal_name(const char *s) /* for utf-8 and latin-1 */ /* Return the coding spec in S, or NULL if none is found. */ -static int -get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok) +static char * +get_coding_spec(const char *s, Py_ssize_t size) { Py_ssize_t i; - *spec = NULL; /* Coding spec must be in a comment, and that comment must be * the only statement on the source code line. */ for (i = 0; i < size - 6; i++) { if (s[i] == '#') break; if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') - return 1; + return NULL; } for (; i < size - 6; i++) { /* XXX inefficient search */ const char* t = s + i; @@ -177,23 +235,20 @@ get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *t t++; if (begin < t) { - char* r = new_string(begin, t - begin, tok); - const char* q; + char* r = new_string(begin, t - begin); + char* q; if (!r) - return 0; + return NULL; q = get_normal_name(r); if (r != q) { PyMem_FREE(r); - r = new_string(q, strlen(q), tok); - if (!r) - return 0; + r = new_string(q, strlen(q)); } - *spec = r; - break; + return r; } } } - return 1; + return NULL; } /* Check whether the line contains a coding spec. If it does, @@ -205,7 +260,7 @@ static int check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, int set_readline(struct tok_state *, const char *)) { - char *cs; + char * cs; int r = 1; if (tok->cont_line) { @@ -213,8 +268,7 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, tok->read_coding_spec = 1; return 1; } - if (!get_coding_spec(line, &cs, size, tok)) - return 0; + cs = get_coding_spec(line, size); if (!cs) { Py_ssize_t i; for (i = 0; i < size; i++) { @@ -227,31 +281,40 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, break; } } - return 1; - } - tok->read_coding_spec = 1; - if (tok->encoding == NULL) { - assert(tok->decoding_state == STATE_RAW); - if (strcmp(cs, "utf-8") == 0) { - tok->encoding = cs; - } else { - r = set_readline(tok, cs); - if (r) { + } else { + tok->read_coding_spec = 1; + if (tok->encoding == NULL) { + assert(tok->decoding_state == 1); /* raw */ + if (strcmp(cs, "utf-8") == 0 || + strcmp(cs, "iso-8859-1") == 0) { tok->encoding = cs; - tok->decoding_state = STATE_NORMAL; - } - else { - PyErr_Format(PyExc_SyntaxError, - "encoding problem: %s", cs); + } else { +#ifdef Py_USING_UNICODE + r = set_readline(tok, cs); + if (r) { + tok->encoding = cs; + tok->decoding_state = -1; + } + else { + PyErr_Format(PyExc_SyntaxError, + "encoding problem: %s", cs); + PyMem_FREE(cs); + } +#else + /* Without Unicode support, we cannot + process the coding spec. Since there + won't be any Unicode literals, that + won't matter. */ PyMem_FREE(cs); +#endif } + } else { /* then, compare cs with BOM */ + r = (strcmp(tok->encoding, cs) == 0); + if (!r) + PyErr_Format(PyExc_SyntaxError, + "encoding problem: %s with BOM", cs); + PyMem_FREE(cs); } - } else { /* then, compare cs with BOM */ - r = (strcmp(tok->encoding, cs) == 0); - if (!r) - PyErr_Format(PyExc_SyntaxError, - "encoding problem: %s with BOM", cs); - PyMem_FREE(cs); } return r; } @@ -268,7 +331,7 @@ check_bom(int get_char(struct tok_state *), { int ch1, ch2, ch3; ch1 = get_char(tok); - tok->decoding_state = STATE_RAW; + tok->decoding_state = 1; if (ch1 == EOF) { return 1; } else if (ch1 == 0xEF) { @@ -297,7 +360,7 @@ check_bom(int get_char(struct tok_state *), } if (!set_readline(tok, "utf-16-be")) return 0; - tok->decoding_state = STATE_NORMAL; + tok->decoding_state = -1; } else if (ch1 == 0xFF) { ch2 = get_char(tok); if (ch2 != 0xFE) { @@ -307,7 +370,7 @@ check_bom(int get_char(struct tok_state *), } if (!set_readline(tok, "utf-16-le")) return 0; - tok->decoding_state = STATE_NORMAL; + tok->decoding_state = -1; #endif } else { unget_char(ch1, tok); @@ -315,10 +378,7 @@ check_bom(int get_char(struct tok_state *), } if (tok->encoding != NULL) PyMem_FREE(tok->encoding); - tok->encoding = new_string("utf-8", 5, tok); - if (!tok->encoding) - return 0; - /* No need to set_readline: input is already utf-8 */ + tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ return 1; } @@ -329,7 +389,7 @@ check_bom(int get_char(struct tok_state *), 1) NULL: need to call tok->decoding_readline to get a new line 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and stored the result in tok->decoding_buffer - 3) PyByteArrayObject *: previous call to fp_readl did not have enough room + 3) PyStringObject *: previous call to fp_readl did not have enough room (in the s buffer) to copy entire contents of the line read by tok->decoding_readline. tok->decoding_buffer has the overflow. In this case, fp_readl is called in a loop (with an expanded buffer) @@ -340,62 +400,58 @@ check_bom(int get_char(struct tok_state *), static char * fp_readl(char *s, int size, struct tok_state *tok) { - PyObject* bufobj; - const char *buf; - Py_ssize_t buflen; +#ifndef Py_USING_UNICODE + /* In a non-Unicode built, this should never be called. */ + Py_FatalError("fp_readl should not be called in this build."); + return NULL; /* Keep compiler happy (not reachable) */ +#else + PyObject* utf8 = NULL; + PyObject* buf = tok->decoding_buffer; + char *str; + Py_ssize_t utf8len; /* Ask for one less byte so we can terminate it */ assert(size > 0); size--; - if (tok->decoding_buffer) { - bufobj = tok->decoding_buffer; - Py_INCREF(bufobj); - } - else - { - bufobj = _PyObject_CallNoArg(tok->decoding_readline); - if (bufobj == NULL) - goto error; - } - if (PyUnicode_CheckExact(bufobj)) - { - buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen); - if (buf == NULL) { - goto error; + if (buf == NULL) { + buf = PyObject_CallObject(tok->decoding_readline, NULL); + if (buf == NULL) + return error_ret(tok); + if (!PyUnicode_Check(buf)) { + Py_DECREF(buf); + PyErr_SetString(PyExc_SyntaxError, + "codec did not return a unicode object"); + return error_ret(tok); } + } else { + tok->decoding_buffer = NULL; + if (PyString_CheckExact(buf)) + utf8 = buf; } - else - { - buf = PyByteArray_AsString(bufobj); - if (buf == NULL) { - goto error; - } - buflen = PyByteArray_GET_SIZE(bufobj); + if (utf8 == NULL) { + utf8 = PyUnicode_AsUTF8String(buf); + Py_DECREF(buf); + if (utf8 == NULL) + return error_ret(tok); } - - Py_XDECREF(tok->decoding_buffer); - if (buflen > size) { - /* Too many chars, the rest goes into tok->decoding_buffer */ - tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size, - buflen-size); - if (tok->decoding_buffer == NULL) - goto error; - buflen = size; + str = PyString_AsString(utf8); + utf8len = PyString_GET_SIZE(utf8); + if (utf8len > size) { + tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size); + if (tok->decoding_buffer == NULL) { + Py_DECREF(utf8); + return error_ret(tok); + } + utf8len = size; } - else - tok->decoding_buffer = NULL; - - memcpy(s, buf, buflen); - s[buflen] = '\0'; - if (buflen == 0) /* EOF */ - s = NULL; - Py_DECREF(bufobj); + memcpy(s, str, utf8len); + s[utf8len] = '\0'; + Py_DECREF(utf8); + if (utf8len == 0) + return NULL; /* EOF */ return s; - -error: - Py_XDECREF(bufobj); - return error_ret(tok); +#endif } /* Set the readline function for TOK to a StreamReader's @@ -411,48 +467,24 @@ error: static int fp_setreadl(struct tok_state *tok, const char* enc) { - PyObject *readline, *io, *stream; - _Py_IDENTIFIER(open); - _Py_IDENTIFIER(readline); - int fd; - long pos; - - fd = fileno(tok->fp); - /* Due to buffering the file offset for fd can be different from the file - * position of tok->fp. If tok->fp was opened in text mode on Windows, - * its file position counts CRLF as one char and can't be directly mapped - * to the file offset for fd. Instead we step back one byte and read to - * the end of line.*/ - pos = ftell(tok->fp); - if (pos == -1 || - lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) { - PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); - return 0; - } + PyObject *reader, *stream, *readline; - io = PyImport_ImportModuleNoBlock("io"); - if (io == NULL) - return 0; - - stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO", - fd, "r", -1, enc, Py_None, Py_None, Py_False); - Py_DECREF(io); + /* XXX: constify filename argument. */ + stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL); if (stream == NULL) return 0; - readline = _PyObject_GetAttrId(stream, &PyId_readline); + reader = PyCodec_StreamReader(enc, stream, NULL); Py_DECREF(stream); - if (readline == NULL) + if (reader == NULL) return 0; - Py_XSETREF(tok->decoding_readline, readline); - if (pos > 0) { - PyObject *bufobj = _PyObject_CallNoArg(readline); - if (bufobj == NULL) - return 0; - Py_DECREF(bufobj); - } + readline = PyObject_GetAttrString(reader, "readline"); + Py_DECREF(reader); + if (readline == NULL) + return 0; + tok->decoding_readline = readline; return 1; } @@ -468,34 +500,6 @@ static void fp_ungetc(int c, struct tok_state *tok) { ungetc(c, tok->fp); } -/* Check whether the characters at s start a valid - UTF-8 sequence. Return the number of characters forming - the sequence if yes, 0 if not. */ -static int valid_utf8(const unsigned char* s) -{ - int expected = 0; - int length; - if (*s < 0x80) - /* single-byte code */ - return 1; - if (*s < 0xc0) - /* following byte */ - return 0; - if (*s < 0xE0) - expected = 1; - else if (*s < 0xF0) - expected = 2; - else if (*s < 0xF8) - expected = 3; - else - return 0; - length = expected + 1; - for (; expected; expected--) - if (s[expected] < 0x80 || s[expected] >= 0xC0) - return 0; - return length; -} - /* Read a line of input from TOK. Determine encoding if necessary. */ @@ -505,12 +509,12 @@ decoding_fgets(char *s, int size, struct tok_state *tok) char *line = NULL; int badchar = 0; for (;;) { - if (tok->decoding_state == STATE_NORMAL) { + if (tok->decoding_state < 0) { /* We already have a codec associated with this input. */ line = fp_readl(s, size, tok); break; - } else if (tok->decoding_state == STATE_RAW) { + } else if (tok->decoding_state > 0) { /* We want a 'raw' read. */ line = Py_UniversalNewlineFgets(s, size, tok->fp, NULL); @@ -521,7 +525,7 @@ decoding_fgets(char *s, int size, struct tok_state *tok) reader functions from now on. */ if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) return error_ret(tok); - assert(tok->decoding_state != STATE_INIT); + assert(tok->decoding_state != 0); } } if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { @@ -529,40 +533,43 @@ decoding_fgets(char *s, int size, struct tok_state *tok) return error_ret(tok); } } - /* The default encoding is UTF-8, so make sure we don't have any - non-UTF-8 sequences in it. */ +#ifndef PGEN + /* The default encoding is ASCII, so make sure we don't have any + non-ASCII bytes in it. */ if (line && !tok->encoding) { unsigned char *c; - int length; - for (c = (unsigned char *)line; *c; c += length) - if (!(length = valid_utf8(c))) { + for (c = (unsigned char *)line; *c; c++) + if (*c > 127) { badchar = *c; break; } } if (badchar) { + char buf[500]; /* Need to add 1 to the line number, since this line has not been counted, yet. */ - PyErr_Format(PyExc_SyntaxError, - "Non-UTF-8 code starting with '\\x%.2x' " - "in file %U on line %i, " - "but no encoding declared; " - "see http://python.org/dev/peps/pep-0263/ for details", - badchar, tok->filename, tok->lineno + 1); + sprintf(buf, + "Non-ASCII character '\\x%.2x' " + "in file %.200s on line %i, " + "but no encoding declared; " + "see http://python.org/dev/peps/pep-0263/ for details", + badchar, tok->filename, tok->lineno + 1); + PyErr_SetString(PyExc_SyntaxError, buf); return error_ret(tok); } +#endif return line; } static int decoding_feof(struct tok_state *tok) { - if (tok->decoding_state != STATE_NORMAL) { + if (tok->decoding_state >= 0) { return feof(tok->fp); } else { PyObject* buf = tok->decoding_buffer; if (buf == NULL) { - buf = _PyObject_CallNoArg(tok->decoding_readline); + buf = PyObject_CallObject(tok->decoding_readline, NULL); if (buf == NULL) { error_ret(tok); return 1; @@ -601,6 +608,7 @@ buf_setreadl(struct tok_state *tok, const char* enc) { /* Return a UTF-8 encoding Python string object from the C byte string STR, which is encoded with ENC. */ +#ifdef Py_USING_UNICODE static PyObject * translate_into_utf8(const char* str, const char* enc) { PyObject *utf8; @@ -611,12 +619,12 @@ translate_into_utf8(const char* str, const char* enc) { Py_DECREF(buf); return utf8; } +#endif static char * translate_newlines(const char *s, int exec_input, struct tok_state *tok) { - int skip_next_lf = 0; - size_t needed_length = strlen(s) + 2, final_length; + int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length; char *buf, *current; char c = '\0'; buf = PyMem_MALLOC(needed_length); @@ -680,12 +688,14 @@ decode_str(const char *input, int single, struct tok_state *tok) return error_ret(tok); str = tok->str; /* string after BOM if any */ assert(str); +#ifdef Py_USING_UNICODE if (tok->enc != NULL) { utf8 = translate_into_utf8(str, tok->enc); if (utf8 == NULL) return error_ret(tok); - str = PyBytes_AsString(utf8); + str = PyString_AsString(utf8); } +#endif for (s = str;; s++) { if (*s == '\0') break; else if (*s == '\n') { @@ -707,18 +717,22 @@ decode_str(const char *input, int single, struct tok_state *tok) return error_ret(tok); } } +#ifdef Py_USING_UNICODE if (tok->enc != NULL) { assert(utf8 == NULL); utf8 = translate_into_utf8(str, tok->enc); if (utf8 == NULL) return error_ret(tok); - str = PyBytes_AS_STRING(utf8); + str = PyString_AsString(utf8); } +#endif assert(tok->decoding_buffer == NULL); tok->decoding_buffer = utf8; /* CAUTION */ return str; } +#endif /* PGEN */ + /* Set up tokenizer for string */ struct tok_state * @@ -727,7 +741,7 @@ PyTokenizer_FromString(const char *str, int exec_input) struct tok_state *tok = tok_new(); if (tok == NULL) return NULL; - str = decode_str(str, exec_input, tok); + str = (char *)decode_str(str, exec_input, tok); if (str == NULL) { PyTokenizer_Free(tok); return NULL; @@ -738,38 +752,11 @@ PyTokenizer_FromString(const char *str, int exec_input) return tok; } -struct tok_state * -PyTokenizer_FromUTF8(const char *str, int exec_input) -{ - struct tok_state *tok = tok_new(); - if (tok == NULL) - return NULL; - tok->input = str = translate_newlines(str, exec_input, tok); - if (str == NULL) { - PyTokenizer_Free(tok); - return NULL; - } - tok->decoding_state = STATE_RAW; - tok->read_coding_spec = 1; - tok->enc = NULL; - tok->str = str; - tok->encoding = (char *)PyMem_MALLOC(6); - if (!tok->encoding) { - PyTokenizer_Free(tok); - return NULL; - } - strcpy(tok->encoding, "utf-8"); - - /* XXX: constify members. */ - tok->buf = tok->cur = tok->end = tok->inp = (char*)str; - return tok; -} /* Set up tokenizer for file */ struct tok_state * -PyTokenizer_FromFile(FILE *fp, const char* enc, - const char *ps1, const char *ps2) +PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2) { struct tok_state *tok = tok_new(); if (tok == NULL) @@ -783,17 +770,6 @@ PyTokenizer_FromFile(FILE *fp, const char* enc, tok->fp = fp; tok->prompt = ps1; tok->nextprompt = ps2; - if (enc != NULL) { - /* Must copy encoding declaration since it - gets copied into the parse tree. */ - tok->encoding = PyMem_MALLOC(strlen(enc)+1); - if (!tok->encoding) { - PyTokenizer_Free(tok); - return NULL; - } - strcpy(tok->encoding, enc); - tok->decoding_state = STATE_NORMAL; - } return tok; } @@ -805,9 +781,10 @@ PyTokenizer_Free(struct tok_state *tok) { if (tok->encoding != NULL) PyMem_FREE(tok->encoding); +#ifndef PGEN Py_XDECREF(tok->decoding_readline); Py_XDECREF(tok->decoding_buffer); - Py_XDECREF(tok->filename); +#endif if (tok->fp != NULL && tok->buf != NULL) PyMem_FREE(tok->buf); if (tok->input) @@ -815,10 +792,74 @@ PyTokenizer_Free(struct tok_state *tok) PyMem_FREE(tok); } +#if !defined(PGEN) && defined(Py_USING_UNICODE) +static int +tok_stdin_decode(struct tok_state *tok, char **inp) +{ + PyObject *enc, *sysstdin, *decoded, *utf8; + const char *encoding; + char *converted; + + if (PySys_GetFile((char *)"stdin", NULL) != stdin) + return 0; + sysstdin = PySys_GetObject("stdin"); + if (sysstdin == NULL || !PyFile_Check(sysstdin)) + return 0; + + enc = ((PyFileObject *)sysstdin)->f_encoding; + if (enc == NULL || !PyString_Check(enc)) + return 0; + Py_INCREF(enc); + + encoding = PyString_AsString(enc); + decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL); + if (decoded == NULL) + goto error_clear; + + utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL); + Py_DECREF(decoded); + if (utf8 == NULL) + goto error_clear; + + assert(PyString_Check(utf8)); + converted = new_string(PyString_AS_STRING(utf8), + PyString_GET_SIZE(utf8)); + Py_DECREF(utf8); + if (converted == NULL) + goto error_nomem; + + PyMem_FREE(*inp); + *inp = converted; + if (tok->encoding != NULL) + PyMem_FREE(tok->encoding); + tok->encoding = new_string(encoding, strlen(encoding)); + if (tok->encoding == NULL) + goto error_nomem; + + Py_DECREF(enc); + return 0; + +error_nomem: + Py_DECREF(enc); + tok->done = E_NOMEM; + return -1; + +error_clear: + Py_DECREF(enc); + if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { + tok->done = E_ERROR; + return -1; + } + /* Fallback to iso-8859-1: for backward compatibility */ + PyErr_Clear(); + return 0; +} +#endif + /* Get next char, updating state; error code goes into tok->done */ static int -tok_nextc(struct tok_state *tok) +tok_nextc(register struct tok_state *tok) { for (;;) { if (tok->cur != tok->inp) { @@ -846,34 +887,6 @@ tok_nextc(struct tok_state *tok) } if (tok->prompt != NULL) { char *newtok = PyOS_Readline(stdin, stdout, tok->prompt); - if (newtok != NULL) { - char *translated = translate_newlines(newtok, 0, tok); - PyMem_FREE(newtok); - if (translated == NULL) - return EOF; - newtok = translated; - } - if (tok->encoding && newtok && *newtok) { - /* Recode to UTF-8 */ - Py_ssize_t buflen; - const char* buf; - PyObject *u = translate_into_utf8(newtok, tok->encoding); - PyMem_FREE(newtok); - if (!u) { - tok->done = E_DECODE; - return EOF; - } - buflen = PyBytes_GET_SIZE(u); - buf = PyBytes_AS_STRING(u); - newtok = PyMem_MALLOC(buflen+1); - if (newtok == NULL) { - Py_DECREF(u); - tok->done = E_NOMEM; - return EOF; - } - strcpy(newtok, buf); - Py_DECREF(u); - } if (tok->nextprompt != NULL) tok->prompt = tok->nextprompt; if (newtok == NULL) @@ -882,6 +895,10 @@ tok_nextc(struct tok_state *tok) PyMem_FREE(newtok); tok->done = E_EOF; } +#if !defined(PGEN) && defined(Py_USING_UNICODE) + else if (tok_stdin_decode(tok, &newtok) != 0) + PyMem_FREE(newtok); +#endif else if (tok->start != NULL) { size_t start = tok->start - tok->buf; size_t oldlen = tok->cur - tok->buf; @@ -956,7 +973,6 @@ tok_nextc(struct tok_state *tok) while (!done) { Py_ssize_t curstart = tok->start == NULL ? -1 : tok->start - tok->buf; - Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; Py_ssize_t curvalid = tok->inp - tok->buf; Py_ssize_t newsize = curvalid + BUFSIZ; char *newbuf = tok->buf; @@ -969,7 +985,6 @@ tok_nextc(struct tok_state *tok) } tok->buf = newbuf; tok->cur = tok->buf + cur; - tok->multi_line_start = tok->buf + cur_multi_line_start; tok->line_start = tok->cur; tok->inp = tok->buf + curvalid; tok->end = tok->buf + newsize; @@ -985,8 +1000,7 @@ tok_nextc(struct tok_state *tok) return EOF; /* Last line does not end in \n, fake one */ - if (tok->inp[-1] != '\n') - strcpy(tok->inp, "\n"); + strcpy(tok->inp, "\n"); } tok->inp = strchr(tok->inp, '\0'); done = tok->inp[-1] == '\n'; @@ -1018,7 +1032,7 @@ tok_nextc(struct tok_state *tok) /* Back-up one character */ static void -tok_backup(struct tok_state *tok, int c) +tok_backup(register struct tok_state *tok, register int c) { if (c != EOF) { if (--tok->cur < tok->buf) @@ -1029,88 +1043,185 @@ tok_backup(struct tok_state *tok, int c) } -static int -syntaxerror(struct tok_state *tok, const char *format, ...) -{ - va_list vargs; -#ifdef HAVE_STDARG_PROTOTYPES - va_start(vargs, format); -#else - va_start(vargs); -#endif - PyErr_FormatV(PyExc_SyntaxError, format, vargs); - va_end(vargs); - PyErr_SyntaxLocationObject(tok->filename, - tok->lineno, - (int)(tok->cur - tok->line_start)); - tok->done = E_ERROR; - return ERRORTOKEN; -} +/* Return the token corresponding to a single character */ -static int -indenterror(struct tok_state *tok) +int +PyToken_OneChar(int c) { - tok->done = E_TABSPACE; - tok->cur = tok->inp; - return ERRORTOKEN; + switch (c) { + case '(': return LPAR; + case ')': return RPAR; + case '[': return LSQB; + case ']': return RSQB; + case ':': return COLON; + case ',': return COMMA; + case ';': return SEMI; + case '+': return PLUS; + case '-': return MINUS; + case '*': return STAR; + case '/': return SLASH; + case '|': return VBAR; + case '&': return AMPER; + case '<': return LESS; + case '>': return GREATER; + case '=': return EQUAL; + case '.': return DOT; + case '%': return PERCENT; + case '`': return BACKQUOTE; + case '{': return LBRACE; + case '}': return RBRACE; + case '^': return CIRCUMFLEX; + case '~': return TILDE; + case '@': return AT; + default: return OP; + } } -/* Verify that the identifier follows PEP 3131. - All identifier strings are guaranteed to be "ready" unicode objects. - */ -static int -verify_identifier(struct tok_state *tok) + +int +PyToken_TwoChars(int c1, int c2) { - PyObject *s; - int result; - if (tok->decoding_erred) - return 0; - s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); - if (s == NULL) { - if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { - PyErr_Clear(); - tok->done = E_IDENTIFIER; - } else { - tok->done = E_ERROR; + switch (c1) { + case '=': + switch (c2) { + case '=': return EQEQUAL; } - return 0; + break; + case '!': + switch (c2) { + case '=': return NOTEQUAL; + } + break; + case '<': + switch (c2) { + case '>': return NOTEQUAL; + case '=': return LESSEQUAL; + case '<': return LEFTSHIFT; + } + break; + case '>': + switch (c2) { + case '=': return GREATEREQUAL; + case '>': return RIGHTSHIFT; + } + break; + case '+': + switch (c2) { + case '=': return PLUSEQUAL; + } + break; + case '-': + switch (c2) { + case '=': return MINEQUAL; + } + break; + case '*': + switch (c2) { + case '*': return DOUBLESTAR; + case '=': return STAREQUAL; + } + break; + case '/': + switch (c2) { + case '/': return DOUBLESLASH; + case '=': return SLASHEQUAL; + } + break; + case '|': + switch (c2) { + case '=': return VBAREQUAL; + } + break; + case '%': + switch (c2) { + case '=': return PERCENTEQUAL; + } + break; + case '&': + switch (c2) { + case '=': return AMPEREQUAL; + } + break; + case '^': + switch (c2) { + case '=': return CIRCUMFLEXEQUAL; + } + break; } - result = PyUnicode_IsIdentifier(s); - Py_DECREF(s); - if (result == 0) - tok->done = E_IDENTIFIER; - return result; + return OP; } -static int -tok_decimal_tail(struct tok_state *tok) +int +PyToken_ThreeChars(int c1, int c2, int c3) { - int c; - - while (1) { - do { - c = tok_nextc(tok); - } while (isdigit(c)); - if (c != '_') { + switch (c1) { + case '<': + switch (c2) { + case '<': + switch (c3) { + case '=': + return LEFTSHIFTEQUAL; + } break; } - c = tok_nextc(tok); - if (!isdigit(c)) { - tok_backup(tok, c); - syntaxerror(tok, "invalid decimal literal"); - return 0; + break; + case '>': + switch (c2) { + case '>': + switch (c3) { + case '=': + return RIGHTSHIFTEQUAL; + } + break; + } + break; + case '*': + switch (c2) { + case '*': + switch (c3) { + case '=': + return DOUBLESTAREQUAL; + } + break; + } + break; + case '/': + switch (c2) { + case '/': + switch (c3) { + case '=': + return DOUBLESLASHEQUAL; + } + break; } + break; + } + return OP; +} + +static int +indenterror(struct tok_state *tok) +{ + if (tok->alterror) { + tok->done = E_TABSPACE; + tok->cur = tok->inp; + return 1; + } + if (tok->altwarning) { + PySys_WriteStderr("%s: inconsistent use of tabs and spaces " + "in indentation\n", tok->filename); + tok->altwarning = 0; } - return c; + return 0; } /* Get next token, after space stripping etc. */ static int -tok_get(struct tok_state *tok, char **p_start, char **p_end) +tok_get(register struct tok_state *tok, char **p_start, char **p_end) { - int c; - int blankline, nonascii; + register int c; + int blankline; *p_start = *p_end = NULL; nextline: @@ -1119,24 +1230,22 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) /* Get indentation level */ if (tok->atbol) { - int col = 0; - int altcol = 0; + register int col = 0; + register int altcol = 0; tok->atbol = 0; for (;;) { c = tok_nextc(tok); - if (c == ' ') { + if (c == ' ') col++, altcol++; - } else if (c == '\t') { - col = (col / tok->tabsize + 1) * tok->tabsize; - altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE; + col = (col/tok->tabsize + 1) * tok->tabsize; + altcol = (altcol/tok->alttabsize + 1) + * tok->alttabsize; } - else if (c == '\014') {/* Control-L (formfeed) */ + else if (c == '\014') /* Control-L (formfeed) */ col = altcol = 0; /* For Emacs users */ - } - else { + else break; - } } tok_backup(tok, c); if (c == '#' || c == '\n') { @@ -1145,18 +1254,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) not passed to the parser as NEWLINE tokens, except *totally* empty lines in interactive mode, which signal the end of a command group. */ - if (col == 0 && c == '\n' && tok->prompt != NULL) { + if (col == 0 && c == '\n' && tok->prompt != NULL) blankline = 0; /* Let it through */ - } - else if (tok->prompt != NULL && tok->lineno == 1) { - /* In interactive mode, if the first line contains - only spaces and/or a comment, let it through. */ - blankline = 0; - col = altcol = 0; - } - else { + else blankline = 1; /* Ignore completely */ - } /* We can't jump back right here since we still may need to skip to the end of a comment */ } @@ -1164,7 +1265,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) if (col == tok->indstack[tok->indent]) { /* No change */ if (altcol != tok->altindstack[tok->indent]) { - return indenterror(tok); + if (indenterror(tok)) + return ERRORTOKEN; } } else if (col > tok->indstack[tok->indent]) { @@ -1175,7 +1277,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) return ERRORTOKEN; } if (altcol <= tok->altindstack[tok->indent]) { - return indenterror(tok); + if (indenterror(tok)) + return ERRORTOKEN; } tok->pendin++; tok->indstack[++tok->indent] = col; @@ -1194,7 +1297,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) return ERRORTOKEN; } if (altcol != tok->altindstack[tok->indent]) { - return indenterror(tok); + if (indenterror(tok)) + return ERRORTOKEN; } } } @@ -1214,31 +1318,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) } } - /* Peek ahead at the next character */ - c = tok_nextc(tok); - tok_backup(tok, c); - /* Check if we are closing an async function */ - if (tok->async_def - && !blankline - /* Due to some implementation artifacts of type comments, - * a TYPE_COMMENT at the start of a function won't set an - * indentation level and it will produce a NEWLINE after it. - * To avoid spuriously ending an async function due to this, - * wait until we have some non-newline char in front of us. */ - && c != '\n' - && tok->level == 0 - /* There was a NEWLINE after ASYNC DEF, - so we're past the signature. */ - && tok->async_def_nl - /* Current indentation level is less than where - the async function was defined */ - && tok->async_def_indent >= tok->indent) - { - tok->async_def = 0; - tok->async_def_indent = 0; - tok->async_def_nl = 0; - } - again: tok->start = NULL; /* Skip spaces */ @@ -1249,63 +1328,40 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) /* Set start of current token */ tok->start = tok->cur - 1; - /* Skip comment, unless it's a type comment */ + /* Skip comment, while looking for tab-setting magic */ if (c == '#') { - const char *prefix, *p, *type_start; - - while (c != EOF && c != '\n') { - c = tok_nextc(tok); - } - - if (tok->type_comments) { - p = tok->start; - prefix = type_comment_prefix; - while (*prefix && p < tok->cur) { - if (*prefix == ' ') { - while (*p == ' ' || *p == '\t') { - p++; - } - } else if (*prefix == *p) { - p++; - } else { - break; - } - - prefix++; - } - - /* This is a type comment if we matched all of type_comment_prefix. */ - if (!*prefix) { - int is_type_ignore = 1; - const char *ignore_end = p + 6; - tok_backup(tok, c); /* don't eat the newline or EOF */ - - type_start = p; - - /* A TYPE_IGNORE is "type: ignore" followed by the end of the token - * or anything ASCII and non-alphanumeric. */ - is_type_ignore = ( - tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0 - && !(tok->cur > ignore_end - && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); - - if (is_type_ignore) { - *p_start = (char *) ignore_end; - *p_end = tok->cur; - - /* If this type ignore is the only thing on the line, consume the newline also. */ - if (blankline) { - tok_nextc(tok); - tok->atbol = 1; - } - return TYPE_IGNORE; - } else { - *p_start = (char *) type_start; /* after type_comment_prefix */ - *p_end = tok->cur; - return TYPE_COMMENT; + static char *tabforms[] = { + "tab-width:", /* Emacs */ + ":tabstop=", /* vim, full form */ + ":ts=", /* vim, abbreviated form */ + "set tabsize=", /* will vi never die? */ + /* more templates can be added here to support other editors */ + }; + char cbuf[80]; + char *tp, **cp; + tp = cbuf; + do { + *tp++ = c = tok_nextc(tok); + } while (c != EOF && c != '\n' && + (size_t)(tp - cbuf + 1) < sizeof(cbuf)); + *tp = '\0'; + for (cp = tabforms; + cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); + cp++) { + if ((tp = strstr(cbuf, *cp))) { + int newsize = atoi(tp + strlen(*cp)); + + if (newsize >= 1 && newsize <= 40) { + tok->tabsize = newsize; + if (Py_VerboseFlag) + PySys_WriteStderr( + "Tab size set to %d\n", + newsize); } } } + while (c != EOF && c != '\n') + c = tok_nextc(tok); } /* Check for EOF and errors now */ @@ -1314,108 +1370,49 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) } /* Identifier (most frequent token!) */ - nonascii = 0; - if (is_potential_identifier_start(c)) { - /* Process the various legal combinations of b"", r"", u"", and f"". */ - int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0; - while (1) { - if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B')) - saw_b = 1; - /* Since this is a backwards compatibility support literal we don't - want to support it in arbitrary order like byte literals. */ - else if (!(saw_b || saw_u || saw_r || saw_f) - && (c == 'u'|| c == 'U')) { - saw_u = 1; - } - /* ur"" and ru"" are not supported */ - else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) { - saw_r = 1; - } - else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) { - saw_f = 1; - } - else { - break; - } + if (Py_ISALPHA(c) || c == '_') { + /* Process r"", u"" and ur"" */ + switch (c) { + case 'b': + case 'B': c = tok_nextc(tok); - if (c == '"' || c == '\'') { + if (c == 'r' || c == 'R') + c = tok_nextc(tok); + if (c == '"' || c == '\'') goto letter_quote; - } + break; + case 'r': + case 'R': + c = tok_nextc(tok); + if (c == '"' || c == '\'') + goto letter_quote; + break; + case 'u': + case 'U': + c = tok_nextc(tok); + if (c == 'r' || c == 'R') + c = tok_nextc(tok); + if (c == '"' || c == '\'') + goto letter_quote; + break; } - while (is_potential_identifier_char(c)) { - if (c >= 128) { - nonascii = 1; - } + while (c != EOF && (Py_ISALNUM(c) || c == '_')) { c = tok_nextc(tok); } tok_backup(tok, c); - if (nonascii && !verify_identifier(tok)) { - return ERRORTOKEN; - } *p_start = tok->start; *p_end = tok->cur; - - /* async/await parsing block. */ - if (tok->cur - tok->start == 5 && tok->start[0] == 'a') { - /* May be an 'async' or 'await' token. For Python 3.7 or - later we recognize them unconditionally. For Python - 3.5 or 3.6 we recognize 'async' in front of 'def', and - either one inside of 'async def'. (Technically we - shouldn't recognize these at all for 3.4 or earlier, - but there's no *valid* Python 3.4 code that would be - rejected, and async functions will be rejected in a - later phase.) */ - if (!tok->async_hacks || tok->async_def) { - /* Always recognize the keywords. */ - if (memcmp(tok->start, "async", 5) == 0) { - return ASYNC; - } - if (memcmp(tok->start, "await", 5) == 0) { - return AWAIT; - } - } - else if (memcmp(tok->start, "async", 5) == 0) { - /* The current token is 'async'. - Look ahead one token to see if that is 'def'. */ - - struct tok_state ahead_tok; - char *ahead_tok_start = NULL, *ahead_tok_end = NULL; - int ahead_tok_kind; - - memcpy(&ahead_tok, tok, sizeof(ahead_tok)); - ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, - &ahead_tok_end); - - if (ahead_tok_kind == NAME - && ahead_tok.cur - ahead_tok.start == 3 - && memcmp(ahead_tok.start, "def", 3) == 0) - { - /* The next token is going to be 'def', so instead of - returning a plain NAME token, return ASYNC. */ - tok->async_def_indent = tok->indent; - tok->async_def = 1; - return ASYNC; - } - } - } - return NAME; } /* Newline */ if (c == '\n') { tok->atbol = 1; - if (blankline || tok->level > 0) { + if (blankline || tok->level > 0) goto nextline; - } *p_start = tok->start; *p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; - if (tok->async_def) { - /* We're somewhere inside an 'async def' function, and - we've encountered a NEWLINE after its signature. */ - tok->async_def_nl = 1; - } return NEWLINE; } @@ -1424,24 +1421,13 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) c = tok_nextc(tok); if (isdigit(c)) { goto fraction; - } else if (c == '.') { - c = tok_nextc(tok); - if (c == '.') { - *p_start = tok->start; - *p_end = tok->cur; - return ELLIPSIS; - } - else { - tok_backup(tok, c); - } - tok_backup(tok, '.'); } else { tok_backup(tok, c); + *p_start = tok->start; + *p_end = tok->cur; + return DOT; } - *p_start = tok->start; - *p_end = tok->cur; - return DOT; } /* Number */ @@ -1449,136 +1435,94 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) if (c == '0') { /* Hex, octal or binary -- maybe. */ c = tok_nextc(tok); + if (c == '.') + goto fraction; +#ifndef WITHOUT_COMPLEX + if (c == 'j' || c == 'J') + goto imaginary; +#endif if (c == 'x' || c == 'X') { + /* Hex */ c = tok_nextc(tok); + if (!isxdigit(c)) { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } do { - if (c == '_') { - c = tok_nextc(tok); - } - if (!isxdigit(c)) { - tok_backup(tok, c); - return syntaxerror(tok, "invalid hexadecimal literal"); - } - do { - c = tok_nextc(tok); - } while (isxdigit(c)); - } while (c == '_'); + c = tok_nextc(tok); + } while (isxdigit(c)); } else if (c == 'o' || c == 'O') { /* Octal */ c = tok_nextc(tok); - do { - if (c == '_') { - c = tok_nextc(tok); - } - if (c < '0' || c >= '8') { - tok_backup(tok, c); - if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in octal literal", c); - } - else { - return syntaxerror(tok, "invalid octal literal"); - } - } - do { - c = tok_nextc(tok); - } while ('0' <= c && c < '8'); - } while (c == '_'); - if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in octal literal", c); + if (c < '0' || c >= '8') { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; } + do { + c = tok_nextc(tok); + } while ('0' <= c && c < '8'); } else if (c == 'b' || c == 'B') { /* Binary */ c = tok_nextc(tok); - do { - if (c == '_') { - c = tok_nextc(tok); - } - if (c != '0' && c != '1') { - tok_backup(tok, c); - if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in binary literal", c); - } - else { - return syntaxerror(tok, "invalid binary literal"); - } - } - do { - c = tok_nextc(tok); - } while (c == '0' || c == '1'); - } while (c == '_'); - if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in binary literal", c); + if (c != '0' && c != '1') { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; } + do { + c = tok_nextc(tok); + } while (c == '0' || c == '1'); } else { - int nonzero = 0; - /* maybe old-style octal; c is first char of it */ - /* in any case, allow '0' as a literal */ - while (1) { - if (c == '_') { - c = tok_nextc(tok); - if (!isdigit(c)) { - tok_backup(tok, c); - return syntaxerror(tok, "invalid decimal literal"); - } - } - if (c != '0') { - break; - } + int found_decimal = 0; + /* Octal; c is first char of it */ + /* There's no 'isoctdigit' macro, sigh */ + while ('0' <= c && c < '8') { c = tok_nextc(tok); } if (isdigit(c)) { - nonzero = 1; - c = tok_decimal_tail(tok); - if (c == 0) { - return ERRORTOKEN; - } + found_decimal = 1; + do { + c = tok_nextc(tok); + } while (isdigit(c)); } - if (c == '.') { - c = tok_nextc(tok); + if (c == '.') goto fraction; - } - else if (c == 'e' || c == 'E') { + else if (c == 'e' || c == 'E') goto exponent; - } - else if (c == 'j' || c == 'J') { +#ifndef WITHOUT_COMPLEX + else if (c == 'j' || c == 'J') goto imaginary; - } - else if (nonzero) { - /* Old-style octal: now disallowed. */ +#endif + else if (found_decimal) { + tok->done = E_TOKEN; tok_backup(tok, c); - return syntaxerror(tok, - "leading zeros in decimal integer " - "literals are not permitted; " - "use an 0o prefix for octal integers"); + return ERRORTOKEN; } } + if (c == 'l' || c == 'L') + c = tok_nextc(tok); } else { /* Decimal */ - c = tok_decimal_tail(tok); - if (c == 0) { - return ERRORTOKEN; - } - { + do { + c = tok_nextc(tok); + } while (isdigit(c)); + if (c == 'l' || c == 'L') + c = tok_nextc(tok); + else { /* Accept floating point numbers. */ if (c == '.') { - c = tok_nextc(tok); fraction: /* Fraction */ - if (isdigit(c)) { - c = tok_decimal_tail(tok); - if (c == 0) { - return ERRORTOKEN; - } - } + do { + c = tok_nextc(tok); + } while (isdigit(c)); } if (c == 'e' || c == 'E') { int e; @@ -1589,8 +1533,9 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) if (c == '+' || c == '-') { c = tok_nextc(tok); if (!isdigit(c)) { + tok->done = E_TOKEN; tok_backup(tok, c); - return syntaxerror(tok, "invalid decimal literal"); + return ERRORTOKEN; } } else if (!isdigit(c)) { tok_backup(tok, c); @@ -1599,16 +1544,16 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) *p_end = tok->cur; return NUMBER; } - c = tok_decimal_tail(tok); - if (c == 0) { - return ERRORTOKEN; - } + do { + c = tok_nextc(tok); + } while (isdigit(c)); } - if (c == 'j' || c == 'J') { +#ifndef WITHOUT_COMPLEX + if (c == 'j' || c == 'J') /* Imaginary part */ imaginary: c = tok_nextc(tok); - } +#endif } } tok_backup(tok, c); @@ -1620,61 +1565,55 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) letter_quote: /* String */ if (c == '\'' || c == '"') { + Py_ssize_t quote2 = tok->cur - tok->start + 1; int quote = c; - int quote_size = 1; /* 1 or 3 */ - int end_quote_size = 0; - - /* Nodes of type STRING, especially multi line strings - must be handled differently in order to get both - the starting line number and the column offset right. - (cf. issue 16806) */ - tok->first_lineno = tok->lineno; - tok->multi_line_start = tok->line_start; - - /* Find the quote size and start of string */ - c = tok_nextc(tok); - if (c == quote) { - c = tok_nextc(tok); - if (c == quote) { - quote_size = 3; - } - else { - end_quote_size = 1; /* empty string found */ - } - } - if (c != quote) { - tok_backup(tok, c); - } - - /* Get rest of string */ - while (end_quote_size != quote_size) { + int triple = 0; + int tripcount = 0; + for (;;) { c = tok_nextc(tok); - if (c == EOF) { - if (quote_size == 3) { - tok->done = E_EOFS; - } - else { + if (c == '\n') { + if (!triple) { tok->done = E_EOLS; + tok_backup(tok, c); + return ERRORTOKEN; } - tok->cur = tok->inp; - return ERRORTOKEN; + tripcount = 0; + tok->cont_line = 1; /* multiline string. */ } - if (quote_size == 1 && c == '\n') { - tok->done = E_EOLS; + else if (c == EOF) { + if (triple) + tok->done = E_EOFS; + else + tok->done = E_EOLS; tok->cur = tok->inp; return ERRORTOKEN; } - if (c == quote) { - end_quote_size += 1; + else if (c == quote) { + tripcount++; + if (tok->cur - tok->start == quote2) { + c = tok_nextc(tok); + if (c == quote) { + triple = 1; + tripcount = 0; + continue; + } + tok_backup(tok, c); + } + if (!triple || tripcount == 3) + break; } - else { - end_quote_size = 0; - if (c == '\\') { - tok_nextc(tok); /* skip escaped char */ + else if (c == '\\') { + tripcount = 0; + c = tok_nextc(tok); + if (c == EOF) { + tok->done = E_EOLS; + tok->cur = tok->inp; + return ERRORTOKEN; } } + else + tripcount = 0; } - *p_start = tok->start; *p_end = tok->cur; return STRING; @@ -1688,14 +1627,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) tok->cur = tok->inp; return ERRORTOKEN; } - c = tok_nextc(tok); - if (c == EOF) { - tok->done = E_EOF; - tok->cur = tok->inp; - return ERRORTOKEN; - } else { - tok_backup(tok, c); - } tok->cont_line = 1; goto again; /* Read next line */ } @@ -1704,13 +1635,24 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) { int c2 = tok_nextc(tok); int token = PyToken_TwoChars(c, c2); +#ifndef PGEN + if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') { + if (PyErr_WarnExplicit(PyExc_DeprecationWarning, + "<> not supported in 3.x; use !=", + tok->filename, tok->lineno, + NULL, NULL)) { + tok->done = E_ERROR; + tok->cur = tok->inp; + return ERRORTOKEN; + } + } +#endif if (token != OP) { int c3 = tok_nextc(tok); int token3 = PyToken_ThreeChars(c, c2, c3); if (token3 != OP) { token = token3; - } - else { + } else { tok_backup(tok, c3); } *p_start = tok->start; @@ -1725,38 +1667,12 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) case '(': case '[': case '{': - if (tok->level >= MAXLEVEL) { - return syntaxerror(tok, "too many nested parentheses"); - } - tok->parenstack[tok->level] = c; - tok->parenlinenostack[tok->level] = tok->lineno; tok->level++; break; case ')': case ']': case '}': - if (!tok->level) { - return syntaxerror(tok, "unmatched '%c'", c); - } tok->level--; - int opening = tok->parenstack[tok->level]; - if (!((opening == '(' && c == ')') || - (opening == '[' && c == ']') || - (opening == '{' && c == '}'))) - { - if (tok->parenlinenostack[tok->level] != tok->lineno) { - return syntaxerror(tok, - "closing parenthesis '%c' does not match " - "opening parenthesis '%c' on line %d", - c, opening, tok->parenlinenostack[tok->level]); - } - else { - return syntaxerror(tok, - "closing parenthesis '%c' does not match " - "opening parenthesis '%c'", - c, opening); - } - } break; } @@ -1770,6 +1686,11 @@ int PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) { int result = tok_get(tok, p_start, p_end); + if (tok->fp && ferror(tok->fp)) { + clearerr(tok->fp); + result = ERRORTOKEN; + tok->done = E_IO; + } if (tok->decoding_erred) { result = ERRORTOKEN; tok->done = E_DECODE; @@ -1777,67 +1698,67 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) return result; } -/* Get the encoding of a Python file. Check for the coding cookie and check if - the file starts with a BOM. - - PyTokenizer_FindEncodingFilename() returns NULL when it can't find the - encoding in the first or second line of the file (in which case the encoding - should be assumed to be UTF-8). - - The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed - by the caller. */ +/* This function is only called from parsetok. However, it cannot live + there, as it must be empty for PGEN, and we can check for PGEN only + in this file. */ -char * -PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) +#if defined(PGEN) || !defined(Py_USING_UNICODE) +char* +PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset) { - struct tok_state *tok; - FILE *fp; - char *p_start =NULL , *p_end =NULL , *encoding = NULL; - - fd = _Py_dup(fd); - if (fd < 0) { - return NULL; - } - - fp = fdopen(fd, "r"); - if (fp == NULL) { - return NULL; - } - tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL); - if (tok == NULL) { - fclose(fp); - return NULL; - } - if (filename != NULL) { - Py_INCREF(filename); - tok->filename = filename; - } - else { - tok->filename = PyUnicode_FromString("<string>"); - if (tok->filename == NULL) { - fclose(fp); - PyTokenizer_Free(tok); - return encoding; - } - } - while (tok->lineno < 2 && tok->done == E_OK) { - PyTokenizer_Get(tok, &p_start, &p_end); + return NULL; +} +#else +#ifdef Py_USING_UNICODE +static PyObject * +dec_utf8(const char *enc, const char *text, size_t len) { + PyObject *ret = NULL; + PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace"); + if (unicode_text) { + ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace"); + Py_DECREF(unicode_text); } - fclose(fp); - if (tok->encoding) { - encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1); - if (encoding) - strcpy(encoding, tok->encoding); + if (!ret) { + PyErr_Clear(); } - PyTokenizer_Free(tok); - return encoding; + return ret; } - char * -PyTokenizer_FindEncoding(int fd) +PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset) { - return PyTokenizer_FindEncodingFilename(fd, NULL); + char *text = NULL; + if (tok->encoding) { + /* convert source to original encondig */ + PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len); + if (lineobj != NULL) { + int linelen = PyString_Size(lineobj); + const char *line = PyString_AsString(lineobj); + text = PyObject_MALLOC(linelen + 1); + if (text != NULL && line != NULL) { + if (linelen) + strncpy(text, line, linelen); + text[linelen] = '\0'; + } + Py_DECREF(lineobj); + + /* adjust error offset */ + if (*offset > 1) { + PyObject *offsetobj = dec_utf8(tok->encoding, + tok->buf, *offset-1); + if (offsetobj) { + *offset = PyString_Size(offsetobj) + 1; + Py_DECREF(offsetobj); + } + } + + } + } + return text; + } +#endif /* defined(Py_USING_UNICODE) */ +#endif + #ifdef Py_DEBUG diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 92669bf..f15e252 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -11,13 +11,6 @@ extern "C" { #include "token.h" /* For token types */ #define MAXINDENT 100 /* Max indentation level */ -#define MAXLEVEL 200 /* Max parentheses level */ - -enum decoding_state { - STATE_INIT, - STATE_RAW, - STATE_NORMAL /* have a codec associated with input */ -}; /* Tokenizer state */ struct tok_state { @@ -36,51 +29,40 @@ struct tok_state { int indstack[MAXINDENT]; /* Stack of indents */ int atbol; /* Nonzero if at begin of new line */ int pendin; /* Pending indents (if > 0) or dedents (if < 0) */ - const char *prompt, *nextprompt; /* For interactive prompting */ + char *prompt, *nextprompt; /* For interactive prompting */ int lineno; /* Current line number */ - int first_lineno; /* First line of a single line or multi line string - expression (cf. issue 16806) */ int level; /* () [] {} Parentheses nesting level */ /* Used to allow free continuations inside them */ - char parenstack[MAXLEVEL]; - int parenlinenostack[MAXLEVEL]; - PyObject *filename; /* Stuff for checking on different tab sizes */ + const char *filename; /* For error messages */ + int altwarning; /* Issue warning if alternate tabs don't match */ + int alterror; /* Issue error if alternate tabs don't match */ + int alttabsize; /* Alternate tab spacing */ int altindstack[MAXINDENT]; /* Stack of alternate indents */ /* Stuff for PEP 0263 */ - enum decoding_state decoding_state; + int decoding_state; /* -1:decoding, 0:init, 1:raw */ int decoding_erred; /* whether erred in decoding */ int read_coding_spec; /* whether 'coding:...' has been read */ - char *encoding; /* Source encoding. */ + char *encoding; int cont_line; /* whether we are in a continuation line. */ const char* line_start; /* pointer to start of current line */ - const char* multi_line_start; /* pointer to start of first line of - a single line or multi line string - expression (cf. issue 16806) */ - PyObject *decoding_readline; /* open(...).readline */ +#ifndef PGEN + PyObject *decoding_readline; /* codecs.open(...).readline */ PyObject *decoding_buffer; - const char* enc; /* Encoding for the current str. */ +#endif + const char* enc; const char* str; const char* input; /* Tokenizer's newline translated copy of the string. */ - - int type_comments; /* Whether to look for type comments */ - - /* async/await related fields (still needed depending on feature_version) */ - int async_hacks; /* =1 if async/await aren't always keywords */ - int async_def; /* =1 if tokens are inside an 'async def' body. */ - int async_def_indent; /* Indentation level of the outermost 'async def'. */ - int async_def_nl; /* =1 if the outermost 'async def' had at least one - NEWLINE token after it. */ }; extern struct tok_state *PyTokenizer_FromString(const char *, int); -extern struct tok_state *PyTokenizer_FromUTF8(const char *, int); -extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*, - const char *, const char *); +extern struct tok_state *PyTokenizer_FromFile(FILE *, char *, char *); extern void PyTokenizer_Free(struct tok_state *); extern int PyTokenizer_Get(struct tok_state *, char **, char **); - -#define tok_dump _Py_tok_dump +#if defined(PGEN) || defined(Py_USING_UNICODE) +extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok, + int len, int *offset); +#endif #ifdef __cplusplus } diff --git a/Parser/tokenizer_pgen.c b/Parser/tokenizer_pgen.c new file mode 100644 index 0000000..9cb8492 --- /dev/null +++ b/Parser/tokenizer_pgen.c @@ -0,0 +1,2 @@ +#define PGEN +#include "tokenizer.c" |