summaryrefslogtreecommitdiffstats
path: root/Parser
diff options
context:
space:
mode:
Diffstat (limited to 'Parser')
-rw-r--r--Parser/Python.asdl222
-rw-r--r--Parser/acceler.c18
-rw-r--r--Parser/asdl.py573
-rwxr-xr-xParser/asdl_c.py773
-rw-r--r--Parser/bitset.c66
-rw-r--r--Parser/firstsets.c113
-rw-r--r--Parser/grammar.c272
-rw-r--r--Parser/grammar1.c28
-rw-r--r--Parser/intrcheck.c178
-rw-r--r--Parser/listnode.c28
-rw-r--r--Parser/metagrammar.c159
-rw-r--r--Parser/myreadline.c280
-rw-r--r--Parser/node.c32
-rw-r--r--Parser/parser.c88
-rw-r--r--Parser/parser.h31
-rw-r--r--Parser/parsetok.c339
-rw-r--r--Parser/pgen.c726
-rw-r--r--Parser/pgen/__init__.py0
-rw-r--r--Parser/pgen/__main__.py33
-rw-r--r--Parser/pgen/automata.py371
-rw-r--r--Parser/pgen/grammar.py147
-rw-r--r--Parser/pgen/keywordgen.py59
-rw-r--r--Parser/pgen/metaparser.py152
-rw-r--r--Parser/pgen/pgen.py305
-rw-r--r--Parser/pgen/token.py38
-rw-r--r--Parser/pgenmain.c174
-rw-r--r--Parser/printgrammar.c117
-rw-r--r--Parser/spark.py839
-rw-r--r--Parser/token.c243
-rw-r--r--Parser/tokenizer.c1493
-rw-r--r--Parser/tokenizer.h50
-rw-r--r--Parser/tokenizer_pgen.c2
32 files changed, 4298 insertions, 3651 deletions
diff --git a/Parser/Python.asdl b/Parser/Python.asdl
index 126d478..9a9b933 100644
--- a/Parser/Python.asdl
+++ b/Parser/Python.asdl
@@ -1,129 +1,115 @@
--- ASDL's 5 builtin types are:
--- identifier, int, string, object, constant
+-- ASDL's five builtin types are identifier, int, string, object, bool
-module Python
+module Python version "$Revision$"
{
- mod = Module(stmt* body, type_ignore *type_ignores)
- | Interactive(stmt* body)
- | Expression(expr body)
- | FunctionType(expr* argtypes, expr returns)
-
- -- not really an actual node but useful in Jython's typesystem.
- | Suite(stmt* body)
-
- stmt = FunctionDef(identifier name, arguments args,
- stmt* body, expr* decorator_list, expr? returns,
- string? type_comment)
- | AsyncFunctionDef(identifier name, arguments args,
- stmt* body, expr* decorator_list, expr? returns,
- string? type_comment)
-
- | ClassDef(identifier name,
- expr* bases,
- keyword* keywords,
- stmt* body,
- expr* decorator_list)
- | Return(expr? value)
-
- | Delete(expr* targets)
- | Assign(expr* targets, expr value, string? type_comment)
- | AugAssign(expr target, operator op, expr value)
- -- 'simple' indicates that we annotate simple name without parens
- | AnnAssign(expr target, expr annotation, expr? value, int simple)
-
- -- use 'orelse' because else is a keyword in target languages
- | For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
- | AsyncFor(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
- | While(expr test, stmt* body, stmt* orelse)
- | If(expr test, stmt* body, stmt* orelse)
- | With(withitem* items, stmt* body, string? type_comment)
- | AsyncWith(withitem* items, stmt* body, string? type_comment)
-
- | Raise(expr? exc, expr? cause)
- | Try(stmt* body, excepthandler* handlers, stmt* orelse, stmt* finalbody)
- | Assert(expr test, expr? msg)
-
- | Import(alias* names)
- | ImportFrom(identifier? module, alias* names, int? level)
-
- | Global(identifier* names)
- | Nonlocal(identifier* names)
- | Expr(expr value)
- | Pass | Break | Continue
-
- -- XXX Jython will be different
- -- col_offset is the byte offset in the utf8 string the parser uses
- attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
-
- -- BoolOp() can use left & right?
- expr = BoolOp(boolop op, expr* values)
- | NamedExpr(expr target, expr value)
- | BinOp(expr left, operator op, expr right)
- | UnaryOp(unaryop op, expr operand)
- | Lambda(arguments args, expr body)
- | IfExp(expr test, expr body, expr orelse)
- | Dict(expr* keys, expr* values)
- | Set(expr* elts)
- | ListComp(expr elt, comprehension* generators)
- | SetComp(expr elt, comprehension* generators)
- | DictComp(expr key, expr value, comprehension* generators)
- | GeneratorExp(expr elt, comprehension* generators)
- -- the grammar constrains where yield expressions can occur
- | Await(expr value)
- | Yield(expr? value)
- | YieldFrom(expr value)
- -- need sequences for compare to distinguish between
- -- x < 4 < 3 and (x < 4) < 3
- | Compare(expr left, cmpop* ops, expr* comparators)
- | Call(expr func, expr* args, keyword* keywords)
- | FormattedValue(expr value, int? conversion, expr? format_spec)
- | JoinedStr(expr* values)
- | Constant(constant value, string? kind)
-
- -- the following expression can appear in assignment context
- | Attribute(expr value, identifier attr, expr_context ctx)
- | Subscript(expr value, slice slice, expr_context ctx)
- | Starred(expr value, expr_context ctx)
- | Name(identifier id, expr_context ctx)
- | List(expr* elts, expr_context ctx)
- | Tuple(expr* elts, expr_context ctx)
-
- -- col_offset is the byte offset in the utf8 string the parser uses
- attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
-
- expr_context = Load | Store | Del | AugLoad | AugStore | Param
-
- slice = Slice(expr? lower, expr? upper, expr? step)
- | ExtSlice(slice* dims)
- | Index(expr value)
-
- boolop = And | Or
-
- operator = Add | Sub | Mult | MatMult | Div | Mod | Pow | LShift
+ mod = Module(stmt* body)
+ | Interactive(stmt* body)
+ | Expression(expr body)
+
+ -- not really an actual node but useful in Jython's typesystem.
+ | Suite(stmt* body)
+
+ stmt = FunctionDef(identifier name, arguments args,
+ stmt* body, expr* decorator_list)
+ | ClassDef(identifier name, expr* bases, stmt* body, expr* decorator_list)
+ | Return(expr? value)
+
+ | Delete(expr* targets)
+ | Assign(expr* targets, expr value)
+ | AugAssign(expr target, operator op, expr value)
+
+ -- not sure if bool is allowed, can always use int
+ | Print(expr? dest, expr* values, bool nl)
+
+ -- use 'orelse' because else is a keyword in target languages
+ | For(expr target, expr iter, stmt* body, stmt* orelse)
+ | While(expr test, stmt* body, stmt* orelse)
+ | If(expr test, stmt* body, stmt* orelse)
+ | With(expr context_expr, expr? optional_vars, stmt* body)
+
+ -- 'type' is a bad name
+ | Raise(expr? type, expr? inst, expr? tback)
+ | TryExcept(stmt* body, excepthandler* handlers, stmt* orelse)
+ | TryFinally(stmt* body, stmt* finalbody)
+ | Assert(expr test, expr? msg)
+
+ | Import(alias* names)
+ | ImportFrom(identifier? module, alias* names, int? level)
+
+ -- Doesn't capture requirement that locals must be
+ -- defined if globals is
+ -- still supports use as a function!
+ | Exec(expr body, expr? globals, expr? locals)
+
+ | Global(identifier* names)
+ | Expr(expr value)
+ | Pass | Break | Continue
+
+ -- XXX Jython will be different
+ -- col_offset is the byte offset in the utf8 string the parser uses
+ attributes (int lineno, int col_offset)
+
+ -- BoolOp() can use left & right?
+ expr = BoolOp(boolop op, expr* values)
+ | BinOp(expr left, operator op, expr right)
+ | UnaryOp(unaryop op, expr operand)
+ | Lambda(arguments args, expr body)
+ | IfExp(expr test, expr body, expr orelse)
+ | Dict(expr* keys, expr* values)
+ | Set(expr* elts)
+ | ListComp(expr elt, comprehension* generators)
+ | SetComp(expr elt, comprehension* generators)
+ | DictComp(expr key, expr value, comprehension* generators)
+ | GeneratorExp(expr elt, comprehension* generators)
+ -- the grammar constrains where yield expressions can occur
+ | Yield(expr? value)
+ -- need sequences for compare to distinguish between
+ -- x < 4 < 3 and (x < 4) < 3
+ | Compare(expr left, cmpop* ops, expr* comparators)
+ | Call(expr func, expr* args, keyword* keywords,
+ expr? starargs, expr? kwargs)
+ | Repr(expr value)
+ | Num(object n) -- a number as a PyObject.
+ | Str(string s) -- need to specify raw, unicode, etc?
+ -- other literals? bools?
+
+ -- the following expression can appear in assignment context
+ | Attribute(expr value, identifier attr, expr_context ctx)
+ | Subscript(expr value, slice slice, expr_context ctx)
+ | Name(identifier id, expr_context ctx)
+ | List(expr* elts, expr_context ctx)
+ | Tuple(expr* elts, expr_context ctx)
+
+ -- col_offset is the byte offset in the utf8 string the parser uses
+ attributes (int lineno, int col_offset)
+
+ expr_context = Load | Store | Del | AugLoad | AugStore | Param
+
+ slice = Ellipsis | Slice(expr? lower, expr? upper, expr? step)
+ | ExtSlice(slice* dims)
+ | Index(expr value)
+
+ boolop = And | Or
+
+ operator = Add | Sub | Mult | Div | Mod | Pow | LShift
| RShift | BitOr | BitXor | BitAnd | FloorDiv
- unaryop = Invert | Not | UAdd | USub
+ unaryop = Invert | Not | UAdd | USub
- cmpop = Eq | NotEq | Lt | LtE | Gt | GtE | Is | IsNot | In | NotIn
+ cmpop = Eq | NotEq | Lt | LtE | Gt | GtE | Is | IsNot | In | NotIn
- comprehension = (expr target, expr iter, expr* ifs, int is_async)
+ comprehension = (expr target, expr iter, expr* ifs)
- excepthandler = ExceptHandler(expr? type, identifier? name, stmt* body)
- attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
+ -- not sure what to call the first argument for raise and except
+ excepthandler = ExceptHandler(expr? type, expr? name, stmt* body)
+ attributes (int lineno, int col_offset)
- arguments = (arg* posonlyargs, arg* args, arg? vararg, arg* kwonlyargs,
- expr* kw_defaults, arg? kwarg, expr* defaults)
+ arguments = (expr* args, identifier? vararg,
+ identifier? kwarg, expr* defaults)
- arg = (identifier arg, expr? annotation, string? type_comment)
- attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
+ -- keyword arguments supplied to call
+ keyword = (identifier arg, expr value)
- -- keyword arguments supplied to call (NULL identifier for **kwargs)
- keyword = (identifier? arg, expr value)
-
- -- import name with optional 'as' alias.
- alias = (identifier name, identifier? asname)
-
- withitem = (expr context_expr, expr? optional_vars)
-
- type_ignore = TypeIgnore(int lineno, string tag)
+ -- import name with optional 'as' alias.
+ alias = (identifier name, identifier? asname)
}
diff --git a/Parser/acceler.c b/Parser/acceler.c
index e515833..9b14263 100644
--- a/Parser/acceler.c
+++ b/Parser/acceler.c
@@ -10,21 +10,22 @@
are not part of the static data structure written on graminit.[ch]
by the parser generator. */
-#include "Python.h"
+#include "pgenheaders.h"
#include "grammar.h"
#include "node.h"
#include "token.h"
#include "parser.h"
/* Forward references */
-static void fixdfa(grammar *, const dfa *);
+static void fixdfa(grammar *, dfa *);
static void fixstate(grammar *, state *);
void
PyGrammar_AddAccelerators(grammar *g)
{
+ dfa *d;
int i;
- const dfa *d = g->g_dfa;
+ d = g->g_dfa;
for (i = g->g_ndfas; --i >= 0; d++)
fixdfa(g, d);
g->g_accel = 1;
@@ -33,9 +34,10 @@ PyGrammar_AddAccelerators(grammar *g)
void
PyGrammar_RemoveAccelerators(grammar *g)
{
+ dfa *d;
int i;
g->g_accel = 0;
- const dfa *d = g->g_dfa;
+ d = g->g_dfa;
for (i = g->g_ndfas; --i >= 0; d++) {
state *s;
int j;
@@ -49,7 +51,7 @@ PyGrammar_RemoveAccelerators(grammar *g)
}
static void
-fixdfa(grammar *g, const dfa *d)
+fixdfa(grammar *g, dfa *d)
{
state *s;
int j;
@@ -61,7 +63,7 @@ fixdfa(grammar *g, const dfa *d)
static void
fixstate(grammar *g, state *s)
{
- const arc *a;
+ arc *a;
int k;
int *accel;
int nl = g->g_ll.ll_nlabels;
@@ -76,14 +78,14 @@ fixstate(grammar *g, state *s)
a = s->s_arc;
for (k = s->s_narcs; --k >= 0; a++) {
int lbl = a->a_lbl;
- const label *l = &g->g_ll.ll_label[lbl];
+ label *l = &g->g_ll.ll_label[lbl];
int type = l->lb_type;
if (a->a_arrow >= (1 << 7)) {
printf("XXX too many states!\n");
continue;
}
if (ISNONTERMINAL(type)) {
- const dfa *d1 = PyGrammar_FindDFA(g, type);
+ dfa *d1 = PyGrammar_FindDFA(g, type);
int ibit;
if (type - NT_OFFSET >= (1 << 7)) {
printf("XXX too high nonterminal number!\n");
diff --git a/Parser/asdl.py b/Parser/asdl.py
index 62f5c19..1ddc3f8 100644
--- a/Parser/asdl.py
+++ b/Parser/asdl.py
@@ -1,53 +1,243 @@
-#-------------------------------------------------------------------------------
-# Parser for ASDL [1] definition files. Reads in an ASDL description and parses
-# it into an AST that describes it.
-#
-# The EBNF we're parsing here: Figure 1 of the paper [1]. Extended to support
-# modules and attributes after a product. Words starting with Capital letters
-# are terminals. Literal tokens are in "double quotes". Others are
-# non-terminals. Id is either TokenId or ConstructorId.
-#
-# module ::= "module" Id "{" [definitions] "}"
-# definitions ::= { TypeId "=" type }
-# type ::= product | sum
-# product ::= fields ["attributes" fields]
-# fields ::= "(" { field, "," } field ")"
-# field ::= TypeId ["?" | "*"] [Id]
-# sum ::= constructor { "|" constructor } ["attributes" fields]
-# constructor ::= ConstructorId [fields]
-#
-# [1] "The Zephyr Abstract Syntax Description Language" by Wang, et. al. See
-# http://asdl.sourceforge.net/
-#-------------------------------------------------------------------------------
-from collections import namedtuple
-import re
-
-__all__ = [
- 'builtin_types', 'parse', 'AST', 'Module', 'Type', 'Constructor',
- 'Field', 'Sum', 'Product', 'VisitorBase', 'Check', 'check']
-
-# The following classes define nodes into which the ASDL description is parsed.
-# Note: this is a "meta-AST". ASDL files (such as Python.asdl) describe the AST
-# structure used by a programming language. But ASDL files themselves need to be
-# parsed. This module parses ASDL files and uses a simple AST to represent them.
-# See the EBNF at the top of the file to understand the logical connection
-# between the various node types.
-
-builtin_types = {'identifier', 'string', 'bytes', 'int', 'object', 'singleton',
- 'constant'}
-
-class AST:
+"""An implementation of the Zephyr Abstract Syntax Definition Language.
+
+See http://asdl.sourceforge.net/ and
+http://www.cs.princeton.edu/research/techreps/TR-554-97
+
+Only supports top level module decl, not view. I'm guessing that view
+is intended to support the browser and I'm not interested in the
+browser.
+
+Changes for Python: Add support for module versions
+"""
+
+import os
+import traceback
+
+import spark
+
+class Token(object):
+ # spark seems to dispatch in the parser based on a token's
+ # type attribute
+ def __init__(self, type, lineno):
+ self.type = type
+ self.lineno = lineno
+
+ def __str__(self):
+ return self.type
+
def __repr__(self):
- raise NotImplementedError
+ return str(self)
+
+class Id(Token):
+ def __init__(self, value, lineno):
+ self.type = 'Id'
+ self.value = value
+ self.lineno = lineno
+
+ def __str__(self):
+ return self.value
+
+class String(Token):
+ def __init__(self, value, lineno):
+ self.type = 'String'
+ self.value = value
+ self.lineno = lineno
+
+class ASDLSyntaxError(Exception):
+
+ def __init__(self, lineno, token=None, msg=None):
+ self.lineno = lineno
+ self.token = token
+ self.msg = msg
+
+ def __str__(self):
+ if self.msg is None:
+ return "Error at '%s', line %d" % (self.token, self.lineno)
+ else:
+ return "%s, line %d" % (self.msg, self.lineno)
+
+class ASDLScanner(spark.GenericScanner, object):
+
+ def tokenize(self, input):
+ self.rv = []
+ self.lineno = 1
+ super(ASDLScanner, self).tokenize(input)
+ return self.rv
+
+ def t_id(self, s):
+ r"[\w\.]+"
+ # XXX doesn't distinguish upper vs. lower, which is
+ # significant for ASDL.
+ self.rv.append(Id(s, self.lineno))
+
+ def t_string(self, s):
+ r'"[^"]*"'
+ self.rv.append(String(s, self.lineno))
+
+ def t_xxx(self, s): # not sure what this production means
+ r"<="
+ self.rv.append(Token(s, self.lineno))
+
+ def t_punctuation(self, s):
+ r"[\{\}\*\=\|\(\)\,\?\:]"
+ self.rv.append(Token(s, self.lineno))
+
+ def t_comment(self, s):
+ r"\-\-[^\n]*"
+ pass
+
+ def t_newline(self, s):
+ r"\n"
+ self.lineno += 1
+
+ def t_whitespace(self, s):
+ r"[ \t]+"
+ pass
+
+ def t_default(self, s):
+ r" . +"
+ raise ValueError, "unmatched input: %s" % `s`
+
+class ASDLParser(spark.GenericParser, object):
+ def __init__(self):
+ super(ASDLParser, self).__init__("module")
+
+ def typestring(self, tok):
+ return tok.type
+
+ def error(self, tok):
+ raise ASDLSyntaxError(tok.lineno, tok)
+
+ def p_module_0(self, (module, name, version, _0, _1)):
+ " module ::= Id Id version { } "
+ if module.value != "module":
+ raise ASDLSyntaxError(module.lineno,
+ msg="expected 'module', found %s" % module)
+ return Module(name, None, version)
+
+ def p_module(self, (module, name, version, _0, definitions, _1)):
+ " module ::= Id Id version { definitions } "
+ if module.value != "module":
+ raise ASDLSyntaxError(module.lineno,
+ msg="expected 'module', found %s" % module)
+ return Module(name, definitions, version)
+
+ def p_version(self, (version, V)):
+ "version ::= Id String"
+ if version.value != "version":
+ raise ASDLSyntaxError(version.lineno,
+ msg="expected 'version', found %s" % version)
+ return V
+
+ def p_definition_0(self, (definition,)):
+ " definitions ::= definition "
+ return definition
+
+ def p_definition_1(self, (definitions, definition)):
+ " definitions ::= definition definitions "
+ return definitions + definition
+
+ def p_definition(self, (id, _, type)):
+ " definition ::= Id = type "
+ return [Type(id, type)]
+
+ def p_type_0(self, (product,)):
+ " type ::= product "
+ return product
+
+ def p_type_1(self, (sum,)):
+ " type ::= sum "
+ return Sum(sum)
+
+ def p_type_2(self, (sum, id, _0, attributes, _1)):
+ " type ::= sum Id ( fields ) "
+ if id.value != "attributes":
+ raise ASDLSyntaxError(id.lineno,
+ msg="expected attributes, found %s" % id)
+ if attributes:
+ attributes.reverse()
+ return Sum(sum, attributes)
+
+ def p_product(self, (_0, fields, _1)):
+ " product ::= ( fields ) "
+ # XXX can't I just construct things in the right order?
+ fields.reverse()
+ return Product(fields)
+
+ def p_sum_0(self, (constructor,)):
+ " sum ::= constructor "
+ return [constructor]
+
+ def p_sum_1(self, (constructor, _, sum)):
+ " sum ::= constructor | sum "
+ return [constructor] + sum
+
+ def p_sum_2(self, (constructor, _, sum)):
+ " sum ::= constructor | sum "
+ return [constructor] + sum
+
+ def p_constructor_0(self, (id,)):
+ " constructor ::= Id "
+ return Constructor(id)
+
+ def p_constructor_1(self, (id, _0, fields, _1)):
+ " constructor ::= Id ( fields ) "
+ # XXX can't I just construct things in the right order?
+ fields.reverse()
+ return Constructor(id, fields)
+
+ def p_fields_0(self, (field,)):
+ " fields ::= field "
+ return [field]
+
+ def p_fields_1(self, (field, _, fields)):
+ " fields ::= field , fields "
+ return fields + [field]
+
+ def p_field_0(self, (type,)):
+ " field ::= Id "
+ return Field(type)
+
+ def p_field_1(self, (type, name)):
+ " field ::= Id Id "
+ return Field(type, name)
+
+ def p_field_2(self, (type, _, name)):
+ " field ::= Id * Id "
+ return Field(type, name, seq=True)
+
+ def p_field_3(self, (type, _, name)):
+ " field ::= Id ? Id "
+ return Field(type, name, opt=True)
+
+ def p_field_4(self, (type, _)):
+ " field ::= Id * "
+ return Field(type, seq=True)
+
+ def p_field_5(self, (type, _)):
+ " field ::= Id ? "
+ return Field(type, opt=True)
+
+builtin_types = ("identifier", "string", "int", "bool", "object")
+
+# below is a collection of classes to capture the AST of an AST :-)
+# not sure if any of the methods are useful yet, but I'm adding them
+# piecemeal as they seem helpful
+
+class AST(object):
+ pass # a marker class
class Module(AST):
- def __init__(self, name, dfns):
+ def __init__(self, name, dfns, version):
self.name = name
self.dfns = dfns
- self.types = {type.name: type.value for type in dfns}
+ self.version = version
+ self.types = {} # maps type name to value (from dfns)
+ for type in dfns:
+ self.types[type.name.value] = type.value
def __repr__(self):
- return 'Module({0.name}, {0.dfns})'.format(self)
+ return "Module(%s, %s)" % (self.name, self.dfns)
class Type(AST):
def __init__(self, name, value):
@@ -55,7 +245,7 @@ class Type(AST):
self.value = value
def __repr__(self):
- return 'Type({0.name}, {0.value})'.format(self)
+ return "Type(%s, %s)" % (self.name, self.value)
class Constructor(AST):
def __init__(self, name, fields=None):
@@ -63,7 +253,7 @@ class Constructor(AST):
self.fields = fields or []
def __repr__(self):
- return 'Constructor({0.name}, {0.fields})'.format(self)
+ return "Constructor(%s, %s)" % (self.name, self.fields)
class Field(AST):
def __init__(self, type, name=None, seq=False, opt=False):
@@ -80,9 +270,9 @@ class Field(AST):
else:
extra = ""
if self.name is None:
- return 'Field({0.type}{1})'.format(self, extra)
+ return "Field(%s%s)" % (self.type, extra)
else:
- return 'Field({0.type}, {0.name}{1})'.format(self, extra)
+ return "Field(%s, %s%s)" % (self.type, self.name, extra)
class Sum(AST):
def __init__(self, types, attributes=None):
@@ -90,54 +280,56 @@ class Sum(AST):
self.attributes = attributes or []
def __repr__(self):
- if self.attributes:
- return 'Sum({0.types}, {0.attributes})'.format(self)
+ if self.attributes is None:
+ return "Sum(%s)" % self.types
else:
- return 'Sum({0.types})'.format(self)
+ return "Sum(%s, %s)" % (self.types, self.attributes)
class Product(AST):
- def __init__(self, fields, attributes=None):
+ def __init__(self, fields):
self.fields = fields
- self.attributes = attributes or []
def __repr__(self):
- if self.attributes:
- return 'Product({0.fields}, {0.attributes})'.format(self)
- else:
- return 'Product({0.fields})'.format(self)
-
-# A generic visitor for the meta-AST that describes ASDL. This can be used by
-# emitters. Note that this visitor does not provide a generic visit method, so a
-# subclass needs to define visit methods from visitModule to as deep as the
-# interesting node.
-# We also define a Check visitor that makes sure the parsed ASDL is well-formed.
+ return "Product(%s)" % self.fields
class VisitorBase(object):
- """Generic tree visitor for ASTs."""
- def __init__(self):
+
+ def __init__(self, skip=False):
self.cache = {}
+ self.skip = skip
- def visit(self, obj, *args):
- klass = obj.__class__
+ def visit(self, object, *args):
+ meth = self._dispatch(object)
+ if meth is None:
+ return
+ try:
+ meth(object, *args)
+ except Exception, err:
+ print "Error visiting", repr(object)
+ print err
+ traceback.print_exc()
+ # XXX hack
+ if hasattr(self, 'file'):
+ self.file.flush()
+ os._exit(1)
+
+ def _dispatch(self, object):
+ assert isinstance(object, AST), repr(object)
+ klass = object.__class__
meth = self.cache.get(klass)
if meth is None:
methname = "visit" + klass.__name__
- meth = getattr(self, methname, None)
+ if self.skip:
+ meth = getattr(self, methname, None)
+ else:
+ meth = getattr(self, methname)
self.cache[klass] = meth
- if meth:
- try:
- meth(obj, *args)
- except Exception as e:
- print("Error visiting %r: %s" % (obj, e))
- raise
+ return meth
class Check(VisitorBase):
- """A visitor that checks a parsed ASDL tree for correctness.
- Errors are printed and accumulated.
- """
def __init__(self):
- super(Check, self).__init__()
+ super(Check, self).__init__(skip=True)
self.cons = {}
self.errors = 0
self.types = {}
@@ -159,8 +351,8 @@ class Check(VisitorBase):
if conflict is None:
self.cons[key] = name
else:
- print('Redefinition of constructor {}'.format(key))
- print('Defined in {} and {}'.format(conflict, name))
+ print "Redefinition of constructor %s" % key
+ print "Defined in %s and %s" % (conflict, name)
self.errors += 1
for f in cons.fields:
self.visit(f, key)
@@ -175,11 +367,6 @@ class Check(VisitorBase):
self.visit(f, name)
def check(mod):
- """Check the parsed ASDL tree for correctness.
-
- Return True if success. For failure, the errors are printed out and False
- is returned.
- """
v = Check()
v.visit(mod)
@@ -187,190 +374,40 @@ def check(mod):
if t not in mod.types and not t in builtin_types:
v.errors += 1
uses = ", ".join(v.types[t])
- print('Undefined type {}, used in {}'.format(t, uses))
- return not v.errors
-
-# The ASDL parser itself comes next. The only interesting external interface
-# here is the top-level parse function.
-
-def parse(filename):
- """Parse ASDL from the given file and return a Module node describing it."""
- with open(filename) as f:
- parser = ASDLParser()
- return parser.parse(f.read())
-
-# Types for describing tokens in an ASDL specification.
-class TokenKind:
- """TokenKind is provides a scope for enumerated token kinds."""
- (ConstructorId, TypeId, Equals, Comma, Question, Pipe, Asterisk,
- LParen, RParen, LBrace, RBrace) = range(11)
+ print "Undefined type %s, used in %s" % (t, uses)
- operator_table = {
- '=': Equals, ',': Comma, '?': Question, '|': Pipe, '(': LParen,
- ')': RParen, '*': Asterisk, '{': LBrace, '}': RBrace}
-
-Token = namedtuple('Token', 'kind value lineno')
-
-class ASDLSyntaxError(Exception):
- def __init__(self, msg, lineno=None):
- self.msg = msg
- self.lineno = lineno or '<unknown>'
-
- def __str__(self):
- return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
-
-def tokenize_asdl(buf):
- """Tokenize the given buffer. Yield Token objects."""
- for lineno, line in enumerate(buf.splitlines(), 1):
- for m in re.finditer(r'\s*(\w+|--.*|.)', line.strip()):
- c = m.group(1)
- if c[0].isalpha():
- # Some kind of identifier
- if c[0].isupper():
- yield Token(TokenKind.ConstructorId, c, lineno)
- else:
- yield Token(TokenKind.TypeId, c, lineno)
- elif c[:2] == '--':
- # Comment
- break
- else:
- # Operators
- try:
- op_kind = TokenKind.operator_table[c]
- except KeyError:
- raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
- yield Token(op_kind, c, lineno)
-
-class ASDLParser:
- """Parser for ASDL files.
-
- Create, then call the parse method on a buffer containing ASDL.
- This is a simple recursive descent parser that uses tokenize_asdl for the
- lexing.
- """
- def __init__(self):
- self._tokenizer = None
- self.cur_token = None
-
- def parse(self, buf):
- """Parse the ASDL in the buffer and return an AST with a Module root.
- """
- self._tokenizer = tokenize_asdl(buf)
- self._advance()
- return self._parse_module()
-
- def _parse_module(self):
- if self._at_keyword('module'):
- self._advance()
- else:
- raise ASDLSyntaxError(
- 'Expected "module" (found {})'.format(self.cur_token.value),
- self.cur_token.lineno)
- name = self._match(self._id_kinds)
- self._match(TokenKind.LBrace)
- defs = self._parse_definitions()
- self._match(TokenKind.RBrace)
- return Module(name, defs)
-
- def _parse_definitions(self):
- defs = []
- while self.cur_token.kind == TokenKind.TypeId:
- typename = self._advance()
- self._match(TokenKind.Equals)
- type = self._parse_type()
- defs.append(Type(typename, type))
- return defs
-
- def _parse_type(self):
- if self.cur_token.kind == TokenKind.LParen:
- # If we see a (, it's a product
- return self._parse_product()
- else:
- # Otherwise it's a sum. Look for ConstructorId
- sumlist = [Constructor(self._match(TokenKind.ConstructorId),
- self._parse_optional_fields())]
- while self.cur_token.kind == TokenKind.Pipe:
- # More constructors
- self._advance()
- sumlist.append(Constructor(
- self._match(TokenKind.ConstructorId),
- self._parse_optional_fields()))
- return Sum(sumlist, self._parse_optional_attributes())
-
- def _parse_product(self):
- return Product(self._parse_fields(), self._parse_optional_attributes())
-
- def _parse_fields(self):
- fields = []
- self._match(TokenKind.LParen)
- while self.cur_token.kind == TokenKind.TypeId:
- typename = self._advance()
- is_seq, is_opt = self._parse_optional_field_quantifier()
- id = (self._advance() if self.cur_token.kind in self._id_kinds
- else None)
- fields.append(Field(typename, id, seq=is_seq, opt=is_opt))
- if self.cur_token.kind == TokenKind.RParen:
- break
- elif self.cur_token.kind == TokenKind.Comma:
- self._advance()
- self._match(TokenKind.RParen)
- return fields
-
- def _parse_optional_fields(self):
- if self.cur_token.kind == TokenKind.LParen:
- return self._parse_fields()
- else:
- return None
+ return not v.errors
- def _parse_optional_attributes(self):
- if self._at_keyword('attributes'):
- self._advance()
- return self._parse_fields()
+def parse(file):
+ scanner = ASDLScanner()
+ parser = ASDLParser()
+
+ buf = open(file).read()
+ tokens = scanner.tokenize(buf)
+ try:
+ return parser.parse(tokens)
+ except ASDLSyntaxError, err:
+ print err
+ lines = buf.split("\n")
+ print lines[err.lineno - 1] # lines starts at 0, files at 1
+
+if __name__ == "__main__":
+ import glob
+ import sys
+
+ if len(sys.argv) > 1:
+ files = sys.argv[1:]
+ else:
+ testdir = "tests"
+ files = glob.glob(testdir + "/*.asdl")
+
+ for file in files:
+ print file
+ mod = parse(file)
+ print "module", mod.name
+ print len(mod.dfns), "definitions"
+ if not check(mod):
+ print "Check failed"
else:
- return None
-
- def _parse_optional_field_quantifier(self):
- is_seq, is_opt = False, False
- if self.cur_token.kind == TokenKind.Asterisk:
- is_seq = True
- self._advance()
- elif self.cur_token.kind == TokenKind.Question:
- is_opt = True
- self._advance()
- return is_seq, is_opt
-
- def _advance(self):
- """ Return the value of the current token and read the next one into
- self.cur_token.
- """
- cur_val = None if self.cur_token is None else self.cur_token.value
- try:
- self.cur_token = next(self._tokenizer)
- except StopIteration:
- self.cur_token = None
- return cur_val
-
- _id_kinds = (TokenKind.ConstructorId, TokenKind.TypeId)
-
- def _match(self, kind):
- """The 'match' primitive of RD parsers.
-
- * Verifies that the current token is of the given kind (kind can
- be a tuple, in which the kind must match one of its members).
- * Returns the value of the current token
- * Reads in the next token
- """
- if (isinstance(kind, tuple) and self.cur_token.kind in kind or
- self.cur_token.kind == kind
- ):
- value = self.cur_token.value
- self._advance()
- return value
- else:
- raise ASDLSyntaxError(
- 'Unmatched {} (found {})'.format(kind, self.cur_token.kind),
- self.cur_token.lineno)
-
- def _at_keyword(self, keyword):
- return (self.cur_token.kind == TokenKind.TypeId and
- self.cur_token.value == keyword)
+ for dfn in mod.dfns:
+ print dfn.type
diff --git a/Parser/asdl_c.py b/Parser/asdl_c.py
index 52495e9..ac61c78 100755
--- a/Parser/asdl_c.py
+++ b/Parser/asdl_c.py
@@ -1,18 +1,25 @@
#! /usr/bin/env python
"""Generate C code from an ASDL description."""
+# TO DO
+# handle fields that have a type but no name
+
import os, sys
import asdl
-TABSIZE = 4
+TABSIZE = 8
MAX_COL = 80
def get_c_type(name):
"""Return a string for the C name of the type.
- This function special cases the default types provided by asdl.
+ This function special cases the default types provided by asdl:
+ identifier, string, int, bool.
"""
+ # XXX ack! need to figure out where Id is useful and where string
+ if isinstance(name, asdl.Id):
+ name = name.value
if name in asdl.builtin_types:
return name
else:
@@ -77,20 +84,8 @@ class EmitVisitor(asdl.VisitorBase):
def __init__(self, file):
self.file = file
- self.identifiers = set()
- self.singletons = set()
- self.types = set()
super(EmitVisitor, self).__init__()
- def emit_identifier(self, name):
- self.identifiers.add(str(name))
-
- def emit_singleton(self, name):
- self.singletons.add(str(name))
-
- def emit_type(self, name):
- self.types.add(str(name))
-
def emit(self, s, depth, reflow=True):
# XXX reflow long lines?
if reflow:
@@ -98,9 +93,8 @@ class EmitVisitor(asdl.VisitorBase):
else:
lines = [s]
for line in lines:
- if line:
- line = (" " * TABSIZE * depth) + line
- self.file.write(line + "\n")
+ line = (" " * TABSIZE * depth) + line + "\n"
+ self.file.write(line)
class TypeDefVisitor(EmitVisitor):
@@ -186,6 +180,9 @@ class StructVisitor(EmitVisitor):
self.visit(f, depth + 1)
self.emit("} %s;" % cons.name, depth)
self.emit("", depth)
+ else:
+ # XXX not sure what I want here, nothing is probably fine
+ pass
def visitField(self, field, depth):
# XXX need to lookup field.type, because it might be something
@@ -193,7 +190,7 @@ class StructVisitor(EmitVisitor):
ctype = get_c_type(field.type)
name = field.name
if field.seq:
- if field.type == 'cmpop':
+ if field.type.value in ('cmpop',):
self.emit("asdl_int_seq *%(name)s;" % locals(), depth)
else:
self.emit("asdl_seq *%(name)s;" % locals(), depth)
@@ -204,11 +201,6 @@ class StructVisitor(EmitVisitor):
self.emit("struct _%(name)s {" % locals(), depth)
for f in product.fields:
self.visit(f, depth + 1)
- for field in product.attributes:
- # rudimentary attribute handling
- type = str(field.type)
- assert type in asdl.builtin_types, type
- self.emit("%s %s;" % (type, field.name), depth + 1);
self.emit("};", depth)
self.emit("", depth)
@@ -248,7 +240,7 @@ class PrototypeVisitor(EmitVisitor):
name = f.name
# XXX should extend get_c_type() to handle this
if f.seq:
- if f.type == 'cmpop':
+ if f.type.value in ('cmpop',):
ctype = "asdl_int_seq *"
else:
ctype = "asdl_seq *"
@@ -280,9 +272,7 @@ class PrototypeVisitor(EmitVisitor):
def visitProduct(self, prod, name):
self.emit_function(name, get_c_type(name),
- self.get_args(prod.fields),
- self.get_args(prod.attributes),
- union=False)
+ self.get_args(prod.fields), [], union=False)
class FunctionVisitor(PrototypeVisitor):
@@ -302,7 +292,8 @@ class FunctionVisitor(PrototypeVisitor):
emit("{")
emit("%s p;" % ctype, 1)
for argtype, argname, opt in args:
- if not opt and argtype != "int":
+ # XXX hack alert: false is allowed for a bool
+ if not opt and not (argtype == "bool" or argtype == "int"):
emit("if (!%s) {" % argname, 1)
emit("PyErr_SetString(PyExc_ValueError,", 2)
msg = "field %s is required for %s" % (argname, name)
@@ -336,8 +327,7 @@ class FunctionVisitor(PrototypeVisitor):
self.emit(s, depth, reflow)
for argtype, argname, opt in args:
emit("p->%s = %s;" % (argname, argname), 1)
- for argtype, argname, opt in attrs:
- emit("p->%s = %s;" % (argname, argname), 1)
+ assert not attrs
class PickleVisitor(EmitVisitor):
@@ -376,18 +366,20 @@ class Obj2ModVisitor(PickleVisitor):
self.emit("int", 0)
self.emit("obj2ast_%s(PyObject* obj, %s* out, PyArena* arena)" % (name, ctype), 0)
self.emit("{", 0)
+ self.emit("PyObject* tmp = NULL;", 1)
self.emit("int isinstance;", 1)
self.emit("", 0)
- def sumTrailer(self, name, add_label=False):
+ def sumTrailer(self, name):
self.emit("", 0)
+ self.emit("tmp = PyObject_Repr(obj);", 1)
# there's really nothing more we can do if this fails ...
- error = "expected some sort of %s, but got %%R" % name
- format = "PyErr_Format(PyExc_TypeError, \"%s\", obj);"
+ self.emit("if (tmp == NULL) goto failed;", 1)
+ error = "expected some sort of %s, but got %%.400s" % name
+ format = "PyErr_Format(PyExc_TypeError, \"%s\", PyString_AS_STRING(tmp));"
self.emit(format % error, 1, reflow=False)
- if add_label:
- self.emit("failed:", 1)
- self.emit("Py_XDECREF(tmp);", 1)
+ self.emit("failed:", 0)
+ self.emit("Py_XDECREF(tmp);", 1)
self.emit("return 1;", 1)
self.emit("}", 0)
self.emit("", 0)
@@ -396,7 +388,7 @@ class Obj2ModVisitor(PickleVisitor):
self.funcHeader(name)
for t in sum.types:
line = ("isinstance = PyObject_IsInstance(obj, "
- "astmodulestate_global->%s_type);")
+ "(PyObject *)%s_type);")
self.emit(line % (t.name,), 1)
self.emit("if (isinstance == -1) {", 1)
self.emit("return 1;", 2)
@@ -412,8 +404,6 @@ class Obj2ModVisitor(PickleVisitor):
def complexSum(self, sum, name):
self.funcHeader(name)
- self.emit("PyObject *tmp = NULL;", 1)
- self.emit("PyObject *tp;", 1)
for a in sum.attributes:
self.visitAttributeDeclaration(a, name, sum=sum)
self.emit("", 0)
@@ -425,8 +415,8 @@ class Obj2ModVisitor(PickleVisitor):
for a in sum.attributes:
self.visitField(a, name, sum=sum, depth=1)
for t in sum.types:
- self.emit("tp = astmodulestate_global->%s_type;" % (t.name,), 1)
- self.emit("isinstance = PyObject_IsInstance(obj, tp);", 1)
+ line = "isinstance = PyObject_IsInstance(obj, (PyObject*)%s_type);"
+ self.emit(line % (t.name,), 1)
self.emit("if (isinstance == -1) {", 1)
self.emit("return 1;", 2)
self.emit("}", 1)
@@ -436,12 +426,12 @@ class Obj2ModVisitor(PickleVisitor):
self.emit("", 0)
for f in t.fields:
self.visitField(f, t.name, sum=sum, depth=2)
- args = [f.name for f in t.fields] + [a.name for a in sum.attributes]
+ args = [f.name.value for f in t.fields] + [a.name.value for a in sum.attributes]
self.emit("*out = %s(%s);" % (t.name, self.buildArgs(args)), 2)
self.emit("if (*out == NULL) goto failed;", 2)
self.emit("return 0;", 2)
self.emit("}", 1)
- self.sumTrailer(name, True)
+ self.sumTrailer(name)
def visitAttributeDeclaration(self, a, name, sum=sum):
ctype = get_c_type(a.type)
@@ -461,15 +451,10 @@ class Obj2ModVisitor(PickleVisitor):
self.emit("PyObject* tmp = NULL;", 1)
for f in prod.fields:
self.visitFieldDeclaration(f, name, prod=prod, depth=1)
- for a in prod.attributes:
- self.visitFieldDeclaration(a, name, prod=prod, depth=1)
self.emit("", 0)
for f in prod.fields:
self.visitField(f, name, prod=prod, depth=1)
- for a in prod.attributes:
- self.visitField(a, name, prod=prod, depth=1)
- args = [f.name for f in prod.fields]
- args.extend([a.name for a in prod.attributes])
+ args = [f.name.value for f in prod.fields]
self.emit("*out = %s(%s);" % (name, self.buildArgs(args)), 1)
self.emit("return 0;", 1)
self.emit("failed:", 0)
@@ -491,8 +476,8 @@ class Obj2ModVisitor(PickleVisitor):
def isSimpleSum(self, field):
# XXX can the members of this list be determined automatically?
- return field.type in ('expr_context', 'boolop', 'operator',
- 'unaryop', 'cmpop')
+ return field.type.value in ('expr_context', 'boolop', 'operator',
+ 'unaryop', 'cmpop')
def isNumeric(self, field):
return get_c_type(field.type) in ("int", "bool")
@@ -502,52 +487,31 @@ class Obj2ModVisitor(PickleVisitor):
def visitField(self, field, name, sum=None, prod=None, depth=0):
ctype = get_c_type(field.type)
- line = "if (_PyObject_LookupAttr(obj, astmodulestate_global->%s, &tmp) < 0) {"
- self.emit(line % field.name, depth)
- self.emit("return 1;", depth+1)
- self.emit("}", depth)
- if not field.opt:
- self.emit("if (tmp == NULL) {", depth)
- message = "required field \\\"%s\\\" missing from %s" % (field.name, name)
- format = "PyErr_SetString(PyExc_TypeError, \"%s\");"
- self.emit(format % message, depth+1, reflow=False)
- self.emit("return 1;", depth+1)
- else:
- self.emit("if (tmp == NULL || tmp == Py_None) {", depth)
- self.emit("Py_CLEAR(tmp);", depth+1)
- if self.isNumeric(field):
- self.emit("%s = 0;" % field.name, depth+1)
- elif not self.isSimpleType(field):
- self.emit("%s = NULL;" % field.name, depth+1)
- else:
- raise TypeError("could not determine the default value for %s" % field.name)
- self.emit("}", depth)
- self.emit("else {", depth)
-
+ self.emit("if (PyObject_HasAttrString(obj, \"%s\")) {" % field.name, depth)
self.emit("int res;", depth+1)
if field.seq:
self.emit("Py_ssize_t len;", depth+1)
self.emit("Py_ssize_t i;", depth+1)
+ self.emit("tmp = PyObject_GetAttrString(obj, \"%s\");" % field.name, depth+1)
+ self.emit("if (tmp == NULL) goto failed;", depth+1)
+ if field.seq:
self.emit("if (!PyList_Check(tmp)) {", depth+1)
self.emit("PyErr_Format(PyExc_TypeError, \"%s field \\\"%s\\\" must "
- "be a list, not a %%.200s\", _PyType_Name(Py_TYPE(tmp)));" %
+ "be a list, not a %%.200s\", tmp->ob_type->tp_name);" %
(name, field.name),
depth+2, reflow=False)
self.emit("goto failed;", depth+2)
self.emit("}", depth+1)
self.emit("len = PyList_GET_SIZE(tmp);", depth+1)
if self.isSimpleType(field):
- self.emit("%s = _Py_asdl_int_seq_new(len, arena);" % field.name, depth+1)
+ self.emit("%s = asdl_int_seq_new(len, arena);" % field.name, depth+1)
else:
- self.emit("%s = _Py_asdl_seq_new(len, arena);" % field.name, depth+1)
+ self.emit("%s = asdl_seq_new(len, arena);" % field.name, depth+1)
self.emit("if (%s == NULL) goto failed;" % field.name, depth+1)
self.emit("for (i = 0; i < len; i++) {", depth+1)
self.emit("%s val;" % ctype, depth+2)
- self.emit("PyObject *tmp2 = PyList_GET_ITEM(tmp, i);", depth+2)
- self.emit("Py_INCREF(tmp2);", depth+2)
- self.emit("res = obj2ast_%s(tmp2, &val, arena);" %
+ self.emit("res = obj2ast_%s(PyList_GET_ITEM(tmp, i), &val, arena);" %
field.type, depth+2, reflow=False)
- self.emit("Py_DECREF(tmp2);", depth+2)
self.emit("if (res != 0) goto failed;", depth+2)
self.emit("if (len != PyList_GET_SIZE(tmp)) {", depth+2)
self.emit("PyErr_SetString(PyExc_RuntimeError, \"%s field \\\"%s\\\" "
@@ -563,7 +527,21 @@ class Obj2ModVisitor(PickleVisitor):
(field.type, field.name), depth+1)
self.emit("if (res != 0) goto failed;", depth+1)
- self.emit("Py_CLEAR(tmp);", depth+1)
+ self.emit("Py_XDECREF(tmp);", depth+1)
+ self.emit("tmp = NULL;", depth+1)
+ self.emit("} else {", depth)
+ if not field.opt:
+ message = "required field \\\"%s\\\" missing from %s" % (field.name, name)
+ format = "PyErr_SetString(PyExc_TypeError, \"%s\");"
+ self.emit(format % message, depth+1, reflow=False)
+ self.emit("return 1;", depth+1)
+ else:
+ if self.isNumeric(field):
+ self.emit("%s = 0;" % field.name, depth+1)
+ elif not self.isSimpleType(field):
+ self.emit("%s = NULL;" % field.name, depth+1)
+ else:
+ raise TypeError("could not determine the default value for %s" % field.name)
self.emit("}", depth)
@@ -580,46 +558,37 @@ class MarshalPrototypeVisitor(PickleVisitor):
class PyTypesDeclareVisitor(PickleVisitor):
def visitProduct(self, prod, name):
- self.emit_type("%s_type" % name)
+ self.emit("static PyTypeObject *%s_type;" % name, 0)
self.emit("static PyObject* ast2obj_%s(void*);" % name, 0)
- if prod.attributes:
- for a in prod.attributes:
- self.emit_identifier(a.name)
- self.emit("static const char * const %s_attributes[] = {" % name, 0)
- for a in prod.attributes:
- self.emit('"%s",' % a.name, 1)
- self.emit("};", 0)
if prod.fields:
- for f in prod.fields:
- self.emit_identifier(f.name)
- self.emit("static const char * const %s_fields[]={" % name,0)
+ self.emit("static char *%s_fields[]={" % name,0)
for f in prod.fields:
self.emit('"%s",' % f.name, 1)
self.emit("};", 0)
def visitSum(self, sum, name):
- self.emit_type("%s_type" % name)
+ self.emit("static PyTypeObject *%s_type;" % name, 0)
if sum.attributes:
- for a in sum.attributes:
- self.emit_identifier(a.name)
- self.emit("static const char * const %s_attributes[] = {" % name, 0)
+ self.emit("static char *%s_attributes[] = {" % name, 0)
for a in sum.attributes:
self.emit('"%s",' % a.name, 1)
self.emit("};", 0)
ptype = "void*"
if is_simple(sum):
ptype = get_c_type(name)
+ tnames = []
for t in sum.types:
- self.emit_singleton("%s_singleton" % t.name)
+ tnames.append(str(t.name)+"_singleton")
+ tnames = ", *".join(tnames)
+ self.emit("static PyObject *%s;" % tnames, 0)
self.emit("static PyObject* ast2obj_%s(%s);" % (name, ptype), 0)
for t in sum.types:
self.visitConstructor(t, name)
def visitConstructor(self, cons, name):
+ self.emit("static PyTypeObject *%s_type;" % cons.name, 0)
if cons.fields:
- for t in cons.fields:
- self.emit_identifier(t.name)
- self.emit("static const char * const %s_fields[]={" % cons.name, 0)
+ self.emit("static char *%s_fields[]={" % cons.name, 0)
for t in cons.fields:
self.emit('"%s",' % t.name, 1)
self.emit("};",0)
@@ -628,74 +597,43 @@ class PyTypesVisitor(PickleVisitor):
def visitModule(self, mod):
self.emit("""
-
-typedef struct {
- PyObject_HEAD
- PyObject *dict;
-} AST_object;
-
-static void
-ast_dealloc(AST_object *self)
-{
- /* bpo-31095: UnTrack is needed before calling any callbacks */
- PyTypeObject *tp = Py_TYPE(self);
- PyObject_GC_UnTrack(self);
- Py_CLEAR(self->dict);
- freefunc free_func = PyType_GetSlot(tp, Py_tp_free);
- assert(free_func != NULL);
- free_func(self);
- Py_DECREF(tp);
-}
-
-static int
-ast_traverse(AST_object *self, visitproc visit, void *arg)
-{
- Py_VISIT(self->dict);
- return 0;
-}
-
-static int
-ast_clear(AST_object *self)
-{
- Py_CLEAR(self->dict);
- return 0;
-}
-
static int
ast_type_init(PyObject *self, PyObject *args, PyObject *kw)
{
Py_ssize_t i, numfields = 0;
int res = -1;
PyObject *key, *value, *fields;
- if (_PyObject_LookupAttr((PyObject*)Py_TYPE(self), astmodulestate_global->_fields, &fields) < 0) {
- goto cleanup;
- }
+ fields = PyObject_GetAttrString((PyObject*)Py_TYPE(self), "_fields");
+ if (!fields)
+ PyErr_Clear();
if (fields) {
numfields = PySequence_Size(fields);
if (numfields == -1)
goto cleanup;
}
-
res = 0; /* if no error occurs, this stays 0 to the end */
- if (numfields < PyTuple_GET_SIZE(args)) {
- PyErr_Format(PyExc_TypeError, "%.400s constructor takes at most "
- "%zd positional argument%s",
- _PyType_Name(Py_TYPE(self)),
- numfields, numfields == 1 ? "" : "s");
- res = -1;
- goto cleanup;
- }
- for (i = 0; i < PyTuple_GET_SIZE(args); i++) {
- /* cannot be reached when fields is NULL */
- PyObject *name = PySequence_GetItem(fields, i);
- if (!name) {
+ if (PyTuple_GET_SIZE(args) > 0) {
+ if (numfields != PyTuple_GET_SIZE(args)) {
+ PyErr_Format(PyExc_TypeError, "%.400s constructor takes %s"
+ "%zd positional argument%s",
+ Py_TYPE(self)->tp_name,
+ numfields == 0 ? "" : "either 0 or ",
+ numfields, numfields == 1 ? "" : "s");
res = -1;
goto cleanup;
}
- res = PyObject_SetAttr(self, name, PyTuple_GET_ITEM(args, i));
- Py_DECREF(name);
- if (res < 0)
- goto cleanup;
+ for (i = 0; i < PyTuple_GET_SIZE(args); i++) {
+ /* cannot be reached when fields is NULL */
+ PyObject *name = PySequence_GetItem(fields, i);
+ if (!name) {
+ res = -1;
+ goto cleanup;
+ }
+ res = PyObject_SetAttr(self, name, PyTuple_GET_ITEM(args, i));
+ Py_DECREF(name);
+ if (res < 0)
+ goto cleanup;
+ }
}
if (kw) {
i = 0; /* needed by PyDict_Next */
@@ -714,95 +652,105 @@ ast_type_init(PyObject *self, PyObject *args, PyObject *kw)
static PyObject *
ast_type_reduce(PyObject *self, PyObject *unused)
{
- PyObject *dict;
- if (_PyObject_LookupAttr(self, astmodulestate_global->__dict__, &dict) < 0) {
- return NULL;
+ PyObject *res;
+ PyObject *dict = PyObject_GetAttrString(self, "__dict__");
+ if (dict == NULL) {
+ if (PyErr_ExceptionMatches(PyExc_AttributeError))
+ PyErr_Clear();
+ else
+ return NULL;
}
if (dict) {
- return Py_BuildValue("O()N", Py_TYPE(self), dict);
+ res = Py_BuildValue("O()O", Py_TYPE(self), dict);
+ Py_DECREF(dict);
+ return res;
}
return Py_BuildValue("O()", Py_TYPE(self));
}
-static PyMemberDef ast_type_members[] = {
- {"__dictoffset__", T_PYSSIZET, offsetof(AST_object, dict), READONLY},
- {NULL} /* Sentinel */
-};
-
static PyMethodDef ast_type_methods[] = {
{"__reduce__", ast_type_reduce, METH_NOARGS, NULL},
{NULL}
};
-static PyGetSetDef ast_type_getsets[] = {
- {"__dict__", PyObject_GenericGetDict, PyObject_GenericSetDict},
- {NULL}
-};
-
-static PyType_Slot AST_type_slots[] = {
- {Py_tp_dealloc, ast_dealloc},
- {Py_tp_getattro, PyObject_GenericGetAttr},
- {Py_tp_setattro, PyObject_GenericSetAttr},
- {Py_tp_traverse, ast_traverse},
- {Py_tp_clear, ast_clear},
- {Py_tp_members, ast_type_members},
- {Py_tp_methods, ast_type_methods},
- {Py_tp_getset, ast_type_getsets},
- {Py_tp_init, ast_type_init},
- {Py_tp_alloc, PyType_GenericAlloc},
- {Py_tp_new, PyType_GenericNew},
- {Py_tp_free, PyObject_GC_Del},
- {0, 0},
-};
-
-static PyType_Spec AST_type_spec = {
+static PyTypeObject AST_type = {
+ PyVarObject_HEAD_INIT(&PyType_Type, 0)
"_ast.AST",
- sizeof(AST_object),
+ sizeof(PyObject),
0,
- Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC,
- AST_type_slots
+ 0, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ PyObject_GenericGetAttr, /* tp_getattro */
+ PyObject_GenericSetAttr, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
+ 0, /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ ast_type_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ (initproc)ast_type_init, /* tp_init */
+ PyType_GenericAlloc, /* tp_alloc */
+ PyType_GenericNew, /* tp_new */
+ PyObject_Del, /* tp_free */
};
-static PyObject *
-make_type(const char *type, PyObject* base, const char* const* fields, int num_fields)
+
+static PyTypeObject* make_type(char *type, PyTypeObject* base, char**fields, int num_fields)
{
PyObject *fnames, *result;
int i;
fnames = PyTuple_New(num_fields);
if (!fnames) return NULL;
for (i = 0; i < num_fields; i++) {
- PyObject *field = PyUnicode_InternFromString(fields[i]);
+ PyObject *field = PyString_FromString(fields[i]);
if (!field) {
Py_DECREF(fnames);
return NULL;
}
PyTuple_SET_ITEM(fnames, i, field);
}
- result = PyObject_CallFunction((PyObject*)&PyType_Type, "s(O){OOOO}",
- type, base,
- astmodulestate_global->_fields, fnames,
- astmodulestate_global->__module__,
- astmodulestate_global->_ast);
+ result = PyObject_CallFunction((PyObject*)&PyType_Type, "s(O){sOss}",
+ type, base, "_fields", fnames, "__module__", "_ast");
Py_DECREF(fnames);
- return result;
+ return (PyTypeObject*)result;
}
-static int
-add_attributes(PyObject *type, const char * const *attrs, int num_fields)
+static int add_attributes(PyTypeObject* type, char**attrs, int num_fields)
{
int i, result;
PyObject *s, *l = PyTuple_New(num_fields);
if (!l)
return 0;
for (i = 0; i < num_fields; i++) {
- s = PyUnicode_InternFromString(attrs[i]);
+ s = PyString_FromString(attrs[i]);
if (!s) {
Py_DECREF(l);
return 0;
}
PyTuple_SET_ITEM(l, i, s);
}
- result = PyObject_SetAttr(type, astmodulestate_global->_attributes, l) >= 0;
+ result = PyObject_SetAttrString((PyObject*)type, "_attributes", l) >= 0;
Py_DECREF(l);
return result;
}
@@ -811,7 +759,7 @@ add_attributes(PyObject *type, const char * const *attrs, int num_fields)
static PyObject* ast2obj_list(asdl_seq *seq, PyObject* (*func)(void*))
{
- Py_ssize_t i, n = asdl_seq_LEN(seq);
+ int i, n = asdl_seq_LEN(seq);
PyObject *result = PyList_New(n);
PyObject *value;
if (!result)
@@ -834,15 +782,16 @@ static PyObject* ast2obj_object(void *o)
Py_INCREF((PyObject*)o);
return (PyObject*)o;
}
-#define ast2obj_singleton ast2obj_object
-#define ast2obj_constant ast2obj_object
#define ast2obj_identifier ast2obj_object
#define ast2obj_string ast2obj_object
-#define ast2obj_bytes ast2obj_object
+static PyObject* ast2obj_bool(bool b)
+{
+ return PyBool_FromLong(b);
+}
static PyObject* ast2obj_int(long b)
{
- return PyLong_FromLong(b);
+ return PyInt_FromLong(b);
}
/* Conversion Python -> AST */
@@ -851,32 +800,18 @@ static int obj2ast_object(PyObject* obj, PyObject** out, PyArena* arena)
{
if (obj == Py_None)
obj = NULL;
- if (obj) {
- if (PyArena_AddPyObject(arena, obj) < 0) {
- *out = NULL;
- return -1;
- }
- Py_INCREF(obj);
- }
- *out = obj;
- return 0;
-}
-
-static int obj2ast_constant(PyObject* obj, PyObject** out, PyArena* arena)
-{
- if (PyArena_AddPyObject(arena, obj) < 0) {
- *out = NULL;
- return -1;
- }
- Py_INCREF(obj);
+ if (obj)
+ PyArena_AddPyObject(arena, obj);
+ Py_XINCREF(obj);
*out = obj;
return 0;
}
static int obj2ast_identifier(PyObject* obj, PyObject** out, PyArena* arena)
{
- if (!PyUnicode_CheckExact(obj) && obj != Py_None) {
- PyErr_SetString(PyExc_TypeError, "AST identifier must be of type str");
+ if (!PyString_CheckExact(obj) && obj != Py_None) {
+ PyErr_Format(PyExc_TypeError,
+ "AST identifier must be of type str");
return 1;
}
return obj2ast_object(obj, out, arena);
@@ -884,8 +819,9 @@ static int obj2ast_identifier(PyObject* obj, PyObject** out, PyArena* arena)
static int obj2ast_string(PyObject* obj, PyObject** out, PyArena* arena)
{
- if (!PyUnicode_CheckExact(obj) && !PyBytes_CheckExact(obj)) {
- PyErr_SetString(PyExc_TypeError, "AST string must be of type str");
+ if (!PyString_CheckExact(obj) && !PyUnicode_CheckExact(obj)) {
+ PyErr_SetString(PyExc_TypeError,
+ "AST string must be of type str or unicode");
return 1;
}
return obj2ast_object(obj, out, arena);
@@ -894,25 +830,47 @@ static int obj2ast_string(PyObject* obj, PyObject** out, PyArena* arena)
static int obj2ast_int(PyObject* obj, int* out, PyArena* arena)
{
int i;
- if (!PyLong_Check(obj)) {
- PyErr_Format(PyExc_ValueError, "invalid integer value: %R", obj);
+ if (!_PyAnyInt_Check(obj)) {
+ PyObject *s = PyObject_Repr(obj);
+ if (s == NULL) return 1;
+ PyErr_Format(PyExc_ValueError, "invalid integer value: %.400s",
+ PyString_AS_STRING(s));
+ Py_DECREF(s);
return 1;
}
- i = _PyLong_AsInt(obj);
+ i = (int)PyLong_AsLong(obj);
if (i == -1 && PyErr_Occurred())
return 1;
*out = i;
return 0;
}
+static int obj2ast_bool(PyObject* obj, bool* out, PyArena* arena)
+{
+ if (!PyBool_Check(obj)) {
+ PyObject *s = PyObject_Repr(obj);
+ if (s == NULL) return 1;
+ PyErr_Format(PyExc_ValueError, "invalid boolean value: %.400s",
+ PyString_AS_STRING(s));
+ Py_DECREF(s);
+ return 1;
+ }
+
+ *out = (obj == Py_True);
+ return 0;
+}
+
static int add_ast_fields(void)
{
- PyObject *empty_tuple;
+ PyObject *empty_tuple, *d;
+ if (PyType_Ready(&AST_type) < 0)
+ return -1;
+ d = AST_type.tp_dict;
empty_tuple = PyTuple_New(0);
if (!empty_tuple ||
- PyObject_SetAttrString(astmodulestate_global->AST_type, "_fields", empty_tuple) < 0 ||
- PyObject_SetAttrString(astmodulestate_global->AST_type, "_attributes", empty_tuple) < 0) {
+ PyDict_SetItemString(d, "_fields", empty_tuple) < 0 ||
+ PyDict_SetItemString(d, "_attributes", empty_tuple) < 0) {
Py_XDECREF(empty_tuple);
return -1;
}
@@ -924,91 +882,71 @@ static int add_ast_fields(void)
self.emit("static int init_types(void)",0)
self.emit("{", 0)
- self.emit("PyObject *m;", 1)
- self.emit("if (PyState_FindModule(&_astmodule) == NULL) {", 1)
- self.emit("m = PyModule_Create(&_astmodule);", 2)
- self.emit("if (!m) return 0;", 2)
- self.emit("PyState_AddModule(m, &_astmodule);", 2)
- self.emit("}", 1)
- self.emit("astmodulestate *state = astmodulestate_global;", 1)
- self.emit("if (state->initialized) return 1;", 1)
- self.emit("if (init_identifiers() < 0) return 0;", 1)
- self.emit("state->AST_type = PyType_FromSpec(&AST_type_spec);", 1)
- self.emit("if (!state->AST_type) return 0;", 1)
+ self.emit("static int initialized;", 1)
+ self.emit("if (initialized) return 1;", 1)
self.emit("if (add_ast_fields() < 0) return 0;", 1)
for dfn in mod.dfns:
self.visit(dfn)
- self.emit("state->initialized = 1;", 1)
+ self.emit("initialized = 1;", 1)
self.emit("return 1;", 1);
self.emit("}", 0)
def visitProduct(self, prod, name):
if prod.fields:
- fields = name+"_fields"
+ fields = name.value+"_fields"
else:
fields = "NULL"
- self.emit('state->%s_type = make_type("%s", state->AST_type, %s, %d);' %
+ self.emit('%s_type = make_type("%s", &AST_type, %s, %d);' %
(name, name, fields, len(prod.fields)), 1)
- self.emit("if (!state->%s_type) return 0;" % name, 1)
- self.emit_type("AST_type")
- self.emit_type("%s_type" % name)
- if prod.attributes:
- self.emit("if (!add_attributes(state->%s_type, %s_attributes, %d)) return 0;" %
- (name, name, len(prod.attributes)), 1)
- else:
- self.emit("if (!add_attributes(state->%s_type, NULL, 0)) return 0;" % name, 1)
+ self.emit("if (!%s_type) return 0;" % name, 1)
def visitSum(self, sum, name):
- self.emit('state->%s_type = make_type("%s", state->AST_type, NULL, 0);' %
+ self.emit('%s_type = make_type("%s", &AST_type, NULL, 0);' %
(name, name), 1)
- self.emit_type("%s_type" % name)
- self.emit("if (!state->%s_type) return 0;" % name, 1)
+ self.emit("if (!%s_type) return 0;" % name, 1)
if sum.attributes:
- self.emit("if (!add_attributes(state->%s_type, %s_attributes, %d)) return 0;" %
+ self.emit("if (!add_attributes(%s_type, %s_attributes, %d)) return 0;" %
(name, name, len(sum.attributes)), 1)
else:
- self.emit("if (!add_attributes(state->%s_type, NULL, 0)) return 0;" % name, 1)
+ self.emit("if (!add_attributes(%s_type, NULL, 0)) return 0;" % name, 1)
simple = is_simple(sum)
for t in sum.types:
self.visitConstructor(t, name, simple)
def visitConstructor(self, cons, name, simple):
if cons.fields:
- fields = cons.name+"_fields"
+ fields = cons.name.value+"_fields"
else:
fields = "NULL"
- self.emit('state->%s_type = make_type("%s", state->%s_type, %s, %d);' %
+ self.emit('%s_type = make_type("%s", %s_type, %s, %d);' %
(cons.name, cons.name, name, fields, len(cons.fields)), 1)
- self.emit("if (!state->%s_type) return 0;" % cons.name, 1)
- self.emit_type("%s_type" % cons.name)
+ self.emit("if (!%s_type) return 0;" % cons.name, 1)
if simple:
- self.emit("state->%s_singleton = PyType_GenericNew((PyTypeObject *)"
- "state->%s_type, NULL, NULL);" %
+ self.emit("%s_singleton = PyType_GenericNew(%s_type, NULL, NULL);" %
(cons.name, cons.name), 1)
- self.emit("if (!state->%s_singleton) return 0;" % cons.name, 1)
+ self.emit("if (!%s_singleton) return 0;" % cons.name, 1)
class ASTModuleVisitor(PickleVisitor):
def visitModule(self, mod):
self.emit("PyMODINIT_FUNC", 0)
- self.emit("PyInit__ast(void)", 0)
+ self.emit("init_ast(void)", 0)
self.emit("{", 0)
- self.emit("PyObject *m;", 1)
- self.emit("if (!init_types()) return NULL;", 1)
- self.emit('m = PyState_FindModule(&_astmodule);', 1)
- self.emit("if (!m) return NULL;", 1)
- self.emit('Py_INCREF(astmodulestate(m)->AST_type);', 1)
- self.emit('if (PyModule_AddObject(m, "AST", astmodulestate_global->AST_type) < 0) return NULL;', 1)
- self.emit('if (PyModule_AddIntMacro(m, PyCF_ALLOW_TOP_LEVEL_AWAIT) < 0)', 1)
- self.emit("return NULL;", 2)
- self.emit('if (PyModule_AddIntMacro(m, PyCF_ONLY_AST) < 0)', 1)
- self.emit("return NULL;", 2)
- self.emit('if (PyModule_AddIntMacro(m, PyCF_TYPE_COMMENTS) < 0)', 1)
- self.emit("return NULL;", 2)
+ self.emit("PyObject *m, *d;", 1)
+ self.emit("if (!init_types()) return;", 1)
+ self.emit('m = Py_InitModule3("_ast", NULL, NULL);', 1)
+ self.emit("if (!m) return;", 1)
+ self.emit("d = PyModule_GetDict(m);", 1)
+ self.emit('if (PyDict_SetItemString(d, "AST", (PyObject*)&AST_type) < 0) return;', 1)
+ self.emit('if (PyModule_AddIntConstant(m, "PyCF_ONLY_AST", PyCF_ONLY_AST) < 0)', 1)
+ self.emit("return;", 2)
+ # Value of version: "$Revision$"
+ self.emit('if (PyModule_AddStringConstant(m, "__version__", "%s") < 0)'
+ % mod.version, 1)
+ self.emit("return;", 2)
for dfn in mod.dfns:
self.visit(dfn)
- self.emit("return m;", 1)
self.emit("}", 0)
def visitProduct(self, prod, name):
@@ -1023,9 +961,7 @@ class ASTModuleVisitor(PickleVisitor):
self.addObj(cons.name)
def addObj(self, name):
- self.emit("if (PyModule_AddObject(m, \"%s\", "
- "astmodulestate_global->%s_type) < 0) return NULL;" % (name, name), 1)
- self.emit("Py_INCREF(astmodulestate(m)->%s_type);" % name, 1)
+ self.emit('if (PyDict_SetItemString(d, "%s", (PyObject*)%s_type) < 0) return;' % (name, name), 1)
_SPECIALIZED_SEQUENCES = ('stmt', 'expr')
@@ -1063,9 +999,9 @@ class ObjVisitor(PickleVisitor):
self.emit("{", 0)
self.emit("%s o = (%s)_o;" % (ctype, ctype), 1)
self.emit("PyObject *result = NULL, *value = NULL;", 1)
- self.emit("PyTypeObject *tp;", 1)
self.emit('if (!o) {', 1)
- self.emit("Py_RETURN_NONE;", 2)
+ self.emit("Py_INCREF(Py_None);", 2)
+ self.emit('return Py_None;', 2)
self.emit("}", 1)
self.emit('', 0)
@@ -1091,7 +1027,7 @@ class ObjVisitor(PickleVisitor):
for a in sum.attributes:
self.emit("value = ast2obj_%s(o->%s);" % (a.type, a.name), 1)
self.emit("if (!value) goto failed;", 1)
- self.emit('if (PyObject_SetAttr(result, astmodulestate_global->%s, value) < 0)' % a.name, 1)
+ self.emit('if (PyObject_SetAttrString(result, "%s", value) < 0)' % a.name, 1)
self.emit('goto failed;', 2)
self.emit('Py_DECREF(value);', 1)
self.func_end()
@@ -1102,8 +1038,8 @@ class ObjVisitor(PickleVisitor):
self.emit("switch(o) {", 1)
for t in sum.types:
self.emit("case %s:" % t.name, 2)
- self.emit("Py_INCREF(astmodulestate_global->%s_singleton);" % t.name, 3)
- self.emit("return astmodulestate_global->%s_singleton;" % t.name, 3)
+ self.emit("Py_INCREF(%s_singleton);" % t.name, 3)
+ self.emit("return %s_singleton;" % t.name, 3)
self.emit("default:", 2)
self.emit('/* should never happen, but just in case ... */', 3)
code = "PyErr_Format(PyExc_SystemError, \"unknown %s found\");" % name
@@ -1114,23 +1050,15 @@ class ObjVisitor(PickleVisitor):
def visitProduct(self, prod, name):
self.func_begin(name)
- self.emit("tp = (PyTypeObject *)astmodulestate_global->%s_type;" % name, 1)
- self.emit("result = PyType_GenericNew(tp, NULL, NULL);", 1);
+ self.emit("result = PyType_GenericNew(%s_type, NULL, NULL);" % name, 1);
self.emit("if (!result) return NULL;", 1)
for field in prod.fields:
self.visitField(field, name, 1, True)
- for a in prod.attributes:
- self.emit("value = ast2obj_%s(o->%s);" % (a.type, a.name), 1)
- self.emit("if (!value) goto failed;", 1)
- self.emit("if (PyObject_SetAttr(result, astmodulestate_global->%s, value) < 0)" % a.name, 1)
- self.emit('goto failed;', 2)
- self.emit('Py_DECREF(value);', 1)
self.func_end()
def visitConstructor(self, cons, enum, name):
self.emit("case %s_kind:" % cons.name, 1)
- self.emit("tp = (PyTypeObject *)astmodulestate_global->%s_type;" % cons.name, 2)
- self.emit("result = PyType_GenericNew(tp, NULL, NULL);", 2);
+ self.emit("result = PyType_GenericNew(%s_type, NULL, NULL);" % cons.name, 2);
self.emit("if (!result) goto failed;", 2)
for f in cons.fields:
self.visitField(f, cons.name, 2, False)
@@ -1145,7 +1073,7 @@ class ObjVisitor(PickleVisitor):
value = "o->v.%s.%s" % (name, field.name)
self.set(field, value, depth)
emit("if (!value) goto failed;", 0)
- emit("if (PyObject_SetAttr(result, astmodulestate_global->%s, value) == -1)" % field.name, 0)
+ emit('if (PyObject_SetAttrString(result, "%s", value) == -1)' % field.name, 0)
emit("goto failed;", 1)
emit("Py_DECREF(value);", 0)
@@ -1164,11 +1092,11 @@ class ObjVisitor(PickleVisitor):
def set(self, field, value, depth):
if field.seq:
# XXX should really check for is_simple, but that requires a symbol table
- if field.type == "cmpop":
+ if field.type.value == "cmpop":
# While the sequence elements are stored as void*,
# ast2obj_cmpop expects an enum
self.emit("{", depth)
- self.emit("Py_ssize_t i, n = asdl_seq_LEN(%s);" % value, depth+1)
+ self.emit("int i, n = asdl_seq_LEN(%s);" % value, depth+1)
self.emit("value = PyList_New(n);", depth+1)
self.emit("if (!value) goto failed;", depth+1)
self.emit("for(i = 0; i < n; i++)", depth+1)
@@ -1188,41 +1116,38 @@ class PartingShots(StaticVisitor):
CODE = """
PyObject* PyAST_mod2obj(mod_ty t)
{
- if (!init_types())
- return NULL;
+ init_types();
return ast2obj_mod(t);
}
/* mode is 0 for "exec", 1 for "eval" and 2 for "single" input */
mod_ty PyAST_obj2mod(PyObject* ast, PyArena* arena, int mode)
{
+ mod_ty res;
PyObject *req_type[3];
- const char * const req_name[] = {"Module", "Expression", "Interactive"};
+ char *req_name[3];
int isinstance;
- if (PySys_Audit("compile", "OO", ast, Py_None) < 0) {
- return NULL;
- }
+ req_type[0] = (PyObject*)Module_type;
+ req_type[1] = (PyObject*)Expression_type;
+ req_type[2] = (PyObject*)Interactive_type;
- req_type[0] = astmodulestate_global->Module_type;
- req_type[1] = astmodulestate_global->Expression_type;
- req_type[2] = astmodulestate_global->Interactive_type;
+ req_name[0] = "Module";
+ req_name[1] = "Expression";
+ req_name[2] = "Interactive";
assert(0 <= mode && mode <= 2);
- if (!init_types())
- return NULL;
+ init_types();
isinstance = PyObject_IsInstance(ast, req_type[mode]);
if (isinstance == -1)
return NULL;
if (!isinstance) {
PyErr_Format(PyExc_TypeError, "expected %s node, got %.400s",
- req_name[mode], _PyType_Name(Py_TYPE(ast)));
+ req_name[mode], Py_TYPE(ast)->tp_name);
return NULL;
}
-
- mod_ty res = NULL;
if (obj2ast_mod(ast, &res, arena) != 0)
return NULL;
else
@@ -1231,9 +1156,8 @@ mod_ty PyAST_obj2mod(PyObject* ast, PyArena* arena, int mode)
int PyAST_Check(PyObject* obj)
{
- if (!init_types())
- return -1;
- return PyObject_IsInstance(obj, astmodulestate_global->AST_type);
+ init_types();
+ return PyObject_IsInstance(obj, (PyObject*)&AST_type);
}
"""
@@ -1246,172 +1170,81 @@ class ChainOfVisitors:
v.visit(object)
v.emit("", 0)
+common_msg = "/* File automatically generated by %s. */\n\n"
-def generate_module_def(f, mod):
- # Gather all the data needed for ModuleSpec
- visitor_list = set()
- with open(os.devnull, "w") as devnull:
- visitor = PyTypesDeclareVisitor(devnull)
- visitor.visit(mod)
- visitor_list.add(visitor)
- visitor = PyTypesVisitor(devnull)
- visitor.visit(mod)
- visitor_list.add(visitor)
-
- state_strings = set(["__dict__", "_attributes", "_fields", "__module__", "_ast"])
- module_state = set(["__dict__", "_attributes", "_fields", "__module__", "_ast"])
- for visitor in visitor_list:
- for identifier in visitor.identifiers:
- module_state.add(identifier)
- state_strings.add(identifier)
- for singleton in visitor.singletons:
- module_state.add(singleton)
- for tp in visitor.types:
- module_state.add(tp)
- state_strings = sorted(state_strings)
- module_state = sorted(module_state)
- f.write('typedef struct {\n')
- f.write(' int initialized;\n')
- for s in module_state:
- f.write(' PyObject *' + s + ';\n')
- f.write('} astmodulestate;\n\n')
- f.write("""
-#define astmodulestate(o) ((astmodulestate *)PyModule_GetState(o))
-
-static int astmodule_clear(PyObject *module)
-{
-""")
- for s in module_state:
- f.write(" Py_CLEAR(astmodulestate(module)->" + s + ');\n')
- f.write("""
- return 0;
-}
-
-static int astmodule_traverse(PyObject *module, visitproc visit, void* arg)
-{
-""")
- for s in module_state:
- f.write(" Py_VISIT(astmodulestate(module)->" + s + ');\n')
- f.write("""
- return 0;
-}
-
-static void astmodule_free(void* module) {
- astmodule_clear((PyObject*)module);
-}
-
-static struct PyModuleDef _astmodule = {
- PyModuleDef_HEAD_INIT,
- "_ast",
- NULL,
- sizeof(astmodulestate),
- NULL,
- NULL,
- astmodule_traverse,
- astmodule_clear,
- astmodule_free,
-};
-
-#define astmodulestate_global ((astmodulestate *)PyModule_GetState(PyState_FindModule(&_astmodule)))
-
-""")
- f.write('static int init_identifiers(void)\n')
- f.write('{\n')
- f.write(' astmodulestate *state = astmodulestate_global;\n')
- for identifier in state_strings:
- f.write(' if ((state->' + identifier)
- f.write(' = PyUnicode_InternFromString("')
- f.write(identifier + '")) == NULL) return 0;\n')
- f.write(' return 1;\n')
- f.write('};\n\n')
+c_file_msg = """
+/*
+ __version__ %s.
+ This module must be committed separately after each AST grammar change;
+ The __version__ number is set to the revision number of the commit
+ containing the grammar change.
+*/
-common_msg = "/* File automatically generated by %s. */\n\n"
+"""
-def main(srcfile, dump_module=False):
+def main(srcfile):
argv0 = sys.argv[0]
components = argv0.split(os.sep)
argv0 = os.sep.join(components[-2:])
auto_gen_msg = common_msg % argv0
mod = asdl.parse(srcfile)
- if dump_module:
- print('Parsed Module:')
- print(mod)
+ mod.version = "82160"
if not asdl.check(mod):
sys.exit(1)
- if H_FILE:
- with open(H_FILE, "w") as f:
- f.write(auto_gen_msg)
- f.write('#ifndef Py_PYTHON_AST_H\n')
- f.write('#define Py_PYTHON_AST_H\n')
- f.write('#ifdef __cplusplus\n')
- f.write('extern "C" {\n')
- f.write('#endif\n')
- f.write('\n')
- f.write('#ifndef Py_LIMITED_API\n')
- f.write('#include "asdl.h"\n')
- f.write('\n')
- f.write('#undef Yield /* undefine macro conflicting with <winbase.h> */\n')
- f.write('\n')
- c = ChainOfVisitors(TypeDefVisitor(f),
- StructVisitor(f))
-
- c.visit(mod)
- f.write("// Note: these macros affect function definitions, not only call sites.\n")
- PrototypeVisitor(f).visit(mod)
- f.write("\n")
- f.write("PyObject* PyAST_mod2obj(mod_ty t);\n")
- f.write("mod_ty PyAST_obj2mod(PyObject* ast, PyArena* arena, int mode);\n")
- f.write("int PyAST_Check(PyObject* obj);\n")
- f.write("#endif /* !Py_LIMITED_API */\n")
- f.write('\n')
- f.write('#ifdef __cplusplus\n')
- f.write('}\n')
- f.write('#endif\n')
- f.write('#endif /* !Py_PYTHON_AST_H */\n')
-
- if C_FILE:
- with open(C_FILE, "w") as f:
- f.write(auto_gen_msg)
- f.write('#include <stddef.h>\n')
- f.write('\n')
- f.write('#include "Python.h"\n')
- f.write('#include "%s-ast.h"\n' % mod.name)
- f.write('#include "structmember.h"\n')
- f.write('\n')
-
- generate_module_def(f, mod)
-
- v = ChainOfVisitors(
- PyTypesDeclareVisitor(f),
- PyTypesVisitor(f),
- Obj2ModPrototypeVisitor(f),
- FunctionVisitor(f),
- ObjVisitor(f),
- Obj2ModVisitor(f),
- ASTModuleVisitor(f),
- PartingShots(f),
- )
- v.visit(mod)
+ if INC_DIR:
+ p = "%s/%s-ast.h" % (INC_DIR, mod.name)
+ f = open(p, "wb")
+ f.write(auto_gen_msg)
+ f.write('#include "asdl.h"\n\n')
+ c = ChainOfVisitors(TypeDefVisitor(f),
+ StructVisitor(f),
+ PrototypeVisitor(f),
+ )
+ c.visit(mod)
+ f.write("PyObject* PyAST_mod2obj(mod_ty t);\n")
+ f.write("mod_ty PyAST_obj2mod(PyObject* ast, PyArena* arena, int mode);\n")
+ f.write("int PyAST_Check(PyObject* obj);\n")
+ f.close()
+
+ if SRC_DIR:
+ p = os.path.join(SRC_DIR, str(mod.name) + "-ast.c")
+ f = open(p, "wb")
+ f.write(auto_gen_msg)
+ f.write(c_file_msg % mod.version)
+ f.write('#include "Python.h"\n')
+ f.write('#include "%s-ast.h"\n' % mod.name)
+ f.write('\n')
+ f.write("static PyTypeObject AST_type;\n")
+ v = ChainOfVisitors(
+ PyTypesDeclareVisitor(f),
+ PyTypesVisitor(f),
+ Obj2ModPrototypeVisitor(f),
+ FunctionVisitor(f),
+ ObjVisitor(f),
+ Obj2ModVisitor(f),
+ ASTModuleVisitor(f),
+ PartingShots(f),
+ )
+ v.visit(mod)
+ f.close()
if __name__ == "__main__":
+ import sys
import getopt
- H_FILE = ''
- C_FILE = ''
- dump_module = False
- opts, args = getopt.getopt(sys.argv[1:], "dh:c:")
+ INC_DIR = ''
+ SRC_DIR = ''
+ opts, args = getopt.getopt(sys.argv[1:], "h:c:")
+ if len(opts) != 1:
+ print "Must specify exactly one output file"
+ sys.exit(1)
for o, v in opts:
if o == '-h':
- H_FILE = v
- elif o == '-c':
- C_FILE = v
- elif o == '-d':
- dump_module = True
- if H_FILE and C_FILE:
- print('Must specify exactly one output file')
- sys.exit(1)
- elif len(args) != 1:
- print('Must specify single input file')
+ INC_DIR = v
+ if o == '-c':
+ SRC_DIR = v
+ if len(args) != 1:
+ print "Must specify single input file"
sys.exit(1)
- main(args[0], dump_module)
+ main(args[0])
diff --git a/Parser/bitset.c b/Parser/bitset.c
new file mode 100644
index 0000000..f5bfd41
--- /dev/null
+++ b/Parser/bitset.c
@@ -0,0 +1,66 @@
+
+/* Bitset primitives used by the parser generator */
+
+#include "pgenheaders.h"
+#include "bitset.h"
+
+bitset
+newbitset(int nbits)
+{
+ int nbytes = NBYTES(nbits);
+ bitset ss = (char *)PyObject_MALLOC(sizeof(BYTE) * nbytes);
+
+ if (ss == NULL)
+ Py_FatalError("no mem for bitset");
+
+ ss += nbytes;
+ while (--nbytes >= 0)
+ *--ss = 0;
+ return ss;
+}
+
+void
+delbitset(bitset ss)
+{
+ PyObject_FREE(ss);
+}
+
+int
+addbit(bitset ss, int ibit)
+{
+ int ibyte = BIT2BYTE(ibit);
+ BYTE mask = BIT2MASK(ibit);
+
+ if (ss[ibyte] & mask)
+ return 0; /* Bit already set */
+ ss[ibyte] |= mask;
+ return 1;
+}
+
+#if 0 /* Now a macro */
+int
+testbit(bitset ss, int ibit)
+{
+ return (ss[BIT2BYTE(ibit)] & BIT2MASK(ibit)) != 0;
+}
+#endif
+
+int
+samebitset(bitset ss1, bitset ss2, int nbits)
+{
+ int i;
+
+ for (i = NBYTES(nbits); --i >= 0; )
+ if (*ss1++ != *ss2++)
+ return 0;
+ return 1;
+}
+
+void
+mergebitset(bitset ss1, bitset ss2, int nbits)
+{
+ int i;
+
+ for (i = NBYTES(nbits); --i >= 0; )
+ *ss1++ |= *ss2++;
+}
diff --git a/Parser/firstsets.c b/Parser/firstsets.c
new file mode 100644
index 0000000..ee75d1b
--- /dev/null
+++ b/Parser/firstsets.c
@@ -0,0 +1,113 @@
+
+/* Computation of FIRST stets */
+
+#include "pgenheaders.h"
+#include "grammar.h"
+#include "token.h"
+
+extern int Py_DebugFlag;
+
+/* Forward */
+static void calcfirstset(grammar *, dfa *);
+
+void
+addfirstsets(grammar *g)
+{
+ int i;
+ dfa *d;
+
+ if (Py_DebugFlag)
+ printf("Adding FIRST sets ...\n");
+ for (i = 0; i < g->g_ndfas; i++) {
+ d = &g->g_dfa[i];
+ if (d->d_first == NULL)
+ calcfirstset(g, d);
+ }
+}
+
+static void
+calcfirstset(grammar *g, dfa *d)
+{
+ int i, j;
+ state *s;
+ arc *a;
+ int nsyms;
+ int *sym;
+ int nbits;
+ static bitset dummy;
+ bitset result;
+ int type;
+ dfa *d1;
+ label *l0;
+
+ if (Py_DebugFlag)
+ printf("Calculate FIRST set for '%s'\n", d->d_name);
+
+ if (dummy == NULL)
+ dummy = newbitset(1);
+ if (d->d_first == dummy) {
+ fprintf(stderr, "Left-recursion for '%s'\n", d->d_name);
+ return;
+ }
+ if (d->d_first != NULL) {
+ fprintf(stderr, "Re-calculating FIRST set for '%s' ???\n",
+ d->d_name);
+ }
+ d->d_first = dummy;
+
+ l0 = g->g_ll.ll_label;
+ nbits = g->g_ll.ll_nlabels;
+ result = newbitset(nbits);
+
+ sym = (int *)PyObject_MALLOC(sizeof(int));
+ if (sym == NULL)
+ Py_FatalError("no mem for new sym in calcfirstset");
+ nsyms = 1;
+ sym[0] = findlabel(&g->g_ll, d->d_type, (char *)NULL);
+
+ s = &d->d_state[d->d_initial];
+ for (i = 0; i < s->s_narcs; i++) {
+ a = &s->s_arc[i];
+ for (j = 0; j < nsyms; j++) {
+ if (sym[j] == a->a_lbl)
+ break;
+ }
+ if (j >= nsyms) { /* New label */
+ sym = (int *)PyObject_REALLOC(sym,
+ sizeof(int) * (nsyms + 1));
+ if (sym == NULL)
+ Py_FatalError(
+ "no mem to resize sym in calcfirstset");
+ sym[nsyms++] = a->a_lbl;
+ type = l0[a->a_lbl].lb_type;
+ if (ISNONTERMINAL(type)) {
+ d1 = PyGrammar_FindDFA(g, type);
+ if (d1->d_first == dummy) {
+ fprintf(stderr,
+ "Left-recursion below '%s'\n",
+ d->d_name);
+ }
+ else {
+ if (d1->d_first == NULL)
+ calcfirstset(g, d1);
+ mergebitset(result,
+ d1->d_first, nbits);
+ }
+ }
+ else if (ISTERMINAL(type)) {
+ addbit(result, a->a_lbl);
+ }
+ }
+ }
+ d->d_first = result;
+ if (Py_DebugFlag) {
+ printf("FIRST set for '%s': {", d->d_name);
+ for (i = 0; i < nbits; i++) {
+ if (testbit(result, i))
+ printf(" %s", PyGrammar_LabelRepr(&l0[i]));
+ }
+ printf(" }\n");
+ }
+
+ PyObject_FREE(sym);
+}
diff --git a/Parser/grammar.c b/Parser/grammar.c
new file mode 100644
index 0000000..fcd2219
--- /dev/null
+++ b/Parser/grammar.c
@@ -0,0 +1,272 @@
+
+/* Grammar implementation */
+
+#include "Python.h"
+#include "pgenheaders.h"
+
+#include <ctype.h>
+
+#include "token.h"
+#include "grammar.h"
+
+#ifdef RISCOS
+#include <unixlib.h>
+#endif
+
+extern int Py_DebugFlag;
+
+grammar *
+newgrammar(int start)
+{
+ grammar *g;
+
+ g = (grammar *)PyObject_MALLOC(sizeof(grammar));
+ if (g == NULL)
+ Py_FatalError("no mem for new grammar");
+ g->g_ndfas = 0;
+ g->g_dfa = NULL;
+ g->g_start = start;
+ g->g_ll.ll_nlabels = 0;
+ g->g_ll.ll_label = NULL;
+ g->g_accel = 0;
+ return g;
+}
+
+void
+freegrammar(grammar *g)
+{
+ int i;
+ for (i = 0; i < g->g_ndfas; i++) {
+ int j;
+ free(g->g_dfa[i].d_name);
+ for (j = 0; j < g->g_dfa[i].d_nstates; j++)
+ PyObject_FREE(g->g_dfa[i].d_state[j].s_arc);
+ PyObject_FREE(g->g_dfa[i].d_state);
+ }
+ PyObject_FREE(g->g_dfa);
+ for (i = 0; i < g->g_ll.ll_nlabels; i++)
+ free(g->g_ll.ll_label[i].lb_str);
+ PyObject_FREE(g->g_ll.ll_label);
+ PyObject_FREE(g);
+}
+
+dfa *
+adddfa(grammar *g, int type, char *name)
+{
+ dfa *d;
+
+ g->g_dfa = (dfa *)PyObject_REALLOC(g->g_dfa,
+ sizeof(dfa) * (g->g_ndfas + 1));
+ if (g->g_dfa == NULL)
+ Py_FatalError("no mem to resize dfa in adddfa");
+ d = &g->g_dfa[g->g_ndfas++];
+ d->d_type = type;
+ d->d_name = strdup(name);
+ d->d_nstates = 0;
+ d->d_state = NULL;
+ d->d_initial = -1;
+ d->d_first = NULL;
+ return d; /* Only use while fresh! */
+}
+
+int
+addstate(dfa *d)
+{
+ state *s;
+
+ d->d_state = (state *)PyObject_REALLOC(d->d_state,
+ sizeof(state) * (d->d_nstates + 1));
+ if (d->d_state == NULL)
+ Py_FatalError("no mem to resize state in addstate");
+ s = &d->d_state[d->d_nstates++];
+ s->s_narcs = 0;
+ s->s_arc = NULL;
+ s->s_lower = 0;
+ s->s_upper = 0;
+ s->s_accel = NULL;
+ s->s_accept = 0;
+ return s - d->d_state;
+}
+
+void
+addarc(dfa *d, int from, int to, int lbl)
+{
+ state *s;
+ arc *a;
+
+ assert(0 <= from && from < d->d_nstates);
+ assert(0 <= to && to < d->d_nstates);
+
+ s = &d->d_state[from];
+ s->s_arc = (arc *)PyObject_REALLOC(s->s_arc, sizeof(arc) * (s->s_narcs + 1));
+ if (s->s_arc == NULL)
+ Py_FatalError("no mem to resize arc list in addarc");
+ a = &s->s_arc[s->s_narcs++];
+ a->a_lbl = lbl;
+ a->a_arrow = to;
+}
+
+int
+addlabel(labellist *ll, int type, char *str)
+{
+ int i;
+ label *lb;
+
+ for (i = 0; i < ll->ll_nlabels; i++) {
+ if (ll->ll_label[i].lb_type == type &&
+ strcmp(ll->ll_label[i].lb_str, str) == 0)
+ return i;
+ }
+ ll->ll_label = (label *)PyObject_REALLOC(ll->ll_label,
+ sizeof(label) * (ll->ll_nlabels + 1));
+ if (ll->ll_label == NULL)
+ Py_FatalError("no mem to resize labellist in addlabel");
+ lb = &ll->ll_label[ll->ll_nlabels++];
+ lb->lb_type = type;
+ lb->lb_str = strdup(str);
+ if (Py_DebugFlag)
+ printf("Label @ %8p, %d: %s\n", ll, ll->ll_nlabels,
+ PyGrammar_LabelRepr(lb));
+ return lb - ll->ll_label;
+}
+
+/* Same, but rather dies than adds */
+
+int
+findlabel(labellist *ll, int type, char *str)
+{
+ int i;
+
+ for (i = 0; i < ll->ll_nlabels; i++) {
+ if (ll->ll_label[i].lb_type == type /*&&
+ strcmp(ll->ll_label[i].lb_str, str) == 0*/)
+ return i;
+ }
+ fprintf(stderr, "Label %d/'%s' not found\n", type, str);
+ Py_FatalError("grammar.c:findlabel()");
+ return 0; /* Make gcc -Wall happy */
+}
+
+/* Forward */
+static void translabel(grammar *, label *);
+
+void
+translatelabels(grammar *g)
+{
+ int i;
+
+#ifdef Py_DEBUG
+ printf("Translating labels ...\n");
+#endif
+ /* Don't translate EMPTY */
+ for (i = EMPTY+1; i < g->g_ll.ll_nlabels; i++)
+ translabel(g, &g->g_ll.ll_label[i]);
+}
+
+static void
+translabel(grammar *g, label *lb)
+{
+ int i;
+
+ if (Py_DebugFlag)
+ printf("Translating label %s ...\n", PyGrammar_LabelRepr(lb));
+
+ if (lb->lb_type == NAME) {
+ for (i = 0; i < g->g_ndfas; i++) {
+ if (strcmp(lb->lb_str, g->g_dfa[i].d_name) == 0) {
+ if (Py_DebugFlag)
+ printf(
+ "Label %s is non-terminal %d.\n",
+ lb->lb_str,
+ g->g_dfa[i].d_type);
+ lb->lb_type = g->g_dfa[i].d_type;
+ free(lb->lb_str);
+ lb->lb_str = NULL;
+ return;
+ }
+ }
+ for (i = 0; i < (int)N_TOKENS; i++) {
+ if (strcmp(lb->lb_str, _PyParser_TokenNames[i]) == 0) {
+ if (Py_DebugFlag)
+ printf("Label %s is terminal %d.\n",
+ lb->lb_str, i);
+ lb->lb_type = i;
+ free(lb->lb_str);
+ lb->lb_str = NULL;
+ return;
+ }
+ }
+ printf("Can't translate NAME label '%s'\n", lb->lb_str);
+ return;
+ }
+
+ if (lb->lb_type == STRING) {
+ if (isalpha(Py_CHARMASK(lb->lb_str[1])) ||
+ lb->lb_str[1] == '_') {
+ char *p;
+ char *src;
+ char *dest;
+ size_t name_len;
+ if (Py_DebugFlag)
+ printf("Label %s is a keyword\n", lb->lb_str);
+ lb->lb_type = NAME;
+ src = lb->lb_str + 1;
+ p = strchr(src, '\'');
+ if (p)
+ name_len = p - src;
+ else
+ name_len = strlen(src);
+ dest = (char *)malloc(name_len + 1);
+ if (!dest) {
+ printf("Can't alloc dest '%s'\n", src);
+ return;
+ }
+ strncpy(dest, src, name_len);
+ dest[name_len] = '\0';
+ free(lb->lb_str);
+ lb->lb_str = dest;
+ }
+ else if (lb->lb_str[2] == lb->lb_str[0]) {
+ int type = (int) PyToken_OneChar(lb->lb_str[1]);
+ if (type != OP) {
+ lb->lb_type = type;
+ free(lb->lb_str);
+ lb->lb_str = NULL;
+ }
+ else
+ printf("Unknown OP label %s\n",
+ lb->lb_str);
+ }
+ else if (lb->lb_str[2] && lb->lb_str[3] == lb->lb_str[0]) {
+ int type = (int) PyToken_TwoChars(lb->lb_str[1],
+ lb->lb_str[2]);
+ if (type != OP) {
+ lb->lb_type = type;
+ free(lb->lb_str);
+ lb->lb_str = NULL;
+ }
+ else
+ printf("Unknown OP label %s\n",
+ lb->lb_str);
+ }
+ else if (lb->lb_str[2] && lb->lb_str[3] && lb->lb_str[4] == lb->lb_str[0]) {
+ int type = (int) PyToken_ThreeChars(lb->lb_str[1],
+ lb->lb_str[2],
+ lb->lb_str[3]);
+ if (type != OP) {
+ lb->lb_type = type;
+ free(lb->lb_str);
+ lb->lb_str = NULL;
+ }
+ else
+ printf("Unknown OP label %s\n",
+ lb->lb_str);
+ }
+ else
+ printf("Can't translate STRING label %s\n",
+ lb->lb_str);
+ }
+ else
+ printf("Can't translate label '%s'\n",
+ PyGrammar_LabelRepr(lb));
+}
diff --git a/Parser/grammar1.c b/Parser/grammar1.c
index e0b8fbb..1f7d264 100644
--- a/Parser/grammar1.c
+++ b/Parser/grammar1.c
@@ -2,21 +2,35 @@
/* Grammar subroutines needed by parser */
#include "Python.h"
+#include "pgenheaders.h"
#include "grammar.h"
#include "token.h"
/* Return the DFA for the given type */
-const dfa *
-PyGrammar_FindDFA(grammar *g, int type)
+dfa *
+PyGrammar_FindDFA(grammar *g, register int type)
{
+ register dfa *d;
+#if 1
/* Massive speed-up */
- const dfa *d = &g->g_dfa[type - NT_OFFSET];
+ d = &g->g_dfa[type - NT_OFFSET];
assert(d->d_type == type);
return d;
+#else
+ /* Old, slow version */
+ register int i;
+
+ for (i = g->g_ndfas, d = g->g_dfa; --i >= 0; d++) {
+ if (d->d_type == type)
+ return d;
+ }
+ assert(0);
+ /* NOTREACHED */
+#endif
}
-const char *
+char *
PyGrammar_LabelRepr(label *lb)
{
static char buf[100];
@@ -31,7 +45,7 @@ PyGrammar_LabelRepr(label *lb)
else
return lb->lb_str;
}
- else if (lb->lb_type < N_TOKENS) {
+ else {
if (lb->lb_str == NULL)
return _PyParser_TokenNames[lb->lb_type];
else {
@@ -40,8 +54,4 @@ PyGrammar_LabelRepr(label *lb)
return buf;
}
}
- else {
- Py_FatalError("invalid label");
- return NULL;
- }
}
diff --git a/Parser/intrcheck.c b/Parser/intrcheck.c
new file mode 100644
index 0000000..5844a9a
--- /dev/null
+++ b/Parser/intrcheck.c
@@ -0,0 +1,178 @@
+
+/* Check for interrupts */
+
+#include "Python.h"
+#include "pythread.h"
+
+#ifdef QUICKWIN
+
+#include <io.h>
+
+void
+PyOS_InitInterrupts(void)
+{
+}
+
+void
+PyOS_FiniInterrupts(void)
+{
+}
+
+int
+PyOS_InterruptOccurred(void)
+{
+ _wyield();
+}
+
+#define OK
+
+#endif /* QUICKWIN */
+
+#if defined(_M_IX86) && !defined(__QNX__)
+#include <io.h>
+#endif
+
+#if defined(MSDOS) && !defined(QUICKWIN)
+
+#ifdef __GNUC__
+
+/* This is for DJGPP's GO32 extender. I don't know how to trap
+ * control-C (There's no API for ctrl-C, and I don't want to mess with
+ * the interrupt vectors.) However, this DOES catch control-break.
+ * --Amrit
+ */
+
+#include <go32.h>
+
+void
+PyOS_InitInterrupts(void)
+{
+ _go32_want_ctrl_break(1 /* TRUE */);
+}
+
+void
+PyOS_FiniInterrupts(void)
+{
+}
+
+int
+PyOS_InterruptOccurred(void)
+{
+ return _go32_was_ctrl_break_hit();
+}
+
+#else /* !__GNUC__ */
+
+/* This might work for MS-DOS (untested though): */
+
+void
+PyOS_InitInterrupts(void)
+{
+}
+
+void
+PyOS_FiniInterrupts(void)
+{
+}
+
+int
+PyOS_InterruptOccurred(void)
+{
+ int interrupted = 0;
+ while (kbhit()) {
+ if (getch() == '\003')
+ interrupted = 1;
+ }
+ return interrupted;
+}
+
+#endif /* __GNUC__ */
+
+#define OK
+
+#endif /* MSDOS && !QUICKWIN */
+
+
+#ifndef OK
+
+/* Default version -- for real operating systems and for Standard C */
+
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+
+static int interrupted;
+
+void
+PyErr_SetInterrupt(void)
+{
+ interrupted = 1;
+}
+
+extern int PyErr_CheckSignals(void);
+
+static int
+checksignals_witharg(void * arg)
+{
+ return PyErr_CheckSignals();
+}
+
+static void
+intcatcher(int sig)
+{
+ extern void Py_Exit(int);
+ static char message[] =
+"python: to interrupt a truly hanging Python program, interrupt once more.\n";
+ switch (interrupted++) {
+ case 0:
+ break;
+ case 1:
+#ifdef RISCOS
+ fprintf(stderr, message);
+#else
+ write(2, message, strlen(message));
+#endif
+ break;
+ case 2:
+ interrupted = 0;
+ Py_Exit(1);
+ break;
+ }
+ PyOS_setsig(SIGINT, intcatcher);
+ Py_AddPendingCall(checksignals_witharg, NULL);
+}
+
+static void (*old_siginthandler)(int) = SIG_DFL;
+
+void
+PyOS_InitInterrupts(void)
+{
+ if ((old_siginthandler = PyOS_setsig(SIGINT, SIG_IGN)) != SIG_IGN)
+ PyOS_setsig(SIGINT, intcatcher);
+}
+
+void
+PyOS_FiniInterrupts(void)
+{
+ PyOS_setsig(SIGINT, old_siginthandler);
+}
+
+int
+PyOS_InterruptOccurred(void)
+{
+ if (!interrupted)
+ return 0;
+ interrupted = 0;
+ return 1;
+}
+
+#endif /* !OK */
+
+void
+PyOS_AfterFork(void)
+{
+#ifdef WITH_THREAD
+ PyThread_ReInitTLS();
+ PyEval_ReInitThreads();
+#endif
+}
diff --git a/Parser/listnode.c b/Parser/listnode.c
index d431ae5..b5f8ad2 100644
--- a/Parser/listnode.c
+++ b/Parser/listnode.c
@@ -1,8 +1,7 @@
/* List a node on a file */
-#include "Python.h"
-#include "pycore_pystate.h"
+#include "pgenheaders.h"
#include "token.h"
#include "node.h"
@@ -16,22 +15,20 @@ PyNode_ListTree(node *n)
listnode(stdout, n);
}
+static int level, atbol;
+
static void
listnode(FILE *fp, node *n)
{
- PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
-
- interp->parser.listnode.level = 0;
- interp->parser.listnode.atbol = 1;
+ level = 0;
+ atbol = 1;
list1node(fp, n);
}
static void
list1node(FILE *fp, node *n)
{
- PyInterpreterState *interp;
-
- if (n == NULL)
+ if (n == 0)
return;
if (ISNONTERMINAL(TYPE(n))) {
int i;
@@ -39,26 +36,25 @@ list1node(FILE *fp, node *n)
list1node(fp, CHILD(n, i));
}
else if (ISTERMINAL(TYPE(n))) {
- interp = _PyInterpreterState_GET_UNSAFE();
switch (TYPE(n)) {
case INDENT:
- interp->parser.listnode.level++;
+ ++level;
break;
case DEDENT:
- interp->parser.listnode.level--;
+ --level;
break;
default:
- if (interp->parser.listnode.atbol) {
+ if (atbol) {
int i;
- for (i = 0; i < interp->parser.listnode.level; ++i)
+ for (i = 0; i < level; ++i)
fprintf(fp, "\t");
- interp->parser.listnode.atbol = 0;
+ atbol = 0;
}
if (TYPE(n) == NEWLINE) {
if (STR(n) != NULL)
fprintf(fp, "%s", STR(n));
fprintf(fp, "\n");
- interp->parser.listnode.atbol = 1;
+ atbol = 1;
}
else
fprintf(fp, "%s ", STR(n));
diff --git a/Parser/metagrammar.c b/Parser/metagrammar.c
new file mode 100644
index 0000000..53810b8
--- /dev/null
+++ b/Parser/metagrammar.c
@@ -0,0 +1,159 @@
+
+#include "pgenheaders.h"
+#include "metagrammar.h"
+#include "grammar.h"
+#include "pgen.h"
+static arc arcs_0_0[3] = {
+ {2, 0},
+ {3, 0},
+ {4, 1},
+};
+static arc arcs_0_1[1] = {
+ {0, 1},
+};
+static state states_0[2] = {
+ {3, arcs_0_0},
+ {1, arcs_0_1},
+};
+static arc arcs_1_0[1] = {
+ {5, 1},
+};
+static arc arcs_1_1[1] = {
+ {6, 2},
+};
+static arc arcs_1_2[1] = {
+ {7, 3},
+};
+static arc arcs_1_3[1] = {
+ {3, 4},
+};
+static arc arcs_1_4[1] = {
+ {0, 4},
+};
+static state states_1[5] = {
+ {1, arcs_1_0},
+ {1, arcs_1_1},
+ {1, arcs_1_2},
+ {1, arcs_1_3},
+ {1, arcs_1_4},
+};
+static arc arcs_2_0[1] = {
+ {8, 1},
+};
+static arc arcs_2_1[2] = {
+ {9, 0},
+ {0, 1},
+};
+static state states_2[2] = {
+ {1, arcs_2_0},
+ {2, arcs_2_1},
+};
+static arc arcs_3_0[1] = {
+ {10, 1},
+};
+static arc arcs_3_1[2] = {
+ {10, 1},
+ {0, 1},
+};
+static state states_3[2] = {
+ {1, arcs_3_0},
+ {2, arcs_3_1},
+};
+static arc arcs_4_0[2] = {
+ {11, 1},
+ {13, 2},
+};
+static arc arcs_4_1[1] = {
+ {7, 3},
+};
+static arc arcs_4_2[3] = {
+ {14, 4},
+ {15, 4},
+ {0, 2},
+};
+static arc arcs_4_3[1] = {
+ {12, 4},
+};
+static arc arcs_4_4[1] = {
+ {0, 4},
+};
+static state states_4[5] = {
+ {2, arcs_4_0},
+ {1, arcs_4_1},
+ {3, arcs_4_2},
+ {1, arcs_4_3},
+ {1, arcs_4_4},
+};
+static arc arcs_5_0[3] = {
+ {5, 1},
+ {16, 1},
+ {17, 2},
+};
+static arc arcs_5_1[1] = {
+ {0, 1},
+};
+static arc arcs_5_2[1] = {
+ {7, 3},
+};
+static arc arcs_5_3[1] = {
+ {18, 1},
+};
+static state states_5[4] = {
+ {3, arcs_5_0},
+ {1, arcs_5_1},
+ {1, arcs_5_2},
+ {1, arcs_5_3},
+};
+static dfa dfas[6] = {
+ {256, "MSTART", 0, 2, states_0,
+ "\070\000\000"},
+ {257, "RULE", 0, 5, states_1,
+ "\040\000\000"},
+ {258, "RHS", 0, 2, states_2,
+ "\040\010\003"},
+ {259, "ALT", 0, 2, states_3,
+ "\040\010\003"},
+ {260, "ITEM", 0, 5, states_4,
+ "\040\010\003"},
+ {261, "ATOM", 0, 4, states_5,
+ "\040\000\003"},
+};
+static label labels[19] = {
+ {0, "EMPTY"},
+ {256, 0},
+ {257, 0},
+ {4, 0},
+ {0, 0},
+ {1, 0},
+ {11, 0},
+ {258, 0},
+ {259, 0},
+ {18, 0},
+ {260, 0},
+ {9, 0},
+ {10, 0},
+ {261, 0},
+ {16, 0},
+ {14, 0},
+ {3, 0},
+ {7, 0},
+ {8, 0},
+};
+static grammar _PyParser_Grammar = {
+ 6,
+ dfas,
+ {19, labels},
+ 256
+};
+
+grammar *
+meta_grammar(void)
+{
+ return &_PyParser_Grammar;
+}
+
+grammar *
+Py_meta_grammar(void)
+{
+ return meta_grammar();
+}
diff --git a/Parser/myreadline.c b/Parser/myreadline.c
index 43e5583..5376214 100644
--- a/Parser/myreadline.c
+++ b/Parser/myreadline.c
@@ -10,31 +10,40 @@
*/
#include "Python.h"
-#include "pycore_pystate.h"
#ifdef MS_WINDOWS
#define WIN32_LEAN_AND_MEAN
#include "windows.h"
#endif /* MS_WINDOWS */
+#ifdef __VMS
+extern char* vms__StdioReadline(FILE *sys_stdin, FILE *sys_stdout, char *prompt);
+#endif
+
-PyThreadState* _PyOS_ReadlineTState = NULL;
+PyThreadState* _PyOS_ReadlineTState;
+#ifdef WITH_THREAD
#include "pythread.h"
static PyThread_type_lock _PyOS_ReadlineLock = NULL;
+#endif
int (*PyOS_InputHook)(void) = NULL;
+#ifdef RISCOS
+int Py_RISCOSWimpFlag;
+#endif
+
/* This function restarts a fgets() after an EINTR error occurred
except if PyOS_InterruptOccurred() returns true. */
static int
my_fgets(char *buf, int len, FILE *fp)
{
+ char *p;
#ifdef MS_WINDOWS
- HANDLE hInterruptEvent;
+ int i;
#endif
- char *p;
- int err;
+
while (1) {
if (PyOS_InputHook != NULL)
(void)(PyOS_InputHook)();
@@ -43,29 +52,24 @@ my_fgets(char *buf, int len, FILE *fp)
p = fgets(buf, len, fp);
if (p != NULL)
return 0; /* No error */
- err = errno;
#ifdef MS_WINDOWS
/* Ctrl-C anywhere on the line or Ctrl-Z if the only character
on a line will set ERROR_OPERATION_ABORTED. Under normal
circumstances Ctrl-C will also have caused the SIGINT handler
- to fire which will have set the event object returned by
- _PyOS_SigintEvent. This signal fires in another thread and
- is not guaranteed to have occurred before this point in the
- code.
-
- Therefore: check whether the event is set with a small timeout.
- If it is, assume this is a Ctrl-C and reset the event. If it
- isn't set assume that this is a Ctrl-Z on its own and drop
- through to check for EOF.
+ to fire. This signal fires in another thread and is not
+ guaranteed to have occurred before this point in the code.
+
+ Therefore: check in a small loop to see if the trigger has
+ fired, in which case assume this is a Ctrl-C event. If it
+ hasn't fired within 10ms assume that this is a Ctrl-Z on its
+ own or that the signal isn't going to fire for some other
+ reason and drop through to check for EOF.
*/
if (GetLastError()==ERROR_OPERATION_ABORTED) {
- hInterruptEvent = _PyOS_SigintEvent();
- switch (WaitForSingleObjectEx(hInterruptEvent, 10, FALSE)) {
- case WAIT_OBJECT_0:
- ResetEvent(hInterruptEvent);
- return 1; /* Interrupt */
- case WAIT_FAILED:
- return -2; /* Error */
+ for (i = 0; i < 10; i++) {
+ if (PyOS_InterruptOccurred())
+ return 1;
+ Sleep(1);
}
}
#endif /* MS_WINDOWS */
@@ -74,14 +78,18 @@ my_fgets(char *buf, int len, FILE *fp)
return -1; /* EOF */
}
#ifdef EINTR
- if (err == EINTR) {
+ if (errno == EINTR) {
int s;
+#ifdef WITH_THREAD
PyEval_RestoreThread(_PyOS_ReadlineTState);
+#endif
s = PyErr_CheckSignals();
+#ifdef WITH_THREAD
PyEval_SaveThread();
+#endif
if (s < 0)
return 1;
- /* try again */
+ /* try again */
continue;
}
#endif
@@ -93,185 +101,35 @@ my_fgets(char *buf, int len, FILE *fp)
/* NOTREACHED */
}
-#ifdef MS_WINDOWS
-/* Readline implementation using ReadConsoleW */
-
-extern char _get_console_type(HANDLE handle);
-
-char *
-_PyOS_WindowsConsoleReadline(HANDLE hStdIn)
-{
- static wchar_t wbuf_local[1024 * 16];
- const DWORD chunk_size = 1024;
-
- DWORD n_read, total_read, wbuflen, u8len;
- wchar_t *wbuf;
- char *buf = NULL;
- int err = 0;
-
- n_read = (DWORD)-1;
- total_read = 0;
- wbuf = wbuf_local;
- wbuflen = sizeof(wbuf_local) / sizeof(wbuf_local[0]) - 1;
- while (1) {
- if (PyOS_InputHook != NULL) {
- (void)(PyOS_InputHook)();
- }
- if (!ReadConsoleW(hStdIn, &wbuf[total_read], wbuflen - total_read, &n_read, NULL)) {
- err = GetLastError();
- goto exit;
- }
- if (n_read == (DWORD)-1 && (err = GetLastError()) == ERROR_OPERATION_ABORTED) {
- break;
- }
- if (n_read == 0) {
- int s;
- err = GetLastError();
- if (err != ERROR_OPERATION_ABORTED)
- goto exit;
- err = 0;
- HANDLE hInterruptEvent = _PyOS_SigintEvent();
- if (WaitForSingleObjectEx(hInterruptEvent, 100, FALSE)
- == WAIT_OBJECT_0) {
- ResetEvent(hInterruptEvent);
- PyEval_RestoreThread(_PyOS_ReadlineTState);
- s = PyErr_CheckSignals();
- PyEval_SaveThread();
- if (s < 0)
- goto exit;
- }
- break;
- }
-
- total_read += n_read;
- if (total_read == 0 || wbuf[total_read - 1] == L'\n') {
- break;
- }
- wbuflen += chunk_size;
- if (wbuf == wbuf_local) {
- wbuf[total_read] = '\0';
- wbuf = (wchar_t*)PyMem_RawMalloc(wbuflen * sizeof(wchar_t));
- if (wbuf)
- wcscpy_s(wbuf, wbuflen, wbuf_local);
- else {
- PyErr_NoMemory();
- goto exit;
- }
- }
- else {
- wchar_t *tmp = PyMem_RawRealloc(wbuf, wbuflen * sizeof(wchar_t));
- if (tmp == NULL) {
- PyErr_NoMemory();
- goto exit;
- }
- wbuf = tmp;
- }
- }
-
- if (wbuf[0] == '\x1a') {
- buf = PyMem_RawMalloc(1);
- if (buf)
- buf[0] = '\0';
- else {
- PyErr_NoMemory();
- }
- goto exit;
- }
-
- u8len = WideCharToMultiByte(CP_UTF8, 0, wbuf, total_read, NULL, 0, NULL, NULL);
- buf = PyMem_RawMalloc(u8len + 1);
- if (buf == NULL) {
- PyErr_NoMemory();
- goto exit;
- }
- u8len = WideCharToMultiByte(CP_UTF8, 0, wbuf, total_read, buf, u8len, NULL, NULL);
- buf[u8len] = '\0';
-
-exit:
- if (wbuf != wbuf_local)
- PyMem_RawFree(wbuf);
-
- if (err) {
- PyEval_RestoreThread(_PyOS_ReadlineTState);
- PyErr_SetFromWindowsErr(err);
- PyEval_SaveThread();
- }
-
- return buf;
-}
-
-#endif
-
/* Readline implementation using fgets() */
char *
-PyOS_StdioReadline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt)
+PyOS_StdioReadline(FILE *sys_stdin, FILE *sys_stdout, char *prompt)
{
size_t n;
char *p, *pr;
-
-#ifdef MS_WINDOWS
- if (!Py_LegacyWindowsStdioFlag && sys_stdin == stdin) {
- HANDLE hStdIn, hStdErr;
-
- _Py_BEGIN_SUPPRESS_IPH
- hStdIn = (HANDLE)_get_osfhandle(fileno(sys_stdin));
- hStdErr = (HANDLE)_get_osfhandle(fileno(stderr));
- _Py_END_SUPPRESS_IPH
-
- if (_get_console_type(hStdIn) == 'r') {
- fflush(sys_stdout);
- if (prompt) {
- if (_get_console_type(hStdErr) == 'w') {
- wchar_t *wbuf;
- int wlen;
- wlen = MultiByteToWideChar(CP_UTF8, 0, prompt, -1,
- NULL, 0);
- if (wlen) {
- wbuf = PyMem_RawMalloc(wlen * sizeof(wchar_t));
- if (wbuf == NULL) {
- PyErr_NoMemory();
- return NULL;
- }
- wlen = MultiByteToWideChar(CP_UTF8, 0, prompt, -1,
- wbuf, wlen);
- if (wlen) {
- DWORD n;
- fflush(stderr);
- /* wlen includes null terminator, so subtract 1 */
- WriteConsoleW(hStdErr, wbuf, wlen - 1, &n, NULL);
- }
- PyMem_RawFree(wbuf);
- }
- } else {
- fprintf(stderr, "%s", prompt);
- fflush(stderr);
- }
- }
- clearerr(sys_stdin);
- return _PyOS_WindowsConsoleReadline(hStdIn);
- }
- }
-#endif
-
n = 100;
- p = (char *)PyMem_RawMalloc(n);
- if (p == NULL) {
- PyErr_NoMemory();
+ if ((p = (char *)PyMem_MALLOC(n)) == NULL)
return NULL;
- }
-
fflush(sys_stdout);
+#ifndef RISCOS
if (prompt)
fprintf(stderr, "%s", prompt);
+#else
+ if (prompt) {
+ if(Py_RISCOSWimpFlag)
+ fprintf(stderr, "\x0cr%s\x0c", prompt);
+ else
+ fprintf(stderr, "%s", prompt);
+ }
+#endif
fflush(stderr);
-
switch (my_fgets(p, (int)n, sys_stdin)) {
case 0: /* Normal case */
break;
case 1: /* Interrupt */
- PyMem_RawFree(p);
+ PyMem_FREE(p);
return NULL;
case -1: /* EOF */
case -2: /* Error */
@@ -283,13 +141,13 @@ PyOS_StdioReadline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt)
while (n > 0 && p[n-1] != '\n') {
size_t incr = n+2;
if (incr > INT_MAX) {
- PyMem_RawFree(p);
+ PyMem_FREE(p);
PyErr_SetString(PyExc_OverflowError, "input line too long");
return NULL;
}
- pr = (char *)PyMem_RawRealloc(p, n + incr);
+ pr = (char *)PyMem_REALLOC(p, n + incr);
if (pr == NULL) {
- PyMem_RawFree(p);
+ PyMem_FREE(p);
PyErr_NoMemory();
return NULL;
}
@@ -298,9 +156,9 @@ PyOS_StdioReadline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt)
break;
n += strlen(p+n);
}
- pr = (char *)PyMem_RawRealloc(p, n+1);
+ pr = (char *)PyMem_REALLOC(p, n+1);
if (pr == NULL) {
- PyMem_RawFree(p);
+ PyMem_FREE(p);
PyErr_NoMemory();
return NULL;
}
@@ -313,18 +171,17 @@ PyOS_StdioReadline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt)
Note: Python expects in return a buffer allocated with PyMem_Malloc. */
-char *(*PyOS_ReadlineFunctionPointer)(FILE *, FILE *, const char *) = NULL;
+char *(*PyOS_ReadlineFunctionPointer)(FILE *, FILE *, char *);
/* Interface used by tokenizer.c and bltinmodule.c */
char *
-PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt)
+PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, char *prompt)
{
- char *rv, *res;
- size_t len;
+ char *rv;
- if (_PyOS_ReadlineTState == _PyThreadState_GET()) {
+ if (_PyOS_ReadlineTState == PyThreadState_GET()) {
PyErr_SetString(PyExc_RuntimeError,
"can't re-enter readline");
return NULL;
@@ -332,20 +189,24 @@ PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt)
if (PyOS_ReadlineFunctionPointer == NULL) {
+#ifdef __VMS
+ PyOS_ReadlineFunctionPointer = vms__StdioReadline;
+#else
PyOS_ReadlineFunctionPointer = PyOS_StdioReadline;
+#endif
}
+#ifdef WITH_THREAD
if (_PyOS_ReadlineLock == NULL) {
_PyOS_ReadlineLock = PyThread_allocate_lock();
- if (_PyOS_ReadlineLock == NULL) {
- PyErr_SetString(PyExc_MemoryError, "can't allocate lock");
- return NULL;
- }
}
+#endif
- _PyOS_ReadlineTState = _PyThreadState_GET();
+ _PyOS_ReadlineTState = PyThreadState_GET();
Py_BEGIN_ALLOW_THREADS
+#ifdef WITH_THREAD
PyThread_acquire_lock(_PyOS_ReadlineLock, 1);
+#endif
/* This is needed to handle the unlikely case that the
* interpreter is in interactive mode *and* stdin/out are not
@@ -359,22 +220,11 @@ PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt)
prompt);
Py_END_ALLOW_THREADS
+#ifdef WITH_THREAD
PyThread_release_lock(_PyOS_ReadlineLock);
+#endif
_PyOS_ReadlineTState = NULL;
- if (rv == NULL)
- return NULL;
-
- len = strlen(rv) + 1;
- res = PyMem_Malloc(len);
- if (res != NULL) {
- memcpy(res, rv, len);
- }
- else {
- PyErr_NoMemory();
- }
- PyMem_RawFree(rv);
-
- return res;
+ return rv;
}
diff --git a/Parser/node.c b/Parser/node.c
index f1b70e0..0dea30f 100644
--- a/Parser/node.c
+++ b/Parser/node.c
@@ -13,8 +13,6 @@ PyNode_New(int type)
n->n_type = type;
n->n_str = NULL;
n->n_lineno = 0;
- n->n_end_lineno = 0;
- n->n_end_col_offset = -1;
n->n_nchildren = 0;
n->n_child = NULL;
return n;
@@ -72,39 +70,19 @@ fancy_roundup(int n)
* Note that this would be straightforward if a node stored its current
* capacity. The code is tricky to avoid that.
*/
-#define XXXROUNDUP(n) ((n) <= 1 ? (n) : \
- (n) <= 128 ? (int)_Py_SIZE_ROUND_UP((n), 4) : \
+#define XXXROUNDUP(n) ((n) <= 1 ? (n) : \
+ (n) <= 128 ? (((n) + 3) & ~3) : \
fancy_roundup(n))
-void
-_PyNode_FinalizeEndPos(node *n)
-{
- int nch = NCH(n);
- node *last;
- if (nch == 0) {
- return;
- }
- last = CHILD(n, nch - 1);
- _PyNode_FinalizeEndPos(last);
- n->n_end_lineno = last->n_end_lineno;
- n->n_end_col_offset = last->n_end_col_offset;
-}
-
int
-PyNode_AddChild(node *n1, int type, char *str, int lineno, int col_offset,
- int end_lineno, int end_col_offset)
+PyNode_AddChild(register node *n1, int type, char *str, int lineno, int col_offset)
{
const int nch = n1->n_nchildren;
int current_capacity;
int required_capacity;
node *n;
- // finalize end position of previous node (if any)
- if (nch > 0) {
- _PyNode_FinalizeEndPos(CHILD(n1, nch - 1));
- }
-
if (nch == INT_MAX || nch < 0)
return E_OVERFLOW;
@@ -113,7 +91,7 @@ PyNode_AddChild(node *n1, int type, char *str, int lineno, int col_offset,
if (current_capacity < 0 || required_capacity < 0)
return E_OVERFLOW;
if (current_capacity < required_capacity) {
- if ((size_t)required_capacity > SIZE_MAX / sizeof(node)) {
+ if (required_capacity > PY_SIZE_MAX / sizeof(node)) {
return E_NOMEM;
}
n = n1->n_child;
@@ -129,8 +107,6 @@ PyNode_AddChild(node *n1, int type, char *str, int lineno, int col_offset,
n->n_str = str;
n->n_lineno = lineno;
n->n_col_offset = col_offset;
- n->n_end_lineno = end_lineno; // this and below will be updates after all children are added.
- n->n_end_col_offset = end_col_offset;
n->n_nchildren = 0;
n->n_child = NULL;
return 0;
diff --git a/Parser/parser.c b/Parser/parser.c
index 227b918..b753a17 100644
--- a/Parser/parser.c
+++ b/Parser/parser.c
@@ -6,12 +6,12 @@
/* XXX To do: error recovery */
#include "Python.h"
+#include "pgenheaders.h"
#include "token.h"
#include "grammar.h"
#include "node.h"
#include "parser.h"
#include "errcode.h"
-#include "graminit.h"
#ifdef Py_DEBUG
@@ -35,9 +35,9 @@ s_reset(stack *s)
#define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
static int
-s_push(stack *s, const dfa *d, node *parent)
+s_push(register stack *s, dfa *d, node *parent)
{
- stackentry *top;
+ register stackentry *top;
if (s->s_top == s->s_base) {
fprintf(stderr, "s_push: parser stack overflow\n");
return E_NOMEM;
@@ -52,7 +52,7 @@ s_push(stack *s, const dfa *d, node *parent)
#ifdef Py_DEBUG
static void
-s_pop(stack *s)
+s_pop(register stack *s)
{
if (s_empty(s))
Py_FatalError("s_pop: parser stack underflow -- FATAL");
@@ -105,13 +105,11 @@ PyParser_Delete(parser_state *ps)
/* PARSER STACK OPERATIONS */
static int
-shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset,
- int end_lineno, int end_col_offset)
+shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset)
{
int err;
assert(!s_empty(s));
- err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset,
- end_lineno, end_col_offset);
+ err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
if (err)
return err;
s->s_top->s_state = newstate;
@@ -119,15 +117,13 @@ shift(stack *s, int type, char *str, int newstate, int lineno, int col_offset,
}
static int
-push(stack *s, int type, const dfa *d, int newstate, int lineno, int col_offset,
- int end_lineno, int end_col_offset)
+push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
{
int err;
- node *n;
+ register node *n;
n = s->s_top->s_parent;
assert(!s_empty(s));
- err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset,
- end_lineno, end_col_offset);
+ err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
if (err)
return err;
s->s_top->s_state = newstate;
@@ -138,38 +134,34 @@ push(stack *s, int type, const dfa *d, int newstate, int lineno, int col_offset,
/* PARSER PROPER */
static int
-classify(parser_state *ps, int type, const char *str)
+classify(parser_state *ps, int type, char *str)
{
grammar *g = ps->p_grammar;
- int n = g->g_ll.ll_nlabels;
+ register int n = g->g_ll.ll_nlabels;
if (type == NAME) {
- const label *l = g->g_ll.ll_label;
- int i;
+ register char *s = str;
+ register label *l = g->g_ll.ll_label;
+ register int i;
for (i = n; i > 0; i--, l++) {
if (l->lb_type != NAME || l->lb_str == NULL ||
- l->lb_str[0] != str[0] ||
- strcmp(l->lb_str, str) != 0)
+ l->lb_str[0] != s[0] ||
+ strcmp(l->lb_str, s) != 0)
continue;
#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
-#if 0
- /* Leaving this in as an example */
- if (!(ps->p_flags & CO_FUTURE_WITH_STATEMENT)) {
- if (str[0] == 'w' && strcmp(str, "with") == 0)
- break; /* not a keyword yet */
- else if (str[0] == 'a' && strcmp(str, "as") == 0)
- break; /* not a keyword yet */
+ if (ps->p_flags & CO_FUTURE_PRINT_FUNCTION &&
+ s[0] == 'p' && strcmp(s, "print") == 0) {
+ break; /* no longer a keyword */
}
#endif
-#endif
D(printf("It's a keyword\n"));
return n - i;
}
}
{
- const label *l = g->g_ll.ll_label;
- int i;
+ register label *l = g->g_ll.ll_label;
+ register int i;
for (i = n; i > 0; i--, l++) {
if (l->lb_type == type && l->lb_str == NULL) {
D(printf("It's a token we know\n"));
@@ -183,8 +175,6 @@ classify(parser_state *ps, int type, const char *str)
}
#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
-#if 0
-/* Leaving this in as an example */
static void
future_hack(parser_state *ps)
{
@@ -224,16 +214,13 @@ future_hack(parser_state *ps)
}
}
}
-#endif
#endif /* future keyword */
int
-PyParser_AddToken(parser_state *ps, int type, char *str,
- int lineno, int col_offset,
- int end_lineno, int end_col_offset,
- int *expected_ret)
+PyParser_AddToken(register parser_state *ps, register int type, char *str,
+ int lineno, int col_offset, int *expected_ret)
{
- int ilabel;
+ register int ilabel;
int err;
D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
@@ -246,43 +233,34 @@ PyParser_AddToken(parser_state *ps, int type, char *str,
/* Loop until the token is shifted or an error occurred */
for (;;) {
/* Fetch the current dfa and state */
- const dfa *d = ps->p_stack.s_top->s_dfa;
- state *s = &d->d_state[ps->p_stack.s_top->s_state];
+ register dfa *d = ps->p_stack.s_top->s_dfa;
+ register state *s = &d->d_state[ps->p_stack.s_top->s_state];
D(printf(" DFA '%s', state %d:",
d->d_name, ps->p_stack.s_top->s_state));
/* Check accelerator */
if (s->s_lower <= ilabel && ilabel < s->s_upper) {
- int x = s->s_accel[ilabel - s->s_lower];
+ register int x = s->s_accel[ilabel - s->s_lower];
if (x != -1) {
if (x & (1<<7)) {
/* Push non-terminal */
int nt = (x >> 8) + NT_OFFSET;
int arrow = x & ((1<<7)-1);
- if (nt == func_body_suite && !(ps->p_flags & PyCF_TYPE_COMMENTS)) {
- /* When parsing type comments is not requested,
- we can provide better errors about bad indentation
- by using 'suite' for the body of a funcdef */
- D(printf(" [switch func_body_suite to suite]"));
- nt = suite;
- }
- const dfa *d1 = PyGrammar_FindDFA(
+ dfa *d1 = PyGrammar_FindDFA(
ps->p_grammar, nt);
if ((err = push(&ps->p_stack, nt, d1,
- arrow, lineno, col_offset,
- end_lineno, end_col_offset)) > 0) {
+ arrow, lineno, col_offset)) > 0) {
D(printf(" MemError: push\n"));
return err;
}
- D(printf(" Push '%s'\n", d1->d_name));
+ D(printf(" Push ...\n"));
continue;
}
/* Shift the token */
if ((err = shift(&ps->p_stack, type, str,
- x, lineno, col_offset,
- end_lineno, end_col_offset)) > 0) {
+ x, lineno, col_offset)) > 0) {
D(printf(" MemError: shift.\n"));
return err;
}
@@ -296,13 +274,11 @@ PyParser_AddToken(parser_state *ps, int type, char *str,
d->d_name,
ps->p_stack.s_top->s_state));
#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
-#if 0
if (d->d_name[0] == 'i' &&
strcmp(d->d_name,
"import_stmt") == 0)
future_hack(ps);
#endif
-#endif
s_pop(&ps->p_stack);
if (s_empty(&ps->p_stack)) {
D(printf(" ACCEPT.\n"));
@@ -316,12 +292,10 @@ PyParser_AddToken(parser_state *ps, int type, char *str,
if (s->s_accept) {
#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
-#if 0
if (d->d_name[0] == 'i' &&
strcmp(d->d_name, "import_stmt") == 0)
future_hack(ps);
#endif
-#endif
/* Pop this dfa and try again */
s_pop(&ps->p_stack);
D(printf(" Pop ...\n"));
diff --git a/Parser/parser.h b/Parser/parser.h
index b16075e..403236d 100644
--- a/Parser/parser.h
+++ b/Parser/parser.h
@@ -7,42 +7,35 @@ extern "C" {
/* Parser interface */
-#define MAXSTACK 1700
+#define MAXSTACK 1500
typedef struct {
- int s_state; /* State in current DFA */
- const dfa *s_dfa; /* Current DFA */
- struct _node *s_parent; /* Where to add next node */
+ int s_state; /* State in current DFA */
+ dfa *s_dfa; /* Current DFA */
+ struct _node *s_parent; /* Where to add next node */
} stackentry;
typedef struct {
- stackentry *s_top; /* Top entry */
- stackentry s_base[MAXSTACK];/* Array of stack entries */
- /* NB The stack grows down */
+ stackentry *s_top; /* Top entry */
+ stackentry s_base[MAXSTACK];/* Array of stack entries */
+ /* NB The stack grows down */
} stack;
typedef struct {
- stack p_stack; /* Stack of parser states */
- grammar *p_grammar; /* Grammar to use */
- node *p_tree; /* Top of parse tree */
+ stack p_stack; /* Stack of parser states */
+ grammar *p_grammar; /* Grammar to use */
+ node *p_tree; /* Top of parse tree */
#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
- unsigned long p_flags; /* see co_flags in Include/code.h */
+ unsigned long p_flags; /* see co_flags in Include/code.h */
#endif
} parser_state;
parser_state *PyParser_New(grammar *g, int start);
void PyParser_Delete(parser_state *ps);
-int PyParser_AddToken(parser_state *ps, int type, char *str,
- int lineno, int col_offset,
- int end_lineno, int end_col_offset,
+int PyParser_AddToken(parser_state *ps, int type, char *str, int lineno, int col_offset,
int *expected_ret);
void PyGrammar_AddAccelerators(grammar *g);
-
-#define showtree _Py_showtree
-#define printtree _Py_printtree
-#define dumptree _Py_dumptree
-
#ifdef __cplusplus
}
#endif
diff --git a/Parser/parsetok.c b/Parser/parsetok.c
index a5d7897..a5e9222 100644
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@@ -1,7 +1,7 @@
/* Parser-tokenizer link implementation */
-#include "Python.h"
+#include "pgenheaders.h"
#include "tokenizer.h"
#include "node.h"
#include "grammar.h"
@@ -10,53 +10,12 @@
#include "errcode.h"
#include "graminit.h"
+int Py_TabcheckFlag;
+
/* Forward */
static node *parsetok(struct tok_state *, grammar *, int, perrdetail *, int *);
-static int initerr(perrdetail *err_ret, PyObject * filename);
-
-typedef struct {
- struct {
- int lineno;
- char *comment;
- } *items;
- size_t size;
- size_t num_items;
-} growable_comment_array;
-
-static int
-growable_comment_array_init(growable_comment_array *arr, size_t initial_size) {
- assert(initial_size > 0);
- arr->items = malloc(initial_size * sizeof(*arr->items));
- arr->size = initial_size;
- arr->num_items = 0;
-
- return arr->items != NULL;
-}
-
-static int
-growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) {
- if (arr->num_items >= arr->size) {
- arr->size *= 2;
- arr->items = realloc(arr->items, arr->size * sizeof(*arr->items));
- if (!arr->items) {
- return 0;
- }
- }
-
- arr->items[arr->num_items].lineno = lineno;
- arr->items[arr->num_items].comment = comment;
- arr->num_items++;
- return 1;
-}
-
-static void
-growable_comment_array_deallocate(growable_comment_array *arr) {
- for (unsigned i = 0; i < arr->num_items; i++) {
- PyObject_FREE(arr->items[i].comment);
- }
- free(arr->items);
-}
+static void initerr(perrdetail *err_ret, const char* filename);
/* Parse input coming from a string. Return error code, print some errors. */
node *
@@ -84,135 +43,74 @@ PyParser_ParseStringFlagsFilename(const char *s, const char *filename,
}
node *
-PyParser_ParseStringObject(const char *s, PyObject *filename,
- grammar *g, int start,
- perrdetail *err_ret, int *flags)
+PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename,
+ grammar *g, int start,
+ perrdetail *err_ret, int *flags)
{
struct tok_state *tok;
- int exec_input = start == file_input;
- if (initerr(err_ret, filename) < 0)
- return NULL;
-
- if (PySys_Audit("compile", "yO", s, err_ret->filename) < 0) {
- err_ret->error = E_ERROR;
- return NULL;
- }
+ initerr(err_ret, filename);
- if (*flags & PyPARSE_IGNORE_COOKIE)
- tok = PyTokenizer_FromUTF8(s, exec_input);
- else
- tok = PyTokenizer_FromString(s, exec_input);
- if (tok == NULL) {
+ if ((tok = PyTokenizer_FromString(s, start == file_input)) == NULL) {
err_ret->error = PyErr_Occurred() ? E_DECODE : E_NOMEM;
return NULL;
}
- if (*flags & PyPARSE_TYPE_COMMENTS) {
- tok->type_comments = 1;
+
+ tok->filename = filename ? filename : "<string>";
+ if (Py_TabcheckFlag || Py_VerboseFlag) {
+ tok->altwarning = (tok->filename != NULL);
+ if (Py_TabcheckFlag >= 2)
+ tok->alterror++;
}
- Py_INCREF(err_ret->filename);
- tok->filename = err_ret->filename;
- if (*flags & PyPARSE_ASYNC_HACKS)
- tok->async_hacks = 1;
return parsetok(tok, g, start, err_ret, flags);
}
-node *
-PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename_str,
- grammar *g, int start,
- perrdetail *err_ret, int *flags)
-{
- node *n;
- PyObject *filename = NULL;
- if (filename_str != NULL) {
- filename = PyUnicode_DecodeFSDefault(filename_str);
- if (filename == NULL) {
- err_ret->error = E_ERROR;
- return NULL;
- }
- }
- n = PyParser_ParseStringObject(s, filename, g, start, err_ret, flags);
- Py_XDECREF(filename);
- return n;
-}
-
/* Parse input coming from a file. Return error code, print some errors. */
node *
PyParser_ParseFile(FILE *fp, const char *filename, grammar *g, int start,
- const char *ps1, const char *ps2,
- perrdetail *err_ret)
+ char *ps1, char *ps2, perrdetail *err_ret)
{
- return PyParser_ParseFileFlags(fp, filename, NULL,
- g, start, ps1, ps2, err_ret, 0);
+ return PyParser_ParseFileFlags(fp, filename, g, start, ps1, ps2,
+ err_ret, 0);
}
node *
-PyParser_ParseFileFlags(FILE *fp, const char *filename, const char *enc,
- grammar *g, int start,
- const char *ps1, const char *ps2,
- perrdetail *err_ret, int flags)
+PyParser_ParseFileFlags(FILE *fp, const char *filename, grammar *g, int start,
+ char *ps1, char *ps2, perrdetail *err_ret, int flags)
{
int iflags = flags;
- return PyParser_ParseFileFlagsEx(fp, filename, enc, g, start, ps1,
- ps2, err_ret, &iflags);
+ return PyParser_ParseFileFlagsEx(fp, filename, g, start, ps1, ps2, err_ret, &iflags);
}
node *
-PyParser_ParseFileObject(FILE *fp, PyObject *filename,
- const char *enc, grammar *g, int start,
- const char *ps1, const char *ps2,
- perrdetail *err_ret, int *flags)
+PyParser_ParseFileFlagsEx(FILE *fp, const char *filename, grammar *g, int start,
+ char *ps1, char *ps2, perrdetail *err_ret, int *flags)
{
struct tok_state *tok;
- if (initerr(err_ret, filename) < 0)
- return NULL;
+ initerr(err_ret, filename);
- if (PySys_Audit("compile", "OO", Py_None, err_ret->filename) < 0) {
- return NULL;
- }
-
- if ((tok = PyTokenizer_FromFile(fp, enc, ps1, ps2)) == NULL) {
+ if ((tok = PyTokenizer_FromFile(fp, ps1, ps2)) == NULL) {
err_ret->error = E_NOMEM;
return NULL;
}
- if (*flags & PyPARSE_TYPE_COMMENTS) {
- tok->type_comments = 1;
+ tok->filename = filename;
+ if (Py_TabcheckFlag || Py_VerboseFlag) {
+ tok->altwarning = (filename != NULL);
+ if (Py_TabcheckFlag >= 2)
+ tok->alterror++;
}
- Py_INCREF(err_ret->filename);
- tok->filename = err_ret->filename;
- return parsetok(tok, g, start, err_ret, flags);
-}
-node *
-PyParser_ParseFileFlagsEx(FILE *fp, const char *filename,
- const char *enc, grammar *g, int start,
- const char *ps1, const char *ps2,
- perrdetail *err_ret, int *flags)
-{
- node *n;
- PyObject *fileobj = NULL;
- if (filename != NULL) {
- fileobj = PyUnicode_DecodeFSDefault(filename);
- if (fileobj == NULL) {
- err_ret->error = E_ERROR;
- return NULL;
- }
- }
- n = PyParser_ParseFileObject(fp, fileobj, enc, g,
- start, ps1, ps2, err_ret, flags);
- Py_XDECREF(fileobj);
- return n;
+ return parsetok(tok, g, start, err_ret, flags);
}
-#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
#if 0
-static const char with_msg[] =
+static char with_msg[] =
"%s:%d: Warning: 'with' will become a reserved keyword in Python 2.6\n";
-static const char as_msg[] =
+static char as_msg[] =
"%s:%d: Warning: 'as' will become a reserved keyword in Python 2.6\n";
static void
@@ -223,7 +121,6 @@ warn(const char *msg, const char *filename, int lineno)
PySys_WriteStderr(msg, filename, lineno);
}
#endif
-#endif
/* Parse input coming from the given tokenizer structure.
Return error code. */
@@ -235,25 +132,21 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
parser_state *ps;
node *n;
int started = 0;
- int col_offset, end_col_offset;
- growable_comment_array type_ignores;
-
- if (!growable_comment_array_init(&type_ignores, 10)) {
- err_ret->error = E_NOMEM;
- PyTokenizer_Free(tok);
- return NULL;
- }
if ((ps = PyParser_New(g, start)) == NULL) {
+ fprintf(stderr, "no mem for new parser\n");
err_ret->error = E_NOMEM;
PyTokenizer_Free(tok);
return NULL;
}
#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
- if (*flags & PyPARSE_BARRY_AS_BDFL)
- ps->p_flags |= CO_FUTURE_BARRY_AS_BDFL;
- if (*flags & PyPARSE_TYPE_COMMENTS)
- ps->p_flags |= PyCF_TYPE_COMMENTS;
+ if (*flags & PyPARSE_PRINT_IS_FUNCTION) {
+ ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
+ }
+ if (*flags & PyPARSE_UNICODE_LITERALS) {
+ ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
+ }
+
#endif
for (;;) {
@@ -261,9 +154,7 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
int type;
size_t len;
char *str;
- col_offset = -1;
- int lineno;
- const char *line_start;
+ int col_offset;
type = PyTokenizer_Get(tok, &a, &b);
if (type == ERRORTOKEN) {
@@ -288,6 +179,7 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
len = (a != NULL && b != NULL) ? b - a : 0;
str = (char *) PyObject_MALLOC(len + 1);
if (str == NULL) {
+ fprintf(stderr, "no mem for next token\n");
err_ret->error = E_NOMEM;
break;
}
@@ -296,56 +188,16 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
str[len] = '\0';
#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
- if (type == NOTEQUAL) {
- if (!(ps->p_flags & CO_FUTURE_BARRY_AS_BDFL) &&
- strcmp(str, "!=")) {
- PyObject_FREE(str);
- err_ret->error = E_SYNTAX;
- break;
- }
- else if ((ps->p_flags & CO_FUTURE_BARRY_AS_BDFL) &&
- strcmp(str, "<>")) {
- PyObject_FREE(str);
- err_ret->expected = NOTEQUAL;
- err_ret->error = E_SYNTAX;
- break;
- }
- }
#endif
-
- /* Nodes of type STRING, especially multi line strings
- must be handled differently in order to get both
- the starting line number and the column offset right.
- (cf. issue 16806) */
- lineno = type == STRING ? tok->first_lineno : tok->lineno;
- line_start = type == STRING ? tok->multi_line_start : tok->line_start;
- if (a != NULL && a >= line_start) {
- col_offset = Py_SAFE_DOWNCAST(a - line_start,
- intptr_t, int);
+ if (a != NULL && a >= tok->line_start) {
+ col_offset = a - tok->line_start;
}
else {
col_offset = -1;
}
- if (b != NULL && b >= tok->line_start) {
- end_col_offset = Py_SAFE_DOWNCAST(b - tok->line_start,
- intptr_t, int);
- }
- else {
- end_col_offset = -1;
- }
-
- if (type == TYPE_IGNORE) {
- if (!growable_comment_array_add(&type_ignores, tok->lineno, str)) {
- err_ret->error = E_NOMEM;
- break;
- }
- continue;
- }
-
if ((err_ret->error =
- PyParser_AddToken(ps, (int)type, str,
- lineno, col_offset, tok->lineno, end_col_offset,
+ PyParser_AddToken(ps, (int)type, str, tok->lineno, col_offset,
&(err_ret->expected))) != E_OK) {
if (err_ret->error != E_DONE) {
PyObject_FREE(str);
@@ -358,87 +210,38 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
if (err_ret->error == E_DONE) {
n = ps->p_tree;
ps->p_tree = NULL;
-
- if (n->n_type == file_input) {
- /* Put type_ignore nodes in the ENDMARKER of file_input. */
- int num;
- node *ch;
- size_t i;
-
- num = NCH(n);
- ch = CHILD(n, num - 1);
- REQ(ch, ENDMARKER);
-
- for (i = 0; i < type_ignores.num_items; i++) {
- int res = PyNode_AddChild(ch, TYPE_IGNORE, type_ignores.items[i].comment,
- type_ignores.items[i].lineno, 0,
- type_ignores.items[i].lineno, 0);
- if (res != 0) {
- err_ret->error = res;
- PyNode_Free(n);
- n = NULL;
- break;
- }
- type_ignores.items[i].comment = NULL;
- }
- }
-
- /* Check that the source for a single input statement really
- is a single statement by looking at what is left in the
- buffer after parsing. Trailing whitespace and comments
- are OK. */
- if (err_ret->error == E_DONE && start == single_input) {
- char *cur = tok->cur;
- char c = *tok->cur;
-
- for (;;) {
- while (c == ' ' || c == '\t' || c == '\n' || c == '\014')
- c = *++cur;
-
- if (!c)
- break;
-
- if (c != '#') {
- err_ret->error = E_BADSINGLE;
- PyNode_Free(n);
- n = NULL;
- break;
- }
-
- /* Suck up comment. */
- while (c && c != '\n')
- c = *++cur;
- }
- }
}
else
n = NULL;
- growable_comment_array_deallocate(&type_ignores);
-
#ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
*flags = ps->p_flags;
#endif
PyParser_Delete(ps);
if (n == NULL) {
- if (tok->done == E_EOF)
+ if (tok->lineno <= 1 && tok->done == E_EOF)
err_ret->error = E_EOF;
err_ret->lineno = tok->lineno;
if (tok->buf != NULL) {
+ char *text = NULL;
size_t len;
assert(tok->cur - tok->buf < INT_MAX);
- /* if we've managed to parse a token, point the offset to its start,
- * else use the current reading position of the tokenizer
- */
- err_ret->offset = col_offset != -1 ? col_offset + 1 : ((int)(tok->cur - tok->buf));
+ err_ret->offset = (int)(tok->cur - tok->buf);
len = tok->inp - tok->buf;
- err_ret->text = (char *) PyObject_MALLOC(len + 1);
- if (err_ret->text != NULL) {
- if (len > 0)
- strncpy(err_ret->text, tok->buf, len);
- err_ret->text[len] = '\0';
+#ifdef Py_USING_UNICODE
+ text = PyTokenizer_RestoreEncoding(tok, len, &err_ret->offset);
+
+#endif
+ if (text == NULL) {
+ text = (char *) PyObject_MALLOC(len + 1);
+ if (text != NULL) {
+ if (len > 0)
+ strncpy(text, tok->buf, len);
+ text[len] = '\0';
+ }
}
+ err_ret->text = text;
}
} else if (tok->encoding != NULL) {
/* 'nodes->n_str' uses PyObject_*, while 'tok->encoding' was
@@ -465,31 +268,17 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
done:
PyTokenizer_Free(tok);
- if (n != NULL) {
- _PyNode_FinalizeEndPos(n);
- }
return n;
}
-static int
-initerr(perrdetail *err_ret, PyObject *filename)
+static void
+initerr(perrdetail *err_ret, const char *filename)
{
err_ret->error = E_OK;
+ err_ret->filename = filename;
err_ret->lineno = 0;
err_ret->offset = 0;
err_ret->text = NULL;
err_ret->token = -1;
err_ret->expected = -1;
- if (filename) {
- Py_INCREF(filename);
- err_ret->filename = filename;
- }
- else {
- err_ret->filename = PyUnicode_FromString("<string>");
- if (err_ret->filename == NULL) {
- err_ret->error = E_ERROR;
- return -1;
- }
- }
- return 0;
}
diff --git a/Parser/pgen.c b/Parser/pgen.c
new file mode 100644
index 0000000..b20d976
--- /dev/null
+++ b/Parser/pgen.c
@@ -0,0 +1,726 @@
+/* Parser generator */
+
+/* For a description, see the comments at end of this file */
+
+#include "Python.h"
+#include "pgenheaders.h"
+#include "token.h"
+#include "node.h"
+#include "grammar.h"
+#include "metagrammar.h"
+#include "pgen.h"
+
+extern int Py_DebugFlag;
+extern int Py_IgnoreEnvironmentFlag; /* needed by Py_GETENV */
+
+
+/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
+
+typedef struct _nfaarc {
+ int ar_label;
+ int ar_arrow;
+} nfaarc;
+
+typedef struct _nfastate {
+ int st_narcs;
+ nfaarc *st_arc;
+} nfastate;
+
+typedef struct _nfa {
+ int nf_type;
+ char *nf_name;
+ int nf_nstates;
+ nfastate *nf_state;
+ int nf_start, nf_finish;
+} nfa;
+
+/* Forward */
+static void compile_rhs(labellist *ll,
+ nfa *nf, node *n, int *pa, int *pb);
+static void compile_alt(labellist *ll,
+ nfa *nf, node *n, int *pa, int *pb);
+static void compile_item(labellist *ll,
+ nfa *nf, node *n, int *pa, int *pb);
+static void compile_atom(labellist *ll,
+ nfa *nf, node *n, int *pa, int *pb);
+
+static int
+addnfastate(nfa *nf)
+{
+ nfastate *st;
+
+ nf->nf_state = (nfastate *)PyObject_REALLOC(nf->nf_state,
+ sizeof(nfastate) * (nf->nf_nstates + 1));
+ if (nf->nf_state == NULL)
+ Py_FatalError("out of mem");
+ st = &nf->nf_state[nf->nf_nstates++];
+ st->st_narcs = 0;
+ st->st_arc = NULL;
+ return st - nf->nf_state;
+}
+
+static void
+addnfaarc(nfa *nf, int from, int to, int lbl)
+{
+ nfastate *st;
+ nfaarc *ar;
+
+ st = &nf->nf_state[from];
+ st->st_arc = (nfaarc *)PyObject_REALLOC(st->st_arc,
+ sizeof(nfaarc) * (st->st_narcs + 1));
+ if (st->st_arc == NULL)
+ Py_FatalError("out of mem");
+ ar = &st->st_arc[st->st_narcs++];
+ ar->ar_label = lbl;
+ ar->ar_arrow = to;
+}
+
+static nfa *
+newnfa(char *name)
+{
+ nfa *nf;
+ static int type = NT_OFFSET; /* All types will be disjunct */
+
+ nf = (nfa *)PyObject_MALLOC(sizeof(nfa));
+ if (nf == NULL)
+ Py_FatalError("no mem for new nfa");
+ nf->nf_type = type++;
+ nf->nf_name = name; /* XXX strdup(name) ??? */
+ nf->nf_nstates = 0;
+ nf->nf_state = NULL;
+ nf->nf_start = nf->nf_finish = -1;
+ return nf;
+}
+
+typedef struct _nfagrammar {
+ int gr_nnfas;
+ nfa **gr_nfa;
+ labellist gr_ll;
+} nfagrammar;
+
+/* Forward */
+static void compile_rule(nfagrammar *gr, node *n);
+
+static nfagrammar *
+newnfagrammar(void)
+{
+ nfagrammar *gr;
+
+ gr = (nfagrammar *)PyObject_MALLOC(sizeof(nfagrammar));
+ if (gr == NULL)
+ Py_FatalError("no mem for new nfa grammar");
+ gr->gr_nnfas = 0;
+ gr->gr_nfa = NULL;
+ gr->gr_ll.ll_nlabels = 0;
+ gr->gr_ll.ll_label = NULL;
+ addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
+ return gr;
+}
+
+static void
+freenfagrammar(nfagrammar *gr)
+{
+ int i;
+ for (i = 0; i < gr->gr_nnfas; i++) {
+ PyObject_FREE(gr->gr_nfa[i]->nf_state);
+ }
+ PyObject_FREE(gr->gr_nfa);
+ PyObject_FREE(gr);
+}
+
+static nfa *
+addnfa(nfagrammar *gr, char *name)
+{
+ nfa *nf;
+
+ nf = newnfa(name);
+ gr->gr_nfa = (nfa **)PyObject_REALLOC(gr->gr_nfa,
+ sizeof(nfa*) * (gr->gr_nnfas + 1));
+ if (gr->gr_nfa == NULL)
+ Py_FatalError("out of mem");
+ gr->gr_nfa[gr->gr_nnfas++] = nf;
+ addlabel(&gr->gr_ll, NAME, nf->nf_name);
+ return nf;
+}
+
+#ifdef Py_DEBUG
+
+static char REQNFMT[] = "metacompile: less than %d children\n";
+
+#define REQN(i, count) do { \
+ if (i < count) { \
+ fprintf(stderr, REQNFMT, count); \
+ Py_FatalError("REQN"); \
+ } \
+} while (0)
+
+#else
+#define REQN(i, count) /* empty */
+#endif
+
+static nfagrammar *
+metacompile(node *n)
+{
+ nfagrammar *gr;
+ int i;
+
+ if (Py_DebugFlag)
+ printf("Compiling (meta-) parse tree into NFA grammar\n");
+ gr = newnfagrammar();
+ REQ(n, MSTART);
+ i = n->n_nchildren - 1; /* Last child is ENDMARKER */
+ n = n->n_child;
+ for (; --i >= 0; n++) {
+ if (n->n_type != NEWLINE)
+ compile_rule(gr, n);
+ }
+ return gr;
+}
+
+static void
+compile_rule(nfagrammar *gr, node *n)
+{
+ nfa *nf;
+
+ REQ(n, RULE);
+ REQN(n->n_nchildren, 4);
+ n = n->n_child;
+ REQ(n, NAME);
+ nf = addnfa(gr, n->n_str);
+ n++;
+ REQ(n, COLON);
+ n++;
+ REQ(n, RHS);
+ compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
+ n++;
+ REQ(n, NEWLINE);
+}
+
+static void
+compile_rhs(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
+{
+ int i;
+ int a, b;
+
+ REQ(n, RHS);
+ i = n->n_nchildren;
+ REQN(i, 1);
+ n = n->n_child;
+ REQ(n, ALT);
+ compile_alt(ll, nf, n, pa, pb);
+ if (--i <= 0)
+ return;
+ n++;
+ a = *pa;
+ b = *pb;
+ *pa = addnfastate(nf);
+ *pb = addnfastate(nf);
+ addnfaarc(nf, *pa, a, EMPTY);
+ addnfaarc(nf, b, *pb, EMPTY);
+ for (; --i >= 0; n++) {
+ REQ(n, VBAR);
+ REQN(i, 1);
+ --i;
+ n++;
+ REQ(n, ALT);
+ compile_alt(ll, nf, n, &a, &b);
+ addnfaarc(nf, *pa, a, EMPTY);
+ addnfaarc(nf, b, *pb, EMPTY);
+ }
+}
+
+static void
+compile_alt(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
+{
+ int i;
+ int a, b;
+
+ REQ(n, ALT);
+ i = n->n_nchildren;
+ REQN(i, 1);
+ n = n->n_child;
+ REQ(n, ITEM);
+ compile_item(ll, nf, n, pa, pb);
+ --i;
+ n++;
+ for (; --i >= 0; n++) {
+ REQ(n, ITEM);
+ compile_item(ll, nf, n, &a, &b);
+ addnfaarc(nf, *pb, a, EMPTY);
+ *pb = b;
+ }
+}
+
+static void
+compile_item(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
+{
+ int i;
+ int a, b;
+
+ REQ(n, ITEM);
+ i = n->n_nchildren;
+ REQN(i, 1);
+ n = n->n_child;
+ if (n->n_type == LSQB) {
+ REQN(i, 3);
+ n++;
+ REQ(n, RHS);
+ *pa = addnfastate(nf);
+ *pb = addnfastate(nf);
+ addnfaarc(nf, *pa, *pb, EMPTY);
+ compile_rhs(ll, nf, n, &a, &b);
+ addnfaarc(nf, *pa, a, EMPTY);
+ addnfaarc(nf, b, *pb, EMPTY);
+ REQN(i, 1);
+ n++;
+ REQ(n, RSQB);
+ }
+ else {
+ compile_atom(ll, nf, n, pa, pb);
+ if (--i <= 0)
+ return;
+ n++;
+ addnfaarc(nf, *pb, *pa, EMPTY);
+ if (n->n_type == STAR)
+ *pb = *pa;
+ else
+ REQ(n, PLUS);
+ }
+}
+
+static void
+compile_atom(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
+{
+ int i;
+
+ REQ(n, ATOM);
+ i = n->n_nchildren;
+ (void)i; /* Don't warn about set but unused */
+ REQN(i, 1);
+ n = n->n_child;
+ if (n->n_type == LPAR) {
+ REQN(i, 3);
+ n++;
+ REQ(n, RHS);
+ compile_rhs(ll, nf, n, pa, pb);
+ n++;
+ REQ(n, RPAR);
+ }
+ else if (n->n_type == NAME || n->n_type == STRING) {
+ *pa = addnfastate(nf);
+ *pb = addnfastate(nf);
+ addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
+ }
+ else
+ REQ(n, NAME);
+}
+
+static void
+dumpstate(labellist *ll, nfa *nf, int istate)
+{
+ nfastate *st;
+ int i;
+ nfaarc *ar;
+
+ printf("%c%2d%c",
+ istate == nf->nf_start ? '*' : ' ',
+ istate,
+ istate == nf->nf_finish ? '.' : ' ');
+ st = &nf->nf_state[istate];
+ ar = st->st_arc;
+ for (i = 0; i < st->st_narcs; i++) {
+ if (i > 0)
+ printf("\n ");
+ printf("-> %2d %s", ar->ar_arrow,
+ PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label]));
+ ar++;
+ }
+ printf("\n");
+}
+
+static void
+dumpnfa(labellist *ll, nfa *nf)
+{
+ int i;
+
+ printf("NFA '%s' has %d states; start %d, finish %d\n",
+ nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
+ for (i = 0; i < nf->nf_nstates; i++)
+ dumpstate(ll, nf, i);
+}
+
+
+/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
+
+static void
+addclosure(bitset ss, nfa *nf, int istate)
+{
+ if (addbit(ss, istate)) {
+ nfastate *st = &nf->nf_state[istate];
+ nfaarc *ar = st->st_arc;
+ int i;
+
+ for (i = st->st_narcs; --i >= 0; ) {
+ if (ar->ar_label == EMPTY)
+ addclosure(ss, nf, ar->ar_arrow);
+ ar++;
+ }
+ }
+}
+
+typedef struct _ss_arc {
+ bitset sa_bitset;
+ int sa_arrow;
+ int sa_label;
+} ss_arc;
+
+typedef struct _ss_state {
+ bitset ss_ss;
+ int ss_narcs;
+ struct _ss_arc *ss_arc;
+ int ss_deleted;
+ int ss_finish;
+ int ss_rename;
+} ss_state;
+
+typedef struct _ss_dfa {
+ int sd_nstates;
+ ss_state *sd_state;
+} ss_dfa;
+
+/* Forward */
+static void printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
+ labellist *ll, char *msg);
+static void simplify(int xx_nstates, ss_state *xx_state);
+static void convert(dfa *d, int xx_nstates, ss_state *xx_state);
+
+static void
+makedfa(nfagrammar *gr, nfa *nf, dfa *d)
+{
+ int nbits = nf->nf_nstates;
+ bitset ss;
+ int xx_nstates;
+ ss_state *xx_state, *yy;
+ ss_arc *zz;
+ int istate, jstate, iarc, jarc, ibit;
+ nfastate *st;
+ nfaarc *ar;
+ int i, j;
+
+ ss = newbitset(nbits);
+ addclosure(ss, nf, nf->nf_start);
+ xx_state = (ss_state *)PyObject_MALLOC(sizeof(ss_state));
+ if (xx_state == NULL)
+ Py_FatalError("no mem for xx_state in makedfa");
+ xx_nstates = 1;
+ yy = &xx_state[0];
+ yy->ss_ss = ss;
+ yy->ss_narcs = 0;
+ yy->ss_arc = NULL;
+ yy->ss_deleted = 0;
+ yy->ss_finish = testbit(ss, nf->nf_finish);
+ if (yy->ss_finish)
+ printf("Error: nonterminal '%s' may produce empty.\n",
+ nf->nf_name);
+
+ /* This algorithm is from a book written before
+ the invention of structured programming... */
+
+ /* For each unmarked state... */
+ for (istate = 0; istate < xx_nstates; ++istate) {
+ size_t size;
+ yy = &xx_state[istate];
+ ss = yy->ss_ss;
+ /* For all its states... */
+ for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
+ if (!testbit(ss, ibit))
+ continue;
+ st = &nf->nf_state[ibit];
+ /* For all non-empty arcs from this state... */
+ for (iarc = 0; iarc < st->st_narcs; iarc++) {
+ ar = &st->st_arc[iarc];
+ if (ar->ar_label == EMPTY)
+ continue;
+ /* Look up in list of arcs from this state */
+ for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
+ zz = &yy->ss_arc[jarc];
+ if (ar->ar_label == zz->sa_label)
+ goto found;
+ }
+ /* Add new arc for this state */
+ size = sizeof(ss_arc) * (yy->ss_narcs + 1);
+ yy->ss_arc = (ss_arc *)PyObject_REALLOC(
+ yy->ss_arc, size);
+ if (yy->ss_arc == NULL)
+ Py_FatalError("out of mem");
+ zz = &yy->ss_arc[yy->ss_narcs++];
+ zz->sa_label = ar->ar_label;
+ zz->sa_bitset = newbitset(nbits);
+ zz->sa_arrow = -1;
+ found: ;
+ /* Add destination */
+ addclosure(zz->sa_bitset, nf, ar->ar_arrow);
+ }
+ }
+ /* Now look up all the arrow states */
+ for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
+ zz = &xx_state[istate].ss_arc[jarc];
+ for (jstate = 0; jstate < xx_nstates; jstate++) {
+ if (samebitset(zz->sa_bitset,
+ xx_state[jstate].ss_ss, nbits)) {
+ zz->sa_arrow = jstate;
+ goto done;
+ }
+ }
+ size = sizeof(ss_state) * (xx_nstates + 1);
+ xx_state = (ss_state *)PyObject_REALLOC(xx_state,
+ size);
+ if (xx_state == NULL)
+ Py_FatalError("out of mem");
+ zz->sa_arrow = xx_nstates;
+ yy = &xx_state[xx_nstates++];
+ yy->ss_ss = zz->sa_bitset;
+ yy->ss_narcs = 0;
+ yy->ss_arc = NULL;
+ yy->ss_deleted = 0;
+ yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
+ done: ;
+ }
+ }
+
+ if (Py_DebugFlag)
+ printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
+ "before minimizing");
+
+ simplify(xx_nstates, xx_state);
+
+ if (Py_DebugFlag)
+ printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
+ "after minimizing");
+
+ convert(d, xx_nstates, xx_state);
+
+ for (i = 0; i < xx_nstates; i++) {
+ for (j = 0; j < xx_state[i].ss_narcs; j++)
+ delbitset(xx_state[i].ss_arc[j].sa_bitset);
+ PyObject_FREE(xx_state[i].ss_arc);
+ }
+ PyObject_FREE(xx_state);
+}
+
+static void
+printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
+ labellist *ll, char *msg)
+{
+ int i, ibit, iarc;
+ ss_state *yy;
+ ss_arc *zz;
+
+ printf("Subset DFA %s\n", msg);
+ for (i = 0; i < xx_nstates; i++) {
+ yy = &xx_state[i];
+ if (yy->ss_deleted)
+ continue;
+ printf(" Subset %d", i);
+ if (yy->ss_finish)
+ printf(" (finish)");
+ printf(" { ");
+ for (ibit = 0; ibit < nbits; ibit++) {
+ if (testbit(yy->ss_ss, ibit))
+ printf("%d ", ibit);
+ }
+ printf("}\n");
+ for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
+ zz = &yy->ss_arc[iarc];
+ printf(" Arc to state %d, label %s\n",
+ zz->sa_arrow,
+ PyGrammar_LabelRepr(
+ &ll->ll_label[zz->sa_label]));
+ }
+ }
+}
+
+
+/* PART THREE -- SIMPLIFY DFA */
+
+/* Simplify the DFA by repeatedly eliminating states that are
+ equivalent to another oner. This is NOT Algorithm 3.3 from
+ [Aho&Ullman 77]. It does not always finds the minimal DFA,
+ but it does usually make a much smaller one... (For an example
+ of sub-optimal behavior, try S: x a b+ | y a b+.)
+*/
+
+static int
+samestate(ss_state *s1, ss_state *s2)
+{
+ int i;
+
+ if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
+ return 0;
+ for (i = 0; i < s1->ss_narcs; i++) {
+ if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
+ s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
+ return 0;
+ }
+ return 1;
+}
+
+static void
+renamestates(int xx_nstates, ss_state *xx_state, int from, int to)
+{
+ int i, j;
+
+ if (Py_DebugFlag)
+ printf("Rename state %d to %d.\n", from, to);
+ for (i = 0; i < xx_nstates; i++) {
+ if (xx_state[i].ss_deleted)
+ continue;
+ for (j = 0; j < xx_state[i].ss_narcs; j++) {
+ if (xx_state[i].ss_arc[j].sa_arrow == from)
+ xx_state[i].ss_arc[j].sa_arrow = to;
+ }
+ }
+}
+
+static void
+simplify(int xx_nstates, ss_state *xx_state)
+{
+ int changes;
+ int i, j;
+
+ do {
+ changes = 0;
+ for (i = 1; i < xx_nstates; i++) {
+ if (xx_state[i].ss_deleted)
+ continue;
+ for (j = 0; j < i; j++) {
+ if (xx_state[j].ss_deleted)
+ continue;
+ if (samestate(&xx_state[i], &xx_state[j])) {
+ xx_state[i].ss_deleted++;
+ renamestates(xx_nstates, xx_state,
+ i, j);
+ changes++;
+ break;
+ }
+ }
+ }
+ } while (changes);
+}
+
+
+/* PART FOUR -- GENERATE PARSING TABLES */
+
+/* Convert the DFA into a grammar that can be used by our parser */
+
+static void
+convert(dfa *d, int xx_nstates, ss_state *xx_state)
+{
+ int i, j;
+ ss_state *yy;
+ ss_arc *zz;
+
+ for (i = 0; i < xx_nstates; i++) {
+ yy = &xx_state[i];
+ if (yy->ss_deleted)
+ continue;
+ yy->ss_rename = addstate(d);
+ }
+
+ for (i = 0; i < xx_nstates; i++) {
+ yy = &xx_state[i];
+ if (yy->ss_deleted)
+ continue;
+ for (j = 0; j < yy->ss_narcs; j++) {
+ zz = &yy->ss_arc[j];
+ addarc(d, yy->ss_rename,
+ xx_state[zz->sa_arrow].ss_rename,
+ zz->sa_label);
+ }
+ if (yy->ss_finish)
+ addarc(d, yy->ss_rename, yy->ss_rename, 0);
+ }
+
+ d->d_initial = 0;
+}
+
+
+/* PART FIVE -- GLUE IT ALL TOGETHER */
+
+static grammar *
+maketables(nfagrammar *gr)
+{
+ int i;
+ nfa *nf;
+ dfa *d;
+ grammar *g;
+
+ if (gr->gr_nnfas == 0)
+ return NULL;
+ g = newgrammar(gr->gr_nfa[0]->nf_type);
+ /* XXX first rule must be start rule */
+ g->g_ll = gr->gr_ll;
+
+ for (i = 0; i < gr->gr_nnfas; i++) {
+ nf = gr->gr_nfa[i];
+ if (Py_DebugFlag) {
+ printf("Dump of NFA for '%s' ...\n", nf->nf_name);
+ dumpnfa(&gr->gr_ll, nf);
+ printf("Making DFA for '%s' ...\n", nf->nf_name);
+ }
+ d = adddfa(g, nf->nf_type, nf->nf_name);
+ makedfa(gr, gr->gr_nfa[i], d);
+ }
+
+ return g;
+}
+
+grammar *
+pgen(node *n)
+{
+ nfagrammar *gr;
+ grammar *g;
+
+ gr = metacompile(n);
+ g = maketables(gr);
+ translatelabels(g);
+ addfirstsets(g);
+ freenfagrammar(gr);
+ return g;
+}
+
+grammar *
+Py_pgen(node *n)
+{
+ return pgen(n);
+}
+
+/*
+
+Description
+-----------
+
+Input is a grammar in extended BNF (using * for repetition, + for
+at-least-once repetition, [] for optional parts, | for alternatives and
+() for grouping). This has already been parsed and turned into a parse
+tree.
+
+Each rule is considered as a regular expression in its own right.
+It is turned into a Non-deterministic Finite Automaton (NFA), which
+is then turned into a Deterministic Finite Automaton (DFA), which is then
+optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
+or similar compiler books (this technique is more often used for lexical
+analyzers).
+
+The DFA's are used by the parser as parsing tables in a special way
+that's probably unique. Before they are usable, the FIRST sets of all
+non-terminals are computed.
+
+Reference
+---------
+
+[Aho&Ullman 77]
+ Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
+ (first edition)
+
+*/
diff --git a/Parser/pgen/__init__.py b/Parser/pgen/__init__.py
deleted file mode 100644
index e69de29..0000000
--- a/Parser/pgen/__init__.py
+++ /dev/null
diff --git a/Parser/pgen/__main__.py b/Parser/pgen/__main__.py
deleted file mode 100644
index bb96e75..0000000
--- a/Parser/pgen/__main__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import argparse
-
-from .pgen import ParserGenerator
-
-
-def main():
- parser = argparse.ArgumentParser(description="Parser generator main program.")
- parser.add_argument(
- "grammar", type=str, help="The file with the grammar definition in EBNF format"
- )
- parser.add_argument("tokens", type=str, help="The file with the token definitions")
- parser.add_argument(
- "graminit_h",
- type=argparse.FileType("w"),
- help="The path to write the grammar's non-terminals as #defines",
- )
- parser.add_argument(
- "graminit_c",
- type=argparse.FileType("w"),
- help="The path to write the grammar as initialized data",
- )
-
- parser.add_argument("--verbose", "-v", action="count")
- args = parser.parse_args()
-
- p = ParserGenerator(args.grammar, args.tokens, verbose=args.verbose)
- grammar = p.make_grammar()
- grammar.produce_graminit_h(args.graminit_h.write)
- grammar.produce_graminit_c(args.graminit_c.write)
-
-
-if __name__ == "__main__":
- main()
diff --git a/Parser/pgen/automata.py b/Parser/pgen/automata.py
deleted file mode 100644
index 545a737..0000000
--- a/Parser/pgen/automata.py
+++ /dev/null
@@ -1,371 +0,0 @@
-"""Classes representing state-machine concepts"""
-
-class NFA:
- """A non deterministic finite automata
-
- A non deterministic automata is a form of a finite state
- machine. An NFA's rules are less restrictive than a DFA.
- The NFA rules are:
-
- * A transition can be non-deterministic and can result in
- nothing, one, or two or more states.
-
- * An epsilon transition consuming empty input is valid.
- Transitions consuming labeled symbols are also permitted.
-
- This class assumes that there is only one starting state and one
- accepting (ending) state.
-
- Attributes:
- name (str): The name of the rule the NFA is representing.
- start (NFAState): The starting state.
- end (NFAState): The ending state
- """
-
- def __init__(self, start, end):
- self.name = start.rule_name
- self.start = start
- self.end = end
-
- def __repr__(self):
- return "NFA(start={}, end={})".format(self.start, self.end)
-
- def dump(self, writer=print):
- """Dump a graphical representation of the NFA"""
- todo = [self.start]
- for i, state in enumerate(todo):
- writer(" State", i, state is self.end and "(final)" or "")
- for arc in state.arcs:
- label = arc.label
- next = arc.target
- if next in todo:
- j = todo.index(next)
- else:
- j = len(todo)
- todo.append(next)
- if label is None:
- writer(" -> %d" % j)
- else:
- writer(" %s -> %d" % (label, j))
-
-
-class NFAArc:
- """An arc representing a transition between two NFA states.
-
- NFA states can be connected via two ways:
-
- * A label transition: An input equal to the label must
- be consumed to perform the transition.
- * An epsilon transition: The transition can be taken without
- consuming any input symbol.
-
- Attributes:
- target (NFAState): The ending state of the transition arc.
- label (Optional[str]): The label that must be consumed to make
- the transition. An epsilon transition is represented
- using `None`.
- """
-
- def __init__(self, target, label):
- self.target = target
- self.label = label
-
- def __repr__(self):
- return "<%s: %s>" % (self.__class__.__name__, self.label)
-
-
-class NFAState:
- """A state of a NFA, non deterministic finite automata.
-
- Attributes:
- target (rule_name): The name of the rule used to represent the NFA's
- ending state after a transition.
- arcs (Dict[Optional[str], NFAState]): A mapping representing transitions
- between the current NFA state and another NFA state via following
- a label.
- """
-
- def __init__(self, rule_name):
- self.rule_name = rule_name
- self.arcs = []
-
- def add_arc(self, target, label=None):
- """Add a new arc to connect the state to a target state within the NFA
-
- The method adds a new arc to the list of arcs available as transitions
- from the present state. An optional label indicates a named transition
- that consumes an input while the absence of a label represents an epsilon
- transition.
-
- Attributes:
- target (NFAState): The end of the transition that the arc represents.
- label (Optional[str]): The label that must be consumed for making
- the transition. If the label is not provided the transition is assumed
- to be an epsilon-transition.
- """
- assert label is None or isinstance(label, str)
- assert isinstance(target, NFAState)
- self.arcs.append(NFAArc(target, label))
-
- def __repr__(self):
- return "<%s: from %s>" % (self.__class__.__name__, self.rule_name)
-
-
-class DFA:
- """A deterministic finite automata
-
- A deterministic finite automata is a form of a finite state machine
- that obeys the following rules:
-
- * Each of the transitions is uniquely determined by
- the source state and input symbol
- * Reading an input symbol is required for each state
- transition (no epsilon transitions).
-
- The finite-state machine will accept or reject a string of symbols
- and only produces a unique computation of the automaton for each input
- string. The DFA must have a unique starting state (represented as the first
- element in the list of states) but can have multiple final states.
-
- Attributes:
- name (str): The name of the rule the DFA is representing.
- states (List[DFAState]): A collection of DFA states.
- """
-
- def __init__(self, name, states):
- self.name = name
- self.states = states
-
- @classmethod
- def from_nfa(cls, nfa):
- """Constructs a DFA from a NFA using the Rabin–Scott construction algorithm.
-
- To simulate the operation of a DFA on a given input string, it's
- necessary to keep track of a single state at any time, or more precisely,
- the state that the automaton will reach after seeing a prefix of the
- input. In contrast, to simulate an NFA, it's necessary to keep track of
- a set of states: all of the states that the automaton could reach after
- seeing the same prefix of the input, according to the nondeterministic
- choices made by the automaton. There are two possible sources of
- non-determinism:
-
- 1) Multiple (one or more) transitions with the same label
-
- 'A' +-------+
- +----------->+ State +----------->+
- | | 2 |
- +-------+ +-------+
- | State |
- | 1 | +-------+
- +-------+ | State |
- +----------->+ 3 +----------->+
- 'A' +-------+
-
- 2) Epsilon transitions (transitions that can be taken without consuming any input)
-
- +-------+ +-------+
- | State | ε | State |
- | 1 +----------->+ 2 +----------->+
- +-------+ +-------+
-
- Looking at the first case above, we can't determine which transition should be
- followed when given an input A. We could choose whether or not to follow the
- transition while in the second case the problem is that we can choose both to
- follow the transition or not doing it. To solve this problem we can imagine that
- we follow all possibilities at the same time and we construct new states from the
- set of all possible reachable states. For every case in the previous example:
-
-
- 1) For multiple transitions with the same label we colapse all of the
- final states under the same one
-
- +-------+ +-------+
- | State | 'A' | State |
- | 1 +----------->+ 2-3 +----------->+
- +-------+ +-------+
-
- 2) For epsilon transitions we collapse all epsilon-reachable states
- into the same one
-
- +-------+
- | State |
- | 1-2 +----------->
- +-------+
-
- Because the DFA states consist of sets of NFA states, an n-state NFA
- may be converted to a DFA with at most 2**n states. Notice that the
- constructed DFA is not minimal and can be simplified or reduced
- afterwards.
-
- Parameters:
- name (NFA): The NFA to transform to DFA.
- """
- assert isinstance(nfa, NFA)
-
- def add_closure(nfa_state, base_nfa_set):
- """Calculate the epsilon-closure of a given state
-
- Add to the *base_nfa_set* all the states that are
- reachable from *nfa_state* via epsilon-transitions.
- """
- assert isinstance(nfa_state, NFAState)
- if nfa_state in base_nfa_set:
- return
- base_nfa_set.add(nfa_state)
- for nfa_arc in nfa_state.arcs:
- if nfa_arc.label is None:
- add_closure(nfa_arc.target, base_nfa_set)
-
- # Calculte the epsilon-closure of the starting state
- base_nfa_set = set()
- add_closure(nfa.start, base_nfa_set)
-
- # Start by visiting the NFA starting state (there is only one).
- states = [DFAState(nfa.name, base_nfa_set, nfa.end)]
-
- for state in states: # NB states grow while we're iterating
-
- # Find transitions from the current state to other reachable states
- # and store them in mapping that correlates the label to all the
- # possible reachable states that can be obtained by consuming a
- # token equal to the label. Each set of all the states that can
- # be reached after following a label will be the a DFA state.
- arcs = {}
- for nfa_state in state.nfa_set:
- for nfa_arc in nfa_state.arcs:
- if nfa_arc.label is not None:
- nfa_set = arcs.setdefault(nfa_arc.label, set())
- # All states that can be reached by epsilon-transitions
- # are also included in the set of reachable states.
- add_closure(nfa_arc.target, nfa_set)
-
- # Now create new DFAs by visiting all posible transitions between
- # the current DFA state and the new power-set states (each nfa_set)
- # via the different labels. As the nodes are appended to *states* this
- # is performing a breadth-first search traversal over the power-set of
- # the states of the original NFA.
- for label, nfa_set in sorted(arcs.items()):
- for exisisting_state in states:
- if exisisting_state.nfa_set == nfa_set:
- # The DFA state already exists for this rule.
- next_state = exisisting_state
- break
- else:
- next_state = DFAState(nfa.name, nfa_set, nfa.end)
- states.append(next_state)
-
- # Add a transition between the current DFA state and the new
- # DFA state (the power-set state) via the current label.
- state.add_arc(next_state, label)
-
- return cls(nfa.name, states)
-
- def __iter__(self):
- return iter(self.states)
-
- def simplify(self):
- """Attempt to reduce the number of states of the DFA
-
- Transform the DFA into an equivalent DFA that has fewer states. Two
- classes of states can be removed or merged from the original DFA without
- affecting the language it accepts to minimize it:
-
- * Unreachable states can not be reached from the initial
- state of the DFA, for any input string.
- * Nondistinguishable states are those that cannot be distinguished
- from one another for any input string.
-
- This algorithm does not achieve the optimal fully-reduced solution, but it
- works well enough for the particularities of the Python grammar. The
- algorithm repeatedly looks for two states that have the same set of
- arcs (same labels pointing to the same nodes) and unifies them, until
- things stop changing.
- """
- changes = True
- while changes:
- changes = False
- for i, state_i in enumerate(self.states):
- for j in range(i + 1, len(self.states)):
- state_j = self.states[j]
- if state_i == state_j:
- del self.states[j]
- for state in self.states:
- state.unifystate(state_j, state_i)
- changes = True
- break
-
- def dump(self, writer=print):
- """Dump a graphical representation of the DFA"""
- for i, state in enumerate(self.states):
- writer(" State", i, state.is_final and "(final)" or "")
- for label, next in sorted(state.arcs.items()):
- writer(" %s -> %d" % (label, self.states.index(next)))
-
-
-class DFAState(object):
- """A state of a DFA
-
- Attributes:
- rule_name (rule_name): The name of the DFA rule containing the represented state.
- nfa_set (Set[NFAState]): The set of NFA states used to create this state.
- final (bool): True if the state represents an accepting state of the DFA
- containing this state.
- arcs (Dict[label, DFAState]): A mapping representing transitions between
- the current DFA state and another DFA state via following a label.
- """
-
- def __init__(self, rule_name, nfa_set, final):
- assert isinstance(nfa_set, set)
- assert isinstance(next(iter(nfa_set)), NFAState)
- assert isinstance(final, NFAState)
- self.rule_name = rule_name
- self.nfa_set = nfa_set
- self.arcs = {} # map from terminals/nonterminals to DFAState
- self.is_final = final in nfa_set
-
- def add_arc(self, target, label):
- """Add a new arc to the current state.
-
- Parameters:
- target (DFAState): The DFA state at the end of the arc.
- label (str): The label respresenting the token that must be consumed
- to perform this transition.
- """
- assert isinstance(label, str)
- assert label not in self.arcs
- assert isinstance(target, DFAState)
- self.arcs[label] = target
-
- def unifystate(self, old, new):
- """Replace all arcs from the current node to *old* with *new*.
-
- Parameters:
- old (DFAState): The DFA state to remove from all existing arcs.
- new (DFAState): The DFA state to replace in all existing arcs.
- """
- for label, next_ in self.arcs.items():
- if next_ is old:
- self.arcs[label] = new
-
- def __eq__(self, other):
- # The nfa_set does not matter for equality
- assert isinstance(other, DFAState)
- if self.is_final != other.is_final:
- return False
- # We cannot just return self.arcs == other.arcs because that
- # would invoke this method recursively if there are any cycles.
- if len(self.arcs) != len(other.arcs):
- return False
- for label, next_ in self.arcs.items():
- if next_ is not other.arcs.get(label):
- return False
- return True
-
- __hash__ = None # For Py3 compatibility.
-
- def __repr__(self):
- return "<%s: %s is_final=%s>" % (
- self.__class__.__name__,
- self.rule_name,
- self.is_final,
- )
diff --git a/Parser/pgen/grammar.py b/Parser/pgen/grammar.py
deleted file mode 100644
index ce40e16..0000000
--- a/Parser/pgen/grammar.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import collections
-
-
-class Grammar:
- """Pgen parsing tables class.
-
- The instance variables are as follows:
-
- symbol2number -- a dict mapping symbol names to numbers. Symbol
- numbers are always 256 or higher, to distinguish
- them from token numbers, which are between 0 and
- 255 (inclusive).
-
- number2symbol -- a dict mapping numbers to symbol names;
- these two are each other's inverse.
-
- states -- a list of DFAs, where each DFA is a list of
- states, each state is a list of arcs, and each
- arc is a (i, j) pair where i is a label and j is
- a state number. The DFA number is the index into
- this list. (This name is slightly confusing.)
- Final states are represented by a special arc of
- the form (0, j) where j is its own state number.
-
- dfas -- a dict mapping symbol numbers to (DFA, first)
- pairs, where DFA is an item from the states list
- above, and first is a set of tokens that can
- begin this grammar rule.
-
- labels -- a list of (x, y) pairs where x is either a token
- number or a symbol number, and y is either None
- or a string; the strings are keywords. The label
- number is the index in this list; label numbers
- are used to mark state transitions (arcs) in the
- DFAs.
-
- start -- the number of the grammar's start symbol.
-
- keywords -- a dict mapping keyword strings to arc labels.
-
- tokens -- a dict mapping token numbers to arc labels.
-
- """
-
- def __init__(self):
- self.symbol2number = collections.OrderedDict()
- self.number2symbol = collections.OrderedDict()
- self.states = []
- self.dfas = collections.OrderedDict()
- self.labels = [(0, "EMPTY")]
- self.keywords = collections.OrderedDict()
- self.tokens = collections.OrderedDict()
- self.symbol2label = collections.OrderedDict()
- self.start = 256
-
- def produce_graminit_h(self, writer):
- writer("/* Generated by Parser/pgen */\n\n")
- for number, symbol in self.number2symbol.items():
- writer("#define {} {}\n".format(symbol, number))
-
- def produce_graminit_c(self, writer):
- writer("/* Generated by Parser/pgen */\n\n")
-
- writer('#include "exports.h"\n')
- writer('#include "grammar.h"\n')
- writer("Py_EXPORTED_SYMBOL grammar _PyParser_Grammar;\n")
-
- self.print_dfas(writer)
- self.print_labels(writer)
-
- writer("Py_EXPORTED_SYMBOL grammar _PyParser_Grammar = {\n")
- writer(" {n_dfas},\n".format(n_dfas=len(self.dfas)))
- writer(" dfas,\n")
- writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels)))
- writer(" {start_number}\n".format(start_number=self.start))
- writer("};\n")
-
- def print_labels(self, writer):
- writer(
- "static const label labels[{n_labels}] = {{\n".format(
- n_labels=len(self.labels)
- )
- )
- for label, name in self.labels:
- label_name = '"{}"'.format(name) if name is not None else 0
- writer(
- " {{{label}, {label_name}}},\n".format(
- label=label, label_name=label_name
- )
- )
- writer("};\n")
-
- def print_dfas(self, writer):
- self.print_states(writer)
- writer("static const dfa dfas[{}] = {{\n".format(len(self.dfas)))
- for dfaindex, dfa_elem in enumerate(self.dfas.items()):
- symbol, (dfa, first_sets) = dfa_elem
- writer(
- ' {{{dfa_symbol}, "{symbol_name}", '.format(
- dfa_symbol=symbol, symbol_name=self.number2symbol[symbol]
- )
- + "{n_states}, states_{dfa_index},\n".format(
- n_states=len(dfa), dfa_index=dfaindex
- )
- + ' "'
- )
-
- bitset = bytearray((len(self.labels) >> 3) + 1)
- for token in first_sets:
- bitset[token >> 3] |= 1 << (token & 7)
- for byte in bitset:
- writer("\\%03o" % (byte & 0xFF))
- writer('"},\n')
- writer("};\n")
-
- def print_states(self, write):
- for dfaindex, dfa in enumerate(self.states):
- self.print_arcs(write, dfaindex, dfa)
- write(
- "static state states_{dfa_index}[{n_states}] = {{\n".format(
- dfa_index=dfaindex, n_states=len(dfa)
- )
- )
- for stateindex, state in enumerate(dfa):
- narcs = len(state)
- write(
- " {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format(
- n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex
- )
- )
- write("};\n")
-
- def print_arcs(self, write, dfaindex, states):
- for stateindex, state in enumerate(states):
- narcs = len(state)
- write(
- "static const arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format(
- dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs
- )
- )
- for a, b in state:
- write(
- " {{{from_label}, {to_state}}},\n".format(
- from_label=a, to_state=b
- )
- )
- write("};\n")
diff --git a/Parser/pgen/keywordgen.py b/Parser/pgen/keywordgen.py
deleted file mode 100644
index f0234a8..0000000
--- a/Parser/pgen/keywordgen.py
+++ /dev/null
@@ -1,59 +0,0 @@
-"""Generate Lib/keyword.py from the Grammar and Tokens files using pgen"""
-
-import argparse
-
-from .pgen import ParserGenerator
-
-TEMPLATE = r'''
-"""Keywords (from "Grammar/Grammar")
-
-This file is automatically generated; please don't muck it up!
-
-To update the symbols in this file, 'cd' to the top directory of
-the python source tree and run:
-
- python3 -m Parser.pgen.keywordgen Grammar/Grammar \
- Grammar/Tokens \
- Lib/keyword.py
-
-Alternatively, you can run 'make regen-keyword'.
-"""
-
-__all__ = ["iskeyword", "kwlist"]
-
-kwlist = [
- {keywords}
-]
-
-iskeyword = frozenset(kwlist).__contains__
-'''.lstrip()
-
-EXTRA_KEYWORDS = ["async", "await"]
-
-
-def main():
- parser = argparse.ArgumentParser(
- description="Generate the Lib/keywords.py " "file from the grammar."
- )
- parser.add_argument(
- "grammar", type=str, help="The file with the grammar definition in EBNF format"
- )
- parser.add_argument("tokens", type=str, help="The file with the token definitions")
- parser.add_argument(
- "keyword_file",
- type=argparse.FileType("w"),
- help="The path to write the keyword definitions",
- )
- args = parser.parse_args()
- p = ParserGenerator(args.grammar, args.tokens)
- grammar = p.make_grammar()
-
- with args.keyword_file as thefile:
- all_keywords = sorted(list(grammar.keywords) + EXTRA_KEYWORDS)
-
- keywords = ",\n ".join(map(repr, all_keywords))
- thefile.write(TEMPLATE.format(keywords=keywords))
-
-
-if __name__ == "__main__":
- main()
diff --git a/Parser/pgen/metaparser.py b/Parser/pgen/metaparser.py
deleted file mode 100644
index 074a083..0000000
--- a/Parser/pgen/metaparser.py
+++ /dev/null
@@ -1,152 +0,0 @@
-"""Parser for the Python metagrammar"""
-
-import io
-import tokenize # from stdlib
-
-from .automata import NFA, NFAState
-
-
-class GrammarParser:
- """Parser for Python grammar files."""
-
- _translation_table = {
- tokenize.NAME: "NAME",
- tokenize.STRING: "STRING",
- tokenize.NEWLINE: "NEWLINE",
- tokenize.NL: "NL",
- tokenize.OP: "OP",
- tokenize.ENDMARKER: "ENDMARKER",
- tokenize.COMMENT: "COMMENT",
- }
-
- def __init__(self, grammar):
- self.grammar = grammar
- grammar_adaptor = io.StringIO(grammar)
- self.generator = tokenize.generate_tokens(grammar_adaptor.readline)
- self._gettoken() # Initialize lookahead
- self._current_rule_name = None
-
- def parse(self):
- """Turn the grammar into a collection of NFAs"""
- # grammar: (NEWLINE | rule)* ENDMARKER
- while self.type != tokenize.ENDMARKER:
- while self.type == tokenize.NEWLINE:
- self._gettoken()
- # rule: NAME ':' rhs NEWLINE
- self._current_rule_name = self._expect(tokenize.NAME)
- self._expect(tokenize.OP, ":")
- a, z = self._parse_rhs()
- self._expect(tokenize.NEWLINE)
-
- yield NFA(a, z)
-
- def _parse_rhs(self):
- # rhs: items ('|' items)*
- a, z = self._parse_items()
- if self.value != "|":
- return a, z
- else:
- aa = NFAState(self._current_rule_name)
- zz = NFAState(self._current_rule_name)
- while True:
- # Allow to transit directly to the previous state and connect the end of the
- # previous state to the end of the current one, effectively allowing to skip
- # the current state.
- aa.add_arc(a)
- z.add_arc(zz)
- if self.value != "|":
- break
-
- self._gettoken()
- a, z = self._parse_items()
- return aa, zz
-
- def _parse_items(self):
- # items: item+
- a, b = self._parse_item()
- while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["):
- c, d = self._parse_item()
- # Allow a transition between the end of the previous state
- # and the beginning of the new one, connecting all the items
- # together. In this way we can only reach the end if we visit
- # all the items.
- b.add_arc(c)
- b = d
- return a, b
-
- def _parse_item(self):
- # item: '[' rhs ']' | atom ['+' | '*']
- if self.value == "[":
- self._gettoken()
- a, z = self._parse_rhs()
- self._expect(tokenize.OP, "]")
- # Make a transition from the beginning to the end so it is possible to
- # advance for free to the next state of this item # without consuming
- # anything from the rhs.
- a.add_arc(z)
- return a, z
- else:
- a, z = self._parse_atom()
- value = self.value
- if value not in ("+", "*"):
- return a, z
- self._gettoken()
- z.add_arc(a)
- if value == "+":
- # Create a cycle to the beginning so we go back to the old state in this
- # item and repeat.
- return a, z
- else:
- # The end state is the same as the beginning, so we can cycle arbitrarily
- # and end in the beginning if necessary.
- return a, a
-
- def _parse_atom(self):
- # atom: '(' rhs ')' | NAME | STRING
- if self.value == "(":
- self._gettoken()
- a, z = self._parse_rhs()
- self._expect(tokenize.OP, ")")
- return a, z
- elif self.type in (tokenize.NAME, tokenize.STRING):
- a = NFAState(self._current_rule_name)
- z = NFAState(self._current_rule_name)
- # We can transit to the next state only if we consume the value.
- a.add_arc(z, self.value)
- self._gettoken()
- return a, z
- else:
- self._raise_error(
- "expected (...) or NAME or STRING, got {} ({})",
- self._translation_table.get(self.type, self.type),
- self.value,
- )
-
- def _expect(self, type_, value=None):
- if self.type != type_:
- self._raise_error(
- "expected {}, got {} ({})",
- self._translation_table.get(type_, type_),
- self._translation_table.get(self.type, self.type),
- self.value,
- )
- if value is not None and self.value != value:
- self._raise_error("expected {}, got {}", value, self.value)
- value = self.value
- self._gettoken()
- return value
-
- def _gettoken(self):
- tup = next(self.generator)
- while tup[0] in (tokenize.COMMENT, tokenize.NL):
- tup = next(self.generator)
- self.type, self.value, self.begin, self.end, self.line = tup
-
- def _raise_error(self, msg, *args):
- if args:
- try:
- msg = msg.format(*args)
- except Exception:
- msg = " ".join([msg] + list(map(str, args)))
- line = self.grammar.splitlines()[self.begin[0] - 1]
- raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line))
diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py
deleted file mode 100644
index 2f444eb..0000000
--- a/Parser/pgen/pgen.py
+++ /dev/null
@@ -1,305 +0,0 @@
-"""Python parser generator
-
-
-This parser generator transforms a Python grammar file into parsing tables
-that can be consumed by Python's LL(1) parser written in C.
-
-Concepts
---------
-
-* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a
- top-down parser for a subset of context-free languages. It parses the input
- from Left to right, performing Leftmost derivation of the sentence, and can
- only use 1 token of lookahead when parsing a sentence.
-
-* A parsing table is a collection of data that a generic implementation of the
- LL(1) parser consumes to know how to parse a given context-free grammar. In
- this case the collection of data involves Deterministic Finite Automatons,
- calculated first sets, keywords and transition labels.
-
-* A grammar is defined by production rules (or just 'productions') that specify
- which symbols may replace which other symbols; these rules may be used to
- generate strings, or to parse them. Each such rule has a head, or left-hand
- side, which consists of the string that may be replaced, and a body, or
- right-hand side, which consists of a string that may replace it. In the
- Python grammar, rules are written in the form
-
- rule_name: rule_description;
-
- meaning the rule 'a: b' specifies that a can be replaced by b. A context-free
- grammar is a grammar in which the left-hand side of each production rule
- consists of only a single nonterminal symbol. Context-free grammars can
- always be recognized by a Non-Deterministic Automatons.
-
-* Terminal symbols are literal symbols which may appear in the outputs of the
- production rules of the grammar and which cannot be changed using the rules
- of the grammar. Applying the rules recursively to a source string of symbols
- will usually terminate in a final output string consisting only of terminal
- symbols.
-
-* Nonterminal symbols are those symbols which can be replaced. The grammar
- includes a start symbol a designated member of the set of nonterminals from
- which all the strings in the language may be derived by successive
- applications of the production rules.
-
-* The language defined by the grammar is defined as the set of terminal strings
- that can be derived using the production rules.
-
-* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals
- that can appear in the first position of any string derived from the rule.
- This is useful for LL(1) parsers as the parser is only allowed to look at the
- next token in the input to know which rule needs to parse. For example, given
- this grammar:
-
- start: '(' A | B ')'
- A: 'a' '<'
- B: 'b' '<'
-
- and the input '(b<)' the parser can only look at 'b' to know if it needs
- to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows
- that needs to continue parsing rule B because only that rule can start
- with 'b'.
-
-Description
------------
-
-The input for the parser generator is a grammar in extended BNF form (using *
-for repetition, + for at-least-once repetition, [] for optional parts, | for
-alternatives and () for grouping).
-
-Each rule in the grammar file is considered as a regular expression in its
-own right. It is turned into a Non-deterministic Finite Automaton (NFA),
-which is then turned into a Deterministic Finite Automaton (DFA), which is
-then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
-or similar compiler books (this technique is more often used for lexical
-analyzers).
-
-The DFA's are used by the parser as parsing tables in a special way that's
-probably unique. Before they are usable, the FIRST sets of all non-terminals
-are computed so the LL(1) parser consuming the parsing tables can distinguish
-between different transitions.
-Reference
----------
-
-[Aho&Ullman 77]
- Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
- (first edition)
-"""
-
-from ast import literal_eval
-import collections
-
-from . import grammar, token
-from .automata import DFA
-from .metaparser import GrammarParser
-
-import enum
-
-
-class LabelType(enum.Enum):
- NONTERMINAL = 0
- NAMED_TOKEN = 1
- KEYWORD = 2
- OPERATOR = 3
- NONE = 4
-
-
-class Label(str):
- def __init__(self, value):
- self.type = self._get_type()
-
- def _get_type(self):
- if self[0].isalpha():
- if self.upper() == self:
- # NAMED tokens (ASYNC, NAME...) are all uppercase by convention
- return LabelType.NAMED_TOKEN
- else:
- # If is not uppercase it must be a non terminal.
- return LabelType.NONTERMINAL
- else:
- # Keywords and operators are wrapped in quotes
- assert self[0] == self[-1] in ('"', "'"), self
- value = literal_eval(self)
- if value[0].isalpha():
- return LabelType.KEYWORD
- else:
- return LabelType.OPERATOR
-
- def __repr__(self):
- return "{}({})".format(self.type, super().__repr__())
-
-
-class ParserGenerator(object):
- def __init__(self, grammar_file, token_file, verbose=False):
- with open(grammar_file) as f:
- self.grammar = f.read()
- with open(token_file) as tok_file:
- token_lines = tok_file.readlines()
- self.tokens = dict(token.generate_tokens(token_lines))
- self.opmap = dict(token.generate_opmap(token_lines))
- # Manually add <> so it does not collide with !=
- self.opmap["<>"] = "NOTEQUAL"
- self.verbose = verbose
- self.filename = grammar_file
- self.dfas, self.startsymbol = self.create_dfas()
- self.first = {} # map from symbol name to set of tokens
- self.calculate_first_sets()
-
- def create_dfas(self):
- rule_to_dfas = collections.OrderedDict()
- start_nonterminal = None
- for nfa in GrammarParser(self.grammar).parse():
- if self.verbose:
- print("Dump of NFA for", nfa.name)
- nfa.dump()
- dfa = DFA.from_nfa(nfa)
- if self.verbose:
- print("Dump of DFA for", dfa.name)
- dfa.dump()
- dfa.simplify()
- rule_to_dfas[dfa.name] = dfa
-
- if start_nonterminal is None:
- start_nonterminal = dfa.name
-
- return rule_to_dfas, start_nonterminal
-
- def make_grammar(self):
- c = grammar.Grammar()
- c.all_labels = set()
- names = list(self.dfas.keys())
- names.remove(self.startsymbol)
- names.insert(0, self.startsymbol)
- for name in names:
- i = 256 + len(c.symbol2number)
- c.symbol2number[Label(name)] = i
- c.number2symbol[i] = Label(name)
- c.all_labels.add(name)
- for name in names:
- self.make_label(c, name)
- dfa = self.dfas[name]
- states = []
- for state in dfa:
- arcs = []
- for label, next in sorted(state.arcs.items()):
- c.all_labels.add(label)
- arcs.append((self.make_label(c, label), dfa.states.index(next)))
- if state.is_final:
- arcs.append((0, dfa.states.index(state)))
- states.append(arcs)
- c.states.append(states)
- c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name))
- c.start = c.symbol2number[self.startsymbol]
-
- if self.verbose:
- print("")
- print("Grammar summary")
- print("===============")
-
- print("- {n_labels} labels".format(n_labels=len(c.labels)))
- print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
- print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
- print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
- print(
- "- Start symbol: {start_symbol}".format(
- start_symbol=c.number2symbol[c.start]
- )
- )
- return c
-
- def make_first_sets(self, c, name):
- rawfirst = self.first[name]
- first = set()
- for label in sorted(rawfirst):
- ilabel = self.make_label(c, label)
- ##assert ilabel not in first # XXX failed on <> ... !=
- first.add(ilabel)
- return first
-
- def make_label(self, c, label):
- label = Label(label)
- ilabel = len(c.labels)
-
- if label.type == LabelType.NONTERMINAL:
- if label in c.symbol2label:
- return c.symbol2label[label]
- else:
- c.labels.append((c.symbol2number[label], None))
- c.symbol2label[label] = ilabel
- return ilabel
- elif label.type == LabelType.NAMED_TOKEN:
- # A named token (NAME, NUMBER, STRING)
- itoken = self.tokens.get(label, None)
- assert isinstance(itoken, int), label
- assert itoken in self.tokens.values(), label
- if itoken in c.tokens:
- return c.tokens[itoken]
- else:
- c.labels.append((itoken, None))
- c.tokens[itoken] = ilabel
- return ilabel
- elif label.type == LabelType.KEYWORD:
- # A keyword
- value = literal_eval(label)
- if value in c.keywords:
- return c.keywords[value]
- else:
- c.labels.append((self.tokens["NAME"], value))
- c.keywords[value] = ilabel
- return ilabel
- elif label.type == LabelType.OPERATOR:
- # An operator (any non-numeric token)
- value = literal_eval(label)
- tok_name = self.opmap[value] # Fails if unknown token
- itoken = self.tokens[tok_name]
- if itoken in c.tokens:
- return c.tokens[itoken]
- else:
- c.labels.append((itoken, None))
- c.tokens[itoken] = ilabel
- return ilabel
- else:
- raise ValueError("Cannot categorize label {}".format(label))
-
- def calculate_first_sets(self):
- names = list(self.dfas.keys())
- for name in names:
- if name not in self.first:
- self.calculate_first_sets_for_rule(name)
-
- if self.verbose:
- print("First set for {dfa_name}".format(dfa_name=name))
- for item in self.first[name]:
- print(" - {terminal}".format(terminal=item))
-
- def calculate_first_sets_for_rule(self, name):
- dfa = self.dfas[name]
- self.first[name] = None # dummy to detect left recursion
- state = dfa.states[0]
- totalset = set()
- overlapcheck = {}
- for label, next in state.arcs.items():
- if label in self.dfas:
- if label in self.first:
- fset = self.first[label]
- if fset is None:
- raise ValueError("recursion for rule %r" % name)
- else:
- self.calculate_first_sets_for_rule(label)
- fset = self.first[label]
- totalset.update(fset)
- overlapcheck[label] = fset
- else:
- totalset.add(label)
- overlapcheck[label] = {label}
- inverse = {}
- for label, itsfirst in overlapcheck.items():
- for symbol in itsfirst:
- if symbol in inverse:
- raise ValueError(
- "rule %s is ambiguous; %s is in the"
- " first sets of %s as well as %s"
- % (name, symbol, label, inverse[symbol])
- )
- inverse[symbol] = label
- self.first[name] = totalset
diff --git a/Parser/pgen/token.py b/Parser/pgen/token.py
deleted file mode 100644
index 2cff62c..0000000
--- a/Parser/pgen/token.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import itertools
-
-
-def generate_tokens(tokens):
- numbers = itertools.count(0)
- for line in tokens:
- line = line.strip()
-
- if not line or line.startswith("#"):
- continue
-
- name = line.split()[0]
- yield (name, next(numbers))
-
- yield ("N_TOKENS", next(numbers))
- yield ("NT_OFFSET", 256)
-
-
-def generate_opmap(tokens):
- for line in tokens:
- line = line.strip()
-
- if not line or line.startswith("#"):
- continue
-
- pieces = line.split()
-
- if len(pieces) != 2:
- continue
-
- name, op = pieces
- yield (op.strip("'"), name)
-
- # Yield independently <>. This is needed so it does not collide
- # with the token generation in "generate_tokens" because if this
- # symbol is included in Grammar/Tokens, it will collide with !=
- # as it has the same name (NOTEQUAL).
- yield ("<>", "NOTEQUAL")
diff --git a/Parser/pgenmain.c b/Parser/pgenmain.c
new file mode 100644
index 0000000..0b47295
--- /dev/null
+++ b/Parser/pgenmain.c
@@ -0,0 +1,174 @@
+
+/* Parser generator main program */
+
+/* This expects a filename containing the grammar as argv[1] (UNIX)
+ or asks the console for such a file name (THINK C).
+ It writes its output on two files in the current directory:
+ - "graminit.c" gets the grammar as a bunch of initialized data
+ - "graminit.h" gets the grammar's non-terminals as #defines.
+ Error messages and status info during the generation process are
+ written to stdout, or sometimes to stderr. */
+
+/* XXX TO DO:
+ - check for duplicate definitions of names (instead of fatal err)
+*/
+
+#include "Python.h"
+#include "pgenheaders.h"
+#include "grammar.h"
+#include "node.h"
+#include "parsetok.h"
+#include "pgen.h"
+
+int Py_DebugFlag;
+int Py_VerboseFlag;
+int Py_IgnoreEnvironmentFlag;
+
+/* Forward */
+grammar *getgrammar(char *filename);
+
+void
+Py_Exit(int sts)
+{
+ exit(sts);
+}
+
+int
+main(int argc, char **argv)
+{
+ grammar *g;
+ FILE *fp;
+ char *filename, *graminit_h, *graminit_c;
+
+ if (argc != 4) {
+ fprintf(stderr,
+ "usage: %s grammar graminit.h graminit.c\n", argv[0]);
+ Py_Exit(2);
+ }
+ filename = argv[1];
+ graminit_h = argv[2];
+ graminit_c = argv[3];
+ g = getgrammar(filename);
+ fp = fopen(graminit_c, "w");
+ if (fp == NULL) {
+ perror(graminit_c);
+ Py_Exit(1);
+ }
+ if (Py_DebugFlag)
+ printf("Writing %s ...\n", graminit_c);
+ printgrammar(g, fp);
+ fclose(fp);
+ fp = fopen(graminit_h, "w");
+ if (fp == NULL) {
+ perror(graminit_h);
+ Py_Exit(1);
+ }
+ if (Py_DebugFlag)
+ printf("Writing %s ...\n", graminit_h);
+ printnonterminals(g, fp);
+ fclose(fp);
+ freegrammar(g);
+ Py_Exit(0);
+ return 0; /* Make gcc -Wall happy */
+}
+
+grammar *
+getgrammar(char *filename)
+{
+ FILE *fp;
+ node *n;
+ grammar *g0, *g;
+ perrdetail err;
+
+ fp = fopen(filename, "r");
+ if (fp == NULL) {
+ perror(filename);
+ Py_Exit(1);
+ }
+ g0 = meta_grammar();
+ n = PyParser_ParseFile(fp, filename, g0, g0->g_start,
+ (char *)NULL, (char *)NULL, &err);
+ fclose(fp);
+ if (n == NULL) {
+ fprintf(stderr, "Parsing error %d, line %d.\n",
+ err.error, err.lineno);
+ if (err.text != NULL) {
+ size_t i;
+ fprintf(stderr, "%s", err.text);
+ i = strlen(err.text);
+ if (i == 0 || err.text[i-1] != '\n')
+ fprintf(stderr, "\n");
+ for (i = 0; i < err.offset; i++) {
+ if (err.text[i] == '\t')
+ putc('\t', stderr);
+ else
+ putc(' ', stderr);
+ }
+ fprintf(stderr, "^\n");
+ PyObject_FREE(err.text);
+ }
+ Py_Exit(1);
+ }
+ g = pgen(n);
+ if (g == NULL) {
+ printf("Bad grammar.\n");
+ Py_Exit(1);
+ }
+ return g;
+}
+
+/* Can't happen in pgen */
+PyObject*
+PyErr_Occurred()
+{
+ return 0;
+}
+
+void
+Py_FatalError(const char *msg)
+{
+ fprintf(stderr, "pgen: FATAL ERROR: %s\n", msg);
+ Py_Exit(1);
+}
+
+/* No-nonsense my_readline() for tokenizer.c */
+
+char *
+PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, char *prompt)
+{
+ size_t n = 1000;
+ char *p = (char *)PyMem_MALLOC(n);
+ char *q;
+ if (p == NULL)
+ return NULL;
+ fprintf(stderr, "%s", prompt);
+ q = fgets(p, n, sys_stdin);
+ if (q == NULL) {
+ *p = '\0';
+ return p;
+ }
+ n = strlen(p);
+ if (n > 0 && p[n-1] != '\n')
+ p[n-1] = '\n';
+ return (char *)PyMem_REALLOC(p, n+1);
+}
+
+/* No-nonsense fgets */
+char *
+Py_UniversalNewlineFgets(char *buf, int n, FILE *stream, PyObject *fobj)
+{
+ return fgets(buf, n, stream);
+}
+
+
+#include <stdarg.h>
+
+void
+PySys_WriteStderr(const char *format, ...)
+{
+ va_list va;
+
+ va_start(va, format);
+ vfprintf(stderr, format, va);
+ va_end(va);
+}
diff --git a/Parser/printgrammar.c b/Parser/printgrammar.c
new file mode 100644
index 0000000..01f552f
--- /dev/null
+++ b/Parser/printgrammar.c
@@ -0,0 +1,117 @@
+
+/* Print a bunch of C initializers that represent a grammar */
+
+#include "pgenheaders.h"
+#include "grammar.h"
+
+/* Forward */
+static void printarcs(int, dfa *, FILE *);
+static void printstates(grammar *, FILE *);
+static void printdfas(grammar *, FILE *);
+static void printlabels(grammar *, FILE *);
+
+void
+printgrammar(grammar *g, FILE *fp)
+{
+ fprintf(fp, "/* Generated by Parser/pgen */\n\n");
+ fprintf(fp, "#include \"pgenheaders.h\"\n");
+ fprintf(fp, "#include \"grammar.h\"\n");
+ fprintf(fp, "PyAPI_DATA(grammar) _PyParser_Grammar;\n");
+ printdfas(g, fp);
+ printlabels(g, fp);
+ fprintf(fp, "grammar _PyParser_Grammar = {\n");
+ fprintf(fp, " %d,\n", g->g_ndfas);
+ fprintf(fp, " dfas,\n");
+ fprintf(fp, " {%d, labels},\n", g->g_ll.ll_nlabels);
+ fprintf(fp, " %d\n", g->g_start);
+ fprintf(fp, "};\n");
+}
+
+void
+printnonterminals(grammar *g, FILE *fp)
+{
+ dfa *d;
+ int i;
+
+ fprintf(fp, "/* Generated by Parser/pgen */\n\n");
+
+ d = g->g_dfa;
+ for (i = g->g_ndfas; --i >= 0; d++)
+ fprintf(fp, "#define %s %d\n", d->d_name, d->d_type);
+}
+
+static void
+printarcs(int i, dfa *d, FILE *fp)
+{
+ arc *a;
+ state *s;
+ int j, k;
+
+ s = d->d_state;
+ for (j = 0; j < d->d_nstates; j++, s++) {
+ fprintf(fp, "static arc arcs_%d_%d[%d] = {\n",
+ i, j, s->s_narcs);
+ a = s->s_arc;
+ for (k = 0; k < s->s_narcs; k++, a++)
+ fprintf(fp, " {%d, %d},\n", a->a_lbl, a->a_arrow);
+ fprintf(fp, "};\n");
+ }
+}
+
+static void
+printstates(grammar *g, FILE *fp)
+{
+ state *s;
+ dfa *d;
+ int i, j;
+
+ d = g->g_dfa;
+ for (i = 0; i < g->g_ndfas; i++, d++) {
+ printarcs(i, d, fp);
+ fprintf(fp, "static state states_%d[%d] = {\n",
+ i, d->d_nstates);
+ s = d->d_state;
+ for (j = 0; j < d->d_nstates; j++, s++)
+ fprintf(fp, " {%d, arcs_%d_%d},\n",
+ s->s_narcs, i, j);
+ fprintf(fp, "};\n");
+ }
+}
+
+static void
+printdfas(grammar *g, FILE *fp)
+{
+ dfa *d;
+ int i, j;
+
+ printstates(g, fp);
+ fprintf(fp, "static dfa dfas[%d] = {\n", g->g_ndfas);
+ d = g->g_dfa;
+ for (i = 0; i < g->g_ndfas; i++, d++) {
+ fprintf(fp, " {%d, \"%s\", %d, %d, states_%d,\n",
+ d->d_type, d->d_name, d->d_initial, d->d_nstates, i);
+ fprintf(fp, " \"");
+ for (j = 0; j < NBYTES(g->g_ll.ll_nlabels); j++)
+ fprintf(fp, "\\%03o", d->d_first[j] & 0xff);
+ fprintf(fp, "\"},\n");
+ }
+ fprintf(fp, "};\n");
+}
+
+static void
+printlabels(grammar *g, FILE *fp)
+{
+ label *l;
+ int i;
+
+ fprintf(fp, "static label labels[%d] = {\n", g->g_ll.ll_nlabels);
+ l = g->g_ll.ll_label;
+ for (i = g->g_ll.ll_nlabels; --i >= 0; l++) {
+ if (l->lb_str == NULL)
+ fprintf(fp, " {%d, 0},\n", l->lb_type);
+ else
+ fprintf(fp, " {%d, \"%s\"},\n",
+ l->lb_type, l->lb_str);
+ }
+ fprintf(fp, "};\n");
+}
diff --git a/Parser/spark.py b/Parser/spark.py
new file mode 100644
index 0000000..b064d62
--- /dev/null
+++ b/Parser/spark.py
@@ -0,0 +1,839 @@
+# Copyright (c) 1998-2002 John Aycock
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+__version__ = 'SPARK-0.7 (pre-alpha-5)'
+
+import re
+import string
+
+def _namelist(instance):
+ namelist, namedict, classlist = [], {}, [instance.__class__]
+ for c in classlist:
+ for b in c.__bases__:
+ classlist.append(b)
+ for name in c.__dict__.keys():
+ if not namedict.has_key(name):
+ namelist.append(name)
+ namedict[name] = 1
+ return namelist
+
+class GenericScanner:
+ def __init__(self, flags=0):
+ pattern = self.reflect()
+ self.re = re.compile(pattern, re.VERBOSE|flags)
+
+ self.index2func = {}
+ for name, number in self.re.groupindex.items():
+ self.index2func[number-1] = getattr(self, 't_' + name)
+
+ def makeRE(self, name):
+ doc = getattr(self, name).__doc__
+ rv = '(?P<%s>%s)' % (name[2:], doc)
+ return rv
+
+ def reflect(self):
+ rv = []
+ for name in _namelist(self):
+ if name[:2] == 't_' and name != 't_default':
+ rv.append(self.makeRE(name))
+
+ rv.append(self.makeRE('t_default'))
+ return string.join(rv, '|')
+
+ def error(self, s, pos):
+ print "Lexical error at position %s" % pos
+ raise SystemExit
+
+ def tokenize(self, s):
+ pos = 0
+ n = len(s)
+ while pos < n:
+ m = self.re.match(s, pos)
+ if m is None:
+ self.error(s, pos)
+
+ groups = m.groups()
+ for i in range(len(groups)):
+ if groups[i] and self.index2func.has_key(i):
+ self.index2func[i](groups[i])
+ pos = m.end()
+
+ def t_default(self, s):
+ r'( . | \n )+'
+ print "Specification error: unmatched input"
+ raise SystemExit
+
+#
+# Extracted from GenericParser and made global so that [un]picking works.
+#
+class _State:
+ def __init__(self, stateno, items):
+ self.T, self.complete, self.items = [], [], items
+ self.stateno = stateno
+
+class GenericParser:
+ #
+ # An Earley parser, as per J. Earley, "An Efficient Context-Free
+ # Parsing Algorithm", CACM 13(2), pp. 94-102. Also J. C. Earley,
+ # "An Efficient Context-Free Parsing Algorithm", Ph.D. thesis,
+ # Carnegie-Mellon University, August 1968. New formulation of
+ # the parser according to J. Aycock, "Practical Earley Parsing
+ # and the SPARK Toolkit", Ph.D. thesis, University of Victoria,
+ # 2001, and J. Aycock and R. N. Horspool, "Practical Earley
+ # Parsing", unpublished paper, 2001.
+ #
+
+ def __init__(self, start):
+ self.rules = {}
+ self.rule2func = {}
+ self.rule2name = {}
+ self.collectRules()
+ self.augment(start)
+ self.ruleschanged = 1
+
+ _NULLABLE = '\e_'
+ _START = 'START'
+ _BOF = '|-'
+
+ #
+ # When pickling, take the time to generate the full state machine;
+ # some information is then extraneous, too. Unfortunately we
+ # can't save the rule2func map.
+ #
+ def __getstate__(self):
+ if self.ruleschanged:
+ #
+ # XXX - duplicated from parse()
+ #
+ self.computeNull()
+ self.newrules = {}
+ self.new2old = {}
+ self.makeNewRules()
+ self.ruleschanged = 0
+ self.edges, self.cores = {}, {}
+ self.states = { 0: self.makeState0() }
+ self.makeState(0, self._BOF)
+ #
+ # XXX - should find a better way to do this..
+ #
+ changes = 1
+ while changes:
+ changes = 0
+ for k, v in self.edges.items():
+ if v is None:
+ state, sym = k
+ if self.states.has_key(state):
+ self.goto(state, sym)
+ changes = 1
+ rv = self.__dict__.copy()
+ for s in self.states.values():
+ del s.items
+ del rv['rule2func']
+ del rv['nullable']
+ del rv['cores']
+ return rv
+
+ def __setstate__(self, D):
+ self.rules = {}
+ self.rule2func = {}
+ self.rule2name = {}
+ self.collectRules()
+ start = D['rules'][self._START][0][1][1] # Blech.
+ self.augment(start)
+ D['rule2func'] = self.rule2func
+ D['makeSet'] = self.makeSet_fast
+ self.__dict__ = D
+
+ #
+ # A hook for GenericASTBuilder and GenericASTMatcher. Mess
+ # thee not with this; nor shall thee toucheth the _preprocess
+ # argument to addRule.
+ #
+ def preprocess(self, rule, func): return rule, func
+
+ def addRule(self, doc, func, _preprocess=1):
+ fn = func
+ rules = string.split(doc)
+
+ index = []
+ for i in range(len(rules)):
+ if rules[i] == '::=':
+ index.append(i-1)
+ index.append(len(rules))
+
+ for i in range(len(index)-1):
+ lhs = rules[index[i]]
+ rhs = rules[index[i]+2:index[i+1]]
+ rule = (lhs, tuple(rhs))
+
+ if _preprocess:
+ rule, fn = self.preprocess(rule, func)
+
+ if self.rules.has_key(lhs):
+ self.rules[lhs].append(rule)
+ else:
+ self.rules[lhs] = [ rule ]
+ self.rule2func[rule] = fn
+ self.rule2name[rule] = func.__name__[2:]
+ self.ruleschanged = 1
+
+ def collectRules(self):
+ for name in _namelist(self):
+ if name[:2] == 'p_':
+ func = getattr(self, name)
+ doc = func.__doc__
+ self.addRule(doc, func)
+
+ def augment(self, start):
+ rule = '%s ::= %s %s' % (self._START, self._BOF, start)
+ self.addRule(rule, lambda args: args[1], 0)
+
+ def computeNull(self):
+ self.nullable = {}
+ tbd = []
+
+ for rulelist in self.rules.values():
+ lhs = rulelist[0][0]
+ self.nullable[lhs] = 0
+ for rule in rulelist:
+ rhs = rule[1]
+ if len(rhs) == 0:
+ self.nullable[lhs] = 1
+ continue
+ #
+ # We only need to consider rules which
+ # consist entirely of nonterminal symbols.
+ # This should be a savings on typical
+ # grammars.
+ #
+ for sym in rhs:
+ if not self.rules.has_key(sym):
+ break
+ else:
+ tbd.append(rule)
+ changes = 1
+ while changes:
+ changes = 0
+ for lhs, rhs in tbd:
+ if self.nullable[lhs]:
+ continue
+ for sym in rhs:
+ if not self.nullable[sym]:
+ break
+ else:
+ self.nullable[lhs] = 1
+ changes = 1
+
+ def makeState0(self):
+ s0 = _State(0, [])
+ for rule in self.newrules[self._START]:
+ s0.items.append((rule, 0))
+ return s0
+
+ def finalState(self, tokens):
+ #
+ # Yuck.
+ #
+ if len(self.newrules[self._START]) == 2 and len(tokens) == 0:
+ return 1
+ start = self.rules[self._START][0][1][1]
+ return self.goto(1, start)
+
+ def makeNewRules(self):
+ worklist = []
+ for rulelist in self.rules.values():
+ for rule in rulelist:
+ worklist.append((rule, 0, 1, rule))
+
+ for rule, i, candidate, oldrule in worklist:
+ lhs, rhs = rule
+ n = len(rhs)
+ while i < n:
+ sym = rhs[i]
+ if not self.rules.has_key(sym) or \
+ not self.nullable[sym]:
+ candidate = 0
+ i = i + 1
+ continue
+
+ newrhs = list(rhs)
+ newrhs[i] = self._NULLABLE+sym
+ newrule = (lhs, tuple(newrhs))
+ worklist.append((newrule, i+1,
+ candidate, oldrule))
+ candidate = 0
+ i = i + 1
+ else:
+ if candidate:
+ lhs = self._NULLABLE+lhs
+ rule = (lhs, rhs)
+ if self.newrules.has_key(lhs):
+ self.newrules[lhs].append(rule)
+ else:
+ self.newrules[lhs] = [ rule ]
+ self.new2old[rule] = oldrule
+
+ def typestring(self, token):
+ return None
+
+ def error(self, token):
+ print "Syntax error at or near `%s' token" % token
+ raise SystemExit
+
+ def parse(self, tokens):
+ sets = [ [(1,0), (2,0)] ]
+ self.links = {}
+
+ if self.ruleschanged:
+ self.computeNull()
+ self.newrules = {}
+ self.new2old = {}
+ self.makeNewRules()
+ self.ruleschanged = 0
+ self.edges, self.cores = {}, {}
+ self.states = { 0: self.makeState0() }
+ self.makeState(0, self._BOF)
+
+ for i in xrange(len(tokens)):
+ sets.append([])
+
+ if sets[i] == []:
+ break
+ self.makeSet(tokens[i], sets, i)
+ else:
+ sets.append([])
+ self.makeSet(None, sets, len(tokens))
+
+ #_dump(tokens, sets, self.states)
+
+ finalitem = (self.finalState(tokens), 0)
+ if finalitem not in sets[-2]:
+ if len(tokens) > 0:
+ self.error(tokens[i-1])
+ else:
+ self.error(None)
+
+ return self.buildTree(self._START, finalitem,
+ tokens, len(sets)-2)
+
+ def isnullable(self, sym):
+ #
+ # For symbols in G_e only. If we weren't supporting 1.5,
+ # could just use sym.startswith().
+ #
+ return self._NULLABLE == sym[0:len(self._NULLABLE)]
+
+ def skip(self, (lhs, rhs), pos=0):
+ n = len(rhs)
+ while pos < n:
+ if not self.isnullable(rhs[pos]):
+ break
+ pos = pos + 1
+ return pos
+
+ def makeState(self, state, sym):
+ assert sym is not None
+ #
+ # Compute \epsilon-kernel state's core and see if
+ # it exists already.
+ #
+ kitems = []
+ for rule, pos in self.states[state].items:
+ lhs, rhs = rule
+ if rhs[pos:pos+1] == (sym,):
+ kitems.append((rule, self.skip(rule, pos+1)))
+ core = kitems
+
+ core.sort()
+ tcore = tuple(core)
+ if self.cores.has_key(tcore):
+ return self.cores[tcore]
+ #
+ # Nope, doesn't exist. Compute it and the associated
+ # \epsilon-nonkernel state together; we'll need it right away.
+ #
+ k = self.cores[tcore] = len(self.states)
+ K, NK = _State(k, kitems), _State(k+1, [])
+ self.states[k] = K
+ predicted = {}
+
+ edges = self.edges
+ rules = self.newrules
+ for X in K, NK:
+ worklist = X.items
+ for item in worklist:
+ rule, pos = item
+ lhs, rhs = rule
+ if pos == len(rhs):
+ X.complete.append(rule)
+ continue
+
+ nextSym = rhs[pos]
+ key = (X.stateno, nextSym)
+ if not rules.has_key(nextSym):
+ if not edges.has_key(key):
+ edges[key] = None
+ X.T.append(nextSym)
+ else:
+ edges[key] = None
+ if not predicted.has_key(nextSym):
+ predicted[nextSym] = 1
+ for prule in rules[nextSym]:
+ ppos = self.skip(prule)
+ new = (prule, ppos)
+ NK.items.append(new)
+ #
+ # Problem: we know K needs generating, but we
+ # don't yet know about NK. Can't commit anything
+ # regarding NK to self.edges until we're sure. Should
+ # we delay committing on both K and NK to avoid this
+ # hacky code? This creates other problems..
+ #
+ if X is K:
+ edges = {}
+
+ if NK.items == []:
+ return k
+
+ #
+ # Check for \epsilon-nonkernel's core. Unfortunately we
+ # need to know the entire set of predicted nonterminals
+ # to do this without accidentally duplicating states.
+ #
+ core = predicted.keys()
+ core.sort()
+ tcore = tuple(core)
+ if self.cores.has_key(tcore):
+ self.edges[(k, None)] = self.cores[tcore]
+ return k
+
+ nk = self.cores[tcore] = self.edges[(k, None)] = NK.stateno
+ self.edges.update(edges)
+ self.states[nk] = NK
+ return k
+
+ def goto(self, state, sym):
+ key = (state, sym)
+ if not self.edges.has_key(key):
+ #
+ # No transitions from state on sym.
+ #
+ return None
+
+ rv = self.edges[key]
+ if rv is None:
+ #
+ # Target state isn't generated yet. Remedy this.
+ #
+ rv = self.makeState(state, sym)
+ self.edges[key] = rv
+ return rv
+
+ def gotoT(self, state, t):
+ return [self.goto(state, t)]
+
+ def gotoST(self, state, st):
+ rv = []
+ for t in self.states[state].T:
+ if st == t:
+ rv.append(self.goto(state, t))
+ return rv
+
+ def add(self, set, item, i=None, predecessor=None, causal=None):
+ if predecessor is None:
+ if item not in set:
+ set.append(item)
+ else:
+ key = (item, i)
+ if item not in set:
+ self.links[key] = []
+ set.append(item)
+ self.links[key].append((predecessor, causal))
+
+ def makeSet(self, token, sets, i):
+ cur, next = sets[i], sets[i+1]
+
+ ttype = token is not None and self.typestring(token) or None
+ if ttype is not None:
+ fn, arg = self.gotoT, ttype
+ else:
+ fn, arg = self.gotoST, token
+
+ for item in cur:
+ ptr = (item, i)
+ state, parent = item
+ add = fn(state, arg)
+ for k in add:
+ if k is not None:
+ self.add(next, (k, parent), i+1, ptr)
+ nk = self.goto(k, None)
+ if nk is not None:
+ self.add(next, (nk, i+1))
+
+ if parent == i:
+ continue
+
+ for rule in self.states[state].complete:
+ lhs, rhs = rule
+ for pitem in sets[parent]:
+ pstate, pparent = pitem
+ k = self.goto(pstate, lhs)
+ if k is not None:
+ why = (item, i, rule)
+ pptr = (pitem, parent)
+ self.add(cur, (k, pparent),
+ i, pptr, why)
+ nk = self.goto(k, None)
+ if nk is not None:
+ self.add(cur, (nk, i))
+
+ def makeSet_fast(self, token, sets, i):
+ #
+ # Call *only* when the entire state machine has been built!
+ # It relies on self.edges being filled in completely, and
+ # then duplicates and inlines code to boost speed at the
+ # cost of extreme ugliness.
+ #
+ cur, next = sets[i], sets[i+1]
+ ttype = token is not None and self.typestring(token) or None
+
+ for item in cur:
+ ptr = (item, i)
+ state, parent = item
+ if ttype is not None:
+ k = self.edges.get((state, ttype), None)
+ if k is not None:
+ #self.add(next, (k, parent), i+1, ptr)
+ #INLINED --v
+ new = (k, parent)
+ key = (new, i+1)
+ if new not in next:
+ self.links[key] = []
+ next.append(new)
+ self.links[key].append((ptr, None))
+ #INLINED --^
+ #nk = self.goto(k, None)
+ nk = self.edges.get((k, None), None)
+ if nk is not None:
+ #self.add(next, (nk, i+1))
+ #INLINED --v
+ new = (nk, i+1)
+ if new not in next:
+ next.append(new)
+ #INLINED --^
+ else:
+ add = self.gotoST(state, token)
+ for k in add:
+ if k is not None:
+ self.add(next, (k, parent), i+1, ptr)
+ #nk = self.goto(k, None)
+ nk = self.edges.get((k, None), None)
+ if nk is not None:
+ self.add(next, (nk, i+1))
+
+ if parent == i:
+ continue
+
+ for rule in self.states[state].complete:
+ lhs, rhs = rule
+ for pitem in sets[parent]:
+ pstate, pparent = pitem
+ #k = self.goto(pstate, lhs)
+ k = self.edges.get((pstate, lhs), None)
+ if k is not None:
+ why = (item, i, rule)
+ pptr = (pitem, parent)
+ #self.add(cur, (k, pparent),
+ # i, pptr, why)
+ #INLINED --v
+ new = (k, pparent)
+ key = (new, i)
+ if new not in cur:
+ self.links[key] = []
+ cur.append(new)
+ self.links[key].append((pptr, why))
+ #INLINED --^
+ #nk = self.goto(k, None)
+ nk = self.edges.get((k, None), None)
+ if nk is not None:
+ #self.add(cur, (nk, i))
+ #INLINED --v
+ new = (nk, i)
+ if new not in cur:
+ cur.append(new)
+ #INLINED --^
+
+ def predecessor(self, key, causal):
+ for p, c in self.links[key]:
+ if c == causal:
+ return p
+ assert 0
+
+ def causal(self, key):
+ links = self.links[key]
+ if len(links) == 1:
+ return links[0][1]
+ choices = []
+ rule2cause = {}
+ for p, c in links:
+ rule = c[2]
+ choices.append(rule)
+ rule2cause[rule] = c
+ return rule2cause[self.ambiguity(choices)]
+
+ def deriveEpsilon(self, nt):
+ if len(self.newrules[nt]) > 1:
+ rule = self.ambiguity(self.newrules[nt])
+ else:
+ rule = self.newrules[nt][0]
+ #print rule
+
+ rhs = rule[1]
+ attr = [None] * len(rhs)
+
+ for i in range(len(rhs)-1, -1, -1):
+ attr[i] = self.deriveEpsilon(rhs[i])
+ return self.rule2func[self.new2old[rule]](attr)
+
+ def buildTree(self, nt, item, tokens, k):
+ state, parent = item
+
+ choices = []
+ for rule in self.states[state].complete:
+ if rule[0] == nt:
+ choices.append(rule)
+ rule = choices[0]
+ if len(choices) > 1:
+ rule = self.ambiguity(choices)
+ #print rule
+
+ rhs = rule[1]
+ attr = [None] * len(rhs)
+
+ for i in range(len(rhs)-1, -1, -1):
+ sym = rhs[i]
+ if not self.newrules.has_key(sym):
+ if sym != self._BOF:
+ attr[i] = tokens[k-1]
+ key = (item, k)
+ item, k = self.predecessor(key, None)
+ #elif self.isnullable(sym):
+ elif self._NULLABLE == sym[0:len(self._NULLABLE)]:
+ attr[i] = self.deriveEpsilon(sym)
+ else:
+ key = (item, k)
+ why = self.causal(key)
+ attr[i] = self.buildTree(sym, why[0],
+ tokens, why[1])
+ item, k = self.predecessor(key, why)
+ return self.rule2func[self.new2old[rule]](attr)
+
+ def ambiguity(self, rules):
+ #
+ # XXX - problem here and in collectRules() if the same rule
+ # appears in >1 method. Also undefined results if rules
+ # causing the ambiguity appear in the same method.
+ #
+ sortlist = []
+ name2index = {}
+ for i in range(len(rules)):
+ lhs, rhs = rule = rules[i]
+ name = self.rule2name[self.new2old[rule]]
+ sortlist.append((len(rhs), name))
+ name2index[name] = i
+ sortlist.sort()
+ list = map(lambda (a,b): b, sortlist)
+ return rules[name2index[self.resolve(list)]]
+
+ def resolve(self, list):
+ #
+ # Resolve ambiguity in favor of the shortest RHS.
+ # Since we walk the tree from the top down, this
+ # should effectively resolve in favor of a "shift".
+ #
+ return list[0]
+
+#
+# GenericASTBuilder automagically constructs a concrete/abstract syntax tree
+# for a given input. The extra argument is a class (not an instance!)
+# which supports the "__setslice__" and "__len__" methods.
+#
+# XXX - silently overrides any user code in methods.
+#
+
+class GenericASTBuilder(GenericParser):
+ def __init__(self, AST, start):
+ GenericParser.__init__(self, start)
+ self.AST = AST
+
+ def preprocess(self, rule, func):
+ rebind = lambda lhs, self=self: \
+ lambda args, lhs=lhs, self=self: \
+ self.buildASTNode(args, lhs)
+ lhs, rhs = rule
+ return rule, rebind(lhs)
+
+ def buildASTNode(self, args, lhs):
+ children = []
+ for arg in args:
+ if isinstance(arg, self.AST):
+ children.append(arg)
+ else:
+ children.append(self.terminal(arg))
+ return self.nonterminal(lhs, children)
+
+ def terminal(self, token): return token
+
+ def nonterminal(self, type, args):
+ rv = self.AST(type)
+ rv[:len(args)] = args
+ return rv
+
+#
+# GenericASTTraversal is a Visitor pattern according to Design Patterns. For
+# each node it attempts to invoke the method n_<node type>, falling
+# back onto the default() method if the n_* can't be found. The preorder
+# traversal also looks for an exit hook named n_<node type>_exit (no default
+# routine is called if it's not found). To prematurely halt traversal
+# of a subtree, call the prune() method -- this only makes sense for a
+# preorder traversal. Node type is determined via the typestring() method.
+#
+
+class GenericASTTraversalPruningException:
+ pass
+
+class GenericASTTraversal:
+ def __init__(self, ast):
+ self.ast = ast
+
+ def typestring(self, node):
+ return node.type
+
+ def prune(self):
+ raise GenericASTTraversalPruningException
+
+ def preorder(self, node=None):
+ if node is None:
+ node = self.ast
+
+ try:
+ name = 'n_' + self.typestring(node)
+ if hasattr(self, name):
+ func = getattr(self, name)
+ func(node)
+ else:
+ self.default(node)
+ except GenericASTTraversalPruningException:
+ return
+
+ for kid in node:
+ self.preorder(kid)
+
+ name = name + '_exit'
+ if hasattr(self, name):
+ func = getattr(self, name)
+ func(node)
+
+ def postorder(self, node=None):
+ if node is None:
+ node = self.ast
+
+ for kid in node:
+ self.postorder(kid)
+
+ name = 'n_' + self.typestring(node)
+ if hasattr(self, name):
+ func = getattr(self, name)
+ func(node)
+ else:
+ self.default(node)
+
+
+ def default(self, node):
+ pass
+
+#
+# GenericASTMatcher. AST nodes must have "__getitem__" and "__cmp__"
+# implemented.
+#
+# XXX - makes assumptions about how GenericParser walks the parse tree.
+#
+
+class GenericASTMatcher(GenericParser):
+ def __init__(self, start, ast):
+ GenericParser.__init__(self, start)
+ self.ast = ast
+
+ def preprocess(self, rule, func):
+ rebind = lambda func, self=self: \
+ lambda args, func=func, self=self: \
+ self.foundMatch(args, func)
+ lhs, rhs = rule
+ rhslist = list(rhs)
+ rhslist.reverse()
+
+ return (lhs, tuple(rhslist)), rebind(func)
+
+ def foundMatch(self, args, func):
+ func(args[-1])
+ return args[-1]
+
+ def match_r(self, node):
+ self.input.insert(0, node)
+ children = 0
+
+ for child in node:
+ if children == 0:
+ self.input.insert(0, '(')
+ children = children + 1
+ self.match_r(child)
+
+ if children > 0:
+ self.input.insert(0, ')')
+
+ def match(self, ast=None):
+ if ast is None:
+ ast = self.ast
+ self.input = []
+
+ self.match_r(ast)
+ self.parse(self.input)
+
+ def resolve(self, list):
+ #
+ # Resolve ambiguity in favor of the longest RHS.
+ #
+ return list[-1]
+
+def _dump(tokens, sets, states):
+ for i in range(len(sets)):
+ print 'set', i
+ for item in sets[i]:
+ print '\t', item
+ for (lhs, rhs), pos in states[item[0]].items:
+ print '\t\t', lhs, '::=',
+ print string.join(rhs[:pos]),
+ print '.',
+ print string.join(rhs[pos:])
+ if i < len(tokens):
+ print
+ print 'token', str(tokens[i])
+ print
diff --git a/Parser/token.c b/Parser/token.c
deleted file mode 100644
index a489668..0000000
--- a/Parser/token.c
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Auto-generated by Tools/scripts/generate_token.py */
-
-#include "Python.h"
-#include "token.h"
-
-/* Token names */
-
-const char * const _PyParser_TokenNames[] = {
- "ENDMARKER",
- "NAME",
- "NUMBER",
- "STRING",
- "NEWLINE",
- "INDENT",
- "DEDENT",
- "LPAR",
- "RPAR",
- "LSQB",
- "RSQB",
- "COLON",
- "COMMA",
- "SEMI",
- "PLUS",
- "MINUS",
- "STAR",
- "SLASH",
- "VBAR",
- "AMPER",
- "LESS",
- "GREATER",
- "EQUAL",
- "DOT",
- "PERCENT",
- "LBRACE",
- "RBRACE",
- "EQEQUAL",
- "NOTEQUAL",
- "LESSEQUAL",
- "GREATEREQUAL",
- "TILDE",
- "CIRCUMFLEX",
- "LEFTSHIFT",
- "RIGHTSHIFT",
- "DOUBLESTAR",
- "PLUSEQUAL",
- "MINEQUAL",
- "STAREQUAL",
- "SLASHEQUAL",
- "PERCENTEQUAL",
- "AMPEREQUAL",
- "VBAREQUAL",
- "CIRCUMFLEXEQUAL",
- "LEFTSHIFTEQUAL",
- "RIGHTSHIFTEQUAL",
- "DOUBLESTAREQUAL",
- "DOUBLESLASH",
- "DOUBLESLASHEQUAL",
- "AT",
- "ATEQUAL",
- "RARROW",
- "ELLIPSIS",
- "COLONEQUAL",
- "OP",
- "AWAIT",
- "ASYNC",
- "TYPE_IGNORE",
- "TYPE_COMMENT",
- "<ERRORTOKEN>",
- "<COMMENT>",
- "<NL>",
- "<ENCODING>",
- "<N_TOKENS>",
-};
-
-/* Return the token corresponding to a single character */
-
-int
-PyToken_OneChar(int c1)
-{
- switch (c1) {
- case '%': return PERCENT;
- case '&': return AMPER;
- case '(': return LPAR;
- case ')': return RPAR;
- case '*': return STAR;
- case '+': return PLUS;
- case ',': return COMMA;
- case '-': return MINUS;
- case '.': return DOT;
- case '/': return SLASH;
- case ':': return COLON;
- case ';': return SEMI;
- case '<': return LESS;
- case '=': return EQUAL;
- case '>': return GREATER;
- case '@': return AT;
- case '[': return LSQB;
- case ']': return RSQB;
- case '^': return CIRCUMFLEX;
- case '{': return LBRACE;
- case '|': return VBAR;
- case '}': return RBRACE;
- case '~': return TILDE;
- }
- return OP;
-}
-
-int
-PyToken_TwoChars(int c1, int c2)
-{
- switch (c1) {
- case '!':
- switch (c2) {
- case '=': return NOTEQUAL;
- }
- break;
- case '%':
- switch (c2) {
- case '=': return PERCENTEQUAL;
- }
- break;
- case '&':
- switch (c2) {
- case '=': return AMPEREQUAL;
- }
- break;
- case '*':
- switch (c2) {
- case '*': return DOUBLESTAR;
- case '=': return STAREQUAL;
- }
- break;
- case '+':
- switch (c2) {
- case '=': return PLUSEQUAL;
- }
- break;
- case '-':
- switch (c2) {
- case '=': return MINEQUAL;
- case '>': return RARROW;
- }
- break;
- case '/':
- switch (c2) {
- case '/': return DOUBLESLASH;
- case '=': return SLASHEQUAL;
- }
- break;
- case ':':
- switch (c2) {
- case '=': return COLONEQUAL;
- }
- break;
- case '<':
- switch (c2) {
- case '<': return LEFTSHIFT;
- case '=': return LESSEQUAL;
- case '>': return NOTEQUAL;
- }
- break;
- case '=':
- switch (c2) {
- case '=': return EQEQUAL;
- }
- break;
- case '>':
- switch (c2) {
- case '=': return GREATEREQUAL;
- case '>': return RIGHTSHIFT;
- }
- break;
- case '@':
- switch (c2) {
- case '=': return ATEQUAL;
- }
- break;
- case '^':
- switch (c2) {
- case '=': return CIRCUMFLEXEQUAL;
- }
- break;
- case '|':
- switch (c2) {
- case '=': return VBAREQUAL;
- }
- break;
- }
- return OP;
-}
-
-int
-PyToken_ThreeChars(int c1, int c2, int c3)
-{
- switch (c1) {
- case '*':
- switch (c2) {
- case '*':
- switch (c3) {
- case '=': return DOUBLESTAREQUAL;
- }
- break;
- }
- break;
- case '.':
- switch (c2) {
- case '.':
- switch (c3) {
- case '.': return ELLIPSIS;
- }
- break;
- }
- break;
- case '/':
- switch (c2) {
- case '/':
- switch (c3) {
- case '=': return DOUBLESLASHEQUAL;
- }
- break;
- }
- break;
- case '<':
- switch (c2) {
- case '<':
- switch (c3) {
- case '=': return LEFTSHIFTEQUAL;
- }
- break;
- }
- break;
- case '>':
- switch (c2) {
- case '>':
- switch (c3) {
- case '=': return RIGHTSHIFTEQUAL;
- }
- break;
- }
- break;
- }
- return OP;
-}
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index f84093d..8966661 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -2,6 +2,7 @@
/* Tokenizer implementation */
#include "Python.h"
+#include "pgenheaders.h"
#include <ctype.h>
#include <assert.h>
@@ -9,29 +10,16 @@
#include "tokenizer.h"
#include "errcode.h"
+#ifndef PGEN
#include "unicodeobject.h"
-#include "bytesobject.h"
+#include "stringobject.h"
#include "fileobject.h"
#include "codecs.h"
#include "abstract.h"
+#include "pydebug.h"
+#endif /* PGEN */
-/* Alternate tab spacing */
-#define ALTTABSIZE 1
-
-#define is_potential_identifier_start(c) (\
- (c >= 'a' && c <= 'z')\
- || (c >= 'A' && c <= 'Z')\
- || c == '_'\
- || (c >= 128))
-
-#define is_potential_identifier_char(c) (\
- (c >= 'a' && c <= 'z')\
- || (c >= 'A' && c <= 'Z')\
- || (c >= '0' && c <= '9')\
- || c == '_'\
- || (c >= 128))
-
-extern char *PyOS_Readline(FILE *, FILE *, const char *);
+extern char *PyOS_Readline(FILE *, FILE *, char *);
/* Return malloc'ed string including trailing \n;
empty malloc'ed string for EOF;
NULL if interrupted */
@@ -44,10 +32,65 @@ static struct tok_state *tok_new(void);
static int tok_nextc(struct tok_state *tok);
static void tok_backup(struct tok_state *tok, int c);
-
-/* Spaces in this constant are treated as "zero or more spaces or tabs" when
- tokenizing. */
-static const char* type_comment_prefix = "# type: ";
+/* Token names */
+
+char *_PyParser_TokenNames[] = {
+ "ENDMARKER",
+ "NAME",
+ "NUMBER",
+ "STRING",
+ "NEWLINE",
+ "INDENT",
+ "DEDENT",
+ "LPAR",
+ "RPAR",
+ "LSQB",
+ "RSQB",
+ "COLON",
+ "COMMA",
+ "SEMI",
+ "PLUS",
+ "MINUS",
+ "STAR",
+ "SLASH",
+ "VBAR",
+ "AMPER",
+ "LESS",
+ "GREATER",
+ "EQUAL",
+ "DOT",
+ "PERCENT",
+ "BACKQUOTE",
+ "LBRACE",
+ "RBRACE",
+ "EQEQUAL",
+ "NOTEQUAL",
+ "LESSEQUAL",
+ "GREATEREQUAL",
+ "TILDE",
+ "CIRCUMFLEX",
+ "LEFTSHIFT",
+ "RIGHTSHIFT",
+ "DOUBLESTAR",
+ "PLUSEQUAL",
+ "MINEQUAL",
+ "STAREQUAL",
+ "SLASHEQUAL",
+ "PERCENTEQUAL",
+ "AMPEREQUAL",
+ "VBAREQUAL",
+ "CIRCUMFLEXEQUAL",
+ "LEFTSHIFTEQUAL",
+ "RIGHTSHIFTEQUAL",
+ "DOUBLESTAREQUAL",
+ "DOUBLESLASH",
+ "DOUBLESLASHEQUAL",
+ "AT",
+ /* This table must match the #defines in token.h! */
+ "OP",
+ "<ERRORTOKEN>",
+ "<N_TOKENS>"
+};
/* Create and initialize a new tok_state structure */
@@ -65,45 +108,61 @@ tok_new(void)
tok->tabsize = TABSIZE;
tok->indent = 0;
tok->indstack[0] = 0;
-
tok->atbol = 1;
tok->pendin = 0;
tok->prompt = tok->nextprompt = NULL;
tok->lineno = 0;
tok->level = 0;
+ tok->filename = NULL;
+ tok->altwarning = 0;
+ tok->alterror = 0;
+ tok->alttabsize = 1;
tok->altindstack[0] = 0;
- tok->decoding_state = STATE_INIT;
+ tok->decoding_state = 0;
tok->decoding_erred = 0;
tok->read_coding_spec = 0;
- tok->enc = NULL;
tok->encoding = NULL;
tok->cont_line = 0;
- tok->filename = NULL;
+#ifndef PGEN
tok->decoding_readline = NULL;
tok->decoding_buffer = NULL;
- tok->type_comments = 0;
-
- tok->async_hacks = 0;
- tok->async_def = 0;
- tok->async_def_indent = 0;
- tok->async_def_nl = 0;
-
+#endif
return tok;
}
static char *
-new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
+new_string(const char *s, Py_ssize_t len)
{
char* result = (char *)PyMem_MALLOC(len + 1);
- if (!result) {
- tok->done = E_NOMEM;
- return NULL;
+ if (result != NULL) {
+ memcpy(result, s, len);
+ result[len] = '\0';
}
- memcpy(result, s, len);
- result[len] = '\0';
return result;
}
+#ifdef PGEN
+
+static char *
+decoding_fgets(char *s, int size, struct tok_state *tok)
+{
+ return fgets(s, size, tok->fp);
+}
+
+static int
+decoding_feof(struct tok_state *tok)
+{
+ return feof(tok->fp);
+}
+
+static char *
+decode_str(const char *str, int exec_input, struct tok_state *tok)
+{
+ return new_string(str, strlen(str));
+}
+
+#else /* PGEN */
+
static char *
error_ret(struct tok_state *tok) /* XXX */
{
@@ -116,8 +175,8 @@ error_ret(struct tok_state *tok) /* XXX */
}
-static const char *
-get_normal_name(const char *s) /* for utf-8 and latin-1 */
+static char *
+get_normal_name(char *s) /* for utf-8 and latin-1 */
{
char buf[13];
int i;
@@ -147,18 +206,17 @@ get_normal_name(const char *s) /* for utf-8 and latin-1 */
/* Return the coding spec in S, or NULL if none is found. */
-static int
-get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
+static char *
+get_coding_spec(const char *s, Py_ssize_t size)
{
Py_ssize_t i;
- *spec = NULL;
/* Coding spec must be in a comment, and that comment must be
* the only statement on the source code line. */
for (i = 0; i < size - 6; i++) {
if (s[i] == '#')
break;
if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
- return 1;
+ return NULL;
}
for (; i < size - 6; i++) { /* XXX inefficient search */
const char* t = s + i;
@@ -177,23 +235,20 @@ get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *t
t++;
if (begin < t) {
- char* r = new_string(begin, t - begin, tok);
- const char* q;
+ char* r = new_string(begin, t - begin);
+ char* q;
if (!r)
- return 0;
+ return NULL;
q = get_normal_name(r);
if (r != q) {
PyMem_FREE(r);
- r = new_string(q, strlen(q), tok);
- if (!r)
- return 0;
+ r = new_string(q, strlen(q));
}
- *spec = r;
- break;
+ return r;
}
}
}
- return 1;
+ return NULL;
}
/* Check whether the line contains a coding spec. If it does,
@@ -205,7 +260,7 @@ static int
check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
int set_readline(struct tok_state *, const char *))
{
- char *cs;
+ char * cs;
int r = 1;
if (tok->cont_line) {
@@ -213,8 +268,7 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
tok->read_coding_spec = 1;
return 1;
}
- if (!get_coding_spec(line, &cs, size, tok))
- return 0;
+ cs = get_coding_spec(line, size);
if (!cs) {
Py_ssize_t i;
for (i = 0; i < size; i++) {
@@ -227,31 +281,40 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
break;
}
}
- return 1;
- }
- tok->read_coding_spec = 1;
- if (tok->encoding == NULL) {
- assert(tok->decoding_state == STATE_RAW);
- if (strcmp(cs, "utf-8") == 0) {
- tok->encoding = cs;
- } else {
- r = set_readline(tok, cs);
- if (r) {
+ } else {
+ tok->read_coding_spec = 1;
+ if (tok->encoding == NULL) {
+ assert(tok->decoding_state == 1); /* raw */
+ if (strcmp(cs, "utf-8") == 0 ||
+ strcmp(cs, "iso-8859-1") == 0) {
tok->encoding = cs;
- tok->decoding_state = STATE_NORMAL;
- }
- else {
- PyErr_Format(PyExc_SyntaxError,
- "encoding problem: %s", cs);
+ } else {
+#ifdef Py_USING_UNICODE
+ r = set_readline(tok, cs);
+ if (r) {
+ tok->encoding = cs;
+ tok->decoding_state = -1;
+ }
+ else {
+ PyErr_Format(PyExc_SyntaxError,
+ "encoding problem: %s", cs);
+ PyMem_FREE(cs);
+ }
+#else
+ /* Without Unicode support, we cannot
+ process the coding spec. Since there
+ won't be any Unicode literals, that
+ won't matter. */
PyMem_FREE(cs);
+#endif
}
+ } else { /* then, compare cs with BOM */
+ r = (strcmp(tok->encoding, cs) == 0);
+ if (!r)
+ PyErr_Format(PyExc_SyntaxError,
+ "encoding problem: %s with BOM", cs);
+ PyMem_FREE(cs);
}
- } else { /* then, compare cs with BOM */
- r = (strcmp(tok->encoding, cs) == 0);
- if (!r)
- PyErr_Format(PyExc_SyntaxError,
- "encoding problem: %s with BOM", cs);
- PyMem_FREE(cs);
}
return r;
}
@@ -268,7 +331,7 @@ check_bom(int get_char(struct tok_state *),
{
int ch1, ch2, ch3;
ch1 = get_char(tok);
- tok->decoding_state = STATE_RAW;
+ tok->decoding_state = 1;
if (ch1 == EOF) {
return 1;
} else if (ch1 == 0xEF) {
@@ -297,7 +360,7 @@ check_bom(int get_char(struct tok_state *),
}
if (!set_readline(tok, "utf-16-be"))
return 0;
- tok->decoding_state = STATE_NORMAL;
+ tok->decoding_state = -1;
} else if (ch1 == 0xFF) {
ch2 = get_char(tok);
if (ch2 != 0xFE) {
@@ -307,7 +370,7 @@ check_bom(int get_char(struct tok_state *),
}
if (!set_readline(tok, "utf-16-le"))
return 0;
- tok->decoding_state = STATE_NORMAL;
+ tok->decoding_state = -1;
#endif
} else {
unget_char(ch1, tok);
@@ -315,10 +378,7 @@ check_bom(int get_char(struct tok_state *),
}
if (tok->encoding != NULL)
PyMem_FREE(tok->encoding);
- tok->encoding = new_string("utf-8", 5, tok);
- if (!tok->encoding)
- return 0;
- /* No need to set_readline: input is already utf-8 */
+ tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
return 1;
}
@@ -329,7 +389,7 @@ check_bom(int get_char(struct tok_state *),
1) NULL: need to call tok->decoding_readline to get a new line
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
stored the result in tok->decoding_buffer
- 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
+ 3) PyStringObject *: previous call to fp_readl did not have enough room
(in the s buffer) to copy entire contents of the line read
by tok->decoding_readline. tok->decoding_buffer has the overflow.
In this case, fp_readl is called in a loop (with an expanded buffer)
@@ -340,62 +400,58 @@ check_bom(int get_char(struct tok_state *),
static char *
fp_readl(char *s, int size, struct tok_state *tok)
{
- PyObject* bufobj;
- const char *buf;
- Py_ssize_t buflen;
+#ifndef Py_USING_UNICODE
+ /* In a non-Unicode built, this should never be called. */
+ Py_FatalError("fp_readl should not be called in this build.");
+ return NULL; /* Keep compiler happy (not reachable) */
+#else
+ PyObject* utf8 = NULL;
+ PyObject* buf = tok->decoding_buffer;
+ char *str;
+ Py_ssize_t utf8len;
/* Ask for one less byte so we can terminate it */
assert(size > 0);
size--;
- if (tok->decoding_buffer) {
- bufobj = tok->decoding_buffer;
- Py_INCREF(bufobj);
- }
- else
- {
- bufobj = _PyObject_CallNoArg(tok->decoding_readline);
- if (bufobj == NULL)
- goto error;
- }
- if (PyUnicode_CheckExact(bufobj))
- {
- buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
- if (buf == NULL) {
- goto error;
+ if (buf == NULL) {
+ buf = PyObject_CallObject(tok->decoding_readline, NULL);
+ if (buf == NULL)
+ return error_ret(tok);
+ if (!PyUnicode_Check(buf)) {
+ Py_DECREF(buf);
+ PyErr_SetString(PyExc_SyntaxError,
+ "codec did not return a unicode object");
+ return error_ret(tok);
}
+ } else {
+ tok->decoding_buffer = NULL;
+ if (PyString_CheckExact(buf))
+ utf8 = buf;
}
- else
- {
- buf = PyByteArray_AsString(bufobj);
- if (buf == NULL) {
- goto error;
- }
- buflen = PyByteArray_GET_SIZE(bufobj);
+ if (utf8 == NULL) {
+ utf8 = PyUnicode_AsUTF8String(buf);
+ Py_DECREF(buf);
+ if (utf8 == NULL)
+ return error_ret(tok);
}
-
- Py_XDECREF(tok->decoding_buffer);
- if (buflen > size) {
- /* Too many chars, the rest goes into tok->decoding_buffer */
- tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
- buflen-size);
- if (tok->decoding_buffer == NULL)
- goto error;
- buflen = size;
+ str = PyString_AsString(utf8);
+ utf8len = PyString_GET_SIZE(utf8);
+ if (utf8len > size) {
+ tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
+ if (tok->decoding_buffer == NULL) {
+ Py_DECREF(utf8);
+ return error_ret(tok);
+ }
+ utf8len = size;
}
- else
- tok->decoding_buffer = NULL;
-
- memcpy(s, buf, buflen);
- s[buflen] = '\0';
- if (buflen == 0) /* EOF */
- s = NULL;
- Py_DECREF(bufobj);
+ memcpy(s, str, utf8len);
+ s[utf8len] = '\0';
+ Py_DECREF(utf8);
+ if (utf8len == 0)
+ return NULL; /* EOF */
return s;
-
-error:
- Py_XDECREF(bufobj);
- return error_ret(tok);
+#endif
}
/* Set the readline function for TOK to a StreamReader's
@@ -411,48 +467,24 @@ error:
static int
fp_setreadl(struct tok_state *tok, const char* enc)
{
- PyObject *readline, *io, *stream;
- _Py_IDENTIFIER(open);
- _Py_IDENTIFIER(readline);
- int fd;
- long pos;
-
- fd = fileno(tok->fp);
- /* Due to buffering the file offset for fd can be different from the file
- * position of tok->fp. If tok->fp was opened in text mode on Windows,
- * its file position counts CRLF as one char and can't be directly mapped
- * to the file offset for fd. Instead we step back one byte and read to
- * the end of line.*/
- pos = ftell(tok->fp);
- if (pos == -1 ||
- lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
- PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
- return 0;
- }
+ PyObject *reader, *stream, *readline;
- io = PyImport_ImportModuleNoBlock("io");
- if (io == NULL)
- return 0;
-
- stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
- fd, "r", -1, enc, Py_None, Py_None, Py_False);
- Py_DECREF(io);
+ /* XXX: constify filename argument. */
+ stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
if (stream == NULL)
return 0;
- readline = _PyObject_GetAttrId(stream, &PyId_readline);
+ reader = PyCodec_StreamReader(enc, stream, NULL);
Py_DECREF(stream);
- if (readline == NULL)
+ if (reader == NULL)
return 0;
- Py_XSETREF(tok->decoding_readline, readline);
- if (pos > 0) {
- PyObject *bufobj = _PyObject_CallNoArg(readline);
- if (bufobj == NULL)
- return 0;
- Py_DECREF(bufobj);
- }
+ readline = PyObject_GetAttrString(reader, "readline");
+ Py_DECREF(reader);
+ if (readline == NULL)
+ return 0;
+ tok->decoding_readline = readline;
return 1;
}
@@ -468,34 +500,6 @@ static void fp_ungetc(int c, struct tok_state *tok) {
ungetc(c, tok->fp);
}
-/* Check whether the characters at s start a valid
- UTF-8 sequence. Return the number of characters forming
- the sequence if yes, 0 if not. */
-static int valid_utf8(const unsigned char* s)
-{
- int expected = 0;
- int length;
- if (*s < 0x80)
- /* single-byte code */
- return 1;
- if (*s < 0xc0)
- /* following byte */
- return 0;
- if (*s < 0xE0)
- expected = 1;
- else if (*s < 0xF0)
- expected = 2;
- else if (*s < 0xF8)
- expected = 3;
- else
- return 0;
- length = expected + 1;
- for (; expected; expected--)
- if (s[expected] < 0x80 || s[expected] >= 0xC0)
- return 0;
- return length;
-}
-
/* Read a line of input from TOK. Determine encoding
if necessary. */
@@ -505,12 +509,12 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
char *line = NULL;
int badchar = 0;
for (;;) {
- if (tok->decoding_state == STATE_NORMAL) {
+ if (tok->decoding_state < 0) {
/* We already have a codec associated with
this input. */
line = fp_readl(s, size, tok);
break;
- } else if (tok->decoding_state == STATE_RAW) {
+ } else if (tok->decoding_state > 0) {
/* We want a 'raw' read. */
line = Py_UniversalNewlineFgets(s, size,
tok->fp, NULL);
@@ -521,7 +525,7 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
reader functions from now on. */
if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
return error_ret(tok);
- assert(tok->decoding_state != STATE_INIT);
+ assert(tok->decoding_state != 0);
}
}
if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
@@ -529,40 +533,43 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
return error_ret(tok);
}
}
- /* The default encoding is UTF-8, so make sure we don't have any
- non-UTF-8 sequences in it. */
+#ifndef PGEN
+ /* The default encoding is ASCII, so make sure we don't have any
+ non-ASCII bytes in it. */
if (line && !tok->encoding) {
unsigned char *c;
- int length;
- for (c = (unsigned char *)line; *c; c += length)
- if (!(length = valid_utf8(c))) {
+ for (c = (unsigned char *)line; *c; c++)
+ if (*c > 127) {
badchar = *c;
break;
}
}
if (badchar) {
+ char buf[500];
/* Need to add 1 to the line number, since this line
has not been counted, yet. */
- PyErr_Format(PyExc_SyntaxError,
- "Non-UTF-8 code starting with '\\x%.2x' "
- "in file %U on line %i, "
- "but no encoding declared; "
- "see http://python.org/dev/peps/pep-0263/ for details",
- badchar, tok->filename, tok->lineno + 1);
+ sprintf(buf,
+ "Non-ASCII character '\\x%.2x' "
+ "in file %.200s on line %i, "
+ "but no encoding declared; "
+ "see http://python.org/dev/peps/pep-0263/ for details",
+ badchar, tok->filename, tok->lineno + 1);
+ PyErr_SetString(PyExc_SyntaxError, buf);
return error_ret(tok);
}
+#endif
return line;
}
static int
decoding_feof(struct tok_state *tok)
{
- if (tok->decoding_state != STATE_NORMAL) {
+ if (tok->decoding_state >= 0) {
return feof(tok->fp);
} else {
PyObject* buf = tok->decoding_buffer;
if (buf == NULL) {
- buf = _PyObject_CallNoArg(tok->decoding_readline);
+ buf = PyObject_CallObject(tok->decoding_readline, NULL);
if (buf == NULL) {
error_ret(tok);
return 1;
@@ -601,6 +608,7 @@ buf_setreadl(struct tok_state *tok, const char* enc) {
/* Return a UTF-8 encoding Python string object from the
C byte string STR, which is encoded with ENC. */
+#ifdef Py_USING_UNICODE
static PyObject *
translate_into_utf8(const char* str, const char* enc) {
PyObject *utf8;
@@ -611,12 +619,12 @@ translate_into_utf8(const char* str, const char* enc) {
Py_DECREF(buf);
return utf8;
}
+#endif
static char *
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
- int skip_next_lf = 0;
- size_t needed_length = strlen(s) + 2, final_length;
+ int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
char *buf, *current;
char c = '\0';
buf = PyMem_MALLOC(needed_length);
@@ -680,12 +688,14 @@ decode_str(const char *input, int single, struct tok_state *tok)
return error_ret(tok);
str = tok->str; /* string after BOM if any */
assert(str);
+#ifdef Py_USING_UNICODE
if (tok->enc != NULL) {
utf8 = translate_into_utf8(str, tok->enc);
if (utf8 == NULL)
return error_ret(tok);
- str = PyBytes_AsString(utf8);
+ str = PyString_AsString(utf8);
}
+#endif
for (s = str;; s++) {
if (*s == '\0') break;
else if (*s == '\n') {
@@ -707,18 +717,22 @@ decode_str(const char *input, int single, struct tok_state *tok)
return error_ret(tok);
}
}
+#ifdef Py_USING_UNICODE
if (tok->enc != NULL) {
assert(utf8 == NULL);
utf8 = translate_into_utf8(str, tok->enc);
if (utf8 == NULL)
return error_ret(tok);
- str = PyBytes_AS_STRING(utf8);
+ str = PyString_AsString(utf8);
}
+#endif
assert(tok->decoding_buffer == NULL);
tok->decoding_buffer = utf8; /* CAUTION */
return str;
}
+#endif /* PGEN */
+
/* Set up tokenizer for string */
struct tok_state *
@@ -727,7 +741,7 @@ PyTokenizer_FromString(const char *str, int exec_input)
struct tok_state *tok = tok_new();
if (tok == NULL)
return NULL;
- str = decode_str(str, exec_input, tok);
+ str = (char *)decode_str(str, exec_input, tok);
if (str == NULL) {
PyTokenizer_Free(tok);
return NULL;
@@ -738,38 +752,11 @@ PyTokenizer_FromString(const char *str, int exec_input)
return tok;
}
-struct tok_state *
-PyTokenizer_FromUTF8(const char *str, int exec_input)
-{
- struct tok_state *tok = tok_new();
- if (tok == NULL)
- return NULL;
- tok->input = str = translate_newlines(str, exec_input, tok);
- if (str == NULL) {
- PyTokenizer_Free(tok);
- return NULL;
- }
- tok->decoding_state = STATE_RAW;
- tok->read_coding_spec = 1;
- tok->enc = NULL;
- tok->str = str;
- tok->encoding = (char *)PyMem_MALLOC(6);
- if (!tok->encoding) {
- PyTokenizer_Free(tok);
- return NULL;
- }
- strcpy(tok->encoding, "utf-8");
-
- /* XXX: constify members. */
- tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
- return tok;
-}
/* Set up tokenizer for file */
struct tok_state *
-PyTokenizer_FromFile(FILE *fp, const char* enc,
- const char *ps1, const char *ps2)
+PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
{
struct tok_state *tok = tok_new();
if (tok == NULL)
@@ -783,17 +770,6 @@ PyTokenizer_FromFile(FILE *fp, const char* enc,
tok->fp = fp;
tok->prompt = ps1;
tok->nextprompt = ps2;
- if (enc != NULL) {
- /* Must copy encoding declaration since it
- gets copied into the parse tree. */
- tok->encoding = PyMem_MALLOC(strlen(enc)+1);
- if (!tok->encoding) {
- PyTokenizer_Free(tok);
- return NULL;
- }
- strcpy(tok->encoding, enc);
- tok->decoding_state = STATE_NORMAL;
- }
return tok;
}
@@ -805,9 +781,10 @@ PyTokenizer_Free(struct tok_state *tok)
{
if (tok->encoding != NULL)
PyMem_FREE(tok->encoding);
+#ifndef PGEN
Py_XDECREF(tok->decoding_readline);
Py_XDECREF(tok->decoding_buffer);
- Py_XDECREF(tok->filename);
+#endif
if (tok->fp != NULL && tok->buf != NULL)
PyMem_FREE(tok->buf);
if (tok->input)
@@ -815,10 +792,74 @@ PyTokenizer_Free(struct tok_state *tok)
PyMem_FREE(tok);
}
+#if !defined(PGEN) && defined(Py_USING_UNICODE)
+static int
+tok_stdin_decode(struct tok_state *tok, char **inp)
+{
+ PyObject *enc, *sysstdin, *decoded, *utf8;
+ const char *encoding;
+ char *converted;
+
+ if (PySys_GetFile((char *)"stdin", NULL) != stdin)
+ return 0;
+ sysstdin = PySys_GetObject("stdin");
+ if (sysstdin == NULL || !PyFile_Check(sysstdin))
+ return 0;
+
+ enc = ((PyFileObject *)sysstdin)->f_encoding;
+ if (enc == NULL || !PyString_Check(enc))
+ return 0;
+ Py_INCREF(enc);
+
+ encoding = PyString_AsString(enc);
+ decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
+ if (decoded == NULL)
+ goto error_clear;
+
+ utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
+ Py_DECREF(decoded);
+ if (utf8 == NULL)
+ goto error_clear;
+
+ assert(PyString_Check(utf8));
+ converted = new_string(PyString_AS_STRING(utf8),
+ PyString_GET_SIZE(utf8));
+ Py_DECREF(utf8);
+ if (converted == NULL)
+ goto error_nomem;
+
+ PyMem_FREE(*inp);
+ *inp = converted;
+ if (tok->encoding != NULL)
+ PyMem_FREE(tok->encoding);
+ tok->encoding = new_string(encoding, strlen(encoding));
+ if (tok->encoding == NULL)
+ goto error_nomem;
+
+ Py_DECREF(enc);
+ return 0;
+
+error_nomem:
+ Py_DECREF(enc);
+ tok->done = E_NOMEM;
+ return -1;
+
+error_clear:
+ Py_DECREF(enc);
+ if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
+ tok->done = E_ERROR;
+ return -1;
+ }
+ /* Fallback to iso-8859-1: for backward compatibility */
+ PyErr_Clear();
+ return 0;
+}
+#endif
+
/* Get next char, updating state; error code goes into tok->done */
static int
-tok_nextc(struct tok_state *tok)
+tok_nextc(register struct tok_state *tok)
{
for (;;) {
if (tok->cur != tok->inp) {
@@ -846,34 +887,6 @@ tok_nextc(struct tok_state *tok)
}
if (tok->prompt != NULL) {
char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
- if (newtok != NULL) {
- char *translated = translate_newlines(newtok, 0, tok);
- PyMem_FREE(newtok);
- if (translated == NULL)
- return EOF;
- newtok = translated;
- }
- if (tok->encoding && newtok && *newtok) {
- /* Recode to UTF-8 */
- Py_ssize_t buflen;
- const char* buf;
- PyObject *u = translate_into_utf8(newtok, tok->encoding);
- PyMem_FREE(newtok);
- if (!u) {
- tok->done = E_DECODE;
- return EOF;
- }
- buflen = PyBytes_GET_SIZE(u);
- buf = PyBytes_AS_STRING(u);
- newtok = PyMem_MALLOC(buflen+1);
- if (newtok == NULL) {
- Py_DECREF(u);
- tok->done = E_NOMEM;
- return EOF;
- }
- strcpy(newtok, buf);
- Py_DECREF(u);
- }
if (tok->nextprompt != NULL)
tok->prompt = tok->nextprompt;
if (newtok == NULL)
@@ -882,6 +895,10 @@ tok_nextc(struct tok_state *tok)
PyMem_FREE(newtok);
tok->done = E_EOF;
}
+#if !defined(PGEN) && defined(Py_USING_UNICODE)
+ else if (tok_stdin_decode(tok, &newtok) != 0)
+ PyMem_FREE(newtok);
+#endif
else if (tok->start != NULL) {
size_t start = tok->start - tok->buf;
size_t oldlen = tok->cur - tok->buf;
@@ -956,7 +973,6 @@ tok_nextc(struct tok_state *tok)
while (!done) {
Py_ssize_t curstart = tok->start == NULL ? -1 :
tok->start - tok->buf;
- Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Py_ssize_t curvalid = tok->inp - tok->buf;
Py_ssize_t newsize = curvalid + BUFSIZ;
char *newbuf = tok->buf;
@@ -969,7 +985,6 @@ tok_nextc(struct tok_state *tok)
}
tok->buf = newbuf;
tok->cur = tok->buf + cur;
- tok->multi_line_start = tok->buf + cur_multi_line_start;
tok->line_start = tok->cur;
tok->inp = tok->buf + curvalid;
tok->end = tok->buf + newsize;
@@ -985,8 +1000,7 @@ tok_nextc(struct tok_state *tok)
return EOF;
/* Last line does not end in \n,
fake one */
- if (tok->inp[-1] != '\n')
- strcpy(tok->inp, "\n");
+ strcpy(tok->inp, "\n");
}
tok->inp = strchr(tok->inp, '\0');
done = tok->inp[-1] == '\n';
@@ -1018,7 +1032,7 @@ tok_nextc(struct tok_state *tok)
/* Back-up one character */
static void
-tok_backup(struct tok_state *tok, int c)
+tok_backup(register struct tok_state *tok, register int c)
{
if (c != EOF) {
if (--tok->cur < tok->buf)
@@ -1029,88 +1043,185 @@ tok_backup(struct tok_state *tok, int c)
}
-static int
-syntaxerror(struct tok_state *tok, const char *format, ...)
-{
- va_list vargs;
-#ifdef HAVE_STDARG_PROTOTYPES
- va_start(vargs, format);
-#else
- va_start(vargs);
-#endif
- PyErr_FormatV(PyExc_SyntaxError, format, vargs);
- va_end(vargs);
- PyErr_SyntaxLocationObject(tok->filename,
- tok->lineno,
- (int)(tok->cur - tok->line_start));
- tok->done = E_ERROR;
- return ERRORTOKEN;
-}
+/* Return the token corresponding to a single character */
-static int
-indenterror(struct tok_state *tok)
+int
+PyToken_OneChar(int c)
{
- tok->done = E_TABSPACE;
- tok->cur = tok->inp;
- return ERRORTOKEN;
+ switch (c) {
+ case '(': return LPAR;
+ case ')': return RPAR;
+ case '[': return LSQB;
+ case ']': return RSQB;
+ case ':': return COLON;
+ case ',': return COMMA;
+ case ';': return SEMI;
+ case '+': return PLUS;
+ case '-': return MINUS;
+ case '*': return STAR;
+ case '/': return SLASH;
+ case '|': return VBAR;
+ case '&': return AMPER;
+ case '<': return LESS;
+ case '>': return GREATER;
+ case '=': return EQUAL;
+ case '.': return DOT;
+ case '%': return PERCENT;
+ case '`': return BACKQUOTE;
+ case '{': return LBRACE;
+ case '}': return RBRACE;
+ case '^': return CIRCUMFLEX;
+ case '~': return TILDE;
+ case '@': return AT;
+ default: return OP;
+ }
}
-/* Verify that the identifier follows PEP 3131.
- All identifier strings are guaranteed to be "ready" unicode objects.
- */
-static int
-verify_identifier(struct tok_state *tok)
+
+int
+PyToken_TwoChars(int c1, int c2)
{
- PyObject *s;
- int result;
- if (tok->decoding_erred)
- return 0;
- s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
- if (s == NULL) {
- if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
- PyErr_Clear();
- tok->done = E_IDENTIFIER;
- } else {
- tok->done = E_ERROR;
+ switch (c1) {
+ case '=':
+ switch (c2) {
+ case '=': return EQEQUAL;
}
- return 0;
+ break;
+ case '!':
+ switch (c2) {
+ case '=': return NOTEQUAL;
+ }
+ break;
+ case '<':
+ switch (c2) {
+ case '>': return NOTEQUAL;
+ case '=': return LESSEQUAL;
+ case '<': return LEFTSHIFT;
+ }
+ break;
+ case '>':
+ switch (c2) {
+ case '=': return GREATEREQUAL;
+ case '>': return RIGHTSHIFT;
+ }
+ break;
+ case '+':
+ switch (c2) {
+ case '=': return PLUSEQUAL;
+ }
+ break;
+ case '-':
+ switch (c2) {
+ case '=': return MINEQUAL;
+ }
+ break;
+ case '*':
+ switch (c2) {
+ case '*': return DOUBLESTAR;
+ case '=': return STAREQUAL;
+ }
+ break;
+ case '/':
+ switch (c2) {
+ case '/': return DOUBLESLASH;
+ case '=': return SLASHEQUAL;
+ }
+ break;
+ case '|':
+ switch (c2) {
+ case '=': return VBAREQUAL;
+ }
+ break;
+ case '%':
+ switch (c2) {
+ case '=': return PERCENTEQUAL;
+ }
+ break;
+ case '&':
+ switch (c2) {
+ case '=': return AMPEREQUAL;
+ }
+ break;
+ case '^':
+ switch (c2) {
+ case '=': return CIRCUMFLEXEQUAL;
+ }
+ break;
}
- result = PyUnicode_IsIdentifier(s);
- Py_DECREF(s);
- if (result == 0)
- tok->done = E_IDENTIFIER;
- return result;
+ return OP;
}
-static int
-tok_decimal_tail(struct tok_state *tok)
+int
+PyToken_ThreeChars(int c1, int c2, int c3)
{
- int c;
-
- while (1) {
- do {
- c = tok_nextc(tok);
- } while (isdigit(c));
- if (c != '_') {
+ switch (c1) {
+ case '<':
+ switch (c2) {
+ case '<':
+ switch (c3) {
+ case '=':
+ return LEFTSHIFTEQUAL;
+ }
break;
}
- c = tok_nextc(tok);
- if (!isdigit(c)) {
- tok_backup(tok, c);
- syntaxerror(tok, "invalid decimal literal");
- return 0;
+ break;
+ case '>':
+ switch (c2) {
+ case '>':
+ switch (c3) {
+ case '=':
+ return RIGHTSHIFTEQUAL;
+ }
+ break;
+ }
+ break;
+ case '*':
+ switch (c2) {
+ case '*':
+ switch (c3) {
+ case '=':
+ return DOUBLESTAREQUAL;
+ }
+ break;
+ }
+ break;
+ case '/':
+ switch (c2) {
+ case '/':
+ switch (c3) {
+ case '=':
+ return DOUBLESLASHEQUAL;
+ }
+ break;
}
+ break;
+ }
+ return OP;
+}
+
+static int
+indenterror(struct tok_state *tok)
+{
+ if (tok->alterror) {
+ tok->done = E_TABSPACE;
+ tok->cur = tok->inp;
+ return 1;
+ }
+ if (tok->altwarning) {
+ PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
+ "in indentation\n", tok->filename);
+ tok->altwarning = 0;
}
- return c;
+ return 0;
}
/* Get next token, after space stripping etc. */
static int
-tok_get(struct tok_state *tok, char **p_start, char **p_end)
+tok_get(register struct tok_state *tok, char **p_start, char **p_end)
{
- int c;
- int blankline, nonascii;
+ register int c;
+ int blankline;
*p_start = *p_end = NULL;
nextline:
@@ -1119,24 +1230,22 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
/* Get indentation level */
if (tok->atbol) {
- int col = 0;
- int altcol = 0;
+ register int col = 0;
+ register int altcol = 0;
tok->atbol = 0;
for (;;) {
c = tok_nextc(tok);
- if (c == ' ') {
+ if (c == ' ')
col++, altcol++;
- }
else if (c == '\t') {
- col = (col / tok->tabsize + 1) * tok->tabsize;
- altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
+ col = (col/tok->tabsize + 1) * tok->tabsize;
+ altcol = (altcol/tok->alttabsize + 1)
+ * tok->alttabsize;
}
- else if (c == '\014') {/* Control-L (formfeed) */
+ else if (c == '\014') /* Control-L (formfeed) */
col = altcol = 0; /* For Emacs users */
- }
- else {
+ else
break;
- }
}
tok_backup(tok, c);
if (c == '#' || c == '\n') {
@@ -1145,18 +1254,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
not passed to the parser as NEWLINE tokens,
except *totally* empty lines in interactive
mode, which signal the end of a command group. */
- if (col == 0 && c == '\n' && tok->prompt != NULL) {
+ if (col == 0 && c == '\n' && tok->prompt != NULL)
blankline = 0; /* Let it through */
- }
- else if (tok->prompt != NULL && tok->lineno == 1) {
- /* In interactive mode, if the first line contains
- only spaces and/or a comment, let it through. */
- blankline = 0;
- col = altcol = 0;
- }
- else {
+ else
blankline = 1; /* Ignore completely */
- }
/* We can't jump back right here since we still
may need to skip to the end of a comment */
}
@@ -1164,7 +1265,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
if (col == tok->indstack[tok->indent]) {
/* No change */
if (altcol != tok->altindstack[tok->indent]) {
- return indenterror(tok);
+ if (indenterror(tok))
+ return ERRORTOKEN;
}
}
else if (col > tok->indstack[tok->indent]) {
@@ -1175,7 +1277,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
return ERRORTOKEN;
}
if (altcol <= tok->altindstack[tok->indent]) {
- return indenterror(tok);
+ if (indenterror(tok))
+ return ERRORTOKEN;
}
tok->pendin++;
tok->indstack[++tok->indent] = col;
@@ -1194,7 +1297,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
return ERRORTOKEN;
}
if (altcol != tok->altindstack[tok->indent]) {
- return indenterror(tok);
+ if (indenterror(tok))
+ return ERRORTOKEN;
}
}
}
@@ -1214,31 +1318,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
}
}
- /* Peek ahead at the next character */
- c = tok_nextc(tok);
- tok_backup(tok, c);
- /* Check if we are closing an async function */
- if (tok->async_def
- && !blankline
- /* Due to some implementation artifacts of type comments,
- * a TYPE_COMMENT at the start of a function won't set an
- * indentation level and it will produce a NEWLINE after it.
- * To avoid spuriously ending an async function due to this,
- * wait until we have some non-newline char in front of us. */
- && c != '\n'
- && tok->level == 0
- /* There was a NEWLINE after ASYNC DEF,
- so we're past the signature. */
- && tok->async_def_nl
- /* Current indentation level is less than where
- the async function was defined */
- && tok->async_def_indent >= tok->indent)
- {
- tok->async_def = 0;
- tok->async_def_indent = 0;
- tok->async_def_nl = 0;
- }
-
again:
tok->start = NULL;
/* Skip spaces */
@@ -1249,63 +1328,40 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
/* Set start of current token */
tok->start = tok->cur - 1;
- /* Skip comment, unless it's a type comment */
+ /* Skip comment, while looking for tab-setting magic */
if (c == '#') {
- const char *prefix, *p, *type_start;
-
- while (c != EOF && c != '\n') {
- c = tok_nextc(tok);
- }
-
- if (tok->type_comments) {
- p = tok->start;
- prefix = type_comment_prefix;
- while (*prefix && p < tok->cur) {
- if (*prefix == ' ') {
- while (*p == ' ' || *p == '\t') {
- p++;
- }
- } else if (*prefix == *p) {
- p++;
- } else {
- break;
- }
-
- prefix++;
- }
-
- /* This is a type comment if we matched all of type_comment_prefix. */
- if (!*prefix) {
- int is_type_ignore = 1;
- const char *ignore_end = p + 6;
- tok_backup(tok, c); /* don't eat the newline or EOF */
-
- type_start = p;
-
- /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
- * or anything ASCII and non-alphanumeric. */
- is_type_ignore = (
- tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
- && !(tok->cur > ignore_end
- && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
-
- if (is_type_ignore) {
- *p_start = (char *) ignore_end;
- *p_end = tok->cur;
-
- /* If this type ignore is the only thing on the line, consume the newline also. */
- if (blankline) {
- tok_nextc(tok);
- tok->atbol = 1;
- }
- return TYPE_IGNORE;
- } else {
- *p_start = (char *) type_start; /* after type_comment_prefix */
- *p_end = tok->cur;
- return TYPE_COMMENT;
+ static char *tabforms[] = {
+ "tab-width:", /* Emacs */
+ ":tabstop=", /* vim, full form */
+ ":ts=", /* vim, abbreviated form */
+ "set tabsize=", /* will vi never die? */
+ /* more templates can be added here to support other editors */
+ };
+ char cbuf[80];
+ char *tp, **cp;
+ tp = cbuf;
+ do {
+ *tp++ = c = tok_nextc(tok);
+ } while (c != EOF && c != '\n' &&
+ (size_t)(tp - cbuf + 1) < sizeof(cbuf));
+ *tp = '\0';
+ for (cp = tabforms;
+ cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
+ cp++) {
+ if ((tp = strstr(cbuf, *cp))) {
+ int newsize = atoi(tp + strlen(*cp));
+
+ if (newsize >= 1 && newsize <= 40) {
+ tok->tabsize = newsize;
+ if (Py_VerboseFlag)
+ PySys_WriteStderr(
+ "Tab size set to %d\n",
+ newsize);
}
}
}
+ while (c != EOF && c != '\n')
+ c = tok_nextc(tok);
}
/* Check for EOF and errors now */
@@ -1314,108 +1370,49 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
}
/* Identifier (most frequent token!) */
- nonascii = 0;
- if (is_potential_identifier_start(c)) {
- /* Process the various legal combinations of b"", r"", u"", and f"". */
- int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
- while (1) {
- if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
- saw_b = 1;
- /* Since this is a backwards compatibility support literal we don't
- want to support it in arbitrary order like byte literals. */
- else if (!(saw_b || saw_u || saw_r || saw_f)
- && (c == 'u'|| c == 'U')) {
- saw_u = 1;
- }
- /* ur"" and ru"" are not supported */
- else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
- saw_r = 1;
- }
- else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
- saw_f = 1;
- }
- else {
- break;
- }
+ if (Py_ISALPHA(c) || c == '_') {
+ /* Process r"", u"" and ur"" */
+ switch (c) {
+ case 'b':
+ case 'B':
c = tok_nextc(tok);
- if (c == '"' || c == '\'') {
+ if (c == 'r' || c == 'R')
+ c = tok_nextc(tok);
+ if (c == '"' || c == '\'')
goto letter_quote;
- }
+ break;
+ case 'r':
+ case 'R':
+ c = tok_nextc(tok);
+ if (c == '"' || c == '\'')
+ goto letter_quote;
+ break;
+ case 'u':
+ case 'U':
+ c = tok_nextc(tok);
+ if (c == 'r' || c == 'R')
+ c = tok_nextc(tok);
+ if (c == '"' || c == '\'')
+ goto letter_quote;
+ break;
}
- while (is_potential_identifier_char(c)) {
- if (c >= 128) {
- nonascii = 1;
- }
+ while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
c = tok_nextc(tok);
}
tok_backup(tok, c);
- if (nonascii && !verify_identifier(tok)) {
- return ERRORTOKEN;
- }
*p_start = tok->start;
*p_end = tok->cur;
-
- /* async/await parsing block. */
- if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
- /* May be an 'async' or 'await' token. For Python 3.7 or
- later we recognize them unconditionally. For Python
- 3.5 or 3.6 we recognize 'async' in front of 'def', and
- either one inside of 'async def'. (Technically we
- shouldn't recognize these at all for 3.4 or earlier,
- but there's no *valid* Python 3.4 code that would be
- rejected, and async functions will be rejected in a
- later phase.) */
- if (!tok->async_hacks || tok->async_def) {
- /* Always recognize the keywords. */
- if (memcmp(tok->start, "async", 5) == 0) {
- return ASYNC;
- }
- if (memcmp(tok->start, "await", 5) == 0) {
- return AWAIT;
- }
- }
- else if (memcmp(tok->start, "async", 5) == 0) {
- /* The current token is 'async'.
- Look ahead one token to see if that is 'def'. */
-
- struct tok_state ahead_tok;
- char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
- int ahead_tok_kind;
-
- memcpy(&ahead_tok, tok, sizeof(ahead_tok));
- ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
- &ahead_tok_end);
-
- if (ahead_tok_kind == NAME
- && ahead_tok.cur - ahead_tok.start == 3
- && memcmp(ahead_tok.start, "def", 3) == 0)
- {
- /* The next token is going to be 'def', so instead of
- returning a plain NAME token, return ASYNC. */
- tok->async_def_indent = tok->indent;
- tok->async_def = 1;
- return ASYNC;
- }
- }
- }
-
return NAME;
}
/* Newline */
if (c == '\n') {
tok->atbol = 1;
- if (blankline || tok->level > 0) {
+ if (blankline || tok->level > 0)
goto nextline;
- }
*p_start = tok->start;
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
tok->cont_line = 0;
- if (tok->async_def) {
- /* We're somewhere inside an 'async def' function, and
- we've encountered a NEWLINE after its signature. */
- tok->async_def_nl = 1;
- }
return NEWLINE;
}
@@ -1424,24 +1421,13 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
c = tok_nextc(tok);
if (isdigit(c)) {
goto fraction;
- } else if (c == '.') {
- c = tok_nextc(tok);
- if (c == '.') {
- *p_start = tok->start;
- *p_end = tok->cur;
- return ELLIPSIS;
- }
- else {
- tok_backup(tok, c);
- }
- tok_backup(tok, '.');
}
else {
tok_backup(tok, c);
+ *p_start = tok->start;
+ *p_end = tok->cur;
+ return DOT;
}
- *p_start = tok->start;
- *p_end = tok->cur;
- return DOT;
}
/* Number */
@@ -1449,136 +1435,94 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
if (c == '0') {
/* Hex, octal or binary -- maybe. */
c = tok_nextc(tok);
+ if (c == '.')
+ goto fraction;
+#ifndef WITHOUT_COMPLEX
+ if (c == 'j' || c == 'J')
+ goto imaginary;
+#endif
if (c == 'x' || c == 'X') {
+
/* Hex */
c = tok_nextc(tok);
+ if (!isxdigit(c)) {
+ tok->done = E_TOKEN;
+ tok_backup(tok, c);
+ return ERRORTOKEN;
+ }
do {
- if (c == '_') {
- c = tok_nextc(tok);
- }
- if (!isxdigit(c)) {
- tok_backup(tok, c);
- return syntaxerror(tok, "invalid hexadecimal literal");
- }
- do {
- c = tok_nextc(tok);
- } while (isxdigit(c));
- } while (c == '_');
+ c = tok_nextc(tok);
+ } while (isxdigit(c));
}
else if (c == 'o' || c == 'O') {
/* Octal */
c = tok_nextc(tok);
- do {
- if (c == '_') {
- c = tok_nextc(tok);
- }
- if (c < '0' || c >= '8') {
- tok_backup(tok, c);
- if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in octal literal", c);
- }
- else {
- return syntaxerror(tok, "invalid octal literal");
- }
- }
- do {
- c = tok_nextc(tok);
- } while ('0' <= c && c < '8');
- } while (c == '_');
- if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in octal literal", c);
+ if (c < '0' || c >= '8') {
+ tok->done = E_TOKEN;
+ tok_backup(tok, c);
+ return ERRORTOKEN;
}
+ do {
+ c = tok_nextc(tok);
+ } while ('0' <= c && c < '8');
}
else if (c == 'b' || c == 'B') {
/* Binary */
c = tok_nextc(tok);
- do {
- if (c == '_') {
- c = tok_nextc(tok);
- }
- if (c != '0' && c != '1') {
- tok_backup(tok, c);
- if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in binary literal", c);
- }
- else {
- return syntaxerror(tok, "invalid binary literal");
- }
- }
- do {
- c = tok_nextc(tok);
- } while (c == '0' || c == '1');
- } while (c == '_');
- if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in binary literal", c);
+ if (c != '0' && c != '1') {
+ tok->done = E_TOKEN;
+ tok_backup(tok, c);
+ return ERRORTOKEN;
}
+ do {
+ c = tok_nextc(tok);
+ } while (c == '0' || c == '1');
}
else {
- int nonzero = 0;
- /* maybe old-style octal; c is first char of it */
- /* in any case, allow '0' as a literal */
- while (1) {
- if (c == '_') {
- c = tok_nextc(tok);
- if (!isdigit(c)) {
- tok_backup(tok, c);
- return syntaxerror(tok, "invalid decimal literal");
- }
- }
- if (c != '0') {
- break;
- }
+ int found_decimal = 0;
+ /* Octal; c is first char of it */
+ /* There's no 'isoctdigit' macro, sigh */
+ while ('0' <= c && c < '8') {
c = tok_nextc(tok);
}
if (isdigit(c)) {
- nonzero = 1;
- c = tok_decimal_tail(tok);
- if (c == 0) {
- return ERRORTOKEN;
- }
+ found_decimal = 1;
+ do {
+ c = tok_nextc(tok);
+ } while (isdigit(c));
}
- if (c == '.') {
- c = tok_nextc(tok);
+ if (c == '.')
goto fraction;
- }
- else if (c == 'e' || c == 'E') {
+ else if (c == 'e' || c == 'E')
goto exponent;
- }
- else if (c == 'j' || c == 'J') {
+#ifndef WITHOUT_COMPLEX
+ else if (c == 'j' || c == 'J')
goto imaginary;
- }
- else if (nonzero) {
- /* Old-style octal: now disallowed. */
+#endif
+ else if (found_decimal) {
+ tok->done = E_TOKEN;
tok_backup(tok, c);
- return syntaxerror(tok,
- "leading zeros in decimal integer "
- "literals are not permitted; "
- "use an 0o prefix for octal integers");
+ return ERRORTOKEN;
}
}
+ if (c == 'l' || c == 'L')
+ c = tok_nextc(tok);
}
else {
/* Decimal */
- c = tok_decimal_tail(tok);
- if (c == 0) {
- return ERRORTOKEN;
- }
- {
+ do {
+ c = tok_nextc(tok);
+ } while (isdigit(c));
+ if (c == 'l' || c == 'L')
+ c = tok_nextc(tok);
+ else {
/* Accept floating point numbers. */
if (c == '.') {
- c = tok_nextc(tok);
fraction:
/* Fraction */
- if (isdigit(c)) {
- c = tok_decimal_tail(tok);
- if (c == 0) {
- return ERRORTOKEN;
- }
- }
+ do {
+ c = tok_nextc(tok);
+ } while (isdigit(c));
}
if (c == 'e' || c == 'E') {
int e;
@@ -1589,8 +1533,9 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
if (c == '+' || c == '-') {
c = tok_nextc(tok);
if (!isdigit(c)) {
+ tok->done = E_TOKEN;
tok_backup(tok, c);
- return syntaxerror(tok, "invalid decimal literal");
+ return ERRORTOKEN;
}
} else if (!isdigit(c)) {
tok_backup(tok, c);
@@ -1599,16 +1544,16 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
*p_end = tok->cur;
return NUMBER;
}
- c = tok_decimal_tail(tok);
- if (c == 0) {
- return ERRORTOKEN;
- }
+ do {
+ c = tok_nextc(tok);
+ } while (isdigit(c));
}
- if (c == 'j' || c == 'J') {
+#ifndef WITHOUT_COMPLEX
+ if (c == 'j' || c == 'J')
/* Imaginary part */
imaginary:
c = tok_nextc(tok);
- }
+#endif
}
}
tok_backup(tok, c);
@@ -1620,61 +1565,55 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
letter_quote:
/* String */
if (c == '\'' || c == '"') {
+ Py_ssize_t quote2 = tok->cur - tok->start + 1;
int quote = c;
- int quote_size = 1; /* 1 or 3 */
- int end_quote_size = 0;
-
- /* Nodes of type STRING, especially multi line strings
- must be handled differently in order to get both
- the starting line number and the column offset right.
- (cf. issue 16806) */
- tok->first_lineno = tok->lineno;
- tok->multi_line_start = tok->line_start;
-
- /* Find the quote size and start of string */
- c = tok_nextc(tok);
- if (c == quote) {
- c = tok_nextc(tok);
- if (c == quote) {
- quote_size = 3;
- }
- else {
- end_quote_size = 1; /* empty string found */
- }
- }
- if (c != quote) {
- tok_backup(tok, c);
- }
-
- /* Get rest of string */
- while (end_quote_size != quote_size) {
+ int triple = 0;
+ int tripcount = 0;
+ for (;;) {
c = tok_nextc(tok);
- if (c == EOF) {
- if (quote_size == 3) {
- tok->done = E_EOFS;
- }
- else {
+ if (c == '\n') {
+ if (!triple) {
tok->done = E_EOLS;
+ tok_backup(tok, c);
+ return ERRORTOKEN;
}
- tok->cur = tok->inp;
- return ERRORTOKEN;
+ tripcount = 0;
+ tok->cont_line = 1; /* multiline string. */
}
- if (quote_size == 1 && c == '\n') {
- tok->done = E_EOLS;
+ else if (c == EOF) {
+ if (triple)
+ tok->done = E_EOFS;
+ else
+ tok->done = E_EOLS;
tok->cur = tok->inp;
return ERRORTOKEN;
}
- if (c == quote) {
- end_quote_size += 1;
+ else if (c == quote) {
+ tripcount++;
+ if (tok->cur - tok->start == quote2) {
+ c = tok_nextc(tok);
+ if (c == quote) {
+ triple = 1;
+ tripcount = 0;
+ continue;
+ }
+ tok_backup(tok, c);
+ }
+ if (!triple || tripcount == 3)
+ break;
}
- else {
- end_quote_size = 0;
- if (c == '\\') {
- tok_nextc(tok); /* skip escaped char */
+ else if (c == '\\') {
+ tripcount = 0;
+ c = tok_nextc(tok);
+ if (c == EOF) {
+ tok->done = E_EOLS;
+ tok->cur = tok->inp;
+ return ERRORTOKEN;
}
}
+ else
+ tripcount = 0;
}
-
*p_start = tok->start;
*p_end = tok->cur;
return STRING;
@@ -1688,14 +1627,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
tok->cur = tok->inp;
return ERRORTOKEN;
}
- c = tok_nextc(tok);
- if (c == EOF) {
- tok->done = E_EOF;
- tok->cur = tok->inp;
- return ERRORTOKEN;
- } else {
- tok_backup(tok, c);
- }
tok->cont_line = 1;
goto again; /* Read next line */
}
@@ -1704,13 +1635,24 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
{
int c2 = tok_nextc(tok);
int token = PyToken_TwoChars(c, c2);
+#ifndef PGEN
+ if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
+ if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
+ "<> not supported in 3.x; use !=",
+ tok->filename, tok->lineno,
+ NULL, NULL)) {
+ tok->done = E_ERROR;
+ tok->cur = tok->inp;
+ return ERRORTOKEN;
+ }
+ }
+#endif
if (token != OP) {
int c3 = tok_nextc(tok);
int token3 = PyToken_ThreeChars(c, c2, c3);
if (token3 != OP) {
token = token3;
- }
- else {
+ } else {
tok_backup(tok, c3);
}
*p_start = tok->start;
@@ -1725,38 +1667,12 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
case '(':
case '[':
case '{':
- if (tok->level >= MAXLEVEL) {
- return syntaxerror(tok, "too many nested parentheses");
- }
- tok->parenstack[tok->level] = c;
- tok->parenlinenostack[tok->level] = tok->lineno;
tok->level++;
break;
case ')':
case ']':
case '}':
- if (!tok->level) {
- return syntaxerror(tok, "unmatched '%c'", c);
- }
tok->level--;
- int opening = tok->parenstack[tok->level];
- if (!((opening == '(' && c == ')') ||
- (opening == '[' && c == ']') ||
- (opening == '{' && c == '}')))
- {
- if (tok->parenlinenostack[tok->level] != tok->lineno) {
- return syntaxerror(tok,
- "closing parenthesis '%c' does not match "
- "opening parenthesis '%c' on line %d",
- c, opening, tok->parenlinenostack[tok->level]);
- }
- else {
- return syntaxerror(tok,
- "closing parenthesis '%c' does not match "
- "opening parenthesis '%c'",
- c, opening);
- }
- }
break;
}
@@ -1770,6 +1686,11 @@ int
PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
{
int result = tok_get(tok, p_start, p_end);
+ if (tok->fp && ferror(tok->fp)) {
+ clearerr(tok->fp);
+ result = ERRORTOKEN;
+ tok->done = E_IO;
+ }
if (tok->decoding_erred) {
result = ERRORTOKEN;
tok->done = E_DECODE;
@@ -1777,67 +1698,67 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
return result;
}
-/* Get the encoding of a Python file. Check for the coding cookie and check if
- the file starts with a BOM.
-
- PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
- encoding in the first or second line of the file (in which case the encoding
- should be assumed to be UTF-8).
-
- The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
- by the caller. */
+/* This function is only called from parsetok. However, it cannot live
+ there, as it must be empty for PGEN, and we can check for PGEN only
+ in this file. */
-char *
-PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
+#if defined(PGEN) || !defined(Py_USING_UNICODE)
+char*
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
{
- struct tok_state *tok;
- FILE *fp;
- char *p_start =NULL , *p_end =NULL , *encoding = NULL;
-
- fd = _Py_dup(fd);
- if (fd < 0) {
- return NULL;
- }
-
- fp = fdopen(fd, "r");
- if (fp == NULL) {
- return NULL;
- }
- tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
- if (tok == NULL) {
- fclose(fp);
- return NULL;
- }
- if (filename != NULL) {
- Py_INCREF(filename);
- tok->filename = filename;
- }
- else {
- tok->filename = PyUnicode_FromString("<string>");
- if (tok->filename == NULL) {
- fclose(fp);
- PyTokenizer_Free(tok);
- return encoding;
- }
- }
- while (tok->lineno < 2 && tok->done == E_OK) {
- PyTokenizer_Get(tok, &p_start, &p_end);
+ return NULL;
+}
+#else
+#ifdef Py_USING_UNICODE
+static PyObject *
+dec_utf8(const char *enc, const char *text, size_t len) {
+ PyObject *ret = NULL;
+ PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
+ if (unicode_text) {
+ ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
+ Py_DECREF(unicode_text);
}
- fclose(fp);
- if (tok->encoding) {
- encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
- if (encoding)
- strcpy(encoding, tok->encoding);
+ if (!ret) {
+ PyErr_Clear();
}
- PyTokenizer_Free(tok);
- return encoding;
+ return ret;
}
-
char *
-PyTokenizer_FindEncoding(int fd)
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
{
- return PyTokenizer_FindEncodingFilename(fd, NULL);
+ char *text = NULL;
+ if (tok->encoding) {
+ /* convert source to original encondig */
+ PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
+ if (lineobj != NULL) {
+ int linelen = PyString_Size(lineobj);
+ const char *line = PyString_AsString(lineobj);
+ text = PyObject_MALLOC(linelen + 1);
+ if (text != NULL && line != NULL) {
+ if (linelen)
+ strncpy(text, line, linelen);
+ text[linelen] = '\0';
+ }
+ Py_DECREF(lineobj);
+
+ /* adjust error offset */
+ if (*offset > 1) {
+ PyObject *offsetobj = dec_utf8(tok->encoding,
+ tok->buf, *offset-1);
+ if (offsetobj) {
+ *offset = PyString_Size(offsetobj) + 1;
+ Py_DECREF(offsetobj);
+ }
+ }
+
+ }
+ }
+ return text;
+
}
+#endif /* defined(Py_USING_UNICODE) */
+#endif
+
#ifdef Py_DEBUG
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 92669bf..f15e252 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -11,13 +11,6 @@ extern "C" {
#include "token.h" /* For token types */
#define MAXINDENT 100 /* Max indentation level */
-#define MAXLEVEL 200 /* Max parentheses level */
-
-enum decoding_state {
- STATE_INIT,
- STATE_RAW,
- STATE_NORMAL /* have a codec associated with input */
-};
/* Tokenizer state */
struct tok_state {
@@ -36,51 +29,40 @@ struct tok_state {
int indstack[MAXINDENT]; /* Stack of indents */
int atbol; /* Nonzero if at begin of new line */
int pendin; /* Pending indents (if > 0) or dedents (if < 0) */
- const char *prompt, *nextprompt; /* For interactive prompting */
+ char *prompt, *nextprompt; /* For interactive prompting */
int lineno; /* Current line number */
- int first_lineno; /* First line of a single line or multi line string
- expression (cf. issue 16806) */
int level; /* () [] {} Parentheses nesting level */
/* Used to allow free continuations inside them */
- char parenstack[MAXLEVEL];
- int parenlinenostack[MAXLEVEL];
- PyObject *filename;
/* Stuff for checking on different tab sizes */
+ const char *filename; /* For error messages */
+ int altwarning; /* Issue warning if alternate tabs don't match */
+ int alterror; /* Issue error if alternate tabs don't match */
+ int alttabsize; /* Alternate tab spacing */
int altindstack[MAXINDENT]; /* Stack of alternate indents */
/* Stuff for PEP 0263 */
- enum decoding_state decoding_state;
+ int decoding_state; /* -1:decoding, 0:init, 1:raw */
int decoding_erred; /* whether erred in decoding */
int read_coding_spec; /* whether 'coding:...' has been read */
- char *encoding; /* Source encoding. */
+ char *encoding;
int cont_line; /* whether we are in a continuation line. */
const char* line_start; /* pointer to start of current line */
- const char* multi_line_start; /* pointer to start of first line of
- a single line or multi line string
- expression (cf. issue 16806) */
- PyObject *decoding_readline; /* open(...).readline */
+#ifndef PGEN
+ PyObject *decoding_readline; /* codecs.open(...).readline */
PyObject *decoding_buffer;
- const char* enc; /* Encoding for the current str. */
+#endif
+ const char* enc;
const char* str;
const char* input; /* Tokenizer's newline translated copy of the string. */
-
- int type_comments; /* Whether to look for type comments */
-
- /* async/await related fields (still needed depending on feature_version) */
- int async_hacks; /* =1 if async/await aren't always keywords */
- int async_def; /* =1 if tokens are inside an 'async def' body. */
- int async_def_indent; /* Indentation level of the outermost 'async def'. */
- int async_def_nl; /* =1 if the outermost 'async def' had at least one
- NEWLINE token after it. */
};
extern struct tok_state *PyTokenizer_FromString(const char *, int);
-extern struct tok_state *PyTokenizer_FromUTF8(const char *, int);
-extern struct tok_state *PyTokenizer_FromFile(FILE *, const char*,
- const char *, const char *);
+extern struct tok_state *PyTokenizer_FromFile(FILE *, char *, char *);
extern void PyTokenizer_Free(struct tok_state *);
extern int PyTokenizer_Get(struct tok_state *, char **, char **);
-
-#define tok_dump _Py_tok_dump
+#if defined(PGEN) || defined(Py_USING_UNICODE)
+extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
+ int len, int *offset);
+#endif
#ifdef __cplusplus
}
diff --git a/Parser/tokenizer_pgen.c b/Parser/tokenizer_pgen.c
new file mode 100644
index 0000000..9cb8492
--- /dev/null
+++ b/Parser/tokenizer_pgen.c
@@ -0,0 +1,2 @@
+#define PGEN
+#include "tokenizer.c"