8 files changed, 104 insertions, 54 deletions
diff --git a/Parser/Python.asdl b/Parser/Python.asdl
index 9407b2f..8e2e1ac 100644
--- a/Parser/Python.asdl
+++ b/Parser/Python.asdl
@@ -1,6 +1,6 @@
 -- ASDL's four builtin types are identifier, int, string, object
 
-module Python version "$Revision$"
+module Python
 {
 	mod = Module(stmt* body)
 	    | Interactive(stmt* body)
diff --git a/Parser/asdl.py b/Parser/asdl.py
index 7b4e2dc..c63dfa7 100644
--- a/Parser/asdl.py
+++ b/Parser/asdl.py
@@ -114,28 +114,20 @@ class ASDLParser(spark.GenericParser, object):
         raise ASDLSyntaxError(tok.lineno, tok)
 
     def p_module_0(self, info):
-        " module ::= Id Id version { } "
-        module, name, version, _0, _1 = info
+        " module ::= Id Id { } "
+        module, name, _0, _1 = info
         if module.value != "module":
             raise ASDLSyntaxError(module.lineno,
                                   msg="expected 'module', found %s" % module)
-        return Module(name, None, version)
+        return Module(name, None)
 
     def p_module(self, info):
-        " module ::= Id Id version { definitions } "
-        module, name, version, _0, definitions, _1 = info
+        " module ::= Id Id { definitions } "
+        module, name, _0, definitions, _1 = info
         if module.value != "module":
             raise ASDLSyntaxError(module.lineno,
                                   msg="expected 'module', found %s" % module)
-        return Module(name, definitions, version)
-
-    def p_version(self, info):
-        "version ::= Id String"
-        version, V = info
-        if version.value != "version":
-            raise ASDLSyntaxError(version.lineno,
-                                  msg="expected 'version', found %" % version)
-        return V
+        return Module(name, definitions)
 
     def p_definition_0(self, definition):
         " definitions ::= definition "
@@ -246,10 +238,9 @@ class AST(object):
     pass # a marker class
 
 class Module(AST):
-    def __init__(self, name, dfns, version):
+    def __init__(self, name, dfns):
         self.name = name
         self.dfns = dfns
-        self.version = version
         self.types = {} # maps type name to value (from dfns)
         for type in dfns:
             self.types[type.name.value] = type.value
diff --git a/Parser/asdl_c.py b/Parser/asdl_c.py
index d6555d6..cdce5a3 100755
--- a/Parser/asdl_c.py
+++ b/Parser/asdl_c.py
@@ -5,6 +5,7 @@
 # handle fields that have a type but no name
 
 import os, sys
+import subprocess
 
 import asdl
 
@@ -882,9 +883,6 @@ static int add_ast_fields(void)
             self.emit("if (!%s_singleton) return 0;" % cons.name, 1)
 
 
-def parse_version(mod):
-    return mod.version.value[12:-3]
-
 class ASTModuleVisitor(PickleVisitor):
 
     def visitModule(self, mod):
@@ -904,7 +902,7 @@ class ASTModuleVisitor(PickleVisitor):
         self.emit("return NULL;", 2)
         # Value of version: "$Revision$"
         self.emit('if (PyModule_AddStringConstant(m, "__version__", "%s") < 0)'
-                % parse_version(mod), 1)
+                % (mod.version,), 1)
         self.emit("return NULL;", 2)
         for dfn in mod.dfns:
             self.visit(dfn)
@@ -1137,6 +1135,18 @@ c_file_msg = """
 
 """
 
+
+def get_file_revision(f):
+    """Fish out the last change to a file in hg."""
+    args = ["hg", "log", "--template", "{node|short}", "--limit", "1", f]
+    p = subprocess.Popen(args, stdout=subprocess.PIPE)
+    out = p.communicate()[0]
+    if p.returncode:
+        print >> sys.stderr, "error return code from hg"
+        sys.exit(1)
+    return out
+
+
 def main(srcfile):
     argv0 = sys.argv[0]
     components = argv0.split(os.sep)
@@ -1145,6 +1155,7 @@ def main(srcfile):
     mod = asdl.parse(srcfile)
     if not asdl.check(mod):
         sys.exit(1)
+    mod.version = get_file_revision(srcfile)
     if INC_DIR:
         p = "%s/%s-ast.h" % (INC_DIR, mod.name)
         f = open(p, "w")
@@ -1164,7 +1175,7 @@ def main(srcfile):
         p = os.path.join(SRC_DIR, str(mod.name) + "-ast.c")
         f = open(p, "w")
         f.write(auto_gen_msg)
-        f.write(c_file_msg % parse_version(mod))
+        f.write(c_file_msg % (mod.version,))
         f.write('#include "Python.h"\n')
         f.write('#include "%s-ast.h"\n' % mod.name)
         f.write('\n')
diff --git a/Parser/parsetok.c b/Parser/parsetok.c
index 7636a54..eef650a 100644
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@@ -13,7 +13,7 @@
 
 /* Forward */
 static node *parsetok(struct tok_state *, grammar *, int, perrdetail *, int *);
-static void initerr(perrdetail *err_ret, const char* filename);
+static int initerr(perrdetail *err_ret, const char* filename);
 
 /* Parse input coming from a string.  Return error code, print some errors. */
 node *
@@ -48,7 +48,8 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename,
     struct tok_state *tok;
     int exec_input = start == file_input;
 
-    initerr(err_ret, filename);
+    if (initerr(err_ret, filename) < 0)
+        return NULL;
 
     if (*flags & PyPARSE_IGNORE_COOKIE)
         tok = PyTokenizer_FromUTF8(s, exec_input);
@@ -59,7 +60,10 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename,
         return NULL;
     }
 
-    tok->filename = filename ? filename : "<string>";
+#ifndef PGEN
+    Py_INCREF(err_ret->filename);
+    tok->filename = err_ret->filename;
+#endif
     return parsetok(tok, g, start, err_ret, flags);
 }
 
@@ -90,13 +94,17 @@ PyParser_ParseFileFlagsEx(FILE *fp, const char *filename,
 {
     struct tok_state *tok;
 
-    initerr(err_ret, filename);
+    if (initerr(err_ret, filename) < 0)
+        return NULL;
 
     if ((tok = PyTokenizer_FromFile(fp, (char *)enc, ps1, ps2)) == NULL) {
         err_ret->error = E_NOMEM;
         return NULL;
     }
-    tok->filename = filename;
+#ifndef PGEN
+    Py_INCREF(err_ret->filename);
+    tok->filename = err_ret->filename;
+#endif
     return parsetok(tok, g, start, err_ret, flags);
 }
 
@@ -127,7 +135,7 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
 {
     parser_state *ps;
     node *n;
-    int started = 0, handling_import = 0, handling_with = 0;
+    int started = 0;
 
     if ((ps = PyParser_New(g, start)) == NULL) {
         fprintf(stderr, "no mem for new parser\n");
@@ -154,7 +162,6 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
         }
         if (type == ENDMARKER && started) {
             type = NEWLINE; /* Add an extra newline */
-            handling_with = handling_import = 0;
             started = 0;
             /* Add the right number of dedent tokens,
                except if a certain flag is given --
@@ -268,14 +275,24 @@ done:
     return n;
 }
 
-static void
+static int
 initerr(perrdetail *err_ret, const char *filename)
 {
     err_ret->error = E_OK;
-    err_ret->filename = filename;
     err_ret->lineno = 0;
     err_ret->offset = 0;
     err_ret->text = NULL;
     err_ret->token = -1;
     err_ret->expected = -1;
+#ifndef PGEN
+    if (filename)
+        err_ret->filename = PyUnicode_DecodeFSDefault(filename);
+    else
+        err_ret->filename = PyUnicode_FromString("<string>");
+    if (err_ret->filename == NULL) {
+        err_ret->error = E_ERROR;
+        return -1;
+    }
+#endif
+    return 0;
 }
diff --git a/Parser/parsetok_pgen.c b/Parser/parsetok_pgen.c
new file mode 100644
index 0000000..97b9288
--- /dev/null
+++ b/Parser/parsetok_pgen.c
@@ -0,0 +1,2 @@
+#define PGEN
+#include "parsetok.c"
diff --git a/Parser/pgenmain.c b/Parser/pgenmain.c
index 4b7b55a..52b8380 100644
--- a/Parser/pgenmain.c
+++ b/Parser/pgenmain.c
@@ -29,6 +29,8 @@ int Py_IgnoreEnvironmentFlag;
 /* Forward */
 grammar *getgrammar(char *filename);
 
+void Py_Exit(int) _Py_NO_RETURN;
+
 void
 Py_Exit(int sts)
 {
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 3f6be2f..f4d7e3f 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -128,7 +128,6 @@ tok_new(void)
     tok->prompt = tok->nextprompt = NULL;
     tok->lineno = 0;
     tok->level = 0;
-    tok->filename = NULL;
     tok->altwarning = 1;
     tok->alterror = 1;
     tok->alttabsize = 1;
@@ -140,6 +139,7 @@ tok_new(void)
     tok->encoding = NULL;
     tok->cont_line = 0;
 #ifndef PGEN
+    tok->filename = NULL;
     tok->decoding_readline = NULL;
     tok->decoding_buffer = NULL;
 #endif
@@ -545,7 +545,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
 {
     char *line = NULL;
     int badchar = 0;
-    PyObject *filename;
     for (;;) {
         if (tok->decoding_state == STATE_NORMAL) {
             /* We already have a codec associated with
@@ -586,16 +585,12 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
     if (badchar) {
         /* Need to add 1 to the line number, since this line
            has not been counted, yet.  */
-        filename = PyUnicode_DecodeFSDefault(tok->filename);
-        if (filename != NULL) {
-            PyErr_Format(PyExc_SyntaxError,
-                    "Non-UTF-8 code starting with '\\x%.2x' "
-                    "in file %U on line %i, "
-                    "but no encoding declared; "
-                    "see http://python.org/dev/peps/pep-0263/ for details",
-                    badchar, filename, tok->lineno + 1);
-            Py_DECREF(filename);
-        }
+        PyErr_Format(PyExc_SyntaxError,
+                "Non-UTF-8 code starting with '\\x%.2x' "
+                "in file %U on line %i, "
+                "but no encoding declared; "
+                "see http://python.org/dev/peps/pep-0263/ for details",
+                badchar, tok->filename, tok->lineno + 1);
         return error_ret(tok);
     }
 #endif
@@ -853,6 +848,7 @@ PyTokenizer_Free(struct tok_state *tok)
 #ifndef PGEN
     Py_XDECREF(tok->decoding_readline);
     Py_XDECREF(tok->decoding_buffer);
+    Py_XDECREF(tok->filename);
 #endif
     if (tok->fp != NULL && tok->buf != NULL)
         PyMem_FREE(tok->buf);
@@ -1247,8 +1243,13 @@ indenterror(struct tok_state *tok)
         return 1;
     }
     if (tok->altwarning) {
-        PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
+#ifdef PGEN
+        PySys_WriteStderr("inconsistent use of tabs and spaces "
+                          "in indentation\n");
+#else
+        PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
                           "in indentation\n", tok->filename);
+#endif
         tok->altwarning = 0;
     }
     return 0;
@@ -1689,17 +1690,18 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
     return result;
 }
 
-/* Get -*- encoding -*- from a Python file.
+/* Get the encoding of a Python file. Check for the coding cookie and check if
+   the file starts with a BOM.
 
-   PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
-   the first or second line of the file (in which case the encoding
-   should be assumed to be PyUnicode_GetDefaultEncoding()).
+   PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
+   encoding in the first or second line of the file (in which case the encoding
+   should be assumed to be UTF-8).
+
+   The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
+   by the caller. */
 
-   The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
-   by the caller.
-*/
 char *
-PyTokenizer_FindEncoding(int fd)
+PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
 {
     struct tok_state *tok;
     FILE *fp;
@@ -1718,6 +1720,20 @@ PyTokenizer_FindEncoding(int fd)
         fclose(fp);
         return NULL;
     }
+#ifndef PGEN
+    if (filename != NULL) {
+        Py_INCREF(filename);
+        tok->filename = filename;
+    }
+    else {
+        tok->filename = PyUnicode_FromString("<string>");
+        if (tok->filename == NULL) {
+            fclose(fp);
+            PyTokenizer_Free(tok);
+            return encoding;
+        }
+    }
+#endif
     while (tok->lineno < 2 && tok->done == E_OK) {
         PyTokenizer_Get(tok, &p_start, &p_end);
     }
@@ -1731,6 +1747,12 @@ PyTokenizer_FindEncoding(int fd)
     return encoding;
 }
 
+char *
+PyTokenizer_FindEncoding(int fd)
+{
+    return PyTokenizer_FindEncodingFilename(fd, NULL);
+}
+
 #ifdef Py_DEBUG
 
 void
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 2be3bf2..ed1f3aa 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -40,7 +40,13 @@ struct tok_state {
     int level;          /* () [] {} Parentheses nesting level */
             /* Used to allow free continuations inside them */
     /* Stuff for checking on different tab sizes */
-    const char *filename;   /* encoded to the filesystem encoding */
+#ifndef PGEN
+    /* pgen doesn't have access to Python codecs, it cannot decode the input
+       filename. The bytes filename might be kept, but it is only used by
+       indenterror() and it is not really needed: pgen only compiles one file
+       (Grammar/Grammar). */
+    PyObject *filename;
+#endif
     int altwarning;     /* Issue warning if alternate tabs don't match */
     int alterror;       /* Issue error if alternate tabs don't match */
     int alttabsize;     /* Alternate tab spacing */
@@ -69,7 +75,6 @@ extern void PyTokenizer_Free(struct tok_state *);
 extern int PyTokenizer_Get(struct tok_state *, char **, char **);
 extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
                                           int len, int *offset);
-extern char * PyTokenizer_FindEncoding(int);
 
 #ifdef __cplusplus
 }