1 files changed, 707 insertions, 786 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index f84093d..8966661 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -2,6 +2,7 @@
 /* Tokenizer implementation */
 
 #include "Python.h"
+#include "pgenheaders.h"
 
 #include <ctype.h>
 #include <assert.h>
@@ -9,29 +10,16 @@
 #include "tokenizer.h"
 #include "errcode.h"
 
+#ifndef PGEN
 #include "unicodeobject.h"
-#include "bytesobject.h"
+#include "stringobject.h"
 #include "fileobject.h"
 #include "codecs.h"
 #include "abstract.h"
+#include "pydebug.h"
+#endif /* PGEN */
 
-/* Alternate tab spacing */
-#define ALTTABSIZE 1
-
-#define is_potential_identifier_start(c) (\
-              (c >= 'a' && c <= 'z')\
-               || (c >= 'A' && c <= 'Z')\
-               || c == '_'\
-               || (c >= 128))
-
-#define is_potential_identifier_char(c) (\
-              (c >= 'a' && c <= 'z')\
-               || (c >= 'A' && c <= 'Z')\
-               || (c >= '0' && c <= '9')\
-               || c == '_'\
-               || (c >= 128))
-
-extern char *PyOS_Readline(FILE *, FILE *, const char *);
+extern char *PyOS_Readline(FILE *, FILE *, char *);
 /* Return malloc'ed string including trailing \n;
    empty malloc'ed string for EOF;
    NULL if interrupted */
@@ -44,10 +32,65 @@ static struct tok_state *tok_new(void);
 static int tok_nextc(struct tok_state *tok);
 static void tok_backup(struct tok_state *tok, int c);
 
-
-/* Spaces in this constant are treated as "zero or more spaces or tabs" when
-   tokenizing. */
-static const char* type_comment_prefix = "# type: ";
+/* Token names */
+
+char *_PyParser_TokenNames[] = {
+    "ENDMARKER",
+    "NAME",
+    "NUMBER",
+    "STRING",
+    "NEWLINE",
+    "INDENT",
+    "DEDENT",
+    "LPAR",
+    "RPAR",
+    "LSQB",
+    "RSQB",
+    "COLON",
+    "COMMA",
+    "SEMI",
+    "PLUS",
+    "MINUS",
+    "STAR",
+    "SLASH",
+    "VBAR",
+    "AMPER",
+    "LESS",
+    "GREATER",
+    "EQUAL",
+    "DOT",
+    "PERCENT",
+    "BACKQUOTE",
+    "LBRACE",
+    "RBRACE",
+    "EQEQUAL",
+    "NOTEQUAL",
+    "LESSEQUAL",
+    "GREATEREQUAL",
+    "TILDE",
+    "CIRCUMFLEX",
+    "LEFTSHIFT",
+    "RIGHTSHIFT",
+    "DOUBLESTAR",
+    "PLUSEQUAL",
+    "MINEQUAL",
+    "STAREQUAL",
+    "SLASHEQUAL",
+    "PERCENTEQUAL",
+    "AMPEREQUAL",
+    "VBAREQUAL",
+    "CIRCUMFLEXEQUAL",
+    "LEFTSHIFTEQUAL",
+    "RIGHTSHIFTEQUAL",
+    "DOUBLESTAREQUAL",
+    "DOUBLESLASH",
+    "DOUBLESLASHEQUAL",
+    "AT",
+    /* This table must match the #defines in token.h! */
+    "OP",
+    "<ERRORTOKEN>",
+    "<N_TOKENS>"
+};
 
 /* Create and initialize a new tok_state structure */
 
@@ -65,45 +108,61 @@ tok_new(void)
     tok->tabsize = TABSIZE;
     tok->indent = 0;
     tok->indstack[0] = 0;
-
     tok->atbol = 1;
     tok->pendin = 0;
     tok->prompt = tok->nextprompt = NULL;
     tok->lineno = 0;
     tok->level = 0;
+    tok->filename = NULL;
+    tok->altwarning = 0;
+    tok->alterror = 0;
+    tok->alttabsize = 1;
     tok->altindstack[0] = 0;
-    tok->decoding_state = STATE_INIT;
+    tok->decoding_state = 0;
     tok->decoding_erred = 0;
     tok->read_coding_spec = 0;
-    tok->enc = NULL;
     tok->encoding = NULL;
     tok->cont_line = 0;
-    tok->filename = NULL;
+#ifndef PGEN
     tok->decoding_readline = NULL;
     tok->decoding_buffer = NULL;
-    tok->type_comments = 0;
-
-    tok->async_hacks = 0;
-    tok->async_def = 0;
-    tok->async_def_indent = 0;
-    tok->async_def_nl = 0;
-
+#endif
     return tok;
 }
 
 static char *
-new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
+new_string(const char *s, Py_ssize_t len)
 {
     char* result = (char *)PyMem_MALLOC(len + 1);
-    if (!result) {
-        tok->done = E_NOMEM;
-        return NULL;
+    if (result != NULL) {
+        memcpy(result, s, len);
+        result[len] = '\0';
     }
-    memcpy(result, s, len);
-    result[len] = '\0';
     return result;
 }
 
+#ifdef PGEN
+
+static char *
+decoding_fgets(char *s, int size, struct tok_state *tok)
+{
+    return fgets(s, size, tok->fp);
+}
+
+static int
+decoding_feof(struct tok_state *tok)
+{
+    return feof(tok->fp);
+}
+
+static char *
+decode_str(const char *str, int exec_input, struct tok_state *tok)
+{
+    return new_string(str, strlen(str));
+}
+
+#else /* PGEN */
+
 static char *
 error_ret(struct tok_state *tok) /* XXX */
 {
@@ -116,8 +175,8 @@ error_ret(struct tok_state *tok) /* XXX */
 }
 
 
-static const char *
-get_normal_name(const char *s)  /* for utf-8 and latin-1 */
+static char *
+get_normal_name(char *s)        /* for utf-8 and latin-1 */
 {
     char buf[13];
     int i;
@@ -147,18 +206,17 @@ get_normal_name(const char *s)  /* for utf-8 and latin-1 */
 
 /* Return the coding spec in S, or NULL if none is found.  */
 
-static int
-get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
+static char *
+get_coding_spec(const char *s, Py_ssize_t size)
 {
     Py_ssize_t i;
-    *spec = NULL;
     /* Coding spec must be in a comment, and that comment must be
      * the only statement on the source code line. */
     for (i = 0; i < size - 6; i++) {
         if (s[i] == '#')
             break;
         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
-            return 1;
+            return NULL;
     }
     for (; i < size - 6; i++) { /* XXX inefficient search */
         const char* t = s + i;
@@ -177,23 +235,20 @@ get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *t
                 t++;
 
             if (begin < t) {
-                char* r = new_string(begin, t - begin, tok);
-                const char* q;
+                char* r = new_string(begin, t - begin);
+                char* q;
                 if (!r)
-                    return 0;
+                    return NULL;
                 q = get_normal_name(r);
                 if (r != q) {
                     PyMem_FREE(r);
-                    r = new_string(q, strlen(q), tok);
-                    if (!r)
-                        return 0;
+                    r = new_string(q, strlen(q));
                 }
-                *spec = r;
-                break;
+                return r;
             }
         }
     }
-    return 1;
+    return NULL;
 }
 
 /* Check whether the line contains a coding spec. If it does,
@@ -205,7 +260,7 @@ static int
 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
                   int set_readline(struct tok_state *, const char *))
 {
-    char *cs;
+    char * cs;
     int r = 1;
 
     if (tok->cont_line) {
@@ -213,8 +268,7 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
         tok->read_coding_spec = 1;
         return 1;
     }
-    if (!get_coding_spec(line, &cs, size, tok))
-        return 0;
+    cs = get_coding_spec(line, size);
     if (!cs) {
         Py_ssize_t i;
         for (i = 0; i < size; i++) {
@@ -227,31 +281,40 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
                 break;
             }
         }
-        return 1;
-    }
-    tok->read_coding_spec = 1;
-    if (tok->encoding == NULL) {
-        assert(tok->decoding_state == STATE_RAW);
-        if (strcmp(cs, "utf-8") == 0) {
-            tok->encoding = cs;
-        } else {
-            r = set_readline(tok, cs);
-            if (r) {
+    } else {
+        tok->read_coding_spec = 1;
+        if (tok->encoding == NULL) {
+            assert(tok->decoding_state == 1); /* raw */
+            if (strcmp(cs, "utf-8") == 0 ||
+                strcmp(cs, "iso-8859-1") == 0) {
                 tok->encoding = cs;
-                tok->decoding_state = STATE_NORMAL;
-            }
-            else {
-                PyErr_Format(PyExc_SyntaxError,
-                             "encoding problem: %s", cs);
+            } else {
+#ifdef Py_USING_UNICODE
+                r = set_readline(tok, cs);
+                if (r) {
+                    tok->encoding = cs;
+                    tok->decoding_state = -1;
+                }
+                else {
+                    PyErr_Format(PyExc_SyntaxError,
+                                 "encoding problem: %s", cs);
+                    PyMem_FREE(cs);
+                }
+#else
+                /* Without Unicode support, we cannot
+                   process the coding spec. Since there
+                   won't be any Unicode literals, that
+                   won't matter. */
                 PyMem_FREE(cs);
+#endif
             }
+        } else {                /* then, compare cs with BOM */
+            r = (strcmp(tok->encoding, cs) == 0);
+            if (!r)
+                PyErr_Format(PyExc_SyntaxError,
+                             "encoding problem: %s with BOM", cs);
+            PyMem_FREE(cs);
         }
-    } else {                /* then, compare cs with BOM */
-        r = (strcmp(tok->encoding, cs) == 0);
-        if (!r)
-            PyErr_Format(PyExc_SyntaxError,
-                         "encoding problem: %s with BOM", cs);
-        PyMem_FREE(cs);
     }
     return r;
 }
@@ -268,7 +331,7 @@ check_bom(int get_char(struct tok_state *),
 {
     int ch1, ch2, ch3;
     ch1 = get_char(tok);
-    tok->decoding_state = STATE_RAW;
+    tok->decoding_state = 1;
     if (ch1 == EOF) {
         return 1;
     } else if (ch1 == 0xEF) {
@@ -297,7 +360,7 @@ check_bom(int get_char(struct tok_state *),
         }
         if (!set_readline(tok, "utf-16-be"))
             return 0;
-        tok->decoding_state = STATE_NORMAL;
+        tok->decoding_state = -1;
     } else if (ch1 == 0xFF) {
         ch2 = get_char(tok);
         if (ch2 != 0xFE) {
@@ -307,7 +370,7 @@ check_bom(int get_char(struct tok_state *),
         }
         if (!set_readline(tok, "utf-16-le"))
             return 0;
-        tok->decoding_state = STATE_NORMAL;
+        tok->decoding_state = -1;
 #endif
     } else {
         unget_char(ch1, tok);
@@ -315,10 +378,7 @@ check_bom(int get_char(struct tok_state *),
     }
     if (tok->encoding != NULL)
         PyMem_FREE(tok->encoding);
-    tok->encoding = new_string("utf-8", 5, tok);
-    if (!tok->encoding)
-        return 0;
-    /* No need to set_readline: input is already utf-8 */
+    tok->encoding = new_string("utf-8", 5);     /* resulting is in utf-8 */
     return 1;
 }
 
@@ -329,7 +389,7 @@ check_bom(int get_char(struct tok_state *),
      1) NULL: need to call tok->decoding_readline to get a new line
      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
        stored the result in tok->decoding_buffer
-     3) PyByteArrayObject *: previous call to fp_readl did not have enough room
+     3) PyStringObject *: previous call to fp_readl did not have enough room
        (in the s buffer) to copy entire contents of the line read
        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
        In this case, fp_readl is called in a loop (with an expanded buffer)
@@ -340,62 +400,58 @@ check_bom(int get_char(struct tok_state *),
 static char *
 fp_readl(char *s, int size, struct tok_state *tok)
 {
-    PyObject* bufobj;
-    const char *buf;
-    Py_ssize_t buflen;
+#ifndef Py_USING_UNICODE
+    /* In a non-Unicode built, this should never be called. */
+    Py_FatalError("fp_readl should not be called in this build.");
+    return NULL; /* Keep compiler happy (not reachable) */
+#else
+    PyObject* utf8 = NULL;
+    PyObject* buf = tok->decoding_buffer;
+    char *str;
+    Py_ssize_t utf8len;
 
     /* Ask for one less byte so we can terminate it */
     assert(size > 0);
     size--;
 
-    if (tok->decoding_buffer) {
-        bufobj = tok->decoding_buffer;
-        Py_INCREF(bufobj);
-    }
-    else
-    {
-        bufobj = _PyObject_CallNoArg(tok->decoding_readline);
-        if (bufobj == NULL)
-            goto error;
-    }
-    if (PyUnicode_CheckExact(bufobj))
-    {
-        buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
-        if (buf == NULL) {
-            goto error;
+    if (buf == NULL) {
+        buf = PyObject_CallObject(tok->decoding_readline, NULL);
+        if (buf == NULL)
+            return error_ret(tok);
+        if (!PyUnicode_Check(buf)) {
+            Py_DECREF(buf);
+            PyErr_SetString(PyExc_SyntaxError,
+                            "codec did not return a unicode object");
+            return error_ret(tok);
         }
+    } else {
+        tok->decoding_buffer = NULL;
+        if (PyString_CheckExact(buf))
+            utf8 = buf;
     }
-    else
-    {
-        buf = PyByteArray_AsString(bufobj);
-        if (buf == NULL) {
-            goto error;
-        }
-        buflen = PyByteArray_GET_SIZE(bufobj);
+    if (utf8 == NULL) {
+        utf8 = PyUnicode_AsUTF8String(buf);
+        Py_DECREF(buf);
+        if (utf8 == NULL)
+            return error_ret(tok);
     }
-
-    Py_XDECREF(tok->decoding_buffer);
-    if (buflen > size) {
-        /* Too many chars, the rest goes into tok->decoding_buffer */
-        tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
-                                                         buflen-size);
-        if (tok->decoding_buffer == NULL)
-            goto error;
-        buflen = size;
+    str = PyString_AsString(utf8);
+    utf8len = PyString_GET_SIZE(utf8);
+    if (utf8len > size) {
+        tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
+        if (tok->decoding_buffer == NULL) {
+            Py_DECREF(utf8);
+            return error_ret(tok);
+        }
+        utf8len = size;
     }
-    else
-        tok->decoding_buffer = NULL;
-
-    memcpy(s, buf, buflen);
-    s[buflen] = '\0';
-    if (buflen == 0) /* EOF */
-        s = NULL;
-    Py_DECREF(bufobj);
+    memcpy(s, str, utf8len);
+    s[utf8len] = '\0';
+    Py_DECREF(utf8);
+    if (utf8len == 0)
+        return NULL; /* EOF */
     return s;
-
-error:
-    Py_XDECREF(bufobj);
-    return error_ret(tok);
+#endif
 }
 
 /* Set the readline function for TOK to a StreamReader's
@@ -411,48 +467,24 @@ error:
 static int
 fp_setreadl(struct tok_state *tok, const char* enc)
 {
-    PyObject *readline, *io, *stream;
-    _Py_IDENTIFIER(open);
-    _Py_IDENTIFIER(readline);
-    int fd;
-    long pos;
-
-    fd = fileno(tok->fp);
-    /* Due to buffering the file offset for fd can be different from the file
-     * position of tok->fp.  If tok->fp was opened in text mode on Windows,
-     * its file position counts CRLF as one char and can't be directly mapped
-     * to the file offset for fd.  Instead we step back one byte and read to
-     * the end of line.*/
-    pos = ftell(tok->fp);
-    if (pos == -1 ||
-        lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
-        PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
-        return 0;
-    }
+    PyObject *reader, *stream, *readline;
 
-    io = PyImport_ImportModuleNoBlock("io");
-    if (io == NULL)
-        return 0;
-
-    stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
-                    fd, "r", -1, enc, Py_None, Py_None, Py_False);
-    Py_DECREF(io);
+    /* XXX: constify filename argument. */
+    stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
     if (stream == NULL)
         return 0;
 
-    readline = _PyObject_GetAttrId(stream, &PyId_readline);
+    reader = PyCodec_StreamReader(enc, stream, NULL);
     Py_DECREF(stream);
-    if (readline == NULL)
+    if (reader == NULL)
         return 0;
-    Py_XSETREF(tok->decoding_readline, readline);
 
-    if (pos > 0) {
-        PyObject *bufobj = _PyObject_CallNoArg(readline);
-        if (bufobj == NULL)
-            return 0;
-        Py_DECREF(bufobj);
-    }
+    readline = PyObject_GetAttrString(reader, "readline");
+    Py_DECREF(reader);
+    if (readline == NULL)
+        return 0;
 
+    tok->decoding_readline = readline;
     return 1;
 }
 
@@ -468,34 +500,6 @@ static void fp_ungetc(int c, struct tok_state *tok) {
     ungetc(c, tok->fp);
 }
 
-/* Check whether the characters at s start a valid
-   UTF-8 sequence. Return the number of characters forming
-   the sequence if yes, 0 if not.  */
-static int valid_utf8(const unsigned char* s)
-{
-    int expected = 0;
-    int length;
-    if (*s < 0x80)
-        /* single-byte code */
-        return 1;
-    if (*s < 0xc0)
-        /* following byte */
-        return 0;
-    if (*s < 0xE0)
-        expected = 1;
-    else if (*s < 0xF0)
-        expected = 2;
-    else if (*s < 0xF8)
-        expected = 3;
-    else
-        return 0;
-    length = expected + 1;
-    for (; expected; expected--)
-        if (s[expected] < 0x80 || s[expected] >= 0xC0)
-            return 0;
-    return length;
-}
-
 /* Read a line of input from TOK. Determine encoding
    if necessary.  */
 
@@ -505,12 +509,12 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
     char *line = NULL;
     int badchar = 0;
     for (;;) {
-        if (tok->decoding_state == STATE_NORMAL) {
+        if (tok->decoding_state < 0) {
             /* We already have a codec associated with
                this input. */
             line = fp_readl(s, size, tok);
             break;
-        } else if (tok->decoding_state == STATE_RAW) {
+        } else if (tok->decoding_state > 0) {
             /* We want a 'raw' read. */
             line = Py_UniversalNewlineFgets(s, size,
                                             tok->fp, NULL);
@@ -521,7 +525,7 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
                reader functions from now on. */
             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
                 return error_ret(tok);
-            assert(tok->decoding_state != STATE_INIT);
+            assert(tok->decoding_state != 0);
         }
     }
     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
@@ -529,40 +533,43 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
             return error_ret(tok);
         }
     }
-    /* The default encoding is UTF-8, so make sure we don't have any
-       non-UTF-8 sequences in it. */
+#ifndef PGEN
+    /* The default encoding is ASCII, so make sure we don't have any
+       non-ASCII bytes in it. */
     if (line && !tok->encoding) {
         unsigned char *c;
-        int length;
-        for (c = (unsigned char *)line; *c; c += length)
-            if (!(length = valid_utf8(c))) {
+        for (c = (unsigned char *)line; *c; c++)
+            if (*c > 127) {
                 badchar = *c;
                 break;
             }
     }
     if (badchar) {
+        char buf[500];
         /* Need to add 1 to the line number, since this line
            has not been counted, yet.  */
-        PyErr_Format(PyExc_SyntaxError,
-                "Non-UTF-8 code starting with '\\x%.2x' "
-                "in file %U on line %i, "
-                "but no encoding declared; "
-                "see http://python.org/dev/peps/pep-0263/ for details",
-                badchar, tok->filename, tok->lineno + 1);
+        sprintf(buf,
+            "Non-ASCII character '\\x%.2x' "
+            "in file %.200s on line %i, "
+            "but no encoding declared; "
+            "see http://python.org/dev/peps/pep-0263/ for details",
+            badchar, tok->filename, tok->lineno + 1);
+        PyErr_SetString(PyExc_SyntaxError, buf);
         return error_ret(tok);
     }
+#endif
     return line;
 }
 
 static int
 decoding_feof(struct tok_state *tok)
 {
-    if (tok->decoding_state != STATE_NORMAL) {
+    if (tok->decoding_state >= 0) {
         return feof(tok->fp);
     } else {
         PyObject* buf = tok->decoding_buffer;
         if (buf == NULL) {
-            buf = _PyObject_CallNoArg(tok->decoding_readline);
+            buf = PyObject_CallObject(tok->decoding_readline, NULL);
             if (buf == NULL) {
                 error_ret(tok);
                 return 1;
@@ -601,6 +608,7 @@ buf_setreadl(struct tok_state *tok, const char* enc) {
 /* Return a UTF-8 encoding Python string object from the
    C byte string STR, which is encoded with ENC. */
 
+#ifdef Py_USING_UNICODE
 static PyObject *
 translate_into_utf8(const char* str, const char* enc) {
     PyObject *utf8;
@@ -611,12 +619,12 @@ translate_into_utf8(const char* str, const char* enc) {
     Py_DECREF(buf);
     return utf8;
 }
+#endif
 
 
 static char *
 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
-    int skip_next_lf = 0;
-    size_t needed_length = strlen(s) + 2, final_length;
+    int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
     char *buf, *current;
     char c = '\0';
     buf = PyMem_MALLOC(needed_length);
@@ -680,12 +688,14 @@ decode_str(const char *input, int single, struct tok_state *tok)
         return error_ret(tok);
     str = tok->str;             /* string after BOM if any */
     assert(str);
+#ifdef Py_USING_UNICODE
     if (tok->enc != NULL) {
         utf8 = translate_into_utf8(str, tok->enc);
         if (utf8 == NULL)
             return error_ret(tok);
-        str = PyBytes_AsString(utf8);
+        str = PyString_AsString(utf8);
     }
+#endif
     for (s = str;; s++) {
         if (*s == '\0') break;
         else if (*s == '\n') {
@@ -707,18 +717,22 @@ decode_str(const char *input, int single, struct tok_state *tok)
                 return error_ret(tok);
         }
     }
+#ifdef Py_USING_UNICODE
     if (tok->enc != NULL) {
         assert(utf8 == NULL);
         utf8 = translate_into_utf8(str, tok->enc);
         if (utf8 == NULL)
             return error_ret(tok);
-        str = PyBytes_AS_STRING(utf8);
+        str = PyString_AsString(utf8);
     }
+#endif
     assert(tok->decoding_buffer == NULL);
     tok->decoding_buffer = utf8; /* CAUTION */
     return str;
 }
 
+#endif /* PGEN */
+
 /* Set up tokenizer for string */
 
 struct tok_state *
@@ -727,7 +741,7 @@ PyTokenizer_FromString(const char *str, int exec_input)
     struct tok_state *tok = tok_new();
     if (tok == NULL)
         return NULL;
-    str = decode_str(str, exec_input, tok);
+    str = (char *)decode_str(str, exec_input, tok);
     if (str == NULL) {
         PyTokenizer_Free(tok);
         return NULL;
@@ -738,38 +752,11 @@ PyTokenizer_FromString(const char *str, int exec_input)
     return tok;
 }
 
-struct tok_state *
-PyTokenizer_FromUTF8(const char *str, int exec_input)
-{
-    struct tok_state *tok = tok_new();
-    if (tok == NULL)
-        return NULL;
-    tok->input = str = translate_newlines(str, exec_input, tok);
-    if (str == NULL) {
-        PyTokenizer_Free(tok);
-        return NULL;
-    }
-    tok->decoding_state = STATE_RAW;
-    tok->read_coding_spec = 1;
-    tok->enc = NULL;
-    tok->str = str;
-    tok->encoding = (char *)PyMem_MALLOC(6);
-    if (!tok->encoding) {
-        PyTokenizer_Free(tok);
-        return NULL;
-    }
-    strcpy(tok->encoding, "utf-8");
-
-    /* XXX: constify members. */
-    tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
-    return tok;
-}
 
 /* Set up tokenizer for file */
 
 struct tok_state *
-PyTokenizer_FromFile(FILE *fp, const char* enc,
-                     const char *ps1, const char *ps2)
+PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
 {
     struct tok_state *tok = tok_new();
     if (tok == NULL)
@@ -783,17 +770,6 @@ PyTokenizer_FromFile(FILE *fp, const char* enc,
     tok->fp = fp;
     tok->prompt = ps1;
     tok->nextprompt = ps2;
-    if (enc != NULL) {
-        /* Must copy encoding declaration since it
-           gets copied into the parse tree. */
-        tok->encoding = PyMem_MALLOC(strlen(enc)+1);
-        if (!tok->encoding) {
-            PyTokenizer_Free(tok);
-            return NULL;
-        }
-        strcpy(tok->encoding, enc);
-        tok->decoding_state = STATE_NORMAL;
-    }
     return tok;
 }
 
@@ -805,9 +781,10 @@ PyTokenizer_Free(struct tok_state *tok)
 {
     if (tok->encoding != NULL)
         PyMem_FREE(tok->encoding);
+#ifndef PGEN
     Py_XDECREF(tok->decoding_readline);
     Py_XDECREF(tok->decoding_buffer);
-    Py_XDECREF(tok->filename);
+#endif
     if (tok->fp != NULL && tok->buf != NULL)
         PyMem_FREE(tok->buf);
     if (tok->input)
@@ -815,10 +792,74 @@ PyTokenizer_Free(struct tok_state *tok)
     PyMem_FREE(tok);
 }
 
+#if !defined(PGEN) && defined(Py_USING_UNICODE)
+static int
+tok_stdin_decode(struct tok_state *tok, char **inp)
+{
+    PyObject *enc, *sysstdin, *decoded, *utf8;
+    const char *encoding;
+    char *converted;
+
+    if (PySys_GetFile((char *)"stdin", NULL) != stdin)
+        return 0;
+    sysstdin = PySys_GetObject("stdin");
+    if (sysstdin == NULL || !PyFile_Check(sysstdin))
+        return 0;
+
+    enc = ((PyFileObject *)sysstdin)->f_encoding;
+    if (enc == NULL || !PyString_Check(enc))
+        return 0;
+    Py_INCREF(enc);
+
+    encoding = PyString_AsString(enc);
+    decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
+    if (decoded == NULL)
+        goto error_clear;
+
+    utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
+    Py_DECREF(decoded);
+    if (utf8 == NULL)
+        goto error_clear;
+
+    assert(PyString_Check(utf8));
+    converted = new_string(PyString_AS_STRING(utf8),
+                           PyString_GET_SIZE(utf8));
+    Py_DECREF(utf8);
+    if (converted == NULL)
+        goto error_nomem;
+
+    PyMem_FREE(*inp);
+    *inp = converted;
+    if (tok->encoding != NULL)
+        PyMem_FREE(tok->encoding);
+    tok->encoding = new_string(encoding, strlen(encoding));
+    if (tok->encoding == NULL)
+        goto error_nomem;
+
+    Py_DECREF(enc);
+    return 0;
+
+error_nomem:
+    Py_DECREF(enc);
+    tok->done = E_NOMEM;
+    return -1;
+
+error_clear:
+    Py_DECREF(enc);
+    if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
+        tok->done = E_ERROR;
+        return -1;
+    }
+    /* Fallback to iso-8859-1: for backward compatibility */
+    PyErr_Clear();
+    return 0;
+}
+#endif
+
 /* Get next char, updating state; error code goes into tok->done */
 
 static int
-tok_nextc(struct tok_state *tok)
+tok_nextc(register struct tok_state *tok)
 {
     for (;;) {
         if (tok->cur != tok->inp) {
@@ -846,34 +887,6 @@ tok_nextc(struct tok_state *tok)
         }
         if (tok->prompt != NULL) {
             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
-            if (newtok != NULL) {
-                char *translated = translate_newlines(newtok, 0, tok);
-                PyMem_FREE(newtok);
-                if (translated == NULL)
-                    return EOF;
-                newtok = translated;
-            }
-            if (tok->encoding && newtok && *newtok) {
-                /* Recode to UTF-8 */
-                Py_ssize_t buflen;
-                const char* buf;
-                PyObject *u = translate_into_utf8(newtok, tok->encoding);
-                PyMem_FREE(newtok);
-                if (!u) {
-                    tok->done = E_DECODE;
-                    return EOF;
-                }
-                buflen = PyBytes_GET_SIZE(u);
-                buf = PyBytes_AS_STRING(u);
-                newtok = PyMem_MALLOC(buflen+1);
-                if (newtok == NULL) {
-                    Py_DECREF(u);
-                    tok->done = E_NOMEM;
-                    return EOF;
-                }
-                strcpy(newtok, buf);
-                Py_DECREF(u);
-            }
             if (tok->nextprompt != NULL)
                 tok->prompt = tok->nextprompt;
             if (newtok == NULL)
@@ -882,6 +895,10 @@ tok_nextc(struct tok_state *tok)
                 PyMem_FREE(newtok);
                 tok->done = E_EOF;
             }
+#if !defined(PGEN) && defined(Py_USING_UNICODE)
+            else if (tok_stdin_decode(tok, &newtok) != 0)
+                PyMem_FREE(newtok);
+#endif
             else if (tok->start != NULL) {
                 size_t start = tok->start - tok->buf;
                 size_t oldlen = tok->cur - tok->buf;
@@ -956,7 +973,6 @@ tok_nextc(struct tok_state *tok)
             while (!done) {
                 Py_ssize_t curstart = tok->start == NULL ? -1 :
                           tok->start - tok->buf;
-                Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
                 Py_ssize_t curvalid = tok->inp - tok->buf;
                 Py_ssize_t newsize = curvalid + BUFSIZ;
                 char *newbuf = tok->buf;
@@ -969,7 +985,6 @@ tok_nextc(struct tok_state *tok)
                 }
                 tok->buf = newbuf;
                 tok->cur = tok->buf + cur;
-                tok->multi_line_start = tok->buf + cur_multi_line_start;
                 tok->line_start = tok->cur;
                 tok->inp = tok->buf + curvalid;
                 tok->end = tok->buf + newsize;
@@ -985,8 +1000,7 @@ tok_nextc(struct tok_state *tok)
                         return EOF;
                     /* Last line does not end in \n,
                        fake one */
-                    if (tok->inp[-1] != '\n')
-                        strcpy(tok->inp, "\n");
+                    strcpy(tok->inp, "\n");
                 }
                 tok->inp = strchr(tok->inp, '\0');
                 done = tok->inp[-1] == '\n';
@@ -1018,7 +1032,7 @@ tok_nextc(struct tok_state *tok)
 /* Back-up one character */
 
 static void
-tok_backup(struct tok_state *tok, int c)
+tok_backup(register struct tok_state *tok, register int c)
 {
     if (c != EOF) {
         if (--tok->cur < tok->buf)
@@ -1029,88 +1043,185 @@ tok_backup(struct tok_state *tok, int c)
 }
 
 
-static int
-syntaxerror(struct tok_state *tok, const char *format, ...)
-{
-    va_list vargs;
-#ifdef HAVE_STDARG_PROTOTYPES
-    va_start(vargs, format);
-#else
-    va_start(vargs);
-#endif
-    PyErr_FormatV(PyExc_SyntaxError, format, vargs);
-    va_end(vargs);
-    PyErr_SyntaxLocationObject(tok->filename,
-                               tok->lineno,
-                               (int)(tok->cur - tok->line_start));
-    tok->done = E_ERROR;
-    return ERRORTOKEN;
-}
+/* Return the token corresponding to a single character */
 
-static int
-indenterror(struct tok_state *tok)
+int
+PyToken_OneChar(int c)
 {
-    tok->done = E_TABSPACE;
-    tok->cur = tok->inp;
-    return ERRORTOKEN;
+    switch (c) {
+    case '(':           return LPAR;
+    case ')':           return RPAR;
+    case '[':           return LSQB;
+    case ']':           return RSQB;
+    case ':':           return COLON;
+    case ',':           return COMMA;
+    case ';':           return SEMI;
+    case '+':           return PLUS;
+    case '-':           return MINUS;
+    case '*':           return STAR;
+    case '/':           return SLASH;
+    case '|':           return VBAR;
+    case '&':           return AMPER;
+    case '<':           return LESS;
+    case '>':           return GREATER;
+    case '=':           return EQUAL;
+    case '.':           return DOT;
+    case '%':           return PERCENT;
+    case '`':           return BACKQUOTE;
+    case '{':           return LBRACE;
+    case '}':           return RBRACE;
+    case '^':           return CIRCUMFLEX;
+    case '~':           return TILDE;
+    case '@':       return AT;
+    default:            return OP;
+    }
 }
 
-/* Verify that the identifier follows PEP 3131.
-   All identifier strings are guaranteed to be "ready" unicode objects.
- */
-static int
-verify_identifier(struct tok_state *tok)
+
+int
+PyToken_TwoChars(int c1, int c2)
 {
-    PyObject *s;
-    int result;
-    if (tok->decoding_erred)
-        return 0;
-    s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
-    if (s == NULL) {
-        if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
-            PyErr_Clear();
-            tok->done = E_IDENTIFIER;
-        } else {
-            tok->done = E_ERROR;
+    switch (c1) {
+    case '=':
+        switch (c2) {
+        case '=':               return EQEQUAL;
         }
-        return 0;
+        break;
+    case '!':
+        switch (c2) {
+        case '=':               return NOTEQUAL;
+        }
+        break;
+    case '<':
+        switch (c2) {
+        case '>':               return NOTEQUAL;
+        case '=':               return LESSEQUAL;
+        case '<':               return LEFTSHIFT;
+        }
+        break;
+    case '>':
+        switch (c2) {
+        case '=':               return GREATEREQUAL;
+        case '>':               return RIGHTSHIFT;
+        }
+        break;
+    case '+':
+        switch (c2) {
+        case '=':               return PLUSEQUAL;
+        }
+        break;
+    case '-':
+        switch (c2) {
+        case '=':               return MINEQUAL;
+        }
+        break;
+    case '*':
+        switch (c2) {
+        case '*':               return DOUBLESTAR;
+        case '=':               return STAREQUAL;
+        }
+        break;
+    case '/':
+        switch (c2) {
+        case '/':               return DOUBLESLASH;
+        case '=':               return SLASHEQUAL;
+        }
+        break;
+    case '|':
+        switch (c2) {
+        case '=':               return VBAREQUAL;
+        }
+        break;
+    case '%':
+        switch (c2) {
+        case '=':               return PERCENTEQUAL;
+        }
+        break;
+    case '&':
+        switch (c2) {
+        case '=':               return AMPEREQUAL;
+        }
+        break;
+    case '^':
+        switch (c2) {
+        case '=':               return CIRCUMFLEXEQUAL;
+        }
+        break;
     }
-    result = PyUnicode_IsIdentifier(s);
-    Py_DECREF(s);
-    if (result == 0)
-        tok->done = E_IDENTIFIER;
-    return result;
+    return OP;
 }
 
-static int
-tok_decimal_tail(struct tok_state *tok)
+int
+PyToken_ThreeChars(int c1, int c2, int c3)
 {
-    int c;
-
-    while (1) {
-        do {
-            c = tok_nextc(tok);
-        } while (isdigit(c));
-        if (c != '_') {
+    switch (c1) {
+    case '<':
+        switch (c2) {
+        case '<':
+            switch (c3) {
+            case '=':
+                return LEFTSHIFTEQUAL;
+            }
             break;
         }
-        c = tok_nextc(tok);
-        if (!isdigit(c)) {
-            tok_backup(tok, c);
-            syntaxerror(tok, "invalid decimal literal");
-            return 0;
+        break;
+    case '>':
+        switch (c2) {
+        case '>':
+            switch (c3) {
+            case '=':
+                return RIGHTSHIFTEQUAL;
+            }
+            break;
+        }
+        break;
+    case '*':
+        switch (c2) {
+        case '*':
+            switch (c3) {
+            case '=':
+                return DOUBLESTAREQUAL;
+            }
+            break;
+        }
+        break;
+    case '/':
+        switch (c2) {
+        case '/':
+            switch (c3) {
+            case '=':
+                return DOUBLESLASHEQUAL;
+            }
+            break;
         }
+        break;
+    }
+    return OP;
+}
+
+static int
+indenterror(struct tok_state *tok)
+{
+    if (tok->alterror) {
+        tok->done = E_TABSPACE;
+        tok->cur = tok->inp;
+        return 1;
+    }
+    if (tok->altwarning) {
+        PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
+                          "in indentation\n", tok->filename);
+        tok->altwarning = 0;
     }
-    return c;
+    return 0;
 }
 
 /* Get next token, after space stripping etc. */
 
 static int
-tok_get(struct tok_state *tok, char **p_start, char **p_end)
+tok_get(register struct tok_state *tok, char **p_start, char **p_end)
 {
-    int c;
-    int blankline, nonascii;
+    register int c;
+    int blankline;
 
     *p_start = *p_end = NULL;
   nextline:
@@ -1119,24 +1230,22 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
 
     /* Get indentation level */
     if (tok->atbol) {
-        int col = 0;
-        int altcol = 0;
+        register int col = 0;
+        register int altcol = 0;
         tok->atbol = 0;
         for (;;) {
             c = tok_nextc(tok);
-            if (c == ' ') {
+            if (c == ' ')
                 col++, altcol++;
-            }
             else if (c == '\t') {
-                col = (col / tok->tabsize + 1) * tok->tabsize;
-                altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
+                col = (col/tok->tabsize + 1) * tok->tabsize;
+                altcol = (altcol/tok->alttabsize + 1)
+                    * tok->alttabsize;
             }
-            else if (c == '\014')  {/* Control-L (formfeed) */
+            else if (c == '\014') /* Control-L (formfeed) */
                 col = altcol = 0; /* For Emacs users */
-            }
-            else {
+            else
                 break;
-            }
         }
         tok_backup(tok, c);
         if (c == '#' || c == '\n') {
@@ -1145,18 +1254,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
                not passed to the parser as NEWLINE tokens,
                except *totally* empty lines in interactive
                mode, which signal the end of a command group. */
-            if (col == 0 && c == '\n' && tok->prompt != NULL) {
+            if (col == 0 && c == '\n' && tok->prompt != NULL)
                 blankline = 0; /* Let it through */
-            }
-            else if (tok->prompt != NULL && tok->lineno == 1) {
-                /* In interactive mode, if the first line contains
-                   only spaces and/or a comment, let it through. */
-                blankline = 0;
-                col = altcol = 0;
-            }
-            else {
+            else
                 blankline = 1; /* Ignore completely */
-            }
             /* We can't jump back right here since we still
                may need to skip to the end of a comment */
         }
@@ -1164,7 +1265,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
             if (col == tok->indstack[tok->indent]) {
                 /* No change */
                 if (altcol != tok->altindstack[tok->indent]) {
-                    return indenterror(tok);
+                    if (indenterror(tok))
+                        return ERRORTOKEN;
                 }
             }
             else if (col > tok->indstack[tok->indent]) {
@@ -1175,7 +1277,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
                     return ERRORTOKEN;
                 }
                 if (altcol <= tok->altindstack[tok->indent]) {
-                    return indenterror(tok);
+                    if (indenterror(tok))
+                        return ERRORTOKEN;
                 }
                 tok->pendin++;
                 tok->indstack[++tok->indent] = col;
@@ -1194,7 +1297,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
                     return ERRORTOKEN;
                 }
                 if (altcol != tok->altindstack[tok->indent]) {
-                    return indenterror(tok);
+                    if (indenterror(tok))
+                        return ERRORTOKEN;
                 }
             }
         }
@@ -1214,31 +1318,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
         }
     }
 
-    /* Peek ahead at the next character */
-    c = tok_nextc(tok);
-    tok_backup(tok, c);
-    /* Check if we are closing an async function */
-    if (tok->async_def
-        && !blankline
-        /* Due to some implementation artifacts of type comments,
-         * a TYPE_COMMENT at the start of a function won't set an
-         * indentation level and it will produce a NEWLINE after it.
-         * To avoid spuriously ending an async function due to this,
-         * wait until we have some non-newline char in front of us. */
-        && c != '\n'
-        && tok->level == 0
-        /* There was a NEWLINE after ASYNC DEF,
-           so we're past the signature. */
-        && tok->async_def_nl
-        /* Current indentation level is less than where
-           the async function was defined */
-        && tok->async_def_indent >= tok->indent)
-    {
-        tok->async_def = 0;
-        tok->async_def_indent = 0;
-        tok->async_def_nl = 0;
-    }
-
  again:
     tok->start = NULL;
     /* Skip spaces */
@@ -1249,63 +1328,40 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
     /* Set start of current token */
     tok->start = tok->cur - 1;
 
-    /* Skip comment, unless it's a type comment */
+    /* Skip comment, while looking for tab-setting magic */
     if (c == '#') {
-        const char *prefix, *p, *type_start;
-
-        while (c != EOF && c != '\n') {
-            c = tok_nextc(tok);
-        }
-
-        if (tok->type_comments) {
-            p = tok->start;
-            prefix = type_comment_prefix;
-            while (*prefix && p < tok->cur) {
-                if (*prefix == ' ') {
-                    while (*p == ' ' || *p == '\t') {
-                        p++;
-                    }
-                } else if (*prefix == *p) {
-                    p++;
-                } else {
-                    break;
-                }
-
-                prefix++;
-            }
-
-            /* This is a type comment if we matched all of type_comment_prefix. */
-            if (!*prefix) {
-                int is_type_ignore = 1;
-                const char *ignore_end = p + 6;
-                tok_backup(tok, c);  /* don't eat the newline or EOF */
-
-                type_start = p;
-
-                /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
-                 * or anything ASCII and non-alphanumeric. */
-                is_type_ignore = (
-                    tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
-                    && !(tok->cur > ignore_end
-                         && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
-
-                if (is_type_ignore) {
-                    *p_start = (char *) ignore_end;
-                    *p_end = tok->cur;
-
-                    /* If this type ignore is the only thing on the line, consume the newline also. */
-                    if (blankline) {
-                        tok_nextc(tok);
-                        tok->atbol = 1;
-                    }
-                    return TYPE_IGNORE;
-                } else {
-                    *p_start = (char *) type_start;  /* after type_comment_prefix */
-                    *p_end = tok->cur;
-                    return TYPE_COMMENT;
+        static char *tabforms[] = {
+            "tab-width:",                       /* Emacs */
+            ":tabstop=",                        /* vim, full form */
+            ":ts=",                             /* vim, abbreviated form */
+            "set tabsize=",                     /* will vi never die? */
+        /* more templates can be added here to support other editors */
+        };
+        char cbuf[80];
+        char *tp, **cp;
+        tp = cbuf;
+        do {
+            *tp++ = c = tok_nextc(tok);
+        } while (c != EOF && c != '\n' &&
+                 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
+        *tp = '\0';
+        for (cp = tabforms;
+             cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
+             cp++) {
+            if ((tp = strstr(cbuf, *cp))) {
+                int newsize = atoi(tp + strlen(*cp));
+
+                if (newsize >= 1 && newsize <= 40) {
+                    tok->tabsize = newsize;
+                    if (Py_VerboseFlag)
+                        PySys_WriteStderr(
+                        "Tab size set to %d\n",
+                        newsize);
                 }
             }
         }
+        while (c != EOF && c != '\n')
+            c = tok_nextc(tok);
     }
 
     /* Check for EOF and errors now */
@@ -1314,108 +1370,49 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
     }
 
     /* Identifier (most frequent token!) */
-    nonascii = 0;
-    if (is_potential_identifier_start(c)) {
-        /* Process the various legal combinations of b"", r"", u"", and f"". */
-        int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
-        while (1) {
-            if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
-                saw_b = 1;
-            /* Since this is a backwards compatibility support literal we don't
-               want to support it in arbitrary order like byte literals. */
-            else if (!(saw_b || saw_u || saw_r || saw_f)
-                     && (c == 'u'|| c == 'U')) {
-                saw_u = 1;
-            }
-            /* ur"" and ru"" are not supported */
-            else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
-                saw_r = 1;
-            }
-            else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
-                saw_f = 1;
-            }
-            else {
-                break;
-            }
+    if (Py_ISALPHA(c) || c == '_') {
+        /* Process r"", u"" and ur"" */
+        switch (c) {
+        case 'b':
+        case 'B':
             c = tok_nextc(tok);
-            if (c == '"' || c == '\'') {
+            if (c == 'r' || c == 'R')
+                c = tok_nextc(tok);
+            if (c == '"' || c == '\'')
                 goto letter_quote;
-            }
+            break;
+        case 'r':
+        case 'R':
+            c = tok_nextc(tok);
+            if (c == '"' || c == '\'')
+                goto letter_quote;
+            break;
+        case 'u':
+        case 'U':
+            c = tok_nextc(tok);
+            if (c == 'r' || c == 'R')
+                c = tok_nextc(tok);
+            if (c == '"' || c == '\'')
+                goto letter_quote;
+            break;
         }
-        while (is_potential_identifier_char(c)) {
-            if (c >= 128) {
-                nonascii = 1;
-            }
+        while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
             c = tok_nextc(tok);
         }
         tok_backup(tok, c);
-        if (nonascii && !verify_identifier(tok)) {
-            return ERRORTOKEN;
-        }
         *p_start = tok->start;
         *p_end = tok->cur;
-
-        /* async/await parsing block. */
-        if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
-            /* May be an 'async' or 'await' token.  For Python 3.7 or
-               later we recognize them unconditionally.  For Python
-               3.5 or 3.6 we recognize 'async' in front of 'def', and
-               either one inside of 'async def'.  (Technically we
-               shouldn't recognize these at all for 3.4 or earlier,
-               but there's no *valid* Python 3.4 code that would be
-               rejected, and async functions will be rejected in a
-               later phase.) */
-            if (!tok->async_hacks || tok->async_def) {
-                /* Always recognize the keywords. */
-                if (memcmp(tok->start, "async", 5) == 0) {
-                    return ASYNC;
-                }
-                if (memcmp(tok->start, "await", 5) == 0) {
-                    return AWAIT;
-                }
-            }
-            else if (memcmp(tok->start, "async", 5) == 0) {
-                /* The current token is 'async'.
-                   Look ahead one token to see if that is 'def'. */
-
-                struct tok_state ahead_tok;
-                char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
-                int ahead_tok_kind;
-
-                memcpy(&ahead_tok, tok, sizeof(ahead_tok));
-                ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
-                                         &ahead_tok_end);
-
-                if (ahead_tok_kind == NAME
-                    && ahead_tok.cur - ahead_tok.start == 3
-                    && memcmp(ahead_tok.start, "def", 3) == 0)
-                {
-                    /* The next token is going to be 'def', so instead of
-                       returning a plain NAME token, return ASYNC. */
-                    tok->async_def_indent = tok->indent;
-                    tok->async_def = 1;
-                    return ASYNC;
-                }
-            }
-        }
-
         return NAME;
     }
 
     /* Newline */
     if (c == '\n') {
         tok->atbol = 1;
-        if (blankline || tok->level > 0) {
+        if (blankline || tok->level > 0)
             goto nextline;
-        }
         *p_start = tok->start;
         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
         tok->cont_line = 0;
-        if (tok->async_def) {
-            /* We're somewhere inside an 'async def' function, and
-               we've encountered a NEWLINE after its signature. */
-            tok->async_def_nl = 1;
-        }
         return NEWLINE;
     }
 
@@ -1424,24 +1421,13 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
         c = tok_nextc(tok);
         if (isdigit(c)) {
             goto fraction;
-        } else if (c == '.') {
-            c = tok_nextc(tok);
-            if (c == '.') {
-                *p_start = tok->start;
-                *p_end = tok->cur;
-                return ELLIPSIS;
-            }
-            else {
-                tok_backup(tok, c);
-            }
-            tok_backup(tok, '.');
         }
         else {
             tok_backup(tok, c);
+            *p_start = tok->start;
+            *p_end = tok->cur;
+            return DOT;
         }
-        *p_start = tok->start;
-        *p_end = tok->cur;
-        return DOT;
     }
 
     /* Number */
@@ -1449,136 +1435,94 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
         if (c == '0') {
             /* Hex, octal or binary -- maybe. */
             c = tok_nextc(tok);
+            if (c == '.')
+                goto fraction;
+#ifndef WITHOUT_COMPLEX
+            if (c == 'j' || c == 'J')
+                goto imaginary;
+#endif
             if (c == 'x' || c == 'X') {
+
                 /* Hex */
                 c = tok_nextc(tok);
+                if (!isxdigit(c)) {
+                    tok->done = E_TOKEN;
+                    tok_backup(tok, c);
+                    return ERRORTOKEN;
+                }
                 do {
-                    if (c == '_') {
-                        c = tok_nextc(tok);
-                    }
-                    if (!isxdigit(c)) {
-                        tok_backup(tok, c);
-                        return syntaxerror(tok, "invalid hexadecimal literal");
-                    }
-                    do {
-                        c = tok_nextc(tok);
-                    } while (isxdigit(c));
-                } while (c == '_');
+                    c = tok_nextc(tok);
+                } while (isxdigit(c));
             }
             else if (c == 'o' || c == 'O') {
                 /* Octal */
                 c = tok_nextc(tok);
-                do {
-                    if (c == '_') {
-                        c = tok_nextc(tok);
-                    }
-                    if (c < '0' || c >= '8') {
-                        tok_backup(tok, c);
-                        if (isdigit(c)) {
-                            return syntaxerror(tok,
-                                    "invalid digit '%c' in octal literal", c);
-                        }
-                        else {
-                            return syntaxerror(tok, "invalid octal literal");
-                        }
-                    }
-                    do {
-                        c = tok_nextc(tok);
-                    } while ('0' <= c && c < '8');
-                } while (c == '_');
-                if (isdigit(c)) {
-                    return syntaxerror(tok,
-                            "invalid digit '%c' in octal literal", c);
+                if (c < '0' || c >= '8') {
+                    tok->done = E_TOKEN;
+                    tok_backup(tok, c);
+                    return ERRORTOKEN;
                 }
+                do {
+                    c = tok_nextc(tok);
+                } while ('0' <= c && c < '8');
             }
             else if (c == 'b' || c == 'B') {
                 /* Binary */
                 c = tok_nextc(tok);
-                do {
-                    if (c == '_') {
-                        c = tok_nextc(tok);
-                    }
-                    if (c != '0' && c != '1') {
-                        tok_backup(tok, c);
-                        if (isdigit(c)) {
-                            return syntaxerror(tok,
-                                    "invalid digit '%c' in binary literal", c);
-                        }
-                        else {
-                            return syntaxerror(tok, "invalid binary literal");
-                        }
-                    }
-                    do {
-                        c = tok_nextc(tok);
-                    } while (c == '0' || c == '1');
-                } while (c == '_');
-                if (isdigit(c)) {
-                    return syntaxerror(tok,
-                            "invalid digit '%c' in binary literal", c);
+                if (c != '0' && c != '1') {
+                    tok->done = E_TOKEN;
+                    tok_backup(tok, c);
+                    return ERRORTOKEN;
                 }
+                do {
+                    c = tok_nextc(tok);
+                } while (c == '0' || c == '1');
             }
             else {
-                int nonzero = 0;
-                /* maybe old-style octal; c is first char of it */
-                /* in any case, allow '0' as a literal */
-                while (1) {
-                    if (c == '_') {
-                        c = tok_nextc(tok);
-                        if (!isdigit(c)) {
-                            tok_backup(tok, c);
-                            return syntaxerror(tok, "invalid decimal literal");
-                        }
-                    }
-                    if (c != '0') {
-                        break;
-                    }
+                int found_decimal = 0;
+                /* Octal; c is first char of it */
+                /* There's no 'isoctdigit' macro, sigh */
+                while ('0' <= c && c < '8') {
                     c = tok_nextc(tok);
                 }
                 if (isdigit(c)) {
-                    nonzero = 1;
-                    c = tok_decimal_tail(tok);
-                    if (c == 0) {
-                        return ERRORTOKEN;
-                    }
+                    found_decimal = 1;
+                    do {
+                        c = tok_nextc(tok);
+                    } while (isdigit(c));
                 }
-                if (c == '.') {
-                    c = tok_nextc(tok);
+                if (c == '.')
                     goto fraction;
-                }
-                else if (c == 'e' || c == 'E') {
+                else if (c == 'e' || c == 'E')
                     goto exponent;
-                }
-                else if (c == 'j' || c == 'J') {
+#ifndef WITHOUT_COMPLEX
+                else if (c == 'j' || c == 'J')
                     goto imaginary;
-                }
-                else if (nonzero) {
-                    /* Old-style octal: now disallowed. */
+#endif
+                else if (found_decimal) {
+                    tok->done = E_TOKEN;
                     tok_backup(tok, c);
-                    return syntaxerror(tok,
-                                       "leading zeros in decimal integer "
-                                       "literals are not permitted; "
-                                       "use an 0o prefix for octal integers");
+                    return ERRORTOKEN;
                 }
             }
+            if (c == 'l' || c == 'L')
+                c = tok_nextc(tok);
         }
         else {
             /* Decimal */
-            c = tok_decimal_tail(tok);
-            if (c == 0) {
-                return ERRORTOKEN;
-            }
-            {
+            do {
+                c = tok_nextc(tok);
+            } while (isdigit(c));
+            if (c == 'l' || c == 'L')
+                c = tok_nextc(tok);
+            else {
                 /* Accept floating point numbers. */
                 if (c == '.') {
-                    c = tok_nextc(tok);
         fraction:
                     /* Fraction */
-                    if (isdigit(c)) {
-                        c = tok_decimal_tail(tok);
-                        if (c == 0) {
-                            return ERRORTOKEN;
-                        }
-                    }
+                    do {
+                        c = tok_nextc(tok);
+                    } while (isdigit(c));
                 }
                 if (c == 'e' || c == 'E') {
                     int e;
@@ -1589,8 +1533,9 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
                     if (c == '+' || c == '-') {
                         c = tok_nextc(tok);
                         if (!isdigit(c)) {
+                            tok->done = E_TOKEN;
                             tok_backup(tok, c);
-                            return syntaxerror(tok, "invalid decimal literal");
+                            return ERRORTOKEN;
                         }
                     } else if (!isdigit(c)) {
                         tok_backup(tok, c);
@@ -1599,16 +1544,16 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
                         *p_end = tok->cur;
                         return NUMBER;
                     }
-                    c = tok_decimal_tail(tok);
-                    if (c == 0) {
-                        return ERRORTOKEN;
-                    }
+                    do {
+                        c = tok_nextc(tok);
+                    } while (isdigit(c));
                 }
-                if (c == 'j' || c == 'J') {
+#ifndef WITHOUT_COMPLEX
+                if (c == 'j' || c == 'J')
                     /* Imaginary part */
         imaginary:
                     c = tok_nextc(tok);
-                }
+#endif
             }
         }
         tok_backup(tok, c);
@@ -1620,61 +1565,55 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
   letter_quote:
     /* String */
     if (c == '\'' || c == '"') {
+        Py_ssize_t quote2 = tok->cur - tok->start + 1;
         int quote = c;
-        int quote_size = 1;             /* 1 or 3 */
-        int end_quote_size = 0;
-
-        /* Nodes of type STRING, especially multi line strings
-           must be handled differently in order to get both
-           the starting line number and the column offset right.
-           (cf. issue 16806) */
-        tok->first_lineno = tok->lineno;
-        tok->multi_line_start = tok->line_start;
-
-        /* Find the quote size and start of string */
-        c = tok_nextc(tok);
-        if (c == quote) {
-            c = tok_nextc(tok);
-            if (c == quote) {
-                quote_size = 3;
-            }
-            else {
-                end_quote_size = 1;     /* empty string found */
-            }
-        }
-        if (c != quote) {
-            tok_backup(tok, c);
-        }
-
-        /* Get rest of string */
-        while (end_quote_size != quote_size) {
+        int triple = 0;
+        int tripcount = 0;
+        for (;;) {
             c = tok_nextc(tok);
-            if (c == EOF) {
-                if (quote_size == 3) {
-                    tok->done = E_EOFS;
-                }
-                else {
+            if (c == '\n') {
+                if (!triple) {
                     tok->done = E_EOLS;
+                    tok_backup(tok, c);
+                    return ERRORTOKEN;
                 }
-                tok->cur = tok->inp;
-                return ERRORTOKEN;
+                tripcount = 0;
+                tok->cont_line = 1; /* multiline string. */
             }
-            if (quote_size == 1 && c == '\n') {
-                tok->done = E_EOLS;
+            else if (c == EOF) {
+                if (triple)
+                    tok->done = E_EOFS;
+                else
+                    tok->done = E_EOLS;
                 tok->cur = tok->inp;
                 return ERRORTOKEN;
             }
-            if (c == quote) {
-                end_quote_size += 1;
+            else if (c == quote) {
+                tripcount++;
+                if (tok->cur - tok->start == quote2) {
+                    c = tok_nextc(tok);
+                    if (c == quote) {
+                        triple = 1;
+                        tripcount = 0;
+                        continue;
+                    }
+                    tok_backup(tok, c);
+                }
+                if (!triple || tripcount == 3)
+                    break;
             }
-            else {
-                end_quote_size = 0;
-                if (c == '\\') {
-                    tok_nextc(tok);  /* skip escaped char */
+            else if (c == '\\') {
+                tripcount = 0;
+                c = tok_nextc(tok);
+                if (c == EOF) {
+                    tok->done = E_EOLS;
+                    tok->cur = tok->inp;
+                    return ERRORTOKEN;
                 }
             }
+            else
+                tripcount = 0;
         }
-
         *p_start = tok->start;
         *p_end = tok->cur;
         return STRING;
@@ -1688,14 +1627,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
             tok->cur = tok->inp;
             return ERRORTOKEN;
         }
-        c = tok_nextc(tok);
-        if (c == EOF) {
-            tok->done = E_EOF;
-            tok->cur = tok->inp;
-            return ERRORTOKEN;
-        } else {
-            tok_backup(tok, c);
-        }
         tok->cont_line = 1;
         goto again; /* Read next line */
     }
@@ -1704,13 +1635,24 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
     {
         int c2 = tok_nextc(tok);
         int token = PyToken_TwoChars(c, c2);
+#ifndef PGEN
+        if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
+            if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
+                                   "<> not supported in 3.x; use !=",
+                                   tok->filename, tok->lineno,
+                                   NULL, NULL)) {
+                tok->done = E_ERROR;
+                tok->cur = tok->inp;
+                return ERRORTOKEN;
+            }
+        }
+#endif
         if (token != OP) {
             int c3 = tok_nextc(tok);
             int token3 = PyToken_ThreeChars(c, c2, c3);
             if (token3 != OP) {
                 token = token3;
-            }
-            else {
+            } else {
                 tok_backup(tok, c3);
             }
             *p_start = tok->start;
@@ -1725,38 +1667,12 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
     case '(':
     case '[':
     case '{':
-        if (tok->level >= MAXLEVEL) {
-            return syntaxerror(tok, "too many nested parentheses");
-        }
-        tok->parenstack[tok->level] = c;
-        tok->parenlinenostack[tok->level] = tok->lineno;
         tok->level++;
         break;
     case ')':
     case ']':
     case '}':
-        if (!tok->level) {
-            return syntaxerror(tok, "unmatched '%c'", c);
-        }
         tok->level--;
-        int opening = tok->parenstack[tok->level];
-        if (!((opening == '(' && c == ')') ||
-              (opening == '[' && c == ']') ||
-              (opening == '{' && c == '}')))
-        {
-            if (tok->parenlinenostack[tok->level] != tok->lineno) {
-                return syntaxerror(tok,
-                        "closing parenthesis '%c' does not match "
-                        "opening parenthesis '%c' on line %d",
-                        c, opening, tok->parenlinenostack[tok->level]);
-            }
-            else {
-                return syntaxerror(tok,
-                        "closing parenthesis '%c' does not match "
-                        "opening parenthesis '%c'",
-                        c, opening);
-            }
-        }
         break;
     }
 
@@ -1770,6 +1686,11 @@ int
 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
 {
     int result = tok_get(tok, p_start, p_end);
+    if (tok->fp && ferror(tok->fp)) {
+        clearerr(tok->fp);
+        result = ERRORTOKEN;
+        tok->done = E_IO;
+    }
     if (tok->decoding_erred) {
         result = ERRORTOKEN;
         tok->done = E_DECODE;
@@ -1777,67 +1698,67 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
     return result;
 }
 
-/* Get the encoding of a Python file. Check for the coding cookie and check if
-   the file starts with a BOM.
-
-   PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
-   encoding in the first or second line of the file (in which case the encoding
-   should be assumed to be UTF-8).
-
-   The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
-   by the caller. */
+/* This function is only called from parsetok. However, it cannot live
+   there, as it must be empty for PGEN, and we can check for PGEN only
+   in this file. */
 
-char *
-PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
+#if defined(PGEN) || !defined(Py_USING_UNICODE)
+char*
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
 {
-    struct tok_state *tok;
-    FILE *fp;
-    char *p_start =NULL , *p_end =NULL , *encoding = NULL;
-
-    fd = _Py_dup(fd);
-    if (fd < 0) {
-        return NULL;
-    }
-
-    fp = fdopen(fd, "r");
-    if (fp == NULL) {
-        return NULL;
-    }
-    tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
-    if (tok == NULL) {
-        fclose(fp);
-        return NULL;
-    }
-    if (filename != NULL) {
-        Py_INCREF(filename);
-        tok->filename = filename;
-    }
-    else {
-        tok->filename = PyUnicode_FromString("<string>");
-        if (tok->filename == NULL) {
-            fclose(fp);
-            PyTokenizer_Free(tok);
-            return encoding;
-        }
-    }
-    while (tok->lineno < 2 && tok->done == E_OK) {
-        PyTokenizer_Get(tok, &p_start, &p_end);
+    return NULL;
+}
+#else
+#ifdef Py_USING_UNICODE
+static PyObject *
+dec_utf8(const char *enc, const char *text, size_t len) {
+    PyObject *ret = NULL;
+    PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
+    if (unicode_text) {
+        ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
+        Py_DECREF(unicode_text);
     }
-    fclose(fp);
-    if (tok->encoding) {
-        encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
-        if (encoding)
-            strcpy(encoding, tok->encoding);
+    if (!ret) {
+        PyErr_Clear();
     }
-    PyTokenizer_Free(tok);
-    return encoding;
+    return ret;
 }
-
 char *
-PyTokenizer_FindEncoding(int fd)
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
 {
-    return PyTokenizer_FindEncodingFilename(fd, NULL);
+    char *text = NULL;
+    if (tok->encoding) {
+        /* convert source to original encondig */
+        PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
+        if (lineobj != NULL) {
+            int linelen = PyString_Size(lineobj);
+            const char *line = PyString_AsString(lineobj);
+            text = PyObject_MALLOC(linelen + 1);
+            if (text != NULL && line != NULL) {
+                if (linelen)
+                    strncpy(text, line, linelen);
+                text[linelen] = '\0';
+            }
+            Py_DECREF(lineobj);
+
+            /* adjust error offset */
+            if (*offset > 1) {
+                PyObject *offsetobj = dec_utf8(tok->encoding,
+                                               tok->buf, *offset-1);
+                if (offsetobj) {
+                    *offset = PyString_Size(offsetobj) + 1;
+                    Py_DECREF(offsetobj);
+                }
+            }
+
+        }
+    }
+    return text;
+
 }
+#endif /* defined(Py_USING_UNICODE) */
+#endif
+
 
 #ifdef Py_DEBUG