1 files changed, 65 insertions, 35 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 5ba12a4..93a4a5c 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -128,7 +128,6 @@ tok_new(void)
     tok->prompt = tok->nextprompt = NULL;
     tok->lineno = 0;
     tok->level = 0;
-    tok->filename = NULL;
     tok->altwarning = 1;
     tok->alterror = 1;
     tok->alttabsize = 1;
@@ -140,6 +139,7 @@ tok_new(void)
     tok->encoding = NULL;
     tok->cont_line = 0;
 #ifndef PGEN
+    tok->filename = NULL;
     tok->decoding_readline = NULL;
     tok->decoding_buffer = NULL;
 #endif
@@ -462,6 +462,8 @@ static int
 fp_setreadl(struct tok_state *tok, const char* enc)
 {
     PyObject *readline = NULL, *stream = NULL, *io = NULL;
+    _Py_IDENTIFIER(open);
+    _Py_IDENTIFIER(readline);
     int fd;
 
     io = PyImport_ImportModuleNoBlock("io");
@@ -474,13 +476,13 @@ fp_setreadl(struct tok_state *tok, const char* enc)
         goto cleanup;
     }
 
-    stream = PyObject_CallMethod(io, "open", "isisOOO",
+    stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
                     fd, "r", -1, enc, Py_None, Py_None, Py_False);
     if (stream == NULL)
         goto cleanup;
 
     Py_XDECREF(tok->decoding_readline);
-    readline = PyObject_GetAttrString(stream, "readline");
+    readline = _PyObject_GetAttrId(stream, &PyId_readline);
     tok->decoding_readline = readline;
 
     /* The file has been reopened; parsing will restart from
@@ -545,7 +547,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
 {
     char *line = NULL;
     int badchar = 0;
-    PyObject *filename;
     for (;;) {
         if (tok->decoding_state == STATE_NORMAL) {
             /* We already have a codec associated with
@@ -586,19 +587,12 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
     if (badchar) {
         /* Need to add 1 to the line number, since this line
            has not been counted, yet.  */
-        if (tok->filename != NULL)
-            filename = PyUnicode_DecodeFSDefault(tok->filename);
-        else
-            filename = PyUnicode_FromString("<file>");
-        if (filename != NULL) {
-            PyErr_Format(PyExc_SyntaxError,
-                    "Non-UTF-8 code starting with '\\x%.2x' "
-                    "in file %U on line %i, "
-                    "but no encoding declared; "
-                    "see http://python.org/dev/peps/pep-0263/ for details",
-                    badchar, filename, tok->lineno + 1);
-            Py_DECREF(filename);
-        }
+        PyErr_Format(PyExc_SyntaxError,
+                "Non-UTF-8 code starting with '\\x%.2x' "
+                "in file %U on line %i, "
+                "but no encoding declared; "
+                "see http://python.org/dev/peps/pep-0263/ for details",
+                badchar, tok->filename, tok->lineno + 1);
         return error_ret(tok);
     }
 #endif
@@ -856,6 +850,7 @@ PyTokenizer_Free(struct tok_state *tok)
 #ifndef PGEN
     Py_XDECREF(tok->decoding_readline);
     Py_XDECREF(tok->decoding_buffer);
+    Py_XDECREF(tok->filename);
 #endif
     if (tok->fp != NULL && tok->buf != NULL)
         PyMem_FREE(tok->buf);
@@ -1250,8 +1245,13 @@ indenterror(struct tok_state *tok)
         return 1;
     }
     if (tok->altwarning) {
-        PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
+#ifdef PGEN
+        PySys_WriteStderr("inconsistent use of tabs and spaces "
+                          "in indentation\n");
+#else
+        PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
                           "in indentation\n", tok->filename);
+#endif
         tok->altwarning = 0;
     }
     return 0;
@@ -1260,14 +1260,16 @@ indenterror(struct tok_state *tok)
 #ifdef PGEN
 #define verify_identifier(tok) 1
 #else
-/* Verify that the identifier follows PEP 3131. */
+/* Verify that the identifier follows PEP 3131.
+   All identifier strings are guaranteed to be "ready" unicode objects.
+ */
 static int
 verify_identifier(struct tok_state *tok)
 {
     PyObject *s;
     int result;
     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
-    if (s == NULL) {
+    if (s == NULL || PyUnicode_READY(s) == -1) {
         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
             PyErr_Clear();
             tok->done = E_IDENTIFIER;
@@ -1410,13 +1412,20 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
     /* Identifier (most frequent token!) */
     nonascii = 0;
     if (is_potential_identifier_start(c)) {
-        /* Process b"", r"" and br"" */
-        if (c == 'b' || c == 'B') {
-            c = tok_nextc(tok);
-            if (c == '"' || c == '\'')
-                goto letter_quote;
-        }
-        if (c == 'r' || c == 'R') {
+        /* Process b"", r"", u"", br"" and rb"" */
+        int saw_b = 0, saw_r = 0, saw_u = 0;
+        while (1) {
+            if (!(saw_b || saw_u) && (c == 'b' || c == 'B'))
+                saw_b = 1;
+            /* Since this is a backwards compatibility support literal we don't
+               want to support it in arbitrary order like byte literals. */
+            else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U'))
+                saw_u = 1;
+            /* ur"" and ru"" are not supported */
+            else if (!(saw_r || saw_u) && (c == 'r' || c == 'R'))
+                saw_r = 1;
+            else
+                break;
             c = tok_nextc(tok);
             if (c == '"' || c == '\'')
                 goto letter_quote;
@@ -1692,17 +1701,18 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
     return result;
 }
 
-/* Get -*- encoding -*- from a Python file.
+/* Get the encoding of a Python file. Check for the coding cookie and check if
+   the file starts with a BOM.
 
-   PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
-   the first or second line of the file (in which case the encoding
-   should be assumed to be PyUnicode_GetDefaultEncoding()).
+   PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
+   encoding in the first or second line of the file (in which case the encoding
+   should be assumed to be UTF-8).
+
+   The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
+   by the caller. */
 
-   The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
-   by the caller.
-*/
 char *
-PyTokenizer_FindEncoding(int fd)
+PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
 {
     struct tok_state *tok;
     FILE *fp;
@@ -1721,6 +1731,20 @@ PyTokenizer_FindEncoding(int fd)
         fclose(fp);
         return NULL;
     }
+#ifndef PGEN
+    if (filename != NULL) {
+        Py_INCREF(filename);
+        tok->filename = filename;
+    }
+    else {
+        tok->filename = PyUnicode_FromString("<string>");
+        if (tok->filename == NULL) {
+            fclose(fp);
+            PyTokenizer_Free(tok);
+            return encoding;
+        }
+    }
+#endif
     while (tok->lineno < 2 && tok->done == E_OK) {
         PyTokenizer_Get(tok, &p_start, &p_end);
     }
@@ -1734,6 +1758,12 @@ PyTokenizer_FindEncoding(int fd)
     return encoding;
 }
 
+char *
+PyTokenizer_FindEncoding(int fd)
+{
+    return PyTokenizer_FindEncodingFilename(fd, NULL);
+}
+
 #ifdef Py_DEBUG
 
 void