diff options
Diffstat (limited to 'Parser/tokenizer.c')
-rw-r--r-- | Parser/tokenizer.c | 100 |
1 files changed, 65 insertions, 35 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 5ba12a4..93a4a5c 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -128,7 +128,6 @@ tok_new(void) tok->prompt = tok->nextprompt = NULL; tok->lineno = 0; tok->level = 0; - tok->filename = NULL; tok->altwarning = 1; tok->alterror = 1; tok->alttabsize = 1; @@ -140,6 +139,7 @@ tok_new(void) tok->encoding = NULL; tok->cont_line = 0; #ifndef PGEN + tok->filename = NULL; tok->decoding_readline = NULL; tok->decoding_buffer = NULL; #endif @@ -462,6 +462,8 @@ static int fp_setreadl(struct tok_state *tok, const char* enc) { PyObject *readline = NULL, *stream = NULL, *io = NULL; + _Py_IDENTIFIER(open); + _Py_IDENTIFIER(readline); int fd; io = PyImport_ImportModuleNoBlock("io"); @@ -474,13 +476,13 @@ fp_setreadl(struct tok_state *tok, const char* enc) goto cleanup; } - stream = PyObject_CallMethod(io, "open", "isisOOO", + stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO", fd, "r", -1, enc, Py_None, Py_None, Py_False); if (stream == NULL) goto cleanup; Py_XDECREF(tok->decoding_readline); - readline = PyObject_GetAttrString(stream, "readline"); + readline = _PyObject_GetAttrId(stream, &PyId_readline); tok->decoding_readline = readline; /* The file has been reopened; parsing will restart from @@ -545,7 +547,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok) { char *line = NULL; int badchar = 0; - PyObject *filename; for (;;) { if (tok->decoding_state == STATE_NORMAL) { /* We already have a codec associated with @@ -586,19 +587,12 @@ decoding_fgets(char *s, int size, struct tok_state *tok) if (badchar) { /* Need to add 1 to the line number, since this line has not been counted, yet. */ - if (tok->filename != NULL) - filename = PyUnicode_DecodeFSDefault(tok->filename); - else - filename = PyUnicode_FromString("<file>"); - if (filename != NULL) { - PyErr_Format(PyExc_SyntaxError, - "Non-UTF-8 code starting with '\\x%.2x' " - "in file %U on line %i, " - "but no encoding declared; " - "see http://python.org/dev/peps/pep-0263/ for details", - badchar, filename, tok->lineno + 1); - Py_DECREF(filename); - } + PyErr_Format(PyExc_SyntaxError, + "Non-UTF-8 code starting with '\\x%.2x' " + "in file %U on line %i, " + "but no encoding declared; " + "see http://python.org/dev/peps/pep-0263/ for details", + badchar, tok->filename, tok->lineno + 1); return error_ret(tok); } #endif @@ -856,6 +850,7 @@ PyTokenizer_Free(struct tok_state *tok) #ifndef PGEN Py_XDECREF(tok->decoding_readline); Py_XDECREF(tok->decoding_buffer); + Py_XDECREF(tok->filename); #endif if (tok->fp != NULL && tok->buf != NULL) PyMem_FREE(tok->buf); @@ -1250,8 +1245,13 @@ indenterror(struct tok_state *tok) return 1; } if (tok->altwarning) { - PySys_WriteStderr("%s: inconsistent use of tabs and spaces " +#ifdef PGEN + PySys_WriteStderr("inconsistent use of tabs and spaces " + "in indentation\n"); +#else + PySys_FormatStderr("%U: inconsistent use of tabs and spaces " "in indentation\n", tok->filename); +#endif tok->altwarning = 0; } return 0; @@ -1260,14 +1260,16 @@ indenterror(struct tok_state *tok) #ifdef PGEN #define verify_identifier(tok) 1 #else -/* Verify that the identifier follows PEP 3131. */ +/* Verify that the identifier follows PEP 3131. + All identifier strings are guaranteed to be "ready" unicode objects. + */ static int verify_identifier(struct tok_state *tok) { PyObject *s; int result; s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); - if (s == NULL) { + if (s == NULL || PyUnicode_READY(s) == -1) { if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { PyErr_Clear(); tok->done = E_IDENTIFIER; @@ -1410,13 +1412,20 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end) /* Identifier (most frequent token!) */ nonascii = 0; if (is_potential_identifier_start(c)) { - /* Process b"", r"" and br"" */ - if (c == 'b' || c == 'B') { - c = tok_nextc(tok); - if (c == '"' || c == '\'') - goto letter_quote; - } - if (c == 'r' || c == 'R') { + /* Process b"", r"", u"", br"" and rb"" */ + int saw_b = 0, saw_r = 0, saw_u = 0; + while (1) { + if (!(saw_b || saw_u) && (c == 'b' || c == 'B')) + saw_b = 1; + /* Since this is a backwards compatibility support literal we don't + want to support it in arbitrary order like byte literals. */ + else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U')) + saw_u = 1; + /* ur"" and ru"" are not supported */ + else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) + saw_r = 1; + else + break; c = tok_nextc(tok); if (c == '"' || c == '\'') goto letter_quote; @@ -1692,17 +1701,18 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) return result; } -/* Get -*- encoding -*- from a Python file. +/* Get the encoding of a Python file. Check for the coding cookie and check if + the file starts with a BOM. - PyTokenizer_FindEncoding returns NULL when it can't find the encoding in - the first or second line of the file (in which case the encoding - should be assumed to be PyUnicode_GetDefaultEncoding()). + PyTokenizer_FindEncodingFilename() returns NULL when it can't find the + encoding in the first or second line of the file (in which case the encoding + should be assumed to be UTF-8). + + The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed + by the caller. */ - The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed - by the caller. -*/ char * -PyTokenizer_FindEncoding(int fd) +PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) { struct tok_state *tok; FILE *fp; @@ -1721,6 +1731,20 @@ PyTokenizer_FindEncoding(int fd) fclose(fp); return NULL; } +#ifndef PGEN + if (filename != NULL) { + Py_INCREF(filename); + tok->filename = filename; + } + else { + tok->filename = PyUnicode_FromString("<string>"); + if (tok->filename == NULL) { + fclose(fp); + PyTokenizer_Free(tok); + return encoding; + } + } +#endif while (tok->lineno < 2 && tok->done == E_OK) { PyTokenizer_Get(tok, &p_start, &p_end); } @@ -1734,6 +1758,12 @@ PyTokenizer_FindEncoding(int fd) return encoding; } +char * +PyTokenizer_FindEncoding(int fd) +{ + return PyTokenizer_FindEncodingFilename(fd, NULL); +} + #ifdef Py_DEBUG void |