diff options
Diffstat (limited to 'Parser/tokenizer.c')
-rw-r--r-- | Parser/tokenizer.c | 1493 |
1 files changed, 707 insertions, 786 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index f84093d..8966661 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -2,6 +2,7 @@ /* Tokenizer implementation */ #include "Python.h" +#include "pgenheaders.h" #include <ctype.h> #include <assert.h> @@ -9,29 +10,16 @@ #include "tokenizer.h" #include "errcode.h" +#ifndef PGEN #include "unicodeobject.h" -#include "bytesobject.h" +#include "stringobject.h" #include "fileobject.h" #include "codecs.h" #include "abstract.h" +#include "pydebug.h" +#endif /* PGEN */ -/* Alternate tab spacing */ -#define ALTTABSIZE 1 - -#define is_potential_identifier_start(c) (\ - (c >= 'a' && c <= 'z')\ - || (c >= 'A' && c <= 'Z')\ - || c == '_'\ - || (c >= 128)) - -#define is_potential_identifier_char(c) (\ - (c >= 'a' && c <= 'z')\ - || (c >= 'A' && c <= 'Z')\ - || (c >= '0' && c <= '9')\ - || c == '_'\ - || (c >= 128)) - -extern char *PyOS_Readline(FILE *, FILE *, const char *); +extern char *PyOS_Readline(FILE *, FILE *, char *); /* Return malloc'ed string including trailing \n; empty malloc'ed string for EOF; NULL if interrupted */ @@ -44,10 +32,65 @@ static struct tok_state *tok_new(void); static int tok_nextc(struct tok_state *tok); static void tok_backup(struct tok_state *tok, int c); - -/* Spaces in this constant are treated as "zero or more spaces or tabs" when - tokenizing. */ -static const char* type_comment_prefix = "# type: "; +/* Token names */ + +char *_PyParser_TokenNames[] = { + "ENDMARKER", + "NAME", + "NUMBER", + "STRING", + "NEWLINE", + "INDENT", + "DEDENT", + "LPAR", + "RPAR", + "LSQB", + "RSQB", + "COLON", + "COMMA", + "SEMI", + "PLUS", + "MINUS", + "STAR", + "SLASH", + "VBAR", + "AMPER", + "LESS", + "GREATER", + "EQUAL", + "DOT", + "PERCENT", + "BACKQUOTE", + "LBRACE", + "RBRACE", + "EQEQUAL", + "NOTEQUAL", + "LESSEQUAL", + "GREATEREQUAL", + "TILDE", + "CIRCUMFLEX", + "LEFTSHIFT", + "RIGHTSHIFT", + "DOUBLESTAR", + "PLUSEQUAL", + "MINEQUAL", + "STAREQUAL", + "SLASHEQUAL", + "PERCENTEQUAL", + "AMPEREQUAL", + "VBAREQUAL", + "CIRCUMFLEXEQUAL", + "LEFTSHIFTEQUAL", + "RIGHTSHIFTEQUAL", + "DOUBLESTAREQUAL", + "DOUBLESLASH", + "DOUBLESLASHEQUAL", + "AT", + /* This table must match the #defines in token.h! */ + "OP", + "<ERRORTOKEN>", + "<N_TOKENS>" +}; /* Create and initialize a new tok_state structure */ @@ -65,45 +108,61 @@ tok_new(void) tok->tabsize = TABSIZE; tok->indent = 0; tok->indstack[0] = 0; - tok->atbol = 1; tok->pendin = 0; tok->prompt = tok->nextprompt = NULL; tok->lineno = 0; tok->level = 0; + tok->filename = NULL; + tok->altwarning = 0; + tok->alterror = 0; + tok->alttabsize = 1; tok->altindstack[0] = 0; - tok->decoding_state = STATE_INIT; + tok->decoding_state = 0; tok->decoding_erred = 0; tok->read_coding_spec = 0; - tok->enc = NULL; tok->encoding = NULL; tok->cont_line = 0; - tok->filename = NULL; +#ifndef PGEN tok->decoding_readline = NULL; tok->decoding_buffer = NULL; - tok->type_comments = 0; - - tok->async_hacks = 0; - tok->async_def = 0; - tok->async_def_indent = 0; - tok->async_def_nl = 0; - +#endif return tok; } static char * -new_string(const char *s, Py_ssize_t len, struct tok_state *tok) +new_string(const char *s, Py_ssize_t len) { char* result = (char *)PyMem_MALLOC(len + 1); - if (!result) { - tok->done = E_NOMEM; - return NULL; + if (result != NULL) { + memcpy(result, s, len); + result[len] = '\0'; } - memcpy(result, s, len); - result[len] = '\0'; return result; } +#ifdef PGEN + +static char * +decoding_fgets(char *s, int size, struct tok_state *tok) +{ + return fgets(s, size, tok->fp); +} + +static int +decoding_feof(struct tok_state *tok) +{ + return feof(tok->fp); +} + +static char * +decode_str(const char *str, int exec_input, struct tok_state *tok) +{ + return new_string(str, strlen(str)); +} + +#else /* PGEN */ + static char * error_ret(struct tok_state *tok) /* XXX */ { @@ -116,8 +175,8 @@ error_ret(struct tok_state *tok) /* XXX */ } -static const char * -get_normal_name(const char *s) /* for utf-8 and latin-1 */ +static char * +get_normal_name(char *s) /* for utf-8 and latin-1 */ { char buf[13]; int i; @@ -147,18 +206,17 @@ get_normal_name(const char *s) /* for utf-8 and latin-1 */ /* Return the coding spec in S, or NULL if none is found. */ -static int -get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok) +static char * +get_coding_spec(const char *s, Py_ssize_t size) { Py_ssize_t i; - *spec = NULL; /* Coding spec must be in a comment, and that comment must be * the only statement on the source code line. */ for (i = 0; i < size - 6; i++) { if (s[i] == '#') break; if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') - return 1; + return NULL; } for (; i < size - 6; i++) { /* XXX inefficient search */ const char* t = s + i; @@ -177,23 +235,20 @@ get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *t t++; if (begin < t) { - char* r = new_string(begin, t - begin, tok); - const char* q; + char* r = new_string(begin, t - begin); + char* q; if (!r) - return 0; + return NULL; q = get_normal_name(r); if (r != q) { PyMem_FREE(r); - r = new_string(q, strlen(q), tok); - if (!r) - return 0; + r = new_string(q, strlen(q)); } - *spec = r; - break; + return r; } } } - return 1; + return NULL; } /* Check whether the line contains a coding spec. If it does, @@ -205,7 +260,7 @@ static int check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, int set_readline(struct tok_state *, const char *)) { - char *cs; + char * cs; int r = 1; if (tok->cont_line) { @@ -213,8 +268,7 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, tok->read_coding_spec = 1; return 1; } - if (!get_coding_spec(line, &cs, size, tok)) - return 0; + cs = get_coding_spec(line, size); if (!cs) { Py_ssize_t i; for (i = 0; i < size; i++) { @@ -227,31 +281,40 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, break; } } - return 1; - } - tok->read_coding_spec = 1; - if (tok->encoding == NULL) { - assert(tok->decoding_state == STATE_RAW); - if (strcmp(cs, "utf-8") == 0) { - tok->encoding = cs; - } else { - r = set_readline(tok, cs); - if (r) { + } else { + tok->read_coding_spec = 1; + if (tok->encoding == NULL) { + assert(tok->decoding_state == 1); /* raw */ + if (strcmp(cs, "utf-8") == 0 || + strcmp(cs, "iso-8859-1") == 0) { tok->encoding = cs; - tok->decoding_state = STATE_NORMAL; - } - else { - PyErr_Format(PyExc_SyntaxError, - "encoding problem: %s", cs); + } else { +#ifdef Py_USING_UNICODE + r = set_readline(tok, cs); + if (r) { + tok->encoding = cs; + tok->decoding_state = -1; + } + else { + PyErr_Format(PyExc_SyntaxError, + "encoding problem: %s", cs); + PyMem_FREE(cs); + } +#else + /* Without Unicode support, we cannot + process the coding spec. Since there + won't be any Unicode literals, that + won't matter. */ PyMem_FREE(cs); +#endif } + } else { /* then, compare cs with BOM */ + r = (strcmp(tok->encoding, cs) == 0); + if (!r) + PyErr_Format(PyExc_SyntaxError, + "encoding problem: %s with BOM", cs); + PyMem_FREE(cs); } - } else { /* then, compare cs with BOM */ - r = (strcmp(tok->encoding, cs) == 0); - if (!r) - PyErr_Format(PyExc_SyntaxError, - "encoding problem: %s with BOM", cs); - PyMem_FREE(cs); } return r; } @@ -268,7 +331,7 @@ check_bom(int get_char(struct tok_state *), { int ch1, ch2, ch3; ch1 = get_char(tok); - tok->decoding_state = STATE_RAW; + tok->decoding_state = 1; if (ch1 == EOF) { return 1; } else if (ch1 == 0xEF) { @@ -297,7 +360,7 @@ check_bom(int get_char(struct tok_state *), } if (!set_readline(tok, "utf-16-be")) return 0; - tok->decoding_state = STATE_NORMAL; + tok->decoding_state = -1; } else if (ch1 == 0xFF) { ch2 = get_char(tok); if (ch2 != 0xFE) { @@ -307,7 +370,7 @@ check_bom(int get_char(struct tok_state *), } if (!set_readline(tok, "utf-16-le")) return 0; - tok->decoding_state = STATE_NORMAL; + tok->decoding_state = -1; #endif } else { unget_char(ch1, tok); @@ -315,10 +378,7 @@ check_bom(int get_char(struct tok_state *), } if (tok->encoding != NULL) PyMem_FREE(tok->encoding); - tok->encoding = new_string("utf-8", 5, tok); - if (!tok->encoding) - return 0; - /* No need to set_readline: input is already utf-8 */ + tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ return 1; } @@ -329,7 +389,7 @@ check_bom(int get_char(struct tok_state *), 1) NULL: need to call tok->decoding_readline to get a new line 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and stored the result in tok->decoding_buffer - 3) PyByteArrayObject *: previous call to fp_readl did not have enough room + 3) PyStringObject *: previous call to fp_readl did not have enough room (in the s buffer) to copy entire contents of the line read by tok->decoding_readline. tok->decoding_buffer has the overflow. In this case, fp_readl is called in a loop (with an expanded buffer) @@ -340,62 +400,58 @@ check_bom(int get_char(struct tok_state *), static char * fp_readl(char *s, int size, struct tok_state *tok) { - PyObject* bufobj; - const char *buf; - Py_ssize_t buflen; +#ifndef Py_USING_UNICODE + /* In a non-Unicode built, this should never be called. */ + Py_FatalError("fp_readl should not be called in this build."); + return NULL; /* Keep compiler happy (not reachable) */ +#else + PyObject* utf8 = NULL; + PyObject* buf = tok->decoding_buffer; + char *str; + Py_ssize_t utf8len; /* Ask for one less byte so we can terminate it */ assert(size > 0); size--; - if (tok->decoding_buffer) { - bufobj = tok->decoding_buffer; - Py_INCREF(bufobj); - } - else - { - bufobj = _PyObject_CallNoArg(tok->decoding_readline); - if (bufobj == NULL) - goto error; - } - if (PyUnicode_CheckExact(bufobj)) - { - buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen); - if (buf == NULL) { - goto error; + if (buf == NULL) { + buf = PyObject_CallObject(tok->decoding_readline, NULL); + if (buf == NULL) + return error_ret(tok); + if (!PyUnicode_Check(buf)) { + Py_DECREF(buf); + PyErr_SetString(PyExc_SyntaxError, + "codec did not return a unicode object"); + return error_ret(tok); } + } else { + tok->decoding_buffer = NULL; + if (PyString_CheckExact(buf)) + utf8 = buf; } - else - { - buf = PyByteArray_AsString(bufobj); - if (buf == NULL) { - goto error; - } - buflen = PyByteArray_GET_SIZE(bufobj); + if (utf8 == NULL) { + utf8 = PyUnicode_AsUTF8String(buf); + Py_DECREF(buf); + if (utf8 == NULL) + return error_ret(tok); } - - Py_XDECREF(tok->decoding_buffer); - if (buflen > size) { - /* Too many chars, the rest goes into tok->decoding_buffer */ - tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size, - buflen-size); - if (tok->decoding_buffer == NULL) - goto error; - buflen = size; + str = PyString_AsString(utf8); + utf8len = PyString_GET_SIZE(utf8); + if (utf8len > size) { + tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size); + if (tok->decoding_buffer == NULL) { + Py_DECREF(utf8); + return error_ret(tok); + } + utf8len = size; } - else - tok->decoding_buffer = NULL; - - memcpy(s, buf, buflen); - s[buflen] = '\0'; - if (buflen == 0) /* EOF */ - s = NULL; - Py_DECREF(bufobj); + memcpy(s, str, utf8len); + s[utf8len] = '\0'; + Py_DECREF(utf8); + if (utf8len == 0) + return NULL; /* EOF */ return s; - -error: - Py_XDECREF(bufobj); - return error_ret(tok); +#endif } /* Set the readline function for TOK to a StreamReader's @@ -411,48 +467,24 @@ error: static int fp_setreadl(struct tok_state *tok, const char* enc) { - PyObject *readline, *io, *stream; - _Py_IDENTIFIER(open); - _Py_IDENTIFIER(readline); - int fd; - long pos; - - fd = fileno(tok->fp); - /* Due to buffering the file offset for fd can be different from the file - * position of tok->fp. If tok->fp was opened in text mode on Windows, - * its file position counts CRLF as one char and can't be directly mapped - * to the file offset for fd. Instead we step back one byte and read to - * the end of line.*/ - pos = ftell(tok->fp); - if (pos == -1 || - lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) { - PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); - return 0; - } + PyObject *reader, *stream, *readline; - io = PyImport_ImportModuleNoBlock("io"); - if (io == NULL) - return 0; - - stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO", - fd, "r", -1, enc, Py_None, Py_None, Py_False); - Py_DECREF(io); + /* XXX: constify filename argument. */ + stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL); if (stream == NULL) return 0; - readline = _PyObject_GetAttrId(stream, &PyId_readline); + reader = PyCodec_StreamReader(enc, stream, NULL); Py_DECREF(stream); - if (readline == NULL) + if (reader == NULL) return 0; - Py_XSETREF(tok->decoding_readline, readline); - if (pos > 0) { - PyObject *bufobj = _PyObject_CallNoArg(readline); - if (bufobj == NULL) - return 0; - Py_DECREF(bufobj); - } + readline = PyObject_GetAttrString(reader, "readline"); + Py_DECREF(reader); + if (readline == NULL) + return 0; + tok->decoding_readline = readline; return 1; } @@ -468,34 +500,6 @@ static void fp_ungetc(int c, struct tok_state *tok) { ungetc(c, tok->fp); } -/* Check whether the characters at s start a valid - UTF-8 sequence. Return the number of characters forming - the sequence if yes, 0 if not. */ -static int valid_utf8(const unsigned char* s) -{ - int expected = 0; - int length; - if (*s < 0x80) - /* single-byte code */ - return 1; - if (*s < 0xc0) - /* following byte */ - return 0; - if (*s < 0xE0) - expected = 1; - else if (*s < 0xF0) - expected = 2; - else if (*s < 0xF8) - expected = 3; - else - return 0; - length = expected + 1; - for (; expected; expected--) - if (s[expected] < 0x80 || s[expected] >= 0xC0) - return 0; - return length; -} - /* Read a line of input from TOK. Determine encoding if necessary. */ @@ -505,12 +509,12 @@ decoding_fgets(char *s, int size, struct tok_state *tok) char *line = NULL; int badchar = 0; for (;;) { - if (tok->decoding_state == STATE_NORMAL) { + if (tok->decoding_state < 0) { /* We already have a codec associated with this input. */ line = fp_readl(s, size, tok); break; - } else if (tok->decoding_state == STATE_RAW) { + } else if (tok->decoding_state > 0) { /* We want a 'raw' read. */ line = Py_UniversalNewlineFgets(s, size, tok->fp, NULL); @@ -521,7 +525,7 @@ decoding_fgets(char *s, int size, struct tok_state *tok) reader functions from now on. */ if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) return error_ret(tok); - assert(tok->decoding_state != STATE_INIT); + assert(tok->decoding_state != 0); } } if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) { @@ -529,40 +533,43 @@ decoding_fgets(char *s, int size, struct tok_state *tok) return error_ret(tok); } } - /* The default encoding is UTF-8, so make sure we don't have any - non-UTF-8 sequences in it. */ +#ifndef PGEN + /* The default encoding is ASCII, so make sure we don't have any + non-ASCII bytes in it. */ if (line && !tok->encoding) { unsigned char *c; - int length; - for (c = (unsigned char *)line; *c; c += length) - if (!(length = valid_utf8(c))) { + for (c = (unsigned char *)line; *c; c++) + if (*c > 127) { badchar = *c; break; } } if (badchar) { + char buf[500]; /* Need to add 1 to the line number, since this line has not been counted, yet. */ - PyErr_Format(PyExc_SyntaxError, - "Non-UTF-8 code starting with '\\x%.2x' " - "in file %U on line %i, " - "but no encoding declared; " - "see http://python.org/dev/peps/pep-0263/ for details", - badchar, tok->filename, tok->lineno + 1); + sprintf(buf, + "Non-ASCII character '\\x%.2x' " + "in file %.200s on line %i, " + "but no encoding declared; " + "see http://python.org/dev/peps/pep-0263/ for details", + badchar, tok->filename, tok->lineno + 1); + PyErr_SetString(PyExc_SyntaxError, buf); return error_ret(tok); } +#endif return line; } static int decoding_feof(struct tok_state *tok) { - if (tok->decoding_state != STATE_NORMAL) { + if (tok->decoding_state >= 0) { return feof(tok->fp); } else { PyObject* buf = tok->decoding_buffer; if (buf == NULL) { - buf = _PyObject_CallNoArg(tok->decoding_readline); + buf = PyObject_CallObject(tok->decoding_readline, NULL); if (buf == NULL) { error_ret(tok); return 1; @@ -601,6 +608,7 @@ buf_setreadl(struct tok_state *tok, const char* enc) { /* Return a UTF-8 encoding Python string object from the C byte string STR, which is encoded with ENC. */ +#ifdef Py_USING_UNICODE static PyObject * translate_into_utf8(const char* str, const char* enc) { PyObject *utf8; @@ -611,12 +619,12 @@ translate_into_utf8(const char* str, const char* enc) { Py_DECREF(buf); return utf8; } +#endif static char * translate_newlines(const char *s, int exec_input, struct tok_state *tok) { - int skip_next_lf = 0; - size_t needed_length = strlen(s) + 2, final_length; + int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length; char *buf, *current; char c = '\0'; buf = PyMem_MALLOC(needed_length); @@ -680,12 +688,14 @@ decode_str(const char *input, int single, struct tok_state *tok) return error_ret(tok); str = tok->str; /* string after BOM if any */ assert(str); +#ifdef Py_USING_UNICODE if (tok->enc != NULL) { utf8 = translate_into_utf8(str, tok->enc); if (utf8 == NULL) return error_ret(tok); - str = PyBytes_AsString(utf8); + str = PyString_AsString(utf8); } +#endif for (s = str;; s++) { if (*s == '\0') break; else if (*s == '\n') { @@ -707,18 +717,22 @@ decode_str(const char *input, int single, struct tok_state *tok) return error_ret(tok); } } +#ifdef Py_USING_UNICODE if (tok->enc != NULL) { assert(utf8 == NULL); utf8 = translate_into_utf8(str, tok->enc); if (utf8 == NULL) return error_ret(tok); - str = PyBytes_AS_STRING(utf8); + str = PyString_AsString(utf8); } +#endif assert(tok->decoding_buffer == NULL); tok->decoding_buffer = utf8; /* CAUTION */ return str; } +#endif /* PGEN */ + /* Set up tokenizer for string */ struct tok_state * @@ -727,7 +741,7 @@ PyTokenizer_FromString(const char *str, int exec_input) struct tok_state *tok = tok_new(); if (tok == NULL) return NULL; - str = decode_str(str, exec_input, tok); + str = (char *)decode_str(str, exec_input, tok); if (str == NULL) { PyTokenizer_Free(tok); return NULL; @@ -738,38 +752,11 @@ PyTokenizer_FromString(const char *str, int exec_input) return tok; } -struct tok_state * -PyTokenizer_FromUTF8(const char *str, int exec_input) -{ - struct tok_state *tok = tok_new(); - if (tok == NULL) - return NULL; - tok->input = str = translate_newlines(str, exec_input, tok); - if (str == NULL) { - PyTokenizer_Free(tok); - return NULL; - } - tok->decoding_state = STATE_RAW; - tok->read_coding_spec = 1; - tok->enc = NULL; - tok->str = str; - tok->encoding = (char *)PyMem_MALLOC(6); - if (!tok->encoding) { - PyTokenizer_Free(tok); - return NULL; - } - strcpy(tok->encoding, "utf-8"); - - /* XXX: constify members. */ - tok->buf = tok->cur = tok->end = tok->inp = (char*)str; - return tok; -} /* Set up tokenizer for file */ struct tok_state * -PyTokenizer_FromFile(FILE *fp, const char* enc, - const char *ps1, const char *ps2) +PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2) { struct tok_state *tok = tok_new(); if (tok == NULL) @@ -783,17 +770,6 @@ PyTokenizer_FromFile(FILE *fp, const char* enc, tok->fp = fp; tok->prompt = ps1; tok->nextprompt = ps2; - if (enc != NULL) { - /* Must copy encoding declaration since it - gets copied into the parse tree. */ - tok->encoding = PyMem_MALLOC(strlen(enc)+1); - if (!tok->encoding) { - PyTokenizer_Free(tok); - return NULL; - } - strcpy(tok->encoding, enc); - tok->decoding_state = STATE_NORMAL; - } return tok; } @@ -805,9 +781,10 @@ PyTokenizer_Free(struct tok_state *tok) { if (tok->encoding != NULL) PyMem_FREE(tok->encoding); +#ifndef PGEN Py_XDECREF(tok->decoding_readline); Py_XDECREF(tok->decoding_buffer); - Py_XDECREF(tok->filename); +#endif if (tok->fp != NULL && tok->buf != NULL) PyMem_FREE(tok->buf); if (tok->input) @@ -815,10 +792,74 @@ PyTokenizer_Free(struct tok_state *tok) PyMem_FREE(tok); } +#if !defined(PGEN) && defined(Py_USING_UNICODE) +static int +tok_stdin_decode(struct tok_state *tok, char **inp) +{ + PyObject *enc, *sysstdin, *decoded, *utf8; + const char *encoding; + char *converted; + + if (PySys_GetFile((char *)"stdin", NULL) != stdin) + return 0; + sysstdin = PySys_GetObject("stdin"); + if (sysstdin == NULL || !PyFile_Check(sysstdin)) + return 0; + + enc = ((PyFileObject *)sysstdin)->f_encoding; + if (enc == NULL || !PyString_Check(enc)) + return 0; + Py_INCREF(enc); + + encoding = PyString_AsString(enc); + decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL); + if (decoded == NULL) + goto error_clear; + + utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL); + Py_DECREF(decoded); + if (utf8 == NULL) + goto error_clear; + + assert(PyString_Check(utf8)); + converted = new_string(PyString_AS_STRING(utf8), + PyString_GET_SIZE(utf8)); + Py_DECREF(utf8); + if (converted == NULL) + goto error_nomem; + + PyMem_FREE(*inp); + *inp = converted; + if (tok->encoding != NULL) + PyMem_FREE(tok->encoding); + tok->encoding = new_string(encoding, strlen(encoding)); + if (tok->encoding == NULL) + goto error_nomem; + + Py_DECREF(enc); + return 0; + +error_nomem: + Py_DECREF(enc); + tok->done = E_NOMEM; + return -1; + +error_clear: + Py_DECREF(enc); + if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { + tok->done = E_ERROR; + return -1; + } + /* Fallback to iso-8859-1: for backward compatibility */ + PyErr_Clear(); + return 0; +} +#endif + /* Get next char, updating state; error code goes into tok->done */ static int -tok_nextc(struct tok_state *tok) +tok_nextc(register struct tok_state *tok) { for (;;) { if (tok->cur != tok->inp) { @@ -846,34 +887,6 @@ tok_nextc(struct tok_state *tok) } if (tok->prompt != NULL) { char *newtok = PyOS_Readline(stdin, stdout, tok->prompt); - if (newtok != NULL) { - char *translated = translate_newlines(newtok, 0, tok); - PyMem_FREE(newtok); - if (translated == NULL) - return EOF; - newtok = translated; - } - if (tok->encoding && newtok && *newtok) { - /* Recode to UTF-8 */ - Py_ssize_t buflen; - const char* buf; - PyObject *u = translate_into_utf8(newtok, tok->encoding); - PyMem_FREE(newtok); - if (!u) { - tok->done = E_DECODE; - return EOF; - } - buflen = PyBytes_GET_SIZE(u); - buf = PyBytes_AS_STRING(u); - newtok = PyMem_MALLOC(buflen+1); - if (newtok == NULL) { - Py_DECREF(u); - tok->done = E_NOMEM; - return EOF; - } - strcpy(newtok, buf); - Py_DECREF(u); - } if (tok->nextprompt != NULL) tok->prompt = tok->nextprompt; if (newtok == NULL) @@ -882,6 +895,10 @@ tok_nextc(struct tok_state *tok) PyMem_FREE(newtok); tok->done = E_EOF; } +#if !defined(PGEN) && defined(Py_USING_UNICODE) + else if (tok_stdin_decode(tok, &newtok) != 0) + PyMem_FREE(newtok); +#endif else if (tok->start != NULL) { size_t start = tok->start - tok->buf; size_t oldlen = tok->cur - tok->buf; @@ -956,7 +973,6 @@ tok_nextc(struct tok_state *tok) while (!done) { Py_ssize_t curstart = tok->start == NULL ? -1 : tok->start - tok->buf; - Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; Py_ssize_t curvalid = tok->inp - tok->buf; Py_ssize_t newsize = curvalid + BUFSIZ; char *newbuf = tok->buf; @@ -969,7 +985,6 @@ tok_nextc(struct tok_state *tok) } tok->buf = newbuf; tok->cur = tok->buf + cur; - tok->multi_line_start = tok->buf + cur_multi_line_start; tok->line_start = tok->cur; tok->inp = tok->buf + curvalid; tok->end = tok->buf + newsize; @@ -985,8 +1000,7 @@ tok_nextc(struct tok_state *tok) return EOF; /* Last line does not end in \n, fake one */ - if (tok->inp[-1] != '\n') - strcpy(tok->inp, "\n"); + strcpy(tok->inp, "\n"); } tok->inp = strchr(tok->inp, '\0'); done = tok->inp[-1] == '\n'; @@ -1018,7 +1032,7 @@ tok_nextc(struct tok_state *tok) /* Back-up one character */ static void -tok_backup(struct tok_state *tok, int c) +tok_backup(register struct tok_state *tok, register int c) { if (c != EOF) { if (--tok->cur < tok->buf) @@ -1029,88 +1043,185 @@ tok_backup(struct tok_state *tok, int c) } -static int -syntaxerror(struct tok_state *tok, const char *format, ...) -{ - va_list vargs; -#ifdef HAVE_STDARG_PROTOTYPES - va_start(vargs, format); -#else - va_start(vargs); -#endif - PyErr_FormatV(PyExc_SyntaxError, format, vargs); - va_end(vargs); - PyErr_SyntaxLocationObject(tok->filename, - tok->lineno, - (int)(tok->cur - tok->line_start)); - tok->done = E_ERROR; - return ERRORTOKEN; -} +/* Return the token corresponding to a single character */ -static int -indenterror(struct tok_state *tok) +int +PyToken_OneChar(int c) { - tok->done = E_TABSPACE; - tok->cur = tok->inp; - return ERRORTOKEN; + switch (c) { + case '(': return LPAR; + case ')': return RPAR; + case '[': return LSQB; + case ']': return RSQB; + case ':': return COLON; + case ',': return COMMA; + case ';': return SEMI; + case '+': return PLUS; + case '-': return MINUS; + case '*': return STAR; + case '/': return SLASH; + case '|': return VBAR; + case '&': return AMPER; + case '<': return LESS; + case '>': return GREATER; + case '=': return EQUAL; + case '.': return DOT; + case '%': return PERCENT; + case '`': return BACKQUOTE; + case '{': return LBRACE; + case '}': return RBRACE; + case '^': return CIRCUMFLEX; + case '~': return TILDE; + case '@': return AT; + default: return OP; + } } -/* Verify that the identifier follows PEP 3131. - All identifier strings are guaranteed to be "ready" unicode objects. - */ -static int -verify_identifier(struct tok_state *tok) + +int +PyToken_TwoChars(int c1, int c2) { - PyObject *s; - int result; - if (tok->decoding_erred) - return 0; - s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); - if (s == NULL) { - if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { - PyErr_Clear(); - tok->done = E_IDENTIFIER; - } else { - tok->done = E_ERROR; + switch (c1) { + case '=': + switch (c2) { + case '=': return EQEQUAL; } - return 0; + break; + case '!': + switch (c2) { + case '=': return NOTEQUAL; + } + break; + case '<': + switch (c2) { + case '>': return NOTEQUAL; + case '=': return LESSEQUAL; + case '<': return LEFTSHIFT; + } + break; + case '>': + switch (c2) { + case '=': return GREATEREQUAL; + case '>': return RIGHTSHIFT; + } + break; + case '+': + switch (c2) { + case '=': return PLUSEQUAL; + } + break; + case '-': + switch (c2) { + case '=': return MINEQUAL; + } + break; + case '*': + switch (c2) { + case '*': return DOUBLESTAR; + case '=': return STAREQUAL; + } + break; + case '/': + switch (c2) { + case '/': return DOUBLESLASH; + case '=': return SLASHEQUAL; + } + break; + case '|': + switch (c2) { + case '=': return VBAREQUAL; + } + break; + case '%': + switch (c2) { + case '=': return PERCENTEQUAL; + } + break; + case '&': + switch (c2) { + case '=': return AMPEREQUAL; + } + break; + case '^': + switch (c2) { + case '=': return CIRCUMFLEXEQUAL; + } + break; } - result = PyUnicode_IsIdentifier(s); - Py_DECREF(s); - if (result == 0) - tok->done = E_IDENTIFIER; - return result; + return OP; } -static int -tok_decimal_tail(struct tok_state *tok) +int +PyToken_ThreeChars(int c1, int c2, int c3) { - int c; - - while (1) { - do { - c = tok_nextc(tok); - } while (isdigit(c)); - if (c != '_') { + switch (c1) { + case '<': + switch (c2) { + case '<': + switch (c3) { + case '=': + return LEFTSHIFTEQUAL; + } break; } - c = tok_nextc(tok); - if (!isdigit(c)) { - tok_backup(tok, c); - syntaxerror(tok, "invalid decimal literal"); - return 0; + break; + case '>': + switch (c2) { + case '>': + switch (c3) { + case '=': + return RIGHTSHIFTEQUAL; + } + break; + } + break; + case '*': + switch (c2) { + case '*': + switch (c3) { + case '=': + return DOUBLESTAREQUAL; + } + break; + } + break; + case '/': + switch (c2) { + case '/': + switch (c3) { + case '=': + return DOUBLESLASHEQUAL; + } + break; } + break; + } + return OP; +} + +static int +indenterror(struct tok_state *tok) +{ + if (tok->alterror) { + tok->done = E_TABSPACE; + tok->cur = tok->inp; + return 1; + } + if (tok->altwarning) { + PySys_WriteStderr("%s: inconsistent use of tabs and spaces " + "in indentation\n", tok->filename); + tok->altwarning = 0; } - return c; + return 0; } /* Get next token, after space stripping etc. */ static int -tok_get(struct tok_state *tok, char **p_start, char **p_end) +tok_get(register struct tok_state *tok, char **p_start, char **p_end) { - int c; - int blankline, nonascii; + register int c; + int blankline; *p_start = *p_end = NULL; nextline: @@ -1119,24 +1230,22 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) /* Get indentation level */ if (tok->atbol) { - int col = 0; - int altcol = 0; + register int col = 0; + register int altcol = 0; tok->atbol = 0; for (;;) { c = tok_nextc(tok); - if (c == ' ') { + if (c == ' ') col++, altcol++; - } else if (c == '\t') { - col = (col / tok->tabsize + 1) * tok->tabsize; - altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE; + col = (col/tok->tabsize + 1) * tok->tabsize; + altcol = (altcol/tok->alttabsize + 1) + * tok->alttabsize; } - else if (c == '\014') {/* Control-L (formfeed) */ + else if (c == '\014') /* Control-L (formfeed) */ col = altcol = 0; /* For Emacs users */ - } - else { + else break; - } } tok_backup(tok, c); if (c == '#' || c == '\n') { @@ -1145,18 +1254,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) not passed to the parser as NEWLINE tokens, except *totally* empty lines in interactive mode, which signal the end of a command group. */ - if (col == 0 && c == '\n' && tok->prompt != NULL) { + if (col == 0 && c == '\n' && tok->prompt != NULL) blankline = 0; /* Let it through */ - } - else if (tok->prompt != NULL && tok->lineno == 1) { - /* In interactive mode, if the first line contains - only spaces and/or a comment, let it through. */ - blankline = 0; - col = altcol = 0; - } - else { + else blankline = 1; /* Ignore completely */ - } /* We can't jump back right here since we still may need to skip to the end of a comment */ } @@ -1164,7 +1265,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) if (col == tok->indstack[tok->indent]) { /* No change */ if (altcol != tok->altindstack[tok->indent]) { - return indenterror(tok); + if (indenterror(tok)) + return ERRORTOKEN; } } else if (col > tok->indstack[tok->indent]) { @@ -1175,7 +1277,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) return ERRORTOKEN; } if (altcol <= tok->altindstack[tok->indent]) { - return indenterror(tok); + if (indenterror(tok)) + return ERRORTOKEN; } tok->pendin++; tok->indstack[++tok->indent] = col; @@ -1194,7 +1297,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) return ERRORTOKEN; } if (altcol != tok->altindstack[tok->indent]) { - return indenterror(tok); + if (indenterror(tok)) + return ERRORTOKEN; } } } @@ -1214,31 +1318,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) } } - /* Peek ahead at the next character */ - c = tok_nextc(tok); - tok_backup(tok, c); - /* Check if we are closing an async function */ - if (tok->async_def - && !blankline - /* Due to some implementation artifacts of type comments, - * a TYPE_COMMENT at the start of a function won't set an - * indentation level and it will produce a NEWLINE after it. - * To avoid spuriously ending an async function due to this, - * wait until we have some non-newline char in front of us. */ - && c != '\n' - && tok->level == 0 - /* There was a NEWLINE after ASYNC DEF, - so we're past the signature. */ - && tok->async_def_nl - /* Current indentation level is less than where - the async function was defined */ - && tok->async_def_indent >= tok->indent) - { - tok->async_def = 0; - tok->async_def_indent = 0; - tok->async_def_nl = 0; - } - again: tok->start = NULL; /* Skip spaces */ @@ -1249,63 +1328,40 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) /* Set start of current token */ tok->start = tok->cur - 1; - /* Skip comment, unless it's a type comment */ + /* Skip comment, while looking for tab-setting magic */ if (c == '#') { - const char *prefix, *p, *type_start; - - while (c != EOF && c != '\n') { - c = tok_nextc(tok); - } - - if (tok->type_comments) { - p = tok->start; - prefix = type_comment_prefix; - while (*prefix && p < tok->cur) { - if (*prefix == ' ') { - while (*p == ' ' || *p == '\t') { - p++; - } - } else if (*prefix == *p) { - p++; - } else { - break; - } - - prefix++; - } - - /* This is a type comment if we matched all of type_comment_prefix. */ - if (!*prefix) { - int is_type_ignore = 1; - const char *ignore_end = p + 6; - tok_backup(tok, c); /* don't eat the newline or EOF */ - - type_start = p; - - /* A TYPE_IGNORE is "type: ignore" followed by the end of the token - * or anything ASCII and non-alphanumeric. */ - is_type_ignore = ( - tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0 - && !(tok->cur > ignore_end - && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); - - if (is_type_ignore) { - *p_start = (char *) ignore_end; - *p_end = tok->cur; - - /* If this type ignore is the only thing on the line, consume the newline also. */ - if (blankline) { - tok_nextc(tok); - tok->atbol = 1; - } - return TYPE_IGNORE; - } else { - *p_start = (char *) type_start; /* after type_comment_prefix */ - *p_end = tok->cur; - return TYPE_COMMENT; + static char *tabforms[] = { + "tab-width:", /* Emacs */ + ":tabstop=", /* vim, full form */ + ":ts=", /* vim, abbreviated form */ + "set tabsize=", /* will vi never die? */ + /* more templates can be added here to support other editors */ + }; + char cbuf[80]; + char *tp, **cp; + tp = cbuf; + do { + *tp++ = c = tok_nextc(tok); + } while (c != EOF && c != '\n' && + (size_t)(tp - cbuf + 1) < sizeof(cbuf)); + *tp = '\0'; + for (cp = tabforms; + cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]); + cp++) { + if ((tp = strstr(cbuf, *cp))) { + int newsize = atoi(tp + strlen(*cp)); + + if (newsize >= 1 && newsize <= 40) { + tok->tabsize = newsize; + if (Py_VerboseFlag) + PySys_WriteStderr( + "Tab size set to %d\n", + newsize); } } } + while (c != EOF && c != '\n') + c = tok_nextc(tok); } /* Check for EOF and errors now */ @@ -1314,108 +1370,49 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) } /* Identifier (most frequent token!) */ - nonascii = 0; - if (is_potential_identifier_start(c)) { - /* Process the various legal combinations of b"", r"", u"", and f"". */ - int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0; - while (1) { - if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B')) - saw_b = 1; - /* Since this is a backwards compatibility support literal we don't - want to support it in arbitrary order like byte literals. */ - else if (!(saw_b || saw_u || saw_r || saw_f) - && (c == 'u'|| c == 'U')) { - saw_u = 1; - } - /* ur"" and ru"" are not supported */ - else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) { - saw_r = 1; - } - else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) { - saw_f = 1; - } - else { - break; - } + if (Py_ISALPHA(c) || c == '_') { + /* Process r"", u"" and ur"" */ + switch (c) { + case 'b': + case 'B': c = tok_nextc(tok); - if (c == '"' || c == '\'') { + if (c == 'r' || c == 'R') + c = tok_nextc(tok); + if (c == '"' || c == '\'') goto letter_quote; - } + break; + case 'r': + case 'R': + c = tok_nextc(tok); + if (c == '"' || c == '\'') + goto letter_quote; + break; + case 'u': + case 'U': + c = tok_nextc(tok); + if (c == 'r' || c == 'R') + c = tok_nextc(tok); + if (c == '"' || c == '\'') + goto letter_quote; + break; } - while (is_potential_identifier_char(c)) { - if (c >= 128) { - nonascii = 1; - } + while (c != EOF && (Py_ISALNUM(c) || c == '_')) { c = tok_nextc(tok); } tok_backup(tok, c); - if (nonascii && !verify_identifier(tok)) { - return ERRORTOKEN; - } *p_start = tok->start; *p_end = tok->cur; - - /* async/await parsing block. */ - if (tok->cur - tok->start == 5 && tok->start[0] == 'a') { - /* May be an 'async' or 'await' token. For Python 3.7 or - later we recognize them unconditionally. For Python - 3.5 or 3.6 we recognize 'async' in front of 'def', and - either one inside of 'async def'. (Technically we - shouldn't recognize these at all for 3.4 or earlier, - but there's no *valid* Python 3.4 code that would be - rejected, and async functions will be rejected in a - later phase.) */ - if (!tok->async_hacks || tok->async_def) { - /* Always recognize the keywords. */ - if (memcmp(tok->start, "async", 5) == 0) { - return ASYNC; - } - if (memcmp(tok->start, "await", 5) == 0) { - return AWAIT; - } - } - else if (memcmp(tok->start, "async", 5) == 0) { - /* The current token is 'async'. - Look ahead one token to see if that is 'def'. */ - - struct tok_state ahead_tok; - char *ahead_tok_start = NULL, *ahead_tok_end = NULL; - int ahead_tok_kind; - - memcpy(&ahead_tok, tok, sizeof(ahead_tok)); - ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, - &ahead_tok_end); - - if (ahead_tok_kind == NAME - && ahead_tok.cur - ahead_tok.start == 3 - && memcmp(ahead_tok.start, "def", 3) == 0) - { - /* The next token is going to be 'def', so instead of - returning a plain NAME token, return ASYNC. */ - tok->async_def_indent = tok->indent; - tok->async_def = 1; - return ASYNC; - } - } - } - return NAME; } /* Newline */ if (c == '\n') { tok->atbol = 1; - if (blankline || tok->level > 0) { + if (blankline || tok->level > 0) goto nextline; - } *p_start = tok->start; *p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; - if (tok->async_def) { - /* We're somewhere inside an 'async def' function, and - we've encountered a NEWLINE after its signature. */ - tok->async_def_nl = 1; - } return NEWLINE; } @@ -1424,24 +1421,13 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) c = tok_nextc(tok); if (isdigit(c)) { goto fraction; - } else if (c == '.') { - c = tok_nextc(tok); - if (c == '.') { - *p_start = tok->start; - *p_end = tok->cur; - return ELLIPSIS; - } - else { - tok_backup(tok, c); - } - tok_backup(tok, '.'); } else { tok_backup(tok, c); + *p_start = tok->start; + *p_end = tok->cur; + return DOT; } - *p_start = tok->start; - *p_end = tok->cur; - return DOT; } /* Number */ @@ -1449,136 +1435,94 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) if (c == '0') { /* Hex, octal or binary -- maybe. */ c = tok_nextc(tok); + if (c == '.') + goto fraction; +#ifndef WITHOUT_COMPLEX + if (c == 'j' || c == 'J') + goto imaginary; +#endif if (c == 'x' || c == 'X') { + /* Hex */ c = tok_nextc(tok); + if (!isxdigit(c)) { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } do { - if (c == '_') { - c = tok_nextc(tok); - } - if (!isxdigit(c)) { - tok_backup(tok, c); - return syntaxerror(tok, "invalid hexadecimal literal"); - } - do { - c = tok_nextc(tok); - } while (isxdigit(c)); - } while (c == '_'); + c = tok_nextc(tok); + } while (isxdigit(c)); } else if (c == 'o' || c == 'O') { /* Octal */ c = tok_nextc(tok); - do { - if (c == '_') { - c = tok_nextc(tok); - } - if (c < '0' || c >= '8') { - tok_backup(tok, c); - if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in octal literal", c); - } - else { - return syntaxerror(tok, "invalid octal literal"); - } - } - do { - c = tok_nextc(tok); - } while ('0' <= c && c < '8'); - } while (c == '_'); - if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in octal literal", c); + if (c < '0' || c >= '8') { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; } + do { + c = tok_nextc(tok); + } while ('0' <= c && c < '8'); } else if (c == 'b' || c == 'B') { /* Binary */ c = tok_nextc(tok); - do { - if (c == '_') { - c = tok_nextc(tok); - } - if (c != '0' && c != '1') { - tok_backup(tok, c); - if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in binary literal", c); - } - else { - return syntaxerror(tok, "invalid binary literal"); - } - } - do { - c = tok_nextc(tok); - } while (c == '0' || c == '1'); - } while (c == '_'); - if (isdigit(c)) { - return syntaxerror(tok, - "invalid digit '%c' in binary literal", c); + if (c != '0' && c != '1') { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; } + do { + c = tok_nextc(tok); + } while (c == '0' || c == '1'); } else { - int nonzero = 0; - /* maybe old-style octal; c is first char of it */ - /* in any case, allow '0' as a literal */ - while (1) { - if (c == '_') { - c = tok_nextc(tok); - if (!isdigit(c)) { - tok_backup(tok, c); - return syntaxerror(tok, "invalid decimal literal"); - } - } - if (c != '0') { - break; - } + int found_decimal = 0; + /* Octal; c is first char of it */ + /* There's no 'isoctdigit' macro, sigh */ + while ('0' <= c && c < '8') { c = tok_nextc(tok); } if (isdigit(c)) { - nonzero = 1; - c = tok_decimal_tail(tok); - if (c == 0) { - return ERRORTOKEN; - } + found_decimal = 1; + do { + c = tok_nextc(tok); + } while (isdigit(c)); } - if (c == '.') { - c = tok_nextc(tok); + if (c == '.') goto fraction; - } - else if (c == 'e' || c == 'E') { + else if (c == 'e' || c == 'E') goto exponent; - } - else if (c == 'j' || c == 'J') { +#ifndef WITHOUT_COMPLEX + else if (c == 'j' || c == 'J') goto imaginary; - } - else if (nonzero) { - /* Old-style octal: now disallowed. */ +#endif + else if (found_decimal) { + tok->done = E_TOKEN; tok_backup(tok, c); - return syntaxerror(tok, - "leading zeros in decimal integer " - "literals are not permitted; " - "use an 0o prefix for octal integers"); + return ERRORTOKEN; } } + if (c == 'l' || c == 'L') + c = tok_nextc(tok); } else { /* Decimal */ - c = tok_decimal_tail(tok); - if (c == 0) { - return ERRORTOKEN; - } - { + do { + c = tok_nextc(tok); + } while (isdigit(c)); + if (c == 'l' || c == 'L') + c = tok_nextc(tok); + else { /* Accept floating point numbers. */ if (c == '.') { - c = tok_nextc(tok); fraction: /* Fraction */ - if (isdigit(c)) { - c = tok_decimal_tail(tok); - if (c == 0) { - return ERRORTOKEN; - } - } + do { + c = tok_nextc(tok); + } while (isdigit(c)); } if (c == 'e' || c == 'E') { int e; @@ -1589,8 +1533,9 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) if (c == '+' || c == '-') { c = tok_nextc(tok); if (!isdigit(c)) { + tok->done = E_TOKEN; tok_backup(tok, c); - return syntaxerror(tok, "invalid decimal literal"); + return ERRORTOKEN; } } else if (!isdigit(c)) { tok_backup(tok, c); @@ -1599,16 +1544,16 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) *p_end = tok->cur; return NUMBER; } - c = tok_decimal_tail(tok); - if (c == 0) { - return ERRORTOKEN; - } + do { + c = tok_nextc(tok); + } while (isdigit(c)); } - if (c == 'j' || c == 'J') { +#ifndef WITHOUT_COMPLEX + if (c == 'j' || c == 'J') /* Imaginary part */ imaginary: c = tok_nextc(tok); - } +#endif } } tok_backup(tok, c); @@ -1620,61 +1565,55 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) letter_quote: /* String */ if (c == '\'' || c == '"') { + Py_ssize_t quote2 = tok->cur - tok->start + 1; int quote = c; - int quote_size = 1; /* 1 or 3 */ - int end_quote_size = 0; - - /* Nodes of type STRING, especially multi line strings - must be handled differently in order to get both - the starting line number and the column offset right. - (cf. issue 16806) */ - tok->first_lineno = tok->lineno; - tok->multi_line_start = tok->line_start; - - /* Find the quote size and start of string */ - c = tok_nextc(tok); - if (c == quote) { - c = tok_nextc(tok); - if (c == quote) { - quote_size = 3; - } - else { - end_quote_size = 1; /* empty string found */ - } - } - if (c != quote) { - tok_backup(tok, c); - } - - /* Get rest of string */ - while (end_quote_size != quote_size) { + int triple = 0; + int tripcount = 0; + for (;;) { c = tok_nextc(tok); - if (c == EOF) { - if (quote_size == 3) { - tok->done = E_EOFS; - } - else { + if (c == '\n') { + if (!triple) { tok->done = E_EOLS; + tok_backup(tok, c); + return ERRORTOKEN; } - tok->cur = tok->inp; - return ERRORTOKEN; + tripcount = 0; + tok->cont_line = 1; /* multiline string. */ } - if (quote_size == 1 && c == '\n') { - tok->done = E_EOLS; + else if (c == EOF) { + if (triple) + tok->done = E_EOFS; + else + tok->done = E_EOLS; tok->cur = tok->inp; return ERRORTOKEN; } - if (c == quote) { - end_quote_size += 1; + else if (c == quote) { + tripcount++; + if (tok->cur - tok->start == quote2) { + c = tok_nextc(tok); + if (c == quote) { + triple = 1; + tripcount = 0; + continue; + } + tok_backup(tok, c); + } + if (!triple || tripcount == 3) + break; } - else { - end_quote_size = 0; - if (c == '\\') { - tok_nextc(tok); /* skip escaped char */ + else if (c == '\\') { + tripcount = 0; + c = tok_nextc(tok); + if (c == EOF) { + tok->done = E_EOLS; + tok->cur = tok->inp; + return ERRORTOKEN; } } + else + tripcount = 0; } - *p_start = tok->start; *p_end = tok->cur; return STRING; @@ -1688,14 +1627,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) tok->cur = tok->inp; return ERRORTOKEN; } - c = tok_nextc(tok); - if (c == EOF) { - tok->done = E_EOF; - tok->cur = tok->inp; - return ERRORTOKEN; - } else { - tok_backup(tok, c); - } tok->cont_line = 1; goto again; /* Read next line */ } @@ -1704,13 +1635,24 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) { int c2 = tok_nextc(tok); int token = PyToken_TwoChars(c, c2); +#ifndef PGEN + if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') { + if (PyErr_WarnExplicit(PyExc_DeprecationWarning, + "<> not supported in 3.x; use !=", + tok->filename, tok->lineno, + NULL, NULL)) { + tok->done = E_ERROR; + tok->cur = tok->inp; + return ERRORTOKEN; + } + } +#endif if (token != OP) { int c3 = tok_nextc(tok); int token3 = PyToken_ThreeChars(c, c2, c3); if (token3 != OP) { token = token3; - } - else { + } else { tok_backup(tok, c3); } *p_start = tok->start; @@ -1725,38 +1667,12 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) case '(': case '[': case '{': - if (tok->level >= MAXLEVEL) { - return syntaxerror(tok, "too many nested parentheses"); - } - tok->parenstack[tok->level] = c; - tok->parenlinenostack[tok->level] = tok->lineno; tok->level++; break; case ')': case ']': case '}': - if (!tok->level) { - return syntaxerror(tok, "unmatched '%c'", c); - } tok->level--; - int opening = tok->parenstack[tok->level]; - if (!((opening == '(' && c == ')') || - (opening == '[' && c == ']') || - (opening == '{' && c == '}'))) - { - if (tok->parenlinenostack[tok->level] != tok->lineno) { - return syntaxerror(tok, - "closing parenthesis '%c' does not match " - "opening parenthesis '%c' on line %d", - c, opening, tok->parenlinenostack[tok->level]); - } - else { - return syntaxerror(tok, - "closing parenthesis '%c' does not match " - "opening parenthesis '%c'", - c, opening); - } - } break; } @@ -1770,6 +1686,11 @@ int PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) { int result = tok_get(tok, p_start, p_end); + if (tok->fp && ferror(tok->fp)) { + clearerr(tok->fp); + result = ERRORTOKEN; + tok->done = E_IO; + } if (tok->decoding_erred) { result = ERRORTOKEN; tok->done = E_DECODE; @@ -1777,67 +1698,67 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end) return result; } -/* Get the encoding of a Python file. Check for the coding cookie and check if - the file starts with a BOM. - - PyTokenizer_FindEncodingFilename() returns NULL when it can't find the - encoding in the first or second line of the file (in which case the encoding - should be assumed to be UTF-8). - - The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed - by the caller. */ +/* This function is only called from parsetok. However, it cannot live + there, as it must be empty for PGEN, and we can check for PGEN only + in this file. */ -char * -PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) +#if defined(PGEN) || !defined(Py_USING_UNICODE) +char* +PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset) { - struct tok_state *tok; - FILE *fp; - char *p_start =NULL , *p_end =NULL , *encoding = NULL; - - fd = _Py_dup(fd); - if (fd < 0) { - return NULL; - } - - fp = fdopen(fd, "r"); - if (fp == NULL) { - return NULL; - } - tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL); - if (tok == NULL) { - fclose(fp); - return NULL; - } - if (filename != NULL) { - Py_INCREF(filename); - tok->filename = filename; - } - else { - tok->filename = PyUnicode_FromString("<string>"); - if (tok->filename == NULL) { - fclose(fp); - PyTokenizer_Free(tok); - return encoding; - } - } - while (tok->lineno < 2 && tok->done == E_OK) { - PyTokenizer_Get(tok, &p_start, &p_end); + return NULL; +} +#else +#ifdef Py_USING_UNICODE +static PyObject * +dec_utf8(const char *enc, const char *text, size_t len) { + PyObject *ret = NULL; + PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace"); + if (unicode_text) { + ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace"); + Py_DECREF(unicode_text); } - fclose(fp); - if (tok->encoding) { - encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1); - if (encoding) - strcpy(encoding, tok->encoding); + if (!ret) { + PyErr_Clear(); } - PyTokenizer_Free(tok); - return encoding; + return ret; } - char * -PyTokenizer_FindEncoding(int fd) +PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset) { - return PyTokenizer_FindEncodingFilename(fd, NULL); + char *text = NULL; + if (tok->encoding) { + /* convert source to original encondig */ + PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len); + if (lineobj != NULL) { + int linelen = PyString_Size(lineobj); + const char *line = PyString_AsString(lineobj); + text = PyObject_MALLOC(linelen + 1); + if (text != NULL && line != NULL) { + if (linelen) + strncpy(text, line, linelen); + text[linelen] = '\0'; + } + Py_DECREF(lineobj); + + /* adjust error offset */ + if (*offset > 1) { + PyObject *offsetobj = dec_utf8(tok->encoding, + tok->buf, *offset-1); + if (offsetobj) { + *offset = PyString_Size(offsetobj) + 1; + Py_DECREF(offsetobj); + } + } + + } + } + return text; + } +#endif /* defined(Py_USING_UNICODE) */ +#endif + #ifdef Py_DEBUG |