summaryrefslogtreecommitdiffstats
path: root/Parser/tokenizer.c
diff options
context:
space:
mode:
Diffstat (limited to 'Parser/tokenizer.c')
-rw-r--r--Parser/tokenizer.c1493
1 files changed, 707 insertions, 786 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index f84093d..8966661 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -2,6 +2,7 @@
/* Tokenizer implementation */
#include "Python.h"
+#include "pgenheaders.h"
#include <ctype.h>
#include <assert.h>
@@ -9,29 +10,16 @@
#include "tokenizer.h"
#include "errcode.h"
+#ifndef PGEN
#include "unicodeobject.h"
-#include "bytesobject.h"
+#include "stringobject.h"
#include "fileobject.h"
#include "codecs.h"
#include "abstract.h"
+#include "pydebug.h"
+#endif /* PGEN */
-/* Alternate tab spacing */
-#define ALTTABSIZE 1
-
-#define is_potential_identifier_start(c) (\
- (c >= 'a' && c <= 'z')\
- || (c >= 'A' && c <= 'Z')\
- || c == '_'\
- || (c >= 128))
-
-#define is_potential_identifier_char(c) (\
- (c >= 'a' && c <= 'z')\
- || (c >= 'A' && c <= 'Z')\
- || (c >= '0' && c <= '9')\
- || c == '_'\
- || (c >= 128))
-
-extern char *PyOS_Readline(FILE *, FILE *, const char *);
+extern char *PyOS_Readline(FILE *, FILE *, char *);
/* Return malloc'ed string including trailing \n;
empty malloc'ed string for EOF;
NULL if interrupted */
@@ -44,10 +32,65 @@ static struct tok_state *tok_new(void);
static int tok_nextc(struct tok_state *tok);
static void tok_backup(struct tok_state *tok, int c);
-
-/* Spaces in this constant are treated as "zero or more spaces or tabs" when
- tokenizing. */
-static const char* type_comment_prefix = "# type: ";
+/* Token names */
+
+char *_PyParser_TokenNames[] = {
+ "ENDMARKER",
+ "NAME",
+ "NUMBER",
+ "STRING",
+ "NEWLINE",
+ "INDENT",
+ "DEDENT",
+ "LPAR",
+ "RPAR",
+ "LSQB",
+ "RSQB",
+ "COLON",
+ "COMMA",
+ "SEMI",
+ "PLUS",
+ "MINUS",
+ "STAR",
+ "SLASH",
+ "VBAR",
+ "AMPER",
+ "LESS",
+ "GREATER",
+ "EQUAL",
+ "DOT",
+ "PERCENT",
+ "BACKQUOTE",
+ "LBRACE",
+ "RBRACE",
+ "EQEQUAL",
+ "NOTEQUAL",
+ "LESSEQUAL",
+ "GREATEREQUAL",
+ "TILDE",
+ "CIRCUMFLEX",
+ "LEFTSHIFT",
+ "RIGHTSHIFT",
+ "DOUBLESTAR",
+ "PLUSEQUAL",
+ "MINEQUAL",
+ "STAREQUAL",
+ "SLASHEQUAL",
+ "PERCENTEQUAL",
+ "AMPEREQUAL",
+ "VBAREQUAL",
+ "CIRCUMFLEXEQUAL",
+ "LEFTSHIFTEQUAL",
+ "RIGHTSHIFTEQUAL",
+ "DOUBLESTAREQUAL",
+ "DOUBLESLASH",
+ "DOUBLESLASHEQUAL",
+ "AT",
+ /* This table must match the #defines in token.h! */
+ "OP",
+ "<ERRORTOKEN>",
+ "<N_TOKENS>"
+};
/* Create and initialize a new tok_state structure */
@@ -65,45 +108,61 @@ tok_new(void)
tok->tabsize = TABSIZE;
tok->indent = 0;
tok->indstack[0] = 0;
-
tok->atbol = 1;
tok->pendin = 0;
tok->prompt = tok->nextprompt = NULL;
tok->lineno = 0;
tok->level = 0;
+ tok->filename = NULL;
+ tok->altwarning = 0;
+ tok->alterror = 0;
+ tok->alttabsize = 1;
tok->altindstack[0] = 0;
- tok->decoding_state = STATE_INIT;
+ tok->decoding_state = 0;
tok->decoding_erred = 0;
tok->read_coding_spec = 0;
- tok->enc = NULL;
tok->encoding = NULL;
tok->cont_line = 0;
- tok->filename = NULL;
+#ifndef PGEN
tok->decoding_readline = NULL;
tok->decoding_buffer = NULL;
- tok->type_comments = 0;
-
- tok->async_hacks = 0;
- tok->async_def = 0;
- tok->async_def_indent = 0;
- tok->async_def_nl = 0;
-
+#endif
return tok;
}
static char *
-new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
+new_string(const char *s, Py_ssize_t len)
{
char* result = (char *)PyMem_MALLOC(len + 1);
- if (!result) {
- tok->done = E_NOMEM;
- return NULL;
+ if (result != NULL) {
+ memcpy(result, s, len);
+ result[len] = '\0';
}
- memcpy(result, s, len);
- result[len] = '\0';
return result;
}
+#ifdef PGEN
+
+static char *
+decoding_fgets(char *s, int size, struct tok_state *tok)
+{
+ return fgets(s, size, tok->fp);
+}
+
+static int
+decoding_feof(struct tok_state *tok)
+{
+ return feof(tok->fp);
+}
+
+static char *
+decode_str(const char *str, int exec_input, struct tok_state *tok)
+{
+ return new_string(str, strlen(str));
+}
+
+#else /* PGEN */
+
static char *
error_ret(struct tok_state *tok) /* XXX */
{
@@ -116,8 +175,8 @@ error_ret(struct tok_state *tok) /* XXX */
}
-static const char *
-get_normal_name(const char *s) /* for utf-8 and latin-1 */
+static char *
+get_normal_name(char *s) /* for utf-8 and latin-1 */
{
char buf[13];
int i;
@@ -147,18 +206,17 @@ get_normal_name(const char *s) /* for utf-8 and latin-1 */
/* Return the coding spec in S, or NULL if none is found. */
-static int
-get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
+static char *
+get_coding_spec(const char *s, Py_ssize_t size)
{
Py_ssize_t i;
- *spec = NULL;
/* Coding spec must be in a comment, and that comment must be
* the only statement on the source code line. */
for (i = 0; i < size - 6; i++) {
if (s[i] == '#')
break;
if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
- return 1;
+ return NULL;
}
for (; i < size - 6; i++) { /* XXX inefficient search */
const char* t = s + i;
@@ -177,23 +235,20 @@ get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *t
t++;
if (begin < t) {
- char* r = new_string(begin, t - begin, tok);
- const char* q;
+ char* r = new_string(begin, t - begin);
+ char* q;
if (!r)
- return 0;
+ return NULL;
q = get_normal_name(r);
if (r != q) {
PyMem_FREE(r);
- r = new_string(q, strlen(q), tok);
- if (!r)
- return 0;
+ r = new_string(q, strlen(q));
}
- *spec = r;
- break;
+ return r;
}
}
}
- return 1;
+ return NULL;
}
/* Check whether the line contains a coding spec. If it does,
@@ -205,7 +260,7 @@ static int
check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
int set_readline(struct tok_state *, const char *))
{
- char *cs;
+ char * cs;
int r = 1;
if (tok->cont_line) {
@@ -213,8 +268,7 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
tok->read_coding_spec = 1;
return 1;
}
- if (!get_coding_spec(line, &cs, size, tok))
- return 0;
+ cs = get_coding_spec(line, size);
if (!cs) {
Py_ssize_t i;
for (i = 0; i < size; i++) {
@@ -227,31 +281,40 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
break;
}
}
- return 1;
- }
- tok->read_coding_spec = 1;
- if (tok->encoding == NULL) {
- assert(tok->decoding_state == STATE_RAW);
- if (strcmp(cs, "utf-8") == 0) {
- tok->encoding = cs;
- } else {
- r = set_readline(tok, cs);
- if (r) {
+ } else {
+ tok->read_coding_spec = 1;
+ if (tok->encoding == NULL) {
+ assert(tok->decoding_state == 1); /* raw */
+ if (strcmp(cs, "utf-8") == 0 ||
+ strcmp(cs, "iso-8859-1") == 0) {
tok->encoding = cs;
- tok->decoding_state = STATE_NORMAL;
- }
- else {
- PyErr_Format(PyExc_SyntaxError,
- "encoding problem: %s", cs);
+ } else {
+#ifdef Py_USING_UNICODE
+ r = set_readline(tok, cs);
+ if (r) {
+ tok->encoding = cs;
+ tok->decoding_state = -1;
+ }
+ else {
+ PyErr_Format(PyExc_SyntaxError,
+ "encoding problem: %s", cs);
+ PyMem_FREE(cs);
+ }
+#else
+ /* Without Unicode support, we cannot
+ process the coding spec. Since there
+ won't be any Unicode literals, that
+ won't matter. */
PyMem_FREE(cs);
+#endif
}
+ } else { /* then, compare cs with BOM */
+ r = (strcmp(tok->encoding, cs) == 0);
+ if (!r)
+ PyErr_Format(PyExc_SyntaxError,
+ "encoding problem: %s with BOM", cs);
+ PyMem_FREE(cs);
}
- } else { /* then, compare cs with BOM */
- r = (strcmp(tok->encoding, cs) == 0);
- if (!r)
- PyErr_Format(PyExc_SyntaxError,
- "encoding problem: %s with BOM", cs);
- PyMem_FREE(cs);
}
return r;
}
@@ -268,7 +331,7 @@ check_bom(int get_char(struct tok_state *),
{
int ch1, ch2, ch3;
ch1 = get_char(tok);
- tok->decoding_state = STATE_RAW;
+ tok->decoding_state = 1;
if (ch1 == EOF) {
return 1;
} else if (ch1 == 0xEF) {
@@ -297,7 +360,7 @@ check_bom(int get_char(struct tok_state *),
}
if (!set_readline(tok, "utf-16-be"))
return 0;
- tok->decoding_state = STATE_NORMAL;
+ tok->decoding_state = -1;
} else if (ch1 == 0xFF) {
ch2 = get_char(tok);
if (ch2 != 0xFE) {
@@ -307,7 +370,7 @@ check_bom(int get_char(struct tok_state *),
}
if (!set_readline(tok, "utf-16-le"))
return 0;
- tok->decoding_state = STATE_NORMAL;
+ tok->decoding_state = -1;
#endif
} else {
unget_char(ch1, tok);
@@ -315,10 +378,7 @@ check_bom(int get_char(struct tok_state *),
}
if (tok->encoding != NULL)
PyMem_FREE(tok->encoding);
- tok->encoding = new_string("utf-8", 5, tok);
- if (!tok->encoding)
- return 0;
- /* No need to set_readline: input is already utf-8 */
+ tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
return 1;
}
@@ -329,7 +389,7 @@ check_bom(int get_char(struct tok_state *),
1) NULL: need to call tok->decoding_readline to get a new line
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
stored the result in tok->decoding_buffer
- 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
+ 3) PyStringObject *: previous call to fp_readl did not have enough room
(in the s buffer) to copy entire contents of the line read
by tok->decoding_readline. tok->decoding_buffer has the overflow.
In this case, fp_readl is called in a loop (with an expanded buffer)
@@ -340,62 +400,58 @@ check_bom(int get_char(struct tok_state *),
static char *
fp_readl(char *s, int size, struct tok_state *tok)
{
- PyObject* bufobj;
- const char *buf;
- Py_ssize_t buflen;
+#ifndef Py_USING_UNICODE
+ /* In a non-Unicode built, this should never be called. */
+ Py_FatalError("fp_readl should not be called in this build.");
+ return NULL; /* Keep compiler happy (not reachable) */
+#else
+ PyObject* utf8 = NULL;
+ PyObject* buf = tok->decoding_buffer;
+ char *str;
+ Py_ssize_t utf8len;
/* Ask for one less byte so we can terminate it */
assert(size > 0);
size--;
- if (tok->decoding_buffer) {
- bufobj = tok->decoding_buffer;
- Py_INCREF(bufobj);
- }
- else
- {
- bufobj = _PyObject_CallNoArg(tok->decoding_readline);
- if (bufobj == NULL)
- goto error;
- }
- if (PyUnicode_CheckExact(bufobj))
- {
- buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
- if (buf == NULL) {
- goto error;
+ if (buf == NULL) {
+ buf = PyObject_CallObject(tok->decoding_readline, NULL);
+ if (buf == NULL)
+ return error_ret(tok);
+ if (!PyUnicode_Check(buf)) {
+ Py_DECREF(buf);
+ PyErr_SetString(PyExc_SyntaxError,
+ "codec did not return a unicode object");
+ return error_ret(tok);
}
+ } else {
+ tok->decoding_buffer = NULL;
+ if (PyString_CheckExact(buf))
+ utf8 = buf;
}
- else
- {
- buf = PyByteArray_AsString(bufobj);
- if (buf == NULL) {
- goto error;
- }
- buflen = PyByteArray_GET_SIZE(bufobj);
+ if (utf8 == NULL) {
+ utf8 = PyUnicode_AsUTF8String(buf);
+ Py_DECREF(buf);
+ if (utf8 == NULL)
+ return error_ret(tok);
}
-
- Py_XDECREF(tok->decoding_buffer);
- if (buflen > size) {
- /* Too many chars, the rest goes into tok->decoding_buffer */
- tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
- buflen-size);
- if (tok->decoding_buffer == NULL)
- goto error;
- buflen = size;
+ str = PyString_AsString(utf8);
+ utf8len = PyString_GET_SIZE(utf8);
+ if (utf8len > size) {
+ tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
+ if (tok->decoding_buffer == NULL) {
+ Py_DECREF(utf8);
+ return error_ret(tok);
+ }
+ utf8len = size;
}
- else
- tok->decoding_buffer = NULL;
-
- memcpy(s, buf, buflen);
- s[buflen] = '\0';
- if (buflen == 0) /* EOF */
- s = NULL;
- Py_DECREF(bufobj);
+ memcpy(s, str, utf8len);
+ s[utf8len] = '\0';
+ Py_DECREF(utf8);
+ if (utf8len == 0)
+ return NULL; /* EOF */
return s;
-
-error:
- Py_XDECREF(bufobj);
- return error_ret(tok);
+#endif
}
/* Set the readline function for TOK to a StreamReader's
@@ -411,48 +467,24 @@ error:
static int
fp_setreadl(struct tok_state *tok, const char* enc)
{
- PyObject *readline, *io, *stream;
- _Py_IDENTIFIER(open);
- _Py_IDENTIFIER(readline);
- int fd;
- long pos;
-
- fd = fileno(tok->fp);
- /* Due to buffering the file offset for fd can be different from the file
- * position of tok->fp. If tok->fp was opened in text mode on Windows,
- * its file position counts CRLF as one char and can't be directly mapped
- * to the file offset for fd. Instead we step back one byte and read to
- * the end of line.*/
- pos = ftell(tok->fp);
- if (pos == -1 ||
- lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
- PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
- return 0;
- }
+ PyObject *reader, *stream, *readline;
- io = PyImport_ImportModuleNoBlock("io");
- if (io == NULL)
- return 0;
-
- stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
- fd, "r", -1, enc, Py_None, Py_None, Py_False);
- Py_DECREF(io);
+ /* XXX: constify filename argument. */
+ stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
if (stream == NULL)
return 0;
- readline = _PyObject_GetAttrId(stream, &PyId_readline);
+ reader = PyCodec_StreamReader(enc, stream, NULL);
Py_DECREF(stream);
- if (readline == NULL)
+ if (reader == NULL)
return 0;
- Py_XSETREF(tok->decoding_readline, readline);
- if (pos > 0) {
- PyObject *bufobj = _PyObject_CallNoArg(readline);
- if (bufobj == NULL)
- return 0;
- Py_DECREF(bufobj);
- }
+ readline = PyObject_GetAttrString(reader, "readline");
+ Py_DECREF(reader);
+ if (readline == NULL)
+ return 0;
+ tok->decoding_readline = readline;
return 1;
}
@@ -468,34 +500,6 @@ static void fp_ungetc(int c, struct tok_state *tok) {
ungetc(c, tok->fp);
}
-/* Check whether the characters at s start a valid
- UTF-8 sequence. Return the number of characters forming
- the sequence if yes, 0 if not. */
-static int valid_utf8(const unsigned char* s)
-{
- int expected = 0;
- int length;
- if (*s < 0x80)
- /* single-byte code */
- return 1;
- if (*s < 0xc0)
- /* following byte */
- return 0;
- if (*s < 0xE0)
- expected = 1;
- else if (*s < 0xF0)
- expected = 2;
- else if (*s < 0xF8)
- expected = 3;
- else
- return 0;
- length = expected + 1;
- for (; expected; expected--)
- if (s[expected] < 0x80 || s[expected] >= 0xC0)
- return 0;
- return length;
-}
-
/* Read a line of input from TOK. Determine encoding
if necessary. */
@@ -505,12 +509,12 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
char *line = NULL;
int badchar = 0;
for (;;) {
- if (tok->decoding_state == STATE_NORMAL) {
+ if (tok->decoding_state < 0) {
/* We already have a codec associated with
this input. */
line = fp_readl(s, size, tok);
break;
- } else if (tok->decoding_state == STATE_RAW) {
+ } else if (tok->decoding_state > 0) {
/* We want a 'raw' read. */
line = Py_UniversalNewlineFgets(s, size,
tok->fp, NULL);
@@ -521,7 +525,7 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
reader functions from now on. */
if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
return error_ret(tok);
- assert(tok->decoding_state != STATE_INIT);
+ assert(tok->decoding_state != 0);
}
}
if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
@@ -529,40 +533,43 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
return error_ret(tok);
}
}
- /* The default encoding is UTF-8, so make sure we don't have any
- non-UTF-8 sequences in it. */
+#ifndef PGEN
+ /* The default encoding is ASCII, so make sure we don't have any
+ non-ASCII bytes in it. */
if (line && !tok->encoding) {
unsigned char *c;
- int length;
- for (c = (unsigned char *)line; *c; c += length)
- if (!(length = valid_utf8(c))) {
+ for (c = (unsigned char *)line; *c; c++)
+ if (*c > 127) {
badchar = *c;
break;
}
}
if (badchar) {
+ char buf[500];
/* Need to add 1 to the line number, since this line
has not been counted, yet. */
- PyErr_Format(PyExc_SyntaxError,
- "Non-UTF-8 code starting with '\\x%.2x' "
- "in file %U on line %i, "
- "but no encoding declared; "
- "see http://python.org/dev/peps/pep-0263/ for details",
- badchar, tok->filename, tok->lineno + 1);
+ sprintf(buf,
+ "Non-ASCII character '\\x%.2x' "
+ "in file %.200s on line %i, "
+ "but no encoding declared; "
+ "see http://python.org/dev/peps/pep-0263/ for details",
+ badchar, tok->filename, tok->lineno + 1);
+ PyErr_SetString(PyExc_SyntaxError, buf);
return error_ret(tok);
}
+#endif
return line;
}
static int
decoding_feof(struct tok_state *tok)
{
- if (tok->decoding_state != STATE_NORMAL) {
+ if (tok->decoding_state >= 0) {
return feof(tok->fp);
} else {
PyObject* buf = tok->decoding_buffer;
if (buf == NULL) {
- buf = _PyObject_CallNoArg(tok->decoding_readline);
+ buf = PyObject_CallObject(tok->decoding_readline, NULL);
if (buf == NULL) {
error_ret(tok);
return 1;
@@ -601,6 +608,7 @@ buf_setreadl(struct tok_state *tok, const char* enc) {
/* Return a UTF-8 encoding Python string object from the
C byte string STR, which is encoded with ENC. */
+#ifdef Py_USING_UNICODE
static PyObject *
translate_into_utf8(const char* str, const char* enc) {
PyObject *utf8;
@@ -611,12 +619,12 @@ translate_into_utf8(const char* str, const char* enc) {
Py_DECREF(buf);
return utf8;
}
+#endif
static char *
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
- int skip_next_lf = 0;
- size_t needed_length = strlen(s) + 2, final_length;
+ int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
char *buf, *current;
char c = '\0';
buf = PyMem_MALLOC(needed_length);
@@ -680,12 +688,14 @@ decode_str(const char *input, int single, struct tok_state *tok)
return error_ret(tok);
str = tok->str; /* string after BOM if any */
assert(str);
+#ifdef Py_USING_UNICODE
if (tok->enc != NULL) {
utf8 = translate_into_utf8(str, tok->enc);
if (utf8 == NULL)
return error_ret(tok);
- str = PyBytes_AsString(utf8);
+ str = PyString_AsString(utf8);
}
+#endif
for (s = str;; s++) {
if (*s == '\0') break;
else if (*s == '\n') {
@@ -707,18 +717,22 @@ decode_str(const char *input, int single, struct tok_state *tok)
return error_ret(tok);
}
}
+#ifdef Py_USING_UNICODE
if (tok->enc != NULL) {
assert(utf8 == NULL);
utf8 = translate_into_utf8(str, tok->enc);
if (utf8 == NULL)
return error_ret(tok);
- str = PyBytes_AS_STRING(utf8);
+ str = PyString_AsString(utf8);
}
+#endif
assert(tok->decoding_buffer == NULL);
tok->decoding_buffer = utf8; /* CAUTION */
return str;
}
+#endif /* PGEN */
+
/* Set up tokenizer for string */
struct tok_state *
@@ -727,7 +741,7 @@ PyTokenizer_FromString(const char *str, int exec_input)
struct tok_state *tok = tok_new();
if (tok == NULL)
return NULL;
- str = decode_str(str, exec_input, tok);
+ str = (char *)decode_str(str, exec_input, tok);
if (str == NULL) {
PyTokenizer_Free(tok);
return NULL;
@@ -738,38 +752,11 @@ PyTokenizer_FromString(const char *str, int exec_input)
return tok;
}
-struct tok_state *
-PyTokenizer_FromUTF8(const char *str, int exec_input)
-{
- struct tok_state *tok = tok_new();
- if (tok == NULL)
- return NULL;
- tok->input = str = translate_newlines(str, exec_input, tok);
- if (str == NULL) {
- PyTokenizer_Free(tok);
- return NULL;
- }
- tok->decoding_state = STATE_RAW;
- tok->read_coding_spec = 1;
- tok->enc = NULL;
- tok->str = str;
- tok->encoding = (char *)PyMem_MALLOC(6);
- if (!tok->encoding) {
- PyTokenizer_Free(tok);
- return NULL;
- }
- strcpy(tok->encoding, "utf-8");
-
- /* XXX: constify members. */
- tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
- return tok;
-}
/* Set up tokenizer for file */
struct tok_state *
-PyTokenizer_FromFile(FILE *fp, const char* enc,
- const char *ps1, const char *ps2)
+PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
{
struct tok_state *tok = tok_new();
if (tok == NULL)
@@ -783,17 +770,6 @@ PyTokenizer_FromFile(FILE *fp, const char* enc,
tok->fp = fp;
tok->prompt = ps1;
tok->nextprompt = ps2;
- if (enc != NULL) {
- /* Must copy encoding declaration since it
- gets copied into the parse tree. */
- tok->encoding = PyMem_MALLOC(strlen(enc)+1);
- if (!tok->encoding) {
- PyTokenizer_Free(tok);
- return NULL;
- }
- strcpy(tok->encoding, enc);
- tok->decoding_state = STATE_NORMAL;
- }
return tok;
}
@@ -805,9 +781,10 @@ PyTokenizer_Free(struct tok_state *tok)
{
if (tok->encoding != NULL)
PyMem_FREE(tok->encoding);
+#ifndef PGEN
Py_XDECREF(tok->decoding_readline);
Py_XDECREF(tok->decoding_buffer);
- Py_XDECREF(tok->filename);
+#endif
if (tok->fp != NULL && tok->buf != NULL)
PyMem_FREE(tok->buf);
if (tok->input)
@@ -815,10 +792,74 @@ PyTokenizer_Free(struct tok_state *tok)
PyMem_FREE(tok);
}
+#if !defined(PGEN) && defined(Py_USING_UNICODE)
+static int
+tok_stdin_decode(struct tok_state *tok, char **inp)
+{
+ PyObject *enc, *sysstdin, *decoded, *utf8;
+ const char *encoding;
+ char *converted;
+
+ if (PySys_GetFile((char *)"stdin", NULL) != stdin)
+ return 0;
+ sysstdin = PySys_GetObject("stdin");
+ if (sysstdin == NULL || !PyFile_Check(sysstdin))
+ return 0;
+
+ enc = ((PyFileObject *)sysstdin)->f_encoding;
+ if (enc == NULL || !PyString_Check(enc))
+ return 0;
+ Py_INCREF(enc);
+
+ encoding = PyString_AsString(enc);
+ decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
+ if (decoded == NULL)
+ goto error_clear;
+
+ utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
+ Py_DECREF(decoded);
+ if (utf8 == NULL)
+ goto error_clear;
+
+ assert(PyString_Check(utf8));
+ converted = new_string(PyString_AS_STRING(utf8),
+ PyString_GET_SIZE(utf8));
+ Py_DECREF(utf8);
+ if (converted == NULL)
+ goto error_nomem;
+
+ PyMem_FREE(*inp);
+ *inp = converted;
+ if (tok->encoding != NULL)
+ PyMem_FREE(tok->encoding);
+ tok->encoding = new_string(encoding, strlen(encoding));
+ if (tok->encoding == NULL)
+ goto error_nomem;
+
+ Py_DECREF(enc);
+ return 0;
+
+error_nomem:
+ Py_DECREF(enc);
+ tok->done = E_NOMEM;
+ return -1;
+
+error_clear:
+ Py_DECREF(enc);
+ if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
+ tok->done = E_ERROR;
+ return -1;
+ }
+ /* Fallback to iso-8859-1: for backward compatibility */
+ PyErr_Clear();
+ return 0;
+}
+#endif
+
/* Get next char, updating state; error code goes into tok->done */
static int
-tok_nextc(struct tok_state *tok)
+tok_nextc(register struct tok_state *tok)
{
for (;;) {
if (tok->cur != tok->inp) {
@@ -846,34 +887,6 @@ tok_nextc(struct tok_state *tok)
}
if (tok->prompt != NULL) {
char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
- if (newtok != NULL) {
- char *translated = translate_newlines(newtok, 0, tok);
- PyMem_FREE(newtok);
- if (translated == NULL)
- return EOF;
- newtok = translated;
- }
- if (tok->encoding && newtok && *newtok) {
- /* Recode to UTF-8 */
- Py_ssize_t buflen;
- const char* buf;
- PyObject *u = translate_into_utf8(newtok, tok->encoding);
- PyMem_FREE(newtok);
- if (!u) {
- tok->done = E_DECODE;
- return EOF;
- }
- buflen = PyBytes_GET_SIZE(u);
- buf = PyBytes_AS_STRING(u);
- newtok = PyMem_MALLOC(buflen+1);
- if (newtok == NULL) {
- Py_DECREF(u);
- tok->done = E_NOMEM;
- return EOF;
- }
- strcpy(newtok, buf);
- Py_DECREF(u);
- }
if (tok->nextprompt != NULL)
tok->prompt = tok->nextprompt;
if (newtok == NULL)
@@ -882,6 +895,10 @@ tok_nextc(struct tok_state *tok)
PyMem_FREE(newtok);
tok->done = E_EOF;
}
+#if !defined(PGEN) && defined(Py_USING_UNICODE)
+ else if (tok_stdin_decode(tok, &newtok) != 0)
+ PyMem_FREE(newtok);
+#endif
else if (tok->start != NULL) {
size_t start = tok->start - tok->buf;
size_t oldlen = tok->cur - tok->buf;
@@ -956,7 +973,6 @@ tok_nextc(struct tok_state *tok)
while (!done) {
Py_ssize_t curstart = tok->start == NULL ? -1 :
tok->start - tok->buf;
- Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
Py_ssize_t curvalid = tok->inp - tok->buf;
Py_ssize_t newsize = curvalid + BUFSIZ;
char *newbuf = tok->buf;
@@ -969,7 +985,6 @@ tok_nextc(struct tok_state *tok)
}
tok->buf = newbuf;
tok->cur = tok->buf + cur;
- tok->multi_line_start = tok->buf + cur_multi_line_start;
tok->line_start = tok->cur;
tok->inp = tok->buf + curvalid;
tok->end = tok->buf + newsize;
@@ -985,8 +1000,7 @@ tok_nextc(struct tok_state *tok)
return EOF;
/* Last line does not end in \n,
fake one */
- if (tok->inp[-1] != '\n')
- strcpy(tok->inp, "\n");
+ strcpy(tok->inp, "\n");
}
tok->inp = strchr(tok->inp, '\0');
done = tok->inp[-1] == '\n';
@@ -1018,7 +1032,7 @@ tok_nextc(struct tok_state *tok)
/* Back-up one character */
static void
-tok_backup(struct tok_state *tok, int c)
+tok_backup(register struct tok_state *tok, register int c)
{
if (c != EOF) {
if (--tok->cur < tok->buf)
@@ -1029,88 +1043,185 @@ tok_backup(struct tok_state *tok, int c)
}
-static int
-syntaxerror(struct tok_state *tok, const char *format, ...)
-{
- va_list vargs;
-#ifdef HAVE_STDARG_PROTOTYPES
- va_start(vargs, format);
-#else
- va_start(vargs);
-#endif
- PyErr_FormatV(PyExc_SyntaxError, format, vargs);
- va_end(vargs);
- PyErr_SyntaxLocationObject(tok->filename,
- tok->lineno,
- (int)(tok->cur - tok->line_start));
- tok->done = E_ERROR;
- return ERRORTOKEN;
-}
+/* Return the token corresponding to a single character */
-static int
-indenterror(struct tok_state *tok)
+int
+PyToken_OneChar(int c)
{
- tok->done = E_TABSPACE;
- tok->cur = tok->inp;
- return ERRORTOKEN;
+ switch (c) {
+ case '(': return LPAR;
+ case ')': return RPAR;
+ case '[': return LSQB;
+ case ']': return RSQB;
+ case ':': return COLON;
+ case ',': return COMMA;
+ case ';': return SEMI;
+ case '+': return PLUS;
+ case '-': return MINUS;
+ case '*': return STAR;
+ case '/': return SLASH;
+ case '|': return VBAR;
+ case '&': return AMPER;
+ case '<': return LESS;
+ case '>': return GREATER;
+ case '=': return EQUAL;
+ case '.': return DOT;
+ case '%': return PERCENT;
+ case '`': return BACKQUOTE;
+ case '{': return LBRACE;
+ case '}': return RBRACE;
+ case '^': return CIRCUMFLEX;
+ case '~': return TILDE;
+ case '@': return AT;
+ default: return OP;
+ }
}
-/* Verify that the identifier follows PEP 3131.
- All identifier strings are guaranteed to be "ready" unicode objects.
- */
-static int
-verify_identifier(struct tok_state *tok)
+
+int
+PyToken_TwoChars(int c1, int c2)
{
- PyObject *s;
- int result;
- if (tok->decoding_erred)
- return 0;
- s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
- if (s == NULL) {
- if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
- PyErr_Clear();
- tok->done = E_IDENTIFIER;
- } else {
- tok->done = E_ERROR;
+ switch (c1) {
+ case '=':
+ switch (c2) {
+ case '=': return EQEQUAL;
}
- return 0;
+ break;
+ case '!':
+ switch (c2) {
+ case '=': return NOTEQUAL;
+ }
+ break;
+ case '<':
+ switch (c2) {
+ case '>': return NOTEQUAL;
+ case '=': return LESSEQUAL;
+ case '<': return LEFTSHIFT;
+ }
+ break;
+ case '>':
+ switch (c2) {
+ case '=': return GREATEREQUAL;
+ case '>': return RIGHTSHIFT;
+ }
+ break;
+ case '+':
+ switch (c2) {
+ case '=': return PLUSEQUAL;
+ }
+ break;
+ case '-':
+ switch (c2) {
+ case '=': return MINEQUAL;
+ }
+ break;
+ case '*':
+ switch (c2) {
+ case '*': return DOUBLESTAR;
+ case '=': return STAREQUAL;
+ }
+ break;
+ case '/':
+ switch (c2) {
+ case '/': return DOUBLESLASH;
+ case '=': return SLASHEQUAL;
+ }
+ break;
+ case '|':
+ switch (c2) {
+ case '=': return VBAREQUAL;
+ }
+ break;
+ case '%':
+ switch (c2) {
+ case '=': return PERCENTEQUAL;
+ }
+ break;
+ case '&':
+ switch (c2) {
+ case '=': return AMPEREQUAL;
+ }
+ break;
+ case '^':
+ switch (c2) {
+ case '=': return CIRCUMFLEXEQUAL;
+ }
+ break;
}
- result = PyUnicode_IsIdentifier(s);
- Py_DECREF(s);
- if (result == 0)
- tok->done = E_IDENTIFIER;
- return result;
+ return OP;
}
-static int
-tok_decimal_tail(struct tok_state *tok)
+int
+PyToken_ThreeChars(int c1, int c2, int c3)
{
- int c;
-
- while (1) {
- do {
- c = tok_nextc(tok);
- } while (isdigit(c));
- if (c != '_') {
+ switch (c1) {
+ case '<':
+ switch (c2) {
+ case '<':
+ switch (c3) {
+ case '=':
+ return LEFTSHIFTEQUAL;
+ }
break;
}
- c = tok_nextc(tok);
- if (!isdigit(c)) {
- tok_backup(tok, c);
- syntaxerror(tok, "invalid decimal literal");
- return 0;
+ break;
+ case '>':
+ switch (c2) {
+ case '>':
+ switch (c3) {
+ case '=':
+ return RIGHTSHIFTEQUAL;
+ }
+ break;
+ }
+ break;
+ case '*':
+ switch (c2) {
+ case '*':
+ switch (c3) {
+ case '=':
+ return DOUBLESTAREQUAL;
+ }
+ break;
+ }
+ break;
+ case '/':
+ switch (c2) {
+ case '/':
+ switch (c3) {
+ case '=':
+ return DOUBLESLASHEQUAL;
+ }
+ break;
}
+ break;
+ }
+ return OP;
+}
+
+static int
+indenterror(struct tok_state *tok)
+{
+ if (tok->alterror) {
+ tok->done = E_TABSPACE;
+ tok->cur = tok->inp;
+ return 1;
+ }
+ if (tok->altwarning) {
+ PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
+ "in indentation\n", tok->filename);
+ tok->altwarning = 0;
}
- return c;
+ return 0;
}
/* Get next token, after space stripping etc. */
static int
-tok_get(struct tok_state *tok, char **p_start, char **p_end)
+tok_get(register struct tok_state *tok, char **p_start, char **p_end)
{
- int c;
- int blankline, nonascii;
+ register int c;
+ int blankline;
*p_start = *p_end = NULL;
nextline:
@@ -1119,24 +1230,22 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
/* Get indentation level */
if (tok->atbol) {
- int col = 0;
- int altcol = 0;
+ register int col = 0;
+ register int altcol = 0;
tok->atbol = 0;
for (;;) {
c = tok_nextc(tok);
- if (c == ' ') {
+ if (c == ' ')
col++, altcol++;
- }
else if (c == '\t') {
- col = (col / tok->tabsize + 1) * tok->tabsize;
- altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
+ col = (col/tok->tabsize + 1) * tok->tabsize;
+ altcol = (altcol/tok->alttabsize + 1)
+ * tok->alttabsize;
}
- else if (c == '\014') {/* Control-L (formfeed) */
+ else if (c == '\014') /* Control-L (formfeed) */
col = altcol = 0; /* For Emacs users */
- }
- else {
+ else
break;
- }
}
tok_backup(tok, c);
if (c == '#' || c == '\n') {
@@ -1145,18 +1254,10 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
not passed to the parser as NEWLINE tokens,
except *totally* empty lines in interactive
mode, which signal the end of a command group. */
- if (col == 0 && c == '\n' && tok->prompt != NULL) {
+ if (col == 0 && c == '\n' && tok->prompt != NULL)
blankline = 0; /* Let it through */
- }
- else if (tok->prompt != NULL && tok->lineno == 1) {
- /* In interactive mode, if the first line contains
- only spaces and/or a comment, let it through. */
- blankline = 0;
- col = altcol = 0;
- }
- else {
+ else
blankline = 1; /* Ignore completely */
- }
/* We can't jump back right here since we still
may need to skip to the end of a comment */
}
@@ -1164,7 +1265,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
if (col == tok->indstack[tok->indent]) {
/* No change */
if (altcol != tok->altindstack[tok->indent]) {
- return indenterror(tok);
+ if (indenterror(tok))
+ return ERRORTOKEN;
}
}
else if (col > tok->indstack[tok->indent]) {
@@ -1175,7 +1277,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
return ERRORTOKEN;
}
if (altcol <= tok->altindstack[tok->indent]) {
- return indenterror(tok);
+ if (indenterror(tok))
+ return ERRORTOKEN;
}
tok->pendin++;
tok->indstack[++tok->indent] = col;
@@ -1194,7 +1297,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
return ERRORTOKEN;
}
if (altcol != tok->altindstack[tok->indent]) {
- return indenterror(tok);
+ if (indenterror(tok))
+ return ERRORTOKEN;
}
}
}
@@ -1214,31 +1318,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
}
}
- /* Peek ahead at the next character */
- c = tok_nextc(tok);
- tok_backup(tok, c);
- /* Check if we are closing an async function */
- if (tok->async_def
- && !blankline
- /* Due to some implementation artifacts of type comments,
- * a TYPE_COMMENT at the start of a function won't set an
- * indentation level and it will produce a NEWLINE after it.
- * To avoid spuriously ending an async function due to this,
- * wait until we have some non-newline char in front of us. */
- && c != '\n'
- && tok->level == 0
- /* There was a NEWLINE after ASYNC DEF,
- so we're past the signature. */
- && tok->async_def_nl
- /* Current indentation level is less than where
- the async function was defined */
- && tok->async_def_indent >= tok->indent)
- {
- tok->async_def = 0;
- tok->async_def_indent = 0;
- tok->async_def_nl = 0;
- }
-
again:
tok->start = NULL;
/* Skip spaces */
@@ -1249,63 +1328,40 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
/* Set start of current token */
tok->start = tok->cur - 1;
- /* Skip comment, unless it's a type comment */
+ /* Skip comment, while looking for tab-setting magic */
if (c == '#') {
- const char *prefix, *p, *type_start;
-
- while (c != EOF && c != '\n') {
- c = tok_nextc(tok);
- }
-
- if (tok->type_comments) {
- p = tok->start;
- prefix = type_comment_prefix;
- while (*prefix && p < tok->cur) {
- if (*prefix == ' ') {
- while (*p == ' ' || *p == '\t') {
- p++;
- }
- } else if (*prefix == *p) {
- p++;
- } else {
- break;
- }
-
- prefix++;
- }
-
- /* This is a type comment if we matched all of type_comment_prefix. */
- if (!*prefix) {
- int is_type_ignore = 1;
- const char *ignore_end = p + 6;
- tok_backup(tok, c); /* don't eat the newline or EOF */
-
- type_start = p;
-
- /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
- * or anything ASCII and non-alphanumeric. */
- is_type_ignore = (
- tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
- && !(tok->cur > ignore_end
- && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
-
- if (is_type_ignore) {
- *p_start = (char *) ignore_end;
- *p_end = tok->cur;
-
- /* If this type ignore is the only thing on the line, consume the newline also. */
- if (blankline) {
- tok_nextc(tok);
- tok->atbol = 1;
- }
- return TYPE_IGNORE;
- } else {
- *p_start = (char *) type_start; /* after type_comment_prefix */
- *p_end = tok->cur;
- return TYPE_COMMENT;
+ static char *tabforms[] = {
+ "tab-width:", /* Emacs */
+ ":tabstop=", /* vim, full form */
+ ":ts=", /* vim, abbreviated form */
+ "set tabsize=", /* will vi never die? */
+ /* more templates can be added here to support other editors */
+ };
+ char cbuf[80];
+ char *tp, **cp;
+ tp = cbuf;
+ do {
+ *tp++ = c = tok_nextc(tok);
+ } while (c != EOF && c != '\n' &&
+ (size_t)(tp - cbuf + 1) < sizeof(cbuf));
+ *tp = '\0';
+ for (cp = tabforms;
+ cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
+ cp++) {
+ if ((tp = strstr(cbuf, *cp))) {
+ int newsize = atoi(tp + strlen(*cp));
+
+ if (newsize >= 1 && newsize <= 40) {
+ tok->tabsize = newsize;
+ if (Py_VerboseFlag)
+ PySys_WriteStderr(
+ "Tab size set to %d\n",
+ newsize);
}
}
}
+ while (c != EOF && c != '\n')
+ c = tok_nextc(tok);
}
/* Check for EOF and errors now */
@@ -1314,108 +1370,49 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
}
/* Identifier (most frequent token!) */
- nonascii = 0;
- if (is_potential_identifier_start(c)) {
- /* Process the various legal combinations of b"", r"", u"", and f"". */
- int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
- while (1) {
- if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
- saw_b = 1;
- /* Since this is a backwards compatibility support literal we don't
- want to support it in arbitrary order like byte literals. */
- else if (!(saw_b || saw_u || saw_r || saw_f)
- && (c == 'u'|| c == 'U')) {
- saw_u = 1;
- }
- /* ur"" and ru"" are not supported */
- else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
- saw_r = 1;
- }
- else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
- saw_f = 1;
- }
- else {
- break;
- }
+ if (Py_ISALPHA(c) || c == '_') {
+ /* Process r"", u"" and ur"" */
+ switch (c) {
+ case 'b':
+ case 'B':
c = tok_nextc(tok);
- if (c == '"' || c == '\'') {
+ if (c == 'r' || c == 'R')
+ c = tok_nextc(tok);
+ if (c == '"' || c == '\'')
goto letter_quote;
- }
+ break;
+ case 'r':
+ case 'R':
+ c = tok_nextc(tok);
+ if (c == '"' || c == '\'')
+ goto letter_quote;
+ break;
+ case 'u':
+ case 'U':
+ c = tok_nextc(tok);
+ if (c == 'r' || c == 'R')
+ c = tok_nextc(tok);
+ if (c == '"' || c == '\'')
+ goto letter_quote;
+ break;
}
- while (is_potential_identifier_char(c)) {
- if (c >= 128) {
- nonascii = 1;
- }
+ while (c != EOF && (Py_ISALNUM(c) || c == '_')) {
c = tok_nextc(tok);
}
tok_backup(tok, c);
- if (nonascii && !verify_identifier(tok)) {
- return ERRORTOKEN;
- }
*p_start = tok->start;
*p_end = tok->cur;
-
- /* async/await parsing block. */
- if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
- /* May be an 'async' or 'await' token. For Python 3.7 or
- later we recognize them unconditionally. For Python
- 3.5 or 3.6 we recognize 'async' in front of 'def', and
- either one inside of 'async def'. (Technically we
- shouldn't recognize these at all for 3.4 or earlier,
- but there's no *valid* Python 3.4 code that would be
- rejected, and async functions will be rejected in a
- later phase.) */
- if (!tok->async_hacks || tok->async_def) {
- /* Always recognize the keywords. */
- if (memcmp(tok->start, "async", 5) == 0) {
- return ASYNC;
- }
- if (memcmp(tok->start, "await", 5) == 0) {
- return AWAIT;
- }
- }
- else if (memcmp(tok->start, "async", 5) == 0) {
- /* The current token is 'async'.
- Look ahead one token to see if that is 'def'. */
-
- struct tok_state ahead_tok;
- char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
- int ahead_tok_kind;
-
- memcpy(&ahead_tok, tok, sizeof(ahead_tok));
- ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
- &ahead_tok_end);
-
- if (ahead_tok_kind == NAME
- && ahead_tok.cur - ahead_tok.start == 3
- && memcmp(ahead_tok.start, "def", 3) == 0)
- {
- /* The next token is going to be 'def', so instead of
- returning a plain NAME token, return ASYNC. */
- tok->async_def_indent = tok->indent;
- tok->async_def = 1;
- return ASYNC;
- }
- }
- }
-
return NAME;
}
/* Newline */
if (c == '\n') {
tok->atbol = 1;
- if (blankline || tok->level > 0) {
+ if (blankline || tok->level > 0)
goto nextline;
- }
*p_start = tok->start;
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
tok->cont_line = 0;
- if (tok->async_def) {
- /* We're somewhere inside an 'async def' function, and
- we've encountered a NEWLINE after its signature. */
- tok->async_def_nl = 1;
- }
return NEWLINE;
}
@@ -1424,24 +1421,13 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
c = tok_nextc(tok);
if (isdigit(c)) {
goto fraction;
- } else if (c == '.') {
- c = tok_nextc(tok);
- if (c == '.') {
- *p_start = tok->start;
- *p_end = tok->cur;
- return ELLIPSIS;
- }
- else {
- tok_backup(tok, c);
- }
- tok_backup(tok, '.');
}
else {
tok_backup(tok, c);
+ *p_start = tok->start;
+ *p_end = tok->cur;
+ return DOT;
}
- *p_start = tok->start;
- *p_end = tok->cur;
- return DOT;
}
/* Number */
@@ -1449,136 +1435,94 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
if (c == '0') {
/* Hex, octal or binary -- maybe. */
c = tok_nextc(tok);
+ if (c == '.')
+ goto fraction;
+#ifndef WITHOUT_COMPLEX
+ if (c == 'j' || c == 'J')
+ goto imaginary;
+#endif
if (c == 'x' || c == 'X') {
+
/* Hex */
c = tok_nextc(tok);
+ if (!isxdigit(c)) {
+ tok->done = E_TOKEN;
+ tok_backup(tok, c);
+ return ERRORTOKEN;
+ }
do {
- if (c == '_') {
- c = tok_nextc(tok);
- }
- if (!isxdigit(c)) {
- tok_backup(tok, c);
- return syntaxerror(tok, "invalid hexadecimal literal");
- }
- do {
- c = tok_nextc(tok);
- } while (isxdigit(c));
- } while (c == '_');
+ c = tok_nextc(tok);
+ } while (isxdigit(c));
}
else if (c == 'o' || c == 'O') {
/* Octal */
c = tok_nextc(tok);
- do {
- if (c == '_') {
- c = tok_nextc(tok);
- }
- if (c < '0' || c >= '8') {
- tok_backup(tok, c);
- if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in octal literal", c);
- }
- else {
- return syntaxerror(tok, "invalid octal literal");
- }
- }
- do {
- c = tok_nextc(tok);
- } while ('0' <= c && c < '8');
- } while (c == '_');
- if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in octal literal", c);
+ if (c < '0' || c >= '8') {
+ tok->done = E_TOKEN;
+ tok_backup(tok, c);
+ return ERRORTOKEN;
}
+ do {
+ c = tok_nextc(tok);
+ } while ('0' <= c && c < '8');
}
else if (c == 'b' || c == 'B') {
/* Binary */
c = tok_nextc(tok);
- do {
- if (c == '_') {
- c = tok_nextc(tok);
- }
- if (c != '0' && c != '1') {
- tok_backup(tok, c);
- if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in binary literal", c);
- }
- else {
- return syntaxerror(tok, "invalid binary literal");
- }
- }
- do {
- c = tok_nextc(tok);
- } while (c == '0' || c == '1');
- } while (c == '_');
- if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in binary literal", c);
+ if (c != '0' && c != '1') {
+ tok->done = E_TOKEN;
+ tok_backup(tok, c);
+ return ERRORTOKEN;
}
+ do {
+ c = tok_nextc(tok);
+ } while (c == '0' || c == '1');
}
else {
- int nonzero = 0;
- /* maybe old-style octal; c is first char of it */
- /* in any case, allow '0' as a literal */
- while (1) {
- if (c == '_') {
- c = tok_nextc(tok);
- if (!isdigit(c)) {
- tok_backup(tok, c);
- return syntaxerror(tok, "invalid decimal literal");
- }
- }
- if (c != '0') {
- break;
- }
+ int found_decimal = 0;
+ /* Octal; c is first char of it */
+ /* There's no 'isoctdigit' macro, sigh */
+ while ('0' <= c && c < '8') {
c = tok_nextc(tok);
}
if (isdigit(c)) {
- nonzero = 1;
- c = tok_decimal_tail(tok);
- if (c == 0) {
- return ERRORTOKEN;
- }
+ found_decimal = 1;
+ do {
+ c = tok_nextc(tok);
+ } while (isdigit(c));
}
- if (c == '.') {
- c = tok_nextc(tok);
+ if (c == '.')
goto fraction;
- }
- else if (c == 'e' || c == 'E') {
+ else if (c == 'e' || c == 'E')
goto exponent;
- }
- else if (c == 'j' || c == 'J') {
+#ifndef WITHOUT_COMPLEX
+ else if (c == 'j' || c == 'J')
goto imaginary;
- }
- else if (nonzero) {
- /* Old-style octal: now disallowed. */
+#endif
+ else if (found_decimal) {
+ tok->done = E_TOKEN;
tok_backup(tok, c);
- return syntaxerror(tok,
- "leading zeros in decimal integer "
- "literals are not permitted; "
- "use an 0o prefix for octal integers");
+ return ERRORTOKEN;
}
}
+ if (c == 'l' || c == 'L')
+ c = tok_nextc(tok);
}
else {
/* Decimal */
- c = tok_decimal_tail(tok);
- if (c == 0) {
- return ERRORTOKEN;
- }
- {
+ do {
+ c = tok_nextc(tok);
+ } while (isdigit(c));
+ if (c == 'l' || c == 'L')
+ c = tok_nextc(tok);
+ else {
/* Accept floating point numbers. */
if (c == '.') {
- c = tok_nextc(tok);
fraction:
/* Fraction */
- if (isdigit(c)) {
- c = tok_decimal_tail(tok);
- if (c == 0) {
- return ERRORTOKEN;
- }
- }
+ do {
+ c = tok_nextc(tok);
+ } while (isdigit(c));
}
if (c == 'e' || c == 'E') {
int e;
@@ -1589,8 +1533,9 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
if (c == '+' || c == '-') {
c = tok_nextc(tok);
if (!isdigit(c)) {
+ tok->done = E_TOKEN;
tok_backup(tok, c);
- return syntaxerror(tok, "invalid decimal literal");
+ return ERRORTOKEN;
}
} else if (!isdigit(c)) {
tok_backup(tok, c);
@@ -1599,16 +1544,16 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
*p_end = tok->cur;
return NUMBER;
}
- c = tok_decimal_tail(tok);
- if (c == 0) {
- return ERRORTOKEN;
- }
+ do {
+ c = tok_nextc(tok);
+ } while (isdigit(c));
}
- if (c == 'j' || c == 'J') {
+#ifndef WITHOUT_COMPLEX
+ if (c == 'j' || c == 'J')
/* Imaginary part */
imaginary:
c = tok_nextc(tok);
- }
+#endif
}
}
tok_backup(tok, c);
@@ -1620,61 +1565,55 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
letter_quote:
/* String */
if (c == '\'' || c == '"') {
+ Py_ssize_t quote2 = tok->cur - tok->start + 1;
int quote = c;
- int quote_size = 1; /* 1 or 3 */
- int end_quote_size = 0;
-
- /* Nodes of type STRING, especially multi line strings
- must be handled differently in order to get both
- the starting line number and the column offset right.
- (cf. issue 16806) */
- tok->first_lineno = tok->lineno;
- tok->multi_line_start = tok->line_start;
-
- /* Find the quote size and start of string */
- c = tok_nextc(tok);
- if (c == quote) {
- c = tok_nextc(tok);
- if (c == quote) {
- quote_size = 3;
- }
- else {
- end_quote_size = 1; /* empty string found */
- }
- }
- if (c != quote) {
- tok_backup(tok, c);
- }
-
- /* Get rest of string */
- while (end_quote_size != quote_size) {
+ int triple = 0;
+ int tripcount = 0;
+ for (;;) {
c = tok_nextc(tok);
- if (c == EOF) {
- if (quote_size == 3) {
- tok->done = E_EOFS;
- }
- else {
+ if (c == '\n') {
+ if (!triple) {
tok->done = E_EOLS;
+ tok_backup(tok, c);
+ return ERRORTOKEN;
}
- tok->cur = tok->inp;
- return ERRORTOKEN;
+ tripcount = 0;
+ tok->cont_line = 1; /* multiline string. */
}
- if (quote_size == 1 && c == '\n') {
- tok->done = E_EOLS;
+ else if (c == EOF) {
+ if (triple)
+ tok->done = E_EOFS;
+ else
+ tok->done = E_EOLS;
tok->cur = tok->inp;
return ERRORTOKEN;
}
- if (c == quote) {
- end_quote_size += 1;
+ else if (c == quote) {
+ tripcount++;
+ if (tok->cur - tok->start == quote2) {
+ c = tok_nextc(tok);
+ if (c == quote) {
+ triple = 1;
+ tripcount = 0;
+ continue;
+ }
+ tok_backup(tok, c);
+ }
+ if (!triple || tripcount == 3)
+ break;
}
- else {
- end_quote_size = 0;
- if (c == '\\') {
- tok_nextc(tok); /* skip escaped char */
+ else if (c == '\\') {
+ tripcount = 0;
+ c = tok_nextc(tok);
+ if (c == EOF) {
+ tok->done = E_EOLS;
+ tok->cur = tok->inp;
+ return ERRORTOKEN;
}
}
+ else
+ tripcount = 0;
}
-
*p_start = tok->start;
*p_end = tok->cur;
return STRING;
@@ -1688,14 +1627,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
tok->cur = tok->inp;
return ERRORTOKEN;
}
- c = tok_nextc(tok);
- if (c == EOF) {
- tok->done = E_EOF;
- tok->cur = tok->inp;
- return ERRORTOKEN;
- } else {
- tok_backup(tok, c);
- }
tok->cont_line = 1;
goto again; /* Read next line */
}
@@ -1704,13 +1635,24 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
{
int c2 = tok_nextc(tok);
int token = PyToken_TwoChars(c, c2);
+#ifndef PGEN
+ if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
+ if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
+ "<> not supported in 3.x; use !=",
+ tok->filename, tok->lineno,
+ NULL, NULL)) {
+ tok->done = E_ERROR;
+ tok->cur = tok->inp;
+ return ERRORTOKEN;
+ }
+ }
+#endif
if (token != OP) {
int c3 = tok_nextc(tok);
int token3 = PyToken_ThreeChars(c, c2, c3);
if (token3 != OP) {
token = token3;
- }
- else {
+ } else {
tok_backup(tok, c3);
}
*p_start = tok->start;
@@ -1725,38 +1667,12 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
case '(':
case '[':
case '{':
- if (tok->level >= MAXLEVEL) {
- return syntaxerror(tok, "too many nested parentheses");
- }
- tok->parenstack[tok->level] = c;
- tok->parenlinenostack[tok->level] = tok->lineno;
tok->level++;
break;
case ')':
case ']':
case '}':
- if (!tok->level) {
- return syntaxerror(tok, "unmatched '%c'", c);
- }
tok->level--;
- int opening = tok->parenstack[tok->level];
- if (!((opening == '(' && c == ')') ||
- (opening == '[' && c == ']') ||
- (opening == '{' && c == '}')))
- {
- if (tok->parenlinenostack[tok->level] != tok->lineno) {
- return syntaxerror(tok,
- "closing parenthesis '%c' does not match "
- "opening parenthesis '%c' on line %d",
- c, opening, tok->parenlinenostack[tok->level]);
- }
- else {
- return syntaxerror(tok,
- "closing parenthesis '%c' does not match "
- "opening parenthesis '%c'",
- c, opening);
- }
- }
break;
}
@@ -1770,6 +1686,11 @@ int
PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
{
int result = tok_get(tok, p_start, p_end);
+ if (tok->fp && ferror(tok->fp)) {
+ clearerr(tok->fp);
+ result = ERRORTOKEN;
+ tok->done = E_IO;
+ }
if (tok->decoding_erred) {
result = ERRORTOKEN;
tok->done = E_DECODE;
@@ -1777,67 +1698,67 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
return result;
}
-/* Get the encoding of a Python file. Check for the coding cookie and check if
- the file starts with a BOM.
-
- PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
- encoding in the first or second line of the file (in which case the encoding
- should be assumed to be UTF-8).
-
- The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
- by the caller. */
+/* This function is only called from parsetok. However, it cannot live
+ there, as it must be empty for PGEN, and we can check for PGEN only
+ in this file. */
-char *
-PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
+#if defined(PGEN) || !defined(Py_USING_UNICODE)
+char*
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
{
- struct tok_state *tok;
- FILE *fp;
- char *p_start =NULL , *p_end =NULL , *encoding = NULL;
-
- fd = _Py_dup(fd);
- if (fd < 0) {
- return NULL;
- }
-
- fp = fdopen(fd, "r");
- if (fp == NULL) {
- return NULL;
- }
- tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
- if (tok == NULL) {
- fclose(fp);
- return NULL;
- }
- if (filename != NULL) {
- Py_INCREF(filename);
- tok->filename = filename;
- }
- else {
- tok->filename = PyUnicode_FromString("<string>");
- if (tok->filename == NULL) {
- fclose(fp);
- PyTokenizer_Free(tok);
- return encoding;
- }
- }
- while (tok->lineno < 2 && tok->done == E_OK) {
- PyTokenizer_Get(tok, &p_start, &p_end);
+ return NULL;
+}
+#else
+#ifdef Py_USING_UNICODE
+static PyObject *
+dec_utf8(const char *enc, const char *text, size_t len) {
+ PyObject *ret = NULL;
+ PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
+ if (unicode_text) {
+ ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
+ Py_DECREF(unicode_text);
}
- fclose(fp);
- if (tok->encoding) {
- encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
- if (encoding)
- strcpy(encoding, tok->encoding);
+ if (!ret) {
+ PyErr_Clear();
}
- PyTokenizer_Free(tok);
- return encoding;
+ return ret;
}
-
char *
-PyTokenizer_FindEncoding(int fd)
+PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
{
- return PyTokenizer_FindEncodingFilename(fd, NULL);
+ char *text = NULL;
+ if (tok->encoding) {
+ /* convert source to original encondig */
+ PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
+ if (lineobj != NULL) {
+ int linelen = PyString_Size(lineobj);
+ const char *line = PyString_AsString(lineobj);
+ text = PyObject_MALLOC(linelen + 1);
+ if (text != NULL && line != NULL) {
+ if (linelen)
+ strncpy(text, line, linelen);
+ text[linelen] = '\0';
+ }
+ Py_DECREF(lineobj);
+
+ /* adjust error offset */
+ if (*offset > 1) {
+ PyObject *offsetobj = dec_utf8(tok->encoding,
+ tok->buf, *offset-1);
+ if (offsetobj) {
+ *offset = PyString_Size(offsetobj) + 1;
+ Py_DECREF(offsetobj);
+ }
+ }
+
+ }
+ }
+ return text;
+
}
+#endif /* defined(Py_USING_UNICODE) */
+#endif
+
#ifdef Py_DEBUG