diff options
Diffstat (limited to 'Parser/tokenizer.c')
-rw-r--r-- | Parser/tokenizer.c | 246 |
1 files changed, 169 insertions, 77 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index eda1542..8961884 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -202,8 +202,8 @@ error_ret(struct tok_state *tok) /* XXX */ } -static char * -get_normal_name(char *s) /* for utf-8 and latin-1 */ +static const char * +get_normal_name(const char *s) /* for utf-8 and latin-1 */ { char buf[13]; int i; @@ -264,7 +264,7 @@ get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *t if (begin < t) { char* r = new_string(begin, t - begin, tok); - char* q; + const char* q; if (!r) return 0; q = get_normal_name(r); @@ -1335,6 +1335,28 @@ verify_identifier(struct tok_state *tok) } #endif +static int +tok_decimal_tail(struct tok_state *tok) +{ + int c; + + while (1) { + do { + c = tok_nextc(tok); + } while (isdigit(c)); + if (c != '_') { + break; + } + c = tok_nextc(tok); + if (!isdigit(c)) { + tok->done = E_TOKEN; + tok_backup(tok, c); + return 0; + } + } + return c; +} + /* Get next token, after space stripping etc. */ static int @@ -1355,17 +1377,20 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) tok->atbol = 0; for (;;) { c = tok_nextc(tok); - if (c == ' ') + if (c == ' ') { col++, altcol++; + } else if (c == '\t') { col = (col/tok->tabsize + 1) * tok->tabsize; altcol = (altcol/tok->alttabsize + 1) * tok->alttabsize; } - else if (c == '\014') /* Control-L (formfeed) */ + else if (c == '\014') {/* Control-L (formfeed) */ col = altcol = 0; /* For Emacs users */ - else + } + else { break; + } } tok_backup(tok, c); if (c == '#' || c == '\n') { @@ -1374,10 +1399,12 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) not passed to the parser as NEWLINE tokens, except *totally* empty lines in interactive mode, which signal the end of a command group. */ - if (col == 0 && c == '\n' && tok->prompt != NULL) + if (col == 0 && c == '\n' && tok->prompt != NULL) { blankline = 0; /* Let it through */ - else + } + else { blankline = 1; /* Ignore completely */ + } /* We can't jump back right here since we still may need to skip to the end of a comment */ } @@ -1385,8 +1412,9 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) if (col == tok->indstack[tok->indent]) { /* No change */ if (altcol != tok->altindstack[tok->indent]) { - if (indenterror(tok)) + if (indenterror(tok)) { return ERRORTOKEN; + } } } else if (col > tok->indstack[tok->indent]) { @@ -1397,8 +1425,9 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) return ERRORTOKEN; } if (altcol <= tok->altindstack[tok->indent]) { - if (indenterror(tok)) + if (indenterror(tok)) { return ERRORTOKEN; + } } tok->pendin++; tok->indstack[++tok->indent] = col; @@ -1417,8 +1446,9 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) return ERRORTOKEN; } if (altcol != tok->altindstack[tok->indent]) { - if (indenterror(tok)) + if (indenterror(tok)) { return ERRORTOKEN; + } } } } @@ -1464,9 +1494,11 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) tok->start = tok->cur - 1; /* Skip comment */ - if (c == '#') - while (c != EOF && c != '\n') + if (c == '#') { + while (c != EOF && c != '\n') { c = tok_nextc(tok); + } + } /* Check for EOF and errors now */ if (c == EOF) { @@ -1477,31 +1509,41 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) nonascii = 0; if (is_potential_identifier_start(c)) { /* Process b"", r"", u"", br"" and rb"" */ - int saw_b = 0, saw_r = 0, saw_u = 0; + int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0; while (1) { - if (!(saw_b || saw_u) && (c == 'b' || c == 'B')) + if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B')) saw_b = 1; /* Since this is a backwards compatibility support literal we don't want to support it in arbitrary order like byte literals. */ - else if (!(saw_b || saw_u || saw_r) && (c == 'u' || c == 'U')) + else if (!(saw_b || saw_u || saw_r || saw_f) + && (c == 'u'|| c == 'U')) { saw_u = 1; + } /* ur"" and ru"" are not supported */ - else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) + else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) { saw_r = 1; - else + } + else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) { + saw_f = 1; + } + else { break; + } c = tok_nextc(tok); - if (c == '"' || c == '\'') + if (c == '"' || c == '\'') { goto letter_quote; + } } while (is_potential_identifier_char(c)) { - if (c >= 128) + if (c >= 128) { nonascii = 1; + } c = tok_nextc(tok); } tok_backup(tok, c); - if (nonascii && !verify_identifier(tok)) + if (nonascii && !verify_identifier(tok)) { return ERRORTOKEN; + } *p_start = tok->start; *p_end = tok->cur; @@ -1510,10 +1552,12 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) /* Current token length is 5. */ if (tok->async_def) { /* We're inside an 'async def' function. */ - if (memcmp(tok->start, "async", 5) == 0) + if (memcmp(tok->start, "async", 5) == 0) { return ASYNC; - if (memcmp(tok->start, "await", 5) == 0) + } + if (memcmp(tok->start, "await", 5) == 0) { return AWAIT; + } } else if (memcmp(tok->start, "async", 5) == 0) { /* The current token is 'async'. @@ -1546,8 +1590,9 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) /* Newline */ if (c == '\n') { tok->atbol = 1; - if (blankline || tok->level > 0) + if (blankline || tok->level > 0) { goto nextline; + } *p_start = tok->start; *p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; @@ -1570,11 +1615,13 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) *p_start = tok->start; *p_end = tok->cur; return ELLIPSIS; - } else { + } + else { tok_backup(tok, c); } tok_backup(tok, '.'); - } else { + } + else { tok_backup(tok, c); } *p_start = tok->start; @@ -1587,64 +1634,94 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) if (c == '0') { /* Hex, octal or binary -- maybe. */ c = tok_nextc(tok); - if (c == '.') - goto fraction; - if (c == 'j' || c == 'J') - goto imaginary; if (c == 'x' || c == 'X') { - /* Hex */ c = tok_nextc(tok); - if (!isxdigit(c)) { - tok->done = E_TOKEN; - tok_backup(tok, c); - return ERRORTOKEN; - } do { - c = tok_nextc(tok); - } while (isxdigit(c)); + if (c == '_') { + c = tok_nextc(tok); + } + if (!isxdigit(c)) { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + do { + c = tok_nextc(tok); + } while (isxdigit(c)); + } while (c == '_'); } else if (c == 'o' || c == 'O') { /* Octal */ c = tok_nextc(tok); - if (c < '0' || c >= '8') { - tok->done = E_TOKEN; - tok_backup(tok, c); - return ERRORTOKEN; - } do { - c = tok_nextc(tok); - } while ('0' <= c && c < '8'); + if (c == '_') { + c = tok_nextc(tok); + } + if (c < '0' || c >= '8') { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + do { + c = tok_nextc(tok); + } while ('0' <= c && c < '8'); + } while (c == '_'); } else if (c == 'b' || c == 'B') { /* Binary */ c = tok_nextc(tok); - if (c != '0' && c != '1') { - tok->done = E_TOKEN; - tok_backup(tok, c); - return ERRORTOKEN; - } do { - c = tok_nextc(tok); - } while (c == '0' || c == '1'); + if (c == '_') { + c = tok_nextc(tok); + } + if (c != '0' && c != '1') { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + do { + c = tok_nextc(tok); + } while (c == '0' || c == '1'); + } while (c == '_'); } else { int nonzero = 0; /* maybe old-style octal; c is first char of it */ /* in any case, allow '0' as a literal */ - while (c == '0') + while (1) { + if (c == '_') { + c = tok_nextc(tok); + if (!isdigit(c)) { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + } + if (c != '0') { + break; + } c = tok_nextc(tok); - while (isdigit(c)) { + } + if (isdigit(c)) { nonzero = 1; - c = tok_nextc(tok); + c = tok_decimal_tail(tok); + if (c == 0) { + return ERRORTOKEN; + } } - if (c == '.') + if (c == '.') { + c = tok_nextc(tok); goto fraction; - else if (c == 'e' || c == 'E') + } + else if (c == 'e' || c == 'E') { goto exponent; - else if (c == 'j' || c == 'J') + } + else if (c == 'j' || c == 'J') { goto imaginary; + } else if (nonzero) { + /* Old-style octal: now disallowed. */ tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; @@ -1653,17 +1730,22 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) } else { /* Decimal */ - do { - c = tok_nextc(tok); - } while (isdigit(c)); + c = tok_decimal_tail(tok); + if (c == 0) { + return ERRORTOKEN; + } { /* Accept floating point numbers. */ if (c == '.') { + c = tok_nextc(tok); fraction: /* Fraction */ - do { - c = tok_nextc(tok); - } while (isdigit(c)); + if (isdigit(c)) { + c = tok_decimal_tail(tok); + if (c == 0) { + return ERRORTOKEN; + } + } } if (c == 'e' || c == 'E') { int e; @@ -1685,14 +1767,16 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) *p_end = tok->cur; return NUMBER; } - do { - c = tok_nextc(tok); - } while (isdigit(c)); + c = tok_decimal_tail(tok); + if (c == 0) { + return ERRORTOKEN; + } } - if (c == 'j' || c == 'J') + if (c == 'j' || c == 'J') { /* Imaginary part */ imaginary: c = tok_nextc(tok); + } } } tok_backup(tok, c); @@ -1712,22 +1796,27 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) c = tok_nextc(tok); if (c == quote) { c = tok_nextc(tok); - if (c == quote) + if (c == quote) { quote_size = 3; - else + } + else { end_quote_size = 1; /* empty string found */ + } } - if (c != quote) + if (c != quote) { tok_backup(tok, c); + } /* Get rest of string */ while (end_quote_size != quote_size) { c = tok_nextc(tok); if (c == EOF) { - if (quote_size == 3) + if (quote_size == 3) { tok->done = E_EOFS; - else + } + else { tok->done = E_EOLS; + } tok->cur = tok->inp; return ERRORTOKEN; } @@ -1736,12 +1825,14 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) tok->cur = tok->inp; return ERRORTOKEN; } - if (c == quote) + if (c == quote) { end_quote_size += 1; + } else { end_quote_size = 0; - if (c == '\\') - c = tok_nextc(tok); /* skip escaped char */ + if (c == '\\') { + tok_nextc(tok); /* skip escaped char */ + } } } @@ -1771,7 +1862,8 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end) int token3 = PyToken_ThreeChars(c, c2, c3); if (token3 != OP) { token = token3; - } else { + } + else { tok_backup(tok, c3); } *p_start = tok->start; |