summaryrefslogtreecommitdiffstats
path: root/Parser/tokenizer.c
diff options
context:
space:
mode:
authorGuido van Rossum <guido@python.org>2007-11-16 00:51:45 (GMT)
committerGuido van Rossum <guido@python.org>2007-11-16 00:51:45 (GMT)
commitcf171a7fbcf4967feb7b4cd01c56250fb3fc8c8a (patch)
tree62836362cc9d2346f1c902b4cdd2fe2582d7b257 /Parser/tokenizer.c
parent053b4f3a0e3ef404b1c663229fd6ebaf0a1fb0a9 (diff)
downloadcpython-cf171a7fbcf4967feb7b4cd01c56250fb3fc8c8a.zip
cpython-cf171a7fbcf4967feb7b4cd01c56250fb3fc8c8a.tar.gz
cpython-cf171a7fbcf4967feb7b4cd01c56250fb3fc8c8a.tar.bz2
Cleanup of tokenizer.c.
Diffstat (limited to 'Parser/tokenizer.c')
-rw-r--r--Parser/tokenizer.c108
1 files changed, 47 insertions, 61 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 710c566..1c2b8e8 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1269,30 +1269,24 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
/* Identifier (most frequent token!) */
nonascii = 0;
if (is_potential_identifier_start(c)) {
- /* Process r"", u"" and ur"" */
- switch (c) {
- case 'r':
- case 'R':
+ /* Process b"", r"" and br"" */
+ if (c == 'b' || c == 'B') {
c = tok_nextc(tok);
if (c == '"' || c == '\'')
goto letter_quote;
- break;
- case 'b':
- case 'B':
+ }
+ if (c == 'r' || c == 'R') {
c = tok_nextc(tok);
- if (c == 'r' || c == 'R')
- c = tok_nextc(tok);
if (c == '"' || c == '\'')
goto letter_quote;
- break;
- }
+ }
while (is_potential_identifier_char(c)) {
if (c >= 128)
nonascii = 1;
c = tok_nextc(tok);
}
tok_backup(tok, c);
- if (nonascii &&
+ if (nonascii &&
!verify_identifier(tok->start, tok->cur)) {
tok->done = E_IDENTIFIER;
return ERRORTOKEN;
@@ -1322,7 +1316,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
c = tok_nextc(tok);
if (c == '.') {
*p_start = tok->start;
- *p_end = tok->cur;
+ *p_end = tok->cur;
return ELLIPSIS;
} else {
tok_backup(tok, c);
@@ -1436,55 +1430,47 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
letter_quote:
/* String */
if (c == '\'' || c == '"') {
- Py_ssize_t quote2 = tok->cur - tok->start + 1;
- int quote = c;
- int triple = 0;
- int tripcount = 0;
- for (;;) {
- c = tok_nextc(tok);
- if (c == '\n') {
- if (!triple) {
- tok->done = E_EOLS;
- tok_backup(tok, c);
- return ERRORTOKEN;
- }
- tripcount = 0;
- tok->cont_line = 1; /* multiline string. */
- }
- else if (c == EOF) {
- if (triple)
- tok->done = E_EOFS;
- else
- tok->done = E_EOLS;
- tok->cur = tok->inp;
- return ERRORTOKEN;
- }
- else if (c == quote) {
- tripcount++;
- if (tok->cur - tok->start == quote2) {
- c = tok_nextc(tok);
- if (c == quote) {
- triple = 1;
- tripcount = 0;
- continue;
- }
- tok_backup(tok, c);
- }
- if (!triple || tripcount == 3)
- break;
- }
- else if (c == '\\') {
- tripcount = 0;
- c = tok_nextc(tok);
- if (c == EOF) {
- tok->done = E_EOLS;
- tok->cur = tok->inp;
- return ERRORTOKEN;
- }
- }
+ int quote = c;
+ int quote_size = 1; /* 1 or 3 */
+ int end_quote_size = 0;
+
+ /* Find the quote size and start of string */
+ c = tok_nextc(tok);
+ if (c == quote) {
+ c = tok_nextc(tok);
+ if (c == quote)
+ quote_size = 3;
else
- tripcount = 0;
+ end_quote_size = 1; /* empty string found */
}
+ if (c != quote)
+ tok_backup(tok, c);
+
+ /* Get rest of string */
+ while (end_quote_size != quote_size) {
+ c = tok_nextc(tok);
+ if (c == EOF) {
+ if (quote_size == 3)
+ tok->done = E_EOFS;
+ else
+ tok->done = E_EOLS;
+ tok->cur = tok->inp;
+ return ERRORTOKEN;
+ }
+ if (quote_size == 1 && c == '\n') {
+ tok->done = E_EOLS;
+ tok->cur = tok->inp;
+ return ERRORTOKEN;
+ }
+ if (c == quote)
+ end_quote_size += 1;
+ else {
+ end_quote_size = 0;
+ if (c == '\\')
+ c = tok_nextc(tok); /* skip escaped char */
+ }
+ }
+
*p_start = tok->start;
*p_end = tok->cur;
return STRING;
@@ -1619,7 +1605,7 @@ PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
/* Get -*- encoding -*- from a Python file.
PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
- the first or second line of the file (in which case the encoding
+ the first or second line of the file (in which case the encoding
should be assumed to be PyUnicode_GetDefaultEncoding()).
The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed