summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlbert-Jan Nijburg <albertjan@trinket.io>2017-05-31 14:00:21 (GMT)
committerVictor Stinner <victor.stinner@gmail.com>2017-05-31 14:00:21 (GMT)
commitfc354f07855a9197e71f851ad930cbf5652f9160 (patch)
tree5c7c6ad2a4de72993d916c321156572d4fa4635d
parent85aba238e49abd2d5a604102981d28a50f305443 (diff)
downloadcpython-fc354f07855a9197e71f851ad930cbf5652f9160.zip
cpython-fc354f07855a9197e71f851ad930cbf5652f9160.tar.gz
cpython-fc354f07855a9197e71f851ad930cbf5652f9160.tar.bz2
bpo-25324: copy tok_name before changing it (#1608)
* add test to check if were modifying token * copy list so import tokenize doesnt have side effects on token * shorten line * add tokenize tokens to token.h to get them to show up in token * move ERRORTOKEN back to its previous location, and fix nitpick * copy comments from token.h automatically * fix whitespace and make more pythonic * change to fix comments from @haypo * update token.rst and Misc/NEWS * change wording * some more wording changes
-rw-r--r--Doc/library/token.rst8
-rw-r--r--Include/token.h6
-rw-r--r--Lib/test/test_tokenize.py12
-rw-r--r--Lib/token.py29
-rw-r--r--Lib/tokenize.py11
-rw-r--r--Misc/NEWS4
-rw-r--r--Parser/tokenizer.c3
7 files changed, 52 insertions, 21 deletions
diff --git a/Doc/library/token.rst b/Doc/library/token.rst
index effb711..4bf15d5 100644
--- a/Doc/library/token.rst
+++ b/Doc/library/token.rst
@@ -101,6 +101,9 @@ The token constants are:
AWAIT
ASYNC
ERRORTOKEN
+ COMMENT
+ NL
+ ENCODING
N_TOKENS
NT_OFFSET
@@ -108,3 +111,8 @@ The token constants are:
Added :data:`AWAIT` and :data:`ASYNC` tokens. Starting with
Python 3.7, "async" and "await" will be tokenized as :data:`NAME`
tokens, and :data:`AWAIT` and :data:`ASYNC` will be removed.
+
+ .. versionchanged:: 3.7
+ Added :data:`COMMENT`, :data:`NL` and :data:`ENCODING` to bring
+ the tokens in the C code in line with the tokens needed in
+ :mod:`tokenize` module. These tokens aren't used by the C tokenizer. \ No newline at end of file
diff --git a/Include/token.h b/Include/token.h
index 595afa0..b28830b 100644
--- a/Include/token.h
+++ b/Include/token.h
@@ -67,7 +67,11 @@ extern "C" {
#define AWAIT 54
#define ASYNC 55
#define ERRORTOKEN 56
-#define N_TOKENS 57
+/* These aren't used by the C tokenizer but are needed for tokenize.py */
+#define COMMENT 57
+#define NL 58
+#define ENCODING 59
+#define N_TOKENS 60
/* Special definitions for cooperation with parser */
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index dcaf58f..538612c 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1343,13 +1343,13 @@ class TestTokenize(TestCase):
tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
num_optypes = len(optypes)
self.assertEqual(len(tokens), 2 + num_optypes)
- self.assertEqual(token.tok_name[tokens[0].exact_type],
- token.tok_name[ENCODING])
+ self.assertEqual(tok_name[tokens[0].exact_type],
+ tok_name[ENCODING])
for i in range(num_optypes):
- self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
- token.tok_name[optypes[i]])
- self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
- token.tok_name[token.ENDMARKER])
+ self.assertEqual(tok_name[tokens[i + 1].exact_type],
+ tok_name[optypes[i]])
+ self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
+ tok_name[token.ENDMARKER])
def test_exact_type(self):
self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
diff --git a/Lib/token.py b/Lib/token.py
index 5fdb222..091f80b 100644
--- a/Lib/token.py
+++ b/Lib/token.py
@@ -63,11 +63,17 @@ AT = 49
ATEQUAL = 50
RARROW = 51
ELLIPSIS = 52
+# Don't forget to update the table _PyParser_TokenNames in tokenizer.c!
OP = 53
AWAIT = 54
ASYNC = 55
ERRORTOKEN = 56
-N_TOKENS = 57
+# These aren't used by the C tokenizer but are needed for tokenize.py
+COMMENT = 57
+NL = 58
+ENCODING = 59
+N_TOKENS = 60
+# Special definitions for cooperation with parser
NT_OFFSET = 256
#--end constants--
@@ -102,15 +108,26 @@ def _main():
with fp:
lines = fp.read().split("\n")
prog = re.compile(
- "#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
+ r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
re.IGNORECASE)
+ comment_regex = re.compile(
+ r"^\s*/\*\s*(.+?)\s*\*/\s*$",
+ re.IGNORECASE)
+
tokens = {}
+ prev_val = None
for line in lines:
match = prog.match(line)
if match:
name, val = match.group(1, 2)
val = int(val)
- tokens[val] = name # reverse so we can sort them...
+ tokens[val] = {'token': name} # reverse so we can sort them...
+ prev_val = val
+ else:
+ comment_match = comment_regex.match(line)
+ if comment_match and prev_val is not None:
+ comment = comment_match.group(1)
+ tokens[prev_val]['comment'] = comment
keys = sorted(tokens.keys())
# load the output skeleton from the target:
try:
@@ -127,8 +144,10 @@ def _main():
sys.stderr.write("target does not contain format markers")
sys.exit(3)
lines = []
- for val in keys:
- lines.append("%s = %d" % (tokens[val], val))
+ for key in keys:
+ lines.append("%s = %d" % (tokens[key]["token"], key))
+ if "comment" in tokens[key]:
+ lines.append("# %s" % tokens[key]["comment"])
format[start:end] = lines
try:
fp = open(outFileName, 'w')
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 9017bb1..5fa4152 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -38,17 +38,10 @@ cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
import token
-__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
- "NL", "untokenize", "ENCODING", "TokenInfo"]
+__all__ = token.__all__ + ["tokenize", "detect_encoding",
+ "untokenize", "TokenInfo"]
del token
-COMMENT = N_TOKENS
-tok_name[COMMENT] = 'COMMENT'
-NL = N_TOKENS + 1
-tok_name[NL] = 'NL'
-ENCODING = N_TOKENS + 2
-tok_name[ENCODING] = 'ENCODING'
-N_TOKENS += 3
EXACT_TOKEN_TYPES = {
'(': LPAR,
')': RPAR,
diff --git a/Misc/NEWS b/Misc/NEWS
index 6f90175..cda5ce0 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@ What's New in Python 3.7.0 alpha 1?
Core and Builtins
-----------------
+- bpo-25324: Tokens needed for parsing in Python moved to C. ``COMMENT``,
+ ``NL`` and ``ENCODING``. This way the tokens and tok_names in the token
+ module don't get changed when you import the tokenize module.
+
- bpo-29104: Fixed parsing backslashes in f-strings.
- bpo-27945: Fixed various segfaults with dict when input collections are
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 5cc9533..7f2f3e6 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -106,6 +106,9 @@ const char *_PyParser_TokenNames[] = {
"AWAIT",
"ASYNC",
"<ERRORTOKEN>",
+ "COMMENT",
+ "NL",
+ "ENCODING"
"<N_TOKENS>"
};