diff options
author | Albert-Jan Nijburg <albertjan@trinket.io> | 2017-05-31 14:00:21 (GMT) |
---|---|---|
committer | Victor Stinner <victor.stinner@gmail.com> | 2017-05-31 14:00:21 (GMT) |
commit | fc354f07855a9197e71f851ad930cbf5652f9160 (patch) | |
tree | 5c7c6ad2a4de72993d916c321156572d4fa4635d | |
parent | 85aba238e49abd2d5a604102981d28a50f305443 (diff) | |
download | cpython-fc354f07855a9197e71f851ad930cbf5652f9160.zip cpython-fc354f07855a9197e71f851ad930cbf5652f9160.tar.gz cpython-fc354f07855a9197e71f851ad930cbf5652f9160.tar.bz2 |
bpo-25324: copy tok_name before changing it (#1608)
* add test to check if were modifying token
* copy list so import tokenize doesnt have side effects on token
* shorten line
* add tokenize tokens to token.h to get them to show up in token
* move ERRORTOKEN back to its previous location, and fix nitpick
* copy comments from token.h automatically
* fix whitespace and make more pythonic
* change to fix comments from @haypo
* update token.rst and Misc/NEWS
* change wording
* some more wording changes
-rw-r--r-- | Doc/library/token.rst | 8 | ||||
-rw-r--r-- | Include/token.h | 6 | ||||
-rw-r--r-- | Lib/test/test_tokenize.py | 12 | ||||
-rw-r--r-- | Lib/token.py | 29 | ||||
-rw-r--r-- | Lib/tokenize.py | 11 | ||||
-rw-r--r-- | Misc/NEWS | 4 | ||||
-rw-r--r-- | Parser/tokenizer.c | 3 |
7 files changed, 52 insertions, 21 deletions
diff --git a/Doc/library/token.rst b/Doc/library/token.rst index effb711..4bf15d5 100644 --- a/Doc/library/token.rst +++ b/Doc/library/token.rst @@ -101,6 +101,9 @@ The token constants are: AWAIT ASYNC ERRORTOKEN + COMMENT + NL + ENCODING N_TOKENS NT_OFFSET @@ -108,3 +111,8 @@ The token constants are: Added :data:`AWAIT` and :data:`ASYNC` tokens. Starting with Python 3.7, "async" and "await" will be tokenized as :data:`NAME` tokens, and :data:`AWAIT` and :data:`ASYNC` will be removed. + + .. versionchanged:: 3.7 + Added :data:`COMMENT`, :data:`NL` and :data:`ENCODING` to bring + the tokens in the C code in line with the tokens needed in + :mod:`tokenize` module. These tokens aren't used by the C tokenizer.
\ No newline at end of file diff --git a/Include/token.h b/Include/token.h index 595afa0..b28830b 100644 --- a/Include/token.h +++ b/Include/token.h @@ -67,7 +67,11 @@ extern "C" { #define AWAIT 54 #define ASYNC 55 #define ERRORTOKEN 56 -#define N_TOKENS 57 +/* These aren't used by the C tokenizer but are needed for tokenize.py */ +#define COMMENT 57 +#define NL 58 +#define ENCODING 59 +#define N_TOKENS 60 /* Special definitions for cooperation with parser */ diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index dcaf58f..538612c 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1343,13 +1343,13 @@ class TestTokenize(TestCase): tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline)) num_optypes = len(optypes) self.assertEqual(len(tokens), 2 + num_optypes) - self.assertEqual(token.tok_name[tokens[0].exact_type], - token.tok_name[ENCODING]) + self.assertEqual(tok_name[tokens[0].exact_type], + tok_name[ENCODING]) for i in range(num_optypes): - self.assertEqual(token.tok_name[tokens[i + 1].exact_type], - token.tok_name[optypes[i]]) - self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type], - token.tok_name[token.ENDMARKER]) + self.assertEqual(tok_name[tokens[i + 1].exact_type], + tok_name[optypes[i]]) + self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type], + tok_name[token.ENDMARKER]) def test_exact_type(self): self.assertExactTypeEqual('()', token.LPAR, token.RPAR) diff --git a/Lib/token.py b/Lib/token.py index 5fdb222..091f80b 100644 --- a/Lib/token.py +++ b/Lib/token.py @@ -63,11 +63,17 @@ AT = 49 ATEQUAL = 50 RARROW = 51 ELLIPSIS = 52 +# Don't forget to update the table _PyParser_TokenNames in tokenizer.c! OP = 53 AWAIT = 54 ASYNC = 55 ERRORTOKEN = 56 -N_TOKENS = 57 +# These aren't used by the C tokenizer but are needed for tokenize.py +COMMENT = 57 +NL = 58 +ENCODING = 59 +N_TOKENS = 60 +# Special definitions for cooperation with parser NT_OFFSET = 256 #--end constants-- @@ -102,15 +108,26 @@ def _main(): with fp: lines = fp.read().split("\n") prog = re.compile( - "#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)", + r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)", re.IGNORECASE) + comment_regex = re.compile( + r"^\s*/\*\s*(.+?)\s*\*/\s*$", + re.IGNORECASE) + tokens = {} + prev_val = None for line in lines: match = prog.match(line) if match: name, val = match.group(1, 2) val = int(val) - tokens[val] = name # reverse so we can sort them... + tokens[val] = {'token': name} # reverse so we can sort them... + prev_val = val + else: + comment_match = comment_regex.match(line) + if comment_match and prev_val is not None: + comment = comment_match.group(1) + tokens[prev_val]['comment'] = comment keys = sorted(tokens.keys()) # load the output skeleton from the target: try: @@ -127,8 +144,10 @@ def _main(): sys.stderr.write("target does not contain format markers") sys.exit(3) lines = [] - for val in keys: - lines.append("%s = %d" % (tokens[val], val)) + for key in keys: + lines.append("%s = %d" % (tokens[key]["token"], key)) + if "comment" in tokens[key]: + lines.append("# %s" % tokens[key]["comment"]) format[start:end] = lines try: fp = open(outFileName, 'w') diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 9017bb1..5fa4152 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -38,17 +38,10 @@ cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) import token -__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding", - "NL", "untokenize", "ENCODING", "TokenInfo"] +__all__ = token.__all__ + ["tokenize", "detect_encoding", + "untokenize", "TokenInfo"] del token -COMMENT = N_TOKENS -tok_name[COMMENT] = 'COMMENT' -NL = N_TOKENS + 1 -tok_name[NL] = 'NL' -ENCODING = N_TOKENS + 2 -tok_name[ENCODING] = 'ENCODING' -N_TOKENS += 3 EXACT_TOKEN_TYPES = { '(': LPAR, ')': RPAR, @@ -10,6 +10,10 @@ What's New in Python 3.7.0 alpha 1? Core and Builtins ----------------- +- bpo-25324: Tokens needed for parsing in Python moved to C. ``COMMENT``, + ``NL`` and ``ENCODING``. This way the tokens and tok_names in the token + module don't get changed when you import the tokenize module. + - bpo-29104: Fixed parsing backslashes in f-strings. - bpo-27945: Fixed various segfaults with dict when input collections are diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 5cc9533..7f2f3e6 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -106,6 +106,9 @@ const char *_PyParser_TokenNames[] = { "AWAIT", "ASYNC", "<ERRORTOKEN>", + "COMMENT", + "NL", + "ENCODING" "<N_TOKENS>" }; |