summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/library/tokenize.rst53
-rw-r--r--Lib/test/test_tokenize.py75
-rw-r--r--Lib/tokenize.py59
-rw-r--r--Misc/NEWS3
4 files changed, 187 insertions, 3 deletions
diff --git a/Doc/library/tokenize.rst b/Doc/library/tokenize.rst
index 050d74c..37d9f41 100644
--- a/Doc/library/tokenize.rst
+++ b/Doc/library/tokenize.rst
@@ -15,6 +15,11 @@ implemented in Python. The scanner in this module returns comments as tokens
as well, making it useful for implementing "pretty-printers," including
colorizers for on-screen displays.
+To simplify token stream handling, all :ref:`operators` and :ref:`delimiters`
+tokens are returned using the generic :data:`token.OP` token type. The exact
+type can be determined by checking the ``exact_type`` property on the
+:term:`named tuple` returned from :func:`tokenize.tokenize`.
+
Tokenizing Input
----------------
@@ -36,9 +41,17 @@ The primary entry point is a :term:`generator`:
returned as a :term:`named tuple` with the field names:
``type string start end line``.
+ The returned :term:`named tuple` has a additional property named
+ ``exact_type`` that contains the exact operator type for
+ :data:`token.OP` tokens. For all other token types ``exact_type``
+ equals the named tuple ``type`` field.
+
.. versionchanged:: 3.1
Added support for named tuples.
+ .. versionchanged:: 3.3
+ Added support for ``exact_type``.
+
:func:`tokenize` determines the source encoding of the file by looking for a
UTF-8 BOM or encoding cookie, according to :pep:`263`.
@@ -131,7 +144,19 @@ It is as simple as:
.. code-block:: sh
- python -m tokenize [filename.py]
+ python -m tokenize [-e] [filename.py]
+
+The following options are accepted:
+
+.. program:: tokenize
+
+.. cmdoption:: -h, --help
+
+ show this help message and exit
+
+.. cmdoption:: -e, --exact
+
+ display token names using the exact type
If :file:`filename.py` is specified its contents are tokenized to stdout.
Otherwise, tokenization is performed on stdin.
@@ -215,3 +240,29 @@ the name of the token, and the final column is the value of the token (if any)
4,10-4,11: OP ')'
4,11-4,12: NEWLINE '\n'
5,0-5,0: ENDMARKER ''
+
+The exact token type names can be displayed using the ``-e`` option:
+
+.. code-block:: sh
+
+ $ python -m tokenize -e hello.py
+ 0,0-0,0: ENCODING 'utf-8'
+ 1,0-1,3: NAME 'def'
+ 1,4-1,13: NAME 'say_hello'
+ 1,13-1,14: LPAR '('
+ 1,14-1,15: RPAR ')'
+ 1,15-1,16: COLON ':'
+ 1,16-1,17: NEWLINE '\n'
+ 2,0-2,4: INDENT ' '
+ 2,4-2,9: NAME 'print'
+ 2,9-2,10: LPAR '('
+ 2,10-2,25: STRING '"Hello, World!"'
+ 2,25-2,26: RPAR ')'
+ 2,26-2,27: NEWLINE '\n'
+ 3,0-3,1: NL '\n'
+ 4,0-4,0: DEDENT ''
+ 4,0-4,9: NAME 'say_hello'
+ 4,9-4,10: LPAR '('
+ 4,10-4,11: RPAR ')'
+ 4,11-4,12: NEWLINE '\n'
+ 5,0-5,0: ENDMARKER ''
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index af2bbf1..dce3c6e 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -567,11 +567,12 @@ Non-ascii identifiers
from test import support
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
- STRING, ENDMARKER, tok_name, detect_encoding,
+ STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
open as tokenize_open)
from io import BytesIO
from unittest import TestCase
import os, sys, glob
+import token
def dump_tokens(s):
"""Print out the tokens in s in a table format.
@@ -922,6 +923,78 @@ class TestTokenize(TestCase):
self.assertTrue(encoding_used, encoding)
+ def assertExactTypeEqual(self, opstr, *optypes):
+ tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
+ num_optypes = len(optypes)
+ self.assertEqual(len(tokens), 2 + num_optypes)
+ self.assertEqual(token.tok_name[tokens[0].exact_type],
+ token.tok_name[ENCODING])
+ for i in range(num_optypes):
+ self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
+ token.tok_name[optypes[i]])
+ self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
+ token.tok_name[token.ENDMARKER])
+
+ def test_exact_type(self):
+ self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
+ self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
+ self.assertExactTypeEqual(':', token.COLON)
+ self.assertExactTypeEqual(',', token.COMMA)
+ self.assertExactTypeEqual(';', token.SEMI)
+ self.assertExactTypeEqual('+', token.PLUS)
+ self.assertExactTypeEqual('-', token.MINUS)
+ self.assertExactTypeEqual('*', token.STAR)
+ self.assertExactTypeEqual('/', token.SLASH)
+ self.assertExactTypeEqual('|', token.VBAR)
+ self.assertExactTypeEqual('&', token.AMPER)
+ self.assertExactTypeEqual('<', token.LESS)
+ self.assertExactTypeEqual('>', token.GREATER)
+ self.assertExactTypeEqual('=', token.EQUAL)
+ self.assertExactTypeEqual('.', token.DOT)
+ self.assertExactTypeEqual('%', token.PERCENT)
+ self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
+ self.assertExactTypeEqual('==', token.EQEQUAL)
+ self.assertExactTypeEqual('!=', token.NOTEQUAL)
+ self.assertExactTypeEqual('<=', token.LESSEQUAL)
+ self.assertExactTypeEqual('>=', token.GREATEREQUAL)
+ self.assertExactTypeEqual('~', token.TILDE)
+ self.assertExactTypeEqual('^', token.CIRCUMFLEX)
+ self.assertExactTypeEqual('<<', token.LEFTSHIFT)
+ self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
+ self.assertExactTypeEqual('**', token.DOUBLESTAR)
+ self.assertExactTypeEqual('+=', token.PLUSEQUAL)
+ self.assertExactTypeEqual('-=', token.MINEQUAL)
+ self.assertExactTypeEqual('*=', token.STAREQUAL)
+ self.assertExactTypeEqual('/=', token.SLASHEQUAL)
+ self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
+ self.assertExactTypeEqual('&=', token.AMPEREQUAL)
+ self.assertExactTypeEqual('|=', token.VBAREQUAL)
+ self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
+ self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
+ self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
+ self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
+ self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
+ self.assertExactTypeEqual('//', token.DOUBLESLASH)
+ self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
+ self.assertExactTypeEqual('@', token.AT)
+
+ self.assertExactTypeEqual('a**2+b**2==c**2',
+ NAME, token.DOUBLESTAR, NUMBER,
+ token.PLUS,
+ NAME, token.DOUBLESTAR, NUMBER,
+ token.EQEQUAL,
+ NAME, token.DOUBLESTAR, NUMBER)
+ self.assertExactTypeEqual('{1, 2, 3}',
+ token.LBRACE,
+ token.NUMBER, token.COMMA,
+ token.NUMBER, token.COMMA,
+ token.NUMBER,
+ token.RBRACE)
+ self.assertExactTypeEqual('^(x & 0x1)',
+ token.CIRCUMFLEX,
+ token.LPAR,
+ token.NAME, token.AMPER, token.NUMBER,
+ token.RPAR)
__test__ = {"doctests" : doctests, 'decistmt': decistmt}
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index f923e17..4c42bbc 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -45,6 +45,51 @@ tok_name[NL] = 'NL'
ENCODING = N_TOKENS + 2
tok_name[ENCODING] = 'ENCODING'
N_TOKENS += 3
+EXACT_TOKEN_TYPES = {
+ '(': LPAR,
+ ')': RPAR,
+ '[': LSQB,
+ ']': RSQB,
+ ':': COLON,
+ ',': COMMA,
+ ';': SEMI,
+ '+': PLUS,
+ '-': MINUS,
+ '*': STAR,
+ '/': SLASH,
+ '|': VBAR,
+ '&': AMPER,
+ '<': LESS,
+ '>': GREATER,
+ '=': EQUAL,
+ '.': DOT,
+ '%': PERCENT,
+ '{': LBRACE,
+ '}': RBRACE,
+ '==': EQEQUAL,
+ '!=': NOTEQUAL,
+ '<=': LESSEQUAL,
+ '>=': GREATEREQUAL,
+ '~': TILDE,
+ '^': CIRCUMFLEX,
+ '<<': LEFTSHIFT,
+ '>>': RIGHTSHIFT,
+ '**': DOUBLESTAR,
+ '+=': PLUSEQUAL,
+ '-=': MINEQUAL,
+ '*=': STAREQUAL,
+ '/=': SLASHEQUAL,
+ '%=': PERCENTEQUAL,
+ '&=': AMPEREQUAL,
+ '|=': VBAREQUAL,
+ '^=': CIRCUMFLEXEQUAL,
+ '<<=': LEFTSHIFTEQUAL,
+ '>>=': RIGHTSHIFTEQUAL,
+ '**=': DOUBLESTAREQUAL,
+ '//': DOUBLESLASH,
+ '//=': DOUBLESLASHEQUAL,
+ '@': AT
+}
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
def __repr__(self):
@@ -52,6 +97,13 @@ class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line'
return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
self._replace(type=annotated_type))
+ @property
+ def exact_type(self):
+ if self.type == OP and self.string in EXACT_TOKEN_TYPES:
+ return EXACT_TOKEN_TYPES[self.string]
+ else:
+ return self.type
+
def group(*choices): return '(' + '|'.join(choices) + ')'
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'
@@ -549,6 +601,8 @@ def main():
parser.add_argument(dest='filename', nargs='?',
metavar='filename.py',
help='the file to tokenize; defaults to stdin')
+ parser.add_argument('-e', '--exact', dest='exact', action='store_true',
+ help='display token names using the exact type')
args = parser.parse_args()
try:
@@ -563,9 +617,12 @@ def main():
# Output the tokenization
for token in tokens:
+ token_type = token.type
+ if args.exact:
+ token_type = token.exact_type
token_range = "%d,%d-%d,%d:" % (token.start + token.end)
print("%-20s%-15s%-15r" %
- (token_range, tok_name[token.type], token.string))
+ (token_range, tok_name[token_type], token.string))
except IndentationError as err:
line, column = err.args[1][1:3]
error(err.args[0], filename, (line, column))
diff --git a/Misc/NEWS b/Misc/NEWS
index 2d70565..d7daba4 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -450,6 +450,9 @@ Core and Builtins
Library
-------
+- Issue #2134: A new attribute that specifies the exact type of token.OP
+ tokens has been added to tokenize.TokenInfo.
+
- Issue #13722: Avoid silencing ImportErrors when initializing the codecs
registry.