diff options
author | Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> | 2023-05-31 10:11:53 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-31 10:11:53 (GMT) |
commit | c687946f6815a17bc5ceacaf3bbceba5b41e73fd (patch) | |
tree | 232c64d0c0190d8da0f3d6b9c3ab4528e4bcba0c /Lib/tokenize.py | |
parent | 2f8c22f1d6c22f018c78264937db66d52fb18869 (diff) | |
download | cpython-c687946f6815a17bc5ceacaf3bbceba5b41e73fd.zip cpython-c687946f6815a17bc5ceacaf3bbceba5b41e73fd.tar.gz cpython-c687946f6815a17bc5ceacaf3bbceba5b41e73fd.tar.bz2 |
[3.12] gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively (GH-105070) (#105119)
gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively (GH-105070)
(cherry picked from commit 9216e69a87d16d871625721ed5a8aa302511f367)
Co-authored-by: Pablo Galindo Salgado <Pablogsal@gmail.com>
Diffstat (limited to 'Lib/tokenize.py')
-rw-r--r-- | Lib/tokenize.py | 32 |
1 files changed, 11 insertions, 21 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 4895e94..380dc2a 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -34,6 +34,7 @@ import re import sys from token import * from token import EXACT_TOKEN_TYPES +import _tokenize cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) @@ -443,12 +444,7 @@ def tokenize(readline): # BOM will already have been stripped. encoding = "utf-8" yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') - yield from _tokenize(rl_gen, encoding) - -def _tokenize(rl_gen, encoding): - source = b"".join(rl_gen).decode(encoding) - for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True): - yield token + yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True) def generate_tokens(readline): """Tokenize a source reading Python code as unicode strings. @@ -456,16 +452,7 @@ def generate_tokens(readline): This has the same API as tokenize(), except that it expects the *readline* callable to return str objects instead of bytes. """ - def _gen(): - while True: - try: - line = readline() - except StopIteration: - return - if not line: - return - yield line.encode() - return _tokenize(_gen(), 'utf-8') + return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True) def main(): import argparse @@ -502,9 +489,9 @@ def main(): tokens = list(tokenize(f.readline)) else: filename = "<stdin>" - tokens = _tokenize( + tokens = _generate_tokens_from_c_tokenizer( (x.encode('utf-8') for x in iter(sys.stdin.readline, "") - ), "utf-8") + ), "utf-8", extra_tokens=True) # Output the tokenization @@ -531,10 +518,13 @@ def main(): perror("unexpected error: %s" % err) raise -def _generate_tokens_from_c_tokenizer(source, extra_tokens=False): +def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False): """Tokenize a source reading Python code as unicode strings using the internal C tokenizer""" - import _tokenize as c_tokenizer - for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens): + if encoding is None: + it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens) + else: + it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens) + for info in it: yield TokenInfo._make(info) |