summaryrefslogtreecommitdiffstats
path: root/Lib/tokenize.py
diff options
context:
space:
mode:
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>2023-05-31 10:11:53 (GMT)
committerGitHub <noreply@github.com>2023-05-31 10:11:53 (GMT)
commitc687946f6815a17bc5ceacaf3bbceba5b41e73fd (patch)
tree232c64d0c0190d8da0f3d6b9c3ab4528e4bcba0c /Lib/tokenize.py
parent2f8c22f1d6c22f018c78264937db66d52fb18869 (diff)
downloadcpython-c687946f6815a17bc5ceacaf3bbceba5b41e73fd.zip
cpython-c687946f6815a17bc5ceacaf3bbceba5b41e73fd.tar.gz
cpython-c687946f6815a17bc5ceacaf3bbceba5b41e73fd.tar.bz2
[3.12] gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively (GH-105070) (#105119)
gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively (GH-105070) (cherry picked from commit 9216e69a87d16d871625721ed5a8aa302511f367) Co-authored-by: Pablo Galindo Salgado <Pablogsal@gmail.com>
Diffstat (limited to 'Lib/tokenize.py')
-rw-r--r--Lib/tokenize.py32
1 files changed, 11 insertions, 21 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 4895e94..380dc2a 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -34,6 +34,7 @@ import re
import sys
from token import *
from token import EXACT_TOKEN_TYPES
+import _tokenize
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@@ -443,12 +444,7 @@ def tokenize(readline):
# BOM will already have been stripped.
encoding = "utf-8"
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
- yield from _tokenize(rl_gen, encoding)
-
-def _tokenize(rl_gen, encoding):
- source = b"".join(rl_gen).decode(encoding)
- for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
- yield token
+ yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True)
def generate_tokens(readline):
"""Tokenize a source reading Python code as unicode strings.
@@ -456,16 +452,7 @@ def generate_tokens(readline):
This has the same API as tokenize(), except that it expects the *readline*
callable to return str objects instead of bytes.
"""
- def _gen():
- while True:
- try:
- line = readline()
- except StopIteration:
- return
- if not line:
- return
- yield line.encode()
- return _tokenize(_gen(), 'utf-8')
+ return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True)
def main():
import argparse
@@ -502,9 +489,9 @@ def main():
tokens = list(tokenize(f.readline))
else:
filename = "<stdin>"
- tokens = _tokenize(
+ tokens = _generate_tokens_from_c_tokenizer(
(x.encode('utf-8') for x in iter(sys.stdin.readline, "")
- ), "utf-8")
+ ), "utf-8", extra_tokens=True)
# Output the tokenization
@@ -531,10 +518,13 @@ def main():
perror("unexpected error: %s" % err)
raise
-def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
+def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
- import _tokenize as c_tokenizer
- for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
+ if encoding is None:
+ it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
+ else:
+ it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
+ for info in it:
yield TokenInfo._make(info)