diff options
author | Marta Gómez Macías <mgmacias@google.com> | 2023-05-21 00:03:02 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-21 00:03:02 (GMT) |
commit | 6715f91edcf6f379f666e18f57b8a0dcb724bf79 (patch) | |
tree | 25724d6eb5b8ff5e713f7bfd8f6c33e5a6d87f62 /Lib/tokenize.py | |
parent | 3ed57e4995d9f8583083483f397ddc3131720953 (diff) | |
download | cpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.zip cpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.tar.gz cpython-6715f91edcf6f379f666e18f57b8a0dcb724bf79.tar.bz2 |
gh-102856: Python tokenizer implementation for PEP 701 (#104323)
This commit replaces the Python implementation of the tokenize module with an implementation
that reuses the real C tokenizer via a private extension module. The tokenize module now implements
a compatibility layer that transforms tokens from the C tokenizer into Python tokenize tokens for backward
compatibility.
As the C tokenizer does not emit some tokens that the Python tokenizer provides (such as comments and non-semantic newlines), a new special mode has been added to the C tokenizer mode that currently is only used via
the extension module that exposes it to the Python layer. This new mode forces the C tokenizer to emit these new extra tokens and add the appropriate metadata that is needed to match the old Python implementation.
Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
Diffstat (limited to 'Lib/tokenize.py')
-rw-r--r-- | Lib/tokenize.py | 339 |
1 files changed, 51 insertions, 288 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 46d2224..bfe40c6 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -56,112 +56,11 @@ class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line' else: return self.type -def group(*choices): return '(' + '|'.join(choices) + ')' -def any(*choices): return group(*choices) + '*' -def maybe(*choices): return group(*choices) + '?' - -# Note: we use unicode matching for names ("\w") but ascii matching for -# number literals. -Whitespace = r'[ \f\t]*' -Comment = r'#[^\r\n]*' -Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) -Name = r'\w+' - -Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' -Binnumber = r'0[bB](?:_?[01])+' -Octnumber = r'0[oO](?:_?[0-7])+' -Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' -Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) -Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' -Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', - r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) -Expfloat = r'[0-9](?:_?[0-9])*' + Exponent -Floatnumber = group(Pointfloat, Expfloat) -Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') -Number = group(Imagnumber, Floatnumber, Intnumber) - -# Return the empty string, plus all of the valid string prefixes. -def _all_string_prefixes(): - # The valid string prefixes. Only contain the lower case versions, - # and don't contain any permutations (include 'fr', but not - # 'rf'). The various permutations will be generated. - _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr'] - # if we add binary f-strings, add: ['fb', 'fbr'] - result = {''} - for prefix in _valid_string_prefixes: - for t in _itertools.permutations(prefix): - # create a list with upper and lower versions of each - # character - for u in _itertools.product(*[(c, c.upper()) for c in t]): - result.add(''.join(u)) - return result - -@functools.lru_cache -def _compile(expr): - return re.compile(expr, re.UNICODE) - -# Note that since _all_string_prefixes includes the empty string, -# StringPrefix can be the empty string (making it optional). -StringPrefix = group(*_all_string_prefixes()) - -# Tail end of ' string. -Single = r"[^'\\]*(?:\\.[^'\\]*)*'" -# Tail end of " string. -Double = r'[^"\\]*(?:\\.[^"\\]*)*"' -# Tail end of ''' string. -Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" -# Tail end of """ string. -Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' -Triple = group(StringPrefix + "'''", StringPrefix + '"""') -# Single-line ' or " string. -String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", - StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') - -# Sorting in reverse order puts the long operators before their prefixes. -# Otherwise if = came before ==, == would get recognized as two instances -# of =. -Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True))) -Funny = group(r'\r?\n', Special) - -PlainToken = group(Number, Funny, String, Name) -Token = Ignore + PlainToken - -# First (or only) line of ' or " string. -ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + - group("'", r'\\\r?\n'), - StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + - group('"', r'\\\r?\n')) -PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) -PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) - -# For a given string prefix plus quotes, endpats maps it to a regex -# to match the remainder of that string. _prefix can be empty, for -# a normal single or triple quoted string (with no prefix). -endpats = {} -for _prefix in _all_string_prefixes(): - endpats[_prefix + "'"] = Single - endpats[_prefix + '"'] = Double - endpats[_prefix + "'''"] = Single3 - endpats[_prefix + '"""'] = Double3 -del _prefix - -# A set of all of the single and triple quoted string prefixes, -# including the opening quotes. -single_quoted = set() -triple_quoted = set() -for t in _all_string_prefixes(): - for u in (t + '"', t + "'"): - single_quoted.add(u) - for u in (t + '"""', t + "'''"): - triple_quoted.add(u) -del t, u - -tabsize = 8 class TokenError(Exception): pass -class StopTokenizing(Exception): pass +class StopTokenizing(Exception): pass class Untokenizer: @@ -213,6 +112,14 @@ class Untokenizer: self.tokens.append(indent) self.prev_col = len(indent) startline = False + elif tok_type == FSTRING_MIDDLE: + if '{' in token or '}' in token: + end_line, end_col = end + end = (end_line, end_col + token.count('{') + token.count('}')) + token = re.sub('{', '{{', token) + token = re.sub('}', '}}', token) + + self.add_whitespace(start) self.tokens.append(token) self.prev_row, self.prev_col = end @@ -255,6 +162,11 @@ class Untokenizer: elif startline and indents: toks_append(indents[-1]) startline = False + elif toknum == FSTRING_MIDDLE: + if '{' in tokval or '}' in tokval: + tokval = re.sub('{', '{{', tokval) + tokval = re.sub('}', '}}', tokval) + toks_append(tokval) @@ -404,7 +316,6 @@ def open(filename): buffer.close() raise - def tokenize(readline): """ The tokenize() generator requires one argument, readline, which @@ -425,192 +336,32 @@ def tokenize(readline): which tells you which encoding was used to decode the bytes stream. """ encoding, consumed = detect_encoding(readline) - empty = _itertools.repeat(b"") - rl_gen = _itertools.chain(consumed, iter(readline, b""), empty) - return _tokenize(rl_gen.__next__, encoding) - - -def _tokenize(readline, encoding): - lnum = parenlev = continued = 0 - numchars = '0123456789' - contstr, needcont = '', 0 - contline = None - indents = [0] - + rl_gen = _itertools.chain(consumed, iter(readline, b"")) if encoding is not None: if encoding == "utf-8-sig": # BOM will already have been stripped. encoding = "utf-8" yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') - last_line = b'' - line = b'' - while True: # loop over lines in stream - try: - # We capture the value of the line variable here because - # readline uses the empty string '' to signal end of input, - # hence `line` itself will always be overwritten at the end - # of this loop. - last_line = line - line = readline() - except StopIteration: - line = b'' - - if encoding is not None: - line = line.decode(encoding) - lnum += 1 - pos, max = 0, len(line) - - if contstr: # continued string - if not line: - raise TokenError("EOF in multi-line string", strstart) - endmatch = endprog.match(line) - if endmatch: - pos = end = endmatch.end(0) - yield TokenInfo(STRING, contstr + line[:end], - strstart, (lnum, end), contline + line) - contstr, needcont = '', 0 - contline = None - elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': - yield TokenInfo(ERRORTOKEN, contstr + line, - strstart, (lnum, len(line)), contline) - contstr = '' - contline = None - continue - else: - contstr = contstr + line - contline = contline + line - continue - - elif parenlev == 0 and not continued: # new statement - if not line: break - column = 0 - while pos < max: # measure leading whitespace - if line[pos] == ' ': - column += 1 - elif line[pos] == '\t': - column = (column//tabsize + 1)*tabsize - elif line[pos] == '\f': - column = 0 - else: - break - pos += 1 - if pos == max: - break - - if line[pos] in '#\r\n': # skip comments or blank lines - if line[pos] == '#': - comment_token = line[pos:].rstrip('\r\n') - yield TokenInfo(COMMENT, comment_token, - (lnum, pos), (lnum, pos + len(comment_token)), line) - pos += len(comment_token) - - yield TokenInfo(NL, line[pos:], - (lnum, pos), (lnum, len(line)), line) - continue - - if column > indents[-1]: # count indents or dedents - indents.append(column) - yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) - while column < indents[-1]: - if column not in indents: - raise IndentationError( - "unindent does not match any outer indentation level", - ("<tokenize>", lnum, pos, line)) - indents = indents[:-1] - - yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) - - else: # continued statement - if not line: - raise TokenError("EOF in multi-line statement", (lnum, 0)) - continued = 0 - - while pos < max: - pseudomatch = _compile(PseudoToken).match(line, pos) - if pseudomatch: # scan for tokens - start, end = pseudomatch.span(1) - spos, epos, pos = (lnum, start), (lnum, end), end - if start == end: - continue - token, initial = line[start:end], line[start] - - if (initial in numchars or # ordinary number - (initial == '.' and token != '.' and token != '...')): - yield TokenInfo(NUMBER, token, spos, epos, line) - elif initial in '\r\n': - if parenlev > 0: - yield TokenInfo(NL, token, spos, epos, line) - else: - yield TokenInfo(NEWLINE, token, spos, epos, line) - - elif initial == '#': - assert not token.endswith("\n") - yield TokenInfo(COMMENT, token, spos, epos, line) - - elif token in triple_quoted: - endprog = _compile(endpats[token]) - endmatch = endprog.match(line, pos) - if endmatch: # all on one line - pos = endmatch.end(0) - token = line[start:pos] - yield TokenInfo(STRING, token, spos, (lnum, pos), line) - else: - strstart = (lnum, start) # multiple lines - contstr = line[start:] - contline = line - break - - # Check up to the first 3 chars of the token to see if - # they're in the single_quoted set. If so, they start - # a string. - # We're using the first 3, because we're looking for - # "rb'" (for example) at the start of the token. If - # we switch to longer prefixes, this needs to be - # adjusted. - # Note that initial == token[:1]. - # Also note that single quote checking must come after - # triple quote checking (above). - elif (initial in single_quoted or - token[:2] in single_quoted or - token[:3] in single_quoted): - if token[-1] == '\n': # continued string - strstart = (lnum, start) - # Again, using the first 3 chars of the - # token. This is looking for the matching end - # regex for the correct type of quote - # character. So it's really looking for - # endpats["'"] or endpats['"'], by trying to - # skip string prefix characters, if any. - endprog = _compile(endpats.get(initial) or - endpats.get(token[1]) or - endpats.get(token[2])) - contstr, needcont = line[start:], 1 - contline = line - break - else: # ordinary string - yield TokenInfo(STRING, token, spos, epos, line) - - elif initial.isidentifier(): # ordinary name - yield TokenInfo(NAME, token, spos, epos, line) - elif initial == '\\': # continued stmt - continued = 1 - else: - if initial in '([{': - parenlev += 1 - elif initial in ')]}': - parenlev -= 1 - yield TokenInfo(OP, token, spos, epos, line) - else: - yield TokenInfo(ERRORTOKEN, line[pos], - (lnum, pos), (lnum, pos+1), line) - pos += 1 - - # Add an implicit NEWLINE if the input doesn't end in one - if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"): - yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') - for indent in indents[1:]: # pop remaining indent levels - yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') - yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') + yield from _tokenize(rl_gen, encoding) + +def _tokenize(rl_gen, encoding): + source = b"".join(rl_gen).decode(encoding) + token = None + for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True): + # TODO: Marta -> limpiar esto + if 6 < token.type <= 54: + token = token._replace(type=OP) + if token.type in {ASYNC, AWAIT}: + token = token._replace(type=NAME) + if token.type == NEWLINE: + l_start, c_start = token.start + l_end, c_end = token.end + token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1)) + + yield token + if token is not None: + last_line, _ = token.start + yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '') def generate_tokens(readline): @@ -619,7 +370,16 @@ def generate_tokens(readline): This has the same API as tokenize(), except that it expects the *readline* callable to return str objects instead of bytes. """ - return _tokenize(readline, None) + def _gen(): + while True: + try: + line = readline() + except StopIteration: + return + if not line: + return + yield line.encode() + return _tokenize(_gen(), 'utf-8') def main(): import argparse @@ -656,7 +416,10 @@ def main(): tokens = list(tokenize(f.readline)) else: filename = "<stdin>" - tokens = _tokenize(sys.stdin.readline, None) + tokens = _tokenize( + (x.encode('utf-8') for x in iter(sys.stdin.readline, "") + ), "utf-8") + # Output the tokenization for token in tokens: @@ -682,10 +445,10 @@ def main(): perror("unexpected error: %s" % err) raise -def _generate_tokens_from_c_tokenizer(source): +def _generate_tokens_from_c_tokenizer(source, extra_tokens=False): """Tokenize a source reading Python code as unicode strings using the internal C tokenizer""" import _tokenize as c_tokenizer - for info in c_tokenizer.TokenizerIter(source): + for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens): tok, type, lineno, end_lineno, col_off, end_col_off, line = info yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line) |