diff options
Diffstat (limited to 'Tools/peg_generator/pegen/tokenizer.py')
-rw-r--r-- | Tools/peg_generator/pegen/tokenizer.py | 58 |
1 files changed, 45 insertions, 13 deletions
diff --git a/Tools/peg_generator/pegen/tokenizer.py b/Tools/peg_generator/pegen/tokenizer.py index 61a28ef..7ee49e1 100644 --- a/Tools/peg_generator/pegen/tokenizer.py +++ b/Tools/peg_generator/pegen/tokenizer.py @@ -1,10 +1,10 @@ import token import tokenize -from typing import List, Iterator +from typing import Dict, Iterator, List Mark = int # NewType('Mark', int) -exact_token_types = token.EXACT_TOKEN_TYPES # type: ignore +exact_token_types = token.EXACT_TOKEN_TYPES def shorttok(tok: tokenize.TokenInfo) -> str: @@ -19,26 +19,22 @@ class Tokenizer: _tokens: List[tokenize.TokenInfo] - def __init__(self, tokengen: Iterator[tokenize.TokenInfo], *, verbose: bool = False): + def __init__( + self, tokengen: Iterator[tokenize.TokenInfo], *, path: str = "", verbose: bool = False + ): self._tokengen = tokengen self._tokens = [] self._index = 0 self._verbose = verbose + self._lines: Dict[int, str] = {} + self._path = path if verbose: self.report(False, False) def getnext(self) -> tokenize.TokenInfo: """Return the next token and updates the index.""" - cached = True - while self._index == len(self._tokens): - tok = next(self._tokengen) - if tok.type in (tokenize.NL, tokenize.COMMENT): - continue - if tok.type == token.ERRORTOKEN and tok.string.isspace(): - continue - self._tokens.append(tok) - cached = False - tok = self._tokens[self._index] + cached = not self._index == len(self._tokens) + tok = self.peek() self._index += 1 if self._verbose: self.report(cached, False) @@ -52,7 +48,15 @@ class Tokenizer: continue if tok.type == token.ERRORTOKEN and tok.string.isspace(): continue + if ( + tok.type == token.NEWLINE + and self._tokens + and self._tokens[-1].type == token.NEWLINE + ): + continue self._tokens.append(tok) + if not self._path: + self._lines[tok.start[0]] = tok.line return self._tokens[self._index] def diagnose(self) -> tokenize.TokenInfo: @@ -60,6 +64,34 @@ class Tokenizer: self.getnext() return self._tokens[-1] + def get_last_non_whitespace_token(self) -> tokenize.TokenInfo: + for tok in reversed(self._tokens[: self._index]): + if tok.type != tokenize.ENDMARKER and ( + tok.type < tokenize.NEWLINE or tok.type > tokenize.DEDENT + ): + break + return tok + + def get_lines(self, line_numbers: List[int]) -> List[str]: + """Retrieve source lines corresponding to line numbers.""" + if self._lines: + lines = self._lines + else: + n = len(line_numbers) + lines = {} + count = 0 + seen = 0 + with open(self._path) as f: + for l in f: + count += 1 + if count in line_numbers: + seen += 1 + lines[count] = l + if seen == n: + break + + return [lines[n] for n in line_numbers] + def mark(self) -> Mark: return self._index |