summaryrefslogtreecommitdiffstats
path: root/Tools/peg_generator/pegen/tokenizer.py
diff options
context:
space:
mode:
Diffstat (limited to 'Tools/peg_generator/pegen/tokenizer.py')
-rw-r--r--Tools/peg_generator/pegen/tokenizer.py58
1 files changed, 45 insertions, 13 deletions
diff --git a/Tools/peg_generator/pegen/tokenizer.py b/Tools/peg_generator/pegen/tokenizer.py
index 61a28ef..7ee49e1 100644
--- a/Tools/peg_generator/pegen/tokenizer.py
+++ b/Tools/peg_generator/pegen/tokenizer.py
@@ -1,10 +1,10 @@
import token
import tokenize
-from typing import List, Iterator
+from typing import Dict, Iterator, List
Mark = int # NewType('Mark', int)
-exact_token_types = token.EXACT_TOKEN_TYPES # type: ignore
+exact_token_types = token.EXACT_TOKEN_TYPES
def shorttok(tok: tokenize.TokenInfo) -> str:
@@ -19,26 +19,22 @@ class Tokenizer:
_tokens: List[tokenize.TokenInfo]
- def __init__(self, tokengen: Iterator[tokenize.TokenInfo], *, verbose: bool = False):
+ def __init__(
+ self, tokengen: Iterator[tokenize.TokenInfo], *, path: str = "", verbose: bool = False
+ ):
self._tokengen = tokengen
self._tokens = []
self._index = 0
self._verbose = verbose
+ self._lines: Dict[int, str] = {}
+ self._path = path
if verbose:
self.report(False, False)
def getnext(self) -> tokenize.TokenInfo:
"""Return the next token and updates the index."""
- cached = True
- while self._index == len(self._tokens):
- tok = next(self._tokengen)
- if tok.type in (tokenize.NL, tokenize.COMMENT):
- continue
- if tok.type == token.ERRORTOKEN and tok.string.isspace():
- continue
- self._tokens.append(tok)
- cached = False
- tok = self._tokens[self._index]
+ cached = not self._index == len(self._tokens)
+ tok = self.peek()
self._index += 1
if self._verbose:
self.report(cached, False)
@@ -52,7 +48,15 @@ class Tokenizer:
continue
if tok.type == token.ERRORTOKEN and tok.string.isspace():
continue
+ if (
+ tok.type == token.NEWLINE
+ and self._tokens
+ and self._tokens[-1].type == token.NEWLINE
+ ):
+ continue
self._tokens.append(tok)
+ if not self._path:
+ self._lines[tok.start[0]] = tok.line
return self._tokens[self._index]
def diagnose(self) -> tokenize.TokenInfo:
@@ -60,6 +64,34 @@ class Tokenizer:
self.getnext()
return self._tokens[-1]
+ def get_last_non_whitespace_token(self) -> tokenize.TokenInfo:
+ for tok in reversed(self._tokens[: self._index]):
+ if tok.type != tokenize.ENDMARKER and (
+ tok.type < tokenize.NEWLINE or tok.type > tokenize.DEDENT
+ ):
+ break
+ return tok
+
+ def get_lines(self, line_numbers: List[int]) -> List[str]:
+ """Retrieve source lines corresponding to line numbers."""
+ if self._lines:
+ lines = self._lines
+ else:
+ n = len(line_numbers)
+ lines = {}
+ count = 0
+ seen = 0
+ with open(self._path) as f:
+ for l in f:
+ count += 1
+ if count in line_numbers:
+ seen += 1
+ lines[count] = l
+ if seen == n:
+ break
+
+ return [lines[n] for n in line_numbers]
+
def mark(self) -> Mark:
return self._index