1 files changed, 45 insertions, 13 deletions
diff --git a/Tools/peg_generator/pegen/tokenizer.py b/Tools/peg_generator/pegen/tokenizer.py
index 61a28ef..7ee49e1 100644
--- a/Tools/peg_generator/pegen/tokenizer.py
+++ b/Tools/peg_generator/pegen/tokenizer.py
@@ -1,10 +1,10 @@
 import token
 import tokenize
-from typing import List, Iterator
+from typing import Dict, Iterator, List
 
 Mark = int  # NewType('Mark', int)
 
-exact_token_types = token.EXACT_TOKEN_TYPES  # type: ignore
+exact_token_types = token.EXACT_TOKEN_TYPES
 
 
 def shorttok(tok: tokenize.TokenInfo) -> str:
@@ -19,26 +19,22 @@ class Tokenizer:
 
     _tokens: List[tokenize.TokenInfo]
 
-    def __init__(self, tokengen: Iterator[tokenize.TokenInfo], *, verbose: bool = False):
+    def __init__(
+        self, tokengen: Iterator[tokenize.TokenInfo], *, path: str = "", verbose: bool = False
+    ):
         self._tokengen = tokengen
         self._tokens = []
         self._index = 0
         self._verbose = verbose
+        self._lines: Dict[int, str] = {}
+        self._path = path
         if verbose:
             self.report(False, False)
 
     def getnext(self) -> tokenize.TokenInfo:
         """Return the next token and updates the index."""
-        cached = True
-        while self._index == len(self._tokens):
-            tok = next(self._tokengen)
-            if tok.type in (tokenize.NL, tokenize.COMMENT):
-                continue
-            if tok.type == token.ERRORTOKEN and tok.string.isspace():
-                continue
-            self._tokens.append(tok)
-            cached = False
-        tok = self._tokens[self._index]
+        cached = not self._index == len(self._tokens)
+        tok = self.peek()
         self._index += 1
         if self._verbose:
             self.report(cached, False)
@@ -52,7 +48,15 @@ class Tokenizer:
                 continue
             if tok.type == token.ERRORTOKEN and tok.string.isspace():
                 continue
+            if (
+                tok.type == token.NEWLINE
+                and self._tokens
+                and self._tokens[-1].type == token.NEWLINE
+            ):
+                continue
             self._tokens.append(tok)
+            if not self._path:
+                self._lines[tok.start[0]] = tok.line
         return self._tokens[self._index]
 
     def diagnose(self) -> tokenize.TokenInfo:
@@ -60,6 +64,34 @@ class Tokenizer:
             self.getnext()
         return self._tokens[-1]
 
+    def get_last_non_whitespace_token(self) -> tokenize.TokenInfo:
+        for tok in reversed(self._tokens[: self._index]):
+            if tok.type != tokenize.ENDMARKER and (
+                tok.type < tokenize.NEWLINE or tok.type > tokenize.DEDENT
+            ):
+                break
+        return tok
+
+    def get_lines(self, line_numbers: List[int]) -> List[str]:
+        """Retrieve source lines corresponding to line numbers."""
+        if self._lines:
+            lines = self._lines
+        else:
+            n = len(line_numbers)
+            lines = {}
+            count = 0
+            seen = 0
+            with open(self._path) as f:
+                for l in f:
+                    count += 1
+                    if count in line_numbers:
+                        seen += 1
+                        lines[count] = l
+                        if seen == n:
+                            break
+
+        return [lines[n] for n in line_numbers]
+
     def mark(self) -> Mark:
         return self._index