From 36860134a9eda8df5af5a38d6c7533437c594c2f Mon Sep 17 00:00:00 2001 From: Tian Gao Date: Sun, 23 Apr 2023 22:03:49 -0700 Subject: gh-103285: Rewrite _splitlines_no_ff to improve performance (#103307) --- Lib/ast.py | 26 +++++++--------------- Lib/test/test_ast.py | 11 +++++++++ .../2023-04-06-04-35-59.gh-issue-103285.rCZ9-G.rst | 1 + 3 files changed, 20 insertions(+), 18 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2023-04-06-04-35-59.gh-issue-103285.rCZ9-G.rst diff --git a/Lib/ast.py b/Lib/ast.py index 2cbc80a..d9733a7 100644 --- a/Lib/ast.py +++ b/Lib/ast.py @@ -25,6 +25,7 @@ :license: Python License. """ import sys +import re from _ast import * from contextlib import contextmanager, nullcontext from enum import IntEnum, auto, _simple_enum @@ -305,28 +306,17 @@ def get_docstring(node, clean=True): return text -def _splitlines_no_ff(source): +_line_pattern = re.compile(r"(.*?(?:\r\n|\n|\r|$))") +def _splitlines_no_ff(source, maxlines=None): """Split a string into lines ignoring form feed and other chars. This mimics how the Python parser splits source code. """ - idx = 0 lines = [] - next_line = '' - while idx < len(source): - c = source[idx] - next_line += c - idx += 1 - # Keep \r\n together - if c == '\r' and idx < len(source) and source[idx] == '\n': - next_line += '\n' - idx += 1 - if c in '\r\n': - lines.append(next_line) - next_line = '' - - if next_line: - lines.append(next_line) + for lineno, match in enumerate(_line_pattern.finditer(source), 1): + if maxlines is not None and lineno > maxlines: + break + lines.append(match[0]) return lines @@ -360,7 +350,7 @@ def get_source_segment(source, node, *, padded=False): except AttributeError: return None - lines = _splitlines_no_ff(source) + lines = _splitlines_no_ff(source, maxlines=end_lineno+1) if end_lineno == lineno: return lines[lineno].encode()[col_offset:end_col_offset].decode() diff --git a/Lib/test/test_ast.py b/Lib/test/test_ast.py index a579bfd..8eef7ba 100644 --- a/Lib/test/test_ast.py +++ b/Lib/test/test_ast.py @@ -2293,6 +2293,17 @@ class EndPositionTests(unittest.TestCase): cdef = ast.parse(s).body[0] self.assertEqual(ast.get_source_segment(s, cdef.body[0], padded=True), s_method) + def test_source_segment_newlines(self): + s = 'def f():\n pass\ndef g():\r pass\r\ndef h():\r\n pass\r\n' + f, g, h = ast.parse(s).body + self._check_content(s, f, 'def f():\n pass') + self._check_content(s, g, 'def g():\r pass') + self._check_content(s, h, 'def h():\r\n pass') + + s = 'def f():\n a = 1\r b = 2\r\n c = 3\n' + f = ast.parse(s).body[0] + self._check_content(s, f, s.rstrip()) + def test_source_segment_missing_info(self): s = 'v = 1\r\nw = 1\nx = 1\n\ry = 1\r\n' v, w, x, y = ast.parse(s).body diff --git a/Misc/NEWS.d/next/Library/2023-04-06-04-35-59.gh-issue-103285.rCZ9-G.rst b/Misc/NEWS.d/next/Library/2023-04-06-04-35-59.gh-issue-103285.rCZ9-G.rst new file mode 100644 index 0000000..62b4364 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-04-06-04-35-59.gh-issue-103285.rCZ9-G.rst @@ -0,0 +1 @@ +Improve performance of :func:`ast.get_source_segment`. -- cgit v0.12