summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTian Gao <gaogaotiantian@hotmail.com>2023-04-24 05:03:49 (GMT)
committerGitHub <noreply@github.com>2023-04-24 05:03:49 (GMT)
commit36860134a9eda8df5af5a38d6c7533437c594c2f (patch)
tree6b024114fd9f35c525f59add5502bdef45f90051
parentf0ed293f6aec1c2ed22725301b77d6ccedc2d486 (diff)
downloadcpython-36860134a9eda8df5af5a38d6c7533437c594c2f.zip
cpython-36860134a9eda8df5af5a38d6c7533437c594c2f.tar.gz
cpython-36860134a9eda8df5af5a38d6c7533437c594c2f.tar.bz2
gh-103285: Rewrite _splitlines_no_ff to improve performance (#103307)
-rw-r--r--Lib/ast.py26
-rw-r--r--Lib/test/test_ast.py11
-rw-r--r--Misc/NEWS.d/next/Library/2023-04-06-04-35-59.gh-issue-103285.rCZ9-G.rst1
3 files changed, 20 insertions, 18 deletions
diff --git a/Lib/ast.py b/Lib/ast.py
index 2cbc80a..d9733a7 100644
--- a/Lib/ast.py
+++ b/Lib/ast.py
@@ -25,6 +25,7 @@
:license: Python License.
"""
import sys
+import re
from _ast import *
from contextlib import contextmanager, nullcontext
from enum import IntEnum, auto, _simple_enum
@@ -305,28 +306,17 @@ def get_docstring(node, clean=True):
return text
-def _splitlines_no_ff(source):
+_line_pattern = re.compile(r"(.*?(?:\r\n|\n|\r|$))")
+def _splitlines_no_ff(source, maxlines=None):
"""Split a string into lines ignoring form feed and other chars.
This mimics how the Python parser splits source code.
"""
- idx = 0
lines = []
- next_line = ''
- while idx < len(source):
- c = source[idx]
- next_line += c
- idx += 1
- # Keep \r\n together
- if c == '\r' and idx < len(source) and source[idx] == '\n':
- next_line += '\n'
- idx += 1
- if c in '\r\n':
- lines.append(next_line)
- next_line = ''
-
- if next_line:
- lines.append(next_line)
+ for lineno, match in enumerate(_line_pattern.finditer(source), 1):
+ if maxlines is not None and lineno > maxlines:
+ break
+ lines.append(match[0])
return lines
@@ -360,7 +350,7 @@ def get_source_segment(source, node, *, padded=False):
except AttributeError:
return None
- lines = _splitlines_no_ff(source)
+ lines = _splitlines_no_ff(source, maxlines=end_lineno+1)
if end_lineno == lineno:
return lines[lineno].encode()[col_offset:end_col_offset].decode()
diff --git a/Lib/test/test_ast.py b/Lib/test/test_ast.py
index a579bfd..8eef7ba 100644
--- a/Lib/test/test_ast.py
+++ b/Lib/test/test_ast.py
@@ -2293,6 +2293,17 @@ class EndPositionTests(unittest.TestCase):
cdef = ast.parse(s).body[0]
self.assertEqual(ast.get_source_segment(s, cdef.body[0], padded=True), s_method)
+ def test_source_segment_newlines(self):
+ s = 'def f():\n pass\ndef g():\r pass\r\ndef h():\r\n pass\r\n'
+ f, g, h = ast.parse(s).body
+ self._check_content(s, f, 'def f():\n pass')
+ self._check_content(s, g, 'def g():\r pass')
+ self._check_content(s, h, 'def h():\r\n pass')
+
+ s = 'def f():\n a = 1\r b = 2\r\n c = 3\n'
+ f = ast.parse(s).body[0]
+ self._check_content(s, f, s.rstrip())
+
def test_source_segment_missing_info(self):
s = 'v = 1\r\nw = 1\nx = 1\n\ry = 1\r\n'
v, w, x, y = ast.parse(s).body
diff --git a/Misc/NEWS.d/next/Library/2023-04-06-04-35-59.gh-issue-103285.rCZ9-G.rst b/Misc/NEWS.d/next/Library/2023-04-06-04-35-59.gh-issue-103285.rCZ9-G.rst
new file mode 100644
index 0000000..62b4364
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-04-06-04-35-59.gh-issue-103285.rCZ9-G.rst
@@ -0,0 +1 @@
+Improve performance of :func:`ast.get_source_segment`.