gh-103285: Rewrite _splitlines_no_ff to improve performance (#103307)

author: Tian Gao <gaogaotiantian@hotmail.com> 2023-04-24 05:03:49 (GMT)
committer: GitHub <noreply@github.com> 2023-04-24 05:03:49 (GMT)
commit: 36860134a9eda8df5af5a38d6c7533437c594c2f (patch)
tree: 6b024114fd9f35c525f59add5502bdef45f90051 /Lib/ast.py
parent: f0ed293f6aec1c2ed22725301b77d6ccedc2d486 (diff)
download: cpython-36860134a9eda8df5af5a38d6c7533437c594c2f.zip
cpython-36860134a9eda8df5af5a38d6c7533437c594c2f.tar.gz
cpython-36860134a9eda8df5af5a38d6c7533437c594c2f.tar.bz2
1 files changed, 8 insertions, 18 deletions
diff --git a/Lib/ast.py b/Lib/ast.py
index 2cbc80a..d9733a7 100644
--- a/Lib/ast.py
+++ b/Lib/ast.py
@@ -25,6 +25,7 @@
     :license: Python License.
 """
 import sys
+import re
 from _ast import *
 from contextlib import contextmanager, nullcontext
 from enum import IntEnum, auto, _simple_enum
@@ -305,28 +306,17 @@ def get_docstring(node, clean=True):
     return text
 
 
-def _splitlines_no_ff(source):
+_line_pattern = re.compile(r"(.*?(?:\r\n|\n|\r|$))")
+def _splitlines_no_ff(source, maxlines=None):
     """Split a string into lines ignoring form feed and other chars.
 
     This mimics how the Python parser splits source code.
     """
-    idx = 0
     lines = []
-    next_line = ''
-    while idx < len(source):
-        c = source[idx]
-        next_line += c
-        idx += 1
-        # Keep \r\n together
-        if c == '\r' and idx < len(source) and source[idx] == '\n':
-            next_line += '\n'
-            idx += 1
-        if c in '\r\n':
-            lines.append(next_line)
-            next_line = ''
-
-    if next_line:
-        lines.append(next_line)
+    for lineno, match in enumerate(_line_pattern.finditer(source), 1):
+        if maxlines is not None and lineno > maxlines:
+            break
+        lines.append(match[0])
     return lines
 
 
@@ -360,7 +350,7 @@ def get_source_segment(source, node, *, padded=False):
     except AttributeError:
         return None
 
-    lines = _splitlines_no_ff(source)
+    lines = _splitlines_no_ff(source, maxlines=end_lineno+1)
     if end_lineno == lineno:
         return lines[lineno].encode()[col_offset:end_col_offset].decode()
author	Tian Gao <gaogaotiantian@hotmail.com>	2023-04-24 05:03:49 (GMT)
committer	GitHub <noreply@github.com>	2023-04-24 05:03:49 (GMT)
commit	36860134a9eda8df5af5a38d6c7533437c594c2f (patch)
tree	6b024114fd9f35c525f59add5502bdef45f90051 /Lib/ast.py
parent	f0ed293f6aec1c2ed22725301b77d6ccedc2d486 (diff)
download	cpython-36860134a9eda8df5af5a38d6c7533437c594c2f.zip cpython-36860134a9eda8df5af5a38d6c7533437c594c2f.tar.gz cpython-36860134a9eda8df5af5a38d6c7533437c594c2f.tar.bz2