GH-101362: Omit path anchor from `pathlib.PurePath()._parts` (GH-102476)

Improve performance of path construction by skipping the addition of the path anchor (`drive + root`) to the internal `_parts` list. Rename this attribute to `_tail` for clarity.
author: Barney Gale <barney.gale@gmail.com> 2023-04-09 17:40:03 (GMT)
committer: GitHub <noreply@github.com> 2023-04-09 17:40:03 (GMT)
commit: 2c673d5e93cfe2779f27c4e742d7e50f7a94f356 (patch)
tree: 7b5aa0de6865ad59854a78765f31edb3bda6875f
parent: 0a675f4bb57d01a5e69f8f58ae934ad7ca501a8d (diff)
download: cpython-2c673d5e93cfe2779f27c4e742d7e50f7a94f356.zip
cpython-2c673d5e93cfe2779f27c4e742d7e50f7a94f356.tar.gz
cpython-2c673d5e93cfe2779f27c4e742d7e50f7a94f356.tar.bz2
3 files changed, 108 insertions, 67 deletions
diff --git a/Lib/pathlib.py b/Lib/pathlib.py
index 490f89f..4ae1fae 100644
--- a/Lib/pathlib.py
+++ b/Lib/pathlib.py
@@ -210,20 +210,17 @@ class _RecursiveWildcardSelector(_Selector):
 class _PathParents(Sequence):
     """This object provides sequence-like access to the logical ancestors
     of a path.  Don't try to construct it yourself."""
-    __slots__ = ('_pathcls', '_drv', '_root', '_parts')
+    __slots__ = ('_pathcls', '_drv', '_root', '_tail')
 
     def __init__(self, path):
         # We don't store the instance to avoid reference cycles
         self._pathcls = type(path)
         self._drv = path.drive
         self._root = path.root
-        self._parts = path._parts
+        self._tail = path._tail
 
     def __len__(self):
-        if self._drv or self._root:
-            return len(self._parts) - 1
-        else:
-            return len(self._parts)
+        return len(self._tail)
 
     def __getitem__(self, idx):
         if isinstance(idx, slice):
@@ -234,7 +231,7 @@ class _PathParents(Sequence):
         if idx < 0:
             idx += len(self)
         return self._pathcls._from_parsed_parts(self._drv, self._root,
-                                                self._parts[:-idx - 1])
+                                                self._tail[:-idx - 1])
 
     def __repr__(self):
         return "<{}.parents>".format(self._pathcls.__name__)
@@ -249,9 +246,41 @@ class PurePath(object):
     PureWindowsPath object.  You can also instantiate either of these classes
     directly, regardless of your system.
     """
+
     __slots__ = (
-        '_raw_path', '_drv', '_root', '_parts_cached',
-        '_str', '_hash', '_parts_tuple', '_parts_normcase_cached',
+        # The `_raw_path` slot stores an unnormalized string path. This is set
+        # in the `__init__()` method.
+        '_raw_path',
+
+        # The `_drv`, `_root` and `_tail_cached` slots store parsed and
+        # normalized parts of the path. They are set when any of the `drive`,
+        # `root` or `_tail` properties are accessed for the first time. The
+        # three-part division corresponds to the result of
+        # `os.path.splitroot()`, except that the tail is further split on path
+        # separators (i.e. it is a list of strings), and that the root and
+        # tail are normalized.
+        '_drv', '_root', '_tail_cached',
+
+        # The `_str` slot stores the string representation of the path,
+        # computed from the drive, root and tail when `__str__()` is called
+        # for the first time. It's used to implement `_str_normcase`
+        '_str',
+
+        # The `_str_normcase_cached` slot stores the string path with
+        # normalized case. It is set when the `_str_normcase` property is
+        # accessed for the first time. It's used to implement `__eq__()`
+        # `__hash__()`, and `_parts_normcase`
+        '_str_normcase_cached',
+
+        # The `_parts_normcase_cached` slot stores the case-normalized
+        # string path after splitting on path separators. It's set when the
+        # `_parts_normcase` property is accessed for the first time. It's used
+        # to implement comparison methods like `__lt__()`.
+        '_parts_normcase_cached',
+
+        # The `_hash` slot stores the hash of the case-normalized string
+        # path. It's set when `__hash__()` is called for the first time.
+        '_hash',
     )
     _flavour = os.path
 
@@ -277,10 +306,7 @@ class PurePath(object):
             path = os.fspath(args[0])
         else:
             path = self._flavour.join(*args)
-        if isinstance(path, str):
-            # Force-cast str subclasses to str (issue #21127)
-            path = str(path)
-        else:
+        if not isinstance(path, str):
             raise TypeError(
                 "argument should be a str or an os.PathLike "
                 "object where __fspath__ returns a str, "
@@ -299,33 +325,32 @@ class PurePath(object):
         if drv.startswith(sep):
             # pathlib assumes that UNC paths always have a root.
             root = sep
-        unfiltered_parsed = [drv + root] + rel.split(sep)
-        parsed = [sys.intern(x) for x in unfiltered_parsed if x and x != '.']
+        parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.']
         return drv, root, parsed
 
     def _load_parts(self):
-        drv, root, parts = self._parse_path(self._raw_path)
+        drv, root, tail = self._parse_path(self._raw_path)
         self._drv = drv
         self._root = root
-        self._parts_cached = parts
+        self._tail_cached = tail
 
     @classmethod
-    def _from_parsed_parts(cls, drv, root, parts):
-        path = cls._format_parsed_parts(drv, root, parts)
+    def _from_parsed_parts(cls, drv, root, tail):
+        path = cls._format_parsed_parts(drv, root, tail)
         self = cls(path)
         self._str = path or '.'
         self._drv = drv
         self._root = root
-        self._parts_cached = parts
+        self._tail_cached = tail
         return self
 
     @classmethod
-    def _format_parsed_parts(cls, drv, root, parts):
+    def _format_parsed_parts(cls, drv, root, tail):
         if drv or root:
-            return drv + root + cls._flavour.sep.join(parts[1:])
-        elif parts and cls._flavour.splitdrive(parts[0])[0]:
-            parts = ['.'] + parts
-        return cls._flavour.sep.join(parts)
+            return drv + root + cls._flavour.sep.join(tail)
+        elif tail and cls._flavour.splitdrive(tail[0])[0]:
+            tail = ['.'] + tail
+        return cls._flavour.sep.join(tail)
 
     def __str__(self):
         """Return the string representation of the path, suitable for
@@ -334,7 +359,7 @@ class PurePath(object):
             return self._str
         except AttributeError:
             self._str = self._format_parsed_parts(self.drive, self.root,
-                                                  self._parts) or '.'
+                                                  self._tail) or '.'
             return self._str
 
     def __fspath__(self):
@@ -375,24 +400,33 @@ class PurePath(object):
         return prefix + urlquote_from_bytes(os.fsencode(path))
 
     @property
+    def _str_normcase(self):
+        # String with normalized case, for hashing and equality checks
+        try:
+            return self._str_normcase_cached
+        except AttributeError:
+            self._str_normcase_cached = self._flavour.normcase(str(self))
+            return self._str_normcase_cached
+
+    @property
     def _parts_normcase(self):
-        # Cached parts with normalized case, for hashing and comparison.
+        # Cached parts with normalized case, for comparisons.
         try:
             return self._parts_normcase_cached
         except AttributeError:
-            self._parts_normcase_cached = [self._flavour.normcase(p) for p in self._parts]
+            self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
             return self._parts_normcase_cached
 
     def __eq__(self, other):
         if not isinstance(other, PurePath):
             return NotImplemented
-        return self._parts_normcase == other._parts_normcase and self._flavour is other._flavour
+        return self._str_normcase == other._str_normcase and self._flavour is other._flavour
 
     def __hash__(self):
         try:
             return self._hash
         except AttributeError:
-            self._hash = hash(tuple(self._parts_normcase))
+            self._hash = hash(self._str_normcase)
             return self._hash
 
     def __lt__(self, other):
@@ -434,12 +468,12 @@ class PurePath(object):
             return self._root
 
     @property
-    def _parts(self):
+    def _tail(self):
         try:
-            return self._parts_cached
+            return self._tail_cached
         except AttributeError:
             self._load_parts()
-            return self._parts_cached
+            return self._tail_cached
 
     @property
     def anchor(self):
@@ -450,10 +484,10 @@ class PurePath(object):
     @property
     def name(self):
         """The final path component, if any."""
-        parts = self._parts
-        if len(parts) == (1 if (self.drive or self.root) else 0):
+        tail = self._tail
+        if not tail:
             return ''
-        return parts[-1]
+        return tail[-1]
 
     @property
     def suffix(self):
@@ -501,7 +535,7 @@ class PurePath(object):
         if drv or root or not tail or f.sep in tail or (f.altsep and f.altsep in tail):
             raise ValueError("Invalid name %r" % (name))
         return self._from_parsed_parts(self.drive, self.root,
-                                       self._parts[:-1] + [name])
+                                       self._tail[:-1] + [name])
 
     def with_stem(self, stem):
         """Return a new path with the stem changed."""
@@ -526,7 +560,7 @@ class PurePath(object):
         else:
             name = name[:-len(old_suffix)] + suffix
         return self._from_parsed_parts(self.drive, self.root,
-                                       self._parts[:-1] + [name])
+                                       self._tail[:-1] + [name])
 
     def relative_to(self, other, /, *_deprecated, walk_up=False):
         """Return the relative path to another path identified by the passed
@@ -551,7 +585,7 @@ class PurePath(object):
             raise ValueError(f"{str(self)!r} and {str(other)!r} have different anchors")
         if step and not walk_up:
             raise ValueError(f"{str(self)!r} is not in the subpath of {str(other)!r}")
-        parts = ('..',) * step + self.parts[len(path.parts):]
+        parts = ['..'] * step + self._tail[len(path._tail):]
         return path_cls(*parts)
 
     def is_relative_to(self, other, /, *_deprecated):
@@ -570,13 +604,10 @@ class PurePath(object):
     def parts(self):
         """An object providing sequence-like access to the
         components in the filesystem path."""
-        # We cache the tuple to avoid building a new one each time .parts
-        # is accessed.  XXX is this necessary?
-        try:
-            return self._parts_tuple
-        except AttributeError:
-            self._parts_tuple = tuple(self._parts)
-            return self._parts_tuple
+        if self.drive or self.root:
+            return (self.drive + self.root,) + tuple(self._tail)
+        else:
+            return tuple(self._tail)
 
     def joinpath(self, *args):
         """Combine this path with one or several arguments, and return a
@@ -603,10 +634,10 @@ class PurePath(object):
         """The logical parent of the path."""
         drv = self.drive
         root = self.root
-        parts = self._parts
-        if len(parts) == 1 and (drv or root):
+        tail = self._tail
+        if not tail:
             return self
-        return self._from_parsed_parts(drv, root, parts[:-1])
+        return self._from_parsed_parts(drv, root, tail[:-1])
 
     @property
     def parents(self):
@@ -624,29 +655,29 @@ class PurePath(object):
     def is_reserved(self):
         """Return True if the path contains one of the special names reserved
         by the system, if any."""
-        if self._flavour is posixpath or not self._parts:
+        if self._flavour is posixpath or not self._tail:
             return False
 
         # NOTE: the rules for reserved names seem somewhat complicated
         # (e.g. r"..\NUL" is reserved but not r"foo\NUL" if "foo" does not
         # exist). We err on the side of caution and return True for paths
         # which are not considered reserved by Windows.
-        if self._parts[0].startswith('\\\\'):
+        if self.drive.startswith('\\\\'):
             # UNC paths are never reserved.
             return False
-        name = self._parts[-1].partition('.')[0].partition(':')[0].rstrip(' ')
+        name = self._tail[-1].partition('.')[0].partition(':')[0].rstrip(' ')
         return name.upper() in _WIN_RESERVED_NAMES
 
     def match(self, path_pattern):
         """
         Return True if this path matches the given pattern.
         """
-        path_pattern = self._flavour.normcase(path_pattern)
-        drv, root, pat_parts = self._parse_path(path_pattern)
-        if not pat_parts:
+        pat = type(self)(path_pattern)
+        if not pat.parts:
             raise ValueError("empty pattern")
+        pat_parts = pat._parts_normcase
         parts = self._parts_normcase
-        if drv or root:
+        if pat.drive or pat.root:
             if len(pat_parts) != len(parts):
                 return False
         elif len(pat_parts) > len(parts):
@@ -707,11 +738,21 @@ class Path(PurePath):
             cls = WindowsPath if os.name == 'nt' else PosixPath
         return object.__new__(cls)
 
-    def _make_child_relpath(self, part):
-        # This is an optimization used for dir walking.  `part` must be
-        # a single part relative to this path.
-        parts = self._parts + [part]
-        return self._from_parsed_parts(self.drive, self.root, parts)
+    def _make_child_relpath(self, name):
+        path_str = str(self)
+        tail = self._tail
+        if tail:
+            path_str = f'{path_str}{self._flavour.sep}{name}'
+        elif path_str != '.':
+            path_str = f'{path_str}{name}'
+        else:
+            path_str = name
+        path = type(self)(path_str)
+        path._str = path_str
+        path._drv = self.drive
+        path._root = self.root
+        path._tail_cached = tail + [name]
+        return path
 
     def __enter__(self):
         # In previous versions of pathlib, __exit__() marked this path as
@@ -1196,12 +1237,12 @@ class Path(PurePath):
         (as returned by os.path.expanduser)
         """
         if (not (self.drive or self.root) and
-            self._parts and self._parts[0][:1] == '~'):
-            homedir = self._flavour.expanduser(self._parts[0])
+            self._tail and self._tail[0][:1] == '~'):
+            homedir = self._flavour.expanduser(self._tail[0])
             if homedir[:1] == "~":
                 raise RuntimeError("Could not determine home directory.")
-            drv, root, parts = self._parse_path(homedir)
-            return self._from_parsed_parts(drv, root, parts + self._parts[1:])
+            drv, root, tail = self._parse_path(homedir)
+            return self._from_parsed_parts(drv, root, tail + self._tail[1:])
 
         return self
 
diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py
index fe75f1c..3c6da94 100644
--- a/Lib/test/test_pathlib.py
+++ b/Lib/test/test_pathlib.py
@@ -346,8 +346,6 @@ class _BasePurePathTest(object):
         p = P('a/b')
         parts = p.parts
         self.assertEqual(parts, ('a', 'b'))
-        # The object gets reused.
-        self.assertIs(parts, p.parts)
         # When the path is absolute, the anchor is a separate part.
         p = P('/a/b')
         parts = p.parts
diff --git a/Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst b/Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst
new file mode 100644
index 0000000..87617a5
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst
@@ -0,0 +1,2 @@
+Speed up :class:`pathlib.Path` construction by omitting the path anchor from
+the internal list of path parts.
author	Barney Gale <barney.gale@gmail.com>	2023-04-09 17:40:03 (GMT)
committer	GitHub <noreply@github.com>	2023-04-09 17:40:03 (GMT)
commit	2c673d5e93cfe2779f27c4e742d7e50f7a94f356 (patch)
tree	7b5aa0de6865ad59854a78765f31edb3bda6875f
parent	0a675f4bb57d01a5e69f8f58ae934ad7ca501a8d (diff)
download	cpython-2c673d5e93cfe2779f27c4e742d7e50f7a94f356.zip cpython-2c673d5e93cfe2779f27c4e742d7e50f7a94f356.tar.gz cpython-2c673d5e93cfe2779f27c4e742d7e50f7a94f356.tar.bz2