summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBarney Gale <barney.gale@gmail.com>2023-04-09 17:40:03 (GMT)
committerGitHub <noreply@github.com>2023-04-09 17:40:03 (GMT)
commit2c673d5e93cfe2779f27c4e742d7e50f7a94f356 (patch)
tree7b5aa0de6865ad59854a78765f31edb3bda6875f
parent0a675f4bb57d01a5e69f8f58ae934ad7ca501a8d (diff)
downloadcpython-2c673d5e93cfe2779f27c4e742d7e50f7a94f356.zip
cpython-2c673d5e93cfe2779f27c4e742d7e50f7a94f356.tar.gz
cpython-2c673d5e93cfe2779f27c4e742d7e50f7a94f356.tar.bz2
GH-101362: Omit path anchor from `pathlib.PurePath()._parts` (GH-102476)
Improve performance of path construction by skipping the addition of the path anchor (`drive + root`) to the internal `_parts` list. Rename this attribute to `_tail` for clarity.
-rw-r--r--Lib/pathlib.py171
-rw-r--r--Lib/test/test_pathlib.py2
-rw-r--r--Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst2
3 files changed, 108 insertions, 67 deletions
diff --git a/Lib/pathlib.py b/Lib/pathlib.py
index 490f89f..4ae1fae 100644
--- a/Lib/pathlib.py
+++ b/Lib/pathlib.py
@@ -210,20 +210,17 @@ class _RecursiveWildcardSelector(_Selector):
class _PathParents(Sequence):
"""This object provides sequence-like access to the logical ancestors
of a path. Don't try to construct it yourself."""
- __slots__ = ('_pathcls', '_drv', '_root', '_parts')
+ __slots__ = ('_pathcls', '_drv', '_root', '_tail')
def __init__(self, path):
# We don't store the instance to avoid reference cycles
self._pathcls = type(path)
self._drv = path.drive
self._root = path.root
- self._parts = path._parts
+ self._tail = path._tail
def __len__(self):
- if self._drv or self._root:
- return len(self._parts) - 1
- else:
- return len(self._parts)
+ return len(self._tail)
def __getitem__(self, idx):
if isinstance(idx, slice):
@@ -234,7 +231,7 @@ class _PathParents(Sequence):
if idx < 0:
idx += len(self)
return self._pathcls._from_parsed_parts(self._drv, self._root,
- self._parts[:-idx - 1])
+ self._tail[:-idx - 1])
def __repr__(self):
return "<{}.parents>".format(self._pathcls.__name__)
@@ -249,9 +246,41 @@ class PurePath(object):
PureWindowsPath object. You can also instantiate either of these classes
directly, regardless of your system.
"""
+
__slots__ = (
- '_raw_path', '_drv', '_root', '_parts_cached',
- '_str', '_hash', '_parts_tuple', '_parts_normcase_cached',
+ # The `_raw_path` slot stores an unnormalized string path. This is set
+ # in the `__init__()` method.
+ '_raw_path',
+
+ # The `_drv`, `_root` and `_tail_cached` slots store parsed and
+ # normalized parts of the path. They are set when any of the `drive`,
+ # `root` or `_tail` properties are accessed for the first time. The
+ # three-part division corresponds to the result of
+ # `os.path.splitroot()`, except that the tail is further split on path
+ # separators (i.e. it is a list of strings), and that the root and
+ # tail are normalized.
+ '_drv', '_root', '_tail_cached',
+
+ # The `_str` slot stores the string representation of the path,
+ # computed from the drive, root and tail when `__str__()` is called
+ # for the first time. It's used to implement `_str_normcase`
+ '_str',
+
+ # The `_str_normcase_cached` slot stores the string path with
+ # normalized case. It is set when the `_str_normcase` property is
+ # accessed for the first time. It's used to implement `__eq__()`
+ # `__hash__()`, and `_parts_normcase`
+ '_str_normcase_cached',
+
+ # The `_parts_normcase_cached` slot stores the case-normalized
+ # string path after splitting on path separators. It's set when the
+ # `_parts_normcase` property is accessed for the first time. It's used
+ # to implement comparison methods like `__lt__()`.
+ '_parts_normcase_cached',
+
+ # The `_hash` slot stores the hash of the case-normalized string
+ # path. It's set when `__hash__()` is called for the first time.
+ '_hash',
)
_flavour = os.path
@@ -277,10 +306,7 @@ class PurePath(object):
path = os.fspath(args[0])
else:
path = self._flavour.join(*args)
- if isinstance(path, str):
- # Force-cast str subclasses to str (issue #21127)
- path = str(path)
- else:
+ if not isinstance(path, str):
raise TypeError(
"argument should be a str or an os.PathLike "
"object where __fspath__ returns a str, "
@@ -299,33 +325,32 @@ class PurePath(object):
if drv.startswith(sep):
# pathlib assumes that UNC paths always have a root.
root = sep
- unfiltered_parsed = [drv + root] + rel.split(sep)
- parsed = [sys.intern(x) for x in unfiltered_parsed if x and x != '.']
+ parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.']
return drv, root, parsed
def _load_parts(self):
- drv, root, parts = self._parse_path(self._raw_path)
+ drv, root, tail = self._parse_path(self._raw_path)
self._drv = drv
self._root = root
- self._parts_cached = parts
+ self._tail_cached = tail
@classmethod
- def _from_parsed_parts(cls, drv, root, parts):
- path = cls._format_parsed_parts(drv, root, parts)
+ def _from_parsed_parts(cls, drv, root, tail):
+ path = cls._format_parsed_parts(drv, root, tail)
self = cls(path)
self._str = path or '.'
self._drv = drv
self._root = root
- self._parts_cached = parts
+ self._tail_cached = tail
return self
@classmethod
- def _format_parsed_parts(cls, drv, root, parts):
+ def _format_parsed_parts(cls, drv, root, tail):
if drv or root:
- return drv + root + cls._flavour.sep.join(parts[1:])
- elif parts and cls._flavour.splitdrive(parts[0])[0]:
- parts = ['.'] + parts
- return cls._flavour.sep.join(parts)
+ return drv + root + cls._flavour.sep.join(tail)
+ elif tail and cls._flavour.splitdrive(tail[0])[0]:
+ tail = ['.'] + tail
+ return cls._flavour.sep.join(tail)
def __str__(self):
"""Return the string representation of the path, suitable for
@@ -334,7 +359,7 @@ class PurePath(object):
return self._str
except AttributeError:
self._str = self._format_parsed_parts(self.drive, self.root,
- self._parts) or '.'
+ self._tail) or '.'
return self._str
def __fspath__(self):
@@ -375,24 +400,33 @@ class PurePath(object):
return prefix + urlquote_from_bytes(os.fsencode(path))
@property
+ def _str_normcase(self):
+ # String with normalized case, for hashing and equality checks
+ try:
+ return self._str_normcase_cached
+ except AttributeError:
+ self._str_normcase_cached = self._flavour.normcase(str(self))
+ return self._str_normcase_cached
+
+ @property
def _parts_normcase(self):
- # Cached parts with normalized case, for hashing and comparison.
+ # Cached parts with normalized case, for comparisons.
try:
return self._parts_normcase_cached
except AttributeError:
- self._parts_normcase_cached = [self._flavour.normcase(p) for p in self._parts]
+ self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
return self._parts_normcase_cached
def __eq__(self, other):
if not isinstance(other, PurePath):
return NotImplemented
- return self._parts_normcase == other._parts_normcase and self._flavour is other._flavour
+ return self._str_normcase == other._str_normcase and self._flavour is other._flavour
def __hash__(self):
try:
return self._hash
except AttributeError:
- self._hash = hash(tuple(self._parts_normcase))
+ self._hash = hash(self._str_normcase)
return self._hash
def __lt__(self, other):
@@ -434,12 +468,12 @@ class PurePath(object):
return self._root
@property
- def _parts(self):
+ def _tail(self):
try:
- return self._parts_cached
+ return self._tail_cached
except AttributeError:
self._load_parts()
- return self._parts_cached
+ return self._tail_cached
@property
def anchor(self):
@@ -450,10 +484,10 @@ class PurePath(object):
@property
def name(self):
"""The final path component, if any."""
- parts = self._parts
- if len(parts) == (1 if (self.drive or self.root) else 0):
+ tail = self._tail
+ if not tail:
return ''
- return parts[-1]
+ return tail[-1]
@property
def suffix(self):
@@ -501,7 +535,7 @@ class PurePath(object):
if drv or root or not tail or f.sep in tail or (f.altsep and f.altsep in tail):
raise ValueError("Invalid name %r" % (name))
return self._from_parsed_parts(self.drive, self.root,
- self._parts[:-1] + [name])
+ self._tail[:-1] + [name])
def with_stem(self, stem):
"""Return a new path with the stem changed."""
@@ -526,7 +560,7 @@ class PurePath(object):
else:
name = name[:-len(old_suffix)] + suffix
return self._from_parsed_parts(self.drive, self.root,
- self._parts[:-1] + [name])
+ self._tail[:-1] + [name])
def relative_to(self, other, /, *_deprecated, walk_up=False):
"""Return the relative path to another path identified by the passed
@@ -551,7 +585,7 @@ class PurePath(object):
raise ValueError(f"{str(self)!r} and {str(other)!r} have different anchors")
if step and not walk_up:
raise ValueError(f"{str(self)!r} is not in the subpath of {str(other)!r}")
- parts = ('..',) * step + self.parts[len(path.parts):]
+ parts = ['..'] * step + self._tail[len(path._tail):]
return path_cls(*parts)
def is_relative_to(self, other, /, *_deprecated):
@@ -570,13 +604,10 @@ class PurePath(object):
def parts(self):
"""An object providing sequence-like access to the
components in the filesystem path."""
- # We cache the tuple to avoid building a new one each time .parts
- # is accessed. XXX is this necessary?
- try:
- return self._parts_tuple
- except AttributeError:
- self._parts_tuple = tuple(self._parts)
- return self._parts_tuple
+ if self.drive or self.root:
+ return (self.drive + self.root,) + tuple(self._tail)
+ else:
+ return tuple(self._tail)
def joinpath(self, *args):
"""Combine this path with one or several arguments, and return a
@@ -603,10 +634,10 @@ class PurePath(object):
"""The logical parent of the path."""
drv = self.drive
root = self.root
- parts = self._parts
- if len(parts) == 1 and (drv or root):
+ tail = self._tail
+ if not tail:
return self
- return self._from_parsed_parts(drv, root, parts[:-1])
+ return self._from_parsed_parts(drv, root, tail[:-1])
@property
def parents(self):
@@ -624,29 +655,29 @@ class PurePath(object):
def is_reserved(self):
"""Return True if the path contains one of the special names reserved
by the system, if any."""
- if self._flavour is posixpath or not self._parts:
+ if self._flavour is posixpath or not self._tail:
return False
# NOTE: the rules for reserved names seem somewhat complicated
# (e.g. r"..\NUL" is reserved but not r"foo\NUL" if "foo" does not
# exist). We err on the side of caution and return True for paths
# which are not considered reserved by Windows.
- if self._parts[0].startswith('\\\\'):
+ if self.drive.startswith('\\\\'):
# UNC paths are never reserved.
return False
- name = self._parts[-1].partition('.')[0].partition(':')[0].rstrip(' ')
+ name = self._tail[-1].partition('.')[0].partition(':')[0].rstrip(' ')
return name.upper() in _WIN_RESERVED_NAMES
def match(self, path_pattern):
"""
Return True if this path matches the given pattern.
"""
- path_pattern = self._flavour.normcase(path_pattern)
- drv, root, pat_parts = self._parse_path(path_pattern)
- if not pat_parts:
+ pat = type(self)(path_pattern)
+ if not pat.parts:
raise ValueError("empty pattern")
+ pat_parts = pat._parts_normcase
parts = self._parts_normcase
- if drv or root:
+ if pat.drive or pat.root:
if len(pat_parts) != len(parts):
return False
elif len(pat_parts) > len(parts):
@@ -707,11 +738,21 @@ class Path(PurePath):
cls = WindowsPath if os.name == 'nt' else PosixPath
return object.__new__(cls)
- def _make_child_relpath(self, part):
- # This is an optimization used for dir walking. `part` must be
- # a single part relative to this path.
- parts = self._parts + [part]
- return self._from_parsed_parts(self.drive, self.root, parts)
+ def _make_child_relpath(self, name):
+ path_str = str(self)
+ tail = self._tail
+ if tail:
+ path_str = f'{path_str}{self._flavour.sep}{name}'
+ elif path_str != '.':
+ path_str = f'{path_str}{name}'
+ else:
+ path_str = name
+ path = type(self)(path_str)
+ path._str = path_str
+ path._drv = self.drive
+ path._root = self.root
+ path._tail_cached = tail + [name]
+ return path
def __enter__(self):
# In previous versions of pathlib, __exit__() marked this path as
@@ -1196,12 +1237,12 @@ class Path(PurePath):
(as returned by os.path.expanduser)
"""
if (not (self.drive or self.root) and
- self._parts and self._parts[0][:1] == '~'):
- homedir = self._flavour.expanduser(self._parts[0])
+ self._tail and self._tail[0][:1] == '~'):
+ homedir = self._flavour.expanduser(self._tail[0])
if homedir[:1] == "~":
raise RuntimeError("Could not determine home directory.")
- drv, root, parts = self._parse_path(homedir)
- return self._from_parsed_parts(drv, root, parts + self._parts[1:])
+ drv, root, tail = self._parse_path(homedir)
+ return self._from_parsed_parts(drv, root, tail + self._tail[1:])
return self
diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py
index fe75f1c..3c6da94 100644
--- a/Lib/test/test_pathlib.py
+++ b/Lib/test/test_pathlib.py
@@ -346,8 +346,6 @@ class _BasePurePathTest(object):
p = P('a/b')
parts = p.parts
self.assertEqual(parts, ('a', 'b'))
- # The object gets reused.
- self.assertIs(parts, p.parts)
# When the path is absolute, the anchor is a separate part.
p = P('/a/b')
parts = p.parts
diff --git a/Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst b/Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst
new file mode 100644
index 0000000..87617a5
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-03-06-18-49-57.gh-issue-101362.eSSy6L.rst
@@ -0,0 +1,2 @@
+Speed up :class:`pathlib.Path` construction by omitting the path anchor from
+the internal list of path parts.