1# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2# SPDX-License-Identifier: Apache-2.0
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16import logging
17import os
18import stat
19from pathlib import Path, PurePosixPath
20from typing import Union
21
22from .client import StorageClient
23from .shortcuts import resolve_storage_client
24from .types import MSC_PROTOCOL, ObjectMetadata, SourceVersionCheckMode
25from .utils import join_paths
26
27logger = logging.getLogger(__name__)
28
29
[docs]
30class StatResult:
31 """
32 A stat-like result object that mimics os.stat_result for remote storage paths.
33
34 This class provides the same interface as os.stat_result but is populated
35 from ObjectMetadata obtained from storage providers.
36 """
37
38 def __init__(self, metadata: ObjectMetadata):
39 """Initialize StatResult from ObjectMetadata."""
40 # File type and mode bits
41 if metadata.type == "directory":
42 # Directory: 0o755 (rwxr-xr-x) + S_IFDIR
43 self.st_mode = stat.S_IFDIR | 0o755
44 else:
45 # Regular file: 0o644 (rw-r--r--) + S_IFREG
46 self.st_mode = stat.S_IFREG | 0o644
47
48 # File size
49 self.st_size = metadata.content_length
50
51 # Timestamps - convert datetime to epoch seconds
52 mtime = metadata.last_modified.timestamp()
53 self.st_mtime = mtime
54 self.st_atime = mtime
55 self.st_ctime = mtime
56
57 # Nanosecond precision timestamps
58 mtime_ns = int(mtime * 1_000_000_000)
59 self.st_mtime_ns = mtime_ns
60 self.st_atime_ns = mtime_ns
61 self.st_ctime_ns = mtime_ns
62
63 # Default values for fields we don't have from storage providers
64 self.st_ino = 0
65 self.st_dev = 0
66 self.st_nlink = 1
67 self.st_uid = os.getuid() if hasattr(os, "getuid") else 0 # User ID
68 self.st_gid = os.getgid() if hasattr(os, "getgid") else 0 # Group ID
69
70
[docs]
71class MultiStoragePath:
72 """
73 A path object similar to pathlib.Path that supports both local and remote file systems.
74
75 MultiStoragePath provides a unified interface for working with paths across different storage systems,
76 including local files, S3, GCS, Azure Blob Storage, and more. It uses the "msc://" protocol
77 prefix to identify remote storage paths.
78
79 This implementation is based on Python 3.9's pathlib.Path interface, providing compatible behavior
80 for local filesystem operations while extending support to remote storage systems.
81
82 Examples:
83 >>> import multistorageclient as msc
84 >>> msc.Path("/local/path/file.txt")
85 >>> msc.Path("msc://my-profile/data/file.txt")
86 >>> msc.Path(pathlib.Path("relative/path"))
87 """
88
89 _internal_path: PurePosixPath
90 _storage_client: StorageClient
91 _path: str
92
93 def __init__(self, path: Union[str, os.PathLike]):
94 """
95 Initialize path object supporting multiple storage backends.
96
97 :param path: String, Path, or MultiStoragePath. Relative paths are automatically converted to absolute.
98 """
99 self._path = str(path)
100 self._storage_client, relative_path = resolve_storage_client(self._path)
101 self._internal_path = PurePosixPath(relative_path)
102
103 if self._storage_client.is_default_profile():
104 self._internal_path = PurePosixPath("/") / self._internal_path
105
106 def __str__(self) -> str:
107 if self._storage_client.is_default_profile():
108 return str(self._internal_path)
109 return join_paths(f"{MSC_PROTOCOL}{self._storage_client.profile}", str(self._internal_path))
110
111 def __repr__(self) -> str:
112 return f"MultiStoragePath({str(self)!r})"
113
114 def __eq__(self, other) -> bool:
115 if not isinstance(other, MultiStoragePath):
116 return False
117 return (
118 self._storage_client.profile == other._storage_client.profile
119 and self._internal_path == other._internal_path
120 )
121
122 def __hash__(self) -> int:
123 """Return hash of the path."""
124 return hash((self._storage_client.profile, self._internal_path))
125
126 def __fspath__(self) -> str:
127 return str(self)
128
[docs]
129 def joinpath(self, *pathsegments):
130 return self.with_segments(*pathsegments)
131
132 def __truediv__(self, key):
133 try:
134 return self.joinpath(key)
135 except TypeError:
136 return NotImplemented
137
138 def __rtruediv__(self, key):
139 try:
140 return self.with_segments(key, self)
141 except TypeError:
142 return NotImplemented
143
144 def __getstate__(self):
145 return {"_path": self._path, "_internal_path": self._internal_path}
146
147 def __setstate__(self, state):
148 self._path = state["_path"]
149 self._internal_path = state["_internal_path"]
150 self._storage_client, _ = resolve_storage_client(self._path)
151
152 @property
153 def anchor(self) -> str:
154 """
155 The concatenation of the drive and root, or ''.
156 """
157 return self._internal_path.anchor
158
159 @property
160 def name(self) -> str:
161 """
162 The final path component, if any.
163 """
164 return self._internal_path.name
165
166 @property
167 def suffix(self) -> str:
168 """
169 The final path component, if any.
170 """
171 return self._internal_path.suffix
172
173 @property
174 def suffixes(self) -> list[str]:
175 """
176 A list of the final component's suffixes, if any.
177
178 These include the leading periods. For example: ['.tar', '.gz']
179 """
180 return self._internal_path.suffixes
181
182 @property
183 def stem(self) -> str:
184 """
185 The final path component, minus its last suffix.
186 """
187 return self._internal_path.stem
188
189 @property
190 def parent(self) -> "MultiStoragePath":
191 """
192 The logical parent of the path.
193 """
194 parent_path = self._internal_path.parent
195 if self._storage_client.is_default_profile():
196 return MultiStoragePath(str(parent_path))
197 return MultiStoragePath(join_paths(f"{MSC_PROTOCOL}{self._storage_client.profile}", str(parent_path)))
198
199 @property
200 def parents(self) -> list["MultiStoragePath"]:
201 """
202 A sequence of this path's logical parents.
203 """
204 if self._storage_client.is_default_profile():
205 return [MultiStoragePath(str(p)) for p in self._internal_path.parents]
206 else:
207 return [
208 MultiStoragePath(join_paths(f"{MSC_PROTOCOL}{self._storage_client.profile}", str(p)))
209 for p in self._internal_path.parents
210 ]
211
212 @property
213 def parts(self):
214 """
215 An object providing sequence-like access to the components in the filesystem path (does not
216 include the msc:// and the profile name).
217 """
218 return self._internal_path.parts
219
[docs]
220 def as_posix(self) -> str:
221 """
222 Return the string representation of the path with forward (/) slashes.
223
224 If the path is a remote path, the file content is downloaded to local storage
225 (either cached or temporary file) and the local filesystem path is returned.
226 This enables access to remote file content through standard filesystem operations.
227 """
228 if self._storage_client.is_default_profile():
229 return self._internal_path.as_posix()
230
231 # Return the local path of the file
232 with self._storage_client.open(str(self._internal_path), mode="rb") as fp:
233 return fp.resolve_filesystem_path()
234
[docs]
235 def is_absolute(self) -> bool:
236 """
237 Paths are always absolute.
238 """
239 return True
240
[docs]
241 def is_relative_to(self, other: "MultiStoragePath") -> bool:
242 """
243 Return True if the path is relative to another path or False.
244 """
245 return isinstance(other, MultiStoragePath) and self._internal_path.is_relative_to(other._internal_path)
246
[docs]
247 def is_reserved(self) -> bool:
248 if self._storage_client.is_default_profile():
249 return self._internal_path.is_reserved()
250 raise NotImplementedError("MultiStoragePath.is_reserved() is unsupported for remote storage paths")
251
[docs]
252 def match(self, pattern) -> bool:
253 """
254 Return True if this path matches the given pattern.
255 """
256 return Path(self._internal_path).match(pattern)
257
[docs]
258 def relative_to(self, other: "MultiStoragePath") -> "MultiStoragePath":
259 """
260 Not implemented.
261 """
262 raise NotImplementedError("MultiStoragePath.relative_to() is unsupported")
263
[docs]
264 def with_name(self, name: str) -> "MultiStoragePath":
265 """
266 Return a new path with the file name changed.
267 """
268 if self._storage_client.is_default_profile():
269 return MultiStoragePath(str(self._internal_path.with_name(name)))
270 else:
271 return MultiStoragePath(
272 join_paths(f"{MSC_PROTOCOL}{self._storage_client.profile}", str(self._internal_path.with_name(name)))
273 )
274
[docs]
275 def with_stem(self, stem: str) -> "MultiStoragePath":
276 """
277 Return a new path with the stem changed.
278 """
279 if self._storage_client.is_default_profile():
280 return MultiStoragePath(str(self._internal_path.with_stem(stem)))
281 else:
282 return MultiStoragePath(
283 join_paths(f"{MSC_PROTOCOL}{self._storage_client.profile}", str(self._internal_path.with_stem(stem)))
284 )
285
[docs]
286 def with_suffix(self, suffix: str) -> "MultiStoragePath":
287 """
288 Return a new path with the file suffix changed. If the path has no suffix, add given suffix.
289 If the given suffix is an empty string, remove the suffix from the path.
290 """
291 if self._storage_client.is_default_profile():
292 return MultiStoragePath(str(self._internal_path.with_suffix(suffix)))
293 else:
294 return MultiStoragePath(
295 join_paths(
296 f"{MSC_PROTOCOL}{self._storage_client.profile}", str(self._internal_path.with_suffix(suffix))
297 )
298 )
299
[docs]
300 def with_segments(self, *pathsegments) -> "MultiStoragePath":
301 """
302 Construct a new path object from any number of path-like objects.
303 """
304 if self._storage_client.is_default_profile():
305 new_path = self._internal_path.joinpath(*pathsegments)
306 return MultiStoragePath(str(new_path))
307 else:
308 new_path = self._internal_path.joinpath(*pathsegments)
309 return MultiStoragePath(join_paths(f"{MSC_PROTOCOL}{self._storage_client.profile}", str(new_path)))
310
311 # Expanding and resolving paths
312
[docs]
313 @classmethod
314 def home(cls):
315 """
316 Return a new path pointing to the user's home directory.
317 """
318 return Path.home()
319
[docs]
320 def expanduser(self):
321 """
322 Return a new path with expanded ~ and ~user constructs (as returned by os.path.expanduser).
323
324 Not supported for remote storage paths.
325 """
326 if self._storage_client.is_default_profile():
327 return Path(self._internal_path).expanduser()
328 raise NotImplementedError("MultiStoragePath.expanduser() is unsupported for remote storage paths")
329
[docs]
330 @classmethod
331 def cwd(cls):
332 """
333 Return a new path pointing to the current working directory.
334 """
335 return Path.cwd()
336
[docs]
337 def absolute(self):
338 """
339 Return the path itself since it is always absolute.
340 """
341 return self
342
[docs]
343 def resolve(self, strict=False):
344 """
345 Return the absolute path.
346 """
347 if self._storage_client.is_default_profile():
348 return MultiStoragePath(str(Path(self._internal_path).resolve(strict=strict)))
349 return MultiStoragePath(join_paths(f"{MSC_PROTOCOL}{self._storage_client.profile}", str(self._internal_path)))
350
[docs]
351 def readlink(self):
352 """
353 Return the path to which the symbolic link points.
354
355 Not supported for remote storage paths.
356 """
357 if self._storage_client.is_default_profile():
358 return MultiStoragePath(str(Path(self._internal_path).readlink()))
359 raise NotImplementedError("MultiStoragePath.readlink() is unsupported for remote storage paths")
360
361 # Querying file type and status
362
[docs]
363 def stat(self):
364 """
365 Return the result of the stat() system call on this path, like os.stat() does.
366
367 If the path is a remote path, the result is a :py:class:`multistorageclient.pathlib.StatResult` object.
368 """
369 if self._storage_client.is_default_profile():
370 return Path(self._internal_path).stat()
371 info = self._storage_client.info(str(self._internal_path))
372 return StatResult(info)
373
[docs]
374 def lstat(self):
375 """
376 Like stat(), except if the path points to a symlink, the symlink's status information
377 is returned, rather than its target's.
378
379 If the path is a remote path, the result is a :py:class:`multistorageclient.pathlib.StatResult` object.
380 """
381 if self._storage_client.is_default_profile():
382 return Path(self._internal_path).lstat()
383 info = self._storage_client.info(str(self._internal_path))
384 return StatResult(info)
385
[docs]
386 def exists(self) -> bool:
387 """
388 Return True if the path exists.
389 """
390 if self._storage_client.is_default_profile():
391 return Path(self._internal_path).exists()
392 else:
393 try:
394 self._storage_client.info(str(self._internal_path))
395 return True
396 except FileNotFoundError:
397 return False
398
[docs]
399 def is_file(self, strict: bool = True) -> bool:
400 """
401 Return True if the path exists and is a regular file.
402 """
403 if self._storage_client.is_default_profile():
404 return Path(self._internal_path).is_file()
405 else:
406 try:
407 # If the path ends with a "/", assume it is a directory.
408 path = str(self._internal_path)
409 if path.endswith("/"):
410 return False
411
412 meta = self._storage_client.info(path, strict=strict)
413 return meta.type == "file"
414 except FileNotFoundError:
415 return False
416 except Exception as e:
417 logger.warning("Error occurred while fetching file info at %s, caused by: %s", self._internal_path, e)
418 return False
419
[docs]
420 def is_dir(self, strict: bool = True) -> bool:
421 """
422 Return True if the path exists and is a directory.
423 """
424 if self._storage_client.is_default_profile():
425 return Path(self._internal_path).is_dir()
426 else:
427 try:
428 # If the path does not end with a "/", append it to ensure the path is a directory.
429 path = str(self._internal_path)
430 if not path.endswith("/"):
431 path += "/"
432
433 meta = self._storage_client.info(path, strict=strict)
434 return meta.type == "directory"
435 except FileNotFoundError:
436 return False
437 except Exception as e:
438 logger.warning("Error occurred while fetching file info at %s, caused by: %s", self._internal_path, e)
439 return False
440
[docs]
441 def is_symlink(self):
442 """
443 Return True if the path exists and is a symbolic link.
444
445 Not supported for remote storage paths.
446 """
447 if self._storage_client.is_default_profile():
448 return Path(self._internal_path).is_symlink()
449 raise NotImplementedError("MultiStoragePath.is_symlink() is unsupported for remote storage paths")
450
[docs]
451 def is_mount(self):
452 """
453 Return True if the path exists and is a mount point.
454
455 Not supported for remote storage paths.
456 """
457 if self._storage_client.is_default_profile():
458 return Path(self._internal_path).is_mount()
459 raise NotImplementedError("MultiStoragePath.is_mount() is unsupported for remote storage paths")
460
[docs]
461 def is_socket(self):
462 """
463 Return True if the path exists and is a socket.
464
465 Not supported for remote storage paths.
466 """
467 if self._storage_client.is_default_profile():
468 return Path(self._internal_path).is_socket()
469 raise NotImplementedError("MultiStoragePath.is_socket() is unsupported for remote storage paths")
470
[docs]
471 def is_fifo(self):
472 """
473 Return True if the path exists and is a FIFO.
474
475 Not supported for remote storage paths.
476 """
477 if self._storage_client.is_default_profile():
478 return Path(self._internal_path).is_fifo()
479 raise NotImplementedError("MultiStoragePath.is_fifo() is unsupported for remote storage paths")
480
[docs]
481 def is_block_device(self):
482 """
483 Return True if the path exists and is a block device.
484
485 Not supported for remote storage paths.
486 """
487 if self._storage_client.is_default_profile():
488 return Path(self._internal_path).is_block_device()
489 raise NotImplementedError("MultiStoragePath.is_block_device() is unsupported for remote storage paths")
490
[docs]
491 def is_char_device(self):
492 """
493 Return True if the path exists and is a character device.
494
495 Not supported for remote storage paths.
496 """
497 if self._storage_client.is_default_profile():
498 return Path(self._internal_path).is_char_device()
499 raise NotImplementedError("MultiStoragePath.is_char_device() is unsupported for remote storage paths")
500
[docs]
501 def samefile(self, other_path):
502 """
503 Return True if both paths point to the same file or directory.
504
505 Not supported for remote storage paths.
506 """
507 if self._storage_client.is_default_profile():
508 return Path(self._internal_path).samefile(other_path)
509 return self == other_path
510
511 # Reading and writing files
512
[docs]
513 def open(
514 self,
515 mode="r",
516 buffering=-1,
517 encoding=None,
518 errors=None,
519 newline=None,
520 check_source_version=SourceVersionCheckMode.INHERIT,
521 ):
522 """
523 Open the file and return a file object.
524 """
525 return self._storage_client.open(
526 str(self._internal_path),
527 mode=mode,
528 buffering=buffering,
529 encoding=encoding,
530 check_source_version=check_source_version,
531 )
532
[docs]
533 def read_bytes(self) -> bytes:
534 """
535 Open the file in bytes mode, read it, and close the file.
536 """
537 return self._storage_client.read(str(self._internal_path))
538
[docs]
539 def read_text(self, encoding: str = "utf-8", errors: str = "strict") -> str:
540 """
541 Open the file in text mode, read it, and close the file.
542 """
543 return self._storage_client.read(str(self._internal_path)).decode(encoding)
544
[docs]
545 def write_bytes(self, data: bytes) -> None:
546 """
547 Open the file in bytes mode, write to it, and close the file.
548 """
549 self._storage_client.write(str(self._internal_path), data)
550
[docs]
551 def write_text(self, data: str, encoding: str = "utf-8", errors: str = "strict") -> None:
552 """
553 Open the file in text mode, write to it, and close the file.
554 """
555 self._storage_client.write(str(self._internal_path), data.encode(encoding))
556
557 # Reading directories
558
[docs]
559 def iterdir(self):
560 """
561 Yield path objects of the directory contents.
562 """
563 if self._storage_client.is_default_profile():
564 for item in Path(self._internal_path).iterdir():
565 yield MultiStoragePath(str(item))
566 else:
567 path = str(self._internal_path)
568 if not path.endswith("/"):
569 path += "/"
570 for item in self._storage_client.list(path, include_directories=True, include_url_prefix=True):
571 yield MultiStoragePath(item.key)
572
[docs]
573 def glob(self, pattern):
574 """
575 Iterate over this subtree and yield all existing files (of any kind, including directories)
576 matching the given relative pattern.
577 """
578 if self._storage_client.is_default_profile():
579 return [MultiStoragePath(str(p)) for p in Path(self._internal_path).glob(pattern)]
580 else:
581 return [
582 MultiStoragePath(str(p))
583 for p in self._storage_client.glob(str(self._internal_path / pattern), include_url_prefix=True)
584 ]
585
[docs]
586 def rglob(self, pattern):
587 """
588 Recursively yield all existing files (of any kind, including directories) matching the
589 given relative pattern, anywhere in this subtree.
590 """
591 if self._storage_client.is_default_profile():
592 return [MultiStoragePath(str(p)) for p in Path(self._internal_path).rglob(pattern)]
593 else:
594 recursive_pattern = f"**/{pattern}"
595 return [
596 MultiStoragePath(str(p))
597 for p in self._storage_client.glob(
598 str(self._internal_path / recursive_pattern), include_url_prefix=True
599 )
600 ]
601
[docs]
602 def walk(self, top_down=True, on_error=None, follow_symlinks=False):
603 """
604 Walk the directory tree from this directory, similar to os.walk().
605
606 Not supported for remote storage paths.
607 """
608 if self._storage_client.is_default_profile():
609 return Path(self._internal_path).walk(top_down, on_error, follow_symlinks) # pyright: ignore[reportAttributeAccessIssue]
610 raise NotImplementedError("MultiStoragePath.walk() is unsupported for remote storage paths")
611
612 # Creating files and directories
613
[docs]
614 def touch(self, mode=0o666, exist_ok=False):
615 """
616 Create this file with the given access mode, if it doesn't exist.
617 """
618 if self._storage_client.is_default_profile():
619 Path(self._internal_path).touch(mode, exist_ok)
620 else:
621 if self.exists():
622 # object storage does not support updating the last modified time of a object without writing the object
623 logger.warning("MultiStoragePath.touch() is not supported for remote storage paths")
624 else:
625 self._storage_client.write(str(self._internal_path), b"")
626
[docs]
627 def mkdir(self, mode=0o777, parents=False, exist_ok=False) -> None:
628 """
629 Create a new directory at this given path.
630
631 For remote storage paths, this operation is a no-op.
632 """
633 if self._storage_client.is_default_profile():
634 Path(self._internal_path).mkdir(mode, parents, exist_ok)
635
[docs]
636 def symlink_to(self, target, target_is_directory=False):
637 """
638 Make this path a symlink pointing to the target path.
639
640 Not supported for remote storage paths.
641 """
642 if self._storage_client.is_default_profile():
643 Path(self._internal_path).symlink_to(target, target_is_directory)
644 else:
645 raise NotImplementedError("MultiStoragePath.symlink_to() is unsupported for remote storage paths")
646
647 # Renaming and deleting
648
[docs]
649 def rename(self, target) -> "MultiStoragePath":
650 """
651 Rename this path to the target path.
652 """
653 if not isinstance(target, MultiStoragePath):
654 target = MultiStoragePath(target)
655
656 if self._storage_client.is_default_profile():
657 Path(self._internal_path).rename(str(target._internal_path))
658 else:
659 # Note: This operation is not atomic, and the target path must be a single file.
660 self._storage_client.copy(str(self._internal_path), str(target._internal_path))
661 self._storage_client.delete(str(self._internal_path))
662
663 return target
664
[docs]
665 def replace(self, target):
666 """
667 Rename this path to the target path, overwriting if that path exists.
668
669 Not supported for remote storage paths.
670 """
671 if self._storage_client.is_default_profile():
672 Path(self._internal_path).replace(target)
673 else:
674 raise NotImplementedError("MultiStoragePath.replace() is unsupported for remote storage paths")
675
[docs]
676 def unlink(self, missing_ok: bool = False) -> None:
677 """
678 Remove this file or link. If the path is a directory, use rmdir() instead.
679 """
680 if self._storage_client.is_default_profile():
681 Path(self._internal_path).unlink(missing_ok=missing_ok)
682 else:
683 try:
684 self._storage_client.delete(str(self._internal_path))
685 except FileNotFoundError:
686 if not missing_ok:
687 raise
688
[docs]
689 def rmdir(self) -> None:
690 """
691 Remove this directory. The directory must be empty.
692
693 Not supported for remote storage paths.
694 """
695 if self._storage_client.is_default_profile():
696 Path(self._internal_path).rmdir()
697 else:
698 raise NotImplementedError("MultiStoragePath.rmdir() is unsupported for remote storage paths")
699
700 # Permissions and ownership
701
[docs]
702 def owner(self):
703 """
704 Return the login name of the file owner.
705
706 Not supported for remote storage paths.
707 """
708 if self._storage_client.is_default_profile():
709 return Path(self._internal_path).owner()
710 raise NotImplementedError("MultiStoragePath.owner() is unsupported for remote storage paths")
711
[docs]
712 def group(self):
713 """
714 Return the group name of the file gid.
715
716 Not supported for remote storage paths.
717 """
718 if self._storage_client.is_default_profile():
719 return Path(self._internal_path).group()
720 raise NotImplementedError("MultiStoragePath.group() is unsupported for remote storage paths")
721
[docs]
722 def chmod(self, mode):
723 """
724 Change the permissions of the path, like os.chmod().
725
726 Not supported for remote storage paths.
727 """
728 if self._storage_client.is_default_profile():
729 Path(self._internal_path).chmod(mode)
730 else:
731 raise NotImplementedError("MultiStoragePath.chmod() is unsupported for remote storage paths")
732
[docs]
733 def lchmod(self, mode):
734 """
735 Like chmod(), except if the path points to a symlink, the symlink's permissions are changed, rather
736 than its target's.
737
738 Not supported for remote storage paths.
739 """
740 if self._storage_client.is_default_profile():
741 Path(self._internal_path).lchmod(mode)
742 else:
743 raise NotImplementedError("MultiStoragePath.lchmod() is unsupported for remote storage paths")