Source code for multistorageclient.types

  1# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2# SPDX-License-Identifier: Apache-2.0
  3#
  4# Licensed under the Apache License, Version 2.0 (the "License");
  5# you may not use this file except in compliance with the License.
  6# You may obtain a copy of the License at
  7#
  8# http://www.apache.org/licenses/LICENSE-2.0
  9#
 10# Unless required by applicable law or agreed to in writing, software
 11# distributed under the License is distributed on an "AS IS" BASIS,
 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13# See the License for the specific language governing permissions and
 14# limitations under the License.
 15
 16from abc import ABC, abstractmethod
 17from collections.abc import Iterator
 18from dataclasses import asdict, dataclass, field
 19from datetime import datetime, timezone
 20from enum import Enum
 21from typing import IO, Any, Optional, Union
 22
 23from dateutil.parser import parse as dateutil_parser
 24
 25MSC_PROTOCOL_NAME = "msc"
 26MSC_PROTOCOL = MSC_PROTOCOL_NAME + "://"
 27
 28DEFAULT_RETRY_ATTEMPTS = 3
 29DEFAULT_RETRY_DELAY = 1.0
 30
 31# datetime.min is a naive datetime.
 32#
 33# This creates issues when doing datetime.astimezone(timezone.utc) since it assumes the local timezone for the naive datetime.
 34# If the local timezone is offset behind UTC, it attempts to subtract off the offset which goes below the representable limit (i.e. an underflow).
 35# A `ValueError: year 0 is out of range` is thrown as a result.
 36AWARE_DATETIME_MIN = datetime.min.replace(tzinfo=timezone.utc)
 37
 38
[docs] 39@dataclass 40class Credentials: 41 """ 42 A data class representing the credentials needed to access a storage provider. 43 """ 44 45 #: The access key for authentication. 46 access_key: str 47 #: The secret key for authentication. 48 secret_key: str 49 #: An optional security token for temporary credentials. 50 token: Optional[str] 51 #: The expiration time of the credentials in ISO 8601 format. 52 expiration: Optional[str] 53 #: A dictionary for storing custom key-value pairs. 54 custom_fields: dict[str, Any] = field(default_factory=dict) 55
[docs] 56 def is_expired(self) -> bool: 57 """ 58 Checks if the credentials are expired based on the expiration time. 59 60 :return: ``True`` if the credentials are expired, ``False`` otherwise. 61 """ 62 expiry = dateutil_parser(self.expiration) if self.expiration else None 63 if expiry is None: 64 return False 65 return expiry <= datetime.now(tz=timezone.utc)
66
[docs] 67 def get_custom_field(self, key: str, default: Any = None) -> Any: 68 """ 69 Retrieves a value from custom fields by its key. 70 71 :param key: The key to look up in custom fields. 72 :param default: The default value to return if the key is not found. 73 :return: The value associated with the key, or the default value if not found. 74 """ 75 return self.custom_fields.get(key, default)
76 77
[docs] 78@dataclass 79class ObjectMetadata: 80 """ 81 A data class that represents the metadata associated with an object stored in a cloud storage service. This metadata 82 includes both required and optional information about the object. 83 """ 84 85 #: Relative path of the object. 86 key: str 87 #: The size of the object in bytes. 88 content_length: int 89 #: The timestamp indicating when the object was last modified. 90 last_modified: datetime 91 type: str = "file" 92 #: The MIME type of the object. 93 content_type: Optional[str] = field(default=None) 94 #: The entity tag (ETag) of the object. 95 etag: Optional[str] = field(default=None) 96 #: The storage class of the object. 97 storage_class: Optional[str] = field(default=None) 98 99 metadata: Optional[dict[str, Any]] = field(default=None) 100
[docs] 101 @staticmethod 102 def from_dict(data: dict) -> "ObjectMetadata": 103 """ 104 Creates an ObjectMetadata instance from a dictionary (parsed from JSON). 105 """ 106 try: 107 last_modified = dateutil_parser(data["last_modified"]) 108 key = data.get("key") 109 if key is None: 110 raise ValueError("Missing required field: 'key'") 111 return ObjectMetadata( 112 key=key, 113 content_length=data["content_length"], 114 last_modified=last_modified, 115 type=data.get("type", "file"), # default to file 116 content_type=data.get("content_type"), 117 etag=data.get("etag"), 118 storage_class=data.get("storage_class"), 119 metadata=data.get("metadata"), 120 ) 121 except KeyError as e: 122 raise ValueError("Missing required field.") from e
123
[docs] 124 def to_dict(self) -> dict: 125 data = asdict(self) 126 data["last_modified"] = self.last_modified.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ") 127 return {k: v for k, v in data.items() if v is not None}
128 129
[docs] 130class CredentialsProvider(ABC): 131 """ 132 Abstract base class for providing credentials to access a storage provider. 133 """ 134
[docs] 135 @abstractmethod 136 def get_credentials(self) -> Credentials: 137 """ 138 Retrieves the current credentials. 139 140 :return: The current credentials used for authentication. 141 """ 142 pass
143
[docs] 144 @abstractmethod 145 def refresh_credentials(self) -> None: 146 """ 147 Refreshes the credentials if they are expired or about to expire. 148 """ 149 pass
150 151
[docs] 152@dataclass 153class Range: 154 """ 155 Byte-range read. 156 """ 157 158 offset: int 159 size: int
160 161
[docs] 162class StorageProvider(ABC): 163 """ 164 Abstract base class for interacting with a storage provider. 165 """ 166
[docs] 167 @abstractmethod 168 def put_object( 169 self, 170 path: str, 171 body: bytes, 172 if_match: Optional[str] = None, 173 if_none_match: Optional[str] = None, 174 attributes: Optional[dict[str, str]] = None, 175 ) -> None: 176 """ 177 Uploads an object to the storage provider. 178 179 :param path: The path where the object will be stored. 180 :param body: The content of the object to store. 181 :param attributes: The attributes to add to the file 182 """ 183 pass
184
[docs] 185 @abstractmethod 186 def get_object(self, path: str, byte_range: Optional[Range] = None) -> bytes: 187 """ 188 Retrieves an object from the storage provider. 189 190 :param path: The path where the object is stored. 191 192 :return: The content of the retrieved object. 193 """ 194 pass
195
[docs] 196 @abstractmethod 197 def copy_object(self, src_path: str, dest_path: str) -> None: 198 """ 199 Copies an object from source to destination in the storage provider. 200 201 :param src_path: The path of the source object to copy. 202 :param dest_path: The path of the destination. 203 """ 204 pass
205
[docs] 206 @abstractmethod 207 def delete_object(self, path: str, if_match: Optional[str] = None) -> None: 208 """ 209 Deletes an object from the storage provider. 210 211 :param path: The path of the object to delete. 212 :param if_match: Optional if-match value to use for conditional deletion. 213 """ 214 pass
215
[docs] 216 @abstractmethod 217 def get_object_metadata(self, path: str, strict: bool = True) -> ObjectMetadata: 218 """ 219 Retrieves metadata or information about an object stored in the provider. 220 221 :param path: The path of the object. 222 :param strict: If True, performs additional validation to determine whether the path refers to a directory. 223 224 :return: A metadata object containing the information about the object. 225 """ 226 pass
227
[docs] 228 @abstractmethod 229 def list_objects( 230 self, 231 path: str, 232 start_after: Optional[str] = None, 233 end_at: Optional[str] = None, 234 include_directories: bool = False, 235 attribute_filter_expression: Optional[str] = None, 236 show_attributes: bool = False, 237 ) -> Iterator[ObjectMetadata]: 238 """ 239 Lists objects in the storage provider under the specified path. 240 241 :param path: The path to list objects under. The path must be a valid file or subdirectory path, cannot be partial or just "prefix". 242 :param start_after: The key to start after (i.e. exclusive). An object with this key doesn't have to exist. 243 :param end_at: The key to end at (i.e. inclusive). An object with this key doesn't have to exist. 244 :param include_directories: Whether to include directories in the result. When True, directories are returned alongside objects. 245 :param attribute_filter_expression: The attribute filter expression to apply to the result. 246 :param show_attributes: Whether to return attributes in the result. There will be performance impact if this is True as now we need to get object metadata for each object. 247 248 :return: An iterator over objects metadata under the specified path. 249 """ 250 pass
251
[docs] 252 @abstractmethod 253 def upload_file(self, remote_path: str, f: Union[str, IO], attributes: Optional[dict[str, str]] = None) -> None: 254 """ 255 Uploads a file from the local file system to the storage provider. 256 257 :param remote_path: The path where the object will be stored. 258 :param f: The source file to upload. This can either be a string representing the local 259 file path, or a file-like object (e.g., an open file handle). 260 :param attributes: The attributes to add to the file if a new file is created. 261 """ 262 pass
263
[docs] 264 @abstractmethod 265 def download_file(self, remote_path: str, f: Union[str, IO], metadata: Optional[ObjectMetadata] = None) -> None: 266 """ 267 Downloads a file from the storage provider to the local file system. 268 269 :param remote_path: The path of the file to download. 270 :param f: The destination for the downloaded file. This can either be a string representing 271 the local file path where the file will be saved, or a file-like object to write the 272 downloaded content into. 273 :param metadata: Metadata about the object to download. 274 """ 275 pass
276
[docs] 277 @abstractmethod 278 def glob(self, pattern: str, attribute_filter_expression: Optional[str] = None) -> list[str]: 279 """ 280 Matches and retrieves a list of object keys in the storage provider that match the specified pattern. 281 282 :param pattern: The pattern to match object keys against, supporting wildcards (e.g., ``*.txt``). 283 :param attribute_filter_expression: The attribute filter expression to apply to the result. 284 285 :return: A list of object keys that match the specified pattern. 286 """ 287 pass
288
[docs] 289 @abstractmethod 290 def is_file(self, path: str) -> bool: 291 """ 292 Checks whether the specified key in the storage provider points to a file (as opposed to a folder or directory). 293 294 :param path: The path to check. 295 296 :return: ``True`` if the key points to a file, ``False`` if it points to a directory or folder. 297 """ 298 pass
299 300
[docs] 301class MetadataProvider(ABC): 302 """ 303 Abstract base class for accessing file metadata. 304 """ 305
[docs] 306 @abstractmethod 307 def list_objects( 308 self, 309 path: str, 310 start_after: Optional[str] = None, 311 end_at: Optional[str] = None, 312 include_directories: bool = False, 313 attribute_filter_expression: Optional[str] = None, 314 show_attributes: bool = False, 315 ) -> Iterator[ObjectMetadata]: 316 """ 317 Lists objects in the metadata provider under the specified path. 318 319 :param path: The path to list objects under. The path must be a valid file or subdirectory path, cannot be partial or just "prefix". 320 :param start_after: The key to start after (i.e. exclusive). An object with this key doesn't have to exist. 321 :param end_at: The key to end at (i.e. inclusive). An object with this key doesn't have to exist. 322 :param include_directories: Whether to include directories in the result. When True, directories are returned alongside objects. 323 :param attribute_filter_expression: The attribute filter expression to apply to the result. 324 :param show_attributes: Whether to return attributes in the result. Depend on implementation, there might be performance impact if this set to True. 325 326 :return: A iterator over objects metadata under the specified path. 327 """ 328 pass
329
[docs] 330 @abstractmethod 331 def get_object_metadata(self, path: str, include_pending: bool = False) -> ObjectMetadata: 332 """ 333 Retrieves metadata or information about an object stored in the provider. 334 335 :param path: The path of the object. 336 :param include_pending: Whether to include metadata that is not yet committed. 337 338 :return: A metadata object containing the information about the object. 339 """ 340 pass
341
[docs] 342 @abstractmethod 343 def glob(self, pattern: str, attribute_filter_expression: Optional[str] = None) -> list[str]: 344 """ 345 Matches and retrieves a list of object keys in the storage provider that match the specified pattern. 346 347 :param pattern: The pattern to match object keys against, supporting wildcards (e.g., ``*.txt``). 348 :param attribute_filter_expression: The attribute filter expression to apply to the result. 349 350 :return: A list of object keys that match the specified pattern. 351 """ 352 pass
353
[docs] 354 @abstractmethod 355 def realpath(self, path: str) -> tuple[str, bool]: 356 """ 357 Returns the canonical, full real physical path for use by a 358 :py:class:`StorageProvider`. This provides translation from user-visible paths to 359 the canonical paths needed by a :py:class:`StorageProvider`. 360 361 :param path: user-supplied virtual path 362 363 :return: A canonical physical path and if the object at the path is valid 364 """ 365 pass
366
[docs] 367 @abstractmethod 368 def add_file(self, path: str, metadata: ObjectMetadata) -> None: 369 """ 370 Add a file to be tracked by the :py:class:`MetadataProvider`. Does not have to be 371 reflected in listing until a :py:meth:`MetadataProvider.commit_updates` forces a persist. 372 This function must tolerate duplicate calls (idempotent behavior). 373 374 :param path: User-supplied virtual path 375 :param metadata: physical file metadata from StorageProvider 376 """ 377 pass
378
[docs] 379 @abstractmethod 380 def remove_file(self, path: str) -> None: 381 """ 382 Remove a file tracked by the :py:class:`MetadataProvider`. Does not have to be 383 reflected in listing until a :py:meth:`MetadataProvider.commit_updates` forces a persist. 384 This function must tolerate duplicate calls (idempotent behavior). 385 386 :param path: User-supplied virtual path 387 """ 388 pass
389
[docs] 390 @abstractmethod 391 def commit_updates(self) -> None: 392 """ 393 Commit any newly adding files, used in conjunction with :py:meth:`MetadataProvider.add_file`. 394 :py:class:`MetadataProvider` will persistently record any metadata changes. 395 """ 396 pass
397
[docs] 398 @abstractmethod 399 def is_writable(self) -> bool: 400 """ 401 Returns ``True`` if the :py:class:`MetadataProvider` supports writes else ``False``. 402 """ 403 pass
404 405
[docs] 406@dataclass 407class StorageProviderConfig: 408 """ 409 A data class that represents the configuration needed to initialize a storage provider. 410 """ 411 412 #: The name or type of the storage provider (e.g., ``s3``, ``gcs``, ``oci``, ``azure``). 413 type: str 414 #: Additional options required to configure the storage provider (e.g., endpoint URLs, region, etc.). 415 options: Optional[dict[str, Any]] = None
416 417
[docs] 418class ProviderBundle(ABC): 419 """ 420 Abstract base class that serves as a container for various providers (storage, credentials, and metadata) 421 that interact with a storage service. The :py:class:`ProviderBundle` abstracts access to these providers, allowing for 422 flexible implementations of cloud storage solutions. 423 """ 424 425 @property 426 @abstractmethod 427 def storage_provider_config(self) -> StorageProviderConfig: 428 """ 429 :return: The configuration for the storage provider, which includes the provider 430 name/type and additional options. 431 """ 432 pass 433 434 @property 435 @abstractmethod 436 def credentials_provider(self) -> Optional[CredentialsProvider]: 437 """ 438 :return: The credentials provider responsible for managing authentication credentials 439 required to access the storage service. 440 """ 441 pass 442 443 @property 444 @abstractmethod 445 def metadata_provider(self) -> Optional[MetadataProvider]: 446 """ 447 :return: The metadata provider responsible for retrieving metadata about objects in the storage service. 448 """ 449 pass 450 451 @property 452 @abstractmethod 453 def replicas(self) -> list["Replica"]: 454 """ 455 :return: The replicas configuration for this provider bundle, if any. 456 """ 457 pass
458 459
[docs] 460@dataclass 461class RetryConfig: 462 """ 463 A data class that represents the configuration for retry strategy. 464 """ 465 466 #: The number of attempts before giving up. Must be at least 1. 467 attempts: int = DEFAULT_RETRY_ATTEMPTS 468 #: The delay (in seconds) between retry attempts. Must be a non-negative value. 469 delay: float = DEFAULT_RETRY_DELAY 470 471 def __post_init__(self) -> None: 472 if self.attempts < 1: 473 raise ValueError("Attempts must be at least 1.") 474 if self.delay < 0: 475 raise ValueError("Delay must be a non-negative number.")
476 477
[docs] 478class RetryableError(Exception): 479 """ 480 Exception raised for errors that should trigger a retry. 481 """ 482 483 pass
484 485
[docs] 486class PreconditionFailedError(Exception): 487 """ 488 Exception raised when a precondition fails. e.g. if-match, if-none-match, etc. 489 """ 490 491 pass
492 493
[docs] 494class NotModifiedError(Exception): 495 """Raised when a conditional operation fails because the resource has not been modified. 496 497 This typically occurs when using if-none-match with a specific generation/etag 498 and the resource's current generation/etag matches the specified one. 499 """ 500 501 pass
502 503
[docs] 504class SourceVersionCheckMode(Enum): 505 """Enum for controlling source version checking behavior.""" 506 507 INHERIT = "inherit" # Inherit from configuration (cache config) 508 ENABLE = "enable" # Always check source version 509 DISABLE = "disable" # Never check source version
510 511
[docs] 512@dataclass 513class Replica: 514 """A tier of storage that can be used to store data.""" 515 516 replica_profile: str 517 read_priority: int
518 519
[docs] 520class AutoCommitConfig: 521 """ 522 A data class that represents the configuration for auto commit. 523 """ 524 525 interval_minutes: Optional[float] # The interval in minutes for auto commit. 526 at_exit: bool = False # if True, commit on program exit 527 528 def __init__(self, interval_minutes: Optional[float] = None, at_exit: bool = False) -> None: 529 self.interval_minutes = interval_minutes 530 self.at_exit = at_exit
531 532
[docs] 533class ExecutionMode(Enum): 534 """ 535 Enum for controlling execution mode in sync operations. 536 """ 537 538 LOCAL = "local" 539 RAY = "ray"