Source code for multistorageclient.shortcuts

  1# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2# SPDX-License-Identifier: Apache-2.0
  3#
  4# Licensed under the Apache License, Version 2.0 (the "License");
  5# you may not use this file except in compliance with the License.
  6# You may obtain a copy of the License at
  7#
  8# http://www.apache.org/licenses/LICENSE-2.0
  9#
 10# Unless required by applicable law or agreed to in writing, software
 11# distributed under the License is distributed on an "AS IS" BASIS,
 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13# See the License for the specific language governing permissions and
 14# limitations under the License.
 15
 16import os
 17import threading
 18from collections.abc import Iterator
 19from typing import Any, Optional, Union
 20from urllib.parse import ParseResult, urlparse
 21
 22from .client import StorageClient
 23from .config import DEFAULT_POSIX_PROFILE_NAME, SUPPORTED_IMPLICIT_PROFILE_PROTOCOLS, StorageClientConfig
 24from .file import ObjectFile, PosixFile
 25from .telemetry import Telemetry
 26from .types import MSC_PROTOCOL, ObjectMetadata
 27
 28_TELEMETRY: Optional[Telemetry] = None
 29_TELEMETRY_LOCK = threading.Lock()
 30_STORAGE_CLIENT_CACHE: dict[str, StorageClient] = {}
 31_STORAGE_CLIENT_CACHE_LOCK = threading.Lock()
 32
 33
[docs] 34def get_telemetry() -> Optional[Telemetry]: 35 """ 36 Get the :py:class:``Telemetry`` instance to use for storage clients created by shortcuts. 37 38 :return: A telemetry instance. 39 """ 40 global _TELEMETRY 41 42 return _TELEMETRY
43 44
[docs] 45def set_telemetry(telemetry: Optional[Telemetry]) -> None: 46 """ 47 Set the :py:class:``Telemetry`` instance to use for storage clients created by shortcuts. 48 49 :param telemetry: A telemetry instance. 50 """ 51 global _TELEMETRY 52 global _TELEMETRY_LOCK 53 54 with _TELEMETRY_LOCK: 55 _TELEMETRY = telemetry
56 57 58def _build_full_path(pr: ParseResult) -> str: 59 """ 60 Helper function to construct the full path from a parsed URL, including query and fragment. 61 62 :param pr: The parsed URL result from urlparse 63 :return: The complete path including query and fragment if present 64 """ 65 path = pr.path 66 if pr.query: 67 path += "?" + pr.query 68 if pr.fragment: 69 path += "#" + pr.fragment 70 return path 71 72 73def _resolve_msc_url(url: str) -> tuple[str, str]: 74 """ 75 Resolve an MSC URL to a profile name and path. 76 77 :param url: The MSC URL to resolve (msc://profile/path) 78 :return: A tuple of (profile_name, path) 79 """ 80 pr = urlparse(url) 81 profile = pr.netloc 82 path = _build_full_path(pr) 83 if path.startswith("/"): 84 path = path[1:] 85 return profile, path 86 87 88def _resolve_non_msc_url(url: str) -> tuple[str, str]: 89 """ 90 Resolve a non-MSC URL to a profile name and path. 91 92 Resolution process: 93 1. First check if MSC config exists 94 2. If config exists, check for possible path mapping 95 3. If no mapping is found, fall back to default POSIX profile 96 for file paths or create an implicit profile based on URL 97 98 :param url: The non-MSC URL to resolve 99 :return: A tuple of (profile_name, path) 100 """ 101 # Check if we have a valid path mapping, if so check if there is a matching mapping 102 path_mapping = StorageClientConfig.read_path_mapping() 103 if path_mapping: 104 # Look for a matching mapping 105 possible_mapping = path_mapping.find_mapping(url) 106 if possible_mapping: 107 return possible_mapping # return the profile name and path 108 109 # For file paths, use the default POSIX profile 110 if url.startswith("file://"): 111 pr = urlparse(url) 112 return DEFAULT_POSIX_PROFILE_NAME, _build_full_path(pr) 113 elif url.startswith("/"): 114 url = os.path.normpath(url) 115 return DEFAULT_POSIX_PROFILE_NAME, url 116 117 # For other URL protocol, create an implicit profile name 118 pr = urlparse(url) 119 protocol = pr.scheme.lower() 120 121 # Translate relative paths to absolute paths 122 if not protocol: 123 return DEFAULT_POSIX_PROFILE_NAME, os.path.realpath(url) 124 125 # Validate the protocol is supported 126 if protocol not in SUPPORTED_IMPLICIT_PROFILE_PROTOCOLS: 127 supported_protocols = ", ".join([f"{p}://" for p in SUPPORTED_IMPLICIT_PROFILE_PROTOCOLS]) 128 raise ValueError( 129 f'Unknown URL "{url}", expecting "{MSC_PROTOCOL}" or a supported protocol ({supported_protocols}) or a POSIX path' 130 ) 131 132 # Build the implicit profile name using the format _protocol-bucket 133 bucket = pr.netloc 134 if not bucket: 135 raise ValueError(f'Invalid URL "{url}", bucket name is required for {protocol}:// URLs') 136 137 profile_name = f"_{protocol}-{bucket}" 138 139 # Return normalized path with leading slash removed 140 path = pr.path 141 if path.startswith("/"): 142 path = path[1:] 143 144 return profile_name, path 145 146
[docs] 147def resolve_storage_client(url: str) -> tuple[StorageClient, str]: 148 """ 149 Build and return a :py:class:`multistorageclient.StorageClient` instance based on the provided URL or path. 150 151 This function parses the given URL or path and determines the appropriate storage profile and path. 152 It supports URLs with the protocol ``msc://``, as well as POSIX paths or ``file://`` URLs for local file 153 system access. If the profile has already been instantiated, it returns the cached client. Otherwise, 154 it creates a new :py:class:`StorageClient` and caches it. 155 156 The function also supports implicit profiles for non-MSC URLs. When a non-MSC URL is provided (like s3://, 157 gs://, ais://, file://), MSC will infer the storage provider based on the URL protocol and create an implicit 158 profile with the naming convention "_protocol-bucket" (e.g., "_s3-bucket1", "_gs-bucket1"). 159 160 Path mapping defined in the MSC configuration are also applied before creating implicit profiles. 161 This allows for explicit mappings between source paths and destination MSC profiles. 162 163 :param url: The storage location, which can be: 164 - A URL in the format ``msc://profile/path`` for object storage. 165 - A local file system path (absolute POSIX path) or a ``file://`` URL. 166 - A non-MSC URL with a supported protocol (s3://, gs://, ais://). 167 168 :return: A tuple containing the :py:class:`multistorageclient.StorageClient` instance and the parsed path. 169 170 :raises ValueError: If the URL's protocol is neither ``msc`` nor a valid local file system path 171 or a supported non-MSC protocol. 172 """ 173 global _STORAGE_CLIENT_CACHE 174 global _STORAGE_CLIENT_CACHE_LOCK 175 176 # Normalize the path for msc:/ prefix due to pathlib.Path('msc://') 177 if url.startswith("msc:/") and not url.startswith("msc://"): 178 url = url.replace("msc:/", "msc://") 179 180 # Resolve the URL to a profile name and path 181 profile, path = _resolve_msc_url(url) if url.startswith(MSC_PROTOCOL) else _resolve_non_msc_url(url) 182 183 # Check if the profile has already been instantiated 184 if profile in _STORAGE_CLIENT_CACHE: 185 return _STORAGE_CLIENT_CACHE[profile], path 186 187 # Create a new StorageClient instance and cache it 188 with _STORAGE_CLIENT_CACHE_LOCK: 189 if profile in _STORAGE_CLIENT_CACHE: 190 return _STORAGE_CLIENT_CACHE[profile], path 191 else: 192 client = StorageClient(config=StorageClientConfig.from_file(profile=profile, telemetry=get_telemetry())) 193 _STORAGE_CLIENT_CACHE[profile] = client 194 195 return client, path
196 197
[docs] 198def open(url: str, mode: str = "rb", **kwargs: Any) -> Union[PosixFile, ObjectFile]: 199 """ 200 Open a file at the given URL using the specified mode. 201 202 The function utilizes the :py:class:`multistorageclient.StorageClient` to open a file at the provided path. 203 The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` is retrieved or built. 204 205 :param url: The URL of the file to open. (example: ``msc://profile/prefix/dataset.tar``) 206 :param mode: The file mode to open the file in. 207 208 :return: A file-like object that allows interaction with the file. 209 210 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 211 """ 212 client, path = resolve_storage_client(url) 213 return client.open(path, mode, **kwargs)
214 215
[docs] 216def glob(pattern: str, attribute_filter_expression: Optional[str] = None) -> list[str]: 217 """ 218 Return a list of files matching a pattern. 219 220 This function supports glob-style patterns for matching multiple files within a storage system. The pattern is 221 parsed, and the associated :py:class:`multistorageclient.StorageClient` is used to retrieve the 222 list of matching files. 223 224 :param pattern: The glob-style pattern to match files. (example: ``msc://profile/prefix/**/*.tar``) 225 :param attribute_filter_expression: The attribute filter expression to apply to the result. 226 227 :return: A list of file paths matching the pattern. 228 229 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 230 """ 231 client, path = resolve_storage_client(pattern) 232 if not pattern.startswith(MSC_PROTOCOL) and client.profile == DEFAULT_POSIX_PROFILE_NAME: 233 return client.glob(path, include_url_prefix=False, attribute_filter_expression=attribute_filter_expression) 234 else: 235 return client.glob(path, include_url_prefix=True, attribute_filter_expression=attribute_filter_expression)
236 237
[docs] 238def upload_file(url: str, local_path: str, attributes: Optional[dict[str, str]] = None) -> None: 239 """ 240 Upload a file to the given URL from a local path. 241 242 The function utilizes the :py:class:`multistorageclient.StorageClient` to upload a file (object) to the 243 provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` 244 is retrieved or built. 245 246 :param url: The URL of the file. (example: ``msc://profile/prefix/dataset.tar``) 247 :param local_path: The local path of the file. 248 249 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 250 """ 251 client, path = resolve_storage_client(url) 252 return client.upload_file(remote_path=path, local_path=local_path, attributes=attributes)
253 254
[docs] 255def download_file(url: str, local_path: str) -> None: 256 """ 257 Download a file in a given remote_path to a local path 258 259 The function utilizes the :py:class:`multistorageclient.StorageClient` to download a file (object) at the 260 provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` 261 is retrieved or built. 262 263 :param url: The URL of the file to download. (example: ``msc://profile/prefix/dataset.tar``) 264 :param local_path: The local path where the file should be downloaded. 265 266 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 267 """ 268 client, path = resolve_storage_client(url) 269 return client.download_file(remote_path=path, local_path=local_path)
270 271
[docs] 272def is_empty(url: str) -> bool: 273 """ 274 Checks whether the specified URL contains any objects. 275 276 :param url: The URL to check, typically pointing to a storage location. 277 :return: ``True`` if there are no objects/files under this URL, ``False`` otherwise. 278 279 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 280 """ 281 client, path = resolve_storage_client(url) 282 return client.is_empty(path)
283 284
[docs] 285def is_file(url: str) -> bool: 286 """ 287 Checks whether the specified url points to a file (rather than a directory or folder). 288 289 The function utilizes the :py:class:`multistorageclient.StorageClient` to check if a file (object) exists 290 at the provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` 291 is retrieved or built. 292 293 :param url: The URL to check the existence of a file. (example: ``msc://profile/prefix/dataset.tar``) 294 """ 295 client, path = resolve_storage_client(url) 296 return client.is_file(path=path)
297 298
[docs] 299def sync(source_url: str, target_url: str, delete_unmatched_files: bool = False) -> None: 300 """ 301 Syncs files from the source storage to the target storage. 302 303 :param source_url: The URL for the source storage. 304 :param target_url: The URL for the target storage. 305 :param delete_unmatched_files: Whether to delete files at the target that are not present at the source. 306 """ 307 source_client, source_path = resolve_storage_client(source_url) 308 target_client, target_path = resolve_storage_client(target_url) 309 target_client.sync_from(source_client, source_path, target_path, delete_unmatched_files)
310 311
[docs] 312def list( 313 url: str, 314 start_after: Optional[str] = None, 315 end_at: Optional[str] = None, 316 include_directories: bool = False, 317 attribute_filter_expression: Optional[str] = None, 318) -> Iterator[ObjectMetadata]: 319 """ 320 Lists the contents of the specified URL prefix. 321 322 This function retrieves the corresponding :py:class:`multistorageclient.StorageClient` 323 for the given URL and returns an iterator of objects (files or directories) stored under the provided prefix. 324 325 :param url: The prefix to list objects under. 326 :param start_after: The key to start after (i.e. exclusive). An object with this key doesn't have to exist. 327 :param end_at: The key to end at (i.e. inclusive). An object with this key doesn't have to exist. 328 :param include_directories: Whether to include directories in the result. When True, directories are returned alongside objects. 329 :param attribute_filter_expression: The attribute filter expression to apply to the result. 330 331 :return: An iterator of :py:class:`ObjectMetadata` objects representing the files (and optionally directories) 332 accessible under the specified URL prefix. The returned keys will always be prefixed with msc://. 333 """ 334 client, prefix = resolve_storage_client(url) 335 return client.list( 336 prefix=prefix, 337 start_after=start_after, 338 end_at=end_at, 339 include_directories=include_directories, 340 include_url_prefix=True, 341 attribute_filter_expression=attribute_filter_expression, 342 )
343 344
[docs] 345def write(url: str, body: bytes, attributes: Optional[dict[str, str]] = None) -> None: 346 """ 347 Writes an object to the storage provider at the specified path. 348 349 :param url: The path where the object should be written. 350 :param body: The content to write to the object. 351 """ 352 client, path = resolve_storage_client(url) 353 client.write(path=path, body=body, attributes=attributes)
354 355
[docs] 356def delete(url: str, recursive: bool = False) -> None: 357 """ 358 Deletes the specified object(s) from the storage provider. 359 360 This function retrieves the corresponding :py:class:`multistorageclient.StorageClient` 361 for the given URL and deletes the object(s) at the specified path. 362 363 :param url: The URL of the object to delete. (example: ``msc://profile/prefix/file.txt``) 364 :param recursive: Whether to delete objects in the path recursively. 365 """ 366 client, path = resolve_storage_client(url) 367 client.delete(path, recursive=recursive)
368 369
[docs] 370def info(url: str) -> ObjectMetadata: 371 """ 372 Retrieves metadata or information about an object stored at the specified path. 373 374 :param url: The URL of the object to retrieve information about. (example: ``msc://profile/prefix/file.txt``) 375 376 :return: An :py:class:`ObjectMetadata` object representing the object's metadata. 377 """ 378 client, path = resolve_storage_client(url) 379 return client.info(path)
380 381
[docs] 382def commit_metadata(url: str) -> None: 383 """ 384 Commits the metadata updates for the specified storage client profile. 385 386 :param url: The URL of the path to commit metadata for. 387 """ 388 client, path = resolve_storage_client(url) 389 client.commit_metadata(prefix=path)