Source code for multistorageclient.shortcuts

  1# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2# SPDX-License-Identifier: Apache-2.0
  3#
  4# Licensed under the Apache License, Version 2.0 (the "License");
  5# you may not use this file except in compliance with the License.
  6# You may obtain a copy of the License at
  7#
  8# http://www.apache.org/licenses/LICENSE-2.0
  9#
 10# Unless required by applicable law or agreed to in writing, software
 11# distributed under the License is distributed on an "AS IS" BASIS,
 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13# See the License for the specific language governing permissions and
 14# limitations under the License.
 15
 16import os
 17import threading
 18from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 19from urllib.parse import ParseResult, urlparse
 20
 21from .client import StorageClient
 22from .config import StorageClientConfig
 23from .file import ObjectFile, PosixFile
 24from .types import DEFAULT_POSIX_PROFILE_NAME, MSC_PROTOCOL, ObjectMetadata
 25
 26_instance_cache: Dict[str, StorageClient] = {}
 27_cache_lock = threading.Lock()
 28
 29
 30def _build_full_path(pr: ParseResult) -> str:
 31    """
 32    Helper function to construct the full path from a parsed URL, including query and fragment.
 33
 34    :param pr: The parsed URL result from urlparse
 35    :return: The complete path including query and fragment if present
 36    """
 37    path = pr.path
 38    if pr.query:
 39        path += "?" + pr.query
 40    if pr.fragment:
 41        path += "#" + pr.fragment
 42    return path
 43
 44
[docs] 45def resolve_storage_client(url: str) -> Tuple[StorageClient, str]: 46 """ 47 Build and return a :py:class:`multistorageclient.StorageClient` instance based on the provided URL or path. 48 49 This function parses the given URL or path and determines the appropriate storage profile and path. 50 It supports URLs with the protocol ``msc://``, as well as POSIX paths or ``file://`` URLs for local file 51 system access. If the profile has already been instantiated, it returns the cached client. Otherwise, 52 it creates a new :py:class:`StorageClient` and caches it. 53 54 :param url: The storage location, which can be: 55 - A URL in the format ``msc://profile/path`` for object storage. 56 - A local file system path (absolute POSIX path) or a ``file://`` URL. 57 58 :return: A tuple containing the :py:class:`multistorageclient.StorageClient` instance and the parsed path. 59 60 :raises ValueError: If the URL's protocol is neither ``msc`` nor a valid local file system path. 61 """ 62 if url.startswith(MSC_PROTOCOL): 63 pr = urlparse(url) 64 profile = pr.netloc 65 path = _build_full_path(pr) 66 if path.startswith("/"): 67 path = path[1:] 68 elif url.startswith("file://"): 69 pr = urlparse(url) 70 profile = DEFAULT_POSIX_PROFILE_NAME 71 path = _build_full_path(pr) 72 elif url.startswith("/"): 73 # POSIX paths (only absolute paths are supported) 74 url = os.path.normpath(url) 75 if os.path.isabs(url): 76 profile = DEFAULT_POSIX_PROFILE_NAME 77 path = url 78 else: 79 raise ValueError(f'Invalid POSIX path "{url}", only absolute path is allowed') 80 else: 81 raise ValueError(f'Unknown URL "{url}", expecting "{MSC_PROTOCOL}" or a POSIX path') 82 83 # Check if the profile has already been instantiated 84 if profile in _instance_cache: 85 return _instance_cache[profile], path 86 87 # Create a new StorageClient instance and cache it 88 with _cache_lock: 89 if profile in _instance_cache: 90 return _instance_cache[profile], path 91 else: 92 client = StorageClient(config=StorageClientConfig.from_file(profile=profile)) 93 _instance_cache[profile] = client 94 95 return client, path
96 97
[docs] 98def open(url: str, mode: str = "rb", **kwargs: Any) -> Union[PosixFile, ObjectFile]: 99 """ 100 Open a file at the given URL using the specified mode. 101 102 The function utilizes the :py:class:`multistorageclient.StorageClient` to open a file at the provided path. 103 The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` is retrieved or built. 104 105 :param url: The URL of the file to open. (example: ``msc://profile/prefix/dataset.tar``) 106 :param mode: The file mode to open the file in. 107 108 :return: A file-like object that allows interaction with the file. 109 110 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 111 """ 112 client, path = resolve_storage_client(url) 113 return client.open(path, mode, **kwargs)
114 115
[docs] 116def glob(pattern: str) -> List[str]: 117 """ 118 Return a list of files matching a pattern. 119 120 This function supports glob-style patterns for matching multiple files within a storage system. The pattern is 121 parsed, and the associated :py:class:`multistorageclient.StorageClient` is used to retrieve the 122 list of matching files. 123 124 :param pattern: The glob-style pattern to match files. (example: ``msc://profile/prefix/**/*.tar``) 125 126 :return: A list of file paths matching the pattern. 127 128 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 129 """ 130 client, path = resolve_storage_client(pattern) 131 if not pattern.startswith(MSC_PROTOCOL) and client.profile == DEFAULT_POSIX_PROFILE_NAME: 132 return client.glob(path, include_url_prefix=False) 133 else: 134 return client.glob(path, include_url_prefix=True)
135 136
[docs] 137def upload_file(url: str, local_path: str) -> None: 138 """ 139 Upload a file to the given URL from a local path. 140 141 The function utilizes the :py:class:`multistorageclient.StorageClient` to upload a file (object) to the 142 provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` 143 is retrieved or built. 144 145 :param url: The URL of the file. (example: ``msc://profile/prefix/dataset.tar``) 146 :param local_path: The local path of the file. 147 148 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 149 """ 150 client, path = resolve_storage_client(url) 151 return client.upload_file(remote_path=path, local_path=local_path)
152 153
[docs] 154def download_file(url: str, local_path: str) -> None: 155 """ 156 Download a file in a given remote_path to a local path 157 158 The function utilizes the :py:class:`multistorageclient.StorageClient` to download a file (object) at the 159 provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` 160 is retrieved or built. 161 162 :param url: The URL of the file to download. (example: ``msc://profile/prefix/dataset.tar``) 163 :param local_path: The local path where the file should be downloaded. 164 165 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 166 """ 167 client, path = resolve_storage_client(url) 168 return client.download_file(remote_path=path, local_path=local_path)
169 170
[docs] 171def is_empty(url: str) -> bool: 172 """ 173 Checks whether the specified URL contains any objects. 174 175 :param url: The URL to check, typically pointing to a storage location. 176 :return: ``True`` if there are no objects/files under this URL, ``False`` otherwise. 177 178 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 179 """ 180 client, path = resolve_storage_client(url) 181 return client.is_empty(path)
182 183
[docs] 184def is_file(url: str) -> bool: 185 """ 186 Checks whether the specified url points to a file (rather than a directory or folder). 187 188 The function utilizes the :py:class:`multistorageclient.StorageClient` to check if a file (object) exists 189 at the provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` 190 is retrieved or built. 191 192 :param url: The URL to check the existence of a file. (example: ``msc://profile/prefix/dataset.tar``) 193 """ 194 client, path = resolve_storage_client(url) 195 return client.is_file(path=path)
196 197
[docs] 198def sync(source_url: str, target_url: str, delete_unmatched_files: bool = False) -> None: 199 """ 200 Syncs files from the source storage to the target storage. 201 202 :param source_url: The URL for the source storage. 203 :param target_url: The URL for the target storage. 204 :param delete_unmatched_files: Whether to delete files at the target that are not present at the source. 205 """ 206 source_client, source_path = resolve_storage_client(source_url) 207 target_client, target_path = resolve_storage_client(target_url) 208 target_client.sync_from(source_client, source_path, target_path, delete_unmatched_files)
209 210
[docs] 211def list( 212 url: str, start_after: Optional[str] = None, end_at: Optional[str] = None, include_directories: bool = False 213) -> Iterator[ObjectMetadata]: 214 """ 215 Lists the contents of the specified URL prefix. 216 217 This function retrieves the corresponding :py:class:`multistorageclient.StorageClient` 218 for the given URL and returns an iterator of objects (files or directories) stored under the provided prefix. 219 220 :param url: The prefix to list objects under. 221 :param start_after: The key to start after (i.e. exclusive). An object with this key doesn't have to exist. 222 :param end_at: The key to end at (i.e. inclusive). An object with this key doesn't have to exist. 223 :param include_directories: Whether to include directories in the result. When True, directories are returned alongside objects. 224 225 :return: An iterator of :py:class:`ObjectMetadata` objects representing the files (and optionally directories) 226 accessible under the specified URL prefix. The returned keys will always be prefixed with msc://. 227 """ 228 client, prefix = resolve_storage_client(url) 229 return client.list( 230 prefix=prefix, 231 start_after=start_after, 232 end_at=end_at, 233 include_directories=include_directories, 234 include_url_prefix=True, 235 )
236 237
[docs] 238def write(url: str, body: bytes) -> None: 239 """ 240 Writes an object to the storage provider at the specified path. 241 242 :param url: The path where the object should be written. 243 :param body: The content to write to the object. 244 """ 245 client, path = resolve_storage_client(url) 246 client.write(path=path, body=body)
247 248
[docs] 249def delete(url: str) -> None: 250 """ 251 Deletes the specified object from the storage provider. 252 253 This function retrieves the corresponding :py:class:`multistorageclient.StorageClient` 254 for the given URL and deletes the object at the specified path. 255 256 :param url: The URL of the object to delete. (example: ``msc://profile/prefix/file.txt``) 257 """ 258 client, path = resolve_storage_client(url) 259 client.delete(path)