Source code for multistorageclient.shortcuts

  1# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2# SPDX-License-Identifier: Apache-2.0
  3#
  4# Licensed under the Apache License, Version 2.0 (the "License");
  5# you may not use this file except in compliance with the License.
  6# You may obtain a copy of the License at
  7#
  8# http://www.apache.org/licenses/LICENSE-2.0
  9#
 10# Unless required by applicable law or agreed to in writing, software
 11# distributed under the License is distributed on an "AS IS" BASIS,
 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13# See the License for the specific language governing permissions and
 14# limitations under the License.
 15
 16import threading
 17from pathlib import Path
 18from typing import Any, Dict, List, Tuple, Union
 19from urllib.parse import urlparse
 20
 21from .client import StorageClient
 22from .config import StorageClientConfig
 23from .file import ObjectFile, PosixFile
 24from .types import DEFAULT_POSIX_PROFILE_NAME, MSC_PROTOCOL, MSC_PROTOCOL_NAME
 25
 26_instance_cache: Dict[str, StorageClient] = {}
 27_cache_lock = threading.Lock()
 28
 29
[docs] 30def resolve_storage_client(url: str) -> Tuple[StorageClient, str]: 31 """ 32 Build and return a :py:class:`multistorageclient.StorageClient` instance based on the provided URL or path. 33 34 This function parses the given URL or path and determines the appropriate storage profile and path. 35 It supports URLs with the protocol ``msc://``, as well as POSIX paths or ``file://`` URLs for local file 36 system access. If the profile has already been instantiated, it returns the cached client. Otherwise, 37 it creates a new :py:class:`StorageClient` and caches it. 38 39 :param url: The storage location, which can be: 40 - A URL in the format ``msc://profile/path`` for object storage. 41 - A local file system path (absolute POSIX path) or a ``file://`` URL. 42 43 :return: A tuple containing the :py:class:`multistorageclient.StorageClient` instance and the parsed path. 44 45 :raises ValueError: If the URL's protocol is neither ``msc`` nor a valid local file system path. 46 """ 47 pr = urlparse(url) 48 if pr.scheme == MSC_PROTOCOL_NAME: 49 profile = pr.netloc 50 51 # Remove the leading slash 52 if pr.path.startswith("/"): 53 path = pr.path[1:] 54 else: 55 path = pr.path 56 elif pr.scheme == "" or pr.scheme == "file": 57 if Path(pr.path).is_absolute(): 58 profile = DEFAULT_POSIX_PROFILE_NAME 59 path = pr.path 60 else: 61 raise ValueError(f'Invalid POSIX path "{url}", only absolute path is allowed') 62 else: 63 raise ValueError(f'Unknown URL "{url}", expecting "{MSC_PROTOCOL}" or a POSIX path') 64 65 if profile in _instance_cache: 66 return _instance_cache[profile], path 67 68 with _cache_lock: 69 if profile in _instance_cache: 70 return _instance_cache[profile], path 71 else: 72 client = StorageClient(config=StorageClientConfig.from_file(profile=profile)) 73 _instance_cache[profile] = client 74 75 return client, path
76 77
[docs] 78def open(url: str, mode: str = "rb", **kwargs: Any) -> Union[PosixFile, ObjectFile]: 79 """ 80 Open a file at the given URL using the specified mode. 81 82 The function utilizes the :py:class:`multistorageclient.StorageClient` to open a file at the provided path. 83 The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` is retrieved or built. 84 85 :param url: The URL of the file to open. (example: ``msc://profile/prefix/dataset.tar``) 86 :param mode: The file mode to open the file in. 87 88 :return: A file-like object that allows interaction with the file. 89 90 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 91 """ 92 client, path = resolve_storage_client(url) 93 return client.open(path, mode, **kwargs)
94 95
[docs] 96def glob(pattern: str) -> List[str]: 97 """ 98 Return a list of files matching a pattern. 99 100 This function supports glob-style patterns for matching multiple files within a storage system. The pattern is 101 parsed, and the associated :py:class:`multistorageclient.StorageClient` is used to retrieve the 102 list of matching files. 103 104 :param pattern: The glob-style pattern to match files. (example: ``msc://profile/prefix/**/*.tar``) 105 106 :return: A list of file paths matching the pattern. 107 108 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 109 """ 110 client, path = resolve_storage_client(pattern) 111 if not pattern.startswith(MSC_PROTOCOL) and client.profile == DEFAULT_POSIX_PROFILE_NAME: 112 return client.glob(path, include_url_prefix=False) 113 else: 114 return client.glob(path, include_url_prefix=True)
115 116
[docs] 117def upload_file(url: str, local_path: str) -> None: 118 """ 119 Upload a file to the given URL from a local path. 120 121 The function utilizes the :py:class:`multistorageclient.StorageClient` to upload a file (object) to the 122 provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` 123 is retrieved or built. 124 125 :param url: The URL of the file. (example: ``msc://profile/prefix/dataset.tar``) 126 :param local_path: The local path of the file. 127 128 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 129 """ 130 client, path = resolve_storage_client(url) 131 return client.upload_file(remote_path=path, local_path=local_path)
132 133
[docs] 134def download_file(url: str, local_path: str) -> None: 135 """ 136 Download a file in a given remote_path to a local path 137 138 The function utilizes the :py:class:`multistorageclient.StorageClient` to download a file (object) at the 139 provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` 140 is retrieved or built. 141 142 :param url: The URL of the file to download. (example: ``msc://profile/prefix/dataset.tar``) 143 :param local_path: The local path where the file should be downloaded. 144 145 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 146 """ 147 client, path = resolve_storage_client(url) 148 return client.download_file(remote_path=path, local_path=local_path)
149 150
[docs] 151def is_empty(url: str) -> bool: 152 """ 153 Checks whether the specified URL contains any objects. 154 155 :param url: The URL to check, typically pointing to a storage location. 156 :return: ``True`` if there are no objects/files under this URL, ``False`` otherwise. 157 158 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``. 159 """ 160 client, path = resolve_storage_client(url) 161 return client.is_empty(path)
162 163
[docs] 164def is_file(url: str) -> bool: 165 """ 166 Checks whether the specified url points to a file (rather than a directory or folder). 167 168 The function utilizes the :py:class:`multistorageclient.StorageClient` to check if a file (object) exists 169 at the provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` 170 is retrieved or built. 171 172 :param url: The URL to check the existence of a file. (example: ``msc://profile/prefix/dataset.tar``) 173 """ 174 client, path = resolve_storage_client(url) 175 return client.is_file(path=path)