1# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2# SPDX-License-Identifier: Apache-2.0
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16import os
17import threading
18from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
19from urllib.parse import ParseResult, urlparse
20
21from .client import StorageClient
22from .config import StorageClientConfig
23from .file import ObjectFile, PosixFile
24from .types import DEFAULT_POSIX_PROFILE_NAME, MSC_PROTOCOL, ObjectMetadata
25
26_instance_cache: Dict[str, StorageClient] = {}
27_cache_lock = threading.Lock()
28
29
30def _build_full_path(pr: ParseResult) -> str:
31 """
32 Helper function to construct the full path from a parsed URL, including query and fragment.
33
34 :param pr: The parsed URL result from urlparse
35 :return: The complete path including query and fragment if present
36 """
37 path = pr.path
38 if pr.query:
39 path += "?" + pr.query
40 if pr.fragment:
41 path += "#" + pr.fragment
42 return path
43
44
[docs]
45def resolve_storage_client(url: str) -> Tuple[StorageClient, str]:
46 """
47 Build and return a :py:class:`multistorageclient.StorageClient` instance based on the provided URL or path.
48
49 This function parses the given URL or path and determines the appropriate storage profile and path.
50 It supports URLs with the protocol ``msc://``, as well as POSIX paths or ``file://`` URLs for local file
51 system access. If the profile has already been instantiated, it returns the cached client. Otherwise,
52 it creates a new :py:class:`StorageClient` and caches it.
53
54 :param url: The storage location, which can be:
55 - A URL in the format ``msc://profile/path`` for object storage.
56 - A local file system path (absolute POSIX path) or a ``file://`` URL.
57
58 :return: A tuple containing the :py:class:`multistorageclient.StorageClient` instance and the parsed path.
59
60 :raises ValueError: If the URL's protocol is neither ``msc`` nor a valid local file system path.
61 """
62 if url.startswith(MSC_PROTOCOL):
63 pr = urlparse(url)
64 profile = pr.netloc
65 path = _build_full_path(pr)
66 if path.startswith("/"):
67 path = path[1:]
68 elif url.startswith("file://"):
69 pr = urlparse(url)
70 profile = DEFAULT_POSIX_PROFILE_NAME
71 path = _build_full_path(pr)
72 elif url.startswith("/"):
73 # POSIX paths (only absolute paths are supported)
74 url = os.path.normpath(url)
75 if os.path.isabs(url):
76 profile = DEFAULT_POSIX_PROFILE_NAME
77 path = url
78 else:
79 raise ValueError(f'Invalid POSIX path "{url}", only absolute path is allowed')
80 else:
81 raise ValueError(f'Unknown URL "{url}", expecting "{MSC_PROTOCOL}" or a POSIX path')
82
83 # Check if the profile has already been instantiated
84 if profile in _instance_cache:
85 return _instance_cache[profile], path
86
87 # Create a new StorageClient instance and cache it
88 with _cache_lock:
89 if profile in _instance_cache:
90 return _instance_cache[profile], path
91 else:
92 client = StorageClient(config=StorageClientConfig.from_file(profile=profile))
93 _instance_cache[profile] = client
94
95 return client, path
96
97
[docs]
98def open(url: str, mode: str = "rb", **kwargs: Any) -> Union[PosixFile, ObjectFile]:
99 """
100 Open a file at the given URL using the specified mode.
101
102 The function utilizes the :py:class:`multistorageclient.StorageClient` to open a file at the provided path.
103 The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` is retrieved or built.
104
105 :param url: The URL of the file to open. (example: ``msc://profile/prefix/dataset.tar``)
106 :param mode: The file mode to open the file in.
107
108 :return: A file-like object that allows interaction with the file.
109
110 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``.
111 """
112 client, path = resolve_storage_client(url)
113 return client.open(path, mode, **kwargs)
114
115
[docs]
116def glob(pattern: str) -> List[str]:
117 """
118 Return a list of files matching a pattern.
119
120 This function supports glob-style patterns for matching multiple files within a storage system. The pattern is
121 parsed, and the associated :py:class:`multistorageclient.StorageClient` is used to retrieve the
122 list of matching files.
123
124 :param pattern: The glob-style pattern to match files. (example: ``msc://profile/prefix/**/*.tar``)
125
126 :return: A list of file paths matching the pattern.
127
128 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``.
129 """
130 client, path = resolve_storage_client(pattern)
131 if not pattern.startswith(MSC_PROTOCOL) and client.profile == DEFAULT_POSIX_PROFILE_NAME:
132 return client.glob(path, include_url_prefix=False)
133 else:
134 return client.glob(path, include_url_prefix=True)
135
136
[docs]
137def upload_file(url: str, local_path: str) -> None:
138 """
139 Upload a file to the given URL from a local path.
140
141 The function utilizes the :py:class:`multistorageclient.StorageClient` to upload a file (object) to the
142 provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient`
143 is retrieved or built.
144
145 :param url: The URL of the file. (example: ``msc://profile/prefix/dataset.tar``)
146 :param local_path: The local path of the file.
147
148 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``.
149 """
150 client, path = resolve_storage_client(url)
151 return client.upload_file(remote_path=path, local_path=local_path)
152
153
[docs]
154def download_file(url: str, local_path: str) -> None:
155 """
156 Download a file in a given remote_path to a local path
157
158 The function utilizes the :py:class:`multistorageclient.StorageClient` to download a file (object) at the
159 provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient`
160 is retrieved or built.
161
162 :param url: The URL of the file to download. (example: ``msc://profile/prefix/dataset.tar``)
163 :param local_path: The local path where the file should be downloaded.
164
165 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``.
166 """
167 client, path = resolve_storage_client(url)
168 return client.download_file(remote_path=path, local_path=local_path)
169
170
[docs]
171def is_empty(url: str) -> bool:
172 """
173 Checks whether the specified URL contains any objects.
174
175 :param url: The URL to check, typically pointing to a storage location.
176 :return: ``True`` if there are no objects/files under this URL, ``False`` otherwise.
177
178 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``.
179 """
180 client, path = resolve_storage_client(url)
181 return client.is_empty(path)
182
183
[docs]
184def is_file(url: str) -> bool:
185 """
186 Checks whether the specified url points to a file (rather than a directory or folder).
187
188 The function utilizes the :py:class:`multistorageclient.StorageClient` to check if a file (object) exists
189 at the provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient`
190 is retrieved or built.
191
192 :param url: The URL to check the existence of a file. (example: ``msc://profile/prefix/dataset.tar``)
193 """
194 client, path = resolve_storage_client(url)
195 return client.is_file(path=path)
196
197
[docs]
198def sync(source_url: str, target_url: str, delete_unmatched_files: bool = False) -> None:
199 """
200 Syncs files from the source storage to the target storage.
201
202 :param source_url: The URL for the source storage.
203 :param target_url: The URL for the target storage.
204 :param delete_unmatched_files: Whether to delete files at the target that are not present at the source.
205 """
206 source_client, source_path = resolve_storage_client(source_url)
207 target_client, target_path = resolve_storage_client(target_url)
208 target_client.sync_from(source_client, source_path, target_path, delete_unmatched_files)
209
210
[docs]
211def list(
212 url: str, start_after: Optional[str] = None, end_at: Optional[str] = None, include_directories: bool = False
213) -> Iterator[ObjectMetadata]:
214 """
215 Lists the contents of the specified URL prefix.
216
217 This function retrieves the corresponding :py:class:`multistorageclient.StorageClient`
218 for the given URL and returns an iterator of objects (files or directories) stored under the provided prefix.
219
220 :param url: The prefix to list objects under.
221 :param start_after: The key to start after (i.e. exclusive). An object with this key doesn't have to exist.
222 :param end_at: The key to end at (i.e. inclusive). An object with this key doesn't have to exist.
223 :param include_directories: Whether to include directories in the result. When True, directories are returned alongside objects.
224
225 :return: An iterator of :py:class:`ObjectMetadata` objects representing the files (and optionally directories)
226 accessible under the specified URL prefix. The returned keys will always be prefixed with msc://.
227 """
228 client, prefix = resolve_storage_client(url)
229 return client.list(
230 prefix=prefix,
231 start_after=start_after,
232 end_at=end_at,
233 include_directories=include_directories,
234 include_url_prefix=True,
235 )
236
237
[docs]
238def write(url: str, body: bytes) -> None:
239 """
240 Writes an object to the storage provider at the specified path.
241
242 :param url: The path where the object should be written.
243 :param body: The content to write to the object.
244 """
245 client, path = resolve_storage_client(url)
246 client.write(path=path, body=body)
247
248
[docs]
249def delete(url: str) -> None:
250 """
251 Deletes the specified object from the storage provider.
252
253 This function retrieves the corresponding :py:class:`multistorageclient.StorageClient`
254 for the given URL and deletes the object at the specified path.
255
256 :param url: The URL of the object to delete. (example: ``msc://profile/prefix/file.txt``)
257 """
258 client, path = resolve_storage_client(url)
259 client.delete(path)