1# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2# SPDX-License-Identifier: Apache-2.0
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16import threading
17from pathlib import Path
18from typing import Any, Dict, List, Tuple, Union
19from urllib.parse import urlparse
20
21from .client import StorageClient
22from .config import StorageClientConfig
23from .file import ObjectFile, PosixFile
24from .types import DEFAULT_POSIX_PROFILE_NAME, MSC_PROTOCOL, MSC_PROTOCOL_NAME
25
26_instance_cache: Dict[str, StorageClient] = {}
27_cache_lock = threading.Lock()
28
29
[docs]
30def resolve_storage_client(url: str) -> Tuple[StorageClient, str]:
31 """
32 Build and return a :py:class:`multistorageclient.StorageClient` instance based on the provided URL or path.
33
34 This function parses the given URL or path and determines the appropriate storage profile and path.
35 It supports URLs with the protocol ``msc://``, as well as POSIX paths or ``file://`` URLs for local file
36 system access. If the profile has already been instantiated, it returns the cached client. Otherwise,
37 it creates a new :py:class:`StorageClient` and caches it.
38
39 :param url: The storage location, which can be:
40 - A URL in the format ``msc://profile/path`` for object storage.
41 - A local file system path (absolute POSIX path) or a ``file://`` URL.
42
43 :return: A tuple containing the :py:class:`multistorageclient.StorageClient` instance and the parsed path.
44
45 :raises ValueError: If the URL's protocol is neither ``msc`` nor a valid local file system path.
46 """
47 pr = urlparse(url)
48 if pr.scheme == MSC_PROTOCOL_NAME:
49 profile = pr.netloc
50
51 # Remove the leading slash
52 if pr.path.startswith("/"):
53 path = pr.path[1:]
54 else:
55 path = pr.path
56 elif pr.scheme == "" or pr.scheme == "file":
57 if Path(pr.path).is_absolute():
58 profile = DEFAULT_POSIX_PROFILE_NAME
59 path = pr.path
60 else:
61 raise ValueError(f'Invalid POSIX path "{url}", only absolute path is allowed')
62 else:
63 raise ValueError(f'Unknown URL "{url}", expecting "{MSC_PROTOCOL}" or a POSIX path')
64
65 if profile in _instance_cache:
66 return _instance_cache[profile], path
67
68 with _cache_lock:
69 if profile in _instance_cache:
70 return _instance_cache[profile], path
71 else:
72 client = StorageClient(config=StorageClientConfig.from_file(profile=profile))
73 _instance_cache[profile] = client
74
75 return client, path
76
77
[docs]
78def open(url: str, mode: str = "rb", **kwargs: Any) -> Union[PosixFile, ObjectFile]:
79 """
80 Open a file at the given URL using the specified mode.
81
82 The function utilizes the :py:class:`multistorageclient.StorageClient` to open a file at the provided path.
83 The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient` is retrieved or built.
84
85 :param url: The URL of the file to open. (example: ``msc://profile/prefix/dataset.tar``)
86 :param mode: The file mode to open the file in.
87
88 :return: A file-like object that allows interaction with the file.
89
90 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``.
91 """
92 client, path = resolve_storage_client(url)
93 return client.open(path, mode, **kwargs)
94
95
[docs]
96def glob(pattern: str) -> List[str]:
97 """
98 Return a list of files matching a pattern.
99
100 This function supports glob-style patterns for matching multiple files within a storage system. The pattern is
101 parsed, and the associated :py:class:`multistorageclient.StorageClient` is used to retrieve the
102 list of matching files.
103
104 :param pattern: The glob-style pattern to match files. (example: ``msc://profile/prefix/**/*.tar``)
105
106 :return: A list of file paths matching the pattern.
107
108 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``.
109 """
110 client, path = resolve_storage_client(pattern)
111 if not pattern.startswith(MSC_PROTOCOL) and client.profile == DEFAULT_POSIX_PROFILE_NAME:
112 return client.glob(path, include_url_prefix=False)
113 else:
114 return client.glob(path, include_url_prefix=True)
115
116
[docs]
117def upload_file(url: str, local_path: str) -> None:
118 """
119 Upload a file to the given URL from a local path.
120
121 The function utilizes the :py:class:`multistorageclient.StorageClient` to upload a file (object) to the
122 provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient`
123 is retrieved or built.
124
125 :param url: The URL of the file. (example: ``msc://profile/prefix/dataset.tar``)
126 :param local_path: The local path of the file.
127
128 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``.
129 """
130 client, path = resolve_storage_client(url)
131 return client.upload_file(remote_path=path, local_path=local_path)
132
133
[docs]
134def download_file(url: str, local_path: str) -> None:
135 """
136 Download a file in a given remote_path to a local path
137
138 The function utilizes the :py:class:`multistorageclient.StorageClient` to download a file (object) at the
139 provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient`
140 is retrieved or built.
141
142 :param url: The URL of the file to download. (example: ``msc://profile/prefix/dataset.tar``)
143 :param local_path: The local path where the file should be downloaded.
144
145 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``.
146 """
147 client, path = resolve_storage_client(url)
148 return client.download_file(remote_path=path, local_path=local_path)
149
150
[docs]
151def is_empty(url: str) -> bool:
152 """
153 Checks whether the specified URL contains any objects.
154
155 :param url: The URL to check, typically pointing to a storage location.
156 :return: ``True`` if there are no objects/files under this URL, ``False`` otherwise.
157
158 :raises ValueError: If the URL's protocol does not match the expected protocol ``msc``.
159 """
160 client, path = resolve_storage_client(url)
161 return client.is_empty(path)
162
163
[docs]
164def is_file(url: str) -> bool:
165 """
166 Checks whether the specified url points to a file (rather than a directory or folder).
167
168 The function utilizes the :py:class:`multistorageclient.StorageClient` to check if a file (object) exists
169 at the provided path. The URL is parsed, and the corresponding :py:class:`multistorageclient.StorageClient`
170 is retrieved or built.
171
172 :param url: The URL to check the existence of a file. (example: ``msc://profile/prefix/dataset.tar``)
173 """
174 client, path = resolve_storage_client(url)
175 return client.is_file(path=path)