Source code for multistorageclient.providers.huggingface

  1# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2# SPDX-License-Identifier: Apache-2.0
  3#
  4# Licensed under the Apache License, Version 2.0 (the "License");
  5# you may not use this file except in compliance with the License.
  6# You may obtain a copy of the License at
  7#
  8# http://www.apache.org/licenses/LICENSE-2.0
  9#
 10# Unless required by applicable law or agreed to in writing, software
 11# distributed under the License is distributed on an "AS IS" BASIS,
 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13# See the License for the specific language governing permissions and
 14# limitations under the License.
 15
 16import importlib.util
 17import io
 18import os
 19import tempfile
 20from collections.abc import Callable, Iterator
 21from typing import IO, Any, Optional, TypeVar, Union
 22
 23from huggingface_hub import CommitOperationCopy, HfApi
 24from huggingface_hub.errors import EntryNotFoundError, HfHubHTTPError, RepositoryNotFoundError, RevisionNotFoundError
 25from huggingface_hub.hf_api import RepoFile, RepoFolder
 26
 27from ..telemetry import Telemetry
 28from ..types import (
 29    AWARE_DATETIME_MIN,
 30    Credentials,
 31    CredentialsProvider,
 32    ObjectMetadata,
 33    Range,
 34    RetryableError,
 35    SymlinkHandling,
 36)
 37from ..utils import safe_makedirs
 38from .base import BaseStorageProvider
 39
 40_T = TypeVar("_T")
 41
 42PROVIDER = "huggingface"
 43
 44HF_TRANSFER_UNAVAILABLE_ERROR_MESSAGE = (
 45    "Fast transfer using 'hf_transfer' is enabled (HF_HUB_ENABLE_HF_TRANSFER=1) "
 46    "but 'hf_transfer' package is not available in your environment. "
 47    "Either install hf_transfer with 'pip install hf_transfer' or "
 48    "disable it by setting HF_HUB_ENABLE_HF_TRANSFER=0"
 49)
 50
 51
[docs] 52class HuggingFaceCredentialsProvider(CredentialsProvider): 53 """ 54 A concrete implementation of the :py:class:`multistorageclient.types.CredentialsProvider` that provides HuggingFace credentials. 55 """ 56 57 def __init__(self, access_token: str): 58 """ 59 Initializes the :py:class:`HuggingFaceCredentialsProvider` with the provided access token. 60 61 :param access_token: The HuggingFace access token for authentication. 62 """ 63 self.token = access_token 64
[docs] 65 def get_credentials(self) -> Credentials: 66 """ 67 Retrieves the current HuggingFace credentials. 68 69 :return: The current credentials used for HuggingFace authentication. 70 """ 71 return Credentials( 72 access_key="", 73 secret_key="", 74 token=self.token, 75 expiration=None, 76 )
77
[docs] 78 def refresh_credentials(self) -> None: 79 """ 80 Refreshes the credentials if they are expired or about to expire. 81 82 Note: HuggingFace tokens typically don't expire, so this is a no-op. 83 """ 84 pass
85 86
[docs] 87class HuggingFaceStorageProvider(BaseStorageProvider): 88 """ 89 A concrete implementation of the :py:class:`multistorageclient.types.StorageProvider` for interacting with HuggingFace Hub repositories. 90 """ 91 92 def __init__( 93 self, 94 repository_id: str, 95 repo_type: str = "model", 96 base_path: str = "", 97 repo_revision: str = "main", 98 credentials_provider: Optional[CredentialsProvider] = None, 99 config_dict: Optional[dict[str, Any]] = None, 100 telemetry_provider: Optional[Callable[[], Telemetry]] = None, 101 ): 102 """ 103 Initializes the :py:class:`HuggingFaceStorageProvider` with repository information and optional credentials provider. 104 105 :param repository_id: The HuggingFace repository ID (e.g., 'username/repo-name'). 106 :param repo_type: The type of repository ('dataset', 'model', 'space'). Defaults to 'model'. 107 :param base_path: The root prefix path within the repository where all operations will be scoped. 108 :param repo_revision: The git revision (branch, tag, or commit) to use. Defaults to 'main'. 109 :param credentials_provider: The provider to retrieve HuggingFace credentials. 110 :param config_dict: Resolved MSC config. 111 :param telemetry_provider: A function that provides a telemetry instance. 112 """ 113 114 # Validate repo_type 115 allowed_repo_types = {"dataset", "model", "space"} 116 if repo_type not in allowed_repo_types: 117 raise ValueError(f"Invalid repo_type '{repo_type}'. Must be one of: {allowed_repo_types}") 118 119 # Validate repository_id format 120 if not repository_id or "/" not in repository_id: 121 raise ValueError(f"Invalid repository_id '{repository_id}'. Expected format: 'username/repo-name'") 122 123 self._validate_hf_transfer_availability() 124 125 super().__init__( 126 base_path=base_path, 127 provider_name=PROVIDER, 128 config_dict=config_dict, 129 telemetry_provider=telemetry_provider, 130 ) 131 132 self._repository_id = repository_id 133 self._repo_type = repo_type 134 self._repo_revision = repo_revision 135 self._credentials_provider = credentials_provider 136 137 self._hf_client: HfApi = self._create_hf_api_client() 138 139 def _create_hf_api_client(self) -> HfApi: 140 """ 141 Creates and configures the HuggingFace API client. 142 143 Initializes the HfApi client with authentication token if credentials are provided, 144 otherwise creates an unauthenticated client for public repositories. 145 146 :return: Configured HfApi client instance. 147 """ 148 149 token = None 150 if self._credentials_provider: 151 creds = self._credentials_provider.get_credentials() 152 token = creds.token 153 154 return HfApi(token=token) 155 156 def _validate_hf_transfer_availability(self) -> None: 157 """ 158 Validates that hf_transfer is available if it's enabled via environment variables. 159 160 Raises: 161 ValueError: If hf_transfer is enabled but not available. 162 """ 163 # Check if hf_transfer is enabled via environment variable 164 hf_transfer_enabled = os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "").lower() in ("1", "on", "true", "yes") 165 166 if hf_transfer_enabled and importlib.util.find_spec("hf_transfer") is None: 167 raise ValueError(HF_TRANSFER_UNAVAILABLE_ERROR_MESSAGE) 168 169 def _parse_rate_limit_headers(self, response) -> str: 170 """ 171 Parses HuggingFace rate limit headers and returns formatted information. 172 173 HuggingFace returns rate limit information in these headers: 174 - RateLimit: "api";r=0;t=142 175 - r = requests remaining in the current window 176 - t = seconds until rate limit resets 177 - RateLimit-Policy: "fixed window";"api";q=10000;w=300 178 - q = total requests allowed per window 179 - w = window size in seconds 180 181 Reference: https://huggingface.co/docs/hub/rate-limits 182 183 :param response: The HTTP response object containing rate limit headers. 184 :return: Formatted string with rate limit information, or empty string if headers not found. 185 """ 186 187 try: 188 headers = response.headers 189 except Exception: 190 return "" 191 192 rate_limit_info = [] 193 194 # Note: HTTP headers are case-insensitive, but we use the canonical casing from HF docs 195 if "RateLimit" in headers: 196 rate_limit = headers["RateLimit"] 197 # Extract r (remaining) and t (time until reset) 198 remaining = None 199 reset_seconds = None 200 201 parts = rate_limit.split(";") 202 for part in parts: 203 part = part.strip() 204 if part.startswith("r="): 205 try: 206 remaining = int(part[2:]) 207 except ValueError: 208 pass 209 elif part.startswith("t="): 210 try: 211 reset_seconds = int(part[2:]) 212 except ValueError: 213 pass 214 215 if remaining is not None: 216 rate_limit_info.append(f"Requests remaining in current window: {remaining}") 217 if reset_seconds is not None: 218 rate_limit_info.append(f"Rate limit resets in: {reset_seconds} seconds") 219 220 if "RateLimit-Policy" in headers: 221 policy = headers["RateLimit-Policy"] 222 # Extract q (quota) and w (window size) 223 quota = None 224 window_seconds = None 225 226 parts = policy.split(";") 227 for part in parts: 228 part = part.strip() 229 if part.startswith("q="): 230 try: 231 quota = int(part[2:]) 232 except ValueError: 233 pass 234 elif part.startswith("w="): 235 try: 236 window_seconds = int(part[2:]) 237 except ValueError: 238 pass 239 240 if quota is not None and window_seconds is not None: 241 window_minutes = window_seconds / 60 242 rate_limit_info.append(f"Rate limit policy: {quota} requests per {window_minutes:.0f}-minute window") 243 244 if rate_limit_info: 245 return " | ".join(rate_limit_info) 246 247 return "" 248 249 def _translate_errors( 250 self, 251 func: Callable[[], _T], 252 operation: str, 253 repo_id: str, 254 path: str, 255 ) -> _T: 256 """ 257 Translates HuggingFace errors into standardized exceptions with retry logic. 258 259 Parses HuggingFace rate limit headers (RateLimit and RateLimit-Policy) to provide 260 detailed information about rate limiting to users. See https://huggingface.co/docs/hub/rate-limits 261 262 :param func: The function that performs the actual HuggingFace operation. 263 :param operation: The type of operation being performed (e.g., "upload", "download", "delete"). 264 :param repo_id: The HuggingFace repository ID. 265 :param path: The path of the object within the repository. 266 :return: The result of the HuggingFace operation. 267 :raises RetryableError: For transient errors that can be retried (429, 503, connection errors). 268 :raises FileNotFoundError: When the requested resource is not found. 269 :raises RuntimeError: For other non-retryable errors. 270 """ 271 try: 272 return func() 273 except RepositoryNotFoundError as error: 274 raise FileNotFoundError( 275 f"Repository not found or access denied: {repo_id}. " 276 f"Verify the repository exists and you have access permissions." 277 ) from error 278 except RevisionNotFoundError as error: 279 raise FileNotFoundError( 280 f"Revision '{self._repo_revision}' not found in repository {repo_id}. " 281 f"Verify the branch, tag, or commit exists." 282 ) from error 283 except EntryNotFoundError as error: 284 raise FileNotFoundError(f"File not found in HuggingFace repository: {path}") from error 285 except FileNotFoundError: 286 raise 287 except HfHubHTTPError as error: 288 # Extract status code and parse rate limit headers 289 # Don't use hasattr() - it's unreliable with response objects 290 status_code = None 291 response = None 292 293 try: 294 response = error.response 295 if response is not None: 296 status_code = response.status_code 297 except AttributeError: 298 pass 299 300 rate_limit_info = self._parse_rate_limit_headers(response) 301 quota_suffix = f" | {rate_limit_info}" if rate_limit_info else "" 302 303 error_info = f"repo_id: {repo_id}, path: {path}, status_code: {status_code}, error: {error}" 304 305 if status_code == 404: 306 raise FileNotFoundError(f"Object {repo_id}/{path} does not exist. {error_info}") from error 307 elif status_code == 409: 308 raise RetryableError(f"Conflict Error for {repo_id}. {error_info}{quota_suffix}") from error 309 elif status_code == 429: 310 base_message = f"Rate limit exceeded when {operation} object(s) at {repo_id}/{path}. {error_info}" 311 raise RetryableError(f"{base_message}{quota_suffix}") from error 312 elif status_code == 503: 313 raise RetryableError( 314 f"Service unavailable when {operation} object(s) at {repo_id}/{path}. {error_info}{quota_suffix}" 315 ) from error 316 elif status_code in (408, 500, 502, 504): 317 raise RetryableError( 318 f"Transient error ({status_code}) when {operation} object(s) at {repo_id}/{path}. {error_info}{quota_suffix}" 319 ) from error 320 else: 321 raise RuntimeError( 322 f"HuggingFace API error during {operation} of {path}: {error}{quota_suffix}" 323 ) from error 324 except (ConnectionError, TimeoutError, OSError) as error: 325 raise RetryableError( 326 f"Connection error when {operation} object(s) at {repo_id}/{path}, error type: {type(error).__name__}" 327 ) from error 328 except Exception as error: 329 raise RuntimeError(f"Unexpected error during {operation} of {path}: {error}") from error 330 331 def _put_object( 332 self, 333 path: str, 334 body: bytes, 335 if_match: Optional[str] = None, 336 if_none_match: Optional[str] = None, 337 attributes: Optional[dict[str, str]] = None, 338 ) -> int: 339 """ 340 Uploads an object to the HuggingFace repository. 341 342 :param path: The path where the object will be stored in the repository. 343 :param body: The content of the object to store. 344 :param if_match: Optional ETag for conditional uploads (not supported by HuggingFace). 345 :param if_none_match: Optional ETag for conditional uploads (not supported by HuggingFace). 346 :param attributes: Optional attributes for the object (not supported by HuggingFace). 347 :return: Data size in bytes. 348 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 349 :raises ValueError: If client attempts to create a directory. 350 :raises ValueError: If conditional upload parameters are provided (not supported). 351 """ 352 if not self._hf_client: 353 raise RuntimeError("HuggingFace client not initialized") 354 355 if if_match is not None or if_none_match is not None: 356 raise ValueError( 357 "HuggingFace provider does not support conditional uploads. " 358 "if_match and if_none_match parameters are not supported." 359 ) 360 361 if attributes is not None: 362 raise ValueError( 363 "HuggingFace provider does not support custom object attributes. " 364 "Use commit messages or repository metadata instead." 365 ) 366 367 if path.endswith("/"): 368 raise ValueError( 369 "HuggingFace Storage Provider does not support explicit directory creation. " 370 "Directories are created implicitly when files are uploaded to paths within them." 371 ) 372 373 path = self._normalize_path(path) 374 375 def _invoke_api(): 376 with tempfile.NamedTemporaryFile(delete=False) as temp_file: 377 temp_file.write(body) 378 temp_file_path = temp_file.name 379 380 try: 381 self._hf_client.upload_file( 382 path_or_fileobj=temp_file_path, 383 path_in_repo=path, 384 repo_id=self._repository_id, 385 repo_type=self._repo_type, 386 revision=self._repo_revision, 387 commit_message=f"Upload {path}", 388 commit_description=None, 389 create_pr=False, 390 ) 391 392 return len(body) 393 394 finally: 395 os.unlink(temp_file_path) 396 397 return self._translate_errors(_invoke_api, "PUT", self._repository_id, path) 398 399 def _get_object(self, path: str, byte_range: Optional[Range] = None) -> bytes: 400 """ 401 Retrieves an object from the HuggingFace repository. 402 403 :param path: The path of the object to retrieve from the repository. 404 :param byte_range: Optional byte range for partial content (not supported by HuggingFace). 405 :return: The content of the retrieved object. 406 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 407 :raises ValueError: If a byte range is requested (HuggingFace doesn't support range reads). 408 :raises FileNotFoundError: If the file doesn't exist in the repository. 409 """ 410 411 if not self._hf_client: 412 raise RuntimeError("HuggingFace client not initialized") 413 414 if byte_range is not None: 415 raise ValueError( 416 "HuggingFace provider does not support partial range reads. " 417 f"Requested range: offset={byte_range.offset}, size={byte_range.size}. " 418 "To read the entire file, call get_object() without the byte_range parameter." 419 ) 420 421 path = self._normalize_path(path) 422 423 def _invoke_api(): 424 with tempfile.TemporaryDirectory() as temp_dir: 425 downloaded_path = self._hf_client.hf_hub_download( 426 repo_id=self._repository_id, 427 filename=path, 428 repo_type=self._repo_type, 429 revision=self._repo_revision, 430 local_dir=temp_dir, 431 ) 432 433 with open(downloaded_path, "rb") as f: 434 data = f.read() 435 436 return data 437 438 return self._translate_errors(_invoke_api, "GET", self._repository_id, path) 439 440 def _copy_object(self, src_path: str, dest_path: str) -> int: 441 """ 442 Copies an object within the HuggingFace repository using server-side copy. 443 444 .. note:: 445 Copy behavior is size-dependent: files ≥10MB are copied remotely via 446 metadata (LFS), while files <10MB are downloaded and re-uploaded. 447 448 :param src_path: The source path of the object to copy. 449 :param dest_path: The destination path for the copied object. 450 :return: Data size in bytes. 451 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 452 :raises FileNotFoundError: If the source file doesn't exist. 453 """ 454 if not self._hf_client: 455 raise RuntimeError("HuggingFace client not initialized") 456 457 src_path = self._normalize_path(src_path) 458 dest_path = self._normalize_path(dest_path) 459 460 src_object = self._get_object_metadata(src_path) 461 462 def _invoke_api(): 463 operations = [ 464 CommitOperationCopy( 465 src_path_in_repo=src_path, 466 path_in_repo=dest_path, 467 ) 468 ] 469 470 self._hf_client.create_commit( 471 repo_id=self._repository_id, 472 operations=operations, 473 commit_message=f"Copy {src_path} to {dest_path}", 474 repo_type=self._repo_type, 475 revision=self._repo_revision, 476 ) 477 478 return src_object.content_length 479 480 return self._translate_errors(_invoke_api, "COPY", self._repository_id, f"{src_path} to {dest_path}") 481 482 def _delete_object(self, path: str, if_match: Optional[str] = None) -> None: 483 """ 484 Deletes an object from the HuggingFace repository. 485 486 :param path: The path of the object to delete from the repository. 487 :param if_match: Optional ETag for conditional deletion (not supported by HuggingFace). 488 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 489 :raises ValueError: If conditional deletion parameters are provided (not supported). 490 :raises FileNotFoundError: If the file doesn't exist in the repository. 491 """ 492 if not self._hf_client: 493 raise RuntimeError("HuggingFace client not initialized") 494 495 if if_match is not None: 496 raise ValueError( 497 "HuggingFace provider does not support conditional deletion. if_match parameter is not supported." 498 ) 499 500 path = self._normalize_path(path) 501 502 def _invoke_api(): 503 self._hf_client.delete_file( 504 path_in_repo=path, 505 repo_id=self._repository_id, 506 repo_type=self._repo_type, 507 revision=self._repo_revision, 508 commit_message=f"Delete {path}", 509 ) 510 511 self._translate_errors(_invoke_api, "DELETE", self._repository_id, path) 512 513 def _item_to_metadata(self, item: Union[RepoFile, RepoFolder]) -> ObjectMetadata: 514 """ 515 Convert a RepoFile or RepoFolder into ObjectMetadata. 516 517 :param item: The RepoFile or RepoFolder item from HuggingFace API. 518 :return: ObjectMetadata representing the item. 519 """ 520 last_modified = AWARE_DATETIME_MIN 521 522 if isinstance(item, RepoFile): 523 etag = item.blob_id 524 return ObjectMetadata( 525 key=item.path, 526 type="file", 527 content_length=item.size, 528 last_modified=last_modified, 529 etag=etag, 530 content_type=None, 531 storage_class=None, 532 metadata=None, 533 ) 534 else: 535 etag = item.tree_id 536 return ObjectMetadata( 537 key=item.path, 538 type="directory", 539 content_length=0, 540 last_modified=last_modified, 541 etag=etag, 542 content_type=None, 543 storage_class=None, 544 metadata=None, 545 ) 546 547 def _make_symlink(self, path: str, target: str) -> None: 548 """ 549 Not supported. HuggingFace repositories are read-only through this provider. 550 551 :raises NotImplementedError: Always. 552 """ 553 raise NotImplementedError("HuggingFace provider does not support symlink creation.") 554 555 def _get_object_metadata(self, path: str, strict: bool = True) -> ObjectMetadata: 556 """ 557 Retrieves metadata for an object in the HuggingFace repository. 558 559 :param path: The path of the object to get metadata for. 560 :param strict: Whether to raise an error if the object doesn't exist. 561 :return: Metadata about the object. 562 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 563 :raises FileNotFoundError: If the file doesn't exist and strict=True. 564 """ 565 if not self._hf_client: 566 raise RuntimeError("HuggingFace client not initialized") 567 568 path = self._normalize_path(path) 569 570 def _invoke_api(): 571 items = self._hf_client.get_paths_info( 572 repo_id=self._repository_id, 573 paths=[path], 574 repo_type=self._repo_type, 575 revision=self._repo_revision, 576 expand=True, 577 ) 578 579 if not items: 580 raise FileNotFoundError(f"File not found in HuggingFace repository: {path}") 581 582 item = items[0] 583 return self._item_to_metadata(item) 584 585 try: 586 return self._translate_errors(_invoke_api, "HEAD", self._repository_id, path) 587 except FileNotFoundError as error: 588 if strict: 589 dir_path = path.rstrip("/") + "/" 590 if self._is_dir(dir_path): 591 return ObjectMetadata( 592 key=dir_path, 593 type="directory", 594 content_length=0, 595 last_modified=AWARE_DATETIME_MIN, 596 etag=None, 597 content_type=None, 598 storage_class=None, 599 metadata=None, 600 ) 601 raise error 602 603 def _list_objects( 604 self, 605 path: str, 606 start_after: Optional[str] = None, 607 end_at: Optional[str] = None, 608 include_directories: bool = False, 609 symlink_handling: SymlinkHandling = SymlinkHandling.FOLLOW, 610 ) -> Iterator[ObjectMetadata]: 611 """ 612 Lists objects in the HuggingFace repository under the specified path. 613 614 :param path: The path to list objects under. 615 :param start_after: The key to start listing after (exclusive, used as cursor). 616 :param end_at: The key to end listing at (inclusive, used as cursor). 617 :param include_directories: Whether to include directories in the listing. 618 :return: An iterator over object metadata for objects under the specified path. 619 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 620 621 .. note:: 622 HuggingFace Hub API does not natively support pagination parameters. 623 This implementation fetches all items and uses cursor-based filtering, 624 which may impact performance for large repositories. The ordering is 625 directory-first, then files, with lexicographical ordering within each group. 626 """ 627 if not self._hf_client: 628 raise RuntimeError("HuggingFace client not initialized") 629 630 path = self._normalize_path(path) 631 632 try: 633 metadata = self._get_object_metadata(path.rstrip("/"), strict=False) 634 if metadata and metadata.type == "file": 635 yield metadata 636 return 637 except FileNotFoundError: 638 pass 639 640 def _invoke_api(): 641 dir_path = path.rstrip("/") 642 643 repo_items = self._hf_client.list_repo_tree( 644 repo_id=self._repository_id, 645 path_in_repo=dir_path + "/" if dir_path else None, 646 repo_type=self._repo_type, 647 revision=self._repo_revision, 648 expand=True, 649 recursive=not include_directories, 650 ) 651 652 return list(repo_items) 653 654 try: 655 items = self._translate_errors(_invoke_api, "LIST", self._repository_id, path) 656 657 # Use cursor-based pagination because HuggingFace returns items with 658 # directory-first ordering (not pure lexicographical). 659 seen_start = start_after is None 660 seen_end = False 661 662 for item in items: 663 if seen_end: 664 break 665 666 metadata = self._item_to_metadata(item) 667 key = metadata.key 668 669 if not seen_start: 670 if key == start_after: 671 seen_start = True 672 continue 673 674 should_yield = False 675 if include_directories and isinstance(item, RepoFolder): 676 should_yield = True 677 elif isinstance(item, RepoFile): 678 should_yield = True 679 680 if should_yield: 681 yield metadata 682 683 if end_at is not None and key == end_at: 684 seen_end = True 685 686 except FileNotFoundError: 687 # Directory doesn't exist - return empty (matches POSIX behavior) 688 pass 689 690 def _upload_file(self, remote_path: str, f: Union[str, IO], attributes: Optional[dict[str, str]] = None) -> int: 691 """ 692 Uploads a file to the HuggingFace repository. 693 694 :param remote_path: The remote path where the file will be stored in the repository. 695 :param f: File path or file object to upload. 696 :param attributes: Optional attributes for the file (not supported by HuggingFace). 697 :return: Data size in bytes. 698 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 699 :raises ValueError: If client attempts to create a directory. 700 :raises ValueError: If custom attributes are provided (not supported). 701 """ 702 if not self._hf_client: 703 raise RuntimeError("HuggingFace client not initialized") 704 705 if attributes is not None: 706 raise ValueError( 707 "HuggingFace provider does not support custom file attributes. " 708 "Use commit messages or repository metadata instead." 709 ) 710 711 if remote_path.endswith("/"): 712 raise ValueError( 713 "HuggingFace Storage Provider does not support explicit directory creation. " 714 "Directories are created implicitly when files are uploaded to paths within them." 715 ) 716 717 remote_path = self._normalize_path(remote_path) 718 719 def _invoke_api(): 720 if isinstance(f, str): 721 file_size = os.path.getsize(f) 722 723 self._hf_client.upload_file( 724 path_or_fileobj=f, 725 path_in_repo=remote_path, 726 repo_id=self._repository_id, 727 repo_type=self._repo_type, 728 revision=self._repo_revision, 729 commit_message=f"Upload {remote_path}", 730 commit_description=None, 731 create_pr=False, 732 ) 733 734 return file_size 735 736 else: 737 content = f.read() 738 739 if isinstance(content, str): 740 content_bytes = content.encode("utf-8") 741 else: 742 content_bytes = content 743 744 # Create temporary file since HfAPI.upload_file requires BinaryIO, not generic IO 745 with tempfile.NamedTemporaryFile(delete=False) as temp_file: 746 temp_file.write(content_bytes) 747 temp_file_path = temp_file.name 748 749 try: 750 self._hf_client.upload_file( 751 path_or_fileobj=temp_file_path, 752 path_in_repo=remote_path, 753 repo_id=self._repository_id, 754 repo_type=self._repo_type, 755 revision=self._repo_revision, 756 commit_message=f"Upload {remote_path}", 757 create_pr=False, 758 ) 759 760 return len(content_bytes) 761 762 finally: 763 os.unlink(temp_file_path) 764 765 return self._translate_errors(_invoke_api, "PUT", self._repository_id, remote_path) 766 767 def _download_file(self, remote_path: str, f: Union[str, IO], metadata: Optional[ObjectMetadata] = None) -> int: 768 """ 769 Downloads a file from the HuggingFace repository. 770 771 :param remote_path: The remote path of the file to download from the repository. 772 :param f: Local file path or file object to write to. 773 :param metadata: Optional object metadata (not used in this implementation). 774 :return: Data size in bytes. 775 """ 776 if not self._hf_client: 777 raise RuntimeError("HuggingFace client not initialized") 778 779 remote_path = self._normalize_path(remote_path) 780 781 def _invoke_api(): 782 if isinstance(f, str): 783 parent_dir = os.path.dirname(f) 784 if parent_dir: 785 safe_makedirs(parent_dir) 786 787 target_dir = parent_dir if parent_dir else "." 788 downloaded_path = self._hf_client.hf_hub_download( 789 repo_id=self._repository_id, 790 filename=remote_path, 791 repo_type=self._repo_type, 792 revision=self._repo_revision, 793 local_dir=target_dir, 794 ) 795 796 if os.path.abspath(downloaded_path) != os.path.abspath(f): 797 os.rename(downloaded_path, f) 798 799 return os.path.getsize(f) 800 801 else: 802 with tempfile.TemporaryDirectory() as temp_dir: 803 downloaded_path = self._hf_client.hf_hub_download( 804 repo_id=self._repository_id, 805 filename=remote_path, 806 repo_type=self._repo_type, 807 revision=self._repo_revision, 808 local_dir=temp_dir, 809 ) 810 811 with open(downloaded_path, "rb") as src: 812 data = src.read() 813 if isinstance(f, io.TextIOBase): 814 f.write(data.decode("utf-8")) 815 else: 816 f.write(data) 817 818 return len(data) 819 820 return self._translate_errors(_invoke_api, "GET", self._repository_id, remote_path) 821 822 def _is_dir(self, path: str) -> bool: 823 """ 824 Helper method to check if a path is a directory. 825 826 :param path: The path to check. 827 :return: True if the path appears to be a directory (has files under it). 828 """ 829 path = path.rstrip("/") 830 if not path: 831 # The root of the repo is always a directory 832 return True 833 834 try: 835 path_info = self._hf_client.get_paths_info( 836 repo_id=self._repository_id, 837 paths=[path], 838 repo_type=self._repo_type, 839 revision=self._repo_revision, 840 ) 841 842 if not path_info: 843 return False 844 845 return isinstance(path_info[0], RepoFolder) 846 847 except RepositoryNotFoundError as e: 848 raise FileNotFoundError( 849 f"Repository not found or access denied: {self._repository_id}. " 850 f"Verify the repository exists and you have access permissions." 851 ) from e 852 except RevisionNotFoundError as e: 853 raise FileNotFoundError( 854 f"Revision '{self._repo_revision}' not found in repository {self._repository_id}. " 855 f"Verify the branch, tag, or commit exists." 856 ) from e 857 except IndexError: 858 return False 859 except Exception as e: 860 raise Exception(f"Unexpected error: {e}") 861 862 def _normalize_path(self, path: str) -> str: 863 """ 864 Normalize path for HuggingFace API by removing leading slashes. 865 HuggingFace expects relative paths within repositories. 866 """ 867 return path.lstrip("/")