Source code for multistorageclient.providers.huggingface

  1# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2# SPDX-License-Identifier: Apache-2.0
  3#
  4# Licensed under the Apache License, Version 2.0 (the "License");
  5# you may not use this file except in compliance with the License.
  6# You may obtain a copy of the License at
  7#
  8# http://www.apache.org/licenses/LICENSE-2.0
  9#
 10# Unless required by applicable law or agreed to in writing, software
 11# distributed under the License is distributed on an "AS IS" BASIS,
 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13# See the License for the specific language governing permissions and
 14# limitations under the License.
 15
 16import importlib.util
 17import io
 18import os
 19import tempfile
 20from collections.abc import Callable, Iterator
 21from typing import IO, Any, Optional, TypeVar, Union
 22
 23from huggingface_hub import CommitOperationCopy, HfApi
 24from huggingface_hub.errors import EntryNotFoundError, HfHubHTTPError, RepositoryNotFoundError, RevisionNotFoundError
 25from huggingface_hub.hf_api import RepoFile, RepoFolder
 26
 27from ..telemetry import Telemetry
 28from ..types import AWARE_DATETIME_MIN, Credentials, CredentialsProvider, ObjectMetadata, Range, RetryableError
 29from ..utils import safe_makedirs
 30from .base import BaseStorageProvider
 31
 32_T = TypeVar("_T")
 33
 34PROVIDER = "huggingface"
 35
 36HF_TRANSFER_UNAVAILABLE_ERROR_MESSAGE = (
 37    "Fast transfer using 'hf_transfer' is enabled (HF_HUB_ENABLE_HF_TRANSFER=1) "
 38    "but 'hf_transfer' package is not available in your environment. "
 39    "Either install hf_transfer with 'pip install hf_transfer' or "
 40    "disable it by setting HF_HUB_ENABLE_HF_TRANSFER=0"
 41)
 42
 43
[docs] 44class HuggingFaceCredentialsProvider(CredentialsProvider): 45 """ 46 A concrete implementation of the :py:class:`multistorageclient.types.CredentialsProvider` that provides HuggingFace credentials. 47 """ 48 49 def __init__(self, access_token: str): 50 """ 51 Initializes the :py:class:`HuggingFaceCredentialsProvider` with the provided access token. 52 53 :param access_token: The HuggingFace access token for authentication. 54 """ 55 self.token = access_token 56
[docs] 57 def get_credentials(self) -> Credentials: 58 """ 59 Retrieves the current HuggingFace credentials. 60 61 :return: The current credentials used for HuggingFace authentication. 62 """ 63 return Credentials( 64 access_key="", 65 secret_key="", 66 token=self.token, 67 expiration=None, 68 )
69
[docs] 70 def refresh_credentials(self) -> None: 71 """ 72 Refreshes the credentials if they are expired or about to expire. 73 74 Note: HuggingFace tokens typically don't expire, so this is a no-op. 75 """ 76 pass
77 78
[docs] 79class HuggingFaceStorageProvider(BaseStorageProvider): 80 """ 81 A concrete implementation of the :py:class:`multistorageclient.types.StorageProvider` for interacting with HuggingFace Hub repositories. 82 """ 83 84 def __init__( 85 self, 86 repository_id: str, 87 repo_type: str = "model", 88 base_path: str = "", 89 repo_revision: str = "main", 90 credentials_provider: Optional[CredentialsProvider] = None, 91 config_dict: Optional[dict[str, Any]] = None, 92 telemetry_provider: Optional[Callable[[], Telemetry]] = None, 93 ): 94 """ 95 Initializes the :py:class:`HuggingFaceStorageProvider` with repository information and optional credentials provider. 96 97 :param repository_id: The HuggingFace repository ID (e.g., 'username/repo-name'). 98 :param repo_type: The type of repository ('dataset', 'model', 'space'). Defaults to 'model'. 99 :param base_path: The root prefix path within the repository where all operations will be scoped. 100 :param repo_revision: The git revision (branch, tag, or commit) to use. Defaults to 'main'. 101 :param credentials_provider: The provider to retrieve HuggingFace credentials. 102 :param config_dict: Resolved MSC config. 103 :param telemetry_provider: A function that provides a telemetry instance. 104 """ 105 106 # Validate repo_type 107 allowed_repo_types = {"dataset", "model", "space"} 108 if repo_type not in allowed_repo_types: 109 raise ValueError(f"Invalid repo_type '{repo_type}'. Must be one of: {allowed_repo_types}") 110 111 # Validate repository_id format 112 if not repository_id or "/" not in repository_id: 113 raise ValueError(f"Invalid repository_id '{repository_id}'. Expected format: 'username/repo-name'") 114 115 self._validate_hf_transfer_availability() 116 117 super().__init__( 118 base_path=base_path, 119 provider_name=PROVIDER, 120 config_dict=config_dict, 121 telemetry_provider=telemetry_provider, 122 ) 123 124 self._repository_id = repository_id 125 self._repo_type = repo_type 126 self._repo_revision = repo_revision 127 self._credentials_provider = credentials_provider 128 129 self._hf_client: HfApi = self._create_hf_api_client() 130 131 def _create_hf_api_client(self) -> HfApi: 132 """ 133 Creates and configures the HuggingFace API client. 134 135 Initializes the HfApi client with authentication token if credentials are provided, 136 otherwise creates an unauthenticated client for public repositories. 137 138 :return: Configured HfApi client instance. 139 """ 140 141 token = None 142 if self._credentials_provider: 143 creds = self._credentials_provider.get_credentials() 144 token = creds.token 145 146 return HfApi(token=token) 147 148 def _validate_hf_transfer_availability(self) -> None: 149 """ 150 Validates that hf_transfer is available if it's enabled via environment variables. 151 152 Raises: 153 ValueError: If hf_transfer is enabled but not available. 154 """ 155 # Check if hf_transfer is enabled via environment variable 156 hf_transfer_enabled = os.environ.get("HF_HUB_ENABLE_HF_TRANSFER", "").lower() in ("1", "on", "true", "yes") 157 158 if hf_transfer_enabled and importlib.util.find_spec("hf_transfer") is None: 159 raise ValueError(HF_TRANSFER_UNAVAILABLE_ERROR_MESSAGE) 160 161 def _parse_rate_limit_headers(self, response) -> str: 162 """ 163 Parses HuggingFace rate limit headers and returns formatted information. 164 165 HuggingFace returns rate limit information in these headers: 166 - RateLimit: "api";r=0;t=142 167 - r = requests remaining in the current window 168 - t = seconds until rate limit resets 169 - RateLimit-Policy: "fixed window";"api";q=10000;w=300 170 - q = total requests allowed per window 171 - w = window size in seconds 172 173 Reference: https://huggingface.co/docs/hub/rate-limits 174 175 :param response: The HTTP response object containing rate limit headers. 176 :return: Formatted string with rate limit information, or empty string if headers not found. 177 """ 178 179 try: 180 headers = response.headers 181 except Exception: 182 return "" 183 184 rate_limit_info = [] 185 186 # Note: HTTP headers are case-insensitive, but we use the canonical casing from HF docs 187 if "RateLimit" in headers: 188 rate_limit = headers["RateLimit"] 189 # Extract r (remaining) and t (time until reset) 190 remaining = None 191 reset_seconds = None 192 193 parts = rate_limit.split(";") 194 for part in parts: 195 part = part.strip() 196 if part.startswith("r="): 197 try: 198 remaining = int(part[2:]) 199 except ValueError: 200 pass 201 elif part.startswith("t="): 202 try: 203 reset_seconds = int(part[2:]) 204 except ValueError: 205 pass 206 207 if remaining is not None: 208 rate_limit_info.append(f"Requests remaining in current window: {remaining}") 209 if reset_seconds is not None: 210 rate_limit_info.append(f"Rate limit resets in: {reset_seconds} seconds") 211 212 if "RateLimit-Policy" in headers: 213 policy = headers["RateLimit-Policy"] 214 # Extract q (quota) and w (window size) 215 quota = None 216 window_seconds = None 217 218 parts = policy.split(";") 219 for part in parts: 220 part = part.strip() 221 if part.startswith("q="): 222 try: 223 quota = int(part[2:]) 224 except ValueError: 225 pass 226 elif part.startswith("w="): 227 try: 228 window_seconds = int(part[2:]) 229 except ValueError: 230 pass 231 232 if quota is not None and window_seconds is not None: 233 window_minutes = window_seconds / 60 234 rate_limit_info.append(f"Rate limit policy: {quota} requests per {window_minutes:.0f}-minute window") 235 236 if rate_limit_info: 237 return " | ".join(rate_limit_info) 238 239 return "" 240 241 def _translate_errors( 242 self, 243 func: Callable[[], _T], 244 operation: str, 245 repo_id: str, 246 path: str, 247 ) -> _T: 248 """ 249 Translates HuggingFace errors into standardized exceptions with retry logic. 250 251 Parses HuggingFace rate limit headers (RateLimit and RateLimit-Policy) to provide 252 detailed information about rate limiting to users. See https://huggingface.co/docs/hub/rate-limits 253 254 :param func: The function that performs the actual HuggingFace operation. 255 :param operation: The type of operation being performed (e.g., "upload", "download", "delete"). 256 :param repo_id: The HuggingFace repository ID. 257 :param path: The path of the object within the repository. 258 :return: The result of the HuggingFace operation. 259 :raises RetryableError: For transient errors that can be retried (429, 503, connection errors). 260 :raises FileNotFoundError: When the requested resource is not found. 261 :raises RuntimeError: For other non-retryable errors. 262 """ 263 try: 264 return func() 265 except RepositoryNotFoundError as error: 266 raise FileNotFoundError( 267 f"Repository not found or access denied: {repo_id}. " 268 f"Verify the repository exists and you have access permissions." 269 ) from error 270 except RevisionNotFoundError as error: 271 raise FileNotFoundError( 272 f"Revision '{self._repo_revision}' not found in repository {repo_id}. " 273 f"Verify the branch, tag, or commit exists." 274 ) from error 275 except EntryNotFoundError as error: 276 raise FileNotFoundError(f"File not found in HuggingFace repository: {path}") from error 277 except FileNotFoundError: 278 raise 279 except HfHubHTTPError as error: 280 # Extract status code and parse rate limit headers 281 # Don't use hasattr() - it's unreliable with response objects 282 status_code = None 283 response = None 284 285 try: 286 response = error.response 287 if response is not None: 288 status_code = response.status_code 289 except AttributeError: 290 pass 291 292 rate_limit_info = self._parse_rate_limit_headers(response) 293 quota_suffix = f" | {rate_limit_info}" if rate_limit_info else "" 294 295 error_info = f"repo_id: {repo_id}, path: {path}, status_code: {status_code}, error: {error}" 296 297 if status_code == 404: 298 raise FileNotFoundError(f"Object {repo_id}/{path} does not exist. {error_info}") from error 299 elif status_code == 409: 300 raise RetryableError(f"Conflict Error for {repo_id}. {error_info}{quota_suffix}") from error 301 elif status_code == 429: 302 base_message = f"Rate limit exceeded when {operation} object(s) at {repo_id}/{path}. {error_info}" 303 raise RetryableError(f"{base_message}{quota_suffix}") from error 304 elif status_code == 503: 305 raise RetryableError( 306 f"Service unavailable when {operation} object(s) at {repo_id}/{path}. {error_info}{quota_suffix}" 307 ) from error 308 elif status_code in (408, 500, 502, 504): 309 raise RetryableError( 310 f"Transient error ({status_code}) when {operation} object(s) at {repo_id}/{path}. {error_info}{quota_suffix}" 311 ) from error 312 else: 313 raise RuntimeError( 314 f"HuggingFace API error during {operation} of {path}: {error}{quota_suffix}" 315 ) from error 316 except (ConnectionError, TimeoutError, OSError) as error: 317 raise RetryableError( 318 f"Connection error when {operation} object(s) at {repo_id}/{path}, error type: {type(error).__name__}" 319 ) from error 320 except Exception as error: 321 raise RuntimeError(f"Unexpected error during {operation} of {path}: {error}") from error 322 323 def _put_object( 324 self, 325 path: str, 326 body: bytes, 327 if_match: Optional[str] = None, 328 if_none_match: Optional[str] = None, 329 attributes: Optional[dict[str, str]] = None, 330 ) -> int: 331 """ 332 Uploads an object to the HuggingFace repository. 333 334 :param path: The path where the object will be stored in the repository. 335 :param body: The content of the object to store. 336 :param if_match: Optional ETag for conditional uploads (not supported by HuggingFace). 337 :param if_none_match: Optional ETag for conditional uploads (not supported by HuggingFace). 338 :param attributes: Optional attributes for the object (not supported by HuggingFace). 339 :return: Data size in bytes. 340 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 341 :raises ValueError: If client attempts to create a directory. 342 :raises ValueError: If conditional upload parameters are provided (not supported). 343 """ 344 if not self._hf_client: 345 raise RuntimeError("HuggingFace client not initialized") 346 347 if if_match is not None or if_none_match is not None: 348 raise ValueError( 349 "HuggingFace provider does not support conditional uploads. " 350 "if_match and if_none_match parameters are not supported." 351 ) 352 353 if attributes is not None: 354 raise ValueError( 355 "HuggingFace provider does not support custom object attributes. " 356 "Use commit messages or repository metadata instead." 357 ) 358 359 if path.endswith("/"): 360 raise ValueError( 361 "HuggingFace Storage Provider does not support explicit directory creation. " 362 "Directories are created implicitly when files are uploaded to paths within them." 363 ) 364 365 path = self._normalize_path(path) 366 367 def _invoke_api(): 368 with tempfile.NamedTemporaryFile(delete=False) as temp_file: 369 temp_file.write(body) 370 temp_file_path = temp_file.name 371 372 try: 373 self._hf_client.upload_file( 374 path_or_fileobj=temp_file_path, 375 path_in_repo=path, 376 repo_id=self._repository_id, 377 repo_type=self._repo_type, 378 revision=self._repo_revision, 379 commit_message=f"Upload {path}", 380 commit_description=None, 381 create_pr=False, 382 ) 383 384 return len(body) 385 386 finally: 387 os.unlink(temp_file_path) 388 389 return self._translate_errors(_invoke_api, "PUT", self._repository_id, path) 390 391 def _get_object(self, path: str, byte_range: Optional[Range] = None) -> bytes: 392 """ 393 Retrieves an object from the HuggingFace repository. 394 395 :param path: The path of the object to retrieve from the repository. 396 :param byte_range: Optional byte range for partial content (not supported by HuggingFace). 397 :return: The content of the retrieved object. 398 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 399 :raises ValueError: If a byte range is requested (HuggingFace doesn't support range reads). 400 :raises FileNotFoundError: If the file doesn't exist in the repository. 401 """ 402 403 if not self._hf_client: 404 raise RuntimeError("HuggingFace client not initialized") 405 406 if byte_range is not None: 407 raise ValueError( 408 "HuggingFace provider does not support partial range reads. " 409 f"Requested range: offset={byte_range.offset}, size={byte_range.size}. " 410 "To read the entire file, call get_object() without the byte_range parameter." 411 ) 412 413 path = self._normalize_path(path) 414 415 def _invoke_api(): 416 with tempfile.TemporaryDirectory() as temp_dir: 417 downloaded_path = self._hf_client.hf_hub_download( 418 repo_id=self._repository_id, 419 filename=path, 420 repo_type=self._repo_type, 421 revision=self._repo_revision, 422 local_dir=temp_dir, 423 ) 424 425 with open(downloaded_path, "rb") as f: 426 data = f.read() 427 428 return data 429 430 return self._translate_errors(_invoke_api, "GET", self._repository_id, path) 431 432 def _copy_object(self, src_path: str, dest_path: str) -> int: 433 """ 434 Copies an object within the HuggingFace repository using server-side copy. 435 436 .. note:: 437 Copy behavior is size-dependent: files ≥10MB are copied remotely via 438 metadata (LFS), while files <10MB are downloaded and re-uploaded. 439 440 :param src_path: The source path of the object to copy. 441 :param dest_path: The destination path for the copied object. 442 :return: Data size in bytes. 443 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 444 :raises FileNotFoundError: If the source file doesn't exist. 445 """ 446 if not self._hf_client: 447 raise RuntimeError("HuggingFace client not initialized") 448 449 src_path = self._normalize_path(src_path) 450 dest_path = self._normalize_path(dest_path) 451 452 src_object = self._get_object_metadata(src_path) 453 454 def _invoke_api(): 455 operations = [ 456 CommitOperationCopy( 457 src_path_in_repo=src_path, 458 path_in_repo=dest_path, 459 ) 460 ] 461 462 self._hf_client.create_commit( 463 repo_id=self._repository_id, 464 operations=operations, 465 commit_message=f"Copy {src_path} to {dest_path}", 466 repo_type=self._repo_type, 467 revision=self._repo_revision, 468 ) 469 470 return src_object.content_length 471 472 return self._translate_errors(_invoke_api, "COPY", self._repository_id, f"{src_path} to {dest_path}") 473 474 def _delete_object(self, path: str, if_match: Optional[str] = None) -> None: 475 """ 476 Deletes an object from the HuggingFace repository. 477 478 :param path: The path of the object to delete from the repository. 479 :param if_match: Optional ETag for conditional deletion (not supported by HuggingFace). 480 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 481 :raises ValueError: If conditional deletion parameters are provided (not supported). 482 :raises FileNotFoundError: If the file doesn't exist in the repository. 483 """ 484 if not self._hf_client: 485 raise RuntimeError("HuggingFace client not initialized") 486 487 if if_match is not None: 488 raise ValueError( 489 "HuggingFace provider does not support conditional deletion. if_match parameter is not supported." 490 ) 491 492 path = self._normalize_path(path) 493 494 def _invoke_api(): 495 self._hf_client.delete_file( 496 path_in_repo=path, 497 repo_id=self._repository_id, 498 repo_type=self._repo_type, 499 revision=self._repo_revision, 500 commit_message=f"Delete {path}", 501 ) 502 503 self._translate_errors(_invoke_api, "DELETE", self._repository_id, path) 504 505 def _item_to_metadata(self, item: Union[RepoFile, RepoFolder]) -> ObjectMetadata: 506 """ 507 Convert a RepoFile or RepoFolder into ObjectMetadata. 508 509 :param item: The RepoFile or RepoFolder item from HuggingFace API. 510 :return: ObjectMetadata representing the item. 511 """ 512 last_modified = AWARE_DATETIME_MIN 513 514 if isinstance(item, RepoFile): 515 etag = item.blob_id 516 return ObjectMetadata( 517 key=item.path, 518 type="file", 519 content_length=item.size, 520 last_modified=last_modified, 521 etag=etag, 522 content_type=None, 523 storage_class=None, 524 metadata=None, 525 ) 526 else: 527 etag = item.tree_id 528 return ObjectMetadata( 529 key=item.path, 530 type="directory", 531 content_length=0, 532 last_modified=last_modified, 533 etag=etag, 534 content_type=None, 535 storage_class=None, 536 metadata=None, 537 ) 538 539 def _get_object_metadata(self, path: str, strict: bool = True) -> ObjectMetadata: 540 """ 541 Retrieves metadata for an object in the HuggingFace repository. 542 543 :param path: The path of the object to get metadata for. 544 :param strict: Whether to raise an error if the object doesn't exist. 545 :return: Metadata about the object. 546 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 547 :raises FileNotFoundError: If the file doesn't exist and strict=True. 548 """ 549 if not self._hf_client: 550 raise RuntimeError("HuggingFace client not initialized") 551 552 path = self._normalize_path(path) 553 554 def _invoke_api(): 555 items = self._hf_client.get_paths_info( 556 repo_id=self._repository_id, 557 paths=[path], 558 repo_type=self._repo_type, 559 revision=self._repo_revision, 560 expand=True, 561 ) 562 563 if not items: 564 raise FileNotFoundError(f"File not found in HuggingFace repository: {path}") 565 566 item = items[0] 567 return self._item_to_metadata(item) 568 569 try: 570 return self._translate_errors(_invoke_api, "HEAD", self._repository_id, path) 571 except FileNotFoundError as error: 572 if strict: 573 dir_path = path.rstrip("/") + "/" 574 if self._is_dir(dir_path): 575 return ObjectMetadata( 576 key=dir_path, 577 type="directory", 578 content_length=0, 579 last_modified=AWARE_DATETIME_MIN, 580 etag=None, 581 content_type=None, 582 storage_class=None, 583 metadata=None, 584 ) 585 raise error 586 587 def _list_objects( 588 self, 589 path: str, 590 start_after: Optional[str] = None, 591 end_at: Optional[str] = None, 592 include_directories: bool = False, 593 follow_symlinks: bool = True, 594 ) -> Iterator[ObjectMetadata]: 595 """ 596 Lists objects in the HuggingFace repository under the specified path. 597 598 :param path: The path to list objects under. 599 :param start_after: The key to start listing after (exclusive, used as cursor). 600 :param end_at: The key to end listing at (inclusive, used as cursor). 601 :param include_directories: Whether to include directories in the listing. 602 :return: An iterator over object metadata for objects under the specified path. 603 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 604 605 .. note:: 606 HuggingFace Hub API does not natively support pagination parameters. 607 This implementation fetches all items and uses cursor-based filtering, 608 which may impact performance for large repositories. The ordering is 609 directory-first, then files, with lexicographical ordering within each group. 610 """ 611 if not self._hf_client: 612 raise RuntimeError("HuggingFace client not initialized") 613 614 path = self._normalize_path(path) 615 616 try: 617 metadata = self._get_object_metadata(path.rstrip("/"), strict=False) 618 if metadata and metadata.type == "file": 619 yield metadata 620 return 621 except FileNotFoundError: 622 pass 623 624 def _invoke_api(): 625 dir_path = path.rstrip("/") 626 627 repo_items = self._hf_client.list_repo_tree( 628 repo_id=self._repository_id, 629 path_in_repo=dir_path + "/" if dir_path else None, 630 repo_type=self._repo_type, 631 revision=self._repo_revision, 632 expand=True, 633 recursive=not include_directories, 634 ) 635 636 return list(repo_items) 637 638 try: 639 items = self._translate_errors(_invoke_api, "LIST", self._repository_id, path) 640 641 # Use cursor-based pagination because HuggingFace returns items with 642 # directory-first ordering (not pure lexicographical). 643 seen_start = start_after is None 644 seen_end = False 645 646 for item in items: 647 if seen_end: 648 break 649 650 metadata = self._item_to_metadata(item) 651 key = metadata.key 652 653 if not seen_start: 654 if key == start_after: 655 seen_start = True 656 continue 657 658 should_yield = False 659 if include_directories and isinstance(item, RepoFolder): 660 should_yield = True 661 elif isinstance(item, RepoFile): 662 should_yield = True 663 664 if should_yield: 665 yield metadata 666 667 if end_at is not None and key == end_at: 668 seen_end = True 669 670 except FileNotFoundError: 671 # Directory doesn't exist - return empty (matches POSIX behavior) 672 pass 673 674 def _upload_file(self, remote_path: str, f: Union[str, IO], attributes: Optional[dict[str, str]] = None) -> int: 675 """ 676 Uploads a file to the HuggingFace repository. 677 678 :param remote_path: The remote path where the file will be stored in the repository. 679 :param f: File path or file object to upload. 680 :param attributes: Optional attributes for the file (not supported by HuggingFace). 681 :return: Data size in bytes. 682 :raises RuntimeError: If HuggingFace client is not initialized or API errors occur. 683 :raises ValueError: If client attempts to create a directory. 684 :raises ValueError: If custom attributes are provided (not supported). 685 """ 686 if not self._hf_client: 687 raise RuntimeError("HuggingFace client not initialized") 688 689 if attributes is not None: 690 raise ValueError( 691 "HuggingFace provider does not support custom file attributes. " 692 "Use commit messages or repository metadata instead." 693 ) 694 695 if remote_path.endswith("/"): 696 raise ValueError( 697 "HuggingFace Storage Provider does not support explicit directory creation. " 698 "Directories are created implicitly when files are uploaded to paths within them." 699 ) 700 701 remote_path = self._normalize_path(remote_path) 702 703 def _invoke_api(): 704 if isinstance(f, str): 705 file_size = os.path.getsize(f) 706 707 self._hf_client.upload_file( 708 path_or_fileobj=f, 709 path_in_repo=remote_path, 710 repo_id=self._repository_id, 711 repo_type=self._repo_type, 712 revision=self._repo_revision, 713 commit_message=f"Upload {remote_path}", 714 commit_description=None, 715 create_pr=False, 716 ) 717 718 return file_size 719 720 else: 721 content = f.read() 722 723 if isinstance(content, str): 724 content_bytes = content.encode("utf-8") 725 else: 726 content_bytes = content 727 728 # Create temporary file since HfAPI.upload_file requires BinaryIO, not generic IO 729 with tempfile.NamedTemporaryFile(delete=False) as temp_file: 730 temp_file.write(content_bytes) 731 temp_file_path = temp_file.name 732 733 try: 734 self._hf_client.upload_file( 735 path_or_fileobj=temp_file_path, 736 path_in_repo=remote_path, 737 repo_id=self._repository_id, 738 repo_type=self._repo_type, 739 revision=self._repo_revision, 740 commit_message=f"Upload {remote_path}", 741 create_pr=False, 742 ) 743 744 return len(content_bytes) 745 746 finally: 747 os.unlink(temp_file_path) 748 749 return self._translate_errors(_invoke_api, "PUT", self._repository_id, remote_path) 750 751 def _download_file(self, remote_path: str, f: Union[str, IO], metadata: Optional[ObjectMetadata] = None) -> int: 752 """ 753 Downloads a file from the HuggingFace repository. 754 755 :param remote_path: The remote path of the file to download from the repository. 756 :param f: Local file path or file object to write to. 757 :param metadata: Optional object metadata (not used in this implementation). 758 :return: Data size in bytes. 759 """ 760 if not self._hf_client: 761 raise RuntimeError("HuggingFace client not initialized") 762 763 remote_path = self._normalize_path(remote_path) 764 765 def _invoke_api(): 766 if isinstance(f, str): 767 parent_dir = os.path.dirname(f) 768 if parent_dir: 769 safe_makedirs(parent_dir) 770 771 target_dir = parent_dir if parent_dir else "." 772 downloaded_path = self._hf_client.hf_hub_download( 773 repo_id=self._repository_id, 774 filename=remote_path, 775 repo_type=self._repo_type, 776 revision=self._repo_revision, 777 local_dir=target_dir, 778 ) 779 780 if os.path.abspath(downloaded_path) != os.path.abspath(f): 781 os.rename(downloaded_path, f) 782 783 return os.path.getsize(f) 784 785 else: 786 with tempfile.TemporaryDirectory() as temp_dir: 787 downloaded_path = self._hf_client.hf_hub_download( 788 repo_id=self._repository_id, 789 filename=remote_path, 790 repo_type=self._repo_type, 791 revision=self._repo_revision, 792 local_dir=temp_dir, 793 ) 794 795 with open(downloaded_path, "rb") as src: 796 data = src.read() 797 if isinstance(f, io.TextIOBase): 798 f.write(data.decode("utf-8")) 799 else: 800 f.write(data) 801 802 return len(data) 803 804 return self._translate_errors(_invoke_api, "GET", self._repository_id, remote_path) 805 806 def _is_dir(self, path: str) -> bool: 807 """ 808 Helper method to check if a path is a directory. 809 810 :param path: The path to check. 811 :return: True if the path appears to be a directory (has files under it). 812 """ 813 path = path.rstrip("/") 814 if not path: 815 # The root of the repo is always a directory 816 return True 817 818 try: 819 path_info = self._hf_client.get_paths_info( 820 repo_id=self._repository_id, 821 paths=[path], 822 repo_type=self._repo_type, 823 revision=self._repo_revision, 824 ) 825 826 if not path_info: 827 return False 828 829 return isinstance(path_info[0], RepoFolder) 830 831 except RepositoryNotFoundError as e: 832 raise FileNotFoundError( 833 f"Repository not found or access denied: {self._repository_id}. " 834 f"Verify the repository exists and you have access permissions." 835 ) from e 836 except RevisionNotFoundError as e: 837 raise FileNotFoundError( 838 f"Revision '{self._repo_revision}' not found in repository {self._repository_id}. " 839 f"Verify the branch, tag, or commit exists." 840 ) from e 841 except IndexError: 842 return False 843 except Exception as e: 844 raise Exception(f"Unexpected error: {e}") 845 846 def _normalize_path(self, path: str) -> str: 847 """ 848 Normalize path for HuggingFace API by removing leading slashes. 849 HuggingFace expects relative paths within repositories. 850 """ 851 return path.lstrip("/")