Load

`default_ngc_client(use_guest_if_api_key_invalid=True)`

Create a default NGC client.

This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container.

Source code in bionemo/core/data/load.py

def default_ngc_client(use_guest_if_api_key_invalid: bool = True) -> "ngcsdk.Client":
    """Create a default NGC client.

    This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container.
    """
    import ngcsdk

    client = ngcsdk.Client()

    try:
        client.configure()

    except ValueError as e:
        if use_guest_if_api_key_invalid:
            logger.error(f"Error configuring NGC client: {e}, signing in as guest.")
            client = ngcsdk.Client("no-apikey")
            client.configure(
                api_key="no-apikey",  # pragma: allowlist secret
                org_name="no-org",
                team_name="no-team",
                ace_name="no-ace",
            )

        else:
            raise

    return client

`default_pbss_client()`

Create a default S3 client for PBSS.

Source code in bionemo/core/data/load.py

def default_pbss_client():
    """Create a default S3 client for PBSS."""
    try:
        import boto3
    except ImportError:
        raise ImportError("boto3 is required to download from PBSS.")

    retry_config = Config(retries={"max_attempts": 10, "mode": "standard"})
    return boto3.client("s3", endpoint_url="https://pbss.s8k.io", config=retry_config)

`load(model_or_data_tag, source=DEFAULT_SOURCE, resources=None, cache_dir=None)`

Download a resource from PBSS or NGC.

Parameters:

Name	Type	Description	Default
`model_or_data_tag`	`str`	A pointer to the desired resource. Must be a key in the resources dictionary.	required
`source`	`SourceOptions`	Either "pbss" (NVIDIA-internal download) or "ngc" (NVIDIA GPU Cloud). Defaults to "pbss".	`DEFAULT_SOURCE`
`resources`	`dict[str, Resource] \| None`	A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.)	`None`
`cache_dir`	`Path \| None`	The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.)	`None`

Raises:

Type	Description
`ValueError`	If the desired tag was not found, or if an NGC url was requested but not provided.

Returns:

Type	Description
`Path`	A Path object pointing either at the downloaded file, or at a decompressed folder containing the
`Path`	file(s).

Examples:

For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file:

>>> load("filename/tag")
PosixPath(/tmp/bionemo/downloaded-file-name)

Source code in bionemo/core/data/load.py

def load(
    model_or_data_tag: str,
    source: SourceOptions = DEFAULT_SOURCE,
    resources: dict[str, Resource] | None = None,
    cache_dir: Path | None = None,
) -> Path:
    """Download a resource from PBSS or NGC.

    Args:
        model_or_data_tag: A pointer to the desired resource. Must be a key in the resources dictionary.
        source: Either "pbss" (NVIDIA-internal download) or "ngc" (NVIDIA GPU Cloud). Defaults to "pbss".
        resources: A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.)
        cache_dir: The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.)

    Raises:
        ValueError: If the desired tag was not found, or if an NGC url was requested but not provided.

    Returns:
        A Path object pointing either at the downloaded file, or at a decompressed folder containing the
        file(s).

    Examples:
        For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file:
        >>> load("filename/tag")
        PosixPath(/tmp/bionemo/downloaded-file-name)
    """
    if resources is None:
        resources = get_all_resources()

    if cache_dir is None:
        cache_dir = BIONEMO_CACHE_DIR

    if model_or_data_tag not in resources:
        raise ValueError(f"Resource '{model_or_data_tag}' not found.")

    if source == "ngc" and resources[model_or_data_tag].ngc is None:
        raise ValueError(f"Resource '{model_or_data_tag}' does not have an NGC URL.")

    resource = resources[model_or_data_tag]
    filename = str(resource.pbss).split("/")[-1]

    extension = "".join(Path(filename).suffixes)
    processor = _get_processor(extension, resource.unpack, resource.decompress)

    if source == "pbss":
        download_fn = _s3_download
        url = resource.pbss

    elif source == "ngc":
        assert resource.ngc_registry is not None
        download_fn = NGCDownloader(filename=filename, ngc_registry=resource.ngc_registry)
        url = resource.ngc

    else:
        raise ValueError(f"Source '{source}' not supported.")

    download = pooch.retrieve(
        url=str(url),
        fname=f"{resource.sha256}-{filename}",
        known_hash=resource.sha256,
        path=cache_dir,
        downloader=download_fn,
        processor=processor,
    )

    # Pooch by default returns a list of unpacked files if they unpack a zipped or tarred directory. Instead of that, we
    # just want the unpacked, parent folder.
    if isinstance(download, list):
        return Path(processor.extract_dir)  # type: ignore

    else:
        return Path(download)