Load

`NGCDownloader` `dataclass`

A class to download files from NGC in a Pooch-compatible way.

NGC downloads are typically structured as directories, while pooch expects a single file. This class downloads a single file from an NGC directory and moves it to the desired location.

Source code in bionemo/core/data/load.py

@dataclass
class NGCDownloader:
    """A class to download files from NGC in a Pooch-compatible way.

    NGC downloads are typically structured as directories, while pooch expects a single file. This class
    downloads a single file from an NGC directory and moves it to the desired location.
    """

    filename: str
    ngc_registry: Literal["model", "resource"]

    def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None:
        """Download a file from NGC."""
        client = default_ngc_client()
        nest_asyncio.apply()

        download_fns = {
            "model": client.registry.model.download_version,
            "resource": client.registry.resource.download_version,
        }

        output_file = Path(output_file)
        output_file.parent.mkdir(parents=True, exist_ok=True)

        # NGC seems to always download to a specific directory that we can't specify ourselves.
        ngc_dirname = Path(url).name.replace(":", "_v")

        with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir:
            download_fns[self.ngc_registry](url, temp_dir, file_patterns=[self.filename])
            shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file)

`call(url, output_file, _)`

Download a file from NGC.

Source code in bionemo/core/data/load.py

def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None:
    """Download a file from NGC."""
    client = default_ngc_client()
    nest_asyncio.apply()

    download_fns = {
        "model": client.registry.model.download_version,
        "resource": client.registry.resource.download_version,
    }

    output_file = Path(output_file)
    output_file.parent.mkdir(parents=True, exist_ok=True)

    # NGC seems to always download to a specific directory that we can't specify ourselves.
    ngc_dirname = Path(url).name.replace(":", "_v")

    with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir:
        download_fns[self.ngc_registry](url, temp_dir, file_patterns=[self.filename])
        shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file)

`default_ngc_client(use_guest_if_api_key_invalid=True)`

Create a default NGC client.

This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container.

Source code in bionemo/core/data/load.py

def default_ngc_client(use_guest_if_api_key_invalid: bool = True) -> "ngcsdk.Client":
    """Create a default NGC client.

    This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container.
    """
    import ngcsdk

    client = ngcsdk.Client()

    try:
        client.configure()

    except ValueError as e:
        if use_guest_if_api_key_invalid:
            logger.error(f"Error configuring NGC client: {e}, signing in as guest.")
            client = ngcsdk.Client("no-apikey")
            client.configure(
                api_key="no-apikey",  # pragma: allowlist secret
                org_name="no-org",
                team_name="no-team",
                ace_name="no-ace",
            )

        else:
            raise

    return client

`default_pbss_client()`

Create a default S3 client for PBSS.

Source code in bionemo/core/data/load.py

def default_pbss_client():
    """Create a default S3 client for PBSS."""
    try:
        import boto3
    except ImportError:
        raise ImportError("boto3 is required to download from PBSS.")

    retry_config = Config(retries={"max_attempts": 10, "mode": "standard"})
    return boto3.client("s3", endpoint_url="https://pbss.s8k.io", config=retry_config)

`entrypoint()`

Allows a user to get a specific artifact from the command line.

Source code in bionemo/core/data/load.py

def entrypoint():
    """Allows a user to get a specific artifact from the command line."""
    parser = argparse.ArgumentParser(
        description="Retrieve the local path to the requested artifact name or list resources."
    )

    # Create mutually exclusive group
    group = parser.add_mutually_exclusive_group(required=True)

    # Add the argument for artifact name, which is required if --list-resources is not used
    group.add_argument("artifact_name", type=str, nargs="?", help="Name of the artifact")

    # Add the --list-resources option
    group.add_argument(
        "--list-resources", action="store_true", default=False, help="List all available artifacts and then exit."
    )

    # Add the --source option
    parser.add_argument(
        "--source",
        type=str,
        choices=["pbss", "ngc"],
        default="ngc",
        help='Backend to use, Internal NVIDIA users can set this to "pbss".',
    )

    parser.add_argument(
        "--all",
        action="store_true",
        default=False,
        help="Download all resources. Ignores all other options.",
    )
    args = parser.parse_args()
    maybe_error = main(
        download_all=args.all,
        list_resources=args.list_resources,
        artifact_name=args.artifact_name,
        source=args.source,
    )
    if maybe_error is not None:
        parser.error(maybe_error)

`load(model_or_data_tag, source=DEFAULT_SOURCE, resources=None, cache_dir=None)`

Download a resource from PBSS or NGC.

Parameters:

Name	Type	Description	Default
`model_or_data_tag`	`str`	A pointer to the desired resource. Must be a key in the resources dictionary.	required
`source`	`SourceOptions`	Either "pbss" (NVIDIA-internal download) or "ngc" (NVIDIA GPU Cloud). Defaults to "pbss".	`DEFAULT_SOURCE`
`resources`	`dict[str, Resource] \| None`	A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.)	`None`
`cache_dir`	`Path \| None`	The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.)	`None`

Raises:

Type	Description
`ValueError`	If the desired tag was not found, or if an NGC url was requested but not provided.

Returns:

Type	Description
`Path`	A Path object pointing either at the downloaded file, or at a decompressed folder containing the
`Path`	file(s).

Examples:

For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file:

>>> load("filename/tag")
PosixPath(/tmp/bionemo/downloaded-file-name)

Source code in bionemo/core/data/load.py

def load(
    model_or_data_tag: str,
    source: SourceOptions = DEFAULT_SOURCE,
    resources: dict[str, Resource] | None = None,
    cache_dir: Path | None = None,
) -> Path:
    """Download a resource from PBSS or NGC.

    Args:
        model_or_data_tag: A pointer to the desired resource. Must be a key in the resources dictionary.
        source: Either "pbss" (NVIDIA-internal download) or "ngc" (NVIDIA GPU Cloud). Defaults to "pbss".
        resources: A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.)
        cache_dir: The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.)

    Raises:
        ValueError: If the desired tag was not found, or if an NGC url was requested but not provided.

    Returns:
        A Path object pointing either at the downloaded file, or at a decompressed folder containing the
        file(s).

    Examples:
        For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file:
        >>> load("filename/tag")
        PosixPath(/tmp/bionemo/downloaded-file-name)
    """
    if resources is None:
        resources = get_all_resources()

    if cache_dir is None:
        cache_dir = BIONEMO_CACHE_DIR

    if model_or_data_tag not in resources:
        raise ValueError(f"Resource '{model_or_data_tag}' not found.")

    if source == "ngc" and resources[model_or_data_tag].ngc is None:
        raise ValueError(f"Resource '{model_or_data_tag}' does not have an NGC URL.")

    resource = resources[model_or_data_tag]
    filename = str(resource.pbss).split("/")[-1]

    extension = "".join(Path(filename).suffixes)
    processor = _get_processor(extension, resource.unpack, resource.decompress)

    if source == "pbss":
        download_fn = _s3_download
        url = resource.pbss

    elif source == "ngc":
        assert resource.ngc_registry is not None
        download_fn = NGCDownloader(filename=filename, ngc_registry=resource.ngc_registry)
        url = resource.ngc

    else:
        raise ValueError(f"Source '{source}' not supported.")

    download = pooch.retrieve(
        url=str(url),
        fname=f"{resource.sha256}-{filename}",
        known_hash=resource.sha256,
        path=cache_dir,
        downloader=download_fn,
        processor=processor,
    )

    # Pooch by default returns a list of unpacked files if they unpack a zipped or tarred directory. Instead of that, we
    # just want the unpacked, parent folder.
    if isinstance(download, list):
        return Path(processor.extract_dir)  # type: ignore

    else:
        return Path(download)

`main(download_all, list_resources, artifact_name, source)`

Main download script logic: parameters are 1:1 with CLI flags. Returns string describing error on failure.

Source code in bionemo/core/data/load.py

def main(
    download_all: bool, list_resources: bool, artifact_name: str, source: Literal["pbss", "ngc"]
) -> Optional[str]:
    """Main download script logic: parameters are 1:1 with CLI flags. Returns string describing error on failure."""
    if download_all:
        print("Downloading all resources:", file=sys.stderr)
        print_resources(output_source=sys.stderr)
        print("-" * 80, file=sys.stderr)

        resource_to_local: dict[str, Path] = {}
        for resource_name in tqdm(
            sorted(get_all_resources()),
            desc="Downloading Resources",
        ):
            with contextlib.redirect_stdout(sys.stderr):
                local_path = load(resource_name, source=source)
            resource_to_local[resource_name] = local_path

        print("-" * 80, file=sys.stderr)
        print("All resources downloaded:", file=sys.stderr)
        for resource_name, local_path in sorted(resource_to_local.items()):
            print(f"  {resource_name}: {str(local_path.absolute())}", file=sys.stderr)

    elif list_resources:
        print_resources(output_source=sys.stdout)

    elif artifact_name is not None and len(artifact_name) > 0:
        # Get the local path for the provided artifact name
        with contextlib.redirect_stdout(sys.stderr):
            local_path = load(artifact_name, source=source)

        # Print the result => CLI use assumes that we can get the single downloaded resource's path on STDOUT
        print(str(local_path.absolute()))

    else:
        return "You must provide an artifact name if --list-resources or --all is not set!"

`print_resources(*, output_source=sys.stdout)`

Prints all available downloadable resources & their sources to STDOUT.

Source code in bionemo/core/data/load.py

def print_resources(*, output_source: TextIO = sys.stdout) -> None:
    """Prints all available downloadable resources & their sources to STDOUT."""
    print("#resource_name\tsource_options", file=output_source)
    for resource_name, resource in sorted(get_all_resources().items()):
        sources = []
        if resource.ngc is not None:
            sources.append("ngc")
        if resource.pbss is not None:
            sources.append("pbss")
        print(f"{resource_name}\t{','.join(sources)}", file=output_source)

Load

NGCDownloader dataclass

__call__(url, output_file, _)

default_ngc_client(use_guest_if_api_key_invalid=True)

default_pbss_client()

entrypoint()

load(model_or_data_tag, source=DEFAULT_SOURCE, resources=None, cache_dir=None)

main(download_all, list_resources, artifact_name, source)

print_resources(*, output_source=sys.stdout)

`NGCDownloader` `dataclass`

`call(url, output_file, _)`

`default_ngc_client(use_guest_if_api_key_invalid=True)`

`default_pbss_client()`

`entrypoint()`

`load(model_or_data_tag, source=DEFAULT_SOURCE, resources=None, cache_dir=None)`

`main(download_all, list_resources, artifact_name, source)`

`print_resources(*, output_source=sys.stdout)`