Load

`NGCDownloader` `dataclass`

A class to download files from NGC in a Pooch-compatible way.

NGC downloads are typically structured as directories, while pooch expects a single file. This class downloads a single file from an NGC directory and moves it to the desired location.

Source code in bionemo/scdl/data/load.py

@dataclass
class NGCDownloader:
    """A class to download files from NGC in a Pooch-compatible way.

    NGC downloads are typically structured as directories, while pooch expects a single file. This class
    downloads a single file from an NGC directory and moves it to the desired location.
    """

    filename: str

    def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None:
        """Download a file from NGC."""
        try:
            import nest_asyncio
        except ImportError:
            raise ImportError(
                "nest_asyncio is required for NGC downloads. Please install nest_asyncio or use PBSS source instead."
            )

        client = default_ngc_client()
        nest_asyncio.apply()

        # SCDL only uses NGC resources, never models
        download_fn = client.registry.resource.download_version

        output_file = Path(output_file)
        output_file.parent.mkdir(parents=True, exist_ok=True)

        # NGC seems to always download to a specific directory that we can't specify ourselves.
        ngc_dirname = Path(url).name.replace(":", "_v")

        with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir:
            download_fn(url, temp_dir, file_patterns=[self.filename])
            shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file)

`call(url, output_file, _)`

Download a file from NGC.

Source code in bionemo/scdl/data/load.py

def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None:
    """Download a file from NGC."""
    try:
        import nest_asyncio
    except ImportError:
        raise ImportError(
            "nest_asyncio is required for NGC downloads. Please install nest_asyncio or use PBSS source instead."
        )

    client = default_ngc_client()
    nest_asyncio.apply()

    # SCDL only uses NGC resources, never models
    download_fn = client.registry.resource.download_version

    output_file = Path(output_file)
    output_file.parent.mkdir(parents=True, exist_ok=True)

    # NGC seems to always download to a specific directory that we can't specify ourselves.
    ngc_dirname = Path(url).name.replace(":", "_v")

    with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir:
        download_fn(url, temp_dir, file_patterns=[self.filename])
        shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file)

`Resource` `dataclass`

Class that represents a remote resource for downloading and caching test data.

Source code in bionemo/scdl/data/load.py

@dataclass
class Resource:
    """Class that represents a remote resource for downloading and caching test data."""

    tag: str
    """A unique identifier for the resource."""

    ngc: str | None = None
    """The NGC URL for the resource."""

    pbss: str | None = None
    """The PBSS URL of the resource."""

    sha256: str | None = None
    """The SHA256 checksum of the resource."""

    owner: str = ""
    """The owner or primary point of contact for the resource."""

    description: str | None = None
    """A description of the file(s)."""

    unpack: Literal[False, None] = None
    """Whether the resource should be unpacked after download."""

    decompress: Literal[False, None] = None
    """Whether the resource should be decompressed after download."""

`decompress = None` `class-attribute` `instance-attribute`

Whether the resource should be decompressed after download.

`description = None` `class-attribute` `instance-attribute`

A description of the file(s).

`ngc = None` `class-attribute` `instance-attribute`

The NGC URL for the resource.

`owner = ''` `class-attribute` `instance-attribute`

The owner or primary point of contact for the resource.

`pbss = None` `class-attribute` `instance-attribute`

The PBSS URL of the resource.

`sha256 = None` `class-attribute` `instance-attribute`

The SHA256 checksum of the resource.

`tag` `instance-attribute`

A unique identifier for the resource.

`unpack = None` `class-attribute` `instance-attribute`

Whether the resource should be unpacked after download.

`default_ngc_client(use_guest_if_api_key_invalid=True)`

Create a default NGC client.

This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container.

Source code in bionemo/scdl/data/load.py

def default_ngc_client(use_guest_if_api_key_invalid: bool = True) -> "ngcsdk.Client":
    """Create a default NGC client.

    This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container.
    """
    import ngcsdk

    client = ngcsdk.Client()

    try:
        client.configure()

    except ValueError as e:
        if use_guest_if_api_key_invalid:
            logger.error(f"Error configuring NGC client: {e}, signing in as guest.")
            client = ngcsdk.Client("no-apikey")
            client.configure(
                api_key="no-apikey",  # pragma: allowlist secret
                org_name="no-org",
                team_name="no-team",
                ace_name="no-ace",
            )

        else:
            raise

    return client

`default_pbss_client()`

Create a default S3 client for PBSS.

Source code in bionemo/scdl/data/load.py

def default_pbss_client():
    """Create a default S3 client for PBSS."""
    try:
        import boto3
        from botocore.config import Config
    except ImportError:
        raise ImportError("boto3 and botocore are required to download from PBSS.")

    retry_config = Config(retries={"max_attempts": 10, "mode": "standard"})
    return boto3.client("s3", endpoint_url="https://pbss.s8k.io", config=retry_config)

`get_all_resources(resource_path=None)` `cached`

Return a dictionary of all resources.

Source code in bionemo/scdl/data/load.py

@functools.cache
def get_all_resources(resource_path: Path | None = None) -> dict[str, Resource]:
    """Return a dictionary of all resources."""
    if not resource_path:
        # Use importlib.resources to access bundled package resources
        try:
            resource_files = resources.files("bionemo.scdl.data.resources")
            resources_files = [f for f in resource_files.iterdir() if f.is_file() and f.suffix in {".yaml", ".yml"}]
        except (ImportError, FileNotFoundError):
            # Fallback to local directory for development/testing
            resource_path = Path(__file__).parent / "resources"
            resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml"))
    else:
        resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml"))

    all_resources = [resource for file in resources_files for resource in _parse_resource_file(file)]

    try:
        import pydantic

        resource_list = pydantic.TypeAdapter(list[Resource]).validate_python(all_resources)
    except ImportError:
        # If pydantic is not available, create Resource objects directly
        resource_list = [Resource(**resource) for resource in all_resources]
    resource_dict = {resource.tag: resource for resource in resource_list}

    if len(resource_dict) != len(resource_list):
        # Show the # of and which ones are duplicated so that a user can begin debugging and resolve the issue.
        tag_counts = Counter([resource.tag for resource in resource_list])
        raise ValueError(f"Duplicate resource tags found!: {[tag for tag, count in tag_counts.items() if count > 1]}")

    return resource_dict

`load(model_or_data_tag, source=DEFAULT_SOURCE, resources=None, cache_dir=None)`

Download a resource from PBSS or NGC.

Parameters:

Name	Type	Description	Default
`model_or_data_tag`	`str`	A pointer to the desired resource. Must be a key in the resources dictionary.	required
`source`	`SourceOptions`	Either "pbss" (NVIDIA-internal) or "ngc" (NGC). Defaults to DEFAULT_SOURCE (from environment variable BIONEMO_DATA_SOURCE; defaults to "ngc").	`DEFAULT_SOURCE`
`resources`	`dict[str, Resource] \| None`	A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.)	`None`
`cache_dir`	`Path \| None`	The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.)	`None`

Raises:

Type	Description
`ValueError`	If the desired tag was not found, or if an NGC url was requested but not provided.

Returns:

Type	Description
`Path`	A Path object pointing either at the downloaded file, or at a decompressed folder containing the
`Path`	file(s).

Examples:

For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file:

>>> load("filename/tag")
PosixPath(/tmp/bionemo/downloaded-file-name)

Source code in bionemo/scdl/data/load.py

def load(
    model_or_data_tag: str,
    source: SourceOptions = DEFAULT_SOURCE,
    resources: dict[str, Resource] | None = None,
    cache_dir: Path | None = None,
) -> Path:
    """Download a resource from PBSS or NGC.

    Args:
        model_or_data_tag: A pointer to the desired resource. Must be a key in the resources dictionary.
        source: Either "pbss" (NVIDIA-internal) or "ngc" (NGC). Defaults to DEFAULT_SOURCE
            (from environment variable BIONEMO_DATA_SOURCE; defaults to "ngc").
        resources: A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.)
        cache_dir: The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.)

    Raises:
        ValueError: If the desired tag was not found, or if an NGC url was requested but not provided.

    Returns:
        A Path object pointing either at the downloaded file, or at a decompressed folder containing the
        file(s).

    Examples:
        For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file:
        >>> load("filename/tag")
        PosixPath(/tmp/bionemo/downloaded-file-name)
    """
    if resources is None:
        # Get resources from the local scdl data directory
        resources = get_all_resources()

    if cache_dir is None:
        cache_dir = BIONEMO_CACHE_DIR

    if model_or_data_tag not in resources:
        raise ValueError(f"Resource '{model_or_data_tag}' not found.")

    if source == "ngc" and resources[model_or_data_tag].ngc is None:
        raise ValueError(f"Resource '{model_or_data_tag}' does not have an NGC URL.")

    resource = resources[model_or_data_tag]
    filename = str(resource.pbss).split("/")[-1]

    # Determine the right Pooch processor based on filename suffixes
    processor = _get_processor(filename, resource.unpack, resource.decompress)

    if source == "pbss":
        download_fn = _s3_download
        url = resource.pbss

    elif source == "ngc":
        download_fn = NGCDownloader(filename=filename)
        url = resource.ngc

    else:
        raise ValueError(f"Source '{source}' not supported.")

    download = pooch.retrieve(
        url=str(url),
        fname=f"{resource.sha256}-{filename}",
        known_hash=resource.sha256,
        path=cache_dir,
        downloader=download_fn,
        processor=processor,
    )

    # Pooch by default returns a list of unpacked files if they unpack a zipped or tarred directory. Instead of that, we
    # just want the unpacked, parent folder.
    if isinstance(download, list):
        return Path(processor.extract_dir)  # type: ignore

    else:
        return Path(download)

Load

NGCDownloader dataclass

__call__(url, output_file, _)

Resource dataclass

decompress = None class-attribute instance-attribute

description = None class-attribute instance-attribute

ngc = None class-attribute instance-attribute

owner = '' class-attribute instance-attribute

pbss = None class-attribute instance-attribute

sha256 = None class-attribute instance-attribute

tag instance-attribute

unpack = None class-attribute instance-attribute

default_ngc_client(use_guest_if_api_key_invalid=True)

default_pbss_client()

get_all_resources(resource_path=None) cached

load(model_or_data_tag, source=DEFAULT_SOURCE, resources=None, cache_dir=None)

`NGCDownloader` `dataclass`

`call(url, output_file, _)`

`Resource` `dataclass`

`decompress = None` `class-attribute` `instance-attribute`

`description = None` `class-attribute` `instance-attribute`

`ngc = None` `class-attribute` `instance-attribute`

`owner = ''` `class-attribute` `instance-attribute`

`pbss = None` `class-attribute` `instance-attribute`

`sha256 = None` `class-attribute` `instance-attribute`

`tag` `instance-attribute`

`unpack = None` `class-attribute` `instance-attribute`

`default_ngc_client(use_guest_if_api_key_invalid=True)`

`default_pbss_client()`

`get_all_resources(resource_path=None)` `cached`

`load(model_or_data_tag, source=DEFAULT_SOURCE, resources=None, cache_dir=None)`