Skip to content

Load

NGCDownloader dataclass

A class to download files from NGC in a Pooch-compatible way.

NGC downloads are typically structured as directories, while pooch expects a single file. This class downloads a single file from an NGC directory and moves it to the desired location.

Source code in bionemo/scdl/data/load.py
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
@dataclass
class NGCDownloader:
    """A class to download files from NGC in a Pooch-compatible way.

    NGC downloads are typically structured as directories, while pooch expects a single file. This class
    downloads a single file from an NGC directory and moves it to the desired location.
    """

    filename: str

    def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None:
        """Download a file from NGC."""
        try:
            import nest_asyncio
        except ImportError:
            raise ImportError(
                "nest_asyncio is required for NGC downloads. Please install nest_asyncio or use PBSS source instead."
            )

        client = default_ngc_client()
        nest_asyncio.apply()

        # SCDL only uses NGC resources, never models
        download_fn = client.registry.resource.download_version

        output_file = Path(output_file)
        output_file.parent.mkdir(parents=True, exist_ok=True)

        # NGC seems to always download to a specific directory that we can't specify ourselves.
        ngc_dirname = Path(url).name.replace(":", "_v")

        with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir:
            download_fn(url, temp_dir, file_patterns=[self.filename])
            shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file)

__call__(url, output_file, _)

Download a file from NGC.

Source code in bionemo/scdl/data/load.py
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None:
    """Download a file from NGC."""
    try:
        import nest_asyncio
    except ImportError:
        raise ImportError(
            "nest_asyncio is required for NGC downloads. Please install nest_asyncio or use PBSS source instead."
        )

    client = default_ngc_client()
    nest_asyncio.apply()

    # SCDL only uses NGC resources, never models
    download_fn = client.registry.resource.download_version

    output_file = Path(output_file)
    output_file.parent.mkdir(parents=True, exist_ok=True)

    # NGC seems to always download to a specific directory that we can't specify ourselves.
    ngc_dirname = Path(url).name.replace(":", "_v")

    with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir:
        download_fn(url, temp_dir, file_patterns=[self.filename])
        shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file)

Resource dataclass

Class that represents a remote resource for downloading and caching test data.

Source code in bionemo/scdl/data/load.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
@dataclass
class Resource:
    """Class that represents a remote resource for downloading and caching test data."""

    tag: str
    """A unique identifier for the resource."""

    ngc: str | None = None
    """The NGC URL for the resource."""

    pbss: str | None = None
    """The PBSS URL of the resource."""

    sha256: str | None = None
    """The SHA256 checksum of the resource."""

    owner: str = ""
    """The owner or primary point of contact for the resource."""

    description: str | None = None
    """A description of the file(s)."""

    unpack: Literal[False, None] = None
    """Whether the resource should be unpacked after download."""

    decompress: Literal[False, None] = None
    """Whether the resource should be decompressed after download."""

decompress = None class-attribute instance-attribute

Whether the resource should be decompressed after download.

description = None class-attribute instance-attribute

A description of the file(s).

ngc = None class-attribute instance-attribute

The NGC URL for the resource.

owner = '' class-attribute instance-attribute

The owner or primary point of contact for the resource.

pbss = None class-attribute instance-attribute

The PBSS URL of the resource.

sha256 = None class-attribute instance-attribute

The SHA256 checksum of the resource.

tag instance-attribute

A unique identifier for the resource.

unpack = None class-attribute instance-attribute

Whether the resource should be unpacked after download.

default_ngc_client(use_guest_if_api_key_invalid=True)

Create a default NGC client.

This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container.

Source code in bionemo/scdl/data/load.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
def default_ngc_client(use_guest_if_api_key_invalid: bool = True) -> "ngcsdk.Client":
    """Create a default NGC client.

    This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container.
    """
    import ngcsdk

    client = ngcsdk.Client()

    try:
        client.configure()

    except ValueError as e:
        if use_guest_if_api_key_invalid:
            logger.error(f"Error configuring NGC client: {e}, signing in as guest.")
            client = ngcsdk.Client("no-apikey")
            client.configure(
                api_key="no-apikey",  # pragma: allowlist secret
                org_name="no-org",
                team_name="no-team",
                ace_name="no-ace",
            )

        else:
            raise

    return client

default_pbss_client()

Create a default S3 client for PBSS.

Source code in bionemo/scdl/data/load.py
229
230
231
232
233
234
235
236
237
238
def default_pbss_client():
    """Create a default S3 client for PBSS."""
    try:
        import boto3
        from botocore.config import Config
    except ImportError:
        raise ImportError("boto3 and botocore are required to download from PBSS.")

    retry_config = Config(retries={"max_attempts": 10, "mode": "standard"})
    return boto3.client("s3", endpoint_url="https://pbss.s8k.io", config=retry_config)

get_all_resources(resource_path=None) cached

Return a dictionary of all resources.

Source code in bionemo/scdl/data/load.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
@functools.cache
def get_all_resources(resource_path: Path | None = None) -> dict[str, Resource]:
    """Return a dictionary of all resources."""
    if not resource_path:
        # Use importlib.resources to access bundled package resources
        try:
            resource_files = resources.files("bionemo.scdl.data.resources")
            resources_files = [f for f in resource_files.iterdir() if f.is_file() and f.suffix in {".yaml", ".yml"}]
        except (ImportError, FileNotFoundError):
            # Fallback to local directory for development/testing
            resource_path = Path(__file__).parent / "resources"
            resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml"))
    else:
        resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml"))

    all_resources = [resource for file in resources_files for resource in _parse_resource_file(file)]

    try:
        import pydantic

        resource_list = pydantic.TypeAdapter(list[Resource]).validate_python(all_resources)
    except ImportError:
        # If pydantic is not available, create Resource objects directly
        resource_list = [Resource(**resource) for resource in all_resources]
    resource_dict = {resource.tag: resource for resource in resource_list}

    if len(resource_dict) != len(resource_list):
        # Show the # of and which ones are duplicated so that a user can begin debugging and resolve the issue.
        tag_counts = Counter([resource.tag for resource in resource_list])
        raise ValueError(f"Duplicate resource tags found!: {[tag for tag, count in tag_counts.items() if count > 1]}")

    return resource_dict

load(model_or_data_tag, source=DEFAULT_SOURCE, resources=None, cache_dir=None)

Download a resource from PBSS or NGC.

Parameters:

Name Type Description Default
model_or_data_tag str

A pointer to the desired resource. Must be a key in the resources dictionary.

required
source SourceOptions

Either "pbss" (NVIDIA-internal) or "ngc" (NGC). Defaults to DEFAULT_SOURCE (from environment variable BIONEMO_DATA_SOURCE; defaults to "ngc").

DEFAULT_SOURCE
resources dict[str, Resource] | None

A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.)

None
cache_dir Path | None

The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.)

None

Raises:

Type Description
ValueError

If the desired tag was not found, or if an NGC url was requested but not provided.

Returns:

Type Description
Path

A Path object pointing either at the downloaded file, or at a decompressed folder containing the

Path

file(s).

Examples:

For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file:

>>> load("filename/tag")
PosixPath(/tmp/bionemo/downloaded-file-name)
Source code in bionemo/scdl/data/load.py
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
def load(
    model_or_data_tag: str,
    source: SourceOptions = DEFAULT_SOURCE,
    resources: dict[str, Resource] | None = None,
    cache_dir: Path | None = None,
) -> Path:
    """Download a resource from PBSS or NGC.

    Args:
        model_or_data_tag: A pointer to the desired resource. Must be a key in the resources dictionary.
        source: Either "pbss" (NVIDIA-internal) or "ngc" (NGC). Defaults to DEFAULT_SOURCE
            (from environment variable BIONEMO_DATA_SOURCE; defaults to "ngc").
        resources: A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.)
        cache_dir: The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.)

    Raises:
        ValueError: If the desired tag was not found, or if an NGC url was requested but not provided.

    Returns:
        A Path object pointing either at the downloaded file, or at a decompressed folder containing the
        file(s).

    Examples:
        For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file:
        >>> load("filename/tag")
        PosixPath(/tmp/bionemo/downloaded-file-name)
    """
    if resources is None:
        # Get resources from the local scdl data directory
        resources = get_all_resources()

    if cache_dir is None:
        cache_dir = BIONEMO_CACHE_DIR

    if model_or_data_tag not in resources:
        raise ValueError(f"Resource '{model_or_data_tag}' not found.")

    if source == "ngc" and resources[model_or_data_tag].ngc is None:
        raise ValueError(f"Resource '{model_or_data_tag}' does not have an NGC URL.")

    resource = resources[model_or_data_tag]
    filename = str(resource.pbss).split("/")[-1]

    # Determine the right Pooch processor based on filename suffixes
    processor = _get_processor(filename, resource.unpack, resource.decompress)

    if source == "pbss":
        download_fn = _s3_download
        url = resource.pbss

    elif source == "ngc":
        download_fn = NGCDownloader(filename=filename)
        url = resource.ngc

    else:
        raise ValueError(f"Source '{source}' not supported.")

    download = pooch.retrieve(
        url=str(url),
        fname=f"{resource.sha256}-{filename}",
        known_hash=resource.sha256,
        path=cache_dir,
        downloader=download_fn,
        processor=processor,
    )

    # Pooch by default returns a list of unpacked files if they unpack a zipped or tarred directory. Instead of that, we
    # just want the unpacked, parent folder.
    if isinstance(download, list):
        return Path(processor.extract_dir)  # type: ignore

    else:
        return Path(download)