Skip to content

Resource

Resource

Bases: BaseModel

Class that represents a remote resource for downloading and caching test data.

Source code in bionemo/core/data/resource.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
class Resource(pydantic.BaseModel):
    """Class that represents a remote resource for downloading and caching test data."""

    model_config = pydantic.ConfigDict(use_attribute_docstrings=True)

    tag: Annotated[str, pydantic.StringConstraints(pattern=r"^[^/]*/[^/]*$")]  # Only slash between filename and tag.
    """A unique identifier for the resource. The file(s) will be accessible via load("filename/tag")."""

    ngc: Annotated[str, pydantic.AfterValidator(_validate_ngc_resource)] | None = None
    """The NGC URL for the resource.

    Should be in format [org/[team/]]name[:version]. If None, the resource is not available on NGC.
    """

    ngc_registry: Literal["model", "resource"] | None = None
    """The NGC resource type (model or resource) for the data. Must be provided if ngc is not None."""

    pbss: Annotated[pydantic.AnyUrl, pydantic.UrlConstraints(allowed_schemes=["s3"])]
    """The PBSS (NVIDIA-internal) URL of the resource."""

    sha256: str | None
    """The SHA256 checksum of the resource. If None, the SHA will not be checked on download (not recommended)."""

    owner: pydantic.NameEmail
    """The owner or primary point of contact for the resource, in the format "Name <email>"."""

    description: str | None = None
    """A description of the file(s)."""

    unpack: Literal[False, None] = None
    """Whether the resource should be unpacked after download. If None, will defer to the file extension."""

    decompress: Literal[False, None] = None
    """Whether the resource should be decompressed after download. If None, will defer to the file extension."""

    @pydantic.model_validator(mode="after")
    def _validate_ngc_registry(self):
        if self.ngc and not self.ngc_registry:
            raise ValueError(f"ngc_registry must be provided if ngc is not None: {self.tag}")
        return self

decompress = None class-attribute instance-attribute

Whether the resource should be decompressed after download. If None, will defer to the file extension.

description = None class-attribute instance-attribute

A description of the file(s).

ngc = None class-attribute instance-attribute

The NGC URL for the resource.

Should be in format [org/[team/]]name[:version]. If None, the resource is not available on NGC.

ngc_registry = None class-attribute instance-attribute

The NGC resource type (model or resource) for the data. Must be provided if ngc is not None.

owner instance-attribute

The owner or primary point of contact for the resource, in the format "Name ".

pbss instance-attribute

The PBSS (NVIDIA-internal) URL of the resource.

sha256 instance-attribute

The SHA256 checksum of the resource. If None, the SHA will not be checked on download (not recommended).

tag instance-attribute

A unique identifier for the resource. The file(s) will be accessible via load("filename/tag").

unpack = None class-attribute instance-attribute

Whether the resource should be unpacked after download. If None, will defer to the file extension.

get_all_resources(resource_path=None) cached

Return a dictionary of all resources.

Source code in bionemo/core/data/resource.py
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
@functools.cache
def get_all_resources(resource_path: Path | None = None) -> dict[str, Resource]:
    """Return a dictionary of all resources."""
    if not resource_path:
        resource_path = Path(files("bionemo.core.data").joinpath("resources"))  # type: ignore

    resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml"))

    all_resources = [resource for file in resources_files for resource in _parse_resource_file(file)]

    resource_list = pydantic.TypeAdapter(list[Resource]).validate_python(all_resources)
    resource_dict = {resource.tag: resource for resource in resource_list}

    if len(resource_dict) != len(resource_list):
        # Show the # of and which ones are duplicated so that a user can begin debugging and resolve the issue.
        tag_counts = Counter([resource.tag for resource in resource_list])
        raise ValueError(f"Duplicate resource tags found!: {[tag for tag, count in tag_counts.items() if count > 1]}")

    return resource_dict