Example notebook

NOTE It takes about 10 minutes to deploy this notebook as a Launchable. As of this writing, we are working on a free tier so a credit card may be required. You can reach out to your NVIDIA rep for credits.

In [1]:

Copied!





import os
import tempfile

import pooch
from torch.utils.data import DataLoader

from bionemo.core import BIONEMO_CACHE_DIR
from bionemo.scdl.io.single_cell_memmap_dataset import SingleCellMemMapDataset
from bionemo.scdl.util.torch_dataloader_utils import collate_sparse_matrix_batch
import os
import tempfile

import pooch
from torch.utils.data import DataLoader

from bionemo.core import BIONEMO_CACHE_DIR
from bionemo.scdl.io.single_cell_memmap_dataset import SingleCellMemMapDataset
from bionemo.scdl.util.torch_dataloader_utils import collate_sparse_matrix_batch

First, copy the input data. This can be done by copying https://datasets.cellxgene.cziscience.com/97e96fb1-8caf-4f08-9174-27308eabd4ea.h5ad to a directory named hdf5s.

In [2]:

Copied!





input_data = pooch.retrieve(
    "https://datasets.cellxgene.cziscience.com/97e96fb1-8caf-4f08-9174-27308eabd4ea.h5ad",
    path=BIONEMO_CACHE_DIR / "hdf5s",
    known_hash="a0728e13a421bbcd6b2718e1d32f88d0d5c7cb92289331e3f14a59b7c513b3bc",
)
input_data = pooch.retrieve(
    "https://datasets.cellxgene.cziscience.com/97e96fb1-8caf-4f08-9174-27308eabd4ea.h5ad",
    path=BIONEMO_CACHE_DIR / "hdf5s",
    known_hash="a0728e13a421bbcd6b2718e1d32f88d0d5c7cb92289331e3f14a59b7c513b3bc",
)

In [3]:

Copied!

# Create a SingleCellMemMapDataset
dataset_temp_dir = tempfile.TemporaryDirectory()
dataset_dir = os.path.join(dataset_temp_dir.name, "97e_scmm")

data = SingleCellMemMapDataset(dataset_dir, input_data)
# Create a SingleCellMemMapDataset
dataset_temp_dir = tempfile.TemporaryDirectory()
dataset_dir = os.path.join(dataset_temp_dir.name, "97e_scmm")

data = SingleCellMemMapDataset(dataset_dir, input_data)

In [4]:

Copied!

# Save the dataset to the disk.
data.save()
# Save the dataset to the disk.
data.save()

Out[4]:

True

In [5]:

Copied!

# Reload the data
reloaded_data = SingleCellMemMapDataset(dataset_dir)
# Reload the data
reloaded_data = SingleCellMemMapDataset(dataset_dir)

There are various numbers of columns per observation. However, for a batch size of 1 the data does not need to be collated. It will then be outputted in a torch tensor of shape (1, 2, num_obs) The first row of lengh num_obs contains the column pointers, and the second row contains the corresponding values.

In [6]:

Copied!





model = lambda x: x  # noqa: E731

dataloader = DataLoader(data, batch_size=1, shuffle=True, collate_fn=collate_sparse_matrix_batch)
n_epochs = 1
for e in range(n_epochs):
    for batch in dataloader:
        model(batch)
model = lambda x: x  # noqa: E731

dataloader = DataLoader(data, batch_size=1, shuffle=True, collate_fn=collate_sparse_matrix_batch)
n_epochs = 1
for e in range(n_epochs):
    for batch in dataloader:
        model(batch)

/home/pbinder/bionemo-framework/sub-packages/bionemo-scdl/src/bionemo/scdl/util/torch_dataloader_utils.py:39: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at ../aten/src/ATen/SparseCsrTensorImpl.cpp:53.)
  batch_sparse_tensor = torch.sparse_csr_tensor(batch_rows, batch_cols, batch_values, size=(len(batch), max_pointer))

The data can be collated with a batch size of 1 and must be collated with larger batch sizes. This will collate several sparse matrices into the CSR (Compressed Sparse Row) torch tensor format.

In [7]:

Copied!





model = lambda x: x  # noqa: E731

dataloader = DataLoader(data, batch_size=8, shuffle=True, collate_fn=collate_sparse_matrix_batch)
n_epochs = 1
for e in range(n_epochs):
    for batch in dataloader:
        model(batch)
model = lambda x: x  # noqa: E731

dataloader = DataLoader(data, batch_size=8, shuffle=True, collate_fn=collate_sparse_matrix_batch)
n_epochs = 1
for e in range(n_epochs):
    for batch in dataloader:
        model(batch)

For some applications, we might want to also use the features. These can be specified with get_row(index, return_features = True). By default, all features are returned, but the features can be specified with the feature_vars argument in get_row, which corresponds to a list of the feature names to return.

In [21]:

Copied!

for index in range(len(data)):
    model(data.get_row(index, return_features=True))
for index in range(len(data)):
    model(data.get_row(index, return_features=True))

Alternatively, if there are multiple AnnData files, they can be converted into a single SingleCellMemMapDataset. If the hdf5 directory has one or more AnnData files, the SingleCellCollection class crawls the filesystem to recursively find AnnData files (with the h5ad extension). The code below is in scripts/convert_h5ad_to_scdl.py. It will create a new dataset at example_dataset. This can also be called with the convert_h5ad_to_scdl command.

In [ ]:

Copied!

# path to dir holding hdf5s data
hdf5s = BIONEMO_CACHE_DIR / "hdf5s"

# path to output dir where SCDataset will be stored
output_dir = os.path.join("scdataset_output")
# path to dir holding hdf5s data
hdf5s = BIONEMO_CACHE_DIR / "hdf5s"

# path to output dir where SCDataset will be stored
output_dir = os.path.join("scdataset_output")

In [ ]:

Copied!

from bionemo.scdl.io.single_cell_collection import SingleCellCollection

with tempfile.TemporaryDirectory() as temp_dir:
    coll = SingleCellCollection(temp_dir)
    coll.load_h5ad_multi(hdf5s, max_workers=4, use_processes=True)
    coll.flatten(output_dir, destroy_on_copy=True)
from bionemo.scdl.io.single_cell_collection import SingleCellCollection

with tempfile.TemporaryDirectory() as temp_dir:
    coll = SingleCellCollection(temp_dir)
    coll.load_h5ad_multi(hdf5s, max_workers=4, use_processes=True)
    coll.flatten(output_dir, destroy_on_copy=True)

In [ ]:

Copied!

dataset_temp_dir.cleanup()
dataset_temp_dir.cleanup()

In [ ]: