Skip to content

Single cell collection

FileNames

Bases: str, Enum

Names of files that are generated in SingleCellCollection.

Source code in bionemo/scdl/io/single_cell_collection.py
57
58
59
60
61
62
class FileNames(str, Enum):
    """Names of files that are generated in SingleCellCollection."""

    VERSION = "version.json"
    METADATA = "metadata.json"
    FEATURES = "features"

SingleCellCollection

Bases: SingleCellRowDatasetCore

A collection of one or more SingleCellMemMapDatasets.

SingleCellCollection support most of the functionality of the SingleCellDataSet API. An SingleCellCollection can be converted to a single SingleCellMemMapDataset. A SingleCellCollection enables the use of heterogeneous datasets, such as those composed of many AnnData files.

Attributes:

Name Type Description
_version str

The version of the dataset

data_path str

The directory where the colleection of datasets is stored.

_feature_index RowFeatureIndex

The corresponding RowFeatureIndex where features are

fname_to_mmap Dict[str, SingleCellMemMapDataset]

dictionary to hold each SingleCellMemMapDataset object.

False Dict[str, SingleCellMemMapDataset]

not ragged; all SingleCellMemMapDataset have same column dimemsion

True Dict[str, SingleCellMemMapDataset]

ragged; scmmap column dimemsions vary

Source code in bionemo/scdl/io/single_cell_collection.py
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
class SingleCellCollection(SingleCellRowDatasetCore):
    """A collection of one or more SingleCellMemMapDatasets.

    SingleCellCollection support most of the functionality of the
    SingleCellDataSet API. An SingleCellCollection can be converted
    to a single SingleCellMemMapDataset. A SingleCellCollection
    enables the use of heterogeneous datasets, such as those composed of many
    AnnData files.

    Attributes:
        _version: The version of the dataset
        data_path: The directory where the colleection of datasets is stored.
        _feature_index: The corresponding RowFeatureIndex where features are
        stored.
        fname_to_mmap:  dictionary to hold each SingleCellMemMapDataset object.
        This maps from the path to the dataset.
        ragged dataset is an dataset of arrays where the arrays have different
        lengths
        False: not ragged; all SingleCellMemMapDataset have same column dimemsion
        True: ragged; scmmap column dimemsions vary
    """

    def __init__(self, data_path: str) -> None:
        """Instantiate the class.

        Args:
            data_path: Where the class will be stored.
        """
        self.data_path: str = data_path
        self._version: str = importlib.metadata.version("bionemo.scdl")
        self.metadata: Dict[str, int] = {}
        self._feature_index: RowFeatureIndex = RowFeatureIndex()
        self.fname_to_mmap: Dict[str, SingleCellMemMapDataset] = {}

        Path(self.data_path).mkdir(parents=True, exist_ok=True)

        # Write the version
        if not os.path.exists(f"{self.data_path}/{FileNames.VERSION.value}"):
            with open(f"{self.data_path}/{FileNames.VERSION.value}", "w") as vfi:
                json.dump(self.version(), vfi)

    def version(self) -> str:
        """Returns a version number.

        (following <major>.<minor>.<point> convention).
        """
        return self._version

    def load_h5ad(self, h5ad_path: str) -> None:
        """Loads data from an existing AnnData archive.

        This creates and saves a new backing data structure.
        Then, the location and the data and the dataset are stored.

        Args:
            h5ad_path: the path to AnnData archive
        """
        mmap_path = Path(self.data_path) / Path(h5ad_path).stem
        self.fname_to_mmap[mmap_path] = _create_single_cell_memmap_dataset_from_h5ad(
            h5ad_path=h5ad_path, base_directory_path=self.data_path
        )
        self._feature_index.concat(self.fname_to_mmap[mmap_path]._feature_index)

    def load_h5ad_multi(self, directory_path: str, max_workers: int = 5, use_processes: bool = False) -> None:
        """Loads one or more AnnData files and adds them to the collection.

        Args:
            directory_path: The path to the directory with the AnnData files
            max_workers: the maximal number of workers to use
            use_processes: If True, use ProcessPoolExecutor; otherwise, use
                ThreadPoolExecutor
        Raises:
            FileNotFoundError: If no h5ad files are found in the directory.
            RuntimeError: If an error occurs in the loading of any of the h5ad files.
        """
        directory_path = Path(directory_path)
        ann_data_paths = sorted(directory_path.rglob("*.h5ad"))
        if len(ann_data_paths) == 0:
            raise FileNotFoundError(f"There a no h5ad files in {directory_path}.")
        mmap_paths = [Path(self.data_path) / Path(ann_datapath).stem for ann_datapath in ann_data_paths]
        queue = AsyncWorkQueue(max_workers=max_workers, use_processes=use_processes)
        for ann in ann_data_paths:
            queue.submit_task(_create_single_cell_memmap_dataset_from_h5ad, ann, base_directory_path=self.data_path)
        queue.wait()
        mmaps = queue.get_task_results()

        for result in mmaps:
            if isinstance(result, Exception):
                raise RuntimeError(f"Error in processing file {ann}: {result}") from result

        for mmap_path, mmap in zip(mmap_paths, mmaps):
            if isinstance(mmap, Exception):
                raise RuntimeError(f"Error in processing file {mmap_path}: {mmap}") from mmap

            self.fname_to_mmap[mmap_path] = mmap
            self._feature_index.concat(self.fname_to_mmap[mmap_path]._feature_index)

    def number_nonzero_values(self) -> int:
        """Sum of the number of non zero entries in each dataset."""
        return sum([self.fname_to_mmap[mmap_path].number_nonzero_values() for mmap_path in self.fname_to_mmap])

    def number_of_values(self) -> int:
        """Sum of the number of values in each dataset."""
        return sum([self.fname_to_mmap[mmap_path].number_of_values() for mmap_path in self.fname_to_mmap])

    def number_of_rows(self) -> int:
        """The number of rows in the dataset.

        Returns:
            The number of rows in the dataset
        Raises:
            ValueError if the length of the number of rows in the feature
            index does not correspond to the number of stored rows.
        """
        row_sum_from_datasets = sum(
            [self.fname_to_mmap[mmap_path].number_of_rows() for mmap_path in self.fname_to_mmap]
        )
        if len(self._feature_index) > 0 and self._feature_index.number_of_rows() != row_sum_from_datasets:
            raise ValueError(
                f"""The nuber of rows in the feature index {self._feature_index.number_of_rows()}
                             does not correspond to the number of rows in the datasets {row_sum_from_datasets}"""
            )

        return row_sum_from_datasets

    def number_of_variables(self) -> List[int]:
        """If ragged, returns a list of variable lengths.

        If not ragged, returns a list with one entry. A ragged
        collection is one where the datasets have different lengths.
        """
        if len(self._feature_index) == 0:
            return [0]
        else:
            num_vars = self._feature_index.column_dims()
            return num_vars

    def shape(self) -> Tuple[int, List[int]]:
        """Get the shape of the dataset.

        This is the number of entries by the the length of the feature index
        corresponding to that variable.

        Returns:
            The total number of elements across dataset
            A list containing the number of variables for each entry in the
                RowFeatureIndex.
        """
        return self.number_of_rows(), self.number_of_variables()

    def flatten(
        self,
        output_path: str,
        destroy_on_copy: bool = False,
    ) -> None:
        """Flattens the collection into a single SingleCellMemMapDataset.

        Args:
            output_path: location to store new dataset
            destroy_on_copy: Whether to remove the current data_path
        """
        output = SingleCellMemMapDataset(
            output_path,
            num_elements=self.number_of_rows(),
            num_rows=self.number_nonzero_values(),
            mode=Mode.CREATE_APPEND,
        )

        output.concat(list(self.fname_to_mmap.values()))

        # Hit save!
        output.save()

        if destroy_on_copy:
            shutil.rmtree(self.data_path)

__init__(data_path)

Instantiate the class.

Parameters:

Name Type Description Default
data_path str

Where the class will be stored.

required
Source code in bionemo/scdl/io/single_cell_collection.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def __init__(self, data_path: str) -> None:
    """Instantiate the class.

    Args:
        data_path: Where the class will be stored.
    """
    self.data_path: str = data_path
    self._version: str = importlib.metadata.version("bionemo.scdl")
    self.metadata: Dict[str, int] = {}
    self._feature_index: RowFeatureIndex = RowFeatureIndex()
    self.fname_to_mmap: Dict[str, SingleCellMemMapDataset] = {}

    Path(self.data_path).mkdir(parents=True, exist_ok=True)

    # Write the version
    if not os.path.exists(f"{self.data_path}/{FileNames.VERSION.value}"):
        with open(f"{self.data_path}/{FileNames.VERSION.value}", "w") as vfi:
            json.dump(self.version(), vfi)

flatten(output_path, destroy_on_copy=False)

Flattens the collection into a single SingleCellMemMapDataset.

Parameters:

Name Type Description Default
output_path str

location to store new dataset

required
destroy_on_copy bool

Whether to remove the current data_path

False
Source code in bionemo/scdl/io/single_cell_collection.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def flatten(
    self,
    output_path: str,
    destroy_on_copy: bool = False,
) -> None:
    """Flattens the collection into a single SingleCellMemMapDataset.

    Args:
        output_path: location to store new dataset
        destroy_on_copy: Whether to remove the current data_path
    """
    output = SingleCellMemMapDataset(
        output_path,
        num_elements=self.number_of_rows(),
        num_rows=self.number_nonzero_values(),
        mode=Mode.CREATE_APPEND,
    )

    output.concat(list(self.fname_to_mmap.values()))

    # Hit save!
    output.save()

    if destroy_on_copy:
        shutil.rmtree(self.data_path)

load_h5ad(h5ad_path)

Loads data from an existing AnnData archive.

This creates and saves a new backing data structure. Then, the location and the data and the dataset are stored.

Parameters:

Name Type Description Default
h5ad_path str

the path to AnnData archive

required
Source code in bionemo/scdl/io/single_cell_collection.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def load_h5ad(self, h5ad_path: str) -> None:
    """Loads data from an existing AnnData archive.

    This creates and saves a new backing data structure.
    Then, the location and the data and the dataset are stored.

    Args:
        h5ad_path: the path to AnnData archive
    """
    mmap_path = Path(self.data_path) / Path(h5ad_path).stem
    self.fname_to_mmap[mmap_path] = _create_single_cell_memmap_dataset_from_h5ad(
        h5ad_path=h5ad_path, base_directory_path=self.data_path
    )
    self._feature_index.concat(self.fname_to_mmap[mmap_path]._feature_index)

load_h5ad_multi(directory_path, max_workers=5, use_processes=False)

Loads one or more AnnData files and adds them to the collection.

Parameters:

Name Type Description Default
directory_path str

The path to the directory with the AnnData files

required
max_workers int

the maximal number of workers to use

5
use_processes bool

If True, use ProcessPoolExecutor; otherwise, use ThreadPoolExecutor

False

Raises: FileNotFoundError: If no h5ad files are found in the directory. RuntimeError: If an error occurs in the loading of any of the h5ad files.

Source code in bionemo/scdl/io/single_cell_collection.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def load_h5ad_multi(self, directory_path: str, max_workers: int = 5, use_processes: bool = False) -> None:
    """Loads one or more AnnData files and adds them to the collection.

    Args:
        directory_path: The path to the directory with the AnnData files
        max_workers: the maximal number of workers to use
        use_processes: If True, use ProcessPoolExecutor; otherwise, use
            ThreadPoolExecutor
    Raises:
        FileNotFoundError: If no h5ad files are found in the directory.
        RuntimeError: If an error occurs in the loading of any of the h5ad files.
    """
    directory_path = Path(directory_path)
    ann_data_paths = sorted(directory_path.rglob("*.h5ad"))
    if len(ann_data_paths) == 0:
        raise FileNotFoundError(f"There a no h5ad files in {directory_path}.")
    mmap_paths = [Path(self.data_path) / Path(ann_datapath).stem for ann_datapath in ann_data_paths]
    queue = AsyncWorkQueue(max_workers=max_workers, use_processes=use_processes)
    for ann in ann_data_paths:
        queue.submit_task(_create_single_cell_memmap_dataset_from_h5ad, ann, base_directory_path=self.data_path)
    queue.wait()
    mmaps = queue.get_task_results()

    for result in mmaps:
        if isinstance(result, Exception):
            raise RuntimeError(f"Error in processing file {ann}: {result}") from result

    for mmap_path, mmap in zip(mmap_paths, mmaps):
        if isinstance(mmap, Exception):
            raise RuntimeError(f"Error in processing file {mmap_path}: {mmap}") from mmap

        self.fname_to_mmap[mmap_path] = mmap
        self._feature_index.concat(self.fname_to_mmap[mmap_path]._feature_index)

number_nonzero_values()

Sum of the number of non zero entries in each dataset.

Source code in bionemo/scdl/io/single_cell_collection.py
162
163
164
def number_nonzero_values(self) -> int:
    """Sum of the number of non zero entries in each dataset."""
    return sum([self.fname_to_mmap[mmap_path].number_nonzero_values() for mmap_path in self.fname_to_mmap])

number_of_rows()

The number of rows in the dataset.

Returns:

Type Description
int

The number of rows in the dataset

Raises: ValueError if the length of the number of rows in the feature index does not correspond to the number of stored rows.

Source code in bionemo/scdl/io/single_cell_collection.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def number_of_rows(self) -> int:
    """The number of rows in the dataset.

    Returns:
        The number of rows in the dataset
    Raises:
        ValueError if the length of the number of rows in the feature
        index does not correspond to the number of stored rows.
    """
    row_sum_from_datasets = sum(
        [self.fname_to_mmap[mmap_path].number_of_rows() for mmap_path in self.fname_to_mmap]
    )
    if len(self._feature_index) > 0 and self._feature_index.number_of_rows() != row_sum_from_datasets:
        raise ValueError(
            f"""The nuber of rows in the feature index {self._feature_index.number_of_rows()}
                         does not correspond to the number of rows in the datasets {row_sum_from_datasets}"""
        )

    return row_sum_from_datasets

number_of_values()

Sum of the number of values in each dataset.

Source code in bionemo/scdl/io/single_cell_collection.py
166
167
168
def number_of_values(self) -> int:
    """Sum of the number of values in each dataset."""
    return sum([self.fname_to_mmap[mmap_path].number_of_values() for mmap_path in self.fname_to_mmap])

number_of_variables()

If ragged, returns a list of variable lengths.

If not ragged, returns a list with one entry. A ragged collection is one where the datasets have different lengths.

Source code in bionemo/scdl/io/single_cell_collection.py
190
191
192
193
194
195
196
197
198
199
200
def number_of_variables(self) -> List[int]:
    """If ragged, returns a list of variable lengths.

    If not ragged, returns a list with one entry. A ragged
    collection is one where the datasets have different lengths.
    """
    if len(self._feature_index) == 0:
        return [0]
    else:
        num_vars = self._feature_index.column_dims()
        return num_vars

shape()

Get the shape of the dataset.

This is the number of entries by the the length of the feature index corresponding to that variable.

Returns:

Type Description
int

The total number of elements across dataset

List[int]

A list containing the number of variables for each entry in the RowFeatureIndex.

Source code in bionemo/scdl/io/single_cell_collection.py
202
203
204
205
206
207
208
209
210
211
212
213
def shape(self) -> Tuple[int, List[int]]:
    """Get the shape of the dataset.

    This is the number of entries by the the length of the feature index
    corresponding to that variable.

    Returns:
        The total number of elements across dataset
        A list containing the number of variables for each entry in the
            RowFeatureIndex.
    """
    return self.number_of_rows(), self.number_of_variables()

version()

Returns a version number.

(following .. convention).

Source code in bionemo/scdl/io/single_cell_collection.py
106
107
108
109
110
111
def version(self) -> str:
    """Returns a version number.

    (following <major>.<minor>.<point> convention).
    """
    return self._version