Skip to content

Row feature index

RowFeatureIndex

Maintains a mapping between a row and its features.

This is a ragged dataset, where the number and dimension of features can be different at every row.

Attributes:

Name Type Description
_cumulative_sum_index array

Pointer that deliniates which entries

_feature_arr list[dict[str, ndarray]]

list of feature dictionaries for each dataset

_num_genes_per_row list[int]

list that tracks the feature length (number of genes) for each dataset.

_labels list[str]

list of labels

_version

The version of the dataset

Source code in bionemo/scdl/index/row_feature_index.py
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
class RowFeatureIndex:
    """Maintains a mapping between a row and its features.

    This is a ragged dataset, where the number and dimension of features
    can be different at every row.

    Attributes:
        _cumulative_sum_index: Pointer that deliniates which entries
        correspondto a given row. For examples if the array is [-1, 200, 201],
        rows 0 to 199 correspond to _feature_arr[0] and 200 corresponds to
        _feature_arr[1]
        _feature_arr: list of feature dictionaries for each dataset
        _num_genes_per_row: list that tracks the feature length (number of genes) for each dataset.
        Extracting this information repeatedly from self._feature_arr would be cumbersome which is why we
        add this attribute.
        _labels: list of labels
        _version: The version of the dataset
    """

    def __init__(self) -> None:
        """Instantiates the index."""
        self._cumulative_sum_index: np.array = np.array([-1])
        self._feature_arr: list[dict[str, np.ndarray]] = []
        self._num_genes_per_row: list[int] = []
        self._version = importlib.metadata.version("bionemo.scdl")
        self._labels: list[str] = []

    def _get_dataset_id(self, row) -> int:
        """Gets the dataset id for a specified row index.

        Args:
            row (int): The index of the row.

        Returns:
            An int representing the dataset id the row belongs to.
        """
        # creates a mask for values where cumulative sum > row
        mask = ~(self._cumulative_sum_index > row)
        # Sum these to get the index of the first range > row
        # Subtract one to get the range containing row.
        d_id = sum(mask) - 1
        return d_id

    def version(self) -> str:
        """Returns a version number.

        (following <major>.<minor>.<point> convention).
        """
        return self._version

    def __len__(self) -> int:
        """The length is the number of rows or RowFeatureIndex length."""
        return len(self._feature_arr)

    def append_features(
        self, n_obs: int, features: dict[str, np.ndarray], num_genes: int, label: Optional[str] = None
    ) -> None:
        """Updates the index with the given features.

        The dict is inserted into the feature array by adding a
        new span to the row lookup index. Additionally, we update the number of genes for the newly added row.

        Args:
            n_obs (int): The number of times that these feature occur in the
            class.
            features (dict): Corresponding features.
            num_genes (int): the length of the features for each feature key in features (i.e., number of genes)
            label (str): Label for the features.
        """
        if isinstance(features, pd.DataFrame):
            raise TypeError("Expected a dictionary, but received a Pandas DataFrame.")
        csum = max(self._cumulative_sum_index[-1], 0)

        # If the new feature array is identical to the last one, it is not appended. Instead, the last array accounts
        # for the additional n_obs also.
        if len(self._feature_arr) > 0 and are_dicts_equal(self._feature_arr[-1], features):
            self._cumulative_sum_index[-1] = csum + n_obs
        else:
            self._cumulative_sum_index = np.append(self._cumulative_sum_index, csum + n_obs)
            self._feature_arr.append(features)
            self._num_genes_per_row.append(num_genes)
            self._labels.append(label)

    def lookup(self, row: int, select_features: Optional[list[str]] = None) -> Tuple[list[np.ndarray], str]:
        """Find the features at a given row.

        It is assumed that the row is
        non-zero._cumulative_sum_index contains pointers to which rows correspond
        to given dictionaries. To obtain a specific row, we determine where it is
        located in _cumulative_sum_index and then look up that dictionary in
        _feature_arr
        Args:
            row (int): The row in the feature index.
            select_features (list[str]): a list of features to select
        Returns
            list[np.ndarray]: list of np arrays with the feature values in that row of the specified features
            str: optional label for the row
        Raises:
            IndexError: An error occured due to input row being negative or it
            exceeding the larger row of the rows in the index. It is also raised
            if there are no entries in the index yet.
        """
        if row < 0:
            raise IndexError(f"Row index {row} is not valid. It must be non-negative.")
        if len(self._cumulative_sum_index) < 2:
            raise IndexError("There are no features to lookup.")

        if row > self._cumulative_sum_index[-1]:
            raise IndexError(
                f"Row index {row} is larger than number of rows in FeatureIndex ({self._cumulative_sum_index[-1]})."
            )
        d_id = self._get_dataset_id(row)

        # Retrieve the features for the identified value.
        features_dict = self._feature_arr[d_id]

        # If specific features are to be selected, filter the features.
        if select_features is not None:
            features = []
            for feature in select_features:
                if feature not in features_dict:
                    raise ValueError(f"Provided feature column {feature} in select_features not present in dataset.")
                features.append(features_dict[feature])
        else:
            features = [features_dict[f] for f in features_dict]

        # Return the features for the identified range.
        return features, self._labels[d_id]

    def number_vars_at_row(self, row: int) -> int:
        """Return number of variables in a given row.

        Args:
            row (int): The row in the feature index.

        Returns:
            The length of the features at the row
        """
        return self._num_genes_per_row[self._get_dataset_id(row)]

    def column_dims(self) -> list[int]:
        """Return the number of columns in all rows.

        Args:
            length of features at every row is returned.

        Returns:
            A list containing the lengths of the features in every row
        """
        return self._num_genes_per_row

    def number_of_values(self) -> list[int]:
        """Get the total number of values in the array.

        For each row, the number of genes is counted.

        Returns:
            A list containing the lengths of the features in every block of rows
        """
        if len(self._feature_arr) == 0:
            return [0]
        rows = [
            self._cumulative_sum_index[i] - max(self._cumulative_sum_index[i - 1], 0)
            for i in range(1, len(self._cumulative_sum_index))
        ]
        vals = []
        vals = [n_rows * self._num_genes_per_row[i] for i, n_rows in enumerate(rows)]
        return vals

    def number_of_rows(self) -> int:
        """The number of rows in the index"".

        Returns:
            An integer corresponding to the number or rows in the index
        """
        return int(max(self._cumulative_sum_index[-1], 0))

    def concat(self, other_row_index: RowFeatureIndex, fail_on_empty_index: bool = True) -> RowFeatureIndex:
        """Concatenates the other FeatureIndex to this one.

        Returns the new, updated index. Warning: modifies this index in-place.

        Args:
            other_row_index: another RowFeatureIndex
            fail_on_empty_index: A boolean flag that sets whether to raise an
            error if an empty row index is passed in.

        Returns:
            self, the RowIndexFeature after the concatenations.

        Raises:
            TypeError if other_row_index is not a RowFeatureIndex
            ValueError if an empty RowFeatureIndex is passed and the function is
            set to fail in this case.
        """
        match other_row_index:
            case self.__class__():
                pass
            case _:
                raise TypeError("Error: trying to concatenate something that's not a RowFeatureIndex.")

        if fail_on_empty_index and not len(other_row_index._feature_arr) > 0:
            raise ValueError("Error: Cannot append empty FeatureIndex.")
        for i, feats in enumerate(list(other_row_index._feature_arr)):
            c_span = other_row_index._cumulative_sum_index[i + 1]
            label = other_row_index._labels[i]
            num_genes = other_row_index._num_genes_per_row[i]
            self.append_features(c_span, feats, num_genes, label)

        return self

    def save(self, datapath: str) -> None:
        """Saves the RowFeatureIndex to a given path.

        Args:
            datapath: path to save the index
        """
        Path(datapath).mkdir(parents=True, exist_ok=True)
        num_digits = len(str(len(self._feature_arr)))
        for index, feature_dict in enumerate(self._feature_arr):
            table = pa.table({column: pa.array(values) for column, values in feature_dict.items()})
            dataframe_str_index = f"{index:0{num_digits}d}"
            pq.write_table(table, f"{datapath}/dataframe_{dataframe_str_index}.parquet")

        np.save(Path(datapath) / "cumulative_sum_index.npy", self._cumulative_sum_index)
        np.save(Path(datapath) / "labels.npy", self._labels)
        np.save(Path(datapath) / "version.npy", np.array(self._version))

    @staticmethod
    def load(datapath: str) -> RowFeatureIndex:
        """Loads the data from datapath.

        Args:
            datapath: the path to load from
        Returns:
            An instance of RowFeatureIndex
        """
        new_row_feat_index = RowFeatureIndex()
        parquet_data_paths = sorted(Path(datapath).rglob("*.parquet"))
        data_tables = [pq.read_table(csv_path) for csv_path in parquet_data_paths]
        new_row_feat_index._feature_arr = [
            {column: table[column].to_numpy() for column in table.column_names} for table in data_tables
        ]
        new_row_feat_index._num_genes_per_row = [
            len(feats[next(iter(feats.keys()))]) for feats in new_row_feat_index._feature_arr
        ]

        new_row_feat_index._cumulative_sum_index = np.load(Path(datapath) / "cumulative_sum_index.npy")
        new_row_feat_index._labels = np.load(Path(datapath) / "labels.npy", allow_pickle=True)
        new_row_feat_index._version = np.load(Path(datapath) / "version.npy").item()
        return new_row_feat_index

__init__()

Instantiates the index.

Source code in bionemo/scdl/index/row_feature_index.py
64
65
66
67
68
69
70
def __init__(self) -> None:
    """Instantiates the index."""
    self._cumulative_sum_index: np.array = np.array([-1])
    self._feature_arr: list[dict[str, np.ndarray]] = []
    self._num_genes_per_row: list[int] = []
    self._version = importlib.metadata.version("bionemo.scdl")
    self._labels: list[str] = []

__len__()

The length is the number of rows or RowFeatureIndex length.

Source code in bionemo/scdl/index/row_feature_index.py
95
96
97
def __len__(self) -> int:
    """The length is the number of rows or RowFeatureIndex length."""
    return len(self._feature_arr)

append_features(n_obs, features, num_genes, label=None)

Updates the index with the given features.

The dict is inserted into the feature array by adding a new span to the row lookup index. Additionally, we update the number of genes for the newly added row.

Parameters:

Name Type Description Default
n_obs int

The number of times that these feature occur in the

required
features dict

Corresponding features.

required
num_genes int

the length of the features for each feature key in features (i.e., number of genes)

required
label str

Label for the features.

None
Source code in bionemo/scdl/index/row_feature_index.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def append_features(
    self, n_obs: int, features: dict[str, np.ndarray], num_genes: int, label: Optional[str] = None
) -> None:
    """Updates the index with the given features.

    The dict is inserted into the feature array by adding a
    new span to the row lookup index. Additionally, we update the number of genes for the newly added row.

    Args:
        n_obs (int): The number of times that these feature occur in the
        class.
        features (dict): Corresponding features.
        num_genes (int): the length of the features for each feature key in features (i.e., number of genes)
        label (str): Label for the features.
    """
    if isinstance(features, pd.DataFrame):
        raise TypeError("Expected a dictionary, but received a Pandas DataFrame.")
    csum = max(self._cumulative_sum_index[-1], 0)

    # If the new feature array is identical to the last one, it is not appended. Instead, the last array accounts
    # for the additional n_obs also.
    if len(self._feature_arr) > 0 and are_dicts_equal(self._feature_arr[-1], features):
        self._cumulative_sum_index[-1] = csum + n_obs
    else:
        self._cumulative_sum_index = np.append(self._cumulative_sum_index, csum + n_obs)
        self._feature_arr.append(features)
        self._num_genes_per_row.append(num_genes)
        self._labels.append(label)

column_dims()

Return the number of columns in all rows.

Returns:

Type Description
list[int]

A list containing the lengths of the features in every row

Source code in bionemo/scdl/index/row_feature_index.py
185
186
187
188
189
190
191
192
193
194
def column_dims(self) -> list[int]:
    """Return the number of columns in all rows.

    Args:
        length of features at every row is returned.

    Returns:
        A list containing the lengths of the features in every row
    """
    return self._num_genes_per_row

concat(other_row_index, fail_on_empty_index=True)

Concatenates the other FeatureIndex to this one.

Returns the new, updated index. Warning: modifies this index in-place.

Parameters:

Name Type Description Default
other_row_index RowFeatureIndex

another RowFeatureIndex

required
fail_on_empty_index bool

A boolean flag that sets whether to raise an

True

Returns:

Type Description
RowFeatureIndex

self, the RowIndexFeature after the concatenations.

Source code in bionemo/scdl/index/row_feature_index.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
def concat(self, other_row_index: RowFeatureIndex, fail_on_empty_index: bool = True) -> RowFeatureIndex:
    """Concatenates the other FeatureIndex to this one.

    Returns the new, updated index. Warning: modifies this index in-place.

    Args:
        other_row_index: another RowFeatureIndex
        fail_on_empty_index: A boolean flag that sets whether to raise an
        error if an empty row index is passed in.

    Returns:
        self, the RowIndexFeature after the concatenations.

    Raises:
        TypeError if other_row_index is not a RowFeatureIndex
        ValueError if an empty RowFeatureIndex is passed and the function is
        set to fail in this case.
    """
    match other_row_index:
        case self.__class__():
            pass
        case _:
            raise TypeError("Error: trying to concatenate something that's not a RowFeatureIndex.")

    if fail_on_empty_index and not len(other_row_index._feature_arr) > 0:
        raise ValueError("Error: Cannot append empty FeatureIndex.")
    for i, feats in enumerate(list(other_row_index._feature_arr)):
        c_span = other_row_index._cumulative_sum_index[i + 1]
        label = other_row_index._labels[i]
        num_genes = other_row_index._num_genes_per_row[i]
        self.append_features(c_span, feats, num_genes, label)

    return self

load(datapath) staticmethod

Loads the data from datapath.

Parameters:

Name Type Description Default
datapath str

the path to load from

required

Returns: An instance of RowFeatureIndex

Source code in bionemo/scdl/index/row_feature_index.py
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
@staticmethod
def load(datapath: str) -> RowFeatureIndex:
    """Loads the data from datapath.

    Args:
        datapath: the path to load from
    Returns:
        An instance of RowFeatureIndex
    """
    new_row_feat_index = RowFeatureIndex()
    parquet_data_paths = sorted(Path(datapath).rglob("*.parquet"))
    data_tables = [pq.read_table(csv_path) for csv_path in parquet_data_paths]
    new_row_feat_index._feature_arr = [
        {column: table[column].to_numpy() for column in table.column_names} for table in data_tables
    ]
    new_row_feat_index._num_genes_per_row = [
        len(feats[next(iter(feats.keys()))]) for feats in new_row_feat_index._feature_arr
    ]

    new_row_feat_index._cumulative_sum_index = np.load(Path(datapath) / "cumulative_sum_index.npy")
    new_row_feat_index._labels = np.load(Path(datapath) / "labels.npy", allow_pickle=True)
    new_row_feat_index._version = np.load(Path(datapath) / "version.npy").item()
    return new_row_feat_index

lookup(row, select_features=None)

Find the features at a given row.

It is assumed that the row is non-zero._cumulative_sum_index contains pointers to which rows correspond to given dictionaries. To obtain a specific row, we determine where it is located in _cumulative_sum_index and then look up that dictionary in _feature_arr Args: row (int): The row in the feature index. select_features (list[str]): a list of features to select Returns list[np.ndarray]: list of np arrays with the feature values in that row of the specified features str: optional label for the row Raises: IndexError: An error occured due to input row being negative or it exceeding the larger row of the rows in the index. It is also raised if there are no entries in the index yet.

Source code in bionemo/scdl/index/row_feature_index.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def lookup(self, row: int, select_features: Optional[list[str]] = None) -> Tuple[list[np.ndarray], str]:
    """Find the features at a given row.

    It is assumed that the row is
    non-zero._cumulative_sum_index contains pointers to which rows correspond
    to given dictionaries. To obtain a specific row, we determine where it is
    located in _cumulative_sum_index and then look up that dictionary in
    _feature_arr
    Args:
        row (int): The row in the feature index.
        select_features (list[str]): a list of features to select
    Returns
        list[np.ndarray]: list of np arrays with the feature values in that row of the specified features
        str: optional label for the row
    Raises:
        IndexError: An error occured due to input row being negative or it
        exceeding the larger row of the rows in the index. It is also raised
        if there are no entries in the index yet.
    """
    if row < 0:
        raise IndexError(f"Row index {row} is not valid. It must be non-negative.")
    if len(self._cumulative_sum_index) < 2:
        raise IndexError("There are no features to lookup.")

    if row > self._cumulative_sum_index[-1]:
        raise IndexError(
            f"Row index {row} is larger than number of rows in FeatureIndex ({self._cumulative_sum_index[-1]})."
        )
    d_id = self._get_dataset_id(row)

    # Retrieve the features for the identified value.
    features_dict = self._feature_arr[d_id]

    # If specific features are to be selected, filter the features.
    if select_features is not None:
        features = []
        for feature in select_features:
            if feature not in features_dict:
                raise ValueError(f"Provided feature column {feature} in select_features not present in dataset.")
            features.append(features_dict[feature])
    else:
        features = [features_dict[f] for f in features_dict]

    # Return the features for the identified range.
    return features, self._labels[d_id]

number_of_rows()

The number of rows in the index"".

Returns:

Type Description
int

An integer corresponding to the number or rows in the index

Source code in bionemo/scdl/index/row_feature_index.py
214
215
216
217
218
219
220
def number_of_rows(self) -> int:
    """The number of rows in the index"".

    Returns:
        An integer corresponding to the number or rows in the index
    """
    return int(max(self._cumulative_sum_index[-1], 0))

number_of_values()

Get the total number of values in the array.

For each row, the number of genes is counted.

Returns:

Type Description
list[int]

A list containing the lengths of the features in every block of rows

Source code in bionemo/scdl/index/row_feature_index.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def number_of_values(self) -> list[int]:
    """Get the total number of values in the array.

    For each row, the number of genes is counted.

    Returns:
        A list containing the lengths of the features in every block of rows
    """
    if len(self._feature_arr) == 0:
        return [0]
    rows = [
        self._cumulative_sum_index[i] - max(self._cumulative_sum_index[i - 1], 0)
        for i in range(1, len(self._cumulative_sum_index))
    ]
    vals = []
    vals = [n_rows * self._num_genes_per_row[i] for i, n_rows in enumerate(rows)]
    return vals

number_vars_at_row(row)

Return number of variables in a given row.

Parameters:

Name Type Description Default
row int

The row in the feature index.

required

Returns:

Type Description
int

The length of the features at the row

Source code in bionemo/scdl/index/row_feature_index.py
174
175
176
177
178
179
180
181
182
183
def number_vars_at_row(self, row: int) -> int:
    """Return number of variables in a given row.

    Args:
        row (int): The row in the feature index.

    Returns:
        The length of the features at the row
    """
    return self._num_genes_per_row[self._get_dataset_id(row)]

save(datapath)

Saves the RowFeatureIndex to a given path.

Parameters:

Name Type Description Default
datapath str

path to save the index

required
Source code in bionemo/scdl/index/row_feature_index.py
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def save(self, datapath: str) -> None:
    """Saves the RowFeatureIndex to a given path.

    Args:
        datapath: path to save the index
    """
    Path(datapath).mkdir(parents=True, exist_ok=True)
    num_digits = len(str(len(self._feature_arr)))
    for index, feature_dict in enumerate(self._feature_arr):
        table = pa.table({column: pa.array(values) for column, values in feature_dict.items()})
        dataframe_str_index = f"{index:0{num_digits}d}"
        pq.write_table(table, f"{datapath}/dataframe_{dataframe_str_index}.parquet")

    np.save(Path(datapath) / "cumulative_sum_index.npy", self._cumulative_sum_index)
    np.save(Path(datapath) / "labels.npy", self._labels)
    np.save(Path(datapath) / "version.npy", np.array(self._version))

version()

Returns a version number.

(following .. convention).

Source code in bionemo/scdl/index/row_feature_index.py
88
89
90
91
92
93
def version(self) -> str:
    """Returns a version number.

    (following <major>.<minor>.<point> convention).
    """
    return self._version

are_dicts_equal(dict1, dict2)

Compare two dictionaries with string keys and numpy.ndarray values.

Parameters:

Name Type Description Default
dict1 dict[str, ndarray]

The first dictionary to compare.

required
dict2 dict[str, ndarray]

The second dictionary to compare.

required

Returns:

Name Type Description
bool bool

True if the dictionaries have the same keys and all corresponding numpy arrays are equal; False otherwise.

Source code in bionemo/scdl/index/row_feature_index.py
31
32
33
34
35
36
37
38
39
40
41
42
def are_dicts_equal(dict1: dict[str, np.ndarray], dict2: dict[str, np.ndarray]) -> bool:
    """Compare two dictionaries with string keys and numpy.ndarray values.

    Args:
        dict1 (dict[str, np.ndarray]): The first dictionary to compare.
        dict2 (dict[str, np.ndarray]): The second dictionary to compare.

    Returns:
        bool: True if the dictionaries have the same keys and all corresponding
              numpy arrays are equal; False otherwise.
    """
    return dict1.keys() == dict2.keys() and all(np.array_equal(dict1[k], dict2[k]) for k in dict1)