Source code for accvlab.dali_pipeline_framework.pipeline.sample_data_group

# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Used to enable type hints using a class type inside the implementation of that class itself.
from __future__ import annotations

import copy
import numbers
import warnings

from typing import Union, Tuple, List, Dict, Any, Sequence, Optional

import numpy as np
import cupy
import torch

import nvidia.dali as dali
import nvidia.dali.fn as fn
import nvidia.dali.types as types

from ..internal_helpers import check_type, get_mapped


[docs] class SampleDataGroup: '''Structured container for sample data. Can also be used as a blueprint to describe the data format. Data is organized as a tree containing: - **Data fields**: Leaf nodes that hold the actual data. - **Data group fields**: Non-leaf nodes that group related items. Example: An example for accessing the data field ``"bounding_boxes"`` inside nested data group fields ``"camera"`` and ``"annotations"``: >>> bounding_boxes = data["camera"]["annotations"]["bounding_boxes"] Note that accessing the data is done as for a nested dictionary. Here, the data group fields are analogous to :class:`dict` objects and data fields correspond to the actual stored values at the leaves. Capabilities (see individual method docs for details): - Enforce a predefined data format (field names, order, and types). Format changes need to be performed explicitly. - Inside the input callable/iterable and outside the DALI pipeline the following can be performed (both can be disabled): - Apply automatic type conversions (e.g., integers to floats) on assignment - Apply optional custom string-to-numeric mappings on assignment for selected fields (see :meth:`add_data_field`, :meth:`add_data_field_array`, and :meth:`set_apply_mapping`). - Inside the pipeline: Apply automatic type checks on assignment. - Render the tree in a human-readable form via ``print(obj)``. - Flatten values to a sequence and reconstruct from a sequence (see :meth:`get_data`, :meth:`set_data`, and :meth:`set_data_from_dali_generic_iterator_output`). This is useful when passing the data from the input callable/iterable to the pipeline, and when returning data from the pipeline, as nested data structures are not supported there. Also see :class:`DALIStructuredOutputIterator` for an output iterator which re-assembles the data from the flattened output into a :class:`SampleDataGroup` instance or nested dictionaries before returning it. - Compare formats of two instances (see :meth:`type_matches`). This also ensures that the flattened data obtained from one instance can be used to fill the data of another instance. - Utilities that facilitate implementation of pipeline steps: find/remove all occurrences of fields with a given name, add/remove/change fields and types, etc. (e.g. see :meth:`find_all_occurrences`). Note that the search is performed at DALI graph construction time, so there is no overhead during the pipeline execution. - Supports passing strings through the DALI pipeline and obtaining them as strings in the pipeline output. Note that strings are not supported inside the DALI pipeline. They can be accessed/assigned as strings in the input callable/iterable and outside the DALI pipeline, but appear as uint8 tensors inside the pipeline itself (alternative: use a mapping to numeric values as described above). Usage modes: - **Blueprint**: describes the data format (fields and types) but contains no values. This allows inferring downstream formats without running data processing (e.g., to initialize a DALI iterator). When only passing of flattened data is possible, a blueprint can be filled from flattened values (see :meth:`get_data`, :meth:`set_data`). - **Container**: holds actual values. When accessing the data, behaves similarly to a nested dictionary. When assigning data, additional checks/conversions are potentially performed. Important: **Assigning a Field Value** Assignment means using the indexed assignment operator ``obj[name] = value`` or the method ``obj.set_item_in_path(path, value)``. When assigning data fields, the following holds: - Mappings and conversions will be performed on assignment (inside the input callable/iterable and outside the DALI pipeline; if not disabled). Inside the DALI pipeline itself, no mapping or conversion is applied. - Inside the DALI pipeline, type checks are performed instead on assignment and an error is raised if the type is not correct. - Assigning strings is only supported in the input callable/iterable and outside the DALI pipeline. String fields are handled as uint8 tensors inside the DALI pipeline. When assigning to data group fields, the following holds: - The assignment succeeds only if the new value's format matches the previous format, i.e. if ``obj[name].type_matches(value)`` holds. Otherwise, a :class:`KeyError` is raised. This is done to prevent changing the data format implicitly by assigning a different type. - If the type needs to be changed, this needs to be done explicitly first (e.g., using :meth:`change_type_of_data_and_remove_data`). Important: **Getting a Field Value** Getting a field value means using the indexed access operator ``obj[name]`` or the method ``obj.get_item_in_path(path)``. Accessing strings inside the DALI pipeline (except for the input callable/iterable) will return the underlying uint8 tensor instead. Using strings directly is only supported in the input callable/iterable and outside the DALI pipeline. Important: **Changing the Data Format** Changing the data format is always explicit. For example, adding a field and assigning values is a two-step process: create the field first, then assign data. When defining a blueprint, fields are created but left empty. Important: **Type Checking** Type checking is performed on assignment to ensure that the data type is correct (inside the DALI pipeline). This is useful when developing the pipeline/processing step, but adds some overhead. Type checking is enabled by default (see :meth:`set_do_check_type`). Note: Additional information: - When converting a :class:`SampleDataGroup` to a string (e.g., using ``print(obj)``), the data format as well as some details (e.g., for which fields a mapping is defined, which fields are empty, data types of the fields) are printed. The actual stored values are not printed. For a more simple output, see :meth:`get_string_no_details`. - When obtaining the length of a :class:`SampleDataGroup` (e.g., using ``len(obj)``), the number of direct children (data fields and data group fields) is returned. ''' _type_mapping = { types.DALIDataType.BOOL: bool, types.DALIDataType.FLOAT: np.float32, types.DALIDataType.FLOAT16: np.float16, types.DALIDataType.FLOAT64: np.float64, types.DALIDataType.INT8: np.int8, types.DALIDataType.INT16: np.int16, types.DALIDataType.INT32: np.int32, types.DALIDataType.INT64: np.int64, types.DALIDataType.UINT8: np.uint8, types.DALIDataType.UINT16: np.uint16, types.DALIDataType.UINT32: np.uint32, types.DALIDataType.UINT64: np.uint64, } def __init__(self): self._mappings = {} self._value_order = tuple() self._types_order = tuple() self._values = {} self._types = {} for i in range(len(self._value_order)): val = self._value_order[i] self._values[val] = None self._types[val] = self._types_order[i] self._do_apply_mapping = True self._do_convert = True self._do_check_type = True
[docs] @staticmethod def create_data_field_array( type: types.DALIDataType, num_fields: int, mapping: Optional[Dict[Union[str, None], Union[int, float, np.number, bool]]] = None, ) -> SampleDataGroup: '''Create a :class:`SampleDataGroup` containing multiple data fields of the same type. The data fields have numerical (integer) names in the range ``[0; num_fields - 1]``. This means that the returned :class:`SampleDataGroup` behaves as an array of data fields. Args: type: Type of the fields to add num_fields: Number of fields to add to the array data group field mapping: Optional mapping for the fields (see :meth:`add_data_field` for details on mappings). See also: :meth:`create_data_group_field_array` :meth:`add_data_field_array` :meth:`add_data_group_field_array` Returns: Resulting array :class:`SampleDataGroup` object ''' res = SampleDataGroup() for i in range(num_fields): res.add_data_field(i, type, mapping) return res
[docs] @staticmethod def create_data_group_field_array(sample_data_group: SampleDataGroup, num_fields: int) -> SampleDataGroup: '''Create a :class:`SampleDataGroup` containing multiple data group fields (themselves :class:`SampleDataGroup` instances). Note that the created data group fields will be initialized as blueprints, i.e. they will not contain any actual data even if ``sample_data_group`` does. This is done to cleanly separate this step (defining the data format) from actually filling the data. Args: sample_data_group: Blueprint representing the element format. Any actual data present in ``sample_data_group`` will be ignored; the resulting elements will be empty of data. num_fields: Number of fields to create See also: :meth:`create_data_field_array` :meth:`add_data_field_array` :meth:`add_data_group_field_array` Returns: Resulting array :class:`SampleDataGroup` object ''' res = SampleDataGroup() for i in range(num_fields): res.add_data_group_field(i, sample_data_group) return res
[docs] def set_apply_mapping(self, apply: bool): '''Set whether to apply string to numeric mapping (for data fields where such a mapping is defined). This setting will be propagated to descendants (data group fields) of the data group field for which it is called. Note: The mapping is applied in the input callable/iterable and outside the DALI pipeline. Inside the DALI pipeline itself, the mapping is not applied. If apply mapping is set to ``True`` and an assignment is performed inside the pipeline, a warning will be issued, and the assignment will be performed without mapping (if it is already in the correct format; an error will be raised if the format is not correct). Args: apply: Whether to apply the mapping (for fields where a mapping is set). ''' self._do_apply_mapping = apply # Also set (recursively) in SampleDataGroup elements for type, name in zip(self._types_order, self._value_order): if type == SampleDataGroup: self[name].set_apply_mapping(apply)
[docs] def set_do_convert(self, convert: bool): '''Set whether to convert data in the data fields to the types set up when creating those fields. This setting will be propagated to descendants (data group fields) of the data group field for which it is called. Note: The conversion is applied in the input callable/iterable and outside the DALI pipeline. Inside the DALI pipeline itself, the conversion is not applied. Instead, type checks are performed (regardless of this setting). Args: convert: Whether to perform automatic type conversions (e.g., integers to floats) on assignment. ''' self._do_convert = convert # Also set (recursively) in SampleDataGroup elements for type, name in zip(self._types_order, self._value_order): if type == SampleDataGroup: self[name].set_do_convert(convert)
[docs] def set_do_check_type(self, check_type: bool): '''Set whether to perform type checking on assignment. This setting will be propagated to descendants (data group fields) of the data group field for which it is called. Note: The type checking is useful when developing the pipeline/processing step, but adds some overhead. Therefore, it is advisable to disable it in production. Args: check_type: Whether to perform type checking (in the DALI pipeline) on assignment. ''' self._do_check_type = check_type # Also set (recursively) in SampleDataGroup elements for type, name in zip(self._types_order, self._value_order): if type == SampleDataGroup: self[name].set_do_check_type(check_type)
[docs] def get_empty_like_self(self) -> SampleDataGroup: '''Get an object with the same structure (same nested data group fields and data fields), but no values. Obtain a blueprint either from another blueprint or from a populated object (ignoring values and initializing all data fields as empty). This can be regarded as a deep-copy of the original object, but with the actual data removed. Returns: Resulting blueprint :class:`SampleDataGroup` object. ''' res = self._get_copy_except_values() # Values should be empty (except which are SampleDataGroups themselves, see next step), but the # correct fields should be set up and filled with 'None' res._values = {} for key in self._values: if res._types[key] == SampleDataGroup: # If the element is itself a SampleDataGroup, ensure we get the correct empty like the element res._values[key] = self._values[key].get_empty_like_self() else: # If the element is a primitive, set it to None res._values[key] = None return res
[docs] def get_copy(self) -> SampleDataGroup: '''Get a copy. Create a copy: equivalent to :meth:`get_empty_like_self` followed by filling the data from the original object. Note that for the actual data, references to the original data are used, i.e. the data itself is not deep-copied. However, the data group fields making up the data format are deep-copied. This means that modifying the data in place will modify the data in the original. However, assigning new data to fields, adding or deleting fields, changing their type etc. will not affect the original. Returns: Resulting copy ''' res = self._get_copy_except_values() res._values = {} # Values should be re-used. However, the individual values should be not deep-copied. # Instead, build a new dictionary (so that adding or removing keys will not affect the original), # but use references to the original objects for the actual values except for SampleDataGroup # elements, which are handled differently (see below) for key in self._values: if res._types[key] == SampleDataGroup: # If the element is itself a SampleDataGroup, ensure it is copied correctly by calling # get_copy recursively. res._values[key] = self._values[key].get_copy() else: # If the element is not a SampleDataGroup, set reference to the original object. res._values[key] = self._values[key] return res
[docs] def type_matches(self, other: SampleDataGroup) -> bool: '''Check whether the data type defined by two objects of :class:`SampleDataGroup` is the same. The following is not considered when checking for equality as it is not considered to be part of the type described by the object: - The actual data stored in the data fields - Whether mapping and conversion should be performed - Whether mappings are available for the same fields and whether mappings themselves are the same Important: Note that it is checked whether the fields appear in the same order in the two objects. This is the case if the objects are constructed from the same blueprint (or if they were constructed by adding the individual fields in the same order). This is important as it defines whether the flattened data, e.g. obtained by :meth:`get_data` from one of the objects can be used to fill the data into the other one, e.g. using :meth:`set_data`. ''' match = self._value_order == other._value_order if match: for type, name, i in zip(self._types_order, self._value_order, range(len(self))): if type == SampleDataGroup: match = self[name].type_matches(other[name]) else: match = type == other._types_order[i] if not match: break return match
def __setitem__(self, name: Union[str, int], value: Any): # Documented as part of the class docstring assert isinstance(name, str) or isinstance(name, int), f"'name' has unsupported type: `{type(name)}`" if not name in self._values: raise KeyError(f"No field with name '{name}'") if self._types[name] == SampleDataGroup: if (self._types[name] == SampleDataGroup) and (not self[name].type_matches(value)): raise KeyError( f"Tried to set a data group field '{name}' " f"(fields of type SampleDataGroup), but types do not match." ) if self._types[name] == types.DALIDataType.STRING and not isinstance(value, dali.data_node.DataNode): self._values[name] = self._convert_from_string(value) else: self._values[name] = self._apply_mapping_check_and_convert(name, value)
[docs] def set_item_in_path( self, path: Union[str, int, Tuple[Union[str, int]], List[Union[str, int]]], value: Any ): '''Assign a field value at a (nested) path. The path is a sequence of field names/keys. For example, if the path is ``path = ("name_1", "name_2", "name_3")``, the following are equivalent: - ``obj.set_item_in_path(path, value_to_set)`` - ``obj["name_1"]["name_2"]["name_3"] = value_to_set`` Important: See the class docstring for details on the assignment behavior. Args: path: Path of the item to be set. value: Value to be set. ''' assert ( isinstance(path, str) or isinstance(path, int) or isinstance(path, tuple) or isinstance(path, list) ), "'path' has unsupported type" if isinstance(path, list) or isinstance(path, tuple): assert ( len(path) > 0 ), "Only setting of children is supported. Therefore, 'path' cannot be a tuple/list with length 0." if not path[0] in self._values: raise KeyError(f"No field with name '{path[0]}'") if len(path) == 1: # The path is a tuple/list, but has only 1 entry. This is equivalent to using a string/number # directly self[path[0]] = value else: # (recursively) use set_item_in_path() and walk the path, until the remaining path is a # single name self._values[path[0]].set_item_in_path(path[1:], value) else: self[path] = value
def __getitem__(self, name: Union[str, int]) -> Any: # Documented as part of the class docstring assert isinstance(name, str) or isinstance(name, int), "'name' has unsupported type" if not name in self._values: raise KeyError(f"No field with name '{name}'") value = self._values[name] if self._types[name] == types.DALIDataType.STRING and not isinstance(value, dali.data_node.DataNode): return self._convert_to_string(value) return value
[docs] def get_item_in_path(self, path: Union[str, int, Tuple[Union[str, int]], List[Union[str, int]]]) -> Any: '''Get a field value at a nested path. The path is a sequence of field names/keys. For example, if ``path = ("name_1", "name_2", "name_3")``, the following are equivalent: - ``value = obj.get_item_in_path(path)`` - ``value = obj["name_1"]["name_2"]["name_3"]`` Note: Accessing strings inside the DALI pipeline (except for the input callable/iterable) will return the underlying uint8 tensor instead. Using strings directly is only supported in the input callable/iterable and outside the DALI pipeline. Args: path: Path of the item to get. Returns: Item at ``path``. ''' assert ( isinstance(path, str) or isinstance(path, int) or isinstance(path, tuple) or isinstance(path, list) ), "'path' has unsupported type" if isinstance(path, list) or isinstance(path, tuple): if len(path) == 0: return self if not path[0] in self._values: raise KeyError(f"No field with name '{path[0]}'") if len(path) == 1: # The path is a tuple/list, but has only 1 entry. This is equivalent to using a string/number # directly return self[path[0]] else: # (recursively) use get_item_in_path() and walk the path, until the remaining path is a # single name return self._values[path[0]].get_item_in_path(path[1:]) else: return self[path]
[docs] def get_parent_of_path( self, path: Union[int, str, Tuple[Union[str, int]], List[Union[str, int]]] ) -> SampleDataGroup: '''Get the parent of an element described in path. The following are equivalent: - ``obj.get_parent_of_path(path)`` - ``obj.get_item_in_path(path[:-1])`` Note: As a parent node cannot be a data field (i.e. a leaf node), the returned value is always a :class:`SampleDataGroup` instance. Args: path: Path for which to get the parent. Returns: Parent of the path. ''' if self.path_is_single_name(path): if not self.has_child(path): raise KeyError(f"No element '{path}' is present.") return self else: assert len(path) > 0, ( "Cannot get parent of element with path len 0, as path len 0 corresponds to the element for " "which the method was called." ) return self.get_item_in_path(path[:-1])
[docs] def get_type_of_item_in_path( self, path: Union[Tuple[Union[str, int]], List[Union[str, int]]] ) -> Union[types.DALIDataType, type]: '''Get the type of the item at a nested path. Args: path: Path to the item. See also: - :meth:`SampleDataGroup.get_item_in_path` for a description of the `path` parameter. - :meth:`SampleDataGroup.get_type_of_field` for a description of how type information is returned (which applies to this method as well). Returns: Data type of the field. For data group fields, :class:`SampleDataGroup`. For data fields, the corresponding :class:`nvidia.dali.types.DALIDataType`. If ``path`` is empty, returns ``self``. ''' assert isinstance(path, tuple) or isinstance(path, list), "'path' has to be tuple or list" if len(path) > 0: if len(path) > 1: to_check_in = self.get_parent_of_path(path) else: to_check_in = self res_type = to_check_in.get_type_of_field(path[-1]) else: # Path is refering to `self`. As this is a SampleDataGroup, the data type is `SampleDataGroup` res_type = SampleDataGroup return res_type
[docs] @staticmethod def path_is_single_name(path: Union[str, int, Tuple[Union[str, int]], List[Union[str, int]]]) -> bool: '''Check if the path given is a single name. Args: path: Path to check. Can be a single name/key or a sequence of names. Returns: ``True`` if ``path`` is a single name/key (i.e., a string or integer, not a sequence), ``False`` otherwise. ''' is_name = isinstance(path, str) or not isinstance(path, Sequence) return is_name
[docs] def path_exists(self, path: Union[str, int, Tuple[Union[str, int]], List[Union[str, int]]]) -> bool: '''Check if a field with the given path exists. Args: path: Path to check. Returns: Whether field with given path exists. ''' if self.path_is_single_name(path): exists = self.has_child(path) elif len(path) == 0: exists = True elif len(path) == 1: exists = self.has_child(path[0]) else: remaining_path = path[1:] if len(remaining_path) == 1: remaining_path = remaining_path[0] exists = self[path[0]].path_exists(remaining_path) return exists
[docs] def path_exists_and_is_data_group_field( self, path: Union[str, int, Tuple[Union[str, int]], List[Union[str, int]]] ) -> bool: '''Check if a field with the given path exists and is a data group field. Args: path: Path to check Returns: ``True`` if field at path exists and is a data group field, ``False`` otherwise ''' exists = self.path_exists(path) if exists: # Path is referring to 'self', which is a SampleDataGroup and therefore a data group field if len(path) == 0: res = True else: # Get the last name in the path if self.path_is_single_name(path): last_name = path else: last_name = path[-1] # Get the parent element of the path (is a SampleDataGroup, as it contains children and # therefore is a data group field) parent = self.get_parent_of_path(path) # Check whether the element with name `last_name` is a data group field using its parent # node (as the node itself may be a data node). res = parent.is_data_group_field(last_name) return res
[docs] def get_type_of_field(self, name: Union[str, int]) -> Union[types.DALIDataType, type]: '''Get type of a field. The type is either expressed as a :class:`nvidia.dali.types.DALIDataType` (data fields) or :class:`SampleDataGroup` (data group fields). Args: name: Name of the field. Returns: Type of the field. For string fields this returns :class:`nvidia.dali.types.DALIDataType.STRING`. Note that this is different from flattened contexts (e.g., :attr:`field_types_flat`), where strings are represented as :class:`nvidia.dali.types.DALIDataType.UINT8`. This is as the flattened data is used internally to pass data between :class:`SampleDataGroup` objects where the object itself cannot be passed and consequently, the string data is passed as stored internally (i.e. the underlying uint8 tensors). Here, the actual type as configured (e.g. by :meth:`add_data_field`) is returned. ''' return self._types[name]
[docs] def get_string_no_details(self) -> str: '''Get string representing the :class:`SampleDataGroup` instance, omitting details. Omits per-field details such as whether a value is set and whether a mapping is available. ''' res_str = "{\n" + self._to_string_with_indent(2, False) + "}\n" return res_str
def __str__(self) -> str: # Documented as part of the class docstring res_str = "{\n" + self._to_string_with_indent(2, True) + "}\n" return res_str def __len__(self) -> int: # Documented as part of the class docstring return len(self._value_order)
[docs] def is_array(self, field: Optional[Union[str, int]] = None) -> bool: '''Check whether (self or child) object can be regarded as an array. This is the case if all of the following hold: - The field names have integer numeric names. - Each element in the range ``[0; len(self) - 1]`` is present as a name. - The value order is such that for each element, the name increases by 1, i.e. ``self.contained_top_level_field_names == (0, 1, 2, 3, ...)``. Args: field: If set, perform the check for the named child. Otherwise, check ``self``. Returns: Whether the object can be considered an array. ''' if field is None: for i in range(len(self)): if not self._value_order[i] == i: return False return True else: return self[field].is_array()
[docs] def is_data_field_array(self, field: Optional[Union[str, int]] = None) -> bool: """Check whether (self or child) object is an array whose elements are all data fields (no data group fields). See documentation of :meth:`is_array` for conditions for a data group field to be regarded as an array. Args: field: If set, perform the check for the named child. Otherwise, check ``self``. Returns: Whether the object is an array of data fields. """ if field is None: for i in range(len(self)): if not self._value_order[i] == i: return False if not self.is_data_field(i): return False return True else: if not self.is_data_group_field(field): return False else: return self[field].is_data_field_array()
[docs] def is_data_group_field_array(self, field: Optional[Union[str, int]] = None) -> bool: """Check whether (self or child) object is an array whose elements are all data group fields (no data fields). See documentation of :meth:`is_array` for conditions for a data group field to be regarded as an array. Args: field: If set, perform the check for the named child. Otherwise, check ``self``. Returns: Whether the object is an array of data group fields. """ if field is None: for i in range(len(self)): if not self._value_order[i] == i: return False if not self.is_data_group_field(i): return False return True else: return self[field].is_data_group_field_array()
@property def contained_top_level_field_names(self) -> Tuple[Union[str, int]]: """Get the names of the contained top-level fields. The order of the fields corresponds to the order in which they were added. Returns: Names of contained fields. """ return self._value_order @property def field_top_level_types(self) -> Tuple[Union[types.DALIDataType, type]]: """Types of the top-level fields. The order of the fields corresponds to the order in which they were added (and to the order of the elements returned by :attr:`contained_top_level_field_names`). Types fields are :class:`nvidia.dali.types.DALIDataType` instances for data fields and :class:`SampleDataGroup` blueprints for data group fields. """ return self._types_order @property def field_names_flat(self) -> Tuple[str]: """Names of contained data fields flattened (all leaf nodes, not only direct children). Each element corresponds to a data field (leaf node). Original nesting is reflected in the names (concatenated with "." between parent and child). Numerical names are converted to strings to ensure that they can be used as names in other places (e.g. DALI generic iterator). For example, the numeric name ``5`` would become ``"[5]"``. For example, if there is a data field in the original object in the path ``object["name_0"][1]["name_2"]``, the name used in the flattened tuple of names would be ``"name_0.[1].name_2"``. The order of the elements corresponds to the order used in :meth:`get_data`, so that the names obtained here correspond to the values obtained there. No names are added for data group fields themselves. If they contain descendants which are data fields, their name will appear in the name of the descendants (before "."). However, if a data group field does not contain any data field descendants, it will not contribute a name to the output. Note: The names themselves reflect the hierarchy of the data, so that the names are unique, even if there are multiple fields with the same name in the structure. """ res = tuple(self._get_contained_field_names_flat("")) return res @property def field_types_flat(self) -> Tuple[types.DALIDataType]: """Types of contained data fields flattened (all leaf nodes, not only direct children). Each element corresponds to a leaf node. The order of the elements corresponds to the order used in :meth:`get_data`, so that the types obtained here correspond to the values obtained there. No types are added for data group fields themselves. If they contain descendants which are data fields, the types of these descendants will be added. However, if a data group field does not contain any data field descendants, it will not contribute a type to the output. Note: As only the leaf nodes containing data are considered, no entries directly corresponding to data group fields will be added. String fields are represented as :class:`nvidia.dali.types.DALIDataType.UINT8`, matching their in-pipeline representation. Note that this is different from e.g. :meth:`get_type_of_field`, but consistent with :meth:`get_data` (see :meth:`get_data` for details on the rationale). """ res = tuple(self._get_contained_types_flat()) return res
[docs] def get_data(self, as_list_type: bool = False) -> Union[tuple, list]: """Get values of all data fields as a flattened sequence (all leaf nodes, not only direct children). The order of the elements is the order of a depth-first traversal with the order of the children at each node corresponding to the order in which the elements were added (consistent with, e.g., :attr:`contained_top_level_field_names`). The order is the same as in :attr:`field_names_flat` and :attr:`field_types_flat`, so that these can be used to obtain information about the individual elements of the obtained sequence of values. Only data fields (leaf nodes that are not :class:`SampleDataGroup`) contribute values. Data group fields are not included directly, but their data field descendants contribute values. Note: The tuple returned by this function can be used directly to - Pass parameters from an input callable/iterable to the DALI pipeline. - Return the final output of the DALI pipeline. In these cases, the returned sequence can be used to fill the original data structure (using :meth:`set_data` or :meth:`set_data_from_dali_generic_iterator_output`) into a :class:`SampleDataGroup` blueprint object with the same format as ``self``. Important: For string data fields, the values are the underlying uint8 arrays/tensors (or DataNodes), not Python ``str`` objects (both inside and outside the DALI pipeline). This method is designed to exchange data between :class:`SampleDataGroup` objects and directly returns the underlying data, with the encoded strings. The conversion to Python ``str`` objects is performed when the data is obtained, e.g. using the indexed access operator ``[]`` or :meth:`get_item_in_path`. Args: as_list_type: If ``True``, return a list (tuple otherwise). Returns: Sequence of values of all data fields. """ res = [] for type, name in zip(self.field_top_level_types, self.contained_top_level_field_names): if type == SampleDataGroup: res_i = self[name].get_data(True) res = res + res_i else: res.append(self._getitem_without_conversions(name)) if not as_list_type: res = tuple(res) return res
[docs] def set_data(self, data: Union[tuple, list]): '''Set values of all descendant data fields from a flattened sequence. The sequence needs to contain the data in the same order as indicated by :attr:`field_names_flat`. If the flat data was obtained by :meth:`get_data` from a :class:`SampleDataGroup` object with the same data format as ``self``, this will always be the case. The compatibility between the object from which the flattened data was obtained and this instance can be checked with :meth:`type_matches`. Important: When setting data in this way, no conversions or mappings are applied (both inside and outside the DALI pipeline). This method is designed to exchange data between :class:`SampleDataGroup` objects and expects the data as stored in the :class:`SampleDataGroup` object (i.e., already converted and with mappings applied) as input. Args: data: Flat sequence of values to use. ''' self._set_data_and_get_num_used_data_elements(data)
[docs] def set_data_from_dali_generic_iterator_output(self, data: List[Dict[str, Any]], index: int): '''Set values from the output of a DALI generic iterator. The DALI generic iterator refers to :class:`nvidia.dali.plugin.pytorch.DALIGenericIterator` or any other iterator which follows the same interface (tensor types may be from a different framework). The iterator (and therefore, the underlying DALI pipeline) must output the flattened data in the format as this instance (using :meth:`get_data`), with names assigned in the iterator to the individual fields matching :attr:`field_names_flat` of this object. The compatibility between the object from which the flattened data was obtained and this instance can be checked with :meth:`type_matches`. See also: :meth:`get_like_self_filled_from_iterator_output` Note: Values for string fields are uint8 arrays/tensors (not Python strings). For details, see :meth:`get_data`. Args: data: Output of the DALI generic iterator. index: Index inside data from which to fill the data. ''' name_order = self.field_names_flat data_as_sequence = [data[index][name] for name in name_order] self.set_data(data_as_sequence)
[docs] def has_child(self, name: Union[str, int]) -> bool: '''Check whether a direct child with the given name exists. Args: name: Name of the child to check Returns: Whether child exists. ''' res = name in self._values return res
[docs] def add_data_field( self, name: Union[str, int], type: types.DALIDataType, mapping: Optional[Union[Dict[Union[str, None], Union[int, float, np.number, bool]]]] = None, ): '''Add a data field as a direct child. Data field means that the field contains actual data, i.e. is not another data group field (:class:`SampleDataGroup` instance). Note: If a mapping is defined, it is applied both to strings and to (possibly nested, multi-dimensional) sequences of strings (lists/tuples/arrays). The mapping is a dictionary from original string values to numeric values. The special key ``None`` provides a default value for unmatched inputs. The mapping is only applied when data is assigned inside the input callable/iterable or outside the DALI pipeline. The mapping is not performed for assignments inside the actual DALI pipeline (and setting data there is only supported directly using numerical values). Note: Alternatively to using a mapping, strings can be directly assigned to data fields by setting the data type to :class:`nvidia.dali.types.DALIDataType.STRING`. However, - String processing in this way is only supported inside the input callable/iterable and outside the DALI pipeline, and such strings appear as uint8 tensors inside the DALI pipeline. - Only single strings can be assigned, not sequences of strings (although outputting 1D sequences of strings is supported to enable output of batch-wise data). - Often, using a mapping is advantageous to meaningfully process the data in the pipeline and also needs to be performed for other reasons (e.g. to convert class labels from strings to integers to be used in the loss computation). This way of handling strings is e.g. useful to pass sample tags or other high-level descriptors through the pipeline. Args: name: Name of the field to add type: Type of (the elements of) the field to add. If a mapping is used, this is the type after mapping is applied. mapping: Mapping defining the mapping from input string values to numerical values. The conversion from string to numeric happens at data assignment (if applying mapping is not disabled). ``None`` can be added as a key to the mapping. In this case, the respective value is used if the input string(s) do not match any of the other keys. Mapping is applied both if a single string is assigned, but also for (n-dimensional) sequences of strings. Note that if a mapping is set, numeric values can still be assigned directly to the data field alternatively to strings. ''' assert not isinstance(type, SampleDataGroup), ( "The method add_data_field() cannot be used to add data group fields (type: SampleDataGroup). " "Use add_data_group_field() instead." ) assert ( type != types.DALIDataType.STRING or mapping is None ), "Cannot set a mapping for data fields of type types.DALIDataType.STRING" if name in self._value_order: raise KeyError(f"Field '{name}' cannot be added as it already exists.") self._value_order = self._value_order + (name,) self._types_order = self._types_order + (type,) self._values[name] = None self._types[name] = type if mapping is not None: self._mappings[name] = mapping
[docs] def add_data_group_field(self, name: str, blueprint_sample_data_group: SampleDataGroup): '''Add a data group field as a direct child. Data group field means a child of the type :class:`SampleDataGroup`, which itself can contain data fields and/or data group fields. Data group fields are used to group elements together logically. ``blueprint_sample_data_group`` acts as a blueprint. A new empty instance with the same format is created and added as the child. Values can be assigned later directly (or via :meth:`set_item_in_path`). Args: name: Name of the new field. blueprint_sample_data_group: :class:`SampleDataGroup` instance describing the field format to add. ''' if name in self._value_order: raise KeyError(f"Field '{name}' cannot be added as it already exists.") self._value_order = self._value_order + (name,) self._types_order = self._types_order + (SampleDataGroup,) to_add = blueprint_sample_data_group.get_empty_like_self() to_add.set_apply_mapping(self._do_apply_mapping) to_add.set_do_convert(self._do_convert) self._values[name] = to_add self._types[name] = SampleDataGroup
[docs] def add_data_field_array( self, name: str, type: types.DALIDataType, num_fields: int, mapping: Optional[Dict[Union[str, None], Union[int, float, np.number, bool]]] = None, ): '''Add a data field array. Add a child data group field (type :class:`SampleDataGroup`) that contains ``num_fields`` elements, each with the type and mapping defined here. Elements are added with integer names from ``0`` to ``num_fields - 1``, so the child behaves like an array. Note: If a blueprint of the array is already created as another, independent blueprint, you can use :meth:`add_data_group_field` to add the blueprint to this object. See also: :meth:`add_data_group_field_array` :meth:`create_data_field_array` :meth:`create_data_group_field_array` Args: name: Name of the array data group field to add type: Type of the fields to add to the array data group field num_fields: Number of fields to add to the array data group field mapping: Optional mapping for the fields (see :meth:`add_data_field` for details on mappings). ''' data_group_to_add = self.create_data_field_array(type, num_fields, mapping) self.add_data_group_field(name, data_group_to_add)
[docs] def add_data_group_field_array( self, name: str, blueprint_sample_data_group: SampleDataGroup, num_fields: int ): '''Add a data group field array. Add a child data group field (type :class:`SampleDataGroup`) that contains ``num_fields`` elements, each matching the provided blueprint. Elements are added with integer names from ``0`` to ``num_fields - 1`` so the child behaves like an array. Note: If a blueprint of the array is already created as another, independent blueprint, you can use :meth:`add_data_group_field` to add the blueprint to this object. See also: :meth:`add_data_field_array` :meth:`create_data_field_array` :meth:`create_data_group_field_array` Args: name: Name of the array data group field to add blueprint_sample_data_group: :class:`SampleDataGroup` describing the element format (each element is initialized from ``get_empty_like_self()`` of the blueprint). num_fields: Number of elements to add. ''' data_group_to_add = self.create_data_group_field_array(blueprint_sample_data_group, num_fields) self.add_data_group_field(name, data_group_to_add)
[docs] def remove_field(self, name: Union[str, int]): '''Delete the direct child with the given name. See also: :meth:`remove_all_occurrences` Args: name: Name of the child to remove. ''' if not name in self._value_order: raise KeyError(f"Cannot delete field '{name}' as it is not present.") index = self._value_order.index(name) self._value_order = self._value_order[0:index] + self._value_order[index + 1 :] self._types_order = self._types_order[0:index] + self._types_order[index + 1 :] if name in self._mappings: self._mappings.pop(name) self._values.pop(name) self._types.pop(name)
[docs] def remove_all_occurrences(self, name_to_remove: Union[str, int]): '''Remove all fields with a given name. All fields with a given name are removed in the tree of which ``self`` is the root, i.e. of this node and its descendants. See also: :meth:`remove_field` Args: name_to_remove: Name of the field(s) to remove ''' # If a child with the matching name exists, remove it if self.has_child(name_to_remove): self.remove_field(name_to_remove) # Also make sure to remove in children (recursively) for type, name in zip(self._types_order, self._value_order): if type == SampleDataGroup: self[name].remove_all_occurrences(name_to_remove)
[docs] def find_all_occurrences(self, name_to_find: Union[str, int]) -> Tuple[Tuple[Union[str, int]]]: '''Find all occurrences of fields with a given name. The search is performed in the tree where ``self`` is the root, i.e. of this node and its descendants. See also: :meth:`get_num_occurrences` Args: name_to_find: Name of the field(s) to find Returns: Paths to the found fields. If none were found, an empty tuple is returned. The individual paths are themselves tuples. For example, the path ``("name_1", "name_2", "name_3")`` would denote the element ``self["name_1"]["name_2"]["name_3"]``. ''' res = [] self._find_all_occurrences_rec(name_to_find, [], res) # convert the individual paths to tuples if len(res) > 0: res = [tuple(r) for r in res] return res
[docs] def get_num_occurrences(self, name_to_find: Union[str, int]) -> int: '''Get the number of occurrences of fields with a given name. Returns the number of occurrences in the tree where ``self`` is the root, i.e. of this node and its descendants. See also: :meth:`find_all_occurrences` Args: name_to_find: Name to search for. Returns: Number of occurrences ''' occurences = self.find_all_occurrences(name_to_find) num_occ = len(occurences) return num_occ
[docs] def change_type_of_data_and_remove_data( self, path: Union[Tuple[Union[str, int]], str, int], new_type: Union[types.DALIDataType, SampleDataGroup], new_mapping: Optional[Union[Dict[Union[str, None], Union[int, float, np.number, bool]]]] = None, ): """Change the type of a child field and remove its data. The data is removed as it is incompatible with the new type. Note that removing the data means resetting the reference, not actively deleting the data. Example: A typical use case would be: 1) Get the data of which the type should be changed, e.g.: ``data = obj["name"]`` 2) Change the data type a) Change the data type as stored in the structure, e.g.: ``obj.change_type_of_data_and_remove_data("name", dali.types.DALIDataType.FLOAT)`` b) Convert the actual data, e.g.: ``data = dali.fn.cast(data, dtype=types.DALIDataType.FLOAT)`` 3) Write data back, e.g.: ``obj["name"] = data`` Note that instead of ``"name"``, a nested path can be used. Args: path: Either a child name or a nested path (sequence of names). new_type: For data fields, a :class:`types.DALIDataType`. For data group fields, a :class:`SampleDataGroup` used as a blueprint describing the new format. new_mapping: New mapping for data fields (see :meth:`add_data_field`). Must be ``None`` for data group fields. """ old_element = self.get_item_in_path(path) assert isinstance(new_type, SampleDataGroup) == isinstance(old_element, SampleDataGroup), ( "Data group field array type can only be changed to another data group field type and data field " "type only to another data field type." ) if isinstance(new_type, SampleDataGroup): assert new_mapping is None, ( "When changing type of data group field (i.e. SampleDataGroup node), `new_mapping` has to be " "`None`" ) parent = self.get_parent_of_path(path) if not self.path_is_single_name(path): name = path[-1] else: name = path parent._change_data_group_field_type_to(name, new_type) else: parent = self.get_parent_of_path(path) if not self.path_is_single_name(path): name = path[-1] else: name = path element_idx = parent._value_order.index(name) value_order_to_set = list(parent._value_order) value_order_to_set[element_idx] = name parent._value_order = tuple(value_order_to_set) parent._values[name] = None type_order_to_set = list(parent._types_order) type_order_to_set[element_idx] = new_type parent._types_order = tuple(type_order_to_set) parent._types[name] = new_type if new_mapping is not None: parent._mappings[name] = new_mapping elif name in parent._mappings: del parent._mappings[name]
[docs] def get_flat_index_first_discrepancy_to_other(self, other: SampleDataGroup) -> int: """Get the first flat index where two instances differ in field structure, name, or type. Compares flattened field names and types (see :attr:`field_names_flat`, :attr:`field_types_flat`). The flattened names include full paths, making structural differences visible. Empty sample data group nodes (no data field descendants) are ignored. Args: other: Other SampleDataGroup instance to compare to. Returns: Index where the first difference is present, or -1 if there are no differences. Note that string fields are compared as :class:`nvidia.dali.types.DALIDataType.UINT8` in the flattened types, matching :attr:`field_types_flat`. """ self_types = self.field_types_flat self_names = self.field_names_flat other_types = other.field_types_flat other_names = other.field_names_flat types_match = self_types == other_types names_match = self_names == other_names if types_match and names_match: return -1 length = np.min([len(self_types), len(other_types)]) for i in range(length): if self_names[i] != other_names[i] or self_types[i] != other_types[i]: return i # If none of the other return statements were executed, this means that the length is different. # In this case, 'length' is the first index in which the SampleDataGroup instances differ, as this is # the length of the shorter one and therefore points to the first element in the long one for which # there is no correspondence in the short one. return length
[docs] def ensure_uniform_size_in_batch(self, fill_value: Union[int, float]): '''For each data field, ensure uniform size in batch by padding with ``fill_value``. This is equivalent to calling ``dali.fn.pad(field_values)`` for all contained data fields (in this data group field, and its descendants). Warning: - This method needs to be called inside the DALI pipeline (except the input callable/iterable). - Scalar (i.e. 0D) tensors are not supported. If such tensors are present, an error will be raised. Args: fill_value: Fill value to be used for the padded region. ''' for type, name in zip(self._types_order, self._value_order): if type == SampleDataGroup: # Recursively apply to SampleDataNode children self[name].ensure_uniform_size_in_batch(fill_value) else: self._values[name] = fn.pad(self._values[name], fill_value=fill_value)
[docs] def ensure_uniform_size_in_batch_for_all_strings(self): '''Ensure uniform size in batch for all string data fields. This is useful before outputting from the DALI pipeline in a format that expects uniform size. A padding with 0-values is performed for all string data fields. This is done for all contained string data fields (in this data group field, and its descendants). Note: When obtaining the data as strings, the padding is removed and only the actual data is returned. ''' for type, name in zip(self._types_order, self._value_order): if type == types.DALIDataType.STRING: self._values[name] = fn.pad(self._values[name], fill_value=0) elif type == SampleDataGroup: self[name].ensure_uniform_size_in_batch_for_all_strings()
[docs] def is_data_field(self, name: Union[str, int]) -> bool: '''Check whether a child field is a data field. Args: name: Name of the child field to check. Returns: Whether the child field is a data field (contains values) as opposed to a data group field (field of type :class:`SampleDataGroup`). ''' if not name in self._value_order: raise KeyError(f"No element with name '{name}' is present.") is_leaf = not (self._types[name] == SampleDataGroup) return is_leaf
[docs] def is_data_group_field(self, name: Union[str, int]) -> bool: '''Check whether a child field is a data group field. Args: name: Name of the child field to check. Returns: Whether the child field is a data group field (field of type :class:`SampleDataGroup`). ''' return not self.is_data_field(name)
[docs] def to_dictionary(self) -> dict: '''Get a nested dictionary with the same (nested) data structure and contained values. This and descendants :class:`SampleDataGroup` objects are converted to :class:`dict` objects. Contained strings are returned as Python strings. Returns: Resulting dictionary. ''' res = {} for name, type in zip(self._value_order, self._types_order): if type == SampleDataGroup: res[name] = self[name].to_dictionary() else: res[name] = self[name] return res
[docs] @staticmethod def get_numpy_type_for_dali_type(dali_type: types.DALIDataType) -> type: '''Get the numpy dtype corresponding to a DALI data type. Note: Only numeric and boolean DALI types are supported. A ``ValueError`` is raised for unsupported types. ''' if not dali_type in SampleDataGroup._type_mapping: raise ValueError( f"The DALI type ({dali_type}) does not have a corresponding numpy type set in SampleDataGroup" ) res = SampleDataGroup._type_mapping[dali_type] return res
[docs] def check_has_children( self, data_field_children: Optional[Union[Sequence[Union[str, int]], str, int]] = None, data_group_field_children: Optional[Union[Sequence[Union[str, int]], str, int]] = None, data_field_array_children: Optional[Union[Sequence[Union[str, int]], str, int]] = None, data_group_field_array_children: Optional[Union[Sequence[Union[str, int]], str, int]] = None, current_name: Optional[str] = None, ): '''Check that required children are present; raise ``ValueError`` if not. Convenience helper for validating presence and kinds of children. Args: data_field_children: Required child names which must be data fields. data_group_field_children: Required child names which must be data group fields. data_field_array_children: Required child names which must be arrays of data fields. data_group_field_array_children: Required child names which must be arrays of data group fields. current_name: Name of the current element. Optional, only used to provide clearer error messages. Raises: ValueError: If a required child is not present or is not of the expected type. ''' if current_name is None: name_to_insert = "" else: name_to_insert = f"'{current_name}'" if data_field_children is not None: if isinstance(data_field_children, (str, int)): data_field_children = [data_field_children] for dfc in data_field_children: if not self.has_child(dfc): raise ValueError(f"Data Group field {name_to_insert} does not have child `{dfc}`.") if not self.is_data_field(dfc): raise ValueError(f"Data Group field {name_to_insert}: child `{dfc}` is not a data field.") if data_group_field_children is not None: if isinstance(data_group_field_children, (str, int)): data_group_field_children = [data_group_field_children] for dgfc in data_group_field_children: if not self.has_child(dgfc): raise ValueError(f"Data Group field {name_to_insert} does not have child `{dgfc}`.") if not self.is_data_group_field(dgfc): raise ValueError( f"Data Group field {name_to_insert}: child `{dgfc}` is not a data group field." ) if data_field_array_children is not None: if isinstance(data_field_array_children, (str, int)): data_field_array_children = [data_field_array_children] for dfca in data_field_array_children: if not self.has_child(dfca): raise ValueError(f"Data Group field {name_to_insert} does not have child `{dfca}`.") if not (self.is_data_group_field(dfca) and self[dfca].is_data_field_array()): raise ValueError( f"Data Group field {name_to_insert}: child `{dfca}` is not a data field array." ) if data_group_field_array_children is not None: if isinstance(data_group_field_array_children, (str, int)): data_group_field_array_children = [data_group_field_array_children] for dgfca in data_group_field_array_children: if not self.has_child(dgfca): raise ValueError(f"Data Group field {name_to_insert} does not have child `{dgfca}`.") if not (self.is_data_group_field(dgfca) and self[dgfca].is_data_group_field_array()): raise ValueError( f"Data Group field {name_to_insert}: child `{dgfca}` is not a data group field array." )
# ----- Private helper functions from here on ----- def _to_string_with_indent(self, indent: int, with_details: bool) -> str: ident_string = " " * indent res_str = "" space_details = " " * 2 for type, name in zip(self._types_order, self._value_order): if type == SampleDataGroup: res_str += ( f"{ident_string}{name}:\n{ident_string}" + "{" + f"\n{self[name]._to_string_with_indent(indent + 2, with_details)}" ) res_str += ident_string + "}\n" else: res_str += f"{ident_string}{name}: {str(self._types[name])}{space_details} " + ( f"(is set: {self._values[name] is not None}; " f"mapping available: {name in self._mappings})\n" if with_details else "\n" ) return res_str def _get_contained_field_names_flat(self, prefix: str) -> List[str]: res = [] for type, name in zip(self.field_top_level_types, self.contained_top_level_field_names): if type == SampleDataGroup: if isinstance(name, str): prefix_to_add = f"{name}." else: prefix_to_add = f"[{name}]." res_i = self[name]._get_contained_field_names_flat(prefix + prefix_to_add) res = res + res_i else: if isinstance(name, str): name_to_use = prefix + name else: name_to_use = prefix + f"[{name}]" res.append(name_to_use) return res def _get_contained_types_flat(self) -> List[types.DALIDataType]: res = [] for type, name in zip(self.field_top_level_types, self.contained_top_level_field_names): if type == SampleDataGroup: res_i = self[name]._get_contained_types_flat() res = res + res_i elif type == types.DALIDataType.STRING: res.append(types.DALIDataType.UINT8) else: res.append(type) return res def _set_data_and_get_num_used_data_elements(self, data: Union[tuple, list]) -> int: curr_element = 0 for type, name in zip(self.field_top_level_types, self.contained_top_level_field_names): if type == SampleDataGroup: num_elements_used = self[name]._set_data_and_get_num_used_data_elements(data[curr_element:]) curr_element += num_elements_used else: self._setitem_without_conversions(name, data[curr_element]) curr_element += 1 return curr_element def _find_all_occurrences_rec( self, name_to_find: Union[str, int], prefix: List[Union[str, int]], results_ref: List[List[Union[str, int]]], ): if name_to_find in self._value_order: # Copy necessary as otherwise, the prefix used by the outer recursion levels would be altered path = copy.deepcopy(prefix) path.append(name_to_find) results_ref.append(path) for type, name in zip(self._types_order, self._value_order): if type == SampleDataGroup: # Copy prefix to not modify the original, which will is still needed by the caller prefix_for_next = copy.deepcopy(prefix) # Include the current child which which we re going to call `_find_all_occurrences_rec(...)` # to the prefix prefix_for_next.append(name) # Call `_find_all_occurrences_rec(...)` of the current child self[name]._find_all_occurrences_rec(name_to_find, prefix_for_next, results_ref) def _get_copy_except_values(self) -> SampleDataGroup: # First, make a shallow copy to obtain the object itself. res = copy.copy(self) # Then, deep copy the individual properties where possible & needed res._mappings = copy.deepcopy(self._mappings) res._value_order = copy.deepcopy(self._value_order) res._types_order = copy.deepcopy(self._types_order) res._types = copy.deepcopy(self._types) # 'res._values' should not be filled in this function. Set it to 'None' # as otherwise, it is a shallow copy of 'self._values' res._values = None return res def _apply_mapping_check_and_convert(self, name: Union[str, int], value: Any) -> Any: if self._do_apply_mapping: value = self._apply_mapping_if_set(name, value) res = self._check_or_convert_types(name, value) return res def _apply_mapping_if_set(self, name: Union[str, int], data: Any) -> Any: if name in self._mappings: if isinstance(data, SampleDataGroup): warnings.warn( "Mapping cannot be applied inside the DALI pipeline; call " "set_apply_mapping(False) first to disable. Proceeding without mapping." ) res = data else: res = get_mapped(data, self._mappings[name]) else: res = data return res def _check_or_convert_types(self, name: Union[str, int], data: Any) -> Any: # Get the expected type of the data field dali_type = self._types[name] # Only perform runtime type checking inside the DALI pipeline when explicitly enabled. # Skipping this preserves tensor layout metadata (important for steps like AxesLayoutSetter). if self._do_check_type: # Support both regular and debug-mode DALI nodes is_data_node = isinstance(data, getattr(dali.data_node, "DataNode", ())) or isinstance( data, getattr(getattr(dali, "_debug_mode", object()), "DataNodeDebug", ()) ) # If we are inside the DALI pipeline, we need to check that the data type is correct (regardless of # the `_do_convert` flag). if is_data_node: # Ensure the check op is part of the graph by using its output res = check_type(data, self._type_mapping[dali_type], name) return res if not self._do_convert: return data # If the set element is a data group, there is no conversion needed if dali_type == SampleDataGroup: return data np_type = self._type_mapping[dali_type] # note that `numbers.Number` includes Booleans, but not `np.bool_`, so do not check for python # booleans explicitly, but check for `np.bool_` if ( isinstance(data, list) or isinstance(data, tuple) or isinstance(data, np.ndarray) or isinstance(data, np.matrix) or isinstance(data, numbers.Number) or isinstance(data, np.bool_) ): data = np.array(data, dtype=np_type) elif isinstance(data, cupy.ndarray): data = cupy.array(data, dtype=np_type) return data def _convert_from_string( self, data: Union[dali.pipeline.DataNode, str, Sequence[str], None] ) -> Union[dali.pipeline.DataNode, np.ndarray, None]: if isinstance(data, dali.pipeline.DataNode): res = data elif isinstance(data, str): as_bytes = data.encode("utf-8") res = np.frombuffer(as_bytes, dtype=np.uint8) elif data is None: res = None else: raise ValueError(f"Expected a string or a DataNode, but got {type(data)}") return res def _convert_to_string( self, data: Union[dali.pipeline.DataNode, np.ndarray, cupy.ndarray, torch.Tensor, None] ) -> Union[dali.pipeline.DataNode, str, List[str], None]: if isinstance(data, dali.pipeline.DataNode): return data if isinstance(data, np.ndarray): np_data = data elif isinstance(data, cupy.ndarray): np_data = np.array(data.get()) elif isinstance(data, torch.Tensor): np_data = data.detach().cpu().numpy() elif data is None: return None else: raise ValueError( f"Expected a numpy array, cupy array, a torch tensor, or a DataNode, but got {type(data)}" ) # If this is an encoded string, the first element will contain a number # If it is a (possibly nested) sequence of strings, the first element will be again a sequence (and # the else branch is executed) if isinstance(np_data[0], np.number): as_bytes = np_data.tobytes().strip(b'\x00') res = str(as_bytes.decode("utf-8")) else: # Elements are themseves (spossibly nested) sequences of strings (see comment above `if``). In # this case, process each entry (recursively, until the actual strings are reached). res = [self._convert_to_string(d) for d in np_data] return res def _change_data_group_field_type_to(self, name: Union[str, int], value: SampleDataGroup): if self._types[name] != SampleDataGroup: raise ValueError("Called _change_data_group_field_type_to() for a non-SampleDataGroup element.") blueprint = value.get_empty_like_self() blueprint.set_apply_mapping(self._apply_mapping_if_set) blueprint.set_do_convert(self._do_convert) self._values[name] = blueprint def _setitem_without_conversions(self, name: Union[str, int], value: Any): if not name in self._values: raise KeyError(f"No field with name '{name}'") if self._types[name] == SampleDataGroup: if (self._types[name] == SampleDataGroup) and (not self[name].type_matches(value)): raise KeyError( f"Tried to set a data group field '{name}' (fields of type SampleDataGroup), but types " "do not match." ) self._values[name] = value def _getitem_without_conversions(self, name: Union[str, int]) -> Any: if not name in self._values: raise KeyError(f"No field with name '{name}'") return self._values[name]