Source code for megatron.energon.flavors.crude

# Copyright (c) 2025, NVIDIA CORPORATION.
# SPDX-License-Identifier: BSD-3-Clause

from typing import Any, Callable, Dict, List, Optional, Union

from megatron.energon.epathlib import EPath
from megatron.energon.flavors.webdataset import DefaultDecoderWebdatasetFactory



[docs]
class CrudeSample(dict):
    """Generic sample type to be processed later."""




[docs]
class CrudeWebdataset(DefaultDecoderWebdatasetFactory[CrudeSample]):
    """The CrudeWebdataset is used to load crude / raw samples and
    decode them in the user code using so-called cookers.

    See the documentation under "Crude Data" for more information.
    """

    __sample_type__ = CrudeSample


[docs]
    def __init__(
        self,
        path: EPath,
        *,
        subflavors: Optional[Dict[str, Any]] = None,
        part_filter: Union[str, List[str], Callable[[str], bool]] = lambda _: True,
        **kwargs,
    ):
        """
        Constructs a crude webdataset.

        Args:
            path: Root path to the joined datasets.
            subflavors: Subflavors dictionary to set for all loaded samples.
            part_filter: Function for filtering tar files to load by dict keys.
            **kwargs: Additional arguments to the BaseWebdataset constructor.
        """
        # We skip the parent class __init__ and call the BaseWebdataset.__init__ directly

        if "sample_loader" in kwargs:
            raise ValueError("sample_loader is not allowed to be set when using CrudeWebdataset")

        super().__init__(
            path,
            subflavors=subflavors,
            sample_loader=lambda sample: sample,
            part_filter=part_filter,
            **kwargs,
        )