Programmatic Data Preparation

Below, we provide an example function that shows how to prepare a dataset programmatically.

from megatron.energon.flavors import BaseWebdatasetFactory

def prepare_one_dataset(path: Path, num_workers: int, template_dir: Path):

    if (path / ".nv-meta" / "dataset.yaml").exists():
        print(f"Dataset {path} already prepared. Skipping.")
        return

    # Fixed settings
    tar_index_only = False
    split_parts_ratio = [("train", 1), ("val", 0), ("test", 0)]
    split_parts_patterns = None
    
    # Get all tar files
    all_tars = list(path.glob("**/*.tar")) + list(path.glob("**/*.tgz"))
    all_tars = [str(p.relative_to(path)) for p in sorted(all_tars)]

    if len(all_tars) == 0:
        print("Did not find any tar files. Exiting.")
        return

    print(f"Found {len(all_tars)} tar files in total. The first and last ones are:")
    print(f"- {all_tars[0]}")
    print(f"- {all_tars[-1]}")

    def progress_fn(els, length=None):
        with click.progressbar(
            els,
            label="Indexing shards",
            show_pos=True,
            length=length,
        ) as bar:
            for el in bar:
                yield el

    found_types, duplicates = BaseWebdatasetFactory.prepare_dataset(
        path,
        all_tars,
        split_parts_ratio=split_parts_ratio,
        split_parts_patterns=split_parts_patterns,
        progress_fn=progress_fn,
        tar_index_only=tar_index_only,
        shuffle_seed=None,
        workers=num_workers,
    )

    # Copy sample loader and dataset.yaml templates
    for file in template_dir.glob("*"):
        shutil.copy(file, path / ".nv-meta" / file.name)

Example usage:

First, create a template directory with the dataset.yaml file, and optionally the sample_loader.py file. Let’s call it template_dir.

Then, run the script:

if __name__ == "__main__":
    prepare_one_dataset(Path("/path/to/dataset"), 16, Path("/path/to/template_dir"))