Programmatic Data Preparation
Below, we provide an example function that shows how to prepare a dataset programmatically.
from megatron.energon.flavors import BaseWebdatasetFactory
def prepare_one_dataset(path: Path, num_workers: int, template_dir: Path):
if (path / ".nv-meta" / "dataset.yaml").exists():
print(f"Dataset {path} already prepared. Skipping.")
return
# Fixed settings
tar_index_only = False
split_parts_ratio = [("train", 1), ("val", 0), ("test", 0)]
split_parts_patterns = None
# Get all tar files
all_tars = list(path.glob("**/*.tar")) + list(path.glob("**/*.tgz"))
all_tars = [str(p.relative_to(path)) for p in sorted(all_tars)]
if len(all_tars) == 0:
print("Did not find any tar files. Exiting.")
return
print(f"Found {len(all_tars)} tar files in total. The first and last ones are:")
print(f"- {all_tars[0]}")
print(f"- {all_tars[-1]}")
def progress_fn(els, length=None):
with click.progressbar(
els,
label="Indexing shards",
show_pos=True,
length=length,
) as bar:
for el in bar:
yield el
found_types, duplicates = BaseWebdatasetFactory.prepare_dataset(
path,
all_tars,
split_parts_ratio=split_parts_ratio,
split_parts_patterns=split_parts_patterns,
progress_fn=progress_fn,
tar_index_only=tar_index_only,
shuffle_seed=None,
workers=num_workers,
)
# Copy sample loader and dataset.yaml templates
for file in template_dir.glob("*"):
shutil.copy(file, path / ".nv-meta" / file.name)
Example usage:
First, create a template directory with the dataset.yaml file, and optionally the sample_loader.py file.
Let’s call it template_dir.
Then, run the script:
if __name__ == "__main__":
prepare_one_dataset(Path("/path/to/dataset"), 16, Path("/path/to/template_dir"))