Bases: BaseModel
Pydantic model class specifying the configuration schema for a preprocessed IndexedDataset (.bin, .idx).
Source code in bionemo/evo2/utils/config.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93 | class Evo2PreprocessingConfig(BaseModel):
"""Pydantic model class specifying the configuration schema for a preprocessed IndexedDataset (.bin, .idx)."""
# Paths
datapaths: list[Path] = []
output_dir: None | Path = None
output_prefix: None | str = None
# Random Datasplit
train_split: float = 0.7
valid_split: float = 0.2
test_split: float = 0.1
# Overwrite existing binaries. Otherwise, skip already preprocessed datasets.
overwrite: bool = False
# Raw Preprocessing Transforms
embed_reverse_complement: bool = False
random_reverse_complement: float = 0.0
random_lineage_dropout: float = 0.0
transcribe: None | Literal["transcribe", "back_transcribe"] = None
force_uppercase: bool = False
indexed_dataset_dtype: str = "uint8"
# Tokenization Transforms
append_eod: bool = True
enforce_sample_length: None | int = None
ftfy: bool = False
# NeMo Tokenizer Configuration
tokenizer_type: Literal[
"Byte-Level",
"HuggingFace",
"SentencePiece",
"Regex",
"Megatron",
"Tiktoken",
] = "Byte-Level"
vocab_file: None | Path = None
vocab_size: None | int = 512
merges_file: None | Path = None
tokenizer_model_name: None | str = None
pretrained_tokenizer_model: None | str = None
special_tokens: None | dict[str, str] = {}
fast_hf_tokenizer: bool = False
# Compute Configuration
# NOTE: If preprocessing a large amount of short individual sequences (< 1000 bp), do NOT use
# multiprocessing (workers > 1) because sequence-level parallel IPC will dominate the preprocessing time!
workers: int = 1
preproc_concurrency: int = 100000
chunksize: int = 1
# Filters
drop_empty_sequences: bool = False
nnn_filter: bool = False
# RNG
seed: None | int = None
# Evo2 Taxonomic Lineage Tags
# SeqID Sub-String Indexing: "ABC" will have taxonomy data from "A".
taxonomy_data: dict[str, Evo2TaxonomyLineage] = {}
# Periodicity of injecting phylogenetic lineage tags in the sequence prior to tokenization.
prompt_spacer_length: int = 131072
|