Creates a fasta file with the given number of sequences, sequence length, and repeating dna pattern. Each contig uses a shifted version of the repeating pattern.
Source code in bionemo/testing/data/fasta.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63 | def create_fasta_file(
fasta_file_path: Path,
num_sequences: int,
sequence_length: int | None = None,
sequence_lengths: list[int] | None = None,
repeating_dna_pattern: str = ALU_SEQUENCE,
max_line_length: int = 80,
) -> Path:
"""Creates a fasta file with the given number of sequences, sequence length, and repeating dna pattern. Each contig uses a shifted version of the repeating pattern."""
assert sequence_length is not None or sequence_lengths is not None
with open(fasta_file_path, "w") as f:
if sequence_lengths is not None:
assert len(sequence_lengths) == num_sequences
else:
assert sequence_length is not None
sequence_lengths: list[int] = [sequence_length] * num_sequences
for i in range(num_sequences):
# get the repeating pattern shifted by i for this contig
repeat_pattern_for_contig = repeating_dna_pattern[i:] + repeating_dna_pattern[:i]
# repeat the pattern enough times to reach the desired sequence length
if sequence_lengths[i] <= len(repeat_pattern_for_contig):
contig_output = repeat_pattern_for_contig[: sequence_lengths[i]]
else:
# Calculate how many complete repeats we need
num_repeats = sequence_lengths[i] // len(repeat_pattern_for_contig)
remainder = sequence_lengths[i] % len(repeat_pattern_for_contig)
contig_output = repeat_pattern_for_contig * num_repeats + repeat_pattern_for_contig[:remainder]
# verify the length of the contig is as expected
assert len(contig_output) == sequence_lengths[i]
# Fold the contig output into lines of max_line_length
contig_output = "\n".join(
contig_output[i : i + max_line_length] for i in range(0, sequence_lengths[i], max_line_length)
)
# write to the fasta file with the actual contig_output, not the repeating pattern
f.write(f">contig_{i}\n{contig_output}\n")
return fasta_file_path
|