Skip to content

Fasta

create_fasta_file(fasta_file_path, num_sequences, sequence_length=None, sequence_lengths=None, repeating_dna_pattern=ALU_SEQUENCE, max_line_length=80)

Creates a fasta file with the given number of sequences, sequence length, and repeating dna pattern. Each contig uses a shifted version of the repeating pattern.

Source code in bionemo/testing/data/fasta.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def create_fasta_file(
    fasta_file_path: Path,
    num_sequences: int,
    sequence_length: int | None = None,
    sequence_lengths: list[int] | None = None,
    repeating_dna_pattern: str = ALU_SEQUENCE,
    max_line_length: int = 80,
) -> Path:
    """Creates a fasta file with the given number of sequences, sequence length, and repeating dna pattern. Each contig uses a shifted version of the repeating pattern."""
    assert sequence_length is not None or sequence_lengths is not None
    with open(fasta_file_path, "w") as f:
        if sequence_lengths is not None:
            assert len(sequence_lengths) == num_sequences
        else:
            assert sequence_length is not None
            sequence_lengths: list[int] = [sequence_length] * num_sequences
        for i in range(num_sequences):
            # get the repeating pattern shifted by i for this contig
            repeat_pattern_for_contig = repeating_dna_pattern[i:] + repeating_dna_pattern[:i]
            # repeat the pattern enough times to reach the desired sequence length
            if sequence_lengths[i] <= len(repeat_pattern_for_contig):
                contig_output = repeat_pattern_for_contig[: sequence_lengths[i]]
            else:
                # Calculate how many complete repeats we need
                num_repeats = sequence_lengths[i] // len(repeat_pattern_for_contig)
                remainder = sequence_lengths[i] % len(repeat_pattern_for_contig)
                contig_output = repeat_pattern_for_contig * num_repeats + repeat_pattern_for_contig[:remainder]
            # verify the length of the contig is as expected
            assert len(contig_output) == sequence_lengths[i]
            # Fold the contig output into lines of max_line_length
            contig_output = "\n".join(
                contig_output[i : i + max_line_length] for i in range(0, sequence_lengths[i], max_line_length)
            )
            # write to the fasta file with the actual contig_output, not the repeating pattern
            f.write(f">contig_{i}\n{contig_output}\n")
    return fasta_file_path