Skip to content

Fasta to jsonl

Convert FASTA files to JSONL format for use with inference --prompt-file.

Each FASTA record becomes one JSONL line::

{"id": "sequence_header", "prompt": "ATCGATCG..."}

Usage::

bionemo_fasta_to_jsonl input.fasta output.jsonl
bionemo_fasta_to_jsonl input.fa output.jsonl --upper

This module is used by multiple recipes via bionemo-recipeutils. It must not import megatron-core, megatron-bridge, or NeMo.

fasta_to_jsonl(input_path, output_path, *, uppercase=False)

Convert a FASTA file to JSONL.

Parameters:

Name Type Description Default
input_path Path

Path to input FASTA file (.fasta, .fa, .fna, etc.).

required
output_path Path

Path to output JSONL file.

required
uppercase bool

If True, convert sequences to uppercase.

False

Returns:

Type Description
int

Number of records written.

Source code in bionemo/recipeutils/io/fasta_to_jsonl.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def fasta_to_jsonl(input_path: Path, output_path: Path, *, uppercase: bool = False) -> int:
    """Convert a FASTA file to JSONL.

    Args:
        input_path: Path to input FASTA file (.fasta, .fa, .fna, etc.).
        output_path: Path to output JSONL file.
        uppercase: If True, convert sequences to uppercase.

    Returns:
        Number of records written.
    """
    count = 0
    current_id: str | None = None
    sequence_parts: list[str] = []

    def _flush(f):
        nonlocal count, current_id, sequence_parts
        if current_id is not None:
            seq = "".join(sequence_parts)
            if uppercase:
                seq = seq.upper()
            f.write(json.dumps({"id": current_id, "prompt": seq}) + "\n")
            count += 1
        current_id = None
        sequence_parts = []

    with open(input_path) as fin, open(output_path, "w") as fout:
        for raw_line in fin:
            stripped = raw_line.strip()
            if not stripped:
                continue
            if stripped.startswith(">"):
                _flush(fout)
                current_id = stripped[1:].split()[0]
            else:
                sequence_parts.append(stripped)
        _flush(fout)

    return count

main()

CLI entry point.

Source code in bionemo/recipeutils/io/fasta_to_jsonl.py
 94
 95
 96
 97
 98
 99
100
101
def main() -> None:
    """CLI entry point."""
    args = parse_args()
    if not args.input.exists():
        print(f"Error: input file not found: {args.input}", file=sys.stderr)
        sys.exit(1)
    count = fasta_to_jsonl(args.input, args.output, uppercase=args.upper)
    print(f"Wrote {count} record(s) to {args.output}")

parse_args()

Parse CLI arguments.

Source code in bionemo/recipeutils/io/fasta_to_jsonl.py
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def parse_args() -> argparse.Namespace:
    """Parse CLI arguments."""
    ap = argparse.ArgumentParser(
        description="Convert a FASTA file to JSONL for use with inference --prompt-file",
    )
    ap.add_argument("input", type=Path, help="Input FASTA file")
    ap.add_argument("output", type=Path, help="Output JSONL file")
    ap.add_argument(
        "--upper",
        action="store_true",
        default=False,
        help="Convert sequences to uppercase",
    )
    return ap.parse_args()