Tutorial 4: RiboNN Translation Efficiency Prediction with Encodon¶

This notebook demonstrates predicting translation efficiency using pretrained Encodon models.

Overview¶

Task: Predict translation efficiency from mRNA sequences
Dataset: RiboNN dataset with human translation efficiency data
Model: Pretrained Encodon + Random Forest regressor
Method: EncodonInference wrapper for embedding extraction

1. Import Libraries and Setup¶

In [2]:

Copied!





import os
import sys
import warnings
from pathlib import Path

import numpy as np
import polars as pl
import torch
from tqdm import tqdm


warnings.filterwarnings("ignore")

# ML libraries
# Visualization
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


# Add project paths
sys.path.append("..")

# Import Encodon modules
from src.data.metadata import MetadataFields
from src.inference.encodon import EncodonInference
from src.inference.task_types import TaskTypes


# Fix random seed
torch.manual_seed(42)
np.random.seed(42)

print("✅ Libraries imported successfully!")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
import os
import sys
import warnings
from pathlib import Path

import numpy as np
import polars as pl
import torch
from tqdm import tqdm


warnings.filterwarnings("ignore")

# ML libraries
# Visualization
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


# Add project paths
sys.path.append("..")

# Import Encodon modules
from src.data.metadata import MetadataFields
from src.inference.encodon import EncodonInference
from src.inference.task_types import TaskTypes


# Fix random seed
torch.manual_seed(42)
np.random.seed(42)

print("✅ Libraries imported successfully!")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

✅ Libraries imported successfully!
PyTorch: 2.8.0a0+34c6371d24.nv25.08
CUDA available: True

2. Load Pretrained Encodon Model¶

In [ ]:

Copied!





from src.utils.load_checkpoint import download_checkpoint


# download models if necessary
download_checkpoint(
    repo_id="nvidia/NV-CodonFM-Encodon-TE-80M-v1", local_dir="/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1"
)
download_checkpoint(
    repo_id="nvidia/NV-CodonFM-Encodon-TE-600M-v1", local_dir="/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1"
)
download_checkpoint(
    repo_id="nvidia/NV-CodonFM-Encodon-TE-1B-v1", local_dir="/data/checkpoints/NV-CodonFM-Encodon-TE-1B-v1"
)
from src.utils.load_checkpoint import download_checkpoint


# download models if necessary
download_checkpoint(
    repo_id="nvidia/NV-CodonFM-Encodon-TE-80M-v1", local_dir="/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1"
)
download_checkpoint(
    repo_id="nvidia/NV-CodonFM-Encodon-TE-600M-v1", local_dir="/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1"
)
download_checkpoint(
    repo_id="nvidia/NV-CodonFM-Encodon-TE-1B-v1", local_dir="/data/checkpoints/NV-CodonFM-Encodon-TE-1B-v1"
)

In [ ]:

Copied!





# Define checkpoint paths
checkpoint_paths = [
    "/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1",
    "/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1",
    "/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-1B-v1",
]

checkpoint_path = checkpoint_paths[0]
model_loaded = False
if os.path.exists(checkpoint_path):
    try:
        device = "cuda" if torch.cuda.is_available() else "cpu"

        # Create EncodonInference wrapper
        encodon_model = EncodonInference(
            model_path=checkpoint_path, task_type=TaskTypes.EMBEDDING_PREDICTION, use_transformer_engine=True
        )

        # Configure model
        encodon_model.configure_model()
        encodon_model.to(device)
        encodon_model.eval()

        print(f"✅ Model loaded from: {checkpoint_path}")
        print(f"Device: {device}")
        print(f"Parameters: {sum(p.numel() for p in encodon_model.model.parameters()):,}")

        model_loaded = True
    except Exception as e:
        print(f"Failed to load {checkpoint_path}: {e}")

if not model_loaded:
    print("❌ Could not load any model. Please check checkpoint paths.")
# Define checkpoint paths
checkpoint_paths = [
    "/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1",
    "/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1",
    "/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-1B-v1",
]

checkpoint_path = checkpoint_paths[0]
model_loaded = False
if os.path.exists(checkpoint_path):
    try:
        device = "cuda" if torch.cuda.is_available() else "cpu"

        # Create EncodonInference wrapper
        encodon_model = EncodonInference(
            model_path=checkpoint_path, task_type=TaskTypes.EMBEDDING_PREDICTION, use_transformer_engine=True
        )

        # Configure model
        encodon_model.configure_model()
        encodon_model.to(device)
        encodon_model.eval()

        print(f"✅ Model loaded from: {checkpoint_path}")
        print(f"Device: {device}")
        print(f"Parameters: {sum(p.numel() for p in encodon_model.model.parameters()):,}")

        model_loaded = True
    except Exception as e:
        print(f"Failed to load {checkpoint_path}: {e}")

if not model_loaded:
    print("❌ Could not load any model. Please check checkpoint paths.")

3. Load Dataset¶

In [12]:

Copied!





import os
import urllib.request


# Configurable dataset path
data_path = "/data/validation/processed/data_with_human_TE_cellline_all_NA_plain.csv"

# Source URL for the TE dataset
te_dataset_url = "https://raw.githubusercontent.com/CenikLab/TE_classic_ML/refs/heads/main/data/data_with_human_TE_cellline_all_NA_plain.csv"

# Ensure parent directory exists
Path(os.path.dirname(data_path)).mkdir(parents=True, exist_ok=True)

# Download if missing
if not os.path.exists(data_path):
    print(f"Downloading TE dataset to {data_path} ...")
    urllib.request.urlretrieve(te_dataset_url, data_path)
    print("Download complete.")
else:
    print(f"Found existing dataset at {data_path}.")
import os
import urllib.request


# Configurable dataset path
data_path = "/data/validation/processed/data_with_human_TE_cellline_all_NA_plain.csv"

# Source URL for the TE dataset
te_dataset_url = "https://raw.githubusercontent.com/CenikLab/TE_classic_ML/refs/heads/main/data/data_with_human_TE_cellline_all_NA_plain.csv"

# Ensure parent directory exists
Path(os.path.dirname(data_path)).mkdir(parents=True, exist_ok=True)

# Download if missing
if not os.path.exists(data_path):
    print(f"Downloading TE dataset to {data_path} ...")
    urllib.request.urlretrieve(te_dataset_url, data_path)
    print("Download complete.")
else:
    print(f"Found existing dataset at {data_path}.")

Found existing dataset at /data/validation/processed/data_with_human_TE_cellline_all_NA_plain.csv.

In [13]:

Copied!





data = pl.read_csv(data_path, separator="\t")
data = data.with_columns(
    [
        pl.struct(["utr5_size", "cds_size", "tx_sequence"])
        .map_elements(
            lambda row: row["tx_sequence"][row["utr5_size"] : row["utr5_size"] + row["cds_size"]], return_dtype=pl.Utf8
        )
        .alias("cds_sequence"),
        pl.struct(["utr5_size", "tx_sequence"])
        .map_elements(lambda row: row["tx_sequence"][: row["utr5_size"]], return_dtype=pl.Utf8)
        .alias("utr5_sequence"),
        pl.struct(["utr5_size", "cds_size", "tx_sequence"])
        .map_elements(lambda row: row["tx_sequence"][row["utr5_size"] + row["cds_size"] :], return_dtype=pl.Utf8)
        .alias("utr3_sequence"),
    ]
).with_row_index("id")
output_path = data_path[:-4] + ".processed.csv"
data.write_csv(output_path)
data = pl.read_csv(data_path, separator="\t")
data = data.with_columns(
    [
        pl.struct(["utr5_size", "cds_size", "tx_sequence"])
        .map_elements(
            lambda row: row["tx_sequence"][row["utr5_size"] : row["utr5_size"] + row["cds_size"]], return_dtype=pl.Utf8
        )
        .alias("cds_sequence"),
        pl.struct(["utr5_size", "tx_sequence"])
        .map_elements(lambda row: row["tx_sequence"][: row["utr5_size"]], return_dtype=pl.Utf8)
        .alias("utr5_sequence"),
        pl.struct(["utr5_size", "cds_size", "tx_sequence"])
        .map_elements(lambda row: row["tx_sequence"][row["utr5_size"] + row["cds_size"] :], return_dtype=pl.Utf8)
        .alias("utr3_sequence"),
    ]
).with_row_index("id")
output_path = data_path[:-4] + ".processed.csv"
data.write_csv(output_path)

In [14]:

Copied!





# Load RiboNN dataset
data_loaded = False
if os.path.exists(output_path):
    try:
        data = pl.read_csv(output_path)
        print(f"✅ Loaded {len(data)} sequences from: {output_path}")
        print(f"Shape: {data.shape}")
        print(f"Key columns: {[col for col in ['id', 'cds_sequence', 'mean_te', 'fold'] if col in data.columns]}")

        data_loaded = True
    except Exception as e:
        print(f"Failed to load {output_path}: {e}")

    # Show basic statistics
    te_stats = data.select(
        [
            pl.col("mean_te").mean().alias("mean"),
            pl.col("mean_te").std().alias("std"),
            pl.col("mean_te").min().alias("min"),
            pl.col("mean_te").max().alias("max"),
        ]
    )
    print("\nTranslation Efficiency stats:")
    print(f"  Mean: {te_stats['mean'][0]:.4f}")
    print(f"  Range: [{te_stats['min'][0]:.4f}, {te_stats['max'][0]:.4f}]")
    data_loaded = True
# Load RiboNN dataset
data_loaded = False
if os.path.exists(output_path):
    try:
        data = pl.read_csv(output_path)
        print(f"✅ Loaded {len(data)} sequences from: {output_path}")
        print(f"Shape: {data.shape}")
        print(f"Key columns: {[col for col in ['id', 'cds_sequence', 'mean_te', 'fold'] if col in data.columns]}")

        data_loaded = True
    except Exception as e:
        print(f"Failed to load {output_path}: {e}")

    # Show basic statistics
    te_stats = data.select(
        [
            pl.col("mean_te").mean().alias("mean"),
            pl.col("mean_te").std().alias("std"),
            pl.col("mean_te").min().alias("min"),
            pl.col("mean_te").max().alias("max"),
        ]
    )
    print("\nTranslation Efficiency stats:")
    print(f"  Mean: {te_stats['mean'][0]:.4f}")
    print(f"  Range: [{te_stats['min'][0]:.4f}, {te_stats['max'][0]:.4f}]")
    data_loaded = True

✅ Loaded 11153 sequences from: /data/validation/processed/data_with_human_TE_cellline_all_NA_plain.processed.csv
Shape: (11153, 106)
Key columns: ['id', 'cds_sequence', 'mean_te', 'fold']

Translation Efficiency stats:
  Mean: 0.2474
  Range: [-2.6395, 3.4892]

4. Data Preprocessing¶

In [15]:

Copied!





demo_size = 500  # set to len(data) for full dataset
batch_size = 16

# Subsample data while maintaining split proportions using sklearn
if data_loaded and demo_size < len(data):
    print("=== SUBSAMPLING DATA ===")
    sample_fraction = demo_size / len(data)
    _, data = train_test_split(data, test_size=sample_fraction, stratify=data["fold"], random_state=42)
else:
    print(f"Using full dataset: {len(data) if data_loaded else 0} samples")
demo_size = 500  # set to len(data) for full dataset
batch_size = 16

# Subsample data while maintaining split proportions using sklearn
if data_loaded and demo_size < len(data):
    print("=== SUBSAMPLING DATA ===")
    sample_fraction = demo_size / len(data)
    _, data = train_test_split(data, test_size=sample_fraction, stratify=data["fold"], random_state=42)
else:
    print(f"Using full dataset: {len(data) if data_loaded else 0} samples")

=== SUBSAMPLING DATA ===

In [16]:

Copied!





demo_size = 500  # set to len(data) for full dataset
batch_size = 16
if data_loaded and model_loaded:
    print("=== DATA PREPROCESSING ===")
    data = data.to_pandas()
    sequences = data["cds_sequence"].tolist()
    targets = data["mean_te"].values
    # Use subset for demo
    sequences = sequences
    targets = targets

    print(f"Processing {demo_size} sequences (demo mode)")

    # Extract embeddings
    print("\nExtracting embeddings...")
    all_embeddings = []

    for i in tqdm(range(0, len(sequences), batch_size)):
        batch_seqs = sequences[i : i + batch_size]

        # Prepare batch
        batch_items = []
        for j, seq in enumerate(batch_seqs):
            seq = seq.upper().replace("U", "T")
            tokens = encodon_model.tokenizer.tokenize(seq)
            input_ids = encodon_model.tokenizer.convert_tokens_to_ids(tokens)

            # Truncate if needed
            if len(input_ids) > encodon_model.model.hparams.max_position_embeddings - 2:  # Leave room for CLS/SEP
                input_ids = input_ids[: encodon_model.model.hparams.max_position_embeddings - 2]

            # Add special tokens
            input_ids = [encodon_model.tokenizer.cls_token_id] + input_ids + [encodon_model.tokenizer.sep_token_id]
            attention_mask = [1] * len(input_ids)

            batch_items.append(
                {
                    MetadataFields.INPUT_IDS: input_ids,
                    MetadataFields.ATTENTION_MASK: attention_mask,
                }
            )

        # Pad batch
        max_len = encodon_model.model.hparams.max_position_embeddings

        padded_input_ids = []
        padded_attention_masks = []
        batch_ids = []

        for item in batch_items:
            input_ids = item[MetadataFields.INPUT_IDS]
            attention_mask = item[MetadataFields.ATTENTION_MASK]

            # Pad
            pad_len = max_len - len(input_ids)
            input_ids.extend([encodon_model.tokenizer.pad_token_id] * pad_len)
            attention_mask.extend([0] * pad_len)

            padded_input_ids.append(input_ids)
            padded_attention_masks.append(attention_mask)

        # Create batch tensor
        batch = {
            MetadataFields.INPUT_IDS: torch.tensor(padded_input_ids, dtype=torch.long).to(encodon_model.device),
            MetadataFields.ATTENTION_MASK: torch.tensor(padded_attention_masks, dtype=torch.long).to(
                encodon_model.device
            ),
        }

        # Extract embeddings
        output = encodon_model.extract_embeddings(batch)
        all_embeddings.append(output.embeddings)

    # Combine embeddings
    embeddings = np.vstack(all_embeddings)
    print(f"\n✅ Extracted embeddings: {embeddings.shape}")

else:
    print("❌ Skipping preprocessing - data or model not loaded")
demo_size = 500  # set to len(data) for full dataset
batch_size = 16
if data_loaded and model_loaded:
    print("=== DATA PREPROCESSING ===")
    data = data.to_pandas()
    sequences = data["cds_sequence"].tolist()
    targets = data["mean_te"].values
    # Use subset for demo
    sequences = sequences
    targets = targets

    print(f"Processing {demo_size} sequences (demo mode)")

    # Extract embeddings
    print("\nExtracting embeddings...")
    all_embeddings = []

    for i in tqdm(range(0, len(sequences), batch_size)):
        batch_seqs = sequences[i : i + batch_size]

        # Prepare batch
        batch_items = []
        for j, seq in enumerate(batch_seqs):
            seq = seq.upper().replace("U", "T")
            tokens = encodon_model.tokenizer.tokenize(seq)
            input_ids = encodon_model.tokenizer.convert_tokens_to_ids(tokens)

            # Truncate if needed
            if len(input_ids) > encodon_model.model.hparams.max_position_embeddings - 2:  # Leave room for CLS/SEP
                input_ids = input_ids[: encodon_model.model.hparams.max_position_embeddings - 2]

            # Add special tokens
            input_ids = [encodon_model.tokenizer.cls_token_id] + input_ids + [encodon_model.tokenizer.sep_token_id]
            attention_mask = [1] * len(input_ids)

            batch_items.append(
                {
                    MetadataFields.INPUT_IDS: input_ids,
                    MetadataFields.ATTENTION_MASK: attention_mask,
                }
            )

        # Pad batch
        max_len = encodon_model.model.hparams.max_position_embeddings

        padded_input_ids = []
        padded_attention_masks = []
        batch_ids = []

        for item in batch_items:
            input_ids = item[MetadataFields.INPUT_IDS]
            attention_mask = item[MetadataFields.ATTENTION_MASK]

            # Pad
            pad_len = max_len - len(input_ids)
            input_ids.extend([encodon_model.tokenizer.pad_token_id] * pad_len)
            attention_mask.extend([0] * pad_len)

            padded_input_ids.append(input_ids)
            padded_attention_masks.append(attention_mask)

        # Create batch tensor
        batch = {
            MetadataFields.INPUT_IDS: torch.tensor(padded_input_ids, dtype=torch.long).to(encodon_model.device),
            MetadataFields.ATTENTION_MASK: torch.tensor(padded_attention_masks, dtype=torch.long).to(
                encodon_model.device
            ),
        }

        # Extract embeddings
        output = encodon_model.extract_embeddings(batch)
        all_embeddings.append(output.embeddings)

    # Combine embeddings
    embeddings = np.vstack(all_embeddings)
    print(f"\n✅ Extracted embeddings: {embeddings.shape}")

else:
    print("❌ Skipping preprocessing - data or model not loaded")

=== DATA PREPROCESSING ===
Processing 500 sequences (demo mode)

Extracting embeddings...

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:13<00:00,  2.38it/s]

✅ Extracted embeddings: (500, 1024)

5. Train Random Forest¶

In [17]:

Copied!





if "embeddings" in locals() and "targets" in locals():
    print("=== TRAINING RANDOM FOREST ===")
    results = {"r2_scores": [], "pearson_scores": [], "mse_scores": []}
    folds = data["fold"].unique()
    for fold in folds:
        train_idx = data[data["fold"] != fold].index
        test_idx = data[data["fold"] == fold].index
        X_train, X_test = embeddings[train_idx], embeddings[test_idx]
        y_train, y_test = targets[train_idx], targets[test_idx]

        # Train Random Forest
        rf = RandomForestRegressor(n_estimators=500, max_depth=15, min_samples_split=2, random_state=42, n_jobs=-1)

        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)

        # Calculate metrics
        r2 = r2_score(y_test, y_pred)
        pearson_r, _ = pearsonr(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)

        results["r2_scores"].append(r2)
        results["pearson_scores"].append(pearson_r)
        results["mse_scores"].append(mse)

        print(f"Fold {fold}: R² = {r2:.4f}, r = {pearson_r:.4f}")

    # Summary statistics
    mean_r2 = np.mean(results["r2_scores"])
    mean_pearson = np.mean(results["pearson_scores"])
    mean_mse = np.mean(results["mse_scores"])

    print("\n=== CROSS-VALIDATION RESULTS ===")
    print(f"Mean R²: {mean_r2:.4f} ± {np.std(results['r2_scores']):.4f}")
    print(f"Mean Pearson r: {mean_pearson:.4f} ± {np.std(results['pearson_scores']):.4f}")
    print(f"Mean RMSE: {np.sqrt(mean_mse):.4f}")

    # Store final model trained on all data
    final_rf = RandomForestRegressor(n_estimators=1000, max_depth=5, random_state=42, n_jobs=-1)
    final_rf.fit(embeddings, targets)
    final_predictions = final_rf.predict(embeddings)

else:
    print("❌ Cannot train - missing embeddings or targets")
if "embeddings" in locals() and "targets" in locals():
    print("=== TRAINING RANDOM FOREST ===")
    results = {"r2_scores": [], "pearson_scores": [], "mse_scores": []}
    folds = data["fold"].unique()
    for fold in folds:
        train_idx = data[data["fold"] != fold].index
        test_idx = data[data["fold"] == fold].index
        X_train, X_test = embeddings[train_idx], embeddings[test_idx]
        y_train, y_test = targets[train_idx], targets[test_idx]

        # Train Random Forest
        rf = RandomForestRegressor(n_estimators=500, max_depth=15, min_samples_split=2, random_state=42, n_jobs=-1)

        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)

        # Calculate metrics
        r2 = r2_score(y_test, y_pred)
        pearson_r, _ = pearsonr(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)

        results["r2_scores"].append(r2)
        results["pearson_scores"].append(pearson_r)
        results["mse_scores"].append(mse)

        print(f"Fold {fold}: R² = {r2:.4f}, r = {pearson_r:.4f}")

    # Summary statistics
    mean_r2 = np.mean(results["r2_scores"])
    mean_pearson = np.mean(results["pearson_scores"])
    mean_mse = np.mean(results["mse_scores"])

    print("\n=== CROSS-VALIDATION RESULTS ===")
    print(f"Mean R²: {mean_r2:.4f} ± {np.std(results['r2_scores']):.4f}")
    print(f"Mean Pearson r: {mean_pearson:.4f} ± {np.std(results['pearson_scores']):.4f}")
    print(f"Mean RMSE: {np.sqrt(mean_mse):.4f}")

    # Store final model trained on all data
    final_rf = RandomForestRegressor(n_estimators=1000, max_depth=5, random_state=42, n_jobs=-1)
    final_rf.fit(embeddings, targets)
    final_predictions = final_rf.predict(embeddings)

else:
    print("❌ Cannot train - missing embeddings or targets")

=== TRAINING RANDOM FOREST ===
Fold 0: R² = 0.1847, r = 0.5366
Fold 8: R² = 0.3132, r = 0.6646
Fold 5: R² = 0.3702, r = 0.7084
Fold 7: R² = 0.2903, r = 0.5745
Fold 1: R² = 0.3519, r = 0.6480
Fold 3: R² = 0.3723, r = 0.7051
Fold 9: R² = 0.2300, r = 0.5158
Fold 6: R² = 0.3742, r = 0.6456
Fold 4: R² = 0.3871, r = 0.6730
Fold 2: R² = 0.1689, r = 0.6124

=== CROSS-VALIDATION RESULTS ===
Mean R²: 0.3043 ± 0.0784
Mean Pearson r: 0.6284 ± 0.0637
Mean RMSE: 0.6061

6. Plot Results¶

In [18]:

Copied!





if "final_predictions" in locals():
    fig, axes = plt.subplots(1, 2, figsize=(15, 8))
    fig.suptitle("RiboNN Translation Efficiency Prediction Results", fontsize=16)

    # Cross-validation performance
    axes[0].plot(range(len(results["r2_scores"])), results["r2_scores"], "o-", label="R²")
    axes[0].plot(range(len(results["pearson_scores"])), results["pearson_scores"], "s-", label="Pearson r")
    axes[0].set_xlabel("Fold")
    axes[0].set_ylabel("Score")
    axes[0].set_title("Cross-Validation Performance")

    # Add mean performance lines
    mean_r2 = np.mean(results["r2_scores"])
    mean_pearson = np.mean(results["pearson_scores"])
    axes[0].axhline(mean_r2, color="blue", linestyle="--", alpha=0.7, label=f"Mean R² = {mean_r2:.3f}")
    axes[0].axhline(
        mean_pearson, color="orange", linestyle="--", alpha=0.7, label=f"Mean Pearson r = {mean_pearson:.3f}"
    )
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)

    # Target distribution
    axes[1].hist(targets, bins=30, alpha=0.7, edgecolor="black")
    axes[1].axvline(targets.mean(), color="red", linestyle="--", label=f"Mean = {targets.mean():.3f}")
    axes[1].set_xlabel("Translation Efficiency")
    axes[1].set_ylabel("Frequency")
    axes[1].set_title("Target Distribution")
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()
else:
    print("❌ No results to plot")
if "final_predictions" in locals():
    fig, axes = plt.subplots(1, 2, figsize=(15, 8))
    fig.suptitle("RiboNN Translation Efficiency Prediction Results", fontsize=16)

    # Cross-validation performance
    axes[0].plot(range(len(results["r2_scores"])), results["r2_scores"], "o-", label="R²")
    axes[0].plot(range(len(results["pearson_scores"])), results["pearson_scores"], "s-", label="Pearson r")
    axes[0].set_xlabel("Fold")
    axes[0].set_ylabel("Score")
    axes[0].set_title("Cross-Validation Performance")

    # Add mean performance lines
    mean_r2 = np.mean(results["r2_scores"])
    mean_pearson = np.mean(results["pearson_scores"])
    axes[0].axhline(mean_r2, color="blue", linestyle="--", alpha=0.7, label=f"Mean R² = {mean_r2:.3f}")
    axes[0].axhline(
        mean_pearson, color="orange", linestyle="--", alpha=0.7, label=f"Mean Pearson r = {mean_pearson:.3f}"
    )
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)

    # Target distribution
    axes[1].hist(targets, bins=30, alpha=0.7, edgecolor="black")
    axes[1].axvline(targets.mean(), color="red", linestyle="--", label=f"Mean = {targets.mean():.3f}")
    axes[1].set_xlabel("Translation Efficiency")
    axes[1].set_ylabel("Frequency")
    axes[1].set_title("Target Distribution")
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()
else:
    print("❌ No results to plot")

No description has been provided for this image

7. Troubleshooting & Optimization Tips¶

Common Issues and Solutions:¶

1. Model Loading Issues¶

Problem: Checkpoint not found
Solution: Update checkpoint paths in section 2
Check: Verify checkpoint files exist and are accessible

2. Data Loading Issues¶

Problem: Dataset not found
Solution: Update data paths in section 3
Check: Ensure CSV files have required columns (id, ref_seq, value)

3. Memory Issues¶

Problem: CUDA out of memory
Solution: Reduce batch_size in preprocessing section
Alternative: Use CPU by setting device='cpu'

4. Performance Issues¶

Problem: Low R² scores
Solutions:
- Try larger models (600M or 1B parameters)
- Implement fine-tuning instead of just embeddings
- Tune Random Forest hyperparameters
- Check data quality and preprocessing

Optimization Strategies:¶

1. Model Architecture¶

80M model: Fast, good for initial experiments
600M model: Better performance, moderate cost
1B model: Best performance, highest computational cost

3. Hyperparameter Tuning¶

# Try these Random Forest parameters:
rf_params = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}