3-Zero-Shot-Mutation-Variant-Clinvar-Synonymous¶
Synonymous ClinVar variants evaluation to assesss models ability to distinguish between synonymous mutations that are pathogenic vs benign.
Prerequisites¶
Run 00-Mutation-Datasets-Preprocessing.ipynb to download required data clinvar_synom.csv.
Dataset Information¶
- Dataset: ClinVar
- Path:
/data/processed/mutation_datasets_latest/clinvar_synom.csv - Task: Zero-shot mutation effect prediction
- Models: Pretrained Encodon models (80M, 600M, 1B)
- Evaluation: Mann-Whitney U test between healthy and disease patients (-log10p) significant test values.
In [1]:
Copied!
#!/usr/bin/env python3
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
import torch
from scipy.stats import mannwhitneyu
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader
from tqdm import tqdm
# Add project paths
sys.path.append("..")
# Import Encodon-specific modules
from src.data.metadata import MetadataFields
from src.data.mutation_dataset import MutationDataset, collate_fn
from src.data.preprocess.mutation_pred import mlm_process_item
from src.inference.encodon import EncodonInference
from src.inference.task_types import TaskTypes
print("✅ All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"GPU device: {torch.cuda.get_device_name()}")
#!/usr/bin/env python3
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
import torch
from scipy.stats import mannwhitneyu
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader
from tqdm import tqdm
# Add project paths
sys.path.append("..")
# Import Encodon-specific modules
from src.data.metadata import MetadataFields
from src.data.mutation_dataset import MutationDataset, collate_fn
from src.data.preprocess.mutation_pred import mlm_process_item
from src.inference.encodon import EncodonInference
from src.inference.task_types import TaskTypes
print("✅ All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"GPU device: {torch.cuda.get_device_name()}")
/usr/local/lib/python3.12/dist-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you. import pynvml # type: ignore[import]
✅ All libraries imported successfully! PyTorch version: 2.9.0a0+50eac811a6.nv25.09 CUDA available: True GPU device: NVIDIA RTX A6000
In [2]:
Copied!
# Configuration: Data paths and model checkpoints
DATA_INPUT_PATH = "/data/processed/mutation_datasets_latest/clinvar_synom.csv"
OUTPUT_DIR = "/data/validation/encodons_results/clinvar_synom"
# Model checkpoints for inference
MODEL_CHECKPOINTS = {
"80M": "/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1",
"600m": "/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1",
"1B": "/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-1B-v1",
}
from src.utils.load_checkpoint import download_checkpoint
# download models if necessary
download_checkpoint(
repo_id="nvidia/NV-CodonFM-Encodon-TE-80M-v1", local_dir="/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1"
)
download_checkpoint(
repo_id="nvidia/NV-CodonFM-Encodon-TE-600M-v1", local_dir="/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1"
)
download_checkpoint(
repo_id="nvidia/NV-CodonFM-Encodon-TE-1B-v1", local_dir="/data/checkpoints/NV-CodonFM-Encodon-TE-1B-v1"
)
# Inference parameters
BATCH_SIZE = 64
NUM_WORKERS = 4
CONTEXT_LENGTH = 2048
# Configuration: Data paths and model checkpoints
DATA_INPUT_PATH = "/data/processed/mutation_datasets_latest/clinvar_synom.csv"
OUTPUT_DIR = "/data/validation/encodons_results/clinvar_synom"
# Model checkpoints for inference
MODEL_CHECKPOINTS = {
"80M": "/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1",
"600m": "/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1",
"1B": "/data/checkpoints/NV-CodonFM-Encodon-TE-Cdwt-1B-v1",
}
from src.utils.load_checkpoint import download_checkpoint
# download models if necessary
download_checkpoint(
repo_id="nvidia/NV-CodonFM-Encodon-TE-80M-v1", local_dir="/data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1"
)
download_checkpoint(
repo_id="nvidia/NV-CodonFM-Encodon-TE-600M-v1", local_dir="/data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1"
)
download_checkpoint(
repo_id="nvidia/NV-CodonFM-Encodon-TE-1B-v1", local_dir="/data/checkpoints/NV-CodonFM-Encodon-TE-1B-v1"
)
# Inference parameters
BATCH_SIZE = 64
NUM_WORKERS = 4
CONTEXT_LENGTH = 2048
/home/ubuntu/.local/lib/python3.12/site-packages/huggingface_hub/file_download.py:945: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn(
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
README.md: 0.00B [00:00, ?B/s]
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
README.md: 0.00B [00:00, ?B/s]
Running inference¶
This part can be skipped if the predictions are already in place
In [3]:
Copied!
def load_encodon_inference_model(checkpoint_path: str, device: str = "cuda") -> EncodonInference:
"""
Load pretrained Encodon model using the inference wrapper.
Args:
checkpoint_path: Path to the pretrained checkpoint
device: Device to load model on ('cuda' or 'cpu')
Returns:
EncodonInference object ready for mutation prediction
"""
print(f"Loading Encodon model from: {checkpoint_path}")
# Create inference wrapper
inference_model = EncodonInference(
model_path=checkpoint_path, use_transformer_engine=True, task_type=TaskTypes.MUTATION_PREDICTION
)
# Configure the model (loads checkpoint and tokenizer)
inference_model.configure_model()
inference_model.eval()
print(f"✅ Model loaded successfully on {device}")
print(f"Model parameters: {sum(p.numel() for p in inference_model.model.parameters()):,}")
print(f"Tokenizer vocabulary size: {inference_model.tokenizer.vocab_size}")
return inference_model
model_loaded = False
encodon_models = {}
for size, checkpoint_path in MODEL_CHECKPOINTS.items():
if os.path.exists(checkpoint_path):
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = load_encodon_inference_model(checkpoint_path, device=device)
# Extract model name from path
model_name = os.path.basename(os.path.dirname(os.path.dirname(checkpoint_path)))
display_name = f"EnCodon ({size})"
encodon_models[display_name] = {
"model": model,
"path": checkpoint_path,
"device": device,
"original_name": size,
}
print(f"✅ Successfully loaded {display_name} from: {checkpoint_path}")
model_loaded = True
except Exception as e:
print(f"Failed to load from {checkpoint_path}: {e}")
continue
if not model_loaded:
print("❌ Could not load any Encodon model from the specified paths.")
print("Please ensure a checkpoint exists or update the checkpoint_paths list.")
else:
print(f"\n✅ Loaded {len(encodon_models)} models: {list(encodon_models.keys())}")
def load_encodon_inference_model(checkpoint_path: str, device: str = "cuda") -> EncodonInference:
"""
Load pretrained Encodon model using the inference wrapper.
Args:
checkpoint_path: Path to the pretrained checkpoint
device: Device to load model on ('cuda' or 'cpu')
Returns:
EncodonInference object ready for mutation prediction
"""
print(f"Loading Encodon model from: {checkpoint_path}")
# Create inference wrapper
inference_model = EncodonInference(
model_path=checkpoint_path, use_transformer_engine=True, task_type=TaskTypes.MUTATION_PREDICTION
)
# Configure the model (loads checkpoint and tokenizer)
inference_model.configure_model()
inference_model.eval()
print(f"✅ Model loaded successfully on {device}")
print(f"Model parameters: {sum(p.numel() for p in inference_model.model.parameters()):,}")
print(f"Tokenizer vocabulary size: {inference_model.tokenizer.vocab_size}")
return inference_model
model_loaded = False
encodon_models = {}
for size, checkpoint_path in MODEL_CHECKPOINTS.items():
if os.path.exists(checkpoint_path):
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = load_encodon_inference_model(checkpoint_path, device=device)
# Extract model name from path
model_name = os.path.basename(os.path.dirname(os.path.dirname(checkpoint_path)))
display_name = f"EnCodon ({size})"
encodon_models[display_name] = {
"model": model,
"path": checkpoint_path,
"device": device,
"original_name": size,
}
print(f"✅ Successfully loaded {display_name} from: {checkpoint_path}")
model_loaded = True
except Exception as e:
print(f"Failed to load from {checkpoint_path}: {e}")
continue
if not model_loaded:
print("❌ Could not load any Encodon model from the specified paths.")
print("Please ensure a checkpoint exists or update the checkpoint_paths list.")
else:
print(f"\n✅ Loaded {len(encodon_models)} models: {list(encodon_models.keys())}")
Loading Encodon model from: /data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1 Loading safetensors from: /data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1/model.safetensors ✅ Model loaded successfully on cuda Model parameters: 76,833,861 Tokenizer vocabulary size: 69 ✅ Successfully loaded EnCodon (80M) from: /data/checkpoints/NV-CodonFM-Encodon-TE-80M-v1 Loading Encodon model from: /data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1 Loading safetensors from: /data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1/model.safetensors ✅ Model loaded successfully on cuda Model parameters: 609,032,261 Tokenizer vocabulary size: 69 ✅ Successfully loaded EnCodon (600m) from: /data/checkpoints/NV-CodonFM-Encodon-TE-600M-v1 ✅ Loaded 2 models: ['EnCodon (80M)', 'EnCodon (600m)']
In [4]:
Copied!
# Load ClinVar Synonymous dataset
DATASET_CONFIG = {
"key": "clinvar_synom",
"name": "ClinVar Synonymous",
"data_path": "/data/processed/mutation_datasets_latest/clinvar_synom.csv",
}
def load_dataset(config):
"""Load and inspect the ClinVar Synonymous dataset."""
print(f"Loading {config['name']} dataset...")
print(f"Path: {config['data_path']}")
if not os.path.exists(config["data_path"]):
print(f"❌ Dataset not found: {config['data_path']}")
return None
try:
# Load data using polars then convert to pandas
data = pl.read_csv(config["data_path"], ignore_errors=True).to_pandas()
print(f"✅ Loaded {len(data)} variants")
print(f"Shape: {data.shape}")
print(f"Columns: {list(data.columns)}")
# Check for required columns
required_cols = ["id", "ref_seq", "ref_codon", "alt_codon", "codon_position"]
missing_cols = [col for col in required_cols if col not in data.columns]
if missing_cols:
print(f"⚠️ Missing columns: {missing_cols}")
else:
print("✅ All required columns present")
# Show sample data
display_cols = [col for col in ["id", "ref_codon", "alt_codon", "codon_position"] if col in data.columns]
display_cols.append("label")
print("\nSample data:")
print(data[display_cols].head(3))
# Set dataloader
return data
except Exception as e:
print(f"❌ Failed to load dataset: {e}")
return None
# Load the dataset
dataset = load_dataset(DATASET_CONFIG)
print(f"\n📊 Dataset loaded: {dataset is not None}")
# Load ClinVar Synonymous dataset
DATASET_CONFIG = {
"key": "clinvar_synom",
"name": "ClinVar Synonymous",
"data_path": "/data/processed/mutation_datasets_latest/clinvar_synom.csv",
}
def load_dataset(config):
"""Load and inspect the ClinVar Synonymous dataset."""
print(f"Loading {config['name']} dataset...")
print(f"Path: {config['data_path']}")
if not os.path.exists(config["data_path"]):
print(f"❌ Dataset not found: {config['data_path']}")
return None
try:
# Load data using polars then convert to pandas
data = pl.read_csv(config["data_path"], ignore_errors=True).to_pandas()
print(f"✅ Loaded {len(data)} variants")
print(f"Shape: {data.shape}")
print(f"Columns: {list(data.columns)}")
# Check for required columns
required_cols = ["id", "ref_seq", "ref_codon", "alt_codon", "codon_position"]
missing_cols = [col for col in required_cols if col not in data.columns]
if missing_cols:
print(f"⚠️ Missing columns: {missing_cols}")
else:
print("✅ All required columns present")
# Show sample data
display_cols = [col for col in ["id", "ref_codon", "alt_codon", "codon_position"] if col in data.columns]
display_cols.append("label")
print("\nSample data:")
print(data[display_cols].head(3))
# Set dataloader
return data
except Exception as e:
print(f"❌ Failed to load dataset: {e}")
return None
# Load the dataset
dataset = load_dataset(DATASET_CONFIG)
print(f"\n📊 Dataset loaded: {dataset is not None}")
Loading ClinVar Synonymous dataset... Path: /data/processed/mutation_datasets_latest/clinvar_synom.csv ✅ Loaded 129384 variants Shape: (129384, 27) Columns: ['id', 'chrom', 'pos', 'ref', 'alt', 'var_rel_dist_in_cds', 'codon_position', 'ref_codon', 'alt_codon', 'tx', 'label', 'in_splice_junction', 'ref_seq', 'alt_seq', 'ref_aa', 'alt_aa', 'ref_codon_freq', 'alt_codon_freq', 'codon_freq_ratio', 'gene_name', 'pli', 'pli_bin', 'phylop', 'phylop_bin', 'cds_length', 'cds_offset_frac', 'cds_offset_frac_bin'] ✅ All required columns present Sample data: id ref_codon alt_codon codon_position label 0 0 GAG GAA 313 Likely pathogenic 1 1 AAC AAT 377 Benign 2 2 GTA GTT 398 Pathogenic 📊 Dataset loaded: True
In [ ]:
Copied!
def run_mutation_predictions(models, data, batch_size=32):
"""Run mutation predictions for ClinVar Alphamissense dataset."""
if data is None or not models:
print("❌ No data or models available")
return {}
print("\n=== RUNNING MUTATION PREDICTIONS FOR CLINVAR SYNONYMOUS ===")
# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
data_subset = data.copy()
all_predictions = {}
for model_name, model_info in models.items():
original_model_name = model_info.get("original_name", model_name)
print(f"\n--- Processing {model_name} ---")
output_path = os.path.join(OUTPUT_DIR, f"preds_{original_model_name}.csv")
# Create temporary CSV file
temp_csv_path = f"/tmp/clinvar_synonymous_{original_model_name.replace(' ', '_')}_temp.csv"
data_subset.to_csv(temp_csv_path, index=False)
try:
# Create MutationDataset
mutation_dataset = MutationDataset(
data_path=temp_csv_path,
tokenizer=model_info["model"].tokenizer,
process_item=mlm_process_item,
context_length=2048,
task="mlm",
extract_seq=True,
train_val_test_ratio=None,
)
# Create DataLoader
dataloader = DataLoader(
mutation_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=0
)
# Run predictions
all_ids = []
all_likelihood_ratios = []
model_info["model"].eval()
model_info["model"].to(model_info["device"])
with torch.no_grad():
for batch in tqdm(dataloader, desc=f"{model_name} predictions"):
# Move batch to device
for key in batch:
if isinstance(batch[key], torch.Tensor):
batch[key] = batch[key].to(model_info["device"])
# Get predictions
output = model_info["model"].predict_mutation(batch, ids=batch[MetadataFields.ID])
all_ids.extend(output.ids)
all_likelihood_ratios.extend(output.likelihood_ratios)
# Save predictions of current model
out_df = pl.DataFrame({"id": np.array(all_ids), original_model_name: np.array(all_likelihood_ratios)})
out_df = out_df.with_columns(pl.col("id").cast(pl.UInt32))
out_df.write_csv(output_path)
print(f"Predictions saved to {output_path}")
print(f"✅ Completed {len(all_ids)} predictions")
print(
f"Likelihood ratio range: [{np.min(all_likelihood_ratios):.3f}, {np.max(all_likelihood_ratios):.3f}]"
)
except Exception as e:
print(f"❌ Failed predictions for {model_name}: {e}")
continue
finally:
# Clean up temporary file
if os.path.exists(temp_csv_path):
os.remove(temp_csv_path)
# Offload model from GPU to free memory
if "model" in model_info and hasattr(model_info["model"], "cpu"):
model_info["model"].cpu()
print(f"🔄 Offloaded {model_name} from GPU")
# Clear GPU cache
if torch.cuda.is_available():
torch.cuda.empty_cache()
print(f"🧹 Cleared GPU cache after {model_name}")
return all_predictions
# Run predictions if models and dataset are available
if "encodon_models" in locals() and "dataset" in locals() and dataset is not None:
predictions = run_mutation_predictions(encodon_models, dataset)
print(f"\n✅ Predictions completed for {len(predictions)} models")
else:
print("❌ Cannot run predictions - missing models or dataset")
predictions = {}
def run_mutation_predictions(models, data, batch_size=32):
"""Run mutation predictions for ClinVar Alphamissense dataset."""
if data is None or not models:
print("❌ No data or models available")
return {}
print("\n=== RUNNING MUTATION PREDICTIONS FOR CLINVAR SYNONYMOUS ===")
# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
data_subset = data.copy()
all_predictions = {}
for model_name, model_info in models.items():
original_model_name = model_info.get("original_name", model_name)
print(f"\n--- Processing {model_name} ---")
output_path = os.path.join(OUTPUT_DIR, f"preds_{original_model_name}.csv")
# Create temporary CSV file
temp_csv_path = f"/tmp/clinvar_synonymous_{original_model_name.replace(' ', '_')}_temp.csv"
data_subset.to_csv(temp_csv_path, index=False)
try:
# Create MutationDataset
mutation_dataset = MutationDataset(
data_path=temp_csv_path,
tokenizer=model_info["model"].tokenizer,
process_item=mlm_process_item,
context_length=2048,
task="mlm",
extract_seq=True,
train_val_test_ratio=None,
)
# Create DataLoader
dataloader = DataLoader(
mutation_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=0
)
# Run predictions
all_ids = []
all_likelihood_ratios = []
model_info["model"].eval()
model_info["model"].to(model_info["device"])
with torch.no_grad():
for batch in tqdm(dataloader, desc=f"{model_name} predictions"):
# Move batch to device
for key in batch:
if isinstance(batch[key], torch.Tensor):
batch[key] = batch[key].to(model_info["device"])
# Get predictions
output = model_info["model"].predict_mutation(batch, ids=batch[MetadataFields.ID])
all_ids.extend(output.ids)
all_likelihood_ratios.extend(output.likelihood_ratios)
# Save predictions of current model
out_df = pl.DataFrame({"id": np.array(all_ids), original_model_name: np.array(all_likelihood_ratios)})
out_df = out_df.with_columns(pl.col("id").cast(pl.UInt32))
out_df.write_csv(output_path)
print(f"Predictions saved to {output_path}")
print(f"✅ Completed {len(all_ids)} predictions")
print(
f"Likelihood ratio range: [{np.min(all_likelihood_ratios):.3f}, {np.max(all_likelihood_ratios):.3f}]"
)
except Exception as e:
print(f"❌ Failed predictions for {model_name}: {e}")
continue
finally:
# Clean up temporary file
if os.path.exists(temp_csv_path):
os.remove(temp_csv_path)
# Offload model from GPU to free memory
if "model" in model_info and hasattr(model_info["model"], "cpu"):
model_info["model"].cpu()
print(f"🔄 Offloaded {model_name} from GPU")
# Clear GPU cache
if torch.cuda.is_available():
torch.cuda.empty_cache()
print(f"🧹 Cleared GPU cache after {model_name}")
return all_predictions
# Run predictions if models and dataset are available
if "encodon_models" in locals() and "dataset" in locals() and dataset is not None:
predictions = run_mutation_predictions(encodon_models, dataset)
print(f"\n✅ Predictions completed for {len(predictions)} models")
else:
print("❌ Cannot run predictions - missing models or dataset")
predictions = {}
Scoring: Pathogenic vs Benign¶
In [7]:
Copied!
# Load dataset with predictions from inference section
print("Loading dataset...")
dset = pl.read_csv(DATA_INPUT_PATH)
print(f"Dataset shape: {dset.shape}")
print("Label distribution:")
print(dset["label"].value_counts())
# Load predictions from inference paths saved in previous section
print("\nLoading predictions from inference outputs...")
for model_name in MODEL_CHECKPOINTS.keys():
pred_path = os.path.join(OUTPUT_DIR, f"preds_{model_name}.csv")
if os.path.exists(pred_path):
print(f" Loading {model_name} from {pred_path}")
pred_df = pl.read_csv(pred_path)
dset = dset.join(pred_df, on="id", how="left")
print(f" Loaded {len(pred_df)} predictions")
else:
print(f" Warning: Predictions for {model_name} not found at {pred_path}")
# Add pathogenicity label
dset = dset.with_columns(
(pl.col("label").str.to_lowercase().str.contains("pathogenic").cast(pl.Int8)).alias("is_pathogenic")
)
print(f"\nFinal dataset shape: {dset.shape}")
print(f"Columns with predictions: {[col for col in dset.columns if col in MODEL_CHECKPOINTS.keys()]}")
# Load dataset with predictions from inference section
print("Loading dataset...")
dset = pl.read_csv(DATA_INPUT_PATH)
print(f"Dataset shape: {dset.shape}")
print("Label distribution:")
print(dset["label"].value_counts())
# Load predictions from inference paths saved in previous section
print("\nLoading predictions from inference outputs...")
for model_name in MODEL_CHECKPOINTS.keys():
pred_path = os.path.join(OUTPUT_DIR, f"preds_{model_name}.csv")
if os.path.exists(pred_path):
print(f" Loading {model_name} from {pred_path}")
pred_df = pl.read_csv(pred_path)
dset = dset.join(pred_df, on="id", how="left")
print(f" Loaded {len(pred_df)} predictions")
else:
print(f" Warning: Predictions for {model_name} not found at {pred_path}")
# Add pathogenicity label
dset = dset.with_columns(
(pl.col("label").str.to_lowercase().str.contains("pathogenic").cast(pl.Int8)).alias("is_pathogenic")
)
print(f"\nFinal dataset shape: {dset.shape}")
print(f"Columns with predictions: {[col for col in dset.columns if col in MODEL_CHECKPOINTS.keys()]}")
Loading dataset...
Dataset shape: (129384, 27)
Label distribution:
shape: (6, 2)
┌──────────────────────────────┬───────┐
│ label ┆ count │
│ --- ┆ --- │
│ str ┆ u32 │
╞══════════════════════════════╪═══════╡
│ Benign ┆ 21065 │
│ Pathogenic/Likely pathogenic ┆ 142 │
│ Likely benign ┆ 75623 │
│ Pathogenic ┆ 93 │
│ Benign/Likely benign ┆ 32417 │
│ Likely pathogenic ┆ 44 │
└──────────────────────────────┴───────┘
Loading predictions from inference outputs...
Loading 80m from /data/validation/encodons_results/clinvar_synom/preds_80m.csv
Loaded 129453 predictions
Final dataset shape: (129384, 29)
Columns with predictions: ['80m']
In [8]:
Copied!
preds_cols = [col for col in dset.columns if col in MODEL_CHECKPOINTS.keys()]
# Create separate plots for each model
for model_name in preds_cols:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
for j, (in_splice, ax) in enumerate(zip([True, False], [ax1, ax2])):
# Filter data based on splice junction status
filtered_data = dset.filter(pl.col("in_splice_junction") == in_splice).to_pandas()
# Separate pathogenic and benign data (only from filtered data)
pathogenic_data = filtered_data[filtered_data["is_pathogenic"] == 1][model_name].dropna()
benign_data = filtered_data[filtered_data["is_pathogenic"] == 0][model_name].dropna()
# Calculate Mann-Whitney U test
p_value = None
if len(pathogenic_data) > 0 and len(benign_data) > 0:
_, p_value = mannwhitneyu(pathogenic_data, benign_data, alternative="two-sided")
# Create histogram with custom labels
sns.histplot(
data=filtered_data,
x=model_name,
hue="is_pathogenic",
hue_order=[0, 1],
bins=50,
kde=True,
stat="proportion",
common_norm=False,
ax=ax,
)
# Update legend labels to match hue_order [0, 1] = [Benign, Pathogenic]
# hue_order=[0, 1] means: 0=Benign (first), 1=Pathogenic (second)
legend_labels = [f"Benign (n={len(benign_data)})", f"Pathogenic (n={len(pathogenic_data)})"]
# Get the existing legend and update labels
legend = ax.get_legend()
if legend is not None:
for idx, text in enumerate(legend.get_texts()):
if idx < len(legend_labels):
text.set_text(legend_labels[idx])
else:
# If no legend exists, create one manually
ax.legend(labels=legend_labels)
# Set title and labels
splice_status = "IN" if in_splice else "NOT in"
title = f"Histogram of {model_name} by Pathogenicity ({splice_status} Splice Junction)"
if p_value is not None:
title += f"\nMann-Whitney U p-value: {p_value:.2e}"
ax.set_title(title)
ax.set_xlabel(f"{model_name} likelihood ratio")
ax.set_ylabel("Density")
plt.tight_layout()
plt.show()
preds_cols = [col for col in dset.columns if col in MODEL_CHECKPOINTS.keys()]
# Create separate plots for each model
for model_name in preds_cols:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
for j, (in_splice, ax) in enumerate(zip([True, False], [ax1, ax2])):
# Filter data based on splice junction status
filtered_data = dset.filter(pl.col("in_splice_junction") == in_splice).to_pandas()
# Separate pathogenic and benign data (only from filtered data)
pathogenic_data = filtered_data[filtered_data["is_pathogenic"] == 1][model_name].dropna()
benign_data = filtered_data[filtered_data["is_pathogenic"] == 0][model_name].dropna()
# Calculate Mann-Whitney U test
p_value = None
if len(pathogenic_data) > 0 and len(benign_data) > 0:
_, p_value = mannwhitneyu(pathogenic_data, benign_data, alternative="two-sided")
# Create histogram with custom labels
sns.histplot(
data=filtered_data,
x=model_name,
hue="is_pathogenic",
hue_order=[0, 1],
bins=50,
kde=True,
stat="proportion",
common_norm=False,
ax=ax,
)
# Update legend labels to match hue_order [0, 1] = [Benign, Pathogenic]
# hue_order=[0, 1] means: 0=Benign (first), 1=Pathogenic (second)
legend_labels = [f"Benign (n={len(benign_data)})", f"Pathogenic (n={len(pathogenic_data)})"]
# Get the existing legend and update labels
legend = ax.get_legend()
if legend is not None:
for idx, text in enumerate(legend.get_texts()):
if idx < len(legend_labels):
text.set_text(legend_labels[idx])
else:
# If no legend exists, create one manually
ax.legend(labels=legend_labels)
# Set title and labels
splice_status = "IN" if in_splice else "NOT in"
title = f"Histogram of {model_name} by Pathogenicity ({splice_status} Splice Junction)"
if p_value is not None:
title += f"\nMann-Whitney U p-value: {p_value:.2e}"
ax.set_title(title)
ax.set_xlabel(f"{model_name} likelihood ratio")
ax.set_ylabel("Density")
plt.tight_layout()
plt.show()
In [9]:
Copied!
dset.head()
dset.head()
Out[9]:
shape: (5, 29)
| id | chrom | pos | ref | alt | var_rel_dist_in_cds | codon_position | ref_codon | alt_codon | tx | label | in_splice_junction | ref_seq | alt_seq | ref_aa | alt_aa | ref_codon_freq | alt_codon_freq | codon_freq_ratio | gene_name | pli | pli_bin | phylop | phylop_bin | cds_length | cds_offset_frac | cds_offset_frac_bin | 80m | is_pathogenic |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| i64 | str | i64 | str | str | i64 | i64 | str | str | str | str | bool | str | str | str | str | f64 | f64 | f64 | str | f64 | i64 | f64 | i64 | i64 | f64 | i64 | f64 | i8 |
| 0 | "chr1" | 45015006 | "G" | "A" | 941 | 313 | "GAG" | "GAA" | "NM_000374.5" | "Likely pathogenic" | true | "ATGGAAGCGAATGGGTTGGGACCTCAGGGT… | "ATGGAAGCGAATGGGTTGGGACCTCAGGGT… | "E" | "E" | 4.6414453e7 | 3.7827281e7 | 0.20458 | "UROD" | 0.0 | 0 | 7.998 | 8 | 1104 | 0.852355 | 8 | 1.1841149 | 1 |
| 1 | "chr10" | 124400865 | "G" | "A" | 1133 | 377 | "AAC" | "AAT" | "NM_000274.4" | "Benign" | false | "ATGTTTTCCAAACTAGCACATTTGCAGAGG… | "ATGTTTTCCAAACTAGCACATTTGCAGAGG… | "N" | "N" | 2.0900468e7 | 2.0353876e7 | 0.0265 | "OAT" | 0.0 | 0 | -2.351 | -2 | 1320 | 0.858333 | 8 | -1.123766 | 0 |
| 2 | "chr12" | 102843648 | "T" | "A" | 1196 | 398 | "GTA" | "GTT" | "NM_000277.3" | "Pathogenic" | false | "ATGTCCACTGCGGTCCTGGAAAACCCAGGC… | "ATGTCCACTGCGGTCCTGGAAAACCCAGGC… | "V" | "V" | 8.725178e6 | 1.3189619e7 | -0.413217 | "PAH" | 0.0 | 0 | 1.96 | 2 | 1359 | 0.880059 | 8 | 0.160462 | 1 |
| 3 | "chr7" | 65967767 | "G" | "A" | 1616 | 538 | "AGC" | "AGT" | "NM_000181.4" | "Pathogenic/Likely pathogenic" | false | "ATGGCCCGGGGGTCGGCGGTTGCCTGGGCG… | "ATGGCCCGGGGGTCGGCGGTTGCCTGGGCG… | "S" | "S" | 2.2875088e7 | 1.5825387e7 | 0.368433 | "GUSB" | 0.0 | 0 | 2.034 | 2 | 1956 | 0.826176 | 8 | 0.398401 | 1 |
| 4 | "chr9" | 137233722 | "G" | "A" | 845 | 281 | "CCG" | "CCA" | "NM_001177316.2" | "Pathogenic/Likely pathogenic" | true | "ATGCCGAGTTCCCTTCCCGGCAGCCAGGTC… | "ATGCCGAGTTCCCTTCCCGGCAGCCAGGTC… | "P" | "P" | 7.888797e6 | 2.1390054e7 | -0.997482 | "SLC34A3" | 0.0 | 0 | 2.659 | 3 | 1800 | 0.469444 | 4 | 0.8169 | 1 |
In [10]:
Copied!
temp = dset.filter(~pl.col("in_splice_junction")).to_pandas()
temp["is_pathogenic"] = temp["is_pathogenic"].astype(bool)
mwu_scores = []
patho = temp.loc[temp["is_pathogenic"]]
non_patho = temp.loc[~temp["is_pathogenic"]]
for col in preds_cols + ["phylop", "codon_freq_ratio"]:
roc_score = roc_auc_score(temp["is_pathogenic"], temp[col])
mwu = mannwhitneyu(patho[col], non_patho[col])
mwu_scores.append([col, mwu.pvalue, mwu.statistic, roc_score])
mwu_scores_df = pd.DataFrame(mwu_scores, columns=["col", "pvalue", "statistic", "roc_score"])
mwu_scores_df["-log10(pvalue)"] = -np.log10(mwu_scores_df["pvalue"])
plt.figure()
sns.barplot(x="col", y="-log10(pvalue)", data=mwu_scores_df)
plt.title("Pathogenic all vs Benign all\nMann-Whitney U Test -log10(pvalue)")
plt.xticks(rotation=45)
plt.figure()
plt.xticks(rotation=45)
sns.barplot(x="col", y="roc_score", data=mwu_scores_df)
plt.ylim(0.5, 1)
plt.title("Pathogenic all vs Benign all\nROC Score")
temp = dset.filter(~pl.col("in_splice_junction")).to_pandas()
temp["is_pathogenic"] = temp["is_pathogenic"].astype(bool)
mwu_scores = []
patho = temp.loc[temp["is_pathogenic"]]
non_patho = temp.loc[~temp["is_pathogenic"]]
for col in preds_cols + ["phylop", "codon_freq_ratio"]:
roc_score = roc_auc_score(temp["is_pathogenic"], temp[col])
mwu = mannwhitneyu(patho[col], non_patho[col])
mwu_scores.append([col, mwu.pvalue, mwu.statistic, roc_score])
mwu_scores_df = pd.DataFrame(mwu_scores, columns=["col", "pvalue", "statistic", "roc_score"])
mwu_scores_df["-log10(pvalue)"] = -np.log10(mwu_scores_df["pvalue"])
plt.figure()
sns.barplot(x="col", y="-log10(pvalue)", data=mwu_scores_df)
plt.title("Pathogenic all vs Benign all\nMann-Whitney U Test -log10(pvalue)")
plt.xticks(rotation=45)
plt.figure()
plt.xticks(rotation=45)
sns.barplot(x="col", y="roc_score", data=mwu_scores_df)
plt.ylim(0.5, 1)
plt.title("Pathogenic all vs Benign all\nROC Score")
Out[10]:
Text(0.5, 1.0, 'Pathogenic all vs Benign all\nROC Score')
In [11]:
Copied!
temp = dset.filter(~pl.col("in_splice_junction")).to_pandas()
temp["is_pathogenic"] = temp["is_pathogenic"].astype(bool)
temp["match_key"] = (
temp["ref_codon"]
+ ">"
+ temp["alt_codon"]
+ ">"
+ temp["pli_bin"].astype(str)
+ ">"
+ temp["cds_offset_frac_bin"].astype(str)
)
mwu_scores = []
for _ in range(50):
# Split into pathogenic and non-pathogenic
patho = temp.loc[temp["is_pathogenic"]]
non_patho = temp.loc[~temp["is_pathogenic"]]
# Count trinuc_change in pathogenic
patho_trinuc_counts = patho["match_key"].value_counts()
sampled_non_patho = []
for trinuc, count in patho_trinuc_counts.items():
subset = non_patho[non_patho["match_key"] == trinuc]
if len(subset) >= count:
sampled = subset.sample(n=count)
else:
# print(f"Not enough {trinuc} in non-pathogenic data, taking all available")
sampled = subset # take all available if not enough
sampled_non_patho.append(sampled)
sampled_non_patho = pd.concat(sampled_non_patho, ignore_index=True)
# Combine back with pathogenic
temp_sampled = pd.concat([patho, sampled_non_patho], ignore_index=True)
for col in preds_cols + ["phylop", "codon_freq_ratio"]:
roc_score = roc_auc_score(temp_sampled["is_pathogenic"], temp_sampled[col])
mwu = mannwhitneyu(patho[col], sampled_non_patho[col])
mwu_scores.append([col, mwu.pvalue, mwu.statistic, roc_score])
mwu_scores_df = pd.DataFrame(mwu_scores, columns=["col", "pvalue", "statistic", "roc_score"])
mwu_scores_df["-log10(pvalue)"] = -np.log10(mwu_scores_df["pvalue"])
# mwu_scores_df = mwu_scores_df[mwu_scores_df['col'] != '1b_cdswt']
plt.figure()
sns.boxplot(x="col", y="-log10(pvalue)", data=mwu_scores_df)
plt.title("Pathogenic vs Benign matched\nMann-Whitney U Test -log10(pvalue)")
plt.xticks(rotation=45)
plt.figure()
sns.boxplot(x="col", y="roc_score", data=mwu_scores_df)
plt.title("Pathogenic vs Benign matched\nROC Score")
plt.xticks(rotation=45)
temp = dset.filter(~pl.col("in_splice_junction")).to_pandas()
temp["is_pathogenic"] = temp["is_pathogenic"].astype(bool)
temp["match_key"] = (
temp["ref_codon"]
+ ">"
+ temp["alt_codon"]
+ ">"
+ temp["pli_bin"].astype(str)
+ ">"
+ temp["cds_offset_frac_bin"].astype(str)
)
mwu_scores = []
for _ in range(50):
# Split into pathogenic and non-pathogenic
patho = temp.loc[temp["is_pathogenic"]]
non_patho = temp.loc[~temp["is_pathogenic"]]
# Count trinuc_change in pathogenic
patho_trinuc_counts = patho["match_key"].value_counts()
sampled_non_patho = []
for trinuc, count in patho_trinuc_counts.items():
subset = non_patho[non_patho["match_key"] == trinuc]
if len(subset) >= count:
sampled = subset.sample(n=count)
else:
# print(f"Not enough {trinuc} in non-pathogenic data, taking all available")
sampled = subset # take all available if not enough
sampled_non_patho.append(sampled)
sampled_non_patho = pd.concat(sampled_non_patho, ignore_index=True)
# Combine back with pathogenic
temp_sampled = pd.concat([patho, sampled_non_patho], ignore_index=True)
for col in preds_cols + ["phylop", "codon_freq_ratio"]:
roc_score = roc_auc_score(temp_sampled["is_pathogenic"], temp_sampled[col])
mwu = mannwhitneyu(patho[col], sampled_non_patho[col])
mwu_scores.append([col, mwu.pvalue, mwu.statistic, roc_score])
mwu_scores_df = pd.DataFrame(mwu_scores, columns=["col", "pvalue", "statistic", "roc_score"])
mwu_scores_df["-log10(pvalue)"] = -np.log10(mwu_scores_df["pvalue"])
# mwu_scores_df = mwu_scores_df[mwu_scores_df['col'] != '1b_cdswt']
plt.figure()
sns.boxplot(x="col", y="-log10(pvalue)", data=mwu_scores_df)
plt.title("Pathogenic vs Benign matched\nMann-Whitney U Test -log10(pvalue)")
plt.xticks(rotation=45)
plt.figure()
sns.boxplot(x="col", y="roc_score", data=mwu_scores_df)
plt.title("Pathogenic vs Benign matched\nROC Score")
plt.xticks(rotation=45)
Out[11]:
([0, 1, 2], [Text(0, 0, '80m'), Text(1, 0, 'phylop'), Text(2, 0, 'codon_freq_ratio')])