evaluating_aracena_training

Author

Saideep Gona

Published

September 8, 2023

Evaluation of the performance of the aracena models

Code
import h5py
import numpy as np
import torch
import os,sys

import kipoiseq

sys.path.append("./../2023-09-05-ref_to_aracena_mlp")

import ref_to_aracena_models
Code
model_type = "cnn"

cur_individual = "AF20"

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(device)

if model_type == "linear":
    cur_model = ref_to_aracena_models.RefToAracenaMLP(hidden_layer_dims=[])
else:
    cur_model = ref_to_aracena_models.RefToAracenaCNN()
cur_model = cur_model.to(device)
cpu
Code
saved_models_path = "/beagle3/haky/users/saideep/projects/aracena_modeling/saved_models"

trained_model_path = os.path.join(saved_models_path, "train18_ref_to_aracena_cnn_poissonlog_ep99_lr1e-06.pt")

# trained_model_path = os.path.join(saved_models_path, "train1_ref_to_aracena_cnn_poissonlog_ep19_lr1e-06.pt")

saved_model = torch.load(trained_model_path, map_location=device)

cur_model.load_state_dict(saved_model["model_state_dict"])
cur_model.eval()
RuntimeError: Error(s) in loading state_dict for RefToAracenaCNN:
    Missing key(s) in state_dict: "model.0.0.weight", "model.0.0.bias", "model.0.0.running_mean", "model.0.0.running_var", "model.0.2.weight", "model.0.2.bias", "model.1.0.weight", "model.1.0.bias". 
    Unexpected key(s) in state_dict: "model.0.weight", "model.0.bias". 
Code
saved_model_loss = saved_model["loss"]

import matplotlib.pyplot as plt
import seaborn as sns
# plt.plot(loss_tracker)
sns.lineplot(saved_model_loss)
<Axes: >

Code
sns.lineplot(saved_model_loss[-50:])
<Axes: >

Load in some validation datasets for evaluation

Code
validation_path = "/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training/AF20_valid_aracena.h5"

sequence_ind = 40
track_ind = 1

with h5py.File(validation_path, "r") as f:
    example_valid = f['targets'][:,:,sequence_ind]
    example_valid_reg = f['regions'][:,:,sequence_ind]


example_valid.shape


bigwig_track_readouts = [
    "RNAseq",
    "ATACseq",
    "H3K4me1",
    "H3K4me3",
    "H3K27ac",
    "H3K27me3"
]

conditions = ["Flu","NI"]

bigwig_tracks = []
for condition in conditions:
    for bigwig_track_readout in bigwig_track_readouts:
        bigwig_tracks.append(condition+"_"+bigwig_track_readout)


Code
import matplotlib.pyplot as plt
import seaborn as sns

def plot_tracks(tracks, interval, height=1.5):
  fig, axes = plt.subplots(len(tracks), 1, figsize=(20, height * len(tracks)), sharex=True)
  for ax, (title, y) in zip(axes, tracks.items()):
    ax.fill_between(np.linspace(interval[1], interval[2], num=len(y)), y)
    ax.set_title(title)
    sns.despine(top=True, right=True, bottom=True)
  ax.set_xlabel("_".join([str(x) for x in interval]))
  plt.tight_layout()
Code
tracks = {}
for i,track in enumerate(bigwig_tracks):
    tracks[track] = example_valid[:,i]


target_interval = [str(example_valid_reg[0,0]), example_valid_reg[0,1], example_valid_reg[0,2]]

plot_tracks(tracks, target_interval)

We can now make a prediction on the corresponding input region using our stored model(s)

Code
import pandas as pd

group = "train18"

mapping_table = pd.read_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training/index_files/remapped_table_filt.csv", index_col=0)
Code
target_interval_str = "_".join([str(x) for x in target_interval])
print(target_interval_str)

ref_index = mapping_table.loc["ref-epigenome", target_interval_str]
print(ref_index)

print(target_interval_str in mapping_table.columns)

with h5py.File("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training/ref-epigenome_valid_aracena.h5", "r") as f:
    example_ref_pred = f['ref_epigenome'][:,:,int(ref_index)]
    example_ref_region = f['regions'][:,:,int(ref_index)]

example_ref_region_str = "_".join([str(x) for x in example_ref_region[0,:]])
print(example_ref_region_str)
print(target_interval_str)

print(example_ref_pred.shape)
2_238660836_238791908
33
True
2_238660836_238791908
2_238660836_238791908
(896, 5313)
Code
def create_tracks(aracena, predictions, track_names):
    tracks = {}
    for i,track_name in enumerate(track_names):
        for track_type in ['aracena', 'model_prediction']:
            if track_type == 'aracena':
                tracks[track_name+"_"+track_type] = aracena[:,i]
            elif track_type == 'model_prediction':
                tracks[track_name+"_"+track_type] = predictions[i,:]
    return tracks

cur_model = cur_model.to(device)

input_tensor = torch.from_numpy(example_ref_pred).to(device).unsqueeze(0).to(device)

print(input_tensor.shape)


if model_type == "linear":
    pass
elif model_type == "cnn":
    input_tensore = input_tensor.swapaxes(1,2)

ex_out_raw = cur_model(input_tensor)

ex_out = ex_out_raw.squeeze().cpu().detach().numpy()[:,:]

print(ex_out.shape)
print(np.sum(ex_out, axis=0))

tracks = create_tracks(example_valid, ex_out.swapaxes(0,1), bigwig_tracks)
torch.Size([1, 896, 5313])
(896, 12)
[ 15.339787  42.597248  60.79495   19.235102  78.37865  903.8718
  98.23487   33.36463   17.024366  30.499329  14.915353  74.11469 ]
Code
plot_tracks(tracks, target_interval)

OK, for this example things are not so promising. It seems like the model was converging on the training set at least, maybe some of those predictions will show promise.

Code
model_type = "cnn"

if model_type == "linear":
    cur_model = ref_to_aracena_models.RefToAracenaMLP(hidden_layer_dims=[])
else:
    cur_model = ref_to_aracena_models.RefToAracenaCNN()
cur_model = cur_model.to(device)

group = "train18"
train_path = f"/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training/AF20_{group}_aracena.h5"

saved_models_path = "/beagle3/haky/users/saideep/projects/aracena_modeling/saved_models"

trained_model_path = os.path.join(saved_models_path, "train18_ref_to_aracena_cnn_poissonlog_ep99_lr1e-06.pt")

# trained_model_path = os.path.join(saved_models_path, "train1_ref_to_aracena_cnn_poissonlog_ep19_lr1e-06.pt")

saved_model = torch.load(trained_model_path, map_location=device)

cur_model.load_state_dict(saved_model["model_state_dict"])
cur_model.eval()

sequence_ind = 0

with h5py.File(train_path, "r") as f:
    example_train = f['targets'][:,:,sequence_ind]
    example_train_reg = f['regions'][:,:,sequence_ind]


target_interval = [str(example_train_reg[0,0]), example_train_reg[0,1], example_train_reg[0,2]]


target_interval_str = "_".join([str(x) for x in target_interval])
print(target_interval_str)

ref_index = mapping_table.loc["ref-epigenome", target_interval_str]
print(ref_index)

with h5py.File(f"/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training/ref-epigenome_{group}_aracena.h5", "r") as f:
    example_ref_pred = f['ref_epigenome'][:,:,int(ref_index)]
    example_ref_region = f['regions'][:,:,int(ref_index)]

print(example_train_reg, example_ref_region)

ex_out_raw = cur_model(torch.from_numpy(example_ref_pred).to(device).unsqueeze(0).to(device))

ex_out = ex_out_raw.squeeze().cpu().detach().numpy()[:,:].swapaxes(0,1)

tracks = create_tracks(example_train, ex_out, bigwig_tracks)
tracks["CAGE_ref"] = example_ref_pred[:,4766]
cpu
RuntimeError: Error(s) in loading state_dict for RefToAracenaCNN:
    Missing key(s) in state_dict: "model.0.0.weight", "model.0.0.bias", "model.0.0.running_mean", "model.0.0.running_var", "model.0.2.weight", "model.0.2.bias", "model.1.0.weight", "model.1.0.bias". 
    Unexpected key(s) in state_dict: "model.0.weight", "model.0.bias". 
Code
example_ref_pred.shape
(896, 5313)
Code
plot_tracks(tracks, target_interval)