suppressMessages(library(tidyverse))suppressMessages(library(glue))PRE ="/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai"## COPY THE DATE AND SLUG fields FROM THE HEADERSLUG="evaluate_tss_center_mean_pred"## copy the slug from the headerbDATE='2023-10-22'## copy the date from the blog's header hereDATA =glue("{PRE}/{bDATE}-{SLUG}")if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))WORK=DATA
Context
Here I trained an elastic net and simple MLP mode to predict TSS RNAseq expression from Flu condition (aracena dataset) from Enformer predictions. Training was done using corpus of TSS sites from a ranmdom mixture of individuals. Here I am evaluating the performance of the trained models on a held out test set.
TODO: Average of individuals TODO: Collect RNAseq from Katie, and use this for DL-EN training AND training PrediXcan TODO: Try different MLP params
predict_on_data_subset <-function(data_subset, glmnet_model_path, nnmodel_path){print(glue("********* Predicting for data subset: {data_subset}"))# Load trained models glmnet_model <-readRDS(glmnet_model_path) nnmodel <-readRDS(nnmodel_path)png("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/glmnet_lambdas.png")plot(glmnet_model)dev.off()# Load dataset to be predicted on data_X <-glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_inputs_mean_aracena.csv") data_y <-glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_targets_mean_aracena.csv") X <-as.matrix(t(data.table::fread(data_X))) y <-as.data.frame(t(data.table::fread(data_y))) y_for_cv <- y[,1]# Make predictions and associated plots glmnet_predictions <-predict(glmnet_model, X, s ="lambda.min", type="response")png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/{data_subset}_corr_en_pred.png"))plot(glmnet_predictions, y[,1], ylim=c(-1,20), ylab="ground truth flu rnaseq", xlab="glmnet predictions", main =glue("Glmnet predictions vs. ground truth for {data_subset} data subset"))dev.off() pred_corr_en <-cor(glmnet_predictions, y[,1])print(glue("Correlation of {pred_corr_en} for {data_subset} data subset with glmnet predictions")) data_frame_for_training <-data.frame(cbind(X, y_for_cv)) predictions_nn <-predict(nnmodel, data_frame_for_training)png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/{data_subset}_corr_nn_pred.png"))plot(predictions_nn, y[,1], ylim=c(-1,20), ylab="ground truth flu rnaseq", xlab="NN predictions", main =glue("NN predictions vs. ground truth for {data_subset} data subset"))dev.off() pred_corr_nn <-cor(predictions_nn, y[,1])print(glue("Correlation of {pred_corr_nn} for {data_subset} data subset with NN predictions"))}
Code
analyze_input_data <-function(data_subset) {print(glue("********* Analyzing data for data subset: {data_subset}")) X <-as.matrix(t(data.table::fread(glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_inputs_mean_aracena.csv")))) y <-as.data.frame(t(data.table::fread(glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_targets_mean_aracena.csv"))))print(dim(X))print(dim(y))png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/{data_subset}_flu_vs_NI_rnaseq.png"))plot(y[,1], y[,7],ylim=c(-1,20),xlim=c(-1,20), xlab="ground truth flu rnaseq", ylab="ground truth NI rnaseq", main =glue("Flu vs. NI in ground truth for {data_subset} data subset"))dev.off() corr_rnaseq <-cor(y[,1], y[,7])print(glue("Correlation of {corr_rnaseq} for {data_subset} data subset between flu/NI rnaseq"))png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/{data_subset}_flu_vs_epi_rnaseq.png"))plot(y[,1], X[,4766],xlim=c(-1,20), xlab="ground truth flu rnaseq", ylab="Reference CAGE epigenome monocyte derived macrophages", main =glue("Reference epi vs. ground truth for {data_subset} data subset"))dev.off() corr_io <-cor(y[,1], X[,4766])print(glue("Correlation of {corr_io} for {data_subset} data subset between aracena flu rnaseq and corresponding enformer prediction"))png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/{data_subset}_NI_vs_epi_rnaseq.png"))plot(y[,7], X[,4766],xlim=c(-1,20), xlab="ground truth flu rnaseq", ylab="Reference CAGE epigenome monocyte derived macrophages", main =glue("Reference epi vs. ground truth for {data_subset} data subset"))dev.off() corr_io <-cor(y[,7], X[,4766])print(glue("Correlation of {corr_io} for {data_subset} data subset between aracena NI rnaseq and corresponding enformer prediction"))}
Now we can use the trained models to make predictions and assess model performance.
Code
analyze_input_data("train")
********* Analyzing data for data subset: train
[1] 12232 5313
[1] 12232 12
Correlation of 0.720936539936496 for train data subset between flu/NI rnaseq
Correlation of 0.197374238365873 for train data subset between aracena flu rnaseq and corresponding enformer prediction
Correlation of 0.220588426286466 for train data subset between aracena NI rnaseq and corresponding enformer prediction
Code
analyze_input_data("valid")
********* Analyzing data for data subset: valid
[1] 1577 5313
[1] 1577 12
Correlation of 0.700955195863934 for valid data subset between flu/NI rnaseq
Correlation of 0.385249011868238 for valid data subset between aracena flu rnaseq and corresponding enformer prediction
Correlation of 0.40371591624657 for valid data subset between aracena NI rnaseq and corresponding enformer prediction
Code
analyze_input_data("test")
********* Analyzing data for data subset: test
[1] 2041 5313
[1] 2041 12
Correlation of 0.998305261247481 for test data subset between flu/NI rnaseq
Correlation of 0.0465746240713448 for test data subset between aracena flu rnaseq and corresponding enformer prediction
Correlation of 0.0563540836639804 for test data subset between aracena NI rnaseq and corresponding enformer prediction
********* Collecting data records for data subset: train
[1] 12232
[1] 12232
[1] 12232
[1] 12232
********* Analyzing data for data subset: test
********* Analyzing data for chrom: 13
[1] 319 5313
[1] 319 12
[1] 0.7429337
[1] 0.430552
[1] 0.4427821
[1] 0.7805843
[1] 0.6003512
********* Collecting data records for data subset: test
[1] 2041
[1] 2041
[1] 2041
[1] 2041
********* Analyzing data for data subset: valid
********* Analyzing data for chrom: 12
[1] 990 5313
[1] 990 12
[1] 0.6603037
[1] 0.333304
[1] 0.3614893
[1] 0.6966347
[1] 0.2454961