evaluate_tss_center_mean_pred

Author

Saideep Gona

Published

October 22, 2023

Code
suppressMessages(library(tidyverse))
suppressMessages(library(glue))
PRE = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai"

## COPY THE DATE AND SLUG fields FROM THE HEADER
SLUG="evaluate_tss_center_mean_pred" ## copy the slug from the header
bDATE='2023-10-22' ## copy the date from the blog's header here
DATA = glue("{PRE}/{bDATE}-{SLUG}")
if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))
WORK=DATA

Context

Here I trained an elastic net and simple MLP mode to predict TSS RNAseq expression from Flu condition (aracena dataset) from Enformer predictions. Training was done using corpus of TSS sites from a ranmdom mixture of individuals. Here I am evaluating the performance of the trained models on a held out test set.

TODO: Average of individuals TODO: Collect RNAseq from Katie, and use this for DL-EN training AND training PrediXcan TODO: Try different MLP params

Code
library(httpgd)
library(glmnet)
library(glue)
library(data.table)
library(neuralnet)
library(dplyr)
library(ggplot2)
library(patchwork)
library(reshape2)
library(stringr)
Code
predict_on_data_subset <- function(data_subset, glmnet_model_path, nnmodel_path){

  print(glue("********* Predicting for data subset: {data_subset}"))

  # Load trained models
  glmnet_model <- readRDS(glmnet_model_path)
  nnmodel <- readRDS(nnmodel_path)

  png("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/glmnet_lambdas.png")
  plot(glmnet_model)
  dev.off()

  # Load dataset to be predicted on
  data_X <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_inputs_mean_aracena.csv")
  data_y <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_targets_mean_aracena.csv")

  X <- as.matrix(t(data.table::fread(data_X)))

  y <- as.data.frame(t(data.table::fread(data_y)))

  y_for_cv <- y[,1]

  # Make predictions and associated plots

  glmnet_predictions <- predict(glmnet_model, X, s = "lambda.min", type="response")

  png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/{data_subset}_corr_en_pred.png"))
  plot(glmnet_predictions, y[,1], ylim=c(-1,20), ylab="ground truth flu rnaseq", xlab="glmnet predictions", main = glue("Glmnet predictions vs. ground truth for {data_subset} data subset"))
  dev.off()

  pred_corr_en <- cor(glmnet_predictions, y[,1])
  print(glue("Correlation of {pred_corr_en} for {data_subset} data subset with glmnet predictions"))

  data_frame_for_training <- data.frame(cbind(X, y_for_cv))

  predictions_nn <- predict(nnmodel, data_frame_for_training)

  png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/{data_subset}_corr_nn_pred.png"))
  plot(predictions_nn, y[,1], ylim=c(-1,20), ylab="ground truth flu rnaseq", xlab="NN predictions", main = glue("NN predictions vs. ground truth for {data_subset} data subset"))
  dev.off()

  pred_corr_nn <- cor(predictions_nn, y[,1])
  print(glue("Correlation of {pred_corr_nn} for {data_subset} data subset with NN predictions"))
}
Code
analyze_input_data <- function(data_subset) {

  print(glue("********* Analyzing data for data subset: {data_subset}"))

  X <- as.matrix(t(data.table::fread(glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_inputs_mean_aracena.csv"))))
  y <- as.data.frame(t(data.table::fread(glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_targets_mean_aracena.csv"))))

  print(dim(X))
  print(dim(y))

  png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/{data_subset}_flu_vs_NI_rnaseq.png"))
  plot(y[,1], y[,7],ylim=c(-1,20),xlim=c(-1,20), xlab="ground truth flu rnaseq", ylab="ground truth NI rnaseq", main = glue("Flu vs. NI in ground truth for {data_subset} data subset"))
  dev.off()

  corr_rnaseq <- cor(y[,1], y[,7])
  print(glue("Correlation of {corr_rnaseq} for {data_subset} data subset between flu/NI rnaseq"))

  png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/{data_subset}_flu_vs_epi_rnaseq.png"))
  plot(y[,1], X[,4766],xlim=c(-1,20), xlab="ground truth flu rnaseq", ylab="Reference CAGE epigenome monocyte derived macrophages", main = glue("Reference epi vs. ground truth for {data_subset} data subset"))
  dev.off()

  corr_io <- cor(y[,1], X[,4766])
  print(glue("Correlation of {corr_io} for {data_subset} data subset between aracena flu rnaseq and corresponding enformer prediction"))

  png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/{data_subset}_NI_vs_epi_rnaseq.png"))
  plot(y[,7], X[,4766],xlim=c(-1,20), xlab="ground truth flu rnaseq", ylab="Reference CAGE epigenome monocyte derived macrophages", main = glue("Reference epi vs. ground truth for {data_subset} data subset"))
  dev.off()

  corr_io <- cor(y[,7], X[,4766])
  print(glue("Correlation of {corr_io} for {data_subset} data subset between aracena NI rnaseq and corresponding enformer prediction"))


}

Now we can use the trained models to make predictions and assess model performance.

Code
analyze_input_data("train")
********* Analyzing data for data subset: train
[1] 12232  5313
[1] 12232    12
Correlation of 0.720936539936496 for train data subset between flu/NI rnaseq
Correlation of 0.197374238365873 for train data subset between aracena flu rnaseq and corresponding enformer prediction
Correlation of 0.220588426286466 for train data subset between aracena NI rnaseq and corresponding enformer prediction
Code
analyze_input_data("valid")
********* Analyzing data for data subset: valid
[1] 1577 5313
[1] 1577   12
Correlation of 0.700955195863934 for valid data subset between flu/NI rnaseq
Correlation of 0.385249011868238 for valid data subset between aracena flu rnaseq and corresponding enformer prediction
Correlation of 0.40371591624657 for valid data subset between aracena NI rnaseq and corresponding enformer prediction
Code
analyze_input_data("test")
********* Analyzing data for data subset: test
[1] 2041 5313
[1] 2041   12
Correlation of 0.998305261247481 for test data subset between flu/NI rnaseq
Correlation of 0.0465746240713448 for test data subset between aracena flu rnaseq and corresponding enformer prediction
Correlation of 0.0563540836639804 for test data subset between aracena NI rnaseq and corresponding enformer prediction
Code
glmnet_model_path <- "/beagle3/haky/users/saideep/projects/aracena_modeling/elastic_net/trained_eln_RNAseq_from_HDF5_mean.linear.rds"
nnmodel_path <- "/beagle3/haky/users/saideep/projects/aracena_modeling/elastic_net/trained_mlp_RNAseq_from_HDF5_mean"

predict_on_data_subset("train", glmnet_model_path, nnmodel_path)
********* Predicting for data subset: train
Correlation of 0.564373045433215 for train data subset with glmnet predictions
Correlation of 0.461306680081697 for train data subset with NN predictions
Code
predict_on_data_subset("valid", glmnet_model_path, nnmodel_path)
********* Predicting for data subset: valid
Correlation of 0.72381811933344 for valid data subset with glmnet predictions
Correlation of 0.380274064824189 for valid data subset with NN predictions
Code
predict_on_data_subset("test", glmnet_model_path, nnmodel_path)
********* Predicting for data subset: test
Correlation of 0.0824962133319529 for test data subset with glmnet predictions
Correlation of 0.0202890063481298 for test data subset with NN predictions
Code
convert_region_to_gene <- function(region, gene_metadata){

  tss_metadata <- data.table::fread("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-09-20-diagnose_training_tracks_issue/canonical_TSS_full_metadata.txt")

  tss_metadata$joined_region <- paste(tss_metadata$chromosome_name, tss_metadata$transcription_start_site, tss_metadata$transcription_start_site+2, sep="_")

  tss_metadata[tss_metadata$joined_region == "1_1471765_1471767",]$ensembl_gene_id
}
Code
tss_metadata <- data.table::fread("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-09-20-diagnose_training_tracks_issue/canonical_TSS_full_metadata.txt")

tss_metadata$joined_region <- paste(tss_metadata$chromosome_name, tss_metadata$transcription_start_site, tss_metadata$transcription_start_site+2, sep="_")

tss_metadata[tss_metadata$joined_region == "1_1471765_1471767",]$ensembl_gene_id
[1] "ENSG00000160072"
Code
split_datasets_chroms <- function(data_subset, mapping_table) {

  data_subset_mapping_table <- mapping_table[,mapping_table["group",]==data_subset]
  regions <- colnames(data_subset_mapping_table)
  chroms <- strsplit(regions, split = "_")
  chroms <- sapply(chroms, function(x) x[1])
  chroms <- sapply(chroms, function(x) substr(x,2,str_length(x)))


  return(chroms)
}
Code
plot_boxplot <- function(df, path) {

    df$is_CAGE <- ifelse(df$variable == "CAGE_epi", "CAGE", "RNAseq")
    png(path)
    g <- ggplot(df, aes(x=variable, y=value)) + geom_boxplot() 
    print(g+facet_wrap(~is_CAGE, scales="free"))
    dev.off()
}
Code
run_cor<- function(x,y, method="pearson") {
  return(cor(x,y, method=method))
}
Code
per_chrom_metrics <- function(data_subset, mapping_table, X_path, y_path, glmnet_model_path, nnmodel_path, cor_method="spearman") {

  print(glue("********* Analyzing data for data subset: {data_subset}"))

  corrs_v <- c()
  corrs_v_spear <- c()
  data_subset_v <- c()
  chrom_v <- c()
  num_genes_v <- c()
  cor_id_v <- c()

  X <- as.matrix(t(data.table::fread(X_path)))
  y <- as.data.frame(t(data.table::fread(y_path)))
  y_for_cv <- y[,1]
  data_frame_for_training <- data.frame(cbind(X, y_for_cv))

  glmnet_model <- readRDS(glmnet_model_path)
  nnmodel <- readRDS(nnmodel_path)

  glmnet_predictions <- predict(glmnet_model, X, s = "lambda.min", type="response")
  predictions_nn <- predict(nnmodel, data_frame_for_training)

  # Record in storage glmnet corr
  data_subset_v <- c(data_subset_v, data_subset)
  corrs_v <- c(corrs_v, run_cor(glmnet_predictions, y[,1]))
  corrs_v_spear <- c(corrs_v_spear, run_cor(glmnet_predictions, y[,1], method="spearman"))
  chrom_v <- c(chrom_v, "all")
  num_genes_v <- c(num_genes_v, dim(X)[1])
  cor_id_v <- c(cor_id_v, "glmnet")

  # Record in storage nn corr
  data_subset_v <- c(data_subset_v, data_subset)
  corrs_v <- c(corrs_v, run_cor(predictions_nn, y[,1]))
  corrs_v_spear <- c(corrs_v_spear, run_cor(predictions_nn, y[,1], method="spearman"))
  chrom_v <- c(chrom_v, "all")
  num_genes_v <- c(num_genes_v, dim(X)[1])
  cor_id_v <- c(cor_id_v, "nn")

  # Record in storage corr between flu and NI
  data_subset_v <- c(data_subset_v, data_subset)
  corrs_v <- c(corrs_v, run_cor(y[,1], y[,7]))
  corrs_v_spear <- c(corrs_v_spear, run_cor(y[,1], y[,7], method="spearman"))
  chrom_v <- c(chrom_v, "all")
  num_genes_v <- c(num_genes_v, dim(X)[1])
  cor_id_v <- c(cor_id_v, "flu_vs_NI")

  # Record in storage corr between flu and epi
  data_subset_v <- c(data_subset_v, data_subset)
  corrs_v <- c(corrs_v, run_cor(y[,1], X[,4766]))
  corrs_v_spear <- c(corrs_v_spear, run_cor(y[,1], X[,4766], method="spearman"))
  chrom_v <- c(chrom_v, "all")
  num_genes_v <- c(num_genes_v, dim(X)[1])
  cor_id_v <- c(cor_id_v, "flu_vs_epi")

  # Done recording
  chroms_all <- split_datasets_chroms(data_subset, mapping_table)
  chroms <- as.vector(chroms_all)
  uniq_chroms <- unique(chroms)

  for (chrom in uniq_chroms) {
    index_of_chrom <- chroms == chrom
    
    X_chrom <- X[index_of_chrom,]
    y_chrom <- y[index_of_chrom,]

    print(glue("********* Analyzing data for chrom: {chrom}"))
    print(dim(X_chrom))
    print(dim(y_chrom))

    print(cor(y_chrom[,1], y_chrom[,7]))
    print(cor(y_chrom[,1], X_chrom[,4766]))
    print(cor(y_chrom[,7], X_chrom[,4766]))

    print(cor(glmnet_predictions[index_of_chrom], y_chrom[,1]))
    print(cor(predictions_nn[index_of_chrom], y_chrom[,1]))

    chrom_df <- data.frame(glmnet_pred=glmnet_predictions[index_of_chrom], flu_rnaseq = y_chrom[,1], NI_rnaseq = y_chrom[,7], CAGE_epi =X_chrom[,4766])
    chrom_df_for_plotting <- melt(chrom_df, measure_vars = colnames(chrom_df))
    plot_boxplot(chrom_df_for_plotting, glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/{data_subset}_chrom_{chrom}_boxplot.png"))

    # Record in storage glmnet corr
    data_subset_v <- c(data_subset_v, data_subset)
    corrs_v <- c(corrs_v, run_cor(glmnet_predictions[index_of_chrom], y_chrom[,1]))
    corrs_v_spear <- c(corrs_v_spear, run_cor(glmnet_predictions[index_of_chrom], y_chrom[,1], method="spearman"))
    chrom_v <- c(chrom_v, chrom)
    num_genes_v <- c(num_genes_v, dim(X_chrom)[1])
    cor_id_v <- c(cor_id_v, "glmnet")

    # Record in storage nn corr
    data_subset_v <- c(data_subset_v, data_subset)
    corrs_v <- c(corrs_v, run_cor(predictions_nn[index_of_chrom], y_chrom[,1]))
    corrs_v_spear <- c(corrs_v_spear, run_cor(predictions_nn[index_of_chrom], y_chrom[,1], method="spearman"))
    chrom_v <- c(chrom_v, chrom)
    num_genes_v <- c(num_genes_v, dim(X_chrom)[1])
    cor_id_v <- c(cor_id_v, "nn")

    # Record in storage corr between flu and NI
    data_subset_v <- c(data_subset_v, data_subset)
    corrs_v <- c(corrs_v, run_cor(y_chrom[,1], y_chrom[,7]))
    corrs_v_spear <- c(corrs_v_spear, run_cor(y_chrom[,1], y_chrom[,7], method="spearman"))
    chrom_v <- c(chrom_v, chrom)
    num_genes_v <- c(num_genes_v, dim(X_chrom)[1])
    cor_id_v <- c(cor_id_v, "flu_vs_NI")

    # Record in storage corr between flu and epi
    data_subset_v <- c(data_subset_v, data_subset)
    corrs_v <- c(corrs_v, run_cor(y_chrom[,1], X_chrom[,4766]))
    corrs_v_spear <- c(corrs_v_spear, run_cor(y_chrom[,1], X_chrom[,4766], method="spearman"))
    chrom_v <- c(chrom_v, chrom)
    num_genes_v <- c(num_genes_v, dim(X_chrom)[1])
    cor_id_v <- c(cor_id_v, "flu_vs_epi")
  }

  storage_df <- data.frame(corrs_v, corrs_v_spear, data_subset_v, chrom_v, num_genes_v, cor_id_v)

  return(storage_df)

}
Code
split_datasets <- function(data_subset, mapping_table) {

  tss_metadata <- data.table::fread("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-09-20-diagnose_training_tracks_issue/canonical_TSS_full_metadata.txt")
  tss_metadata$joined_region <- paste(tss_metadata$chromosome_name, tss_metadata$transcription_start_site, tss_metadata$transcription_start_site+2, sep="_")

  data_subset_mapping_table <- mapping_table[,mapping_table["group",]==data_subset]
  regions <- colnames(data_subset_mapping_table)
  chroms <- strsplit(regions, split = "_")
  chroms <- sapply(chroms, function(x) x[1])
  chroms <- sapply(chroms, function(x) substr(x,2,str_length(x)))

  ensembl_ids <- rep(NA, length(regions))
  external_gene_names <- rep(NA, length(regions))
  for (x in 1:length(regions)) {
    region <- substr(regions[x],2,str_length(regions[x]))
    ensembl_id <- tss_metadata[tss_metadata$joined_region == region,]$ensembl_gene_id
    external_gene_name <- tss_metadata[tss_metadata$joined_region == region,]$external_gene_name
    ensembl_ids[x]  <- ensembl_id
    external_gene_names[x] <- external_gene_name
  }
  print(length(ensembl_ids))
  print(length(external_gene_names))
  print(length(chroms))
  print(length(regions))
  return_list <- list()
  return_list[["chroms"]] <- chroms
  return_list[["regions"]] <- regions
  return_list[["ensembl_ids"]] <- ensembl_ids
  return_list[["external_gene_names"]] <- external_gene_names

  return(return_list)
}
Code
collect_all_records <- function(data_subset, mapping_table, X_path, y_path, glmnet_model_path, nnmodel_path) {
  print(glue("********* Collecting data records for data subset: {data_subset}"))

  X <- as.matrix(t(data.table::fread(X_path)))
  y <- as.data.frame(t(data.table::fread(y_path)))
  y_for_cv <- y[,1]
  data_frame_for_training <- data.frame(cbind(X, y_for_cv))

  glmnet_model <- readRDS(glmnet_model_path)
  nnmodel <- readRDS(nnmodel_path)

  glmnet_predictions <- as.numeric(predict(glmnet_model, X, s = "lambda.min", type="response"))
  predictions_nn <- predict(nnmodel, data_frame_for_training)

  gene_metadata <- split_datasets(data_subset, mapping_table)

  return_df <- data.frame(data_subset= data_subset, glmnet_pred=glmnet_predictions, nn_pred= predictions_nn, flu_rnaseq = y[,1], NI_rnaseq = y[,7], CAGE_epi =X[,4766], ensembl_id = gene_metadata[["ensembl_ids"]], external_gene_name = gene_metadata[["external_gene_names"]], region = gene_metadata[["regions"]], chroms = gene_metadata[["chroms"]])

  return(return_df)

}
Code
remapped_table_path <- "/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/remapped_table_filt.csv"

remapped_table <- read.table(remapped_table_path, header = TRUE, sep = ",")
print(dim(remapped_table))
[1]    29 15851
Code
print(as.character(remapped_table[,1]))
 [1] "EU39"          "EU27"          "AF26"          "EU33"         
 [5] "AF18"          "AF38"          "AF20"          "AF22"         
 [9] "AF08"          "AF28"          "EU25"          "AF24"         
[13] "EU05"          "AF34"          "EU47"          "EU13"         
[17] "AF36"          "EU03"          "AF30"          "AF12"         
[21] "EU09"          "EU15"          "EU07"          "EU21"         
[25] "AF16"          "EU19"          "EU41"          "ref-epigenome"
[29] "group"        
Code
rownames(remapped_table) <- as.vector(remapped_table[,1])
remapped_table[,1] <- NULL
Code
glmnet_model_path <- "/beagle3/haky/users/saideep/projects/aracena_modeling/elastic_net/trained_eln_RNAseq_from_HDF5_mean.linear.rds"
nnmodel_path <- "/beagle3/haky/users/saideep/projects/aracena_modeling/elastic_net/trained_mlp_RNAseq_from_HDF5_mean"
cor_method = "spearman"
df_store <- list()
all_records_df_store <- list()
c=1
for (data_subset in c("train","test","valid")){
  X_path <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_inputs_mean_aracena.csv")
  y_path <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_targets_mean_aracena.csv")
  cur_out_df_perchrom_metrics <- per_chrom_metrics(data_subset, remapped_table, X_path, y_path, glmnet_model_path, nnmodel_path, cor_method)
  df_store[[c]] <- cur_out_df_perchrom_metrics
  all_records_df_store[[c]] <- collect_all_records(data_subset, remapped_table, X_path, y_path, glmnet_model_path, nnmodel_path)
  c=c+1
}
********* Analyzing data for data subset: train
********* Analyzing data for chrom: 1
[1] 2014 5313
[1] 2014   12
[1] 0.585159
[1] 0.3320535
[1] 0.3200134
[1] 0.7007275
[1] 0.4931243
********* Analyzing data for chrom: 10
[1]  710 5313
[1] 710  12
[1] 0.4914441
[1] 0.2711813
[1] 0.4177587
[1] 0.7732099
[1] 0.6417866
********* Analyzing data for chrom: 11
[1] 1287 5313
[1] 1287   12
[1] 0.4685465
[1] 0.3471125
[1] 0.4118005
[1] 0.773881
[1] 0.5169719
********* Analyzing data for chrom: 16
[1]  839 5313
[1] 839  12
[1] 0.4881721
[1] 0.275333
[1] 0.4377035
[1] 0.5745219
[1] 0.4885602
********* Analyzing data for chrom: 17
[1] 1152 5313
[1] 1152   12
[1] 0.4668504
[1] 0.2194275
[1] 0.3007733
[1] 0.6878922
[1] 0.5306309
********* Analyzing data for chrom: 18
[1]  262 5313
[1] 262  12
[1] 0.7290399
[1] 0.3850517
[1] 0.4533551
[1] 0.6865047
[1] 0.2268427
********* Analyzing data for chrom: 19
[1] 1423 5313
[1] 1423   12
[1] 0.9739646
[1] 0.1549743
[1] 0.1543157
[1] 0.5526522
[1] 0.3117509
********* Analyzing data for chrom: 2
[1] 1225 5313
[1] 1225   12
[1] 0.2905684
[1] 0.1839524
[1] 0.3956614
[1] 0.6216482
[1] 0.6836889
********* Analyzing data for chrom: 20
[1]  533 5313
[1] 533  12
[1] 0.8520808
[1] 0.3117639
[1] 0.3367319
[1] 0.5004507
[1] 0.2347415
********* Analyzing data for chrom: 21
[1]  218 5313
[1] 218  12
[1] 0.6917517
[1] 0.4573482
[1] 0.6141122
[1] 0.7530417
[1] 0.3123373
********* Analyzing data for chrom: 3
[1] 1059 5313
[1] 1059   12
[1] 0.4420113
[1] 0.3056034
[1] 0.413794
[1] 0.6469695
[1] 0.3702832
********* Analyzing data for chrom: 4
[1]  748 5313
[1] 748  12
[1] 0.1806992
[1] 0.05613264
[1] 0.397568
[1] 0.5375329
[1] 0.8132907
********* Analyzing data for chrom: 9
[1]  762 5313
[1] 762  12
[1] 0.7291291
[1] 0.2504148
[1] 0.4520028
[1] 0.5974853
[1] 0.3584159
********* Collecting data records for data subset: train
[1] 12232
[1] 12232
[1] 12232
[1] 12232
********* Analyzing data for data subset: test
********* Analyzing data for chrom: 13
[1]  319 5313
[1] 319  12
[1] 0.7429337
[1] 0.430552
[1] 0.4427821
[1] 0.7805843
[1] 0.6003512
********* Analyzing data for chrom: 14
[1]  600 5313
[1] 600  12
[1] 0.9994539
[1] 0.072076
[1] 0.07470177
[1] 0.1531511
[1] 0.05186287
********* Analyzing data for chrom: 22
[1]  431 5313
[1] 431  12
[1] 0.3050559
[1] 0.2059432
[1] 0.484827
[1] 0.4394194
[1] 0.2096601
********* Analyzing data for chrom: 8
[1]  691 5313
[1] 691  12
[1] 0.6345597
[1] 0.2186081
[1] 0.4057245
[1] 0.5265987
[1] 0.2321117
********* Collecting data records for data subset: test
[1] 2041
[1] 2041
[1] 2041
[1] 2041
********* Analyzing data for data subset: valid
********* Analyzing data for chrom: 12
[1]  990 5313
[1] 990  12
[1] 0.6603037
[1] 0.333304
[1] 0.3614893
[1] 0.6966347
[1] 0.2454961
********* Analyzing data for chrom: 15
[1]  587 5313
[1] 587  12
[1] 0.8018562
[1] 0.4498818
[1] 0.4859656
[1] 0.8063956
[1] 0.4831483
********* Collecting data records for data subset: valid
[1] 1577
[1] 1577
[1] 1577
[1] 1577
Code
full_df <- do.call(rbind, df_store)
full_df$data_subset_v <- factor(full_df$data_subset_v, levels = c("train", "valid", "test"))
full_df$cor_id_v <- factor(full_df$cor_id_v, levels = c("glmnet", "nn", "flu_vs_epi", "flu_vs_NI"))
full_df$chrom_v <- factor(full_df$chrom_v, levels = c("all", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"))
Code
full_df
      corrs_v corrs_v_spear data_subset_v chrom_v num_genes_v   cor_id_v
1  0.56437305     0.4400607         train     all       12232     glmnet
2  0.46130668     0.6149674         train     all       12232         nn
3  0.72093654     0.8981904         train     all       12232  flu_vs_NI
4  0.19737424     0.7190657         train     all       12232 flu_vs_epi
5  0.70072752     0.4759208         train       1        2014     glmnet
6  0.49312427     0.6343898         train       1        2014         nn
7  0.58515902     0.9016380         train       1        2014  flu_vs_NI
8  0.33205353     0.7298823         train       1        2014 flu_vs_epi
9  0.77320992     0.4772985         train      10         710     glmnet
10 0.64178663     0.6487913         train      10         710         nn
11 0.49144409     0.8878139         train      10         710  flu_vs_NI
12 0.27118134     0.7187106         train      10         710 flu_vs_epi
13 0.77388098     0.3879118         train      11        1287     glmnet
14 0.51697192     0.6607209         train      11        1287         nn
15 0.46854652     0.9059549         train      11        1287  flu_vs_NI
16 0.34711246     0.7530519         train      11        1287 flu_vs_epi
17 0.57452192     0.4828096         train      16         839     glmnet
18 0.48856024     0.5234316         train      16         839         nn
19 0.48817208     0.8933155         train      16         839  flu_vs_NI
20 0.27533304     0.6388009         train      16         839 flu_vs_epi
21 0.68789220     0.4667182         train      17        1152     glmnet
22 0.53063095     0.5859384         train      17        1152         nn
23 0.46685038     0.8930906         train      17        1152  flu_vs_NI
24 0.21942752     0.7038228         train      17        1152 flu_vs_epi
25 0.68650467     0.4426711         train      18         262     glmnet
26 0.22684268     0.6387822         train      18         262         nn
27 0.72903990     0.9154724         train      18         262  flu_vs_NI
28 0.38505169     0.7902356         train      18         262 flu_vs_epi
29 0.55265220     0.4056606         train      19        1423     glmnet
30 0.31175090     0.5672843         train      19        1423         nn
31 0.97396459     0.8988924         train      19        1423  flu_vs_NI
32 0.15497429     0.6557346         train      19        1423 flu_vs_epi
33 0.62164822     0.4493787         train       2        1225     glmnet
34 0.68368886     0.6006122         train       2        1225         nn
35 0.29056839     0.9029122         train       2        1225  flu_vs_NI
36 0.18395238     0.7360532         train       2        1225 flu_vs_epi
37 0.50045073     0.4321967         train      20         533     glmnet
38 0.23474151     0.6430574         train      20         533         nn
39 0.85208081     0.9036955         train      20         533  flu_vs_NI
40 0.31176386     0.7297021         train      20         533 flu_vs_epi
41 0.75304174     0.5479164         train      21         218     glmnet
42 0.31233726     0.7711796         train      21         218         nn
43 0.69175174     0.9274325         train      21         218  flu_vs_NI
44 0.45734825     0.8176841         train      21         218 flu_vs_epi
45 0.64696953     0.4364245         train       3        1059     glmnet
46 0.37028322     0.6337275         train       3        1059         nn
47 0.44201128     0.8951994         train       3        1059  flu_vs_NI
48 0.30560340     0.7297525         train       3        1059 flu_vs_epi
49 0.53753285     0.3664512         train       4         748     glmnet
50 0.81329066     0.6168320         train       4         748         nn
51 0.18069922     0.9006577         train       4         748  flu_vs_NI
52 0.05613264     0.7691620         train       4         748 flu_vs_epi
53 0.59748531     0.4263626         train       9         762     glmnet
54 0.35841591     0.5390046         train       9         762         nn
55 0.72912914     0.8530804         train       9         762  flu_vs_NI
56 0.25041484     0.6488148         train       9         762 flu_vs_epi
57 0.08249621     0.4208230          test     all        2041     glmnet
58 0.02028901     0.6142361          test     all        2041         nn
59 0.99830526     0.8965072          test     all        2041  flu_vs_NI
60 0.04657462     0.7382084          test     all        2041 flu_vs_epi
61 0.78058434     0.3894792          test      13         319     glmnet
62 0.60035120     0.6189132          test      13         319         nn
63 0.74293368     0.8793152          test      13         319  flu_vs_NI
64 0.43055203     0.7464936          test      13         319 flu_vs_epi
65 0.15315112     0.4327574          test      14         600     glmnet
66 0.05186287     0.5917317          test      14         600         nn
67 0.99945392     0.9133753          test      14         600  flu_vs_NI
68 0.07207600     0.6967590          test      14         600 flu_vs_epi
69 0.43941944     0.4661211          test      22         431     glmnet
70 0.20966013     0.6194520          test      22         431         nn
71 0.30505592     0.8901027          test      22         431  flu_vs_NI
72 0.20594316     0.7080015          test      22         431 flu_vs_epi
73 0.52659868     0.3980502          test       8         691     glmnet
74 0.23211174     0.6376234          test       8         691         nn
75 0.63455974     0.8866055          test       8         691  flu_vs_NI
76 0.21860805     0.7755985          test       8         691 flu_vs_epi
77 0.72381812     0.4566016         valid     all        1577     glmnet
78 0.38027406     0.6083679         valid     all        1577         nn
79 0.70095520     0.8977642         valid     all        1577  flu_vs_NI
80 0.38524901     0.7205278         valid     all        1577 flu_vs_epi
81 0.69663472     0.4578672         valid      12         990     glmnet
82 0.24549606     0.5868101         valid      12         990         nn
83 0.66030373     0.8924773         valid      12         990  flu_vs_NI
84 0.33330399     0.7015403         valid      12         990 flu_vs_epi
85 0.80639559     0.4517969         valid      15         587     glmnet
86 0.48314833     0.6449347         valid      15         587         nn
87 0.80185617     0.9048721         valid      15         587  flu_vs_NI
88 0.44988177     0.7534411         valid      15         587 flu_vs_epi
Code
full_df$num_genes_v <- as.character(full_df$num_genes_v)
full_df_melt <- melt(full_df, measure_vars=c("corrs_v","corrs_v_spear"), id_vars=c("data_subset_v", "chrom_v", "cor_id_v", "num_genes_v"))
full_df_melt
    data_subset_v chrom_v num_genes_v   cor_id_v      variable      value
1           train     all       12232     glmnet       corrs_v 0.56437305
2           train     all       12232         nn       corrs_v 0.46130668
3           train     all       12232  flu_vs_NI       corrs_v 0.72093654
4           train     all       12232 flu_vs_epi       corrs_v 0.19737424
5           train       1        2014     glmnet       corrs_v 0.70072752
6           train       1        2014         nn       corrs_v 0.49312427
7           train       1        2014  flu_vs_NI       corrs_v 0.58515902
8           train       1        2014 flu_vs_epi       corrs_v 0.33205353
9           train      10         710     glmnet       corrs_v 0.77320992
10          train      10         710         nn       corrs_v 0.64178663
11          train      10         710  flu_vs_NI       corrs_v 0.49144409
12          train      10         710 flu_vs_epi       corrs_v 0.27118134
13          train      11        1287     glmnet       corrs_v 0.77388098
14          train      11        1287         nn       corrs_v 0.51697192
15          train      11        1287  flu_vs_NI       corrs_v 0.46854652
16          train      11        1287 flu_vs_epi       corrs_v 0.34711246
17          train      16         839     glmnet       corrs_v 0.57452192
18          train      16         839         nn       corrs_v 0.48856024
19          train      16         839  flu_vs_NI       corrs_v 0.48817208
20          train      16         839 flu_vs_epi       corrs_v 0.27533304
21          train      17        1152     glmnet       corrs_v 0.68789220
22          train      17        1152         nn       corrs_v 0.53063095
23          train      17        1152  flu_vs_NI       corrs_v 0.46685038
24          train      17        1152 flu_vs_epi       corrs_v 0.21942752
25          train      18         262     glmnet       corrs_v 0.68650467
26          train      18         262         nn       corrs_v 0.22684268
27          train      18         262  flu_vs_NI       corrs_v 0.72903990
28          train      18         262 flu_vs_epi       corrs_v 0.38505169
29          train      19        1423     glmnet       corrs_v 0.55265220
30          train      19        1423         nn       corrs_v 0.31175090
31          train      19        1423  flu_vs_NI       corrs_v 0.97396459
32          train      19        1423 flu_vs_epi       corrs_v 0.15497429
33          train       2        1225     glmnet       corrs_v 0.62164822
34          train       2        1225         nn       corrs_v 0.68368886
35          train       2        1225  flu_vs_NI       corrs_v 0.29056839
36          train       2        1225 flu_vs_epi       corrs_v 0.18395238
37          train      20         533     glmnet       corrs_v 0.50045073
38          train      20         533         nn       corrs_v 0.23474151
39          train      20         533  flu_vs_NI       corrs_v 0.85208081
40          train      20         533 flu_vs_epi       corrs_v 0.31176386
41          train      21         218     glmnet       corrs_v 0.75304174
42          train      21         218         nn       corrs_v 0.31233726
43          train      21         218  flu_vs_NI       corrs_v 0.69175174
44          train      21         218 flu_vs_epi       corrs_v 0.45734825
45          train       3        1059     glmnet       corrs_v 0.64696953
46          train       3        1059         nn       corrs_v 0.37028322
47          train       3        1059  flu_vs_NI       corrs_v 0.44201128
48          train       3        1059 flu_vs_epi       corrs_v 0.30560340
49          train       4         748     glmnet       corrs_v 0.53753285
50          train       4         748         nn       corrs_v 0.81329066
51          train       4         748  flu_vs_NI       corrs_v 0.18069922
52          train       4         748 flu_vs_epi       corrs_v 0.05613264
53          train       9         762     glmnet       corrs_v 0.59748531
54          train       9         762         nn       corrs_v 0.35841591
55          train       9         762  flu_vs_NI       corrs_v 0.72912914
56          train       9         762 flu_vs_epi       corrs_v 0.25041484
57           test     all        2041     glmnet       corrs_v 0.08249621
58           test     all        2041         nn       corrs_v 0.02028901
59           test     all        2041  flu_vs_NI       corrs_v 0.99830526
60           test     all        2041 flu_vs_epi       corrs_v 0.04657462
61           test      13         319     glmnet       corrs_v 0.78058434
62           test      13         319         nn       corrs_v 0.60035120
63           test      13         319  flu_vs_NI       corrs_v 0.74293368
64           test      13         319 flu_vs_epi       corrs_v 0.43055203
65           test      14         600     glmnet       corrs_v 0.15315112
66           test      14         600         nn       corrs_v 0.05186287
67           test      14         600  flu_vs_NI       corrs_v 0.99945392
68           test      14         600 flu_vs_epi       corrs_v 0.07207600
69           test      22         431     glmnet       corrs_v 0.43941944
70           test      22         431         nn       corrs_v 0.20966013
71           test      22         431  flu_vs_NI       corrs_v 0.30505592
72           test      22         431 flu_vs_epi       corrs_v 0.20594316
73           test       8         691     glmnet       corrs_v 0.52659868
74           test       8         691         nn       corrs_v 0.23211174
75           test       8         691  flu_vs_NI       corrs_v 0.63455974
76           test       8         691 flu_vs_epi       corrs_v 0.21860805
77          valid     all        1577     glmnet       corrs_v 0.72381812
78          valid     all        1577         nn       corrs_v 0.38027406
79          valid     all        1577  flu_vs_NI       corrs_v 0.70095520
80          valid     all        1577 flu_vs_epi       corrs_v 0.38524901
81          valid      12         990     glmnet       corrs_v 0.69663472
82          valid      12         990         nn       corrs_v 0.24549606
83          valid      12         990  flu_vs_NI       corrs_v 0.66030373
84          valid      12         990 flu_vs_epi       corrs_v 0.33330399
85          valid      15         587     glmnet       corrs_v 0.80639559
86          valid      15         587         nn       corrs_v 0.48314833
87          valid      15         587  flu_vs_NI       corrs_v 0.80185617
88          valid      15         587 flu_vs_epi       corrs_v 0.44988177
89          train     all       12232     glmnet corrs_v_spear 0.44006067
90          train     all       12232         nn corrs_v_spear 0.61496743
91          train     all       12232  flu_vs_NI corrs_v_spear 0.89819038
92          train     all       12232 flu_vs_epi corrs_v_spear 0.71906572
93          train       1        2014     glmnet corrs_v_spear 0.47592078
94          train       1        2014         nn corrs_v_spear 0.63438978
95          train       1        2014  flu_vs_NI corrs_v_spear 0.90163796
96          train       1        2014 flu_vs_epi corrs_v_spear 0.72988230
97          train      10         710     glmnet corrs_v_spear 0.47729851
98          train      10         710         nn corrs_v_spear 0.64879133
99          train      10         710  flu_vs_NI corrs_v_spear 0.88781386
100         train      10         710 flu_vs_epi corrs_v_spear 0.71871063
101         train      11        1287     glmnet corrs_v_spear 0.38791178
102         train      11        1287         nn corrs_v_spear 0.66072093
103         train      11        1287  flu_vs_NI corrs_v_spear 0.90595495
104         train      11        1287 flu_vs_epi corrs_v_spear 0.75305191
105         train      16         839     glmnet corrs_v_spear 0.48280962
106         train      16         839         nn corrs_v_spear 0.52343163
107         train      16         839  flu_vs_NI corrs_v_spear 0.89331548
108         train      16         839 flu_vs_epi corrs_v_spear 0.63880091
109         train      17        1152     glmnet corrs_v_spear 0.46671819
110         train      17        1152         nn corrs_v_spear 0.58593836
111         train      17        1152  flu_vs_NI corrs_v_spear 0.89309058
112         train      17        1152 flu_vs_epi corrs_v_spear 0.70382284
113         train      18         262     glmnet corrs_v_spear 0.44267108
114         train      18         262         nn corrs_v_spear 0.63878222
115         train      18         262  flu_vs_NI corrs_v_spear 0.91547235
116         train      18         262 flu_vs_epi corrs_v_spear 0.79023561
117         train      19        1423     glmnet corrs_v_spear 0.40566056
118         train      19        1423         nn corrs_v_spear 0.56728434
119         train      19        1423  flu_vs_NI corrs_v_spear 0.89889239
120         train      19        1423 flu_vs_epi corrs_v_spear 0.65573461
121         train       2        1225     glmnet corrs_v_spear 0.44937869
122         train       2        1225         nn corrs_v_spear 0.60061216
123         train       2        1225  flu_vs_NI corrs_v_spear 0.90291218
124         train       2        1225 flu_vs_epi corrs_v_spear 0.73605316
125         train      20         533     glmnet corrs_v_spear 0.43219673
126         train      20         533         nn corrs_v_spear 0.64305741
127         train      20         533  flu_vs_NI corrs_v_spear 0.90369551
128         train      20         533 flu_vs_epi corrs_v_spear 0.72970210
129         train      21         218     glmnet corrs_v_spear 0.54791637
130         train      21         218         nn corrs_v_spear 0.77117959
131         train      21         218  flu_vs_NI corrs_v_spear 0.92743255
132         train      21         218 flu_vs_epi corrs_v_spear 0.81768413
133         train       3        1059     glmnet corrs_v_spear 0.43642452
134         train       3        1059         nn corrs_v_spear 0.63372748
135         train       3        1059  flu_vs_NI corrs_v_spear 0.89519940
136         train       3        1059 flu_vs_epi corrs_v_spear 0.72975246
137         train       4         748     glmnet corrs_v_spear 0.36645123
138         train       4         748         nn corrs_v_spear 0.61683195
139         train       4         748  flu_vs_NI corrs_v_spear 0.90065772
140         train       4         748 flu_vs_epi corrs_v_spear 0.76916201
141         train       9         762     glmnet corrs_v_spear 0.42636257
142         train       9         762         nn corrs_v_spear 0.53900459
143         train       9         762  flu_vs_NI corrs_v_spear 0.85308044
144         train       9         762 flu_vs_epi corrs_v_spear 0.64881475
145          test     all        2041     glmnet corrs_v_spear 0.42082300
146          test     all        2041         nn corrs_v_spear 0.61423606
147          test     all        2041  flu_vs_NI corrs_v_spear 0.89650717
148          test     all        2041 flu_vs_epi corrs_v_spear 0.73820835
149          test      13         319     glmnet corrs_v_spear 0.38947919
150          test      13         319         nn corrs_v_spear 0.61891315
151          test      13         319  flu_vs_NI corrs_v_spear 0.87931524
152          test      13         319 flu_vs_epi corrs_v_spear 0.74649363
153          test      14         600     glmnet corrs_v_spear 0.43275744
154          test      14         600         nn corrs_v_spear 0.59173172
155          test      14         600  flu_vs_NI corrs_v_spear 0.91337526
156          test      14         600 flu_vs_epi corrs_v_spear 0.69675904
157          test      22         431     glmnet corrs_v_spear 0.46612115
158          test      22         431         nn corrs_v_spear 0.61945201
159          test      22         431  flu_vs_NI corrs_v_spear 0.89010271
160          test      22         431 flu_vs_epi corrs_v_spear 0.70800148
161          test       8         691     glmnet corrs_v_spear 0.39805017
162          test       8         691         nn corrs_v_spear 0.63762339
163          test       8         691  flu_vs_NI corrs_v_spear 0.88660553
164          test       8         691 flu_vs_epi corrs_v_spear 0.77559845
165         valid     all        1577     glmnet corrs_v_spear 0.45660163
166         valid     all        1577         nn corrs_v_spear 0.60836790
167         valid     all        1577  flu_vs_NI corrs_v_spear 0.89776421
168         valid     all        1577 flu_vs_epi corrs_v_spear 0.72052783
169         valid      12         990     glmnet corrs_v_spear 0.45786716
170         valid      12         990         nn corrs_v_spear 0.58681013
171         valid      12         990  flu_vs_NI corrs_v_spear 0.89247731
172         valid      12         990 flu_vs_epi corrs_v_spear 0.70154032
173         valid      15         587     glmnet corrs_v_spear 0.45179692
174         valid      15         587         nn corrs_v_spear 0.64493467
175         valid      15         587  flu_vs_NI corrs_v_spear 0.90487214
176         valid      15         587 flu_vs_epi corrs_v_spear 0.75344108

Set color palette

Code
corr_colors <- c("glmnet" = "#F8766D", "nn" = "#00BA38", "flu_vs_epi" = "#619CFF", "flu_vs_NI" = "#F564E3")
subset_colors <- c("train" = "orange", "valid" = "yellow", "test" = "black")
Code
full_df_melt_all_chroms <- full_df_melt[full_df_melt$chrom_v == "all",]

png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/corrs_all_chroms_both.png"))
ggplot(full_df_melt_all_chroms) + geom_bar(aes(x=data_subset_v, y=value, fill=cor_id_v), stat="identity", position="dodge") + scale_fill_manual(values = corr_colors) + facet_wrap(~variable, nrow=1)
dev.off()
svg 
  2 
Code
library(ggplot2)

full_df_all_chroms <- full_df[full_df$chrom_v == "all",]

png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/corrs_all_chroms.png"))
ggplot(full_df_all_chroms) + geom_bar(aes(x=data_subset_v, y=corrs_v, fill=cor_id_v), stat="identity", position="dodge") + scale_fill_manual(values = corr_colors)
dev.off()
svg 
  2 
Code
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/corrs_each_chrom.png"), width =1000, height=500)
g <- ggplot(full_df) + geom_bar(aes(x=chrom_v, y=corrs_v, fill=cor_id_v), stat="identity", position="dodge") + scale_fill_manual(values = corr_colors)
g+facet_wrap(~data_subset_v, ncol=1)
dev.off()
svg 
  2 
Code
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/corrs_each_chrom_nonfaceted.png"), width =1000, height=500)
ggplot(full_df) + geom_bar(aes(x=chrom_v, y=corrs_v, fill=cor_id_v), stat="identity", position="dodge") + scale_fill_manual(values = corr_colors)
dev.off()
svg 
  2 
Code
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/corrs_each_chrom_numgenes.png"), width =1000, height=500)
ggplot(full_df) + geom_bar(aes(x=chrom_v, y=num_genes_v), stat="identity", position="dodge")
dev.off()
svg 
  2 
Code
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/corrs_each_chrom_numgenes.png"), width =1000, height=500)
ggplot(full_df[full_df$chrom_v != "all" & full_df$cor_id_v == "glmnet",]) + geom_bar(aes(x=data_subset_v, fill=chrom_v, y=num_genes_v), stat="identity")
dev.off()
svg 
  2 
Code
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/corrs_each_chrom_numgenes_long.png"), width =1000, height=500)
ggplot(full_df[full_df$chrom_v != "all" & full_df$cor_id_v == "glmnet",]) + geom_bar(aes(x=chrom_v, fill=data_subset_v, y=num_genes_v), stat="identity") + scale_fill_manual(values = subset_colors)
dev.off()
svg 
  2 
Code
plot_list <- list()
for (id in c("glmnet", "nn", "flu_vs_epi", "flu_vs_NI")) {
  g<-ggplot(full_df[full_df$cor_id_v == id,]) + geom_bar(aes(x=chrom_v, y=corrs_v), stat="identity", position="dodge") + ylim(c(0,1))
  plot_list[[id]] <- g+facet_wrap(~data_subset_v, ncol=1)
}
for (id in c("glmnet", "nn", "flu_vs_epi", "flu_vs_NI")) {
  png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/corrs_each_chrom_{id}.png"), width =1000, height=500)
  print(plot_list[[id]])
  dev.off()
}
Code
all_records_full_df <- do.call(rbind, all_records_df_store)
all_records_full_df$data_subset <- factor(all_records_full_df$data_subset, levels = c("train", "valid", "test"))
all_records_full_df$chroms<- factor(all_records_full_df$chroms, levels = c("all", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"))

colnames(all_records_full_df)
 [1] "data_subset"        "glmnet_pred"        "nn_pred"           
 [4] "flu_rnaseq"         "NI_rnaseq"          "CAGE_epi"          
 [7] "ensembl_id"         "external_gene_name" "region"            
[10] "chroms"            
Code
all_records_full_df_melt <- melt(all_records_full_df, measure_vars = c("glmnet_pred", "nn_pred", "flu_rnaseq", "NI_rnaseq", "CAGE_epi"))

colnames(all_records_full_df_melt)
[1] "data_subset"        "ensembl_id"         "external_gene_name"
[4] "region"             "chroms"             "variable"          
[7] "value"             
Code
corr_colors <- c("glmnet" = "#F8766D", "nn" = "#00BA38", "flu_vs_epi" = "#619CFF", "flu_vs_NI" = "#F564E3")
subset_colors <- c("train" = "orange", "valid" = "yellow", "test" = "black")

png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-22-evaluate_tss_center_mean_pred/figures/all_records_scatterplots.png"), width =900, height=1200)
(ggplot(all_records_full_df) + geom_point(aes(x=glmnet_pred, y=flu_rnaseq, color=data_subset)) + facet_wrap(~data_subset, nrow=1, scales="free")+ylab("flu RNAseq")+xlab("Elastic net pred")+theme(panel.background = element_rect(fill = alpha(corr_colors["glmnet"],0.4)))+scale_color_manual(values=subset_colors)) /
(ggplot(all_records_full_df) + geom_point(aes(x=nn_pred, y=flu_rnaseq, color=data_subset)) + facet_wrap(~data_subset, nrow=1, scales="free")+ylab("flu RNAseq")+xlab("Neural net pred")+theme(panel.background = element_rect(fill = alpha(corr_colors["nn"],0.4)))+scale_color_manual(values=subset_colors)) /
(ggplot(all_records_full_df) + geom_point(aes(x=CAGE_epi, y=flu_rnaseq, color=data_subset)) + facet_wrap(~data_subset, nrow=1, scales="free")+ylab("flu RNAseq")+xlab("Enformer CAGE pred")+theme(panel.background = element_rect(fill = alpha(corr_colors["flu_vs_epi"],0.4)))+scale_color_manual(values=subset_colors)) /
(ggplot(all_records_full_df) + geom_point(aes(x=NI_rnaseq, y=flu_rnaseq, color=data_subset)) + facet_wrap(~data_subset, nrow=1, scales="free")+ylab("flu RNAseq")+xlab("NI RNAseq")+theme(panel.background = element_rect(fill = alpha(corr_colors["flu_vs_NI"],alpha=0.4)))+scale_color_manual(values=subset_colors))
dev.off()
svg 
  2 
Code
# data_subset <- "test"

# X_path <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_inputs_mean_aracena.csv")
# y_path <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_targets_mean_aracena.csv")

# # glmnet_model <- readRDS(glmnet_model_path)
# # nnmodel <- readRDS(nnmodel_path)

# X <- as.matrix(t(data.table::fread(X_path)))
# y <- as.data.frame(t(data.table::fread(y_path)))
# y_for_cv <- y[,1]
# data_frame_for_training <- data.frame(cbind(X, y_for_cv))

# predictions_nn <- predict(nnmodel, data_frame_for_training)
# glmnet_predictions <- predict(glmnet_model, X, s = "lambda.min", type="response")

# cor(glmnet_predictions, y[,1])