Code
library(data.table)
library(glue)
library(tidyverse)
Saideep Gona
October 25, 2023
After evaluating the performance of my trained models, it seems clear that the spearman correlations are very stable genome-wide, but the pearson correlations are not. This is likely due to the fact that the pearson correlations are sensitive to outliers, of which I have observed many. In particular, chromosome 14 has 2 extreme outliers, and in general there are many outliers in the tails of the distributions. I think the model will perform better on log-normalized data, so I will try that next.
NN training: 8800 min thresh: 0.311676707121647 8817 error: 273.14126 time: 48.73 mins
# path_to_data = "/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered"
# for (data_subset in c("train", "valid", "test")) {
# inputs_path <- glue("{path_to_data}/{data_subset}_inputs_mean_aracena.csv")
# inputs <- fread(inputs_path)
# log_inputs <- log(inputs+1)
# log_inputs_path <- glue("{path_to_data}/{data_subset}_inputs_mean_aracena_log.csv")
# write_delim(log_inputs, log_inputs_path, delim = "\t")
# targets_path <- glue("{path_to_data}/{data_subset}_targets_mean_aracena.csv")
# targets <- fread(targets_path)
# log_targets <- log(targets+1)
# log_targets_path <- glue("{path_to_data}/{data_subset}_targets_mean_aracena_log.csv")
# write_delim(log_targets, log_targets_path, delim = "\t")
# }
convert_region_to_gene <- function(region, gene_metadata){
tss_metadata <- data.table::fread("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-09-20-diagnose_training_tracks_issue/canonical_TSS_full_metadata.txt")
tss_metadata$joined_region <- paste(tss_metadata$chromosome_name, tss_metadata$transcription_start_site, tss_metadata$transcription_start_site+2, sep="_")
}
split_datasets_chroms <- function(data_subset, mapping_table) {
data_subset_mapping_table <- mapping_table[,mapping_table["group",]==data_subset]
regions <- colnames(data_subset_mapping_table)
chroms <- strsplit(regions, split = "_")
chroms <- sapply(chroms, function(x) x[1])
chroms <- sapply(chroms, function(x) substr(x,2,str_length(x)))
return(chroms)
}
per_chrom_metrics <- function(data_subset, mapping_table, X_path, y_path, glmnet_model_path, nnmodel_path, cor_method="pearson", primary_ind=1) {
print(glue("********* Analyzing data for data subset: {data_subset}"))
corrs_v <- c()
corrs_v_spear <- c()
data_subset_v <- c()
chrom_v <- c()
num_genes_v <- c()
cor_id_v <- c()
X <- as.matrix(t(data.table::fread(X_path)))
y <- as.data.frame(t(data.table::fread(y_path)))
y_for_cv <- y[,primary_ind]
data_frame_for_training <- data.frame(cbind(X, y_for_cv))
glmnet_model <- readRDS(glmnet_model_path)
nnmodel <- readRDS(nnmodel_path)
glmnet_predictions <- predict(glmnet_model, X, s = "lambda.min", type="response")
predictions_nn <- predict(nnmodel, data_frame_for_training)
# Record in storage glmnet corr
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(glmnet_predictions, y[,primary_ind], method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(glmnet_predictions, y[,primary_ind], method="spearman"))
chrom_v <- c(chrom_v, "all")
num_genes_v <- c(num_genes_v, dim(X)[1])
cor_id_v <- c(cor_id_v, "glmnet")
# Record in storage nn corr
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(predictions_nn, y[,primary_ind], method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(predictions_nn, y[,primary_ind], method="spearman"))
chrom_v <- c(chrom_v, "all")
num_genes_v <- c(num_genes_v, dim(X)[1])
cor_id_v <- c(cor_id_v, "nn")
# Record in storage corr between flu and NI
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(y[,1], y[,7], method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(y[,1], y[,7], method="spearman"))
chrom_v <- c(chrom_v, "all")
num_genes_v <- c(num_genes_v, dim(X)[1])
cor_id_v <- c(cor_id_v, "flu_vs_NI")
# Record in storage corr between flu and epi
cur_cor <- run_cor(y[,primary_ind], X[,4766], log=TRUE, method=cor_method)
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, cur_cor)
corrs_v_spear <- c(corrs_v_spear, run_cor(y[,primary_ind], X[,4766], log=TRUE, method="spearman"))
chrom_v <- c(chrom_v, "all")
num_genes_v <- c(num_genes_v, dim(X)[1])
cor_id_v <- c(cor_id_v, "flu_vs_epi")
# Done recording
chroms_all <- split_datasets_chroms(data_subset, mapping_table)
chroms <- as.vector(chroms_all)
uniq_chroms <- unique(chroms)
for (chrom in uniq_chroms) {
index_of_chrom <- chroms == chrom
X_chrom <- X[index_of_chrom,]
y_chrom <- y[index_of_chrom,]
print(glue("********* Analyzing data for chrom: {chrom}"))
print(dim(X_chrom))
print(dim(y_chrom))
print(cor(y_chrom[,primary_ind], y_chrom[,7]))
print(cor(y_chrom[,primary_ind], X_chrom[,4766]))
print(cor(y_chrom[,7], X_chrom[,4766]))
print(cor(glmnet_predictions[index_of_chrom], y_chrom[,primary_ind]))
print(cor(predictions_nn[index_of_chrom], y_chrom[,primary_ind]))
chrom_df <- data.frame(glmnet_pred=glmnet_predictions[index_of_chrom], flu_rnaseq = y_chrom[,1], NI_rnaseq = y_chrom[,7], CAGE_epi =X_chrom[,4766])
chrom_df_for_plotting <- melt(chrom_df, measure_vars = colnames(chrom_df))
plot_boxplot(chrom_df_for_plotting, glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/{data_subset}_chrom_{chrom}_boxplot.png"))
# Record in storage glmnet corr
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(glmnet_predictions[index_of_chrom], y_chrom[,primary_ind], method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(glmnet_predictions[index_of_chrom], y_chrom[,primary_ind], method="spearman"))
chrom_v <- c(chrom_v, chrom)
num_genes_v <- c(num_genes_v, dim(X_chrom)[1])
cor_id_v <- c(cor_id_v, "glmnet")
# Record in storage nn corr
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(predictions_nn[index_of_chrom], y_chrom[,primary_ind], method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(predictions_nn[index_of_chrom], y_chrom[,primary_ind], method="spearman"))
chrom_v <- c(chrom_v, chrom)
num_genes_v <- c(num_genes_v, dim(X_chrom)[1])
cor_id_v <- c(cor_id_v, "nn")
# Record in storage corr between flu and NI
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(y_chrom[,1], y_chrom[,7], method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(y_chrom[,1], y_chrom[,7], method="spearman"))
chrom_v <- c(chrom_v, chrom)
num_genes_v <- c(num_genes_v, dim(X_chrom)[1])
cor_id_v <- c(cor_id_v, "flu_vs_NI")
# Record in storage corr between flu and epi
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(y_chrom[,primary_ind], X_chrom[,4766], log=TRUE, method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(y_chrom[,primary_ind], X_chrom[,4766], log=TRUE, method="spearman"))
chrom_v <- c(chrom_v, chrom)
num_genes_v <- c(num_genes_v, dim(X_chrom)[1])
cor_id_v <- c(cor_id_v, "flu_vs_epi")
}
storage_df <- data.frame(corrs_v, corrs_v_spear,data_subset_v, chrom_v, num_genes_v, cor_id_v)
return(storage_df)
}
split_datasets <- function(data_subset, mapping_table) {
tss_metadata <- data.table::fread("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-09-20-diagnose_training_tracks_issue/canonical_TSS_full_metadata.txt")
tss_metadata$joined_region <- paste(tss_metadata$chromosome_name, tss_metadata$transcription_start_site, tss_metadata$transcription_start_site+2, sep="_")
data_subset_mapping_table <- mapping_table[,mapping_table["group",]==data_subset]
regions <- colnames(data_subset_mapping_table)
chroms <- strsplit(regions, split = "_")
chroms <- sapply(chroms, function(x) x[1])
chroms <- sapply(chroms, function(x) substr(x,2,str_length(x)))
ensembl_ids <- rep(NA, length(regions))
external_gene_names <- rep(NA, length(regions))
for (x in 1:length(regions)) {
region <- substr(regions[x],2,str_length(regions[x]))
ensembl_id <- tss_metadata[tss_metadata$joined_region == region,]$ensembl_gene_id
external_gene_name <- tss_metadata[tss_metadata$joined_region == region,]$external_gene_name
ensembl_ids[x] <- ensembl_id
external_gene_names[x] <- external_gene_name
}
print(length(ensembl_ids))
print(length(external_gene_names))
print(length(chroms))
print(length(regions))
return_list <- list()
return_list[["chroms"]] <- chroms
return_list[["regions"]] <- regions
return_list[["ensembl_ids"]] <- ensembl_ids
return_list[["external_gene_names"]] <- external_gene_names
return(return_list)
}
collect_all_records <- function(data_subset, mapping_table, X_path, y_path, glmnet_model_path, nnmodel_path) {
print(glue("********* Collecting data records for data subset: {data_subset}"))
X <- as.matrix(t(data.table::fread(X_path)))
y <- as.data.frame(t(data.table::fread(y_path)))
y_for_cv <- y[,1]
data_frame_for_training <- data.frame(cbind(X, y_for_cv))
glmnet_model <- readRDS(glmnet_model_path)
nnmodel <- readRDS(nnmodel_path)
glmnet_predictions <- as.numeric(predict(glmnet_model, X, s = "lambda.min", type="response"))
predictions_nn <- predict(nnmodel, data_frame_for_training)
gene_metadata <- split_datasets(data_subset, mapping_table)
return_df <- data.frame(data_subset= data_subset, glmnet_pred=glmnet_predictions, nn_pred= predictions_nn, flu_rnaseq = y[,1], NI_rnaseq = y[,7], CAGE_epi =X[,4766], ensembl_id = gene_metadata[["ensembl_ids"]], external_gene_name = gene_metadata[["external_gene_names"]], region = gene_metadata[["regions"]], chroms = gene_metadata[["chroms"]])
return(return_df)
}
[1] 29 15851
[1] "EU39" "EU27" "AF26" "EU33"
[5] "AF18" "AF38" "AF20" "AF22"
[9] "AF08" "AF28" "EU25" "AF24"
[13] "EU05" "AF34" "EU47" "EU13"
[17] "AF36" "EU03" "AF30" "AF12"
[21] "EU09" "EU15" "EU07" "EU21"
[25] "AF16" "EU19" "EU41" "ref-epigenome"
[29] "group"
primary_ind <- 1
glmnet_model_path <- "/beagle3/haky/users/saideep/projects/aracena_modeling/elastic_net/trained_eln_RNAseq_NI_from_HDF5_mean_log.linear.rds"
nnmodel_path <- "/beagle3/haky/users/saideep/projects/aracena_modeling/elastic_net/trained_mlp_RNAseq_NI_from_HDF5_mean_log.rds"
cor_method = "pearson"
df_store <- list()
all_records_df_store <- list()
c=1
for (data_subset in c("train","test","valid")){
X_path <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_inputs_mean_aracena.csv")
y_path <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_targets_mean_aracena_log.csv")
cur_out_df_perchrom_metrics <- per_chrom_metrics(data_subset, remapped_table, X_path, y_path, glmnet_model_path, nnmodel_path, cor_method, primary_ind)
df_store[[c]] <- cur_out_df_perchrom_metrics
all_records_df_store[[c]] <- collect_all_records(data_subset, remapped_table, X_path, y_path, glmnet_model_path, nnmodel_path)
c=c+1
}
********* Analyzing data for data subset: train
********* Analyzing data for chrom: 1
[1] 2014 5313
[1] 2014 12
[1] 0.7738281
[1] 0.4906751
[1] 0.5128315
[1] 0.7047689
[1] 0.5813407
********* Analyzing data for chrom: 10
[1] 710 5313
[1] 710 12
[1] 0.6472775
[1] 0.447759
[1] 0.5692331
[1] 0.5909975
[1] 0.5339694
********* Analyzing data for chrom: 11
[1] 1287 5313
[1] 1287 12
[1] 0.7712791
[1] 0.5456487
[1] 0.5931671
[1] 0.7538166
[1] 0.6369364
********* Analyzing data for chrom: 16
[1] 839 5313
[1] 839 12
[1] 0.6938366
[1] 0.4392715
[1] 0.5129959
[1] 0.63134
[1] 0.5500776
********* Analyzing data for chrom: 17
[1] 1152 5313
[1] 1152 12
[1] 0.6449451
[1] 0.394819
[1] 0.5282789
[1] 0.6056413
[1] 0.5063302
********* Analyzing data for chrom: 18
[1] 262 5313
[1] 262 12
[1] 0.7886703
[1] 0.4710867
[1] 0.5742142
[1] 0.7016001
[1] 0.5409966
********* Analyzing data for chrom: 19
[1] 1423 5313
[1] 1423 12
[1] 0.8067522
[1] 0.3972782
[1] 0.4672016
[1] 0.6908282
[1] 0.4961597
********* Analyzing data for chrom: 2
[1] 1225 5313
[1] 1225 12
[1] 0.6746361
[1] 0.4599844
[1] 0.5473104
[1] 0.6263492
[1] 0.62898
********* Analyzing data for chrom: 20
[1] 533 5313
[1] 533 12
[1] 0.836731
[1] 0.4295909
[1] 0.5013971
[1] 0.6731075
[1] 0.556115
********* Analyzing data for chrom: 21
[1] 218 5313
[1] 218 12
[1] 0.7940787
[1] 0.5481793
[1] 0.6873015
[1] 0.7568527
[1] 0.7324491
********* Analyzing data for chrom: 3
[1] 1059 5313
[1] 1059 12
[1] 0.743747
[1] 0.4979988
[1] 0.5679769
[1] 0.6932378
[1] 0.5565457
********* Analyzing data for chrom: 4
[1] 748 5313
[1] 748 12
[1] 0.5374208
[1] 0.3372035
[1] 0.5504596
[1] 0.4418816
[1] 0.4109451
********* Analyzing data for chrom: 9
[1] 762 5313
[1] 762 12
[1] 0.7774666
[1] 0.4870424
[1] 0.5928558
[1] 0.7243833
[1] 0.5766746
********* Collecting data records for data subset: train
[1] 12232
[1] 12232
[1] 12232
[1] 12232
********* Analyzing data for data subset: test
********* Analyzing data for chrom: 13
[1] 319 5313
[1] 319 12
[1] 0.7747543
[1] 0.5376015
[1] 0.5963206
[1] 0.6713264
[1] 0.6136282
********* Analyzing data for chrom: 14
[1] 600 5313
[1] 600 12
[1] 0.8725071
[1] 0.3379276
[1] 0.3741996
[1] 0.540512
[1] 0.4616933
********* Analyzing data for chrom: 22
[1] 431 5313
[1] 431 12
[1] 0.5932224
[1] 0.3775029
[1] 0.5739591
[1] 0.5127326
[1] 0.4755188
********* Analyzing data for chrom: 8
[1] 691 5313
[1] 691 12
[1] 0.7137821
[1] 0.460958
[1] 0.6211134
[1] 0.6131504
[1] 0.5130661
********* Collecting data records for data subset: test
[1] 2041
[1] 2041
[1] 2041
[1] 2041
********* Analyzing data for data subset: valid
********* Analyzing data for chrom: 12
[1] 990 5313
[1] 990 12
[1] 0.7726486
[1] 0.4867567
[1] 0.5453881
[1] 0.6965704
[1] 0.5466499
********* Analyzing data for chrom: 15
[1] 587 5313
[1] 587 12
[1] 0.7970598
[1] 0.5935388
[1] 0.6312264
[1] 0.7513436
[1] 0.6344261
********* Collecting data records for data subset: valid
[1] 1577
[1] 1577
[1] 1577
[1] 1577
full_df <- do.call(rbind, df_store)
full_df$data_subset_v <- factor(full_df$data_subset_v, levels = c("train", "valid", "test"))
full_df$cor_id_v <- factor(full_df$cor_id_v, levels = c("glmnet", "nn", "flu_vs_epi", "flu_vs_NI"))
full_df$chrom_v <- factor(full_df$chrom_v, levels = c("all", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"))
Set color palette
full_df$num_genes_v <- as.character(full_df$num_genes_v)
full_df_melt <- melt(full_df, measure_vars=c("corrs_v","corrs_v_spear"), id_vars=c("data_subset_v", "chrom_v", "cor_id_v", "num_genes_v"))
full_df_melt$corr_type <- ifelse(full_df_melt$variable == "corrs_v", "pearson", "spearman")
full_df_melt_all_chroms <- full_df_melt[full_df_melt$chrom_v == "all",]
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_all_chroms_both.png"), width=800, height=400)
ggplot(full_df_melt_all_chroms) + geom_bar(aes(x=data_subset_v, y=value, fill=cor_id_v), stat="identity", position="dodge") + scale_fill_manual(values = corr_colors) + facet_wrap(~corr_type, nrow=1) + xlab("Data subset") + ylab("Correlation")
dev.off()
svg
2
library(ggplot2)
full_df_all_chroms <- full_df[full_df$chrom_v == "all",]
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_all_chroms.png"))
ggplot(full_df_all_chroms) + geom_bar(aes(x=data_subset_v, y=corrs_v, fill=cor_id_v), stat="identity", position="dodge") + scale_fill_manual(values = corr_colors)
dev.off()
svg
2
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_each_chrom.png"), width =1000, height=500)
g <- ggplot(full_df) + geom_bar(aes(x=chrom_v, y=corrs_v, fill=cor_id_v), stat="identity", position="dodge") + scale_fill_manual(values = corr_colors)
g+facet_wrap(~data_subset_v, ncol=1)
dev.off()
svg
2
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_each_chrom_nonfaceted.png"), width =1000, height=500)
ggplot(full_df) + geom_bar(aes(x=chrom_v, y=corrs_v, fill=cor_id_v), stat="identity", position="dodge") + scale_fill_manual(values = corr_colors)
dev.off()
svg
2
svg
2
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_each_chrom_numgenes.png"), width =1000, height=500)
ggplot(full_df[full_df$chrom_v != "all" & full_df$cor_id_v == "glmnet",]) + geom_bar(aes(x=data_subset_v, fill=chrom_v, y=num_genes_v), stat="identity")
dev.off()
svg
2
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_each_chrom_numgenes_long.png"), width =1000, height=500)
ggplot(full_df[full_df$chrom_v != "all" & full_df$cor_id_v == "glmnet",]) + geom_bar(aes(x=chrom_v, fill=data_subset_v, y=num_genes_v), stat="identity") + scale_fill_manual(values = subset_colors)
dev.off()
svg
2
plot_list <- list()
for (id in c("glmnet", "nn", "flu_vs_epi", "flu_vs_NI")) {
g<-ggplot(full_df[full_df$cor_id_v == id,]) + geom_bar(aes(x=chrom_v, y=corrs_v), stat="identity", position="dodge") + ylim(c(0,1))
plot_list[[id]] <- g+facet_wrap(~data_subset_v, ncol=1)
}
for (id in c("glmnet", "nn", "flu_vs_epi", "flu_vs_NI")) {
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_each_chrom_{id}.png"), width =1000, height=500)
print(plot_list[[id]])
dev.off()
}
all_records_full_df <- do.call(rbind, all_records_df_store)
all_records_full_df$data_subset <- factor(all_records_full_df$data_subset, levels = c("train", "valid", "test"))
all_records_full_df$chroms<- factor(all_records_full_df$chroms, levels = c("all", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"))
colnames(all_records_full_df)
[1] "data_subset" "glmnet_pred" "nn_pred"
[4] "flu_rnaseq" "NI_rnaseq" "CAGE_epi"
[7] "ensembl_id" "external_gene_name" "region"
[10] "chroms"
[1] "data_subset" "ensembl_id" "external_gene_name"
[4] "region" "chroms" "variable"
[7] "value"
corr_colors <- c("glmnet" = "#F8766D", "nn" = "#00BA38", "flu_vs_epi" = "#619CFF", "flu_vs_NI" = "#F564E3")
subset_colors <- c("train" = "orange", "valid" = "yellow", "test" = "black")
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/all_records_scatterplots.png"), width =900, height=1200)
(ggplot(all_records_full_df) + geom_point(aes(x=glmnet_pred, y=flu_rnaseq, color=data_subset)) + facet_wrap(~data_subset, nrow=1, scales="free")+ylab("flu RNAseq")+xlab("Elastic net pred")+theme(panel.background = element_rect(fill = alpha(corr_colors["glmnet"],0.4)))+scale_color_manual(values=subset_colors)) /
(ggplot(all_records_full_df) + geom_point(aes(x=nn_pred, y=flu_rnaseq, color=data_subset)) + facet_wrap(~data_subset, nrow=1, scales="free")+ylab("flu RNAseq")+xlab("Neural net pred")+theme(panel.background = element_rect(fill = alpha(corr_colors["nn"],0.4)))+scale_color_manual(values=subset_colors)) /
(ggplot(all_records_full_df) + geom_point(aes(x=CAGE_epi, y=flu_rnaseq, color=data_subset)) + facet_wrap(~data_subset, nrow=1, scales="free")+ylab("flu RNAseq")+xlab("Enformer CAGE pred")+theme(panel.background = element_rect(fill = alpha(corr_colors["flu_vs_epi"],0.4)))+scale_color_manual(values=subset_colors)) /
(ggplot(all_records_full_df) + geom_point(aes(x=NI_rnaseq, y=flu_rnaseq, color=data_subset)) + facet_wrap(~data_subset, nrow=1, scales="free")+ylab("flu RNAseq")+xlab("NI RNAseq")+theme(panel.background = element_rect(fill = alpha(corr_colors["flu_vs_NI"],alpha=0.4)))+scale_color_manual(values=subset_colors))
dev.off()
svg
2
---
title: "log_transform_training_tss_predictors"
author: "Saideep Gona"
date: "2023-10-25"
format:
html:
code-fold: true
code-summary: "Show the code"
execute:
freeze: true
warning: false
---
# Context
After evaluating the performance of my trained models, it seems clear that the spearman correlations are very stable genome-wide, but the pearson correlations are not. This is likely due to the fact that the pearson correlations are sensitive to outliers, of which I have observed many. In particular, chromosome 14 has 2 extreme outliers, and in general there are many outliers in the tails of the distributions. I think the model will perform better on log-normalized data, so I will try that next.
NN training: 8800 min thresh: 0.311676707121647 8817 error: 273.14126 time: 48.73 mins
## Create log-transformed training data
```{r}
library(data.table)
library(glue)
library(tidyverse)
```
```{r}
# path_to_data = "/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered"
# for (data_subset in c("train", "valid", "test")) {
# inputs_path <- glue("{path_to_data}/{data_subset}_inputs_mean_aracena.csv")
# inputs <- fread(inputs_path)
# log_inputs <- log(inputs+1)
# log_inputs_path <- glue("{path_to_data}/{data_subset}_inputs_mean_aracena_log.csv")
# write_delim(log_inputs, log_inputs_path, delim = "\t")
# targets_path <- glue("{path_to_data}/{data_subset}_targets_mean_aracena.csv")
# targets <- fread(targets_path)
# log_targets <- log(targets+1)
# log_targets_path <- glue("{path_to_data}/{data_subset}_targets_mean_aracena_log.csv")
# write_delim(log_targets, log_targets_path, delim = "\t")
# }
```
## Evaluate model performance on log-transformed data
```{r}
library(httpgd)
library(glmnet)
library(glue)
library(data.table)
library(neuralnet)
library(dplyr)
library(ggplot2)
library(patchwork)
library(reshape2)
library(stringr)
```
```{r}
convert_region_to_gene <- function(region, gene_metadata){
tss_metadata <- data.table::fread("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-09-20-diagnose_training_tracks_issue/canonical_TSS_full_metadata.txt")
tss_metadata$joined_region <- paste(tss_metadata$chromosome_name, tss_metadata$transcription_start_site, tss_metadata$transcription_start_site+2, sep="_")
}
```
```{r}
split_datasets_chroms <- function(data_subset, mapping_table) {
data_subset_mapping_table <- mapping_table[,mapping_table["group",]==data_subset]
regions <- colnames(data_subset_mapping_table)
chroms <- strsplit(regions, split = "_")
chroms <- sapply(chroms, function(x) x[1])
chroms <- sapply(chroms, function(x) substr(x,2,str_length(x)))
return(chroms)
}
```
```{r}
plot_boxplot <- function(df, path) {
df$is_CAGE <- ifelse(df$variable == "CAGE_epi", "CAGE", "RNAseq")
png(path)
g <- ggplot(df, aes(x=variable, y=value)) + geom_boxplot()
print(g+facet_wrap(~is_CAGE, scales="free"))
dev.off()
}
```
```{r}
run_cor <- function(x,y, method="pearson", log_bool=FALSE) {
if (log_bool) {
return(cor(log(x+1),y, method=method))
} else {
return(cor(x,y, method=method))
}
}
```
```{r}
per_chrom_metrics <- function(data_subset, mapping_table, X_path, y_path, glmnet_model_path, nnmodel_path, cor_method="pearson", primary_ind=1) {
print(glue("********* Analyzing data for data subset: {data_subset}"))
corrs_v <- c()
corrs_v_spear <- c()
data_subset_v <- c()
chrom_v <- c()
num_genes_v <- c()
cor_id_v <- c()
X <- as.matrix(t(data.table::fread(X_path)))
y <- as.data.frame(t(data.table::fread(y_path)))
y_for_cv <- y[,primary_ind]
data_frame_for_training <- data.frame(cbind(X, y_for_cv))
glmnet_model <- readRDS(glmnet_model_path)
nnmodel <- readRDS(nnmodel_path)
glmnet_predictions <- predict(glmnet_model, X, s = "lambda.min", type="response")
predictions_nn <- predict(nnmodel, data_frame_for_training)
# Record in storage glmnet corr
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(glmnet_predictions, y[,primary_ind], method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(glmnet_predictions, y[,primary_ind], method="spearman"))
chrom_v <- c(chrom_v, "all")
num_genes_v <- c(num_genes_v, dim(X)[1])
cor_id_v <- c(cor_id_v, "glmnet")
# Record in storage nn corr
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(predictions_nn, y[,primary_ind], method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(predictions_nn, y[,primary_ind], method="spearman"))
chrom_v <- c(chrom_v, "all")
num_genes_v <- c(num_genes_v, dim(X)[1])
cor_id_v <- c(cor_id_v, "nn")
# Record in storage corr between flu and NI
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(y[,1], y[,7], method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(y[,1], y[,7], method="spearman"))
chrom_v <- c(chrom_v, "all")
num_genes_v <- c(num_genes_v, dim(X)[1])
cor_id_v <- c(cor_id_v, "flu_vs_NI")
# Record in storage corr between flu and epi
cur_cor <- run_cor(y[,primary_ind], X[,4766], log=TRUE, method=cor_method)
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, cur_cor)
corrs_v_spear <- c(corrs_v_spear, run_cor(y[,primary_ind], X[,4766], log=TRUE, method="spearman"))
chrom_v <- c(chrom_v, "all")
num_genes_v <- c(num_genes_v, dim(X)[1])
cor_id_v <- c(cor_id_v, "flu_vs_epi")
# Done recording
chroms_all <- split_datasets_chroms(data_subset, mapping_table)
chroms <- as.vector(chroms_all)
uniq_chroms <- unique(chroms)
for (chrom in uniq_chroms) {
index_of_chrom <- chroms == chrom
X_chrom <- X[index_of_chrom,]
y_chrom <- y[index_of_chrom,]
print(glue("********* Analyzing data for chrom: {chrom}"))
print(dim(X_chrom))
print(dim(y_chrom))
print(cor(y_chrom[,primary_ind], y_chrom[,7]))
print(cor(y_chrom[,primary_ind], X_chrom[,4766]))
print(cor(y_chrom[,7], X_chrom[,4766]))
print(cor(glmnet_predictions[index_of_chrom], y_chrom[,primary_ind]))
print(cor(predictions_nn[index_of_chrom], y_chrom[,primary_ind]))
chrom_df <- data.frame(glmnet_pred=glmnet_predictions[index_of_chrom], flu_rnaseq = y_chrom[,1], NI_rnaseq = y_chrom[,7], CAGE_epi =X_chrom[,4766])
chrom_df_for_plotting <- melt(chrom_df, measure_vars = colnames(chrom_df))
plot_boxplot(chrom_df_for_plotting, glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/{data_subset}_chrom_{chrom}_boxplot.png"))
# Record in storage glmnet corr
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(glmnet_predictions[index_of_chrom], y_chrom[,primary_ind], method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(glmnet_predictions[index_of_chrom], y_chrom[,primary_ind], method="spearman"))
chrom_v <- c(chrom_v, chrom)
num_genes_v <- c(num_genes_v, dim(X_chrom)[1])
cor_id_v <- c(cor_id_v, "glmnet")
# Record in storage nn corr
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(predictions_nn[index_of_chrom], y_chrom[,primary_ind], method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(predictions_nn[index_of_chrom], y_chrom[,primary_ind], method="spearman"))
chrom_v <- c(chrom_v, chrom)
num_genes_v <- c(num_genes_v, dim(X_chrom)[1])
cor_id_v <- c(cor_id_v, "nn")
# Record in storage corr between flu and NI
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(y_chrom[,1], y_chrom[,7], method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(y_chrom[,1], y_chrom[,7], method="spearman"))
chrom_v <- c(chrom_v, chrom)
num_genes_v <- c(num_genes_v, dim(X_chrom)[1])
cor_id_v <- c(cor_id_v, "flu_vs_NI")
# Record in storage corr between flu and epi
data_subset_v <- c(data_subset_v, data_subset)
corrs_v <- c(corrs_v, run_cor(y_chrom[,primary_ind], X_chrom[,4766], log=TRUE, method=cor_method))
corrs_v_spear <- c(corrs_v_spear, run_cor(y_chrom[,primary_ind], X_chrom[,4766], log=TRUE, method="spearman"))
chrom_v <- c(chrom_v, chrom)
num_genes_v <- c(num_genes_v, dim(X_chrom)[1])
cor_id_v <- c(cor_id_v, "flu_vs_epi")
}
storage_df <- data.frame(corrs_v, corrs_v_spear,data_subset_v, chrom_v, num_genes_v, cor_id_v)
return(storage_df)
}
```
```{r}
split_datasets <- function(data_subset, mapping_table) {
tss_metadata <- data.table::fread("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-09-20-diagnose_training_tracks_issue/canonical_TSS_full_metadata.txt")
tss_metadata$joined_region <- paste(tss_metadata$chromosome_name, tss_metadata$transcription_start_site, tss_metadata$transcription_start_site+2, sep="_")
data_subset_mapping_table <- mapping_table[,mapping_table["group",]==data_subset]
regions <- colnames(data_subset_mapping_table)
chroms <- strsplit(regions, split = "_")
chroms <- sapply(chroms, function(x) x[1])
chroms <- sapply(chroms, function(x) substr(x,2,str_length(x)))
ensembl_ids <- rep(NA, length(regions))
external_gene_names <- rep(NA, length(regions))
for (x in 1:length(regions)) {
region <- substr(regions[x],2,str_length(regions[x]))
ensembl_id <- tss_metadata[tss_metadata$joined_region == region,]$ensembl_gene_id
external_gene_name <- tss_metadata[tss_metadata$joined_region == region,]$external_gene_name
ensembl_ids[x] <- ensembl_id
external_gene_names[x] <- external_gene_name
}
print(length(ensembl_ids))
print(length(external_gene_names))
print(length(chroms))
print(length(regions))
return_list <- list()
return_list[["chroms"]] <- chroms
return_list[["regions"]] <- regions
return_list[["ensembl_ids"]] <- ensembl_ids
return_list[["external_gene_names"]] <- external_gene_names
return(return_list)
}
```
```{r}
collect_all_records <- function(data_subset, mapping_table, X_path, y_path, glmnet_model_path, nnmodel_path) {
print(glue("********* Collecting data records for data subset: {data_subset}"))
X <- as.matrix(t(data.table::fread(X_path)))
y <- as.data.frame(t(data.table::fread(y_path)))
y_for_cv <- y[,1]
data_frame_for_training <- data.frame(cbind(X, y_for_cv))
glmnet_model <- readRDS(glmnet_model_path)
nnmodel <- readRDS(nnmodel_path)
glmnet_predictions <- as.numeric(predict(glmnet_model, X, s = "lambda.min", type="response"))
predictions_nn <- predict(nnmodel, data_frame_for_training)
gene_metadata <- split_datasets(data_subset, mapping_table)
return_df <- data.frame(data_subset= data_subset, glmnet_pred=glmnet_predictions, nn_pred= predictions_nn, flu_rnaseq = y[,1], NI_rnaseq = y[,7], CAGE_epi =X[,4766], ensembl_id = gene_metadata[["ensembl_ids"]], external_gene_name = gene_metadata[["external_gene_names"]], region = gene_metadata[["regions"]], chroms = gene_metadata[["chroms"]])
return(return_df)
}
```
```{r}
remapped_table_path <- "/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/remapped_table_filt.csv"
remapped_table <- read.table(remapped_table_path, header = TRUE, sep = ",")
print(dim(remapped_table))
print(as.character(remapped_table[,1]))
rownames(remapped_table) <- as.vector(remapped_table[,1])
remapped_table[,1] <- NULL
```
```{r}
primary_ind <- 1
glmnet_model_path <- "/beagle3/haky/users/saideep/projects/aracena_modeling/elastic_net/trained_eln_RNAseq_NI_from_HDF5_mean_log.linear.rds"
nnmodel_path <- "/beagle3/haky/users/saideep/projects/aracena_modeling/elastic_net/trained_mlp_RNAseq_NI_from_HDF5_mean_log.rds"
cor_method = "pearson"
df_store <- list()
all_records_df_store <- list()
c=1
for (data_subset in c("train","test","valid")){
X_path <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_inputs_mean_aracena.csv")
y_path <- glue("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered/{data_subset}_targets_mean_aracena_log.csv")
cur_out_df_perchrom_metrics <- per_chrom_metrics(data_subset, remapped_table, X_path, y_path, glmnet_model_path, nnmodel_path, cor_method, primary_ind)
df_store[[c]] <- cur_out_df_perchrom_metrics
all_records_df_store[[c]] <- collect_all_records(data_subset, remapped_table, X_path, y_path, glmnet_model_path, nnmodel_path)
c=c+1
}
full_df <- do.call(rbind, df_store)
full_df$data_subset_v <- factor(full_df$data_subset_v, levels = c("train", "valid", "test"))
full_df$cor_id_v <- factor(full_df$cor_id_v, levels = c("glmnet", "nn", "flu_vs_epi", "flu_vs_NI"))
full_df$chrom_v <- factor(full_df$chrom_v, levels = c("all", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"))
```
Set color palette
```{r}
corr_colors <- c("glmnet" = "#F8766D", "nn" = "#00BA38", "flu_vs_epi" = "#619CFF", "flu_vs_NI" = "#F564E3")
subset_colors <- c("train" = "orange", "valid" = "yellow", "test" = "black")
```
```{r}
full_df$num_genes_v <- as.character(full_df$num_genes_v)
full_df_melt <- melt(full_df, measure_vars=c("corrs_v","corrs_v_spear"), id_vars=c("data_subset_v", "chrom_v", "cor_id_v", "num_genes_v"))
full_df_melt$corr_type <- ifelse(full_df_melt$variable == "corrs_v", "pearson", "spearman")
full_df_melt_all_chroms <- full_df_melt[full_df_melt$chrom_v == "all",]
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_all_chroms_both.png"), width=800, height=400)
ggplot(full_df_melt_all_chroms) + geom_bar(aes(x=data_subset_v, y=value, fill=cor_id_v), stat="identity", position="dodge") + scale_fill_manual(values = corr_colors) + facet_wrap(~corr_type, nrow=1) + xlab("Data subset") + ylab("Correlation")
dev.off()
```
```{r}
library(ggplot2)
full_df_all_chroms <- full_df[full_df$chrom_v == "all",]
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_all_chroms.png"))
ggplot(full_df_all_chroms) + geom_bar(aes(x=data_subset_v, y=corrs_v, fill=cor_id_v), stat="identity", position="dodge") + scale_fill_manual(values = corr_colors)
dev.off()
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_each_chrom.png"), width =1000, height=500)
g <- ggplot(full_df) + geom_bar(aes(x=chrom_v, y=corrs_v, fill=cor_id_v), stat="identity", position="dodge") + scale_fill_manual(values = corr_colors)
g+facet_wrap(~data_subset_v, ncol=1)
dev.off()
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_each_chrom_nonfaceted.png"), width =1000, height=500)
ggplot(full_df) + geom_bar(aes(x=chrom_v, y=corrs_v, fill=cor_id_v), stat="identity", position="dodge") + scale_fill_manual(values = corr_colors)
dev.off()
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_each_chrom_numgenes.png"), width =1000, height=500)
ggplot(full_df) + geom_bar(aes(x=chrom_v, y=num_genes_v), stat="identity", position="dodge")
dev.off()
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_each_chrom_numgenes.png"), width =1000, height=500)
ggplot(full_df[full_df$chrom_v != "all" & full_df$cor_id_v == "glmnet",]) + geom_bar(aes(x=data_subset_v, fill=chrom_v, y=num_genes_v), stat="identity")
dev.off()
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_each_chrom_numgenes_long.png"), width =1000, height=500)
ggplot(full_df[full_df$chrom_v != "all" & full_df$cor_id_v == "glmnet",]) + geom_bar(aes(x=chrom_v, fill=data_subset_v, y=num_genes_v), stat="identity") + scale_fill_manual(values = subset_colors)
dev.off()
plot_list <- list()
for (id in c("glmnet", "nn", "flu_vs_epi", "flu_vs_NI")) {
g<-ggplot(full_df[full_df$cor_id_v == id,]) + geom_bar(aes(x=chrom_v, y=corrs_v), stat="identity", position="dodge") + ylim(c(0,1))
plot_list[[id]] <- g+facet_wrap(~data_subset_v, ncol=1)
}
for (id in c("glmnet", "nn", "flu_vs_epi", "flu_vs_NI")) {
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/corrs_each_chrom_{id}.png"), width =1000, height=500)
print(plot_list[[id]])
dev.off()
}
```
```{r}
all_records_full_df <- do.call(rbind, all_records_df_store)
all_records_full_df$data_subset <- factor(all_records_full_df$data_subset, levels = c("train", "valid", "test"))
all_records_full_df$chroms<- factor(all_records_full_df$chroms, levels = c("all", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"))
colnames(all_records_full_df)
```
```{r}
all_records_full_df_melt <- melt(all_records_full_df, measure_vars = c("glmnet_pred", "nn_pred", "flu_rnaseq", "NI_rnaseq", "CAGE_epi"))
colnames(all_records_full_df_melt)
```
```{r}
corr_colors <- c("glmnet" = "#F8766D", "nn" = "#00BA38", "flu_vs_epi" = "#619CFF", "flu_vs_NI" = "#F564E3")
subset_colors <- c("train" = "orange", "valid" = "yellow", "test" = "black")
png(glue("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-25-log_transform_training_tss_predictors/figures/all_records_scatterplots.png"), width =900, height=1200)
(ggplot(all_records_full_df) + geom_point(aes(x=glmnet_pred, y=flu_rnaseq, color=data_subset)) + facet_wrap(~data_subset, nrow=1, scales="free")+ylab("flu RNAseq")+xlab("Elastic net pred")+theme(panel.background = element_rect(fill = alpha(corr_colors["glmnet"],0.4)))+scale_color_manual(values=subset_colors)) /
(ggplot(all_records_full_df) + geom_point(aes(x=nn_pred, y=flu_rnaseq, color=data_subset)) + facet_wrap(~data_subset, nrow=1, scales="free")+ylab("flu RNAseq")+xlab("Neural net pred")+theme(panel.background = element_rect(fill = alpha(corr_colors["nn"],0.4)))+scale_color_manual(values=subset_colors)) /
(ggplot(all_records_full_df) + geom_point(aes(x=CAGE_epi, y=flu_rnaseq, color=data_subset)) + facet_wrap(~data_subset, nrow=1, scales="free")+ylab("flu RNAseq")+xlab("Enformer CAGE pred")+theme(panel.background = element_rect(fill = alpha(corr_colors["flu_vs_epi"],0.4)))+scale_color_manual(values=subset_colors)) /
(ggplot(all_records_full_df) + geom_point(aes(x=NI_rnaseq, y=flu_rnaseq, color=data_subset)) + facet_wrap(~data_subset, nrow=1, scales="free")+ylab("flu RNAseq")+xlab("NI RNAseq")+theme(panel.background = element_rect(fill = alpha(corr_colors["flu_vs_NI"],alpha=0.4)))+scale_color_manual(values=subset_colors))
dev.off()
```