1.4.5_assess_linearization

Author

Saideep Gona

Published

November 22, 2023

Code
library(dplyr)
library(glue)
library(tidyverse)
Code
path_to_predictdb_summaries <- "/beagle3/haky/users/saideep/projects/aracena_modeling/linearization/predixcan_RNAseq_Flu_from_HDF5_mean_log_revised/models/summary"

s_files <- list.files(path_to_predictdb_summaries)

s_dfs <- list()
for (s_file in s_files) {
    s_dfs[[s_file]] <- read_delim(glue("{path_to_predictdb_summaries}/{s_file}"), delim = "\t")
}

full_df <- bind_rows(s_dfs)
Code
hist(full_df$rho_avg, breaks = 100)

Code
charles_linearize_sum_path <- "/beagle3/haky/users/charles/project/singleXcanDL/PredicDB/Corrected_T2D_Geuvadis/work_dir/results/database/Model_summary.txt"

charles_df <- read_delim(charles_linearize_sum_path, delim = "\t")

hist(charles_df$rho_avg, breaks = 100)

Code
full_df_reorder <- full_df %>% 
    arrange(gene_id)

charles_df_reorder <- charles_df %>% 
    arrange(gene)
Code
full_df_reorder
# A tibble: 18,714 x 24
   gene_id        gene_name gene_type     alpha n_snps_in_window n_snps_in_model
   <chr>          <chr>     <chr>         <dbl>            <dbl>           <dbl>
 1 ENSG000000004… DPM1      protein_codi…   0.5             2915              53
 2 ENSG000000004… SCYL3     protein_codi…   0.5             2760             227
 3 ENSG000000004… C1orf112  protein_codi…   0.5             2731             267
 4 ENSG000000009… FGR       protein_codi…   0.5             1139             100
 5 ENSG000000009… CFH       protein_codi…   0.5             1598             172
 6 ENSG000000010… FUCA2     protein_codi…   0.5             2255             290
 7 ENSG000000010… GCLC      protein_codi…   0.5             2861             261
 8 ENSG000000011… NFYA      protein_codi…   0.5             3285             174
 9 ENSG000000014… STPG1     protein_codi…   0.5             2324             156
10 ENSG000000014… NIPAL3    protein_codi…   0.5             2293             106
# … with 18,704 more rows, and 18 more variables: lambda_min_mse <dbl>,
#   test_R2_avg <dbl>, test_R2_sd <dbl>, cv_R2_avg <dbl>, cv_R2_sd <dbl>,
#   in_sample_R2 <dbl>, nested_cv_fisher_pval <dbl>, rho_avg <dbl>,
#   rho_se <dbl>, rho_zscore <dbl>, rho_avg_squared <dbl>, zscore_pval <dbl>,
#   cv_rho_avg <dbl>, cv_rho_se <dbl>, cv_rho_avg_squared <dbl>,
#   cv_zscore_est <dbl>, cv_zscore_pval <dbl>, cv_pval_est <dbl>
Code
charles_df_reorder
# A tibble: 18,714 x 24
   gene           gene_name gene_type     alpha n_snps_in_window n_snps_in_model
   <chr>          <chr>     <chr>         <dbl>            <dbl>           <dbl>
 1 ENSG000000004… DPM1      protein_codi…   0.5             2915              96
 2 ENSG000000004… SCYL3     protein_codi…   0.5             2760             292
 3 ENSG000000004… C1orf112  protein_codi…   0.5             2731             159
 4 ENSG000000009… FGR       protein_codi…   0.5             1139             131
 5 ENSG000000009… CFH       protein_codi…   0.5             1598              52
 6 ENSG000000010… FUCA2     protein_codi…   0.5             2255             155
 7 ENSG000000010… GCLC      protein_codi…   0.5             2860             139
 8 ENSG000000011… NFYA      protein_codi…   0.5             3285             199
 9 ENSG000000014… STPG1     protein_codi…   0.5             2324             141
10 ENSG000000014… NIPAL3    protein_codi…   0.5             2293             264
# … with 18,704 more rows, and 18 more variables: lambda_min_mse <dbl>,
#   test_R2_avg <dbl>, test_R2_sd <dbl>, cv_R2_avg <dbl>, cv_R2_sd <dbl>,
#   in_sample_R2 <dbl>, nested_cv_fisher_pval <dbl>, rho_avg <dbl>,
#   rho_se <dbl>, rho_zscore <dbl>, rho_avg_squared <dbl>, zscore_pval <dbl>,
#   cv_rho_avg <dbl>, cv_rho_se <dbl>, cv_rho_avg_squared <dbl>,
#   cv_zscore_est <dbl>, cv_zscore_pval <dbl>, cv_pval_est <dbl>
Code
plot(full_df_reorder$rho_avg, charles_df_reorder$rho_avg)

Code
rho_df <- data.frame(sai=full_df_reorder$rho_avg, charles=charles_df_reorder$rho_avg)
Code
library(ggplot2)

ggplot(rho_df) + geom_density_2d_filled(aes(x=sai, y=charles))