Code
library(dplyr)
library(glue)
library(tidyverse)
Saideep Gona
November 22, 2023
path_to_predictdb_summaries <- "/beagle3/haky/users/saideep/projects/aracena_modeling/linearization/predixcan_RNAseq_Flu_from_HDF5_mean_log_revised/models/summary"
s_files <- list.files(path_to_predictdb_summaries)
s_dfs <- list()
for (s_file in s_files) {
s_dfs[[s_file]] <- read_delim(glue("{path_to_predictdb_summaries}/{s_file}"), delim = "\t")
}
full_df <- bind_rows(s_dfs)
# A tibble: 18,714 x 24
gene_id gene_name gene_type alpha n_snps_in_window n_snps_in_model
<chr> <chr> <chr> <dbl> <dbl> <dbl>
1 ENSG000000004… DPM1 protein_codi… 0.5 2915 53
2 ENSG000000004… SCYL3 protein_codi… 0.5 2760 227
3 ENSG000000004… C1orf112 protein_codi… 0.5 2731 267
4 ENSG000000009… FGR protein_codi… 0.5 1139 100
5 ENSG000000009… CFH protein_codi… 0.5 1598 172
6 ENSG000000010… FUCA2 protein_codi… 0.5 2255 290
7 ENSG000000010… GCLC protein_codi… 0.5 2861 261
8 ENSG000000011… NFYA protein_codi… 0.5 3285 174
9 ENSG000000014… STPG1 protein_codi… 0.5 2324 156
10 ENSG000000014… NIPAL3 protein_codi… 0.5 2293 106
# … with 18,704 more rows, and 18 more variables: lambda_min_mse <dbl>,
# test_R2_avg <dbl>, test_R2_sd <dbl>, cv_R2_avg <dbl>, cv_R2_sd <dbl>,
# in_sample_R2 <dbl>, nested_cv_fisher_pval <dbl>, rho_avg <dbl>,
# rho_se <dbl>, rho_zscore <dbl>, rho_avg_squared <dbl>, zscore_pval <dbl>,
# cv_rho_avg <dbl>, cv_rho_se <dbl>, cv_rho_avg_squared <dbl>,
# cv_zscore_est <dbl>, cv_zscore_pval <dbl>, cv_pval_est <dbl>
# A tibble: 18,714 x 24
gene gene_name gene_type alpha n_snps_in_window n_snps_in_model
<chr> <chr> <chr> <dbl> <dbl> <dbl>
1 ENSG000000004… DPM1 protein_codi… 0.5 2915 96
2 ENSG000000004… SCYL3 protein_codi… 0.5 2760 292
3 ENSG000000004… C1orf112 protein_codi… 0.5 2731 159
4 ENSG000000009… FGR protein_codi… 0.5 1139 131
5 ENSG000000009… CFH protein_codi… 0.5 1598 52
6 ENSG000000010… FUCA2 protein_codi… 0.5 2255 155
7 ENSG000000010… GCLC protein_codi… 0.5 2860 139
8 ENSG000000011… NFYA protein_codi… 0.5 3285 199
9 ENSG000000014… STPG1 protein_codi… 0.5 2324 141
10 ENSG000000014… NIPAL3 protein_codi… 0.5 2293 264
# … with 18,704 more rows, and 18 more variables: lambda_min_mse <dbl>,
# test_R2_avg <dbl>, test_R2_sd <dbl>, cv_R2_avg <dbl>, cv_R2_sd <dbl>,
# in_sample_R2 <dbl>, nested_cv_fisher_pval <dbl>, rho_avg <dbl>,
# rho_se <dbl>, rho_zscore <dbl>, rho_avg_squared <dbl>, zscore_pval <dbl>,
# cv_rho_avg <dbl>, cv_rho_se <dbl>, cv_rho_avg_squared <dbl>,
# cv_zscore_est <dbl>, cv_zscore_pval <dbl>, cv_pval_est <dbl>
---
title: "1.4.5_assess_linearization"
author: "Saideep Gona"
date: "2023-11-22"
format:
html:
code-fold: true
code-summary: "Show the code"
execute:
freeze: true
warning: false
---
```{r}
library(dplyr)
library(glue)
library(tidyverse)
```
### Flu linearization
```{r}
path_to_predictdb_summaries <- "/beagle3/haky/users/saideep/projects/aracena_modeling/linearization/predixcan_RNAseq_Flu_from_HDF5_mean_log_revised/models/summary"
s_files <- list.files(path_to_predictdb_summaries)
s_dfs <- list()
for (s_file in s_files) {
s_dfs[[s_file]] <- read_delim(glue("{path_to_predictdb_summaries}/{s_file}"), delim = "\t")
}
full_df_flu <- bind_rows(s_dfs)
```
```{r}
png("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-22-retrain_expression_predictor/linearization_plots/flu_linearization.png")
ggplot(full_df_flu) + geom_histogram(aes(x=rho_avg), bins = 100)+ theme_minimal(base_size=15)
# hist(full_df_flu$rho_avg, breaks = 100)
dev.off()
ggplot(full_df_flu) + geom_histogram(aes(x=rho_avg), bins = 100)+ theme_minimal(base_size=15)
```
### NI linearization
```{r}
charles_linearize_sum_path <- "/beagle3/haky/users/charles/project/singleXcanDL/PredicDB/Corrected_T2D_Geuvadis/work_dir/results/database/Model_summary.txt"
charles_df <- read_delim(charles_linearize_sum_path, delim = "\t")
hist(charles_df$rho_avg, breaks = 100)
```
```{r}
hist(full_df_flu$rho_avg, breaks = 100)
```
```{r}
full_df_reorder <- full_df %>%
arrange(gene_id)
charles_df_reorder <- charles_df %>%
arrange(gene)
```
```{r}
full_df_reorder
charles_df_reorder
plot(full_df_reorder$rho_avg, charles_df_reorder$rho_avg)
rho_df <- data.frame(sai=full_df_reorder$rho_avg, charles=charles_df_reorder$rho_avg)
```
```{r}
library(ggplot2)
ggplot(rho_df) + geom_density_2d_filled(aes(x=sai, y=charles))
```
## Evaluate accuracy of personalized predictions
```{r}
```