---
title: "testing_pop_seq"
author: "Saideep Gona"
date: "2023-07-14"
format:
html:
code-fold: true
code-summary: "Show the code"
execute:
freeze: true
warning: false
eval: false
---
```{r}
#| label: Set up box storage directory
suppressMessages (library (tidyverse))
suppressMessages (library (glue))
PRE = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai"
## COPY THE DATE AND SLUG fields FROM THE HEADER
SLUG= "testing_pop_seq" ## copy the slug from the header
bDATE= '2023-07-14' ## copy the date from the blog's header here
DATA = glue ("{PRE}/{bDATE}-{SLUG}" )
if (! file.exists (DATA)) system (glue:: glue ("mkdir {DATA}" ))
WORK= DATA
```
# Context
I have verified that the pop-seq training datasets are intact (they got out of order, but the training process is intact). Now I can test if the pop-seq training does have an impact on personalized prediction.
## Create input datasets for testing
Testing the personalized prediction doesn't require too many training examples, so I will start with testing 40 individuals for 100 genes. I'll begin by assembling the input files for this inference pass.
### Individuals
I will use 20 EUR and 20 AFR individuals to maintain some haplotypic diversity
```{r}
library (tidyverse)
dset_dir <- file.path ("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis" )
# Load GEUVADIS
geuvadis_full <- read_delim (file.path (dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt" ))
# meta_table <- read_table(meta_path)
meta_path_rwp = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/relationships_w_pops.txt"
meta_table <- read_table (meta_path_rwp)
meta_table <- meta_table %>% filter (IID %in% colnames (geuvadis_full))
CEU <- meta_table %>% filter (population != "YRI" )
CEU_20 <- CEU$ IID[1 : 20 ]
YRI <- meta_table %>% filter (population == "YRI" )
YRI_20 <- YRI$ IID[1 : 20 ]
all_inds <- c (CEU_20,YRI_20)
write.table (all_inds, "40_inds.txt" , row.names = FALSE , quote= FALSE , col.names = FALSE )
```
### Genes
```{r}
gene_meta_path <- "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/gene_metadata_full.csv"
gene_meta_path <- "/Users/saideepgona/Documents/PhD/imlab/Github_Repos/Daily-Blog-Sai/posts/2023-06-03-calculating_genetic_distance_from_ref/gene_metadata_w_allele_counts.csv"
gene_metadata <- read_delim (gene_meta_path )
pos_corr <- gene_metadata %>% filter (all_geuvadis_ge_corrs > 0 ) %>% arrange (desc (all_geuvadis_ge_corrs))
pos_corr_50 <- pos_corr[1 : 50 ,]$ gene_name
pos_corr_50_intervals <- paste0 (pos_corr$ chrom,"_" ,pos_corr$ start,"_" ,pos_corr$ end)
neg_corr <- gene_metadata %>% filter (all_geuvadis_ge_corrs < 0 ) %>% arrange (all_geuvadis_ge_corrs)
neg_corr_50 <- neg_corr[1 : 50 ,]$ gene_name
neg_corr_50_intervals <- paste0 (neg_corr$ chrom,"_" ,neg_corr$ start,"_" ,neg_corr$ end)
all_genes <- c (pos_corr_50,neg_corr_50)
write.table (all_genes, "100_corr_genes.txt" , row.names = FALSE , quote= FALSE , col.names = FALSE )
all_genes_intervals <- c (pos_corr_50_intervals,neg_corr_50_intervals)
write.table (all_genes_intervals, "100_corr_intervals.txt" , row.names = FALSE , quote= FALSE , col.names = FALSE )
canonical_tss_path <- "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/canonical_TSS_full_metadata.txt"
canonical_tss_metadata <- read_delim (canonical_tss_path)
canonical_tss_metadata <- canonical_tss_metadata %>% distinct (external_gene_name, .keep_all= TRUE ) %>% drop_na ()
rownames (canonical_tss_metadata) <- canonical_tss_metadata$ external_gene_name
all_canonical_tss <- canonical_tss_metadata[all_genes,]$ target_regions
write.table (all_canonical_tss, "100_corr_canonical_tss_intervals.txt" , row.names = FALSE , quote= FALSE , col.names = FALSE )
```
## Checking correctness of pipeline
Using the ERAP2 window on a single individual,
### Calculate correlations with ground truth from standard trained model
The standard trained model was trained to 80 epochs using reference genome sequence and 240 gpus (batch size 480)
```{r}
library (tidyr)
library (ggplot2)
library (ggrepel)
library (ggpmisc)
library (readr)
library (stringr)
library (ggExtra)
library (preprocessCore)
library (reshape2)
library (patchwork)
```
```{r}
dset_dir <- file.path ("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis" )
# Load GEUVADIS
geuvadis_full <- read_delim (file.path (dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt" ))
geuvadis_genes_symbols <- sapply (str_split (geuvadis_full$ Gene_Symbol, " \\ ." ),` [ ` ,1 )
geuvadis <- as.data.frame (geuvadis_full[,5 : ncol (geuvadis_full)])
rownames (geuvadis) <- geuvadis_genes_symbols
# Load predixcan expression
predixcan_full <- read_delim (file.path (dset_dir,"predixcan_expression.txt" ))
predixcan <- as.data.frame (predixcan_full[,3 : ncol (predixcan_full)])
rownames (predixcan) <- predixcan_full$ gene_names_proper
# Load enformer
enformer_haplo1 <- as.data.frame (read_csv (file.path ("CAGE_summed_haplotype_CAGE:LCL_quantification.csv" )))
rownames (enformer_haplo1) <- enformer_haplo1$ ` Unnamed: 0 `
enformer_haplo1$ ` Unnamed: 0 ` <- NULL
enformer_haplo1$ ...1 <- NULL
enf_genes_symbols <- sapply (str_split (rownames (enformer_haplo1), " \\ ." ),` [ ` ,1 )
enformer <- enformer_haplo1
enformer_norm <- as.data.frame (t (scale (t (enformer))))
enformer <- enformer_norm
rownames (enformer) <- enf_genes_symbols
```
```{r}
metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"
individual_metadata <- read.delim (file.path (metadata_dir,"igsr_samples.tsv" ))
rownames (individual_metadata) <- individual_metadata$ Sample.name
gene_metadata <- read.csv2 (file.path (metadata_dir,"gene_metadata_full.csv" ),row.names = 1 )
```
```{r}
gp_ind <- intersect (colnames (geuvadis), colnames (predixcan))
ge_ind <- intersect (colnames (geuvadis), colnames (enformer))
all_three_ind <- intersect (gp_ind,ge_ind)
gp_genes <- intersect (rownames (geuvadis), rownames (predixcan))
ge_genes <- intersect (rownames (geuvadis), rownames (enformer))
ep_genes <- intersect (rownames (enformer), rownames (predixcan))
all_three_genes <- intersect (gp_genes, ge_genes)
# USING NON-IMPUTED VALUES FOR PREDIXCAN
#
geuvadis_compare <- geuvadis[all_three_genes, all_three_ind]
predixcan_compare <- predixcan[all_three_genes,all_three_ind]
enformer_compare <- enformer[all_three_genes, all_three_ind]
gene_meta_compare <- gene_metadata[all_three_genes,]
individual_metadata_compare <- individual_metadata[rownames (individual_metadata) %in% all_three_ind,]
```
```{r}
corr_gene_across_inds <- function (df_1,df_2) {
df_1 <- scale (df_1)
df_2 <- scale (df_2)
corrs <- sapply (seq.int (dim (df_1)[1 ]), function (i) cor (as.numeric (df_1[i,]), as.numeric (df_2[i,])))
names (corrs) <- rownames (df_1)
return (corrs)
}
gp_corrs <- corr_gene_across_inds (geuvadis_compare, predixcan_compare)
gp_corrs_all <- gp_corrs
ge_corrs <- corr_gene_across_inds (geuvadis_compare, enformer_compare)
ge_corrs_all <- ge_corrs
cor (as.numeric (geuvadis_compare["ENSG00000164308" ,]), as.numeric (enformer_compare["ENSG00000164308" ,]))
plot (as.numeric (geuvadis_compare["ENSG00000164308" ,]), as.numeric (enformer_compare["ENSG00000164308" ,]))
# boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097)
# boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,]))
#
corrs_df <- data.frame (gp= gp_corrs, ge= ge_corrs, ge_abs = abs (ge_corrs), gp_abs = abs (gp_corrs), genes= all_three_genes, gene_names= rownames (gene_meta_compare), gene_h2= gene_meta_compare$ h2_WholeBlood_TS, high_h2 = gene_meta_compare$ h2_WholeBlood_TS > 0.1 , strand = gene_meta_compare$ strand, ref_gene_ancestry= gene_meta_compare$ best_ancestry, pred_vars = as.numeric (apply (predixcan_compare, 1 , var)), enf_vars = as.numeric (apply (enformer_compare, 1 , var)))
#
# corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001)
#
#
# summary(lm(pred_vars ~ ge_corrs, corrs_df))
```
```{r}
ggplot (corrs_df,aes (x= gp,y= ge)) + geom_point (aes (x= gp,y= ge), alpha= 0.5 ) + xlim (- 0.7 ,0.7 ) + ylim (- 0.7 ,0.7 ) +
# geom_hline(xintercept=0) + geom_vline(yintercept=0)
xlab ("geuvadis vs. predixcan correlation across individuals" ) + ylab ("geuvadis vs. enformer correlation across individuals" ) +
geom_hline (aes (yintercept = 0 )) +
geom_vline (aes (xintercept = 0 )) +
geom_abline (slope= 1 , intercept = 0 ) +
geom_smooth (method = "lm" , formula = y~ x, se = F, color= "red" )
boxplot (corrs_df$ ge)
corrs_df_standard <- corrs_df
enformer_compare_standard <- enformer_compare
```
### Calculate correlations with ground truth from popseq trained model
```{r}
dset_dir <- file.path ("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis" )
# Load GEUVADIS
geuvadis_full <- read_delim (file.path (dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt" ))
geuvadis_genes_symbols <- sapply (str_split (geuvadis_full$ Gene_Symbol, " \\ ." ),` [ ` ,1 )
geuvadis <- as.data.frame (geuvadis_full[,5 : ncol (geuvadis_full)])
rownames (geuvadis) <- geuvadis_genes_symbols
# Load predixcan expression
predixcan_full <- read_delim (file.path (dset_dir,"predixcan_expression.txt" ))
predixcan <- as.data.frame (predixcan_full[,3 : ncol (predixcan_full)])
rownames (predixcan) <- predixcan_full$ gene_names_proper
# Load enformer
enformer_haplo1 <- as.data.frame (read_csv (file.path ("CAGE_summed_haplotype_CAGE:LCL_quantification_popseq.csv" )))
rownames (enformer_haplo1) <- enformer_haplo1$ ` Unnamed: 0 `
enformer_haplo1$ ` Unnamed: 0 ` <- NULL
enformer_haplo1$ ...1 <- NULL
enf_genes_symbols <- sapply (str_split (rownames (enformer_haplo1), " \\ ." ),` [ ` ,1 )
enformer <- enformer_haplo1
enformer_norm <- as.data.frame (t (scale (t (enformer))))
enformer <- enformer_norm
rownames (enformer) <- enf_genes_symbols
```
```{r}
metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"
individual_metadata <- read.delim (file.path (metadata_dir,"igsr_samples.tsv" ))
rownames (individual_metadata) <- individual_metadata$ Sample.name
gene_metadata <- read.csv2 (file.path (metadata_dir,"gene_metadata_full.csv" ),row.names = 1 )
```
```{r}
gp_ind <- intersect (colnames (geuvadis), colnames (predixcan))
ge_ind <- intersect (colnames (geuvadis), colnames (enformer))
all_three_ind <- intersect (gp_ind,ge_ind)
gp_genes <- intersect (rownames (geuvadis), rownames (predixcan))
ge_genes <- intersect (rownames (geuvadis), rownames (enformer))
ep_genes <- intersect (rownames (enformer), rownames (predixcan))
all_three_genes <- intersect (gp_genes, ge_genes)
# USING NON-IMPUTED VALUES FOR PREDIXCAN
#
geuvadis_compare <- geuvadis[all_three_genes, all_three_ind]
predixcan_compare <- predixcan[all_three_genes,all_three_ind]
enformer_compare <- enformer[all_three_genes, all_three_ind]
gene_meta_compare <- gene_metadata[all_three_genes,]
individual_metadata_compare <- individual_metadata[rownames (individual_metadata) %in% all_three_ind,]
```
```{r}
corr_gene_across_inds <- function (df_1,df_2) {
df_1 <- scale (df_1)
df_2 <- scale (df_2)
corrs <- sapply (seq.int (dim (df_1)[1 ]), function (i) cor (as.numeric (df_1[i,]), as.numeric (df_2[i,])))
names (corrs) <- rownames (df_1)
return (corrs)
}
gp_corrs <- corr_gene_across_inds (geuvadis_compare, predixcan_compare)
gp_corrs_all <- gp_corrs
ge_corrs <- corr_gene_across_inds (geuvadis_compare, enformer_compare)
ge_corrs_all <- ge_corrs
cor (as.numeric (geuvadis_compare["ENSG00000164308" ,]), as.numeric (enformer_compare["ENSG00000164308" ,]))
plot (as.numeric (geuvadis_compare["ENSG00000164308" ,]), as.numeric (enformer_compare["ENSG00000164308" ,]))
# boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097)
# boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,]))
#
corrs_df <- data.frame (gp= gp_corrs, ge= ge_corrs, ge_abs = abs (ge_corrs), gp_abs = abs (gp_corrs), genes= all_three_genes, gene_names= rownames (gene_meta_compare), gene_h2= gene_meta_compare$ h2_WholeBlood_TS, high_h2 = gene_meta_compare$ h2_WholeBlood_TS > 0.1 , strand = gene_meta_compare$ strand, ref_gene_ancestry= gene_meta_compare$ best_ancestry, pred_vars = as.numeric (apply (predixcan_compare, 1 , var)), enf_vars = as.numeric (apply (enformer_compare, 1 , var)))
#
# corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001)
#
#
# summary(lm(pred_vars ~ ge_corrs, corrs_df))
```
```{r}
ggplot (corrs_df,aes (x= gp,y= ge)) + geom_point (aes (x= gp,y= ge), alpha= 0.5 ) + xlim (- 0.7 ,0.7 ) + ylim (- 0.7 ,0.7 ) +
# geom_hline(xintercept=0) + geom_vline(yintercept=0)
xlab ("geuvadis vs. predixcan correlation across individuals" ) + ylab ("geuvadis vs. enformer correlation across individuals" ) +
geom_hline (aes (yintercept = 0 )) +
geom_vline (aes (xintercept = 0 )) +
geom_abline (slope= 1 , intercept = 0 ) +
geom_smooth (method = "lm" , formula = y~ x, se = F, color= "red" )
boxplot (corrs_df$ ge)
corrs_df_popseq <- corrs_df
enformer_compare_popseq <- enformer_compare
```
### Compare the two training predictions
```{r}
plot (corrs_df_standard$ ge,corrs_df_popseq$ ge) + abline (0 ,1 )
qqplot (as.numeric (corrs_df_standard$ ge),as.numeric (corrs_df_popseq$ ge))
abline (0 ,1 )
plot (as.numeric (geuvadis_compare["ENSG00000129055" ,]), as.numeric (enformer_compare_standard["ENSG00000129055" ,]))
plot (as.numeric (geuvadis_compare["ENSG00000100321" ,]), as.numeric (enformer_compare_standard["ENSG00000100321" ,]))
plot (as.numeric (geuvadis_compare["ENSG00000164978" ,]), as.numeric (enformer_compare_standard["ENSG00000164978" ,]))
boxplot (corrs_df_standard$ ge, corrs_df_popseq$ ge)
```
## Test 8 nodes and 20 epochs
```{r}
dset_dir <- file.path ("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis" )
# Load GEUVADIS
geuvadis_full <- read_delim (file.path (dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt" ))
geuvadis_genes_symbols <- sapply (str_split (geuvadis_full$ Gene_Symbol, " \\ ." ),` [ ` ,1 )
geuvadis <- as.data.frame (geuvadis_full[,5 : ncol (geuvadis_full)])
rownames (geuvadis) <- geuvadis_genes_symbols
# Load predixcan expression
predixcan_full <- read_delim (file.path (dset_dir,"predixcan_expression.txt" ))
predixcan <- as.data.frame (predixcan_full[,3 : ncol (predixcan_full)])
rownames (predixcan) <- predixcan_full$ gene_names_proper
# Load enformer
enformer_haplo1 <- as.data.frame (read_csv (file.path ("CAGE_summed_haplotype_CAGE:LCL_quantification_standard_8.csv" )))
rownames (enformer_haplo1) <- enformer_haplo1$ ` Unnamed: 0 `
enformer_haplo1$ ` Unnamed: 0 ` <- NULL
enformer_haplo1$ ...1 <- NULL
enf_genes_symbols <- sapply (str_split (rownames (enformer_haplo1), " \\ ." ),` [ ` ,1 )
enformer <- enformer_haplo1
enformer_norm <- as.data.frame (t (scale (t (enformer))))
enformer <- enformer_norm
rownames (enformer) <- enf_genes_symbols
```
```{r}
metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"
individual_metadata <- read.delim (file.path (metadata_dir,"igsr_samples.tsv" ))
rownames (individual_metadata) <- individual_metadata$ Sample.name
gene_metadata <- read.csv2 (file.path (metadata_dir,"gene_metadata_full.csv" ),row.names = 1 )
```
```{r}
gp_ind <- intersect (colnames (geuvadis), colnames (predixcan))
ge_ind <- intersect (colnames (geuvadis), colnames (enformer))
all_three_ind <- intersect (gp_ind,ge_ind)
gp_genes <- intersect (rownames (geuvadis), rownames (predixcan))
ge_genes <- intersect (rownames (geuvadis), rownames (enformer))
ep_genes <- intersect (rownames (enformer), rownames (predixcan))
all_three_genes <- intersect (gp_genes, ge_genes)
# USING NON-IMPUTED VALUES FOR PREDIXCAN
#
geuvadis_compare <- geuvadis[all_three_genes, all_three_ind]
predixcan_compare <- predixcan[all_three_genes,all_three_ind]
enformer_compare <- enformer[all_three_genes, all_three_ind]
gene_meta_compare <- gene_metadata[all_three_genes,]
individual_metadata_compare <- individual_metadata[rownames (individual_metadata) %in% all_three_ind,]
```
```{r}
corr_gene_across_inds <- function (df_1,df_2) {
df_1 <- scale (df_1)
df_2 <- scale (df_2)
corrs <- sapply (seq.int (dim (df_1)[1 ]), function (i) cor (as.numeric (df_1[i,]), as.numeric (df_2[i,])))
names (corrs) <- rownames (df_1)
return (corrs)
}
gp_corrs <- corr_gene_across_inds (geuvadis_compare, predixcan_compare)
gp_corrs_all <- gp_corrs
ge_corrs <- corr_gene_across_inds (geuvadis_compare, enformer_compare)
ge_corrs_all <- ge_corrs
cor (as.numeric (geuvadis_compare["ENSG00000164308" ,]), as.numeric (enformer_compare["ENSG00000164308" ,]))
plot (as.numeric (geuvadis_compare["ENSG00000164308" ,]), as.numeric (enformer_compare["ENSG00000164308" ,]))
# boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097)
# boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,]))
#
corrs_df <- data.frame (gp= gp_corrs, ge= ge_corrs, ge_abs = abs (ge_corrs), gp_abs = abs (gp_corrs), genes= all_three_genes, gene_names= rownames (gene_meta_compare), gene_h2= gene_meta_compare$ h2_WholeBlood_TS, high_h2 = gene_meta_compare$ h2_WholeBlood_TS > 0.1 , strand = gene_meta_compare$ strand, ref_gene_ancestry= gene_meta_compare$ best_ancestry, pred_vars = as.numeric (apply (predixcan_compare, 1 , var)), enf_vars = as.numeric (apply (enformer_compare, 1 , var)))
#
# corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001)
#
#
# summary(lm(pred_vars ~ ge_corrs, corrs_df))
```
```{r}
ggplot (corrs_df,aes (x= gp,y= ge)) + geom_point (aes (x= gp,y= ge), alpha= 0.5 ) + xlim (- 0.7 ,0.7 ) + ylim (- 0.7 ,0.7 ) +
# geom_hline(xintercept=0) + geom_vline(yintercept=0)
xlab ("geuvadis vs. predixcan correlation across individuals" ) + ylab ("geuvadis vs. enformer correlation across individuals" ) +
geom_hline (aes (yintercept = 0 )) +
geom_vline (aes (xintercept = 0 )) +
geom_abline (slope= 1 , intercept = 0 ) +
geom_smooth (method = "lm" , formula = y~ x, se = F, color= "red" )
boxplot (corrs_df$ ge)
corrs_df_standard <- corrs_df
enformer_compare_standard <- enformer_compare
```
### Calculate correlations with ground truth from popseq trained model
```{r}
dset_dir <- file.path ("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis" )
# Load GEUVADIS
geuvadis_full <- read_delim (file.path (dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt" ))
geuvadis_genes_symbols <- sapply (str_split (geuvadis_full$ Gene_Symbol, " \\ ." ),` [ ` ,1 )
geuvadis <- as.data.frame (geuvadis_full[,5 : ncol (geuvadis_full)])
rownames (geuvadis) <- geuvadis_genes_symbols
# Load predixcan expression
predixcan_full <- read_delim (file.path (dset_dir,"predixcan_expression.txt" ))
predixcan <- as.data.frame (predixcan_full[,3 : ncol (predixcan_full)])
rownames (predixcan) <- predixcan_full$ gene_names_proper
# Load enformer
enformer_haplo1 <- as.data.frame (read_csv (file.path ("CAGE_summed_haplotype_CAGE:LCL_quantification_popseq_8.csv" )))
rownames (enformer_haplo1) <- enformer_haplo1$ ` Unnamed: 0 `
enformer_haplo1$ ` Unnamed: 0 ` <- NULL
enformer_haplo1$ ...1 <- NULL
enf_genes_symbols <- sapply (str_split (rownames (enformer_haplo1), " \\ ." ),` [ ` ,1 )
enformer <- enformer_haplo1
enformer_norm <- as.data.frame (t (scale (t (enformer))))
enformer <- enformer_norm
rownames (enformer) <- enf_genes_symbols
```
```{r}
metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"
individual_metadata <- read.delim (file.path (metadata_dir,"igsr_samples.tsv" ))
rownames (individual_metadata) <- individual_metadata$ Sample.name
gene_metadata <- read.csv2 (file.path (metadata_dir,"gene_metadata_full.csv" ),row.names = 1 )
```
```{r}
gp_ind <- intersect (colnames (geuvadis), colnames (predixcan))
ge_ind <- intersect (colnames (geuvadis), colnames (enformer))
all_three_ind <- intersect (gp_ind,ge_ind)
gp_genes <- intersect (rownames (geuvadis), rownames (predixcan))
ge_genes <- intersect (rownames (geuvadis), rownames (enformer))
ep_genes <- intersect (rownames (enformer), rownames (predixcan))
all_three_genes <- intersect (gp_genes, ge_genes)
# USING NON-IMPUTED VALUES FOR PREDIXCAN
#
geuvadis_compare <- geuvadis[all_three_genes, all_three_ind]
predixcan_compare <- predixcan[all_three_genes,all_three_ind]
enformer_compare <- enformer[all_three_genes, all_three_ind]
gene_meta_compare <- gene_metadata[all_three_genes,]
individual_metadata_compare <- individual_metadata[rownames (individual_metadata) %in% all_three_ind,]
```
```{r}
corr_gene_across_inds <- function (df_1,df_2) {
df_1 <- scale (df_1)
df_2 <- scale (df_2)
corrs <- sapply (seq.int (dim (df_1)[1 ]), function (i) cor (as.numeric (df_1[i,]), as.numeric (df_2[i,])))
names (corrs) <- rownames (df_1)
return (corrs)
}
gp_corrs <- corr_gene_across_inds (geuvadis_compare, predixcan_compare)
gp_corrs_all <- gp_corrs
ge_corrs <- corr_gene_across_inds (geuvadis_compare, enformer_compare)
ge_corrs_all <- ge_corrs
cor (as.numeric (geuvadis_compare["ENSG00000164308" ,]), as.numeric (enformer_compare["ENSG00000164308" ,]))
plot (as.numeric (geuvadis_compare["ENSG00000164308" ,]), as.numeric (enformer_compare["ENSG00000164308" ,]))
# boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097)
# boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,]))
#
corrs_df <- data.frame (gp= gp_corrs, ge= ge_corrs, ge_abs = abs (ge_corrs), gp_abs = abs (gp_corrs), genes= all_three_genes, gene_names= rownames (gene_meta_compare), gene_h2= gene_meta_compare$ h2_WholeBlood_TS, high_h2 = gene_meta_compare$ h2_WholeBlood_TS > 0.1 , strand = gene_meta_compare$ strand, ref_gene_ancestry= gene_meta_compare$ best_ancestry, pred_vars = as.numeric (apply (predixcan_compare, 1 , var)), enf_vars = as.numeric (apply (enformer_compare, 1 , var)))
#
# corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001)
#
#
# summary(lm(pred_vars ~ ge_corrs, corrs_df))
```
```{r}
ggplot (corrs_df,aes (x= gp,y= ge)) + geom_point (aes (x= gp,y= ge), alpha= 0.5 ) + xlim (- 0.7 ,0.7 ) + ylim (- 0.7 ,0.7 ) +
# geom_hline(xintercept=0) + geom_vline(yintercept=0)
xlab ("geuvadis vs. predixcan correlation across individuals" ) + ylab ("geuvadis vs. enformer correlation across individuals" ) +
geom_hline (aes (yintercept = 0 )) +
geom_vline (aes (xintercept = 0 )) +
geom_abline (slope= 1 , intercept = 0 ) +
geom_smooth (method = "lm" , formula = y~ x, se = F, color= "red" )
boxplot (corrs_df$ ge)
corrs_df_popseq <- corrs_df
enformer_compare_popseq <- enformer_compare
```
### Compare the two training predictions
```{r}
plot (corrs_df_standard$ ge,corrs_df_popseq$ ge)
abline (0 ,1 )
qqplot (as.numeric (corrs_df_standard$ ge),as.numeric (corrs_df_popseq$ ge))
abline (0 ,1 )
plot (as.numeric (geuvadis_compare["ENSG00000129055" ,]), as.numeric (enformer_compare_standard["ENSG00000129055" ,]))
plot (as.numeric (geuvadis_compare["ENSG00000100321" ,]), as.numeric (enformer_compare_standard["ENSG00000100321" ,]))
plot (as.numeric (geuvadis_compare["ENSG00000164978" ,]), as.numeric (enformer_compare_standard["ENSG00000164978" ,]))
boxplot (corrs_df_standard$ ge, corrs_df_popseq$ ge)
```