Author

Saideep Gona

Published

July 14, 2023

Code
suppressMessages(library(tidyverse))
suppressMessages(library(glue))
PRE = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai"

## COPY THE DATE AND SLUG fields FROM THE HEADER
SLUG="testing_pop_seq" ## copy the slug from the header
bDATE='2023-07-14' ## copy the date from the blog's header here
DATA = glue("{PRE}/{bDATE}-{SLUG}")
if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))
WORK=DATA

Context

I have verified that the pop-seq training datasets are intact (they got out of order, but the training process is intact). Now I can test if the pop-seq training does have an impact on personalized prediction.

Create input datasets for testing

Testing the personalized prediction doesn’t require too many training examples, so I will start with testing 40 individuals for 100 genes. I’ll begin by assembling the input files for this inference pass.

Individuals

I will use 20 EUR and 20 AFR individuals to maintain some haplotypic diversity

Code
library(tidyverse)

dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis")

# Load GEUVADIS

geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt"))
# meta_table <- read_table(meta_path)


meta_path_rwp = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/relationships_w_pops.txt"
meta_table <- read_table(meta_path_rwp)
meta_table <- meta_table %>% filter(IID %in% colnames(geuvadis_full))

CEU <- meta_table %>% filter(population != "YRI") 
CEU_20 <- CEU$IID[1:20]
YRI <- meta_table %>% filter(population == "YRI") 
YRI_20 <- YRI$IID[1:20]

all_inds <- c(CEU_20,YRI_20)

write.table(all_inds, "40_inds.txt", row.names = FALSE, quote=FALSE, col.names = FALSE)

Genes

Code
gene_meta_path <- "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/gene_metadata_full.csv"


gene_meta_path <- "/Users/saideepgona/Documents/PhD/imlab/Github_Repos/Daily-Blog-Sai/posts/2023-06-03-calculating_genetic_distance_from_ref/gene_metadata_w_allele_counts.csv"
gene_metadata <- read_delim(gene_meta_path )


pos_corr <- gene_metadata %>% filter(all_geuvadis_ge_corrs > 0) %>% arrange(desc(all_geuvadis_ge_corrs))
pos_corr_50 <- pos_corr[1:50,]$gene_name
pos_corr_50_intervals <- paste0(pos_corr$chrom,"_",pos_corr$start,"_",pos_corr$end)


neg_corr <- gene_metadata %>% filter(all_geuvadis_ge_corrs < 0) %>% arrange(all_geuvadis_ge_corrs)
neg_corr_50 <- neg_corr[1:50,]$gene_name
neg_corr_50_intervals <- paste0(neg_corr$chrom,"_",neg_corr$start,"_",neg_corr$end)

all_genes <- c(pos_corr_50,neg_corr_50)

write.table(all_genes, "100_corr_genes.txt", row.names = FALSE, quote=FALSE, col.names = FALSE)

all_genes_intervals <- c(pos_corr_50_intervals,neg_corr_50_intervals)

write.table(all_genes_intervals, "100_corr_intervals.txt", row.names = FALSE, quote=FALSE, col.names = FALSE)


canonical_tss_path <- "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/canonical_TSS_full_metadata.txt"

canonical_tss_metadata <- read_delim(canonical_tss_path)
canonical_tss_metadata <- canonical_tss_metadata %>% distinct(external_gene_name, .keep_all=TRUE) %>% drop_na()
rownames(canonical_tss_metadata) <- canonical_tss_metadata$external_gene_name

all_canonical_tss <- canonical_tss_metadata[all_genes,]$target_regions

write.table(all_canonical_tss, "100_corr_canonical_tss_intervals.txt", row.names = FALSE, quote=FALSE, col.names = FALSE)

Checking correctness of pipeline

Using the ERAP2 window on a single individual,

Calculate correlations with ground truth from standard trained model

The standard trained model was trained to 80 epochs using reference genome sequence and 240 gpus (batch size 480)

Code
library(tidyr)
library(ggplot2)
library(ggrepel)
library(ggpmisc)
library(readr)
library(stringr)
library(ggExtra)
library(preprocessCore)
library(reshape2)
library(patchwork)
Code
dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis")

# Load GEUVADIS

geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt"))
geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1)
geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)])
rownames(geuvadis) <- geuvadis_genes_symbols

# Load predixcan expression

predixcan_full <- read_delim(file.path(dset_dir,"predixcan_expression.txt"))
predixcan <- as.data.frame(predixcan_full[,3:ncol(predixcan_full)])
rownames(predixcan) <- predixcan_full$gene_names_proper

# Load enformer

enformer_haplo1 <- as.data.frame(read_csv(file.path("CAGE_summed_haplotype_CAGE:LCL_quantification.csv")))

rownames(enformer_haplo1) <- enformer_haplo1$`Unnamed: 0`
enformer_haplo1$`Unnamed: 0` <- NULL
enformer_haplo1$...1 <- NULL


enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1)
enformer <- enformer_haplo1

enformer_norm <- as.data.frame(t(scale(t(enformer))))
enformer <- enformer_norm

rownames(enformer) <- enf_genes_symbols
Code
metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"

individual_metadata <- read.delim(file.path(metadata_dir,"igsr_samples.tsv"))
rownames(individual_metadata) <- individual_metadata$Sample.name

gene_metadata <- read.csv2(file.path(metadata_dir,"gene_metadata_full.csv"),row.names = 1)
Code
gp_ind <- intersect(colnames(geuvadis), colnames(predixcan))

ge_ind <- intersect(colnames(geuvadis), colnames(enformer))

all_three_ind <- intersect(gp_ind,ge_ind)

gp_genes <- intersect(rownames(geuvadis), rownames(predixcan))

ge_genes <- intersect(rownames(geuvadis), rownames(enformer))

ep_genes <- intersect(rownames(enformer), rownames(predixcan))

all_three_genes <- intersect(gp_genes, ge_genes)


# USING NON-IMPUTED VALUES FOR PREDIXCAN
# 
geuvadis_compare <- geuvadis[all_three_genes, all_three_ind]

predixcan_compare <- predixcan[all_three_genes,all_three_ind]

enformer_compare <- enformer[all_three_genes, all_three_ind]

gene_meta_compare <- gene_metadata[all_three_genes,]

individual_metadata_compare <- individual_metadata[rownames(individual_metadata) %in% all_three_ind,]
Code
corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}

gp_corrs <- corr_gene_across_inds(geuvadis_compare, predixcan_compare)
gp_corrs_all <- gp_corrs
ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare)
ge_corrs_all <- ge_corrs

cor(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
plot(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
# boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097)
# boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,]))
# 
corrs_df <- data.frame(gp=gp_corrs, ge=ge_corrs, ge_abs = abs(ge_corrs), gp_abs = abs(gp_corrs), genes=all_three_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry, pred_vars = as.numeric(apply(predixcan_compare, 1, var)), enf_vars = as.numeric(apply(enformer_compare, 1, var)))
#   
# corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001)
# 
# 
# summary(lm(pred_vars ~ ge_corrs, corrs_df))
Code
ggplot(corrs_df,aes(x=gp,y=ge)) + geom_point(aes(x=gp,y=ge), alpha=0.5) + xlim(-0.7,0.7) + ylim(-0.7,0.7) +
  # geom_hline(xintercept=0) + geom_vline(yintercept=0)
   xlab("geuvadis vs. predixcan correlation across individuals") + ylab("geuvadis vs. enformer correlation across individuals") +
geom_hline(aes(yintercept = 0)) +
geom_vline(aes(xintercept = 0)) + 
geom_abline(slope=1, intercept = 0) + 
geom_smooth(method = "lm", formula = y~x, se = F, color="red")

boxplot(corrs_df$ge)

corrs_df_standard <- corrs_df
enformer_compare_standard <- enformer_compare

Calculate correlations with ground truth from popseq trained model

Code
dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis")

# Load GEUVADIS

geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt"))
geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1)
geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)])
rownames(geuvadis) <- geuvadis_genes_symbols

# Load predixcan expression

predixcan_full <- read_delim(file.path(dset_dir,"predixcan_expression.txt"))
predixcan <- as.data.frame(predixcan_full[,3:ncol(predixcan_full)])
rownames(predixcan) <- predixcan_full$gene_names_proper

# Load enformer

enformer_haplo1 <- as.data.frame(read_csv(file.path("CAGE_summed_haplotype_CAGE:LCL_quantification_popseq.csv")))

rownames(enformer_haplo1) <- enformer_haplo1$`Unnamed: 0`
enformer_haplo1$`Unnamed: 0` <- NULL
enformer_haplo1$...1 <- NULL


enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1)
enformer <- enformer_haplo1

enformer_norm <- as.data.frame(t(scale(t(enformer))))
enformer <- enformer_norm

rownames(enformer) <- enf_genes_symbols
Code
metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"

individual_metadata <- read.delim(file.path(metadata_dir,"igsr_samples.tsv"))
rownames(individual_metadata) <- individual_metadata$Sample.name

gene_metadata <- read.csv2(file.path(metadata_dir,"gene_metadata_full.csv"),row.names = 1)
Code
gp_ind <- intersect(colnames(geuvadis), colnames(predixcan))

ge_ind <- intersect(colnames(geuvadis), colnames(enformer))

all_three_ind <- intersect(gp_ind,ge_ind)

gp_genes <- intersect(rownames(geuvadis), rownames(predixcan))

ge_genes <- intersect(rownames(geuvadis), rownames(enformer))

ep_genes <- intersect(rownames(enformer), rownames(predixcan))

all_three_genes <- intersect(gp_genes, ge_genes)


# USING NON-IMPUTED VALUES FOR PREDIXCAN
# 
geuvadis_compare <- geuvadis[all_three_genes, all_three_ind]

predixcan_compare <- predixcan[all_three_genes,all_three_ind]

enformer_compare <- enformer[all_three_genes, all_three_ind]

gene_meta_compare <- gene_metadata[all_three_genes,]

individual_metadata_compare <- individual_metadata[rownames(individual_metadata) %in% all_three_ind,]
Code
corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}

gp_corrs <- corr_gene_across_inds(geuvadis_compare, predixcan_compare)
gp_corrs_all <- gp_corrs
ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare)
ge_corrs_all <- ge_corrs

cor(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
plot(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
# boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097)
# boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,]))
# 
corrs_df <- data.frame(gp=gp_corrs, ge=ge_corrs, ge_abs = abs(ge_corrs), gp_abs = abs(gp_corrs), genes=all_three_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry, pred_vars = as.numeric(apply(predixcan_compare, 1, var)), enf_vars = as.numeric(apply(enformer_compare, 1, var)))
#   
# corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001)
# 
# 
# summary(lm(pred_vars ~ ge_corrs, corrs_df))
Code
ggplot(corrs_df,aes(x=gp,y=ge)) + geom_point(aes(x=gp,y=ge), alpha=0.5) + xlim(-0.7,0.7) + ylim(-0.7,0.7) +
  # geom_hline(xintercept=0) + geom_vline(yintercept=0)
   xlab("geuvadis vs. predixcan correlation across individuals") + ylab("geuvadis vs. enformer correlation across individuals") +
geom_hline(aes(yintercept = 0)) +
geom_vline(aes(xintercept = 0)) + 
geom_abline(slope=1, intercept = 0) + 
geom_smooth(method = "lm", formula = y~x, se = F, color="red")

boxplot(corrs_df$ge)

corrs_df_popseq <- corrs_df
enformer_compare_popseq <- enformer_compare

Compare the two training predictions

Code
plot(corrs_df_standard$ge,corrs_df_popseq$ge) + abline(0,1)

qqplot(as.numeric(corrs_df_standard$ge),as.numeric(corrs_df_popseq$ge))
abline(0,1)

plot(as.numeric(geuvadis_compare["ENSG00000129055",]), as.numeric(enformer_compare_standard["ENSG00000129055",]))

plot(as.numeric(geuvadis_compare["ENSG00000100321",]), as.numeric(enformer_compare_standard["ENSG00000100321",]))

plot(as.numeric(geuvadis_compare["ENSG00000164978",]), as.numeric(enformer_compare_standard["ENSG00000164978",]))

boxplot(corrs_df_standard$ge, corrs_df_popseq$ge)

Test 8 nodes and 20 epochs

Code
dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis")

# Load GEUVADIS

geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt"))
geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1)
geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)])
rownames(geuvadis) <- geuvadis_genes_symbols

# Load predixcan expression

predixcan_full <- read_delim(file.path(dset_dir,"predixcan_expression.txt"))
predixcan <- as.data.frame(predixcan_full[,3:ncol(predixcan_full)])
rownames(predixcan) <- predixcan_full$gene_names_proper

# Load enformer

enformer_haplo1 <- as.data.frame(read_csv(file.path("CAGE_summed_haplotype_CAGE:LCL_quantification_standard_8.csv")))

rownames(enformer_haplo1) <- enformer_haplo1$`Unnamed: 0`
enformer_haplo1$`Unnamed: 0` <- NULL
enformer_haplo1$...1 <- NULL


enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1)
enformer <- enformer_haplo1

enformer_norm <- as.data.frame(t(scale(t(enformer))))
enformer <- enformer_norm

rownames(enformer) <- enf_genes_symbols
Code
metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"

individual_metadata <- read.delim(file.path(metadata_dir,"igsr_samples.tsv"))
rownames(individual_metadata) <- individual_metadata$Sample.name

gene_metadata <- read.csv2(file.path(metadata_dir,"gene_metadata_full.csv"),row.names = 1)
Code
gp_ind <- intersect(colnames(geuvadis), colnames(predixcan))

ge_ind <- intersect(colnames(geuvadis), colnames(enformer))

all_three_ind <- intersect(gp_ind,ge_ind)

gp_genes <- intersect(rownames(geuvadis), rownames(predixcan))

ge_genes <- intersect(rownames(geuvadis), rownames(enformer))

ep_genes <- intersect(rownames(enformer), rownames(predixcan))

all_three_genes <- intersect(gp_genes, ge_genes)


# USING NON-IMPUTED VALUES FOR PREDIXCAN
# 
geuvadis_compare <- geuvadis[all_three_genes, all_three_ind]

predixcan_compare <- predixcan[all_three_genes,all_three_ind]

enformer_compare <- enformer[all_three_genes, all_three_ind]

gene_meta_compare <- gene_metadata[all_three_genes,]

individual_metadata_compare <- individual_metadata[rownames(individual_metadata) %in% all_three_ind,]
Code
corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}

gp_corrs <- corr_gene_across_inds(geuvadis_compare, predixcan_compare)
gp_corrs_all <- gp_corrs
ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare)
ge_corrs_all <- ge_corrs

cor(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
plot(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
# boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097)
# boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,]))
# 
corrs_df <- data.frame(gp=gp_corrs, ge=ge_corrs, ge_abs = abs(ge_corrs), gp_abs = abs(gp_corrs), genes=all_three_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry, pred_vars = as.numeric(apply(predixcan_compare, 1, var)), enf_vars = as.numeric(apply(enformer_compare, 1, var)))
#   
# corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001)
# 
# 
# summary(lm(pred_vars ~ ge_corrs, corrs_df))
Code
ggplot(corrs_df,aes(x=gp,y=ge)) + geom_point(aes(x=gp,y=ge), alpha=0.5) + xlim(-0.7,0.7) + ylim(-0.7,0.7) +
  # geom_hline(xintercept=0) + geom_vline(yintercept=0)
   xlab("geuvadis vs. predixcan correlation across individuals") + ylab("geuvadis vs. enformer correlation across individuals") +
geom_hline(aes(yintercept = 0)) +
geom_vline(aes(xintercept = 0)) + 
geom_abline(slope=1, intercept = 0) + 
geom_smooth(method = "lm", formula = y~x, se = F, color="red")

boxplot(corrs_df$ge)

corrs_df_standard <- corrs_df
enformer_compare_standard <- enformer_compare

Calculate correlations with ground truth from popseq trained model

Code
dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis")

# Load GEUVADIS

geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt"))
geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1)
geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)])
rownames(geuvadis) <- geuvadis_genes_symbols

# Load predixcan expression

predixcan_full <- read_delim(file.path(dset_dir,"predixcan_expression.txt"))
predixcan <- as.data.frame(predixcan_full[,3:ncol(predixcan_full)])
rownames(predixcan) <- predixcan_full$gene_names_proper

# Load enformer

enformer_haplo1 <- as.data.frame(read_csv(file.path("CAGE_summed_haplotype_CAGE:LCL_quantification_popseq_8.csv")))

rownames(enformer_haplo1) <- enformer_haplo1$`Unnamed: 0`
enformer_haplo1$`Unnamed: 0` <- NULL
enformer_haplo1$...1 <- NULL


enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1)
enformer <- enformer_haplo1

enformer_norm <- as.data.frame(t(scale(t(enformer))))
enformer <- enformer_norm

rownames(enformer) <- enf_genes_symbols
Code
metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"

individual_metadata <- read.delim(file.path(metadata_dir,"igsr_samples.tsv"))
rownames(individual_metadata) <- individual_metadata$Sample.name

gene_metadata <- read.csv2(file.path(metadata_dir,"gene_metadata_full.csv"),row.names = 1)
Code
gp_ind <- intersect(colnames(geuvadis), colnames(predixcan))

ge_ind <- intersect(colnames(geuvadis), colnames(enformer))

all_three_ind <- intersect(gp_ind,ge_ind)

gp_genes <- intersect(rownames(geuvadis), rownames(predixcan))

ge_genes <- intersect(rownames(geuvadis), rownames(enformer))

ep_genes <- intersect(rownames(enformer), rownames(predixcan))

all_three_genes <- intersect(gp_genes, ge_genes)


# USING NON-IMPUTED VALUES FOR PREDIXCAN
# 
geuvadis_compare <- geuvadis[all_three_genes, all_three_ind]

predixcan_compare <- predixcan[all_three_genes,all_three_ind]

enformer_compare <- enformer[all_three_genes, all_three_ind]

gene_meta_compare <- gene_metadata[all_three_genes,]

individual_metadata_compare <- individual_metadata[rownames(individual_metadata) %in% all_three_ind,]
Code
corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}

gp_corrs <- corr_gene_across_inds(geuvadis_compare, predixcan_compare)
gp_corrs_all <- gp_corrs
ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare)
ge_corrs_all <- ge_corrs

cor(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
plot(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
# boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097)
# boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,]))
# 
corrs_df <- data.frame(gp=gp_corrs, ge=ge_corrs, ge_abs = abs(ge_corrs), gp_abs = abs(gp_corrs), genes=all_three_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry, pred_vars = as.numeric(apply(predixcan_compare, 1, var)), enf_vars = as.numeric(apply(enformer_compare, 1, var)))
#   
# corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001)
# 
# 
# summary(lm(pred_vars ~ ge_corrs, corrs_df))
Code
ggplot(corrs_df,aes(x=gp,y=ge)) + geom_point(aes(x=gp,y=ge), alpha=0.5) + xlim(-0.7,0.7) + ylim(-0.7,0.7) +
  # geom_hline(xintercept=0) + geom_vline(yintercept=0)
   xlab("geuvadis vs. predixcan correlation across individuals") + ylab("geuvadis vs. enformer correlation across individuals") +
geom_hline(aes(yintercept = 0)) +
geom_vline(aes(xintercept = 0)) + 
geom_abline(slope=1, intercept = 0) + 
geom_smooth(method = "lm", formula = y~x, se = F, color="red")

boxplot(corrs_df$ge)

corrs_df_popseq <- corrs_df
enformer_compare_popseq <- enformer_compare

Compare the two training predictions

Code
plot(corrs_df_standard$ge,corrs_df_popseq$ge)
abline(0,1)

qqplot(as.numeric(corrs_df_standard$ge),as.numeric(corrs_df_popseq$ge))
abline(0,1)

plot(as.numeric(geuvadis_compare["ENSG00000129055",]), as.numeric(enformer_compare_standard["ENSG00000129055",]))

plot(as.numeric(geuvadis_compare["ENSG00000100321",]), as.numeric(enformer_compare_standard["ENSG00000100321",]))

plot(as.numeric(geuvadis_compare["ENSG00000164978",]), as.numeric(enformer_compare_standard["ENSG00000164978",]))

boxplot(corrs_df_standard$ge, corrs_df_popseq$ge)