testing_pop_seq

Author

Saideep Gona

Published

July 14, 2023

Code

suppressMessages(library(tidyverse))
suppressMessages(library(glue))
PRE = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai"

## COPY THE DATE AND SLUG fields FROM THE HEADER
SLUG="testing_pop_seq" ## copy the slug from the header
bDATE='2023-07-14' ## copy the date from the blog's header here
DATA = glue("{PRE}/{bDATE}-{SLUG}")
if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))
WORK=DATA

Context

I have verified that the pop-seq training datasets are intact (they got out of order, but the training process is intact). Now I can test if the pop-seq training does have an impact on personalized prediction.

Create input datasets for testing

Testing the personalized prediction doesn’t require too many training examples, so I will start with testing 40 individuals for 100 genes. I’ll begin by assembling the input files for this inference pass.

Individuals

I will use 20 EUR and 20 AFR individuals to maintain some haplotypic diversity

Code

library(tidyverse)

dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis")

# Load GEUVADIS

geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt"))
# meta_table <- read_table(meta_path)


meta_path_rwp = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/relationships_w_pops.txt"
meta_table <- read_table(meta_path_rwp)
meta_table <- meta_table %>% filter(IID %in% colnames(geuvadis_full))

CEU <- meta_table %>% filter(population != "YRI") 
CEU_20 <- CEU$IID[1:20]
YRI <- meta_table %>% filter(population == "YRI") 
YRI_20 <- YRI$IID[1:20]

all_inds <- c(CEU_20,YRI_20)

write.table(all_inds, "40_inds.txt", row.names = FALSE, quote=FALSE, col.names = FALSE)

Genes

Code

gene_meta_path <- "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/gene_metadata_full.csv"


gene_meta_path <- "/Users/saideepgona/Documents/PhD/imlab/Github_Repos/Daily-Blog-Sai/posts/2023-06-03-calculating_genetic_distance_from_ref/gene_metadata_w_allele_counts.csv"
gene_metadata <- read_delim(gene_meta_path )


pos_corr <- gene_metadata %>% filter(all_geuvadis_ge_corrs > 0) %>% arrange(desc(all_geuvadis_ge_corrs))
pos_corr_50 <- pos_corr[1:50,]$gene_name
pos_corr_50_intervals <- paste0(pos_corr$chrom,"_",pos_corr$start,"_",pos_corr$end)


neg_corr <- gene_metadata %>% filter(all_geuvadis_ge_corrs < 0) %>% arrange(all_geuvadis_ge_corrs)
neg_corr_50 <- neg_corr[1:50,]$gene_name
neg_corr_50_intervals <- paste0(neg_corr$chrom,"_",neg_corr$start,"_",neg_corr$end)

all_genes <- c(pos_corr_50,neg_corr_50)

write.table(all_genes, "100_corr_genes.txt", row.names = FALSE, quote=FALSE, col.names = FALSE)

all_genes_intervals <- c(pos_corr_50_intervals,neg_corr_50_intervals)

write.table(all_genes_intervals, "100_corr_intervals.txt", row.names = FALSE, quote=FALSE, col.names = FALSE)


canonical_tss_path <- "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/canonical_TSS_full_metadata.txt"

canonical_tss_metadata <- read_delim(canonical_tss_path)
canonical_tss_metadata <- canonical_tss_metadata %>% distinct(external_gene_name, .keep_all=TRUE) %>% drop_na()
rownames(canonical_tss_metadata) <- canonical_tss_metadata$external_gene_name

all_canonical_tss <- canonical_tss_metadata[all_genes,]$target_regions

write.table(all_canonical_tss, "100_corr_canonical_tss_intervals.txt", row.names = FALSE, quote=FALSE, col.names = FALSE)

Checking correctness of pipeline

Using the ERAP2 window on a single individual,

Calculate correlations with ground truth from standard trained model

The standard trained model was trained to 80 epochs using reference genome sequence and 240 gpus (batch size 480)

Code

library(tidyr)
library(ggplot2)
library(ggrepel)
library(ggpmisc)
library(readr)
library(stringr)
library(ggExtra)
library(preprocessCore)
library(reshape2)
library(patchwork)

Code

dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis")

# Load GEUVADIS

geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt"))
geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1)
geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)])
rownames(geuvadis) <- geuvadis_genes_symbols

# Load predixcan expression

predixcan_full <- read_delim(file.path(dset_dir,"predixcan_expression.txt"))
predixcan <- as.data.frame(predixcan_full[,3:ncol(predixcan_full)])
rownames(predixcan) <- predixcan_full$gene_names_proper

# Load enformer

enformer_haplo1 <- as.data.frame(read_csv(file.path("CAGE_summed_haplotype_CAGE:LCL_quantification.csv")))

rownames(enformer_haplo1) <- enformer_haplo1$`Unnamed: 0`
enformer_haplo1$`Unnamed: 0` <- NULL
enformer_haplo1$...1 <- NULL


enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1)
enformer <- enformer_haplo1

enformer_norm <- as.data.frame(t(scale(t(enformer))))
enformer <- enformer_norm

rownames(enformer) <- enf_genes_symbols

Code

metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"

individual_metadata <- read.delim(file.path(metadata_dir,"igsr_samples.tsv"))
rownames(individual_metadata) <- individual_metadata$Sample.name

gene_metadata <- read.csv2(file.path(metadata_dir,"gene_metadata_full.csv"),row.names = 1)

Code

gp_ind <- intersect(colnames(geuvadis), colnames(predixcan))

ge_ind <- intersect(colnames(geuvadis), colnames(enformer))

all_three_ind <- intersect(gp_ind,ge_ind)

gp_genes <- intersect(rownames(geuvadis), rownames(predixcan))

ge_genes <- intersect(rownames(geuvadis), rownames(enformer))

ep_genes <- intersect(rownames(enformer), rownames(predixcan))

all_three_genes <- intersect(gp_genes, ge_genes)


# USING NON-IMPUTED VALUES FOR PREDIXCAN
# 
geuvadis_compare <- geuvadis[all_three_genes, all_three_ind]

predixcan_compare <- predixcan[all_three_genes,all_three_ind]

enformer_compare <- enformer[all_three_genes, all_three_ind]

gene_meta_compare <- gene_metadata[all_three_genes,]

individual_metadata_compare <- individual_metadata[rownames(individual_metadata) %in% all_three_ind,]

Code

corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}

gp_corrs <- corr_gene_across_inds(geuvadis_compare, predixcan_compare)
gp_corrs_all <- gp_corrs
ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare)
ge_corrs_all <- ge_corrs

cor(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
plot(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
# boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097)
# boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,]))
# 
corrs_df <- data.frame(gp=gp_corrs, ge=ge_corrs, ge_abs = abs(ge_corrs), gp_abs = abs(gp_corrs), genes=all_three_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry, pred_vars = as.numeric(apply(predixcan_compare, 1, var)), enf_vars = as.numeric(apply(enformer_compare, 1, var)))
#   
# corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001)
# 
# 
# summary(lm(pred_vars ~ ge_corrs, corrs_df))

Code

ggplot(corrs_df,aes(x=gp,y=ge)) + geom_point(aes(x=gp,y=ge), alpha=0.5) + xlim(-0.7,0.7) + ylim(-0.7,0.7) +
  # geom_hline(xintercept=0) + geom_vline(yintercept=0)
   xlab("geuvadis vs. predixcan correlation across individuals") + ylab("geuvadis vs. enformer correlation across individuals") +
geom_hline(aes(yintercept = 0)) +
geom_vline(aes(xintercept = 0)) + 
geom_abline(slope=1, intercept = 0) + 
geom_smooth(method = "lm", formula = y~x, se = F, color="red")

boxplot(corrs_df$ge)

corrs_df_standard <- corrs_df
enformer_compare_standard <- enformer_compare

Calculate correlations with ground truth from popseq trained model

Code

dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis")

# Load GEUVADIS

geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt"))
geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1)
geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)])
rownames(geuvadis) <- geuvadis_genes_symbols

# Load predixcan expression

predixcan_full <- read_delim(file.path(dset_dir,"predixcan_expression.txt"))
predixcan <- as.data.frame(predixcan_full[,3:ncol(predixcan_full)])
rownames(predixcan) <- predixcan_full$gene_names_proper

# Load enformer

enformer_haplo1 <- as.data.frame(read_csv(file.path("CAGE_summed_haplotype_CAGE:LCL_quantification_popseq.csv")))

rownames(enformer_haplo1) <- enformer_haplo1$`Unnamed: 0`
enformer_haplo1$`Unnamed: 0` <- NULL
enformer_haplo1$...1 <- NULL


enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1)
enformer <- enformer_haplo1

enformer_norm <- as.data.frame(t(scale(t(enformer))))
enformer <- enformer_norm

rownames(enformer) <- enf_genes_symbols

Code

metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"

individual_metadata <- read.delim(file.path(metadata_dir,"igsr_samples.tsv"))
rownames(individual_metadata) <- individual_metadata$Sample.name

gene_metadata <- read.csv2(file.path(metadata_dir,"gene_metadata_full.csv"),row.names = 1)

Code

gp_ind <- intersect(colnames(geuvadis), colnames(predixcan))

ge_ind <- intersect(colnames(geuvadis), colnames(enformer))

all_three_ind <- intersect(gp_ind,ge_ind)

gp_genes <- intersect(rownames(geuvadis), rownames(predixcan))

ge_genes <- intersect(rownames(geuvadis), rownames(enformer))

ep_genes <- intersect(rownames(enformer), rownames(predixcan))

all_three_genes <- intersect(gp_genes, ge_genes)


# USING NON-IMPUTED VALUES FOR PREDIXCAN
# 
geuvadis_compare <- geuvadis[all_three_genes, all_three_ind]

predixcan_compare <- predixcan[all_three_genes,all_three_ind]

enformer_compare <- enformer[all_three_genes, all_three_ind]

gene_meta_compare <- gene_metadata[all_three_genes,]

individual_metadata_compare <- individual_metadata[rownames(individual_metadata) %in% all_three_ind,]

Code

corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}

gp_corrs <- corr_gene_across_inds(geuvadis_compare, predixcan_compare)
gp_corrs_all <- gp_corrs
ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare)
ge_corrs_all <- ge_corrs

cor(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
plot(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
# boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097)
# boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,]))
# 
corrs_df <- data.frame(gp=gp_corrs, ge=ge_corrs, ge_abs = abs(ge_corrs), gp_abs = abs(gp_corrs), genes=all_three_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry, pred_vars = as.numeric(apply(predixcan_compare, 1, var)), enf_vars = as.numeric(apply(enformer_compare, 1, var)))
#   
# corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001)
# 
# 
# summary(lm(pred_vars ~ ge_corrs, corrs_df))

Code

ggplot(corrs_df,aes(x=gp,y=ge)) + geom_point(aes(x=gp,y=ge), alpha=0.5) + xlim(-0.7,0.7) + ylim(-0.7,0.7) +
  # geom_hline(xintercept=0) + geom_vline(yintercept=0)
   xlab("geuvadis vs. predixcan correlation across individuals") + ylab("geuvadis vs. enformer correlation across individuals") +
geom_hline(aes(yintercept = 0)) +
geom_vline(aes(xintercept = 0)) + 
geom_abline(slope=1, intercept = 0) + 
geom_smooth(method = "lm", formula = y~x, se = F, color="red")

boxplot(corrs_df$ge)

corrs_df_popseq <- corrs_df
enformer_compare_popseq <- enformer_compare

Compare the two training predictions

Code

plot(corrs_df_standard$ge,corrs_df_popseq$ge) + abline(0,1)

qqplot(as.numeric(corrs_df_standard$ge),as.numeric(corrs_df_popseq$ge))
abline(0,1)

plot(as.numeric(geuvadis_compare["ENSG00000129055",]), as.numeric(enformer_compare_standard["ENSG00000129055",]))

plot(as.numeric(geuvadis_compare["ENSG00000100321",]), as.numeric(enformer_compare_standard["ENSG00000100321",]))

plot(as.numeric(geuvadis_compare["ENSG00000164978",]), as.numeric(enformer_compare_standard["ENSG00000164978",]))

boxplot(corrs_df_standard$ge, corrs_df_popseq$ge)

Test 8 nodes and 20 epochs

Code

dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis")

# Load GEUVADIS

geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt"))
geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1)
geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)])
rownames(geuvadis) <- geuvadis_genes_symbols

# Load predixcan expression

predixcan_full <- read_delim(file.path(dset_dir,"predixcan_expression.txt"))
predixcan <- as.data.frame(predixcan_full[,3:ncol(predixcan_full)])
rownames(predixcan) <- predixcan_full$gene_names_proper

# Load enformer

enformer_haplo1 <- as.data.frame(read_csv(file.path("CAGE_summed_haplotype_CAGE:LCL_quantification_standard_8.csv")))

rownames(enformer_haplo1) <- enformer_haplo1$`Unnamed: 0`
enformer_haplo1$`Unnamed: 0` <- NULL
enformer_haplo1$...1 <- NULL


enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1)
enformer <- enformer_haplo1

enformer_norm <- as.data.frame(t(scale(t(enformer))))
enformer <- enformer_norm

rownames(enformer) <- enf_genes_symbols

Code

metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"

individual_metadata <- read.delim(file.path(metadata_dir,"igsr_samples.tsv"))
rownames(individual_metadata) <- individual_metadata$Sample.name

gene_metadata <- read.csv2(file.path(metadata_dir,"gene_metadata_full.csv"),row.names = 1)

Code

gp_ind <- intersect(colnames(geuvadis), colnames(predixcan))

ge_ind <- intersect(colnames(geuvadis), colnames(enformer))

all_three_ind <- intersect(gp_ind,ge_ind)

gp_genes <- intersect(rownames(geuvadis), rownames(predixcan))

ge_genes <- intersect(rownames(geuvadis), rownames(enformer))

ep_genes <- intersect(rownames(enformer), rownames(predixcan))

all_three_genes <- intersect(gp_genes, ge_genes)


# USING NON-IMPUTED VALUES FOR PREDIXCAN
# 
geuvadis_compare <- geuvadis[all_three_genes, all_three_ind]

predixcan_compare <- predixcan[all_three_genes,all_three_ind]

enformer_compare <- enformer[all_three_genes, all_three_ind]

gene_meta_compare <- gene_metadata[all_three_genes,]

individual_metadata_compare <- individual_metadata[rownames(individual_metadata) %in% all_three_ind,]

Code

corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}

gp_corrs <- corr_gene_across_inds(geuvadis_compare, predixcan_compare)
gp_corrs_all <- gp_corrs
ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare)
ge_corrs_all <- ge_corrs

cor(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
plot(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
# boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097)
# boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,]))
# 
corrs_df <- data.frame(gp=gp_corrs, ge=ge_corrs, ge_abs = abs(ge_corrs), gp_abs = abs(gp_corrs), genes=all_three_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry, pred_vars = as.numeric(apply(predixcan_compare, 1, var)), enf_vars = as.numeric(apply(enformer_compare, 1, var)))
#   
# corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001)
# 
# 
# summary(lm(pred_vars ~ ge_corrs, corrs_df))

Code

ggplot(corrs_df,aes(x=gp,y=ge)) + geom_point(aes(x=gp,y=ge), alpha=0.5) + xlim(-0.7,0.7) + ylim(-0.7,0.7) +
  # geom_hline(xintercept=0) + geom_vline(yintercept=0)
   xlab("geuvadis vs. predixcan correlation across individuals") + ylab("geuvadis vs. enformer correlation across individuals") +
geom_hline(aes(yintercept = 0)) +
geom_vline(aes(xintercept = 0)) + 
geom_abline(slope=1, intercept = 0) + 
geom_smooth(method = "lm", formula = y~x, se = F, color="red")

boxplot(corrs_df$ge)

corrs_df_standard <- corrs_df
enformer_compare_standard <- enformer_compare

Calculate correlations with ground truth from popseq trained model

Code

dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis")

# Load GEUVADIS

geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt"))
geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1)
geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)])
rownames(geuvadis) <- geuvadis_genes_symbols

# Load predixcan expression

predixcan_full <- read_delim(file.path(dset_dir,"predixcan_expression.txt"))
predixcan <- as.data.frame(predixcan_full[,3:ncol(predixcan_full)])
rownames(predixcan) <- predixcan_full$gene_names_proper

# Load enformer

enformer_haplo1 <- as.data.frame(read_csv(file.path("CAGE_summed_haplotype_CAGE:LCL_quantification_popseq_8.csv")))

rownames(enformer_haplo1) <- enformer_haplo1$`Unnamed: 0`
enformer_haplo1$`Unnamed: 0` <- NULL
enformer_haplo1$...1 <- NULL


enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1)
enformer <- enformer_haplo1

enformer_norm <- as.data.frame(t(scale(t(enformer))))
enformer <- enformer_norm

rownames(enformer) <- enf_genes_symbols

Code

metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"

individual_metadata <- read.delim(file.path(metadata_dir,"igsr_samples.tsv"))
rownames(individual_metadata) <- individual_metadata$Sample.name

gene_metadata <- read.csv2(file.path(metadata_dir,"gene_metadata_full.csv"),row.names = 1)

Code

gp_ind <- intersect(colnames(geuvadis), colnames(predixcan))

ge_ind <- intersect(colnames(geuvadis), colnames(enformer))

all_three_ind <- intersect(gp_ind,ge_ind)

gp_genes <- intersect(rownames(geuvadis), rownames(predixcan))

ge_genes <- intersect(rownames(geuvadis), rownames(enformer))

ep_genes <- intersect(rownames(enformer), rownames(predixcan))

all_three_genes <- intersect(gp_genes, ge_genes)


# USING NON-IMPUTED VALUES FOR PREDIXCAN
# 
geuvadis_compare <- geuvadis[all_three_genes, all_three_ind]

predixcan_compare <- predixcan[all_three_genes,all_three_ind]

enformer_compare <- enformer[all_three_genes, all_three_ind]

gene_meta_compare <- gene_metadata[all_three_genes,]

individual_metadata_compare <- individual_metadata[rownames(individual_metadata) %in% all_three_ind,]

Code

corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}

gp_corrs <- corr_gene_across_inds(geuvadis_compare, predixcan_compare)
gp_corrs_all <- gp_corrs
ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare)
ge_corrs_all <- ge_corrs

cor(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
plot(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",]))
# boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097)
# boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,]))
# 
corrs_df <- data.frame(gp=gp_corrs, ge=ge_corrs, ge_abs = abs(ge_corrs), gp_abs = abs(gp_corrs), genes=all_three_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry, pred_vars = as.numeric(apply(predixcan_compare, 1, var)), enf_vars = as.numeric(apply(enformer_compare, 1, var)))
#   
# corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001)
# 
# 
# summary(lm(pred_vars ~ ge_corrs, corrs_df))

Code

ggplot(corrs_df,aes(x=gp,y=ge)) + geom_point(aes(x=gp,y=ge), alpha=0.5) + xlim(-0.7,0.7) + ylim(-0.7,0.7) +
  # geom_hline(xintercept=0) + geom_vline(yintercept=0)
   xlab("geuvadis vs. predixcan correlation across individuals") + ylab("geuvadis vs. enformer correlation across individuals") +
geom_hline(aes(yintercept = 0)) +
geom_vline(aes(xintercept = 0)) + 
geom_abline(slope=1, intercept = 0) + 
geom_smooth(method = "lm", formula = y~x, se = F, color="red")

boxplot(corrs_df$ge)

corrs_df_popseq <- corrs_df
enformer_compare_popseq <- enformer_compare

Compare the two training predictions

Code

plot(corrs_df_standard$ge,corrs_df_popseq$ge)
abline(0,1)

qqplot(as.numeric(corrs_df_standard$ge),as.numeric(corrs_df_popseq$ge))
abline(0,1)

plot(as.numeric(geuvadis_compare["ENSG00000129055",]), as.numeric(enformer_compare_standard["ENSG00000129055",]))

plot(as.numeric(geuvadis_compare["ENSG00000100321",]), as.numeric(enformer_compare_standard["ENSG00000100321",]))

plot(as.numeric(geuvadis_compare["ENSG00000164978",]), as.numeric(enformer_compare_standard["ENSG00000164978",]))

boxplot(corrs_df_standard$ge, corrs_df_popseq$ge)

--- title: "testing_pop_seq" author: "Saideep Gona" date: "2023-07-14" format: html: code-fold: true code-summary: "Show the code" execute: freeze: true warning: false eval: false --- ```{r} #| label: Set up box storage directory suppressMessages(library(tidyverse)) suppressMessages(library(glue)) PRE = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai" ## COPY THE DATE AND SLUG fields FROM THE HEADER SLUG="testing_pop_seq" ## copy the slug from the header bDATE='2023-07-14' ## copy the date from the blog's header here DATA = glue("{PRE}/{bDATE}-{SLUG}") if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}")) WORK=DATA ``` # Context I have verified that the pop-seq training datasets are intact (they got out of order, but the training process is intact). Now I can test if the pop-seq training does have an impact on personalized prediction. ## Create input datasets for testing Testing the personalized prediction doesn't require too many training examples, so I will start with testing 40 individuals for 100 genes. I'll begin by assembling the input files for this inference pass. ### Individuals I will use 20 EUR and 20 AFR individuals to maintain some haplotypic diversity ```{r} library(tidyverse) dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis") # Load GEUVADIS geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt")) # meta_table <- read_table(meta_path) meta_path_rwp = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/relationships_w_pops.txt" meta_table <- read_table(meta_path_rwp) meta_table <- meta_table %>% filter(IID %in% colnames(geuvadis_full)) CEU <- meta_table %>% filter(population != "YRI") CEU_20 <- CEU$IID[1:20] YRI <- meta_table %>% filter(population == "YRI") YRI_20 <- YRI$IID[1:20] all_inds <- c(CEU_20,YRI_20) write.table(all_inds, "40_inds.txt", row.names = FALSE, quote=FALSE, col.names = FALSE) ``` ### Genes ```{r} gene_meta_path <- "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/gene_metadata_full.csv" gene_meta_path <- "/Users/saideepgona/Documents/PhD/imlab/Github_Repos/Daily-Blog-Sai/posts/2023-06-03-calculating_genetic_distance_from_ref/gene_metadata_w_allele_counts.csv" gene_metadata <- read_delim(gene_meta_path ) pos_corr <- gene_metadata %>% filter(all_geuvadis_ge_corrs > 0) %>% arrange(desc(all_geuvadis_ge_corrs)) pos_corr_50 <- pos_corr[1:50,]$gene_name pos_corr_50_intervals <- paste0(pos_corr$chrom,"_",pos_corr$start,"_",pos_corr$end) neg_corr <- gene_metadata %>% filter(all_geuvadis_ge_corrs < 0) %>% arrange(all_geuvadis_ge_corrs) neg_corr_50 <- neg_corr[1:50,]$gene_name neg_corr_50_intervals <- paste0(neg_corr$chrom,"_",neg_corr$start,"_",neg_corr$end) all_genes <- c(pos_corr_50,neg_corr_50) write.table(all_genes, "100_corr_genes.txt", row.names = FALSE, quote=FALSE, col.names = FALSE) all_genes_intervals <- c(pos_corr_50_intervals,neg_corr_50_intervals) write.table(all_genes_intervals, "100_corr_intervals.txt", row.names = FALSE, quote=FALSE, col.names = FALSE) canonical_tss_path <- "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/canonical_TSS_full_metadata.txt" canonical_tss_metadata <- read_delim(canonical_tss_path) canonical_tss_metadata <- canonical_tss_metadata %>% distinct(external_gene_name, .keep_all=TRUE) %>% drop_na() rownames(canonical_tss_metadata) <- canonical_tss_metadata$external_gene_name all_canonical_tss <- canonical_tss_metadata[all_genes,]$target_regions write.table(all_canonical_tss, "100_corr_canonical_tss_intervals.txt", row.names = FALSE, quote=FALSE, col.names = FALSE) ``` ## Checking correctness of pipeline Using the ERAP2 window on a single individual, ### Calculate correlations with ground truth from standard trained model The standard trained model was trained to 80 epochs using reference genome sequence and 240 gpus (batch size 480) ```{r} library(tidyr) library(ggplot2) library(ggrepel) library(ggpmisc) library(readr) library(stringr) library(ggExtra) library(preprocessCore) library(reshape2) library(patchwork) ``` ```{r} dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis") # Load GEUVADIS geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt")) geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1) geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)]) rownames(geuvadis) <- geuvadis_genes_symbols # Load predixcan expression predixcan_full <- read_delim(file.path(dset_dir,"predixcan_expression.txt")) predixcan <- as.data.frame(predixcan_full[,3:ncol(predixcan_full)]) rownames(predixcan) <- predixcan_full$gene_names_proper # Load enformer enformer_haplo1 <- as.data.frame(read_csv(file.path("CAGE_summed_haplotype_CAGE:LCL_quantification.csv"))) rownames(enformer_haplo1) <- enformer_haplo1$`Unnamed: 0` enformer_haplo1$`Unnamed: 0` <- NULL enformer_haplo1$...1 <- NULL enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1) enformer <- enformer_haplo1 enformer_norm <- as.data.frame(t(scale(t(enformer)))) enformer <- enformer_norm rownames(enformer) <- enf_genes_symbols ``` ```{r} metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata" individual_metadata <- read.delim(file.path(metadata_dir,"igsr_samples.tsv")) rownames(individual_metadata) <- individual_metadata$Sample.name gene_metadata <- read.csv2(file.path(metadata_dir,"gene_metadata_full.csv"),row.names = 1) ``` ```{r} gp_ind <- intersect(colnames(geuvadis), colnames(predixcan)) ge_ind <- intersect(colnames(geuvadis), colnames(enformer)) all_three_ind <- intersect(gp_ind,ge_ind) gp_genes <- intersect(rownames(geuvadis), rownames(predixcan)) ge_genes <- intersect(rownames(geuvadis), rownames(enformer)) ep_genes <- intersect(rownames(enformer), rownames(predixcan)) all_three_genes <- intersect(gp_genes, ge_genes) # USING NON-IMPUTED VALUES FOR PREDIXCAN # geuvadis_compare <- geuvadis[all_three_genes, all_three_ind] predixcan_compare <- predixcan[all_three_genes,all_three_ind] enformer_compare <- enformer[all_three_genes, all_three_ind] gene_meta_compare <- gene_metadata[all_three_genes,] individual_metadata_compare <- individual_metadata[rownames(individual_metadata) %in% all_three_ind,] ``` ```{r} corr_gene_across_inds <- function(df_1,df_2) { df_1 <- scale(df_1) df_2 <- scale(df_2) corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,]))) names(corrs) <- rownames(df_1) return(corrs) } gp_corrs <- corr_gene_across_inds(geuvadis_compare, predixcan_compare) gp_corrs_all <- gp_corrs ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare) ge_corrs_all <- ge_corrs cor(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",])) plot(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",])) # boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097) # boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,])) # corrs_df <- data.frame(gp=gp_corrs, ge=ge_corrs, ge_abs = abs(ge_corrs), gp_abs = abs(gp_corrs), genes=all_three_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry, pred_vars = as.numeric(apply(predixcan_compare, 1, var)), enf_vars = as.numeric(apply(enformer_compare, 1, var))) # # corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001) # # # summary(lm(pred_vars ~ ge_corrs, corrs_df)) ``` ```{r} ggplot(corrs_df,aes(x=gp,y=ge)) + geom_point(aes(x=gp,y=ge), alpha=0.5) + xlim(-0.7,0.7) + ylim(-0.7,0.7) + # geom_hline(xintercept=0) + geom_vline(yintercept=0) xlab("geuvadis vs. predixcan correlation across individuals") + ylab("geuvadis vs. enformer correlation across individuals") + geom_hline(aes(yintercept = 0)) + geom_vline(aes(xintercept = 0)) + geom_abline(slope=1, intercept = 0) + geom_smooth(method = "lm", formula = y~x, se = F, color="red") boxplot(corrs_df$ge) corrs_df_standard <- corrs_df enformer_compare_standard <- enformer_compare ``` ### Calculate correlations with ground truth from popseq trained model ```{r} dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis") # Load GEUVADIS geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt")) geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1) geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)]) rownames(geuvadis) <- geuvadis_genes_symbols # Load predixcan expression predixcan_full <- read_delim(file.path(dset_dir,"predixcan_expression.txt")) predixcan <- as.data.frame(predixcan_full[,3:ncol(predixcan_full)]) rownames(predixcan) <- predixcan_full$gene_names_proper # Load enformer enformer_haplo1 <- as.data.frame(read_csv(file.path("CAGE_summed_haplotype_CAGE:LCL_quantification_popseq.csv"))) rownames(enformer_haplo1) <- enformer_haplo1$`Unnamed: 0` enformer_haplo1$`Unnamed: 0` <- NULL enformer_haplo1$...1 <- NULL enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1) enformer <- enformer_haplo1 enformer_norm <- as.data.frame(t(scale(t(enformer)))) enformer <- enformer_norm rownames(enformer) <- enf_genes_symbols ``` ```{r} metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata" individual_metadata <- read.delim(file.path(metadata_dir,"igsr_samples.tsv")) rownames(individual_metadata) <- individual_metadata$Sample.name gene_metadata <- read.csv2(file.path(metadata_dir,"gene_metadata_full.csv"),row.names = 1) ``` ```{r} gp_ind <- intersect(colnames(geuvadis), colnames(predixcan)) ge_ind <- intersect(colnames(geuvadis), colnames(enformer)) all_three_ind <- intersect(gp_ind,ge_ind) gp_genes <- intersect(rownames(geuvadis), rownames(predixcan)) ge_genes <- intersect(rownames(geuvadis), rownames(enformer)) ep_genes <- intersect(rownames(enformer), rownames(predixcan)) all_three_genes <- intersect(gp_genes, ge_genes) # USING NON-IMPUTED VALUES FOR PREDIXCAN # geuvadis_compare <- geuvadis[all_three_genes, all_three_ind] predixcan_compare <- predixcan[all_three_genes,all_three_ind] enformer_compare <- enformer[all_three_genes, all_three_ind] gene_meta_compare <- gene_metadata[all_three_genes,] individual_metadata_compare <- individual_metadata[rownames(individual_metadata) %in% all_three_ind,] ``` ```{r} corr_gene_across_inds <- function(df_1,df_2) { df_1 <- scale(df_1) df_2 <- scale(df_2) corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,]))) names(corrs) <- rownames(df_1) return(corrs) } gp_corrs <- corr_gene_across_inds(geuvadis_compare, predixcan_compare) gp_corrs_all <- gp_corrs ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare) ge_corrs_all <- ge_corrs cor(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",])) plot(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",])) # boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097) # boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,])) # corrs_df <- data.frame(gp=gp_corrs, ge=ge_corrs, ge_abs = abs(ge_corrs), gp_abs = abs(gp_corrs), genes=all_three_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry, pred_vars = as.numeric(apply(predixcan_compare, 1, var)), enf_vars = as.numeric(apply(enformer_compare, 1, var))) # # corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001) # # # summary(lm(pred_vars ~ ge_corrs, corrs_df)) ``` ```{r} ggplot(corrs_df,aes(x=gp,y=ge)) + geom_point(aes(x=gp,y=ge), alpha=0.5) + xlim(-0.7,0.7) + ylim(-0.7,0.7) + # geom_hline(xintercept=0) + geom_vline(yintercept=0) xlab("geuvadis vs. predixcan correlation across individuals") + ylab("geuvadis vs. enformer correlation across individuals") + geom_hline(aes(yintercept = 0)) + geom_vline(aes(xintercept = 0)) + geom_abline(slope=1, intercept = 0) + geom_smooth(method = "lm", formula = y~x, se = F, color="red") boxplot(corrs_df$ge) corrs_df_popseq <- corrs_df enformer_compare_popseq <- enformer_compare ``` ### Compare the two training predictions ```{r} plot(corrs_df_standard$ge,corrs_df_popseq$ge) + abline(0,1) qqplot(as.numeric(corrs_df_standard$ge),as.numeric(corrs_df_popseq$ge)) abline(0,1) plot(as.numeric(geuvadis_compare["ENSG00000129055",]), as.numeric(enformer_compare_standard["ENSG00000129055",])) plot(as.numeric(geuvadis_compare["ENSG00000100321",]), as.numeric(enformer_compare_standard["ENSG00000100321",])) plot(as.numeric(geuvadis_compare["ENSG00000164978",]), as.numeric(enformer_compare_standard["ENSG00000164978",])) boxplot(corrs_df_standard$ge, corrs_df_popseq$ge) ``` ## Test 8 nodes and 20 epochs ```{r} dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis") # Load GEUVADIS geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt")) geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1) geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)]) rownames(geuvadis) <- geuvadis_genes_symbols # Load predixcan expression predixcan_full <- read_delim(file.path(dset_dir,"predixcan_expression.txt")) predixcan <- as.data.frame(predixcan_full[,3:ncol(predixcan_full)]) rownames(predixcan) <- predixcan_full$gene_names_proper # Load enformer enformer_haplo1 <- as.data.frame(read_csv(file.path("CAGE_summed_haplotype_CAGE:LCL_quantification_standard_8.csv"))) rownames(enformer_haplo1) <- enformer_haplo1$`Unnamed: 0` enformer_haplo1$`Unnamed: 0` <- NULL enformer_haplo1$...1 <- NULL enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1) enformer <- enformer_haplo1 enformer_norm <- as.data.frame(t(scale(t(enformer)))) enformer <- enformer_norm rownames(enformer) <- enf_genes_symbols ``` ```{r} metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata" individual_metadata <- read.delim(file.path(metadata_dir,"igsr_samples.tsv")) rownames(individual_metadata) <- individual_metadata$Sample.name gene_metadata <- read.csv2(file.path(metadata_dir,"gene_metadata_full.csv"),row.names = 1) ``` ```{r} gp_ind <- intersect(colnames(geuvadis), colnames(predixcan)) ge_ind <- intersect(colnames(geuvadis), colnames(enformer)) all_three_ind <- intersect(gp_ind,ge_ind) gp_genes <- intersect(rownames(geuvadis), rownames(predixcan)) ge_genes <- intersect(rownames(geuvadis), rownames(enformer)) ep_genes <- intersect(rownames(enformer), rownames(predixcan)) all_three_genes <- intersect(gp_genes, ge_genes) # USING NON-IMPUTED VALUES FOR PREDIXCAN # geuvadis_compare <- geuvadis[all_three_genes, all_three_ind] predixcan_compare <- predixcan[all_three_genes,all_three_ind] enformer_compare <- enformer[all_three_genes, all_three_ind] gene_meta_compare <- gene_metadata[all_three_genes,] individual_metadata_compare <- individual_metadata[rownames(individual_metadata) %in% all_three_ind,] ``` ```{r} corr_gene_across_inds <- function(df_1,df_2) { df_1 <- scale(df_1) df_2 <- scale(df_2) corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,]))) names(corrs) <- rownames(df_1) return(corrs) } gp_corrs <- corr_gene_across_inds(geuvadis_compare, predixcan_compare) gp_corrs_all <- gp_corrs ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare) ge_corrs_all <- ge_corrs cor(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",])) plot(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",])) # boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097) # boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,])) # corrs_df <- data.frame(gp=gp_corrs, ge=ge_corrs, ge_abs = abs(ge_corrs), gp_abs = abs(gp_corrs), genes=all_three_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry, pred_vars = as.numeric(apply(predixcan_compare, 1, var)), enf_vars = as.numeric(apply(enformer_compare, 1, var))) # # corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001) # # # summary(lm(pred_vars ~ ge_corrs, corrs_df)) ``` ```{r} ggplot(corrs_df,aes(x=gp,y=ge)) + geom_point(aes(x=gp,y=ge), alpha=0.5) + xlim(-0.7,0.7) + ylim(-0.7,0.7) + # geom_hline(xintercept=0) + geom_vline(yintercept=0) xlab("geuvadis vs. predixcan correlation across individuals") + ylab("geuvadis vs. enformer correlation across individuals") + geom_hline(aes(yintercept = 0)) + geom_vline(aes(xintercept = 0)) + geom_abline(slope=1, intercept = 0) + geom_smooth(method = "lm", formula = y~x, se = F, color="red") boxplot(corrs_df$ge) corrs_df_standard <- corrs_df enformer_compare_standard <- enformer_compare ``` ### Calculate correlations with ground truth from popseq trained model ```{r} dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis") # Load GEUVADIS geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt")) geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1) geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)]) rownames(geuvadis) <- geuvadis_genes_symbols # Load predixcan expression predixcan_full <- read_delim(file.path(dset_dir,"predixcan_expression.txt")) predixcan <- as.data.frame(predixcan_full[,3:ncol(predixcan_full)]) rownames(predixcan) <- predixcan_full$gene_names_proper # Load enformer enformer_haplo1 <- as.data.frame(read_csv(file.path("CAGE_summed_haplotype_CAGE:LCL_quantification_popseq_8.csv"))) rownames(enformer_haplo1) <- enformer_haplo1$`Unnamed: 0` enformer_haplo1$`Unnamed: 0` <- NULL enformer_haplo1$...1 <- NULL enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1) enformer <- enformer_haplo1 enformer_norm <- as.data.frame(t(scale(t(enformer)))) enformer <- enformer_norm rownames(enformer) <- enf_genes_symbols ``` ```{r} metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata" individual_metadata <- read.delim(file.path(metadata_dir,"igsr_samples.tsv")) rownames(individual_metadata) <- individual_metadata$Sample.name gene_metadata <- read.csv2(file.path(metadata_dir,"gene_metadata_full.csv"),row.names = 1) ``` ```{r} gp_ind <- intersect(colnames(geuvadis), colnames(predixcan)) ge_ind <- intersect(colnames(geuvadis), colnames(enformer)) all_three_ind <- intersect(gp_ind,ge_ind) gp_genes <- intersect(rownames(geuvadis), rownames(predixcan)) ge_genes <- intersect(rownames(geuvadis), rownames(enformer)) ep_genes <- intersect(rownames(enformer), rownames(predixcan)) all_three_genes <- intersect(gp_genes, ge_genes) # USING NON-IMPUTED VALUES FOR PREDIXCAN # geuvadis_compare <- geuvadis[all_three_genes, all_three_ind] predixcan_compare <- predixcan[all_three_genes,all_three_ind] enformer_compare <- enformer[all_three_genes, all_three_ind] gene_meta_compare <- gene_metadata[all_three_genes,] individual_metadata_compare <- individual_metadata[rownames(individual_metadata) %in% all_three_ind,] ``` ```{r} corr_gene_across_inds <- function(df_1,df_2) { df_1 <- scale(df_1) df_2 <- scale(df_2) corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,]))) names(corrs) <- rownames(df_1) return(corrs) } gp_corrs <- corr_gene_across_inds(geuvadis_compare, predixcan_compare) gp_corrs_all <- gp_corrs ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare) ge_corrs_all <- ge_corrs cor(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",])) plot(as.numeric(geuvadis_compare["ENSG00000164308",]), as.numeric(enformer_compare["ENSG00000164308",])) # boxplot(predixcan_compare$HG00096, predixcan_compare$HG00097) # boxplot(as.numeric(predixcan_compare[1,]), as.numeric(predixcan_compare[2,])) # corrs_df <- data.frame(gp=gp_corrs, ge=ge_corrs, ge_abs = abs(ge_corrs), gp_abs = abs(gp_corrs), genes=all_three_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry, pred_vars = as.numeric(apply(predixcan_compare, 1, var)), enf_vars = as.numeric(apply(enformer_compare, 1, var))) # # corrs_df$var_ratio <- (corrs_df$enf_vars + 0.001)/(corrs_df$pred_vars + 0.001) # # # summary(lm(pred_vars ~ ge_corrs, corrs_df)) ``` ```{r} ggplot(corrs_df,aes(x=gp,y=ge)) + geom_point(aes(x=gp,y=ge), alpha=0.5) + xlim(-0.7,0.7) + ylim(-0.7,0.7) + # geom_hline(xintercept=0) + geom_vline(yintercept=0) xlab("geuvadis vs. predixcan correlation across individuals") + ylab("geuvadis vs. enformer correlation across individuals") + geom_hline(aes(yintercept = 0)) + geom_vline(aes(xintercept = 0)) + geom_abline(slope=1, intercept = 0) + geom_smooth(method = "lm", formula = y~x, se = F, color="red") boxplot(corrs_df$ge) corrs_df_popseq <- corrs_df enformer_compare_popseq <- enformer_compare ``` ### Compare the two training predictions ```{r} plot(corrs_df_standard$ge,corrs_df_popseq$ge) abline(0,1) qqplot(as.numeric(corrs_df_standard$ge),as.numeric(corrs_df_popseq$ge)) abline(0,1) plot(as.numeric(geuvadis_compare["ENSG00000129055",]), as.numeric(enformer_compare_standard["ENSG00000129055",])) plot(as.numeric(geuvadis_compare["ENSG00000100321",]), as.numeric(enformer_compare_standard["ENSG00000100321",])) plot(as.numeric(geuvadis_compare["ENSG00000164978",]), as.numeric(enformer_compare_standard["ENSG00000164978",])) boxplot(corrs_df_standard$ge, corrs_df_popseq$ge) ```