analyzing-ERAP2-personalized-mutagenesis

Author

Saideep Gona

Published

April 4, 2023

Code

suppressMessages(library(tidyverse))
suppressMessages(library(glue))
PRE = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai"

## COPY THE DATE AND SLUG fields FROM THE HEADER
SLUG="analyzing-ERAP2-personalized-mutagenesis" ## copy the slug from the header
bDATE='2023-04-04' ## copy the date from the blog's header here
DATA = glue("{PRE}/{bDATE}-{SLUG}")
if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))
WORK=DATA

#TODO

Context

It’s time to analyze the ERAP2 mutagenesis results from reference-based and individual mutagenesis

Load in relevant metadata

Code

metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"

pop_metadata <- read_delim(file.path(metadata_dir,"relationships_w_pops.txt"))
rownames(pop_metadata) <- pop_metadata$IID

gene_annotations <- read_csv(file.path(metadata_dir,"ref_local_ancestry","gene_annotations.csv"))
genes <- as.vector(gene_annotations$...1)
gene_annotations <- gene_annotations[,2:ncol(gene_annotations)]
rownames(gene_annotations) <- genes

gene_annotations["ENSG00000164308",]

# A tibble: 1 × 6
    AFR   AMR   EAS   SAS   EUR  `NA`
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1     0     0     0     0     0 43717

Code

sum(rowSums(gene_annotations)==0)

[1] 11

Code

hist(rowSums(gene_annotations))

Code

# Geuvadis expression
dset_dir <- "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis"
geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt"))
geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1)
geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)])
rownames(geuvadis) <- geuvadis_genes_symbols

Code

data_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/ERAP2_personalized_mutagenesis"

genotype_data <- read_csv(file.path(data_dir,"genotypes_df.csv"))
variant_ids <- as.vector(genotype_data[,1])[[1]]
rownames(genotype_data) <- as.vector(genotype_data[,1])[[1]]

genotype_data[,1] <- NULL
rownames(genotype_data) <- variant_ids

allele_count_df <- ifelse(genotype_data=="1|1",2,ifelse(genotype_data=="0|0",0,1))
rownames(allele_count_df) <- variant_ids

variant_metadata_df <- data.frame(af=rowSums(allele_count_df)/(2*ncol(allele_count_df*2)))

Effect size of reference variants alone

It will help to have the effect size of these variants when using a reference background alone. We can load these in:

Code

data_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/ERAP2_personalized_mutagenesis"

haplo1_ref <- read_csv(file.path(data_dir,"erap2_personalized_LCL_REF_quantification_haplo1.csv"))
haplo1_ref <- haplo1_ref[,3:ncol(haplo1_ref)]

haplo1_ref_rc <- read_csv(file.path(data_dir,"erap2_personalized_LCL_REF_quantification_haplo1_rc.csv"))
haplo1_ref_rc<- haplo1_ref_rc[,3:ncol(haplo1_ref_rc)]

haplo1_ref_ave = data.frame(t((haplo1_ref_rc + haplo1_ref)/2))
colnames(haplo1_ref_ave) <- c("expression")

reference_hap_expression <- as.numeric(haplo1_ref_ave[1,1])


haplo2_ref <- read_csv(file.path(data_dir,"erap2_personalized_LCL_REF_quantification_haplo2.csv"))
haplo2_ref <- haplo2_ref[,3:ncol(haplo2_ref)]

haplo2_ref_rc <- read_csv(file.path(data_dir,"erap2_personalized_LCL_REF_quantification_haplo2_rc.csv"))
haplo2_ref_rc <- haplo2_ref_rc[,3:ncol(haplo2_ref_rc)]

haplo2_ref_ave = data.frame(t((haplo2_ref_rc + haplo2_ref)/2))
colnames(haplo2_ref_ave) <- c("expression")

Code

ref_dif <- haplo2_ref_ave - haplo1_ref_ave
colnames(ref_dif) <- c("dif")

ref_dif$ids <- rownames(ref_dif)
ref_dif <- ref_dif %>% separate(ids,into = c("individual","mutated_variant","ref"))

ref_dif$pos <- as.numeric(ref_dif$mutated_variant)

ref_dif$voi <- ifelse(ref_dif$mutated_variant %in% c("96916885","96916728","96900192"),TRUE,FALSE)
ref_dif$voi_sizes <- as.numeric(ifelse(ref_dif$mutated_variant %in% c("96916885","96916728","96900192"),15,1))

ref_dif$allele_freq <- as.numeric(variant_metadata_df[ref_dif$mutated_variant,1])

Code

ggplot(ref_dif) + geom_point(aes(x=pos,y=dif, color = allele_freq), alpha=0.5) + scale_x_continuous(breaks = seq(96875986-80000, 96875986+80000, by = 20000)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + geom_vline(xintercept = 96875986) + ylab("variant effect magnitude") + xlab("position on chr5") + ggtitle("In-silico mutagenesis of ERAP2 variants")

Code

ggplot(ref_dif) + geom_point(aes(x=allele_freq, y=dif))

Ok, let’s see how well some of these interesting variants correlate with the expression patterns we are seeing:

Calculate true individual-specific expression estimates, this first requires loading the personalized expression:

Code

library(tidyverse)

haplo1 <- read_csv(file.path(data_dir,"erap2_personalized_LCL_quantification_haplo1.csv"))
haplo1 <- haplo1[,3:ncol(haplo1)]

haplo1_rc <- read_csv(file.path(data_dir,"erap2_personalized_LCL_quantification_haplo1_rc.csv"))
haplo1_rc<- haplo1_rc[,3:ncol(haplo1_rc)]

haplo1_ave = data.frame(t((haplo1_rc + haplo1)/2))
colnames(haplo1_ave) <- c("expression")


haplo2 <- read_csv(file.path(data_dir,"erap2_personalized_LCL_quantification_haplo2.csv"))
haplo2 <- haplo2[,3:ncol(haplo2)]

haplo2_rc <- read_csv(file.path(data_dir,"erap2_personalized_LCL_quantification_haplo2_rc.csv"))
haplo2_rc <- haplo2_rc[,3:ncol(haplo2_rc)]

haplo2_ave = data.frame(t((haplo2_rc + haplo2)/2))
colnames(haplo2_ave) <- c("expression")

Code

haplo1_ave$ids <- rownames(haplo1_ave)
haplo1_ave <- haplo1_ave %>% separate(ids,into = c("individual","mutated_variant","mutated_genotype"))
haplo1_ave$haplotype <- "1"

haplo1_ref <- haplo1_ave[haplo1_ave$mutated_genotype == "REF",]
haplo1_alt <- haplo1_ave[haplo1_ave$mutated_genotype == "ALT",]

haplo2_ave$ids <- rownames(haplo2_ave)
haplo2_ave <- haplo2_ave %>% separate(ids,into = c("individual","mutated_variant","mutated_genotype"))
haplo2_ave$haplotype <- "2"

haplo2_ref <- haplo2_ave[haplo2_ave$mutated_genotype == "REF",]
haplo2_alt <- haplo2_ave[haplo2_ave$mutated_genotype == "ALT",]

ref_stack <- rbind(haplo1_ref,haplo2_ref)
alt_stack <- rbind(haplo1_alt,haplo2_alt)

full_df <- data.frame(individual=ref_stack$individual,haplotype=ref_stack$haplotype,mutated_variant=ref_stack$mutated_variant, ref_expression=ref_stack$expression, alt_expression=alt_stack$expression)

Code

erap2_expression <- c()
  
for (ind in colnames(genotype_data)) {
  expression = 0
  cur_geno = str_split(genotype_data["96916885",ind], "|")
  
  if (cur_geno[[1]][2] == "0") {
    id_tag = paste(ind,"96916885","REF", sep="_")
    expression = expression + haplo1_ave[id_tag,]$expression
  } else {
    id_tag = paste(ind,"96916885","ALT", sep="_")
    expression = expression + haplo1_ave[id_tag,]$expression
  }

  if (cur_geno[[1]][4] == "0") {
    id_tag = paste(ind,"96916885","REF", sep="_")
    expression = expression + haplo2_ave[id_tag,]$expression
  } else {
    id_tag = paste(ind,"96916885","ALT", sep="_")
    expression = expression + haplo2_ave[id_tag,]$expression
  }
  
  erap2_expression <- c(erap2_expression, expression)
}

Let’s also grab the large effect size variants from the reference mutagenesis:

Code

ref_dif[ref_dif$dif > 4,]

                            dif individual mutated_variant ref      pos   voi
reference_96876409_REF 5.024243  reference        96876409 REF 96876409 FALSE
reference_96896518_REF 9.473810  reference        96896518 REF 96896518 FALSE
                       voi_sizes allele_freq
reference_96876409_REF         1   0.0200000
reference_96896518_REF         1   0.5628571

Code

erap2_expression_df <- data.frame(enformer_predicted_whole_expression=erap2_expression,
                                  geuvadis_expression = as.numeric(geuvadis["ENSG00000164308",colnames(genotype_data)]),
                                  fm_96916885_geno=allele_count_df["96916885",],
                                  splice_96900192_geno=allele_count_df["96900192",],
                                  fm_96916728_geno=allele_count_df["96916728",],
                                  large5_96876409_geno=allele_count_df["96876409",],
                                  large10_96896518_geno=allele_count_df["96896518",])
rownames(erap2_expression_df) <- colnames(genotype_data)

erap2_expression_df$ancestry <- as.vector(pop_metadata[rownames(erap2_expression_df),]$population)

ggplot(erap2_expression_df) + geom_point(aes(x=geuvadis_expression, y=enformer_predicted_whole_expression, color=ancestry)) + xlab("Geuvadis Expression for ERAP2") + ylab("Enformer predicted ERAP2 expression") + ggtitle("Predicted vs. Observed ERAP2 expression across individuals")

Code

ggplot(erap2_expression_df) + geom_point(aes(x=geuvadis_expression, y=enformer_predicted_whole_expression, color=splice_96900192_geno, shape=ancestry))

Code

ggplot(erap2_expression_df) + geom_point(aes(x=geuvadis_expression, y=enformer_predicted_whole_expression, color=fm_96916885_geno, shape=ancestry))

Code

ggplot(erap2_expression_df) + geom_point(aes(x=geuvadis_expression, y=enformer_predicted_whole_expression, color=fm_96916728_geno, shape=ancestry))

Code

ggplot(erap2_expression_df) + geom_point(aes(x=geuvadis_expression, y=enformer_predicted_whole_expression, color=large5_96876409_geno, shape=ancestry))

Code

ggplot(erap2_expression_df) + geom_point(aes(x=geuvadis_expression, y=enformer_predicted_whole_expression, color=large10_96896518_geno, shape=ancestry))

Let’s generate predicted personalized gene expression by summing up individual variant effects with the reference prediction as a baseline:

Code

allele_count_mat <- as.data.frame(allele_count_df)

dif_mat <- data.frame(difs = ref_dif$dif)
rownames(dif_mat) <- ref_dif$mutated_variant

allele_count_mat <- t(as.matrix(allele_count_mat[rownames(dif_mat),]))
dif_mat <- as.matrix(dif_mat)

linear_predictions <- allele_count_mat %*% dif_mat + reference_hap_expression*2

erap2_expression_df$linear_pred <- as.vector(linear_predictions)

ggplot(erap2_expression_df) + geom_point(aes(x=linear_predictions, y= enformer_predicted_whole_expression, color=ancestry)) + ylab("Enformer predicted ERAP2 expression") + xlab("Sum of variant effects") + ggtitle("Enformer predicted ERAP2 expression vs. sum of individual Enformer variant \n effects for ERAP2")

The above plot plots the individualized expression predictions

Clearly, ERAP2 is well predicted as a linear sum of individual variant effects. This supports the conclusion that the large effect variant is at play here.

personalized mutagenesis

After loading in the personalized expression estimates

We can also annotate each value with data for the individual, haplotype, mutated variant, and genotype of mutated variant.

Code

library(tidyverse)

haplo1 <- read_csv(file.path(data_dir,"erap2_personalized_LCL_quantification_haplo1.csv"))
haplo1 <- haplo1[,3:ncol(haplo1)]

haplo1_rc <- read_csv(file.path(data_dir,"erap2_personalized_LCL_quantification_haplo1_rc.csv"))
haplo1_rc<- haplo1_rc[,3:ncol(haplo1_rc)]

haplo1_ave = data.frame(t((haplo1_rc + haplo1)/2))
colnames(haplo1_ave) <- c("expression")


haplo2 <- read_csv(file.path(data_dir,"erap2_personalized_LCL_quantification_haplo2.csv"))
haplo2 <- haplo2[,3:ncol(haplo2)]

haplo2_rc <- read_csv(file.path(data_dir,"erap2_personalized_LCL_quantification_haplo2_rc.csv"))
haplo2_rc <- haplo2_rc[,3:ncol(haplo2_rc)]

haplo2_ave = data.frame(t((haplo2_rc + haplo2)/2))
colnames(haplo2_ave) <- c("expression")

Code

haplo1_ave$ids <- rownames(haplo1_ave)
haplo1_ave <- haplo1_ave %>% separate(ids,into = c("individual","mutated_variant","mutated_genotype"))
haplo1_ave$haplotype <- "1"

haplo1_ref <- haplo1_ave[haplo1_ave$mutated_genotype == "REF",]
haplo1_alt <- haplo1_ave[haplo1_ave$mutated_genotype == "ALT",]

haplo2_ave$ids <- rownames(haplo2_ave)
haplo2_ave <- haplo2_ave %>% separate(ids,into = c("individual","mutated_variant","mutated_genotype"))
haplo2_ave$haplotype <- "2"

haplo2_ref <- haplo2_ave[haplo2_ave$mutated_genotype == "REF",]
haplo2_alt <- haplo2_ave[haplo2_ave$mutated_genotype == "ALT",]

ref_stack <- rbind(haplo1_ref,haplo2_ref)
alt_stack <- rbind(haplo1_alt,haplo2_alt)

full_df <- data.frame(individual=ref_stack$individual,haplotype=ref_stack$haplotype,mutated_variant=ref_stack$mutated_variant, ref_expression=ref_stack$expression, alt_expression=alt_stack$expression)

We now have collected all our estimates into a concise data frame. Next we can calculate some effect-size related estimates by comparing the alt,ref

Code

full_df$dif <- full_df$alt_expression-full_df$ref_expression
full_df$fc <- full_df$alt_expression/full_df$ref_expression
full_df$logfc <- log2(full_df$fc)

full_df$log_ref <- log(full_df$ref_expression)
full_df$log_alt <- log(full_df$alt_expression)

We can also annotate our results by individual ancestry

Code

full_df$ancestry <- as.vector(pop_metadata[full_df$individual,]$population)

Code

# TODO Plot against allele frequency
# TODO Color by ancestry 

ggplot(full_df) + geom_violin(aes(x=mutated_variant, y=dif, fill=mutated_variant)) + geom_jitter(aes(x=mutated_variant, y=dif))+geom_hline(yintercept = ref_dif[ref_dif$mutated_variant=="96916885","dif"], color="blue") + geom_hline(yintercept = ref_dif[ref_dif$mutated_variant=="96916728","dif"], color="green") + geom_hline(yintercept = ref_dif[ref_dif$mutated_variant=="96900192","dif"], color="red") +scale_color_manual(name='Personalized vs. ref mutagenesis',
                     breaks=c('96916885', '96916728', '96900192'),
                     values=c('96916885'='red', '96916728'='blue', '96900192'='green')) + theme(legend.title=element_text(size=20),
       legend.text=element_text(size=14))

Code

ggplot(full_df) + geom_violin(aes(x=mutated_variant, y=dif, fill=haplotype))

Code

ggplot(full_df) + geom_violin(aes(x=mutated_variant, y=dif, fill=ancestry))

The distributions are certainly interesting.

It would be good to try and identify if the 96916728 “positive” group is an interaction with a specific variant present more frequently in that subgroup.

Code

full_df_fm_96916728_pos <- full_df[(full_df$dif > 0) & (full_df$mutated_variant == 96916728),]

fm_96916728_pos_individuals <- unique(full_df_fm_96916728_pos$individual)


allele_count_sub <- as.data.frame(t(allele_count_df[rownames(dif_mat),]))

allele_count_sub$fm_96916728_pos <- rownames(allele_count_sub) %in% fm_96916728_pos_individuals

fm_96916728_pos_cor <- as.data.frame(cor(allele_count_sub[,colnames(allele_count_sub != "fm_96916728_pos")],allele_count_sub$fm_96916728_pos))

fm_96916728_pos_cor$sites <- as.numeric(rownames(fm_96916728_pos_cor))
colnames(fm_96916728_pos_cor) <- c("cor", "sites")

fm_96916728_pos_cor$allele_freq <- as.numeric(variant_metadata_df[rownames(fm_96916728_pos_cor),1])

ggplot(fm_96916728_pos_cor) + geom_point(aes(x=sites,y=cor, col=allele_freq))