calculating_genetic_distance_from_ref

Author

Saideep Gona

Published

June 3, 2023

Code
suppressMessages(library(tidyverse))
suppressMessages(library(glue))
PRE = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai"

## COPY THE DATE AND SLUG fields FROM THE HEADER
SLUG="calculating_genetic_distance_from_ref" ## copy the slug from the header
bDATE='2023-06-03' ## copy the date from the blog's header here
DATA = glue("{PRE}/{bDATE}-{SLUG}")
if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))
WORK=DATA

Context

Previously, I looked for an association between the local ancestry of genes in the refernce genome and their direction of personalized Enformer expression prediction correlation. This found no association. Another useful metric is to look at the genetic distance of the ENCODE tracks used in training against the reference background. This gives a sort of measure of discordance between the training input and output.

Computing the genetic distance b/w reference and genome of track

For this case, I will use the genome of the track pertaining to LCL CAGE expression. This track comes from a common cell line: GM12878 . Roughly, this is a cell line derived from a European Ancestry female. I have downloaded a VCF file from sequencing of the cell line.

I will start by computing the average alternative allele dosage of variants within each gene window. This will give some measure of the sequence deviation from the reference genome.

Load in the VCF and gene regions

Code
library(VariantAnnotation)
library(GenomicRanges)

vcf_file = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai/2023-06-03-calculating_genetic_distance_from_ref/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz"

# vcf <- readVcf(file = vcf_file, genome = "GRCh38")


metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"
gene_metadata <- as.data.frame(read_delim(file.path(metadata_dir,"gene_metadata_full.csv")))
rownames(gene_metadata) <- as.character(gene_metadata$...1)
gene_metadata$...1 <- NULL

Check if the VCF file reference is the reference genome

Code
example_region <- GRanges(seqnames="chr22", ranges=IRanges(
  start=c(50301422, 50989541),
  end=c(50312106, 51001328),
  names=c("gene_79087", "gene_644186")))

vcf_example <- readVcf(file = vcf_file, genome = "GRCh38", param=example_region)

geno(vcf_example)$GT
rowRanges(vcf_example)

Through manual inspection of this region, it is clear that the reference matches the reference genome, rather than being linked to relative population frequency.

Compute genomic distance from reference

We can write and test a function to compute this genetic distance metric

Code
gene_row = 10

example_gene <- GRanges(seqnames=gene_metadata[gene_row,1], ranges=IRanges(
  start=as.numeric(gene_metadata[gene_row,2]),
  end=as.numeric(gene_metadata[gene_row,3]),
  names=gene_metadata[gene_row,4]))

vcf_example_gene <- readVcf(file = vcf_file, genome = "GRCh38", param=example_gene)

length(geno(vcf_example_gene)$GT)

geno(vcf_example_gene)$GT
# rowRanges(vcf_example)
genos = geno(vcf_example_gene)$GT
total_a_count = 0
for (gt in genos) {
  total_a_count = total_a_count + sum(as.numeric(strsplit(gt,"/")[[1]]))
}
average_a_count = total_a_count/width(example_gene)
Code
calc_alleles_single_gene <- function(gene_row, gene_metadata=gene_metadata, vcf_file=vcf_file) {
  
  if (gene_row[1] == "chrM") {
    return (NA)
  }
  
  example_gene <- GRanges(seqnames=gene_row[1], ranges=IRanges(
    start=as.numeric(gene_row[2]),
    end=as.numeric(gene_row[3]),
    names=gene_row[4]))
  
  vcf_example_gene <- readVcf(file = vcf_file, genome = "GRCh38", param=example_gene)
  
  genos = geno(vcf_example_gene)$GT
  total_a_count = 0
  num_het = 0
  num_homo = 0
  for (gt in genos) {
    cur_a_count <- sum(as.numeric(strsplit(gt,"/")[[1]]))
    if (cur_a_count==2) {
      num_homo = num_homo + 1
    } else {
      num_het = num_het + 1
    }
    total_a_count = total_a_count + cur_a_count
  }
   
  return (c(total_a_count, length(genos),num_het,num_homo))
}

Now let’s run it across all genes

Code
gene_metadata_trimmed <- gene_metadata[1:10,]

total_alt_counts <- apply(gene_metadata,1, calc_alleles_single_gene, gene_metadata=gene_metadata, vcf_file=vcf_file)


ref_var_counts <- matrix(0,nrow=length(total_alt_counts),ncol = 4)
for (x in 1:length(total_alt_counts)) {
  if (length(total_alt_counts[[x]])==1) {
    ref_var_counts[x,] <- c(NA,NA,NA,NA)
  } else {
    ref_var_counts[x,] <- total_alt_counts[[x]]
  }
}

ref_var_counts <- as.data.frame(ref_var_counts)

colnames(ref_var_counts) <- c("total_alt_count_LCL","num_variants_LCL","num_het_LCL","num_homo_LCL")

This takes a while to run, so let’s save it just in case:

Code
save(total_alt_counts, file="altcounts.rda")

Reload alt allele counts here

Code
# load("altcounts.rda")

We can now plot the allelic dosage as a proportion of gene length

Code
gene_metadata$total_alt_count_LCL <- ref_var_counts$total_alt_count_LCL
gene_metadata$num_variants_LCL <- ref_var_counts$num_variants_LCL
gene_metadata$num_het_LCL <- ref_var_counts$num_het_LCL
gene_metadata$num_homo_LCL <- ref_var_counts$num_homo_LCL

gene_metadata$alt_counts_len_normalized_LCL <- gene_metadata$total_alt_count_LCL/gene_metadata$length

ggplot(gene_metadata) + geom_bar(stat="count",aes(x=best_ancestry))
ggplot(gene_metadata) + geom_violin(aes(x=best_ancestry,y=alt_counts_len_normalized_LCL)) + ylim(c(0,0.025))
ggplot(gene_metadata) + geom_boxplot(aes(x=best_ancestry,y=alt_counts_len_normalized_LCL)) + ylim(c(0,0.025))
Code
write_csv(gene_metadata, "gene_metadata_w_allele_counts.csv")

The figures above essentially show that the overall alternate allelic dosage from the cell line (normalized for gene length) is enriched among genes with African reference genome ancestry, which supports our assumptions.

Recompute gene correlations

Code
dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis")

# Load GEUVADIS

geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt"))
geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1)
geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)])
rownames(geuvadis) <- geuvadis_genes_symbols

# Load enformer

enformer_haplo1 <- as.data.frame(read_csv(file.path(dset_dir,"CAGE_lcl_enformer_geuvadis_canonical_haplo1_filtered.csv")))
rownames(enformer_haplo1) <- enformer_haplo1$...1
enformer_haplo1$...1 <- NULL
# rownames(enformer_haplo1) <- enformer_haplo1$X
# enformer_haplo1$X <- NULL
enformer_haplo2 <- as.data.frame(read_csv(file.path(dset_dir,"CAGE_lcl_enformer_geuvadis_canonical_haplo2_filtered.csv")))
rownames(enformer_haplo2) <- enformer_haplo2$X
enformer_haplo2$...1 <- NULL


enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1)
enformer <- (enformer_haplo1 + enformer_haplo2)
rownames(enformer) <- enf_genes_symbols
Code
ge_ind <- intersect(colnames(geuvadis), colnames(enformer))

AFR_ge_ind <- intersect(ge_ind,AFR_ind)
EUR_ge_ind <- intersect(ge_ind,EUR_ind)

ge_genes <- intersect(rownames(geuvadis), rownames(enformer))

geuvadis_compare <- geuvadis[ge_genes, ge_ind]
geuvadis_AFR_compare <- geuvadis[ge_genes, AFR_ge_ind]
geuvadis_EUR_compare <- geuvadis[ge_genes, EUR_ge_ind]

enformer_compare <- enformer[ge_genes, ge_ind]
enformer_AFR_compare <- enformer[ge_genes, AFR_ge_ind]
enformer_EUR_compare <- enformer[ge_genes, EUR_ge_ind]

gene_meta_compare <- gene_metadata[ge_genes,]

print(paste0("Number of individuals: ", length(ge_ind)))
print(paste0("Number of individuals AFR: ", length(AFR_ge_ind)))
print(paste0("Number of individuals EUR: ", length(EUR_ge_ind)))


ge_genes_plus <- ge_genes
print(paste0("Number of genes: ", length(gene_meta_compare$best_ancestry)))
print(paste0("Number of genes,AFR: ", length(gene_meta_compare[gene_meta_compare$best_ancestry=="AFR",]$best_ancestry)))
print(paste0("Number of genes,EUR: ", length(gene_meta_compare[gene_meta_compare$best_ancestry=="EUR",]$best_ancestry)))
Code
corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}


ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare)

corrs_df <- data.frame(ge=ge_corrs, ge_abs = abs(ge_corrs),genes=ge_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry)

gene_metadata$all_geuvadis_ge_corrs <- corrs_df[rownames(gene_metadata),]$ge

corrs_df_AFR_EUR <- corrs_df[corrs_df$ref_gene_ancestry %in% c("AFR","EUR"),]

corrs_df_AFR_EUR$AFR <- as.factor(ifelse(corrs_df_AFR_EUR$ref_gene_ancestry == "AFR",1,0))

corrs_df_AFR_EUR$corr_direction <- ifelse(corrs_df_AFR_EUR$ge > 0,"+","-")

ggplot(corrs_df_AFR_EUR) + geom_boxplot(aes(x=ref_gene_ancestry,y=ge))

plot(table(corrs_df_AFR_EUR$ref_gene_ancestry, corrs_df_AFR_EUR$corr_direction))

summary(glm(AFR~ge, data=corrs_df_AFR_EUR, family="binomial"))
Code
corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}


ge_corrs <- corr_gene_across_inds(geuvadis_AFR_compare, enformer_AFR_compare)

corrs_df <- data.frame(ge=ge_corrs, ge_abs = abs(ge_corrs),genes=ge_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry)

gene_metadata$AFR_geuvadis_ge_corrs <- corrs_df[rownames(gene_metadata),]$ge

corrs_df_plus <- corrs_df[corrs_df$strand == "+",]

corrs_df_plus_AFR_EUR <- corrs_df[corrs_df$ref_gene_ancestry %in% c("AFR","EUR"),]

corrs_df_plus_AFR_EUR$AFR <- as.factor(ifelse(corrs_df_plus_AFR_EUR$ref_gene_ancestry == "AFR",1,0))

corrs_df_plus_AFR_EUR$corr_direction <- ifelse(corrs_df_plus_AFR_EUR$ge > 0,"+","-")

ggplot(corrs_df_plus_AFR_EUR) + geom_boxplot(aes(x=ref_gene_ancestry,y=ge))

plot(table(corrs_df_plus_AFR_EUR$ref_gene_ancestry, corrs_df_plus_AFR_EUR$corr_direction))

summary(glm(AFR~ge, data=corrs_df_plus_AFR_EUR, family = "binomial"))
Code
corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}


ge_corrs <- corr_gene_across_inds(geuvadis_EUR_compare, enformer_EUR_compare)

corrs_df <- data.frame(ge=ge_corrs, ge_abs = abs(ge_corrs),genes=ge_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry)

gene_metadata$EUR_geuvadis_ge_corrs <- corrs_df[rownames(gene_metadata),]$ge

corrs_df_plus <- corrs_df[corrs_df$strand == "+",]

corrs_df_plus_AFR_EUR <- corrs_df[corrs_df$ref_gene_ancestry %in% c("AFR","EUR"),]

corrs_df_plus_AFR_EUR$AFR <- as.factor(ifelse(corrs_df_plus_AFR_EUR$ref_gene_ancestry == "AFR",1,0))

corrs_df_plus_AFR_EUR$corr_direction <- ifelse(corrs_df_plus_AFR_EUR$ge > 0,"+","-")

ggplot(corrs_df_plus_AFR_EUR) + geom_boxplot(aes(x=ref_gene_ancestry,y=ge))

# ggplot(data.frame(table(corrs_df_plus_AFR_EUR$ref_gene_ancestry, corrs_df_plus_AFR_EUR$corr_direction)), aes(x=Var1,y=Freq,fill=Var2, group=Var1)) + geom_bar(stat="identity", position="dodge")

plot(table(corrs_df_plus_AFR_EUR$ref_gene_ancestry, corrs_df_plus_AFR_EUR$corr_direction))

summary(glm(AFR~ge, data=corrs_df_plus_AFR_EUR, family="binomial"))
Code
write_csv(gene_metadata, "gene_metadata_w_allele_counts.csv")

Incorporate population level information

A potential issue with the above metric is that we haven’t really controlled for the fact that some genes also tend to show more variation in general, so would have stronger alt allele counts by default. If this is roughly even between AFR and EUR genes then we are ok, but it may not be.