calculating_genetic_distance_from_ref

Author

Saideep Gona

Published

June 3, 2023

Code

suppressMessages(library(tidyverse))
suppressMessages(library(glue))
PRE = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai"

## COPY THE DATE AND SLUG fields FROM THE HEADER
SLUG="calculating_genetic_distance_from_ref" ## copy the slug from the header
bDATE='2023-06-03' ## copy the date from the blog's header here
DATA = glue("{PRE}/{bDATE}-{SLUG}")
if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))
WORK=DATA

Context

Previously, I looked for an association between the local ancestry of genes in the refernce genome and their direction of personalized Enformer expression prediction correlation. This found no association. Another useful metric is to look at the genetic distance of the ENCODE tracks used in training against the reference background. This gives a sort of measure of discordance between the training input and output.

Computing the genetic distance b/w reference and genome of track

For this case, I will use the genome of the track pertaining to LCL CAGE expression. This track comes from a common cell line: GM12878 . Roughly, this is a cell line derived from a European Ancestry female. I have downloaded a VCF file from sequencing of the cell line.

I will start by computing the average alternative allele dosage of variants within each gene window. This will give some measure of the sequence deviation from the reference genome.

Load in the VCF and gene regions

Code

library(VariantAnnotation)
library(GenomicRanges)

vcf_file = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai/2023-06-03-calculating_genetic_distance_from_ref/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz"

# vcf <- readVcf(file = vcf_file, genome = "GRCh38")


metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata"
gene_metadata <- as.data.frame(read_delim(file.path(metadata_dir,"gene_metadata_full.csv")))
rownames(gene_metadata) <- as.character(gene_metadata$...1)
gene_metadata$...1 <- NULL

Check if the VCF file reference is the reference genome

Code

example_region <- GRanges(seqnames="chr22", ranges=IRanges(
  start=c(50301422, 50989541),
  end=c(50312106, 51001328),
  names=c("gene_79087", "gene_644186")))

vcf_example <- readVcf(file = vcf_file, genome = "GRCh38", param=example_region)

geno(vcf_example)$GT
rowRanges(vcf_example)

Through manual inspection of this region, it is clear that the reference matches the reference genome, rather than being linked to relative population frequency.

Compute genomic distance from reference

We can write and test a function to compute this genetic distance metric

Code

gene_row = 10

example_gene <- GRanges(seqnames=gene_metadata[gene_row,1], ranges=IRanges(
  start=as.numeric(gene_metadata[gene_row,2]),
  end=as.numeric(gene_metadata[gene_row,3]),
  names=gene_metadata[gene_row,4]))

vcf_example_gene <- readVcf(file = vcf_file, genome = "GRCh38", param=example_gene)

length(geno(vcf_example_gene)$GT)

geno(vcf_example_gene)$GT
# rowRanges(vcf_example)
genos = geno(vcf_example_gene)$GT
total_a_count = 0
for (gt in genos) {
  total_a_count = total_a_count + sum(as.numeric(strsplit(gt,"/")[[1]]))
}
average_a_count = total_a_count/width(example_gene)

Code

calc_alleles_single_gene <- function(gene_row, gene_metadata=gene_metadata, vcf_file=vcf_file) {
  
  if (gene_row[1] == "chrM") {
    return (NA)
  }
  
  example_gene <- GRanges(seqnames=gene_row[1], ranges=IRanges(
    start=as.numeric(gene_row[2]),
    end=as.numeric(gene_row[3]),
    names=gene_row[4]))
  
  vcf_example_gene <- readVcf(file = vcf_file, genome = "GRCh38", param=example_gene)
  
  genos = geno(vcf_example_gene)$GT
  total_a_count = 0
  num_het = 0
  num_homo = 0
  for (gt in genos) {
    cur_a_count <- sum(as.numeric(strsplit(gt,"/")[[1]]))
    if (cur_a_count==2) {
      num_homo = num_homo + 1
    } else {
      num_het = num_het + 1
    }
    total_a_count = total_a_count + cur_a_count
  }
   
  return (c(total_a_count, length(genos),num_het,num_homo))
}

Now let’s run it across all genes

Code

gene_metadata_trimmed <- gene_metadata[1:10,]

total_alt_counts <- apply(gene_metadata,1, calc_alleles_single_gene, gene_metadata=gene_metadata, vcf_file=vcf_file)


ref_var_counts <- matrix(0,nrow=length(total_alt_counts),ncol = 4)
for (x in 1:length(total_alt_counts)) {
  if (length(total_alt_counts[[x]])==1) {
    ref_var_counts[x,] <- c(NA,NA,NA,NA)
  } else {
    ref_var_counts[x,] <- total_alt_counts[[x]]
  }
}

ref_var_counts <- as.data.frame(ref_var_counts)

colnames(ref_var_counts) <- c("total_alt_count_LCL","num_variants_LCL","num_het_LCL","num_homo_LCL")

This takes a while to run, so let’s save it just in case:

Code

save(total_alt_counts, file="altcounts.rda")

Reload alt allele counts here

Code

# load("altcounts.rda")

We can now plot the allelic dosage as a proportion of gene length

Code

gene_metadata$total_alt_count_LCL <- ref_var_counts$total_alt_count_LCL
gene_metadata$num_variants_LCL <- ref_var_counts$num_variants_LCL
gene_metadata$num_het_LCL <- ref_var_counts$num_het_LCL
gene_metadata$num_homo_LCL <- ref_var_counts$num_homo_LCL

gene_metadata$alt_counts_len_normalized_LCL <- gene_metadata$total_alt_count_LCL/gene_metadata$length

ggplot(gene_metadata) + geom_bar(stat="count",aes(x=best_ancestry))
ggplot(gene_metadata) + geom_violin(aes(x=best_ancestry,y=alt_counts_len_normalized_LCL)) + ylim(c(0,0.025))
ggplot(gene_metadata) + geom_boxplot(aes(x=best_ancestry,y=alt_counts_len_normalized_LCL)) + ylim(c(0,0.025))

Code

write_csv(gene_metadata, "gene_metadata_w_allele_counts.csv")

The figures above essentially show that the overall alternate allelic dosage from the cell line (normalized for gene length) is enriched among genes with African reference genome ancestry, which supports our assumptions.

Recompute gene correlations

Code

dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis")

# Load GEUVADIS

geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt"))
geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1)
geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)])
rownames(geuvadis) <- geuvadis_genes_symbols

# Load enformer

enformer_haplo1 <- as.data.frame(read_csv(file.path(dset_dir,"CAGE_lcl_enformer_geuvadis_canonical_haplo1_filtered.csv")))
rownames(enformer_haplo1) <- enformer_haplo1$...1
enformer_haplo1$...1 <- NULL
# rownames(enformer_haplo1) <- enformer_haplo1$X
# enformer_haplo1$X <- NULL
enformer_haplo2 <- as.data.frame(read_csv(file.path(dset_dir,"CAGE_lcl_enformer_geuvadis_canonical_haplo2_filtered.csv")))
rownames(enformer_haplo2) <- enformer_haplo2$X
enformer_haplo2$...1 <- NULL


enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1)
enformer <- (enformer_haplo1 + enformer_haplo2)
rownames(enformer) <- enf_genes_symbols

Code

ge_ind <- intersect(colnames(geuvadis), colnames(enformer))

AFR_ge_ind <- intersect(ge_ind,AFR_ind)
EUR_ge_ind <- intersect(ge_ind,EUR_ind)

ge_genes <- intersect(rownames(geuvadis), rownames(enformer))

geuvadis_compare <- geuvadis[ge_genes, ge_ind]
geuvadis_AFR_compare <- geuvadis[ge_genes, AFR_ge_ind]
geuvadis_EUR_compare <- geuvadis[ge_genes, EUR_ge_ind]

enformer_compare <- enformer[ge_genes, ge_ind]
enformer_AFR_compare <- enformer[ge_genes, AFR_ge_ind]
enformer_EUR_compare <- enformer[ge_genes, EUR_ge_ind]

gene_meta_compare <- gene_metadata[ge_genes,]

print(paste0("Number of individuals: ", length(ge_ind)))
print(paste0("Number of individuals AFR: ", length(AFR_ge_ind)))
print(paste0("Number of individuals EUR: ", length(EUR_ge_ind)))


ge_genes_plus <- ge_genes
print(paste0("Number of genes: ", length(gene_meta_compare$best_ancestry)))
print(paste0("Number of genes,AFR: ", length(gene_meta_compare[gene_meta_compare$best_ancestry=="AFR",]$best_ancestry)))
print(paste0("Number of genes,EUR: ", length(gene_meta_compare[gene_meta_compare$best_ancestry=="EUR",]$best_ancestry)))

Code

corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}


ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare)

corrs_df <- data.frame(ge=ge_corrs, ge_abs = abs(ge_corrs),genes=ge_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry)

gene_metadata$all_geuvadis_ge_corrs <- corrs_df[rownames(gene_metadata),]$ge

corrs_df_AFR_EUR <- corrs_df[corrs_df$ref_gene_ancestry %in% c("AFR","EUR"),]

corrs_df_AFR_EUR$AFR <- as.factor(ifelse(corrs_df_AFR_EUR$ref_gene_ancestry == "AFR",1,0))

corrs_df_AFR_EUR$corr_direction <- ifelse(corrs_df_AFR_EUR$ge > 0,"+","-")

ggplot(corrs_df_AFR_EUR) + geom_boxplot(aes(x=ref_gene_ancestry,y=ge))

plot(table(corrs_df_AFR_EUR$ref_gene_ancestry, corrs_df_AFR_EUR$corr_direction))

summary(glm(AFR~ge, data=corrs_df_AFR_EUR, family="binomial"))

Code

corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}


ge_corrs <- corr_gene_across_inds(geuvadis_AFR_compare, enformer_AFR_compare)

corrs_df <- data.frame(ge=ge_corrs, ge_abs = abs(ge_corrs),genes=ge_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry)

gene_metadata$AFR_geuvadis_ge_corrs <- corrs_df[rownames(gene_metadata),]$ge

corrs_df_plus <- corrs_df[corrs_df$strand == "+",]

corrs_df_plus_AFR_EUR <- corrs_df[corrs_df$ref_gene_ancestry %in% c("AFR","EUR"),]

corrs_df_plus_AFR_EUR$AFR <- as.factor(ifelse(corrs_df_plus_AFR_EUR$ref_gene_ancestry == "AFR",1,0))

corrs_df_plus_AFR_EUR$corr_direction <- ifelse(corrs_df_plus_AFR_EUR$ge > 0,"+","-")

ggplot(corrs_df_plus_AFR_EUR) + geom_boxplot(aes(x=ref_gene_ancestry,y=ge))

plot(table(corrs_df_plus_AFR_EUR$ref_gene_ancestry, corrs_df_plus_AFR_EUR$corr_direction))

summary(glm(AFR~ge, data=corrs_df_plus_AFR_EUR, family = "binomial"))

Code

corr_gene_across_inds <- function(df_1,df_2) {
  df_1 <- scale(df_1)
  df_2 <- scale(df_2)
  corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,])))
  names(corrs) <- rownames(df_1)
  return(corrs)
}


ge_corrs <- corr_gene_across_inds(geuvadis_EUR_compare, enformer_EUR_compare)

corrs_df <- data.frame(ge=ge_corrs, ge_abs = abs(ge_corrs),genes=ge_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry)

gene_metadata$EUR_geuvadis_ge_corrs <- corrs_df[rownames(gene_metadata),]$ge

corrs_df_plus <- corrs_df[corrs_df$strand == "+",]

corrs_df_plus_AFR_EUR <- corrs_df[corrs_df$ref_gene_ancestry %in% c("AFR","EUR"),]

corrs_df_plus_AFR_EUR$AFR <- as.factor(ifelse(corrs_df_plus_AFR_EUR$ref_gene_ancestry == "AFR",1,0))

corrs_df_plus_AFR_EUR$corr_direction <- ifelse(corrs_df_plus_AFR_EUR$ge > 0,"+","-")

ggplot(corrs_df_plus_AFR_EUR) + geom_boxplot(aes(x=ref_gene_ancestry,y=ge))

# ggplot(data.frame(table(corrs_df_plus_AFR_EUR$ref_gene_ancestry, corrs_df_plus_AFR_EUR$corr_direction)), aes(x=Var1,y=Freq,fill=Var2, group=Var1)) + geom_bar(stat="identity", position="dodge")

plot(table(corrs_df_plus_AFR_EUR$ref_gene_ancestry, corrs_df_plus_AFR_EUR$corr_direction))

summary(glm(AFR~ge, data=corrs_df_plus_AFR_EUR, family="binomial"))

Code

write_csv(gene_metadata, "gene_metadata_w_allele_counts.csv")

Incorporate population level information

A potential issue with the above metric is that we haven’t really controlled for the fact that some genes also tend to show more variation in general, so would have stronger alt allele counts by default. If this is roughly even between AFR and EUR genes then we are ok, but it may not be.

--- title: "calculating_genetic_distance_from_ref" author: "Saideep Gona" date: "2023-06-03" format: html: code-fold: true code-summary: "Show the code" execute: freeze: true warning: false eval: false --- ```{r} #| label: Set up box storage directory suppressMessages(library(tidyverse)) suppressMessages(library(glue)) PRE = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai" ## COPY THE DATE AND SLUG fields FROM THE HEADER SLUG="calculating_genetic_distance_from_ref" ## copy the slug from the header bDATE='2023-06-03' ## copy the date from the blog's header here DATA = glue("{PRE}/{bDATE}-{SLUG}") if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}")) WORK=DATA ``` # Context Previously, I looked for an association between the local ancestry of genes in the refernce genome and their direction of personalized Enformer expression prediction correlation. This found no association. Another useful metric is to look at the genetic distance of the ENCODE tracks used in training against the reference background. This gives a sort of measure of discordance between the training input and output. ## Computing the genetic distance b/w reference and genome of track For this case, I will use the genome of the track pertaining to LCL CAGE expression. This track comes from a common cell line: GM12878 . Roughly, this is a cell line derived from a European Ancestry female. I have downloaded a VCF file from sequencing of the cell line. I will start by computing the average alternative allele dosage of variants within each gene window. This will give some measure of the sequence deviation from the reference genome. ### Load in the VCF and gene regions ```{r} library(VariantAnnotation) library(GenomicRanges) vcf_file = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai/2023-06-03-calculating_genetic_distance_from_ref/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" # vcf <- readVcf(file = vcf_file, genome = "GRCh38") metadata_dir = "/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata" gene_metadata <- as.data.frame(read_delim(file.path(metadata_dir,"gene_metadata_full.csv"))) rownames(gene_metadata) <- as.character(gene_metadata$...1) gene_metadata$...1 <- NULL ``` ### Check if the VCF file reference is the reference genome ```{r} example_region <- GRanges(seqnames="chr22", ranges=IRanges( start=c(50301422, 50989541), end=c(50312106, 51001328), names=c("gene_79087", "gene_644186"))) vcf_example <- readVcf(file = vcf_file, genome = "GRCh38", param=example_region) geno(vcf_example)$GT rowRanges(vcf_example) ``` Through manual inspection of this region, it is clear that the reference matches the reference genome, rather than being linked to relative population frequency. ### Compute genomic distance from reference We can write and test a function to compute this genetic distance metric ```{r} gene_row = 10 example_gene <- GRanges(seqnames=gene_metadata[gene_row,1], ranges=IRanges( start=as.numeric(gene_metadata[gene_row,2]), end=as.numeric(gene_metadata[gene_row,3]), names=gene_metadata[gene_row,4])) vcf_example_gene <- readVcf(file = vcf_file, genome = "GRCh38", param=example_gene) length(geno(vcf_example_gene)$GT) geno(vcf_example_gene)$GT # rowRanges(vcf_example) genos = geno(vcf_example_gene)$GT total_a_count = 0 for (gt in genos) { total_a_count = total_a_count + sum(as.numeric(strsplit(gt,"/")[[1]])) } average_a_count = total_a_count/width(example_gene) ``` ```{r} calc_alleles_single_gene <- function(gene_row, gene_metadata=gene_metadata, vcf_file=vcf_file) { if (gene_row[1] == "chrM") { return (NA) } example_gene <- GRanges(seqnames=gene_row[1], ranges=IRanges( start=as.numeric(gene_row[2]), end=as.numeric(gene_row[3]), names=gene_row[4])) vcf_example_gene <- readVcf(file = vcf_file, genome = "GRCh38", param=example_gene) genos = geno(vcf_example_gene)$GT total_a_count = 0 num_het = 0 num_homo = 0 for (gt in genos) { cur_a_count <- sum(as.numeric(strsplit(gt,"/")[[1]])) if (cur_a_count==2) { num_homo = num_homo + 1 } else { num_het = num_het + 1 } total_a_count = total_a_count + cur_a_count } return (c(total_a_count, length(genos),num_het,num_homo)) } ``` Now let's run it across all genes ```{r} gene_metadata_trimmed <- gene_metadata[1:10,] total_alt_counts <- apply(gene_metadata,1, calc_alleles_single_gene, gene_metadata=gene_metadata, vcf_file=vcf_file) ref_var_counts <- matrix(0,nrow=length(total_alt_counts),ncol = 4) for (x in 1:length(total_alt_counts)) { if (length(total_alt_counts[[x]])==1) { ref_var_counts[x,] <- c(NA,NA,NA,NA) } else { ref_var_counts[x,] <- total_alt_counts[[x]] } } ref_var_counts <- as.data.frame(ref_var_counts) colnames(ref_var_counts) <- c("total_alt_count_LCL","num_variants_LCL","num_het_LCL","num_homo_LCL") ``` This takes a while to run, so let's save it just in case: ```{r} save(total_alt_counts, file="altcounts.rda") ``` Reload alt allele counts here ```{r} # load("altcounts.rda") ``` We can now plot the allelic dosage as a proportion of gene length ```{r} gene_metadata$total_alt_count_LCL <- ref_var_counts$total_alt_count_LCL gene_metadata$num_variants_LCL <- ref_var_counts$num_variants_LCL gene_metadata$num_het_LCL <- ref_var_counts$num_het_LCL gene_metadata$num_homo_LCL <- ref_var_counts$num_homo_LCL gene_metadata$alt_counts_len_normalized_LCL <- gene_metadata$total_alt_count_LCL/gene_metadata$length ggplot(gene_metadata) + geom_bar(stat="count",aes(x=best_ancestry)) ggplot(gene_metadata) + geom_violin(aes(x=best_ancestry,y=alt_counts_len_normalized_LCL)) + ylim(c(0,0.025)) ggplot(gene_metadata) + geom_boxplot(aes(x=best_ancestry,y=alt_counts_len_normalized_LCL)) + ylim(c(0,0.025)) ``` ```{r} write_csv(gene_metadata, "gene_metadata_w_allele_counts.csv") ``` The figures above essentially show that the overall alternate allelic dosage from the cell line (normalized for gene length) is enriched among genes with African reference genome ancestry, which supports our assumptions. ### Recompute gene correlations ```{r} #| label: Load Datasets dset_dir <- file.path("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/enformer_geuvadis/enformer_portability_analysis") # Load GEUVADIS geuvadis_full <- read_delim(file.path(dset_dir,"GD462.GeneQuantRPKM.50FN.samplename.resk10.txt")) geuvadis_genes_symbols <- sapply(str_split(geuvadis_full$Gene_Symbol, "\\."),`[`,1) geuvadis <- as.data.frame(geuvadis_full[,5:ncol(geuvadis_full)]) rownames(geuvadis) <- geuvadis_genes_symbols # Load enformer enformer_haplo1 <- as.data.frame(read_csv(file.path(dset_dir,"CAGE_lcl_enformer_geuvadis_canonical_haplo1_filtered.csv"))) rownames(enformer_haplo1) <- enformer_haplo1$...1 enformer_haplo1$...1 <- NULL # rownames(enformer_haplo1) <- enformer_haplo1$X # enformer_haplo1$X <- NULL enformer_haplo2 <- as.data.frame(read_csv(file.path(dset_dir,"CAGE_lcl_enformer_geuvadis_canonical_haplo2_filtered.csv"))) rownames(enformer_haplo2) <- enformer_haplo2$X enformer_haplo2$...1 <- NULL enf_genes_symbols <- sapply(str_split(rownames(enformer_haplo1), "\\."),`[`,1) enformer <- (enformer_haplo1 + enformer_haplo2) rownames(enformer) <- enf_genes_symbols ``` ```{r} ge_ind <- intersect(colnames(geuvadis), colnames(enformer)) AFR_ge_ind <- intersect(ge_ind,AFR_ind) EUR_ge_ind <- intersect(ge_ind,EUR_ind) ge_genes <- intersect(rownames(geuvadis), rownames(enformer)) geuvadis_compare <- geuvadis[ge_genes, ge_ind] geuvadis_AFR_compare <- geuvadis[ge_genes, AFR_ge_ind] geuvadis_EUR_compare <- geuvadis[ge_genes, EUR_ge_ind] enformer_compare <- enformer[ge_genes, ge_ind] enformer_AFR_compare <- enformer[ge_genes, AFR_ge_ind] enformer_EUR_compare <- enformer[ge_genes, EUR_ge_ind] gene_meta_compare <- gene_metadata[ge_genes,] print(paste0("Number of individuals: ", length(ge_ind))) print(paste0("Number of individuals AFR: ", length(AFR_ge_ind))) print(paste0("Number of individuals EUR: ", length(EUR_ge_ind))) ge_genes_plus <- ge_genes print(paste0("Number of genes: ", length(gene_meta_compare$best_ancestry))) print(paste0("Number of genes,AFR: ", length(gene_meta_compare[gene_meta_compare$best_ancestry=="AFR",]$best_ancestry))) print(paste0("Number of genes,EUR: ", length(gene_meta_compare[gene_meta_compare$best_ancestry=="EUR",]$best_ancestry))) ``` ```{r} corr_gene_across_inds <- function(df_1,df_2) { df_1 <- scale(df_1) df_2 <- scale(df_2) corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,]))) names(corrs) <- rownames(df_1) return(corrs) } ge_corrs <- corr_gene_across_inds(geuvadis_compare, enformer_compare) corrs_df <- data.frame(ge=ge_corrs, ge_abs = abs(ge_corrs),genes=ge_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry) gene_metadata$all_geuvadis_ge_corrs <- corrs_df[rownames(gene_metadata),]$ge corrs_df_AFR_EUR <- corrs_df[corrs_df$ref_gene_ancestry %in% c("AFR","EUR"),] corrs_df_AFR_EUR$AFR <- as.factor(ifelse(corrs_df_AFR_EUR$ref_gene_ancestry == "AFR",1,0)) corrs_df_AFR_EUR$corr_direction <- ifelse(corrs_df_AFR_EUR$ge > 0,"+","-") ggplot(corrs_df_AFR_EUR) + geom_boxplot(aes(x=ref_gene_ancestry,y=ge)) plot(table(corrs_df_AFR_EUR$ref_gene_ancestry, corrs_df_AFR_EUR$corr_direction)) summary(glm(AFR~ge, data=corrs_df_AFR_EUR, family="binomial")) ``` ```{r} corr_gene_across_inds <- function(df_1,df_2) { df_1 <- scale(df_1) df_2 <- scale(df_2) corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,]))) names(corrs) <- rownames(df_1) return(corrs) } ge_corrs <- corr_gene_across_inds(geuvadis_AFR_compare, enformer_AFR_compare) corrs_df <- data.frame(ge=ge_corrs, ge_abs = abs(ge_corrs),genes=ge_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry) gene_metadata$AFR_geuvadis_ge_corrs <- corrs_df[rownames(gene_metadata),]$ge corrs_df_plus <- corrs_df[corrs_df$strand == "+",] corrs_df_plus_AFR_EUR <- corrs_df[corrs_df$ref_gene_ancestry %in% c("AFR","EUR"),] corrs_df_plus_AFR_EUR$AFR <- as.factor(ifelse(corrs_df_plus_AFR_EUR$ref_gene_ancestry == "AFR",1,0)) corrs_df_plus_AFR_EUR$corr_direction <- ifelse(corrs_df_plus_AFR_EUR$ge > 0,"+","-") ggplot(corrs_df_plus_AFR_EUR) + geom_boxplot(aes(x=ref_gene_ancestry,y=ge)) plot(table(corrs_df_plus_AFR_EUR$ref_gene_ancestry, corrs_df_plus_AFR_EUR$corr_direction)) summary(glm(AFR~ge, data=corrs_df_plus_AFR_EUR, family = "binomial")) ``` ```{r} corr_gene_across_inds <- function(df_1,df_2) { df_1 <- scale(df_1) df_2 <- scale(df_2) corrs <- sapply(seq.int(dim(df_1)[1]), function(i) cor(as.numeric(df_1[i,]), as.numeric(df_2[i,]))) names(corrs) <- rownames(df_1) return(corrs) } ge_corrs <- corr_gene_across_inds(geuvadis_EUR_compare, enformer_EUR_compare) corrs_df <- data.frame(ge=ge_corrs, ge_abs = abs(ge_corrs),genes=ge_genes, gene_names=rownames(gene_meta_compare), gene_h2=gene_meta_compare$h2_WholeBlood_TS, high_h2 = gene_meta_compare$h2_WholeBlood_TS > 0.1, strand = gene_meta_compare$strand, ref_gene_ancestry=gene_meta_compare$best_ancestry) gene_metadata$EUR_geuvadis_ge_corrs <- corrs_df[rownames(gene_metadata),]$ge corrs_df_plus <- corrs_df[corrs_df$strand == "+",] corrs_df_plus_AFR_EUR <- corrs_df[corrs_df$ref_gene_ancestry %in% c("AFR","EUR"),] corrs_df_plus_AFR_EUR$AFR <- as.factor(ifelse(corrs_df_plus_AFR_EUR$ref_gene_ancestry == "AFR",1,0)) corrs_df_plus_AFR_EUR$corr_direction <- ifelse(corrs_df_plus_AFR_EUR$ge > 0,"+","-") ggplot(corrs_df_plus_AFR_EUR) + geom_boxplot(aes(x=ref_gene_ancestry,y=ge)) # ggplot(data.frame(table(corrs_df_plus_AFR_EUR$ref_gene_ancestry, corrs_df_plus_AFR_EUR$corr_direction)), aes(x=Var1,y=Freq,fill=Var2, group=Var1)) + geom_bar(stat="identity", position="dodge") plot(table(corrs_df_plus_AFR_EUR$ref_gene_ancestry, corrs_df_plus_AFR_EUR$corr_direction)) summary(glm(AFR~ge, data=corrs_df_plus_AFR_EUR, family="binomial")) ``` ```{r} write_csv(gene_metadata, "gene_metadata_w_allele_counts.csv") ``` ### Incorporate population level information A potential issue with the above metric is that we haven't really controlled for the fact that some genes also tend to show more variation in general, so would have stronger alt allele counts by default. If this is roughly even between AFR and EUR genes then we are ok, but it may not be.