suppressMessages(library(tidyverse))suppressMessages(library(glue))PRE ="/Users/sgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai"## COPY THE DATE AND SLUG fields FROM THE HEADERSLUG="reference-genome-local-ancestry"## copy the slug from the headerbDATE='2023-03-27'## copy the date from the blog's header hereDATA =glue("{PRE}/{bDATE}-{SLUG}")if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))WORK=DATA
Context
One thing we hadn’t yet taken a look at is the local ancestry of genes in the reference genome. This is a useful metric which should be included in our gene metadata.
Other to-dos: Normalize before modeling epistasis Model demographic covariates in epistasis
Assigning local ancestry to genes
Fortunately I was able to acquire some precomputed local ancestry inference from the following link provided by the McCoy lab: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.1/GRCh38/ancestry/
The following steps are for creating assignments of local ancestry to genes based off the files above. First lets concatenate all the bed files and sort them:
---title: "reference-genome-local-ancestry"author: "Saideep Gona"date: "2023-03-27"format: html: code-fold: true code-summary: "Show the code"execute: freeze: true warning: false---```{r}#| label: Set up box storage directorysuppressMessages(library(tidyverse))suppressMessages(library(glue))PRE ="/Users/sgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/Daily-Blog-Sai"## COPY THE DATE AND SLUG fields FROM THE HEADERSLUG="reference-genome-local-ancestry"## copy the slug from the headerbDATE='2023-03-27'## copy the date from the blog's header hereDATA =glue("{PRE}/{bDATE}-{SLUG}")if(!file.exists(DATA)) system(glue::glue("mkdir {DATA}"))WORK=DATA```# ContextOne thing we hadn't yet taken a look at is the local ancestry of genes in the reference genome. This is a useful metric which should be included in our gene metadata.Other to-dos:Normalize before modeling epistasisModel demographic covariates in epistasis## Assigning local ancestry to genesFortunately I was able to acquire some precomputed local ancestry inference from the following link provided by the McCoy lab: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.1/GRCh38/ancestry/The following steps are for creating assignments of local ancestry to genes based off the files above. First lets concatenate all the bed files and sort them:Add an ancestry column to each bed file:```{r}library(tidyverse)ancestries <-c("AFR","AMR","EAS","EUR","SAS")root_dir <-"/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/ref_local_ancestry"for (ancestry in ancestries) { current_bed <-paste0(root_dir,"/GRCh38_ancestry_",ancestry,".bed") tagged_bed <-paste0(root_dir,"/GRCh38_ancestry_",ancestry,"_tagged.bed") current_bed_table <-read.delim(current_bed,skip =1, header =FALSE) current_bed_table$ancestry <- ancestry current_bed_table <- current_bed_table %>%transform(V1=str_replace(V1,"chr",""))write.table(current_bed_table, tagged_bed, sep="\t", col.names =FALSE, quote =FALSE, row.names =FALSE)}``````{python}import osimport pandas as pdos.system("cat /Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/ref_local_ancestry/GRC*_tagged.bed > /Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/ref_local_ancestry/all_tracks_local_ancestry.bed")os.system("sort -n -k 1,1 -k2,2n /Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/ref_local_ancestry/all_tracks_local_ancestry.bed -o /Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/ref_local_ancestry/all_tracks_local_ancestry.bed")``````{python}import pandas as pdancestry_db = []withopen("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/ref_local_ancestry/all_tracks_local_ancestry.bed", "r") asall:for line inall: p_line = line.strip().split("\t") ancestry_db.append(p_line)print(len(ancestry_db))ancestry_ind =0gene_region_ancestry = {}withopen("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/protein_coding_nosex.bed", "r") as genes:for line in genes: p_line = line.strip().split("\t") cur_chrom = p_line[0].lstrip("chr") cur_start =int(p_line[1]) cur_end =int(p_line[2]) cur_gene = p_line[3].split(".")[0]# print(cur_gene)# print(cur_chrom)# print(cur_start,cur_end) local_count = {"AFR":0,"AMR":0,"EAS":0,"SAS":0,"EUR":0,"NA":0} total_anc_count =0 gene_complete=False local_counter =0whilenot gene_complete: local_counter +=1if local_counter %10000==0:# print(cur_chrom,cur_start,cur_end)# print(a_chrom,a_start,a_end) print(ancestry_ind)print("1000") a_start =int(ancestry_db[ancestry_ind][1]) a_end =int(ancestry_db[ancestry_ind][2]) a_ancestry = ancestry_db[ancestry_ind][3]# print(a_chrom,a_start,a_end)if ancestry_ind >= (len(ancestry_db) -1):print("ancestry_ind exceeds db", ancestry_ind,len(ancestry_db))break# print(local_count)# print(ancestry_db[ancestry_ind]) a_chrom = ancestry_db[ancestry_ind][0]if a_chrom in ["X","Y"]: ancestry_ind +=1continueif a_chrom != cur_chrom:print("chrom",cur_chrom,a_chrom)ifint(a_chrom) <int(cur_chrom): ancestry_ind +=1continue gene_complete =Truecontinue a_start =int(ancestry_db[ancestry_ind][1]) a_end =int(ancestry_db[ancestry_ind][2]) a_ancestry = ancestry_db[ancestry_ind][3]# print("cur",cur_chrom,cur_start,cur_end)# print("a",a_chrom,a_start,a_end)# Outside geneif a_end <= cur_start: ancestry_ind +=1continueif a_start >= cur_end: gene_complete =True local_count["NA"] = cur_end-cur_start-total_anc_countcontinue# Left partial overlapif a_start < cur_start:if cur_start < a_end < cur_end: local_count[a_ancestry] += (a_end-cur_start) total_anc_count += (a_end-cur_start) ancestry_ind +=1continue# Complete overlapif a_start >= cur_start:if a_end <= cur_end: local_count[a_ancestry] += (a_end-a_start) total_anc_count += (a_end-a_start) ancestry_ind +=1continue# Right partial overlapif a_end > cur_end:if cur_start < a_start < cur_end: local_count[a_ancestry] += (cur_end-a_start) total_anc_count += (cur_end-a_start) ancestry_ind +=1continue# Larger than geneif a_start < cur_start:if a_end > cur_end: local_count[a_ancestry] += (cur_end-cur_start) total_anc_count += (cur_end-cur_start) ancestry_ind +=1continue gene_region_ancestry[cur_gene] = local_count if ancestry_ind >= (len(ancestry_db) -1):break# print(gene_region_ancestry)gene_df = pd.DataFrame(gene_region_ancestry)gene_df.transpose().to_csv("/Users/saideepgona/Library/CloudStorage/Box-Box/imlab-data/data-Github/analysis-Sai/metadata/ref_local_ancestry/gene_annotations.csv")```