import matplotlib.pyplot as pltimport seaborn as snsimport numpy as npimport pysamimport matplotlib.pyplot as pltimport numpy as npimport kipoiseqimport multiprocessing as mpimport pyBigWig import os,sysimport h5pyimport pickle as pkl
Extract chromosomes which exist in the basenji training set (subset to these)
Code
basenji_sequences ="/project2/haky/saideep/aracena_data_preprocessing/test_conversion/sequences.bed"chroms_include =set()withopen(basenji_sequences) as f:for line in f: p_line = line.strip().split("\t") chroms_include.add(p_line[0])
Read in pre-computed common genes between aracena and our canonical tss annotations, as well as read in the canonical tss annotations
Code
import randomimport pandas as pdwithopen("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-22-retrain_expression_predictor/common_genes.txt", "r") as f: common_genes =set(f.read().split("\n")[:-1])tss_list ="/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-09-20-diagnose_training_tracks_issue/canonical_TSS_full_metadata.txt"tss_list_df = pd.read_csv(tss_list)tss_list_dflen(list(tss_list_df["ensembl_gene_id"]))
19667
Create train, test, val datasets from aracena expression
Code
path_to_aracena_expression ="/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_expression.txt"aracena_expression = pd.read_csv(path_to_aracena_expression, sep=" ", index_col=0, header=0)aracena_expression_cols =list(aracena_expression.columns)aracena_expression_cols_flu = [x for x in aracena_expression_cols if"Flu"in x]aracena_expression_cols_ni = [x for x in aracena_expression_cols if"NI"in x]aracena_expression_cols_both = {"Flu": aracena_expression_cols_flu,"NI": aracena_expression_cols_ni}
Code
test_val_mapping = {"valid":{"chr7","chr15", "chrX", "chr12"},"test":{"chr14","chr8", "chr22", "chr13"}}partitioned_genes = {"train": [],"valid": [],"test": []}withopen("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-22-retrain_expression_predictor/tss_site.txt", "w") as region_file:for gene in common_genes: gene_chr ="chr"+tss_list_df[tss_list_df["ensembl_gene_id"] == gene]["chromosome_name"].values[0]if gene_chr in ["chrX","chrY","chrM"]:continueif gene_chr in test_val_mapping["valid"]: cur_group ="valid" partitioned_genes["valid"].append(gene)elif gene_chr in test_val_mapping["test"]: cur_group ="test" partitioned_genes["test"].append(gene)else: cur_group ="train" partitioned_genes["train"].append(gene) tss_site = tss_list_df[tss_list_df["ensembl_gene_id"] == gene]["transcription_start_site"].values[0] out = [ gene_chr,str(tss_site),str(tss_site+2), cur_group, gene ] region_file.write("\t".join(out)+"\n")
Code
for dset in ["train", "valid", "test"]:for cond in ["Flu", "NI"]: cur_table = aracena_expression.loc[partitioned_genes[dset], aracena_expression_cols_both[cond]] cur_table.sort_index(inplace=True) cur_table.mean(axis=1).to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/aracena_predixcan/{dset}_{cond}_aracena.txt", sep="\t", header=False)
Now we need to create the enformer reference epigenome inputs as well
Code
def query_epigenome(chr_num, center_bp, enfref_dir , num_bins=896, tracks=-1):""" Parameters: path_to_enfref (str): path to the directory containing the concatenated reference enformer files chr_num (int/string): chromosome number center_bp (int): center base pair position (1-indexed) num_bins (int): number of bins to extract centered around center_bp (default: 896) note: if the number of bins is even, the center bin will be in the second half of the array tracks (int list): list of tracks to extract (default: all 5313 tracks) Returns: epigen (np.array): enformer predictions centered at center_bp of shape (num_bins, len(tracks)) """# from position choose center bin center_ind = center_bp -1 center_bin = center_ind //128 half_bins = num_bins //2 start_bin = center_bin - half_bins end_bin = center_bin + half_binsif num_bins %2!=0: # if num_bins is odd end_bin +=1with h5py.File(f"{enfref_dir}/chr{chr_num}_cat.h5", "r") as f:# get tracks if list providedif tracks ==-1: epigen = f[f'chr{chr_num}'][start_bin:end_bin, :] else: epigen = f[f'chr{chr_num}'][start_bin:end_bin, tracks] return epigen
Code
ref_dir ="/project2/haky/Data/enformer-reference-epigenome"withopen("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-22-retrain_expression_predictor/tss_site.txt", "r") as rf:for line in rf: p_line = line.strip().split("\t") gene = p_line[-1]# print(gene)if p_line[0] notin chroms_include:continueif p_line[0] in test_val_mapping["val"]: cur_group ="valid"elif p_line[0] in test_val_mapping["test"]: cur_group ="test"else: cur_group ="train" cur_query = query_epigenome(p_line[0].lstrip("chr"), int(p_line[1])+1, ref_dir , num_bins=4, tracks=-1)# print(cur_query.shape) query_vec = [gene]+[str(x) for x inlist(cur_query.mean(axis=0))]withopen("/beagle3/haky/users/saideep/projects/aracena_modeling/aracena_predixcan/"+cur_group+"_epigenome.txt", "a") as f: f.write("\t".join(query_vec)+"\n")