1.1_generate_data

Author

Saideep Gona

Published

November 22, 2023

Code

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pysam
import matplotlib.pyplot as plt
import numpy as np

import kipoiseq

import multiprocessing as mp

import pyBigWig 
import os,sys

import h5py

import pickle as pkl

Extract chromosomes which exist in the basenji training set (subset to these)

Code
basenji_sequences = "/project2/haky/saideep/aracena_data_preprocessing/test_conversion/sequences.bed"
chroms_include = set()
with open(basenji_sequences) as f:
    for line in f:
        p_line = line.strip().split("\t")
        chroms_include.add(p_line[0])

Read in pre-computed common genes between aracena and our canonical tss annotations, as well as read in the canonical tss annotations

Code
import random
import pandas as pd

with open("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-22-retrain_expression_predictor/common_genes.txt", "r") as f:
    common_genes = set(f.read().split("\n")[:-1])

tss_list = "/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-09-20-diagnose_training_tracks_issue/canonical_TSS_full_metadata.txt"
tss_list_df = pd.read_csv(tss_list)
tss_list_df

len(list(tss_list_df["ensembl_gene_id"]))
19667

Create train, test, val datasets from aracena expression

Code
path_to_aracena_expression = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/fully_preprocessed_expression.txt"

aracena_expression = pd.read_csv(path_to_aracena_expression, sep=" ", index_col=0, header=0)
aracena_expression_cols = list(aracena_expression.columns)
aracena_expression_cols_flu = [x for x in aracena_expression_cols if "Flu" in x]
aracena_expression_cols_ni = [x for x in aracena_expression_cols if "NI" in x]
aracena_expression_cols_both = {
    "Flu": aracena_expression_cols_flu,
    "NI": aracena_expression_cols_ni
}
Code
test_val_mapping = {
    "valid":{"chr7","chr15", "chrX", "chr12"},
    "test":{"chr14","chr8", "chr22", "chr13"}
}

partitioned_genes = {
    "train": [],
    "valid": [],
    "test": []
}


with open("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-22-retrain_expression_predictor/tss_site.txt", "w") as region_file:
    for gene in common_genes:

        gene_chr = "chr"+tss_list_df[tss_list_df["ensembl_gene_id"] == gene]["chromosome_name"].values[0]
        if gene_chr in ["chrX","chrY","chrM"]:
            continue
        if gene_chr in test_val_mapping["valid"]:
            cur_group = "valid"
            partitioned_genes["valid"].append(gene)
        elif gene_chr in test_val_mapping["test"]:
            cur_group = "test"
            partitioned_genes["test"].append(gene)
        else:
            cur_group = "train"
            partitioned_genes["train"].append(gene)

        tss_site = tss_list_df[tss_list_df["ensembl_gene_id"] == gene]["transcription_start_site"].values[0]
 
        out = [
            gene_chr,
            str(tss_site),
            str(tss_site+2),
            cur_group,
            gene
        ]

        region_file.write("\t".join(out)+"\n")
Code
for dset in ["train", "valid", "test"]:
    for cond in ["Flu", "NI"]:
        cur_table = aracena_expression.loc[partitioned_genes[dset], aracena_expression_cols_both[cond]]
        cur_table.sort_index(inplace=True)
        cur_table.mean(axis=1).to_csv(f"/beagle3/haky/users/saideep/projects/aracena_modeling/aracena_predixcan/{dset}_{cond}_aracena.txt", sep="\t", header=False)

Now we need to create the enformer reference epigenome inputs as well

Code
def query_epigenome(chr_num, center_bp, enfref_dir , num_bins=896, tracks=-1):
    """
    Parameters:
        path_to_enfref (str): path to the directory containing the concatenated reference enformer files
        chr_num (int/string): chromosome number
        center_bp (int): center base pair position (1-indexed)
        num_bins (int): number of bins to extract centered around center_bp (default: 896) 
            note: if the number of bins is even, the center bin will be in the second half of the array
        tracks (int list): list of tracks to extract (default: all 5313 tracks)

    Returns:
        epigen (np.array): enformer predictions centered at center_bp of shape (num_bins, len(tracks))
    """

    # from position choose center bin
    center_ind = center_bp - 1
    center_bin = center_ind // 128
    
    half_bins = num_bins // 2
    start_bin = center_bin - half_bins
    end_bin = center_bin + half_bins
    if num_bins % 2 != 0: # if num_bins is odd
        end_bin += 1

    with h5py.File(f"{enfref_dir}/chr{chr_num}_cat.h5", "r") as f:
        # get tracks if list provided
        if tracks == -1:
            epigen = f[f'chr{chr_num}'][start_bin:end_bin, :] 
        else:
            epigen = f[f'chr{chr_num}'][start_bin:end_bin, tracks] 

    return epigen
Code
ref_dir = "/project2/haky/Data/enformer-reference-epigenome"

with open("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-22-retrain_expression_predictor/tss_site.txt", "r") as rf:
    for line in rf:
        p_line = line.strip().split("\t")

        gene = p_line[-1]
        # print(gene)

        if p_line[0] not in chroms_include:
            continue
        if p_line[0] in test_val_mapping["val"]:
            cur_group = "valid"
        elif p_line[0] in test_val_mapping["test"]:
            cur_group = "test"
        else:
            cur_group = "train"


        
        cur_query = query_epigenome(p_line[0].lstrip("chr"), int(p_line[1])+1, ref_dir , num_bins=4, tracks=-1)
        # print(cur_query.shape)
        query_vec = [gene]+[str(x) for x  in list(cur_query.mean(axis=0))]

        with open("/beagle3/haky/users/saideep/projects/aracena_modeling/aracena_predixcan/"+cur_group+"_epigenome.txt", "a") as f:
            f.write("\t".join(query_vec)+"\n")