build_peak_EnPACT

Author

Saideep Gona

Published

March 26, 2024

Context

In this notebook, I build linearized Con-EnPACT models for peak data from Aracena et al. The peak data are as follows:

ATAC - Chromatin Accessibility H3K27ac - Histone, Enhancers + TSS H3K27me3 - Repression H3K4me1 - “Active” enhancers H3K4me3 - Enhancer

Code

import os,sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json

import subprocess

import pyliftover

Code

# Set up modalities

path_to_config_template = "/beagle3/haky/users/saideep/github_repos/Con-EnPACT/example_json/example_config_with_twas.json"
path_to_run_template = "/beagle3/haky/users/saideep/github_repos/Con-EnPACT/run_training.sbatch"
path_to_project_dir = "/beagle3/haky/users/saideep/projects/Con_EnPACT/models"
path_to_count_table = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/normalized_peak_data/"
path_to_annotations = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/annotations/"

modality_pcs = {
    "H3K27ac":{
        "Flu":1,
        "NI":1
    },
    "H3K27me3":{
        "Flu":2,
        "NI":1
    },
    "H3K4me1":{
        "Flu":4,
        "NI":2
    },
    "H3K4me3":{
        "Flu":2,
        "NI":2
    },
    "ATAC":{
        "Flu":3,
        "NI":3
    }
}

Create Peak annotation files

Code

def liftover_count_table(count_table, from_build="hg19", to_build="hg38", valid_chroms=["chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY"]):

    # Liftover index of dataframe to new build, filter rows if not possible 

    lo = pyliftover.LiftOver(from_build, to_build)

    new_regions = []
    new_regions_keep = []
    for region in count_table.index:
        chrom = region.split("_")[0]
        if len(region.split("_")) != 3:
            print("region length not 3")
            new_regions_keep.append(False)
            continue
        if chrom not in valid_chroms:
            print("chromosome not in valid chroms")
            new_regions_keep.append(False)
            continue
        start = int(region.split("_")[1])
        end = int(region.split("_")[2])

        new_start_coords = lo.convert_coordinate(chrom, start)
        if len(new_start_coords) == 0:
            print("No conversion")
            new_regions_keep.append(False)
            continue
        new_start = new_start_coords[0][1]
                                        
        new_end_coords = lo.convert_coordinate(chrom, end)
        if len(new_end_coords) == 0:
            print("No conversion")
            new_regions_keep.append(False)
            continue                                
        new_end = new_end_coords[0][1]

        new_regions.append(chrom + "_" + str(new_start) + "_" + str(new_end))
        new_regions_keep.append(True)

    new_count_table = count_table[new_regions_keep]
    new_count_table.index = new_regions
    return new_count_table

def liftover_predictdb_genotype_file(genotype_file, from_build="hg19", to_build="hg38"):

    lo = pyliftover.LiftOver(from_build, to_build)

    genotypes_df = pd.read_csv(genotype_file, sep="\t")

    chroms = genotypes_df["varID"].apply(lambda x: x.split("_")[0])
    starts = genotypes_df["varID"].apply(lambda x: int(x.split("_")[1]))

    refs = genotypes_df["varID"].apply(lambda x: x.split("_")[2])
    alts = genotypes_df["varID"].apply(lambda x: x.split("_")[3])

    new_starts = []

    unconverted = 0

    for chrom, start in zip(chroms, starts):
        new_coords = lo.convert_coordinate(chrom, start)
        if len(new_coords) == 0:
            unconverted += 1
            new_starts.append("UNCONVERTED")
        else:
            new_starts.append(new_coords[0][1])


    if to_build == "hg19":
        build_tag = "b37"
        from_build_tag = "b38"
    elif to_build == "hg38":
        build_tag = "b38"
        from_build_tag = "b37"

    genotypes_df["varID"] = ["_".join([chrom,str(new_start),str(ref),str(alt),build_tag]) for chrom, new_start, ref, alt in zip(chroms, new_starts, refs, alts)]

    genotypes_df_no_unconverted = genotypes_df[~genotypes_df["varID"].str.contains("UNCONVERTED")]

    genotypes_df_no_unconverted = genotypes_df_no_unconverted.drop_duplicates(subset=['varID'])

    print(genotypes_df.shape)
    print(genotypes_df_no_unconverted.shape)


    out_file = genotype_file.replace(from_build_tag, build_tag)

    genotypes_df_no_unconverted.to_csv(out_file, sep="\t", index=False)

    print("Unconverted: ", unconverted)

def liftover_snp_annotation_file(annotation_file, from_build="hg19", to_build="hg38"):

    lo = pyliftover.LiftOver(from_build, to_build)

    annotation_df = pd.read_csv(annotation_file, sep="\t")

    chroms = annotation_df["chr"].apply(lambda x: "chr"+str(x))
    starts = annotation_df["pos"].apply(lambda x: int(x))

    refs = annotation_df["ref_vcf"]
    alts = annotation_df["alt_vcf"]

    new_starts = []

    unconverted = 0

    for chrom, start in zip(chroms, starts):
        new_coords = lo.convert_coordinate(chrom, start)
        if len(new_coords) == 0:
            unconverted += 1
            new_starts.append("UNCONVERTED")
        else:
            new_starts.append(new_coords[0][1])
    
    if to_build == "hg19":
        build_tag = "b37"
        from_build_tag = "b38"
    elif to_build == "hg38":
        build_tag = "b38"
        from_build_tag = "b37"

    varids = ["_".join([chrom,str(new_start),str(ref),str(alt),build_tag]) for chrom, new_start, ref, alt in zip(chroms, new_starts, refs, alts)]

    annotation_df["varID"] = varids
    annotation_df["rsid"] = varids

    annotation_df["pos"] = new_starts

    annotation_df_no_unconverted = annotation_df[~annotation_df["varID"].str.contains("UNCONVERTED")]

    print(annotation_df.shape)
    print(annotation_df_no_unconverted.shape)

    out_file = annotation_file.replace(from_build_tag, build_tag)
    annotation_df.to_csv(out_file, sep="\t", index=False)

    print("Unconverted: ", unconverted)

def make_annotation_from_count(cur_counts, modality, annotation_dir, valid_chroms):

    predictdb_annotation_file_dict = {
        "chr":[],
        "gene_id":[],
        "gene_name":[],
        "start":[],
        "end":[],
        "gene_type":[]
    }

    annotation_file_dict = {"ensembl_gene_id":[],
                            "external_gene_name":[],
                            "chromosome_name":[],
                            "transcript_biotype":[],
                            "transcript_start":[],
                            "transcript_end":[],
                            "transcription_start_site":[],
                            "transcript_is_canonical":[],
                            "transcript_count":[],
                            "target_regions":[]
                            }
    
    for row in cur_counts.iterrows():

        region = row[0]
        if len(region.split("_")) != 3:
            print("region length not 3")
            continue
        chrom = region.split("_")[0]
        if chrom not in valid_chroms:
            print("chromosome not in valid chroms")
            continue
        start = int(region.split("_")[1])
        end = int(region.split("_")[2])

        middle = (start + end) // 2

        predictdb_annotation_file_dict["chr"].append(chrom.strip("chr"))
        predictdb_annotation_file_dict["gene_id"].append(region)
        predictdb_annotation_file_dict["gene_name"].append(region)
        predictdb_annotation_file_dict["start"].append(start)
        predictdb_annotation_file_dict["end"].append(end)
        predictdb_annotation_file_dict["gene_type"].append("protein_coding")


        annotation_file_dict["ensembl_gene_id"].append(region)
        annotation_file_dict["external_gene_name"].append(region)

        annotation_file_dict["chromosome_name"].append(chrom.strip("chr"))
        annotation_file_dict["transcript_biotype"].append("NA")
        annotation_file_dict["transcript_start"].append(start)
        annotation_file_dict["transcript_end"].append(end)
        annotation_file_dict["transcription_start_site"].append(middle)
        annotation_file_dict["transcript_is_canonical"].append(1)
        annotation_file_dict["transcript_count"].append("NA")
        annotation_file_dict["target_regions"].append(region)


    predictdb_annotation_file = pd.DataFrame(predictdb_annotation_file_dict)
    print(predictdb_annotation_file.shape)
    o_file = os.path.join(annotation_dir, f"{modality}_predictdb_annotation.txt")
    predictdb_annotation_file.to_csv(o_file, sep="\t", index=False)

    annotation_file = pd.DataFrame(annotation_file_dict)
    print(annotation_file.shape)
    o_file = os.path.join(annotation_dir, f"{modality}_annotation.txt")
    annotation_file.to_csv(o_file, sep=",", index=False)

Reformat normalized count tables, split Flu and NI and subset to genotype file

Code

# # Reformat normalized count tables, split Flu and NI and subset to genotype file

# genotype_file = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping/SNP_genotypes_b37.txt"
# genotype_data = pd.read_csv(genotype_file, sep="\t", index_col=0)
# genotype_data.head()
# genotype_samples = genotype_data.columns

# for modality in modality_pcs.keys():

#     cur_count_file = os.path.join(path_to_count_table,f"fully_preprocessed_{modality}_{modality_pcs[modality]['Flu']}_{modality_pcs[modality]['NI']}.txt")
#     all_expression_ori = pd.read_csv(
#         cur_count_file,
#          sep=" ")
    
    
#     valid_chroms = "chr1,chr2,chr3,chr4,chr5,chr6,chr9,chr10,chr11,chr16,chr17,chr18,chr19,chr20,chr21,chr7,chr15,chrX,chr12,chr14,chr8,chr22,chr13".split(",")
#     # Liftover peaks to hg38
#     all_expression= liftover_count_table(all_expression_ori, from_build="hg19", to_build="hg38", valid_chroms=valid_chroms)

#     # Make annotation file for EnPACT
#     make_annotation_from_count(all_expression, modality, path_to_annotations, valid_chroms)

#     print(all_expression_ori.shape)
#     print(all_expression.shape)

#     # samples = all_expression.columns

#     samples = all_expression.columns

#     flu_samples = [x for x in samples if "Flu" in x]
#     ni_samples = [x for x in samples if "NI" in x]

#     flu_samples = [x for x in flu_samples if x.split("_")[0] in genotype_samples]
#     ni_samples = [x for x in ni_samples if x.split("_")[0] in genotype_samples]

#     flu_expression = all_expression[flu_samples]
#     ni_expression = all_expression[ni_samples]

#     flu_expression = flu_expression.rename(columns={x:x.replace("_Flu","") for x in flu_samples})
#     ni_expression = ni_expression.rename(columns={x:x.replace("_NI","") for x in ni_samples})

#     print(flu_expression.shape)
#     print(ni_expression.shape)

#     # Write reformated files

#     flu_expression.to_csv(cur_count_file.replace("preprocessed_", "preprocessed_Flu_"), index=True, index_label="NAME", sep="\t")
#     ni_expression.to_csv(cur_count_file.replace("preprocessed_", "preprocessed_NI_"), index=True, index_label="NAME", sep="\t")

Liftover inputs for PredictDB

This isn’t always necessary if your inputs are all already in the same genome build.

Code

genotype_file_to_convert = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/QTL_mapping/SNP_genotypes_b37.txt"
annotation_file_to_convert = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/ref/rsIDs.for.predixcan_b37.txt"


converted_genotype_file = genotype_file_to_convert.replace("b37", "b38")
converted_annotation_file = annotation_file_to_convert.replace("b37", "b38")

Code

# Convert genotype file to hg38 (shared among all runs)

# liftover_predictdb_genotype_file(genotype_file_to_convert, from_build="hg19", to_build="hg38")

Code

# Convert SNP annotation file to hg38 (shared among all runs)

# liftover_snp_annotation_file(annotation_file_to_convert, from_build="hg19", to_build="hg38")

Train EnPACT models

Code

window_sizes = [2,4,8,16,32,64,128]

Code

for modality in modality_pcs.keys():
    for context in ["Flu","NI"]:
        for window_size in window_sizes:
            cur_proj_dir = os.path.join(path_to_project_dir,context+"_"+modality+"_ws"+str(window_size))
            os.makedirs(cur_proj_dir,exist_ok=True)

            with open(path_to_config_template,"r") as f:

                config = json.load(f)
                config["general_parameters"]["project_directory"] = cur_proj_dir
                config["general_parameters"]["context"] = context

                config["generate_enpact_training_data"]["input_files"]["normalized_expression_data"] = os.path.join(path_to_count_table,f"fully_preprocessed_{context}_{modality}_{modality_pcs[modality]['Flu']}_{modality_pcs[modality]['NI']}.txt")
                config["generate_enpact_training_data"]["reference_epigenome"]["num_bins"] = window_size 
                config["generate_enpact_training_data"]["input_files"]["gene_annotations"] = os.path.join(path_to_annotations,f"{modality}_annotation.txt")

                with open(os.path.join(cur_proj_dir,"config.json"),"w") as f:
                    json.dump(config,f, indent=4)

            with open(path_to_run_template,"r") as f:
                run = f.read()
                run = run.replace("CONFIG_FILE",os.path.join(cur_proj_dir,"config.json"))
                run = run.replace("JOBNAME",context+"_"+modality)
                run = run.replace("ERROR_LOG", os.path.join(cur_proj_dir,"error_training.log"))
                run = run.replace("OUTPUT_LOG", os.path.join(cur_proj_dir,"output_training.log"))
                with open(os.path.join(cur_proj_dir,"run_training.sbatch"),"w") as f:
                    f.write(run)

            # subprocess.run(["sbatch",
            #                 os.path.join(cur_proj_dir,"run_training.sbatch")],
            #                 cwd=cur_proj_dir)

Personalized prediction evaluation

Before moving on to do linearization and TWAS, it is important to examine the raw personalized prediction performance of the EnPACT model. Also, based on analyzing the window sizes during training, we can decide on the window size for personalized prediction and linearization. They are stored below:

Code

optimal_window_sizes = {
    "H3K27ac":8,
    "H3K27me3":64,
    "H3K4me1":32,
    "H3K4me3":8,
    "ATAC":8
}

nfolds_per_mode = {
    "H3K27ac":3,
    "H3K27me3":3,
    "H3K4me1":3,
    "H3K4me3":3,
    "ATAC":3
}

closest_enformer_track = {
    "H3K27ac":8,
    "H3K27me3":64,
    "H3K4me1":32,
    "H3K4me3":8,
    "ATAC":517
}

max_features = 500

path_to_pdb_standard_template = "/beagle3/haky/users/saideep/github_repos/Con-EnPACT/run_predictdb_standard.sbatch"
path_to_pp_template = "/beagle3/haky/users/saideep/github_repos/Con-EnPACT/run_personalized_predictions.sbatch"

Prepare VCF files for personalized prediction

Personalized prediction requires a VCF file encoding individual’s genotypes. In this case, the existing VCF is in hg19 format, and the sample names are somewhat confusingly named relative to the the other files. Liftover is not required as long as the other input files are in hg19 format, but it will help to rename the VCFs samples for consistency.

Code

vcf_dir = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/VCF/imputed"

import glob
import cyvcf2

vcf_files = glob.glob(os.path.join(vcf_dir,"*.dose.vcf.gz.chr.vcf.gz.snps.vcf.gz.noimputed.vcf.gz"))

new_vcf_dir = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/VCF/renamed"

reheader_script_dir = "/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2024-03-24-run_EnPACT_peak_data/vcf_reheader"

bcftools = "/beagle3/haky/users/saideep/software/bcftools-1.19/bcftools"

# with open(os.path.join(reheader_script_dir,"reheader.sh"), "w") as rs:
#     for vcf_file in vcf_files:


#         new_vcf_file = os.path.join(new_vcf_dir,os.path.basename(vcf_file).replace(".noimputed.vcf.gz",".noimputed.reheader.vcf.gz"))

#         cur_samples = cyvcf2.VCF(vcf_file).samples
#         new_samples = [x.split("_")[1] for x in cur_samples]

#         new_samples_file = os.path.join(reheader_script_dir,os.path.basename(vcf_file).replace(".vcf.gz",".header"))
#         with open(new_samples_file, "w") as f:
#             f.write("\n".join(new_samples))

#         rs.write(f"{bcftools} reheader -s {new_samples_file} -o {new_vcf_file} {vcf_file}\n")
#         rs.write(f"tabix -p vcf {new_vcf_file}\n\n")

Prepare config file for personalized prediction and run the relevant pipeline step

Personalized prediction requires running PredictDB training directly on the study population. This helps select features we will use for EnPACT personalized prediction as well as provide a model comparison. Therefore, the code below also generates the run script for standard predictDB. You should run that first, and then after the filtered db is created run the personalized prediction runscript.

Code


for modality in modality_pcs.keys():
    for context in ["Flu","NI"]:
        cur_window_size = optimal_window_sizes[modality]
        cur_proj_dir = os.path.join(path_to_project_dir,context+"_"+modality+"_ws"+str(cur_window_size))

        with open(os.path.join(cur_proj_dir,"config.json"),"r") as f:

            config = json.load(f)
            
            config["predictDB_standard"]["genotype_file"] = converted_genotype_file
            config["predictDB_standard"]["snp_annotation_file"] = converted_annotation_file
            config["predictDB_standard"]["feature_annotation_file"] = os.path.join(path_to_annotations,f"{modality}_predictdb_annotation.txt")
            config["predictDB_standard"]["nfolds"] = nfolds_per_mode[modality]
            
            config["personalized_predictions"]["max_features"] = max_features
            config["personalized_predictions"]["path_to_epigenome_prediction_pipeline"] = "/beagle3/haky/users/saideep/github_repos/shared_pipelines/enformer_pipeline"
            config["personalized_predictions"]["path_to_epigenome_config"] = "/beagle3/haky/users/saideep/github_repos/shared_pipelines/enformer_pipeline/config_files/run_on_beagle3_personalized.json"
            config["personalized_predictions"]["path_to_vcf"] = new_vcf_dir
            config["personalized_predictions"]["epigenome_config_parameters"]["fasta_file"] = "/beagle3/haky/data/hg_sequences/hg19/raw/genome.fa"
            config["personalized_predictions"]["num_bins"] = optimal_window_sizes[modality]
            config["personalized_predictions"]["date"] = "04-16-2024"

            config["personalized_predictions"]["liftover"] = True
            config["personalized_predictions"]["liftover_target"] = "hg19"


        with open(os.path.join(cur_proj_dir,"config.json"),"w") as f:
            json.dump(config,f, indent=4)

        with open(path_to_pdb_standard_template,"r") as f:
            run = f.read()
            run = run.replace("CONFIG_FILE",os.path.join(cur_proj_dir,"config.json"))
            run = run.replace("JOBNAME",context+"_"+modality)
            run = run.replace("ERROR_LOG", os.path.join(cur_proj_dir,"error_pdb_standard.log"))
            run = run.replace("OUTPUT_LOG", os.path.join(cur_proj_dir,"output_pdb_standard.log"))
            with open(os.path.join(cur_proj_dir,"run_predictdb_standard.sbatch"),"w") as f:
                f.write(run)

        with open(path_to_pp_template,"r") as f:
            run = f.read()
            run = run.replace("CONFIG_FILE",os.path.join(cur_proj_dir,"config.json"))
            run = run.replace("JOBNAME",context+"_"+modality)
            run = run.replace("ERROR_LOG", os.path.join(cur_proj_dir,"error_predictions.log"))
            run = run.replace("OUTPUT_LOG", os.path.join(cur_proj_dir,"output_predictions.log"))
            with open(os.path.join(cur_proj_dir,"run_personalized_predictions.sbatch"),"w") as f:
                f.write(run)

        subprocess.run(["sbatch", os.path.join(cur_proj_dir,"run_personalized_predictions.sbatch"), "True"], cwd=cur_proj_dir)
        # if modality != "ATAC":
        #     if context == "Flu":
        #         subprocess.run(["sbatch", os.path.join(cur_proj_dir,"intermediates","predictDB_standard",
        #                                        "run_predictDB_standard.sbatch")], 
        #                                        cwd=os.path.join(cur_proj_dir,"intermediates","predictDB_standard"))

Submitted batch job 19871914
Submitted batch job 19871915
Submitted batch job 19871916
Submitted batch job 19871917
Submitted batch job 19871918
Submitted batch job 19871919
Submitted batch job 19871920
Submitted batch job 19871921
Submitted batch job 19871922
Submitted batch job 19871923

Linearize Peak EnPACT models

Unlike gene expression, peak data is relatively heterogenous. Different studies can have different sets of peaks, and the total number often greatly exceeds the number of genes. It makes sense, therefore, to have a selection process rather than trying to train PredictDB models for every peak available. In this case, since we are doing follow-up TWAS analysis, we can use GWAS loci to help us select peaks of interest.

We can start by examining examining a few GWAS summary stats and look for strong signals. These loci can be used as target sites for linearization since they are likely to have causal signal and are the places with potential to show up in downstream TWAS analysis.

Which GWAS to use?:

Broad Allergy
COVID GWAS
Other infectious disease GWAS

GPU-hour costs for Enformer inference: 5hr per 20,000 sites

Set up loci selection

I am using Temi’s repo: https://github.com/hakyimlab/TFXcan-snakemake/tree/main, which performs fine-mapping from GWAS summary stats, selects loci of interest, and then runs Enformer predictions for these regions. Once these personalized predictions are made, they can be used in all downstream EnPACT analyses.

Code

path_to_tfxcan_repo = "/beagle3/haky/users/saideep/github_repos/TFXcan-snakemake"

summ_stats = {
    "AE":"/beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/Asthma_eczema_farreira_FORMATTED_hg19.txt",
    "COVID":"",
    "T2D":""
}

Code


for modality in modality_pcs.keys():
    for context in ["Flu","NI"]:
        cur_window_size = optimal_window_sizes[modality]
        cur_proj_dir = os.path.join(path_to_project_dir,context+"_"+modality+"_ws"+str(cur_window_size))

        with open(os.path.join(cur_proj_dir,"config.json"),"r") as f:

            config = json.load(f)

        with open(os.path.join(path_to_tfxcan_repo,"config","pipeline.yaml")) as pyml:

Peak overlap between modalities

Code

import pybedtools

path_to_annotations = "/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices"

for modality in modality_pcs.keys():
    cur_peak_counts_file = os.path.join(path_to_annotations, f"{modality}_annotation.txt")
    cur_peak_counts = pd.read_csv(cur_peak_counts_file, sep=",")

    with open(os.path.join(path_to_annotations, f"{modality}_annotation.bed"),"w") as f:
        for row in cur_peak_counts.iterrows():
            print(row)