test_window_size

Author

Saideep Gona

Published

February 28, 2024

Code
import os,sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json

import subprocess

Context

I need to test variable input window size for Con-EnPACT modeling.

Set up an array of window sizes and normalization strategies for Con-EnPACT training

Runtime: ~30 minutes

Code
path_to_config_template = "/beagle3/haky/users/saideep/github_repos/Con-EnPACT/example_json/flu_4bin_1milscaling.json"
path_to_run_template = "/beagle3/haky/users/saideep/github_repos/Con-EnPACT/run_training.sbatch"
path_to_project_dir = "/beagle3/haky/users/saideep/projects/Con_EnPACT/models"

normalization_strategies = {
    "_4_4_lnormTRUE_beforeTRUE_scale1"
    # "_3_7_lnormTRUE_beforeTRUE_scale1",
    # "_3_7_lnormTRUE_beforeFALSE_scale1",
    # "_3_7_lnormFALSE_beforeFALSE_scale1_sourcekallisto"
}

contexts = {"Flu":"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_flu_expression_for_predictDBNORMSTRAT.txt",
            "NI":"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_NI_expression_for_predictDBNORMSTRAT.txt"}
window_sizes = [4]

        
Code
for normstrat in normalization_strategies:
    # if normstrat != "_3_7_lnormFALSE_beforeFALSE_scale1_sourcekallisto":
    #     continue
    for context in contexts.keys():
        for window_size in window_sizes:
            cur_proj_dir = os.path.join(path_to_project_dir,context+"_ws"+str(window_size)+"_ns"+normstrat)
            os.makedirs(cur_proj_dir,exist_ok=True)

            with open(path_to_config_template,"r") as f:

                config = json.load(f)
                config["general_parameters"]["project_directory"] = cur_proj_dir
                config["general_parameters"]["context"] = context

                config["generate_enpact_training_data"]["input_files"]["normalized_expression_data"] = contexts[context].replace("NORMSTRAT",normstrat)
                config["generate_enpact_training_data"]["reference_epigenome"]["num_bins"] = window_size 

                with open(os.path.join(cur_proj_dir,"config.json"),"w") as f:
                    json.dump(config,f, indent=4)

            with open(path_to_run_template,"r") as f:
                run = f.read()
                run = run.replace("CONFIG_FILE",os.path.join(cur_proj_dir,"config.json"))
                run = run.replace("JOBNAME",context+"_ws"+str(window_size))
                run = run.replace("ERROR_LOG", os.path.join(cur_proj_dir,"error_training.log"))
                run = run.replace("OUTPUT_LOG", os.path.join(cur_proj_dir,"output_training.log"))
                with open(os.path.join(cur_proj_dir,"run_training.sbatch"),"w") as f:
                    f.write(run)

            subprocess.run(["sbatch",
                            os.path.join(cur_proj_dir,"run_training.sbatch")],
                            cwd=cur_proj_dir)
Submitted batch job 18641516
Submitted batch job 18641517

Analyze prediction correlations for different window sizes and normalization strats

Code
corr_tables = []

for normstrat in normalization_strategies:
    for context in contexts.keys():
        for window_size in window_sizes:
            cur_proj_dir = os.path.join(path_to_project_dir,context+"_ws"+str(window_size)+"_ns"+normstrat)
            cur_corr_table = pd.read_csv(os.path.join(cur_proj_dir,"intermediates","evaluate_training","corrs_df_for_subsets.tsv"), sep="\t")
            cur_corr_table["normstrat"] = normstrat
            cur_corr_table["context"] = context
            cur_corr_table["window_size"] = window_size

            corr_tables.append(cur_corr_table)

corr_table_all = pd.concat(corr_tables)
corr_table_all.head()
data_subset cor_id cor_type corrs normstrat context window_size
0 train EnPACT_pred_vs_ground_truth spearman 0.848034 _3_7_lnormFALSE_beforeFALSE_scale1_sourcekallisto Flu 2
1 train log2_epi_ref_track_vs_ground_truth spearman 0.771574 _3_7_lnormFALSE_beforeFALSE_scale1_sourcekallisto Flu 2
2 train EnPACT_pred_vs_ground_truth pearson 0.840772 _3_7_lnormFALSE_beforeFALSE_scale1_sourcekallisto Flu 2
3 train log2_epi_ref_track_vs_ground_truth pearson 0.586671 _3_7_lnormFALSE_beforeFALSE_scale1_sourcekallisto Flu 2
4 valid EnPACT_pred_vs_ground_truth spearman 0.831945 _3_7_lnormFALSE_beforeFALSE_scale1_sourcekallisto Flu 2
Code
import seaborn as sns
sns.set_theme(style="whitegrid")

corr_table_enpact = corr_table_all[corr_table_all["cor_id"]=="EnPACT_pred_vs_ground_truth"]

corr_table_enpact["normstrat_short"] = corr_table_enpact["normstrat"].str.replace("_3_7_","").str.replace("_scale1","")

min_corr = corr_table_enpact["corrs"].min() - 0.1
max_corr = corr_table_enpact["corrs"].max() + 0.1

for data_subset in ["train","valid","test"]:
    corr_table_enpact_subset = corr_table_enpact[corr_table_enpact["data_subset"]==data_subset]
    fg = sns.FacetGrid(data=corr_table_enpact_subset, col="context", row="normstrat_short", margin_titles=True, height=4, aspect=1)
    fg.map_dataframe(sns.scatterplot, x="window_size", y="corrs", hue="cor_type", palette="deep").add_legend().set(ylim=(min_corr,max_corr))
    
    fg.figure.suptitle(data_subset, y=1.05)

From the perspective of mean expression prediction on the validation dataset, it seems like gene length normalization before TMM normalization is more easily predicted than length normalization after TMM normalization. It also doesn’t seem like window size has a huge effect on the prediction accuracy.

Run linearization

Runtime ~ 8-10 hours per EnPACT model

Code
path_to_config_template = "/beagle3/haky/users/saideep/github_repos/Con-EnPACT/example_json/flu_4bin_1milscaling.json"
path_to_run_linearization_template = "/beagle3/haky/users/saideep/github_repos/Con-EnPACT/run_linearization.sbatch"
path_to_project_dir = "/beagle3/haky/users/saideep/projects/Con_EnPACT/models"

normalization_strategies = {
    "_4_4_lnormTRUE_beforeTRUE_scale1"
    # "_3_7_lnormTRUE_beforeTRUE_scale1"
    # "_3_7_lnormFALSE_beforeFALSE_scale1_sourcekallisto"
}

contexts = {"Flu":"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_flu_expression_for_predictDBNORMSTRAT.txt",
            "NI":"/beagle3/haky/users/saideep/projects/aracena_modeling/Inputs/counts_matrices/final_testing/fully_preprocessed_NI_expression_for_predictDBNORMSTRAT.txt"}
window_sizes = [4]
Code

for normstrat in normalization_strategies:
    for context in contexts.keys():
        for window_size in window_sizes:
            cur_proj_dir = os.path.join(path_to_project_dir,context+"_ws"+str(window_size)+"_ns"+normstrat)
            os.makedirs(cur_proj_dir,exist_ok=True)

            with open(path_to_config_template,"r") as f:

                config = json.load(f)
                config["general_parameters"]["project_directory"] = cur_proj_dir
                config["general_parameters"]["context"] = context

                config["generate_enpact_training_data"]["input_files"]["normalized_expression_data"] = contexts[context].replace("NORMSTRAT",normstrat)
                config["generate_enpact_training_data"]["reference_epigenome"]["num_bins"] = window_size 

                with open(os.path.join(cur_proj_dir,"config.json"),"w") as f:
                    json.dump(config,f, indent=4)

            with open(path_to_run_linearization_template,"r") as f:
                run = f.read()
                run = run.replace("CONFIG_FILE",os.path.join(cur_proj_dir,"config.json"))
                run = run.replace("JOBNAME",context+"_ws"+str(window_size))
                run = run.replace("ERROR_LOG", os.path.join(cur_proj_dir,"error_linearization.log"))
                run = run.replace("OUTPUT_LOG", os.path.join(cur_proj_dir,"output_linearization.log"))
                with open(os.path.join(cur_proj_dir,"run_linearization.sbatch"),"w") as f:
                    f.write(run)

            subprocess.run(["sbatch",
                            os.path.join(cur_proj_dir,"run_linearization.sbatch")],
                            cwd=cur_proj_dir)
Submitted batch job 18644266
Submitted batch job 18644267

Run TWAS

runtime: ~ 1 hour per TWAS run (SPrediXcan)

Code
path_to_run_twas_template = "/beagle3/haky/users/saideep/github_repos/Con-EnPACT/run_twas.sbatch"
path_to_project_dir = "/beagle3/haky/users/saideep/projects/Con_EnPACT/models"

for normstrat in normalization_strategies:
    for context in contexts.keys():
        for window_size in window_sizes:
            cur_proj_dir = os.path.join(path_to_project_dir,context+"_ws"+str(window_size)+"_ns"+normstrat)
            os.makedirs(cur_proj_dir,exist_ok=True)

            with open(path_to_run_twas_template, "r") as f:
                run = f.read()
                run = run.replace("CONFIG_FILE",os.path.join(cur_proj_dir,"config.json"))
                run = run.replace("JOBNAME",context+"_ws"+str(window_size))
                run = run.replace("ERROR_LOG", os.path.join(cur_proj_dir,"error_twas.log"))
                run = run.replace("OUTPUT_LOG", os.path.join(cur_proj_dir,"output_twas.log"))
                with open(os.path.join(cur_proj_dir,"run_twas.sbatch"), "w") as f:
                    f.write(run)

            subprocess.run(["sbatch",
                            os.path.join(cur_proj_dir,"run_twas.sbatch")],
                            cwd=cur_proj_dir)
Submitted batch job 18680959
Submitted batch job 18680960