enformer_peak_linearization

This notebook creates inputs for Enformer after creating sets of fine-tuned variants from a GWAS
Author

Saideep Gona

Published

June 3, 2024

Context

Having fine-mapped some GWAS variants, we can make predictions at those loci with Enformer utilizing different personalized context. We can then make Con-EnPACT predictions at those loci and linearize against those.

Notes:

GEUVADIS individuals

Code
import os,sys

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

proj_dir = "/beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/asthma_eczema_farreira_susie/enformer_preds"

optimal_window_sizes = {
    "H3K27ac":8,
    "H3K27me3":64,
    "H3K4me1":32,
    "H3K4me3":8,
    "ATAC":8
}
Code
path_to_fine_mapped_variants = "/beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/asthma_eczema_farreira_susie/all_SNPs_susie_filt.txt"

fine_mapped_variants = pd.read_csv(path_to_fine_mapped_variants, sep="\t")

fine_mapped_variants
variable variable_prob cs SNP PIP ldBlock
0 389 0.938377 3 1_151829204_G_A 0.938377 chr1_151566405_153208353
1 989 0.907326 1 1_152206676_C_T 0.907326 chr1_151566405_153208353
2 1283 0.523677 2 1_152347096_T_C 0.523677 chr1_151566405_153208353
3 4956 0.981685 2 1_172746562_A_G 0.981685 chr1_170588635_173128768
4 4934 0.777214 1 1_172731728_G_A 0.777214 chr1_170588635_173128768
... ... ... ... ... ... ...
60 2559 0.646169 1 19_33230549_C_T 0.646169 chr19_32255614_33772047
61 7373 0.813736 1 20_53568405_C_T 0.813736 chr20_50623121_53856010
62 7507 0.798462 4 20_53642336_C_T 0.798462 chr20_50623121_53856010
63 2329 0.636770 -1 20_51647549_T_C 0.636770 chr20_50623121_53856010
64 2015 0.606744 2 20_51524945_C_T 0.606744 chr20_50623121_53856010

65 rows × 6 columns

Make interval lists

Code

regions = []

chroms_for_annotation = []
starts = []
ends = []

with open(os.path.join(proj_dir, "interval_list.txt"), "w") as f:
    for variant in fine_mapped_variants["SNP"]:

        spl_var = variant.split("_")
        chroms_for_annotation.append(spl_var[0])

        print(variant)
        chrom = "chr"+spl_var[0]
        pos = int(spl_var[1])
        pos_offset = pos + 2

        starts.append(str(pos))
        ends.append(str(pos_offset))

        f.write(f"{chrom}_{pos}_{pos_offset}\n")
        regions.append(f"{chrom}_{pos}_{pos_offset}")


1_151829204_G_A
1_152206676_C_T
1_152347096_T_C
1_172746562_A_G
1_172731728_G_A
1_173177782_C_A
1_226721532_A_G
2_8302118_G_A
2_101962555_G_T
3_72345701_T_C
3_188069858_A_G
4_38797027_C_A
4_38906484_A_G
5_111066174_T_C
5_111134439_C_A
5_132637224_A_G
6_31427086_T_G
6_31383887_G_A
6_31361729_T_C
6_30845798_G_T
6_31354911_T_C
6_31432959_C_T
6_32644302_T_C
6_32616043_G_T
6_32658626_A_G
6_32709636_A_G
6_32633491_T_C
6_32401711_G_A
6_32623421_C_T
6_32669436_A_G
6_32742619_G_A
6_32719304_A_G
6_33007604_A_G
6_32755888_C_T
6_33679281_T_C
7_20521373_T_C
8_127801845_A_G
9_6213468_C_T
9_6532544_G_A
9_5064193_T_C
9_6588227_C_T
9_128850912_G_A
10_9007290_C_T
10_8993291_T_C
10_8894199_C_T
10_8808772_C_T
11_36306032_T_G
11_65784486_A_G
11_76628620_C_T
11_76588387_G_A
11_76582714_G_A
11_118872577_G_A
12_57095926_T_C
15_70271918_T_C
16_11135271_G_T
16_27360218_C_T
17_1492388_C_T
17_40549851_C_T
17_39635234_A_C
17_40740968_C_T
19_33230549_C_T
20_53568405_C_T
20_53642336_C_T
20_51647549_T_C
20_51524945_C_T
Code
path_to_gene_annotation = os.path.join(proj_dir, "peak_annotation.txt")

gene_annotation_dict = {
    "chr":chroms_for_annotation,
    "gene_id":regions,
    "gene_name":regions,
    "start": starts,
    "end": ends
}

gene_annotation_table = pd.DataFrame(gene_annotation_dict)
gene_annotation_table["gene_type"] = "protein_coding"

gene_annotation_table.to_csv(path_to_gene_annotation, sep="\t", index=False)
NameError: name 'chroms_for_annotation' is not defined

We can now make the linearization enformer config

Code
import json

path_to_config_template = "/beagle3/haky/users/saideep/projects/aracena_modeling/enformer_predictions/aracena_preds/aracena_pred.json"

with open(path_to_config_template, "r") as f:
    config = json.load(f)

config["interval_list_file"] = os.path.join(proj_dir, "interval_list.txt")
config["bins_to_save"] = -1
config["project_dir"] = proj_dir

with open(os.path.join(proj_dir, "config.json"), "w") as f:
    json.dump(config, f, indent=4)

Collect predictions together

Code
path_to_predictions = "/beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/asthma_eczema_farreira_susie/enformer_preds/predictions_folder/enformer_aracena_preds_genes/predictions_06-03-2024/enformer_predictions"
collected_preds_dir = "/beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/asthma_eczema_farreira_susie/enformer_preds/collected_preds"

path_to_inds_file = "/beagle3/haky/users/saideep/projects/enformer_all_geuvadis/inputs/all_inds.txt"
inds_pred = []
with open(path_to_inds_file, "r") as f:
    for line in f:
        inds_pred.append(line.strip())
Code
import h5py

for ind in inds_pred:

    print(ind)

    # Check if already complete

    all_exist = True
    for window_size in [8,32,64]:
        if not os.path.exists(os.path.join(collected_preds_dir, ind+"_"+str(window_size)+".txt")):
            all_exist = False
    if all_exist:
        print("all exist for ", ind)
        continue

    if os.path.exists(os.path.join(collected_preds_dir, ind+"_haplo1.txt")):
        if os.path.exists(os.path.join(collected_preds_dir, ind+"_haplo2.txt")):
            continue

    predictions_dicts = {}
    for window_size in [8,32,64]:
        predictions_dicts[window_size] = {}

    for region in regions:

        pred_file_h1 = os.path.join(path_to_predictions,ind,"haplotype1",region+"_predictions.h5")
        pred_file_h2 = os.path.join(path_to_predictions,ind,"haplotype2",region+"_predictions.h5")

        if not (os.path.exists(pred_file_h1) and os.path.exists(pred_file_h2)):
            print("missing",pred_file_h1,pred_file_h2)
            continue

        with h5py.File(pred_file_h1,"r") as f:
            pred_h1 = f[region][:]

        with h5py.File(pred_file_h2,"r") as f:
            pred_h2 = f[region][:]

        mean_pred = (pred_h1 + pred_h2)/2

        for window_size in [8,32,64]:

            center_bin = mean_pred.shape[0]//2
            left_bin = center_bin - window_size//2
            right_bin = center_bin + window_size//2

            # print(mean_pred.shape)
            center_mean = mean_pred[left_bin:right_bin,]

            predictions_dicts[window_size][ind+"."+region] = center_mean.mean(axis=0)

    for window_size in [8,32,64]:
        print(len(predictions_dicts[window_size].keys()))
        predictions_dict = pd.DataFrame(predictions_dicts[window_size]).T
        print(predictions_dict.shape)
        predictions_dict.to_csv(os.path.join(collected_preds_dir, ind+"_"+str(window_size)+".txt"),sep="\t")


    # predictions_dict = pd.DataFrame(predictions_dict_haplo1).T
    # print(predictions_dict.shape)
    # predictions_dict.to_csv(os.path.join(collected_preds_dir, ind+"_haplo1.txt"),sep="\t")
    # predictions_dict = pd.DataFrame(predictions_dict_haplo2).T
    # predictions_dict.to_csv(os.path.join(collected_preds_dir, ind+"_haplo2.txt"),sep="\t")

HG00096
all exist for  HG00096
HG00097
all exist for  HG00097
HG00099
all exist for  HG00099
HG00100
all exist for  HG00100
HG00101
all exist for  HG00101
HG00102
all exist for  HG00102
HG00103
all exist for  HG00103
HG00104
all exist for  HG00104
HG00105
all exist for  HG00105
HG00106
all exist for  HG00106
HG00108
all exist for  HG00108
HG00109
all exist for  HG00109
HG00110
all exist for  HG00110
HG00111
all exist for  HG00111
HG00112
all exist for  HG00112
HG00114
all exist for  HG00114
HG00115
all exist for  HG00115
HG00116
all exist for  HG00116
HG00117
all exist for  HG00117
HG00118
all exist for  HG00118
HG00119
all exist for  HG00119
HG00120
all exist for  HG00120
HG00121
all exist for  HG00121
HG00122
all exist for  HG00122
HG00123
all exist for  HG00123
HG00124
all exist for  HG00124
HG00125
all exist for  HG00125
HG00126
all exist for  HG00126
HG00127
all exist for  HG00127
HG00128
all exist for  HG00128
HG00129
all exist for  HG00129
HG00130
all exist for  HG00130
HG00131
all exist for  HG00131
HG00132
all exist for  HG00132
HG00133
all exist for  HG00133
HG00134
all exist for  HG00134
HG00135
all exist for  HG00135
HG00136
all exist for  HG00136
HG00137
all exist for  HG00137
HG00138
all exist for  HG00138
HG00139
all exist for  HG00139
HG00141
all exist for  HG00141
HG00142
all exist for  HG00142
HG00143
all exist for  HG00143
HG00145
all exist for  HG00145
HG00146
all exist for  HG00146
HG00148
all exist for  HG00148
HG00149
all exist for  HG00149
HG00150
all exist for  HG00150
HG00151
all exist for  HG00151
HG00152
all exist for  HG00152
HG00154
all exist for  HG00154
HG00155
all exist for  HG00155
HG00156
all exist for  HG00156
HG00157
all exist for  HG00157
HG00158
all exist for  HG00158
HG00159
all exist for  HG00159
HG00160
all exist for  HG00160
HG00171
all exist for  HG00171
HG00173
all exist for  HG00173
HG00174
all exist for  HG00174
HG00176
all exist for  HG00176
HG00177
all exist for  HG00177
HG00178
all exist for  HG00178
HG00179
all exist for  HG00179
HG00180
all exist for  HG00180
HG00181
all exist for  HG00181
HG00182
all exist for  HG00182
HG00183
all exist for  HG00183
HG00185
all exist for  HG00185
HG00186
all exist for  HG00186
HG00187
all exist for  HG00187
HG00188
all exist for  HG00188
HG00189
all exist for  HG00189
HG00231
all exist for  HG00231
HG00232
all exist for  HG00232
HG00233
all exist for  HG00233
HG00234
all exist for  HG00234
HG00235
all exist for  HG00235
HG00236
all exist for  HG00236
HG00238
all exist for  HG00238
HG00239
all exist for  HG00239
HG00240
all exist for  HG00240
HG00242
all exist for  HG00242
HG00243
all exist for  HG00243
HG00244
all exist for  HG00244
HG00245
all exist for  HG00245
HG00246
all exist for  HG00246
HG00247
all exist for  HG00247
HG00249
all exist for  HG00249
HG00250
all exist for  HG00250
HG00251
all exist for  HG00251
HG00252
all exist for  HG00252
HG00253
all exist for  HG00253
HG00255
all exist for  HG00255
HG00256
all exist for  HG00256
HG00257
all exist for  HG00257
HG00258
all exist for  HG00258
HG00259
all exist for  HG00259
HG00260
all exist for  HG00260
HG00261
all exist for  HG00261
HG00262
all exist for  HG00262
HG00263
all exist for  HG00263
HG00264
all exist for  HG00264
HG00265
all exist for  HG00265
HG00266
all exist for  HG00266
HG00267
all exist for  HG00267
HG00268
all exist for  HG00268
HG00269
all exist for  HG00269
HG00271
all exist for  HG00271
HG00272
all exist for  HG00272
HG00273
all exist for  HG00273
HG00274
all exist for  HG00274
HG00275
all exist for  HG00275
HG00276
all exist for  HG00276
HG00277
all exist for  HG00277
HG00278
all exist for  HG00278
HG00280
all exist for  HG00280
HG00281
all exist for  HG00281
HG00282
all exist for  HG00282
HG00284
all exist for  HG00284
HG00285
all exist for  HG00285
HG00306
all exist for  HG00306
HG00308
all exist for  HG00308
HG00309
all exist for  HG00309
HG00310
all exist for  HG00310
HG00311
all exist for  HG00311
HG00312
all exist for  HG00312
HG00313
all exist for  HG00313
HG00315
all exist for  HG00315
HG00319
all exist for  HG00319
HG00320
all exist for  HG00320
HG00321
all exist for  HG00321
HG00323
all exist for  HG00323
HG00324
all exist for  HG00324
HG00325
all exist for  HG00325
HG00326
all exist for  HG00326
HG00327
all exist for  HG00327
HG00328
all exist for  HG00328
HG00329
all exist for  HG00329
HG00330
all exist for  HG00330
HG00331
all exist for  HG00331
HG00332
all exist for  HG00332
HG00334
all exist for  HG00334
HG00335
all exist for  HG00335
HG00336
all exist for  HG00336
HG00337
all exist for  HG00337
HG00338
all exist for  HG00338
HG00339
all exist for  HG00339
HG00341
all exist for  HG00341
HG00342
all exist for  HG00342
HG00343
all exist for  HG00343
HG00344
all exist for  HG00344
HG00345
all exist for  HG00345
HG00346
all exist for  HG00346
HG00349
all exist for  HG00349
HG00350
all exist for  HG00350
HG00351
all exist for  HG00351
HG00353
all exist for  HG00353
HG00355
all exist for  HG00355
HG00356
all exist for  HG00356
HG00358
all exist for  HG00358
HG00359
all exist for  HG00359
HG00360
all exist for  HG00360
HG00361
all exist for  HG00361
HG00362
all exist for  HG00362
HG00364
all exist for  HG00364
HG00365
all exist for  HG00365
HG00366
all exist for  HG00366
HG00367
all exist for  HG00367
HG00369
all exist for  HG00369
HG00371
all exist for  HG00371
HG00372
all exist for  HG00372
HG00373
all exist for  HG00373
HG00375
all exist for  HG00375
HG00376
all exist for  HG00376
HG00377
all exist for  HG00377
HG00378
all exist for  HG00378
HG00379
all exist for  HG00379
HG00380
all exist for  HG00380
HG00381
all exist for  HG00381
HG00382
all exist for  HG00382
HG00383
all exist for  HG00383
HG00384
all exist for  HG00384
HG01334
all exist for  HG01334
HG01789
all exist for  HG01789
HG01790
all exist for  HG01790
HG01791
all exist for  HG01791
HG02215
all exist for  HG02215
NA06984
all exist for  NA06984
NA06985
all exist for  NA06985
NA06986
all exist for  NA06986
NA06989
all exist for  NA06989
NA06994
all exist for  NA06994
NA07037
all exist for  NA07037
NA07048
all exist for  NA07048
NA07051
all exist for  NA07051
NA07056
all exist for  NA07056
NA07346
all exist for  NA07346
NA07347
all exist for  NA07347
NA07357
all exist for  NA07357
NA10847
all exist for  NA10847
NA10851
all exist for  NA10851
NA11829
all exist for  NA11829
NA11830
all exist for  NA11830
NA11831
all exist for  NA11831
NA11832
all exist for  NA11832
NA11840
all exist for  NA11840
NA11843
all exist for  NA11843
NA11881
all exist for  NA11881
NA11892
all exist for  NA11892
NA11893
all exist for  NA11893
NA11894
all exist for  NA11894
NA11918
all exist for  NA11918
NA11920
all exist for  NA11920
NA11930
all exist for  NA11930
NA11931
all exist for  NA11931
NA11992
all exist for  NA11992
NA11993
all exist for  NA11993
NA11994
all exist for  NA11994
NA11995
all exist for  NA11995
NA12004
all exist for  NA12004
NA12005
all exist for  NA12005
NA12006
all exist for  NA12006
NA12043
all exist for  NA12043
NA12044
all exist for  NA12044
NA12045
all exist for  NA12045
NA12058
all exist for  NA12058
NA12144
all exist for  NA12144
NA12154
all exist for  NA12154
NA12155
all exist for  NA12155
NA12156
all exist for  NA12156
NA12234
all exist for  NA12234
NA12249
all exist for  NA12249
NA12272
all exist for  NA12272
NA12273
all exist for  NA12273
NA12275
all exist for  NA12275
NA12282
all exist for  NA12282
NA12283
all exist for  NA12283
NA12286
all exist for  NA12286
NA12287
all exist for  NA12287
NA12340
all exist for  NA12340
NA12341
all exist for  NA12341
NA12342
all exist for  NA12342
NA12347
all exist for  NA12347
NA12348
all exist for  NA12348
NA12383
all exist for  NA12383
NA12399
all exist for  NA12399
NA12400
all exist for  NA12400
NA12413
all exist for  NA12413
NA12489
all exist for  NA12489
NA12546
all exist for  NA12546
NA12716
all exist for  NA12716
NA12717
all exist for  NA12717
NA12718
all exist for  NA12718
NA12749
all exist for  NA12749
NA12750
all exist for  NA12750
NA12751
all exist for  NA12751
NA12760
all exist for  NA12760
NA12761
all exist for  NA12761
NA12762
all exist for  NA12762
NA12763
all exist for  NA12763
NA12775
all exist for  NA12775
NA12776
all exist for  NA12776
NA12777
all exist for  NA12777
NA12778
all exist for  NA12778
NA12812
all exist for  NA12812
NA12813
all exist for  NA12813
NA12814
all exist for  NA12814
NA12815
all exist for  NA12815
NA12827
all exist for  NA12827
NA12829
all exist for  NA12829
NA12830
all exist for  NA12830
NA12842
all exist for  NA12842
NA12843
all exist for  NA12843
NA12872
all exist for  NA12872
NA12873
all exist for  NA12873
NA12874
all exist for  NA12874
NA12889
all exist for  NA12889
NA12890
all exist for  NA12890
NA18486
all exist for  NA18486
NA18487
all exist for  NA18487
NA18488
all exist for  NA18488
NA18489
all exist for  NA18489
NA18498
all exist for  NA18498
NA18499
all exist for  NA18499
NA18502
all exist for  NA18502
NA18505
all exist for  NA18505
NA18508
all exist for  NA18508
NA18510
all exist for  NA18510
NA18511
all exist for  NA18511
NA18517
all exist for  NA18517
NA18519
all exist for  NA18519
NA18520
all exist for  NA18520
NA18858
all exist for  NA18858
NA18861
all exist for  NA18861
NA18867
all exist for  NA18867
NA18868
all exist for  NA18868
NA18870
all exist for  NA18870
NA18873
all exist for  NA18873
NA18907
all exist for  NA18907
NA18908
all exist for  NA18908
NA18909
all exist for  NA18909
NA18910
all exist for  NA18910
NA18912
all exist for  NA18912
NA18916
all exist for  NA18916
NA18917
all exist for  NA18917
NA18923
all exist for  NA18923
NA18933
all exist for  NA18933
NA18934
all exist for  NA18934
NA19092
all exist for  NA19092
NA19093
all exist for  NA19093
NA19095
all exist for  NA19095
NA19096
all exist for  NA19096
NA19098
all exist for  NA19098
NA19099
all exist for  NA19099
NA19102
all exist for  NA19102
NA19107
all exist for  NA19107
NA19108
all exist for  NA19108
NA19113
all exist for  NA19113
NA19114
all exist for  NA19114
NA19116
all exist for  NA19116
NA19117
all exist for  NA19117
NA19118
all exist for  NA19118
NA19119
all exist for  NA19119
NA19121
all exist for  NA19121
NA19129
all exist for  NA19129
NA19130
all exist for  NA19130
NA19131
all exist for  NA19131
NA19137
all exist for  NA19137
NA19138
all exist for  NA19138
NA19141
all exist for  NA19141
NA19143
all exist for  NA19143
NA19144
all exist for  NA19144
NA19146
all exist for  NA19146
NA19147
all exist for  NA19147
NA19149
all exist for  NA19149
NA19150
all exist for  NA19150
NA19152
all exist for  NA19152
NA19153
all exist for  NA19153
NA19159
all exist for  NA19159
NA19160
all exist for  NA19160
NA19171
all exist for  NA19171
NA19172
all exist for  NA19172
NA19175
all exist for  NA19175
NA19184
all exist for  NA19184
NA19185
all exist for  NA19185
NA19189
all exist for  NA19189
NA19190
all exist for  NA19190
NA19197
all exist for  NA19197
NA19198
all exist for  NA19198
NA19200
all exist for  NA19200
NA19201
all exist for  NA19201
NA19204
all exist for  NA19204
NA19206
all exist for  NA19206
NA19207
all exist for  NA19207
NA19209
all exist for  NA19209
NA19210
all exist for  NA19210
NA19213
all exist for  NA19213
NA19214
all exist for  NA19214
NA19222
all exist for  NA19222
NA19223
all exist for  NA19223
NA19225
all exist for  NA19225
NA19235
all exist for  NA19235
NA19236
all exist for  NA19236
NA19247
all exist for  NA19247
NA19248
all exist for  NA19248
NA19256
all exist for  NA19256
NA19257
all exist for  NA19257
NA20502
all exist for  NA20502
NA20503
all exist for  NA20503
NA20504
all exist for  NA20504
NA20505
all exist for  NA20505
NA20506
all exist for  NA20506
NA20507
all exist for  NA20507
NA20508
all exist for  NA20508
NA20509
all exist for  NA20509
NA20510
all exist for  NA20510
NA20512
all exist for  NA20512
NA20513
all exist for  NA20513
NA20514
all exist for  NA20514
NA20515
all exist for  NA20515
NA20516
all exist for  NA20516
NA20517
all exist for  NA20517
NA20518
all exist for  NA20518
NA20519
all exist for  NA20519
NA20520
all exist for  NA20520
NA20521
all exist for  NA20521
NA20524
all exist for  NA20524
NA20525
all exist for  NA20525
NA20527
all exist for  NA20527
NA20528
all exist for  NA20528
NA20529
all exist for  NA20529
NA20530
all exist for  NA20530
NA20531
all exist for  NA20531
NA20532
all exist for  NA20532
NA20534
all exist for  NA20534
NA20535
all exist for  NA20535
NA20536
all exist for  NA20536
NA20537
all exist for  NA20537
NA20538
all exist for  NA20538
NA20539
all exist for  NA20539
NA20540
all exist for  NA20540
NA20541
all exist for  NA20541
NA20542
all exist for  NA20542
NA20543
all exist for  NA20543
NA20544
all exist for  NA20544
NA20581
all exist for  NA20581
NA20582
all exist for  NA20582
NA20585
all exist for  NA20585
NA20586
all exist for  NA20586
NA20588
all exist for  NA20588
NA20589
all exist for  NA20589
NA20752
all exist for  NA20752
NA20754
all exist for  NA20754
NA20756
all exist for  NA20756
NA20757
all exist for  NA20757
NA20758
all exist for  NA20758
NA20759
all exist for  NA20759
NA20760
all exist for  NA20760
NA20761
all exist for  NA20761
NA20765
all exist for  NA20765
NA20766
all exist for  NA20766
NA20768
all exist for  NA20768
NA20769
all exist for  NA20769
NA20770
all exist for  NA20770
NA20771
all exist for  NA20771
NA20772
all exist for  NA20772
NA20773
all exist for  NA20773
NA20774
all exist for  NA20774
NA20778
all exist for  NA20778
NA20783
all exist for  NA20783
NA20785
all exist for  NA20785
NA20786
all exist for  NA20786
NA20787
all exist for  NA20787
NA20790
all exist for  NA20790
NA20792
all exist for  NA20792
NA20795
all exist for  NA20795
NA20796
all exist for  NA20796
NA20797
all exist for  NA20797
NA20798
all exist for  NA20798
NA20799
all exist for  NA20799
NA20800
all exist for  NA20800
NA20801
all exist for  NA20801
NA20802
all exist for  NA20802
NA20803
all exist for  NA20803
NA20804
all exist for  NA20804
NA20805
all exist for  NA20805
NA20806
all exist for  NA20806
NA20807
all exist for  NA20807
NA20808
all exist for  NA20808
NA20809
all exist for  NA20809
NA20810
all exist for  NA20810
NA20811
all exist for  NA20811
NA20812
all exist for  NA20812
NA20813
all exist for  NA20813
NA20814
all exist for  NA20814
NA20815
all exist for  NA20815
NA20816
all exist for  NA20816
NA20819
all exist for  NA20819
NA20826
all exist for  NA20826
NA20828
all exist for  NA20828

we can now collect the predictions together and start linearizing

Code
enpact_preds_linear_dir = "/beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/asthma_eczema_farreira_susie/enformer_preds/enpact_linear_preds"
Code
enpact_preds_linear_dir = "/beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/asthma_eczema_farreira_susie/enformer_preds/enpact_linear_preds"


for modality in optimal_window_sizes.keys():
    for cond in ["Flu","NI"]:
        ind_table = {}
        for ind in inds_pred:
            if os.path.exists(os.path.join(enpact_preds_linear_dir,f"{ind}_{modality}_{cond}.txt")):
                cur_table = pd.read_csv(os.path.join(enpact_preds_linear_dir,f"{ind}_{modality}_{cond}.txt"),
                                        sep="\t", skiprows=2, header=None)
                ind_table[ind] = cur_table[1]
        print(modality, cond)
        ind_df = pd.DataFrame(ind_table)
        ind_df.index = [x.split(".")[1] for x in cur_table[0]]
        ind_df.to_csv(os.path.join(enpact_preds_linear_dir, f"{modality}_{cond}_combined_predictions.txt"), sep="\t", index=True, index_label="NAME")


        

                # ind_table[ind] = cur_table[]



            # else:
            #     print(os.path.join(enpact_preds_linear_dir,f"{ind}_{modality}_{cond}.txt"))
      

# paste_com = [
#     "paste",
#     "-d'\t'"
# ]

# for modality in optimal_window_sizes.keys():
#     for cond in ["Flu","NI"]:

#         inds_include = []
#         for ind in inds_pred:
#             if os.path.exists(os.path.join(enpact_preds_linear_dir,f"{ind}_{modality}_{cond}.txt")):
#                 paste_com.append(os.path.join(enpact_preds_linear_dir,f"{ind}_{modality}_{cond}.txt"))
#                 inds_include.append(ind)
#             # else:
#             #     print(os.path.join(enpact_preds_linear_dir,f"{ind}_{modality}_{cond}.txt"))

#         paste_com.append("|")
#         paste_com.append("tail")
#         paste_com.append("-n")
#         paste_com.append("+2")

#         paste_com.append(">")
#         paste_com.append(os.path.join(enpact_preds_linear_dir, f"{modality}_{cond}_combined_predictions.txt"))

#         print(" ".join(paste_com))

        # os.system(" ".join(paste_com))
H3K27ac Flu
H3K27ac NI
H3K27me3 Flu
H3K27me3 NI
H3K4me1 Flu
H3K4me1 NI
H3K4me3 Flu
H3K4me3 NI
ATAC Flu
ATAC NI
Code


# combined_expression_table = pd.read_csv(os.path.join(intermediates_dir, "enpact_personalized_predictions_combined.txt"), delimiter="\t", header=None)

# combined_expression_table.columns = inds_include
Code
import subprocess

predictDB_template = "/beagle3/haky/users/saideep/github_repos/Con-EnPACT/scripts/run_predictDB.sbatch"

predictDB_repo = "/beagle3/haky/users/saideep/github_repos/forked_PredictDB-nextflow/PredictDb-nextflow"

snp_annotation = "/beagle3/haky/users/charles/project/singleXcanDL/PredicDB/T2D_all_DL_PredictDb/meta_data/all_chrs.snp_annot_v1.txt"
genotype_file = "/beagle3/haky/users/charles/project/singleXcanDL/PredicDB/T2D_all_DL_PredictDb/meta_data/all_chrs.geno_v1.txt"
Code
pre_check_path = "/beagle3/haky/users/saideep/github_repos/forked_PredictDB-nextflow/PredictDb-nextflow/bin/pre_flight_checks.R"
modality="ATAC"
cond="Flu"

print(" ".join([
    "Rscript",
    pre_check_path,
    "--snp_annotation", snp_annotation,
    "--genotype_file", genotype_file,
    "--gene_annotation", path_to_gene_annotation,
    "--gene_expression", os.path.join(enpact_preds_linear_dir, f"{modality}_{cond}_combined_predictions.txt")
]))
Rscript /beagle3/haky/users/saideep/github_repos/forked_PredictDB-nextflow/PredictDb-nextflow/bin/pre_flight_checks.R --snp_annotation /beagle3/haky/users/charles/project/singleXcanDL/PredicDB/T2D_all_DL_PredictDb/meta_data/all_chrs.snp_annot_v1.txt --genotype_file /beagle3/haky/users/charles/project/singleXcanDL/PredicDB/T2D_all_DL_PredictDb/meta_data/all_chrs.geno_v1.txt --gene_annotation /beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/asthma_eczema_farreira_susie/enformer_preds/peak_annotation.txt --gene_expression /beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/asthma_eczema_farreira_susie/enformer_preds/enpact_linear_preds/ATAC_Flu_combined_predictions.txt
Code

for modality in optimal_window_sizes.keys():
    for cond in ["Flu","NI"]:

        predictDB_dir = os.path.join(proj_dir,"predictDB",f"{modality}_{cond}")
        os.makedirs(predictDB_dir, exist_ok=True)

        with open(os.path.join(predictDB_template), "r") as f:
            predictDB_script = f.read()

            predictDB_script = predictDB_script.replace("JOBNAME", "EnPACT_Linearization")

            predictDB_script = predictDB_script.replace("PREDICTDB_REPO", predictDB_repo)
            predictDB_script = predictDB_script.replace("EXPRESSION", os.path.join(enpact_preds_linear_dir, f"{modality}_{cond}_combined_predictions.txt"))
            predictDB_script = predictDB_script.replace("OUTPUT_DIR", predictDB_dir)
            predictDB_script = predictDB_script.replace("GENE_ANNOTATION", path_to_gene_annotation)
            predictDB_script = predictDB_script.replace("SNP_ANNOTATION", snp_annotation)
            predictDB_script = predictDB_script.replace("GENOTYPE", genotype_file)
            predictDB_script = predictDB_script.replace("NFOLDS", "10")


            with open(os.path.join(predictDB_dir,"run_predictDB.sbatch"), "w") as rp:
                rp.write(predictDB_script)

            subprocess.run(["sbatch", os.path.join(predictDB_dir,"run_predictDB.sbatch")], cwd = predictDB_dir)
Submitted batch job 20877105
Submitted batch job 20877106
Submitted batch job 20877107
Submitted batch job 20877108
Submitted batch job 20877109
Submitted batch job 20877110
Submitted batch job 20877111
Submitted batch job 20877112
Submitted batch job 20877113
Submitted batch job 20877114

XWAS

Finally, having run the linearization, we can now run XWAS to get the final predictions

Code
for modality in optimal_window_sizes.keys():
    for cond in ["Flu","NI"]:
        
        job_name = f"EnPACT_Linearization_{modality}_{cond}"
        model_db_path = os.path.join(proj_dir,"predictDB",f"{modality}_{cond}", "filtered_db", "predict_db_Model_training_filtered.db")
        cov_path = os.path.join(proj_dir,"predictDB",f"{modality}_{cond}", "filtered_db", "predict_db_Model_training_filtered.txt.gz")
        output_path = os.path.join(proj_dir,"spredixcan",f"spredixcan_results_{modality}_{cond}.txt")

        spredixcan_script = f'''#!/bin/bash
#SBATCH --job-name=SPrediXcan_{job_name}
#SBATCH --account=pi-haky
#SBATCH --partition=caslake
#SBATCH --nodes=1
#SBATCH --cpus-per-task=24
#SBATCH --time=06:00:00
#SBATCH --mem=52G
#SBATCH --error=slurm-%A_%a.err

conda activate /beagle3/haky/users/saideep/envs/enformer

/beagle3/haky/users/saideep/github_repos/MetaXcan/software/SPrediXcan.py \
--model_db_path {model_db_path} \
--covariance {cov_path} \
--gwas_file /beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/Asthma_eczema_farreira_FORMATTED.txt.chromb38 \
--model_db_snp_key varID \
--snp_column panel_variant_id \
--effect_allele_column effect_allele \
--non_effect_allele_column non_effect_allele \
--beta_column effect_size \
--pvalue_column pvalue \
--keep_non_rsid \
--output_file {output_path}
        '''

        with open(os.path.join(proj_dir,"spredixcan",f"spredixcan_{modality}_{cond}.sbatch"), "w") as f:
            f.write(spredixcan_script)

        subprocess.run(["sbatch", os.path.join(proj_dir,"spredixcan",f"spredixcan_{modality}_{cond}.sbatch")], cwd = os.path.join(proj_dir,"spredixcan"))
Submitted batch job 20878939
Submitted batch job 20878940
Submitted batch job 20878941
Submitted batch job 20878942
Submitted batch job 20878943
Submitted batch job 20878944
Submitted batch job 20878945
Submitted batch job 20878946
Submitted batch job 20878947
Submitted batch job 20878948

Now we have the XWAS results! We can collect them into a singular table now.

Code
results_tables = []

for modality in optimal_window_sizes.keys():
    for cond in ["Flu","NI"]:
        path_to_results = os.path.join(proj_dir,"spredixcan",f"spredixcan_results_{modality}_{cond}.txt")
        results = pd.read_csv(path_to_results, sep=",")
        results["modality"] = modality
        results["condition"] = cond

        header=results.iloc[0]

        print(results.head())

        results_tables.append(results)

results_table_full = pd.concat(results_tables)
print(results_table_full.shape)

results_table_full.to_csv(os.path.join(proj_dir,"spredixcan","spredixcan_results_full.txt"), sep="\t", index=False)
                       gene                 gene_name    zscore  effect_size  \
0    chr6_32633491_32633493    chr6_32633491_32633493  9.547982     0.328423   
1   chr11_76582714_76582716   chr11_76582714_76582716 -9.299150    -1.705122   
2      chr2_8302118_8302120      chr2_8302118_8302120 -9.154066    -2.186974   
3  chr5_111066174_111066176  chr5_111066174_111066176 -8.733922    -2.850392   
4   chr16_11135271_11135273   chr16_11135271_11135273 -8.143531    -2.094074   

         pvalue     var_g  pred_perf_r2  pred_perf_pval  pred_perf_qval  \
0  1.322473e-21  0.019164      0.909489    0.000000e+00             NaN   
1  1.415723e-20  0.000524      0.799633   1.155488e-195             NaN   
2  5.482984e-20  0.000303      0.939929    0.000000e+00             NaN   
3  2.459898e-18  0.000175      0.808673   9.366178e-205             NaN   
4  3.839157e-16  0.000245      0.894833   7.096157e-307             NaN   

   n_snps_used  n_snps_in_cov  n_snps_in_model modality condition  
0           91            129              129  H3K27ac       Flu  
1          236            282              282  H3K27ac       Flu  
2          137            175              175  H3K27ac       Flu  
3          113            138              138  H3K27ac       Flu  
4          215            267              267  H3K27ac       Flu  
                       gene                 gene_name     zscore  effect_size  \
0    chr4_38797027_38797029    chr4_38797027_38797029 -11.171582    -1.659374   
1   chr16_11135271_11135273   chr16_11135271_11135273 -10.635322    -2.474143   
2    chr6_32633491_32633493    chr6_32633491_32633493  10.121508     0.402041   
3  chr5_111134439_111134441  chr5_111134439_111134441   9.431472     2.982706   
4    chr4_38906484_38906486    chr4_38906484_38906486   8.924140     3.889340   

         pvalue     var_g  pred_perf_r2  pred_perf_pval  pred_perf_qval  \
0  5.617130e-29  0.001132      0.923508    0.000000e+00             NaN   
1  2.041223e-26  0.000307      0.941674    0.000000e+00             NaN   
2  4.435293e-24  0.014704      0.907585    0.000000e+00             NaN   
3  4.043731e-21  0.000188      0.920107    0.000000e+00             NaN   
4  4.491741e-19  0.000130      0.603079   4.749972e-114             NaN   

   n_snps_used  n_snps_in_cov  n_snps_in_model modality condition  
0          126            167              167  H3K27ac        NI  
1          170            199              199  H3K27ac        NI  
2           56             90               90  H3K27ac        NI  
3          146            181              181  H3K27ac        NI  
4          154            201              201  H3K27ac        NI  
                       gene                 gene_name     zscore  effect_size  \
0   chr11_76588387_76588389   chr11_76588387_76588389 -12.358268    -3.311811   
1    chr4_38797027_38797029    chr4_38797027_38797029   8.718000     0.683150   
2  chr5_111066174_111066176  chr5_111066174_111066176  -8.522314    -2.328081   
3  chr5_111134439_111134441  chr5_111134439_111134441  -8.515787    -1.976937   
4   chr11_76628620_76628622   chr11_76628620_76628622  -8.085678    -1.862680   

         pvalue     var_g  pred_perf_r2  pred_perf_pval  pred_perf_qval  \
0  4.395250e-35  0.000253      0.895698    0.000000e+00             NaN   
1  2.831588e-18  0.001655      0.849409   1.669071e-237             NaN   
2  1.563968e-17  0.000224      0.838334   8.159932e-234             NaN   
3  1.654622e-17  0.000320      0.895191    0.000000e+00             NaN   
4  6.181913e-16  0.000352      0.845098   1.023845e-246             NaN   

   n_snps_used  n_snps_in_cov  n_snps_in_model  modality condition  
0          152            186              186  H3K27me3       Flu  
1          159            213              213  H3K27me3       Flu  
2          124            149              149  H3K27me3       Flu  
3          123            167              167  H3K27me3       Flu  
4           92            113              113  H3K27me3       Flu  
                       gene                 gene_name     zscore  effect_size  \
0   chr11_76588387_76588389   chr11_76588387_76588389 -11.716066    -3.125000   
1  chr5_111134439_111134441  chr5_111134439_111134441  -8.685853    -2.066755   
2   chr11_76628620_76628622   chr11_76628620_76628622  -8.469822    -1.868597   
3    chr6_32633491_32633493    chr6_32633491_32633493  -8.382416    -0.203040   
4   chr16_11135271_11135273   chr16_11135271_11135273   8.266658     1.335046   

         pvalue     var_g  pred_perf_r2  pred_perf_pval  pred_perf_qval  \
0  1.054562e-31  0.000252      0.905802    0.000000e+00             NaN   
1  3.759114e-18  0.000302      0.890662   5.614408e-303             NaN   
2  2.457692e-17  0.000381      0.867941   3.583789e-271             NaN   
3  5.185241e-17  0.041321      0.904682    0.000000e+00             NaN   
4  1.377668e-16  0.000627      0.954031    0.000000e+00             NaN   

   n_snps_used  n_snps_in_cov  n_snps_in_model  modality condition  
0          152            197              197  H3K27me3        NI  
1          118            165              165  H3K27me3        NI  
2          208            253              253  H3K27me3        NI  
3           99            137              137  H3K27me3        NI  
4          108            125              125  H3K27me3        NI  
                      gene                gene_name    zscore  effect_size  \
0   chr6_32644302_32644304   chr6_32644302_32644304 -9.271755    -0.887774   
1   chr4_38797027_38797029   chr4_38797027_38797029 -8.981647    -0.819981   
2     chr2_8302118_8302120     chr2_8302118_8302120 -8.177436    -0.657194   
3     chr9_6213468_6213470     chr9_6213468_6213470 -7.576102    -1.791899   
4  chr11_76582714_76582716  chr11_76582714_76582716 -7.047422    -1.075773   

         pvalue     var_g  pred_perf_r2  pred_perf_pval  pred_perf_qval  \
0  1.831085e-20  0.002952      0.809229   7.246220e-210             NaN   
1  2.667429e-19  0.003083      0.871703   1.384399e-265             NaN   
2  2.899463e-16  0.002660      0.956907    0.000000e+00             NaN   
3  3.560931e-14  0.000306      0.888655   1.387084e-291             NaN   
4  1.822631e-12  0.000782      0.743594   3.000845e-171             NaN   

   n_snps_used  n_snps_in_cov  n_snps_in_model modality condition  
0           44             70               70  H3K4me1       Flu  
1          104            139              139  H3K4me1       Flu  
2          125            159              159  H3K4me1       Flu  
3          124            154              154  H3K4me1       Flu  
4          232            277              277  H3K4me1       Flu  
                      gene                gene_name     zscore  effect_size  \
0   chr4_38797027_38797029   chr4_38797027_38797029 -10.698585    -0.816747   
1   chr6_32644302_32644304   chr6_32644302_32644304  -9.198776    -0.907013   
2   chr4_38906484_38906486   chr4_38906484_38906486   8.424720     3.308340   
3     chr2_8302118_8302120     chr2_8302118_8302120  -8.290471    -0.651513   
4  chr20_53568405_53568407  chr20_53568405_53568407   7.928505     3.577987   

         pvalue     var_g  pred_perf_r2  pred_perf_pval  pred_perf_qval  \
0  1.033439e-26  0.003927      0.916797    0.000000e+00             NaN   
1  3.620493e-20  0.002710      0.796436   5.728714e-195             NaN   
2  3.616147e-17  0.000156      0.774963   3.011172e-187             NaN   
3  1.128005e-16  0.002786      0.956841    0.000000e+00             NaN   
4  2.217998e-15  0.000086      0.895434    0.000000e+00             NaN   

   n_snps_used  n_snps_in_cov  n_snps_in_model modality condition  
0          126            169              169  H3K4me1        NI  
1           43             63               63  H3K4me1        NI  
2          190            252              252  H3K4me1        NI  
3          121            163              163  H3K4me1        NI  
4          131            146              146  H3K4me1        NI  
                       gene                 gene_name     zscore  effect_size  \
0   chr11_76588387_76588389   chr11_76588387_76588389 -14.149766    -2.477659   
1   chr11_76582714_76582716   chr11_76582714_76582716 -13.763971    -3.409770   
2   chr16_11135271_11135273   chr16_11135271_11135273 -10.354444    -2.121070   
3      chr2_8302118_8302120      chr2_8302118_8302120 -10.055464    -2.416658   
4  chr5_132637224_132637226  chr5_132637224_132637226  -7.430645    -8.819775   

         pvalue     var_g  pred_perf_r2  pred_perf_pval  pred_perf_qval  \
0  1.873788e-45  0.000575      0.935108    0.000000e+00             NaN   
1  4.198175e-43  0.000299      0.859549   1.067125e-252             NaN   
2  3.994999e-25  0.000371      0.901121    0.000000e+00             NaN   
3  8.691102e-24  0.000299      0.880640   4.862282e-289             NaN   
4  1.080698e-13  0.000013      0.420446    1.527513e-58             NaN   

   n_snps_used  n_snps_in_cov  n_snps_in_model modality condition  
0          149            176              176  H3K4me3       Flu  
1          194            228              228  H3K4me3       Flu  
2          198            246              246  H3K4me3       Flu  
3           70             89               89  H3K4me3       Flu  
4           81             98               98  H3K4me3       Flu  
                       gene                 gene_name     zscore  effect_size  \
0  chr5_111134439_111134441  chr5_111134439_111134441  11.161489     3.803763   
1   chr16_11135271_11135273   chr16_11135271_11135273 -10.369018    -1.951014   
2      chr2_8302118_8302120      chr2_8302118_8302120 -10.130860    -1.829771   
3  chr5_111066174_111066176  chr5_111066174_111066176   9.712393     1.819449   
4   chr11_76582714_76582716   chr11_76582714_76582716  -7.326456    -2.826627   

         pvalue     var_g  pred_perf_r2  pred_perf_pval  pred_perf_qval  \
0  6.292868e-29  0.000156      0.909274    0.000000e+00             NaN   
1  3.430315e-25  0.000446      0.947267    0.000000e+00             NaN   
2  4.030882e-24  0.000520      0.937519    0.000000e+00             NaN   
3  2.669939e-22  0.000560      0.852033   3.472546e-255             NaN   
4  2.363188e-13  0.000120      0.753139   4.409897e-165             NaN   

   n_snps_used  n_snps_in_cov  n_snps_in_model modality condition  
0           96            148              148  H3K4me3        NI  
1          147            170              170  H3K4me3        NI  
2          137            174              174  H3K4me3        NI  
3          114            150              150  H3K4me3        NI  
4          240            288              288  H3K4me3        NI  
                       gene                 gene_name     zscore  effect_size  \
0   chr11_76582714_76582716   chr11_76582714_76582716 -13.489373    -3.295994   
1  chr5_111134439_111134441  chr5_111134439_111134441  10.204171     3.818776   
2   chr16_11135271_11135273   chr16_11135271_11135273 -10.143527    -2.384734   
3    chr6_32633491_32633493    chr6_32633491_32633493  -9.805530    -0.334582   
4      chr9_6213468_6213470      chr9_6213468_6213470  -8.918155    -1.410380   

         pvalue     var_g  pred_perf_r2  pred_perf_pval  pred_perf_qval  \
0  1.806291e-41  0.000306      0.871800   4.608740e-273             NaN   
1  1.899357e-24  0.000124      0.898641    0.000000e+00             NaN   
2  3.540785e-24  0.000297      0.964486    0.000000e+00             NaN   
3  1.065860e-22  0.021027      0.846749   2.442459e-242             NaN   
4  4.741197e-19  0.000682      0.926805    0.000000e+00             NaN   

   n_snps_used  n_snps_in_cov  n_snps_in_model modality condition  
0          171            206              206     ATAC       Flu  
1           98            126              126     ATAC       Flu  
2          147            170              170     ATAC       Flu  
3           68            102              102     ATAC       Flu  
4          143            175              175     ATAC       Flu  
                       gene                 gene_name     zscore  effect_size  \
0  chr5_111134439_111134441  chr5_111134439_111134441  12.496988     5.461482   
1    chr4_38797027_38797029    chr4_38797027_38797029 -11.784354    -1.361173   
2    chr6_32644302_32644304    chr6_32644302_32644304 -10.990890    -1.689367   
3   chr16_11135271_11135273   chr16_11135271_11135273 -10.655064    -1.685421   
4   chr11_76588387_76588389   chr11_76588387_76588389  10.425832     3.058947   

         pvalue     var_g  pred_perf_r2  pred_perf_pval  pred_perf_qval  \
0  7.753331e-36  0.000095      0.942002    0.000000e+00             NaN   
1  4.700087e-32  0.001944      0.859229   1.795906e-253             NaN   
2  4.227356e-28  0.001096      0.760080   7.252959e-173             NaN   
3  1.651306e-26  0.000656      0.973919    0.000000e+00             NaN   
4  1.889973e-25  0.000206      0.825668   1.321652e-216             NaN   

   n_snps_used  n_snps_in_cov  n_snps_in_model modality condition  
0          124            157              157     ATAC        NI  
1           88            115              115     ATAC        NI  
2           32             53               53     ATAC        NI  
3           84             95               95     ATAC        NI  
4          189            244              244     ATAC        NI  
(650, 14)