This notebook creates inputs for Enformer after creating sets of fine-tuned variants from a GWAS
Author
Saideep Gona
Published
June 3, 2024
Context
Having fine-mapped some GWAS variants, we can make predictions at those loci with Enformer utilizing different personalized context. We can then make Con-EnPACT predictions at those loci and linearize against those.
Notes:
GEUVADIS individuals
Code
import os,sysimport numpy as npimport matplotlib.pyplot as pltimport pandas as pdproj_dir ="/beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/asthma_eczema_farreira_susie/enformer_preds"optimal_window_sizes = {"H3K27ac":8,"H3K27me3":64,"H3K4me1":32,"H3K4me3":8,"ATAC":8}
path_to_predictions ="/beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/asthma_eczema_farreira_susie/enformer_preds/predictions_folder/enformer_aracena_preds_genes/predictions_06-03-2024/enformer_predictions"collected_preds_dir ="/beagle3/haky/users/saideep/projects/aracena_modeling/SPrediXcan/sumstats_formatted/asthma_eczema_farreira_susie/enformer_preds/collected_preds"path_to_inds_file ="/beagle3/haky/users/saideep/projects/enformer_all_geuvadis/inputs/all_inds.txt"inds_pred = []withopen(path_to_inds_file, "r") as f:for line in f: inds_pred.append(line.strip())
Code
import h5pyfor ind in inds_pred:print(ind)# Check if already complete all_exist =Truefor window_size in [8,32,64]:ifnot os.path.exists(os.path.join(collected_preds_dir, ind+"_"+str(window_size)+".txt")): all_exist =Falseif all_exist:print("all exist for ", ind)continueif os.path.exists(os.path.join(collected_preds_dir, ind+"_haplo1.txt")):if os.path.exists(os.path.join(collected_preds_dir, ind+"_haplo2.txt")):continue predictions_dicts = {}for window_size in [8,32,64]: predictions_dicts[window_size] = {}for region in regions: pred_file_h1 = os.path.join(path_to_predictions,ind,"haplotype1",region+"_predictions.h5") pred_file_h2 = os.path.join(path_to_predictions,ind,"haplotype2",region+"_predictions.h5")ifnot (os.path.exists(pred_file_h1) and os.path.exists(pred_file_h2)):print("missing",pred_file_h1,pred_file_h2)continuewith h5py.File(pred_file_h1,"r") as f: pred_h1 = f[region][:]with h5py.File(pred_file_h2,"r") as f: pred_h2 = f[region][:] mean_pred = (pred_h1 + pred_h2)/2for window_size in [8,32,64]: center_bin = mean_pred.shape[0]//2 left_bin = center_bin - window_size//2 right_bin = center_bin + window_size//2# print(mean_pred.shape) center_mean = mean_pred[left_bin:right_bin,] predictions_dicts[window_size][ind+"."+region] = center_mean.mean(axis=0)for window_size in [8,32,64]:print(len(predictions_dicts[window_size].keys())) predictions_dict = pd.DataFrame(predictions_dicts[window_size]).Tprint(predictions_dict.shape) predictions_dict.to_csv(os.path.join(collected_preds_dir, ind+"_"+str(window_size)+".txt"),sep="\t")# predictions_dict = pd.DataFrame(predictions_dict_haplo1).T# print(predictions_dict.shape)# predictions_dict.to_csv(os.path.join(collected_preds_dir, ind+"_haplo1.txt"),sep="\t")# predictions_dict = pd.DataFrame(predictions_dict_haplo2).T# predictions_dict.to_csv(os.path.join(collected_preds_dir, ind+"_haplo2.txt"),sep="\t")
HG00096
all exist for HG00096
HG00097
all exist for HG00097
HG00099
all exist for HG00099
HG00100
all exist for HG00100
HG00101
all exist for HG00101
HG00102
all exist for HG00102
HG00103
all exist for HG00103
HG00104
all exist for HG00104
HG00105
all exist for HG00105
HG00106
all exist for HG00106
HG00108
all exist for HG00108
HG00109
all exist for HG00109
HG00110
all exist for HG00110
HG00111
all exist for HG00111
HG00112
all exist for HG00112
HG00114
all exist for HG00114
HG00115
all exist for HG00115
HG00116
all exist for HG00116
HG00117
all exist for HG00117
HG00118
all exist for HG00118
HG00119
all exist for HG00119
HG00120
all exist for HG00120
HG00121
all exist for HG00121
HG00122
all exist for HG00122
HG00123
all exist for HG00123
HG00124
all exist for HG00124
HG00125
all exist for HG00125
HG00126
all exist for HG00126
HG00127
all exist for HG00127
HG00128
all exist for HG00128
HG00129
all exist for HG00129
HG00130
all exist for HG00130
HG00131
all exist for HG00131
HG00132
all exist for HG00132
HG00133
all exist for HG00133
HG00134
all exist for HG00134
HG00135
all exist for HG00135
HG00136
all exist for HG00136
HG00137
all exist for HG00137
HG00138
all exist for HG00138
HG00139
all exist for HG00139
HG00141
all exist for HG00141
HG00142
all exist for HG00142
HG00143
all exist for HG00143
HG00145
all exist for HG00145
HG00146
all exist for HG00146
HG00148
all exist for HG00148
HG00149
all exist for HG00149
HG00150
all exist for HG00150
HG00151
all exist for HG00151
HG00152
all exist for HG00152
HG00154
all exist for HG00154
HG00155
all exist for HG00155
HG00156
all exist for HG00156
HG00157
all exist for HG00157
HG00158
all exist for HG00158
HG00159
all exist for HG00159
HG00160
all exist for HG00160
HG00171
all exist for HG00171
HG00173
all exist for HG00173
HG00174
all exist for HG00174
HG00176
all exist for HG00176
HG00177
all exist for HG00177
HG00178
all exist for HG00178
HG00179
all exist for HG00179
HG00180
all exist for HG00180
HG00181
all exist for HG00181
HG00182
all exist for HG00182
HG00183
all exist for HG00183
HG00185
all exist for HG00185
HG00186
all exist for HG00186
HG00187
all exist for HG00187
HG00188
all exist for HG00188
HG00189
all exist for HG00189
HG00231
all exist for HG00231
HG00232
all exist for HG00232
HG00233
all exist for HG00233
HG00234
all exist for HG00234
HG00235
all exist for HG00235
HG00236
all exist for HG00236
HG00238
all exist for HG00238
HG00239
all exist for HG00239
HG00240
all exist for HG00240
HG00242
all exist for HG00242
HG00243
all exist for HG00243
HG00244
all exist for HG00244
HG00245
all exist for HG00245
HG00246
all exist for HG00246
HG00247
all exist for HG00247
HG00249
all exist for HG00249
HG00250
all exist for HG00250
HG00251
all exist for HG00251
HG00252
all exist for HG00252
HG00253
all exist for HG00253
HG00255
all exist for HG00255
HG00256
all exist for HG00256
HG00257
all exist for HG00257
HG00258
all exist for HG00258
HG00259
all exist for HG00259
HG00260
all exist for HG00260
HG00261
all exist for HG00261
HG00262
all exist for HG00262
HG00263
all exist for HG00263
HG00264
all exist for HG00264
HG00265
all exist for HG00265
HG00266
all exist for HG00266
HG00267
all exist for HG00267
HG00268
all exist for HG00268
HG00269
all exist for HG00269
HG00271
all exist for HG00271
HG00272
all exist for HG00272
HG00273
all exist for HG00273
HG00274
all exist for HG00274
HG00275
all exist for HG00275
HG00276
all exist for HG00276
HG00277
all exist for HG00277
HG00278
all exist for HG00278
HG00280
all exist for HG00280
HG00281
all exist for HG00281
HG00282
all exist for HG00282
HG00284
all exist for HG00284
HG00285
all exist for HG00285
HG00306
all exist for HG00306
HG00308
all exist for HG00308
HG00309
all exist for HG00309
HG00310
all exist for HG00310
HG00311
all exist for HG00311
HG00312
all exist for HG00312
HG00313
all exist for HG00313
HG00315
all exist for HG00315
HG00319
all exist for HG00319
HG00320
all exist for HG00320
HG00321
all exist for HG00321
HG00323
all exist for HG00323
HG00324
all exist for HG00324
HG00325
all exist for HG00325
HG00326
all exist for HG00326
HG00327
all exist for HG00327
HG00328
all exist for HG00328
HG00329
all exist for HG00329
HG00330
all exist for HG00330
HG00331
all exist for HG00331
HG00332
all exist for HG00332
HG00334
all exist for HG00334
HG00335
all exist for HG00335
HG00336
all exist for HG00336
HG00337
all exist for HG00337
HG00338
all exist for HG00338
HG00339
all exist for HG00339
HG00341
all exist for HG00341
HG00342
all exist for HG00342
HG00343
all exist for HG00343
HG00344
all exist for HG00344
HG00345
all exist for HG00345
HG00346
all exist for HG00346
HG00349
all exist for HG00349
HG00350
all exist for HG00350
HG00351
all exist for HG00351
HG00353
all exist for HG00353
HG00355
all exist for HG00355
HG00356
all exist for HG00356
HG00358
all exist for HG00358
HG00359
all exist for HG00359
HG00360
all exist for HG00360
HG00361
all exist for HG00361
HG00362
all exist for HG00362
HG00364
all exist for HG00364
HG00365
all exist for HG00365
HG00366
all exist for HG00366
HG00367
all exist for HG00367
HG00369
all exist for HG00369
HG00371
all exist for HG00371
HG00372
all exist for HG00372
HG00373
all exist for HG00373
HG00375
all exist for HG00375
HG00376
all exist for HG00376
HG00377
all exist for HG00377
HG00378
all exist for HG00378
HG00379
all exist for HG00379
HG00380
all exist for HG00380
HG00381
all exist for HG00381
HG00382
all exist for HG00382
HG00383
all exist for HG00383
HG00384
all exist for HG00384
HG01334
all exist for HG01334
HG01789
all exist for HG01789
HG01790
all exist for HG01790
HG01791
all exist for HG01791
HG02215
all exist for HG02215
NA06984
all exist for NA06984
NA06985
all exist for NA06985
NA06986
all exist for NA06986
NA06989
all exist for NA06989
NA06994
all exist for NA06994
NA07037
all exist for NA07037
NA07048
all exist for NA07048
NA07051
all exist for NA07051
NA07056
all exist for NA07056
NA07346
all exist for NA07346
NA07347
all exist for NA07347
NA07357
all exist for NA07357
NA10847
all exist for NA10847
NA10851
all exist for NA10851
NA11829
all exist for NA11829
NA11830
all exist for NA11830
NA11831
all exist for NA11831
NA11832
all exist for NA11832
NA11840
all exist for NA11840
NA11843
all exist for NA11843
NA11881
all exist for NA11881
NA11892
all exist for NA11892
NA11893
all exist for NA11893
NA11894
all exist for NA11894
NA11918
all exist for NA11918
NA11920
all exist for NA11920
NA11930
all exist for NA11930
NA11931
all exist for NA11931
NA11992
all exist for NA11992
NA11993
all exist for NA11993
NA11994
all exist for NA11994
NA11995
all exist for NA11995
NA12004
all exist for NA12004
NA12005
all exist for NA12005
NA12006
all exist for NA12006
NA12043
all exist for NA12043
NA12044
all exist for NA12044
NA12045
all exist for NA12045
NA12058
all exist for NA12058
NA12144
all exist for NA12144
NA12154
all exist for NA12154
NA12155
all exist for NA12155
NA12156
all exist for NA12156
NA12234
all exist for NA12234
NA12249
all exist for NA12249
NA12272
all exist for NA12272
NA12273
all exist for NA12273
NA12275
all exist for NA12275
NA12282
all exist for NA12282
NA12283
all exist for NA12283
NA12286
all exist for NA12286
NA12287
all exist for NA12287
NA12340
all exist for NA12340
NA12341
all exist for NA12341
NA12342
all exist for NA12342
NA12347
all exist for NA12347
NA12348
all exist for NA12348
NA12383
all exist for NA12383
NA12399
all exist for NA12399
NA12400
all exist for NA12400
NA12413
all exist for NA12413
NA12489
all exist for NA12489
NA12546
all exist for NA12546
NA12716
all exist for NA12716
NA12717
all exist for NA12717
NA12718
all exist for NA12718
NA12749
all exist for NA12749
NA12750
all exist for NA12750
NA12751
all exist for NA12751
NA12760
all exist for NA12760
NA12761
all exist for NA12761
NA12762
all exist for NA12762
NA12763
all exist for NA12763
NA12775
all exist for NA12775
NA12776
all exist for NA12776
NA12777
all exist for NA12777
NA12778
all exist for NA12778
NA12812
all exist for NA12812
NA12813
all exist for NA12813
NA12814
all exist for NA12814
NA12815
all exist for NA12815
NA12827
all exist for NA12827
NA12829
all exist for NA12829
NA12830
all exist for NA12830
NA12842
all exist for NA12842
NA12843
all exist for NA12843
NA12872
all exist for NA12872
NA12873
all exist for NA12873
NA12874
all exist for NA12874
NA12889
all exist for NA12889
NA12890
all exist for NA12890
NA18486
all exist for NA18486
NA18487
all exist for NA18487
NA18488
all exist for NA18488
NA18489
all exist for NA18489
NA18498
all exist for NA18498
NA18499
all exist for NA18499
NA18502
all exist for NA18502
NA18505
all exist for NA18505
NA18508
all exist for NA18508
NA18510
all exist for NA18510
NA18511
all exist for NA18511
NA18517
all exist for NA18517
NA18519
all exist for NA18519
NA18520
all exist for NA18520
NA18858
all exist for NA18858
NA18861
all exist for NA18861
NA18867
all exist for NA18867
NA18868
all exist for NA18868
NA18870
all exist for NA18870
NA18873
all exist for NA18873
NA18907
all exist for NA18907
NA18908
all exist for NA18908
NA18909
all exist for NA18909
NA18910
all exist for NA18910
NA18912
all exist for NA18912
NA18916
all exist for NA18916
NA18917
all exist for NA18917
NA18923
all exist for NA18923
NA18933
all exist for NA18933
NA18934
all exist for NA18934
NA19092
all exist for NA19092
NA19093
all exist for NA19093
NA19095
all exist for NA19095
NA19096
all exist for NA19096
NA19098
all exist for NA19098
NA19099
all exist for NA19099
NA19102
all exist for NA19102
NA19107
all exist for NA19107
NA19108
all exist for NA19108
NA19113
all exist for NA19113
NA19114
all exist for NA19114
NA19116
all exist for NA19116
NA19117
all exist for NA19117
NA19118
all exist for NA19118
NA19119
all exist for NA19119
NA19121
all exist for NA19121
NA19129
all exist for NA19129
NA19130
all exist for NA19130
NA19131
all exist for NA19131
NA19137
all exist for NA19137
NA19138
all exist for NA19138
NA19141
all exist for NA19141
NA19143
all exist for NA19143
NA19144
all exist for NA19144
NA19146
all exist for NA19146
NA19147
all exist for NA19147
NA19149
all exist for NA19149
NA19150
all exist for NA19150
NA19152
all exist for NA19152
NA19153
all exist for NA19153
NA19159
all exist for NA19159
NA19160
all exist for NA19160
NA19171
all exist for NA19171
NA19172
all exist for NA19172
NA19175
all exist for NA19175
NA19184
all exist for NA19184
NA19185
all exist for NA19185
NA19189
all exist for NA19189
NA19190
all exist for NA19190
NA19197
all exist for NA19197
NA19198
all exist for NA19198
NA19200
all exist for NA19200
NA19201
all exist for NA19201
NA19204
all exist for NA19204
NA19206
all exist for NA19206
NA19207
all exist for NA19207
NA19209
all exist for NA19209
NA19210
all exist for NA19210
NA19213
all exist for NA19213
NA19214
all exist for NA19214
NA19222
all exist for NA19222
NA19223
all exist for NA19223
NA19225
all exist for NA19225
NA19235
all exist for NA19235
NA19236
all exist for NA19236
NA19247
all exist for NA19247
NA19248
all exist for NA19248
NA19256
all exist for NA19256
NA19257
all exist for NA19257
NA20502
all exist for NA20502
NA20503
all exist for NA20503
NA20504
all exist for NA20504
NA20505
all exist for NA20505
NA20506
all exist for NA20506
NA20507
all exist for NA20507
NA20508
all exist for NA20508
NA20509
all exist for NA20509
NA20510
all exist for NA20510
NA20512
all exist for NA20512
NA20513
all exist for NA20513
NA20514
all exist for NA20514
NA20515
all exist for NA20515
NA20516
all exist for NA20516
NA20517
all exist for NA20517
NA20518
all exist for NA20518
NA20519
all exist for NA20519
NA20520
all exist for NA20520
NA20521
all exist for NA20521
NA20524
all exist for NA20524
NA20525
all exist for NA20525
NA20527
all exist for NA20527
NA20528
all exist for NA20528
NA20529
all exist for NA20529
NA20530
all exist for NA20530
NA20531
all exist for NA20531
NA20532
all exist for NA20532
NA20534
all exist for NA20534
NA20535
all exist for NA20535
NA20536
all exist for NA20536
NA20537
all exist for NA20537
NA20538
all exist for NA20538
NA20539
all exist for NA20539
NA20540
all exist for NA20540
NA20541
all exist for NA20541
NA20542
all exist for NA20542
NA20543
all exist for NA20543
NA20544
all exist for NA20544
NA20581
all exist for NA20581
NA20582
all exist for NA20582
NA20585
all exist for NA20585
NA20586
all exist for NA20586
NA20588
all exist for NA20588
NA20589
all exist for NA20589
NA20752
all exist for NA20752
NA20754
all exist for NA20754
NA20756
all exist for NA20756
NA20757
all exist for NA20757
NA20758
all exist for NA20758
NA20759
all exist for NA20759
NA20760
all exist for NA20760
NA20761
all exist for NA20761
NA20765
all exist for NA20765
NA20766
all exist for NA20766
NA20768
all exist for NA20768
NA20769
all exist for NA20769
NA20770
all exist for NA20770
NA20771
all exist for NA20771
NA20772
all exist for NA20772
NA20773
all exist for NA20773
NA20774
all exist for NA20774
NA20778
all exist for NA20778
NA20783
all exist for NA20783
NA20785
all exist for NA20785
NA20786
all exist for NA20786
NA20787
all exist for NA20787
NA20790
all exist for NA20790
NA20792
all exist for NA20792
NA20795
all exist for NA20795
NA20796
all exist for NA20796
NA20797
all exist for NA20797
NA20798
all exist for NA20798
NA20799
all exist for NA20799
NA20800
all exist for NA20800
NA20801
all exist for NA20801
NA20802
all exist for NA20802
NA20803
all exist for NA20803
NA20804
all exist for NA20804
NA20805
all exist for NA20805
NA20806
all exist for NA20806
NA20807
all exist for NA20807
NA20808
all exist for NA20808
NA20809
all exist for NA20809
NA20810
all exist for NA20810
NA20811
all exist for NA20811
NA20812
all exist for NA20812
NA20813
all exist for NA20813
NA20814
all exist for NA20814
NA20815
all exist for NA20815
NA20816
all exist for NA20816
NA20819
all exist for NA20819
NA20826
all exist for NA20826
NA20828
all exist for NA20828
we can now collect the predictions together and start linearizing