1.4.1_prepare_geuvadis_predictions

Author

Saideep Gona

Published

November 22, 2023

Context

Now that I have a working model of gene expression prediction in flu from Enformer predictions, I can continue with the linearization process.

1.) Convert gene centered GEUVADIS predictions to single values

Charles has already generated geuvadis predictions at gene TSS sites stored as 4x5313 per gene and individual combo. I need these to be averaged into 1x5313 and in a format which can be easily read to be predicted by my glmnet. The code below accomplishes this.

Code
import os,sys

import h5py
import numpy as np


with open("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-16-linearization/individuals.txt", "r") as inds_f:
    inds = inds_f.read().split()

# gene_list = []
# c=0
# with open("/beagle3/haky/users/charles/project/singleXcanDL/PredicDB/LCL_PredictDb/files/Gene_anno.txt", "r") as genes_f:
#     for line in genes_f:
#         if c==0:
#             c+=1
#             continue
#         gene_list.append("chr"+"_".join([line.split("\t")[x] for x in [0,3,4]]))

# print(gene_list)

enformer_geuvadis_pred_folder = "/beagle3/haky/data/Geuvadis_TSS_enformer_prediction"

with h5py.File(os.path.join(enformer_geuvadis_pred_folder, inds[0]+".h5"), "r") as f:
    genes_dsets = list(f.keys())
    genes_dsets.sort()
Code

# dset_lengths = []
# c=0
# for ind in inds:
#     if os.path.exists(os.path.join(enformer_geuvadis_pred_folder, ind+".h5")):
#         with h5py.File(os.path.join(enformer_geuvadis_pred_folder, ind+".h5"), "r") as f:
#             dset_lengths.append(len(f.keys()))
#             print(c)
#             c+=1
#     else:
#         print(ind)

# print(dset_lengths)



for ind in inds:
    if os.path.exists(os.path.join("/beagle3/haky/users/saideep/projects/aracena_modeling/linearization",ind+".txt")):
        continue
    expression_array = np.zeros((len(genes_dsets),5313))
    if os.path.exists(os.path.join(enformer_geuvadis_pred_folder, ind+".h5")):
        with h5py.File(os.path.join(enformer_geuvadis_pred_folder, ind+".h5"), "r") as f:
            for i, gene in enumerate(genes_dsets):
                # print(i)
                # print(np.mean(f[gene][:,:],axis=0).shape)
                expression_array[i,:] = np.mean(f[gene][:,:],axis=0)
            np.savetxt(os.path.join("/beagle3/haky/users/saideep/projects/aracena_modeling/linearization",ind+".txt"),expression_array)
    else:
        print(ind)


HG00107
HG00113
HG00128
HG00140
HG00190