Now that I have a working model of gene expression prediction in flu from Enformer predictions, I can continue with the linearization process.
1.) Convert gene centered GEUVADIS predictions to single values
Charles has already generated geuvadis predictions at gene TSS sites stored as 4x5313 per gene and individual combo. I need these to be averaged into 1x5313 and in a format which can be easily read to be predicted by my glmnet. The code below accomplishes this.
Code
import os,sysimport h5pyimport numpy as npwithopen("/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-11-16-linearization/individuals.txt", "r") as inds_f: inds = inds_f.read().split()# gene_list = []# c=0# with open("/beagle3/haky/users/charles/project/singleXcanDL/PredicDB/LCL_PredictDb/files/Gene_anno.txt", "r") as genes_f:# for line in genes_f:# if c==0:# c+=1# continue# gene_list.append("chr"+"_".join([line.split("\t")[x] for x in [0,3,4]]))# print(gene_list)enformer_geuvadis_pred_folder ="/beagle3/haky/data/Geuvadis_TSS_enformer_prediction"with h5py.File(os.path.join(enformer_geuvadis_pred_folder, inds[0]+".h5"), "r") as f: genes_dsets =list(f.keys()) genes_dsets.sort()
Code
# dset_lengths = []# c=0# for ind in inds:# if os.path.exists(os.path.join(enformer_geuvadis_pred_folder, ind+".h5")):# with h5py.File(os.path.join(enformer_geuvadis_pred_folder, ind+".h5"), "r") as f:# dset_lengths.append(len(f.keys()))# print(c)# c+=1# else:# print(ind)# print(dset_lengths)for ind in inds:if os.path.exists(os.path.join("/beagle3/haky/users/saideep/projects/aracena_modeling/linearization",ind+".txt")):continue expression_array = np.zeros((len(genes_dsets),5313))if os.path.exists(os.path.join(enformer_geuvadis_pred_folder, ind+".h5")):with h5py.File(os.path.join(enformer_geuvadis_pred_folder, ind+".h5"), "r") as f:for i, gene inenumerate(genes_dsets):# print(i)# print(np.mean(f[gene][:,:],axis=0).shape) expression_array[i,:] = np.mean(f[gene][:,:],axis=0) np.savetxt(os.path.join("/beagle3/haky/users/saideep/projects/aracena_modeling/linearization",ind+".txt"),expression_array)else:print(ind)