Code
import os,sys
import numpy as np
import h5py
Saideep Gona
October 4, 2023
def updated_reindexed_dictionary(regions, h5_path, individual, dictionary):
# Re-index individual
if not os.path.exists(h5_path):
# print(h5_path, "does not exist")
return
with h5py.File(h5_path, 'r') as f:
stored_regions_raw = f['regions'][:]
stored_regions = []
for i in range(stored_regions_raw.shape[-1]):
stored_regions.append("_".join(list(stored_regions_raw[0,:,i].astype(str))))
stored_region_index = 0
for i, region in enumerate(regions):
if region == stored_regions[stored_region_index]:
# print("Matched", region, stored_regions[stored_region_index])
dictionary[region][individual] = stored_region_index
stored_region_index += 1
# Initialize variables for iterating over
sequences_bed = "/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-04-train_aracena_tss_predictor/tss_sequences.bed"
hdf5_files_path = "/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered"
samples = ['EU03_NI', 'AF30_NI', 'AF36_NI', 'AF20_Flu', 'EU13_NI', 'EU47_NI', 'EU41_NI', 'EU27_NI', 'AF18_Flu', 'AF22_NI', 'EU21_NI', 'EU15_Flu', 'EU25_Flu', 'AF22_Flu', 'EU25_NI', 'EU05_NI', 'EU15_NI', 'AF34_Flu', 'EU05_Flu', 'EU03_Flu', 'EU07_NI', 'EU07_Flu', 'AF24_NI', 'EU41_Flu', 'EU19_NI', 'EU19_Flu', 'AF16_Flu', 'EU27_Flu', 'EU21_Flu', 'EU09_NI', 'AF24_Flu', 'AF08_NI', 'AF26_Flu', 'AF38_Flu', 'AF34_NI', 'AF08_Flu', 'EU33_NI', 'EU13_Flu', 'AF26_NI', 'EU39_NI', 'AF20_NI', 'AF28_NI', 'AF12_Flu', 'AF18_NI', 'AF30_Flu', 'AF38_NI', 'AF12_NI', 'AF16_NI', 'EU47_Flu', 'EU33_Flu', 'AF36_Flu', 'EU09_Flu', 'AF28_Flu', 'EU39_Flu']
individuals = list(set([x.split("_")[0] for x in samples]))
individuals.append("ref-epigenome")
individuals.append("group")
print(len(individuals))
groups = ["train"]
groups.append("test")
groups.append("valid")
print(groups)
grouped_regions = {}
for group in groups:
grouped_regions[group] = []
29
['train', 'test', 'valid']
# Collect regions to groups
regions = []
chrom_counts_full = {}
with open(sequences_bed, "r") as f:
for line in f:
if "test_small" in line:
continue
p_line = line.strip().split("\t")
if p_line[0] not in chrom_counts_full:
chrom_counts_full[p_line[0]] = 0
chrom_counts_full[p_line[0]] += 1
region = "_".join(p_line[0:3])
regions.append(region.strip("chr"))
group = p_line[3].split("_")[0]
grouped_regions[group].append(region)
# initialize remapped table dictionary
remapped_table_dict = {}
for region in regions:
remapped_table_dict[region] = {}
for individual in individuals:
remapped_table_dict[region][individual] = None
# Iterate through regions identifiers in each HDF5 file, and assign index to the remapped table dictionary
regions_set = set(regions)
for group in groups:
for ind in individuals:
h5_path = os.path.join(hdf5_files_path,ind+"_"+group+"_aracena.h5")
if not os.path.exists(h5_path):
# print(h5_path, "does not exist")
continue
# Extract regions for the current HDF5 file (ind, data subset combo)
with h5py.File(h5_path, 'r') as f:
stored_regions_raw = f['regions'][:]
stored_regions = []
for i in range(stored_regions_raw.shape[-1]):
stored_regions.append("_".join(list(stored_regions_raw[0,:,i].astype(str))))
cur_stored_region_count = len(stored_regions)
if ind == "ref-epigenome":
print(cur_stored_region_count, group, ind)
# print(cur_stored_region_count, group, ind)
# continue
# Check if stored region is in the regions superset, and add the index to the full table
stored_region_index = 0
for i, stored_region in enumerate(stored_regions):
if stored_region in regions_set:
remapped_table_dict[stored_region][ind] = int(i)
remapped_table_dict[stored_region]["group"] = group
12294 train ref-epigenome
2047 test ref-epigenome
2432 valid ref-epigenome
(29, 19563)
Ok cool, this table contains the indeces for each individual/window combo in their corresponding training dataset. We can subsequently filter out regions, individuals for which data is not present in all intended training subsets. This filtered table can be used to guide the training so that there is no missing data being iterated over, and so the dataloader can find the correct data at each iteration.
Let’s start by including only individuals with a significant amount of data
(29, 15850)
1_1471765_1471767 | 1_3069203_3069205 | 1_2412564_2412566 | 1_10474950_10474952 | 1_2476289_2476291 | 1_9292894_9292896 | 1_9069635_9069637 | 1_15684320_15684322 | 1_9943475_9943477 | 1_3752450_3752452 | ... | 8_143080630_143080632 | 8_32548311_32548313 | 8_144462871_144462873 | 8_144465035_144465037 | 8_144465489_144465491 | 8_53251637_53251639 | 8_127736231_127736233 | 8_103415107_103415109 | 8_54457935_54457937 | 8_54616096_54616098 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
EU39 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
EU27 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
AF26 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
EU33 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
AF18 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
AF38 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
AF20 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
AF22 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
AF08 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
AF28 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
EU25 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
AF24 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
EU05 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
AF34 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
EU47 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
EU13 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
AF36 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
EU03 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
AF30 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
AF12 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
EU09 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
EU15 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
EU07 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
EU21 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
AF16 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
EU19 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
EU41 | 2995 | 2996 | 2997 | 2998 | 2999 | 3000 | 3001 | 3002 | 3003 | 3004 | ... | 2035 | 2036 | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 |
ref-epigenome | 3000 | 3001 | 3002 | 3003 | 3004 | 3005 | 3006 | 3007 | 3008 | 3009 | ... | 2037 | 2038 | 2039 | 2040 | 2041 | 2042 | 2043 | 2044 | 2045 | 2046 |
group | train | train | train | train | train | train | train | train | train | train | ... | test | test | test | test | test | test | test | test | test | test |
29 rows × 15850 columns
ref_epi_nonna = remapped_table.loc["ref-epigenome"].dropna()
ref_epi_nonna_list = list(ref_epi_nonna.index)
chrom_counts = {}
for x in range(1,23):
chrom_counts[str(x)] = 0
for region in ref_epi_nonna_list:
if region.split("_")[0] not in chrom_counts:
chrom_counts[region.split("_")[0]] = 0
chrom_counts[region.split("_")[0]] += 1
# print(chrom_counts_full)
# print(chrom_counts)
chr1 2030 2023 0.996551724137931
chr2 1227 1226 0.9991850040749797
chr3 1063 1059 0.9962370649106302
chr4 752 748 0.9946808510638298
chr5 872 0 0.0
chr6 1031 0 0.0
chr7 919 0 0.0
chr8 692 691 0.9985549132947977
chr9 766 764 0.9973890339425587
chr10 715 714 0.9986013986013986
chr11 1291 1287 0.9969016266460109
chr12 1017 990 0.9734513274336283
chr13 319 319 1.0
chr14 602 600 0.9966777408637874
chr15 589 587 0.9966044142614601
chr16 842 840 0.997624703087886
chr17 1158 1155 0.9974093264248705
chr18 262 262 1.0
chr19 1432 1424 0.994413407821229
chr20 537 533 0.9925512104283054
chr21 219 218 0.9954337899543378
chr22 434 433 0.9976958525345622
Great, in this case (9/12/23) we are left with 8 individuals and ~23k regions. This is plenty for training purposes for now.