Code
import os,sys
import numpy as np
import h5py
Saideep Gona
September 11, 2023
def updated_reindexed_dictionary(regions, h5_path, individual, dictionary):
# Re-index individual
if not os.path.exists(h5_path):
# print(h5_path, "does not exist")
return
with h5py.File(h5_path, 'r') as f:
stored_regions_raw = f['regions'][:]
stored_regions = []
for i in range(stored_regions_raw.shape[-1]):
stored_regions.append("_".join(list(stored_regions_raw[0,:,i].astype(str))))
stored_region_index = 0
for i, region in enumerate(regions):
if region == stored_regions[stored_region_index]:
# print("Matched", region, stored_regions[stored_region_index])
dictionary[region][individual] = stored_region_index
stored_region_index += 1
sequences_bed = "/project2/haky/saideep/aracena_data_preprocessing/test_conversion/sequences_chunked.bed"
hdf5_files_path = "/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training"
samples = ['EU03_NI', 'AF30_NI', 'AF36_NI', 'AF20_Flu', 'EU13_NI', 'EU47_NI', 'EU41_NI', 'EU27_NI', 'AF18_Flu', 'AF22_NI', 'EU21_NI', 'EU15_Flu', 'EU25_Flu', 'AF22_Flu', 'EU25_NI', 'EU05_NI', 'EU15_NI', 'AF34_Flu', 'EU05_Flu', 'EU03_Flu', 'EU07_NI', 'EU07_Flu', 'AF24_NI', 'EU41_Flu', 'EU19_NI', 'EU19_Flu', 'AF16_Flu', 'EU27_Flu', 'EU21_Flu', 'EU09_NI', 'AF24_Flu', 'AF08_NI', 'AF26_Flu', 'AF38_Flu', 'AF34_NI', 'AF08_Flu', 'EU33_NI', 'EU13_Flu', 'AF26_NI', 'EU39_NI', 'AF20_NI', 'AF28_NI', 'AF12_Flu', 'AF18_NI', 'AF30_Flu', 'AF38_NI', 'AF12_NI', 'AF16_NI', 'EU47_Flu', 'EU33_Flu', 'AF36_Flu', 'EU09_Flu', 'AF28_Flu', 'EU39_Flu']
individuals = list(set([x.split("_")[0] for x in samples]))
individuals.append("ref-epigenome")
individuals.append("group")
groups = ["train"+str(x) for x in range(1,19)]
groups.append("test")
groups.append("valid")
print(groups)
grouped_regions = {}
for group in groups:
grouped_regions[group] = []
regions = []
chrom_counts_full = {}
with open(sequences_bed, "r") as f:
for line in f:
p_line = line.strip().split("\t")
if p_line[0] not in chrom_counts_full:
chrom_counts_full[p_line[0]] = 0
chrom_counts_full[p_line[0]] += 1
region = "_".join(p_line[0:3])
regions.append(region.strip("chr"))
group = p_line[3]
grouped_regions[group].append(region)
remapped_table_dict = {}
for region in regions:
remapped_table_dict[region] = {}
for individual in individuals:
remapped_table_dict[region][individual] = None
['train1', 'train2', 'train3', 'train4', 'train5', 'train6', 'train7', 'train8', 'train9', 'train10', 'train11', 'train12', 'train13', 'train14', 'train15', 'train16', 'train17', 'train18', 'test', 'valid']
['train1', 'train2', 'train3', 'train4', 'train5', 'train6', 'train7', 'train8', 'train9', 'train10', 'train11', 'train12', 'train13', 'train14', 'train15', 'train16', 'train17', 'train18', 'test', 'valid']
['AF22', 'AF38', 'EU13', 'EU47', 'EU27', 'AF26', 'AF08', 'EU41', 'EU39', 'AF12', 'AF36', 'AF34', 'EU09', 'EU19', 'EU33', 'AF28', 'EU15', 'EU05', 'AF20', 'AF24', 'AF18', 'EU07', 'AF30', 'EU21', 'EU03', 'EU25', 'AF16', 'ref-epigenome', 'group']
regions_set = set(regions)
for group in groups:
# if group != "train1":
# continue
# print(group)
for ind in individuals:
h5_path = os.path.join(hdf5_files_path,ind+"_"+group+"_aracena.h5")
if not os.path.exists(h5_path):
# print(h5_path, "does not exist")
continue
with h5py.File(h5_path, 'r') as f:
stored_regions_raw = f['regions'][:]
stored_regions = []
for i in range(stored_regions_raw.shape[-1]):
stored_regions.append("_".join(list(stored_regions_raw[0,:,i].astype(str))))
cur_stored_region_count = len(stored_regions)
print(cur_stored_region_count, group, ind)
# continue
# Check if stored region is in the regions superset, and add the index to the full table
stored_region_index = 0
for i, stored_region in enumerate(stored_regions):
if stored_region in regions_set:
remapped_table_dict[stored_region][ind] = int(i)
remapped_table_dict[stored_region]["group"] = group
# print(os.path.join(hdf5_files_path,ind+"_"+group+"_aracena.h5"))
# updated_reindexed_dictionary(grouped_regions[group], os.path.join(hdf5_files_path,ind+"_"+group+"_aracena.h5"), ind, remapped_table_dict)
1979 train1 EU47
1979 train1 EU33
1979 train1 AF28
1979 train1 EU05
1998 train1 AF20
1979 train1 AF30
1979 train1 AF16
1594 train1 ref-epigenome
1985 train2 EU47
1985 train2 EU33
1985 train2 AF28
1985 train2 EU05
1999 train2 AF20
1985 train2 AF30
1985 train2 AF16
1601 train2 ref-epigenome
1977 train3 EU47
1977 train3 EU33
1977 train3 AF28
1977 train3 EU05
1997 train3 AF20
1977 train3 AF30
1977 train3 AF16
1626 train3 ref-epigenome
1979 train4 EU47
1979 train4 EU33
1979 train4 AF28
1979 train4 EU05
1997 train4 AF20
1979 train4 AF30
1979 train4 AF16
1584 train4 ref-epigenome
1987 train5 EU47
1987 train5 EU33
1987 train5 AF28
1987 train5 EU05
1994 train5 AF20
1987 train5 AF30
1987 train5 AF16
1600 train5 ref-epigenome
1980 train6 EU47
1980 train6 EU33
1980 train6 AF28
1997 train6 AF20
1980 train6 AF30
1980 train6 AF16
1609 train6 ref-epigenome
1980 train7 EU47
1980 train7 EU33
1980 train7 AF28
1996 train7 AF20
1980 train7 AF30
1980 train7 AF16
1569 train7 ref-epigenome
1987 train8 EU47
1987 train8 EU33
1987 train8 AF28
1987 train8 EU05
1997 train8 AF20
1987 train8 AF30
1987 train8 AF16
1582 train8 ref-epigenome
1992 train9 EU47
1992 train9 EU33
1992 train9 AF28
1992 train9 EU05
1993 train9 AF20
1992 train9 AF30
1992 train9 AF16
1574 train9 ref-epigenome
1981 train10 EU47
1981 train10 EU33
1981 train10 AF28
1981 train10 EU05
2000 train10 AF20
1981 train10 AF30
1981 train10 AF16
1602 train10 ref-epigenome
1989 train11 EU47
1989 train11 EU33
1989 train11 AF28
1989 train11 EU05
1996 train11 AF20
1989 train11 AF30
1989 train11 AF16
1582 train11 ref-epigenome
1981 train12 EU47
1981 train12 EU33
1981 train12 AF28
1981 train12 EU05
1992 train12 AF20
1981 train12 AF30
1981 train12 AF16
1591 train12 ref-epigenome
1987 train13 EU47
1987 train13 EU33
1987 train13 AF28
1987 train13 EU05
1998 train13 AF20
1987 train13 AF30
1987 train13 AF16
1588 train13 ref-epigenome
1984 train14 EU47
1984 train14 EU33
1984 train14 AF28
1984 train14 EU05
1998 train14 AF20
1984 train14 AF30
1984 train14 AF16
1592 train14 ref-epigenome
1980 train15 EU47
1980 train15 EU33
1980 train15 AF28
1980 train15 EU05
1995 train15 AF20
1980 train15 AF30
1980 train15 AF16
1568 train15 ref-epigenome
1977 train16 EU47
1977 train16 EU33
1977 train16 AF28
1977 train16 EU05
1997 train16 AF20
1977 train16 AF30
1977 train16 AF16
1627 train16 ref-epigenome
1986 train17 EU47
1986 train17 EU33
1986 train17 AF28
1986 train17 EU05
1997 train17 AF20
1986 train17 AF30
1986 train17 AF16
1606 train17 ref-epigenome
21 train18 EU47
21 train18 EU33
21 train18 AF28
21 train18 EU05
21 train18 AF20
21 train18 AF30
21 train18 AF16
17 train18 ref-epigenome
1912 test EU47
1912 test EU33
1912 test AF28
1912 test EU05
1933 test AF20
1912 test AF30
1912 test AF16
1923 test ref-epigenome
2173 valid EU47
2173 valid EU33
2173 valid AF28
2173 valid EU05
2209 valid AF20
2173 valid AF30
2173 valid AF16
1915 valid ref-epigenome
(29, 38171)
Ok cool, this table contains the indeces for each individual/window combo in their corresponding training dataset. We can subsequently filter out regions, individuals for which data is not present in all intended training subsets. This filtered table can be used to guide the training so that there is no missing data being iterated over, and so the dataloader can find the correct data at each iteration.
Let’s start by including only individuals with a significant amount of data
(9, 26280)
18_928386_1059458 | 4_113630947_113762019 | 11_18427720_18558792 | 16_85805681_85936753 | 3_158386188_158517260 | 8_132158314_132289386 | 21_35639003_35770075 | 16_24521594_24652666 | 8_18647448_18778520 | 4_133441845_133572917 | ... | 14_30965924_31096996 | 11_29225411_29356483 | 14_82805352_82936424 | 14_44040470_44171542 | 15_25685343_25816415 | 19_33204702_33335774 | 14_41861379_41992451 | 19_30681544_30812616 | 14_61473198_61604270 | 2_129664471_129795543 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
EU47 | 0 | 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | ... | 1899 | 1902 | 1903 | 1905 | 1906 | 1907 | 1908 | 1909 | 1910 | 1911 |
EU33 | 0 | 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | ... | 1899 | 1902 | 1903 | 1905 | 1906 | 1907 | 1908 | 1909 | 1910 | 1911 |
AF28 | 0 | 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | ... | 1899 | 1902 | 1903 | 1905 | 1906 | 1907 | 1908 | 1909 | 1910 | 1911 |
EU05 | 0 | 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | ... | 1899 | 1902 | 1903 | 1905 | 1906 | 1907 | 1908 | 1909 | 1910 | 1911 |
AF20 | 0 | 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | ... | 1919 | 1922 | 1923 | 1925 | 1926 | 1928 | 1929 | 1930 | 1931 | 1932 |
AF30 | 0 | 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | ... | 1899 | 1902 | 1903 | 1905 | 1906 | 1907 | 1908 | 1909 | 1910 | 1911 |
AF16 | 0 | 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | ... | 1899 | 1902 | 1903 | 1905 | 1906 | 1907 | 1908 | 1909 | 1910 | 1911 |
ref-epigenome | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 1909 | 1912 | 1913 | 1915 | 1916 | 1918 | 1919 | 1920 | 1921 | 1922 |
group | train1 | train1 | train1 | train1 | train1 | train1 | train1 | train1 | train1 | train1 | ... | test | test | test | test | test | test | test | test | test | test |
9 rows × 26280 columns
ref_epi_nonna = remapped_table.loc["ref-epigenome"].dropna()
ref_epi_nonna_list = list(ref_epi_nonna.index)
chrom_counts = {}
for x in range(1,23):
chrom_counts[str(x)] = 0
for region in ref_epi_nonna_list:
if region.split("_")[0] not in chrom_counts:
chrom_counts[region.split("_")[0]] = 0
chrom_counts[region.split("_")[0]] += 1
# print(chrom_counts_full)
# print(chrom_counts)
chr1 3259 168 0.05154955507824486
chr2 3102 151 0.0486782720825274
chr3 2798 172 0.061472480343102216
chr4 2805 169 0.060249554367201426
chr5 2582 0 0.0
chr6 2380 0 0.0
chr7 2143 0 0.0
chr8 2090 125 0.05980861244019139
chr9 1637 103 0.06291997556505803
chr10 1906 129 0.0676810073452256
chr11 1740 84 0.04827586206896552
chr12 1929 109 0.05650596163815448
chr13 1324 56 0.04229607250755287
chr14 753 0 0.0
chr15 1106 40 0.03616636528028933
chr16 1063 63 0.059266227657572904
chr17 1099 54 0.04913557779799818
chr18 1052 75 0.07129277566539924
chr19 577 16 0.02772963604852686
chr20 669 26 0.03886397608370702
chr21 480 33 0.06875
chr22 467 21 0.044967880085653104
Great, in this case (9/12/23) we are left with 8 individuals and ~23k regions. This is plenty for training purposes for now.