harmonizing_training_data

Author

Saideep Gona

Published

September 11, 2023

Code
import os,sys
import numpy as np
import h5py
Code
def updated_reindexed_dictionary(regions, h5_path, individual, dictionary):
    # Re-index individual

    if not os.path.exists(h5_path):
        # print(h5_path, "does not exist")
        return 

    with h5py.File(h5_path, 'r') as f:
        stored_regions_raw = f['regions'][:]
    stored_regions = []
    for i in range(stored_regions_raw.shape[-1]):
        stored_regions.append("_".join(list(stored_regions_raw[0,:,i].astype(str))))


    stored_region_index = 0
    for i, region in enumerate(regions):
        if region == stored_regions[stored_region_index]:
            # print("Matched", region, stored_regions[stored_region_index])
            dictionary[region][individual] = stored_region_index
            stored_region_index += 1
            
Code
sequences_bed = "/project2/haky/saideep/aracena_data_preprocessing/test_conversion/sequences_chunked.bed"

hdf5_files_path = "/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training"

samples = ['EU03_NI', 'AF30_NI', 'AF36_NI', 'AF20_Flu', 'EU13_NI', 'EU47_NI', 'EU41_NI', 'EU27_NI', 'AF18_Flu', 'AF22_NI', 'EU21_NI', 'EU15_Flu', 'EU25_Flu', 'AF22_Flu', 'EU25_NI', 'EU05_NI', 'EU15_NI', 'AF34_Flu', 'EU05_Flu', 'EU03_Flu', 'EU07_NI', 'EU07_Flu', 'AF24_NI', 'EU41_Flu', 'EU19_NI', 'EU19_Flu', 'AF16_Flu', 'EU27_Flu', 'EU21_Flu', 'EU09_NI', 'AF24_Flu', 'AF08_NI', 'AF26_Flu', 'AF38_Flu', 'AF34_NI', 'AF08_Flu', 'EU33_NI', 'EU13_Flu', 'AF26_NI', 'EU39_NI', 'AF20_NI', 'AF28_NI', 'AF12_Flu', 'AF18_NI', 'AF30_Flu', 'AF38_NI', 'AF12_NI', 'AF16_NI', 'EU47_Flu', 'EU33_Flu', 'AF36_Flu', 'EU09_Flu', 'AF28_Flu', 'EU39_Flu']
individuals = list(set([x.split("_")[0] for x in samples]))
individuals.append("ref-epigenome")
individuals.append("group")

groups = ["train"+str(x) for x in range(1,19)]
groups.append("test")
groups.append("valid")

print(groups)

grouped_regions = {}
for group in groups:
    grouped_regions[group] = []

regions = []
chrom_counts_full = {}
with open(sequences_bed, "r") as f:
    for line in f:
        p_line = line.strip().split("\t")

        if p_line[0] not in chrom_counts_full:
            chrom_counts_full[p_line[0]] = 0
        chrom_counts_full[p_line[0]] += 1

        region = "_".join(p_line[0:3])
        regions.append(region.strip("chr"))
        group = p_line[3]
        grouped_regions[group].append(region)

remapped_table_dict = {}
for region in regions:
        remapped_table_dict[region] = {}
        for individual in individuals:
            remapped_table_dict[region][individual] = None
['train1', 'train2', 'train3', 'train4', 'train5', 'train6', 'train7', 'train8', 'train9', 'train10', 'train11', 'train12', 'train13', 'train14', 'train15', 'train16', 'train17', 'train18', 'test', 'valid']
Code
print(groups)
print(individuals)
['train1', 'train2', 'train3', 'train4', 'train5', 'train6', 'train7', 'train8', 'train9', 'train10', 'train11', 'train12', 'train13', 'train14', 'train15', 'train16', 'train17', 'train18', 'test', 'valid']
['AF22', 'AF38', 'EU13', 'EU47', 'EU27', 'AF26', 'AF08', 'EU41', 'EU39', 'AF12', 'AF36', 'AF34', 'EU09', 'EU19', 'EU33', 'AF28', 'EU15', 'EU05', 'AF20', 'AF24', 'AF18', 'EU07', 'AF30', 'EU21', 'EU03', 'EU25', 'AF16', 'ref-epigenome', 'group']
Code
regions_set = set(regions)

for group in groups:
    # if group != "train1":
    #     continue
    # print(group)
    for ind in individuals:

        h5_path = os.path.join(hdf5_files_path,ind+"_"+group+"_aracena.h5")

        if not os.path.exists(h5_path):
            # print(h5_path, "does not exist")
            continue

        with h5py.File(h5_path, 'r') as f:
            stored_regions_raw = f['regions'][:]

        

        stored_regions = []
        for i in range(stored_regions_raw.shape[-1]):
            stored_regions.append("_".join(list(stored_regions_raw[0,:,i].astype(str))))
        cur_stored_region_count = len(stored_regions)
        print(cur_stored_region_count, group, ind)
        # continue
        # Check if stored region is in the regions superset, and add the index to the full table
        stored_region_index = 0
        for i, stored_region in enumerate(stored_regions):
            if stored_region in regions_set:
                remapped_table_dict[stored_region][ind] = int(i)
                remapped_table_dict[stored_region]["group"] = group


        # print(os.path.join(hdf5_files_path,ind+"_"+group+"_aracena.h5"))
        # updated_reindexed_dictionary(grouped_regions[group], os.path.join(hdf5_files_path,ind+"_"+group+"_aracena.h5"), ind, remapped_table_dict)
1979 train1 EU47
1979 train1 EU33
1979 train1 AF28
1979 train1 EU05
1998 train1 AF20
1979 train1 AF30
1979 train1 AF16
1594 train1 ref-epigenome
1985 train2 EU47
1985 train2 EU33
1985 train2 AF28
1985 train2 EU05
1999 train2 AF20
1985 train2 AF30
1985 train2 AF16
1601 train2 ref-epigenome
1977 train3 EU47
1977 train3 EU33
1977 train3 AF28
1977 train3 EU05
1997 train3 AF20
1977 train3 AF30
1977 train3 AF16
1626 train3 ref-epigenome
1979 train4 EU47
1979 train4 EU33
1979 train4 AF28
1979 train4 EU05
1997 train4 AF20
1979 train4 AF30
1979 train4 AF16
1584 train4 ref-epigenome
1987 train5 EU47
1987 train5 EU33
1987 train5 AF28
1987 train5 EU05
1994 train5 AF20
1987 train5 AF30
1987 train5 AF16
1600 train5 ref-epigenome
1980 train6 EU47
1980 train6 EU33
1980 train6 AF28
1997 train6 AF20
1980 train6 AF30
1980 train6 AF16
1609 train6 ref-epigenome
1980 train7 EU47
1980 train7 EU33
1980 train7 AF28
1996 train7 AF20
1980 train7 AF30
1980 train7 AF16
1569 train7 ref-epigenome
1987 train8 EU47
1987 train8 EU33
1987 train8 AF28
1987 train8 EU05
1997 train8 AF20
1987 train8 AF30
1987 train8 AF16
1582 train8 ref-epigenome
1992 train9 EU47
1992 train9 EU33
1992 train9 AF28
1992 train9 EU05
1993 train9 AF20
1992 train9 AF30
1992 train9 AF16
1574 train9 ref-epigenome
1981 train10 EU47
1981 train10 EU33
1981 train10 AF28
1981 train10 EU05
2000 train10 AF20
1981 train10 AF30
1981 train10 AF16
1602 train10 ref-epigenome
1989 train11 EU47
1989 train11 EU33
1989 train11 AF28
1989 train11 EU05
1996 train11 AF20
1989 train11 AF30
1989 train11 AF16
1582 train11 ref-epigenome
1981 train12 EU47
1981 train12 EU33
1981 train12 AF28
1981 train12 EU05
1992 train12 AF20
1981 train12 AF30
1981 train12 AF16
1591 train12 ref-epigenome
1987 train13 EU47
1987 train13 EU33
1987 train13 AF28
1987 train13 EU05
1998 train13 AF20
1987 train13 AF30
1987 train13 AF16
1588 train13 ref-epigenome
1984 train14 EU47
1984 train14 EU33
1984 train14 AF28
1984 train14 EU05
1998 train14 AF20
1984 train14 AF30
1984 train14 AF16
1592 train14 ref-epigenome
1980 train15 EU47
1980 train15 EU33
1980 train15 AF28
1980 train15 EU05
1995 train15 AF20
1980 train15 AF30
1980 train15 AF16
1568 train15 ref-epigenome
1977 train16 EU47
1977 train16 EU33
1977 train16 AF28
1977 train16 EU05
1997 train16 AF20
1977 train16 AF30
1977 train16 AF16
1627 train16 ref-epigenome
1986 train17 EU47
1986 train17 EU33
1986 train17 AF28
1986 train17 EU05
1997 train17 AF20
1986 train17 AF30
1986 train17 AF16
1606 train17 ref-epigenome
21 train18 EU47
21 train18 EU33
21 train18 AF28
21 train18 EU05
21 train18 AF20
21 train18 AF30
21 train18 AF16
17 train18 ref-epigenome
1912 test EU47
1912 test EU33
1912 test AF28
1912 test EU05
1933 test AF20
1912 test AF30
1912 test AF16
1923 test ref-epigenome
2173 valid EU47
2173 valid EU33
2173 valid AF28
2173 valid EU05
2209 valid AF20
2173 valid AF30
2173 valid AF16
1915 valid ref-epigenome
Code
import pandas as pd

remapped_table = pd.DataFrame.from_dict(remapped_table_dict)
Code
print(remapped_table.shape)

remapped_table.to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training/index_files/remapped_table_full.csv")
(29, 38171)

Ok cool, this table contains the indeces for each individual/window combo in their corresponding training dataset. We can subsequently filter out regions, individuals for which data is not present in all intended training subsets. This filtered table can be used to guide the training so that there is no missing data being iterated over, and so the dataloader can find the correct data at each iteration.

Let’s start by including only individuals with a significant amount of data

Code
remapped_table_filt = remapped_table.dropna(thresh=1000)
Code
remapped_table_filt = remapped_table_filt.dropna(axis=1)
Code
print(remapped_table_filt.shape)

remapped_table_filt.to_csv("/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training/index_files/remapped_table_filt.csv")
remapped_table_filt
(9, 26280)
18_928386_1059458 4_113630947_113762019 11_18427720_18558792 16_85805681_85936753 3_158386188_158517260 8_132158314_132289386 21_35639003_35770075 16_24521594_24652666 8_18647448_18778520 4_133441845_133572917 ... 14_30965924_31096996 11_29225411_29356483 14_82805352_82936424 14_44040470_44171542 15_25685343_25816415 19_33204702_33335774 14_41861379_41992451 19_30681544_30812616 14_61473198_61604270 2_129664471_129795543
EU47 0 1 2 3 4 6 7 8 9 10 ... 1899 1902 1903 1905 1906 1907 1908 1909 1910 1911
EU33 0 1 2 3 4 6 7 8 9 10 ... 1899 1902 1903 1905 1906 1907 1908 1909 1910 1911
AF28 0 1 2 3 4 6 7 8 9 10 ... 1899 1902 1903 1905 1906 1907 1908 1909 1910 1911
EU05 0 1 2 3 4 6 7 8 9 10 ... 1899 1902 1903 1905 1906 1907 1908 1909 1910 1911
AF20 0 1 2 3 4 6 7 8 9 10 ... 1919 1922 1923 1925 1926 1928 1929 1930 1931 1932
AF30 0 1 2 3 4 6 7 8 9 10 ... 1899 1902 1903 1905 1906 1907 1908 1909 1910 1911
AF16 0 1 2 3 4 6 7 8 9 10 ... 1899 1902 1903 1905 1906 1907 1908 1909 1910 1911
ref-epigenome 0 1 2 3 4 5 6 7 8 9 ... 1909 1912 1913 1915 1916 1918 1919 1920 1921 1922
group train1 train1 train1 train1 train1 train1 train1 train1 train1 train1 ... test test test test test test test test test test

9 rows × 26280 columns

Code
ref_epi_nonna = remapped_table.loc["ref-epigenome"].dropna()
ref_epi_nonna_list = list(ref_epi_nonna.index)

chrom_counts = {}
for x in range(1,23):
    chrom_counts[str(x)] = 0

for region in ref_epi_nonna_list:
    if region.split("_")[0] not in chrom_counts:
        chrom_counts[region.split("_")[0]] = 0
    chrom_counts[region.split("_")[0]] += 1

# print(chrom_counts_full)
# print(chrom_counts)
Code
for x in range(1,23):
    full_count = chrom_counts_full["chr"+str(x)]
    filt_count = chrom_counts[str(x)]
    print("chr"+str(x), full_count, filt_count, filt_count/full_count)
chr1 3259 168 0.05154955507824486
chr2 3102 151 0.0486782720825274
chr3 2798 172 0.061472480343102216
chr4 2805 169 0.060249554367201426
chr5 2582 0 0.0
chr6 2380 0 0.0
chr7 2143 0 0.0
chr8 2090 125 0.05980861244019139
chr9 1637 103 0.06291997556505803
chr10 1906 129 0.0676810073452256
chr11 1740 84 0.04827586206896552
chr12 1929 109 0.05650596163815448
chr13 1324 56 0.04229607250755287
chr14 753 0 0.0
chr15 1106 40 0.03616636528028933
chr16 1063 63 0.059266227657572904
chr17 1099 54 0.04913557779799818
chr18 1052 75 0.07129277566539924
chr19 577 16 0.02772963604852686
chr20 669 26 0.03886397608370702
chr21 480 33 0.06875
chr22 467 21 0.044967880085653104

Great, in this case (9/12/23) we are left with 8 individuals and ~23k regions. This is plenty for training purposes for now.