harmonizing_training_data_tss_centered

Author

Saideep Gona

Published

October 4, 2023

Code
import os,sys
import numpy as np
import h5py

If training set is split into chunks, join the chunks together

Code
def updated_reindexed_dictionary(regions, h5_path, individual, dictionary):
    # Re-index individual

    if not os.path.exists(h5_path):
        # print(h5_path, "does not exist")
        return 

    with h5py.File(h5_path, 'r') as f:
        stored_regions_raw = f['regions'][:]
    stored_regions = []
    for i in range(stored_regions_raw.shape[-1]):
        stored_regions.append("_".join(list(stored_regions_raw[0,:,i].astype(str))))


    stored_region_index = 0
    for i, region in enumerate(regions):
        if region == stored_regions[stored_region_index]:
            # print("Matched", region, stored_regions[stored_region_index])
            dictionary[region][individual] = stored_region_index
            stored_region_index += 1
            
Code
# Initialize variables for iterating over

sequences_bed = "/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-04-train_aracena_tss_predictor/tss_sequences.bed"

hdf5_files_path = "/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered"

samples = ['EU03_NI', 'AF30_NI', 'AF36_NI', 'AF20_Flu', 'EU13_NI', 'EU47_NI', 'EU41_NI', 'EU27_NI', 'AF18_Flu', 'AF22_NI', 'EU21_NI', 'EU15_Flu', 'EU25_Flu', 'AF22_Flu', 'EU25_NI', 'EU05_NI', 'EU15_NI', 'AF34_Flu', 'EU05_Flu', 'EU03_Flu', 'EU07_NI', 'EU07_Flu', 'AF24_NI', 'EU41_Flu', 'EU19_NI', 'EU19_Flu', 'AF16_Flu', 'EU27_Flu', 'EU21_Flu', 'EU09_NI', 'AF24_Flu', 'AF08_NI', 'AF26_Flu', 'AF38_Flu', 'AF34_NI', 'AF08_Flu', 'EU33_NI', 'EU13_Flu', 'AF26_NI', 'EU39_NI', 'AF20_NI', 'AF28_NI', 'AF12_Flu', 'AF18_NI', 'AF30_Flu', 'AF38_NI', 'AF12_NI', 'AF16_NI', 'EU47_Flu', 'EU33_Flu', 'AF36_Flu', 'EU09_Flu', 'AF28_Flu', 'EU39_Flu']
individuals = list(set([x.split("_")[0] for x in samples]))
individuals.append("ref-epigenome")
individuals.append("group")

print(len(individuals))

groups = ["train"]
groups.append("test")
groups.append("valid")

print(groups)

grouped_regions = {}
for group in groups:
    grouped_regions[group] = []
29
['train', 'test', 'valid']
Code
# Collect regions to groups

regions = []
chrom_counts_full = {}
with open(sequences_bed, "r") as f:
    for line in f:
        if "test_small" in line:
            continue
        p_line = line.strip().split("\t")

        if p_line[0] not in chrom_counts_full:
            chrom_counts_full[p_line[0]] = 0
        chrom_counts_full[p_line[0]] += 1

        region = "_".join(p_line[0:3])
        regions.append(region.strip("chr"))
        group = p_line[3].split("_")[0]
        grouped_regions[group].append(region)

# initialize remapped table dictionary

remapped_table_dict = {}
for region in regions:
    remapped_table_dict[region] = {}
    for individual in individuals:
        remapped_table_dict[region][individual] = None
Code
# Iterate through regions identifiers in each HDF5 file, and assign index to the remapped table dictionary

regions_set = set(regions)

for group in groups:

    for ind in individuals:

        h5_path = os.path.join(hdf5_files_path,ind+"_"+group+"_aracena.h5")

        if not os.path.exists(h5_path):
            # print(h5_path, "does not exist")
            continue

        # Extract regions for the current HDF5 file (ind, data subset combo)

        with h5py.File(h5_path, 'r') as f:
            stored_regions_raw = f['regions'][:]

        

        stored_regions = []
        for i in range(stored_regions_raw.shape[-1]):
            stored_regions.append("_".join(list(stored_regions_raw[0,:,i].astype(str))))
        cur_stored_region_count = len(stored_regions)
        if ind == "ref-epigenome":
            print(cur_stored_region_count, group, ind)
        # print(cur_stored_region_count, group, ind)
        # continue
        # Check if stored region is in the regions superset, and add the index to the full table
        stored_region_index = 0
        for i, stored_region in enumerate(stored_regions):
            if stored_region in regions_set:
                remapped_table_dict[stored_region][ind] = int(i)
                remapped_table_dict[stored_region]["group"] = group
12294 train ref-epigenome
2047 test ref-epigenome
2432 valid ref-epigenome
Code
import pandas as pd

remapped_table = pd.DataFrame.from_dict(remapped_table_dict)
Code
print(remapped_table.shape)
os.path.join(hdf5_files_path ,"remapped_table_filt.csv")
remapped_table.to_csv(os.path.join(hdf5_files_path ,"remapped_table_full.csv"))
(29, 19563)

Ok cool, this table contains the indeces for each individual/window combo in their corresponding training dataset. We can subsequently filter out regions, individuals for which data is not present in all intended training subsets. This filtered table can be used to guide the training so that there is no missing data being iterated over, and so the dataloader can find the correct data at each iteration.

Let’s start by including only individuals with a significant amount of data

Code
remapped_table_filt = remapped_table.dropna(thresh=1000)
Code
remapped_table_filt = remapped_table_filt.dropna(axis=1)
Code
print(remapped_table_filt.shape)

remapped_table_filt.to_csv(os.path.join(hdf5_files_path ,"remapped_table_filt.csv"))
remapped_table_filt
(29, 15850)
1_1471765_1471767 1_3069203_3069205 1_2412564_2412566 1_10474950_10474952 1_2476289_2476291 1_9292894_9292896 1_9069635_9069637 1_15684320_15684322 1_9943475_9943477 1_3752450_3752452 ... 8_143080630_143080632 8_32548311_32548313 8_144462871_144462873 8_144465035_144465037 8_144465489_144465491 8_53251637_53251639 8_127736231_127736233 8_103415107_103415109 8_54457935_54457937 8_54616096_54616098
EU39 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
EU27 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
AF26 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
EU33 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
AF18 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
AF38 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
AF20 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
AF22 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
AF08 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
AF28 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
EU25 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
AF24 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
EU05 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
AF34 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
EU47 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
EU13 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
AF36 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
EU03 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
AF30 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
AF12 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
EU09 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
EU15 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
EU07 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
EU21 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
AF16 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
EU19 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
EU41 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 ... 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
ref-epigenome 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 ... 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046
group train train train train train train train train train train ... test test test test test test test test test test

29 rows × 15850 columns

Code
ref_epi_nonna = remapped_table.loc["ref-epigenome"].dropna()
ref_epi_nonna_list = list(ref_epi_nonna.index)

chrom_counts = {}
for x in range(1,23):
    chrom_counts[str(x)] = 0

for region in ref_epi_nonna_list:
    if region.split("_")[0] not in chrom_counts:
        chrom_counts[region.split("_")[0]] = 0
    chrom_counts[region.split("_")[0]] += 1

# print(chrom_counts_full)
# print(chrom_counts)
Code
for x in range(1,23):
    full_count = chrom_counts_full["chr"+str(x)]
    filt_count = chrom_counts[str(x)]
    print("chr"+str(x), full_count, filt_count, filt_count/full_count)
chr1 2030 2023 0.996551724137931
chr2 1227 1226 0.9991850040749797
chr3 1063 1059 0.9962370649106302
chr4 752 748 0.9946808510638298
chr5 872 0 0.0
chr6 1031 0 0.0
chr7 919 0 0.0
chr8 692 691 0.9985549132947977
chr9 766 764 0.9973890339425587
chr10 715 714 0.9986013986013986
chr11 1291 1287 0.9969016266460109
chr12 1017 990 0.9734513274336283
chr13 319 319 1.0
chr14 602 600 0.9966777408637874
chr15 589 587 0.9966044142614601
chr16 842 840 0.997624703087886
chr17 1158 1155 0.9974093264248705
chr18 262 262 1.0
chr19 1432 1424 0.994413407821229
chr20 537 533 0.9925512104283054
chr21 219 218 0.9954337899543378
chr22 434 433 0.9976958525345622

Great, in this case (9/12/23) we are left with 8 individuals and ~23k regions. This is plenty for training purposes for now.