harmonizing_training_data_tss_centered

Author

Saideep Gona

Published

October 4, 2023

Code

import os,sys
import numpy as np
import h5py

If training set is split into chunks, join the chunks together

Code

def updated_reindexed_dictionary(regions, h5_path, individual, dictionary):
    # Re-index individual

    if not os.path.exists(h5_path):
        # print(h5_path, "does not exist")
        return 

    with h5py.File(h5_path, 'r') as f:
        stored_regions_raw = f['regions'][:]
    stored_regions = []
    for i in range(stored_regions_raw.shape[-1]):
        stored_regions.append("_".join(list(stored_regions_raw[0,:,i].astype(str))))


    stored_region_index = 0
    for i, region in enumerate(regions):
        if region == stored_regions[stored_region_index]:
            # print("Matched", region, stored_regions[stored_region_index])
            dictionary[region][individual] = stored_region_index
            stored_region_index += 1

Code

# Initialize variables for iterating over

sequences_bed = "/beagle3/haky/users/saideep/github_repos/Daily-Blog-Sai/posts/2023-10-04-train_aracena_tss_predictor/tss_sequences.bed"

hdf5_files_path = "/beagle3/haky/users/saideep/projects/aracena_modeling/hdf5_training_tss_centered"

samples = ['EU03_NI', 'AF30_NI', 'AF36_NI', 'AF20_Flu', 'EU13_NI', 'EU47_NI', 'EU41_NI', 'EU27_NI', 'AF18_Flu', 'AF22_NI', 'EU21_NI', 'EU15_Flu', 'EU25_Flu', 'AF22_Flu', 'EU25_NI', 'EU05_NI', 'EU15_NI', 'AF34_Flu', 'EU05_Flu', 'EU03_Flu', 'EU07_NI', 'EU07_Flu', 'AF24_NI', 'EU41_Flu', 'EU19_NI', 'EU19_Flu', 'AF16_Flu', 'EU27_Flu', 'EU21_Flu', 'EU09_NI', 'AF24_Flu', 'AF08_NI', 'AF26_Flu', 'AF38_Flu', 'AF34_NI', 'AF08_Flu', 'EU33_NI', 'EU13_Flu', 'AF26_NI', 'EU39_NI', 'AF20_NI', 'AF28_NI', 'AF12_Flu', 'AF18_NI', 'AF30_Flu', 'AF38_NI', 'AF12_NI', 'AF16_NI', 'EU47_Flu', 'EU33_Flu', 'AF36_Flu', 'EU09_Flu', 'AF28_Flu', 'EU39_Flu']
individuals = list(set([x.split("_")[0] for x in samples]))
individuals.append("ref-epigenome")
individuals.append("group")

print(len(individuals))

groups = ["train"]
groups.append("test")
groups.append("valid")

print(groups)

grouped_regions = {}
for group in groups:
    grouped_regions[group] = []

29
['train', 'test', 'valid']

Code

# Collect regions to groups

regions = []
chrom_counts_full = {}
with open(sequences_bed, "r") as f:
    for line in f:
        if "test_small" in line:
            continue
        p_line = line.strip().split("\t")

        if p_line[0] not in chrom_counts_full:
            chrom_counts_full[p_line[0]] = 0
        chrom_counts_full[p_line[0]] += 1

        region = "_".join(p_line[0:3])
        regions.append(region.strip("chr"))
        group = p_line[3].split("_")[0]
        grouped_regions[group].append(region)

# initialize remapped table dictionary

remapped_table_dict = {}
for region in regions:
    remapped_table_dict[region] = {}
    for individual in individuals:
        remapped_table_dict[region][individual] = None

Code

# Iterate through regions identifiers in each HDF5 file, and assign index to the remapped table dictionary

regions_set = set(regions)

for group in groups:

    for ind in individuals:

        h5_path = os.path.join(hdf5_files_path,ind+"_"+group+"_aracena.h5")

        if not os.path.exists(h5_path):
            # print(h5_path, "does not exist")
            continue

        # Extract regions for the current HDF5 file (ind, data subset combo)

        with h5py.File(h5_path, 'r') as f:
            stored_regions_raw = f['regions'][:]

        

        stored_regions = []
        for i in range(stored_regions_raw.shape[-1]):
            stored_regions.append("_".join(list(stored_regions_raw[0,:,i].astype(str))))
        cur_stored_region_count = len(stored_regions)
        if ind == "ref-epigenome":
            print(cur_stored_region_count, group, ind)
        # print(cur_stored_region_count, group, ind)
        # continue
        # Check if stored region is in the regions superset, and add the index to the full table
        stored_region_index = 0
        for i, stored_region in enumerate(stored_regions):
            if stored_region in regions_set:
                remapped_table_dict[stored_region][ind] = int(i)
                remapped_table_dict[stored_region]["group"] = group

12294 train ref-epigenome
2047 test ref-epigenome
2432 valid ref-epigenome

Code

import pandas as pd

remapped_table = pd.DataFrame.from_dict(remapped_table_dict)

Code

print(remapped_table.shape)
os.path.join(hdf5_files_path ,"remapped_table_filt.csv")
remapped_table.to_csv(os.path.join(hdf5_files_path ,"remapped_table_full.csv"))

(29, 19563)

Ok cool, this table contains the indeces for each individual/window combo in their corresponding training dataset. We can subsequently filter out regions, individuals for which data is not present in all intended training subsets. This filtered table can be used to guide the training so that there is no missing data being iterated over, and so the dataloader can find the correct data at each iteration.

Let’s start by including only individuals with a significant amount of data

Code

remapped_table_filt = remapped_table.dropna(thresh=1000)

Code

remapped_table_filt = remapped_table_filt.dropna(axis=1)

Code

print(remapped_table_filt.shape)

remapped_table_filt.to_csv(os.path.join(hdf5_files_path ,"remapped_table_filt.csv"))
remapped_table_filt

(29, 15850)

	1_1471765_1471767	1_3069203_3069205	1_2412564_2412566	1_10474950_10474952	1_2476289_2476291	1_9292894_9292896	1_9069635_9069637	1_15684320_15684322	1_9943475_9943477	1_3752450_3752452	...	8_143080630_143080632	8_32548311_32548313	8_144462871_144462873	8_144465035_144465037	8_144465489_144465491	8_53251637_53251639	8_127736231_127736233	8_103415107_103415109	8_54457935_54457937	8_54616096_54616098
EU39	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
EU27	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
AF26	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
EU33	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
AF18	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
AF38	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
AF20	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
AF22	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
AF08	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
AF28	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
EU25	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
AF24	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
EU05	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
AF34	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
EU47	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
EU13	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
AF36	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
EU03	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
AF30	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
AF12	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
EU09	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
EU15	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
EU07	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
EU21	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
AF16	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
EU19	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
EU41	2995	2996	2997	2998	2999	3000	3001	3002	3003	3004	...	2035	2036	2037	2038	2039	2040	2041	2042	2043	2044
ref-epigenome	3000	3001	3002	3003	3004	3005	3006	3007	3008	3009	...	2037	2038	2039	2040	2041	2042	2043	2044	2045	2046
group	train	train	train	train	train	train	train	train	train	train	...	test	test	test	test	test	test	test	test	test	test

29 rows × 15850 columns

Code

ref_epi_nonna = remapped_table.loc["ref-epigenome"].dropna()
ref_epi_nonna_list = list(ref_epi_nonna.index)

chrom_counts = {}
for x in range(1,23):
    chrom_counts[str(x)] = 0

for region in ref_epi_nonna_list:
    if region.split("_")[0] not in chrom_counts:
        chrom_counts[region.split("_")[0]] = 0
    chrom_counts[region.split("_")[0]] += 1

# print(chrom_counts_full)
# print(chrom_counts)

Code

for x in range(1,23):
    full_count = chrom_counts_full["chr"+str(x)]
    filt_count = chrom_counts[str(x)]
    print("chr"+str(x), full_count, filt_count, filt_count/full_count)

chr1 2030 2023 0.996551724137931
chr2 1227 1226 0.9991850040749797
chr3 1063 1059 0.9962370649106302
chr4 752 748 0.9946808510638298
chr5 872 0 0.0
chr6 1031 0 0.0
chr7 919 0 0.0
chr8 692 691 0.9985549132947977
chr9 766 764 0.9973890339425587
chr10 715 714 0.9986013986013986
chr11 1291 1287 0.9969016266460109
chr12 1017 990 0.9734513274336283
chr13 319 319 1.0
chr14 602 600 0.9966777408637874
chr15 589 587 0.9966044142614601
chr16 842 840 0.997624703087886
chr17 1158 1155 0.9974093264248705
chr18 262 262 1.0
chr19 1432 1424 0.994413407821229
chr20 537 533 0.9925512104283054
chr21 219 218 0.9954337899543378
chr22 434 433 0.9976958525345622

Great, in this case (9/12/23) we are left with 8 individuals and ~23k regions. This is plenty for training purposes for now.