search_enformer_variant_effects

Author

Saideep Gona

Published

November 12, 2023

Context

I have downloaded precomputed Enformer variant effect sizes from the published repo. They are stored in HDF5 files, but it is unclear how to query from them. Here I will try to figure out how to query from them and make some reusable code to do so.

Code

import h5py
import numpy as np

import os,sys

Code

path_to_data = '/project2/haky/Data/enformer_variant_effects/google-cloud-sdk'

Code

cur_file = os.path.join(path_to_data,"1000G.MAF_threshold=0.005.1.h5")

Code

with h5py.File(cur_file, 'r') as f:
    chroms = f['chr'][0:20]
    pos = f['pos'][0:20]

str(chroms[0])
chroms[0].decode('utf-8')

'chr1'

Code

print(chroms)
print(pos)

[b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1'
 b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1'
 b'chr1' b'chr1']
[10177 10235 10352 10616 10642 11008 11012 11063 13110 13116 13118 13273
 13289 13380 13483 13550 14464 14599 14604 14930]

From some basic investigating, it is clear that the variant effects are organized in ascending order by chromosome and by position. This means it should be fairly easy to create a search scheme indexed by position. The approach will be to determine the cutoff point of each file and store those in the top level search. Given a new variant, this allows the mapping of the variant to the file it would be stored in if it exists. Next, a file-specific mapping of the index of each variant can be used to look for specific sub-variants.

Code

with h5py.File(cur_file, 'r') as f:
    chroms = f['chr'][:]
    chroms = [x.decode('utf-8') for x in chroms]
    pos = f['pos'][:]

print(sys.getsizeof(chroms)/(1024**2))
print(sys.getsizeof(chroms)/(1024**2))

index_mapping = {}
for i in range(len(chroms)):
    index_mapping[(chroms[i],pos[i])] = i

18.376670837402344
18.376670837402344

Code

print(sys.getsizeof(index_mapping)/(1024**2))

80.00008392333984

It looks like it won’t take more than a GB in memory to create just a single lookup file which can be loaded into memory directly. It is probably easier to just create this then.

Code

import json

index_mapping = {}
for file_ind in range(1,23):
    print(file_ind)
    cur_file = os.path.join(path_to_data,f"1000G.MAF_threshold=0.005.{file_ind}.h5")
    with h5py.File(cur_file, 'r') as f:
        chroms = f['chr'][:]
        chroms = [x.decode('utf-8') for x in chroms]
        pos = f['pos'][:]
        alts = f['alt'][:]
        alts = [x.decode('utf-8') for x in alts]

    for i in range(len(chroms)):
        if "_".join([chroms[i],str(pos[i])]) in index_mapping:
            print((chroms[i],pos[i]))
            index_mapping["_".join([chroms[i],str(pos[i])])][alts[i]] = "_".join([str(file_ind),str(i)])
        else:
            index_mapping["_".join([chroms[i],str(pos[i])])] = {alts[i]:"_".join([str(file_ind),str(i)])}

with open("/project2/haky/Data/enformer_variant_effects/efv_index_mapping.json","w") as f:
    json.dump(index_mapping,f)

Code

print(sys.getsizeof(index_mapping)/(1024**2))

1280.0000839233398

With this mapping then, let’s try to query the effect size if it exists

Code

search_tuple = ('chr1', 14599)
print("Query location:", index_mapping[search_tuple])
query_loc = index_mapping[search_tuple]

cur_file = os.path.join(path_to_data,f"1000G.MAF_threshold=0.005.{query_loc[0]}.h5")
with h5py.File(cur_file, 'r') as f:
    # effect_size = 
    ref = f['ref'][query_loc[1]]
    alt = f['alt'][query_loc[1]]
    snp = f['snp'][query_loc[1]]

print("Loc:", query_loc)
# print("Effect size:", effect_size[query_loc[1]])
print("Ref:", ref)
print("Alt:", alt)
print("Snp:", snp)

Query location: (1, 17)
Loc: (1, 17)
Ref: b'T'
Alt: b'A'
Snp: b'rs531646671'

#TLDR

In summary, you can load in the index mapping pickle file generated above and use it to query from the HDF5 files containing variant effects. This is shown below

Code

# Load index mapping

import json
with open("/project2/haky/Data/enformer_variant_effects/efv_index_mapping.json","r") as f:
    index_mapping = json.load(f)


# Path to data

path_to_data = '/project2/haky/Data/enformer_variant_effects/google-cloud-sdk'

Code

# Function to query a variant effect

def query_variant_effect(chrom, pos, index_mapping, path_to_data):

    import h5py
    import numpy as np

    assert chrom.startswith("chr"), "chromosome should start with chr"
    assert isinstance(pos, int), "position should be an int"

    search_string = "_".join([chrom,str(pos)])

    if search_string in index_mapping:
        print("Query location:", index_mapping[search_string])
        query_loc = index_mapping[search_string]
    else:
        print("Variant not found: ", search_string)
        return None


    effect_sizes = {}
    for alt in query_loc.keys():
        cur_query_loc = [int(x) for x in query_loc[alt].split("_")]
        print(cur_query_loc)
        cur_file = os.path.join(path_to_data,f"1000G.MAF_threshold=0.005.{cur_query_loc[0]}.h5")
        with h5py.File(cur_file, 'r') as f:
            effect_sizes[alt] = f["SAD"][cur_query_loc[1]]

    return effect_sizes

Code

v_effects = query_variant_effect('chr1', 15274, index_mapping, path_to_data)
print(v_effects)

Query location: {'T': '1_23', 'G': '1_24'}
[1, 23]
[1, 24]
{'T': array([-7.5698e-05, -8.6308e-05, -1.1545e-04, ..., -1.5974e-05,
       -3.7432e-05,  2.9802e-06], dtype=float16), 'G': array([-4.339e-05, -7.796e-05, -4.411e-05, ..., -2.420e-05, -1.401e-04,
       -1.327e-04], dtype=float16)}

Code

print(v_effects['T'].shape)

(5313,)